Merge pull request #440 from toasted-nutbread/collapse-emphatic-sequences

Add support for collapsing emphatic character sequences
This commit is contained in:
toasted-nutbread 2020-04-12 11:42:46 -04:00 committed by GitHub
commit 649adb13d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 129 additions and 6 deletions

View File

@ -388,7 +388,8 @@
"convertNumericCharacters", "convertNumericCharacters",
"convertAlphabeticCharacters", "convertAlphabeticCharacters",
"convertHiraganaToKatakana", "convertHiraganaToKatakana",
"convertKatakanaToHiragana" "convertKatakanaToHiragana",
"collapseEmphaticSequences"
], ],
"properties": { "properties": {
"convertHalfWidthCharacters": { "convertHalfWidthCharacters": {
@ -415,6 +416,11 @@
"type": "string", "type": "string",
"enum": ["false", "true", "variant"], "enum": ["false", "true", "variant"],
"default": "variant" "default": "variant"
},
"collapseEmphaticSequences": {
"type": "string",
"enum": ["false", "true", "full"],
"default": "false"
} }
} }
}, },

View File

@ -82,6 +82,9 @@
const ITERATION_MARK_CODE_POINT = 0x3005; const ITERATION_MARK_CODE_POINT = 0x3005;
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc;
// Existing functions // Existing functions
@ -372,6 +375,40 @@
} }
// Miscellaneous
function collapseEmphaticSequences(text, fullCollapse, sourceMap=null) {
let result = '';
let collapseCodePoint = -1;
const hasSourceMap = (sourceMap !== null);
for (const char of text) {
const c = char.codePointAt(0);
if (
c === HIRAGANA_SMALL_TSU_CODE_POINT ||
c === KATAKANA_SMALL_TSU_CODE_POINT ||
c === KANA_PROLONGED_SOUND_MARK_CODE_POINT
) {
if (collapseCodePoint !== c) {
collapseCodePoint = c;
if (!fullCollapse) {
result += char;
continue;
}
}
} else {
collapseCodePoint = -1;
result += char;
continue;
}
if (hasSourceMap) {
sourceMap.combine(Math.max(0, result.length - 1), 1);
}
}
return result;
}
// Exports // Exports
Object.assign(jp, { Object.assign(jp, {
@ -383,6 +420,7 @@
convertHalfWidthKanaToFullWidth, convertHalfWidthKanaToFullWidth,
convertAlphabeticToKana, convertAlphabeticToKana,
distributeFurigana, distributeFurigana,
distributeFuriganaInflected distributeFuriganaInflected,
collapseEmphaticSequences
}); });
})(); })();

View File

@ -170,7 +170,8 @@ function profileOptionsCreateDefaults() {
convertNumericCharacters: 'false', convertNumericCharacters: 'false',
convertAlphabeticCharacters: 'false', convertAlphabeticCharacters: 'false',
convertHiraganaToKatakana: 'false', convertHiraganaToKatakana: 'false',
convertKatakanaToHiragana: 'variant' convertKatakanaToHiragana: 'variant',
collapseEmphaticSequences: 'false'
}, },
dictionaries: {}, dictionaries: {},

View File

@ -118,6 +118,7 @@ async function formRead(options) {
options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val(); options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val();
options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val(); options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val();
options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val(); options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val();
options.translation.collapseEmphaticSequences = $('#translation-collapse-emphatic-sequences').val();
options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked'); options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked');
options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked'); options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked');
@ -199,6 +200,7 @@ async function formWrite(options) {
$('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters); $('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters);
$('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana); $('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana);
$('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana); $('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana);
$('#translation-collapse-emphatic-sequences').val(options.translation.collapseEmphaticSequences);
$('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser); $('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser);
$('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser); $('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser);

View File

@ -347,17 +347,27 @@ class Translator {
getAllDeinflections(text, options) { getAllDeinflections(text, options) {
const translationOptions = options.translation; const translationOptions = options.translation;
const collapseEmphaticOptions = [[false, false]];
switch (translationOptions.collapseEmphaticSequences) {
case 'true':
collapseEmphaticOptions.push([true, false]);
break;
case 'full':
collapseEmphaticOptions.push([true, false], [true, true]);
break;
}
const textOptionVariantArray = [ const textOptionVariantArray = [
Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters),
Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters),
Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters),
Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana), Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana),
Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana) Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana),
collapseEmphaticOptions
]; ];
const deinflections = []; const deinflections = [];
const used = new Set(); const used = new Set();
for (const [halfWidth, numeric, alphabetic, katakana, hiragana] of Translator.getArrayVariants(textOptionVariantArray)) { for (const [halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of Translator.getArrayVariants(textOptionVariantArray)) {
let text2 = text; let text2 = text;
const sourceMap = new TextSourceMap(text2); const sourceMap = new TextSourceMap(text2);
if (halfWidth) { if (halfWidth) {
@ -375,6 +385,9 @@ class Translator {
if (hiragana) { if (hiragana) {
text2 = jp.convertKatakanaToHiragana(text2); text2 = jp.convertKatakanaToHiragana(text2);
} }
if (collapseEmphatic) {
text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap);
}
for (let i = text2.length; i > 0; --i) { for (let i = text2.length; i > 0; --i) {
const text2Substring = text2.substring(0, i); const text2Substring = text2.substring(0, i);

View File

@ -427,7 +427,7 @@
<p class="help-block"> <p class="help-block">
The conversion options below are listed in the order that the conversions are applied to the input text. The conversion options below are listed in the order that the conversions are applied to the input text.
Each conversion has three possible values: Conversions commonly have three possible values:
</p> </p>
<ul class="help-block"> <ul class="help-block">
@ -490,6 +490,15 @@
<option value="variant">Use both variants</option> <option value="variant">Use both variants</option>
</select> </select>
</div> </div>
<div class="form-group">
<label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(すっっごーーい &rarr; すっごーい / すごい)</span></label>
<select class="form-control" id="translation-collapse-emphatic-sequences">
<option value="false">Disabled</option>
<option value="true">Collapse into single character</option>
<option value="full">Remove all characters</option>
</select>
</div>
</div> </div>
<div id="popup-content-scanning"> <div id="popup-content-scanning">

View File

@ -393,6 +393,59 @@ function testDistributeFuriganaInflected() {
} }
} }
function testCollapseEmphaticSequences() {
const data = [
[['かこい', false], ['かこい', [1, 1, 1]]],
[['かこい', true], ['かこい', [1, 1, 1]]],
[['かっこい', false], ['かっこい', [1, 1, 1, 1]]],
[['かっこい', true], ['かこい', [2, 1, 1]]],
[['かっっこい', false], ['かっこい', [1, 2, 1, 1]]],
[['かっっこい', true], ['かこい', [3, 1, 1]]],
[['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]],
[['かっっっこい', true], ['かこい', [4, 1, 1]]],
[['こい', false], ['こい', [1, 1]]],
[['こい', true], ['こい', [1, 1]]],
[['っこい', false], ['っこい', [1, 1, 1]]],
[['っこい', true], ['こい', [2, 1]]],
[['っっこい', false], ['っこい', [2, 1, 1]]],
[['っっこい', true], ['こい', [3, 1]]],
[['っっっこい', false], ['っこい', [3, 1, 1]]],
[['っっっこい', true], ['こい', [4, 1]]],
[['すごい', false], ['すごい', [1, 1, 1]]],
[['すごい', true], ['すごい', [1, 1, 1]]],
[['すごーい', false], ['すごーい', [1, 1, 1, 1]]],
[['すごーい', true], ['すごい', [1, 2, 1]]],
[['すごーーい', false], ['すごーい', [1, 1, 2, 1]]],
[['すごーーい', true], ['すごい', [1, 3, 1]]],
[['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]],
[['すっごーい', true], ['すごい', [2, 2, 1]]],
[['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]],
[['すっっごーーい', true], ['すごい', [3, 3, 1]]],
[['', false], ['', []]],
[['', true], ['', []]],
[['っ', false], ['っ', [1]]],
[['っ', true], ['', [1]]],
[['っっ', false], ['っ', [2]]],
[['っっ', true], ['', [2]]],
[['っっっ', false], ['っ', [3]]],
[['っっっ', true], ['', [3]]]
];
for (const [[text, fullCollapse], [expected, expectedSourceMapping]] of data) {
const sourceMap = new TextSourceMap(text);
const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null);
const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap);
assert.strictEqual(actual1, expected);
assert.strictEqual(actual2, expected);
if (typeof expectedSourceMapping !== 'undefined') {
assert.ok(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping)));
}
}
}
function testIsMoraPitchHigh() { function testIsMoraPitchHigh() {
const data = [ const data = [
[[0, 0], false], [[0, 0], false],
@ -462,6 +515,7 @@ function main() {
testConvertAlphabeticToKana(); testConvertAlphabeticToKana();
testDistributeFurigana(); testDistributeFurigana();
testDistributeFuriganaInflected(); testDistributeFuriganaInflected();
testCollapseEmphaticSequences();
testIsMoraPitchHigh(); testIsMoraPitchHigh();
testGetKanaMorae(); testGetKanaMorae();
} }