Merge pull request #440 from toasted-nutbread/collapse-emphatic-sequences
Add support for collapsing emphatic character sequences
This commit is contained in:
commit
649adb13d8
@ -388,7 +388,8 @@
|
|||||||
"convertNumericCharacters",
|
"convertNumericCharacters",
|
||||||
"convertAlphabeticCharacters",
|
"convertAlphabeticCharacters",
|
||||||
"convertHiraganaToKatakana",
|
"convertHiraganaToKatakana",
|
||||||
"convertKatakanaToHiragana"
|
"convertKatakanaToHiragana",
|
||||||
|
"collapseEmphaticSequences"
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"convertHalfWidthCharacters": {
|
"convertHalfWidthCharacters": {
|
||||||
@ -415,6 +416,11 @@
|
|||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": ["false", "true", "variant"],
|
"enum": ["false", "true", "variant"],
|
||||||
"default": "variant"
|
"default": "variant"
|
||||||
|
},
|
||||||
|
"collapseEmphaticSequences": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["false", "true", "full"],
|
||||||
|
"default": "false"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -82,6 +82,9 @@
|
|||||||
|
|
||||||
const ITERATION_MARK_CODE_POINT = 0x3005;
|
const ITERATION_MARK_CODE_POINT = 0x3005;
|
||||||
|
|
||||||
|
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
|
||||||
|
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
|
||||||
|
const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc;
|
||||||
|
|
||||||
// Existing functions
|
// Existing functions
|
||||||
|
|
||||||
@ -372,6 +375,40 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Miscellaneous
|
||||||
|
|
||||||
|
function collapseEmphaticSequences(text, fullCollapse, sourceMap=null) {
|
||||||
|
let result = '';
|
||||||
|
let collapseCodePoint = -1;
|
||||||
|
const hasSourceMap = (sourceMap !== null);
|
||||||
|
for (const char of text) {
|
||||||
|
const c = char.codePointAt(0);
|
||||||
|
if (
|
||||||
|
c === HIRAGANA_SMALL_TSU_CODE_POINT ||
|
||||||
|
c === KATAKANA_SMALL_TSU_CODE_POINT ||
|
||||||
|
c === KANA_PROLONGED_SOUND_MARK_CODE_POINT
|
||||||
|
) {
|
||||||
|
if (collapseCodePoint !== c) {
|
||||||
|
collapseCodePoint = c;
|
||||||
|
if (!fullCollapse) {
|
||||||
|
result += char;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
collapseCodePoint = -1;
|
||||||
|
result += char;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasSourceMap) {
|
||||||
|
sourceMap.combine(Math.max(0, result.length - 1), 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Exports
|
// Exports
|
||||||
|
|
||||||
Object.assign(jp, {
|
Object.assign(jp, {
|
||||||
@ -383,6 +420,7 @@
|
|||||||
convertHalfWidthKanaToFullWidth,
|
convertHalfWidthKanaToFullWidth,
|
||||||
convertAlphabeticToKana,
|
convertAlphabeticToKana,
|
||||||
distributeFurigana,
|
distributeFurigana,
|
||||||
distributeFuriganaInflected
|
distributeFuriganaInflected,
|
||||||
|
collapseEmphaticSequences
|
||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
|
@ -170,7 +170,8 @@ function profileOptionsCreateDefaults() {
|
|||||||
convertNumericCharacters: 'false',
|
convertNumericCharacters: 'false',
|
||||||
convertAlphabeticCharacters: 'false',
|
convertAlphabeticCharacters: 'false',
|
||||||
convertHiraganaToKatakana: 'false',
|
convertHiraganaToKatakana: 'false',
|
||||||
convertKatakanaToHiragana: 'variant'
|
convertKatakanaToHiragana: 'variant',
|
||||||
|
collapseEmphaticSequences: 'false'
|
||||||
},
|
},
|
||||||
|
|
||||||
dictionaries: {},
|
dictionaries: {},
|
||||||
|
@ -118,6 +118,7 @@ async function formRead(options) {
|
|||||||
options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val();
|
options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val();
|
||||||
options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val();
|
options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val();
|
||||||
options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val();
|
options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val();
|
||||||
|
options.translation.collapseEmphaticSequences = $('#translation-collapse-emphatic-sequences').val();
|
||||||
|
|
||||||
options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked');
|
options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked');
|
||||||
options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked');
|
options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked');
|
||||||
@ -199,6 +200,7 @@ async function formWrite(options) {
|
|||||||
$('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters);
|
$('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters);
|
||||||
$('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana);
|
$('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana);
|
||||||
$('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana);
|
$('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana);
|
||||||
|
$('#translation-collapse-emphatic-sequences').val(options.translation.collapseEmphaticSequences);
|
||||||
|
|
||||||
$('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser);
|
$('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser);
|
||||||
$('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser);
|
$('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser);
|
||||||
|
@ -347,17 +347,27 @@ class Translator {
|
|||||||
|
|
||||||
getAllDeinflections(text, options) {
|
getAllDeinflections(text, options) {
|
||||||
const translationOptions = options.translation;
|
const translationOptions = options.translation;
|
||||||
|
const collapseEmphaticOptions = [[false, false]];
|
||||||
|
switch (translationOptions.collapseEmphaticSequences) {
|
||||||
|
case 'true':
|
||||||
|
collapseEmphaticOptions.push([true, false]);
|
||||||
|
break;
|
||||||
|
case 'full':
|
||||||
|
collapseEmphaticOptions.push([true, false], [true, true]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
const textOptionVariantArray = [
|
const textOptionVariantArray = [
|
||||||
Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters),
|
Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters),
|
||||||
Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters),
|
Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters),
|
||||||
Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters),
|
Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters),
|
||||||
Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana),
|
Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana),
|
||||||
Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana)
|
Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana),
|
||||||
|
collapseEmphaticOptions
|
||||||
];
|
];
|
||||||
|
|
||||||
const deinflections = [];
|
const deinflections = [];
|
||||||
const used = new Set();
|
const used = new Set();
|
||||||
for (const [halfWidth, numeric, alphabetic, katakana, hiragana] of Translator.getArrayVariants(textOptionVariantArray)) {
|
for (const [halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of Translator.getArrayVariants(textOptionVariantArray)) {
|
||||||
let text2 = text;
|
let text2 = text;
|
||||||
const sourceMap = new TextSourceMap(text2);
|
const sourceMap = new TextSourceMap(text2);
|
||||||
if (halfWidth) {
|
if (halfWidth) {
|
||||||
@ -375,6 +385,9 @@ class Translator {
|
|||||||
if (hiragana) {
|
if (hiragana) {
|
||||||
text2 = jp.convertKatakanaToHiragana(text2);
|
text2 = jp.convertKatakanaToHiragana(text2);
|
||||||
}
|
}
|
||||||
|
if (collapseEmphatic) {
|
||||||
|
text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap);
|
||||||
|
}
|
||||||
|
|
||||||
for (let i = text2.length; i > 0; --i) {
|
for (let i = text2.length; i > 0; --i) {
|
||||||
const text2Substring = text2.substring(0, i);
|
const text2Substring = text2.substring(0, i);
|
||||||
|
@ -427,7 +427,7 @@
|
|||||||
|
|
||||||
<p class="help-block">
|
<p class="help-block">
|
||||||
The conversion options below are listed in the order that the conversions are applied to the input text.
|
The conversion options below are listed in the order that the conversions are applied to the input text.
|
||||||
Each conversion has three possible values:
|
Conversions commonly have three possible values:
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<ul class="help-block">
|
<ul class="help-block">
|
||||||
@ -490,6 +490,15 @@
|
|||||||
<option value="variant">Use both variants</option>
|
<option value="variant">Use both variants</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(すっっごーーい → すっごーい / すごい)</span></label>
|
||||||
|
<select class="form-control" id="translation-collapse-emphatic-sequences">
|
||||||
|
<option value="false">Disabled</option>
|
||||||
|
<option value="true">Collapse into single character</option>
|
||||||
|
<option value="full">Remove all characters</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="popup-content-scanning">
|
<div id="popup-content-scanning">
|
||||||
|
@ -393,6 +393,59 @@ function testDistributeFuriganaInflected() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function testCollapseEmphaticSequences() {
|
||||||
|
const data = [
|
||||||
|
[['かこい', false], ['かこい', [1, 1, 1]]],
|
||||||
|
[['かこい', true], ['かこい', [1, 1, 1]]],
|
||||||
|
[['かっこい', false], ['かっこい', [1, 1, 1, 1]]],
|
||||||
|
[['かっこい', true], ['かこい', [2, 1, 1]]],
|
||||||
|
[['かっっこい', false], ['かっこい', [1, 2, 1, 1]]],
|
||||||
|
[['かっっこい', true], ['かこい', [3, 1, 1]]],
|
||||||
|
[['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]],
|
||||||
|
[['かっっっこい', true], ['かこい', [4, 1, 1]]],
|
||||||
|
|
||||||
|
[['こい', false], ['こい', [1, 1]]],
|
||||||
|
[['こい', true], ['こい', [1, 1]]],
|
||||||
|
[['っこい', false], ['っこい', [1, 1, 1]]],
|
||||||
|
[['っこい', true], ['こい', [2, 1]]],
|
||||||
|
[['っっこい', false], ['っこい', [2, 1, 1]]],
|
||||||
|
[['っっこい', true], ['こい', [3, 1]]],
|
||||||
|
[['っっっこい', false], ['っこい', [3, 1, 1]]],
|
||||||
|
[['っっっこい', true], ['こい', [4, 1]]],
|
||||||
|
|
||||||
|
[['すごい', false], ['すごい', [1, 1, 1]]],
|
||||||
|
[['すごい', true], ['すごい', [1, 1, 1]]],
|
||||||
|
[['すごーい', false], ['すごーい', [1, 1, 1, 1]]],
|
||||||
|
[['すごーい', true], ['すごい', [1, 2, 1]]],
|
||||||
|
[['すごーーい', false], ['すごーい', [1, 1, 2, 1]]],
|
||||||
|
[['すごーーい', true], ['すごい', [1, 3, 1]]],
|
||||||
|
[['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]],
|
||||||
|
[['すっごーい', true], ['すごい', [2, 2, 1]]],
|
||||||
|
[['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]],
|
||||||
|
[['すっっごーーい', true], ['すごい', [3, 3, 1]]],
|
||||||
|
|
||||||
|
[['', false], ['', []]],
|
||||||
|
[['', true], ['', []]],
|
||||||
|
[['っ', false], ['っ', [1]]],
|
||||||
|
[['っ', true], ['', [1]]],
|
||||||
|
[['っっ', false], ['っ', [2]]],
|
||||||
|
[['っっ', true], ['', [2]]],
|
||||||
|
[['っっっ', false], ['っ', [3]]],
|
||||||
|
[['っっっ', true], ['', [3]]]
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const [[text, fullCollapse], [expected, expectedSourceMapping]] of data) {
|
||||||
|
const sourceMap = new TextSourceMap(text);
|
||||||
|
const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null);
|
||||||
|
const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap);
|
||||||
|
assert.strictEqual(actual1, expected);
|
||||||
|
assert.strictEqual(actual2, expected);
|
||||||
|
if (typeof expectedSourceMapping !== 'undefined') {
|
||||||
|
assert.ok(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function testIsMoraPitchHigh() {
|
function testIsMoraPitchHigh() {
|
||||||
const data = [
|
const data = [
|
||||||
[[0, 0], false],
|
[[0, 0], false],
|
||||||
@ -462,6 +515,7 @@ function main() {
|
|||||||
testConvertAlphabeticToKana();
|
testConvertAlphabeticToKana();
|
||||||
testDistributeFurigana();
|
testDistributeFurigana();
|
||||||
testDistributeFuriganaInflected();
|
testDistributeFuriganaInflected();
|
||||||
|
testCollapseEmphaticSequences();
|
||||||
testIsMoraPitchHigh();
|
testIsMoraPitchHigh();
|
||||||
testGetKanaMorae();
|
testGetKanaMorae();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user