Furigana distribution improvements (#1157)

* Improve furigana when reading kana is not an exact match

* Simplify group structure

* Return consistent type

* Add more tests

* Remove redundant isAmbiguous assignment

* Simplify group usage

* Add helper function

* Optimize returned arrays

* Use variable

* Remove s(p)lice calls

* Reduce number of convertKatakanaToHiragana calls

* Optimize text length access

* Optimize reading substring

* Move segmentize to a separate function

* Use var

* Use _createFuriganaSegment

* Optimize distributeFuriganaInflected
This commit is contained in:
toasted-nutbread 2020-12-22 11:02:19 -05:00 committed by GitHub
parent b083e9f08f
commit a354becd51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 418 additions and 55 deletions

View File

@ -406,72 +406,40 @@ const JapaneseUtil = (() => {
distributeFurigana(expression, reading) {
if (!reading || reading === expression) {
// Same
return [{furigana: '', text: expression}];
return [this._createFuriganaSegment(expression, '')];
}
let isAmbiguous = false;
const segmentize = (reading2, groups) => {
if (groups.length === 0 || isAmbiguous) {
return [];
}
const group = groups[0];
if (group.mode === 'kana') {
if (this.convertKatakanaToHiragana(reading2).startsWith(this.convertKatakanaToHiragana(group.text))) {
const readingLeft = reading2.substring(group.text.length);
const segs = segmentize(readingLeft, groups.splice(1));
if (segs) {
return [{text: group.text, furigana: ''}].concat(segs);
}
}
} else {
let foundSegments = null;
for (let i = reading2.length; i >= group.text.length; --i) {
const readingUsed = reading2.substring(0, i);
const readingLeft = reading2.substring(i);
const segs = segmentize(readingLeft, groups.slice(1));
if (segs) {
if (foundSegments !== null) {
// more than one way to segmentize the tail, mark as ambiguous
isAmbiguous = true;
return null;
}
foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs);
}
// there is only one way to segmentize the last non-kana group
if (groups.length === 1) {
break;
}
}
return foundSegments;
}
};
const groups = [];
let modePrev = null;
let groupPre = null;
let isKanaPre = null;
for (const c of expression) {
const codePoint = c.codePointAt(0);
const modeCurr = this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT ? 'kanji' : 'kana';
if (modeCurr === modePrev) {
groups[groups.length - 1].text += c;
const isKana = !(this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT);
if (isKana === isKanaPre) {
groupPre.text += c;
} else {
groups.push({mode: modeCurr, text: c});
modePrev = modeCurr;
groupPre = {isKana, text: c, textNormalized: null};
groups.push(groupPre);
isKanaPre = isKana;
}
}
for (const group of groups) {
if (group.isKana) {
group.textNormalized = this.convertKatakanaToHiragana(group.text);
}
}
const segments = segmentize(reading, groups);
if (segments && !isAmbiguous) {
const readingNormalized = this.convertKatakanaToHiragana(reading);
const segments = this._segmentizeFurigana(reading, readingNormalized, groups, 0);
if (segments !== null) {
return segments;
}
// Fallback
return [{furigana: reading, text: expression}];
return [this._createFuriganaSegment(expression, reading)];
}
distributeFuriganaInflected(expression, reading, source) {
const output = [];
let stemLength = 0;
const shortest = Math.min(source.length, expression.length);
const sourceHiragana = this.convertKatakanaToHiragana(source);
@ -486,15 +454,13 @@ const JapaneseUtil = (() => {
0,
offset === 0 ? reading.length : reading.length - expression.length + stemLength
);
for (const segment of this.distributeFurigana(stemExpression, stemReading)) {
output.push(segment);
}
const result = this.distributeFurigana(stemExpression, stemReading);
if (stemLength !== source.length) {
output.push({text: source.substring(stemLength), furigana: ''});
result.push(this._createFuriganaSegment(source.substring(stemLength), ''));
}
return output;
return result;
}
// Miscellaneous
@ -532,6 +498,62 @@ const JapaneseUtil = (() => {
// Private
_createFuriganaSegment(text, furigana) {
return {text, furigana};
}
_segmentizeFurigana(reading, readingNormalized, groups, groupsStart) {
const groupCount = groups.length - groupsStart;
if (groupCount <= 0) {
return [];
}
const group = groups[groupsStart];
const {isKana, text} = group;
const textLength = text.length;
if (isKana) {
const {textNormalized} = group;
if (readingNormalized.startsWith(textNormalized)) {
const segments = this._segmentizeFurigana(
reading.substring(textLength),
readingNormalized.substring(textLength),
groups,
groupsStart + 1
);
if (segments !== null) {
const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength);
segments.unshift(this._createFuriganaSegment(text, furigana));
return segments;
}
}
return null;
} else {
let result = null;
for (let i = reading.length; i >= textLength; --i) {
const segments = this._segmentizeFurigana(
reading.substring(i),
readingNormalized.substring(i),
groups,
groupsStart + 1
);
if (segments !== null) {
if (result !== null) {
// More than one way to segmentize the tail; mark as ambiguous
return null;
}
const furigana = reading.substring(0, i);
segments.unshift(this._createFuriganaSegment(text, furigana));
result = segments;
}
// There is only one way to segmentize the last non-kana group
if (groupCount === 1) {
break;
}
}
return result;
}
}
_getWanakana() {
const wanakana = this._wanakana;
if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); }

View File

@ -353,6 +353,347 @@ function testDistributeFurigana() {
[
{text: 'かいぬ', furigana: ''}
]
],
// Misc
[
['月', 'か'],
[
{text: '月', furigana: 'か'}
]
],
[
['月', 'カ'],
[
{text: '月', furigana: 'カ'}
]
],
// Mismatched kana readings
[
['有り難う', 'アリガトウ'],
[
{text: '有', furigana: 'ア'},
{text: 'り', furigana: 'リ'},
{text: '難', furigana: 'ガト'},
{text: 'う', furigana: 'ウ'}
]
],
[
['ありがとう', 'アリガトウ'],
[
{text: 'ありがとう', furigana: 'アリガトウ'}
]
],
// Mismatched kana readings (real examples)
[
['カ月', 'かげつ'],
[
{text: 'カ', furigana: 'か'},
{text: '月', furigana: 'げつ'}
]
],
[
['序ノ口', 'じょのくち'],
[
{text: '序', furigana: 'じょ'},
{text: '', furigana: 'の'},
{text: '口', furigana: 'くち'}
]
],
[
['スズメの涙', 'すずめのなみだ'],
[
{text: 'スズメの', furigana: 'すずめの'},
{text: '涙', furigana: 'なみだ'}
]
],
[
['二カ所', 'にかしょ'],
[
{text: '二', furigana: 'に'},
{text: 'カ', furigana: 'か'},
{text: '所', furigana: 'しょ'}
]
],
[
['八ツ橋', 'やつはし'],
[
{text: '八', furigana: 'や'},
{text: 'ツ', furigana: 'つ'},
{text: '橋', furigana: 'はし'}
]
],
[
['八ツ橋', 'やつはし'],
[
{text: '八', furigana: 'や'},
{text: 'ツ', furigana: 'つ'},
{text: '橋', furigana: 'はし'}
]
],
[
['一カ月', 'いっかげつ'],
[
{text: '一', furigana: 'いっ'},
{text: 'カ', furigana: 'か'},
{text: '月', furigana: 'げつ'}
]
],
[
['一カ所', 'いっかしょ'],
[
{text: '一', furigana: 'いっ'},
{text: 'カ', furigana: 'か'},
{text: '所', furigana: 'しょ'}
]
],
[
['カ所', 'かしょ'],
[
{text: 'カ', furigana: 'か'},
{text: '所', furigana: 'しょ'}
]
],
[
['数カ月', 'すうかげつ'],
[
{text: '数', furigana: 'すう'},
{text: 'カ', furigana: 'か'},
{text: '月', furigana: 'げつ'}
]
],
[
['くノ一', 'くのいち'],
[
{text: 'くノ', furigana: 'くの'},
{text: '一', furigana: 'いち'}
]
],
[
['くノ一', 'くのいち'],
[
{text: 'くノ', furigana: 'くの'},
{text: '一', furigana: 'いち'}
]
],
[
['数カ国', 'すうかこく'],
[
{text: '数', furigana: 'すう'},
{text: 'カ', furigana: 'か'},
{text: '国', furigana: 'こく'}
]
],
[
['数カ所', 'すうかしょ'],
[
{text: '数', furigana: 'すう'},
{text: 'カ', furigana: 'か'},
{text: '所', furigana: 'しょ'}
]
],
[
['壇ノ浦の戦い', 'だんのうらのたたかい'],
[
{text: '壇', furigana: 'だん'},
{text: '', furigana: 'の'},
{text: '浦', furigana: 'うら'},
{text: 'の', furigana: ''},
{text: '戦', furigana: 'たたか'},
{text: 'い', furigana: ''}
]
],
[
['壇ノ浦の戦', 'だんのうらのたたかい'],
[
{text: '壇', furigana: 'だん'},
{text: '', furigana: 'の'},
{text: '浦', furigana: 'うら'},
{text: 'の', furigana: ''},
{text: '戦', furigana: 'たたかい'}
]
],
[
['序ノ口格', 'じょのくちかく'],
[
{text: '序', furigana: 'じょ'},
{text: '', furigana: 'の'},
{text: '口格', furigana: 'くちかく'}
]
],
[
['二カ国語', 'にかこくご'],
[
{text: '二', furigana: 'に'},
{text: 'カ', furigana: 'か'},
{text: '国語', furigana: 'こくご'}
]
],
[
['カ国', 'かこく'],
[
{text: 'カ', furigana: 'か'},
{text: '国', furigana: 'こく'}
]
],
[
['カ国語', 'かこくご'],
[
{text: 'カ', furigana: 'か'},
{text: '国語', furigana: 'こくご'}
]
],
[
['壇ノ浦の合戦', 'だんのうらのかっせん'],
[
{text: '壇', furigana: 'だん'},
{text: '', furigana: 'の'},
{text: '浦', furigana: 'うら'},
{text: 'の', furigana: ''},
{text: '合戦', furigana: 'かっせん'}
]
],
[
['一タ偏', 'いちたへん'],
[
{text: '一', furigana: 'いち'},
{text: 'タ', furigana: 'た'},
{text: '偏', furigana: 'へん'}
]
],
[
['ル又', 'るまた'],
[
{text: 'ル', furigana: 'る'},
{text: '又', furigana: 'また'}
]
],
[
['ノ木偏', 'のぎへん'],
[
{text: '', furigana: 'の'},
{text: '木偏', furigana: 'ぎへん'}
]
],
[
['一ノ貝', 'いちのかい'],
[
{text: '一', furigana: 'いち'},
{text: '', furigana: 'の'},
{text: '貝', furigana: 'かい'}
]
],
[
['虎ノ門事件', 'とらのもんじけん'],
[
{text: '虎', furigana: 'とら'},
{text: '', furigana: 'の'},
{text: '門事件', furigana: 'もんじけん'}
]
],
[
['教育ニ関スル勅語', 'きょういくにかんするちょくご'],
[
{text: '教育', furigana: 'きょういく'},
{text: 'ニ', furigana: 'に'},
{text: '関', furigana: 'かん'},
{text: 'スル', furigana: 'する'},
{text: '勅語', furigana: 'ちょくご'}
]
],
[
['二カ年', 'にかねん'],
[
{text: '二', furigana: 'に'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['三カ年', 'さんかねん'],
[
{text: '三', furigana: 'さん'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['四カ年', 'よんかねん'],
[
{text: '四', furigana: 'よん'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['五カ年', 'ごかねん'],
[
{text: '五', furigana: 'ご'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['六カ年', 'ろっかねん'],
[
{text: '六', furigana: 'ろっ'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['七カ年', 'ななかねん'],
[
{text: '七', furigana: 'なな'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['八カ年', 'はちかねん'],
[
{text: '八', furigana: 'はち'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['九カ年', 'きゅうかねん'],
[
{text: '九', furigana: 'きゅう'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['十カ年', 'じゅうかねん'],
[
{text: '十', furigana: 'じゅう'},
{text: 'カ', furigana: 'か'},
{text: '年', furigana: 'ねん'}
]
],
[
['鏡ノ間', 'かがみのま'],
[
{text: '鏡', furigana: 'かがみ'},
{text: '', furigana: 'の'},
{text: '間', furigana: 'ま'}
]
],
[
['鏡ノ間', 'かがみのま'],
[
{text: '鏡', furigana: 'かがみ'},
{text: '', furigana: 'の'},
{text: '間', furigana: 'ま'}
]
],
[
['ページ違反', 'ぺーじいはん'],
[
{text: 'ページ', furigana: 'ぺーじ'},
{text: '違反', furigana: 'いはん'}
]
]
];