Fix incorrect furigana distribution (#1514)

* Improve distributeFuriganaInflected implementation

* Update tests
This commit is contained in:
toasted-nutbread 2021-03-10 20:27:01 -05:00 committed by GitHub
parent 63de9273b3
commit 800ce9ed9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 80 additions and 21 deletions

View File

@ -466,27 +466,55 @@ const JapaneseUtil = (() => {
} }
distributeFuriganaInflected(expression, reading, source) { distributeFuriganaInflected(expression, reading, source) {
let stemLength = 0; const expressionNormalized = this.convertKatakanaToHiragana(expression);
const shortest = Math.min(source.length, expression.length); const readingNormalized = this.convertKatakanaToHiragana(reading);
const sourceHiragana = this.convertKatakanaToHiragana(source); const sourceNormalized = this.convertKatakanaToHiragana(source);
const expressionHiragana = this.convertKatakanaToHiragana(expression);
while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) {
++stemLength;
}
const offset = source.length - stemLength;
const stemExpression = source.substring(0, source.length - offset); let mainText = expression;
const stemReading = reading.substring( let stemLength = this._getStemLength(expressionNormalized, sourceNormalized);
0,
offset === 0 ? reading.length : reading.length - expression.length + stemLength
);
const result = this.distributeFurigana(stemExpression, stemReading);
if (stemLength !== source.length) { // Check if source is derived from the reading instead of the expression
result.push(this._createFuriganaSegment(source.substring(stemLength), '')); const readingStemLength = this._getStemLength(readingNormalized, sourceNormalized);
if (readingStemLength > stemLength) {
mainText = reading;
stemLength = readingStemLength;
} }
return result; const segments = [];
if (stemLength > 0) {
const segments2 = this.distributeFurigana(mainText, reading);
let consumed = 0;
for (const segment of segments2) {
const {text} = segment;
const start = consumed;
consumed += text.length;
if (consumed < stemLength) {
segments.push(segment);
} else if (consumed === stemLength) {
segments.push(segment);
break;
} else {
if (start < stemLength) {
segments.push(this._createFuriganaSegment(mainText.substring(start, stemLength), ''));
}
break;
}
}
}
if (stemLength < source.length) {
const remainder = source.substring(stemLength);
const segmentCount = segments.length;
if (segmentCount > 0 && segments[segmentCount - 1].furigana.length === 0) {
// Append to the last segment if it has an empty reading
segments[segmentCount - 1].text += remainder;
} else {
// Otherwise, create a new segment
segments.push(this._createFuriganaSegment(remainder, ''));
}
}
return segments;
} }
// Miscellaneous // Miscellaneous
@ -648,6 +676,27 @@ const JapaneseUtil = (() => {
return result; return result;
} }
_getStemLength(text1, text2) {
const minLength = Math.min(text1.length, text2.length);
if (minLength === 0) { return 0; }
let i = 0;
while (true) {
const char1 = text1.codePointAt(i);
const char2 = text2.codePointAt(i);
if (char1 !== char2) { break; }
const charLength = String.fromCodePoint(char1).length;
i += charLength;
if (i >= minLength) {
if (i > minLength) {
i -= charLength; // Don't consume partial UTF16 surrogate characters
}
break;
}
}
return i;
}
} }

View File

@ -729,16 +729,26 @@ function testDistributeFuriganaInflected() {
['美味しい', 'おいしい', '美味しかた'], ['美味しい', 'おいしい', '美味しかた'],
[ [
{text: '美味', furigana: 'おい'}, {text: '美味', furigana: 'おい'},
{text: 'し', furigana: ''}, {text: 'しかた', furigana: ''}
{text: 'かた', furigana: ''}
] ]
], ],
[ [
['食べる', 'たべる', '食べた'], ['食べる', 'たべる', '食べた'],
[ [
{text: '食', furigana: 'た'}, {text: '食', furigana: 'た'},
{text: 'べ', furigana: ''}, {text: 'べた', furigana: ''}
{text: 'た', furigana: ''} ]
],
[
['迄に', 'までに', 'までに'],
[
{text: 'までに', furigana: ''}
]
],
[
['行う', 'おこなう', 'おこなわなかった'],
[
{text: 'おこなわなかった', furigana: ''}
] ]
] ]
]; ];