Fix incorrect furigana distribution (#1514)
* Improve distributeFuriganaInflected implementation * Update tests
This commit is contained in:
parent
63de9273b3
commit
800ce9ed9e
@ -466,27 +466,55 @@ const JapaneseUtil = (() => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
distributeFuriganaInflected(expression, reading, source) {
|
distributeFuriganaInflected(expression, reading, source) {
|
||||||
let stemLength = 0;
|
const expressionNormalized = this.convertKatakanaToHiragana(expression);
|
||||||
const shortest = Math.min(source.length, expression.length);
|
const readingNormalized = this.convertKatakanaToHiragana(reading);
|
||||||
const sourceHiragana = this.convertKatakanaToHiragana(source);
|
const sourceNormalized = this.convertKatakanaToHiragana(source);
|
||||||
const expressionHiragana = this.convertKatakanaToHiragana(expression);
|
|
||||||
while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) {
|
|
||||||
++stemLength;
|
|
||||||
}
|
|
||||||
const offset = source.length - stemLength;
|
|
||||||
|
|
||||||
const stemExpression = source.substring(0, source.length - offset);
|
let mainText = expression;
|
||||||
const stemReading = reading.substring(
|
let stemLength = this._getStemLength(expressionNormalized, sourceNormalized);
|
||||||
0,
|
|
||||||
offset === 0 ? reading.length : reading.length - expression.length + stemLength
|
|
||||||
);
|
|
||||||
const result = this.distributeFurigana(stemExpression, stemReading);
|
|
||||||
|
|
||||||
if (stemLength !== source.length) {
|
// Check if source is derived from the reading instead of the expression
|
||||||
result.push(this._createFuriganaSegment(source.substring(stemLength), ''));
|
const readingStemLength = this._getStemLength(readingNormalized, sourceNormalized);
|
||||||
|
if (readingStemLength > stemLength) {
|
||||||
|
mainText = reading;
|
||||||
|
stemLength = readingStemLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
const segments = [];
|
||||||
|
if (stemLength > 0) {
|
||||||
|
const segments2 = this.distributeFurigana(mainText, reading);
|
||||||
|
let consumed = 0;
|
||||||
|
for (const segment of segments2) {
|
||||||
|
const {text} = segment;
|
||||||
|
const start = consumed;
|
||||||
|
consumed += text.length;
|
||||||
|
if (consumed < stemLength) {
|
||||||
|
segments.push(segment);
|
||||||
|
} else if (consumed === stemLength) {
|
||||||
|
segments.push(segment);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
if (start < stemLength) {
|
||||||
|
segments.push(this._createFuriganaSegment(mainText.substring(start, stemLength), ''));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stemLength < source.length) {
|
||||||
|
const remainder = source.substring(stemLength);
|
||||||
|
const segmentCount = segments.length;
|
||||||
|
if (segmentCount > 0 && segments[segmentCount - 1].furigana.length === 0) {
|
||||||
|
// Append to the last segment if it has an empty reading
|
||||||
|
segments[segmentCount - 1].text += remainder;
|
||||||
|
} else {
|
||||||
|
// Otherwise, create a new segment
|
||||||
|
segments.push(this._createFuriganaSegment(remainder, ''));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Miscellaneous
|
// Miscellaneous
|
||||||
@ -648,6 +676,27 @@ const JapaneseUtil = (() => {
|
|||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_getStemLength(text1, text2) {
|
||||||
|
const minLength = Math.min(text1.length, text2.length);
|
||||||
|
if (minLength === 0) { return 0; }
|
||||||
|
|
||||||
|
let i = 0;
|
||||||
|
while (true) {
|
||||||
|
const char1 = text1.codePointAt(i);
|
||||||
|
const char2 = text2.codePointAt(i);
|
||||||
|
if (char1 !== char2) { break; }
|
||||||
|
const charLength = String.fromCodePoint(char1).length;
|
||||||
|
i += charLength;
|
||||||
|
if (i >= minLength) {
|
||||||
|
if (i > minLength) {
|
||||||
|
i -= charLength; // Don't consume partial UTF16 surrogate characters
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -729,16 +729,26 @@ function testDistributeFuriganaInflected() {
|
|||||||
['美味しい', 'おいしい', '美味しかた'],
|
['美味しい', 'おいしい', '美味しかた'],
|
||||||
[
|
[
|
||||||
{text: '美味', furigana: 'おい'},
|
{text: '美味', furigana: 'おい'},
|
||||||
{text: 'し', furigana: ''},
|
{text: 'しかた', furigana: ''}
|
||||||
{text: 'かた', furigana: ''}
|
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
['食べる', 'たべる', '食べた'],
|
['食べる', 'たべる', '食べた'],
|
||||||
[
|
[
|
||||||
{text: '食', furigana: 'た'},
|
{text: '食', furigana: 'た'},
|
||||||
{text: 'べ', furigana: ''},
|
{text: 'べた', furigana: ''}
|
||||||
{text: 'た', furigana: ''}
|
]
|
||||||
|
],
|
||||||
|
[
|
||||||
|
['迄に', 'までに', 'までに'],
|
||||||
|
[
|
||||||
|
{text: 'までに', furigana: ''}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
[
|
||||||
|
['行う', 'おこなう', 'おこなわなかった'],
|
||||||
|
[
|
||||||
|
{text: 'おこなわなかった', furigana: ''}
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
];
|
];
|
||||||
|
Loading…
Reference in New Issue
Block a user