Improve kana segmentation (#1446)

* Improve edge case furigana distribution for mixed hiragana/katakana

* Update/add tests
This commit is contained in:
toasted-nutbread 2021-02-26 23:23:16 -05:00 committed by GitHub
parent b994414b14
commit 0bf0620c35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 6 deletions

View File

@ -521,8 +521,11 @@ const JapaneseUtil = (() => {
groupsStart + 1 groupsStart + 1
); );
if (segments !== null) { if (segments !== null) {
const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength); if (reading.startsWith(text)) {
segments.unshift(this._createFuriganaSegment(text, furigana)); segments.unshift(this._createFuriganaSegment(text, ''));
} else {
segments.unshift(...this._getFuriganaKanaSegments(text, reading));
}
return segments; return segments;
} }
} }
@ -554,6 +557,22 @@ const JapaneseUtil = (() => {
} }
} }
_getFuriganaKanaSegments(text, reading) {
const textLength = text.length;
const newSegments = [];
let start = 0;
let state = (reading[0] === text[0]);
for (let i = 1; i < textLength; ++i) {
const newState = (reading[i] === text[i]);
if (state === newState) { continue; }
newSegments.push(this._createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i)));
state = newState;
start = i;
}
newSegments.push(this._createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength)));
return newSegments;
}
_getWanakana() { _getWanakana() {
const wanakana = this._wanakana; const wanakana = this._wanakana;
if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); }

View File

@ -402,7 +402,8 @@ function testDistributeFurigana() {
[ [
['スズメの涙', 'すずめのなみだ'], ['スズメの涙', 'すずめのなみだ'],
[ [
{text: 'スズメの', furigana: 'すずめの'}, {text: 'スズメ', furigana: 'すずめ'},
{text: 'の', furigana: ''},
{text: '涙', furigana: 'なみだ'} {text: '涙', furigana: 'なみだ'}
] ]
], ],
@ -464,14 +465,16 @@ function testDistributeFurigana() {
[ [
['くノ一', 'くのいち'], ['くノ一', 'くのいち'],
[ [
{text: 'くノ', furigana: 'くの'}, {text: 'く', furigana: ''},
{text: '', furigana: 'の'},
{text: '一', furigana: 'いち'} {text: '一', furigana: 'いち'}
] ]
], ],
[ [
['くノ一', 'くのいち'], ['くノ一', 'くのいち'],
[ [
{text: 'くノ', furigana: 'くの'}, {text: 'く', furigana: ''},
{text: '', furigana: 'の'},
{text: '一', furigana: 'いち'} {text: '一', furigana: 'いち'}
] ]
], ],
@ -691,9 +694,19 @@ function testDistributeFurigana() {
[ [
['ページ違反', 'ぺーじいはん'], ['ページ違反', 'ぺーじいはん'],
[ [
{text: 'ページ', furigana: 'ぺーじ'}, {text: 'ペ', furigana: 'ぺ'},
{text: 'ー', furigana: ''},
{text: 'ジ', furigana: 'じ'},
{text: '違反', furigana: 'いはん'} {text: '違反', furigana: 'いはん'}
] ]
],
// Mismatched kana
[
['サボる', 'サボル'],
[
{text: 'サボ', furigana: ''},
{text: 'る', furigana: 'ル'}
]
] ]
]; ];