From 0bf0620c3579a5fe94c529673db105a83d6c3755 Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Fri, 26 Feb 2021 23:23:16 -0500 Subject: [PATCH] Improve kana segmentation (#1446) * Improve edge case furigana distribution for mixed hiragana/katakana * Update/add tests --- ext/js/language/japanese-util.js | 23 +++++++++++++++++++++-- test/test-japanese.js | 21 +++++++++++++++++---- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/ext/js/language/japanese-util.js b/ext/js/language/japanese-util.js index c2ce9627..e47cdf55 100644 --- a/ext/js/language/japanese-util.js +++ b/ext/js/language/japanese-util.js @@ -521,8 +521,11 @@ const JapaneseUtil = (() => { groupsStart + 1 ); if (segments !== null) { - const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength); - segments.unshift(this._createFuriganaSegment(text, furigana)); + if (reading.startsWith(text)) { + segments.unshift(this._createFuriganaSegment(text, '')); + } else { + segments.unshift(...this._getFuriganaKanaSegments(text, reading)); + } return segments; } } @@ -554,6 +557,22 @@ const JapaneseUtil = (() => { } } + _getFuriganaKanaSegments(text, reading) { + const textLength = text.length; + const newSegments = []; + let start = 0; + let state = (reading[0] === text[0]); + for (let i = 1; i < textLength; ++i) { + const newState = (reading[i] === text[i]); + if (state === newState) { continue; } + newSegments.push(this._createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i))); + state = newState; + start = i; + } + newSegments.push(this._createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength))); + return newSegments; + } + _getWanakana() { const wanakana = this._wanakana; if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } diff --git a/test/test-japanese.js b/test/test-japanese.js index 590d3157..1a4fc494 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -402,7 +402,8 @@ function testDistributeFurigana() { [ ['スズメの涙', 'すずめのなみだ'], [ - {text: 'スズメの', furigana: 'すずめの'}, + {text: 'スズメ', furigana: 'すずめ'}, + {text: 'の', furigana: ''}, {text: '涙', furigana: 'なみだ'} ] ], @@ -464,14 +465,16 @@ function testDistributeFurigana() { [ ['くノ一', 'くのいち'], [ - {text: 'くノ', furigana: 'くの'}, + {text: 'く', furigana: ''}, + {text: 'ノ', furigana: 'の'}, {text: '一', furigana: 'いち'} ] ], [ ['くノ一', 'くのいち'], [ - {text: 'くノ', furigana: 'くの'}, + {text: 'く', furigana: ''}, + {text: 'ノ', furigana: 'の'}, {text: '一', furigana: 'いち'} ] ], @@ -691,9 +694,19 @@ function testDistributeFurigana() { [ ['ページ違反', 'ぺーじいはん'], [ - {text: 'ページ', furigana: 'ぺーじ'}, + {text: 'ペ', furigana: 'ぺ'}, + {text: 'ー', furigana: ''}, + {text: 'ジ', furigana: 'じ'}, {text: '違反', furigana: 'いはん'} ] + ], + // Mismatched kana + [ + ['サボる', 'サボル'], + [ + {text: 'サボ', furigana: ''}, + {text: 'る', furigana: 'ル'} + ] ] ];