From d739ccd63f0554f0f880e7463355dd5c4ff166e4 Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Sun, 26 Sep 2021 13:29:55 -0400 Subject: [PATCH] Fix japanese codepoint range issues (#1960) * Add CJK_COMPATIBILITY_IDEOGRAPHS_RANGE * Rename CJK_UNIFIED_IDEOGRAPHS_RANGES => CJK_IDEOGRAPH_RANGES * Simplify isKana check * Update tests --- ext/js/language/sandbox/japanese-util.js | 11 ++++++----- test/test-japanese-util.js | 24 +++++++++++++++++++++--- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/ext/js/language/sandbox/japanese-util.js b/ext/js/language/sandbox/japanese-util.js index 9257e1e5..9b58d255 100644 --- a/ext/js/language/sandbox/japanese-util.js +++ b/ext/js/language/sandbox/japanese-util.js @@ -16,7 +16,6 @@ */ const JapaneseUtil = (() => { - const ITERATION_MARK_CODE_POINT = 0x3005; const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5; @@ -38,8 +37,9 @@ const JapaneseUtil = (() => { const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f]; const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf]; const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef]; + const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff]; const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f]; - const CJK_UNIFIED_IDEOGRAPHS_RANGES = [ + const CJK_IDEOGRAPH_RANGES = [ CJK_UNIFIED_IDEOGRAPHS_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE, @@ -47,6 +47,7 @@ const JapaneseUtil = (() => { CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE, + CJK_COMPATIBILITY_IDEOGRAPHS_RANGE, CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE ]; @@ -55,7 +56,7 @@ const JapaneseUtil = (() => { HIRAGANA_RANGE, KATAKANA_RANGE, - ...CJK_UNIFIED_IDEOGRAPHS_RANGES, + ...CJK_IDEOGRAPH_RANGES, [0xff66, 0xff9f], // Halfwidth katakana @@ -204,7 +205,7 @@ const JapaneseUtil = (() => { // Character code testing functions isCodePointKanji(codePoint) { - return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES); + return isCodePointInRanges(codePoint, CJK_IDEOGRAPH_RANGES); } isCodePointKana(codePoint) { @@ -450,7 +451,7 @@ const JapaneseUtil = (() => { let isKanaPre = null; for (const c of term) { const codePoint = c.codePointAt(0); - const isKana = !(this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT); + const isKana = this.isCodePointKana(codePoint); if (isKana === isKanaPre) { groupPre.text += c; } else { diff --git a/test/test-japanese-util.js b/test/test-japanese-util.js index 48dc9af7..f3b53844 100644 --- a/test/test-japanese-util.js +++ b/test/test-japanese-util.js @@ -33,7 +33,8 @@ function testIsCodePointKanji() { const data = [ ['力方', true], ['\u53f1\u{20b9f}', true], - ['かたカタ々kata、。?,.?', false] + ['かたカタ々kata、。?,.?', false], + ['逸逸', true] ]; for (const [characters, expected] of data) { @@ -65,7 +66,8 @@ function testIsCodePointJapanese() { const data = [ ['かたカタ力方々、。?', true], ['\u53f1\u{20b9f}', true], - ['kata,.?', false] + ['kata,.?', false], + ['逸逸', true] ]; for (const [characters, expected] of data) { @@ -109,7 +111,8 @@ function testIsStringPartiallyJapanese() { ['kata,.?', false], ['かたカタ力方々、。?invalid', true], ['\u53f1\u{20b9f}invalid', true], - ['kata,.?かた', true] + ['kata,.?かた', true], + ['逸逸', true] ]; for (const [string, expected] of data) { @@ -672,6 +675,21 @@ function testDistributeFurigana() { [ {text: 'シック', reading: 'シック・ビルしょうこうぐん'} ] + ], + // Kanji distribution tests + [ + ['逸らす', 'そらす'], + [ + {text: '逸', reading: 'そ'}, + {text: 'らす', reading: ''} + ] + ], + [ + ['逸らす', 'そらす'], + [ + {text: '逸', reading: 'そ'}, + {text: 'らす', reading: ''} + ] ] ];