Fix japanese codepoint range issues (#1960)

* Add CJK_COMPATIBILITY_IDEOGRAPHS_RANGE

* Rename CJK_UNIFIED_IDEOGRAPHS_RANGES => CJK_IDEOGRAPH_RANGES

* Simplify isKana check

* Update tests
This commit is contained in:
toasted-nutbread 2021-09-26 13:29:55 -04:00 committed by GitHub
parent 25fe3ba514
commit d739ccd63f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 8 deletions

View File

@ -16,7 +16,6 @@
*/ */
const JapaneseUtil = (() => { const JapaneseUtil = (() => {
const ITERATION_MARK_CODE_POINT = 0x3005;
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5; const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
@ -38,8 +37,9 @@ const JapaneseUtil = (() => {
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f]; const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf]; const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef]; const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef];
const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff];
const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f]; const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f];
const CJK_UNIFIED_IDEOGRAPHS_RANGES = [ const CJK_IDEOGRAPH_RANGES = [
CJK_UNIFIED_IDEOGRAPHS_RANGE, CJK_UNIFIED_IDEOGRAPHS_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE,
@ -47,6 +47,7 @@ const JapaneseUtil = (() => {
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE,
CJK_COMPATIBILITY_IDEOGRAPHS_RANGE,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE
]; ];
@ -55,7 +56,7 @@ const JapaneseUtil = (() => {
HIRAGANA_RANGE, HIRAGANA_RANGE,
KATAKANA_RANGE, KATAKANA_RANGE,
...CJK_UNIFIED_IDEOGRAPHS_RANGES, ...CJK_IDEOGRAPH_RANGES,
[0xff66, 0xff9f], // Halfwidth katakana [0xff66, 0xff9f], // Halfwidth katakana
@ -204,7 +205,7 @@ const JapaneseUtil = (() => {
// Character code testing functions // Character code testing functions
isCodePointKanji(codePoint) { isCodePointKanji(codePoint) {
return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES); return isCodePointInRanges(codePoint, CJK_IDEOGRAPH_RANGES);
} }
isCodePointKana(codePoint) { isCodePointKana(codePoint) {
@ -450,7 +451,7 @@ const JapaneseUtil = (() => {
let isKanaPre = null; let isKanaPre = null;
for (const c of term) { for (const c of term) {
const codePoint = c.codePointAt(0); const codePoint = c.codePointAt(0);
const isKana = !(this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT); const isKana = this.isCodePointKana(codePoint);
if (isKana === isKanaPre) { if (isKana === isKanaPre) {
groupPre.text += c; groupPre.text += c;
} else { } else {

View File

@ -33,7 +33,8 @@ function testIsCodePointKanji() {
const data = [ const data = [
['力方', true], ['力方', true],
['\u53f1\u{20b9f}', true], ['\u53f1\u{20b9f}', true],
['かたカタ々kata、。,.?', false] ['かたカタ々kata、。,.?', false],
['逸逸', true]
]; ];
for (const [characters, expected] of data) { for (const [characters, expected] of data) {
@ -65,7 +66,8 @@ function testIsCodePointJapanese() {
const data = [ const data = [
['かたカタ力方々、。?', true], ['かたカタ力方々、。?', true],
['\u53f1\u{20b9f}', true], ['\u53f1\u{20b9f}', true],
['kata,.?', false] ['kata,.?', false],
['逸逸', true]
]; ];
for (const [characters, expected] of data) { for (const [characters, expected] of data) {
@ -109,7 +111,8 @@ function testIsStringPartiallyJapanese() {
['kata,.?', false], ['kata,.?', false],
['かたカタ力方々、。invalid', true], ['かたカタ力方々、。invalid', true],
['\u53f1\u{20b9f}invalid', true], ['\u53f1\u{20b9f}invalid', true],
['kata,.?かた', true] ['kata,.?かた', true],
['逸逸', true]
]; ];
for (const [string, expected] of data) { for (const [string, expected] of data) {
@ -672,6 +675,21 @@ function testDistributeFurigana() {
[ [
{text: 'シック', reading: 'シック・ビルしょうこうぐん'} {text: 'シック', reading: 'シック・ビルしょうこうぐん'}
] ]
],
// Kanji distribution tests
[
['逸らす', 'そらす'],
[
{text: '逸', reading: 'そ'},
{text: 'らす', reading: ''}
]
],
[
['逸らす', 'そらす'],
[
{text: '逸', reading: 'そ'},
{text: 'らす', reading: ''}
]
] ]
]; ];