Fix japanese codepoint range issues (#1960)
* Add CJK_COMPATIBILITY_IDEOGRAPHS_RANGE * Rename CJK_UNIFIED_IDEOGRAPHS_RANGES => CJK_IDEOGRAPH_RANGES * Simplify isKana check * Update tests
This commit is contained in:
parent
25fe3ba514
commit
d739ccd63f
@ -16,7 +16,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
const JapaneseUtil = (() => {
|
const JapaneseUtil = (() => {
|
||||||
const ITERATION_MARK_CODE_POINT = 0x3005;
|
|
||||||
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
|
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
|
||||||
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
|
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
|
||||||
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
|
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
|
||||||
@ -38,8 +37,9 @@ const JapaneseUtil = (() => {
|
|||||||
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f];
|
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f];
|
||||||
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf];
|
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf];
|
||||||
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef];
|
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef];
|
||||||
|
const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff];
|
||||||
const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f];
|
const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f];
|
||||||
const CJK_UNIFIED_IDEOGRAPHS_RANGES = [
|
const CJK_IDEOGRAPH_RANGES = [
|
||||||
CJK_UNIFIED_IDEOGRAPHS_RANGE,
|
CJK_UNIFIED_IDEOGRAPHS_RANGE,
|
||||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE,
|
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE,
|
||||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE,
|
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE,
|
||||||
@ -47,6 +47,7 @@ const JapaneseUtil = (() => {
|
|||||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE,
|
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE,
|
||||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE,
|
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE,
|
||||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE,
|
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE,
|
||||||
|
CJK_COMPATIBILITY_IDEOGRAPHS_RANGE,
|
||||||
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE
|
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -55,7 +56,7 @@ const JapaneseUtil = (() => {
|
|||||||
HIRAGANA_RANGE,
|
HIRAGANA_RANGE,
|
||||||
KATAKANA_RANGE,
|
KATAKANA_RANGE,
|
||||||
|
|
||||||
...CJK_UNIFIED_IDEOGRAPHS_RANGES,
|
...CJK_IDEOGRAPH_RANGES,
|
||||||
|
|
||||||
[0xff66, 0xff9f], // Halfwidth katakana
|
[0xff66, 0xff9f], // Halfwidth katakana
|
||||||
|
|
||||||
@ -204,7 +205,7 @@ const JapaneseUtil = (() => {
|
|||||||
// Character code testing functions
|
// Character code testing functions
|
||||||
|
|
||||||
isCodePointKanji(codePoint) {
|
isCodePointKanji(codePoint) {
|
||||||
return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES);
|
return isCodePointInRanges(codePoint, CJK_IDEOGRAPH_RANGES);
|
||||||
}
|
}
|
||||||
|
|
||||||
isCodePointKana(codePoint) {
|
isCodePointKana(codePoint) {
|
||||||
@ -450,7 +451,7 @@ const JapaneseUtil = (() => {
|
|||||||
let isKanaPre = null;
|
let isKanaPre = null;
|
||||||
for (const c of term) {
|
for (const c of term) {
|
||||||
const codePoint = c.codePointAt(0);
|
const codePoint = c.codePointAt(0);
|
||||||
const isKana = !(this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT);
|
const isKana = this.isCodePointKana(codePoint);
|
||||||
if (isKana === isKanaPre) {
|
if (isKana === isKanaPre) {
|
||||||
groupPre.text += c;
|
groupPre.text += c;
|
||||||
} else {
|
} else {
|
||||||
|
@ -33,7 +33,8 @@ function testIsCodePointKanji() {
|
|||||||
const data = [
|
const data = [
|
||||||
['力方', true],
|
['力方', true],
|
||||||
['\u53f1\u{20b9f}', true],
|
['\u53f1\u{20b9f}', true],
|
||||||
['かたカタ々kata、。?,.?', false]
|
['かたカタ々kata、。?,.?', false],
|
||||||
|
['逸逸', true]
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const [characters, expected] of data) {
|
for (const [characters, expected] of data) {
|
||||||
@ -65,7 +66,8 @@ function testIsCodePointJapanese() {
|
|||||||
const data = [
|
const data = [
|
||||||
['かたカタ力方々、。?', true],
|
['かたカタ力方々、。?', true],
|
||||||
['\u53f1\u{20b9f}', true],
|
['\u53f1\u{20b9f}', true],
|
||||||
['kata,.?', false]
|
['kata,.?', false],
|
||||||
|
['逸逸', true]
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const [characters, expected] of data) {
|
for (const [characters, expected] of data) {
|
||||||
@ -109,7 +111,8 @@ function testIsStringPartiallyJapanese() {
|
|||||||
['kata,.?', false],
|
['kata,.?', false],
|
||||||
['かたカタ力方々、。?invalid', true],
|
['かたカタ力方々、。?invalid', true],
|
||||||
['\u53f1\u{20b9f}invalid', true],
|
['\u53f1\u{20b9f}invalid', true],
|
||||||
['kata,.?かた', true]
|
['kata,.?かた', true],
|
||||||
|
['逸逸', true]
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const [string, expected] of data) {
|
for (const [string, expected] of data) {
|
||||||
@ -672,6 +675,21 @@ function testDistributeFurigana() {
|
|||||||
[
|
[
|
||||||
{text: 'シック', reading: 'シック・ビルしょうこうぐん'}
|
{text: 'シック', reading: 'シック・ビルしょうこうぐん'}
|
||||||
]
|
]
|
||||||
|
],
|
||||||
|
// Kanji distribution tests
|
||||||
|
[
|
||||||
|
['逸らす', 'そらす'],
|
||||||
|
[
|
||||||
|
{text: '逸', reading: 'そ'},
|
||||||
|
{text: 'らす', reading: ''}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
[
|
||||||
|
['逸らす', 'そらす'],
|
||||||
|
[
|
||||||
|
{text: '逸', reading: 'そ'},
|
||||||
|
{text: 'らす', reading: ''}
|
||||||
|
]
|
||||||
]
|
]
|
||||||
];
|
];
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user