/* * Copyright (C) 2016-2020 Alex Yatskov * Author: Alex Yatskov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ const JP_HALFWIDTH_KATAKANA_MAPPING = new Map([ ['ヲ', 'ヲヺ-'], ['ァ', 'ァ--'], ['ィ', 'ィ--'], ['ゥ', 'ゥ--'], ['ェ', 'ェ--'], ['ォ', 'ォ--'], ['ャ', 'ャ--'], ['ュ', 'ュ--'], ['ョ', 'ョ--'], ['ッ', 'ッ--'], ['ー', 'ー--'], ['ア', 'ア--'], ['イ', 'イ--'], ['ウ', 'ウヴ-'], ['エ', 'エ--'], ['オ', 'オ--'], ['カ', 'カガ-'], ['キ', 'キギ-'], ['ク', 'クグ-'], ['ケ', 'ケゲ-'], ['コ', 'コゴ-'], ['サ', 'サザ-'], ['シ', 'シジ-'], ['ス', 'スズ-'], ['セ', 'セゼ-'], ['ソ', 'ソゾ-'], ['タ', 'タダ-'], ['チ', 'チヂ-'], ['ツ', 'ツヅ-'], ['テ', 'テデ-'], ['ト', 'トド-'], ['ナ', 'ナ--'], ['ニ', 'ニ--'], ['ヌ', 'ヌ--'], ['ネ', 'ネ--'], ['ノ', 'ノ--'], ['ハ', 'ハバパ'], ['ヒ', 'ヒビピ'], ['フ', 'フブプ'], ['ヘ', 'ヘベペ'], ['ホ', 'ホボポ'], ['マ', 'マ--'], ['ミ', 'ミ--'], ['ム', 'ム--'], ['メ', 'メ--'], ['モ', 'モ--'], ['ヤ', 'ヤ--'], ['ユ', 'ユ--'], ['ヨ', 'ヨ--'], ['ラ', 'ラ--'], ['リ', 'リ--'], ['ル', 'ル--'], ['レ', 'レ--'], ['ロ', 'ロ--'], ['ワ', 'ワ--'], ['ン', 'ン--'] ]); function jpIsKanji(c) { const code = c.charCodeAt(0); return ( (code >= 0x4e00 && code < 0x9fb0) || (code >= 0x3400 && code < 0x4dc0) ); } function jpIsKana(c) { const code = c.charCodeAt(0); return ( (code >= 0x3041 && code <= 0x3096) || // hiragana (code >= 0x30a1 && code <= 0x30fc) // katakana ); } function jpIsCharFullWidth(c) { const code = c.charCodeAt(0); return ( (code >= 0xff21 && code <= 0xff3a) || // full width upper case roman letters (code >= 0xff41 && code <= 0xff3a) || // full width upper case roman letters (code >= 0xff10 && code <= 0xff19) // full width numbers ); } function jpIsKanaHalfWidth(c) { const code = c.charCodeAt(0); return (code >= 0xff66 && code <= 0xff9f); // half width katakana } function jpIsCharacterJapanese(c) { return jpIsKanji(c) || jpIsKana(c) || jpIsCharFullWidth(c) || jpIsKanaHalfWidth(c); } function jpIsAnyCharacterJapanese(text) { for (const c of text) { if (jpIsCharacterJapanese(c)) { return true; } } return false; } function jpKatakanaToHiragana(text) { let result = ''; for (const c of text) { if (wanakana.isKatakana(c)) { result += wanakana.toHiragana(c); } else { result += c; } } return result; } function jpHiraganaToKatakana(text) { let result = ''; for (const c of text) { if (wanakana.isHiragana(c)) { result += wanakana.toKatakana(c); } else { result += c; } } return result; } function jpToRomaji(text) { return wanakana.toRomaji(text); } function jpConvertReading(expressionFragment, readingFragment, readingMode) { switch (readingMode) { case 'hiragana': return jpKatakanaToHiragana(readingFragment || ''); case 'katakana': return jpHiraganaToKatakana(readingFragment || ''); case 'romaji': if (readingFragment) { return jpToRomaji(readingFragment); } else { if (jpIsKana(expressionFragment)) { return jpToRomaji(expressionFragment); } } return readingFragment; default: return readingFragment; } } function jpDistributeFurigana(expression, reading) { const fallback = [{furigana: reading, text: expression}]; if (!reading) { return fallback; } let isAmbiguous = false; const segmentize = (reading, groups) => { if (groups.length === 0 || isAmbiguous) { return []; } const group = groups[0]; if (group.mode === 'kana') { if (jpKatakanaToHiragana(reading).startsWith(jpKatakanaToHiragana(group.text))) { const readingLeft = reading.substring(group.text.length); const segs = segmentize(readingLeft, groups.splice(1)); if (segs) { return [{text: group.text}].concat(segs); } } } else { let foundSegments = null; for (let i = reading.length; i >= group.text.length; --i) { const readingUsed = reading.substring(0, i); const readingLeft = reading.substring(i); const segs = segmentize(readingLeft, groups.slice(1)); if (segs) { if (foundSegments !== null) { // more than one way to segmentize the tail, mark as ambiguous isAmbiguous = true; return null; } foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs); } // there is only one way to segmentize the last non-kana group if (groups.length === 1) { break; } } return foundSegments; } }; const groups = []; let modePrev = null; for (const c of expression) { const modeCurr = jpIsKanji(c) || c.charCodeAt(0) === 0x3005 /* noma */ ? 'kanji' : 'kana'; if (modeCurr === modePrev) { groups[groups.length - 1].text += c; } else { groups.push({mode: modeCurr, text: c}); modePrev = modeCurr; } } const segments = segmentize(reading, groups); if (segments && !isAmbiguous) { return segments; } return fallback; } function jpDistributeFuriganaInflected(expression, reading, source) { const output = []; let stemLength = 0; const shortest = Math.min(source.length, expression.length); const sourceHiragana = jpKatakanaToHiragana(source); const expressionHiragana = jpKatakanaToHiragana(expression); while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) { ++stemLength; } const offset = source.length - stemLength; const stemExpression = source.substring(0, source.length - offset); const stemReading = reading.substring( 0, offset === 0 ? reading.length : reading.length - expression.length + stemLength ); for (const segment of jpDistributeFurigana(stemExpression, stemReading)) { output.push(segment); } if (stemLength !== source.length) { output.push({text: source.substring(stemLength)}); } return output; } function jpConvertHalfWidthKanaToFullWidth(text, sourceMapping) { let result = ''; const ii = text.length; const hasSourceMapping = Array.isArray(sourceMapping); for (let i = 0; i < ii; ++i) { const c = text[i]; const mapping = JP_HALFWIDTH_KATAKANA_MAPPING.get(c); if (typeof mapping !== 'string') { result += c; continue; } let index = 0; switch (text.charCodeAt(i + 1)) { case 0xff9e: // dakuten index = 1; break; case 0xff9f: // handakuten index = 2; break; } let c2 = mapping[index]; if (index > 0) { if (c2 === '-') { // invalid index = 0; c2 = mapping[0]; } else { ++i; } } if (hasSourceMapping && index > 0) { index = result.length; const v = sourceMapping.splice(index + 1, 1)[0]; sourceMapping[index] += v; } result += c2; } return result; } function jpConvertNumericTofullWidth(text) { let result = ''; for (let i = 0, ii = text.length; i < ii; ++i) { let c = text.charCodeAt(i); if (c >= 0x30 && c <= 0x39) { // ['0', '9'] c += 0xff10 - 0x30; // 0xff10 = '0' full width result += String.fromCharCode(c); } else { result += text[i]; } } return result; } function jpConvertAlphabeticToKana(text, sourceMapping) { let part = ''; let result = ''; const ii = text.length; if (sourceMapping.length === ii) { sourceMapping.length = ii; sourceMapping.fill(1); } for (let i = 0; i < ii; ++i) { let c = text.charCodeAt(i); if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] c -= 0x41; } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] c -= 0x61; } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] full width c -= 0xff21; } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] full width c -= 0xff41; } else { if (part.length > 0) { result += jpToHiragana(part, sourceMapping, result.length); part = ''; } result += text[i]; continue; } part += String.fromCharCode(c + 0x61); // + 'a' } if (part.length > 0) { result += jpToHiragana(part, sourceMapping, result.length); } return result; } function jpToHiragana(text, sourceMapping, sourceMappingStart) { const result = wanakana.toHiragana(text); // Generate source mapping if (Array.isArray(sourceMapping)) { if (typeof sourceMappingStart !== 'number') { sourceMappingStart = 0; } let i = 0; let resultPos = 0; const ii = text.length; while (i < ii) { // Find smallest matching substring let iNext = i + 1; let resultPosNext = result.length; while (iNext < ii) { const t = wanakana.toHiragana(text.substring(0, iNext)); if (t === result.substring(0, t.length)) { resultPosNext = t.length; break; } ++iNext; } // Merge characters const removals = iNext - i - 1; if (removals > 0) { let sum = 0; const vs = sourceMapping.splice(sourceMappingStart + 1, removals); for (const v of vs) { sum += v; } sourceMapping[sourceMappingStart] += sum; } ++sourceMappingStart; // Empty elements const additions = resultPosNext - resultPos - 1; for (let j = 0; j < additions; ++j) { sourceMapping.splice(sourceMappingStart, 0, 0); ++sourceMappingStart; } i = iNext; resultPos = resultPosNext; } } return result; }