2017-07-18 23:07:46 -07:00
|
|
|
/*
|
2020-01-01 12:00:00 -05:00
|
|
|
* Copyright (C) 2016-2020 Alex Yatskov <alex@foosoft.net>
|
2017-07-18 23:07:46 -07:00
|
|
|
* Author: Alex Yatskov <alex@foosoft.net>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
2017-08-26 12:21:28 -07:00
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
2017-07-18 23:07:46 -07:00
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
2020-01-01 12:00:31 -05:00
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
2017-07-18 23:07:46 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
function jpIsKanji(c) {
|
|
|
|
const code = c.charCodeAt(0);
|
|
|
|
return code >= 0x4e00 && code < 0x9fb0 || code >= 0x3400 && code < 0x4dc0;
|
|
|
|
}
|
|
|
|
|
|
|
|
function jpIsKana(c) {
|
|
|
|
return wanakana.isKana(c);
|
|
|
|
}
|
|
|
|
|
2019-11-13 13:51:47 +02:00
|
|
|
function jpIsJapaneseText(text) {
|
|
|
|
for (const c of text) {
|
|
|
|
if (jpIsKanji(c) || jpIsKana(c)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-07-18 23:07:46 -07:00
|
|
|
function jpKatakanaToHiragana(text) {
|
2017-08-14 23:10:59 -07:00
|
|
|
let result = '';
|
|
|
|
for (const c of text) {
|
|
|
|
if (wanakana.isKatakana(c)) {
|
|
|
|
result += wanakana.toHiragana(c);
|
|
|
|
} else {
|
|
|
|
result += c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
2017-07-18 23:07:46 -07:00
|
|
|
}
|
2017-08-20 16:07:55 +02:00
|
|
|
|
2019-11-13 20:24:11 +02:00
|
|
|
function jpHiraganaToKatakana(text) {
|
|
|
|
let result = '';
|
|
|
|
for (const c of text) {
|
|
|
|
if (wanakana.isHiragana(c)) {
|
|
|
|
result += wanakana.toKatakana(c);
|
|
|
|
} else {
|
|
|
|
result += c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
function jpToRomaji(text) {
|
|
|
|
return wanakana.toRomaji(text);
|
|
|
|
}
|
|
|
|
|
|
|
|
function jpConvertReading(expressionFragment, readingFragment, readingMode) {
|
|
|
|
switch (readingMode) {
|
|
|
|
case 'hiragana':
|
|
|
|
return jpKatakanaToHiragana(readingFragment || '');
|
|
|
|
case 'katakana':
|
|
|
|
return jpHiraganaToKatakana(readingFragment || '');
|
|
|
|
case 'romaji':
|
|
|
|
if (readingFragment) {
|
|
|
|
return jpToRomaji(readingFragment);
|
|
|
|
} else {
|
|
|
|
if (jpIsKana(expressionFragment)) {
|
|
|
|
return jpToRomaji(expressionFragment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return readingFragment;
|
|
|
|
default:
|
|
|
|
return readingFragment;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-26 11:57:34 -07:00
|
|
|
function jpDistributeFurigana(expression, reading) {
|
|
|
|
const fallback = [{furigana: reading, text: expression}];
|
|
|
|
if (!reading) {
|
|
|
|
return fallback;
|
2017-08-23 20:56:52 -07:00
|
|
|
}
|
2017-08-26 11:57:34 -07:00
|
|
|
|
2019-11-27 01:20:04 +02:00
|
|
|
let isAmbiguous = false;
|
2017-08-26 11:57:34 -07:00
|
|
|
const segmentize = (reading, groups) => {
|
2019-11-27 01:20:04 +02:00
|
|
|
if (groups.length === 0 || isAmbiguous) {
|
2017-08-26 11:57:34 -07:00
|
|
|
return [];
|
2017-08-23 20:56:52 -07:00
|
|
|
}
|
|
|
|
|
2017-08-26 11:57:34 -07:00
|
|
|
const group = groups[0];
|
|
|
|
if (group.mode === 'kana') {
|
2019-11-03 05:08:57 +02:00
|
|
|
if (jpKatakanaToHiragana(reading).startsWith(jpKatakanaToHiragana(group.text))) {
|
2017-08-26 11:57:34 -07:00
|
|
|
const readingLeft = reading.substring(group.text.length);
|
|
|
|
const segs = segmentize(readingLeft, groups.splice(1));
|
|
|
|
if (segs) {
|
2019-11-03 05:08:57 +02:00
|
|
|
return [{text: group.text}].concat(segs);
|
2017-08-26 11:57:34 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2019-11-27 01:20:04 +02:00
|
|
|
let foundSegments = null;
|
2017-08-26 11:57:34 -07:00
|
|
|
for (let i = reading.length; i >= group.text.length; --i) {
|
|
|
|
const readingUsed = reading.substring(0, i);
|
|
|
|
const readingLeft = reading.substring(i);
|
|
|
|
const segs = segmentize(readingLeft, groups.slice(1));
|
|
|
|
if (segs) {
|
2019-11-27 01:20:04 +02:00
|
|
|
if (foundSegments !== null) {
|
|
|
|
// more than one way to segmentize the tail, mark as ambiguous
|
|
|
|
isAmbiguous = true;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs);
|
|
|
|
}
|
|
|
|
// there is only one way to segmentize the last non-kana group
|
|
|
|
if (groups.length === 1) {
|
|
|
|
break;
|
2017-08-23 20:56:52 -07:00
|
|
|
}
|
2017-08-26 11:57:34 -07:00
|
|
|
}
|
2019-11-27 01:20:04 +02:00
|
|
|
return foundSegments;
|
2017-08-26 11:57:34 -07:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const groups = [];
|
|
|
|
let modePrev = null;
|
|
|
|
for (const c of expression) {
|
|
|
|
const modeCurr = jpIsKanji(c) || c.charCodeAt(0) === 0x3005 /* noma */ ? 'kanji' : 'kana';
|
|
|
|
if (modeCurr === modePrev) {
|
|
|
|
groups[groups.length - 1].text += c;
|
|
|
|
} else {
|
|
|
|
groups.push({mode: modeCurr, text: c});
|
|
|
|
modePrev = modeCurr;
|
2017-08-23 20:56:52 -07:00
|
|
|
}
|
|
|
|
}
|
2017-08-26 11:57:34 -07:00
|
|
|
|
2019-11-27 01:20:04 +02:00
|
|
|
const segments = segmentize(reading, groups);
|
|
|
|
if (segments && !isAmbiguous) {
|
|
|
|
return segments;
|
|
|
|
}
|
|
|
|
return fallback;
|
2017-08-23 20:56:52 -07:00
|
|
|
}
|
2019-11-03 05:08:57 +02:00
|
|
|
|
|
|
|
function jpDistributeFuriganaInflected(expression, reading, source) {
|
|
|
|
const output = [];
|
|
|
|
|
|
|
|
let stemLength = 0;
|
|
|
|
const shortest = Math.min(source.length, expression.length);
|
|
|
|
const sourceHiragana = jpKatakanaToHiragana(source);
|
|
|
|
const expressionHiragana = jpKatakanaToHiragana(expression);
|
2019-11-05 02:43:04 +02:00
|
|
|
while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) {
|
2019-11-03 05:08:57 +02:00
|
|
|
++stemLength;
|
|
|
|
}
|
|
|
|
const offset = source.length - stemLength;
|
|
|
|
|
2019-12-08 15:13:22 -05:00
|
|
|
const stemExpression = source.substring(0, source.length - offset);
|
|
|
|
const stemReading = reading.substring(
|
|
|
|
0,
|
|
|
|
offset === 0 ? reading.length : reading.length - expression.length + stemLength
|
2019-11-11 21:58:04 +02:00
|
|
|
);
|
|
|
|
for (const segment of jpDistributeFurigana(stemExpression, stemReading)) {
|
2019-11-03 05:08:57 +02:00
|
|
|
output.push(segment);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (stemLength !== source.length) {
|
2019-12-08 15:13:22 -05:00
|
|
|
output.push({text: source.substring(stemLength)});
|
2019-11-03 05:08:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return output;
|
|
|
|
}
|