add mecab support
This commit is contained in:
parent
3881457e4e
commit
41020289ab
@ -21,6 +21,7 @@
|
||||
<script src="/mixed/js/extension.js"></script>
|
||||
|
||||
<script src="/bg/js/anki.js"></script>
|
||||
<script src="/bg/js/mecab.js"></script>
|
||||
<script src="/bg/js/api.js"></script>
|
||||
<script src="/bg/js/audio.js"></script>
|
||||
<script src="/bg/js/backend-api-forwarder.js"></script>
|
||||
|
@ -91,25 +91,10 @@ async function apiTextParse(text, optionsContext) {
|
||||
definitions = dictTermsSort(definitions);
|
||||
const {expression, reading} = definitions[0];
|
||||
const source = text.slice(0, sourceLength);
|
||||
|
||||
let stemLength = 0;
|
||||
const shortest = Math.min(source.length, expression.length);
|
||||
while (stemLength < shortest && source[stemLength] === expression[stemLength]) {
|
||||
++stemLength;
|
||||
for (const {text, furigana} of jpDistributeFuriganaInflected(expression, reading, source)) {
|
||||
// can't use 'furigana' in templates
|
||||
term.push({text, reading: furigana});
|
||||
}
|
||||
const offset = source.length - stemLength;
|
||||
|
||||
for (const {text, furigana} of jpDistributeFurigana(
|
||||
source.slice(0, offset === 0 ? source.length : source.length - offset),
|
||||
reading.slice(0, offset === 0 ? reading.length : reading.length - expression.length + stemLength)
|
||||
)) {
|
||||
term.push({text, reading: furigana || ''});
|
||||
}
|
||||
|
||||
if (stemLength !== source.length) {
|
||||
term.push({text: source.slice(stemLength)});
|
||||
}
|
||||
|
||||
text = text.slice(source.length);
|
||||
} else {
|
||||
term.push({text: text[0]});
|
||||
@ -120,6 +105,33 @@ async function apiTextParse(text, optionsContext) {
|
||||
return results;
|
||||
}
|
||||
|
||||
async function apiTextParseMecab(text, optionsContext) {
|
||||
const options = await apiOptionsGet(optionsContext);
|
||||
const mecab = utilBackend().mecab;
|
||||
|
||||
const results = [];
|
||||
for (const parsedLine of await mecab.parseText(text)) {
|
||||
for (const {expression, reading, source} of parsedLine) {
|
||||
const term = [];
|
||||
if (expression && reading) {
|
||||
for (const {text, furigana} of jpDistributeFuriganaInflected(
|
||||
expression,
|
||||
jpKatakanaToHiragana(reading),
|
||||
source
|
||||
)) {
|
||||
// can't use 'furigana' in templates
|
||||
term.push({text, reading: furigana});
|
||||
}
|
||||
} else {
|
||||
term.push({text: source});
|
||||
}
|
||||
results.push(term);
|
||||
}
|
||||
results.push([{text: '\n'}]);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function apiKanjiFind(text, optionsContext) {
|
||||
const options = await apiOptionsGet(optionsContext);
|
||||
const definitions = await utilBackend().translator.findKanji(text, options);
|
||||
|
@ -21,6 +21,7 @@ class Backend {
|
||||
constructor() {
|
||||
this.translator = new Translator();
|
||||
this.anki = new AnkiNull();
|
||||
this.mecab = new Mecab();
|
||||
this.options = null;
|
||||
this.optionsContext = {
|
||||
depth: 0,
|
||||
@ -181,6 +182,7 @@ Backend.messageHandlers = {
|
||||
kanjiFind: ({text, optionsContext}) => apiKanjiFind(text, optionsContext),
|
||||
termsFind: ({text, details, optionsContext}) => apiTermsFind(text, details, optionsContext),
|
||||
textParse: ({text, optionsContext}) => apiTextParse(text, optionsContext),
|
||||
textParseMecab: ({text, optionsContext}) => apiTextParseMecab(text, optionsContext),
|
||||
definitionAdd: ({definition, mode, context, optionsContext}) => apiDefinitionAdd(definition, mode, context, optionsContext),
|
||||
definitionsAddable: ({definitions, modes, optionsContext}) => apiDefinitionsAddable(definitions, modes, optionsContext),
|
||||
noteView: ({noteId}) => apiNoteView(noteId),
|
||||
|
63
ext/bg/js/mecab.js
Normal file
63
ext/bg/js/mecab.js
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (C) 2019 Alex Yatskov <alex@foosoft.net>
|
||||
* Author: Alex Yatskov <alex@foosoft.net>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
class Mecab {
|
||||
constructor() {
|
||||
this.listeners = {};
|
||||
this.sequence = 0;
|
||||
this.startListener();
|
||||
}
|
||||
|
||||
async parseText(text) {
|
||||
return await this.invoke('parse_text', {text});
|
||||
}
|
||||
|
||||
startListener() {
|
||||
this.port = chrome.runtime.connectNative('mecab');
|
||||
this.port.onMessage.addListener((message) => {
|
||||
const {sequence, data} = message;
|
||||
const {callback, timer} = this.listeners[sequence] || {};
|
||||
if (timer) {
|
||||
clearTimeout(timer);
|
||||
delete this.listeners[sequence];
|
||||
callback(data);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
invoke(action, params) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const sequence = this.sequence++;
|
||||
|
||||
this.listeners[sequence] = {
|
||||
callback: (data) => {
|
||||
resolve(data);
|
||||
},
|
||||
timer: setTimeout(() => {
|
||||
delete this.listeners[sequence];
|
||||
reject(`Mecab invoke timed out in ${Mecab.timeout} ms`);
|
||||
}, 1000)
|
||||
}
|
||||
|
||||
this.port.postMessage({action, params, sequence});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Mecab.timeout = 1000;
|
@ -74,7 +74,8 @@ class QueryParser {
|
||||
preview: true
|
||||
});
|
||||
|
||||
const results = await apiTextParse(text, this.search.getOptionsContext());
|
||||
// const results = await apiTextParse(text, this.search.getOptionsContext());
|
||||
const results = await apiTextParseMecab(text, this.search.getOptionsContext());
|
||||
|
||||
const content = await apiTemplateRender('query-parser.html', {
|
||||
terms: results.map((term) => {
|
||||
|
@ -33,6 +33,10 @@ function apiTextParse(text, optionsContext) {
|
||||
return utilInvoke('textParse', {text, optionsContext});
|
||||
}
|
||||
|
||||
function apiTextParseMecab(text, optionsContext) {
|
||||
return utilInvoke('textParseMecab', {text, optionsContext});
|
||||
}
|
||||
|
||||
function apiKanjiFind(text, optionsContext) {
|
||||
return utilInvoke('kanjiFind', {text, optionsContext});
|
||||
}
|
||||
|
@ -42,7 +42,8 @@
|
||||
"<all_urls>",
|
||||
"storage",
|
||||
"clipboardWrite",
|
||||
"unlimitedStorage"
|
||||
"unlimitedStorage",
|
||||
"nativeMessaging"
|
||||
],
|
||||
"optional_permissions": [
|
||||
"clipboardRead"
|
||||
|
@ -61,12 +61,11 @@ function jpDistributeFurigana(expression, reading) {
|
||||
|
||||
const group = groups[0];
|
||||
if (group.mode === 'kana') {
|
||||
if (reading.startsWith(group.text)) {
|
||||
const readingUsed = reading.substring(0, group.text.length);
|
||||
if (jpKatakanaToHiragana(reading).startsWith(jpKatakanaToHiragana(group.text))) {
|
||||
const readingLeft = reading.substring(group.text.length);
|
||||
const segs = segmentize(readingLeft, groups.splice(1));
|
||||
if (segs) {
|
||||
return [{text: readingUsed}].concat(segs);
|
||||
return [{text: group.text}].concat(segs);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -95,3 +94,33 @@ function jpDistributeFurigana(expression, reading) {
|
||||
|
||||
return segmentize(reading, groups) || fallback;
|
||||
}
|
||||
|
||||
function jpDistributeFuriganaInflected(expression, reading, source) {
|
||||
const output = [];
|
||||
|
||||
let stemLength = 0;
|
||||
const shortest = Math.min(source.length, expression.length);
|
||||
const sourceHiragana = jpKatakanaToHiragana(source);
|
||||
const expressionHiragana = jpKatakanaToHiragana(expression);
|
||||
while (
|
||||
stemLength < shortest &&
|
||||
// sometimes an expression can use a kanji that's different from the source
|
||||
(!jpIsKana(source[stemLength]) || (sourceHiragana[stemLength] === expressionHiragana[stemLength]))
|
||||
) {
|
||||
++stemLength;
|
||||
}
|
||||
const offset = source.length - stemLength;
|
||||
|
||||
for (const segment of jpDistributeFurigana(
|
||||
source.slice(0, offset === 0 ? source.length : source.length - offset),
|
||||
reading.slice(0, offset === 0 ? reading.length : reading.length - expression.length + stemLength)
|
||||
)) {
|
||||
output.push(segment);
|
||||
}
|
||||
|
||||
if (stemLength !== source.length) {
|
||||
output.push({text: source.slice(stemLength)});
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user