add mecab support

This commit is contained in:
siikamiika 2019-11-03 05:08:57 +02:00
parent 3881457e4e
commit 41020289ab
8 changed files with 136 additions and 23 deletions

View File

@ -21,6 +21,7 @@
<script src="/mixed/js/extension.js"></script>
<script src="/bg/js/anki.js"></script>
<script src="/bg/js/mecab.js"></script>
<script src="/bg/js/api.js"></script>
<script src="/bg/js/audio.js"></script>
<script src="/bg/js/backend-api-forwarder.js"></script>

View File

@ -91,25 +91,10 @@ async function apiTextParse(text, optionsContext) {
definitions = dictTermsSort(definitions);
const {expression, reading} = definitions[0];
const source = text.slice(0, sourceLength);
let stemLength = 0;
const shortest = Math.min(source.length, expression.length);
while (stemLength < shortest && source[stemLength] === expression[stemLength]) {
++stemLength;
for (const {text, furigana} of jpDistributeFuriganaInflected(expression, reading, source)) {
// can't use 'furigana' in templates
term.push({text, reading: furigana});
}
const offset = source.length - stemLength;
for (const {text, furigana} of jpDistributeFurigana(
source.slice(0, offset === 0 ? source.length : source.length - offset),
reading.slice(0, offset === 0 ? reading.length : reading.length - expression.length + stemLength)
)) {
term.push({text, reading: furigana || ''});
}
if (stemLength !== source.length) {
term.push({text: source.slice(stemLength)});
}
text = text.slice(source.length);
} else {
term.push({text: text[0]});
@ -120,6 +105,33 @@ async function apiTextParse(text, optionsContext) {
return results;
}
async function apiTextParseMecab(text, optionsContext) {
const options = await apiOptionsGet(optionsContext);
const mecab = utilBackend().mecab;
const results = [];
for (const parsedLine of await mecab.parseText(text)) {
for (const {expression, reading, source} of parsedLine) {
const term = [];
if (expression && reading) {
for (const {text, furigana} of jpDistributeFuriganaInflected(
expression,
jpKatakanaToHiragana(reading),
source
)) {
// can't use 'furigana' in templates
term.push({text, reading: furigana});
}
} else {
term.push({text: source});
}
results.push(term);
}
results.push([{text: '\n'}]);
}
return results;
}
async function apiKanjiFind(text, optionsContext) {
const options = await apiOptionsGet(optionsContext);
const definitions = await utilBackend().translator.findKanji(text, options);

View File

@ -21,6 +21,7 @@ class Backend {
constructor() {
this.translator = new Translator();
this.anki = new AnkiNull();
this.mecab = new Mecab();
this.options = null;
this.optionsContext = {
depth: 0,
@ -181,6 +182,7 @@ Backend.messageHandlers = {
kanjiFind: ({text, optionsContext}) => apiKanjiFind(text, optionsContext),
termsFind: ({text, details, optionsContext}) => apiTermsFind(text, details, optionsContext),
textParse: ({text, optionsContext}) => apiTextParse(text, optionsContext),
textParseMecab: ({text, optionsContext}) => apiTextParseMecab(text, optionsContext),
definitionAdd: ({definition, mode, context, optionsContext}) => apiDefinitionAdd(definition, mode, context, optionsContext),
definitionsAddable: ({definitions, modes, optionsContext}) => apiDefinitionsAddable(definitions, modes, optionsContext),
noteView: ({noteId}) => apiNoteView(noteId),

63
ext/bg/js/mecab.js Normal file
View File

@ -0,0 +1,63 @@
/*
* Copyright (C) 2019 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
class Mecab {
constructor() {
this.listeners = {};
this.sequence = 0;
this.startListener();
}
async parseText(text) {
return await this.invoke('parse_text', {text});
}
startListener() {
this.port = chrome.runtime.connectNative('mecab');
this.port.onMessage.addListener((message) => {
const {sequence, data} = message;
const {callback, timer} = this.listeners[sequence] || {};
if (timer) {
clearTimeout(timer);
delete this.listeners[sequence];
callback(data);
}
});
}
invoke(action, params) {
return new Promise((resolve, reject) => {
const sequence = this.sequence++;
this.listeners[sequence] = {
callback: (data) => {
resolve(data);
},
timer: setTimeout(() => {
delete this.listeners[sequence];
reject(`Mecab invoke timed out in ${Mecab.timeout} ms`);
}, 1000)
}
this.port.postMessage({action, params, sequence});
});
}
}
Mecab.timeout = 1000;

View File

@ -74,7 +74,8 @@ class QueryParser {
preview: true
});
const results = await apiTextParse(text, this.search.getOptionsContext());
// const results = await apiTextParse(text, this.search.getOptionsContext());
const results = await apiTextParseMecab(text, this.search.getOptionsContext());
const content = await apiTemplateRender('query-parser.html', {
terms: results.map((term) => {

View File

@ -33,6 +33,10 @@ function apiTextParse(text, optionsContext) {
return utilInvoke('textParse', {text, optionsContext});
}
function apiTextParseMecab(text, optionsContext) {
return utilInvoke('textParseMecab', {text, optionsContext});
}
function apiKanjiFind(text, optionsContext) {
return utilInvoke('kanjiFind', {text, optionsContext});
}

View File

@ -42,7 +42,8 @@
"<all_urls>",
"storage",
"clipboardWrite",
"unlimitedStorage"
"unlimitedStorage",
"nativeMessaging"
],
"optional_permissions": [
"clipboardRead"

View File

@ -61,12 +61,11 @@ function jpDistributeFurigana(expression, reading) {
const group = groups[0];
if (group.mode === 'kana') {
if (reading.startsWith(group.text)) {
const readingUsed = reading.substring(0, group.text.length);
if (jpKatakanaToHiragana(reading).startsWith(jpKatakanaToHiragana(group.text))) {
const readingLeft = reading.substring(group.text.length);
const segs = segmentize(readingLeft, groups.splice(1));
if (segs) {
return [{text: readingUsed}].concat(segs);
return [{text: group.text}].concat(segs);
}
}
} else {
@ -95,3 +94,33 @@ function jpDistributeFurigana(expression, reading) {
return segmentize(reading, groups) || fallback;
}
function jpDistributeFuriganaInflected(expression, reading, source) {
const output = [];
let stemLength = 0;
const shortest = Math.min(source.length, expression.length);
const sourceHiragana = jpKatakanaToHiragana(source);
const expressionHiragana = jpKatakanaToHiragana(expression);
while (
stemLength < shortest &&
// sometimes an expression can use a kanji that's different from the source
(!jpIsKana(source[stemLength]) || (sourceHiragana[stemLength] === expressionHiragana[stemLength]))
) {
++stemLength;
}
const offset = source.length - stemLength;
for (const segment of jpDistributeFurigana(
source.slice(0, offset === 0 ? source.length : source.length - offset),
reading.slice(0, offset === 0 ? reading.length : reading.length - expression.length + stemLength)
)) {
output.push(segment);
}
if (stemLength !== source.length) {
output.push({text: source.slice(stemLength)});
}
return output;
}