From 41020289ab68ef22a0691a9f268a79d6a706df6b Mon Sep 17 00:00:00 2001 From: siikamiika Date: Sun, 3 Nov 2019 05:08:57 +0200 Subject: [PATCH] add mecab support --- ext/bg/background.html | 1 + ext/bg/js/api.js | 48 +++++++++++++++--------- ext/bg/js/backend.js | 2 + ext/bg/js/mecab.js | 63 ++++++++++++++++++++++++++++++++ ext/bg/js/search-query-parser.js | 3 +- ext/fg/js/api.js | 4 ++ ext/manifest.json | 3 +- ext/mixed/js/japanese.js | 35 ++++++++++++++++-- 8 files changed, 136 insertions(+), 23 deletions(-) create mode 100644 ext/bg/js/mecab.js diff --git a/ext/bg/background.html b/ext/bg/background.html index bbfbd1e1..6e6e7c26 100644 --- a/ext/bg/background.html +++ b/ext/bg/background.html @@ -21,6 +21,7 @@ + diff --git a/ext/bg/js/api.js b/ext/bg/js/api.js index 7c9a72a7..2ab01af3 100644 --- a/ext/bg/js/api.js +++ b/ext/bg/js/api.js @@ -91,25 +91,10 @@ async function apiTextParse(text, optionsContext) { definitions = dictTermsSort(definitions); const {expression, reading} = definitions[0]; const source = text.slice(0, sourceLength); - - let stemLength = 0; - const shortest = Math.min(source.length, expression.length); - while (stemLength < shortest && source[stemLength] === expression[stemLength]) { - ++stemLength; + for (const {text, furigana} of jpDistributeFuriganaInflected(expression, reading, source)) { + // can't use 'furigana' in templates + term.push({text, reading: furigana}); } - const offset = source.length - stemLength; - - for (const {text, furigana} of jpDistributeFurigana( - source.slice(0, offset === 0 ? source.length : source.length - offset), - reading.slice(0, offset === 0 ? reading.length : reading.length - expression.length + stemLength) - )) { - term.push({text, reading: furigana || ''}); - } - - if (stemLength !== source.length) { - term.push({text: source.slice(stemLength)}); - } - text = text.slice(source.length); } else { term.push({text: text[0]}); @@ -120,6 +105,33 @@ async function apiTextParse(text, optionsContext) { return results; } +async function apiTextParseMecab(text, optionsContext) { + const options = await apiOptionsGet(optionsContext); + const mecab = utilBackend().mecab; + + const results = []; + for (const parsedLine of await mecab.parseText(text)) { + for (const {expression, reading, source} of parsedLine) { + const term = []; + if (expression && reading) { + for (const {text, furigana} of jpDistributeFuriganaInflected( + expression, + jpKatakanaToHiragana(reading), + source + )) { + // can't use 'furigana' in templates + term.push({text, reading: furigana}); + } + } else { + term.push({text: source}); + } + results.push(term); + } + results.push([{text: '\n'}]); + } + return results; +} + async function apiKanjiFind(text, optionsContext) { const options = await apiOptionsGet(optionsContext); const definitions = await utilBackend().translator.findKanji(text, options); diff --git a/ext/bg/js/backend.js b/ext/bg/js/backend.js index d0e404f2..e97f32b5 100644 --- a/ext/bg/js/backend.js +++ b/ext/bg/js/backend.js @@ -21,6 +21,7 @@ class Backend { constructor() { this.translator = new Translator(); this.anki = new AnkiNull(); + this.mecab = new Mecab(); this.options = null; this.optionsContext = { depth: 0, @@ -181,6 +182,7 @@ Backend.messageHandlers = { kanjiFind: ({text, optionsContext}) => apiKanjiFind(text, optionsContext), termsFind: ({text, details, optionsContext}) => apiTermsFind(text, details, optionsContext), textParse: ({text, optionsContext}) => apiTextParse(text, optionsContext), + textParseMecab: ({text, optionsContext}) => apiTextParseMecab(text, optionsContext), definitionAdd: ({definition, mode, context, optionsContext}) => apiDefinitionAdd(definition, mode, context, optionsContext), definitionsAddable: ({definitions, modes, optionsContext}) => apiDefinitionsAddable(definitions, modes, optionsContext), noteView: ({noteId}) => apiNoteView(noteId), diff --git a/ext/bg/js/mecab.js b/ext/bg/js/mecab.js new file mode 100644 index 00000000..dc46ded2 --- /dev/null +++ b/ext/bg/js/mecab.js @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2019 Alex Yatskov + * Author: Alex Yatskov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +class Mecab { + constructor() { + this.listeners = {}; + this.sequence = 0; + this.startListener(); + } + + async parseText(text) { + return await this.invoke('parse_text', {text}); + } + + startListener() { + this.port = chrome.runtime.connectNative('mecab'); + this.port.onMessage.addListener((message) => { + const {sequence, data} = message; + const {callback, timer} = this.listeners[sequence] || {}; + if (timer) { + clearTimeout(timer); + delete this.listeners[sequence]; + callback(data); + } + }); + } + + invoke(action, params) { + return new Promise((resolve, reject) => { + const sequence = this.sequence++; + + this.listeners[sequence] = { + callback: (data) => { + resolve(data); + }, + timer: setTimeout(() => { + delete this.listeners[sequence]; + reject(`Mecab invoke timed out in ${Mecab.timeout} ms`); + }, 1000) + } + + this.port.postMessage({action, params, sequence}); + }); + } +} + +Mecab.timeout = 1000; diff --git a/ext/bg/js/search-query-parser.js b/ext/bg/js/search-query-parser.js index 8a7db69a..0c74e550 100644 --- a/ext/bg/js/search-query-parser.js +++ b/ext/bg/js/search-query-parser.js @@ -74,7 +74,8 @@ class QueryParser { preview: true }); - const results = await apiTextParse(text, this.search.getOptionsContext()); + // const results = await apiTextParse(text, this.search.getOptionsContext()); + const results = await apiTextParseMecab(text, this.search.getOptionsContext()); const content = await apiTemplateRender('query-parser.html', { terms: results.map((term) => { diff --git a/ext/fg/js/api.js b/ext/fg/js/api.js index cc1e0e90..92330d9c 100644 --- a/ext/fg/js/api.js +++ b/ext/fg/js/api.js @@ -33,6 +33,10 @@ function apiTextParse(text, optionsContext) { return utilInvoke('textParse', {text, optionsContext}); } +function apiTextParseMecab(text, optionsContext) { + return utilInvoke('textParseMecab', {text, optionsContext}); +} + function apiKanjiFind(text, optionsContext) { return utilInvoke('kanjiFind', {text, optionsContext}); } diff --git a/ext/manifest.json b/ext/manifest.json index fabceafd..4d75cd54 100644 --- a/ext/manifest.json +++ b/ext/manifest.json @@ -42,7 +42,8 @@ "", "storage", "clipboardWrite", - "unlimitedStorage" + "unlimitedStorage", + "nativeMessaging" ], "optional_permissions": [ "clipboardRead" diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js index d24f56a6..78c419b2 100644 --- a/ext/mixed/js/japanese.js +++ b/ext/mixed/js/japanese.js @@ -61,12 +61,11 @@ function jpDistributeFurigana(expression, reading) { const group = groups[0]; if (group.mode === 'kana') { - if (reading.startsWith(group.text)) { - const readingUsed = reading.substring(0, group.text.length); + if (jpKatakanaToHiragana(reading).startsWith(jpKatakanaToHiragana(group.text))) { const readingLeft = reading.substring(group.text.length); const segs = segmentize(readingLeft, groups.splice(1)); if (segs) { - return [{text: readingUsed}].concat(segs); + return [{text: group.text}].concat(segs); } } } else { @@ -95,3 +94,33 @@ function jpDistributeFurigana(expression, reading) { return segmentize(reading, groups) || fallback; } + +function jpDistributeFuriganaInflected(expression, reading, source) { + const output = []; + + let stemLength = 0; + const shortest = Math.min(source.length, expression.length); + const sourceHiragana = jpKatakanaToHiragana(source); + const expressionHiragana = jpKatakanaToHiragana(expression); + while ( + stemLength < shortest && + // sometimes an expression can use a kanji that's different from the source + (!jpIsKana(source[stemLength]) || (sourceHiragana[stemLength] === expressionHiragana[stemLength])) + ) { + ++stemLength; + } + const offset = source.length - stemLength; + + for (const segment of jpDistributeFurigana( + source.slice(0, offset === 0 ? source.length : source.length - offset), + reading.slice(0, offset === 0 ? reading.length : reading.length - expression.length + stemLength) + )) { + output.push(segment); + } + + if (stemLength !== source.length) { + output.push({text: source.slice(stemLength)}); + } + + return output; +}