Merge pull request #456 from siikamiika/parse-text-refactor

Parse text refactor
This commit is contained in:
siikamiika 2020-04-17 01:32:01 +03:00 committed by GitHub
commit 8c16a6e580
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 150 additions and 143 deletions

View File

@ -84,7 +84,6 @@ class Backend {
['kanjiFind', {handler: this._onApiKanjiFind.bind(this), async: true}], ['kanjiFind', {handler: this._onApiKanjiFind.bind(this), async: true}],
['termsFind', {handler: this._onApiTermsFind.bind(this), async: true}], ['termsFind', {handler: this._onApiTermsFind.bind(this), async: true}],
['textParse', {handler: this._onApiTextParse.bind(this), async: true}], ['textParse', {handler: this._onApiTextParse.bind(this), async: true}],
['textParseMecab', {handler: this._onApiTextParseMecab.bind(this), async: true}],
['definitionAdd', {handler: this._onApiDefinitionAdd.bind(this), async: true}], ['definitionAdd', {handler: this._onApiDefinitionAdd.bind(this), async: true}],
['definitionsAddable', {handler: this._onApiDefinitionsAddable.bind(this), async: true}], ['definitionsAddable', {handler: this._onApiDefinitionsAddable.bind(this), async: true}],
['noteView', {handler: this._onApiNoteView.bind(this), async: true}], ['noteView', {handler: this._onApiNoteView.bind(this), async: true}],
@ -314,6 +313,60 @@ class Backend {
return await this.dictionaryImporter.import(this.database, archiveSource, onProgress, details); return await this.dictionaryImporter.import(this.database, archiveSource, onProgress, details);
} }
async _textParseScanning(text, options) {
const results = [];
while (text.length > 0) {
const term = [];
const [definitions, sourceLength] = await this.translator.findTerms(
'simple',
text.substring(0, options.scanning.length),
{},
options
);
if (definitions.length > 0 && sourceLength > 0) {
dictTermsSort(definitions);
const {expression, reading} = definitions[0];
const source = text.substring(0, sourceLength);
for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) {
const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode);
term.push({text: text2, reading: reading2});
}
text = text.substring(source.length);
} else {
const reading = jp.convertReading(text[0], '', options.parsing.readingMode);
term.push({text: text[0], reading});
text = text.substring(1);
}
results.push(term);
}
return results;
}
async _textParseMecab(text, options) {
const results = [];
const rawResults = await this.mecab.parseText(text);
for (const [mecabName, parsedLines] of Object.entries(rawResults)) {
const result = [];
for (const parsedLine of parsedLines) {
for (const {expression, reading, source} of parsedLine) {
const term = [];
for (const {text: text2, furigana} of jp.distributeFuriganaInflected(
expression.length > 0 ? expression : source,
jp.convertKatakanaToHiragana(reading),
source
)) {
const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode);
term.push({text: text2, reading: reading2});
}
result.push(term);
}
result.push([{text: '\n', reading: ''}]);
}
results.push([mecabName, result]);
}
return results;
}
// Message handlers // Message handlers
_onApiYomichanCoreReady(_params, sender) { _onApiYomichanCoreReady(_params, sender) {
@ -405,61 +458,27 @@ class Backend {
async _onApiTextParse({text, optionsContext}) { async _onApiTextParse({text, optionsContext}) {
const options = this.getOptions(optionsContext); const options = this.getOptions(optionsContext);
const results = []; const results = [];
while (text.length > 0) {
const term = [];
const [definitions, sourceLength] = await this.translator.findTerms(
'simple',
text.substring(0, options.scanning.length),
{},
options
);
if (definitions.length > 0) {
dictTermsSort(definitions);
const {expression, reading} = definitions[0];
const source = text.substring(0, sourceLength);
for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) {
const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode);
term.push({text: text2, reading: reading2});
}
text = text.substring(source.length);
} else {
const reading = jp.convertReading(text[0], null, options.parsing.readingMode);
term.push({text: text[0], reading});
text = text.substring(1);
}
results.push(term);
}
return results;
}
async _onApiTextParseMecab({text, optionsContext}) { if (options.parsing.enableScanningParser) {
const options = this.getOptions(optionsContext); results.push({
const results = []; source: 'scanning-parser',
const rawResults = await this.mecab.parseText(text); id: 'scan',
for (const [mecabName, parsedLines] of Object.entries(rawResults)) { content: await this._textParseScanning(text, options)
const result = []; });
for (const parsedLine of parsedLines) {
for (const {expression, reading, source} of parsedLine) {
const term = [];
if (expression !== null && reading !== null) {
for (const {text: text2, furigana} of jp.distributeFuriganaInflected(
expression,
jp.convertKatakanaToHiragana(reading),
source
)) {
const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode);
term.push({text: text2, reading: reading2});
}
} else {
const reading2 = jp.convertReading(source, null, options.parsing.readingMode);
term.push({text: source, reading: reading2});
}
result.push(term);
}
result.push([{text: '\n'}]);
}
results.push([mecabName, result]);
} }
if (options.parsing.enableMecabParser) {
const mecabResults = await this._textParseMecab(text, options);
for (const [mecabDictName, mecabDictResults] of mecabResults) {
results.push({
source: 'mecab',
dictionary: mecabDictName,
id: `mecab-${mecabDictName}`,
content: mecabDictResults
});
}
}
return results; return results;
} }

View File

@ -124,25 +124,25 @@
return wanakana.toRomaji(text); return wanakana.toRomaji(text);
} }
function convertReading(expressionFragment, readingFragment, readingMode) { function convertReading(expression, reading, readingMode) {
switch (readingMode) { switch (readingMode) {
case 'hiragana': case 'hiragana':
return convertKatakanaToHiragana(readingFragment || ''); return convertKatakanaToHiragana(reading);
case 'katakana': case 'katakana':
return convertHiraganaToKatakana(readingFragment || ''); return convertHiraganaToKatakana(reading);
case 'romaji': case 'romaji':
if (readingFragment) { if (reading) {
return convertToRomaji(readingFragment); return convertToRomaji(reading);
} else { } else {
if (isStringEntirelyKana(expressionFragment)) { if (isStringEntirelyKana(expression)) {
return convertToRomaji(expressionFragment); return convertToRomaji(expression);
} }
} }
return readingFragment; return reading;
case 'none': case 'none':
return null; return '';
default: default:
return readingFragment; return reading;
} }
} }
@ -300,7 +300,7 @@
const readingLeft = reading2.substring(group.text.length); const readingLeft = reading2.substring(group.text.length);
const segs = segmentize(readingLeft, groups.splice(1)); const segs = segmentize(readingLeft, groups.splice(1));
if (segs) { if (segs) {
return [{text: group.text}].concat(segs); return [{text: group.text, furigana: ''}].concat(segs);
} }
} }
} else { } else {
@ -368,7 +368,7 @@
} }
if (stemLength !== source.length) { if (stemLength !== source.length) {
output.push({text: source.substring(stemLength)}); output.push({text: source.substring(stemLength), furigana: ''});
} }
return output; return output;

View File

@ -40,7 +40,36 @@ class Mecab {
} }
async parseText(text) { async parseText(text) {
return await this.invoke('parse_text', {text}); const rawResults = await this.invoke('parse_text', {text});
// {
// 'mecab-name': [
// // line1
// [
// {str expression: 'expression', str reading: 'reading', str source: 'source'},
// {str expression: 'expression2', str reading: 'reading2', str source: 'source2'}
// ],
// line2,
// ...
// ],
// 'mecab-name2': [...]
// }
const results = {};
for (const [mecabName, parsedLines] of Object.entries(rawResults)) {
const result = [];
for (const parsedLine of parsedLines) {
const line = [];
for (const {expression, reading, source} of parsedLine) {
line.push({
expression: expression || '',
reading: reading || '',
source: source || ''
});
}
result.push(line);
}
results[mecabName] = result;
}
return results;
} }
startListener() { startListener() {

View File

@ -36,7 +36,7 @@ class QueryParserGenerator {
const termContainer = this._templateHandler.instantiate(preview ? 'term-preview' : 'term'); const termContainer = this._templateHandler.instantiate(preview ? 'term-preview' : 'term');
for (const segment of term) { for (const segment of term) {
if (!segment.text.trim()) { continue; } if (!segment.text.trim()) { continue; }
if (!segment.reading || !segment.reading.trim()) { if (!segment.reading.trim()) {
termContainer.appendChild(this.createSegmentText(segment.text)); termContainer.appendChild(this.createSegmentText(segment.text));
} else { } else {
termContainer.appendChild(this.createSegment(segment)); termContainer.appendChild(this.createSegment(segment));
@ -71,7 +71,17 @@ class QueryParserGenerator {
for (const parseResult of parseResults) { for (const parseResult of parseResults) {
const optionContainer = this._templateHandler.instantiate('select-option'); const optionContainer = this._templateHandler.instantiate('select-option');
optionContainer.value = parseResult.id; optionContainer.value = parseResult.id;
optionContainer.textContent = parseResult.name; switch (parseResult.source) {
case 'scanning-parser':
optionContainer.textContent = 'Scanning parser';
break;
case 'mecab':
optionContainer.textContent = `MeCab: ${parseResult.dictionary}`;
break;
default:
optionContainer.textContent = 'Unrecognized dictionary';
break;
}
optionContainer.defaultSelected = selectedParser === parseResult.id; optionContainer.defaultSelected = selectedParser === parseResult.id;
selectContainer.appendChild(optionContainer); selectContainer.appendChild(optionContainer);
} }

View File

@ -21,7 +21,6 @@
* apiOptionsSet * apiOptionsSet
* apiTermsFind * apiTermsFind
* apiTextParse * apiTextParse
* apiTextParseMecab
* docSentenceExtract * docSentenceExtract
*/ */
@ -128,7 +127,7 @@ class QueryParser extends TextScanner {
this.setPreview(text); this.setPreview(text);
this.parseResults = await this.parseText(text); this.parseResults = await apiTextParse(text, this.getOptionsContext());
this.refreshSelectedParser(); this.refreshSelectedParser();
this.renderParserSelect(); this.renderParserSelect();
@ -137,33 +136,11 @@ class QueryParser extends TextScanner {
this.setSpinnerVisible(false); this.setSpinnerVisible(false);
} }
async parseText(text) {
const results = [];
if (this.options.parsing.enableScanningParser) {
results.push({
name: 'Scanning parser',
id: 'scan',
parsedText: await apiTextParse(text, this.getOptionsContext())
});
}
if (this.options.parsing.enableMecabParser) {
const mecabResults = await apiTextParseMecab(text, this.getOptionsContext());
for (const [mecabDictName, mecabDictResults] of mecabResults) {
results.push({
name: `MeCab: ${mecabDictName}`,
id: `mecab-${mecabDictName}`,
parsedText: mecabDictResults
});
}
}
return results;
}
setPreview(text) { setPreview(text) {
const previewTerms = []; const previewTerms = [];
for (let i = 0, ii = text.length; i < ii; i += 2) { for (let i = 0, ii = text.length; i < ii; i += 2) {
const tempText = text.substring(i, i + 2); const tempText = text.substring(i, i + 2);
previewTerms.push([{text: tempText}]); previewTerms.push([{text: tempText, reading: ''}]);
} }
this.queryParser.textContent = ''; this.queryParser.textContent = '';
this.queryParser.appendChild(this.queryParserGenerator.createParseResult(previewTerms, true)); this.queryParser.appendChild(this.queryParserGenerator.createParseResult(previewTerms, true));
@ -183,6 +160,6 @@ class QueryParser extends TextScanner {
const parseResult = this.getParseResult(); const parseResult = this.getParseResult();
this.queryParser.textContent = ''; this.queryParser.textContent = '';
if (!parseResult) { return; } if (!parseResult) { return; }
this.queryParser.appendChild(this.queryParserGenerator.createParseResult(parseResult.parsedText)); this.queryParser.appendChild(this.queryParserGenerator.createParseResult(parseResult.content));
} }
} }

View File

@ -44,10 +44,6 @@ function apiTextParse(text, optionsContext) {
return _apiInvoke('textParse', {text, optionsContext}); return _apiInvoke('textParse', {text, optionsContext});
} }
function apiTextParseMecab(text, optionsContext) {
return _apiInvoke('textParseMecab', {text, optionsContext});
}
function apiKanjiFind(text, optionsContext) { function apiKanjiFind(text, optionsContext) {
return _apiInvoke('kanjiFind', {text, optionsContext}); return _apiInvoke('kanjiFind', {text, optionsContext});
} }

View File

@ -176,19 +176,19 @@ function testConvertReading() {
[['アリガトウ', 'アリガトウ', 'hiragana'], 'ありがとう'], [['アリガトウ', 'アリガトウ', 'hiragana'], 'ありがとう'],
[['アリガトウ', 'アリガトウ', 'katakana'], 'アリガトウ'], [['アリガトウ', 'アリガトウ', 'katakana'], 'アリガトウ'],
[['アリガトウ', 'アリガトウ', 'romaji'], 'arigatou'], [['アリガトウ', 'アリガトウ', 'romaji'], 'arigatou'],
[['アリガトウ', 'アリガトウ', 'none'], null], [['アリガトウ', 'アリガトウ', 'none'], ''],
[['アリガトウ', 'アリガトウ', 'default'], 'アリガトウ'], [['アリガトウ', 'アリガトウ', 'default'], 'アリガトウ'],
[['ありがとう', 'ありがとう', 'hiragana'], 'ありがとう'], [['ありがとう', 'ありがとう', 'hiragana'], 'ありがとう'],
[['ありがとう', 'ありがとう', 'katakana'], 'アリガトウ'], [['ありがとう', 'ありがとう', 'katakana'], 'アリガトウ'],
[['ありがとう', 'ありがとう', 'romaji'], 'arigatou'], [['ありがとう', 'ありがとう', 'romaji'], 'arigatou'],
[['ありがとう', 'ありがとう', 'none'], null], [['ありがとう', 'ありがとう', 'none'], ''],
[['ありがとう', 'ありがとう', 'default'], 'ありがとう'], [['ありがとう', 'ありがとう', 'default'], 'ありがとう'],
[['有り難う', 'ありがとう', 'hiragana'], 'ありがとう'], [['有り難う', 'ありがとう', 'hiragana'], 'ありがとう'],
[['有り難う', 'ありがとう', 'katakana'], 'アリガトウ'], [['有り難う', 'ありがとう', 'katakana'], 'アリガトウ'],
[['有り難う', 'ありがとう', 'romaji'], 'arigatou'], [['有り難う', 'ありがとう', 'romaji'], 'arigatou'],
[['有り難う', 'ありがとう', 'none'], null], [['有り難う', 'ありがとう', 'none'], ''],
[['有り難う', 'ありがとう', 'default'], 'ありがとう'], [['有り難う', 'ありがとう', 'default'], 'ありがとう'],
// Cases with falsy readings // Cases with falsy readings
@ -196,44 +196,20 @@ function testConvertReading() {
[['ありがとう', '', 'hiragana'], ''], [['ありがとう', '', 'hiragana'], ''],
[['ありがとう', '', 'katakana'], ''], [['ありがとう', '', 'katakana'], ''],
[['ありがとう', '', 'romaji'], 'arigatou'], [['ありがとう', '', 'romaji'], 'arigatou'],
[['ありがとう', '', 'none'], null], [['ありがとう', '', 'none'], ''],
[['ありがとう', '', 'default'], ''], [['ありがとう', '', 'default'], ''],
[['ありがとう', null, 'hiragana'], ''],
[['ありがとう', null, 'katakana'], ''],
[['ありがとう', null, 'romaji'], 'arigatou'],
[['ありがとう', null, 'none'], null],
[['ありがとう', null, 'default'], null],
[['ありがとう', void 0, 'hiragana'], ''],
[['ありがとう', void 0, 'katakana'], ''],
[['ありがとう', void 0, 'romaji'], 'arigatou'],
[['ありがとう', void 0, 'none'], null],
[['ありがとう', void 0, 'default'], void 0],
// Cases with falsy readings and kanji expressions // Cases with falsy readings and kanji expressions
[['有り難う', '', 'hiragana'], ''], [['有り難う', '', 'hiragana'], ''],
[['有り難う', '', 'katakana'], ''], [['有り難う', '', 'katakana'], ''],
[['有り難う', '', 'romaji'], ''], [['有り難う', '', 'romaji'], ''],
[['有り難う', '', 'none'], null], [['有り難う', '', 'none'], ''],
[['有り難う', '', 'default'], ''], [['有り難う', '', 'default'], '']
[['有り難う', null, 'hiragana'], ''],
[['有り難う', null, 'katakana'], ''],
[['有り難う', null, 'romaji'], null],
[['有り難う', null, 'none'], null],
[['有り難う', null, 'default'], null],
[['有り難う', void 0, 'hiragana'], ''],
[['有り難う', void 0, 'katakana'], ''],
[['有り難う', void 0, 'romaji'], void 0],
[['有り難う', void 0, 'none'], null],
[['有り難う', void 0, 'default'], void 0]
]; ];
for (const [[expressionFragment, readingFragment, readingMode], expected] of data) { for (const [[expression, reading, readingMode], expected] of data) {
assert.strictEqual(jp.convertReading(expressionFragment, readingFragment, readingMode), expected); assert.strictEqual(jp.convertReading(expression, reading, readingMode), expected);
} }
} }
@ -303,9 +279,9 @@ function testDistributeFurigana() {
['有り難う', 'ありがとう'], ['有り難う', 'ありがとう'],
[ [
{text: '有', furigana: 'あ'}, {text: '有', furigana: 'あ'},
{text: 'り'}, {text: 'り', furigana: ''},
{text: '難', furigana: 'がと'}, {text: '難', furigana: 'がと'},
{text: 'う'} {text: 'う', furigana: ''}
] ]
], ],
[ [
@ -317,23 +293,23 @@ function testDistributeFurigana() {
[ [
['お祝い', 'おいわい'], ['お祝い', 'おいわい'],
[ [
{text: 'お'}, {text: 'お', furigana: ''},
{text: '祝', furigana: 'いわ'}, {text: '祝', furigana: 'いわ'},
{text: 'い'} {text: 'い', furigana: ''}
] ]
], ],
[ [
['美味しい', 'おいしい'], ['美味しい', 'おいしい'],
[ [
{text: '美味', furigana: 'おい'}, {text: '美味', furigana: 'おい'},
{text: 'しい'} {text: 'しい', furigana: ''}
] ]
], ],
[ [
['食べ物', 'たべもの'], ['食べ物', 'たべもの'],
[ [
{text: '食', furigana: 'た'}, {text: '食', furigana: 'た'},
{text: 'べ'}, {text: 'べ', furigana: ''},
{text: '物', furigana: 'もの'} {text: '物', furigana: 'もの'}
] ]
], ],
@ -341,9 +317,9 @@ function testDistributeFurigana() {
['試し切り', 'ためしぎり'], ['試し切り', 'ためしぎり'],
[ [
{text: '試', furigana: 'ため'}, {text: '試', furigana: 'ため'},
{text: 'し'}, {text: 'し', furigana: ''},
{text: '切', furigana: 'ぎ'}, {text: '切', furigana: 'ぎ'},
{text: 'り'} {text: 'り', furigana: ''}
] ]
], ],
// Ambiguous // Ambiguous
@ -373,16 +349,16 @@ function testDistributeFuriganaInflected() {
['美味しい', 'おいしい', '美味しかた'], ['美味しい', 'おいしい', '美味しかた'],
[ [
{text: '美味', furigana: 'おい'}, {text: '美味', furigana: 'おい'},
{text: 'し'}, {text: 'し', furigana: ''},
{text: 'かた'} {text: 'かた', furigana: ''}
] ]
], ],
[ [
['食べる', 'たべる', '食べた'], ['食べる', 'たべる', '食べた'],
[ [
{text: '食', furigana: 'た'}, {text: '食', furigana: 'た'},
{text: 'べ'}, {text: 'べ', furigana: ''},
{text: 'た'} {text: 'た', furigana: ''}
] ]
] ]
]; ];