Merge pull request #238 from toasted-nutbread/deinflector-optimization

Deinflector optimization
This commit is contained in:
Alex Yatskov 2019-10-05 19:49:54 -07:00 committed by GitHub
commit 14a5e3ce20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 141 additions and 112 deletions

View File

@ -19,51 +19,74 @@
class Deinflector { class Deinflector {
constructor(reasons) { constructor(reasons) {
this.reasons = reasons; this.reasons = Deinflector.normalizeReasons(reasons);
} }
deinflect(source) { deinflect(source) {
const results = [{ const results = [{
source, source,
term: source, term: source,
rules: [], rules: 0,
definitions: [], definitions: [],
reasons: [] reasons: []
}]; }];
for (let i = 0; i < results.length; ++i) { for (let i = 0; i < results.length; ++i) {
const entry = results[i]; const {rules, term, reasons} = results[i];
for (const [reason, variants] of this.reasons) {
for (const reason in this.reasons) { for (const [kanaIn, kanaOut, rulesIn, rulesOut] of variants) {
for (const variant of this.reasons[reason]) { if (
let accept = entry.rules.length === 0; (rules !== 0 && (rules & rulesIn) === 0) ||
if (!accept) { !term.endsWith(kanaIn) ||
for (const rule of entry.rules) { (term.length - kanaIn.length + kanaOut.length) <= 0
if (variant.rulesIn.includes(rule)) { ) {
accept = true;
break;
}
}
}
if (!accept || !entry.term.endsWith(variant.kanaIn)) {
continue;
}
const term = entry.term.slice(0, -variant.kanaIn.length) + variant.kanaOut;
if (term.length === 0) {
continue; continue;
} }
results.push({ results.push({
source, source,
term, term: term.slice(0, -kanaIn.length) + kanaOut,
rules: variant.rulesOut, rules: rulesOut,
definitions: [], definitions: [],
reasons: [reason, ...entry.reasons] reasons: [reason, ...reasons]
}); });
} }
} }
} }
return results; return results;
} }
static normalizeReasons(reasons) {
const normalizedReasons = [];
for (const reason in reasons) {
const variants = [];
for (const {kanaIn, kanaOut, rulesIn, rulesOut} of reasons[reason]) {
variants.push([
kanaIn,
kanaOut,
Deinflector.rulesToRuleFlags(rulesIn),
Deinflector.rulesToRuleFlags(rulesOut)
]);
}
normalizedReasons.push([reason, variants]);
}
return normalizedReasons;
}
static rulesToRuleFlags(rules) {
const ruleTypes = Deinflector.ruleTypes;
let value = 0;
for (const rule of rules) {
value |= ruleTypes[rule];
}
return value;
}
} }
Deinflector.ruleTypes = {
'v1': 0b0000001, // Verb ichidan
'v5': 0b0000010, // Verb godan
'vs': 0b0000100, // Verb suru
'vk': 0b0001000, // Verb kuru
'adj-i': 0b0010000, // Adjective i
'iru': 0b0100000, // Intermediate -iru endings for progressive or perfect tense
};

View File

@ -238,8 +238,10 @@ class Translator {
const definitions = await this.database.findTermsBulk(uniqueDeinflectionTerms, titles); const definitions = await this.database.findTermsBulk(uniqueDeinflectionTerms, titles);
for (const definition of definitions) { for (const definition of definitions) {
const definitionRules = Deinflector.rulesToRuleFlags(definition.rules);
for (const deinflection of uniqueDeinflectionArrays[definition.index]) { for (const deinflection of uniqueDeinflectionArrays[definition.index]) {
if (Translator.definitionContainsAnyRule(definition, deinflection.rules)) { const deinflectionRules = deinflection.rules;
if (deinflectionRules === 0 || (definitionRules & deinflectionRules) !== 0) {
deinflection.definitions.push(definition); deinflection.definitions.push(definition);
} }
} }
@ -248,19 +250,6 @@ class Translator {
return deinflections.filter(e => e.definitions.length > 0); return deinflections.filter(e => e.definitions.length > 0);
} }
static definitionContainsAnyRule(definition, rules) {
if (rules.length === 0) {
return true;
}
const definitionRules = definition.rules;
for (const rule of rules) {
if (definitionRules.includes(rule)) {
return true;
}
}
return false;
}
getDeinflections(text) { getDeinflections(text) {
const deinflections = []; const deinflections = [];

View File

@ -1186,7 +1186,7 @@
"kanaIn": "て", "kanaIn": "て",
"kanaOut": "る", "kanaOut": "る",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v1", "v1",
@ -1197,7 +1197,7 @@
"kanaIn": "いて", "kanaIn": "いて",
"kanaOut": "く", "kanaOut": "く",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1207,7 +1207,7 @@
"kanaIn": "いで", "kanaIn": "いで",
"kanaOut": "ぐ", "kanaOut": "ぐ",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1217,7 +1217,7 @@
"kanaIn": "きて", "kanaIn": "きて",
"kanaOut": "くる", "kanaOut": "くる",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"vk" "vk"
@ -1227,7 +1227,7 @@
"kanaIn": "くて", "kanaIn": "くて",
"kanaOut": "い", "kanaOut": "い",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"adj-i" "adj-i"
@ -1237,7 +1237,7 @@
"kanaIn": "して", "kanaIn": "して",
"kanaOut": "す", "kanaOut": "す",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1247,7 +1247,7 @@
"kanaIn": "して", "kanaIn": "して",
"kanaOut": "する", "kanaOut": "する",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"vs" "vs"
@ -1257,7 +1257,7 @@
"kanaIn": "って", "kanaIn": "って",
"kanaOut": "う", "kanaOut": "う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1267,7 +1267,7 @@
"kanaIn": "って", "kanaIn": "って",
"kanaOut": "つ", "kanaOut": "つ",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1277,7 +1277,7 @@
"kanaIn": "って", "kanaIn": "って",
"kanaOut": "る", "kanaOut": "る",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1287,7 +1287,7 @@
"kanaIn": "んで", "kanaIn": "んで",
"kanaOut": "ぬ", "kanaOut": "ぬ",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1297,7 +1297,7 @@
"kanaIn": "んで", "kanaIn": "んで",
"kanaOut": "ぶ", "kanaOut": "ぶ",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1307,7 +1307,7 @@
"kanaIn": "んで", "kanaIn": "んで",
"kanaOut": "む", "kanaOut": "む",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1317,7 +1317,7 @@
"kanaIn": "のたもうて", "kanaIn": "のたもうて",
"kanaOut": "のたまう", "kanaOut": "のたまう",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1327,7 +1327,7 @@
"kanaIn": "いって", "kanaIn": "いって",
"kanaOut": "いく", "kanaOut": "いく",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1337,7 +1337,7 @@
"kanaIn": "おうて", "kanaIn": "おうて",
"kanaOut": "おう", "kanaOut": "おう",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1347,7 +1347,7 @@
"kanaIn": "こうて", "kanaIn": "こうて",
"kanaOut": "こう", "kanaOut": "こう",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1357,7 +1357,7 @@
"kanaIn": "そうて", "kanaIn": "そうて",
"kanaOut": "そう", "kanaOut": "そう",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1367,7 +1367,7 @@
"kanaIn": "とうて", "kanaIn": "とうて",
"kanaOut": "とう", "kanaOut": "とう",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1377,7 +1377,7 @@
"kanaIn": "行って", "kanaIn": "行って",
"kanaOut": "行く", "kanaOut": "行く",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1387,7 +1387,7 @@
"kanaIn": "逝って", "kanaIn": "逝って",
"kanaOut": "逝く", "kanaOut": "逝く",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1397,7 +1397,7 @@
"kanaIn": "往って", "kanaIn": "往って",
"kanaOut": "往く", "kanaOut": "往く",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1407,7 +1407,7 @@
"kanaIn": "請うて", "kanaIn": "請うて",
"kanaOut": "請う", "kanaOut": "請う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1417,7 +1417,7 @@
"kanaIn": "乞うて", "kanaIn": "乞うて",
"kanaOut": "乞う", "kanaOut": "乞う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1427,7 +1427,7 @@
"kanaIn": "恋うて", "kanaIn": "恋うて",
"kanaOut": "恋う", "kanaOut": "恋う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1437,7 +1437,7 @@
"kanaIn": "問うて", "kanaIn": "問うて",
"kanaOut": "問う", "kanaOut": "問う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1447,7 +1447,7 @@
"kanaIn": "負うて", "kanaIn": "負うて",
"kanaOut": "負う", "kanaOut": "負う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1457,7 +1457,7 @@
"kanaIn": "沿うて", "kanaIn": "沿うて",
"kanaOut": "沿う", "kanaOut": "沿う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1467,7 +1467,7 @@
"kanaIn": "添うて", "kanaIn": "添うて",
"kanaOut": "添う", "kanaOut": "添う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1477,7 +1477,7 @@
"kanaIn": "副うて", "kanaIn": "副うて",
"kanaOut": "副う", "kanaOut": "副う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -1487,21 +1487,11 @@
"kanaIn": "厭うて", "kanaIn": "厭うて",
"kanaOut": "厭う", "kanaOut": "厭う",
"rulesIn": [ "rulesIn": [
"iru" "iru"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
] ]
},
{
"kanaIn": "で",
"kanaOut": "",
"rulesIn": [
"iru"
],
"rulesOut": [
"neg-de"
]
} }
], ],
"-zu": [ "-zu": [
@ -2233,8 +2223,7 @@
"kanaIn": "ない", "kanaIn": "ない",
"kanaOut": "る", "kanaOut": "る",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v1", "v1",
@ -2245,8 +2234,7 @@
"kanaIn": "かない", "kanaIn": "かない",
"kanaOut": "く", "kanaOut": "く",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2256,8 +2244,7 @@
"kanaIn": "がない", "kanaIn": "がない",
"kanaOut": "ぐ", "kanaOut": "ぐ",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2267,8 +2254,7 @@
"kanaIn": "くない", "kanaIn": "くない",
"kanaOut": "い", "kanaOut": "い",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"adj-i" "adj-i"
@ -2278,8 +2264,7 @@
"kanaIn": "こない", "kanaIn": "こない",
"kanaOut": "くる", "kanaOut": "くる",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"vk" "vk"
@ -2289,8 +2274,7 @@
"kanaIn": "さない", "kanaIn": "さない",
"kanaOut": "す", "kanaOut": "す",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2300,8 +2284,7 @@
"kanaIn": "しない", "kanaIn": "しない",
"kanaOut": "する", "kanaOut": "する",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"vs" "vs"
@ -2311,8 +2294,7 @@
"kanaIn": "たない", "kanaIn": "たない",
"kanaOut": "つ", "kanaOut": "つ",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2322,8 +2304,7 @@
"kanaIn": "なない", "kanaIn": "なない",
"kanaOut": "ぬ", "kanaOut": "ぬ",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2333,8 +2314,7 @@
"kanaIn": "ばない", "kanaIn": "ばない",
"kanaOut": "ぶ", "kanaOut": "ぶ",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2344,8 +2324,7 @@
"kanaIn": "まない", "kanaIn": "まない",
"kanaOut": "む", "kanaOut": "む",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2355,8 +2334,7 @@
"kanaIn": "らない", "kanaIn": "らない",
"kanaOut": "る", "kanaOut": "る",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -2366,8 +2344,7 @@
"kanaIn": "わない", "kanaIn": "わない",
"kanaOut": "う", "kanaOut": "う",
"rulesIn": [ "rulesIn": [
"adj-i", "adj-i"
"neg-de"
], ],
"rulesOut": [ "rulesOut": [
"v5" "v5"
@ -3681,8 +3658,8 @@
], ],
"progressive or perfect": [ "progressive or perfect": [
{ {
"kanaIn": "いる", "kanaIn": "いる",
"kanaOut": "", "kanaOut": "",
"rulesIn": [ "rulesIn": [
"v1" "v1"
], ],
@ -3691,8 +3668,8 @@
] ]
}, },
{ {
"kanaIn": "る", "kanaIn": "ておる",
"kanaOut": "", "kanaOut": "",
"rulesIn": [ "rulesIn": [
"v1" "v1"
], ],
@ -3701,14 +3678,54 @@
] ]
}, },
{ {
"kanaIn": "る", "kanaIn": "る",
"kanaOut": "", "kanaOut": "",
"rulesIn": [ "rulesIn": [
"v1" "v1"
], ],
"rulesOut": [ "rulesOut": [
"iru" "iru"
] ]
},
{
"kanaIn": "でいる",
"kanaOut": "で",
"rulesIn": [
"v1"
],
"rulesOut": [
"iru"
]
},
{
"kanaIn": "でおる",
"kanaOut": "で",
"rulesIn": [
"v1"
],
"rulesOut": [
"iru"
]
},
{
"kanaIn": "とる",
"kanaOut": "て",
"rulesIn": [
"v1"
],
"rulesOut": [
"iru"
]
},
{
"kanaIn": "ないでいる",
"kanaOut": "ない",
"rulesIn": [
"v1"
],
"rulesOut": [
"adj-i"
]
} }
] ]
} }