1

Fixing deinflection bugs

This commit is contained in:
Alex Yatskov 2016-05-09 14:33:52 -07:00
parent 83fb78ba28
commit 66bf7b895f
4 changed files with 1469 additions and 1371 deletions

View File

@ -72,7 +72,7 @@
"v1",
"v5",
"vk",
"vs-"
"vs"
]
},
{
@ -89,7 +89,7 @@
"kanaIn": "ちゃう",
"kanaOut": "る",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v1",
@ -100,7 +100,7 @@
"kanaIn": "いじゃう",
"kanaOut": "ぐ",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -110,7 +110,7 @@
"kanaIn": "いちゃう",
"kanaOut": "く",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -120,7 +120,7 @@
"kanaIn": "きちゃう",
"kanaOut": "くる",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"vk"
@ -130,7 +130,7 @@
"kanaIn": "しちゃう",
"kanaOut": "す",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -140,17 +140,17 @@
"kanaIn": "しちゃう",
"kanaOut": "する",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"vs-"
"vs"
]
},
{
"kanaIn": "っちゃう",
"kanaOut": "う",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -160,7 +160,7 @@
"kanaIn": "っちゃう",
"kanaOut": "く",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -170,7 +170,7 @@
"kanaIn": "っちゃう",
"kanaOut": "つ",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -180,7 +180,7 @@
"kanaIn": "っちゃう",
"kanaOut": "る",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -190,7 +190,7 @@
"kanaIn": "んじゃう",
"kanaOut": "ぬ",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -200,7 +200,7 @@
"kanaIn": "んじゃう",
"kanaOut": "ぶ",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -210,7 +210,7 @@
"kanaIn": "んじゃう",
"kanaOut": "む",
"tagsIn": [
"v5.*"
"v5"
],
"tagsOut": [
"v5"
@ -272,7 +272,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -379,7 +379,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -502,7 +502,7 @@
"v1"
],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -625,7 +625,7 @@
"adj-i"
],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -726,7 +726,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -833,7 +833,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -948,7 +948,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -1055,7 +1055,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -1107,6 +1107,105 @@
]
}
],
"-nu": [
{
"kanaIn": "ぬ",
"kanaOut": "る",
"tagsIn": [],
"tagsOut": [
"v1",
"vk"
]
},
{
"kanaIn": "かぬ",
"kanaOut": "く",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "がぬ",
"kanaOut": "ぐ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "こぬ",
"kanaOut": "くる",
"tagsIn": [],
"tagsOut": [
"vk"
]
},
{
"kanaIn": "さぬ",
"kanaOut": "す",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "せぬ",
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs"
]
},
{
"kanaIn": "たぬ",
"kanaOut": "つ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "なぬ",
"kanaOut": "ぬ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "ばぬ",
"kanaOut": "ぶ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "まぬ",
"kanaOut": "む",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "らぬ",
"kanaOut": "る",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "わぬ",
"kanaOut": "う",
"tagsIn": [],
"tagsOut": [
"v5"
]
}
],
"adv": [
{
"kanaIn": "く",
@ -1145,7 +1244,7 @@
"v1"
],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -1340,7 +1439,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -1348,7 +1447,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
}
],
@ -1361,7 +1460,7 @@
"v1",
"v5",
"vk",
"vs-"
"vs"
]
}
],
@ -1684,7 +1783,7 @@
"adj-i"
],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -1786,7 +1885,7 @@
"v1"
],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -1899,7 +1998,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -2022,7 +2121,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -2121,7 +2220,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -2228,7 +2327,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -2327,7 +2426,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -2434,7 +2533,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
},
{
@ -2701,7 +2800,7 @@
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs-"
"vs"
]
}
]

View File

@ -18,20 +18,14 @@
import codecs
import json
import re
#
# Deinflection
#
class Deinflection:
def __init__(self, term, tags=list(), rule=str()):
self.children = list()
def __init__(self, term, tags=[], rule=''):
self.children = []
self.term = term
self.tags = tags
self.rule = rule
self.success = False
def validate(self, validator):
@ -40,25 +34,29 @@ class Deinflection:
return True
for tag in self.tags:
if self.searchTags(tag, tags):
if tag in tags:
return True
def deinflect(self, validator, rules):
if self.validate(validator):
child = Deinflection(self.term)
child = Deinflection(self.term, self.tags)
self.children.append(child)
for rule, variants in rules.items():
for variant in variants:
tagsIn = variant['tagsIn']
tagsOut = variant['tagsOut']
kanaIn = variant['kanaIn']
kanaOut = variant['kanaOut']
for v in variants:
tagsIn = v['tagsIn']
tagsOut = v['tagsOut']
kanaIn = v['kanaIn']
kanaOut = v['kanaOut']
allowed = len(self.tags) == 0
for tag in self.tags:
if self.searchTags(tag, tagsIn):
#
# TODO: Handle addons through tags.json or rules.json
#
if tag in tagsIn:
allowed = True
break
@ -66,40 +64,29 @@ class Deinflection:
continue
term = self.term[:-len(kanaIn)] + kanaOut
child = Deinflection(term, tagsOut, rule)
if child.deinflect(validator, rules):
self.children.append(child)
if len(self.children) > 0:
return True
def searchTags(self, tag, tags):
for t in tags:
if re.search(tag, t):
return True
return len(self.children) > 0
def gather(self):
if len(self.children) == 0:
return [{'root': self.term, 'rules': list()}]
return [{'root': self.term, 'tags': self.tags, 'rules': []}]
paths = list()
paths = []
for child in self.children:
for path in child.gather():
if self.rule:
path['rules'].append(self.rule)
path['source'] = self.term
paths.append(path)
return paths
#
# Deinflector
#
class Deinflector:
def __init__(self, filename):
with codecs.open(filename, 'rb', 'utf-8') as fp:

View File

@ -26,7 +26,7 @@ class Dictionary:
self.indices = set()
def findTerm(self, word, wildcards=False):
def findTerm(self, text, wildcards=False):
self.requireIndex('Vocab', 'expression')
self.requireIndex('Vocab', 'reading')
self.requireIndex('VocabGloss', 'vocabId')
@ -34,30 +34,45 @@ class Dictionary:
cursor = self.db.cursor()
definitions = []
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (word, word))
for vocabId, expression, reading, tags in cursor:
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (text, text))
for vocabId, expression, reading, tags in cursor.fetchall():
tags = tags.split()
cursor.execute('SELECT glossary From VocabGloss WHERE vocabId=?', (vocabId,))
glossary = map(operator.itemgetter(0), cursor)
#
# TODO: Handle addons through data.
#
addons = []
for tag in tags:
if tag.startswith('v5') and tag != 'v5':
addons.append('v5')
elif tag.startswith('vs-'):
addons.append('vs')
definitions.append({
'id': vocabId,
'expression': expression,
'reading': reading,
'tags': tags.split(),
'glossary': '; '.join(glossary)
'glossary': '; '.join(glossary),
'tags': tags + addons,
'addons': addons
})
return definitions
def findCharacter(self, character):
assert len(character) == 1
def findKanji(self, text):
assert len(text) == 1
self.requireIndex('Kanji', 'character')
self.requireIndex('KanjiGloss', 'kanjiId')
cursor = self.db.cursor()
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', character)
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', text)
query = cursor.fetchone()
if query is None:
return
@ -67,6 +82,7 @@ class Dictionary:
glossary = map(operator.itemgetter(0), cursor)
return {
'id': kanjiId,
'character': character,
'kunyomi': kunyomi,
'onyomi': onyomi,

View File

@ -17,7 +17,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import operator
import util
@ -29,36 +28,36 @@ class Translator:
def findTerm(self, text, wildcards=False):
text = util.sanitize(text, wildcards=wildcards)
groups = {}
groups = {}
for i in xrange(len(text), 0, -1):
term = text[:i]
deinflections = self.deinflector.deinflect(term, self.validator)
if deinflections is None:
self.processTerm(groups, term, wildcards=wildcards)
else:
for deinflection in deinflections:
self.processTerm(groups, **deinflection)
results = map(self.formatResult, groups.items())
results = filter(operator.truth, results)
results = sorted(results, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
dfs = self.deinflector.deinflect(term, lambda term: [d['tags'] for d in self.dictionary.findTerm(term)])
if dfs is None:
continue
for df in dfs:
self.processTerm(groups, **df)
definitions = groups.values()
definitions = sorted(definitions, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
length = 0
for result in results:
for result in definitions:
length = max(length, len(result['source']))
return results, length
return definitions, length
def findCharacters(self, text):
text = util.sanitize(text, kana=False)
processed = {}
results = []
for c in text:
if c not in processed:
match = self.dictionary.findCharacter(c)
match = self.dictionary.findKanji(c)
if match is not None:
results.append(match)
processed[c] = match
@ -66,26 +65,23 @@ class Translator:
return results
def processTerm(self, groups, source, rules=list(), root=str(), wildcards=False):
root = root or source
def processTerm(self, groups, source, tags, rules=[], root='', wildcards=False):
for entry in self.dictionary.findTerm(root, wildcards):
key = entry['expression'], entry['reading'], entry['glossary']
if key not in groups:
groups[key] = entry['tags'], source, rules
if entry['id'] in groups:
continue
matched = len(tags) == 0
for tag in tags:
if tag in entry['tags']:
matched = True
break
def formatResult(self, group):
(expression, reading, glossary), (tags, source, rules) = group
return {
'expression': expression,
'glossary': glossary,
'reading': reading,
'rules': rules,
if matched:
groups[entry['id']] = {
'expression': entry['expression'],
'reading': entry['reading'],
'glossary': entry['glossary'],
'tags': entry['tags'],
'source': source,
'tags': tags
'rules': rules
}
def validator(self, term):
return [d['tags'] for d in self.dictionary.findTerm(term)]