1

Fixing deinflection bugs

This commit is contained in:
Alex Yatskov 2016-05-09 14:33:52 -07:00
parent 83fb78ba28
commit 66bf7b895f
4 changed files with 1469 additions and 1371 deletions

File diff suppressed because it is too large Load Diff

View File

@ -18,20 +18,14 @@
import codecs
import json
import re
#
# Deinflection
#
class Deinflection:
def __init__(self, term, tags=list(), rule=str()):
self.children = list()
self.term = term
self.tags = tags
self.rule = rule
self.success = False
def __init__(self, term, tags=[], rule=''):
self.children = []
self.term = term
self.tags = tags
self.rule = rule
def validate(self, validator):
@ -40,25 +34,29 @@ class Deinflection:
return True
for tag in self.tags:
if self.searchTags(tag, tags):
if tag in tags:
return True
def deinflect(self, validator, rules):
if self.validate(validator):
child = Deinflection(self.term)
child = Deinflection(self.term, self.tags)
self.children.append(child)
for rule, variants in rules.items():
for variant in variants:
tagsIn = variant['tagsIn']
tagsOut = variant['tagsOut']
kanaIn = variant['kanaIn']
kanaOut = variant['kanaOut']
for v in variants:
tagsIn = v['tagsIn']
tagsOut = v['tagsOut']
kanaIn = v['kanaIn']
kanaOut = v['kanaOut']
allowed = len(self.tags) == 0
for tag in self.tags:
if self.searchTags(tag, tagsIn):
#
# TODO: Handle addons through tags.json or rules.json
#
if tag in tagsIn:
allowed = True
break
@ -66,40 +64,29 @@ class Deinflection:
continue
term = self.term[:-len(kanaIn)] + kanaOut
child = Deinflection(term, tagsOut, rule)
if child.deinflect(validator, rules):
self.children.append(child)
if len(self.children) > 0:
return True
def searchTags(self, tag, tags):
for t in tags:
if re.search(tag, t):
return True
return len(self.children) > 0
def gather(self):
if len(self.children) == 0:
return [{'root': self.term, 'rules': list()}]
return [{'root': self.term, 'tags': self.tags, 'rules': []}]
paths = list()
paths = []
for child in self.children:
for path in child.gather():
if self.rule:
path['rules'].append(self.rule)
path['source'] = self.term
paths.append(path)
return paths
#
# Deinflector
#
class Deinflector:
def __init__(self, filename):
with codecs.open(filename, 'rb', 'utf-8') as fp:

View File

@ -26,7 +26,7 @@ class Dictionary:
self.indices = set()
def findTerm(self, word, wildcards=False):
def findTerm(self, text, wildcards=False):
self.requireIndex('Vocab', 'expression')
self.requireIndex('Vocab', 'reading')
self.requireIndex('VocabGloss', 'vocabId')
@ -34,30 +34,45 @@ class Dictionary:
cursor = self.db.cursor()
definitions = []
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (word, word))
for vocabId, expression, reading, tags in cursor:
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (text, text))
for vocabId, expression, reading, tags in cursor.fetchall():
tags = tags.split()
cursor.execute('SELECT glossary From VocabGloss WHERE vocabId=?', (vocabId,))
glossary = map(operator.itemgetter(0), cursor)
#
# TODO: Handle addons through data.
#
addons = []
for tag in tags:
if tag.startswith('v5') and tag != 'v5':
addons.append('v5')
elif tag.startswith('vs-'):
addons.append('vs')
definitions.append({
'id': vocabId,
'expression': expression,
'reading': reading,
'tags': tags.split(),
'glossary': '; '.join(glossary)
'glossary': '; '.join(glossary),
'tags': tags + addons,
'addons': addons
})
return definitions
def findCharacter(self, character):
assert len(character) == 1
def findKanji(self, text):
assert len(text) == 1
self.requireIndex('Kanji', 'character')
self.requireIndex('KanjiGloss', 'kanjiId')
cursor = self.db.cursor()
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', character)
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', text)
query = cursor.fetchone()
if query is None:
return
@ -67,6 +82,7 @@ class Dictionary:
glossary = map(operator.itemgetter(0), cursor)
return {
'id': kanjiId,
'character': character,
'kunyomi': kunyomi,
'onyomi': onyomi,

View File

@ -17,7 +17,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import operator
import util
@ -28,37 +27,37 @@ class Translator:
def findTerm(self, text, wildcards=False):
text = util.sanitize(text, wildcards=wildcards)
groups = {}
text = util.sanitize(text, wildcards=wildcards)
groups = {}
for i in xrange(len(text), 0, -1):
term = text[:i]
deinflections = self.deinflector.deinflect(term, self.validator)
if deinflections is None:
self.processTerm(groups, term, wildcards=wildcards)
else:
for deinflection in deinflections:
self.processTerm(groups, **deinflection)
results = map(self.formatResult, groups.items())
results = filter(operator.truth, results)
results = sorted(results, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
dfs = self.deinflector.deinflect(term, lambda term: [d['tags'] for d in self.dictionary.findTerm(term)])
if dfs is None:
continue
for df in dfs:
self.processTerm(groups, **df)
definitions = groups.values()
definitions = sorted(definitions, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
length = 0
for result in results:
for result in definitions:
length = max(length, len(result['source']))
return results, length
return definitions, length
def findCharacters(self, text):
text = util.sanitize(text, kana=False)
text = util.sanitize(text, kana=False)
processed = {}
results = []
for c in text:
if c not in processed:
match = self.dictionary.findCharacter(c)
match = self.dictionary.findKanji(c)
if match is not None:
results.append(match)
processed[c] = match
@ -66,26 +65,23 @@ class Translator:
return results
def processTerm(self, groups, source, rules=list(), root=str(), wildcards=False):
root = root or source
def processTerm(self, groups, source, tags, rules=[], root='', wildcards=False):
for entry in self.dictionary.findTerm(root, wildcards):
key = entry['expression'], entry['reading'], entry['glossary']
if key not in groups:
groups[key] = entry['tags'], source, rules
if entry['id'] in groups:
continue
matched = len(tags) == 0
for tag in tags:
if tag in entry['tags']:
matched = True
break
def formatResult(self, group):
(expression, reading, glossary), (tags, source, rules) = group
return {
'expression': expression,
'glossary': glossary,
'reading': reading,
'rules': rules,
'source': source,
'tags': tags
}
def validator(self, term):
return [d['tags'] for d in self.dictionary.findTerm(term)]
if matched:
groups[entry['id']] = {
'expression': entry['expression'],
'reading': entry['reading'],
'glossary': entry['glossary'],
'tags': entry['tags'],
'source': source,
'rules': rules
}