1

Fixing deinflection bugs

This commit is contained in:
Alex Yatskov 2016-05-09 14:33:52 -07:00
parent 83fb78ba28
commit 66bf7b895f
4 changed files with 1469 additions and 1371 deletions

View File

@ -72,7 +72,7 @@
"v1", "v1",
"v5", "v5",
"vk", "vk",
"vs-" "vs"
] ]
}, },
{ {
@ -89,7 +89,7 @@
"kanaIn": "ちゃう", "kanaIn": "ちゃう",
"kanaOut": "る", "kanaOut": "る",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v1", "v1",
@ -100,7 +100,7 @@
"kanaIn": "いじゃう", "kanaIn": "いじゃう",
"kanaOut": "ぐ", "kanaOut": "ぐ",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -110,7 +110,7 @@
"kanaIn": "いちゃう", "kanaIn": "いちゃう",
"kanaOut": "く", "kanaOut": "く",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -120,7 +120,7 @@
"kanaIn": "きちゃう", "kanaIn": "きちゃう",
"kanaOut": "くる", "kanaOut": "くる",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"vk" "vk"
@ -130,7 +130,7 @@
"kanaIn": "しちゃう", "kanaIn": "しちゃう",
"kanaOut": "す", "kanaOut": "す",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -140,17 +140,17 @@
"kanaIn": "しちゃう", "kanaIn": "しちゃう",
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
"kanaIn": "っちゃう", "kanaIn": "っちゃう",
"kanaOut": "う", "kanaOut": "う",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -160,7 +160,7 @@
"kanaIn": "っちゃう", "kanaIn": "っちゃう",
"kanaOut": "く", "kanaOut": "く",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -170,7 +170,7 @@
"kanaIn": "っちゃう", "kanaIn": "っちゃう",
"kanaOut": "つ", "kanaOut": "つ",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -180,7 +180,7 @@
"kanaIn": "っちゃう", "kanaIn": "っちゃう",
"kanaOut": "る", "kanaOut": "る",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -190,7 +190,7 @@
"kanaIn": "んじゃう", "kanaIn": "んじゃう",
"kanaOut": "ぬ", "kanaOut": "ぬ",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -200,7 +200,7 @@
"kanaIn": "んじゃう", "kanaIn": "んじゃう",
"kanaOut": "ぶ", "kanaOut": "ぶ",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -210,7 +210,7 @@
"kanaIn": "んじゃう", "kanaIn": "んじゃう",
"kanaOut": "む", "kanaOut": "む",
"tagsIn": [ "tagsIn": [
"v5.*" "v5"
], ],
"tagsOut": [ "tagsOut": [
"v5" "v5"
@ -272,7 +272,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -379,7 +379,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -502,7 +502,7 @@
"v1" "v1"
], ],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -625,7 +625,7 @@
"adj-i" "adj-i"
], ],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -726,7 +726,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -833,7 +833,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -948,7 +948,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -1055,7 +1055,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -1107,6 +1107,105 @@
] ]
} }
], ],
"-nu": [
{
"kanaIn": "ぬ",
"kanaOut": "る",
"tagsIn": [],
"tagsOut": [
"v1",
"vk"
]
},
{
"kanaIn": "かぬ",
"kanaOut": "く",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "がぬ",
"kanaOut": "ぐ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "こぬ",
"kanaOut": "くる",
"tagsIn": [],
"tagsOut": [
"vk"
]
},
{
"kanaIn": "さぬ",
"kanaOut": "す",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "せぬ",
"kanaOut": "する",
"tagsIn": [],
"tagsOut": [
"vs"
]
},
{
"kanaIn": "たぬ",
"kanaOut": "つ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "なぬ",
"kanaOut": "ぬ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "ばぬ",
"kanaOut": "ぶ",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "まぬ",
"kanaOut": "む",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "らぬ",
"kanaOut": "る",
"tagsIn": [],
"tagsOut": [
"v5"
]
},
{
"kanaIn": "わぬ",
"kanaOut": "う",
"tagsIn": [],
"tagsOut": [
"v5"
]
}
],
"adv": [ "adv": [
{ {
"kanaIn": "く", "kanaIn": "く",
@ -1145,7 +1244,7 @@
"v1" "v1"
], ],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -1340,7 +1439,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -1348,7 +1447,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
} }
], ],
@ -1361,7 +1460,7 @@
"v1", "v1",
"v5", "v5",
"vk", "vk",
"vs-" "vs"
] ]
} }
], ],
@ -1684,7 +1783,7 @@
"adj-i" "adj-i"
], ],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -1786,7 +1885,7 @@
"v1" "v1"
], ],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -1899,7 +1998,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -2022,7 +2121,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -2121,7 +2220,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -2228,7 +2327,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -2327,7 +2426,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -2434,7 +2533,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
}, },
{ {
@ -2701,7 +2800,7 @@
"kanaOut": "する", "kanaOut": "する",
"tagsIn": [], "tagsIn": [],
"tagsOut": [ "tagsOut": [
"vs-" "vs"
] ]
} }
] ]

View File

@ -18,20 +18,14 @@
import codecs import codecs
import json import json
import re
#
# Deinflection
#
class Deinflection: class Deinflection:
def __init__(self, term, tags=list(), rule=str()): def __init__(self, term, tags=[], rule=''):
self.children = list() self.children = []
self.term = term self.term = term
self.tags = tags self.tags = tags
self.rule = rule self.rule = rule
self.success = False
def validate(self, validator): def validate(self, validator):
@ -40,25 +34,29 @@ class Deinflection:
return True return True
for tag in self.tags: for tag in self.tags:
if self.searchTags(tag, tags): if tag in tags:
return True return True
def deinflect(self, validator, rules): def deinflect(self, validator, rules):
if self.validate(validator): if self.validate(validator):
child = Deinflection(self.term) child = Deinflection(self.term, self.tags)
self.children.append(child) self.children.append(child)
for rule, variants in rules.items(): for rule, variants in rules.items():
for variant in variants: for v in variants:
tagsIn = variant['tagsIn'] tagsIn = v['tagsIn']
tagsOut = variant['tagsOut'] tagsOut = v['tagsOut']
kanaIn = variant['kanaIn'] kanaIn = v['kanaIn']
kanaOut = variant['kanaOut'] kanaOut = v['kanaOut']
allowed = len(self.tags) == 0 allowed = len(self.tags) == 0
for tag in self.tags: for tag in self.tags:
if self.searchTags(tag, tagsIn): #
# TODO: Handle addons through tags.json or rules.json
#
if tag in tagsIn:
allowed = True allowed = True
break break
@ -66,40 +64,29 @@ class Deinflection:
continue continue
term = self.term[:-len(kanaIn)] + kanaOut term = self.term[:-len(kanaIn)] + kanaOut
child = Deinflection(term, tagsOut, rule) child = Deinflection(term, tagsOut, rule)
if child.deinflect(validator, rules): if child.deinflect(validator, rules):
self.children.append(child) self.children.append(child)
if len(self.children) > 0: return len(self.children) > 0
return True
def searchTags(self, tag, tags):
for t in tags:
if re.search(tag, t):
return True
def gather(self): def gather(self):
if len(self.children) == 0: if len(self.children) == 0:
return [{'root': self.term, 'rules': list()}] return [{'root': self.term, 'tags': self.tags, 'rules': []}]
paths = list() paths = []
for child in self.children: for child in self.children:
for path in child.gather(): for path in child.gather():
if self.rule: if self.rule:
path['rules'].append(self.rule) path['rules'].append(self.rule)
path['source'] = self.term path['source'] = self.term
paths.append(path) paths.append(path)
return paths return paths
#
# Deinflector
#
class Deinflector: class Deinflector:
def __init__(self, filename): def __init__(self, filename):
with codecs.open(filename, 'rb', 'utf-8') as fp: with codecs.open(filename, 'rb', 'utf-8') as fp:

View File

@ -26,7 +26,7 @@ class Dictionary:
self.indices = set() self.indices = set()
def findTerm(self, word, wildcards=False): def findTerm(self, text, wildcards=False):
self.requireIndex('Vocab', 'expression') self.requireIndex('Vocab', 'expression')
self.requireIndex('Vocab', 'reading') self.requireIndex('Vocab', 'reading')
self.requireIndex('VocabGloss', 'vocabId') self.requireIndex('VocabGloss', 'vocabId')
@ -34,30 +34,45 @@ class Dictionary:
cursor = self.db.cursor() cursor = self.db.cursor()
definitions = [] definitions = []
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (word, word)) cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (text, text))
for vocabId, expression, reading, tags in cursor: for vocabId, expression, reading, tags in cursor.fetchall():
tags = tags.split()
cursor.execute('SELECT glossary From VocabGloss WHERE vocabId=?', (vocabId,)) cursor.execute('SELECT glossary From VocabGloss WHERE vocabId=?', (vocabId,))
glossary = map(operator.itemgetter(0), cursor) glossary = map(operator.itemgetter(0), cursor)
#
# TODO: Handle addons through data.
#
addons = []
for tag in tags:
if tag.startswith('v5') and tag != 'v5':
addons.append('v5')
elif tag.startswith('vs-'):
addons.append('vs')
definitions.append({ definitions.append({
'id': vocabId,
'expression': expression, 'expression': expression,
'reading': reading, 'reading': reading,
'tags': tags.split(), 'glossary': '; '.join(glossary),
'glossary': '; '.join(glossary) 'tags': tags + addons,
'addons': addons
}) })
return definitions return definitions
def findCharacter(self, character): def findKanji(self, text):
assert len(character) == 1 assert len(text) == 1
self.requireIndex('Kanji', 'character') self.requireIndex('Kanji', 'character')
self.requireIndex('KanjiGloss', 'kanjiId') self.requireIndex('KanjiGloss', 'kanjiId')
cursor = self.db.cursor() cursor = self.db.cursor()
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', character) cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', text)
query = cursor.fetchone() query = cursor.fetchone()
if query is None: if query is None:
return return
@ -67,6 +82,7 @@ class Dictionary:
glossary = map(operator.itemgetter(0), cursor) glossary = map(operator.itemgetter(0), cursor)
return { return {
'id': kanjiId,
'character': character, 'character': character,
'kunyomi': kunyomi, 'kunyomi': kunyomi,
'onyomi': onyomi, 'onyomi': onyomi,

View File

@ -17,7 +17,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import operator
import util import util
@ -29,36 +28,36 @@ class Translator:
def findTerm(self, text, wildcards=False): def findTerm(self, text, wildcards=False):
text = util.sanitize(text, wildcards=wildcards) text = util.sanitize(text, wildcards=wildcards)
groups = {}
groups = {}
for i in xrange(len(text), 0, -1): for i in xrange(len(text), 0, -1):
term = text[:i] term = text[:i]
deinflections = self.deinflector.deinflect(term, self.validator)
if deinflections is None:
self.processTerm(groups, term, wildcards=wildcards)
else:
for deinflection in deinflections:
self.processTerm(groups, **deinflection)
results = map(self.formatResult, groups.items()) dfs = self.deinflector.deinflect(term, lambda term: [d['tags'] for d in self.dictionary.findTerm(term)])
results = filter(operator.truth, results) if dfs is None:
results = sorted(results, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True) continue
for df in dfs:
self.processTerm(groups, **df)
definitions = groups.values()
definitions = sorted(definitions, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
length = 0 length = 0
for result in results: for result in definitions:
length = max(length, len(result['source'])) length = max(length, len(result['source']))
return results, length return definitions, length
def findCharacters(self, text): def findCharacters(self, text):
text = util.sanitize(text, kana=False) text = util.sanitize(text, kana=False)
processed = {} processed = {}
results = [] results = []
for c in text: for c in text:
if c not in processed: if c not in processed:
match = self.dictionary.findCharacter(c) match = self.dictionary.findKanji(c)
if match is not None: if match is not None:
results.append(match) results.append(match)
processed[c] = match processed[c] = match
@ -66,26 +65,23 @@ class Translator:
return results return results
def processTerm(self, groups, source, rules=list(), root=str(), wildcards=False): def processTerm(self, groups, source, tags, rules=[], root='', wildcards=False):
root = root or source
for entry in self.dictionary.findTerm(root, wildcards): for entry in self.dictionary.findTerm(root, wildcards):
key = entry['expression'], entry['reading'], entry['glossary'] if entry['id'] in groups:
if key not in groups: continue
groups[key] = entry['tags'], source, rules
matched = len(tags) == 0
for tag in tags:
if tag in entry['tags']:
matched = True
break
def formatResult(self, group): if matched:
(expression, reading, glossary), (tags, source, rules) = group groups[entry['id']] = {
return { 'expression': entry['expression'],
'expression': expression, 'reading': entry['reading'],
'glossary': glossary, 'glossary': entry['glossary'],
'reading': reading, 'tags': entry['tags'],
'rules': rules,
'source': source, 'source': source,
'tags': tags 'rules': rules
} }
def validator(self, term):
return [d['tags'] for d in self.dictionary.findTerm(term)]