Fixing deinflection bugs
This commit is contained in:
parent
83fb78ba28
commit
66bf7b895f
File diff suppressed because it is too large
Load Diff
@ -18,20 +18,14 @@
|
||||
|
||||
import codecs
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
#
|
||||
# Deinflection
|
||||
#
|
||||
|
||||
class Deinflection:
|
||||
def __init__(self, term, tags=list(), rule=str()):
|
||||
self.children = list()
|
||||
self.term = term
|
||||
self.tags = tags
|
||||
self.rule = rule
|
||||
self.success = False
|
||||
def __init__(self, term, tags=[], rule=''):
|
||||
self.children = []
|
||||
self.term = term
|
||||
self.tags = tags
|
||||
self.rule = rule
|
||||
|
||||
|
||||
def validate(self, validator):
|
||||
@ -40,25 +34,29 @@ class Deinflection:
|
||||
return True
|
||||
|
||||
for tag in self.tags:
|
||||
if self.searchTags(tag, tags):
|
||||
if tag in tags:
|
||||
return True
|
||||
|
||||
|
||||
def deinflect(self, validator, rules):
|
||||
if self.validate(validator):
|
||||
child = Deinflection(self.term)
|
||||
child = Deinflection(self.term, self.tags)
|
||||
self.children.append(child)
|
||||
|
||||
for rule, variants in rules.items():
|
||||
for variant in variants:
|
||||
tagsIn = variant['tagsIn']
|
||||
tagsOut = variant['tagsOut']
|
||||
kanaIn = variant['kanaIn']
|
||||
kanaOut = variant['kanaOut']
|
||||
for v in variants:
|
||||
tagsIn = v['tagsIn']
|
||||
tagsOut = v['tagsOut']
|
||||
kanaIn = v['kanaIn']
|
||||
kanaOut = v['kanaOut']
|
||||
|
||||
allowed = len(self.tags) == 0
|
||||
for tag in self.tags:
|
||||
if self.searchTags(tag, tagsIn):
|
||||
#
|
||||
# TODO: Handle addons through tags.json or rules.json
|
||||
#
|
||||
|
||||
if tag in tagsIn:
|
||||
allowed = True
|
||||
break
|
||||
|
||||
@ -66,40 +64,29 @@ class Deinflection:
|
||||
continue
|
||||
|
||||
term = self.term[:-len(kanaIn)] + kanaOut
|
||||
|
||||
child = Deinflection(term, tagsOut, rule)
|
||||
if child.deinflect(validator, rules):
|
||||
self.children.append(child)
|
||||
|
||||
if len(self.children) > 0:
|
||||
return True
|
||||
|
||||
|
||||
def searchTags(self, tag, tags):
|
||||
for t in tags:
|
||||
if re.search(tag, t):
|
||||
return True
|
||||
return len(self.children) > 0
|
||||
|
||||
|
||||
def gather(self):
|
||||
if len(self.children) == 0:
|
||||
return [{'root': self.term, 'rules': list()}]
|
||||
return [{'root': self.term, 'tags': self.tags, 'rules': []}]
|
||||
|
||||
paths = list()
|
||||
paths = []
|
||||
for child in self.children:
|
||||
for path in child.gather():
|
||||
if self.rule:
|
||||
path['rules'].append(self.rule)
|
||||
|
||||
path['source'] = self.term
|
||||
paths.append(path)
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
#
|
||||
# Deinflector
|
||||
#
|
||||
|
||||
class Deinflector:
|
||||
def __init__(self, filename):
|
||||
with codecs.open(filename, 'rb', 'utf-8') as fp:
|
||||
|
@ -26,7 +26,7 @@ class Dictionary:
|
||||
self.indices = set()
|
||||
|
||||
|
||||
def findTerm(self, word, wildcards=False):
|
||||
def findTerm(self, text, wildcards=False):
|
||||
self.requireIndex('Vocab', 'expression')
|
||||
self.requireIndex('Vocab', 'reading')
|
||||
self.requireIndex('VocabGloss', 'vocabId')
|
||||
@ -34,30 +34,45 @@ class Dictionary:
|
||||
cursor = self.db.cursor()
|
||||
|
||||
definitions = []
|
||||
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (word, word))
|
||||
for vocabId, expression, reading, tags in cursor:
|
||||
cursor.execute('SELECT * FROM Vocab WHERE expression {0} ? OR reading=?'.format('LIKE' if wildcards else '='), (text, text))
|
||||
for vocabId, expression, reading, tags in cursor.fetchall():
|
||||
tags = tags.split()
|
||||
|
||||
cursor.execute('SELECT glossary From VocabGloss WHERE vocabId=?', (vocabId,))
|
||||
glossary = map(operator.itemgetter(0), cursor)
|
||||
|
||||
#
|
||||
# TODO: Handle addons through data.
|
||||
#
|
||||
|
||||
addons = []
|
||||
for tag in tags:
|
||||
if tag.startswith('v5') and tag != 'v5':
|
||||
addons.append('v5')
|
||||
elif tag.startswith('vs-'):
|
||||
addons.append('vs')
|
||||
|
||||
definitions.append({
|
||||
'id': vocabId,
|
||||
'expression': expression,
|
||||
'reading': reading,
|
||||
'tags': tags.split(),
|
||||
'glossary': '; '.join(glossary)
|
||||
'glossary': '; '.join(glossary),
|
||||
'tags': tags + addons,
|
||||
'addons': addons
|
||||
})
|
||||
|
||||
return definitions
|
||||
|
||||
|
||||
def findCharacter(self, character):
|
||||
assert len(character) == 1
|
||||
def findKanji(self, text):
|
||||
assert len(text) == 1
|
||||
|
||||
self.requireIndex('Kanji', 'character')
|
||||
self.requireIndex('KanjiGloss', 'kanjiId')
|
||||
|
||||
cursor = self.db.cursor()
|
||||
|
||||
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', character)
|
||||
cursor.execute('SELECT * FROM Kanji WHERE character=? LIMIT 1', text)
|
||||
query = cursor.fetchone()
|
||||
if query is None:
|
||||
return
|
||||
@ -67,6 +82,7 @@ class Dictionary:
|
||||
glossary = map(operator.itemgetter(0), cursor)
|
||||
|
||||
return {
|
||||
'id': kanjiId,
|
||||
'character': character,
|
||||
'kunyomi': kunyomi,
|
||||
'onyomi': onyomi,
|
||||
|
@ -17,7 +17,6 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import operator
|
||||
import util
|
||||
|
||||
|
||||
@ -28,37 +27,37 @@ class Translator:
|
||||
|
||||
|
||||
def findTerm(self, text, wildcards=False):
|
||||
text = util.sanitize(text, wildcards=wildcards)
|
||||
groups = {}
|
||||
text = util.sanitize(text, wildcards=wildcards)
|
||||
|
||||
groups = {}
|
||||
for i in xrange(len(text), 0, -1):
|
||||
term = text[:i]
|
||||
deinflections = self.deinflector.deinflect(term, self.validator)
|
||||
if deinflections is None:
|
||||
self.processTerm(groups, term, wildcards=wildcards)
|
||||
else:
|
||||
for deinflection in deinflections:
|
||||
self.processTerm(groups, **deinflection)
|
||||
|
||||
results = map(self.formatResult, groups.items())
|
||||
results = filter(operator.truth, results)
|
||||
results = sorted(results, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
|
||||
dfs = self.deinflector.deinflect(term, lambda term: [d['tags'] for d in self.dictionary.findTerm(term)])
|
||||
if dfs is None:
|
||||
continue
|
||||
|
||||
for df in dfs:
|
||||
self.processTerm(groups, **df)
|
||||
|
||||
definitions = groups.values()
|
||||
definitions = sorted(definitions, key=lambda d: (len(d['source']), 'P' in d['tags'], -len(d['rules'])), reverse=True)
|
||||
|
||||
length = 0
|
||||
for result in results:
|
||||
for result in definitions:
|
||||
length = max(length, len(result['source']))
|
||||
|
||||
return results, length
|
||||
return definitions, length
|
||||
|
||||
|
||||
def findCharacters(self, text):
|
||||
text = util.sanitize(text, kana=False)
|
||||
text = util.sanitize(text, kana=False)
|
||||
|
||||
processed = {}
|
||||
results = []
|
||||
|
||||
for c in text:
|
||||
if c not in processed:
|
||||
match = self.dictionary.findCharacter(c)
|
||||
match = self.dictionary.findKanji(c)
|
||||
if match is not None:
|
||||
results.append(match)
|
||||
processed[c] = match
|
||||
@ -66,26 +65,23 @@ class Translator:
|
||||
return results
|
||||
|
||||
|
||||
def processTerm(self, groups, source, rules=list(), root=str(), wildcards=False):
|
||||
root = root or source
|
||||
|
||||
def processTerm(self, groups, source, tags, rules=[], root='', wildcards=False):
|
||||
for entry in self.dictionary.findTerm(root, wildcards):
|
||||
key = entry['expression'], entry['reading'], entry['glossary']
|
||||
if key not in groups:
|
||||
groups[key] = entry['tags'], source, rules
|
||||
if entry['id'] in groups:
|
||||
continue
|
||||
|
||||
matched = len(tags) == 0
|
||||
for tag in tags:
|
||||
if tag in entry['tags']:
|
||||
matched = True
|
||||
break
|
||||
|
||||
def formatResult(self, group):
|
||||
(expression, reading, glossary), (tags, source, rules) = group
|
||||
return {
|
||||
'expression': expression,
|
||||
'glossary': glossary,
|
||||
'reading': reading,
|
||||
'rules': rules,
|
||||
'source': source,
|
||||
'tags': tags
|
||||
}
|
||||
|
||||
|
||||
def validator(self, term):
|
||||
return [d['tags'] for d in self.dictionary.findTerm(term)]
|
||||
if matched:
|
||||
groups[entry['id']] = {
|
||||
'expression': entry['expression'],
|
||||
'reading': entry['reading'],
|
||||
'glossary': entry['glossary'],
|
||||
'tags': entry['tags'],
|
||||
'source': source,
|
||||
'rules': rules
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user