1

Deinflector now properly working

Former-commit-id: c77faa975904ad9a6560aa8070f6f05f14c45d23
This commit is contained in:
Alex Yatskov 2013-11-09 10:17:53 -08:00
parent 44f12fcf5c
commit e7aeb2f9df

View File

@ -19,6 +19,7 @@
import codecs import codecs
import json import json
import re
# #
@ -26,15 +27,14 @@ import json
# #
class Deinflection: class Deinflection:
def __init__(self, term, parent=None, tags=list(), rule=str()): def __init__(self, term, tags=list(), rule=str()):
self.children = list() self.children = list()
self.term = term self.term = term
self.parent = parent
self.tags = tags self.tags = tags
self.rule = rule self.rule = rule
def deinflect(self, validator, rules): def deinflect(self, validator, rules, candidates):
for rule, variants in rules.items(): for rule, variants in rules.items():
for variant in variants: for variant in variants:
tagsIn = variant['tagsIn'] tagsIn = variant['tagsIn']
@ -42,40 +42,57 @@ class Deinflection:
kanaIn = variant['kanaIn'] kanaIn = variant['kanaIn']
kanaOut = variant['kanaOut'] kanaOut = variant['kanaOut']
allowed = not self.tags allowed = len(self.tags) == 0
for tag in self.tags: for tag in self.tags:
if tag in tagsIn: if self.searchTags(tag, tagsIn):
allowed = True allowed = True
break
if not allowed: if not allowed or not self.term.endswith(kanaIn):
continue continue
for i in xrange(len(kanaIn), len(self.term) + 1): term = self.term[:-len(kanaIn)] + kanaOut
term = self.term[:i] candidates.update([term])
if not term.endswith(kanaIn):
continue
rebase = term[:-len(kanaIn)] + kanaOut child = Deinflection(term, tagsOut, rule)
if validator(rebase, self.tags): if child.deinflect(validator, rules, candidates):
child = Deinflection(rebase, term, tagsOut, rule) self.children.append(child)
self.children.append(child)
child.deinflect(validator, rules) if len(self.children) > 0:
return True
for tags in validator(self.term):
for tag in self.tags:
if self.searchTags(tag, tags):
return True
def dump(self, depth=0): def searchTags(self, tag, tags):
result = u'%s%s' % (u'\t' * depth, self.term) for t in tags:
if self.rule: if re.search(tag, t):
result += u' (%s %s)' % (self.parent, self.rule) return True
result += u'\n'
def gather(self):
if len(self.children) == 0:
endpoint = {
'root': self.term,
'term': self.term,
'rules': [self.rule] if self.rule else list()
}
return [endpoint]
paths = list()
for child in self.children: for child in self.children:
result += child.dump(depth + 1) for path in child.gather():
if self.rule:
path['rules'].append(self.rule)
else:
path['term'] = self.term
paths.append(path)
return result return paths
def __str__(self):
return self.dump()
# #
@ -89,6 +106,7 @@ class Deinflector:
def deinflect(self, term, validator=lambda term, tags: True): def deinflect(self, term, validator=lambda term, tags: True):
candidates = set()
node = Deinflection(term) node = Deinflection(term)
node.deinflect(validator, self.rules) node.deinflect(validator, self.rules, candidates)
return node return node.gather(), candidates