127 lines
3.8 KiB
Python
127 lines
3.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright (C) 2011 Alex Yatskov
|
|
# This module is based on Rikaichan code written by Jonathan Zarate
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
import codecs
|
|
|
|
|
|
class Deinflector:
|
|
class Rule:
|
|
def __init__(self, source, target, types, reason):
|
|
self.source = unicode(source)
|
|
self.target = unicode(target)
|
|
self.types = int(types)
|
|
self.reason = int(reason)
|
|
|
|
|
|
class Result:
|
|
def __init__(self, stem, types, conjugations):
|
|
self.stem = unicode(stem)
|
|
self.types = int(types)
|
|
self.conjugations = list(conjugations)
|
|
|
|
|
|
def __init__(self, filename=None):
|
|
if filename == None:
|
|
self.close()
|
|
else:
|
|
self.load(filename)
|
|
|
|
|
|
def close(self):
|
|
self.conjugations = list()
|
|
self.rules = dict()
|
|
|
|
|
|
def load(self, filename):
|
|
self.close()
|
|
|
|
try:
|
|
with codecs.open(filename, 'rb', 'utf-8') as fp:
|
|
lines = [line.strip() for line in fp.readlines()]
|
|
# ignore the first line which is the file header
|
|
del lines[0]
|
|
except IOError:
|
|
return False
|
|
|
|
for line in lines:
|
|
fields = line.split('\t')
|
|
fieldCount = len(fields)
|
|
|
|
if fieldCount == 1:
|
|
self.conjugations.append(fields[0])
|
|
elif fieldCount == 4:
|
|
rule = self.Rule(*fields)
|
|
sourceLength = len(rule.source)
|
|
if sourceLength not in self.rules:
|
|
self.rules[sourceLength] = list()
|
|
self.rules[sourceLength].append(rule)
|
|
else:
|
|
self.close()
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def deinflect(self, word):
|
|
results = [self.Result(word, 0xff, list())]
|
|
have = {word: 0}
|
|
|
|
for result in results:
|
|
for length, group in sorted(self.rules.items(), reverse=True):
|
|
if length > len(result.stem):
|
|
continue
|
|
|
|
for rule in group:
|
|
if result.types & rule.types == 0 or result.stem[-length:] != rule.source:
|
|
continue
|
|
|
|
new = result.stem[:len(result.stem) - len(rule.source)] + rule.target
|
|
if len(new) <= 1:
|
|
continue
|
|
|
|
if new in have:
|
|
result = results[have[new]]
|
|
result.types |= (rule.types >> 8)
|
|
continue
|
|
|
|
have[new] = len(results)
|
|
|
|
conjugations = [self.conjugations[rule.reason]] + result.conjugations
|
|
results.append(self.Result(new, rule.types >> 8, conjugations))
|
|
|
|
return [
|
|
(result.stem, u', '.join(result.conjugations), result.types) for result in results
|
|
]
|
|
|
|
|
|
def validate(self, types, tags):
|
|
for tag in tags:
|
|
valid = (
|
|
types & 1 and tag == 'v1' or
|
|
types & 2 and tag[:2] == 'v5' or
|
|
types & 4 and tag == 'adj-i' or
|
|
types & 8 and tag == 'vk' or
|
|
types & 16 and tag[:3] == 'vs-'
|
|
)
|
|
|
|
if valid:
|
|
return True
|
|
|
|
return False
|