1
yomichan-anki/languages/japanese/deinflect.py
2011-10-08 09:32:51 -07:00

126 lines
3.8 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (C) 2011 Alex Yatskov
# This module is based on Rikaichan code written by Jonathan Zarate
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import codecs
class Deinflector:
class Rule:
def __init__(self, source, target, types, reason):
self.source = unicode(source)
self.target = unicode(target)
self.types = int(types)
self.reason = int(reason)
class Result:
def __init__(self, stem, types, conjugations):
self.stem = unicode(stem)
self.types = int(types)
self.conjugations = list(conjugations)
def __init__(self, filename=None):
if filename == None:
self.close()
else:
self.load(filename)
def close(self):
self.conjugations = list()
self.rules = dict()
def load(self, filename):
self.close()
try:
with codecs.open(filename, 'rb', 'utf-8') as fp:
lines = [line.strip() for line in fp.readlines()]
# ignore the first line which is the file header
del lines[0]
except IOError:
return False
for line in lines:
fields = line.split('\t')
fieldCount = len(fields)
if fieldCount == 1:
self.conjugations.append(fields[0])
elif fieldCount == 4:
rule = self.Rule(*fields)
sourceLength = len(rule.source)
if sourceLength not in self.rules:
self.rules[sourceLength] = list()
self.rules[sourceLength].append(rule)
else:
self.close()
return False
return True
def deinflect(self, word):
results = [self.Result(word, 0xff, list())]
have = {word: 0}
for result in results:
for length, group in sorted(self.rules.items(), reverse=True):
if length > len(result.stem):
continue
for rule in group:
if result.types & rule.types == 0 or result.stem[-length:] != rule.source:
continue
new = result.stem[:len(result.stem) - len(rule.source)] + rule.target
if len(new) <= 1:
continue
if new in have:
result = results[have[new]]
result.types |= (rule.types >> 8)
continue
have[new] = len(results)
conjugations = [self.conjugations[rule.reason]] + result.conjugations
results.append(self.Result(new, rule.types >> 8, conjugations))
return [
(result.stem, u', '.join(result.conjugations), result.types) for result in results
]
def validate(self, types, tags):
for tag in tags:
valid = (
types & 1 and tag == 'v1' or
types & 2 and tag[:2] == 'v5' or
types & 4 and tag == 'adj-i' or
types & 8 and tag == 'vk' or
types & 16 and tag[:3] == 'vs-'
)
if valid:
return True
return False