# -*- coding: utf-8 -*- # Copyright (C) 2011 Alex Yatskov # This module is based on Rikaichan code written by Jonathan Zarate # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import codecs class Deinflector: class Rule: def __init__(self, source, target, types, reason): self.source = unicode(source) self.target = unicode(target) self.types = int(types) self.reason = int(reason) class Result: def __init__(self, stem, types, conjugations): self.stem = unicode(stem) self.types = int(types) self.conjugations = list(conjugations) def __init__(self, filename=None): if filename == None: self.close() else: self.load(filename) def close(self): self.conjugations = list() self.rules = dict() def load(self, filename): self.close() try: with codecs.open(filename, 'rb', 'utf-8') as fp: lines = [line.strip() for line in fp.readlines()] # ignore the first line which is the file header del lines[0] except IOError: return False for line in lines: fields = line.split('\t') fieldCount = len(fields) if fieldCount == 1: self.conjugations.append(fields[0]) elif fieldCount == 4: rule = self.Rule(*fields) sourceLength = len(rule.source) if sourceLength not in self.rules: self.rules[sourceLength] = list() self.rules[sourceLength].append(rule) else: self.close() return False return True def deinflect(self, word): results = [self.Result(word, 0xff, list())] have = {word: 0} for result in results: for length, group in sorted(self.rules.items(), reverse=True): if length > len(result.stem): continue for rule in group: if result.types & rule.types == 0 or result.stem[-length:] != rule.source: continue new = result.stem[:len(result.stem) - len(rule.source)] + rule.target if len(new) <= 1: continue if new in have: result = results[have[new]] result.types |= (rule.types >> 8) continue have[new] = len(results) conjugations = [self.conjugations[rule.reason]] + result.conjugations results.append(self.Result(new, rule.types >> 8, conjugations)) return [ (result.stem, u', '.join(result.conjugations), result.types) for result in results ] def validate(self, types, tags): for tag in tags: valid = ( types & 1 and tag == 'v1' or types & 2 and tag[:2] == 'v5' or types & 4 and tag == 'adj-i' or types & 8 and tag == 'vk' or types & 16 and tag[:3] == 'vs-' ) if valid: return True return False