# -*- coding: utf-8 -*- # Copyright (C) 2011 Alex Yatskov # This module is based on Rikaichan code written by Jonathan Zarate # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import re from operator import itemgetter class Translator: def __init__(self, deinflector, dictionary): self.deinflector = deinflector self.dictionary = dictionary def wordSearch(self, word, limit, group): source, indices = self.convertKatakana(word) groups = dict() length = 0 count = 0 while source: for i, (stem, conjugations, types) in enumerate(self.deinflector.deinflect(source)): for expression, reading, glossary in self.dictionary.findWord(stem): if count >= limit: break if i > 0: tags = re.split('[,()]', glossary) if not self.deinflector.validate(types, tags): continue length = max(length, indices[len(source) - 1] + 1) key = (expression, glossary, conjugations, source) if key in groups: readings = groups[key][0] if reading and reading not in readings: readings.append(reading) if not group: count += 1 else: readings = [reading] if reading else list() groups[key] = (readings, count) count += 1 source = source[:-1] results = list() for (expression, glossary, conjugations, source), (readings, count) in groups.items(): if group: readings = u', '.join(readings) results.append((expression, readings, glossary, conjugations, source, count)) elif readings: for reading in readings: results.append((expression, reading, glossary, conjugations, source, count)) else: results.append((expression, unicode(), glossary, conjugations, source, count)) results = [result[:5] for result in sorted(results, key=itemgetter(5))] return results, length def convertKatakana(self, word): kanaHalf = [ 0x3092, 0x3041, 0x3043, 0x3045, 0x3047, 0x3049, 0x3083, 0x3085, 0x3087, 0x3063, 0x30fc, 0x3042, 0x3044, 0x3046, 0x3048, 0x304a, 0x304b, 0x304d, 0x304f, 0x3051, 0x3053, 0x3055, 0x3057, 0x3059, 0x305b, 0x305d, 0x305f, 0x3061, 0x3064, 0x3066, 0x3068, 0x306a, 0x306b, 0x306c, 0x306d, 0x306e, 0x306f, 0x3072, 0x3075, 0x3078, 0x307b, 0x307e, 0x307f, 0x3080, 0x3081, 0x3082, 0x3084, 0x3086, 0x3088, 0x3089, 0x308a, 0x308b, 0x308c, 0x308d, 0x308f, 0x3093 ] kanaVoiced = [ 0x30f4, 0xff74, 0xff75, 0x304c, 0x304e, 0x3050, 0x3052, 0x3054, 0x3056, 0x3058, 0x305a, 0x305c, 0x305e, 0x3060, 0x3062, 0x3065, 0x3067, 0x3069, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0x3070, 0x3073, 0x3076, 0x3079, 0x307c ] kanaSemiVoiced = [ 0x3071, 0x3074, 0x3077, 0x307a, 0x307d ] indices = dict() result = unicode() ordPrev = 0 for i, char in enumerate(word): ordCurr = ord(char) if ordCurr <= 0x3000: # break upon hitting non-japanese characters break if 0x30a1 <= ordCurr <= 0x30f3: # full-width katakana to hiragana ordCurr -= 0x60 elif 0xff66 <= ordCurr <= 0xff9d: # half-width katakana to hiragana ordCurr = kanaHalf[ordCurr - 0xff66] elif ordCurr == 0xff9e: # voiced (used in half-width katakana) to hiragana if 0xff73 <= ordPrev <= 0xff8e: result = result[:-1] ordCurr = kanaVoiced[ordPrev - 0xff73] elif ordCurr == 0xff9f: # semi-voiced (used in half-width katakana) to hiragana if 0xff8a <= ordPrev <= 0xff8e: result = result[:-1] ordCurr = kanaSemiVoiced[ordPrev - 0xff8a] elif ordCurr == 0xff5e: # ignore Japanese ~ ordPrev = 0 continue indices[len(result)] = i result += unichr(ordCurr) ordPrev = ord(char) return result, indices