1
yomichan-anki/yomi_base/japanese/translate.py

139 lines
5.1 KiB
Python
Raw Normal View History

2011-08-28 18:01:32 +00:00
# -*- coding: utf-8 -*-
2011-10-27 15:22:26 +00:00
2011-08-28 18:01:32 +00:00
# Copyright (C) 2011 Alex Yatskov
# This module is based on Rikaichan code written by Jonathan Zarate
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
from operator import itemgetter
class Translator:
def __init__(self, deinflector, dictionary):
2011-10-08 16:32:51 +00:00
self.deinflector = deinflector
self.dictionary = dictionary
2011-08-28 18:01:32 +00:00
def wordSearch(self, word, limit, group):
2011-10-08 16:32:51 +00:00
source, indices = self.convertKatakana(word)
2011-08-28 18:01:32 +00:00
groups = dict()
length = 0
count = 0
while source:
2011-10-08 16:32:51 +00:00
for i, (stem, conjugations, types) in enumerate(self.deinflector.deinflect(source)):
for expression, reading, glossary in self.dictionary.findWord(stem):
2011-08-28 18:01:32 +00:00
if count >= limit:
break
if i > 0:
tags = re.split('[,()]', glossary)
2011-10-08 16:32:51 +00:00
if not self.deinflector.validate(types, tags):
2011-08-28 18:01:32 +00:00
continue
length = max(length, indices[len(source) - 1] + 1)
key = (expression, glossary, conjugations, source)
if key in groups:
readings = groups[key][0]
if reading and reading not in readings:
readings.append(reading)
if not group:
count += 1
else:
readings = [reading] if reading else list()
groups[key] = (readings, count)
count += 1
source = source[:-1]
results = list()
for (expression, glossary, conjugations, source), (readings, count) in groups.items():
if group:
readings = u', '.join(readings)
results.append((expression, readings, glossary, conjugations, source, count))
elif readings:
for reading in readings:
results.append((expression, reading, glossary, conjugations, source, count))
else:
results.append((expression, unicode(), glossary, conjugations, source, count))
results = [result[:5] for result in sorted(results, key=itemgetter(5))]
return results, length
2011-10-08 16:32:51 +00:00
def convertKatakana(self, word):
2011-08-28 18:01:32 +00:00
kanaHalf = [
0x3092, 0x3041, 0x3043, 0x3045, 0x3047, 0x3049, 0x3083, 0x3085,
0x3087, 0x3063, 0x30fc, 0x3042, 0x3044, 0x3046, 0x3048, 0x304a,
0x304b, 0x304d, 0x304f, 0x3051, 0x3053, 0x3055, 0x3057, 0x3059,
0x305b, 0x305d, 0x305f, 0x3061, 0x3064, 0x3066, 0x3068, 0x306a,
0x306b, 0x306c, 0x306d, 0x306e, 0x306f, 0x3072, 0x3075, 0x3078,
0x307b, 0x307e, 0x307f, 0x3080, 0x3081, 0x3082, 0x3084, 0x3086,
0x3088, 0x3089, 0x308a, 0x308b, 0x308c, 0x308d, 0x308f, 0x3093
]
kanaVoiced = [
0x30f4, 0xff74, 0xff75, 0x304c, 0x304e, 0x3050, 0x3052, 0x3054,
0x3056, 0x3058, 0x305a, 0x305c, 0x305e, 0x3060, 0x3062, 0x3065,
0x3067, 0x3069, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0x3070,
0x3073, 0x3076, 0x3079, 0x307c
]
kanaSemiVoiced = [
0x3071, 0x3074, 0x3077, 0x307a, 0x307d
]
indices = dict()
result = unicode()
ordPrev = 0
for i, char in enumerate(word):
ordCurr = ord(char)
if ordCurr <= 0x3000:
# break upon hitting non-japanese characters
break
if 0x30a1 <= ordCurr <= 0x30f3:
# full-width katakana to hiragana
ordCurr -= 0x60
elif 0xff66 <= ordCurr <= 0xff9d:
# half-width katakana to hiragana
ordCurr = kanaHalf[ordCurr - 0xff66]
elif ordCurr == 0xff9e:
# voiced (used in half-width katakana) to hiragana
if 0xff73 <= ordPrev <= 0xff8e:
result = result[:-1]
ordCurr = kanaVoiced[ordPrev - 0xff73]
elif ordCurr == 0xff9f:
# semi-voiced (used in half-width katakana) to hiragana
if 0xff8a <= ordPrev <= 0xff8e:
result = result[:-1]
ordCurr = kanaSemiVoiced[ordPrev - 0xff8a]
elif ordCurr == 0xff5e:
# ignore Japanese ~
ordPrev = 0
continue
indices[len(result)] = i
result += unichr(ordCurr)
ordPrev = ord(char)
return result, indices