#!/usr/bin/env python # # Copyright (C) 2011 Alex Yatskov # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import codecs import optparse import os import re import sqlite3 import sys GRAMMAR_TAGS = { 'adj', # former adjective classification (being removed) 'adj-f', # noun or verb acting prenominally (other than the above) 'adj-i', # adjective (keiyoushi) 'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi) 'adj-no', # nouns which may take the genitive case particle `no' 'adj-pn', # pre-noun adjectival (rentaishi) 'adj-t', # `taru' adjective 'adv', # adverb (fukushi) 'adv-n', # adverbial noun 'adv-to', # adverb taking the `to' particle 'aux', # auxiliary 'aux-adj', # auxiliary adjective 'aux-v', # auxiliary verb 'conj', # conjunction 'ctr', # counter 'exp', # Expressions (phrases, clauses, etc.) 'int', # interjection (kandoushi) 'iv', # irregular verb 'n', # noun (common) (futsuumeishi) 'n-adv', # adverbial noun (fukushitekimeishi) 'n-pref', # noun, used as a prefix 'n-suf', # noun, used as a suffix 'n-t', # noun (temporal) (jisoumeishi) 'num', # numeric 'pn', # pronoun 'pref' , # prefix 'prt', # particle 'suf', # suffix 'v1', # Ichidan verb 'v2a-s', # Nidan verb with 'u' ending (archaic) 'v4h', # Yodan verb with `hu/fu' ending (archaic) 'v4r', # Yodan verb with `ru' ending (archaic) 'v5', # Godan verb (not completely classified) 'v5aru', # Godan verb - -aru special class 'v5b', # Godan verb with `bu' ending 'v5g', # Godan verb with `gu' ending 'v5k', # Godan verb with `ku' ending 'v5k-s', # Godan verb - iku/yuku special class 'v5m', # Godan verb with `mu' ending 'v5n', # Godan verb with `nu' ending 'v5r', # Godan verb with `ru' ending 'v5r-i', # Godan verb with `ru' ending (irregular verb) 'v5s', # Godan verb with `su' ending 'v5t', # Godan verb with `tsu' ending 'v5u', # Godan verb with `u' ending 'v5u-s', # Godan verb with `u' ending (special class) 'v5uru', # Godan verb - uru old class verb (old form of Eru) 'v5z', # Godan verb with `zu' ending 'vi', # intransitive verb 'vk', # kuru verb - special class 'vn', # irregular nu verb 'vs', # noun or participle which takes the aux. verb suru 'vs-c', # su verb - precursor to the modern suru 'vs-i', # suru verb - irregular 'vs-s', # suru verb - special class 'vt', # transitive verb 'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs) } def isHiragana(c): return 0x3040 <= ord(c) < 0x30a0 def isKatakana(c): return 0x30a0 <= ord(c) < 0x3100 def loadDefinitions(path): print 'Parsing "{0}"...'.format(path) with codecs.open(path, encoding='euc-jp') as fp: return filter(lambda x: x and x[0] != '#', fp.read().splitlines()) def parseKanjiDic(path): results = list() for line in loadDefinitions(path): segments = line.split() character = segments[0] kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:])) onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:])) meanings = '; '.join(re.findall('\{([^\}]+)\}', line)) results.append((character, onYomi, kunYomi, meanings)) return results def writeKanjiDic(cursor, values): cursor.execute('DROP TABLE IF EXISTS Kanji') cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)') cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values) def parseKradFile(path): results = list() for line in loadDefinitions(path): segments = line.split(' ') character = segments[0] radicals = ', '.join(segments[2:]) results.append((character, radicals)) return results def writeKradFile(cursor, values): cursor.execute('DROP TABLE IF EXISTS Radicals') cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)') cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values) def parseEdict(path): results = list() for line in loadDefinitions(path): segments = line.split('/') expression = segments[0].split(' ') term = expression[0] match = re.search('\[([^\]]+)\]', expression[1]) reading = None if match is None else match.group(1) definitions = filter(lambda x: len(x) > 0, segments[1:]) definitions = '; '.join(definitions) definitions = re.sub('\(\d+\)\s*', str(), definitions) tags = list() for group in re.findall('\(([^\)\]]+)\)', definitions): tags.extend(group.split(',')) tags = set(tags).intersection(GRAMMAR_TAGS) tags = ', '.join(sorted(tags)) results.append((term, reading, definitions, tags)) return results def writeEdict(cursor, values): cursor.execute('DROP TABLE IF EXISTS Terms') cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)') cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values) def build(path, kanjidic, kradfile, edict): with sqlite3.connect(path) as db: cursor = db.cursor() if kanjidic is not None: writeKanjiDic(cursor, parseKanjiDic(kanjidic)) if kradfile is not None: writeKradFile(cursor, parseKradFile(kradfile)) if edict is not None: writeEdict(cursor, parseEdict(edict)) def main(): parser = optparse.OptionParser() parser.add_option('--kanjidic', dest='kanjidic') parser.add_option('--kradfile', dest='kradfile') parser.add_option('--edict', dest='edict') options, args = parser.parse_args() if len(args) == 0: parser.print_help() else: build(args[0], options.kanjidic, options.kradfile, options.edict) if __name__ == '__main__': main()