1
yomichan-anki/util/compile.py
Alex Yatskov 1a9dcc69ff Changing the way radicals are stored in the database
Former-commit-id: 11daa98be108a3fbb1e1b6c91aeb824d3380318b
2013-11-08 13:12:00 -08:00

204 lines
6.6 KiB
Python
Executable File

#!/usr/bin/env python
#
# Copyright (C) 2011 Alex Yatskov
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import codecs
import optparse
import os
import re
import sqlite3
import sys
GRAMMAR_TAGS = {
'adj', # former adjective classification (being removed)
'adj-f', # noun or verb acting prenominally (other than the above)
'adj-i', # adjective (keiyoushi)
'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)
'adj-no', # nouns which may take the genitive case particle `no'
'adj-pn', # pre-noun adjectival (rentaishi)
'adj-t', # `taru' adjective
'adv', # adverb (fukushi)
'adv-n', # adverbial noun
'adv-to', # adverb taking the `to' particle
'aux', # auxiliary
'aux-adj', # auxiliary adjective
'aux-v', # auxiliary verb
'conj', # conjunction
'ctr', # counter
'exp', # Expressions (phrases, clauses, etc.)
'int', # interjection (kandoushi)
'iv', # irregular verb
'n', # noun (common) (futsuumeishi)
'n-adv', # adverbial noun (fukushitekimeishi)
'n-pref', # noun, used as a prefix
'n-suf', # noun, used as a suffix
'n-t', # noun (temporal) (jisoumeishi)
'num', # numeric
'pn', # pronoun
'pref' , # prefix
'prt', # particle
'suf', # suffix
'v1', # Ichidan verb
'v2a-s', # Nidan verb with 'u' ending (archaic)
'v4h', # Yodan verb with `hu/fu' ending (archaic)
'v4r', # Yodan verb with `ru' ending (archaic)
'v5', # Godan verb (not completely classified)
'v5aru', # Godan verb - -aru special class
'v5b', # Godan verb with `bu' ending
'v5g', # Godan verb with `gu' ending
'v5k', # Godan verb with `ku' ending
'v5k-s', # Godan verb - iku/yuku special class
'v5m', # Godan verb with `mu' ending
'v5n', # Godan verb with `nu' ending
'v5r', # Godan verb with `ru' ending
'v5r-i', # Godan verb with `ru' ending (irregular verb)
'v5s', # Godan verb with `su' ending
'v5t', # Godan verb with `tsu' ending
'v5u', # Godan verb with `u' ending
'v5u-s', # Godan verb with `u' ending (special class)
'v5uru', # Godan verb - uru old class verb (old form of Eru)
'v5z', # Godan verb with `zu' ending
'vi', # intransitive verb
'vk', # kuru verb - special class
'vn', # irregular nu verb
'vs', # noun or participle which takes the aux. verb suru
'vs-c', # su verb - precursor to the modern suru
'vs-i', # suru verb - irregular
'vs-s', # suru verb - special class
'vt', # transitive verb
'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
}
def isHiragana(c):
return 0x3040 <= ord(c) < 0x30a0
def isKatakana(c):
return 0x30a0 <= ord(c) < 0x3100
def loadDefinitions(path):
print 'Parsing "{0}"...'.format(path)
with codecs.open(path, encoding='euc-jp') as fp:
return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
def parseKanjiDic(path):
results = list()
for line in loadDefinitions(path):
segments = line.split()
character = segments[0]
kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
meanings = '; '.join(re.findall('\{([^\}]+)\}', line))
results.append((character, onYomi, kunYomi, meanings))
return results
def writeKanjiDic(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Kanji')
cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
def parseKradFile(path):
results = list()
for line in loadDefinitions(path):
segments = line.split(' ')
character = segments[0]
radicals = ' '.join(segments[2:])
results.append((character, radicals))
return results
def writeKradFile(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Radicals')
cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)
def parseEdict(path):
results = list()
for line in loadDefinitions(path):
segments = line.split('/')
expression = segments[0].split(' ')
term = expression[0]
match = re.search('\[([^\]]+)\]', expression[1])
reading = None if match is None else match.group(1)
definitions = filter(lambda x: len(x) > 0, segments[1:])
definitions = '; '.join(definitions)
definitions = re.sub('\(\d+\)\s*', str(), definitions)
tags = list()
for group in re.findall('\(([^\)\]]+)\)', definitions):
tags.extend(group.split(','))
tags = set(tags).intersection(GRAMMAR_TAGS)
tags = ' '.join(sorted(tags))
results.append((term, reading, definitions, tags))
return results
def writeEdict(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Terms')
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)
def build(path, kanjidic, kradfile, edict):
with sqlite3.connect(path) as db:
cursor = db.cursor()
if kanjidic is not None:
writeKanjiDic(cursor, parseKanjiDic(kanjidic))
if kradfile is not None:
writeKradFile(cursor, parseKradFile(kradfile))
if edict is not None:
writeEdict(cursor, parseEdict(edict))
def main():
parser = optparse.OptionParser()
parser.add_option('--kanjidic', dest='kanjidic')
parser.add_option('--kradfile', dest='kradfile')
parser.add_option('--edict', dest='edict')
options, args = parser.parse_args()
if len(args) == 0:
parser.print_help()
else:
build(args[0], options.kanjidic, options.kradfile, options.edict)
if __name__ == '__main__':
main()