1

Updating database compile script to take parameters and to support kanjidic and kradfile

This commit is contained in:
Alex Yatskov 2013-11-08 11:08:04 -08:00
parent edd8f8354b
commit ea90c4ceb0

View File

@ -1,26 +1,27 @@
#!/usr/bin/env python #!/usr/bin/env python
import codecs
import optparse
import os import os
import re import re
import sys
import codecs
import sqlite3 import sqlite3
import sys
GRAMMAR_TAGS = { GRAMMAR_TAGS = {
'adj', # former adjective classification (being removed)
'adj-f', # noun or verb acting prenominally (other than the above)
'adj-i', # adjective (keiyoushi) 'adj-i', # adjective (keiyoushi)
'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi) 'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)
'adj-no', # nouns which may take the genitive case particle `no' 'adj-no', # nouns which may take the genitive case particle `no'
'adj-pn', # pre-noun adjectival (rentaishi) 'adj-pn', # pre-noun adjectival (rentaishi)
'adj-t', # `taru' adjective 'adj-t', # `taru' adjective
'adj-f', # noun or verb acting prenominally (other than the above)
'adj', # former adjective classification (being removed)
'adv', # adverb (fukushi) 'adv', # adverb (fukushi)
'adv-n', # adverbial noun 'adv-n', # adverbial noun
'adv-to', # adverb taking the `to' particle 'adv-to', # adverb taking the `to' particle
'aux', # auxiliary 'aux', # auxiliary
'aux-v', # auxiliary verb
'aux-adj', # auxiliary adjective 'aux-adj', # auxiliary adjective
'aux-v', # auxiliary verb
'conj', # conjunction 'conj', # conjunction
'ctr', # counter 'ctr', # counter
'exp', # Expressions (phrases, clauses, etc.) 'exp', # Expressions (phrases, clauses, etc.)
@ -56,7 +57,6 @@ GRAMMAR_TAGS = {
'v5u-s', # Godan verb with `u' ending (special class) 'v5u-s', # Godan verb with `u' ending (special class)
'v5uru', # Godan verb - uru old class verb (old form of Eru) 'v5uru', # Godan verb - uru old class verb (old form of Eru)
'v5z', # Godan verb with `zu' ending 'v5z', # Godan verb with `zu' ending
'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
'vi', # intransitive verb 'vi', # intransitive verb
'vk', # kuru verb - special class 'vk', # kuru verb - special class
'vn', # irregular nu verb 'vn', # irregular nu verb
@ -65,8 +65,10 @@ GRAMMAR_TAGS = {
'vs-i', # suru verb - irregular 'vs-i', # suru verb - irregular
'vs-s', # suru verb - special class 'vs-s', # suru verb - special class
'vt', # transitive verb 'vt', # transitive verb
'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
} }
def isHiragana(c): def isHiragana(c):
return 0x3040 <= ord(c) < 0x30a0 return 0x3040 <= ord(c) < 0x30a0
@ -86,43 +88,37 @@ def parseKanjiDic(path):
for line in loadDefinitions('kanjidic'): for line in loadDefinitions('kanjidic'):
segments = line.split() segments = line.split()
results.append({ character = segments[0]
'character': segments[0], kunYomi = ','.join(filter(lambda x: filter(isHiragana, x), segments[1:])),
'onyomi': filter(lambda x: filter(isKatakana, x), segments[1:]), onYomi = ','.join(filter(lambda x: filter(isKatakana, x), segments[1:])),
'kunyomi': filter(lambda x: filter(isHiragana, x), segments[1:]), meanings = ','.join(re.findall('\{([^\}]+)\}', line))
'meanings': re.findall('\{([^\}]+)\}', line) results.append((character, onYomi, kunYomi, meanings))
})
return results return results
def writeKanjiDic(cursor, values): def writeKanjiDic(cursor, values):
pass cursor.execute('DROP TABLE IF EXISTS Kanji')
cursor.execute('CREATE TABLE Radicals(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
cursor.executemany('INSERT INTO Radicals VALUES(?, ?, ?, ?)', values)
def parseKradFile(path): def parseKradFile(path):
radsByChar = dict() results = list()
charsByRad = dict()
for line in loadDefinitions(path): for line in loadDefinitions(path):
segments = line.split(' ') segments = line.split(' ')
character = segments[0] character = segments[0]
radicals = segments[2:] radicals = ','.join(segments[2:])
results.append((character, radicals))
radsByChar[character] = radicals
for radical in radicals:
charsByRad[radical] = charsByRad.get(radical, list()) + [character]
results = {
'radsByChar': radsByChar,
'charsByRad': charsByRad
}
return results return results
def writeKradFile(cursor, values): def writeKradFile(cursor, values):
pass cursor.execute('DROP TABLE IF EXISTS Radicals')
cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)
def parseEdict(path): def parseEdict(path):
@ -153,12 +149,12 @@ def parseEdict(path):
def writeEdict(cursor, values): def writeEdict(cursor, values):
cursor.execute('drop table if exists Edict') cursor.execute('DROP TABLE IF EXISTS Terms')
cursor.execute('create table Edict(term text, reading text, definitions text, tags text)') cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
cursor.executemany('insert into Edict values(?, ?, ?, ?)', values) cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)
def main(path, kanjidic=None, kradfile=None, edict=None): def build(path, kanjidic, kradfile, edict):
with sqlite3.connect(path) as db: with sqlite3.connect(path) as db:
cursor = db.cursor() cursor = db.cursor()
@ -172,5 +168,19 @@ def main(path, kanjidic=None, kradfile=None, edict=None):
writeEdict(cursor, parseEdict(edict)) writeEdict(cursor, parseEdict(edict))
def main():
parser = optparse.OptionParser()
parser.add_option('--kanjidic', dest='kanjidic')
parser.add_option('--kradfile', dest='kradfile')
parser.add_option('--edict', dest='edict')
options, args = parser.parse_args()
if len(args) == 0:
parser.print_help()
else:
build(args[0], options.kanjidic, options.kradfile, options.edict)
if __name__ == '__main__': if __name__ == '__main__':
main('dictionary.db', edict='data/edict') main()