Updating database compile script to take parameters and to support kanjidic and kradfile
This commit is contained in:
parent
edd8f8354b
commit
ea90c4ceb0
@ -1,26 +1,27 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import optparse
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
import codecs
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
GRAMMAR_TAGS = {
|
GRAMMAR_TAGS = {
|
||||||
|
'adj', # former adjective classification (being removed)
|
||||||
|
'adj-f', # noun or verb acting prenominally (other than the above)
|
||||||
'adj-i', # adjective (keiyoushi)
|
'adj-i', # adjective (keiyoushi)
|
||||||
'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)
|
'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)
|
||||||
'adj-no', # nouns which may take the genitive case particle `no'
|
'adj-no', # nouns which may take the genitive case particle `no'
|
||||||
'adj-pn', # pre-noun adjectival (rentaishi)
|
'adj-pn', # pre-noun adjectival (rentaishi)
|
||||||
'adj-t', # `taru' adjective
|
'adj-t', # `taru' adjective
|
||||||
'adj-f', # noun or verb acting prenominally (other than the above)
|
|
||||||
'adj', # former adjective classification (being removed)
|
|
||||||
'adv', # adverb (fukushi)
|
'adv', # adverb (fukushi)
|
||||||
'adv-n', # adverbial noun
|
'adv-n', # adverbial noun
|
||||||
'adv-to', # adverb taking the `to' particle
|
'adv-to', # adverb taking the `to' particle
|
||||||
'aux', # auxiliary
|
'aux', # auxiliary
|
||||||
'aux-v', # auxiliary verb
|
|
||||||
'aux-adj', # auxiliary adjective
|
'aux-adj', # auxiliary adjective
|
||||||
|
'aux-v', # auxiliary verb
|
||||||
'conj', # conjunction
|
'conj', # conjunction
|
||||||
'ctr', # counter
|
'ctr', # counter
|
||||||
'exp', # Expressions (phrases, clauses, etc.)
|
'exp', # Expressions (phrases, clauses, etc.)
|
||||||
@ -56,7 +57,6 @@ GRAMMAR_TAGS = {
|
|||||||
'v5u-s', # Godan verb with `u' ending (special class)
|
'v5u-s', # Godan verb with `u' ending (special class)
|
||||||
'v5uru', # Godan verb - uru old class verb (old form of Eru)
|
'v5uru', # Godan verb - uru old class verb (old form of Eru)
|
||||||
'v5z', # Godan verb with `zu' ending
|
'v5z', # Godan verb with `zu' ending
|
||||||
'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
|
|
||||||
'vi', # intransitive verb
|
'vi', # intransitive verb
|
||||||
'vk', # kuru verb - special class
|
'vk', # kuru verb - special class
|
||||||
'vn', # irregular nu verb
|
'vn', # irregular nu verb
|
||||||
@ -65,8 +65,10 @@ GRAMMAR_TAGS = {
|
|||||||
'vs-i', # suru verb - irregular
|
'vs-i', # suru verb - irregular
|
||||||
'vs-s', # suru verb - special class
|
'vs-s', # suru verb - special class
|
||||||
'vt', # transitive verb
|
'vt', # transitive verb
|
||||||
|
'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def isHiragana(c):
|
def isHiragana(c):
|
||||||
return 0x3040 <= ord(c) < 0x30a0
|
return 0x3040 <= ord(c) < 0x30a0
|
||||||
|
|
||||||
@ -86,43 +88,37 @@ def parseKanjiDic(path):
|
|||||||
|
|
||||||
for line in loadDefinitions('kanjidic'):
|
for line in loadDefinitions('kanjidic'):
|
||||||
segments = line.split()
|
segments = line.split()
|
||||||
results.append({
|
character = segments[0]
|
||||||
'character': segments[0],
|
kunYomi = ','.join(filter(lambda x: filter(isHiragana, x), segments[1:])),
|
||||||
'onyomi': filter(lambda x: filter(isKatakana, x), segments[1:]),
|
onYomi = ','.join(filter(lambda x: filter(isKatakana, x), segments[1:])),
|
||||||
'kunyomi': filter(lambda x: filter(isHiragana, x), segments[1:]),
|
meanings = ','.join(re.findall('\{([^\}]+)\}', line))
|
||||||
'meanings': re.findall('\{([^\}]+)\}', line)
|
results.append((character, onYomi, kunYomi, meanings))
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def writeKanjiDic(cursor, values):
|
def writeKanjiDic(cursor, values):
|
||||||
pass
|
cursor.execute('DROP TABLE IF EXISTS Kanji')
|
||||||
|
cursor.execute('CREATE TABLE Radicals(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
|
||||||
|
cursor.executemany('INSERT INTO Radicals VALUES(?, ?, ?, ?)', values)
|
||||||
|
|
||||||
|
|
||||||
def parseKradFile(path):
|
def parseKradFile(path):
|
||||||
radsByChar = dict()
|
results = list()
|
||||||
charsByRad = dict()
|
|
||||||
|
|
||||||
for line in loadDefinitions(path):
|
for line in loadDefinitions(path):
|
||||||
segments = line.split(' ')
|
segments = line.split(' ')
|
||||||
character = segments[0]
|
character = segments[0]
|
||||||
radicals = segments[2:]
|
radicals = ','.join(segments[2:])
|
||||||
|
results.append((character, radicals))
|
||||||
radsByChar[character] = radicals
|
|
||||||
for radical in radicals:
|
|
||||||
charsByRad[radical] = charsByRad.get(radical, list()) + [character]
|
|
||||||
|
|
||||||
results = {
|
|
||||||
'radsByChar': radsByChar,
|
|
||||||
'charsByRad': charsByRad
|
|
||||||
}
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def writeKradFile(cursor, values):
|
def writeKradFile(cursor, values):
|
||||||
pass
|
cursor.execute('DROP TABLE IF EXISTS Radicals')
|
||||||
|
cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
|
||||||
|
cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)
|
||||||
|
|
||||||
|
|
||||||
def parseEdict(path):
|
def parseEdict(path):
|
||||||
@ -153,12 +149,12 @@ def parseEdict(path):
|
|||||||
|
|
||||||
|
|
||||||
def writeEdict(cursor, values):
|
def writeEdict(cursor, values):
|
||||||
cursor.execute('drop table if exists Edict')
|
cursor.execute('DROP TABLE IF EXISTS Terms')
|
||||||
cursor.execute('create table Edict(term text, reading text, definitions text, tags text)')
|
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
|
||||||
cursor.executemany('insert into Edict values(?, ?, ?, ?)', values)
|
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)
|
||||||
|
|
||||||
|
|
||||||
def main(path, kanjidic=None, kradfile=None, edict=None):
|
def build(path, kanjidic, kradfile, edict):
|
||||||
with sqlite3.connect(path) as db:
|
with sqlite3.connect(path) as db:
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
|
|
||||||
@ -172,5 +168,19 @@ def main(path, kanjidic=None, kradfile=None, edict=None):
|
|||||||
writeEdict(cursor, parseEdict(edict))
|
writeEdict(cursor, parseEdict(edict))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = optparse.OptionParser()
|
||||||
|
parser.add_option('--kanjidic', dest='kanjidic')
|
||||||
|
parser.add_option('--kradfile', dest='kradfile')
|
||||||
|
parser.add_option('--edict', dest='edict')
|
||||||
|
|
||||||
|
options, args = parser.parse_args()
|
||||||
|
|
||||||
|
if len(args) == 0:
|
||||||
|
parser.print_help()
|
||||||
|
else:
|
||||||
|
build(args[0], options.kanjidic, options.kradfile, options.edict)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main('dictionary.db', edict='data/edict')
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user