Updating database compile script to take parameters and to support kanjidic and kradfile

2013-11-08 11:08:04 -08:00 · 2013-11-08 11:08:04 -08:00 · ea90c4ceb0
commit ea90c4ceb0
parent edd8f8354b
1 changed files with 41 additions and 31 deletions
--- a/util/compile.py
+++ b/util/compile.py
@ -1,26 +1,27 @@
 #!/usr/bin/env python

+import codecs
+import optparse
 import os
 import re
-import sys
-import codecs
 import sqlite3
+import sys


 GRAMMAR_TAGS = {
+    'adj',      # former adjective classification (being removed)
+    'adj-f',    # noun or verb acting prenominally (other than the above)
    'adj-i',    # adjective (keiyoushi)
    'adj-na',   # adjectival nouns or quasi-adjectives (keiyodoshi)
    'adj-no',   # nouns which may take the genitive case particle `no'
    'adj-pn',   # pre-noun adjectival (rentaishi)
    'adj-t',    # `taru' adjective
-    'adj-f',    # noun or verb acting prenominally (other than the above)
-    'adj',      # former adjective classification (being removed)
    'adv',      # adverb (fukushi)
    'adv-n',    # adverbial noun
    'adv-to',   # adverb taking the `to' particle
    'aux',      # auxiliary
-    'aux-v',    # auxiliary verb
    'aux-adj',  # auxiliary adjective
+    'aux-v',    # auxiliary verb
    'conj',     # conjunction
    'ctr',      # counter
    'exp',      # Expressions (phrases, clauses, etc.)
@ -56,7 +57,6 @@ GRAMMAR_TAGS = {
    'v5u-s',    # Godan verb with `u' ending (special class)
    'v5uru',    # Godan verb - uru old class verb (old form of Eru)
    'v5z',      # Godan verb with `zu' ending
-    'vz',       # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
    'vi',       # intransitive verb
    'vk',       # kuru verb - special class
    'vn',       # irregular nu verb
@ -65,8 +65,10 @@ GRAMMAR_TAGS = {
    'vs-i',     # suru verb - irregular
    'vs-s',     # suru verb - special class
    'vt',       # transitive verb
+    'vz',       # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
 }

+
 def isHiragana(c):
    return 0x3040 <= ord(c) < 0x30a0

@ -86,43 +88,37 @@ def parseKanjiDic(path):

    for line in loadDefinitions('kanjidic'):
        segments = line.split()
-        results.append({
-            'character': segments[0],
-            'onyomi': filter(lambda x: filter(isKatakana, x), segments[1:]),
-            'kunyomi': filter(lambda x: filter(isHiragana, x), segments[1:]),
-            'meanings': re.findall('\{([^\}]+)\}', line)
-        })
+        character = segments[0]
+        kunYomi = ','.join(filter(lambda x: filter(isHiragana, x), segments[1:])),
+        onYomi = ','.join(filter(lambda x: filter(isKatakana, x), segments[1:])),
+        meanings = ','.join(re.findall('\{([^\}]+)\}', line))
+        results.append((character, onYomi, kunYomi, meanings))

    return results


 def writeKanjiDic(cursor, values):
-    pass
+    cursor.execute('DROP TABLE IF EXISTS Kanji')
+    cursor.execute('CREATE TABLE Radicals(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
+    cursor.executemany('INSERT INTO Radicals VALUES(?, ?, ?, ?)', values)


 def parseKradFile(path):
-    radsByChar = dict()
-    charsByRad = dict()
+    results = list()

    for line in loadDefinitions(path):
        segments = line.split(' ')
        character = segments[0]
-        radicals = segments[2:]
-
-        radsByChar[character] = radicals
-        for radical in radicals:
-            charsByRad[radical] = charsByRad.get(radical, list()) + [character]
-
-    results = {
-        'radsByChar': radsByChar,
-        'charsByRad': charsByRad
-    }
+        radicals = ','.join(segments[2:])
+        results.append((character, radicals))

    return results


 def writeKradFile(cursor, values):
-    pass
+    cursor.execute('DROP TABLE IF EXISTS Radicals')
+    cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
+    cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)


 def parseEdict(path):
@ -153,12 +149,12 @@ def parseEdict(path):


 def writeEdict(cursor, values):
-    cursor.execute('drop table if exists Edict')
-    cursor.execute('create table Edict(term text, reading text, definitions text, tags text)')
-    cursor.executemany('insert into Edict values(?, ?, ?, ?)', values)
+    cursor.execute('DROP TABLE IF EXISTS Terms')
+    cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
+    cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)


-def main(path, kanjidic=None, kradfile=None, edict=None):
+def build(path, kanjidic, kradfile, edict):
    with sqlite3.connect(path) as db:
        cursor = db.cursor()

@ -172,5 +168,19 @@ def main(path, kanjidic=None, kradfile=None, edict=None):
            writeEdict(cursor, parseEdict(edict))


+def main():
+    parser = optparse.OptionParser()
+    parser.add_option('--kanjidic', dest='kanjidic')
+    parser.add_option('--kradfile', dest='kradfile')
+    parser.add_option('--edict', dest='edict')
+
+    options, args = parser.parse_args()
+
+    if len(args) == 0:
+        parser.print_help()
+    else:
+        build(args[0], options.kanjidic, options.kradfile, options.edict)
+
+
 if __name__ == '__main__':
-    main('dictionary.db', edict='data/edict')
+    main()