From c43d1da885f9a210d87c10b77d3ade9f584c82fc Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sat, 9 Nov 2013 17:37:53 -0800 Subject: [PATCH] Renaming columns in database, making sure P tag gets parsed Former-commit-id: 0794e83218672f47fc467e2d7f24bede994c94d7 --- build_dict.sh | 2 +- util/compile.py | 27 ++++++++++++++------------- yomi_base/japanese/data/dictionary.db | 4 ++-- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/build_dict.sh b/build_dict.sh index bcca4e8..f24b19c 100755 --- a/build_dict.sh +++ b/build_dict.sh @@ -3,7 +3,7 @@ KANJIDIC=util/data/kanjidic KRADFILE=util/data/kradfile EDICT=util/data/edict -DICT=yomi_base/japanese2/data/dictionary.db +DICT=yomi_base/japanese/data/dictionary.db rm $DICT util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT $DICT diff --git a/util/compile.py b/util/compile.py index 8df1e24..1a80ada 100755 --- a/util/compile.py +++ b/util/compile.py @@ -25,7 +25,8 @@ import sqlite3 import sys -GRAMMAR_TAGS = { +PARSED_TAGS = { + 'P', # common word 'adj', # former adjective classification (being removed) 'adj-f', # noun or verb acting prenominally (other than the above) 'adj-i', # adjective (keiyoushi) @@ -106,17 +107,17 @@ def parseKanjiDic(path): for line in loadDefinitions(path): segments = line.split() character = segments[0] - kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:])) - onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:])) - meanings = '; '.join(re.findall('\{([^\}]+)\}', line)) - results.append((character, onYomi, kunYomi, meanings)) + kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:])) + onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:])) + glossary = '; '.join(re.findall('\{([^\}]+)\}', line)) + results.append((character, onyomi, kunyomi, glossary)) return results def writeKanjiDic(cursor, values): cursor.execute('DROP TABLE IF EXISTS Kanji') - cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)') + cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)') cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values) @@ -149,25 +150,25 @@ def parseEdict(path): match = re.search('\[([^\]]+)\]', expression[1]) reading = None if match is None else match.group(1) - definitions = filter(lambda x: len(x) > 0, segments[1:]) - definitions = '; '.join(definitions) - definitions = re.sub('\(\d+\)\s*', str(), definitions) + glossary = filter(lambda x: len(x) > 0, segments[1:]) + glossary = '; '.join(glossary) + glossary = re.sub('\(\d+\)\s*', str(), glossary) tags = list() - for group in re.findall('\(([^\)\]]+)\)', definitions): + for group in re.findall('\(([^\)\]]+)\)', glossary): tags.extend(group.split(',')) - tags = set(tags).intersection(GRAMMAR_TAGS) + tags = set(tags).intersection(PARSED_TAGS) tags = ' '.join(sorted(tags)) - results.append((term, reading, definitions, tags)) + results.append((term, reading, glossary, tags)) return results def writeEdict(cursor, values): cursor.execute('DROP TABLE IF EXISTS Terms') - cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)') + cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)') cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values) diff --git a/yomi_base/japanese/data/dictionary.db b/yomi_base/japanese/data/dictionary.db index 2f95b7f..54c1129 100644 --- a/yomi_base/japanese/data/dictionary.db +++ b/yomi_base/japanese/data/dictionary.db @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da31b200f6362ba5041bbb848d9c7e3d991a96dfe395d18255333107f21a205c -size 20322304 +oid sha256:4718fcf7ca6fbb26611ba5246e75faed0a4d8ccb994e811724a5c5ca1b9e182a +size 20370432