1

Renaming columns in database, making sure P tag gets parsed

Former-commit-id: 0794e83218672f47fc467e2d7f24bede994c94d7
This commit is contained in:
Alex Yatskov 2013-11-09 17:37:53 -08:00
parent 4329d6c047
commit c43d1da885
3 changed files with 17 additions and 16 deletions

View File

@ -3,7 +3,7 @@
KANJIDIC=util/data/kanjidic KANJIDIC=util/data/kanjidic
KRADFILE=util/data/kradfile KRADFILE=util/data/kradfile
EDICT=util/data/edict EDICT=util/data/edict
DICT=yomi_base/japanese2/data/dictionary.db DICT=yomi_base/japanese/data/dictionary.db
rm $DICT rm $DICT
util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT $DICT util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT $DICT

View File

@ -25,7 +25,8 @@ import sqlite3
import sys import sys
GRAMMAR_TAGS = { PARSED_TAGS = {
'P', # common word
'adj', # former adjective classification (being removed) 'adj', # former adjective classification (being removed)
'adj-f', # noun or verb acting prenominally (other than the above) 'adj-f', # noun or verb acting prenominally (other than the above)
'adj-i', # adjective (keiyoushi) 'adj-i', # adjective (keiyoushi)
@ -106,17 +107,17 @@ def parseKanjiDic(path):
for line in loadDefinitions(path): for line in loadDefinitions(path):
segments = line.split() segments = line.split()
character = segments[0] character = segments[0]
kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:])) kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:])) onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
meanings = '; '.join(re.findall('\{([^\}]+)\}', line)) glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
results.append((character, onYomi, kunYomi, meanings)) results.append((character, onyomi, kunyomi, glossary))
return results return results
def writeKanjiDic(cursor, values): def writeKanjiDic(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Kanji') cursor.execute('DROP TABLE IF EXISTS Kanji')
cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)') cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)')
cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values) cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
@ -149,25 +150,25 @@ def parseEdict(path):
match = re.search('\[([^\]]+)\]', expression[1]) match = re.search('\[([^\]]+)\]', expression[1])
reading = None if match is None else match.group(1) reading = None if match is None else match.group(1)
definitions = filter(lambda x: len(x) > 0, segments[1:]) glossary = filter(lambda x: len(x) > 0, segments[1:])
definitions = '; '.join(definitions) glossary = '; '.join(glossary)
definitions = re.sub('\(\d+\)\s*', str(), definitions) glossary = re.sub('\(\d+\)\s*', str(), glossary)
tags = list() tags = list()
for group in re.findall('\(([^\)\]]+)\)', definitions): for group in re.findall('\(([^\)\]]+)\)', glossary):
tags.extend(group.split(',')) tags.extend(group.split(','))
tags = set(tags).intersection(GRAMMAR_TAGS) tags = set(tags).intersection(PARSED_TAGS)
tags = ' '.join(sorted(tags)) tags = ' '.join(sorted(tags))
results.append((term, reading, definitions, tags)) results.append((term, reading, glossary, tags))
return results return results
def writeEdict(cursor, values): def writeEdict(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Terms') cursor.execute('DROP TABLE IF EXISTS Terms')
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)') cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)')
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values) cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:da31b200f6362ba5041bbb848d9c7e3d991a96dfe395d18255333107f21a205c oid sha256:4718fcf7ca6fbb26611ba5246e75faed0a4d8ccb994e811724a5c5ca1b9e182a
size 20322304 size 20370432