1

Renaming columns in database, making sure P tag gets parsed

Former-commit-id: 0794e83218672f47fc467e2d7f24bede994c94d7
This commit is contained in:
Alex Yatskov 2013-11-09 17:37:53 -08:00
parent 4329d6c047
commit c43d1da885
3 changed files with 17 additions and 16 deletions

View File

@ -3,7 +3,7 @@
KANJIDIC=util/data/kanjidic
KRADFILE=util/data/kradfile
EDICT=util/data/edict
DICT=yomi_base/japanese2/data/dictionary.db
DICT=yomi_base/japanese/data/dictionary.db
rm $DICT
util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT $DICT

View File

@ -25,7 +25,8 @@ import sqlite3
import sys
GRAMMAR_TAGS = {
PARSED_TAGS = {
'P', # common word
'adj', # former adjective classification (being removed)
'adj-f', # noun or verb acting prenominally (other than the above)
'adj-i', # adjective (keiyoushi)
@ -106,17 +107,17 @@ def parseKanjiDic(path):
for line in loadDefinitions(path):
segments = line.split()
character = segments[0]
kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
meanings = '; '.join(re.findall('\{([^\}]+)\}', line))
results.append((character, onYomi, kunYomi, meanings))
kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
results.append((character, onyomi, kunyomi, glossary))
return results
def writeKanjiDic(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Kanji')
cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)')
cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
@ -149,25 +150,25 @@ def parseEdict(path):
match = re.search('\[([^\]]+)\]', expression[1])
reading = None if match is None else match.group(1)
definitions = filter(lambda x: len(x) > 0, segments[1:])
definitions = '; '.join(definitions)
definitions = re.sub('\(\d+\)\s*', str(), definitions)
glossary = filter(lambda x: len(x) > 0, segments[1:])
glossary = '; '.join(glossary)
glossary = re.sub('\(\d+\)\s*', str(), glossary)
tags = list()
for group in re.findall('\(([^\)\]]+)\)', definitions):
for group in re.findall('\(([^\)\]]+)\)', glossary):
tags.extend(group.split(','))
tags = set(tags).intersection(GRAMMAR_TAGS)
tags = set(tags).intersection(PARSED_TAGS)
tags = ' '.join(sorted(tags))
results.append((term, reading, definitions, tags))
results.append((term, reading, glossary, tags))
return results
def writeEdict(cursor, values):
cursor.execute('DROP TABLE IF EXISTS Terms')
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)')
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:da31b200f6362ba5041bbb848d9c7e3d991a96dfe395d18255333107f21a205c
size 20322304
oid sha256:4718fcf7ca6fbb26611ba5246e75faed0a4d8ccb994e811724a5c5ca1b9e182a
size 20370432