Renaming columns in database, making sure P tag gets parsed
Former-commit-id: 0794e83218672f47fc467e2d7f24bede994c94d7
This commit is contained in:
parent
4329d6c047
commit
c43d1da885
@ -3,7 +3,7 @@
|
||||
KANJIDIC=util/data/kanjidic
|
||||
KRADFILE=util/data/kradfile
|
||||
EDICT=util/data/edict
|
||||
DICT=yomi_base/japanese2/data/dictionary.db
|
||||
DICT=yomi_base/japanese/data/dictionary.db
|
||||
|
||||
rm $DICT
|
||||
util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT $DICT
|
||||
|
@ -25,7 +25,8 @@ import sqlite3
|
||||
import sys
|
||||
|
||||
|
||||
GRAMMAR_TAGS = {
|
||||
PARSED_TAGS = {
|
||||
'P', # common word
|
||||
'adj', # former adjective classification (being removed)
|
||||
'adj-f', # noun or verb acting prenominally (other than the above)
|
||||
'adj-i', # adjective (keiyoushi)
|
||||
@ -106,17 +107,17 @@ def parseKanjiDic(path):
|
||||
for line in loadDefinitions(path):
|
||||
segments = line.split()
|
||||
character = segments[0]
|
||||
kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
|
||||
onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
|
||||
meanings = '; '.join(re.findall('\{([^\}]+)\}', line))
|
||||
results.append((character, onYomi, kunYomi, meanings))
|
||||
kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
|
||||
onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
|
||||
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
||||
results.append((character, onyomi, kunyomi, glossary))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def writeKanjiDic(cursor, values):
|
||||
cursor.execute('DROP TABLE IF EXISTS Kanji')
|
||||
cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
|
||||
cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)')
|
||||
cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
|
||||
|
||||
|
||||
@ -149,25 +150,25 @@ def parseEdict(path):
|
||||
match = re.search('\[([^\]]+)\]', expression[1])
|
||||
reading = None if match is None else match.group(1)
|
||||
|
||||
definitions = filter(lambda x: len(x) > 0, segments[1:])
|
||||
definitions = '; '.join(definitions)
|
||||
definitions = re.sub('\(\d+\)\s*', str(), definitions)
|
||||
glossary = filter(lambda x: len(x) > 0, segments[1:])
|
||||
glossary = '; '.join(glossary)
|
||||
glossary = re.sub('\(\d+\)\s*', str(), glossary)
|
||||
|
||||
tags = list()
|
||||
for group in re.findall('\(([^\)\]]+)\)', definitions):
|
||||
for group in re.findall('\(([^\)\]]+)\)', glossary):
|
||||
tags.extend(group.split(','))
|
||||
|
||||
tags = set(tags).intersection(GRAMMAR_TAGS)
|
||||
tags = set(tags).intersection(PARSED_TAGS)
|
||||
tags = ' '.join(sorted(tags))
|
||||
|
||||
results.append((term, reading, definitions, tags))
|
||||
results.append((term, reading, glossary, tags))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def writeEdict(cursor, values):
|
||||
cursor.execute('DROP TABLE IF EXISTS Terms')
|
||||
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
|
||||
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)')
|
||||
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)
|
||||
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:da31b200f6362ba5041bbb848d9c7e3d991a96dfe395d18255333107f21a205c
|
||||
size 20322304
|
||||
oid sha256:4718fcf7ca6fbb26611ba5246e75faed0a4d8ccb994e811724a5c5ca1b9e182a
|
||||
size 20370432
|
||||
|
Loading…
Reference in New Issue
Block a user