Adding converted json dictionaries

This commit is contained in:
Alex Yatskov 2016-03-20 11:43:28 -07:00
parent 23327d7e35
commit fbb7635594
3 changed files with 34 additions and 56 deletions

2
.gitattributes vendored
View File

@ -1,2 +1,2 @@
util/data/* filter=lfs diff=lfs merge=lfs -text util/data/* filter=lfs diff=lfs merge=lfs -text
ext/jp/data/rules.json filter=lfs diff=lfs merge=lfs -text ext/jp/data/* filter=lfs diff=lfs merge=lfs -text

View File

@ -3,7 +3,7 @@
KANJIDIC=util/data/kanjidic KANJIDIC=util/data/kanjidic
EDICT=util/data/edict EDICT=util/data/edict
ENAMDICT=util/data/enamdict ENAMDICT=util/data/enamdict
DICT=ext/jp/data/dict.json KRADFILE=util/data/kradfile
DICT_DIR=ext/jp/data
[ -f $DICT ] && rm $DICT util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
util/compile.py --kanjidic $KANJIDIC --edict $EDICT $DICT --enamdict $ENAMDICT

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2013 Alex Yatskov # Copyright (C) 2016 Alex Yatskov <alex@foosoft.net>
# Author: Alex Yatskov <alex@foosoft.net>
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -18,11 +18,10 @@
import codecs import codecs
import json
import optparse import optparse
import os import os.path
import re import re
import sqlite3
import sys
PARSED_TAGS = { PARSED_TAGS = {
@ -97,44 +96,38 @@ PARSED_TAGS = {
} }
def isHiragana(c): def is_hiragana(c):
return 0x3040 <= ord(c) < 0x30a0 return 0x3040 <= ord(c) < 0x30a0
def isKatakana(c): def is_katakana(c):
return 0x30a0 <= ord(c) < 0x3100 return 0x30a0 <= ord(c) < 0x3100
def loadDefinitions(path): def load_definitions(path):
print 'Parsing "{0}"...'.format(path) print('Parsing "{0}"...'.format(path))
with codecs.open(path, encoding='euc-jp') as fp: with codecs.open(path, encoding='euc-jp') as fp:
return filter(lambda x: x and x[0] != '#', fp.read().splitlines()) return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
def parseKanjiDic(path): def parse_kanji_dic(path):
results = list() results = []
for line in loadDefinitions(path): for line in load_definitions(path):
segments = line.split() segments = line.split()
character = segments[0] character = segments[0]
kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:])) kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:])) onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
glossary = '; '.join(re.findall('\{([^\}]+)\}', line)) glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
results.append((character, kunyomi, onyomi, glossary)) results.append((character, kunyomi, onyomi, glossary))
return results return results
def writeKanjiDic(cursor, values): def parse_krad_file(path):
cursor.execute('DROP TABLE IF EXISTS Kanji') results = []
cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)')
cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
for line in load_definitions(path):
def parseKradFile(path):
results = list()
for line in loadDefinitions(path):
segments = line.split(' ') segments = line.split(' ')
character = segments[0] character = segments[0]
radicals = ' '.join(segments[2:]) radicals = ' '.join(segments[2:])
@ -143,16 +136,10 @@ def parseKradFile(path):
return results return results
def writeKradFile(cursor, values): def parse_edict(path):
cursor.execute('DROP TABLE IF EXISTS Radicals') results = []
cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)
for line in load_definitions(path):
def parseEdict(path):
results = list()
for line in loadDefinitions(path):
segments = line.split('/') segments = line.split('/')
expression = segments[0].split(' ') expression = segments[0].split(' ')
@ -164,7 +151,7 @@ def parseEdict(path):
glossary = '; '.join(glossary) glossary = '; '.join(glossary)
glossary = re.sub('\(\d+\)\s*', str(), glossary) glossary = re.sub('\(\d+\)\s*', str(), glossary)
tags = list() tags = []
for group in re.findall('\(([^\)\]]+)\)', glossary): for group in re.findall('\(([^\)\]]+)\)', glossary):
tags.extend(group.split(',')) tags.extend(group.split(','))
@ -176,27 +163,18 @@ def parseEdict(path):
return results return results
def writeEdict(cursor, values): def build_dict(output_dir, input_file, parser):
cursor.execute('DROP TABLE IF EXISTS Terms') if input_file is not None:
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)') base = os.path.splitext(os.path.basename(input_file))[0]
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values) with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
json.dump(parser(input_file), fp)
def build(path, kanjidic, kradfile, edict, enamdict): def build(dict_dir, kanjidic, kradfile, edict, enamdict):
with sqlite3.connect(path) as db: build_dict(dict_dir, kanjidic, parse_kanji_dic)
if kanjidic is not None: build_dict(dict_dir, kradfile, parse_krad_file)
writeKanjiDic(db, parseKanjiDic(kanjidic)) build_dict(dict_dir, edict, parse_edict)
build_dict(dict_dir, enamdict, parse_edict)
if kradfile is not None:
writeKradFile(db, parseKradFile(kradfile))
terms = []
if edict is not None:
terms += parseEdict(edict)
if enamdict is not None:
terms += parseEdict(enamdict)
if len(terms) > 0:
writeEdict(db, terms)
def main(): def main():