Adding converted json dictionaries
This commit is contained in:
parent
23327d7e35
commit
fbb7635594
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -1,2 +1,2 @@
|
|||||||
util/data/* filter=lfs diff=lfs merge=lfs -text
|
util/data/* filter=lfs diff=lfs merge=lfs -text
|
||||||
ext/jp/data/rules.json filter=lfs diff=lfs merge=lfs -text
|
ext/jp/data/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
KANJIDIC=util/data/kanjidic
|
KANJIDIC=util/data/kanjidic
|
||||||
EDICT=util/data/edict
|
EDICT=util/data/edict
|
||||||
ENAMDICT=util/data/enamdict
|
ENAMDICT=util/data/enamdict
|
||||||
DICT=ext/jp/data/dict.json
|
KRADFILE=util/data/kradfile
|
||||||
|
DICT_DIR=ext/jp/data
|
||||||
|
|
||||||
[ -f $DICT ] && rm $DICT
|
util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
|
||||||
util/compile.py --kanjidic $KANJIDIC --edict $EDICT $DICT --enamdict $ENAMDICT
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Copyright (C) 2013 Alex Yatskov
|
# Copyright (C) 2016 Alex Yatskov <alex@foosoft.net>
|
||||||
|
# Author: Alex Yatskov <alex@foosoft.net>
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify
|
# This program is free software: you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
@ -18,11 +18,10 @@
|
|||||||
|
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
|
import json
|
||||||
import optparse
|
import optparse
|
||||||
import os
|
import os.path
|
||||||
import re
|
import re
|
||||||
import sqlite3
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
PARSED_TAGS = {
|
PARSED_TAGS = {
|
||||||
@ -97,44 +96,38 @@ PARSED_TAGS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def isHiragana(c):
|
def is_hiragana(c):
|
||||||
return 0x3040 <= ord(c) < 0x30a0
|
return 0x3040 <= ord(c) < 0x30a0
|
||||||
|
|
||||||
|
|
||||||
def isKatakana(c):
|
def is_katakana(c):
|
||||||
return 0x30a0 <= ord(c) < 0x3100
|
return 0x30a0 <= ord(c) < 0x3100
|
||||||
|
|
||||||
|
|
||||||
def loadDefinitions(path):
|
def load_definitions(path):
|
||||||
print 'Parsing "{0}"...'.format(path)
|
print('Parsing "{0}"...'.format(path))
|
||||||
with codecs.open(path, encoding='euc-jp') as fp:
|
with codecs.open(path, encoding='euc-jp') as fp:
|
||||||
return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
|
return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
|
||||||
|
|
||||||
|
|
||||||
def parseKanjiDic(path):
|
def parse_kanji_dic(path):
|
||||||
results = list()
|
results = []
|
||||||
|
|
||||||
for line in loadDefinitions(path):
|
for line in load_definitions(path):
|
||||||
segments = line.split()
|
segments = line.split()
|
||||||
character = segments[0]
|
character = segments[0]
|
||||||
kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
|
kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
|
||||||
onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
|
onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
|
||||||
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
||||||
results.append((character, kunyomi, onyomi, glossary))
|
results.append((character, kunyomi, onyomi, glossary))
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def writeKanjiDic(cursor, values):
|
def parse_krad_file(path):
|
||||||
cursor.execute('DROP TABLE IF EXISTS Kanji')
|
results = []
|
||||||
cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)')
|
|
||||||
cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
|
|
||||||
|
|
||||||
|
for line in load_definitions(path):
|
||||||
def parseKradFile(path):
|
|
||||||
results = list()
|
|
||||||
|
|
||||||
for line in loadDefinitions(path):
|
|
||||||
segments = line.split(' ')
|
segments = line.split(' ')
|
||||||
character = segments[0]
|
character = segments[0]
|
||||||
radicals = ' '.join(segments[2:])
|
radicals = ' '.join(segments[2:])
|
||||||
@ -143,16 +136,10 @@ def parseKradFile(path):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def writeKradFile(cursor, values):
|
def parse_edict(path):
|
||||||
cursor.execute('DROP TABLE IF EXISTS Radicals')
|
results = []
|
||||||
cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
|
|
||||||
cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)
|
|
||||||
|
|
||||||
|
for line in load_definitions(path):
|
||||||
def parseEdict(path):
|
|
||||||
results = list()
|
|
||||||
|
|
||||||
for line in loadDefinitions(path):
|
|
||||||
segments = line.split('/')
|
segments = line.split('/')
|
||||||
|
|
||||||
expression = segments[0].split(' ')
|
expression = segments[0].split(' ')
|
||||||
@ -164,7 +151,7 @@ def parseEdict(path):
|
|||||||
glossary = '; '.join(glossary)
|
glossary = '; '.join(glossary)
|
||||||
glossary = re.sub('\(\d+\)\s*', str(), glossary)
|
glossary = re.sub('\(\d+\)\s*', str(), glossary)
|
||||||
|
|
||||||
tags = list()
|
tags = []
|
||||||
for group in re.findall('\(([^\)\]]+)\)', glossary):
|
for group in re.findall('\(([^\)\]]+)\)', glossary):
|
||||||
tags.extend(group.split(','))
|
tags.extend(group.split(','))
|
||||||
|
|
||||||
@ -176,27 +163,18 @@ def parseEdict(path):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def writeEdict(cursor, values):
|
def build_dict(output_dir, input_file, parser):
|
||||||
cursor.execute('DROP TABLE IF EXISTS Terms')
|
if input_file is not None:
|
||||||
cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)')
|
base = os.path.splitext(os.path.basename(input_file))[0]
|
||||||
cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)
|
with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
|
||||||
|
json.dump(parser(input_file), fp)
|
||||||
|
|
||||||
|
|
||||||
def build(path, kanjidic, kradfile, edict, enamdict):
|
def build(dict_dir, kanjidic, kradfile, edict, enamdict):
|
||||||
with sqlite3.connect(path) as db:
|
build_dict(dict_dir, kanjidic, parse_kanji_dic)
|
||||||
if kanjidic is not None:
|
build_dict(dict_dir, kradfile, parse_krad_file)
|
||||||
writeKanjiDic(db, parseKanjiDic(kanjidic))
|
build_dict(dict_dir, edict, parse_edict)
|
||||||
|
build_dict(dict_dir, enamdict, parse_edict)
|
||||||
if kradfile is not None:
|
|
||||||
writeKradFile(db, parseKradFile(kradfile))
|
|
||||||
|
|
||||||
terms = []
|
|
||||||
if edict is not None:
|
|
||||||
terms += parseEdict(edict)
|
|
||||||
if enamdict is not None:
|
|
||||||
terms += parseEdict(enamdict)
|
|
||||||
if len(terms) > 0:
|
|
||||||
writeEdict(db, terms)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
Loading…
Reference in New Issue
Block a user