2016-03-20 18:05:44 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
# Copyright (C) 2016 Alex Yatskov <alex@foosoft.net>
|
|
|
|
# Author: Alex Yatskov <alex@foosoft.net>
|
2016-03-20 18:05:44 +00:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
import codecs
|
2016-04-13 03:17:40 +00:00
|
|
|
import json
|
2016-03-20 18:05:44 +00:00
|
|
|
import optparse
|
2016-03-20 18:43:28 +00:00
|
|
|
import os.path
|
2016-03-20 18:05:44 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
PARSED_TAGS = {
|
2016-04-30 05:25:33 +00:00
|
|
|
'Buddh',
|
|
|
|
'MA',
|
|
|
|
'X',
|
|
|
|
'abbr',
|
|
|
|
'adj',
|
|
|
|
'adj-f',
|
|
|
|
'adj-i',
|
|
|
|
'adj-na',
|
|
|
|
'adj-no',
|
|
|
|
'adj-pn',
|
|
|
|
'adj-t',
|
|
|
|
'adv',
|
|
|
|
'adv-n',
|
|
|
|
'adv-to',
|
|
|
|
'arch',
|
|
|
|
'ateji',
|
|
|
|
'aux',
|
|
|
|
'aux-adj',
|
|
|
|
'aux-v',
|
|
|
|
'c',
|
|
|
|
'chn',
|
|
|
|
'col',
|
|
|
|
'comp',
|
|
|
|
'conj',
|
|
|
|
'ctr',
|
|
|
|
'derog',
|
|
|
|
'eK',
|
|
|
|
'ek',
|
|
|
|
'exp',
|
|
|
|
'f',
|
|
|
|
'fam',
|
|
|
|
'fem',
|
|
|
|
'food',
|
|
|
|
'g',
|
|
|
|
'geom',
|
|
|
|
'gikun',
|
|
|
|
'gram',
|
|
|
|
'h',
|
|
|
|
'hon',
|
|
|
|
'hum',
|
|
|
|
'iK',
|
|
|
|
'id',
|
|
|
|
'ik',
|
|
|
|
'int',
|
|
|
|
'io',
|
|
|
|
'iv',
|
|
|
|
'ling',
|
|
|
|
'm',
|
|
|
|
'm-sl',
|
|
|
|
'male',
|
|
|
|
'male-sl',
|
|
|
|
'math',
|
|
|
|
'mil',
|
|
|
|
'n',
|
|
|
|
'n-adv',
|
|
|
|
'n-pref',
|
|
|
|
'n-suf',
|
|
|
|
'n-t',
|
|
|
|
'num',
|
|
|
|
'oK',
|
|
|
|
'obs',
|
|
|
|
'obsc',
|
|
|
|
'ok',
|
|
|
|
'on-mim',
|
|
|
|
'P',
|
|
|
|
'p',
|
|
|
|
'physics',
|
|
|
|
'pn',
|
|
|
|
'poet',
|
|
|
|
'pol',
|
|
|
|
'pr',
|
|
|
|
'pref',
|
|
|
|
'prt',
|
|
|
|
'rare',
|
|
|
|
's',
|
|
|
|
'sens',
|
|
|
|
'sl',
|
|
|
|
'st',
|
|
|
|
'suf',
|
|
|
|
'u',
|
|
|
|
'uK',
|
|
|
|
'uk',
|
|
|
|
'v1',
|
|
|
|
'v2a-s',
|
|
|
|
'v4h',
|
|
|
|
'v4r',
|
|
|
|
'v5',
|
|
|
|
'v5aru',
|
|
|
|
'v5b',
|
|
|
|
'v5g',
|
|
|
|
'v5k',
|
|
|
|
'v5k-s',
|
|
|
|
'v5m',
|
|
|
|
'v5n',
|
|
|
|
'v5r',
|
|
|
|
'v5r-i',
|
|
|
|
'v5s',
|
|
|
|
'v5t',
|
|
|
|
'v5u',
|
|
|
|
'v5u-s',
|
|
|
|
'v5uru',
|
|
|
|
'v5z',
|
|
|
|
'vi',
|
|
|
|
'vk',
|
|
|
|
'vn',
|
|
|
|
'vs',
|
|
|
|
'vs-c',
|
|
|
|
'vs-i',
|
|
|
|
'vs-s',
|
|
|
|
'vt',
|
|
|
|
'vulg',
|
|
|
|
'vz'
|
2016-03-20 18:05:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def is_hiragana(c):
|
2016-03-20 18:05:44 +00:00
|
|
|
return 0x3040 <= ord(c) < 0x30a0
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def is_katakana(c):
|
2016-03-20 18:05:44 +00:00
|
|
|
return 0x30a0 <= ord(c) < 0x3100
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def load_definitions(path):
|
|
|
|
print('Parsing "{0}"...'.format(path))
|
2016-03-20 18:05:44 +00:00
|
|
|
with codecs.open(path, encoding='euc-jp') as fp:
|
|
|
|
return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def parse_kanji_dic(path):
|
2016-04-13 03:48:11 +00:00
|
|
|
results = {}
|
2016-03-20 18:43:28 +00:00
|
|
|
for line in load_definitions(path):
|
2016-03-20 18:05:44 +00:00
|
|
|
segments = line.split()
|
|
|
|
character = segments[0]
|
2016-04-13 03:48:11 +00:00
|
|
|
kunyomi = ' '.join(filter(lambda x: list(filter(is_hiragana, x)), segments[1:]))
|
|
|
|
onyomi = ' '.join(filter(lambda x: list(filter(is_katakana, x)), segments[1:]))
|
2016-04-25 02:02:40 +00:00
|
|
|
glossary = re.findall('\{([^\}]+)\}', line)
|
2016-04-13 03:48:11 +00:00
|
|
|
results[character] = (kunyomi or None, onyomi or None, glossary)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def parse_edict(path):
|
2016-04-01 03:03:39 +00:00
|
|
|
results = []
|
2016-03-20 18:43:28 +00:00
|
|
|
for line in load_definitions(path):
|
2016-03-20 18:05:44 +00:00
|
|
|
segments = line.split('/')
|
|
|
|
|
2016-04-13 03:17:40 +00:00
|
|
|
exp_parts = segments[0].split(' ')
|
|
|
|
expression = exp_parts[0]
|
|
|
|
reading_match = re.search('\[([^\]]+)\]', exp_parts[1])
|
|
|
|
reading = None if reading_match is None else reading_match.group(1)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-04-13 03:17:40 +00:00
|
|
|
defs = []
|
2016-04-14 02:41:43 +00:00
|
|
|
tags = set()
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-04-13 03:17:40 +00:00
|
|
|
for index, dfn in enumerate(filter(None, segments[1:])):
|
2016-04-14 02:41:43 +00:00
|
|
|
dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn)
|
2016-04-13 03:17:40 +00:00
|
|
|
|
2016-04-14 02:41:43 +00:00
|
|
|
tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
|
2016-04-30 05:25:33 +00:00
|
|
|
tags_raw = tags_raw.intersection(PARSED_TAGS)
|
2016-04-14 02:41:43 +00:00
|
|
|
tags = tags.union(tags_raw)
|
2016-04-13 03:17:40 +00:00
|
|
|
|
2016-04-19 02:59:56 +00:00
|
|
|
gloss = dfn_match.group(2).strip()
|
|
|
|
if len(gloss) == 0:
|
|
|
|
continue
|
|
|
|
|
2016-04-13 03:17:40 +00:00
|
|
|
if index == 0 or len(dfn_match.group(1)) > 0:
|
|
|
|
defs.append([gloss])
|
|
|
|
else:
|
|
|
|
defs[-1].append(gloss)
|
|
|
|
|
|
|
|
result = [expression, reading, ' '.join(tags)]
|
|
|
|
result += map(lambda x: '; '.join(x), defs)
|
|
|
|
|
|
|
|
results.append(result)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-04-13 03:17:40 +00:00
|
|
|
indices = {}
|
|
|
|
for i, d in enumerate(results):
|
|
|
|
for key in d[:2]:
|
|
|
|
if key is not None:
|
|
|
|
values = indices.get(key, [])
|
|
|
|
values.append(i)
|
|
|
|
indices[key] = values
|
2016-03-21 00:15:40 +00:00
|
|
|
|
2016-04-13 03:17:40 +00:00
|
|
|
return {'defs': results, 'indices': indices}
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def build_dict(output_dir, input_file, parser):
|
|
|
|
if input_file is not None:
|
|
|
|
base = os.path.splitext(os.path.basename(input_file))[0]
|
2016-04-13 03:17:40 +00:00
|
|
|
with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
|
|
|
|
# json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
|
|
|
|
json.dump(parser(input_file), fp, separators=(',', ':'))
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
2016-04-01 03:03:39 +00:00
|
|
|
def build(dict_dir, kanjidic, edict, enamdict):
|
2016-03-20 18:43:28 +00:00
|
|
|
build_dict(dict_dir, kanjidic, parse_kanji_dic)
|
|
|
|
build_dict(dict_dir, edict, parse_edict)
|
|
|
|
build_dict(dict_dir, enamdict, parse_edict)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = optparse.OptionParser()
|
|
|
|
parser.add_option('--kanjidic', dest='kanjidic')
|
|
|
|
parser.add_option('--edict', dest='edict')
|
|
|
|
parser.add_option('--enamdict', dest='enamdict')
|
|
|
|
|
|
|
|
options, args = parser.parse_args()
|
|
|
|
|
|
|
|
if len(args) == 0:
|
|
|
|
parser.print_help()
|
|
|
|
else:
|
2016-04-01 03:03:39 +00:00
|
|
|
build(args[0], options.kanjidic, options.edict, options.enamdict)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|