256 lines
8.9 KiB
Python
Executable File
256 lines
8.9 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# Copyright (C) 2016 Alex Yatskov <alex@foosoft.net>
|
|
# Author: Alex Yatskov <alex@foosoft.net>
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
import codecs
|
|
import json
|
|
import optparse
|
|
import os.path
|
|
import re
|
|
|
|
|
|
PARSED_TAGS = {
|
|
'Buddh': 'Buddhist term',
|
|
'MA': 'martial arts term',
|
|
'X': 'rude or X-rated term',
|
|
'abbr': 'abbreviation',
|
|
'adj': 'former adjective classification (being removed)',
|
|
'adj-f': 'noun or verb acting prenominally (other than the above)',
|
|
'adj-i': 'adjective (keiyoushi)',
|
|
'adj-na': 'adjectival nouns or quasi-adjectives (keiyodoshi)',
|
|
'adj-no': 'nouns which may take the genitive case particle "no"',
|
|
'adj-pn': 'pre-noun adjectival (rentaishi)',
|
|
'adj-t': '"taru" adjective',
|
|
'adv': 'adverb (fukushi)',
|
|
'adv-n': 'adverbial noun',
|
|
'adv-to': 'adverb taking the "to" particle',
|
|
'arch': 'archaism',
|
|
'ateji': 'ateji (phonetic) reading',
|
|
'aux': 'auxiliary',
|
|
'aux-adj': 'auxiliary adjective',
|
|
'aux-v': 'auxiliary verb',
|
|
'c': 'company name',
|
|
'chn': 'children\'s language',
|
|
'col': 'colloquialism',
|
|
'comp': 'computer terminology',
|
|
'conj': 'conjunction',
|
|
'ctr': 'counter',
|
|
'derog': 'derogatory term',
|
|
'eK': 'exclusively kanji',
|
|
'ek': 'exclusively kana',
|
|
'exp': 'Expressions (phrases, clauses, etc.)',
|
|
'f': 'female given name',
|
|
'fam': 'familiar language',
|
|
'fem': 'female term or language',
|
|
'food': 'food term',
|
|
'g': 'given name, as-yet not classified by sex',
|
|
'geom': 'geometry term',
|
|
'gikun': 'gikun (meaning) reading',
|
|
'gram': 'grammatical term',
|
|
'h': 'full (usually family plus given) name of a particular person',
|
|
'hon': 'honorific or respectful (sonkeigo) language',
|
|
'hum': 'humble (kenjougo) language',
|
|
'iK': 'word containing irregular kanji usage',
|
|
'id': 'idiomatic expression',
|
|
'ik': 'word containing irregular kana usage',
|
|
'int': 'interjection (kandoushi)',
|
|
'io': 'irregular okurigana usage',
|
|
'iv': 'irregular verb',
|
|
'ling': 'linguistics terminology',
|
|
'm': 'male given name',
|
|
'm-sl': 'manga slang',
|
|
'male': 'male term or language',
|
|
'male-sl': 'male slang',
|
|
'math': 'mathematics',
|
|
'mil': 'military',
|
|
'n': 'noun (common) (futsuumeishi)',
|
|
'n-adv': 'adverbial noun (fukushitekimeishi)',
|
|
'n-pref': 'noun, used as a prefix',
|
|
'n-suf': 'noun, used as a suffix',
|
|
'n-t': 'noun (temporal) (jisoumeishi)',
|
|
'num': 'numeric',
|
|
'oK': 'word containing out-dated kanji',
|
|
'obs': 'obsolete term',
|
|
'obsc': 'obscure term',
|
|
'ok': 'out-dated or obsolete kana usage',
|
|
'on-mim': 'onomatopoeic or mimetic word',
|
|
'P': 'popular term',
|
|
'p': 'place-name',
|
|
'physics': 'physics terminology',
|
|
'pn': 'pronoun',
|
|
'poet': 'poetical term',
|
|
'pol': 'polite (teineigo) language',
|
|
'pr': 'product name',
|
|
'pref': 'prefix',
|
|
'prt': 'particle',
|
|
'rare': 'rare (now replaced by "obsc")',
|
|
's': 'surname',
|
|
'sens': 'sensitive word',
|
|
'sl': 'slang',
|
|
'st': 'stations',
|
|
'suf': 'suffix',
|
|
'u': 'person name, either given or surname, as-yet unclassified',
|
|
'uK': 'word usually written using kanji alone',
|
|
'uk': 'word usually written using kana alone',
|
|
'v1': 'Ichidan verb',
|
|
'v2a-s': 'Nidan verb with "u" ending (archaic)',
|
|
'v4h': 'Yodan verb with "hu/fu" ending (archaic)',
|
|
'v4r': 'Yodan verb with "ru" ending (archaic)',
|
|
'v5': 'Godan verb (not completely classified)',
|
|
# 'v5aru': 'Godan verb - -aru special class',
|
|
# 'v5b': 'Godan verb with "bu" ending',
|
|
# 'v5g': 'Godan verb with "gu" ending',
|
|
# 'v5k': 'Godan verb with "ku" ending',
|
|
# 'v5k-s': 'Godan verb - iku/yuku special class',
|
|
# 'v5m': 'Godan verb with "mu" ending',
|
|
# 'v5n': 'Godan verb with "nu" ending',
|
|
# 'v5r': 'Godan verb with "ru" ending',
|
|
# 'v5r-i': 'Godan verb with "ru" ending (irregular verb)',
|
|
# 'v5s': 'Godan verb with "su" ending',
|
|
# 'v5t': 'Godan verb with "tsu" ending',
|
|
# 'v5u': 'Godan verb with "u" ending',
|
|
# 'v5u-s': 'Godan verb with "u" ending (special class)',
|
|
# 'v5uru': 'Godan verb - uru old class verb (old form of Eru)',
|
|
# 'v5z': 'Godan verb with "zu" ending',
|
|
'vi': 'intransitive verb',
|
|
'vk': 'kuru verb - special class',
|
|
'vn': 'irregular nu verb',
|
|
'vs': 'noun or participle which takes the aux. verb suru',
|
|
'vs-c': 'su verb - precursor to the modern suru',
|
|
'vs-i': 'suru verb - irregular',
|
|
'vs-s': 'suru verb - special class',
|
|
'vt': 'transitive ver',
|
|
'vulg': 'vulgar expression or word',
|
|
'vz': 'Ichidan verb - zuru verb - (alternative form of -jiru verbs)',
|
|
}
|
|
|
|
|
|
def is_hiragana(c):
|
|
return 0x3040 <= ord(c) < 0x30a0
|
|
|
|
|
|
def is_katakana(c):
|
|
return 0x30a0 <= ord(c) < 0x3100
|
|
|
|
|
|
def load_definitions(path):
|
|
print('Parsing "{0}"...'.format(path))
|
|
with codecs.open(path, encoding='euc-jp') as fp:
|
|
return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
|
|
|
|
|
|
def parse_kanji_dic(path):
|
|
results = {}
|
|
for line in load_definitions(path):
|
|
segments = line.split()
|
|
character = segments[0]
|
|
kunyomi = ' '.join(filter(lambda x: list(filter(is_hiragana, x)), segments[1:]))
|
|
onyomi = ' '.join(filter(lambda x: list(filter(is_katakana, x)), segments[1:]))
|
|
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
|
results[character] = (kunyomi or None, onyomi or None, glossary)
|
|
|
|
return results
|
|
|
|
|
|
def fixup_godan_verbs(tags):
|
|
results = []
|
|
for tag in tags:
|
|
if tag.startswith('v5'):
|
|
tag = 'v5'
|
|
results.append(tag)
|
|
|
|
return set(results)
|
|
|
|
|
|
def parse_edict(path):
|
|
results = []
|
|
for line in load_definitions(path):
|
|
segments = line.split('/')
|
|
|
|
exp_parts = segments[0].split(' ')
|
|
expression = exp_parts[0]
|
|
reading_match = re.search('\[([^\]]+)\]', exp_parts[1])
|
|
reading = None if reading_match is None else reading_match.group(1)
|
|
|
|
defs = []
|
|
tags = set()
|
|
|
|
for index, dfn in enumerate(filter(None, segments[1:])):
|
|
dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn)
|
|
|
|
gloss = dfn_match.group(2).strip()
|
|
if len(gloss) == 0:
|
|
continue
|
|
|
|
tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
|
|
tags_raw = fixup_godan_verbs(tags_raw)
|
|
tags_raw = tags_raw.intersection(set(PARSED_TAGS.keys()))
|
|
tags = tags.union(tags_raw)
|
|
|
|
if index == 0 or len(dfn_match.group(1)) > 0:
|
|
defs.append([gloss])
|
|
else:
|
|
defs[-1].append(gloss)
|
|
|
|
result = [expression, reading, ' '.join(tags)]
|
|
result += map(lambda x: '; '.join(x), defs)
|
|
|
|
results.append(result)
|
|
|
|
indices = {}
|
|
for i, d in enumerate(results):
|
|
for key in d[:2]:
|
|
if key is not None:
|
|
values = indices.get(key, [])
|
|
values.append(i)
|
|
indices[key] = values
|
|
|
|
return {'defs': results, 'indices': indices}
|
|
|
|
|
|
def build_dict(output_dir, input_file, parser):
|
|
if input_file is not None:
|
|
base = os.path.splitext(os.path.basename(input_file))[0]
|
|
with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
|
|
# json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
|
|
json.dump(parser(input_file), fp, separators=(',', ':'))
|
|
|
|
|
|
def build(dict_dir, kanjidic, edict, enamdict):
|
|
build_dict(dict_dir, kanjidic, parse_kanji_dic)
|
|
build_dict(dict_dir, edict, parse_edict)
|
|
build_dict(dict_dir, enamdict, parse_edict)
|
|
|
|
|
|
def main():
|
|
parser = optparse.OptionParser()
|
|
parser.add_option('--kanjidic', dest='kanjidic')
|
|
parser.add_option('--edict', dest='edict')
|
|
parser.add_option('--enamdict', dest='enamdict')
|
|
|
|
options, args = parser.parse_args()
|
|
|
|
if len(args) == 0:
|
|
parser.print_help()
|
|
else:
|
|
build(args[0], options.kanjidic, options.edict, options.enamdict)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|