2016-03-20 18:05:44 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
# Copyright (C) 2016 Alex Yatskov <alex@foosoft.net>
|
|
|
|
# Author: Alex Yatskov <alex@foosoft.net>
|
2016-03-20 18:05:44 +00:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
import codecs
|
|
|
|
import optparse
|
2016-03-20 18:43:28 +00:00
|
|
|
import os.path
|
2016-03-20 18:05:44 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
PARSED_TAGS = {
|
|
|
|
'P', # common word
|
|
|
|
'adj', # former adjective classification (being removed)
|
|
|
|
'adj-f', # noun or verb acting prenominally (other than the above)
|
|
|
|
'adj-i', # adjective (keiyoushi)
|
|
|
|
'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)
|
|
|
|
'adj-no', # nouns which may take the genitive case particle `no'
|
|
|
|
'adj-pn', # pre-noun adjectival (rentaishi)
|
|
|
|
'adj-t', # `taru' adjective
|
|
|
|
'adv', # adverb (fukushi)
|
|
|
|
'adv-n', # adverbial noun
|
|
|
|
'adv-to', # adverb taking the `to' particle
|
|
|
|
'aux', # auxiliary
|
|
|
|
'aux-adj', # auxiliary adjective
|
|
|
|
'aux-v', # auxiliary verb
|
|
|
|
'c', # company name
|
|
|
|
'conj', # conjunction
|
|
|
|
'ctr', # counter
|
|
|
|
'exp', # Expressions (phrases, clauses, etc.)
|
|
|
|
'f', # female given name
|
|
|
|
'g', # given name, as-yet not classified by sex
|
|
|
|
'h', # full (usually family plus given) name of a particular person
|
|
|
|
'int', # interjection (kandoushi)
|
|
|
|
'iv', # irregular verb
|
|
|
|
'm', # male given name
|
|
|
|
'n', # noun (common) (futsuumeishi)
|
|
|
|
'n-adv', # adverbial noun (fukushitekimeishi)
|
|
|
|
'n-pref', # noun, used as a prefix
|
|
|
|
'n-suf', # noun, used as a suffix
|
|
|
|
'n-t', # noun (temporal) (jisoumeishi)
|
|
|
|
'num', # numeric
|
|
|
|
'p', # place-name
|
|
|
|
'pn', # pronoun
|
|
|
|
'pr', # product name
|
|
|
|
'pref' , # prefix
|
|
|
|
'prt', # particle
|
|
|
|
's', # surname
|
|
|
|
'st', # stations
|
|
|
|
'suf', # suffix
|
|
|
|
'u', # person name, either given or surname, as-yet unclassified
|
|
|
|
'v1', # Ichidan verb
|
|
|
|
'v2a-s', # Nidan verb with 'u' ending (archaic)
|
|
|
|
'v4h', # Yodan verb with `hu/fu' ending (archaic)
|
|
|
|
'v4r', # Yodan verb with `ru' ending (archaic)
|
|
|
|
'v5', # Godan verb (not completely classified)
|
|
|
|
'v5aru', # Godan verb - -aru special class
|
|
|
|
'v5b', # Godan verb with `bu' ending
|
|
|
|
'v5g', # Godan verb with `gu' ending
|
|
|
|
'v5k', # Godan verb with `ku' ending
|
|
|
|
'v5k-s', # Godan verb - iku/yuku special class
|
|
|
|
'v5m', # Godan verb with `mu' ending
|
|
|
|
'v5n', # Godan verb with `nu' ending
|
|
|
|
'v5r', # Godan verb with `ru' ending
|
|
|
|
'v5r-i', # Godan verb with `ru' ending (irregular verb)
|
|
|
|
'v5s', # Godan verb with `su' ending
|
|
|
|
'v5t', # Godan verb with `tsu' ending
|
|
|
|
'v5u', # Godan verb with `u' ending
|
|
|
|
'v5u-s', # Godan verb with `u' ending (special class)
|
|
|
|
'v5uru', # Godan verb - uru old class verb (old form of Eru)
|
|
|
|
'v5z', # Godan verb with `zu' ending
|
|
|
|
'vi', # intransitive verb
|
|
|
|
'vk', # kuru verb - special class
|
|
|
|
'vn', # irregular nu verb
|
|
|
|
'vs', # noun or participle which takes the aux. verb suru
|
|
|
|
'vs-c', # su verb - precursor to the modern suru
|
|
|
|
'vs-i', # suru verb - irregular
|
|
|
|
'vs-s', # suru verb - special class
|
|
|
|
'vt', # transitive verb
|
|
|
|
'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def is_hiragana(c):
|
2016-03-20 18:05:44 +00:00
|
|
|
return 0x3040 <= ord(c) < 0x30a0
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def is_katakana(c):
|
2016-03-20 18:05:44 +00:00
|
|
|
return 0x30a0 <= ord(c) < 0x3100
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def load_definitions(path):
|
|
|
|
print('Parsing "{0}"...'.format(path))
|
2016-03-20 18:05:44 +00:00
|
|
|
with codecs.open(path, encoding='euc-jp') as fp:
|
|
|
|
return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def parse_kanji_dic(path):
|
2016-04-01 03:03:39 +00:00
|
|
|
results = []
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
for line in load_definitions(path):
|
2016-03-20 18:05:44 +00:00
|
|
|
segments = line.split()
|
|
|
|
character = segments[0]
|
2016-03-20 18:43:28 +00:00
|
|
|
kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
|
|
|
|
onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
|
2016-03-20 18:05:44 +00:00
|
|
|
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
2016-04-01 03:03:39 +00:00
|
|
|
results.append((character, kunyomi, onyomi, glossary))
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def parse_edict(path):
|
2016-04-01 03:03:39 +00:00
|
|
|
results = []
|
2016-03-20 18:43:28 +00:00
|
|
|
for line in load_definitions(path):
|
2016-03-20 18:05:44 +00:00
|
|
|
segments = line.split('/')
|
|
|
|
|
|
|
|
expression = segments[0].split(' ')
|
|
|
|
term = expression[0]
|
|
|
|
match = re.search('\[([^\]]+)\]', expression[1])
|
2016-04-01 03:03:39 +00:00
|
|
|
reading = '' if match is None else match.group(1)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-03-20 20:23:21 +00:00
|
|
|
glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
|
|
|
|
glossary = re.sub('\(\d+\)\s*', '', glossary)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
tags = []
|
2016-03-20 18:05:44 +00:00
|
|
|
for group in re.findall('\(([^\)\]]+)\)', glossary):
|
|
|
|
tags.extend(group.split(','))
|
|
|
|
|
2016-03-20 20:23:21 +00:00
|
|
|
tags = set(tags).intersection(PARSED_TAGS)
|
|
|
|
tags = ' '.join(tags)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
2016-04-01 03:03:39 +00:00
|
|
|
results.append((term, reading, glossary, tags))
|
2016-03-21 00:15:40 +00:00
|
|
|
|
2016-04-01 03:03:39 +00:00
|
|
|
return results[1:]
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
2016-03-20 18:43:28 +00:00
|
|
|
def build_dict(output_dir, input_file, parser):
|
|
|
|
if input_file is not None:
|
|
|
|
base = os.path.splitext(os.path.basename(input_file))[0]
|
2016-04-01 03:03:39 +00:00
|
|
|
with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
|
|
|
|
for d in parser(input_file):
|
|
|
|
fp.write('\t'.join(d) + '\n')
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
2016-04-01 03:03:39 +00:00
|
|
|
def build(dict_dir, kanjidic, edict, enamdict):
|
2016-03-20 18:43:28 +00:00
|
|
|
build_dict(dict_dir, kanjidic, parse_kanji_dic)
|
|
|
|
build_dict(dict_dir, edict, parse_edict)
|
|
|
|
build_dict(dict_dir, enamdict, parse_edict)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = optparse.OptionParser()
|
|
|
|
parser.add_option('--kanjidic', dest='kanjidic')
|
|
|
|
parser.add_option('--edict', dest='edict')
|
|
|
|
parser.add_option('--enamdict', dest='enamdict')
|
|
|
|
|
|
|
|
options, args = parser.parse_args()
|
|
|
|
|
|
|
|
if len(args) == 0:
|
|
|
|
parser.print_help()
|
|
|
|
else:
|
2016-04-01 03:03:39 +00:00
|
|
|
build(args[0], options.kanjidic, options.edict, options.enamdict)
|
2016-03-20 18:05:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|