yomichan/util/compile.py

#!/usr/bin/env python

# Copyright (C) 2016  Alex Yatskov <alex@foosoft.net>
# Author: Alex Yatskov <alex@foosoft.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import codecs
import optparse
import os.path
import re


PARSED_TAGS = {
    'P',       # common word
    'adj',     # former adjective classification (being removed)
    'adj-f',   # noun or verb acting prenominally (other than the above)
    'adj-i',   # adjective (keiyoushi)
    'adj-na',  # adjectival nouns or quasi-adjectives (keiyodoshi)
    'adj-no',  # nouns which may take the genitive case particle `no'
    'adj-pn',  # pre-noun adjectival (rentaishi)
    'adj-t',   # `taru' adjective
    'adv',     # adverb (fukushi)
    'adv-n',   # adverbial noun
    'adv-to',  # adverb taking the `to' particle
    'aux',     # auxiliary
    'aux-adj', # auxiliary adjective
    'aux-v',   # auxiliary verb
    'c',       # company name
    'conj',    # conjunction
    'ctr',     # counter
    'exp',     # Expressions (phrases, clauses, etc.)
    'f',       # female given name
    'g',       # given name, as-yet not classified by sex
    'h',       # full (usually family plus given) name of a particular person
    'int',     # interjection (kandoushi)
    'iv',      # irregular verb
    'm',       # male given name
    'n',       # noun (common) (futsuumeishi)
    'n-adv',   # adverbial noun (fukushitekimeishi)
    'n-pref',  # noun, used as a prefix
    'n-suf',   # noun, used as a suffix
    'n-t',     # noun (temporal) (jisoumeishi)
    'num',     # numeric
    'p',       # place-name
    'pn',      # pronoun
    'pr',      # product name
    'pref' ,   # prefix
    'prt',     # particle
    's',       # surname
    'st',      # stations
    'suf',     # suffix
    'u',       # person name, either given or surname, as-yet unclassified
    'v1',      # Ichidan verb
    'v2a-s',   # Nidan verb with 'u' ending (archaic)
    'v4h',     # Yodan verb with `hu/fu' ending (archaic)
    'v4r',     # Yodan verb with `ru' ending (archaic)
    'v5',      # Godan verb (not completely classified)
    'v5aru',   # Godan verb - -aru special class
    'v5b',     # Godan verb with `bu' ending
    'v5g',     # Godan verb with `gu' ending
    'v5k',     # Godan verb with `ku' ending
    'v5k-s',   # Godan verb - iku/yuku special class
    'v5m',     # Godan verb with `mu' ending
    'v5n',     # Godan verb with `nu' ending
    'v5r',     # Godan verb with `ru' ending
    'v5r-i',   # Godan verb with `ru' ending (irregular verb)
    'v5s',     # Godan verb with `su' ending
    'v5t',     # Godan verb with `tsu' ending
    'v5u',     # Godan verb with `u' ending
    'v5u-s',   # Godan verb with `u' ending (special class)
    'v5uru',   # Godan verb - uru old class verb (old form of Eru)
    'v5z',     # Godan verb with `zu' ending
    'vi',      # intransitive verb
    'vk',      # kuru verb - special class
    'vn',      # irregular nu verb
    'vs',      # noun or participle which takes the aux. verb suru
    'vs-c',    # su verb - precursor to the modern suru
    'vs-i',    # suru verb - irregular
    'vs-s',    # suru verb - special class
    'vt',      # transitive verb
    'vz',      # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
}


def is_hiragana(c):
    return 0x3040 <= ord(c) < 0x30a0


def is_katakana(c):
    return 0x30a0 <= ord(c) < 0x3100


def load_definitions(path):
    print('Parsing "{0}"...'.format(path))
    with codecs.open(path, encoding='euc-jp') as fp:
        return filter(lambda x: x and x[0] != '#', fp.read().splitlines())


def parse_kanji_dic(path):
    results = []

    for line in load_definitions(path):
        segments = line.split()
        character = segments[0]
        kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
        onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
        glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
        results.append((character, kunyomi, onyomi, glossary))

    return results


def parse_edict(path):
    results = []
    for line in load_definitions(path):
        segments = line.split('/')

        expression = segments[0].split(' ')
        term = expression[0]
        match = re.search('\[([^\]]+)\]', expression[1])
        reading = '' if match is None else match.group(1)

        glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
        glossary = re.sub('\(\d+\)\s*', '', glossary)

        tags = []
        for group in re.findall('\(([^\)\]]+)\)', glossary):
            tags.extend(group.split(','))

        tags = set(tags).intersection(PARSED_TAGS)
        tags = ' '.join(tags)

        results.append((term, reading, glossary, tags))

    return results[1:]


def build_dict(output_dir, input_file, parser):
    if input_file is not None:
        base = os.path.splitext(os.path.basename(input_file))[0]
        with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
            for d in parser(input_file):
                fp.write('\t'.join(d) + '\n')


def build(dict_dir, kanjidic, edict, enamdict):
    build_dict(dict_dir, kanjidic, parse_kanji_dic)
    build_dict(dict_dir, edict, parse_edict)
    build_dict(dict_dir, enamdict, parse_edict)


def main():
    parser = optparse.OptionParser()
    parser.add_option('--kanjidic', dest='kanjidic')
    parser.add_option('--edict', dest='edict')
    parser.add_option('--enamdict', dest='enamdict')

    options, args = parser.parse_args()

    if len(args) == 0:
        parser.print_help()
    else:
        build(args[0], options.kanjidic, options.edict, options.enamdict)


if __name__ == '__main__':
    main()
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`#!/usr/bin/env python`

Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`# Copyright (C) 2016 Alex Yatskov <alex@foosoft.net>`
			`# Author: Alex Yatskov <alex@foosoft.net>`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`


			`import codecs`
			`import optparse`
Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`import os.path`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`import re`


			`PARSED_TAGS = {`
			`'P', # common word`
			`'adj', # former adjective classification (being removed)`
			`'adj-f', # noun or verb acting prenominally (other than the above)`
			`'adj-i', # adjective (keiyoushi)`
			`'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)`
			'adj-no', # nouns which may take the genitive case particle `no'
			`'adj-pn', # pre-noun adjectival (rentaishi)`
			'adj-t', # `taru' adjective
			`'adv', # adverb (fukushi)`
			`'adv-n', # adverbial noun`
			'adv-to', # adverb taking the `to' particle
			`'aux', # auxiliary`
			`'aux-adj', # auxiliary adjective`
			`'aux-v', # auxiliary verb`
			`'c', # company name`
			`'conj', # conjunction`
			`'ctr', # counter`
			`'exp', # Expressions (phrases, clauses, etc.)`
			`'f', # female given name`
			`'g', # given name, as-yet not classified by sex`
			`'h', # full (usually family plus given) name of a particular person`
			`'int', # interjection (kandoushi)`
			`'iv', # irregular verb`
			`'m', # male given name`
			`'n', # noun (common) (futsuumeishi)`
			`'n-adv', # adverbial noun (fukushitekimeishi)`
			`'n-pref', # noun, used as a prefix`
			`'n-suf', # noun, used as a suffix`
			`'n-t', # noun (temporal) (jisoumeishi)`
			`'num', # numeric`
			`'p', # place-name`
			`'pn', # pronoun`
			`'pr', # product name`
			`'pref' , # prefix`
			`'prt', # particle`
			`'s', # surname`
			`'st', # stations`
			`'suf', # suffix`
			`'u', # person name, either given or surname, as-yet unclassified`
			`'v1', # Ichidan verb`
			`'v2a-s', # Nidan verb with 'u' ending (archaic)`
			'v4h', # Yodan verb with `hu/fu' ending (archaic)
			'v4r', # Yodan verb with `ru' ending (archaic)
			`'v5', # Godan verb (not completely classified)`
			`'v5aru', # Godan verb - -aru special class`
			'v5b', # Godan verb with `bu' ending
			'v5g', # Godan verb with `gu' ending
			'v5k', # Godan verb with `ku' ending
			`'v5k-s', # Godan verb - iku/yuku special class`
			'v5m', # Godan verb with `mu' ending
			'v5n', # Godan verb with `nu' ending
			'v5r', # Godan verb with `ru' ending
			'v5r-i', # Godan verb with `ru' ending (irregular verb)
			'v5s', # Godan verb with `su' ending
			'v5t', # Godan verb with `tsu' ending
			'v5u', # Godan verb with `u' ending
			'v5u-s', # Godan verb with `u' ending (special class)
			`'v5uru', # Godan verb - uru old class verb (old form of Eru)`
			'v5z', # Godan verb with `zu' ending
			`'vi', # intransitive verb`
			`'vk', # kuru verb - special class`
			`'vn', # irregular nu verb`
			`'vs', # noun or participle which takes the aux. verb suru`
			`'vs-c', # su verb - precursor to the modern suru`
			`'vs-i', # suru verb - irregular`
			`'vs-s', # suru verb - special class`
			`'vt', # transitive verb`
			`'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)`
			`}`


Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`def is_hiragana(c):`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`return 0x3040 <= ord(c) < 0x30a0`


Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`def is_katakana(c):`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`return 0x30a0 <= ord(c) < 0x3100`


Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`def load_definitions(path):`
			`print('Parsing "{0}"...'.format(path))`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`with codecs.open(path, encoding='euc-jp') as fp:`
			`return filter(lambda x: x and x[0] != '#', fp.read().splitlines())`


Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`def parse_kanji_dic(path):`
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`results = []`
Adding source dictionary files 2016-03-20 18:05:44 +00:00
Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`for line in load_definitions(path):`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`segments = line.split()`
			`character = segments[0]`
Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))`
			`onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`glossary = '; '.join(re.findall('\{([^\}]+)\}', line))`
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`results.append((character, kunyomi, onyomi, glossary))`
Adding source dictionary files 2016-03-20 18:05:44 +00:00
			`return results`


Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`def parse_edict(path):`
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`results = []`
Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`for line in load_definitions(path):`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`segments = line.split('/')`

			`expression = segments[0].split(' ')`
			`term = expression[0]`
			`match = re.search('\[([^\]]+)\]', expression[1])`
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`reading = '' if match is None else match.group(1)`
Adding source dictionary files 2016-03-20 18:05:44 +00:00
Dictionary updates 2016-03-20 20:23:21 +00:00			`glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))`
			`glossary = re.sub('\(\d+\)\s*', '', glossary)`
Adding source dictionary files 2016-03-20 18:05:44 +00:00
Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`tags = []`
Adding source dictionary files 2016-03-20 18:05:44 +00:00			`for group in re.findall('\(([^\)\]]+)\)', glossary):`
			`tags.extend(group.split(','))`

Dictionary updates 2016-03-20 20:23:21 +00:00			`tags = set(tags).intersection(PARSED_TAGS)`
			`tags = ' '.join(tags)`
Adding source dictionary files 2016-03-20 18:05:44 +00:00
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`results.append((term, reading, glossary, tags))`
Simple lookup now works. 2016-03-21 00:15:40 +00:00
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`return results[1:]`
Adding source dictionary files 2016-03-20 18:05:44 +00:00

Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`def build_dict(output_dir, input_file, parser):`
			`if input_file is not None:`
			`base = os.path.splitext(os.path.basename(input_file))[0]`
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:`
			`for d in parser(input_file):`
			`fp.write('\t'.join(d) + '\n')`
Adding source dictionary files 2016-03-20 18:05:44 +00:00

Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`def build(dict_dir, kanjidic, edict, enamdict):`
Adding converted json dictionaries 2016-03-20 18:43:28 +00:00			`build_dict(dict_dir, kanjidic, parse_kanji_dic)`
			`build_dict(dict_dir, edict, parse_edict)`
			`build_dict(dict_dir, enamdict, parse_edict)`
Adding source dictionary files 2016-03-20 18:05:44 +00:00

			`def main():`
			`parser = optparse.OptionParser()`
			`parser.add_option('--kanjidic', dest='kanjidic')`
			`parser.add_option('--edict', dest='edict')`
			`parser.add_option('--enamdict', dest='enamdict')`

			`options, args = parser.parse_args()`

			`if len(args) == 0:`
			`parser.print_help()`
			`else:`
Moving large files to CSV format, deleting unused kradfile 2016-04-01 03:03:39 +00:00			`build(args[0], options.kanjidic, options.edict, options.enamdict)`
Adding source dictionary files 2016-03-20 18:05:44 +00:00

			`if __name__ == '__main__':`
			`main()`