From cfab4c31eca220ecbab1096b8d11ba7d0d45ed26 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Tue, 12 Apr 2016 20:17:40 -0700 Subject: [PATCH] Updating dictionary format again --- util/compile.py | 219 +++++++++++++++++++++++++++++------------------- 1 file changed, 135 insertions(+), 84 deletions(-) diff --git a/util/compile.py b/util/compile.py index 790ebfc7..7510aa9b 100755 --- a/util/compile.py +++ b/util/compile.py @@ -18,80 +18,114 @@ import codecs +import json import optparse import os.path import re PARSED_TAGS = { - 'P', # common word - 'adj', # former adjective classification (being removed) - 'adj-f', # noun or verb acting prenominally (other than the above) - 'adj-i', # adjective (keiyoushi) - 'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi) - 'adj-no', # nouns which may take the genitive case particle `no' - 'adj-pn', # pre-noun adjectival (rentaishi) - 'adj-t', # `taru' adjective - 'adv', # adverb (fukushi) - 'adv-n', # adverbial noun - 'adv-to', # adverb taking the `to' particle - 'aux', # auxiliary - 'aux-adj', # auxiliary adjective - 'aux-v', # auxiliary verb - 'c', # company name - 'conj', # conjunction - 'ctr', # counter - 'exp', # Expressions (phrases, clauses, etc.) - 'f', # female given name - 'g', # given name, as-yet not classified by sex - 'h', # full (usually family plus given) name of a particular person - 'int', # interjection (kandoushi) - 'iv', # irregular verb - 'm', # male given name - 'n', # noun (common) (futsuumeishi) - 'n-adv', # adverbial noun (fukushitekimeishi) - 'n-pref', # noun, used as a prefix - 'n-suf', # noun, used as a suffix - 'n-t', # noun (temporal) (jisoumeishi) - 'num', # numeric - 'p', # place-name - 'pn', # pronoun - 'pr', # product name - 'pref' , # prefix - 'prt', # particle - 's', # surname - 'st', # stations - 'suf', # suffix - 'u', # person name, either given or surname, as-yet unclassified - 'v1', # Ichidan verb - 'v2a-s', # Nidan verb with 'u' ending (archaic) - 'v4h', # Yodan verb with `hu/fu' ending (archaic) - 'v4r', # Yodan verb with `ru' ending (archaic) - 'v5', # Godan verb (not completely classified) - 'v5aru', # Godan verb - -aru special class - 'v5b', # Godan verb with `bu' ending - 'v5g', # Godan verb with `gu' ending - 'v5k', # Godan verb with `ku' ending - 'v5k-s', # Godan verb - iku/yuku special class - 'v5m', # Godan verb with `mu' ending - 'v5n', # Godan verb with `nu' ending - 'v5r', # Godan verb with `ru' ending - 'v5r-i', # Godan verb with `ru' ending (irregular verb) - 'v5s', # Godan verb with `su' ending - 'v5t', # Godan verb with `tsu' ending - 'v5u', # Godan verb with `u' ending - 'v5u-s', # Godan verb with `u' ending (special class) - 'v5uru', # Godan verb - uru old class verb (old form of Eru) - 'v5z', # Godan verb with `zu' ending - 'vi', # intransitive verb - 'vk', # kuru verb - special class - 'vn', # irregular nu verb - 'vs', # noun or participle which takes the aux. verb suru - 'vs-c', # su verb - precursor to the modern suru - 'vs-i', # suru verb - irregular - 'vs-s', # suru verb - special class - 'vt', # transitive verb - 'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs) + 'Buddh': 'Buddhist term', + 'MA': 'martial arts term', + 'X': 'rude or X-rated term', + 'abbr': 'abbreviation', + 'adj': 'former adjective classification (being removed)', + 'adj-f': 'noun or verb acting prenominally (other than the above)', + 'adj-i': 'adjective (keiyoushi)', + 'adj-na': 'adjectival nouns or quasi-adjectives (keiyodoshi)', + 'adj-no': 'nouns which may take the genitive case particle "no"', + 'adj-pn': 'pre-noun adjectival (rentaishi)', + 'adj-t': '"taru" adjective', + 'adv': 'adverb (fukushi)', + 'adv-n': 'adverbial noun', + 'adv-to': 'adverb taking the "to" particle', + 'arch': 'archaism', + 'ateji': 'ateji (phonetic) reading', + 'aux': 'auxiliary', + 'aux-adj': 'auxiliary adjective', + 'aux-v': 'auxiliary verb', + 'chn': 'children\'s language', + 'col': 'colloquialism', + 'comp': 'computer terminology', + 'conj': 'conjunction', + 'ctr': 'counter', + 'derog': 'derogatory term', + 'eK': 'exclusively kanji', + 'ek': 'exclusively kana', + 'exp': 'Expressions (phrases, clauses, etc.)', + 'fam': 'familiar language', + 'fem': 'female term or language', + 'food': 'food term', + 'geom': 'geometry term', + 'gikun': 'gikun (meaning) reading', + 'gram': 'grammatical term', + 'hon': 'honorific or respectful (sonkeigo) language', + 'hum': 'humble (kenjougo) language', + 'iK': 'word containing irregular kanji usage', + 'id': 'idiomatic expression', + 'ik': 'word containing irregular kana usage', + 'int': 'interjection (kandoushi)', + 'io': 'irregular okurigana usage', + 'iv': 'irregular verb', + 'ling': 'linguistics terminology', + 'm-sl': 'manga slang', + 'male': 'male term or language', + 'male-sl': 'male slang', + 'math': 'mathematics', + 'mil': 'military', + 'n': 'noun (common) (futsuumeishi)', + 'n-adv': 'adverbial noun (fukushitekimeishi)', + 'n-pref': 'noun, used as a prefix', + 'n-suf': 'noun, used as a suffix', + 'n-t': 'noun (temporal) (jisoumeishi)', + 'num': 'numeric', + 'oK': 'word containing out-dated kanji', + 'obs': 'obsolete term', + 'obsc': 'obscure term', + 'ok': 'out-dated or obsolete kana usage', + 'on-mim': 'onomatopoeic or mimetic word', + 'physics': 'physics terminology', + 'pn': 'pronoun', + 'poet': 'poetical term', + 'pol': 'polite (teineigo) language', + 'pref': 'prefix', + 'prt': 'particle', + 'rare': 'rare (now replaced by "obsc")', + 'sens': 'sensitive word', + 'sl': 'slang', + 'suf': 'suffix', + 'uK': 'word usually written using kanji alone', + 'uk': 'word usually written using kana alone', + 'v1': 'Ichidan verb', + 'v2a-s': 'Nidan verb with "u" ending (archaic)', + 'v4h': 'Yodan verb with "hu/fu" ending (archaic)', + 'v4r': 'Yodan verb with "ru" ending (archaic)', + 'v5': 'Godan verb (not completely classified)', + 'v5aru': 'Godan verb - -aru special class', + 'v5b': 'Godan verb with "bu" ending', + 'v5g': 'Godan verb with "gu" ending', + 'v5k': 'Godan verb with "ku" ending', + 'v5k-s': 'Godan verb - iku/yuku special class', + 'v5m': 'Godan verb with "mu" ending', + 'v5n': 'Godan verb with "nu" ending', + 'v5r': 'Godan verb with "ru" ending', + 'v5r-i': 'Godan verb with "ru" ending (irregular verb)', + 'v5s': 'Godan verb with "su" ending', + 'v5t': 'Godan verb with "tsu" ending', + 'v5u': 'Godan verb with "u" ending', + 'v5u-s': 'Godan verb with "u" ending (special class)', + 'v5uru': 'Godan verb - uru old class verb (old form of Eru)', + 'v5z': 'Godan verb with "zu" ending', + 'vi': 'intransitive verb', + 'vk': 'kuru verb - special class', + 'vn': 'irregular nu verb', + 'vs': 'noun or participle which takes the aux. verb suru', + 'vs-c': 'su verb - precursor to the modern suru', + 'vs-i': 'suru verb - irregular', + 'vs-s': 'suru verb - special class', + 'vt': 'transitive ver', + 'vulg': 'vulgar expression or word', + 'vz': 'Ichidan verb - zuru verb - (alternative form of -jiru verbs)', } @@ -128,32 +162,49 @@ def parse_edict(path): for line in load_definitions(path): segments = line.split('/') - expression = segments[0].split(' ') - term = expression[0] - match = re.search('\[([^\]]+)\]', expression[1]) - reading = '' if match is None else match.group(1) - - glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:])) - glossary = re.sub('\(\d+\)\s*', '', glossary) + exp_parts = segments[0].split(' ') + expression = exp_parts[0] + reading_match = re.search('\[([^\]]+)\]', exp_parts[1]) + reading = None if reading_match is None else reading_match.group(1) + defs = [] tags = [] - for group in re.findall('\(([^\)\]]+)\)', glossary): - tags.extend(group.split(',')) - tags = set(tags).intersection(PARSED_TAGS) - tags = ' '.join(tags) + for index, dfn in enumerate(filter(None, segments[1:])): + dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s+)*)(.*)$', dfn) + gloss = dfn_match.group(2) - results.append((term, reading, glossary, tags)) + if index == 0: + tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1)))) + tags = tags_raw.intersection(set(PARSED_TAGS.keys())) - return results[1:] + if index == 0 or len(dfn_match.group(1)) > 0: + defs.append([gloss]) + else: + defs[-1].append(gloss) + + result = [expression, reading, ' '.join(tags)] + result += map(lambda x: '; '.join(x), defs) + + results.append(result) + + indices = {} + for i, d in enumerate(results): + for key in d[:2]: + if key is not None: + values = indices.get(key, []) + values.append(i) + indices[key] = values + + return {'defs': results, 'indices': indices} def build_dict(output_dir, input_file, parser): if input_file is not None: base = os.path.splitext(os.path.basename(input_file))[0] - with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp: - for d in parser(input_file): - fp.write('\t'.join(d) + '\n') + with open(os.path.join(output_dir, base) + '.json', 'w') as fp: + # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': ')) + json.dump(parser(input_file), fp, separators=(',', ':')) def build(dict_dir, kanjidic, edict, enamdict):