Simplify compile script

This commit is contained in:
Alex Yatskov 2016-04-29 22:25:33 -07:00
parent a7e5a23e2d
commit 61993db702

View File

@ -25,118 +25,118 @@ import re
PARSED_TAGS = { PARSED_TAGS = {
'Buddh': 'Buddhist term', 'Buddh',
'MA': 'martial arts term', 'MA',
'X': 'rude or X-rated term', 'X',
'abbr': 'abbreviation', 'abbr',
'adj': 'former adjective classification (being removed)', 'adj',
'adj-f': 'noun or verb acting prenominally (other than the above)', 'adj-f',
'adj-i': 'adjective (keiyoushi)', 'adj-i',
'adj-na': 'adjectival nouns or quasi-adjectives (keiyodoshi)', 'adj-na',
'adj-no': 'nouns which may take the genitive case particle "no"', 'adj-no',
'adj-pn': 'pre-noun adjectival (rentaishi)', 'adj-pn',
'adj-t': '"taru" adjective', 'adj-t',
'adv': 'adverb (fukushi)', 'adv',
'adv-n': 'adverbial noun', 'adv-n',
'adv-to': 'adverb taking the "to" particle', 'adv-to',
'arch': 'archaism', 'arch',
'ateji': 'ateji (phonetic) reading', 'ateji',
'aux': 'auxiliary', 'aux',
'aux-adj': 'auxiliary adjective', 'aux-adj',
'aux-v': 'auxiliary verb', 'aux-v',
'c': 'company name', 'c',
'chn': 'children\'s language', 'chn',
'col': 'colloquialism', 'col',
'comp': 'computer terminology', 'comp',
'conj': 'conjunction', 'conj',
'ctr': 'counter', 'ctr',
'derog': 'derogatory term', 'derog',
'eK': 'exclusively kanji', 'eK',
'ek': 'exclusively kana', 'ek',
'exp': 'Expressions (phrases, clauses, etc.)', 'exp',
'f': 'female given name', 'f',
'fam': 'familiar language', 'fam',
'fem': 'female term or language', 'fem',
'food': 'food term', 'food',
'g': 'given name, as-yet not classified by sex', 'g',
'geom': 'geometry term', 'geom',
'gikun': 'gikun (meaning) reading', 'gikun',
'gram': 'grammatical term', 'gram',
'h': 'full (usually family plus given) name of a particular person', 'h',
'hon': 'honorific or respectful (sonkeigo) language', 'hon',
'hum': 'humble (kenjougo) language', 'hum',
'iK': 'word containing irregular kanji usage', 'iK',
'id': 'idiomatic expression', 'id',
'ik': 'word containing irregular kana usage', 'ik',
'int': 'interjection (kandoushi)', 'int',
'io': 'irregular okurigana usage', 'io',
'iv': 'irregular verb', 'iv',
'ling': 'linguistics terminology', 'ling',
'm': 'male given name', 'm',
'm-sl': 'manga slang', 'm-sl',
'male': 'male term or language', 'male',
'male-sl': 'male slang', 'male-sl',
'math': 'mathematics', 'math',
'mil': 'military', 'mil',
'n': 'noun (common) (futsuumeishi)', 'n',
'n-adv': 'adverbial noun (fukushitekimeishi)', 'n-adv',
'n-pref': 'noun, used as a prefix', 'n-pref',
'n-suf': 'noun, used as a suffix', 'n-suf',
'n-t': 'noun (temporal) (jisoumeishi)', 'n-t',
'num': 'numeric', 'num',
'oK': 'word containing out-dated kanji', 'oK',
'obs': 'obsolete term', 'obs',
'obsc': 'obscure term', 'obsc',
'ok': 'out-dated or obsolete kana usage', 'ok',
'on-mim': 'onomatopoeic or mimetic word', 'on-mim',
'P': 'popular term', 'P',
'p': 'place-name', 'p',
'physics': 'physics terminology', 'physics',
'pn': 'pronoun', 'pn',
'poet': 'poetical term', 'poet',
'pol': 'polite (teineigo) language', 'pol',
'pr': 'product name', 'pr',
'pref': 'prefix', 'pref',
'prt': 'particle', 'prt',
'rare': 'rare (now replaced by "obsc")', 'rare',
's': 'surname', 's',
'sens': 'sensitive word', 'sens',
'sl': 'slang', 'sl',
'st': 'stations', 'st',
'suf': 'suffix', 'suf',
'u': 'person name, either given or surname, as-yet unclassified', 'u',
'uK': 'word usually written using kanji alone', 'uK',
'uk': 'word usually written using kana alone', 'uk',
'v1': 'Ichidan verb', 'v1',
'v2a-s': 'Nidan verb with "u" ending (archaic)', 'v2a-s',
'v4h': 'Yodan verb with "hu/fu" ending (archaic)', 'v4h',
'v4r': 'Yodan verb with "ru" ending (archaic)', 'v4r',
'v5': 'Godan verb (not completely classified)', 'v5',
'v5aru': 'Godan verb - -aru special class', 'v5aru',
'v5b': 'Godan verb with "bu" ending', 'v5b',
'v5g': 'Godan verb with "gu" ending', 'v5g',
'v5k': 'Godan verb with "ku" ending', 'v5k',
'v5k-s': 'Godan verb - iku/yuku special class', 'v5k-s',
'v5m': 'Godan verb with "mu" ending', 'v5m',
'v5n': 'Godan verb with "nu" ending', 'v5n',
'v5r': 'Godan verb with "ru" ending', 'v5r',
'v5r-i': 'Godan verb with "ru" ending (irregular verb)', 'v5r-i',
'v5s': 'Godan verb with "su" ending', 'v5s',
'v5t': 'Godan verb with "tsu" ending', 'v5t',
'v5u': 'Godan verb with "u" ending', 'v5u',
'v5u-s': 'Godan verb with "u" ending (special class)', 'v5u-s',
'v5uru': 'Godan verb - uru old class verb (old form of Eru)', 'v5uru',
'v5z': 'Godan verb with "zu" ending', 'v5z',
'vi': 'intransitive verb', 'vi',
'vk': 'kuru verb - special class', 'vk',
'vn': 'irregular nu verb', 'vn',
'vs': 'noun or participle which takes the aux. verb suru', 'vs',
'vs-c': 'su verb - precursor to the modern suru', 'vs-c',
'vs-i': 'suru verb - irregular', 'vs-i',
'vs-s': 'suru verb - special class', 'vs-s',
'vt': 'transitive ver', 'vt',
'vulg': 'vulgar expression or word', 'vulg',
'vz': 'Ichidan verb - zuru verb - (alternative form of -jiru verbs)', 'vz'
} }
@ -184,7 +184,7 @@ def parse_edict(path):
dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn) dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn)
tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1)))) tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
tags_raw = tags_raw.intersection(set(PARSED_TAGS.keys())) tags_raw = tags_raw.intersection(PARSED_TAGS)
tags = tags.union(tags_raw) tags = tags.union(tags_raw)
gloss = dfn_match.group(2).strip() gloss = dfn_match.group(2).strip()