Dictionary updates

This commit is contained in:
Alex Yatskov 2016-03-20 13:23:21 -07:00
parent 4f19856f44
commit 7208872b9f

View File

@ -111,7 +111,7 @@ def load_definitions(path):
def parse_kanji_dic(path): def parse_kanji_dic(path):
results = [] results = {}
for line in load_definitions(path): for line in load_definitions(path):
segments = line.split() segments = line.split()
@ -119,26 +119,25 @@ def parse_kanji_dic(path):
kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:])) kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:])) onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
glossary = '; '.join(re.findall('\{([^\}]+)\}', line)) glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
results.append((character, kunyomi, onyomi, glossary)) results[character] = (kunyomi, onyomi, glossary)
return results return results
def parse_krad_file(path): def parse_krad_file(path):
results = [] results = {}
for line in load_definitions(path): for line in load_definitions(path):
segments = line.split(' ') segments = line.split(' ')
character = segments[0] character = segments[0]
radicals = ' '.join(segments[2:]) radicals = ' '.join(segments[2:])
results.append((character, radicals)) results[character] = radicals;
return results return results
def parse_edict(path): def parse_edict(path):
results = {} defs = []
for line in load_definitions(path): for line in load_definitions(path):
segments = line.split('/') segments = line.split('/')
@ -147,26 +146,46 @@ def parse_edict(path):
match = re.search('\[([^\]]+)\]', expression[1]) match = re.search('\[([^\]]+)\]', expression[1])
reading = None if match is None else match.group(1) reading = None if match is None else match.group(1)
glossary = filter(lambda x: len(x) > 0, segments[1:]) glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
glossary = '; '.join(glossary) glossary = re.sub('\(\d+\)\s*', '', glossary)
glossary = re.sub('\(\d+\)\s*', str(), glossary)
tags = [] tags = []
for group in re.findall('\(([^\)\]]+)\)', glossary): for group in re.findall('\(([^\)\]]+)\)', glossary):
tags.extend(group.split(',')) tags.extend(group.split(','))
tags = list(set(tags).intersection(PARSED_TAGS))
defs = results.get(term, []) tags = set(tags).intersection(PARSED_TAGS)
defs.append((reading, glossary, tags)) tags = ' '.join(tags)
results[term] = defs
return results defs.append((term, reading, glossary, tags))
term_indices = {}
reading_indices = {}
for i, d in enumerate(defs):
term, reading = d[:2]
if term is not None:
term_list = term_indices.get(term, [])
term_list.append(i)
term_indices[term] = term_list
if reading is not None:
reading_list = reading_indices.get(reading, [])
reading_list.append(i)
reading_indices[reading] = reading_list
return {
'defs': defs,
't_idx': term_indices,
'r_idx': reading_indices
};
def build_dict(output_dir, input_file, parser): def build_dict(output_dir, input_file, parser):
if input_file is not None: if input_file is not None:
base = os.path.splitext(os.path.basename(input_file))[0] base = os.path.splitext(os.path.basename(input_file))[0]
with open(os.path.join(output_dir, base) + '.json', 'w') as fp: with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
# json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
json.dump(parser(input_file), fp) json.dump(parser(input_file), fp)