Moving large files to CSV format, deleting unused kradfile

This commit is contained in:
Alex Yatskov 2016-03-31 20:03:39 -07:00
parent b97e75ba32
commit 7eadff3457
4 changed files with 54 additions and 87 deletions

View File

@ -3,7 +3,6 @@
KANJIDIC=util/data/kanjidic KANJIDIC=util/data/kanjidic
EDICT=util/data/edict EDICT=util/data/edict
ENAMDICT=util/data/enamdict ENAMDICT=util/data/enamdict
KRADFILE=util/data/kradfile DICT_DIR=ext/bg/data
DICT_DIR=ext/jp/data
util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR util/compile.py --kanjidic $KANJIDIC --edict $EDICT --enamdict $ENAMDICT $DICT_DIR

View File

@ -19,43 +19,33 @@
class Dictionary { class Dictionary {
constructor() { constructor() {
this.termDicts = []; this.terms = [];
this.kanjiDicts = []; this.termIndices = {};
this.kanji = [];
this.kanjiIndices = {};
} }
addTermDict(termDict) { addTermDict(terms) {
this.termDicts.push(termDict); let index = this.terms.length;
for (const [e, r, g, t] in terms) {
this.storeIndex(this.termIndices, e, index);
this.storeIndex(this.termIndices, r, index++);
this.terms.push([e, r, g, t]);
}
} }
addKanjiDict(kanjiDict) { addKanjiDict(kanji) {
this.kanjiDicts.push(kanjiDict); let index = this.kanji.length;
for (const [c, k, o, g] in kanji) {
this.storeIndex(this.kanjiIndices, c, index++);
this.kanji.push([c, k, o, g]);
}
} }
findTerm(term) { findTerm(term) {
let results = []; return (this.termIndices[term] || []).map(index => {
for (let dict of this.termDicts) { const [e, r, g, t] = this.terms[index];
results = results.concat(this.findTermInDict(term, dict));
}
return results;
}
findKanji(kanji) {
const results = [];
for (let dict of this.kanjiDicts) {
const result = this.findKanjiInDict(kanji, dict);
if (result !== null) {
results.push(result);
}
}
return results;
}
findTermInDict(term, dict) {
return (dict.indices[term] || []).map(index => {
const [e, r, g, t] = dict.defs[index];
return { return {
id: index, id: index,
expression: e, expression: e,
@ -66,12 +56,8 @@ class Dictionary {
}); });
} }
findKanjiInDict(kanji, dict) { findKanji(kanji) {
const def = dict.defs[kanji]; return (this.kanjiIndices[kanji] || []).map(index => {
if (def === null) {
return null;
}
const [c, k, o, g] = def; const [c, k, o, g] = def;
return { return {
id: kanji.charCodeAt(0), id: kanji.charCodeAt(0),
@ -80,5 +66,14 @@ class Dictionary {
onyomi: o, onyomi: o,
glossary: g glossary: g
}; };
});
}
storeIndex(indices, term, index) {
if (term.length > 0) {
const indices = this.termIndices[term] || [];
indices.push(term);
this.termIndices[term] = indices;
}
} }
} }

View File

@ -27,7 +27,9 @@ class Client {
this.popup.classList.add('yomichan-popup'); this.popup.classList.add('yomichan-popup');
this.popup.addEventListener('mousedown', (e) => e.stopPropagation()); this.popup.addEventListener('mousedown', (e) => e.stopPropagation());
this.popup.addEventListener('scroll', (e) => e.stopPropagation()); this.popup.addEventListener('scroll', (e) => e.stopPropagation());
document.body.appendChild(this.popup);
const base = document.body.appendChild('div');
base.createShadowRoot().appendChild(this.popup);
chrome.runtime.onMessage.addListener(this.onMessage.bind(this)); chrome.runtime.onMessage.addListener(this.onMessage.bind(this));
window.addEventListener('mousedown', this.onMouseDown.bind(this)); window.addEventListener('mousedown', this.onMouseDown.bind(this));

View File

@ -18,7 +18,6 @@
import codecs import codecs
import json
import optparse import optparse
import os.path import os.path
import re import re
@ -111,7 +110,7 @@ def load_definitions(path):
def parse_kanji_dic(path): def parse_kanji_dic(path):
results = {} results = []
for line in load_definitions(path): for line in load_definitions(path):
segments = line.split() segments = line.split()
@ -119,32 +118,20 @@ def parse_kanji_dic(path):
kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:])) kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:])) onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
glossary = '; '.join(re.findall('\{([^\}]+)\}', line)) glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
results[character] = (kunyomi, onyomi, glossary) results.append((character, kunyomi, onyomi, glossary))
return results
def parse_krad_file(path):
results = {}
for line in load_definitions(path):
segments = line.split(' ')
character = segments[0]
radicals = ' '.join(segments[2:])
results[character] = radicals;
return results return results
def parse_edict(path): def parse_edict(path):
defs = [] results = []
for line in load_definitions(path): for line in load_definitions(path):
segments = line.split('/') segments = line.split('/')
expression = segments[0].split(' ') expression = segments[0].split(' ')
term = expression[0] term = expression[0]
match = re.search('\[([^\]]+)\]', expression[1]) match = re.search('\[([^\]]+)\]', expression[1])
reading = None if match is None else match.group(1) reading = '' if match is None else match.group(1)
glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:])) glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
glossary = re.sub('\(\d+\)\s*', '', glossary) glossary = re.sub('\(\d+\)\s*', '', glossary)
@ -156,30 +143,21 @@ def parse_edict(path):
tags = set(tags).intersection(PARSED_TAGS) tags = set(tags).intersection(PARSED_TAGS)
tags = ' '.join(tags) tags = ' '.join(tags)
defs.append((term, reading, glossary, tags)) results.append((term, reading, glossary, tags))
indices = {} return results[1:]
for i, d in enumerate(defs):
for key in d[:2]:
if key is not None:
values = indices.get(key, [])
values.append(i)
indices[key] = values
return {'defs': defs, 'indices': indices}
def build_dict(output_dir, input_file, parser): def build_dict(output_dir, input_file, parser):
if input_file is not None: if input_file is not None:
base = os.path.splitext(os.path.basename(input_file))[0] base = os.path.splitext(os.path.basename(input_file))[0]
with open(os.path.join(output_dir, base) + '.json', 'w') as fp: with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
# json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': ')) for d in parser(input_file):
json.dump(parser(input_file), fp) fp.write('\t'.join(d) + '\n')
def build(dict_dir, kanjidic, kradfile, edict, enamdict): def build(dict_dir, kanjidic, edict, enamdict):
build_dict(dict_dir, kanjidic, parse_kanji_dic) build_dict(dict_dir, kanjidic, parse_kanji_dic)
build_dict(dict_dir, kradfile, parse_krad_file)
build_dict(dict_dir, edict, parse_edict) build_dict(dict_dir, edict, parse_edict)
build_dict(dict_dir, enamdict, parse_edict) build_dict(dict_dir, enamdict, parse_edict)
@ -187,7 +165,6 @@ def build(dict_dir, kanjidic, kradfile, edict, enamdict):
def main(): def main():
parser = optparse.OptionParser() parser = optparse.OptionParser()
parser.add_option('--kanjidic', dest='kanjidic') parser.add_option('--kanjidic', dest='kanjidic')
parser.add_option('--kradfile', dest='kradfile')
parser.add_option('--edict', dest='edict') parser.add_option('--edict', dest='edict')
parser.add_option('--enamdict', dest='enamdict') parser.add_option('--enamdict', dest='enamdict')
@ -196,13 +173,7 @@ def main():
if len(args) == 0: if len(args) == 0:
parser.print_help() parser.print_help()
else: else:
build( build(args[0], options.kanjidic, options.edict, options.enamdict)
args[0],
options.kanjidic,
options.kradfile,
options.edict,
options.enamdict
)
if __name__ == '__main__': if __name__ == '__main__':