Moving large files to CSV format, deleting unused kradfile
This commit is contained in:
parent
b97e75ba32
commit
7eadff3457
@ -3,7 +3,6 @@
|
|||||||
KANJIDIC=util/data/kanjidic
|
KANJIDIC=util/data/kanjidic
|
||||||
EDICT=util/data/edict
|
EDICT=util/data/edict
|
||||||
ENAMDICT=util/data/enamdict
|
ENAMDICT=util/data/enamdict
|
||||||
KRADFILE=util/data/kradfile
|
DICT_DIR=ext/bg/data
|
||||||
DICT_DIR=ext/jp/data
|
|
||||||
|
|
||||||
util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
|
util/compile.py --kanjidic $KANJIDIC --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
|
||||||
|
@ -19,43 +19,33 @@
|
|||||||
|
|
||||||
class Dictionary {
|
class Dictionary {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.termDicts = [];
|
this.terms = [];
|
||||||
this.kanjiDicts = [];
|
this.termIndices = {};
|
||||||
|
|
||||||
|
this.kanji = [];
|
||||||
|
this.kanjiIndices = {};
|
||||||
}
|
}
|
||||||
|
|
||||||
addTermDict(termDict) {
|
addTermDict(terms) {
|
||||||
this.termDicts.push(termDict);
|
let index = this.terms.length;
|
||||||
|
for (const [e, r, g, t] in terms) {
|
||||||
|
this.storeIndex(this.termIndices, e, index);
|
||||||
|
this.storeIndex(this.termIndices, r, index++);
|
||||||
|
this.terms.push([e, r, g, t]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addKanjiDict(kanjiDict) {
|
addKanjiDict(kanji) {
|
||||||
this.kanjiDicts.push(kanjiDict);
|
let index = this.kanji.length;
|
||||||
|
for (const [c, k, o, g] in kanji) {
|
||||||
|
this.storeIndex(this.kanjiIndices, c, index++);
|
||||||
|
this.kanji.push([c, k, o, g]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
findTerm(term) {
|
findTerm(term) {
|
||||||
let results = [];
|
return (this.termIndices[term] || []).map(index => {
|
||||||
for (let dict of this.termDicts) {
|
const [e, r, g, t] = this.terms[index];
|
||||||
results = results.concat(this.findTermInDict(term, dict));
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
findKanji(kanji) {
|
|
||||||
const results = [];
|
|
||||||
for (let dict of this.kanjiDicts) {
|
|
||||||
const result = this.findKanjiInDict(kanji, dict);
|
|
||||||
if (result !== null) {
|
|
||||||
results.push(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
findTermInDict(term, dict) {
|
|
||||||
return (dict.indices[term] || []).map(index => {
|
|
||||||
const [e, r, g, t] = dict.defs[index];
|
|
||||||
return {
|
return {
|
||||||
id: index,
|
id: index,
|
||||||
expression: e,
|
expression: e,
|
||||||
@ -66,12 +56,8 @@ class Dictionary {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
findKanjiInDict(kanji, dict) {
|
findKanji(kanji) {
|
||||||
const def = dict.defs[kanji];
|
return (this.kanjiIndices[kanji] || []).map(index => {
|
||||||
if (def === null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const [c, k, o, g] = def;
|
const [c, k, o, g] = def;
|
||||||
return {
|
return {
|
||||||
id: kanji.charCodeAt(0),
|
id: kanji.charCodeAt(0),
|
||||||
@ -80,5 +66,14 @@ class Dictionary {
|
|||||||
onyomi: o,
|
onyomi: o,
|
||||||
glossary: g
|
glossary: g
|
||||||
};
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
storeIndex(indices, term, index) {
|
||||||
|
if (term.length > 0) {
|
||||||
|
const indices = this.termIndices[term] || [];
|
||||||
|
indices.push(term);
|
||||||
|
this.termIndices[term] = indices;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,9 @@ class Client {
|
|||||||
this.popup.classList.add('yomichan-popup');
|
this.popup.classList.add('yomichan-popup');
|
||||||
this.popup.addEventListener('mousedown', (e) => e.stopPropagation());
|
this.popup.addEventListener('mousedown', (e) => e.stopPropagation());
|
||||||
this.popup.addEventListener('scroll', (e) => e.stopPropagation());
|
this.popup.addEventListener('scroll', (e) => e.stopPropagation());
|
||||||
document.body.appendChild(this.popup);
|
|
||||||
|
const base = document.body.appendChild('div');
|
||||||
|
base.createShadowRoot().appendChild(this.popup);
|
||||||
|
|
||||||
chrome.runtime.onMessage.addListener(this.onMessage.bind(this));
|
chrome.runtime.onMessage.addListener(this.onMessage.bind(this));
|
||||||
window.addEventListener('mousedown', this.onMouseDown.bind(this));
|
window.addEventListener('mousedown', this.onMouseDown.bind(this));
|
||||||
|
@ -18,7 +18,6 @@
|
|||||||
|
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
|
||||||
import optparse
|
import optparse
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
@ -111,7 +110,7 @@ def load_definitions(path):
|
|||||||
|
|
||||||
|
|
||||||
def parse_kanji_dic(path):
|
def parse_kanji_dic(path):
|
||||||
results = {}
|
results = []
|
||||||
|
|
||||||
for line in load_definitions(path):
|
for line in load_definitions(path):
|
||||||
segments = line.split()
|
segments = line.split()
|
||||||
@ -119,32 +118,20 @@ def parse_kanji_dic(path):
|
|||||||
kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
|
kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
|
||||||
onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
|
onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
|
||||||
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
|
||||||
results[character] = (kunyomi, onyomi, glossary)
|
results.append((character, kunyomi, onyomi, glossary))
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def parse_krad_file(path):
|
|
||||||
results = {}
|
|
||||||
|
|
||||||
for line in load_definitions(path):
|
|
||||||
segments = line.split(' ')
|
|
||||||
character = segments[0]
|
|
||||||
radicals = ' '.join(segments[2:])
|
|
||||||
results[character] = radicals;
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def parse_edict(path):
|
def parse_edict(path):
|
||||||
defs = []
|
results = []
|
||||||
for line in load_definitions(path):
|
for line in load_definitions(path):
|
||||||
segments = line.split('/')
|
segments = line.split('/')
|
||||||
|
|
||||||
expression = segments[0].split(' ')
|
expression = segments[0].split(' ')
|
||||||
term = expression[0]
|
term = expression[0]
|
||||||
match = re.search('\[([^\]]+)\]', expression[1])
|
match = re.search('\[([^\]]+)\]', expression[1])
|
||||||
reading = None if match is None else match.group(1)
|
reading = '' if match is None else match.group(1)
|
||||||
|
|
||||||
glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
|
glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
|
||||||
glossary = re.sub('\(\d+\)\s*', '', glossary)
|
glossary = re.sub('\(\d+\)\s*', '', glossary)
|
||||||
@ -156,30 +143,21 @@ def parse_edict(path):
|
|||||||
tags = set(tags).intersection(PARSED_TAGS)
|
tags = set(tags).intersection(PARSED_TAGS)
|
||||||
tags = ' '.join(tags)
|
tags = ' '.join(tags)
|
||||||
|
|
||||||
defs.append((term, reading, glossary, tags))
|
results.append((term, reading, glossary, tags))
|
||||||
|
|
||||||
indices = {}
|
return results[1:]
|
||||||
for i, d in enumerate(defs):
|
|
||||||
for key in d[:2]:
|
|
||||||
if key is not None:
|
|
||||||
values = indices.get(key, [])
|
|
||||||
values.append(i)
|
|
||||||
indices[key] = values
|
|
||||||
|
|
||||||
return {'defs': defs, 'indices': indices}
|
|
||||||
|
|
||||||
|
|
||||||
def build_dict(output_dir, input_file, parser):
|
def build_dict(output_dir, input_file, parser):
|
||||||
if input_file is not None:
|
if input_file is not None:
|
||||||
base = os.path.splitext(os.path.basename(input_file))[0]
|
base = os.path.splitext(os.path.basename(input_file))[0]
|
||||||
with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
|
with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
|
||||||
# json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
|
for d in parser(input_file):
|
||||||
json.dump(parser(input_file), fp)
|
fp.write('\t'.join(d) + '\n')
|
||||||
|
|
||||||
|
|
||||||
def build(dict_dir, kanjidic, kradfile, edict, enamdict):
|
def build(dict_dir, kanjidic, edict, enamdict):
|
||||||
build_dict(dict_dir, kanjidic, parse_kanji_dic)
|
build_dict(dict_dir, kanjidic, parse_kanji_dic)
|
||||||
build_dict(dict_dir, kradfile, parse_krad_file)
|
|
||||||
build_dict(dict_dir, edict, parse_edict)
|
build_dict(dict_dir, edict, parse_edict)
|
||||||
build_dict(dict_dir, enamdict, parse_edict)
|
build_dict(dict_dir, enamdict, parse_edict)
|
||||||
|
|
||||||
@ -187,7 +165,6 @@ def build(dict_dir, kanjidic, kradfile, edict, enamdict):
|
|||||||
def main():
|
def main():
|
||||||
parser = optparse.OptionParser()
|
parser = optparse.OptionParser()
|
||||||
parser.add_option('--kanjidic', dest='kanjidic')
|
parser.add_option('--kanjidic', dest='kanjidic')
|
||||||
parser.add_option('--kradfile', dest='kradfile')
|
|
||||||
parser.add_option('--edict', dest='edict')
|
parser.add_option('--edict', dest='edict')
|
||||||
parser.add_option('--enamdict', dest='enamdict')
|
parser.add_option('--enamdict', dest='enamdict')
|
||||||
|
|
||||||
@ -196,13 +173,7 @@ def main():
|
|||||||
if len(args) == 0:
|
if len(args) == 0:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
else:
|
else:
|
||||||
build(
|
build(args[0], options.kanjidic, options.edict, options.enamdict)
|
||||||
args[0],
|
|
||||||
options.kanjidic,
|
|
||||||
options.kradfile,
|
|
||||||
options.edict,
|
|
||||||
options.enamdict
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
Reference in New Issue
Block a user