Moving large files to CSV format, deleting unused kradfile

2016-03-31 20:03:39 -07:00 · 2016-03-31 20:03:39 -07:00 · 7eadff3457
commit 7eadff3457
parent b97e75ba32
4 changed files with 54 additions and 87 deletions
--- a/build_dict.sh
+++ b/build_dict.sh
@ -3,7 +3,6 @@
 KANJIDIC=util/data/kanjidic
 EDICT=util/data/edict
 ENAMDICT=util/data/enamdict
-KRADFILE=util/data/kradfile
-DICT_DIR=ext/jp/data
+DICT_DIR=ext/bg/data

-util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
+util/compile.py --kanjidic $KANJIDIC --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
--- a/ext/bg/dictionary.js
+++ b/ext/bg/dictionary.js
@ -19,43 +19,33 @@

 class Dictionary {
    constructor() {
-        this.termDicts  = [];
-        this.kanjiDicts = [];
+        this.terms       = [];
+        this.termIndices = {};
+
+        this.kanji        = [];
+        this.kanjiIndices = {};
    }

-    addTermDict(termDict) {
-        this.termDicts.push(termDict);
+    addTermDict(terms) {
+        let index = this.terms.length;
+        for (const [e, r, g, t] in terms) {
+            this.storeIndex(this.termIndices, e, index);
+            this.storeIndex(this.termIndices, r, index++);
+            this.terms.push([e, r, g, t]);
+        }
    }

-    addKanjiDict(kanjiDict) {
-        this.kanjiDicts.push(kanjiDict);
+    addKanjiDict(kanji) {
+        let index = this.kanji.length;
+        for (const [c, k, o, g] in kanji) {
+            this.storeIndex(this.kanjiIndices, c, index++);
+            this.kanji.push([c, k, o, g]);
+        }
    }

-
    findTerm(term) {
-        let results = [];
-        for (let dict of this.termDicts) {
-            results = results.concat(this.findTermInDict(term, dict));
-        }
-
-        return results;
-    }
-
-    findKanji(kanji) {
-        const results = [];
-        for (let dict of this.kanjiDicts) {
-            const result = this.findKanjiInDict(kanji, dict);
-            if (result !== null) {
-                results.push(result);
-            }
-        }
-
-        return results;
-    }
-
-    findTermInDict(term, dict) {
-        return (dict.indices[term] || []).map(index => {
-            const [e, r, g, t] = dict.defs[index];
+        return (this.termIndices[term] || []).map(index => {
+            const [e, r, g, t] = this.terms[index];
            return {
                id:         index,
                expression: e,
@ -66,19 +56,24 @@ class Dictionary {
        });
    }

-    findKanjiInDict(kanji, dict) {
-        const def = dict.defs[kanji];
-        if (def === null) {
-            return null;
-        }
+    findKanji(kanji) {
+        return (this.kanjiIndices[kanji] || []).map(index => {
+            const [c, k, o, g] = def;
+            return {
+                id:        kanji.charCodeAt(0),
+                character: c,
+                kunyomi:   k,
+                onyomi:    o,
+                glossary:  g
+            };
+        });
+    }

-        const [c, k, o, g] = def;
-        return {
-            id:        kanji.charCodeAt(0),
-            character: c,
-            kunyomi:   k,
-            onyomi:    o,
-            glossary:  g
-        };
+    storeIndex(indices, term, index) {
+        if (term.length > 0) {
+            const indices = this.termIndices[term] || [];
+            indices.push(term);
+            this.termIndices[term] = indices;
+        }
    }
 }
--- a/ext/client.js
+++ b/ext/client.js
@ -27,7 +27,9 @@ class Client {
        this.popup.classList.add('yomichan-popup');
        this.popup.addEventListener('mousedown', (e) => e.stopPropagation());
        this.popup.addEventListener('scroll', (e) => e.stopPropagation());
-        document.body.appendChild(this.popup);
+
+        const base = document.body.appendChild('div');
+        base.createShadowRoot().appendChild(this.popup);

        chrome.runtime.onMessage.addListener(this.onMessage.bind(this));
        window.addEventListener('mousedown', this.onMouseDown.bind(this));
--- a/util/compile.py
+++ b/util/compile.py
@ -18,7 +18,6 @@


 import codecs
-import json
 import optparse
 import os.path
 import re
@ -111,7 +110,7 @@ def load_definitions(path):


 def parse_kanji_dic(path):
-    results = {}
+    results = []

    for line in load_definitions(path):
        segments = line.split()
@ -119,32 +118,20 @@ def parse_kanji_dic(path):
        kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
        onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
        glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
-        results[character] = (kunyomi, onyomi, glossary)
-
-    return results
-
-
-def parse_krad_file(path):
-    results = {}
-
-    for line in load_definitions(path):
-        segments = line.split(' ')
-        character = segments[0]
-        radicals = ' '.join(segments[2:])
-        results[character] = radicals;
+        results.append((character, kunyomi, onyomi, glossary))

    return results


 def parse_edict(path):
-    defs = []
+    results = []
    for line in load_definitions(path):
        segments = line.split('/')

        expression = segments[0].split(' ')
        term = expression[0]
        match = re.search('\[([^\]]+)\]', expression[1])
-        reading = None if match is None else match.group(1)
+        reading = '' if match is None else match.group(1)

        glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
        glossary = re.sub('\(\d+\)\s*', '', glossary)
@ -156,30 +143,21 @@ def parse_edict(path):
        tags = set(tags).intersection(PARSED_TAGS)
        tags = ' '.join(tags)

-        defs.append((term, reading, glossary, tags))
+        results.append((term, reading, glossary, tags))

-    indices = {}
-    for i, d in enumerate(defs):
-        for key in d[:2]:
-            if key is not None:
-                values = indices.get(key, [])
-                values.append(i)
-                indices[key] = values
-
-    return {'defs': defs, 'indices': indices}
+    return results[1:]


 def build_dict(output_dir, input_file, parser):
    if input_file is not None:
        base = os.path.splitext(os.path.basename(input_file))[0]
-        with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
-            # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
-            json.dump(parser(input_file), fp)
+        with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
+            for d in parser(input_file):
+                fp.write('\t'.join(d) + '\n')


-def build(dict_dir, kanjidic, kradfile, edict, enamdict):
+def build(dict_dir, kanjidic, edict, enamdict):
    build_dict(dict_dir, kanjidic, parse_kanji_dic)
-    build_dict(dict_dir, kradfile, parse_krad_file)
    build_dict(dict_dir, edict, parse_edict)
    build_dict(dict_dir, enamdict, parse_edict)

@ -187,7 +165,6 @@ def build(dict_dir, kanjidic, kradfile, edict, enamdict):
 def main():
    parser = optparse.OptionParser()
    parser.add_option('--kanjidic', dest='kanjidic')
-    parser.add_option('--kradfile', dest='kradfile')
    parser.add_option('--edict', dest='edict')
    parser.add_option('--enamdict', dest='enamdict')

@ -196,13 +173,7 @@ def main():
    if len(args) == 0:
        parser.print_help()
    else:
-        build(
-            args[0],
-            options.kanjidic,
-            options.kradfile,
-            options.edict,
-            options.enamdict
-        )
+        build(args[0], options.kanjidic, options.edict, options.enamdict)


 if __name__ == '__main__':