diff --git a/common.go b/common.go index 0a0cd45..28bed5c 100644 --- a/common.go +++ b/common.go @@ -33,10 +33,43 @@ import ( "strings" ) -type dbTagMeta struct { - Category string `json:"category,omitempty"` - Notes string `json:"notes,omitempty"` - Order int `json:"order,omitempty"` +const databaseVersion = 2 + +type dbRecord []interface{} +type dbRecordList []dbRecord + +type dbTag struct { + Name string + Category string + Order int + Notes string +} + +type dbTagList []dbTag + +func (meta dbTagList) crush() dbRecordList { + var results dbRecordList + for _, m := range meta { + results = append(results, dbRecord{m.Name, m.Category, m.Order, m.Notes}) + } + + return results +} + +type dbFrequency struct { + Expression string + Count int +} + +type dbFrequencyList []dbFrequency + +func (freqs dbFrequencyList) crush() dbRecordList { + var results dbRecordList + for _, f := range freqs { + results = append(results, dbRecord{f.Expression, f.Count}) + } + + return results } type dbTerm struct { @@ -58,10 +91,10 @@ func (term *dbTerm) addRules(rules ...string) { term.Rules = appendStringUnique(term.Rules, rules...) } -func (terms dbTermList) crush() [][]interface{} { - var results [][]interface{} +func (terms dbTermList) crush() dbRecordList { + var results dbRecordList for _, t := range terms { - result := []interface{}{ + result := dbRecord{ t.Expression, t.Reading, strings.Join(t.Tags, " "), @@ -97,10 +130,10 @@ func (kanji *dbKanji) addTags(tags ...string) { } } -func (kanji dbKanjiList) crush() [][]interface{} { - var results [][]interface{} +func (kanji dbKanjiList) crush() dbRecordList { + var results dbRecordList for _, k := range kanji { - result := []interface{}{ + result := dbRecord{ k.Character, strings.Join(k.Onyomi, " "), strings.Join(k.Kunyomi, " "), @@ -117,13 +150,11 @@ func (kanji dbKanjiList) crush() [][]interface{} { return results } -func writeDb(outputPath, title, revision string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, stride int, pretty bool) error { - const DB_VERSION = 1 - +func writeDb(outputPath, title, revision string, recordData map[string]dbRecordList, stride int, pretty bool) error { var zbuff bytes.Buffer zip := zip.NewWriter(&zbuff) - marshalJson := func(obj interface{}, pretty bool) ([]byte, error) { + marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) { if pretty { return json.MarshalIndent(obj, "", " ") } @@ -131,7 +162,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka return json.Marshal(obj) } - writeDbRecords := func(prefix string, records [][]interface{}) (int, error) { + writeDbRecords := func(prefix string, records dbRecordList) (int, error) { recordCount := len(records) bankCount := 0 @@ -142,7 +173,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka indexDst = recordCount } - bytes, err := marshalJson(records[indexSrc:indexDst], pretty) + bytes, err := marshalJSON(records[indexSrc:indexDst], pretty) if err != nil { return 0, err } @@ -156,7 +187,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka return 0, err } - bankCount += 1 + bankCount++ } return bankCount, nil @@ -164,28 +195,22 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka var err error var db struct { - Title string `json:"title"` - Version int `json:"version"` - Revision string `json:"revision"` - TagMeta map[string]dbTagMeta `json:"tagMeta"` - TermBanks int `json:"termBanks"` - KanjiBanks int `json:"kanjiBanks"` + Title string `json:"title"` + Version int `json:"version"` + Revision string `json:"revision"` } db.Title = title - db.Version = DB_VERSION + db.Version = databaseVersion db.Revision = revision - db.TagMeta = tagMeta - if db.TermBanks, err = writeDbRecords("term", termRecords); err != nil { - return err + for recordType, recordEntries := range recordData { + if _, err := writeDbRecords(recordType, recordEntries); err != nil { + return err + } } - if db.KanjiBanks, err = writeDbRecords("kanji", kanjiRecords); err != nil { - return err - } - - bytes, err := marshalJson(db, pretty) + bytes, err := marshalJSON(db, pretty) if err != nil { return err } @@ -234,8 +259,13 @@ func hasString(needle string, haystack []string) bool { } func detectFormat(path string) (string, error) { - if filepath.Ext(path) == ".sqlite" { + switch filepath.Ext(path) { + case ".sqlite": return "rikai", nil + case ".kanji_freq": + return "kanji_freq", nil + case ".term_freq": + return "term_freq", nil } switch filepath.Base(path) { diff --git a/edict.go b/edict.go index c47312c..f728d2a 100644 --- a/edict.go +++ b/edict.go @@ -29,7 +29,7 @@ import ( "github.com/FooSoft/jmdict" ) -const JMDICT_REVISION = "jmdict3" +const jmdictRevision = "jmdict3" func jmdictBuildRules(term *dbTerm) { for _, tag := range term.Tags { @@ -71,17 +71,17 @@ func jmdictAddPriorities(term *dbTerm, priorities ...string) { } } -func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { - tags := map[string]dbTagMeta{ - "news": {Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2}, - "ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2}, - "spec": {Notes: "common words not included in frequency lists", Category: "frequent", Order: -2}, - "gai": {Notes: "common loanword", Category: "frequent", Order: -2}, - "P": {Notes: "popular term", Category: "popular", Order: -10}, +func jmdictBuildTagMeta(entities map[string]string) dbTagList { + tags := dbTagList{ + dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2}, + dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2}, + dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2}, + dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2}, + dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10}, } for name, value := range entities { - tag := dbTagMeta{Notes: value} + tag := dbTag{Name: name, Notes: value} switch name { case "exp", "id": @@ -92,7 +92,7 @@ func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { tag.Order = -4 } - tags[name] = tag + tags = append(tags, tag) } return tags @@ -227,13 +227,16 @@ func jmdictExportDb(inputPath, outputPath, language, title string, stride int, p title = "JMdict" } + recordData := map[string]dbRecordList{ + "terms": terms.crush(), + "tags": jmdictBuildTagMeta(entities).crush(), + } + return writeDb( outputPath, title, - JMDICT_REVISION, - terms.crush(), - nil, - jmdictBuildTagMeta(entities), + jmdictRevision, + recordData, stride, pretty, ) diff --git a/enamdict.go b/enamdict.go index f732de8..4dee4ff 100644 --- a/enamdict.go +++ b/enamdict.go @@ -28,13 +28,13 @@ import ( "github.com/FooSoft/jmdict" ) -const JMNEDICT_REVISION = "jmnedict1" +const jmnedictRevision = "jmnedict1" -func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { - tags := make(map[string]dbTagMeta) +func jmnedictBuildTagMeta(entities map[string]string) dbTagList { + var tags dbTagList for name, value := range entities { - tag := dbTagMeta{Notes: value} + tag := dbTag{Name: name, Notes: value} switch name { case "company", "fem", "given", "masc", "organization", "person", "place", "product", "station", "surname", "unclass", "work": @@ -42,7 +42,7 @@ func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { tag.Order = 4 } - tags[name] = tag + tags = append(tags, tag) } return tags @@ -118,13 +118,16 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, title = "JMnedict" } + recordData := map[string]dbRecordList{ + "terms": terms.crush(), + "tags": jmnedictBuildTagMeta(entities).crush(), + } + return writeDb( outputPath, title, - JMNEDICT_REVISION, - terms.crush(), - nil, - jmnedictBuildTagMeta(entities), + jmnedictRevision, + recordData, stride, pretty, ) diff --git a/epwing.go b/epwing.go index 3f1b846..ebb596c 100644 --- a/epwing.go +++ b/epwing.go @@ -200,13 +200,16 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p title = strings.Join(titles, ", ") } + recordData := map[string]dbRecordList{ + "kanji": kanji.crush(), + "terms": terms.crush(), + } + return writeDb( outputPath, title, strings.Join(revisions, ";"), - terms.crush(), - kanji.crush(), - nil, + recordData, stride, pretty, ) diff --git a/frequency.go b/frequency.go new file mode 100644 index 0000000..fcb0c30 --- /dev/null +++ b/frequency.go @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 Alex Yatskov + * Author: Alex Yatskov + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package main + +import ( + "bufio" + "os" + "strconv" + "strings" +) + +const frequencyRevision = "frequency1" + +func frequncyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { + reader, err := os.Open(inputPath) + if err != nil { + return err + } + defer reader.Close() + + var frequencies dbFrequencyList + for scanner := bufio.NewScanner(reader); scanner.Scan(); { + line := scanner.Text() + if strings.HasPrefix(line, "#") { + continue + } + + parts := strings.Split(line, "\t") + if len(parts) != 2 { + continue + } + + expression := parts[0] + count, err := strconv.Atoi(parts[1]) + if err != nil { + continue + } + + frequencies = append(frequencies, dbFrequency{expression, count}) + } + + recordData := map[string]dbRecordList{ + "frequencies": frequencies.crush(), + } + + return writeDb( + outputPath, + title, + frequencyRevision, + recordData, + stride, + pretty, + ) +} diff --git a/kanjidic.go b/kanjidic.go index 2935fca..9392b6f 100644 --- a/kanjidic.go +++ b/kanjidic.go @@ -30,7 +30,7 @@ import ( "github.com/FooSoft/jmdict" ) -const KANJIDIC_REVISION = "kanjidic1" +const kanjidicRevision = "kanjidic1" func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji { if entry.ReadingMeaning == nil { @@ -116,26 +116,29 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int, } } - tagMeta := map[string]dbTagMeta{ - "jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5}, - "jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5}, - "jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"}, - "grade": {Notes: "school grade level at which the character is taught"}, - "strokes": {Notes: "number of strokes needed to write the character"}, - "heisig": {Notes: "frame number in Remembering the Kanji"}, + tags := dbTagList{ + dbTag{Name: "jouyou", Notes: "included in list of regular-use characters", Category: "frequent", Order: -5}, + dbTag{Name: "jinmeiyou", Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5}, + dbTag{Name: "jlpt", Notes: "corresponding Japanese Language Proficiency Test level"}, + dbTag{Name: "grade", Notes: "school grade level at which the character is taught"}, + dbTag{Name: "strokes", Notes: "number of strokes needed to write the character"}, + dbTag{Name: "heisig", Notes: "frame number in Remembering the Kanji"}, } if title == "" { title = "KANJIDIC2" } + recordData := map[string]dbRecordList{ + "kanji": kanji.crush(), + "tags": tags.crush(), + } + return writeDb( outputPath, title, - KANJIDIC_REVISION, - nil, - kanji.crush(), - tagMeta, + kanjidicRevision, + recordData, stride, pretty, ) diff --git a/rikai.go b/rikai.go index 16fadf5..98c3a2c 100644 --- a/rikai.go +++ b/rikai.go @@ -30,7 +30,7 @@ import ( _ "github.com/mattn/go-sqlite3" ) -const RIKAI_REVISION = "rikai2" +const rikaiRevision = "rikai2" type rikaiEntry struct { kanji string @@ -158,21 +158,24 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr title = "Rikai" } - entities := map[string]dbTagMeta{ - "P": {Category: "popular", Order: -10}, - "exp": {Category: "expression", Order: -5}, - "id": {Category: "expression", Order: -5}, - "arch": {Category: "archaism", Order: -4}, - "iK": {Category: "archaism", Order: -4}, + tags := dbTagList{ + dbTag{Name: "P", Category: "popular", Order: -10}, + dbTag{Name: "exp", Category: "expression", Order: -5}, + dbTag{Name: "id", Category: "expression", Order: -5}, + dbTag{Name: "arch", Category: "archaism", Order: -4}, + dbTag{Name: "iK", Category: "archaism", Order: -4}, + } + + recordData := map[string]dbRecordList{ + "terms": terms.crush(), + "tags": tags.crush(), } return writeDb( outputPath, title, - RIKAI_REVISION, - terms.crush(), - nil, - entities, + rikaiRevision, + recordData, stride, pretty, )