From 1f95077e7bba48665bc09d7faaf6b91ac22d4a70 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sun, 18 Dec 2016 11:46:47 -0800 Subject: [PATCH] Cleanup & fixes --- common.go | 2 +- daijirin.go | 7 +++++++ edict.go | 54 +++++++++++++++++++++++++++++------------------------ enamdict.go | 10 +++++----- epwing.go | 9 +-------- kanjidic.go | 10 +++++----- main.go | 8 ++++---- 7 files changed, 53 insertions(+), 47 deletions(-) diff --git a/common.go b/common.go index 2c7c8e9..2ee9369 100644 --- a/common.go +++ b/common.go @@ -116,7 +116,7 @@ func (kanji dbKanjiList) crush() [][]interface{} { func writeDb(outputDir, title string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, pretty bool) error { const DB_VERSION = 1 - const BANK_STRIDE = 50000 + const BANK_STRIDE = 10000 marshalJson := func(obj interface{}, pretty bool) ([]byte, error) { if pretty { diff --git a/daijirin.go b/daijirin.go index 44bf91b..32b8cf2 100644 --- a/daijirin.go +++ b/daijirin.go @@ -27,6 +27,13 @@ import ( "strings" ) +type daijirinExtractor struct { + partsExp *regexp.Regexp + phonExp *regexp.Regexp + variantExp *regexp.Regexp + annotExp *regexp.Regexp +} + func makeDaijirinExtractor() epwingExtractor { return &daijirinExtractor{ partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:((.*)))?`), diff --git a/edict.go b/edict.go index 920b447..49f256c 100644 --- a/edict.go +++ b/edict.go @@ -29,7 +29,7 @@ import ( "github.com/FooSoft/jmdict" ) -func computeJmdictRules(term *dbTerm) { +func jmdictBuildRules(term *dbTerm) { for _, tag := range term.Tags { switch tag { case "adj-i", "v1", "vk", "vs": @@ -42,7 +42,7 @@ func computeJmdictRules(term *dbTerm) { } } -func computeJmdictScore(term *dbTerm) { +func jmdictBuildScore(term *dbTerm) { term.Score = 0 for _, tag := range term.Tags { switch tag { @@ -54,31 +54,37 @@ func computeJmdictScore(term *dbTerm) { } } -func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta { +func jmdictAddPriorities(term *dbTerm, priorities ...string) { + for _, priority := range priorities { + switch priority { + case "news1", "ichi1", "spec1", "gai1": + term.addTags("P") + fallthrough + case "news2", "ichi2", "spec2", "gai2": + term.addTags(priority[:len(priority)-1]) + } + } +} + +func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { tags := map[string]dbTagMeta{ - "news1": {Notes: "appears frequently in Mainichi Shimbun (top listing)", Category: "frequent", Order: 3}, - "ichi1": {Notes: "listed as common in Ichimango Goi Bunruishuu (top listing)", Category: "frequent", Order: 3}, - "spec1": {Notes: "common words not included in frequency lists (top listing)", Category: "frequent", Order: 3}, - "gai1": {Notes: "common loanword (top listing)", Category: "frequent", Order: 3}, - "news2": {Notes: "appears frequently in Mainichi Shimbun (bottom listing)", Order: 3}, - "ichi2": {Notes: "listed as common in Ichimango Goi Bunruishuu (bottom listing)", Order: 3}, - "spec2": {Notes: "common words not included in frequency lists (bottom listing)", Order: 3}, - "gai2": {Notes: "common loanword (bottom listing)", Order: 3}, + "news": {Notes: "appears frequently in Mainichi Shimbun"}, + "ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu"}, + "spec": {Notes: "common words not included in frequency lists"}, + "gai": {Notes: "common loanword"}, + "P": {Notes: "popular term", Category: "popular", Order: -10}, } for name, value := range entities { tag := dbTagMeta{Notes: value} switch name { - case "gai1", "ichi1", "news1", "spec1": - tag.Category = "frequent" - tag.Order = 1 case "exp", "id": tag.Category = "expression" - tag.Order = 2 + tag.Order = -5 case "arch", "iK": tag.Category = "archaism" - tag.Order = 2 + tag.Order = -5 } tags[name] = tag @@ -87,7 +93,7 @@ func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta { return tags } -func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm { +func jmdictExtractTerms(edictEntry jmdict.JmdictEntry) []dbTerm { var terms []dbTerm convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) { @@ -100,7 +106,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm { if kanji == nil { termBase.Expression = reading.Reading - termBase.addTags(reading.Priorities...) + jmdictAddPriorities(&termBase, reading.Priorities...) } else { termBase.Expression = kanji.Expression termBase.Reading = reading.Reading @@ -108,7 +114,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm { for _, priority := range kanji.Priorities { if hasString(priority, reading.Priorities) { - termBase.addTags(priority) + jmdictAddPriorities(&termBase, priority) } } } @@ -133,8 +139,8 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm { term.Glossary = append(term.Glossary, glossary.Content) } - computeJmdictRules(&term) - computeJmdictScore(&term) + jmdictBuildRules(&term) + jmdictBuildScore(&term) terms = append(terms, term) } @@ -155,7 +161,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm { return terms } -func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error { +func jmdictExportDb(outputDir, title string, reader io.Reader, flags int) error { dict, entities, err := jmdict.LoadJmdictNoTransform(reader) if err != nil { return err @@ -163,7 +169,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error var terms dbTermList for _, entry := range dict.Entries { - terms = append(terms, extractJmdictTerms(entry)...) + terms = append(terms, jmdictExtractTerms(entry)...) } return writeDb( @@ -171,7 +177,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error title, terms.crush(), nil, - computeJmdictTagMeta(entities), + jmdictBuildTagMeta(entities), flags&flagPretty == flagPretty, ) } diff --git a/enamdict.go b/enamdict.go index 5cb3de1..fd592b2 100644 --- a/enamdict.go +++ b/enamdict.go @@ -28,7 +28,7 @@ import ( "github.com/FooSoft/jmdict" ) -func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta { +func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { tags := make(map[string]dbTagMeta) for name, value := range entities { @@ -46,7 +46,7 @@ func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta { return tags } -func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm { +func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm { var terms []dbTerm convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) { @@ -95,7 +95,7 @@ func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm { return terms } -func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) error { +func jmnedictExportDb(outputDir, title string, reader io.Reader, flags int) error { dict, entities, err := jmdict.LoadJmnedictNoTransform(reader) if err != nil { return err @@ -103,7 +103,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro var terms dbTermList for _, e := range dict.Entries { - terms = append(terms, extractJmnedictTerms(e)...) + terms = append(terms, jmnedictExtractTerms(e)...) } return writeDb( @@ -111,7 +111,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro title, terms.crush(), nil, - computeJmnedictTagMeta(entities), + jmnedictBuildTagMeta(entities), flags&flagPretty == flagPretty, ) } diff --git a/epwing.go b/epwing.go index c051509..f18bbf3 100644 --- a/epwing.go +++ b/epwing.go @@ -55,14 +55,7 @@ type epwingExtractor interface { getFontWide() map[int]string } -type daijirinExtractor struct { - partsExp *regexp.Regexp - phonExp *regexp.Regexp - variantExp *regexp.Regexp - annotExp *regexp.Regexp -} - -func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error { +func epwingExportDb(outputDir, title string, reader io.Reader, flags int) error { data, err := ioutil.ReadAll(reader) if err != nil { return err diff --git a/kanjidic.go b/kanjidic.go index c78e0ec..8b65e26 100644 --- a/kanjidic.go +++ b/kanjidic.go @@ -30,7 +30,7 @@ import ( "github.com/FooSoft/jmdict" ) -func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji { +func kanjidicExtractKanji(entry jmdict.KanjidicCharacter) dbKanji { kanji := dbKanji{Character: entry.Literal} if level := entry.Misc.JlptLevel; level != nil { @@ -78,7 +78,7 @@ func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji { return kanji } -func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) error { +func kanjidicExportDb(outputDir, title string, reader io.Reader, flags int) error { dict, err := jmdict.LoadKanjidic(reader) if err != nil { return err @@ -86,12 +86,12 @@ func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) erro var kanji dbKanjiList for _, entry := range dict.Characters { - kanji = append(kanji, extractKanjidicKanji(entry)) + kanji = append(kanji, kanjidicExtractKanji(entry)) } tagMeta := map[string]dbTagMeta{ - "jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: 3}, - "jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: 3}, + "jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5}, + "jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5}, "jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"}, "grade": {Notes: "school grade level at which the character is taught"}, "strokes": {Notes: "number of strokes needed to write the character"}, diff --git a/main.go b/main.go index 47fcda7..0fc8e9a 100644 --- a/main.go +++ b/main.go @@ -44,10 +44,10 @@ func usage() { func exportDb(inputPath, outputDir, format, title string, flags int) error { handlers := map[string]func(string, string, io.Reader, int) error{ - "edict": exportJmdictDb, - "enamdict": exportJmnedictDb, - "kanjidic": exportKanjidicDb, - "epwing": exportEpwingDb, + "edict": jmdictExportDb, + "enamdict": jmnedictExportDb, + "kanjidic": kanjidicExportDb, + "epwing": epwingExportDb, } handler, ok := handlers[format]