From 853d0b33dc30dde465a8d62798b17467df0088a5 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 14:14:33 -0600 Subject: [PATCH 01/19] Use empty interface type for dictionary glossaries Necesssary for structured content support --- common.go | 6 +++--- daijirin.go | 4 ++-- daijisen.go | 4 ++-- enamdict.go | 4 +++- gakken.go | 4 ++-- kotowaza.go | 2 +- koujien.go | 4 ++-- meikyou.go | 4 ++-- shougakukan2.go | 2 +- wadai.go | 2 +- 10 files changed, 19 insertions(+), 17 deletions(-) diff --git a/common.go b/common.go index aa566c0..f3ff84c 100644 --- a/common.go +++ b/common.go @@ -21,7 +21,7 @@ const ( const databaseFormat = 3 -type dbRecord []interface{} +type dbRecord []any type dbRecordList []dbRecord type dbTag struct { @@ -46,7 +46,7 @@ func (meta dbTagList) crush() dbRecordList { type dbMeta struct { Expression string Mode string - Data interface{} + Data any } type dbMetaList []dbMeta @@ -66,7 +66,7 @@ type dbTerm struct { DefinitionTags []string Rules []string Score int - Glossary []string + Glossary []any Sequence int TermTags []string } diff --git a/daijirin.go b/daijirin.go index 2c2b190..abc30e6 100644 --- a/daijirin.go +++ b/daijirin.go @@ -65,7 +65,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -79,7 +79,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/daijisen.go b/daijisen.go index 5d663df..332bc46 100644 --- a/daijisen.go +++ b/daijisen.go @@ -70,7 +70,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db if len(expressions) == 0 { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -82,7 +82,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/enamdict.go b/enamdict.go index f3df513..be12d5b 100644 --- a/enamdict.go +++ b/enamdict.go @@ -53,7 +53,9 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm { } for _, trans := range enamdictEntry.Translations { - term.Glossary = append(term.Glossary, trans.Translations...) + for _, translation := range trans.Translations { + term.Glossary = append(term.Glossary, translation) + } term.addDefinitionTags(trans.NameTypes...) } diff --git a/gakken.go b/gakken.go index b25f989..58e96b4 100644 --- a/gakken.go +++ b/gakken.go @@ -90,7 +90,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entryText}, + Glossary: []any{entryText}, Sequence: sequence, } @@ -107,7 +107,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entryText}, + Glossary: []any{entryText}, Sequence: sequence, } diff --git a/kotowaza.go b/kotowaza.go index 7f713ae..fca8f7d 100644 --- a/kotowaza.go +++ b/kotowaza.go @@ -72,7 +72,7 @@ func (e *kotowazaExtractor) extractTerms(entry zig.BookEntry, sequence int) []db term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/koujien.go b/koujien.go index 89b7379..049d5a0 100644 --- a/koujien.go +++ b/koujien.go @@ -75,7 +75,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -89,7 +89,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/meikyou.go b/meikyou.go index 78a3081..2ea33fe 100644 --- a/meikyou.go +++ b/meikyou.go @@ -106,7 +106,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -120,7 +120,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/shougakukan2.go b/shougakukan2.go index 5b06ea6..7b16549 100644 --- a/shougakukan2.go +++ b/shougakukan2.go @@ -93,7 +93,7 @@ func (e *shougakukan2Extractor) extractTerms(entry zig.BookEntry, sequence int) terms = append(terms, dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, }) } diff --git a/wadai.go b/wadai.go index 2507b92..0226f64 100644 --- a/wadai.go +++ b/wadai.go @@ -74,7 +74,7 @@ func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTer term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } From 56f98959677969e6e9f77a56bfd99fdd555a1e42 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 14:27:02 -0600 Subject: [PATCH 02/19] Add struct for handling index.json data --- common.go | 42 ++++++++++++++++++++++++++---------------- edict.go | 14 ++++++++++---- enamdict.go | 15 ++++++++++----- epwing.go | 13 ++++++++++--- frequency.go | 15 ++++++++++----- kanjidic.go | 15 ++++++++++----- rikai.go | 15 ++++++++++----- 7 files changed, 86 insertions(+), 43 deletions(-) diff --git a/common.go b/common.go index f3ff84c..ec331d6 100644 --- a/common.go +++ b/common.go @@ -19,8 +19,6 @@ const ( DefaultTitle = "" ) -const databaseFormat = 3 - type dbRecord []any type dbRecordList []dbRecord @@ -142,11 +140,34 @@ func (kanji dbKanjiList) crush() dbRecordList { return results } -func writeDb(outputPath, title, revision string, sequenced bool, recordData map[string]dbRecordList, stride int, pretty bool) error { +type dbIndex struct { + Title string `json:"title"` + Format int `json:"format"` + Revision string `json:"revision"` + Sequenced bool `json:"sequenced"` + Author string `json:"author"` + Url string `json:"url"` + Description string `json:"description"` + Attribution string `json:"attribution"` +} + +func (index *dbIndex) setDefaults() { + if index.Format == 0 { + index.Format = 3 + } + if index.Author == "" { + index.Author = "yomichan-import" + } + if index.Url == "" { + index.Url = "https://github.com/FooSoft/yomichan-import" + } +} + +func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordList, stride int, pretty bool) error { var zbuff bytes.Buffer zip := zip.NewWriter(&zbuff) - marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) { + marshalJSON := func(obj any, pretty bool) ([]byte, error) { if pretty { return json.MarshalIndent(obj, "", " ") } @@ -186,17 +207,6 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[ } var err error - var db struct { - Title string `json:"title"` - Format int `json:"format"` - Revision string `json:"revision"` - Sequenced bool `json:"sequenced"` - } - - db.Title = title - db.Format = databaseFormat - db.Revision = revision - db.Sequenced = sequenced for recordType, recordEntries := range recordData { if _, err := writeDbRecords(recordType, recordEntries); err != nil { @@ -204,7 +214,7 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[ } } - bytes, err := marshalJSON(db, pretty) + bytes, err := marshalJSON(index, pretty) if err != nil { return err } diff --git a/edict.go b/edict.go index f30dfdb..b6326f0 100644 --- a/edict.go +++ b/edict.go @@ -7,7 +7,7 @@ import ( "foosoft.net/projects/jmdict" ) -const jmdictRevision = "jmdict4" +const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/" func jmdictBuildRules(term *dbTerm) { for _, tag := range term.DefinitionTags { @@ -234,11 +234,17 @@ func jmdictExportDb(inputPath, outputPath, language, title string, stride int, p "tag": jmdictBuildTagMeta(entities).crush(), } + index := dbIndex{ + Title: title, + Revision: "jmdict4", + Sequenced: true, + Attribution: edrdgAttribution, + } + index.setDefaults() + return writeDb( outputPath, - title, - jmdictRevision, - true, + index, recordData, stride, pretty, diff --git a/enamdict.go b/enamdict.go index be12d5b..e0c1cb0 100644 --- a/enamdict.go +++ b/enamdict.go @@ -6,8 +6,6 @@ import ( "foosoft.net/projects/jmdict" ) -const jmnedictRevision = "jmnedict1" - func jmnedictBuildTagMeta(entities map[string]string) dbTagList { var tags dbTagList @@ -103,11 +101,18 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, "tag": jmnedictBuildTagMeta(entities).crush(), } + index := dbIndex{ + Title: title, + Revision: "jmnedict1", + Sequenced: true, + Description: "", + Attribution: edrdgAttribution, + } + index.setDefaults() + return writeDb( outputPath, - title, - jmnedictRevision, - true, + index, recordData, stride, pretty, diff --git a/epwing.go b/epwing.go index 37516c6..83b54b8 100644 --- a/epwing.go +++ b/epwing.go @@ -101,11 +101,18 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p "term": terms.crush(), } + index := dbIndex{ + Title: title, + Revision: strings.Join(revisions, ";"), + Sequenced: true, + Description: "", + Attribution: "", + } + index.setDefaults() + return writeDb( outputPath, - title, - strings.Join(revisions, ";"), - true, + index, recordData, stride, pretty, diff --git a/frequency.go b/frequency.go index 694ed67..5d9f06a 100644 --- a/frequency.go +++ b/frequency.go @@ -7,8 +7,6 @@ import ( "strings" ) -const frequencyRevision = "frequency1" - func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta") } @@ -57,11 +55,18 @@ func frequncyExportDb(inputPath, outputPath, language, title string, stride int, key: frequencies.crush(), } + index := dbIndex{ + Title: title, + Revision: "frequency1", + Sequenced: false, + Description: "", + Attribution: "", + } + index.setDefaults() + return writeDb( outputPath, - title, - frequencyRevision, - false, + index, recordData, stride, pretty, diff --git a/kanjidic.go b/kanjidic.go index 37bebdd..e1c42d9 100644 --- a/kanjidic.go +++ b/kanjidic.go @@ -7,8 +7,6 @@ import ( "foosoft.net/projects/jmdict" ) -const kanjidicRevision = "kanjidic2" - func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji { if entry.ReadingMeaning == nil { return nil @@ -161,11 +159,18 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int, "tag": tags.crush(), } + index := dbIndex{ + Title: title, + Revision: "kanjidic2", + Sequenced: false, + Description: "", + Attribution: edrdgAttribution, + } + index.setDefaults() + return writeDb( outputPath, - title, - kanjidicRevision, - false, + index, recordData, stride, pretty, diff --git a/rikai.go b/rikai.go index 651bc44..f3b6b12 100644 --- a/rikai.go +++ b/rikai.go @@ -8,8 +8,6 @@ import ( _ "github.com/mattn/go-sqlite3" ) -const rikaiRevision = "rikai2" - type rikaiEntry struct { kanji string kana string @@ -154,11 +152,18 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr "tag": tags.crush(), } + index := dbIndex{ + Title: title, + Revision: "rikai2", + Sequenced: true, + Description: "", + Attribution: "", + } + index.setDefaults() + return writeDb( outputPath, - title, - rikaiRevision, - true, + index, recordData, stride, pretty, From 73fb99286583a0ac1f82efabf9172e06ec796968 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 14:32:45 -0600 Subject: [PATCH 03/19] Add intersection and union functions for string arrays --- common.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/common.go b/common.go index ec331d6..5f2dab3 100644 --- a/common.go +++ b/common.go @@ -262,6 +262,39 @@ func hasString(needle string, haystack []string) bool { return false } +func intersection(s1, s2 []string) []string { + s := []string{} + m := make(map[string]bool) + for _, e := range s1 { + m[e] = true + } + for _, e := range s2 { + if m[e] { + s = append(s, e) + m[e] = false + } + } + return s +} + +func union(s1, s2 []string) []string { + s := []string{} + m := make(map[string]bool) + for _, e := range s1 { + if !m[e] { + s = append(s, e) + m[e] = true + } + } + for _, e := range s2 { + if !m[e] { + s = append(s, e) + m[e] = true + } + } + return s +} + func detectFormat(path string) (string, error) { switch filepath.Ext(path) { case ".sqlite": From abc28bb19dc7bc6f7efba09c0720b21fbbc19b11 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 14:37:18 -0600 Subject: [PATCH 04/19] Add new JMdict version --- common.go | 5 +- edict.go | 252 ------------------------------- go.mod | 1 + go.sum | 2 + jmdict.go | 221 +++++++++++++++++++++++++++ jmdictConstants.go | 215 ++++++++++++++++++++++++++ jmdictForms.go | 254 +++++++++++++++++++++++++++++++ jmdictGlossary.go | 300 +++++++++++++++++++++++++++++++++++++ jmdictHeadword.go | 267 +++++++++++++++++++++++++++++++++ jmdictMetadata.go | 158 ++++++++++++++++++++ jmdictReferences.go | 166 +++++++++++++++++++++ jmdictTags.go | 348 +++++++++++++++++++++++++++++++++++++++++++ structuredContent.go | 192 ++++++++++++++++++++++++ 13 files changed, 2127 insertions(+), 254 deletions(-) delete mode 100644 edict.go create mode 100644 jmdict.go create mode 100644 jmdictConstants.go create mode 100644 jmdictForms.go create mode 100644 jmdictGlossary.go create mode 100644 jmdictHeadword.go create mode 100644 jmdictMetadata.go create mode 100644 jmdictReferences.go create mode 100644 jmdictTags.go create mode 100644 structuredContent.go diff --git a/common.go b/common.go index 5f2dab3..9d6b2aa 100644 --- a/common.go +++ b/common.go @@ -306,7 +306,7 @@ func detectFormat(path string) (string, error) { } switch filepath.Base(path) { - case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml": + case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml", "JMdict_e_examp": return "edict", nil case "JMnedict", "JMnedict.xml": return "enamdict", nil @@ -336,7 +336,8 @@ func detectFormat(path string) (string, error) { func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error { handlers := map[string]func(string, string, string, string, int, bool) error{ - "edict": jmdictExportDb, + "edict": jmdExportDb, + "forms": formsExportDb, "enamdict": jmnedictExportDb, "epwing": epwingExportDb, "kanjidic": kanjidicExportDb, diff --git a/edict.go b/edict.go deleted file mode 100644 index b6326f0..0000000 --- a/edict.go +++ /dev/null @@ -1,252 +0,0 @@ -package yomichan - -import ( - "os" - "strings" - - "foosoft.net/projects/jmdict" -) - -const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/" - -func jmdictBuildRules(term *dbTerm) { - for _, tag := range term.DefinitionTags { - switch tag { - case "adj-i", "v1", "vk", "vz": - term.addRules(tag) - default: - if strings.HasPrefix(tag, "v5") { - term.addRules("v5") - } else if strings.HasPrefix(tag, "vs-") { - term.addRules("vs") - } - } - } -} - -func jmdictBuildScore(term *dbTerm) { - for _, tag := range term.DefinitionTags { - switch tag { - case "arch": - term.Score -= 100 - } - } - for _, tag := range term.TermTags { - switch tag { - case "news", "ichi", "spec", "gai1": - term.Score += 100 - case "P": - term.Score += 500 - case "iK", "ik", "ok", "oK", "io", "oik": - term.Score -= 100 - } - } -} - -func jmdictAddPriorities(term *dbTerm, priorities ...string) { - for _, priority := range priorities { - switch priority { - case "news1", "ichi1", "spec1", "gai1": - term.addTermTags("P") - fallthrough - case "news2", "ichi2", "spec2", "gai2": - term.addTermTags(priority[:len(priority)-1]) - } - } -} - -func jmdictBuildTagMeta(entities map[string]string) dbTagList { - tags := dbTagList{ - dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2}, - dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2}, - dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2}, - dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2}, - dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10, Score: 10}, - } - - for name, value := range entities { - tag := dbTag{Name: name, Notes: value} - - switch name { - case "exp", "id": - tag.Category = "expression" - tag.Order = -5 - case "arch": - tag.Category = "archaism" - tag.Order = -4 - case "iK", "ik", "ok", "oK", "io", "oik": - tag.Score = -5 - case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj", - "aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf", - "unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k", - "v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru", - "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i", - "vs", "vs-s", "vt", "vz": - tag.Category = "partOfSpeech" - tag.Order = -3 - } - - tags = append(tags, tag) - } - - return tags -} - -func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm { - var terms []dbTerm - - convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) { - if kanji != nil && reading.Restrictions != nil && !hasString(kanji.Expression, reading.Restrictions) { - return - } - - var termBase dbTerm - termBase.addTermTags(reading.Information...) - - if kanji == nil { - termBase.Expression = reading.Reading - jmdictAddPriorities(&termBase, reading.Priorities...) - } else { - termBase.Expression = kanji.Expression - termBase.Reading = reading.Reading - termBase.addTermTags(kanji.Information...) - - for _, priority := range kanji.Priorities { - if hasString(priority, reading.Priorities) { - jmdictAddPriorities(&termBase, priority) - } - } - } - - var partsOfSpeech []string - for index, sense := range edictEntry.Sense { - - if len(sense.PartsOfSpeech) != 0 { - partsOfSpeech = sense.PartsOfSpeech - } - - if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) { - continue - } - - if kanji != nil && sense.RestrictedKanji != nil && !hasString(kanji.Expression, sense.RestrictedKanji) { - continue - } - - term := dbTerm{ - Reading: termBase.Reading, - Expression: termBase.Expression, - Score: len(edictEntry.Sense) - index, - Sequence: edictEntry.Sequence, - } - - for _, glossary := range sense.Glossary { - if glossary.Language == nil && language == "" || glossary.Language != nil && language == *glossary.Language { - term.Glossary = append(term.Glossary, glossary.Content) - } - } - - if len(term.Glossary) == 0 { - continue - } - - term.addDefinitionTags(termBase.DefinitionTags...) - term.addTermTags(termBase.TermTags...) - term.addDefinitionTags(partsOfSpeech...) - term.addDefinitionTags(sense.Fields...) - term.addDefinitionTags(sense.Misc...) - term.addDefinitionTags(sense.Dialects...) - - jmdictBuildRules(&term) - jmdictBuildScore(&term) - - terms = append(terms, term) - } - } - - if len(edictEntry.Kanji) > 0 { - for _, kanji := range edictEntry.Kanji { - for _, reading := range edictEntry.Readings { - if reading.NoKanji == nil { - convert(reading, &kanji) - } - } - } - for _, reading := range edictEntry.Readings { - if reading.NoKanji != nil { - convert(reading, nil) - } - } - } else { - for _, reading := range edictEntry.Readings { - convert(reading, nil) - } - } - - return terms -} - -func jmdictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { - reader, err := os.Open(inputPath) - if err != nil { - return err - } - defer reader.Close() - - dict, entities, err := jmdict.LoadJmdictNoTransform(reader) - if err != nil { - return err - } - - var langTag string - switch language { - case "dutch": - langTag = "dut" - case "french": - langTag = "fre" - case "german": - langTag = "ger" - case "hungarian": - langTag = "hun" - case "italian": - langTag = "ita" - case "russian": - langTag = "rus" - case "slovenian": - langTag = "slv" - case "spanish": - langTag = "spa" - case "swedish": - langTag = "swe" - } - - var terms dbTermList - for _, entry := range dict.Entries { - terms = append(terms, jmdictExtractTerms(entry, langTag)...) - } - - if title == "" { - title = "JMdict" - } - - recordData := map[string]dbRecordList{ - "term": terms.crush(), - "tag": jmdictBuildTagMeta(entities).crush(), - } - - index := dbIndex{ - Title: title, - Revision: "jmdict4", - Sequenced: true, - Attribution: edrdgAttribution, - } - index.setDefaults() - - return writeDb( - outputPath, - index, - recordData, - stride, - pretty, - ) -} diff --git a/go.mod b/go.mod index 0bca3dd..4f31a22 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( foosoft.net/projects/zero-epwing-go v0.0.0-20220704035039-bc008453615d github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e github.com/mattn/go-sqlite3 v1.14.14 + golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f ) require golang.org/x/text v0.3.7 // indirect diff --git a/go.sum b/go.sum index ca51ada..4dd5f91 100644 --- a/go.sum +++ b/go.sum @@ -6,5 +6,7 @@ github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e h1:wSQCJiig/QkoUnpvelSP github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II= github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw= github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= +golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f h1:90Jq/vvGVDsqj8QqCynjFw9MCerDguSMODLYII416Y8= +golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= diff --git a/jmdict.go b/jmdict.go new file mode 100644 index 0000000..74809e7 --- /dev/null +++ b/jmdict.go @@ -0,0 +1,221 @@ +package yomichan + +import ( + "os" + "regexp" + "strconv" + "strings" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +func grammarRules(partsOfSpeech []string) []string { + rules := []string{} + for _, partOfSpeech := range partsOfSpeech { + switch partOfSpeech { + case "adj-i", "vk", "vz": + rules = append(rules, partOfSpeech) + default: + if strings.HasPrefix(partOfSpeech, "v5") { + rules = append(rules, "v5") + } else if strings.HasPrefix(partOfSpeech, "v1") { + rules = append(rules, "v1") + } else if strings.HasPrefix(partOfSpeech, "vs-") { + rules = append(rules, "vs") + } + } + } + return rules +} + +func calculateTermScore(senseNumber int, headword headword) int { + const senseWeight int = 1 + const entryPositionWeight int = 100 + const priorityWeight int = 10000 + + score := 0 + score -= (senseNumber - 1) * senseWeight + score -= headword.Index * entryPositionWeight + score += headword.Score() * priorityWeight + + return score +} + +func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) bool { + // Display sense numbers if the entry has more than one sense + // or if the headword is found in multiple entries. + hash := headword.Hash() + if meta.seqToSenseCount[entry.Sequence] > 1 { + return true + } else if len(meta.headwordHashToSeqs[hash]) > 1 { + return true + } else { + return false + } +} + +func jmdictPublicationDate(dictionary jmdict.Jmdict) string { + dateEntry := dictionary.Entries[len(dictionary.Entries)-1] + r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`) + jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content) + return jmdictDate +} + +func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { + term := baseFormsTerm(entry) + term.Expression = headword.Expression + term.Reading = headword.Reading + + term.addTermTags(headword.TermTags...) + + term.addDefinitionTags("forms") + senseNumber := meta.seqToSenseCount[entry.Sequence] + 1 + term.Score = calculateTermScore(senseNumber, headword) + return term +} + +func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { + term := dbTerm{ + Expression: headword.Expression, + Sequence: -entry.Sequence, + } + for _, sense := range entry.Sense { + rules := grammarRules(sense.PartsOfSpeech) + term.addRules(rules...) + } + term.addTermTags(headword.TermTags...) + term.Score = calculateTermScore(0, headword) + + redirectHeadword := meta.seqToMainHeadword[entry.Sequence] + expHash := redirectHeadword.ExpHash() + doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1) + + content := contentSpan( + contentAttr{fontSize: "130%"}, + "โŸถ", + redirectHeadword.ToInternalLink(doDisplayReading), + ) + + term.Glossary = []any{contentStructure(content)} + return term +} + +func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { + term := dbTerm{ + Expression: headword.Expression, + Reading: headword.Reading, + Sequence: entry.Sequence, + } + + term.Glossary = createGlossary(sense, meta) + + term.addTermTags(headword.TermTags...) + + if doDisplaySenseNumberTag(headword, entry, meta) { + senseNumberTag := strconv.Itoa(senseNumber) + term.addDefinitionTags(senseNumberTag) + } + term.addDefinitionTags(sense.PartsOfSpeech...) + term.addDefinitionTags(sense.Fields...) + term.addDefinitionTags(sense.Misc...) + term.addDefinitionTags(sense.Dialects...) + + rules := grammarRules(sense.PartsOfSpeech) + term.addRules(rules...) + + term.Score = calculateTermScore(senseNumber, headword) + + return term +} + +func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) { + if meta.seqToSenseCount[entry.Sequence] == 0 { + return nil, false + } + if headword.IsSearchOnly { + searchTerm := createSearchTerm(headword, entry, meta) + return []dbTerm{searchTerm}, true + } + terms := []dbTerm{} + senseNumber := 1 + for _, sense := range entry.Sense { + if !glossaryContainsLanguage(sense.Glossary, meta.language) { + continue + } + if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { + senseNumber += 1 + continue + } + if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) { + senseNumber += 1 + continue + } + senseTerm := createSenseTerm(sense, senseNumber, headword, entry, meta) + senseNumber += 1 + terms = append(terms, senseTerm) + } + + if meta.hasMultipleForms[entry.Sequence] { + formsTerm := createFormsTerm(headword, entry, meta) + terms = append(terms, formsTerm) + } + return terms, true +} + +func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error { + reader, err := os.Open(inputPath) + if err != nil { + return err + } + defer reader.Close() + + dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader) + if err != nil { + return err + } + + meta := newJmdictMetadata(dictionary, languageName) + + terms := dbTermList{} + for _, entry := range dictionary.Entries { + headwords := extractHeadwords(entry) + for _, headword := range headwords { + if newTerms, ok := extractTerms(headword, entry, meta); ok { + terms = append(terms, newTerms...) + } + } + } + + tags := dbTagList{} + tags = append(tags, entityTags(entities)...) + tags = append(tags, senseNumberTags(meta.maxSenseCount)...) + tags = append(tags, newsFrequencyTags()...) + tags = append(tags, customDbTags()...) + + recordData := map[string]dbRecordList{ + "term": terms.crush(), + "tag": tags.crush(), + } + + if title == "" { + title = "JMdict" + } + jmdictDate := jmdictPublicationDate(dictionary) + + index := dbIndex{ + Title: title, + Revision: "JMdict." + jmdictDate, + Sequenced: true, + Attribution: edrdgAttribution, + } + index.setDefaults() + + return writeDb( + outputPath, + index, + recordData, + stride, + pretty, + ) +} diff --git a/jmdictConstants.go b/jmdictConstants.go new file mode 100644 index 0000000..1d49194 --- /dev/null +++ b/jmdictConstants.go @@ -0,0 +1,215 @@ +package yomichan + +type LangCode struct { + language string + code string +} + +const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/" + +const prioritySymbol = "โ˜…" +const rareKanjiSymbol = "๐Ÿ…" +const irregularSymbol = "โš " +const outdatedSymbol = "โ›ฌ" +const defaultSymbol = "ใŠ’" + +const priorityTagName = "โญ" +const rareKanjiTagName = "R" +const irregularTagName = "โš ๏ธ" +const outdatedTagName = "โ›ฌ" +const atejiTagName = "ateji" +const gikunTagName = "gikun" + +const langMarker = "'๐ŸŒ '" +const noteMarker = "'๐Ÿ“ '" +const infoMarker = "'โ„น๏ธ '" +const refMarker = "'โžก๏ธ '" +const antonymMarker = "'๐Ÿ”„ '" + +var ISOtoFlag = map[string]string{ + "": "'๐Ÿ‡ฌ๐Ÿ‡ง '", + "eng": "'๐Ÿ‡ฌ๐Ÿ‡ง '", + "dut": "'๐Ÿ‡ณ๐Ÿ‡ฑ '", + "fre": "'๐Ÿ‡ซ๐Ÿ‡ท '", + "ger": "'๐Ÿ‡ฉ๐Ÿ‡ช '", + "hun": "'๐Ÿ‡ญ๐Ÿ‡บ '", + "ita": "'๐Ÿ‡ฎ๐Ÿ‡น '", + "jpn": "'๐Ÿ‡ฏ๐Ÿ‡ต '", + "rus": "'๐Ÿ‡ท๐Ÿ‡บ '", + "slv": "'๐Ÿ‡ธ๐Ÿ‡ฎ '", + "spa": "'๐Ÿ‡ช๐Ÿ‡ธ '", + "swe": "'๐Ÿ‡ธ๐Ÿ‡ช '", +} + +var langNameToCode = map[string]string{ + "": "eng", + "english": "eng", + "dutch": "dut", + "french": "fre", + "german": "ger", + "hungarian": "hun", + "italian": "ita", + "russian": "rus", + "slovenian": "slv", + "spanish": "spa", + "swedish": "swe", +} + +var glossTypeCodeToName = map[LangCode]string{ + LangCode{"eng", "lit"}: "literally", + LangCode{"eng", "fig"}: "figuratively", + LangCode{"eng", "expl"}: "", // don't need to tell the user that an explanation is an explanation + LangCode{"eng", "tm"}: "trademark", +} + +var refNoteHint = map[LangCode]string{ + LangCode{"eng", "xref"}: "see", + LangCode{"eng", "ant"}: "antonym", +} + +var sourceLangTypeCodeToType = map[LangCode]string{ + LangCode{"eng", "part"}: "partial", + LangCode{"eng", ""}: "", // implied "full" +} + +var langCodeToName = map[LangCode]string{ + LangCode{"eng", "afr"}: "Afrikaans", + LangCode{"eng", "ain"}: "Ainu", + LangCode{"eng", "alg"}: "Algonquian", + LangCode{"eng", "amh"}: "Amharic", + LangCode{"eng", "ara"}: "Arabic", + LangCode{"eng", "arn"}: "Mapudungun", + LangCode{"eng", "bnt"}: "Bantu", + LangCode{"eng", "bre"}: "Breton", + LangCode{"eng", "bul"}: "Bulgarian", + LangCode{"eng", "bur"}: "Burmese", + LangCode{"eng", "chi"}: "Chinese", + LangCode{"eng", "chn"}: "Chinook Jargon", + LangCode{"eng", "cze"}: "Czech", + LangCode{"eng", "dan"}: "Danish", + LangCode{"eng", "dut"}: "Dutch", + LangCode{"eng", "eng"}: "English", + LangCode{"eng", "epo"}: "Esperanto", + LangCode{"eng", "est"}: "Estonian", + LangCode{"eng", "fil"}: "Filipino", + LangCode{"eng", "fin"}: "Finnish", + LangCode{"eng", "fre"}: "French", + LangCode{"eng", "geo"}: "Georgian", + LangCode{"eng", "ger"}: "German", + LangCode{"eng", "glg"}: "Galician", + LangCode{"eng", "grc"}: "Ancient Greek", + LangCode{"eng", "gre"}: "Modern Greek", + LangCode{"eng", "haw"}: "Hawaiian", + LangCode{"eng", "heb"}: "Hebrew", + LangCode{"eng", "hin"}: "Hindi", + LangCode{"eng", "hun"}: "Hungarian", + LangCode{"eng", "ice"}: "Icelandic", + LangCode{"eng", "ind"}: "Indonesian", + LangCode{"eng", "ita"}: "Italian", + LangCode{"eng", "khm"}: "Khmer", + LangCode{"eng", "kor"}: "Korean", + LangCode{"eng", "kur"}: "Kurdish", + LangCode{"eng", "lat"}: "Latin", + LangCode{"eng", "mal"}: "Malayalam", + LangCode{"eng", "mao"}: "Maori", + LangCode{"eng", "may"}: "Malay", + LangCode{"eng", "mnc"}: "Manchu", + LangCode{"eng", "mol"}: "Moldavian", // ISO 639 deprecated (https://iso639-3.sil.org/code/mol) + LangCode{"eng", "mon"}: "Mongolian", + LangCode{"eng", "nor"}: "Norwegian", + LangCode{"eng", "per"}: "Persian", + LangCode{"eng", "pol"}: "Polish", + LangCode{"eng", "por"}: "Portuguese", + LangCode{"eng", "rum"}: "Romanian", + LangCode{"eng", "rus"}: "Russian", + LangCode{"eng", "san"}: "Sanskrit", + LangCode{"eng", "scr"}: "Croatian", // Code doesn't seem to exist in ISO 639. Should be "hrv" instead? (https://iso639-3.sil.org/code/hrv) + LangCode{"eng", "slo"}: "Slovak", + LangCode{"eng", "slv"}: "Slovenian", + LangCode{"eng", "som"}: "Somali", + LangCode{"eng", "spa"}: "Spanish", + LangCode{"eng", "swa"}: "Swahili", + LangCode{"eng", "swe"}: "Swedish", + LangCode{"eng", "tah"}: "Tahitian", + LangCode{"eng", "tam"}: "Tamil", + LangCode{"eng", "tgl"}: "Tagalog", + LangCode{"eng", "tha"}: "Thai", + LangCode{"eng", "tib"}: "Tibetan", + LangCode{"eng", "tur"}: "Turkish", + LangCode{"eng", "ukr"}: "Ukrainian", + LangCode{"eng", "urd"}: "Urdu", + LangCode{"eng", "vie"}: "Vietnamese", + LangCode{"eng", "yid"}: "Yiddish", +} + +// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +var ISOtoHTML = map[string]string{ + "afr": "af", // Afrikaans + "ain": "ain", // Ainu + "alg": "alg", // Algonquian + "amh": "am", // Amharic + "ara": "ar", // Arabic + "arn": "arn", // Mapudungun + "bnt": "bnt", // Bantu + "bre": "br", // Breton + "bul": "bg", // Bulgarian + "bur": "my", // Burmese + "chi": "zh", // Chinese + "chn": "chn", // Chinook Jargon + "cze": "cs", // Czech + "dan": "da", // Danish + "dut": "nl", // Dutch + "eng": "en", // English + "epo": "eo", // Esperanto + "est": "et", // Estonian + "fil": "fil", // Filipino + "fin": "fi", // Finnish + "fre": "fr", // French + "geo": "ka", // Georgian + "ger": "de", // German + "glg": "gl", // Galician + "grc": "grc", // Ancient Greek + "gre": "el", // Modern Greek + "haw": "haw", // Hawaiian + "heb": "he", // Hebrew + "hin": "hi", // Hindi + "hun": "hu", // Hungarian + "ice": "is", // Icelandic + "ind": "id", // Indonesian + "ita": "it", // Italian + "jpn": "ja", // Japanese + "khm": "km", // Khmer + "kor": "ko", // Korean + "kur": "ku", // Kurdish + "lat": "la", // Latin + "mal": "ml", // Malayalam + "mao": "mi", // Maori + "may": "ms", // Malay + "mnc": "mnc", // Manchu + "mol": "ro", // Moldavian + "mon": "mn", // Mongolian + "nor": "no", // Norwegian + "per": "fa", // Persian + "pol": "pl", // Polish + "por": "pt", // Portuguese + "rum": "ro", // Romanian + "rus": "ru", // Russian + "san": "sa", // Sanskrit + "scr": "hr", // Croatian + "slo": "sk", // Slovak + "slv": "sl", // Slovenian + "som": "so", // Somali + "spa": "es", // Spanish + "swa": "sw", // Swahili + "swe": "sv", // Swedish + "tah": "ty", // Tahitian + "tam": "ta", // Tamil + "tgl": "tl", // Tagalog + "tha": "th", // Thai + "tib": "bo", // Tibetan + "tur": "tr", // Turkish + "ukr": "uk", // Ukrainian + "urd": "ur", // Urdu + "vie": "vi", // Vietnamese + "yid": "yi", // Yiddish +} diff --git a/jmdictForms.go b/jmdictForms.go new file mode 100644 index 0000000..76eba34 --- /dev/null +++ b/jmdictForms.go @@ -0,0 +1,254 @@ +package yomichan + +import ( + "os" + "strings" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +func kata2hira(word string) string { + charMap := func(character rune) rune { + if (character >= 'ใ‚ก' && character <= 'ใƒถ') || (character >= 'ใƒฝ' && character <= 'ใƒพ') { + return character - 0x60 + } else { + return character + } + } + return strings.Map(charMap, word) +} + +func (h *headword) InfoSymbols() string { + infoSymbols := []string{} + if h.IsPriority { + infoSymbols = append(infoSymbols, prioritySymbol) + } + if h.IsRareKanji { + infoSymbols = append(infoSymbols, rareKanjiSymbol) + } + if h.IsIrregular { + infoSymbols = append(infoSymbols, irregularSymbol) + } + if h.IsOutdated { + infoSymbols = append(infoSymbols, outdatedSymbol) + } + return strings.Join(infoSymbols[:], " | ") +} + +func (h *headword) GlossText() string { + gloss := h.Expression + if h.IsAteji { + gloss = "ใ€ˆ" + gloss + "ใ€‰" + } + symbolText := h.InfoSymbols() + if symbolText != "" { + gloss += "๏ผˆ" + symbolText + "๏ผ‰" + } + return gloss +} + +func (h *headword) TableColHeaderText() string { + text := h.KanjiForm() + if h.IsAteji { + text = "ใ€ˆ" + text + "ใ€‰" + } + return text +} + +func (h *headword) TableRowHeaderText() string { + text := h.Reading + if h.IsGikun { + text = "ใ€ˆ" + text + "ใ€‰" + } + return text +} + +func (h *headword) TableCellText() string { + text := h.InfoSymbols() + if text == "" { + return defaultSymbol + } else { + return text + } +} + +func (h *headword) KanjiForm() string { + if h.IsKanaOnly() { + return "โˆ…" + } else { + return h.Expression + } +} + +func jmdNeedsFormTable(headwords []headword) bool { + // Does the entry contain more than 1 distinct reading? + // E.g. ใƒใ‚ซใŒใ„ and ใฐใ‹ใŒใ„ are not distinct. + uniqueReading := "" + for _, h := range headwords { + if h.IsGikun { + return true + } else if h.IsSearchOnly { + continue + } else if h.IsKanaOnly() { + continue + } else if uniqueReading == "" { + uniqueReading = kata2hira(h.Reading) + } else if uniqueReading != kata2hira(h.Reading) { + return true + } + } + return false +} + +type formTableData struct { + kanjiForms []string + readings []string + colHeaderText map[string]string + rowHeaderText map[string]string + cellText map[string]map[string]string +} + +func tableData(headwords []headword) formTableData { + d := formTableData{ + kanjiForms: []string{}, + readings: []string{}, + colHeaderText: make(map[string]string), + rowHeaderText: make(map[string]string), + cellText: make(map[string]map[string]string), + } + for _, h := range headwords { + if h.IsSearchOnly { + continue + } + kanjiForm := h.KanjiForm() + if !slices.Contains(d.kanjiForms, kanjiForm) { + d.kanjiForms = append(d.kanjiForms, kanjiForm) + d.colHeaderText[kanjiForm] = h.TableColHeaderText() + } + reading := h.Reading + if !slices.Contains(d.readings, reading) { + d.readings = append(d.readings, reading) + d.rowHeaderText[reading] = h.TableRowHeaderText() + d.cellText[reading] = make(map[string]string) + } + d.cellText[reading][kanjiForm] = h.TableCellText() + } + return d +} + +func formsTableGlossary(headwords []headword) []any { + d := tableData(headwords) + + attr := contentAttr{} + centeredAttr := contentAttr{textAlign: "center"} + leftAttr := contentAttr{textAlign: "left"} + + cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner + headRowCells := []any{cornerCell} + for _, kanjiForm := range d.kanjiForms { + content := d.colHeaderText[kanjiForm] + cell := contentTableHeadCell(centeredAttr, content) + headRowCells = append(headRowCells, cell) + } + headRow := contentTableRow(attr, headRowCells...) + tableRows := []any{headRow} + for _, reading := range d.readings { + rowHeadCellText := d.rowHeaderText[reading] + rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText) + rowCells := []any{rowHeadCell} + for _, kanjiForm := range d.kanjiForms { + text := d.cellText[reading][kanjiForm] + rowCell := contentTableCell(centeredAttr, text) + rowCells = append(rowCells, rowCell) + } + tableRow := contentTableRow(attr, rowCells...) + tableRows = append(tableRows, tableRow) + } + tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}} + contentTable := contentTable(tableAttr, tableRows...) + content := contentStructure(contentTable) + return []any{content} +} + +func formsGlossary(headwords []headword) []any { + glossary := []any{} + for _, h := range headwords { + if h.IsSearchOnly { + continue + } + text := h.GlossText() + glossary = append(glossary, text) + } + return glossary +} + +func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm { + term := dbTerm{Sequence: entry.Sequence} + headwords := extractHeadwords(entry) + if jmdNeedsFormTable(headwords) { + term.Glossary = formsTableGlossary(headwords) + } else { + term.Glossary = formsGlossary(headwords) + } + for _, sense := range entry.Sense { + rules := grammarRules(sense.PartsOfSpeech) + term.addRules(rules...) + } + return term +} + +func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error { + reader, err := os.Open(inputPath) + if err != nil { + return err + } + defer reader.Close() + + dictionary, _, err := jmdict.LoadJmdictNoTransform(reader) + if err != nil { + return err + } + + terms := dbTermList{} + for _, entry := range dictionary.Entries { + baseTerm := baseFormsTerm(entry) + headwords := extractHeadwords(entry) + for _, h := range headwords { + term := baseTerm + if h.IsSearchOnly { + term.Sequence = -term.Sequence + } + term.Expression = h.Expression + term.Reading = h.Reading + terms = append(terms, term) + } + } + + if title == "" { + title = "JMdict Forms" + } + + recordData := map[string]dbRecordList{ + "term": terms.crush(), + "tag": dbRecordList{}, + } + + jmdictDate := jmdictPublicationDate(dictionary) + + index := dbIndex{ + Title: title, + Revision: "JMdict." + jmdictDate, + Sequenced: true, + Attribution: edrdgAttribution, + } + index.setDefaults() + + return writeDb( + outputPath, + index, + recordData, + stride, + pretty, + ) +} diff --git a/jmdictGlossary.go b/jmdictGlossary.go new file mode 100644 index 0000000..0260cbf --- /dev/null +++ b/jmdictGlossary.go @@ -0,0 +1,300 @@ +package yomichan + +import ( + "fmt" + "strconv" + + "foosoft.net/projects/jmdict" +) + +func glossaryContainsLanguage(glossary []jmdict.JmdictGlossary, language string) bool { + hasGlosses := false + for _, gloss := range glossary { + if glossContainsLanguage(gloss, language) { + hasGlosses = true + break + } + } + return hasGlosses +} + +func glossContainsLanguage(gloss jmdict.JmdictGlossary, language string) bool { + if gloss.Language == nil && language != "eng" { + return false + } else if gloss.Language != nil && language != *gloss.Language { + return false + } else { + return true + } +} + +func makeGlossListItem(gloss jmdict.JmdictGlossary, language string) any { + contents := []any{gloss.Content} + listItem := contentListItem(contentAttr{}, contents...) + return listItem +} + +func makeInfoGlossListItem(gloss jmdict.JmdictGlossary, language string) any { + // Prepend gloss with "type" (literal, figurative, trademark, etc.) + glossTypeCode := *gloss.Type + contents := []any{} + if name, ok := glossTypeCodeToName[LangCode{language, glossTypeCode}]; ok { + if name != "" { + italicStyle := contentAttr{fontStyle: "italic"} + contents = append(contents, contentSpan(italicStyle, "("+name+")"), " ") + } + } else { + fmt.Println("Unknown glossary type code " + *gloss.Type + " for build language " + language) + contents = append(contents, "["+glossTypeCode+"] ") + } + contents = append(contents, gloss.Content) + listItem := contentListItem(contentAttr{}, contents...) + return listItem +} + +func makeSourceLangListItem(sourceLanguage jmdict.JmdictSource, language string) any { + contents := []any{} + + var srcLangCode string + if sourceLanguage.Language == nil { + srcLangCode = "eng" + } else { + srcLangCode = *sourceLanguage.Language + } + + // Format: [Language] ([Partial?], [Wasei?]): [Original word?] + // [Language] + if langName, ok := langCodeToName[LangCode{language, srcLangCode}]; ok { + contents = append(contents, langName) + } else { + contents = append(contents, srcLangCode) + fmt.Println("Unable to convert ISO 639 code " + srcLangCode + " to its full name in language " + language) + } + + // ([Partial?], [Wasei?]) + var sourceLangTypeCode string + if sourceLanguage.Type == nil { + sourceLangTypeCode = "" + } else { + sourceLangTypeCode = *sourceLanguage.Type + } + var sourceLangType string + if val, ok := sourceLangTypeCodeToType[LangCode{language, sourceLangTypeCode}]; ok { + sourceLangType = val + } else { + sourceLangType = sourceLangTypeCode + fmt.Println("Unknown source language type code " + sourceLangTypeCode + " for build language " + language) + } + if sourceLangType != "" && sourceLanguage.Wasei == "y" { + contents = append(contents, " ("+sourceLangType+", wasei)") + } else if sourceLangType != "" { + contents = append(contents, " ("+sourceLangType+")") + } else if sourceLanguage.Wasei == "y" { + contents = append(contents, " (wasei)") + } + + // : [Original word?] + if sourceLanguage.Content != "" { + contents = append(contents, ": ") + attr := contentAttr{lang: ISOtoHTML[srcLangCode]} + contents = append(contents, contentSpan(attr, sourceLanguage.Content)) + } + + listItem := contentListItem(contentAttr{}, contents...) + return listItem +} + +func makeReferenceListItem(reference string, refType string, meta jmdictMetadata) any { + contents := []any{} + attr := contentAttr{} + + hint := refNoteHint[LangCode{meta.language, refType}] + contents = append(contents, hint+": ") + + refHeadword, senseNumber, ok := parseReference(reference) + if !ok { + contents = append(contents, "ใ€"+reference+"ใ€‘") + return contentListItem(attr, contents...) + } + + sequence, ok := meta.referenceToSeq[reference] + if !ok { + contents = append(contents, "ใ€"+reference+"ใ€‘") + return contentListItem(attr, contents...) + } + + targetSense := senseID{ + sequence: sequence, + number: senseNumber, + } + + expHash := refHeadword.ExpHash() + doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1) + doDisplaySenseNumber := (meta.seqToSenseCount[targetSense.sequence] > 1) + refGlossAttr := contentAttr{ + fontSize: "65%", + verticalAlign: "middle", + data: map[string]string{"content": "refGlosses"}, + } + + contents = append(contents, refHeadword.ToInternalLink(doDisplayReading)) + if doDisplaySenseNumber { + contents = append(contents, contentSpan(refGlossAttr, " "+strconv.Itoa(targetSense.number)+". "+meta.condensedGlosses[targetSense])) + } else { + contents = append(contents, contentSpan(refGlossAttr, " "+meta.condensedGlosses[targetSense])) + } + + listItem := contentListItem(attr, contents...) + return listItem +} + +func makeExampleListItem(sentence jmdict.JmdictExampleSentence) any { + if sentence.Lang == "jpn" { + return contentListItem(contentAttr{}, sentence.Text) + } else { + attr := contentAttr{ + lang: ISOtoHTML[sentence.Lang], + listStyleType: ISOtoFlag[sentence.Lang], + } + return contentListItem(attr, sentence.Text) + } +} + +func listAttr(lang string, listStyleType string, dataContent string) contentAttr { + return contentAttr{ + lang: lang, + listStyleType: listStyleType, + data: map[string]string{"content": dataContent}, + } +} + +func needsStructuredContent(sense jmdict.JmdictSense, language string) bool { + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, language) && gloss.Type != nil { + return true + } + } + if len(sense.SourceLanguages) > 0 { + return true + } else if len(sense.Information) > 0 { + return true + } else if len(sense.Antonyms) > 0 { + return true + } else if len(sense.References) > 0 { + return true + } else if len(sense.Examples) > 0 { + return true + } else { + return false + } +} + +func createGlossaryContent(sense jmdict.JmdictSense, meta jmdictMetadata) any { + glossaryContents := []any{} + + // Add normal glosses + glossListItems := []any{} + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil { + listItem := makeGlossListItem(gloss, meta.language) + glossListItems = append(glossListItems, listItem) + } + } + if len(glossListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], "circle", "glossary") + list := contentUnorderedList(attr, glossListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add information glosses + infoGlossListItems := []any{} + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, meta.language) && gloss.Type != nil { + listItem := makeInfoGlossListItem(gloss, meta.language) + infoGlossListItems = append(infoGlossListItems, listItem) + } + } + if len(infoGlossListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], infoMarker, "infoGlossary") + list := contentUnorderedList(attr, infoGlossListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add language-of-origin / loanword information + sourceLangListItems := []any{} + for _, sourceLanguage := range sense.SourceLanguages { + listItem := makeSourceLangListItem(sourceLanguage, meta.language) + sourceLangListItems = append(sourceLangListItems, listItem) + } + if len(sourceLangListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], langMarker, "sourceLanguages") + list := contentUnorderedList(attr, sourceLangListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add sense notes + noteListItems := []any{} + for _, information := range sense.Information { + listItem := contentListItem(contentAttr{}, information) + noteListItems = append(noteListItems, listItem) + } + if len(noteListItems) > 0 { + attr := listAttr(ISOtoHTML["jpn"], noteMarker, "notes") // notes often contain japanese text + list := contentUnorderedList(attr, noteListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add antonyms + antonymListItems := []any{} + for _, antonym := range sense.Antonyms { + listItem := makeReferenceListItem(antonym, "ant", meta) + antonymListItems = append(antonymListItems, listItem) + } + if len(antonymListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], antonymMarker, "antonyms") + list := contentUnorderedList(attr, antonymListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add cross-references + referenceListItems := []any{} + for _, reference := range sense.References { + listItem := makeReferenceListItem(reference, "xref", meta) + referenceListItems = append(referenceListItems, listItem) + } + if len(referenceListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], refMarker, "references") + list := contentUnorderedList(attr, referenceListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add example sentences + exampleListItems := []any{} + for _, example := range sense.Examples { + for _, sentence := range example.Sentences { + listItem := makeExampleListItem(sentence) + exampleListItems = append(exampleListItems, listItem) + } + } + if len(exampleListItems) > 0 { + attr := listAttr(ISOtoHTML["jpn"], ISOtoFlag["jpn"], "examples") + list := contentUnorderedList(attr, exampleListItems...) + glossaryContents = append(glossaryContents, list) + } + + return contentStructure(glossaryContents...) +} + +func createGlossary(sense jmdict.JmdictSense, meta jmdictMetadata) []any { + glossary := []any{} + if needsStructuredContent(sense, meta.language) { + glossary = append(glossary, createGlossaryContent(sense, meta)) + } else { + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, meta.language) { + glossary = append(glossary, gloss.Content) + } + } + } + return glossary +} diff --git a/jmdictHeadword.go b/jmdictHeadword.go new file mode 100644 index 0000000..a1a75cb --- /dev/null +++ b/jmdictHeadword.go @@ -0,0 +1,267 @@ +package yomichan + +import ( + "fmt" + "hash/fnv" + "regexp" + "strconv" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +type headword struct { + Expression string + Reading string + TermTags []string + Index int + IsPriority bool + IsIrregular bool + IsOutdated bool + IsRareKanji bool + IsSearchOnly bool + IsAteji bool + IsGikun bool +} + +type hash uint64 + +func (h *headword) Hash() hash { + return hashText(h.Expression + "โž" + h.Reading) +} + +func (h *headword) ExpHash() hash { + return hashText(h.Expression + "โž" + h.Expression) +} + +func (h *headword) ReadingHash() hash { + return hashText(h.Reading + "โž" + h.Reading) +} + +func hashText(s string) hash { + h := fnv.New64a() + h.Write([]byte(s)) + return hash(h.Sum64()) +} + +func (h *headword) IsKanaOnly() bool { + if h.Expression != h.Reading { + return false + } + for _, char := range h.Expression { + if char >= 'ใ' && char <= 'ใƒฟ' { + // hiragana and katakana range + continue + } else if char >= '๏ฝฅ' && char <= '๏พŸ' { + // halfwidth katakana range + continue + } else if char == 'ใ€œ' { + continue + } else { + return false + } + } + return true +} + +func (h *headword) Score() int { + score := 0 + if h.IsPriority { + score += 1 + } + if h.IsIrregular { + score -= 5 + } + if h.IsOutdated { + score -= 5 + } + if h.IsRareKanji { + score -= 5 + } + if h.IsSearchOnly { + score -= 5 + } + return score +} + +func (h *headword) ToInternalLink(includeReading bool) any { + if !includeReading || h.Expression == h.Reading { + return contentInternalLink( + contentAttr{lang: ISOtoHTML["jpn"]}, + h.Expression, + ) + } else { + return contentSpan( + contentAttr{lang: ISOtoHTML["jpn"]}, + contentInternalLink(contentAttr{}, h.Expression), + "๏ผˆ", + contentInternalLink(contentAttr{}, h.Reading), + "๏ผ‰", + ) + } +} + +func (h *headword) SetFlags(infoTags, freqTags []string) { + priorityTags := []string{"ichi1", "news1", "gai1", "spec1", "spec2"} + for _, priorityTag := range priorityTags { + if slices.Contains(freqTags, priorityTag) { + h.IsPriority = true + break + } + } + for _, infoTag := range infoTags { + switch infoTag { + case "iK", "ik", "io": + h.IsIrregular = true + case "oK", "ok": + h.IsOutdated = true + case "sK", "sk": + h.IsSearchOnly = true + case "rK": + h.IsRareKanji = true + case "ateji": + h.IsAteji = true + case "gikun": + h.IsGikun = true + } + } + if h.IsOutdated && h.IsRareKanji { + h.IsRareKanji = false + } +} + +func (h *headword) SetTermTags(freqTags []string) { + h.TermTags = []string{} + if h.IsPriority { + h.TermTags = append(h.TermTags, priorityTagName) + } + for _, tag := range freqTags { + isNewsFreqTag, _ := regexp.MatchString(`nf\d\d`, tag) + if isNewsFreqTag { + // nf tags are divided into ranks of 500 + // (nf01 to nf48), but it will be easier + // for the user to read 1k, 2k, etc. + var i int + if _, err := fmt.Sscanf(tag, "nf%2d", &i); err == nil { + i = (i + (i % 2)) / 2 + newsTag := "news" + strconv.Itoa(i) + "k" + h.TermTags = append(h.TermTags, newsTag) + } + } else if tag == "news1" || tag == "news2" { + continue + } else { + tagWithoutTheNumber := tag[:len(tag)-1] // "ichi", "gai", or "spec" + h.TermTags = append(h.TermTags, tagWithoutTheNumber) + } + } + if h.IsIrregular { + h.TermTags = append(h.TermTags, irregularTagName) + } + if h.IsOutdated { + h.TermTags = append(h.TermTags, outdatedTagName) + } + if h.IsRareKanji { + h.TermTags = append(h.TermTags, rareKanjiTagName) + } + if h.IsAteji { + h.TermTags = append(h.TermTags, atejiTagName) + } + if h.IsGikun { + h.TermTags = append(h.TermTags, gikunTagName) + } +} + +func newHeadword(kanji *jmdict.JmdictKanji, reading *jmdict.JmdictReading) headword { + h := headword{} + infoTags := []string{} + freqTags := []string{} + if kanji == nil { + h.Expression = reading.Reading + h.Reading = reading.Reading + infoTags = reading.Information + freqTags = reading.Priorities + } else if reading == nil { + // should only apply to search-only kanji terms + h.Expression = kanji.Expression + h.Reading = "" + infoTags = kanji.Information + freqTags = kanji.Priorities + } else { + h.Expression = kanji.Expression + h.Reading = reading.Reading + infoTags = union(kanji.Information, reading.Information) + freqTags = intersection(kanji.Priorities, reading.Priorities) + } + h.SetFlags(infoTags, freqTags) + h.SetTermTags(freqTags) + return h +} + +func areAllKanjiIrregular(allKanji []jmdict.JmdictKanji) bool { + // If every kanji form is rare or irregular, then we'll make + // kana-only headwords for each kana form. + if len(allKanji) == 0 { + return false + } + for _, kanji := range allKanji { + h := newHeadword(&kanji, nil) + kanjiIsIrregular := h.IsRareKanji || h.IsIrregular || h.IsOutdated || h.IsSearchOnly + if !kanjiIsIrregular { + return false + } + } + return true +} + +func extractHeadwords(entry jmdict.JmdictEntry) []headword { + headwords := []headword{} + allKanjiAreIrregular := areAllKanjiIrregular(entry.Kanji) + + if allKanjiAreIrregular { + // Adding the reading-only terms before kanji+reading + // terms here for the sake of the Index property, + // which affects the yomichan term ranking. + for _, reading := range entry.Readings { + h := newHeadword(nil, &reading) + h.Index = len(headwords) + headwords = append(headwords, h) + } + } + + for _, kanji := range entry.Kanji { + if slices.Contains(kanji.Information, "sK") { + // Search-only kanji forms do not have associated readings. + h := newHeadword(&kanji, nil) + h.Index = len(headwords) + headwords = append(headwords, h) + continue + } + for _, reading := range entry.Readings { + if reading.NoKanji != nil { + continue + } else if slices.Contains(reading.Information, "sk") { + // Search-only kana forms do not have associated kanji forms. + continue + } else if reading.Restrictions != nil && !slices.Contains(reading.Restrictions, kanji.Expression) { + continue + } else { + h := newHeadword(&kanji, &reading) + h.Index = len(headwords) + headwords = append(headwords, h) + } + } + } + + if !allKanjiAreIrregular { + noKanjiInEntry := (len(entry.Kanji) == 0) + for _, reading := range entry.Readings { + if reading.NoKanji != nil || noKanjiInEntry || slices.Contains(reading.Information, "sk") { + h := newHeadword(nil, &reading) + h.Index = len(headwords) + headwords = append(headwords, h) + } + } + } + + return headwords +} diff --git a/jmdictMetadata.go b/jmdictMetadata.go new file mode 100644 index 0000000..ec92827 --- /dev/null +++ b/jmdictMetadata.go @@ -0,0 +1,158 @@ +package yomichan + +import ( + "strings" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +type sequence = int + +type jmdictMetadata struct { + language string + condensedGlosses map[senseID]string + seqToSenseCount map[sequence]int + seqToMainHeadword map[sequence]headword + expHashToReadings map[hash][]string + headwordHashToSeqs map[hash][]sequence + references []string + referenceToSeq map[string]sequence + hashToSearchValues map[hash][]searchValue + seqToSearchHashes map[sequence][]searchHash + hasMultipleForms map[sequence]bool + maxSenseCount int +} + +type senseID struct { + sequence sequence + number int +} + +func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) { + + // Determine how many senses are in this entry for this language + if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok { + senseCount := 0 + for _, entrySense := range entry.Sense { + for _, gloss := range entrySense.Glossary { + if glossContainsLanguage(gloss, meta.language) { + senseCount += 1 + break + } + } + } + meta.seqToSenseCount[entry.Sequence] = senseCount + } + + if meta.seqToSenseCount[entry.Sequence] == 0 { + return + } + + // main headwords (first ones that are found in entries). + if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok { + meta.seqToMainHeadword[entry.Sequence] = headword + } + + // hash the term pair so we can determine if it's used + // in more than one JMdict entry later. + headwordHash := headword.Hash() + if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) { + meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence) + } + + // hash the expression so that we can determine if we + // need to disambiguate it by displaying its reading + // in reference notes later. + expHash := headword.ExpHash() + if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) { + meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading) + } + + // e.g. for JMdict (English) we expect to end up with + // seqToHashedHeadwords[1260670] == ใ€ๅ…ƒใƒปใ‚‚ใจใ€‘ใ€ใ€ๅ…ƒใƒปๅ…ƒใ€‘ใ€ใ€ใ‚‚ใจใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปๆœฌใ€‘ใ€ใ€็ด ใƒปใ‚‚ใจใ€‘ใ€ใ€็ด ใƒป็ด ใ€‘ใ€ใ€ๅŸบใƒปใ‚‚ใจใ€‘ใ€ใ€ๅŸบใƒปๅŸบใ€‘ + // used for correlating references to sequence numbers later. + searchHashes := []searchHash{ + searchHash{headwordHash, headword.IsPriority}, + searchHash{expHash, headword.IsPriority}, + searchHash{headword.ReadingHash(), headword.IsPriority}, + } + for _, x := range searchHashes { + if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) { + meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x) + } + } + + currentSenseNumber := 1 + for _, entrySense := range entry.Sense { + if !glossaryContainsLanguage(entrySense.Glossary, meta.language) { + continue + } + if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) { + currentSenseNumber += 1 + continue + } + if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) { + currentSenseNumber += 1 + continue + } + + allReferences := append(entrySense.References, entrySense.Antonyms...) + for _, reference := range allReferences { + meta.references = append(meta.references, reference) + } + + currentSense := senseID{entry.Sequence, currentSenseNumber} + if meta.condensedGlosses[currentSense] == "" { + glosses := []string{} + for _, gloss := range entrySense.Glossary { + if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil { + glosses = append(glosses, gloss.Content) + } + } + meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ") + } + currentSenseNumber += 1 + } +} + +func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata { + meta := jmdictMetadata{ + language: langNameToCode[languageName], + seqToSenseCount: make(map[sequence]int), + condensedGlosses: make(map[senseID]string), + seqToMainHeadword: make(map[sequence]headword), + expHashToReadings: make(map[hash][]string), + seqToSearchHashes: make(map[sequence][]searchHash), + headwordHashToSeqs: make(map[hash][]sequence), + references: []string{}, + hashToSearchValues: nil, + referenceToSeq: nil, + hasMultipleForms: make(map[sequence]bool), + maxSenseCount: 0, + } + + for _, entry := range dictionary.Entries { + headwords := extractHeadwords(entry) + formCount := 0 + for _, headword := range headwords { + meta.AddHeadword(headword, entry) + if !headword.IsSearchOnly { + formCount += 1 + } + } + meta.hasMultipleForms[entry.Sequence] = (formCount > 1) + } + + // this correlation process will be unnecessary once JMdict + // includes sequence numbers in its cross-reference data + meta.MakeReferenceToSeqMap() + + for _, senseCount := range meta.seqToSenseCount { + if meta.maxSenseCount < senseCount { + meta.maxSenseCount = senseCount + } + } + + return meta +} diff --git a/jmdictReferences.go b/jmdictReferences.go new file mode 100644 index 0000000..71a7501 --- /dev/null +++ b/jmdictReferences.go @@ -0,0 +1,166 @@ +package yomichan + +import ( + "fmt" + "strconv" + "strings" +) + +/* + * In the future, JMdict will be updated to include sequence numbers + * with each cross reference. At that time, most of the functions and + * types defined in this file will become unnecessary. see: + * https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html + */ + +type searchValue struct { + sequence sequence + index int + isPriority bool +} + +type searchHash struct { + hash hash + isPriority bool +} + +func parseReference(reference string) (headword, int, bool) { + // Reference strings in JMDict currently consist of 3 parts at + // most, separated by ใƒป characters. The latter two parts are + // optional. When the sense number is not specified, it is + // implied to be the first sense. + var h headword + var senseNumber int + ok := true + refParts := strings.Split(reference, "ใƒป") + if len(refParts) == 1 { + // (Kanji) or (Reading) + h = headword{Expression: refParts[0], Reading: refParts[0]} + senseNumber = 1 + } else if len(refParts) == 2 { + // [Kanji + (Reading or Sense)] or (Reading + Sense) + val, err := strconv.Atoi(refParts[1]) + if err == nil { + h = headword{Expression: refParts[0], Reading: refParts[0]} + senseNumber = val + } else { + h = headword{Expression: refParts[0], Reading: refParts[1]} + senseNumber = 1 + } + } else if len(refParts) == 3 { + // Expression + Reading + Sense + h = headword{Expression: refParts[0], Reading: refParts[1]} + val, err := strconv.Atoi(strings.TrimSpace(refParts[2])) + if err == nil { + senseNumber = val + } else { + errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\"" + fmt.Println(errortext) + ok = false + } + } else { + errortext := "Unexpected format for x-ref \"" + reference + "\"" + fmt.Println(errortext) + ok = false + } + return h, senseNumber, ok +} + +func (meta *jmdictMetadata) MakeReferenceToSeqMap() { + + meta.referenceToSeq = make(map[string]sequence) + meta.MakeHashToSearchValuesMap() + + for _, reference := range meta.references { + if meta.referenceToSeq[reference] != 0 { + continue + } + seq := meta.FindBestSequence(reference) + if seq != 0 { + meta.referenceToSeq[reference] = seq + } else { + fmt.Println("Unable to convert reference to sequence number: `" + reference + "`") + } + } +} + +func (meta *jmdictMetadata) MakeHashToSearchValuesMap() { + meta.hashToSearchValues = make(map[hash][]searchValue) + for seq, searchHashes := range meta.seqToSearchHashes { + for score, searchHash := range searchHashes { + searchValue := searchValue{ + sequence: seq, + index: score, + isPriority: searchHash.isPriority, + } + meta.hashToSearchValues[searchHash.hash] = + append(meta.hashToSearchValues[searchHash.hash], searchValue) + } + } +} + +/* + * Generally, correspondence is determined by the order in which term + * pairs are extracted from each JMdict entry. Take for example the + * JMdict entry for ใ”ๆœฌ, which contains a reference to ๆœฌ (without a + * reading specified). To correlate this reference with a sequence + * number, our program searches each entry for the hash ofใ€ๆœฌใƒปๆœฌใ€‘. + * There are two entries in which it is found in JMdict (English): + * + * sequence 1260670: ใ€ๅ…ƒใƒปใ‚‚ใจใ€‘ใ€ใ€ๅ…ƒใƒปๅ…ƒใ€‘ใ€ใ€ใ‚‚ใจใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปๆœฌใ€‘ใ€ใ€็ด ใƒปใ‚‚ใจใ€‘ใ€ใ€็ด ใƒป็ด ใ€‘ใ€ใ€ๅŸบใƒปใ‚‚ใจใ€‘ใ€ใ€ๅŸบใƒปๅŸบใ€‘ + * sequence 1522150: ใ€ๆœฌใƒปใปใ‚“ใ€‘ใ€ใ€ๆœฌใƒปๆœฌใ€‘ใ€ใ€ใปใ‚“ใƒปใปใ‚“ใ€‘ + * + * Because ใ€ๆœฌใƒปๆœฌใ€‘ is closer to the beginning of the array in the + * latter (i.e., has the lowest index), sequence number 1522150 is + * returned. + * + * In situations in which multiple sequences are found with the same + * score, the entry with a priority tag ("news1", "ichi1", "spec1", + * "spec2", "gai1") is given preference. This mostly affects + * katakana-only loanwords like ใƒฉใ‚ฐ. + * + * To improve accuracy, this method also checks to see if the + * reference's specified sense number really exists in the + * corresponding entry. For example, sequence 1582850 ใ€ๅฆ‚ไฝ•ใงใƒปใ„ใ‹ใ‚“ใงใ€‘ + * has a reference to sense #2 of ใ„ใ‹ใ‚“ (no kanji specified), which + * could belong to 13 different sequences. However, sequences 1582850 + * and 2829697 are the only 2 of those 13 which contain more than one + * sense. Incidentally, sequence 1582850 is the correct match. + * + * All else being equal, the entry with the smallest sequence number + * is chosen. References in the JMdict file are currently ambiguous, + * and getting this perfect won't be possible until sequence numbers + * are explictly identified in these references. See: + * https://github.com/JMdictProject/JMdictIssues/issues/61 + */ +func (meta *jmdictMetadata) FindBestSequence(reference string) sequence { + bestSeq := 0 + lowestIndex := 100000 + bestIsPriority := false + headword, senseNumber, ok := parseReference(reference) + if !ok { + return bestSeq + } + hash := headword.Hash() + for _, seqScore := range meta.hashToSearchValues[hash] { + if meta.seqToSenseCount[seqScore.sequence] < senseNumber { + // entry must contain the specified sense + continue + } else if lowestIndex < seqScore.index { + // lower indices are better + continue + } else if (lowestIndex == seqScore.index) && (bestIsPriority && !seqScore.isPriority) { + // if scores match, check priority + continue + } else if (lowestIndex == seqScore.index) && (bestIsPriority == seqScore.isPriority) && (bestSeq < seqScore.sequence) { + // if scores and priority match, check sequence number. + // lower sequence numbers are better + continue + } else { + lowestIndex = seqScore.index + bestSeq = seqScore.sequence + bestIsPriority = seqScore.isPriority + } + } + return bestSeq +} diff --git a/jmdictTags.go b/jmdictTags.go new file mode 100644 index 0000000..b444c47 --- /dev/null +++ b/jmdictTags.go @@ -0,0 +1,348 @@ +package yomichan + +import ( + "fmt" + "strconv" + + "golang.org/x/exp/slices" +) + +func senseNumberTags(maxSenseCount int) []dbTag { + tags := []dbTag{} + for i := 1; i <= maxSenseCount; i++ { + tag := dbTag{ + Name: strconv.Itoa(i), + Order: -10, // these tags will appear on the left side + Notes: "JMdict Sense #" + strconv.Itoa(i), + } + tags = append(tags, tag) + } + return tags +} + +func newsFrequencyTags() []dbTag { + // 24,000 ranks divided into 24 tags, news1k ... news24k + tags := []dbTag{} + for i := 1; i <= 24; i++ { + tagName := "news" + strconv.Itoa(i) + "k" + var startRank string + if i == 1 { + startRank = "1" + } else { + // technically should be ",001", but that looks odd + startRank = strconv.Itoa(i-1) + ",000" + } + endRank := strconv.Itoa(i) + ",000" + tag := dbTag{ + Name: tagName, + Order: -2, + Score: 0, + Category: "frequent", + Notes: "ranked between the top " + startRank + " and " + endRank + " words in a frequency analysis of the Mainichi Shimbun (1990s)", + } + tags = append(tags, tag) + } + return tags +} + +func entityTags(entities map[string]string) []dbTag { + tags := knownEntityTags() + for name, notes := range entities { + idx := slices.IndexFunc(tags, func(t dbTag) bool { return t.Name == name }) + if idx != -1 { + tags[idx].Notes = notes + } else { + fmt.Println("Unknown tag type \"" + name + "\": " + notes) + unknownTag := dbTag{Name: name, Notes: notes} + tags = append(tags, unknownTag) + } + } + return tags +} + +func customDbTags() []dbTag { + return []dbTag{ + dbTag{Name: priorityTagName, Order: -10, Score: 10, Category: "popular", Notes: "high priority term"}, + dbTag{Name: rareKanjiTagName, Order: 0, Score: -5, Category: "archaism", Notes: "rarely-used kanji form of this expression"}, + dbTag{Name: irregularTagName, Order: 0, Score: -5, Category: "archaism", Notes: "irregular form of this expression"}, + dbTag{Name: outdatedTagName, Order: 0, Score: -5, Category: "archaism", Notes: "outdated form of this expression"}, + dbTag{Name: "ichi", Order: -2, Score: 0, Category: "frequent", Notes: "included in Ichimango Goi Bunruishuu (๏ผ‘ไธ‡่ชž่ชžๅฝ™ๅˆ†้กž้›†)"}, + dbTag{Name: "spec", Order: -2, Score: 0, Category: "frequent", Notes: "specified as common by JMdict editors"}, + dbTag{Name: "gai", Order: -2, Score: 0, Category: "frequent", Notes: "common loanword (gairaigoใƒปๅค–ๆฅ่ชž)"}, + dbTag{Name: "forms", Order: 0, Score: 0, Category: "", Notes: "other surface forms and readings"}, + } +} + +func knownEntityTags() []dbTag { + return []dbTag{ + // see: https://www.edrdg.org/jmdictdb/cgi-bin/edhelp.py?svc=jmdict&sid=#kwabbr + // additional descriptions at the beginning of the JMdict file + + // reading info + dbTag{Name: "gikun", Order: 0, Score: 0, Category: ""}, // gikun (meaning as reading) or jukujikun (special kanji reading) + dbTag{Name: "ik", Order: 0, Score: -5, Category: ""}, // word containing irregular kana usage + dbTag{Name: "ok", Order: 0, Score: -5, Category: ""}, // out-dated or obsolete kana usage + dbTag{Name: "sk", Order: 0, Score: -5, Category: ""}, // search-only kana form + + // kanji info + /* kanji info also has a "ik" entity that would go here if not already for the re_inf tag */ + dbTag{Name: "ateji", Order: 0, Score: 0, Category: ""}, // ateji (phonetic) reading + dbTag{Name: "iK", Order: 0, Score: -5, Category: ""}, // word containing irregular kanji usage + dbTag{Name: "io", Order: 0, Score: -5, Category: ""}, // irregular okurigana usage + dbTag{Name: "oK", Order: 0, Score: -5, Category: ""}, // word containing out-dated kanji or kanji usage + dbTag{Name: "rK", Order: 0, Score: -5, Category: ""}, // rarely-used kanji form + dbTag{Name: "sK", Order: 0, Score: -5, Category: ""}, // search-only kanji form + + // miscellaneous sense info + dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation + dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism + dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character + dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language + dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism + dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name + dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature + dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term + dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity + dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory + dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document + dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic + dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event + dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language + dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language + dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction + dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term + dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified + dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group + dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term + dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language + dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language + dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression + dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term + dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend + dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang + dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language + dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology + dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang + dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object + dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term + dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word + dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name + dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other + dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person + dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name + dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term + dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language + dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name + dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb + dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation + dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare + dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion + dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive + dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service + dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name + dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang + dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station + dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname + dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone + dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name + dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word + dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name + dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software) + dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo + + // part-of-speech info + dbTag{Name: "adj-f", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or verb acting prenominally + dbTag{Name: "adj-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) + dbTag{Name: "adj-ix", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) - yoi/ii class + dbTag{Name: "adj-kari", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'kari' adjective (archaic) + dbTag{Name: "adj-ku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'ku' adjective (archaic) + dbTag{Name: "adj-na", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjectival nouns or quasi-adjectives (keiyodoshi) + dbTag{Name: "adj-nari", Order: -3, Score: 0, Category: "partOfSpeech"}, // archaic/formal form of na-adjective + dbTag{Name: "adj-no", Order: -3, Score: 0, Category: "partOfSpeech"}, // nouns which may take the genitive case particle 'no' + dbTag{Name: "adj-pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pre-noun adjectival (rentaishi) + dbTag{Name: "adj-shiku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'shiku' adjective (archaic) + dbTag{Name: "adj-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'taru' adjective + dbTag{Name: "adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb (fukushi) + dbTag{Name: "adv-to", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb taking the 'to' particle + dbTag{Name: "aux", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary + dbTag{Name: "aux-adj", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary adjective + dbTag{Name: "aux-v", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary verb + dbTag{Name: "conj", Order: -3, Score: 0, Category: "partOfSpeech"}, // conjunction + dbTag{Name: "cop", Order: -3, Score: 0, Category: "partOfSpeech"}, // copula + dbTag{Name: "ctr", Order: -3, Score: 0, Category: "partOfSpeech"}, // counter + dbTag{Name: "exp", Order: -5, Score: 0, Category: "expression"}, // expressions (phrases, clauses, etc.) + dbTag{Name: "int", Order: -3, Score: 0, Category: "partOfSpeech"}, // interjection (kandoushi) + dbTag{Name: "n", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (common) (futsuumeishi) + dbTag{Name: "n-adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverbial noun (fukushitekimeishi) + dbTag{Name: "n-pr", Order: -3, Score: 0, Category: "partOfSpeech"}, // proper noun + dbTag{Name: "n-pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a prefix + dbTag{Name: "n-suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a suffix + dbTag{Name: "n-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (temporal) (jisoumeishi) + dbTag{Name: "num", Order: -3, Score: 0, Category: "partOfSpeech"}, // numeric + dbTag{Name: "pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pronoun + dbTag{Name: "pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // prefix + dbTag{Name: "prt", Order: -3, Score: 0, Category: "partOfSpeech"}, // particle + dbTag{Name: "suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // suffix + dbTag{Name: "unc", Order: -3, Score: 0, Category: "partOfSpeech"}, // unclassified + dbTag{Name: "v-unspec", Order: -3, Score: 0, Category: "partOfSpeech"}, // verb unspecified + dbTag{Name: "v1", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb + dbTag{Name: "v1-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - kureru special class + dbTag{Name: "v2a-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb with 'u' ending (archaic) + dbTag{Name: "v2b-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'bu' ending (archaic) + dbTag{Name: "v2b-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'bu' ending (archaic) + dbTag{Name: "v2d-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'dzu' ending (archaic) + dbTag{Name: "v2d-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'dzu' ending (archaic) + dbTag{Name: "v2g-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'gu' ending (archaic) + dbTag{Name: "v2g-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'gu' ending (archaic) + dbTag{Name: "v2h-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'hu/fu' ending (archaic) + dbTag{Name: "v2h-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'hu/fu' ending (archaic) + dbTag{Name: "v2k-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ku' ending (archaic) + dbTag{Name: "v2k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ku' ending (archaic) + dbTag{Name: "v2m-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'mu' ending (archaic) + dbTag{Name: "v2m-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'mu' ending (archaic) + dbTag{Name: "v2n-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'nu' ending (archaic) + dbTag{Name: "v2r-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ru' ending (archaic) + dbTag{Name: "v2r-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ru' ending (archaic) + dbTag{Name: "v2s-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'su' ending (archaic) + dbTag{Name: "v2t-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'tsu' ending (archaic) + dbTag{Name: "v2t-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'tsu' ending (archaic) + dbTag{Name: "v2w-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic) + dbTag{Name: "v2y-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'yu' ending (archaic) + dbTag{Name: "v2y-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'yu' ending (archaic) + dbTag{Name: "v2z-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'zu' ending (archaic) + dbTag{Name: "v4b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'bu' ending (archaic) + dbTag{Name: "v4g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'gu' ending (archaic) + dbTag{Name: "v4h", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'hu/fu' ending (archaic) + dbTag{Name: "v4k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ku' ending (archaic) + dbTag{Name: "v4m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'mu' ending (archaic) + dbTag{Name: "v4n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'nu' ending (archaic) + dbTag{Name: "v4r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ru' ending (archaic) + dbTag{Name: "v4s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'su' ending (archaic) + dbTag{Name: "v4t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'tsu' ending (archaic) + dbTag{Name: "v5aru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - -aru special class + dbTag{Name: "v5b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'bu' ending + dbTag{Name: "v5g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'gu' ending + dbTag{Name: "v5k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ku' ending + dbTag{Name: "v5k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Iku/Yuku special class + dbTag{Name: "v5m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'mu' ending + dbTag{Name: "v5n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'nu' ending + dbTag{Name: "v5r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending + dbTag{Name: "v5r-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending (irregular verb) + dbTag{Name: "v5s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'su' ending + dbTag{Name: "v5t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'tsu' ending + dbTag{Name: "v5u", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending + dbTag{Name: "v5u-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending (special class) + dbTag{Name: "v5uru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Uru old class verb (old form of Eru) + dbTag{Name: "vi", Order: -3, Score: 0, Category: "partOfSpeech"}, // intransitive verb + dbTag{Name: "vk", Order: -3, Score: 0, Category: "partOfSpeech"}, // Kuru verb - special class + dbTag{Name: "vn", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular nu verb + dbTag{Name: "vr", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular ru verb, plain form ends with -ri + dbTag{Name: "vs", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or participle which takes the aux. verb suru + dbTag{Name: "vs-c", Order: -3, Score: 0, Category: "partOfSpeech"}, // su verb - precursor to the modern suru + dbTag{Name: "vs-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - included + dbTag{Name: "vs-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - special class + dbTag{Name: "vt", Order: -3, Score: 0, Category: "partOfSpeech"}, // transitive verb + dbTag{Name: "vz", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - zuru verb (alternative form of -jiru verbs) + + // usage domain + dbTag{Name: "agric", Order: 0, Score: 0, Category: ""}, // agriculture + dbTag{Name: "anat", Order: 0, Score: 0, Category: ""}, // anatomy + dbTag{Name: "archeol", Order: 0, Score: 0, Category: ""}, // archeology + dbTag{Name: "archit", Order: 0, Score: 0, Category: ""}, // architecture + dbTag{Name: "art", Order: 0, Score: 0, Category: ""}, // art, aesthetics + dbTag{Name: "astron", Order: 0, Score: 0, Category: ""}, // astronomy + dbTag{Name: "audvid", Order: 0, Score: 0, Category: ""}, // audiovisual + dbTag{Name: "aviat", Order: 0, Score: 0, Category: ""}, // aviation + dbTag{Name: "baseb", Order: 0, Score: 0, Category: ""}, // baseball + dbTag{Name: "biochem", Order: 0, Score: 0, Category: ""}, // biochemistry + dbTag{Name: "biol", Order: 0, Score: 0, Category: ""}, // biology + dbTag{Name: "bot", Order: 0, Score: 0, Category: ""}, // botany + dbTag{Name: "Buddh", Order: 0, Score: 0, Category: ""}, // Buddhism + dbTag{Name: "bus", Order: 0, Score: 0, Category: ""}, // business + dbTag{Name: "cards", Order: 0, Score: 0, Category: ""}, // card games + dbTag{Name: "chem", Order: 0, Score: 0, Category: ""}, // chemistry + dbTag{Name: "Christn", Order: 0, Score: 0, Category: ""}, // Christianity + dbTag{Name: "cloth", Order: 0, Score: 0, Category: ""}, // clothing + dbTag{Name: "comp", Order: 0, Score: 0, Category: ""}, // computing + dbTag{Name: "cryst", Order: 0, Score: 0, Category: ""}, // crystallography + dbTag{Name: "dent", Order: 0, Score: 0, Category: ""}, // dentistry + dbTag{Name: "ecol", Order: 0, Score: 0, Category: ""}, // ecology + dbTag{Name: "econ", Order: 0, Score: 0, Category: ""}, // economics + dbTag{Name: "elec", Order: 0, Score: 0, Category: ""}, // electricity, elec. eng. + dbTag{Name: "electr", Order: 0, Score: 0, Category: ""}, // electronics + dbTag{Name: "embryo", Order: 0, Score: 0, Category: ""}, // embryology + dbTag{Name: "engr", Order: 0, Score: 0, Category: ""}, // engineering + dbTag{Name: "ent", Order: 0, Score: 0, Category: ""}, // entomology + dbTag{Name: "film", Order: 0, Score: 0, Category: ""}, // film + dbTag{Name: "finc", Order: 0, Score: 0, Category: ""}, // finance + dbTag{Name: "fish", Order: 0, Score: 0, Category: ""}, // fishing + dbTag{Name: "food", Order: 0, Score: 0, Category: ""}, // food, cooking + dbTag{Name: "gardn", Order: 0, Score: 0, Category: ""}, // gardening, horticulture + dbTag{Name: "genet", Order: 0, Score: 0, Category: ""}, // genetics + dbTag{Name: "geogr", Order: 0, Score: 0, Category: ""}, // geography + dbTag{Name: "geol", Order: 0, Score: 0, Category: ""}, // geology + dbTag{Name: "geom", Order: 0, Score: 0, Category: ""}, // geometry + dbTag{Name: "go", Order: 0, Score: 0, Category: ""}, // go (game) + dbTag{Name: "golf", Order: 0, Score: 0, Category: ""}, // golf + dbTag{Name: "gramm", Order: 0, Score: 0, Category: ""}, // grammar + dbTag{Name: "grmyth", Order: 0, Score: 0, Category: ""}, // Greek mythology + dbTag{Name: "hanaf", Order: 0, Score: 0, Category: ""}, // hanafuda + dbTag{Name: "horse", Order: 0, Score: 0, Category: ""}, // horse racing + dbTag{Name: "kabuki", Order: 0, Score: 0, Category: ""}, // kabuki + dbTag{Name: "law", Order: 0, Score: 0, Category: ""}, // law + dbTag{Name: "ling", Order: 0, Score: 0, Category: ""}, // linguistics + dbTag{Name: "logic", Order: 0, Score: 0, Category: ""}, // logic + dbTag{Name: "MA", Order: 0, Score: 0, Category: ""}, // martial arts + dbTag{Name: "mahj", Order: 0, Score: 0, Category: ""}, // mahjong + dbTag{Name: "manga", Order: 0, Score: 0, Category: ""}, // manga + dbTag{Name: "math", Order: 0, Score: 0, Category: ""}, // mathematics + dbTag{Name: "mech", Order: 0, Score: 0, Category: ""}, // mechanical engineering + dbTag{Name: "med", Order: 0, Score: 0, Category: ""}, // medicine + dbTag{Name: "met", Order: 0, Score: 0, Category: ""}, // meteorology + dbTag{Name: "mil", Order: 0, Score: 0, Category: ""}, // military + dbTag{Name: "mining", Order: 0, Score: 0, Category: ""}, // mining + dbTag{Name: "music", Order: 0, Score: 0, Category: ""}, // music + dbTag{Name: "noh", Order: 0, Score: 0, Category: ""}, // noh + dbTag{Name: "ornith", Order: 0, Score: 0, Category: ""}, // ornithology + dbTag{Name: "paleo", Order: 0, Score: 0, Category: ""}, // paleontology + dbTag{Name: "pathol", Order: 0, Score: 0, Category: ""}, // pathology + dbTag{Name: "pharm", Order: 0, Score: 0, Category: ""}, // pharmacy + dbTag{Name: "phil", Order: 0, Score: 0, Category: ""}, // philosophy + dbTag{Name: "photo", Order: 0, Score: 0, Category: ""}, // photography + dbTag{Name: "physics", Order: 0, Score: 0, Category: ""}, // physics + dbTag{Name: "physiol", Order: 0, Score: 0, Category: ""}, // physiology + dbTag{Name: "politics", Order: 0, Score: 0, Category: ""}, // politics + dbTag{Name: "print", Order: 0, Score: 0, Category: ""}, // printing + dbTag{Name: "psy", Order: 0, Score: 0, Category: ""}, // psychiatry + dbTag{Name: "psyanal", Order: 0, Score: 0, Category: ""}, // psychoanalysis + dbTag{Name: "psych", Order: 0, Score: 0, Category: ""}, // psychology + dbTag{Name: "rail", Order: 0, Score: 0, Category: ""}, // railway + dbTag{Name: "rommyth", Order: 0, Score: 0, Category: ""}, // Roman mythology + dbTag{Name: "Shinto", Order: 0, Score: 0, Category: ""}, // Shinto + dbTag{Name: "shogi", Order: 0, Score: 0, Category: ""}, // shogi + dbTag{Name: "ski", Order: 0, Score: 0, Category: ""}, // skiing + dbTag{Name: "sports", Order: 0, Score: 0, Category: ""}, // sports + dbTag{Name: "stat", Order: 0, Score: 0, Category: ""}, // statistics + dbTag{Name: "stockm", Order: 0, Score: 0, Category: ""}, // stock market + dbTag{Name: "sumo", Order: 0, Score: 0, Category: ""}, // sumo + dbTag{Name: "telec", Order: 0, Score: 0, Category: ""}, // telecommunications + dbTag{Name: "tradem", Order: 0, Score: 0, Category: ""}, // trademark + dbTag{Name: "tv", Order: 0, Score: 0, Category: ""}, // television + dbTag{Name: "vidg", Order: 0, Score: 0, Category: ""}, // video games + dbTag{Name: "zool", Order: 0, Score: 0, Category: ""}, // zoology + + // dialect + dbTag{Name: "bra", Order: 0, Score: 0, Category: ""}, // Brazilian + dbTag{Name: "hob", Order: 0, Score: 0, Category: ""}, // Hokkaido-ben + dbTag{Name: "ksb", Order: 0, Score: 0, Category: ""}, // Kansai-ben + dbTag{Name: "ktb", Order: 0, Score: 0, Category: ""}, // Kantou-ben + dbTag{Name: "kyb", Order: 0, Score: 0, Category: ""}, // Kyoto-ben + dbTag{Name: "kyu", Order: 0, Score: 0, Category: ""}, // Kyuushuu-ben + dbTag{Name: "nab", Order: 0, Score: 0, Category: ""}, // Nagano-ben + dbTag{Name: "osb", Order: 0, Score: 0, Category: ""}, // Osaka-ben + dbTag{Name: "rkb", Order: 0, Score: 0, Category: ""}, // Ryuukyuu-ben + dbTag{Name: "thb", Order: 0, Score: 0, Category: ""}, // Touhoku-ben + dbTag{Name: "tsb", Order: 0, Score: 0, Category: ""}, // Tosa-ben + dbTag{Name: "tsug", Order: 0, Score: 0, Category: ""}, // Tsugaru-ben + } +} diff --git a/structuredContent.go b/structuredContent.go new file mode 100644 index 0000000..ded8229 --- /dev/null +++ b/structuredContent.go @@ -0,0 +1,192 @@ +package yomichan + +type contentAttr struct { + lang string + fontStyle string // normal, italic + fontWeight string // normal, bold + fontSize string // small, medium, large, smaller, 80%, 125%, etc. + textDecorationLine []string // underline, overline, line-through + verticalAlign string // baseline, sub, super, text-top, text-bottom, middle, top, bottom + textAlign string // start, end, left, right, center, justify, justify-all, match-parent + marginTop int + marginLeft int + marginRight int + marginBottom int + listStyleType string + data map[string]string +} + +// if the array contains adjacent strings, concatenate them. +// ex: ["one", "two", content_structure, "four"] -> ["onetwo", content_structure, "four"] +// if the array only contains strings, return a concatenated string. +// ex: ["one", "two"] -> "onetwo" +func contentReduce(contents []any) any { + if len(contents) == 1 { + return contents[0] + } + newContents := []any{} + var accumulator string + for _, content := range contents { + switch v := content.(type) { + case string: + accumulator = accumulator + v + default: + if accumulator != "" { + newContents = append(newContents, accumulator) + accumulator = "" + } + newContents = append(newContents, content) + } + } + if accumulator != "" { + newContents = append(newContents, accumulator) + } + if len(newContents) == 1 { + return newContents[0] + } else { + return newContents + } +} + +func contentStructure(contents ...any) map[string]any { + return map[string]any{ + "type": "structured-content", + "content": contentReduce(contents), + } +} + +func contentRuby(attr contentAttr, ruby string, contents ...any) map[string]any { + rubyContent := map[string]any{ + "tag": "ruby", + "content": []any{ + contentReduce(contents), + map[string]string{"tag": "rp", "content": "("}, + map[string]string{"tag": "rt", "content": ruby}, + map[string]string{"tag": "rp", "content": ")"}, + }, + } + if attr.lang != "" { + rubyContent["lang"] = attr.lang + } + if len(attr.data) != 0 { + rubyContent["data"] = attr.data + } + return rubyContent +} + +func contentInternalLink(attr contentAttr, query string, contents ...any) map[string]any { + linkContent := map[string]any{ + "tag": "a", + "href": "?query=" + query + "&wildcards=off", + } + if len(contents) == 0 { + linkContent["content"] = query + } else { + linkContent["content"] = contentReduce(contents) + } + if attr.lang != "" { + linkContent["lang"] = attr.lang + } + if len(attr.data) != 0 { + linkContent["data"] = attr.data + } + return linkContent +} + +func contentSpan(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "span", contents...) +} + +func contentDiv(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "div", contents...) +} + +func contentListItem(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "li", contents...) +} + +func contentOrderedList(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "ol", contents...) +} + +func contentUnorderedList(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "ul", contents...) +} + +func contentTable(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "table", contents...) +} + +func contentTableHead(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "thead", contents...) +} + +func contentTableBody(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "tbody", contents...) +} + +func contentTableRow(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "tr", contents...) +} + +func contentTableHeadCell(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "th", contents...) +} + +func contentTableCell(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "td", contents...) +} + +func contentStyledContainer(attr contentAttr, tag string, contents ...any) map[string]any { + container := map[string]any{"tag": tag} + container["content"] = contentReduce(contents) + if attr.lang != "" { + container["lang"] = attr.lang + } + if len(attr.data) != 0 { + container["data"] = attr.data + } + style := contentStyle(attr) + if len(style) != 0 { + container["style"] = style + } + return container +} + +func contentStyle(attr contentAttr) map[string]any { + style := make(map[string]any) + if attr.fontStyle != "" { + style["fontStyle"] = attr.fontStyle + } + if attr.fontWeight != "" { + style["fontWeight"] = attr.fontWeight + } + if attr.fontSize != "" { + style["fontSize"] = attr.fontSize + } + if len(attr.textDecorationLine) != 0 { + style["textDecorationLine"] = attr.textDecorationLine + } + if attr.verticalAlign != "" { + style["verticalAlign"] = attr.verticalAlign + } + if attr.textAlign != "" { + style["textAlign"] = attr.textAlign + } + if attr.marginTop != 0 { + style["marginTop"] = attr.marginTop + } + if attr.marginLeft != 0 { + style["marginLeft"] = attr.marginLeft + } + if attr.marginRight != 0 { + style["marginRight"] = attr.marginRight + } + if attr.marginBottom != 0 { + style["marginBottom"] = attr.marginBottom + } + if attr.listStyleType != "" { + style["listStyleType"] = attr.listStyleType + } + return style +} From 972dc6c4e99f21adb8e64e691da9604a23ffb33d Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 14:40:39 -0600 Subject: [PATCH 05/19] Update dictionary build script --- scripts/build_dicts.sh | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/scripts/build_dicts.sh b/scripts/build_dicts.sh index b91d9a5..764d1d0 100755 --- a/scripts/build_dicts.sh +++ b/scripts/build_dicts.sh @@ -5,13 +5,23 @@ go get foosoft.net/projects/yomichan-import/yomichan mkdir -p src mkdir -p dst -if [ ! -f src/JMdict ]; then - wget http://ftp.monash.edu/pub/nihongo/JMdict.gz - gunzip -c JMdict.gz > src/JMdict -fi +function refresh_source () { + NOW=$(date '+%s') + YESTERDAY=$((NOW - 86400)) # 86,400 seconds in 24 hours + if [ ! -f "src/$1" ]; then + wget "ftp.edrdg.org/pub/Nihongo/$1.gz" + gunzip -c "$1.gz" > "src/$1" + elif [[ $YESTERDAY -gt $(date -r "src/$1" '+%s') ]]; then + rsync "ftp.edrdg.org::nihongo/$1" "src/$1" + fi +} +refresh_source "JMdict_e_examp" +yomichan -language="english" -title="JMdict" src/JMdict_e_examp dst/jmdict_english_with_examples.zip + +refresh_source "JMdict" +yomichan -language="english" -title="JMdict" src/JMdict dst/jmdict_english.zip yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip -yomichan -language="english" -title="JMdict (English)" src/JMdict dst/jmdict_english.zip yomichan -language="french" -title="JMdict (French)" src/JMdict dst/jmdict_french.zip yomichan -language="german" -title="JMdict (German)" src/JMdict dst/jmdict_german.zip yomichan -language="hungarian" -title="JMdict (Hungarian)" src/JMdict dst/jmdict_hungarian.zip @@ -20,19 +30,13 @@ yomichan -language="slovenian" -title="JMdict (Slovenian)" src/JMdict dst/jmdict yomichan -language="spanish" -title="JMdict (Spanish)" src/JMdict dst/jmdict_spanish.zip yomichan -language="swedish" -title="JMdict (Swedish)" src/JMdict dst/jmdict_swedish.zip -if [ ! -f src/JMnedict.xml ]; then - wget http://ftp.monash.edu/pub/nihongo/JMnedict.xml.gz - gunzip -c JMnedict.xml.gz > src/JMnedict.xml -fi +yomichan -format="forms" -title="JMdict Forms" src/JMdict dst/jmdict_forms.zip +refresh_source "JMnedict.xml" yomichan src/JMnedict.xml dst/jmnedict.zip -if [ ! -f src/kanjidic2.xml ]; then - wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz - gunzip -c kanjidic2.xml.gz > src/kanjidic2.xml -fi - -yomichan -language="english" -title="KANJIDIC (English)" src/kanjidic2.xml dst/kanjidic_english.zip +refresh_source "kanjidic2.xml" +yomichan -language="english" -title="KANJIDIC" src/kanjidic2.xml dst/kanjidic_english.zip yomichan -language="french" -title="KANJIDIC (French)" src/kanjidic2.xml dst/kanjidic_french.zip yomichan -language="portuguese" -title="KANJIDIC (Portuguese)" src/kanjidic2.xml dst/kanjidic_portuguese.zip yomichan -language="spanish" -title="KANJIDIC (Spanish)" src/kanjidic2.xml dst/kanjidic_spanish.zip From 8451803bfd2e6f516e1c55464c96cc209f3336d6 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 15:00:13 -0600 Subject: [PATCH 06/19] Update copyright --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index f13e263..3901c0e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2016-2022 Alex Yatskov +Copyright 2016-2023 Yomichan-Import Authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in From d8a3b420ee8bb56f8c025d4173c8b64588884dcd Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 22 Jan 2023 17:55:27 -0600 Subject: [PATCH 07/19] Exclude "search" and "forms" terms from non-English dictionaries This allows a user to install the English version and another version without cluttering their setup with duplicated information. If a user doesn't want to use the English version, they can get the "search" and "forms" terms by installing the separate jmdict_forms file. --- jmdict.go | 10 +++++++--- jmdictForms.go | 12 ++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/jmdict.go b/jmdict.go index 74809e7..6de8877 100644 --- a/jmdict.go +++ b/jmdict.go @@ -134,8 +134,12 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada return nil, false } if headword.IsSearchOnly { - searchTerm := createSearchTerm(headword, entry, meta) - return []dbTerm{searchTerm}, true + if meta.language == "eng" { + searchTerm := createSearchTerm(headword, entry, meta) + return []dbTerm{searchTerm}, true + } else { + return nil, false + } } terms := []dbTerm{} senseNumber := 1 @@ -156,7 +160,7 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada terms = append(terms, senseTerm) } - if meta.hasMultipleForms[entry.Sequence] { + if meta.hasMultipleForms[entry.Sequence] && meta.language == "eng" { formsTerm := createFormsTerm(headword, entry, meta) terms = append(terms, formsTerm) } diff --git a/jmdictForms.go b/jmdictForms.go index 76eba34..15b894d 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -210,17 +210,21 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int return err } + meta := newJmdictMetadata(dictionary, languageName) + terms := dbTermList{} for _, entry := range dictionary.Entries { baseTerm := baseFormsTerm(entry) headwords := extractHeadwords(entry) for _, h := range headwords { - term := baseTerm + var term dbTerm if h.IsSearchOnly { - term.Sequence = -term.Sequence + term = createSearchTerm(h, entry, meta) + } else { + term = baseTerm + term.Expression = h.Expression + term.Reading = h.Reading } - term.Expression = h.Expression - term.Reading = h.Reading terms = append(terms, term) } } From 6726c5245b0d6b1bd2d5a98bbf025d4feb055353 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Mon, 23 Jan 2023 14:09:50 -0600 Subject: [PATCH 08/19] Rename variables for consistency --- jmdictReferences.go | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/jmdictReferences.go b/jmdictReferences.go index 71a7501..aa5d229 100644 --- a/jmdictReferences.go +++ b/jmdictReferences.go @@ -87,10 +87,10 @@ func (meta *jmdictMetadata) MakeReferenceToSeqMap() { func (meta *jmdictMetadata) MakeHashToSearchValuesMap() { meta.hashToSearchValues = make(map[hash][]searchValue) for seq, searchHashes := range meta.seqToSearchHashes { - for score, searchHash := range searchHashes { + for idx, searchHash := range searchHashes { searchValue := searchValue{ sequence: seq, - index: score, + index: idx, isPriority: searchHash.isPriority, } meta.hashToSearchValues[searchHash.hash] = @@ -100,6 +100,10 @@ func (meta *jmdictMetadata) MakeHashToSearchValuesMap() { } /* + * This function attemps to convert a JMdict reference string into a + * single definite sequence number. These reference strings are often + * ambiguous, so we have to resort to using heuristics. + * * Generally, correspondence is determined by the order in which term * pairs are extracted from each JMdict entry. Take for example the * JMdict entry for ใ”ๆœฌ, which contains a reference to ๆœฌ (without a @@ -115,7 +119,7 @@ func (meta *jmdictMetadata) MakeHashToSearchValuesMap() { * returned. * * In situations in which multiple sequences are found with the same - * score, the entry with a priority tag ("news1", "ichi1", "spec1", + * index, the entry with a priority tag ("news1", "ichi1", "spec1", * "spec2", "gai1") is given preference. This mostly affects * katakana-only loanwords like ใƒฉใ‚ฐ. * @@ -129,8 +133,8 @@ func (meta *jmdictMetadata) MakeHashToSearchValuesMap() { * * All else being equal, the entry with the smallest sequence number * is chosen. References in the JMdict file are currently ambiguous, - * and getting this perfect won't be possible until sequence numbers - * are explictly identified in these references. See: + * and getting this perfect won't be possible until reference sequence + * numbers are included in the file. See: * https://github.com/JMdictProject/JMdictIssues/issues/61 */ func (meta *jmdictMetadata) FindBestSequence(reference string) sequence { @@ -142,24 +146,24 @@ func (meta *jmdictMetadata) FindBestSequence(reference string) sequence { return bestSeq } hash := headword.Hash() - for _, seqScore := range meta.hashToSearchValues[hash] { - if meta.seqToSenseCount[seqScore.sequence] < senseNumber { + for _, v := range meta.hashToSearchValues[hash] { + if meta.seqToSenseCount[v.sequence] < senseNumber { // entry must contain the specified sense continue - } else if lowestIndex < seqScore.index { + } else if lowestIndex < v.index { // lower indices are better continue - } else if (lowestIndex == seqScore.index) && (bestIsPriority && !seqScore.isPriority) { - // if scores match, check priority + } else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) { + // if indices match, check priority continue - } else if (lowestIndex == seqScore.index) && (bestIsPriority == seqScore.isPriority) && (bestSeq < seqScore.sequence) { - // if scores and priority match, check sequence number. + } else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) { + // if indices and priority match, check sequence number. // lower sequence numbers are better continue } else { - lowestIndex = seqScore.index - bestSeq = seqScore.sequence - bestIsPriority = seqScore.isPriority + lowestIndex = v.index + bestSeq = v.sequence + bestIsPriority = v.isPriority } } return bestSeq From d606f729cfbdf2a5f4eb5e5a7903b9854097f359 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Mon, 23 Jan 2023 14:13:22 -0600 Subject: [PATCH 09/19] Use secondary frequency tags in term score calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a term has a frequency tag, it should return higher in search results than a match which does not have a tag. For example, a search for ็ด ๆ€ง should return ใ™ใ˜ใ‚‡ใ† rather than ใใ›ใ„, because the former has a "news" frequency tag. --- jmdictHeadword.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jmdictHeadword.go b/jmdictHeadword.go index a1a75cb..4ead44c 100644 --- a/jmdictHeadword.go +++ b/jmdictHeadword.go @@ -16,6 +16,7 @@ type headword struct { TermTags []string Index int IsPriority bool + IsFrequent bool IsIrregular bool IsOutdated bool IsRareKanji bool @@ -69,6 +70,9 @@ func (h *headword) Score() int { if h.IsPriority { score += 1 } + if h.IsFrequent { + score += 1 + } if h.IsIrregular { score -= 5 } @@ -109,6 +113,9 @@ func (h *headword) SetFlags(infoTags, freqTags []string) { break } } + if len(freqTags) > 1 { + h.IsFrequent = true + } for _, infoTag := range infoTags { switch infoTag { case "iK", "ik", "io": From ef1e74447d51854826ee81be7d5a28783f16dff2 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Mon, 23 Jan 2023 23:52:42 -0600 Subject: [PATCH 10/19] Include term tags and scores in standalone forms dictionary --- jmdictForms.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jmdictForms.go b/jmdictForms.go index 15b894d..def7f7a 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -81,7 +81,7 @@ func (h *headword) KanjiForm() string { } } -func jmdNeedsFormTable(headwords []headword) bool { +func needsFormTable(headwords []headword) bool { // Does the entry contain more than 1 distinct reading? // E.g. ใƒใ‚ซใŒใ„ and ใฐใ‹ใŒใ„ are not distinct. uniqueReading := "" @@ -186,7 +186,7 @@ func formsGlossary(headwords []headword) []any { func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm { term := dbTerm{Sequence: entry.Sequence} headwords := extractHeadwords(entry) - if jmdNeedsFormTable(headwords) { + if needsFormTable(headwords) { term.Glossary = formsTableGlossary(headwords) } else { term.Glossary = formsGlossary(headwords) @@ -224,6 +224,8 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int term = baseTerm term.Expression = h.Expression term.Reading = h.Reading + term.addTermTags(h.TermTags...) + term.Score = calculateTermScore(0, h) } terms = append(terms, term) } From 96358e3eb548d6706b00a203cd5aa126ad622b35 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Tue, 24 Jan 2023 08:55:24 -0600 Subject: [PATCH 11/19] Fix function parameter Sense numbers start at 1, not 0 --- jmdict.go | 2 +- jmdictForms.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jmdict.go b/jmdict.go index 6de8877..7283907 100644 --- a/jmdict.go +++ b/jmdict.go @@ -85,7 +85,7 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe term.addRules(rules...) } term.addTermTags(headword.TermTags...) - term.Score = calculateTermScore(0, headword) + term.Score = calculateTermScore(1, headword) redirectHeadword := meta.seqToMainHeadword[entry.Sequence] expHash := redirectHeadword.ExpHash() diff --git a/jmdictForms.go b/jmdictForms.go index def7f7a..032291d 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -225,7 +225,7 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int term.Expression = h.Expression term.Reading = h.Reading term.addTermTags(h.TermTags...) - term.Score = calculateTermScore(0, h) + term.Score = calculateTermScore(1, h) } terms = append(terms, term) } From 406067eeddf50c1a105234bd4ff34594d2ed90fe Mon Sep 17 00:00:00 2001 From: stephenmk Date: Tue, 24 Jan 2023 13:02:50 -0600 Subject: [PATCH 12/19] Include entity tags in standalone forms dictionary --- jmdictForms.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/jmdictForms.go b/jmdictForms.go index 032291d..9d21ac4 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -205,7 +205,7 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int } defer reader.Close() - dictionary, _, err := jmdict.LoadJmdictNoTransform(reader) + dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader) if err != nil { return err } @@ -231,13 +231,18 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int } } + tags := dbTagList{} + tags = append(tags, entityTags(entities)...) + tags = append(tags, newsFrequencyTags()...) + tags = append(tags, customDbTags()...) + if title == "" { title = "JMdict Forms" } recordData := map[string]dbRecordList{ "term": terms.crush(), - "tag": dbRecordList{}, + "tag": tags.crush(), } jmdictDate := jmdictPublicationDate(dictionary) From 7bd967915c4597c38a93d79cb89a5f305ddbedca Mon Sep 17 00:00:00 2001 From: stephenmk Date: Wed, 25 Jan 2023 18:26:47 -0600 Subject: [PATCH 13/19] Add "forms" term in special circumstances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a headword appears in multiple entries, then each entry needs a corresponding "forms" term in the output dictionary. For example, ่ปฝๅ’ is the only headword in entry 2275730, but ่ปฝๅ’ also appears as an irregular form in entry 1252910. If a "forms" term is not included for the former entry, then it will appear that ่ปฝๅ’ is irregular for all senses in the output dictionary. --- jmdict.go | 59 ++++++++++++++++++++++++++++++++++---------------- jmdictForms.go | 19 ++++++++-------- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/jmdict.go b/jmdict.go index 7283907..362b14d 100644 --- a/jmdict.go +++ b/jmdict.go @@ -62,7 +62,21 @@ func jmdictPublicationDate(dictionary jmdict.Jmdict) string { return jmdictDate } -func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { +func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + // Don't add "forms" terms to non-English dictionaries. + // Information would be duplicated if users installed more + // than one version. + if meta.language != "eng" { + return dbTerm{}, false + } + // Don't need a "forms" term for entries with one unique + // headword which does not appear in any other entries. + if !meta.hasMultipleForms[entry.Sequence] { + if len(meta.headwordHashToSeqs[headword.Hash()]) == 1 { + return dbTerm{}, false + } + } + term := baseFormsTerm(entry) term.Expression = headword.Expression term.Reading = headword.Reading @@ -72,10 +86,17 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet term.addDefinitionTags("forms") senseNumber := meta.seqToSenseCount[entry.Sequence] + 1 term.Score = calculateTermScore(senseNumber, headword) - return term + return term, true } -func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { +func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + // Don't add "search" terms to non-English dictionaries. + // Information would be duplicated if users installed more + // than one version. + if meta.language != "eng" { + return dbTerm{}, false + } + term := dbTerm{ Expression: headword.Expression, Sequence: -entry.Sequence, @@ -98,10 +119,17 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe ) term.Glossary = []any{contentStructure(content)} - return term + return term, true } -func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { +func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { + return dbTerm{}, false + } + if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) { + return dbTerm{}, false + } + term := dbTerm{ Expression: headword.Expression, Reading: headword.Reading, @@ -126,7 +154,7 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor term.Score = calculateTermScore(senseNumber, headword) - return term + return term, true } func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) { @@ -134,8 +162,7 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada return nil, false } if headword.IsSearchOnly { - if meta.language == "eng" { - searchTerm := createSearchTerm(headword, entry, meta) + if searchTerm, ok := createSearchTerm(headword, entry, meta); ok { return []dbTerm{searchTerm}, true } else { return nil, false @@ -145,25 +172,19 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada senseNumber := 1 for _, sense := range entry.Sense { if !glossaryContainsLanguage(sense.Glossary, meta.language) { + // Do not increment sense number continue } - if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { - senseNumber += 1 - continue + if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok { + terms = append(terms, senseTerm) } - if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) { - senseNumber += 1 - continue - } - senseTerm := createSenseTerm(sense, senseNumber, headword, entry, meta) senseNumber += 1 - terms = append(terms, senseTerm) } - if meta.hasMultipleForms[entry.Sequence] && meta.language == "eng" { - formsTerm := createFormsTerm(headword, entry, meta) + if formsTerm, ok := createFormsTerm(headword, entry, meta); ok { terms = append(terms, formsTerm) } + return terms, true } diff --git a/jmdictForms.go b/jmdictForms.go index 9d21ac4..af4bba6 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -210,23 +210,24 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int return err } - meta := newJmdictMetadata(dictionary, languageName) + meta := newJmdictMetadata(dictionary, "english") terms := dbTermList{} for _, entry := range dictionary.Entries { baseTerm := baseFormsTerm(entry) headwords := extractHeadwords(entry) for _, h := range headwords { - var term dbTerm if h.IsSearchOnly { - term = createSearchTerm(h, entry, meta) - } else { - term = baseTerm - term.Expression = h.Expression - term.Reading = h.Reading - term.addTermTags(h.TermTags...) - term.Score = calculateTermScore(1, h) + if term, ok := createSearchTerm(h, entry, meta); ok { + terms = append(terms, term) + } + continue } + term := baseTerm + term.Expression = h.Expression + term.Reading = h.Reading + term.addTermTags(h.TermTags...) + term.Score = calculateTermScore(1, h) terms = append(terms, term) } } From 517ef3d052541731b0821cd32248c028278a29af Mon Sep 17 00:00:00 2001 From: stephenmk Date: Fri, 27 Jan 2023 19:09:12 -0600 Subject: [PATCH 14/19] Fix bug in term score assignments This commit ensures that terms are grouped among their entries of origin and displayed in correct sequential order in Yomichan's default result grouping mode, "Group term-reading pairs." --- jmdict.go | 16 ++++++++++------ jmdictForms.go | 2 +- jmdictMetadata.go | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/jmdict.go b/jmdict.go index 362b14d..4c54db6 100644 --- a/jmdict.go +++ b/jmdict.go @@ -29,13 +29,15 @@ func grammarRules(partsOfSpeech []string) []string { return rules } -func calculateTermScore(senseNumber int, headword headword) int { +func calculateTermScore(senseNumber int, depth int, headword headword) int { const senseWeight int = 1 - const entryPositionWeight int = 100 - const priorityWeight int = 10000 + const depthWeight int = 100 + const entryPositionWeight int = 10000 + const priorityWeight int = 1000000 score := 0 score -= (senseNumber - 1) * senseWeight + score -= depth * depthWeight score -= headword.Index * entryPositionWeight score += headword.Score() * priorityWeight @@ -85,7 +87,8 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet term.addDefinitionTags("forms") senseNumber := meta.seqToSenseCount[entry.Sequence] + 1 - term.Score = calculateTermScore(senseNumber, headword) + entryDepth := meta.entryDepth[entry.Sequence] + term.Score = calculateTermScore(senseNumber, entryDepth, headword) return term, true } @@ -106,7 +109,7 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe term.addRules(rules...) } term.addTermTags(headword.TermTags...) - term.Score = calculateTermScore(1, headword) + term.Score = calculateTermScore(1, 0, headword) redirectHeadword := meta.seqToMainHeadword[entry.Sequence] expHash := redirectHeadword.ExpHash() @@ -152,7 +155,8 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor rules := grammarRules(sense.PartsOfSpeech) term.addRules(rules...) - term.Score = calculateTermScore(senseNumber, headword) + entryDepth := meta.entryDepth[entry.Sequence] + term.Score = calculateTermScore(senseNumber, entryDepth, headword) return term, true } diff --git a/jmdictForms.go b/jmdictForms.go index af4bba6..4964233 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -227,7 +227,7 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int term.Expression = h.Expression term.Reading = h.Reading term.addTermTags(h.TermTags...) - term.Score = calculateTermScore(1, h) + term.Score = calculateTermScore(1, 0, h) terms = append(terms, term) } } diff --git a/jmdictMetadata.go b/jmdictMetadata.go index ec92827..99af862 100644 --- a/jmdictMetadata.go +++ b/jmdictMetadata.go @@ -20,6 +20,7 @@ type jmdictMetadata struct { referenceToSeq map[string]sequence hashToSearchValues map[hash][]searchValue seqToSearchHashes map[sequence][]searchHash + entryDepth map[sequence]int hasMultipleForms map[sequence]bool maxSenseCount int } @@ -29,6 +30,26 @@ type senseID struct { number int } +func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) { + // This is to ensure that terms are grouped among their + // entries of origin and displayed in correct sequential order + maxDepth := 0 + for _, headword := range headwords { + hash := headword.Hash() + for _, seq := range meta.headwordHashToSeqs[hash] { + seqDepth := meta.entryDepth[seq] + if seqDepth == 0 { + meta.entryDepth[seq] = 1 + seqDepth = 1 + } + if maxDepth < seqDepth+1 { + maxDepth = seqDepth + 1 + } + } + } + meta.entryDepth[entrySequence] = maxDepth +} + func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) { // Determine how many senses are in this entry for this language @@ -128,6 +149,7 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta references: []string{}, hashToSearchValues: nil, referenceToSeq: nil, + entryDepth: make(map[sequence]int), hasMultipleForms: make(map[sequence]bool), maxSenseCount: 0, } @@ -141,6 +163,7 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta formCount += 1 } } + meta.CalculateEntryDepth(headwords, entry.Sequence) meta.hasMultipleForms[entry.Sequence] = (formCount > 1) } From 184dd45dbcd9350b2556442d129120bf31e60cb1 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 28 Jan 2023 18:17:06 -0600 Subject: [PATCH 15/19] Use snake_case in filenames --- jmdictConstants.go => jmdict_constants.go | 0 jmdictForms.go => jmdict_forms.go | 0 jmdictGlossary.go => jmdict_glossary.go | 0 jmdictHeadword.go => jmdict_headword.go | 0 jmdictMetadata.go => jmdict_metadata.go | 0 jmdictReferences.go => jmdict_references.go | 0 jmdictTags.go => jmdict_tags.go | 0 structuredContent.go => structured_content.go | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename jmdictConstants.go => jmdict_constants.go (100%) rename jmdictForms.go => jmdict_forms.go (100%) rename jmdictGlossary.go => jmdict_glossary.go (100%) rename jmdictHeadword.go => jmdict_headword.go (100%) rename jmdictMetadata.go => jmdict_metadata.go (100%) rename jmdictReferences.go => jmdict_references.go (100%) rename jmdictTags.go => jmdict_tags.go (100%) rename structuredContent.go => structured_content.go (100%) diff --git a/jmdictConstants.go b/jmdict_constants.go similarity index 100% rename from jmdictConstants.go rename to jmdict_constants.go diff --git a/jmdictForms.go b/jmdict_forms.go similarity index 100% rename from jmdictForms.go rename to jmdict_forms.go diff --git a/jmdictGlossary.go b/jmdict_glossary.go similarity index 100% rename from jmdictGlossary.go rename to jmdict_glossary.go diff --git a/jmdictHeadword.go b/jmdict_headword.go similarity index 100% rename from jmdictHeadword.go rename to jmdict_headword.go diff --git a/jmdictMetadata.go b/jmdict_metadata.go similarity index 100% rename from jmdictMetadata.go rename to jmdict_metadata.go diff --git a/jmdictReferences.go b/jmdict_references.go similarity index 100% rename from jmdictReferences.go rename to jmdict_references.go diff --git a/jmdictTags.go b/jmdict_tags.go similarity index 100% rename from jmdictTags.go rename to jmdict_tags.go diff --git a/structuredContent.go b/structured_content.go similarity index 100% rename from structuredContent.go rename to structured_content.go From abbe18314537935e1680af14d73657085543c249 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 28 Jan 2023 18:39:08 -0600 Subject: [PATCH 16/19] Simplify logic for `index.json` struct --- common.go | 1 + enamdict.go | 2 -- epwing.go | 9 +++------ frequency.go | 9 +++------ jmdict.go | 1 - jmdict_forms.go | 1 - kanjidic.go | 2 -- rikai.go | 9 +++------ 8 files changed, 10 insertions(+), 24 deletions(-) diff --git a/common.go b/common.go index 9d6b2aa..613a255 100644 --- a/common.go +++ b/common.go @@ -214,6 +214,7 @@ func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordLis } } + index.setDefaults() bytes, err := marshalJSON(index, pretty) if err != nil { return err diff --git a/enamdict.go b/enamdict.go index e0c1cb0..78b886d 100644 --- a/enamdict.go +++ b/enamdict.go @@ -105,10 +105,8 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, Title: title, Revision: "jmnedict1", Sequenced: true, - Description: "", Attribution: edrdgAttribution, } - index.setDefaults() return writeDb( outputPath, diff --git a/epwing.go b/epwing.go index 83b54b8..c7b2136 100644 --- a/epwing.go +++ b/epwing.go @@ -102,13 +102,10 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p } index := dbIndex{ - Title: title, - Revision: strings.Join(revisions, ";"), - Sequenced: true, - Description: "", - Attribution: "", + Title: title, + Revision: strings.Join(revisions, ";"), + Sequenced: true, } - index.setDefaults() return writeDb( outputPath, diff --git a/frequency.go b/frequency.go index 5d9f06a..310856c 100644 --- a/frequency.go +++ b/frequency.go @@ -56,13 +56,10 @@ func frequncyExportDb(inputPath, outputPath, language, title string, stride int, } index := dbIndex{ - Title: title, - Revision: "frequency1", - Sequenced: false, - Description: "", - Attribution: "", + Title: title, + Revision: "frequency1", + Sequenced: false, } - index.setDefaults() return writeDb( outputPath, diff --git a/jmdict.go b/jmdict.go index 4c54db6..746f7a0 100644 --- a/jmdict.go +++ b/jmdict.go @@ -238,7 +238,6 @@ func jmdExportDb(inputPath string, outputPath string, languageName string, title Sequenced: true, Attribution: edrdgAttribution, } - index.setDefaults() return writeDb( outputPath, diff --git a/jmdict_forms.go b/jmdict_forms.go index 4964233..59df010 100644 --- a/jmdict_forms.go +++ b/jmdict_forms.go @@ -254,7 +254,6 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int Sequenced: true, Attribution: edrdgAttribution, } - index.setDefaults() return writeDb( outputPath, diff --git a/kanjidic.go b/kanjidic.go index e1c42d9..5474aed 100644 --- a/kanjidic.go +++ b/kanjidic.go @@ -163,10 +163,8 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int, Title: title, Revision: "kanjidic2", Sequenced: false, - Description: "", Attribution: edrdgAttribution, } - index.setDefaults() return writeDb( outputPath, diff --git a/rikai.go b/rikai.go index f3b6b12..bfc5307 100644 --- a/rikai.go +++ b/rikai.go @@ -153,13 +153,10 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr } index := dbIndex{ - Title: title, - Revision: "rikai2", - Sequenced: true, - Description: "", - Attribution: "", + Title: title, + Revision: "rikai2", + Sequenced: true, } - index.setDefaults() return writeDb( outputPath, From 8b4b8999599d18766910edaa37f08ca9807fad26 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 29 Jan 2023 14:06:50 -0600 Subject: [PATCH 17/19] Hide new JMdict structured content features behind "extra" option Require `-language=english_extra` to produce the complete version of the new JMdict dictionary file. If and when we determine that the all the new features are ready to be included the dictionary by default, we can remove this logic. --- jmdict.go | 13 +++++++++++-- jmdict_constants.go | 23 ++++++++++++----------- jmdict_forms.go | 2 +- jmdict_glossary.go | 2 +- jmdict_metadata.go | 2 ++ scripts/build_dicts.sh | 5 +++-- 6 files changed, 30 insertions(+), 17 deletions(-) diff --git a/jmdict.go b/jmdict.go index 746f7a0..ceb5835 100644 --- a/jmdict.go +++ b/jmdict.go @@ -1,6 +1,7 @@ package yomichan import ( + "errors" "os" "regexp" "strconv" @@ -48,7 +49,11 @@ func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta j // Display sense numbers if the entry has more than one sense // or if the headword is found in multiple entries. hash := headword.Hash() - if meta.seqToSenseCount[entry.Sequence] > 1 { + if !meta.extraMode { + return false + } else if meta.language != "eng" { + return false + } else if meta.seqToSenseCount[entry.Sequence] > 1 { return true } else if len(meta.headwordHashToSeqs[hash]) > 1 { return true @@ -68,7 +73,7 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet // Don't add "forms" terms to non-English dictionaries. // Information would be duplicated if users installed more // than one version. - if meta.language != "eng" { + if meta.language != "eng" || !meta.extraMode { return dbTerm{}, false } // Don't need a "forms" term for entries with one unique @@ -193,6 +198,10 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada } func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error { + if _, ok := langNameToCode[languageName]; !ok { + return errors.New("Unrecognized language parameter: " + languageName) + } + reader, err := os.Open(inputPath) if err != nil { return err diff --git a/jmdict_constants.go b/jmdict_constants.go index 1d49194..cb74233 100644 --- a/jmdict_constants.go +++ b/jmdict_constants.go @@ -42,17 +42,18 @@ var ISOtoFlag = map[string]string{ } var langNameToCode = map[string]string{ - "": "eng", - "english": "eng", - "dutch": "dut", - "french": "fre", - "german": "ger", - "hungarian": "hun", - "italian": "ita", - "russian": "rus", - "slovenian": "slv", - "spanish": "spa", - "swedish": "swe", + "": "eng", + "english": "eng", + "english_extra": "eng", + "dutch": "dut", + "french": "fre", + "german": "ger", + "hungarian": "hun", + "italian": "ita", + "russian": "rus", + "slovenian": "slv", + "spanish": "spa", + "swedish": "swe", } var glossTypeCodeToName = map[LangCode]string{ diff --git a/jmdict_forms.go b/jmdict_forms.go index 59df010..5d01de5 100644 --- a/jmdict_forms.go +++ b/jmdict_forms.go @@ -210,7 +210,7 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int return err } - meta := newJmdictMetadata(dictionary, "english") + meta := newJmdictMetadata(dictionary, "") terms := dbTermList{} for _, entry := range dictionary.Entries { diff --git a/jmdict_glossary.go b/jmdict_glossary.go index 0260cbf..d116981 100644 --- a/jmdict_glossary.go +++ b/jmdict_glossary.go @@ -287,7 +287,7 @@ func createGlossaryContent(sense jmdict.JmdictSense, meta jmdictMetadata) any { func createGlossary(sense jmdict.JmdictSense, meta jmdictMetadata) []any { glossary := []any{} - if needsStructuredContent(sense, meta.language) { + if meta.extraMode && needsStructuredContent(sense, meta.language) { glossary = append(glossary, createGlossaryContent(sense, meta)) } else { for _, gloss := range sense.Glossary { diff --git a/jmdict_metadata.go b/jmdict_metadata.go index 99af862..98e35d9 100644 --- a/jmdict_metadata.go +++ b/jmdict_metadata.go @@ -23,6 +23,7 @@ type jmdictMetadata struct { entryDepth map[sequence]int hasMultipleForms map[sequence]bool maxSenseCount int + extraMode bool } type senseID struct { @@ -152,6 +153,7 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta entryDepth: make(map[sequence]int), hasMultipleForms: make(map[sequence]bool), maxSenseCount: 0, + extraMode: languageName == "english_extra", } for _, entry := range dictionary.Entries { diff --git a/scripts/build_dicts.sh b/scripts/build_dicts.sh index 764d1d0..df63ac6 100755 --- a/scripts/build_dicts.sh +++ b/scripts/build_dicts.sh @@ -17,10 +17,11 @@ function refresh_source () { } refresh_source "JMdict_e_examp" -yomichan -language="english" -title="JMdict" src/JMdict_e_examp dst/jmdict_english_with_examples.zip +yomichan -language="english_extra" -title="JMdict" src/JMdict_e_examp dst/jmdict_english_extra_with_examples.zip refresh_source "JMdict" -yomichan -language="english" -title="JMdict" src/JMdict dst/jmdict_english.zip +yomichan -language="english_extra" -title="JMdict" src/JMdict dst/jmdict_english_extra.zip +yomichan -language="english" -title="JMdict (English)" src/JMdict dst/jmdict_english.zip yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip yomichan -language="french" -title="JMdict (French)" src/JMdict dst/jmdict_french.zip yomichan -language="german" -title="JMdict (German)" src/JMdict dst/jmdict_german.zip From aab031972c4cef099eafb30fbc2ae8a96ee8e842 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 29 Jan 2023 20:06:46 -0600 Subject: [PATCH 18/19] Simplify declaration of constants --- jmdict_constants.go | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/jmdict_constants.go b/jmdict_constants.go index cb74233..5424836 100644 --- a/jmdict_constants.go +++ b/jmdict_constants.go @@ -5,26 +5,28 @@ type LangCode struct { code string } -const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/" +const ( + edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/" -const prioritySymbol = "โ˜…" -const rareKanjiSymbol = "๐Ÿ…" -const irregularSymbol = "โš " -const outdatedSymbol = "โ›ฌ" -const defaultSymbol = "ใŠ’" + prioritySymbol = "โ˜…" + rareKanjiSymbol = "๐Ÿ…" + irregularSymbol = "โš " + outdatedSymbol = "โ›ฌ" + defaultSymbol = "ใŠ’" -const priorityTagName = "โญ" -const rareKanjiTagName = "R" -const irregularTagName = "โš ๏ธ" -const outdatedTagName = "โ›ฌ" -const atejiTagName = "ateji" -const gikunTagName = "gikun" + priorityTagName = "โญ" + rareKanjiTagName = "R" + irregularTagName = "โš ๏ธ" + outdatedTagName = "โ›ฌ" + atejiTagName = "ateji" + gikunTagName = "gikun" -const langMarker = "'๐ŸŒ '" -const noteMarker = "'๐Ÿ“ '" -const infoMarker = "'โ„น๏ธ '" -const refMarker = "'โžก๏ธ '" -const antonymMarker = "'๐Ÿ”„ '" + langMarker = "'๐ŸŒ '" + noteMarker = "'๐Ÿ“ '" + infoMarker = "'โ„น๏ธ '" + refMarker = "'โžก๏ธ '" + antonymMarker = "'๐Ÿ”„ '" +) var ISOtoFlag = map[string]string{ "": "'๐Ÿ‡ฌ๐Ÿ‡ง '", From 0b328e1e0715b178c0c335f3c90919d82f0bb45d Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 29 Jan 2023 22:34:13 -0600 Subject: [PATCH 19/19] Add support for undocumented frequency and information tags Custom dictionary files using the JMdict XML format may contain nonstandard frequency and information tags. --- jmdict_headword.go | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/jmdict_headword.go b/jmdict_headword.go index 4ead44c..19a4bba 100644 --- a/jmdict_headword.go +++ b/jmdict_headword.go @@ -130,6 +130,9 @@ func (h *headword) SetFlags(infoTags, freqTags []string) { h.IsAteji = true case "gikun": h.IsGikun = true + default: + fmt.Println("Unknown information tag type: " + infoTag) + h.TermTags = append(h.TermTags, infoTag) } } if h.IsOutdated && h.IsRareKanji { @@ -138,16 +141,16 @@ func (h *headword) SetFlags(infoTags, freqTags []string) { } func (h *headword) SetTermTags(freqTags []string) { - h.TermTags = []string{} if h.IsPriority { h.TermTags = append(h.TermTags, priorityTagName) } + knownFreqTags := []string{"ichi1", "ichi2", "gai1", "gai2", "spec1", "spec2"} for _, tag := range freqTags { isNewsFreqTag, _ := regexp.MatchString(`nf\d\d`, tag) if isNewsFreqTag { // nf tags are divided into ranks of 500 - // (nf01 to nf48), but it will be easier - // for the user to read 1k, 2k, etc. + // (nf01 to nf48). Let's combine them into + // ranks of 1k (news1k, news2k, ..., news24k). var i int if _, err := fmt.Sscanf(tag, "nf%2d", &i); err == nil { i = (i + (i % 2)) / 2 @@ -155,10 +158,15 @@ func (h *headword) SetTermTags(freqTags []string) { h.TermTags = append(h.TermTags, newsTag) } } else if tag == "news1" || tag == "news2" { + // News tags are derived from the nf + // rankings, so these are not needed. continue - } else { - tagWithoutTheNumber := tag[:len(tag)-1] // "ichi", "gai", or "spec" + } else if slices.Contains(knownFreqTags, tag) { + tagWithoutTheNumber := tag[:len(tag)-1] h.TermTags = append(h.TermTags, tagWithoutTheNumber) + } else { + fmt.Println("Unknown frequency tag type: " + tag) + h.TermTags = append(h.TermTags, tag) } } if h.IsIrregular {