diff --git a/LICENSE b/LICENSE index f13e263..3901c0e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2016-2022 Alex Yatskov +Copyright 2016-2023 Yomichan-Import Authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/common.go b/common.go index aa566c0..613a255 100644 --- a/common.go +++ b/common.go @@ -19,9 +19,7 @@ const ( DefaultTitle = "" ) -const databaseFormat = 3 - -type dbRecord []interface{} +type dbRecord []any type dbRecordList []dbRecord type dbTag struct { @@ -46,7 +44,7 @@ func (meta dbTagList) crush() dbRecordList { type dbMeta struct { Expression string Mode string - Data interface{} + Data any } type dbMetaList []dbMeta @@ -66,7 +64,7 @@ type dbTerm struct { DefinitionTags []string Rules []string Score int - Glossary []string + Glossary []any Sequence int TermTags []string } @@ -142,11 +140,34 @@ func (kanji dbKanjiList) crush() dbRecordList { return results } -func writeDb(outputPath, title, revision string, sequenced bool, recordData map[string]dbRecordList, stride int, pretty bool) error { +type dbIndex struct { + Title string `json:"title"` + Format int `json:"format"` + Revision string `json:"revision"` + Sequenced bool `json:"sequenced"` + Author string `json:"author"` + Url string `json:"url"` + Description string `json:"description"` + Attribution string `json:"attribution"` +} + +func (index *dbIndex) setDefaults() { + if index.Format == 0 { + index.Format = 3 + } + if index.Author == "" { + index.Author = "yomichan-import" + } + if index.Url == "" { + index.Url = "https://github.com/FooSoft/yomichan-import" + } +} + +func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordList, stride int, pretty bool) error { var zbuff bytes.Buffer zip := zip.NewWriter(&zbuff) - marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) { + marshalJSON := func(obj any, pretty bool) ([]byte, error) { if pretty { return json.MarshalIndent(obj, "", " ") } @@ -186,17 +207,6 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[ } var err error - var db struct { - Title string `json:"title"` - Format int `json:"format"` - Revision string `json:"revision"` - Sequenced bool `json:"sequenced"` - } - - db.Title = title - db.Format = databaseFormat - db.Revision = revision - db.Sequenced = sequenced for recordType, recordEntries := range recordData { if _, err := writeDbRecords(recordType, recordEntries); err != nil { @@ -204,7 +214,8 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[ } } - bytes, err := marshalJSON(db, pretty) + index.setDefaults() + bytes, err := marshalJSON(index, pretty) if err != nil { return err } @@ -252,6 +263,39 @@ func hasString(needle string, haystack []string) bool { return false } +func intersection(s1, s2 []string) []string { + s := []string{} + m := make(map[string]bool) + for _, e := range s1 { + m[e] = true + } + for _, e := range s2 { + if m[e] { + s = append(s, e) + m[e] = false + } + } + return s +} + +func union(s1, s2 []string) []string { + s := []string{} + m := make(map[string]bool) + for _, e := range s1 { + if !m[e] { + s = append(s, e) + m[e] = true + } + } + for _, e := range s2 { + if !m[e] { + s = append(s, e) + m[e] = true + } + } + return s +} + func detectFormat(path string) (string, error) { switch filepath.Ext(path) { case ".sqlite": @@ -263,7 +307,7 @@ func detectFormat(path string) (string, error) { } switch filepath.Base(path) { - case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml": + case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml", "JMdict_e_examp": return "edict", nil case "JMnedict", "JMnedict.xml": return "enamdict", nil @@ -293,7 +337,8 @@ func detectFormat(path string) (string, error) { func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error { handlers := map[string]func(string, string, string, string, int, bool) error{ - "edict": jmdictExportDb, + "edict": jmdExportDb, + "forms": formsExportDb, "enamdict": jmnedictExportDb, "epwing": epwingExportDb, "kanjidic": kanjidicExportDb, diff --git a/daijirin.go b/daijirin.go index 2c2b190..abc30e6 100644 --- a/daijirin.go +++ b/daijirin.go @@ -65,7 +65,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -79,7 +79,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/daijisen.go b/daijisen.go index 5d663df..332bc46 100644 --- a/daijisen.go +++ b/daijisen.go @@ -70,7 +70,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db if len(expressions) == 0 { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -82,7 +82,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/edict.go b/edict.go deleted file mode 100644 index f30dfdb..0000000 --- a/edict.go +++ /dev/null @@ -1,246 +0,0 @@ -package yomichan - -import ( - "os" - "strings" - - "foosoft.net/projects/jmdict" -) - -const jmdictRevision = "jmdict4" - -func jmdictBuildRules(term *dbTerm) { - for _, tag := range term.DefinitionTags { - switch tag { - case "adj-i", "v1", "vk", "vz": - term.addRules(tag) - default: - if strings.HasPrefix(tag, "v5") { - term.addRules("v5") - } else if strings.HasPrefix(tag, "vs-") { - term.addRules("vs") - } - } - } -} - -func jmdictBuildScore(term *dbTerm) { - for _, tag := range term.DefinitionTags { - switch tag { - case "arch": - term.Score -= 100 - } - } - for _, tag := range term.TermTags { - switch tag { - case "news", "ichi", "spec", "gai1": - term.Score += 100 - case "P": - term.Score += 500 - case "iK", "ik", "ok", "oK", "io", "oik": - term.Score -= 100 - } - } -} - -func jmdictAddPriorities(term *dbTerm, priorities ...string) { - for _, priority := range priorities { - switch priority { - case "news1", "ichi1", "spec1", "gai1": - term.addTermTags("P") - fallthrough - case "news2", "ichi2", "spec2", "gai2": - term.addTermTags(priority[:len(priority)-1]) - } - } -} - -func jmdictBuildTagMeta(entities map[string]string) dbTagList { - tags := dbTagList{ - dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2}, - dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2}, - dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2}, - dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2}, - dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10, Score: 10}, - } - - for name, value := range entities { - tag := dbTag{Name: name, Notes: value} - - switch name { - case "exp", "id": - tag.Category = "expression" - tag.Order = -5 - case "arch": - tag.Category = "archaism" - tag.Order = -4 - case "iK", "ik", "ok", "oK", "io", "oik": - tag.Score = -5 - case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj", - "aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf", - "unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k", - "v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru", - "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i", - "vs", "vs-s", "vt", "vz": - tag.Category = "partOfSpeech" - tag.Order = -3 - } - - tags = append(tags, tag) - } - - return tags -} - -func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm { - var terms []dbTerm - - convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) { - if kanji != nil && reading.Restrictions != nil && !hasString(kanji.Expression, reading.Restrictions) { - return - } - - var termBase dbTerm - termBase.addTermTags(reading.Information...) - - if kanji == nil { - termBase.Expression = reading.Reading - jmdictAddPriorities(&termBase, reading.Priorities...) - } else { - termBase.Expression = kanji.Expression - termBase.Reading = reading.Reading - termBase.addTermTags(kanji.Information...) - - for _, priority := range kanji.Priorities { - if hasString(priority, reading.Priorities) { - jmdictAddPriorities(&termBase, priority) - } - } - } - - var partsOfSpeech []string - for index, sense := range edictEntry.Sense { - - if len(sense.PartsOfSpeech) != 0 { - partsOfSpeech = sense.PartsOfSpeech - } - - if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) { - continue - } - - if kanji != nil && sense.RestrictedKanji != nil && !hasString(kanji.Expression, sense.RestrictedKanji) { - continue - } - - term := dbTerm{ - Reading: termBase.Reading, - Expression: termBase.Expression, - Score: len(edictEntry.Sense) - index, - Sequence: edictEntry.Sequence, - } - - for _, glossary := range sense.Glossary { - if glossary.Language == nil && language == "" || glossary.Language != nil && language == *glossary.Language { - term.Glossary = append(term.Glossary, glossary.Content) - } - } - - if len(term.Glossary) == 0 { - continue - } - - term.addDefinitionTags(termBase.DefinitionTags...) - term.addTermTags(termBase.TermTags...) - term.addDefinitionTags(partsOfSpeech...) - term.addDefinitionTags(sense.Fields...) - term.addDefinitionTags(sense.Misc...) - term.addDefinitionTags(sense.Dialects...) - - jmdictBuildRules(&term) - jmdictBuildScore(&term) - - terms = append(terms, term) - } - } - - if len(edictEntry.Kanji) > 0 { - for _, kanji := range edictEntry.Kanji { - for _, reading := range edictEntry.Readings { - if reading.NoKanji == nil { - convert(reading, &kanji) - } - } - } - for _, reading := range edictEntry.Readings { - if reading.NoKanji != nil { - convert(reading, nil) - } - } - } else { - for _, reading := range edictEntry.Readings { - convert(reading, nil) - } - } - - return terms -} - -func jmdictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { - reader, err := os.Open(inputPath) - if err != nil { - return err - } - defer reader.Close() - - dict, entities, err := jmdict.LoadJmdictNoTransform(reader) - if err != nil { - return err - } - - var langTag string - switch language { - case "dutch": - langTag = "dut" - case "french": - langTag = "fre" - case "german": - langTag = "ger" - case "hungarian": - langTag = "hun" - case "italian": - langTag = "ita" - case "russian": - langTag = "rus" - case "slovenian": - langTag = "slv" - case "spanish": - langTag = "spa" - case "swedish": - langTag = "swe" - } - - var terms dbTermList - for _, entry := range dict.Entries { - terms = append(terms, jmdictExtractTerms(entry, langTag)...) - } - - if title == "" { - title = "JMdict" - } - - recordData := map[string]dbRecordList{ - "term": terms.crush(), - "tag": jmdictBuildTagMeta(entities).crush(), - } - - return writeDb( - outputPath, - title, - jmdictRevision, - true, - recordData, - stride, - pretty, - ) -} diff --git a/enamdict.go b/enamdict.go index f3df513..78b886d 100644 --- a/enamdict.go +++ b/enamdict.go @@ -6,8 +6,6 @@ import ( "foosoft.net/projects/jmdict" ) -const jmnedictRevision = "jmnedict1" - func jmnedictBuildTagMeta(entities map[string]string) dbTagList { var tags dbTagList @@ -53,7 +51,9 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm { } for _, trans := range enamdictEntry.Translations { - term.Glossary = append(term.Glossary, trans.Translations...) + for _, translation := range trans.Translations { + term.Glossary = append(term.Glossary, translation) + } term.addDefinitionTags(trans.NameTypes...) } @@ -101,11 +101,16 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, "tag": jmnedictBuildTagMeta(entities).crush(), } + index := dbIndex{ + Title: title, + Revision: "jmnedict1", + Sequenced: true, + Attribution: edrdgAttribution, + } + return writeDb( outputPath, - title, - jmnedictRevision, - true, + index, recordData, stride, pretty, diff --git a/epwing.go b/epwing.go index 37516c6..c7b2136 100644 --- a/epwing.go +++ b/epwing.go @@ -101,11 +101,15 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p "term": terms.crush(), } + index := dbIndex{ + Title: title, + Revision: strings.Join(revisions, ";"), + Sequenced: true, + } + return writeDb( outputPath, - title, - strings.Join(revisions, ";"), - true, + index, recordData, stride, pretty, diff --git a/frequency.go b/frequency.go index 694ed67..310856c 100644 --- a/frequency.go +++ b/frequency.go @@ -7,8 +7,6 @@ import ( "strings" ) -const frequencyRevision = "frequency1" - func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta") } @@ -57,11 +55,15 @@ func frequncyExportDb(inputPath, outputPath, language, title string, stride int, key: frequencies.crush(), } + index := dbIndex{ + Title: title, + Revision: "frequency1", + Sequenced: false, + } + return writeDb( outputPath, - title, - frequencyRevision, - false, + index, recordData, stride, pretty, diff --git a/gakken.go b/gakken.go index b25f989..58e96b4 100644 --- a/gakken.go +++ b/gakken.go @@ -90,7 +90,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entryText}, + Glossary: []any{entryText}, Sequence: sequence, } @@ -107,7 +107,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entryText}, + Glossary: []any{entryText}, Sequence: sequence, } diff --git a/go.mod b/go.mod index 0bca3dd..4f31a22 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( foosoft.net/projects/zero-epwing-go v0.0.0-20220704035039-bc008453615d github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e github.com/mattn/go-sqlite3 v1.14.14 + golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f ) require golang.org/x/text v0.3.7 // indirect diff --git a/go.sum b/go.sum index ca51ada..4dd5f91 100644 --- a/go.sum +++ b/go.sum @@ -6,5 +6,7 @@ github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e h1:wSQCJiig/QkoUnpvelSP github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II= github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw= github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= +golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f h1:90Jq/vvGVDsqj8QqCynjFw9MCerDguSMODLYII416Y8= +golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= diff --git a/jmdict.go b/jmdict.go new file mode 100644 index 0000000..ceb5835 --- /dev/null +++ b/jmdict.go @@ -0,0 +1,258 @@ +package yomichan + +import ( + "errors" + "os" + "regexp" + "strconv" + "strings" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +func grammarRules(partsOfSpeech []string) []string { + rules := []string{} + for _, partOfSpeech := range partsOfSpeech { + switch partOfSpeech { + case "adj-i", "vk", "vz": + rules = append(rules, partOfSpeech) + default: + if strings.HasPrefix(partOfSpeech, "v5") { + rules = append(rules, "v5") + } else if strings.HasPrefix(partOfSpeech, "v1") { + rules = append(rules, "v1") + } else if strings.HasPrefix(partOfSpeech, "vs-") { + rules = append(rules, "vs") + } + } + } + return rules +} + +func calculateTermScore(senseNumber int, depth int, headword headword) int { + const senseWeight int = 1 + const depthWeight int = 100 + const entryPositionWeight int = 10000 + const priorityWeight int = 1000000 + + score := 0 + score -= (senseNumber - 1) * senseWeight + score -= depth * depthWeight + score -= headword.Index * entryPositionWeight + score += headword.Score() * priorityWeight + + return score +} + +func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) bool { + // Display sense numbers if the entry has more than one sense + // or if the headword is found in multiple entries. + hash := headword.Hash() + if !meta.extraMode { + return false + } else if meta.language != "eng" { + return false + } else if meta.seqToSenseCount[entry.Sequence] > 1 { + return true + } else if len(meta.headwordHashToSeqs[hash]) > 1 { + return true + } else { + return false + } +} + +func jmdictPublicationDate(dictionary jmdict.Jmdict) string { + dateEntry := dictionary.Entries[len(dictionary.Entries)-1] + r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`) + jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content) + return jmdictDate +} + +func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + // Don't add "forms" terms to non-English dictionaries. + // Information would be duplicated if users installed more + // than one version. + if meta.language != "eng" || !meta.extraMode { + return dbTerm{}, false + } + // Don't need a "forms" term for entries with one unique + // headword which does not appear in any other entries. + if !meta.hasMultipleForms[entry.Sequence] { + if len(meta.headwordHashToSeqs[headword.Hash()]) == 1 { + return dbTerm{}, false + } + } + + term := baseFormsTerm(entry) + term.Expression = headword.Expression + term.Reading = headword.Reading + + term.addTermTags(headword.TermTags...) + + term.addDefinitionTags("forms") + senseNumber := meta.seqToSenseCount[entry.Sequence] + 1 + entryDepth := meta.entryDepth[entry.Sequence] + term.Score = calculateTermScore(senseNumber, entryDepth, headword) + return term, true +} + +func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + // Don't add "search" terms to non-English dictionaries. + // Information would be duplicated if users installed more + // than one version. + if meta.language != "eng" { + return dbTerm{}, false + } + + term := dbTerm{ + Expression: headword.Expression, + Sequence: -entry.Sequence, + } + for _, sense := range entry.Sense { + rules := grammarRules(sense.PartsOfSpeech) + term.addRules(rules...) + } + term.addTermTags(headword.TermTags...) + term.Score = calculateTermScore(1, 0, headword) + + redirectHeadword := meta.seqToMainHeadword[entry.Sequence] + expHash := redirectHeadword.ExpHash() + doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1) + + content := contentSpan( + contentAttr{fontSize: "130%"}, + "โŸถ", + redirectHeadword.ToInternalLink(doDisplayReading), + ) + + term.Glossary = []any{contentStructure(content)} + return term, true +} + +func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { + return dbTerm{}, false + } + if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) { + return dbTerm{}, false + } + + term := dbTerm{ + Expression: headword.Expression, + Reading: headword.Reading, + Sequence: entry.Sequence, + } + + term.Glossary = createGlossary(sense, meta) + + term.addTermTags(headword.TermTags...) + + if doDisplaySenseNumberTag(headword, entry, meta) { + senseNumberTag := strconv.Itoa(senseNumber) + term.addDefinitionTags(senseNumberTag) + } + term.addDefinitionTags(sense.PartsOfSpeech...) + term.addDefinitionTags(sense.Fields...) + term.addDefinitionTags(sense.Misc...) + term.addDefinitionTags(sense.Dialects...) + + rules := grammarRules(sense.PartsOfSpeech) + term.addRules(rules...) + + entryDepth := meta.entryDepth[entry.Sequence] + term.Score = calculateTermScore(senseNumber, entryDepth, headword) + + return term, true +} + +func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) { + if meta.seqToSenseCount[entry.Sequence] == 0 { + return nil, false + } + if headword.IsSearchOnly { + if searchTerm, ok := createSearchTerm(headword, entry, meta); ok { + return []dbTerm{searchTerm}, true + } else { + return nil, false + } + } + terms := []dbTerm{} + senseNumber := 1 + for _, sense := range entry.Sense { + if !glossaryContainsLanguage(sense.Glossary, meta.language) { + // Do not increment sense number + continue + } + if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok { + terms = append(terms, senseTerm) + } + senseNumber += 1 + } + + if formsTerm, ok := createFormsTerm(headword, entry, meta); ok { + terms = append(terms, formsTerm) + } + + return terms, true +} + +func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error { + if _, ok := langNameToCode[languageName]; !ok { + return errors.New("Unrecognized language parameter: " + languageName) + } + + reader, err := os.Open(inputPath) + if err != nil { + return err + } + defer reader.Close() + + dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader) + if err != nil { + return err + } + + meta := newJmdictMetadata(dictionary, languageName) + + terms := dbTermList{} + for _, entry := range dictionary.Entries { + headwords := extractHeadwords(entry) + for _, headword := range headwords { + if newTerms, ok := extractTerms(headword, entry, meta); ok { + terms = append(terms, newTerms...) + } + } + } + + tags := dbTagList{} + tags = append(tags, entityTags(entities)...) + tags = append(tags, senseNumberTags(meta.maxSenseCount)...) + tags = append(tags, newsFrequencyTags()...) + tags = append(tags, customDbTags()...) + + recordData := map[string]dbRecordList{ + "term": terms.crush(), + "tag": tags.crush(), + } + + if title == "" { + title = "JMdict" + } + jmdictDate := jmdictPublicationDate(dictionary) + + index := dbIndex{ + Title: title, + Revision: "JMdict." + jmdictDate, + Sequenced: true, + Attribution: edrdgAttribution, + } + + return writeDb( + outputPath, + index, + recordData, + stride, + pretty, + ) +} diff --git a/jmdict_constants.go b/jmdict_constants.go new file mode 100644 index 0000000..5424836 --- /dev/null +++ b/jmdict_constants.go @@ -0,0 +1,218 @@ +package yomichan + +type LangCode struct { + language string + code string +} + +const ( + edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/" + + prioritySymbol = "โ˜…" + rareKanjiSymbol = "๐Ÿ…" + irregularSymbol = "โš " + outdatedSymbol = "โ›ฌ" + defaultSymbol = "ใŠ’" + + priorityTagName = "โญ" + rareKanjiTagName = "R" + irregularTagName = "โš ๏ธ" + outdatedTagName = "โ›ฌ" + atejiTagName = "ateji" + gikunTagName = "gikun" + + langMarker = "'๐ŸŒ '" + noteMarker = "'๐Ÿ“ '" + infoMarker = "'โ„น๏ธ '" + refMarker = "'โžก๏ธ '" + antonymMarker = "'๐Ÿ”„ '" +) + +var ISOtoFlag = map[string]string{ + "": "'๐Ÿ‡ฌ๐Ÿ‡ง '", + "eng": "'๐Ÿ‡ฌ๐Ÿ‡ง '", + "dut": "'๐Ÿ‡ณ๐Ÿ‡ฑ '", + "fre": "'๐Ÿ‡ซ๐Ÿ‡ท '", + "ger": "'๐Ÿ‡ฉ๐Ÿ‡ช '", + "hun": "'๐Ÿ‡ญ๐Ÿ‡บ '", + "ita": "'๐Ÿ‡ฎ๐Ÿ‡น '", + "jpn": "'๐Ÿ‡ฏ๐Ÿ‡ต '", + "rus": "'๐Ÿ‡ท๐Ÿ‡บ '", + "slv": "'๐Ÿ‡ธ๐Ÿ‡ฎ '", + "spa": "'๐Ÿ‡ช๐Ÿ‡ธ '", + "swe": "'๐Ÿ‡ธ๐Ÿ‡ช '", +} + +var langNameToCode = map[string]string{ + "": "eng", + "english": "eng", + "english_extra": "eng", + "dutch": "dut", + "french": "fre", + "german": "ger", + "hungarian": "hun", + "italian": "ita", + "russian": "rus", + "slovenian": "slv", + "spanish": "spa", + "swedish": "swe", +} + +var glossTypeCodeToName = map[LangCode]string{ + LangCode{"eng", "lit"}: "literally", + LangCode{"eng", "fig"}: "figuratively", + LangCode{"eng", "expl"}: "", // don't need to tell the user that an explanation is an explanation + LangCode{"eng", "tm"}: "trademark", +} + +var refNoteHint = map[LangCode]string{ + LangCode{"eng", "xref"}: "see", + LangCode{"eng", "ant"}: "antonym", +} + +var sourceLangTypeCodeToType = map[LangCode]string{ + LangCode{"eng", "part"}: "partial", + LangCode{"eng", ""}: "", // implied "full" +} + +var langCodeToName = map[LangCode]string{ + LangCode{"eng", "afr"}: "Afrikaans", + LangCode{"eng", "ain"}: "Ainu", + LangCode{"eng", "alg"}: "Algonquian", + LangCode{"eng", "amh"}: "Amharic", + LangCode{"eng", "ara"}: "Arabic", + LangCode{"eng", "arn"}: "Mapudungun", + LangCode{"eng", "bnt"}: "Bantu", + LangCode{"eng", "bre"}: "Breton", + LangCode{"eng", "bul"}: "Bulgarian", + LangCode{"eng", "bur"}: "Burmese", + LangCode{"eng", "chi"}: "Chinese", + LangCode{"eng", "chn"}: "Chinook Jargon", + LangCode{"eng", "cze"}: "Czech", + LangCode{"eng", "dan"}: "Danish", + LangCode{"eng", "dut"}: "Dutch", + LangCode{"eng", "eng"}: "English", + LangCode{"eng", "epo"}: "Esperanto", + LangCode{"eng", "est"}: "Estonian", + LangCode{"eng", "fil"}: "Filipino", + LangCode{"eng", "fin"}: "Finnish", + LangCode{"eng", "fre"}: "French", + LangCode{"eng", "geo"}: "Georgian", + LangCode{"eng", "ger"}: "German", + LangCode{"eng", "glg"}: "Galician", + LangCode{"eng", "grc"}: "Ancient Greek", + LangCode{"eng", "gre"}: "Modern Greek", + LangCode{"eng", "haw"}: "Hawaiian", + LangCode{"eng", "heb"}: "Hebrew", + LangCode{"eng", "hin"}: "Hindi", + LangCode{"eng", "hun"}: "Hungarian", + LangCode{"eng", "ice"}: "Icelandic", + LangCode{"eng", "ind"}: "Indonesian", + LangCode{"eng", "ita"}: "Italian", + LangCode{"eng", "khm"}: "Khmer", + LangCode{"eng", "kor"}: "Korean", + LangCode{"eng", "kur"}: "Kurdish", + LangCode{"eng", "lat"}: "Latin", + LangCode{"eng", "mal"}: "Malayalam", + LangCode{"eng", "mao"}: "Maori", + LangCode{"eng", "may"}: "Malay", + LangCode{"eng", "mnc"}: "Manchu", + LangCode{"eng", "mol"}: "Moldavian", // ISO 639 deprecated (https://iso639-3.sil.org/code/mol) + LangCode{"eng", "mon"}: "Mongolian", + LangCode{"eng", "nor"}: "Norwegian", + LangCode{"eng", "per"}: "Persian", + LangCode{"eng", "pol"}: "Polish", + LangCode{"eng", "por"}: "Portuguese", + LangCode{"eng", "rum"}: "Romanian", + LangCode{"eng", "rus"}: "Russian", + LangCode{"eng", "san"}: "Sanskrit", + LangCode{"eng", "scr"}: "Croatian", // Code doesn't seem to exist in ISO 639. Should be "hrv" instead? (https://iso639-3.sil.org/code/hrv) + LangCode{"eng", "slo"}: "Slovak", + LangCode{"eng", "slv"}: "Slovenian", + LangCode{"eng", "som"}: "Somali", + LangCode{"eng", "spa"}: "Spanish", + LangCode{"eng", "swa"}: "Swahili", + LangCode{"eng", "swe"}: "Swedish", + LangCode{"eng", "tah"}: "Tahitian", + LangCode{"eng", "tam"}: "Tamil", + LangCode{"eng", "tgl"}: "Tagalog", + LangCode{"eng", "tha"}: "Thai", + LangCode{"eng", "tib"}: "Tibetan", + LangCode{"eng", "tur"}: "Turkish", + LangCode{"eng", "ukr"}: "Ukrainian", + LangCode{"eng", "urd"}: "Urdu", + LangCode{"eng", "vie"}: "Vietnamese", + LangCode{"eng", "yid"}: "Yiddish", +} + +// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +var ISOtoHTML = map[string]string{ + "afr": "af", // Afrikaans + "ain": "ain", // Ainu + "alg": "alg", // Algonquian + "amh": "am", // Amharic + "ara": "ar", // Arabic + "arn": "arn", // Mapudungun + "bnt": "bnt", // Bantu + "bre": "br", // Breton + "bul": "bg", // Bulgarian + "bur": "my", // Burmese + "chi": "zh", // Chinese + "chn": "chn", // Chinook Jargon + "cze": "cs", // Czech + "dan": "da", // Danish + "dut": "nl", // Dutch + "eng": "en", // English + "epo": "eo", // Esperanto + "est": "et", // Estonian + "fil": "fil", // Filipino + "fin": "fi", // Finnish + "fre": "fr", // French + "geo": "ka", // Georgian + "ger": "de", // German + "glg": "gl", // Galician + "grc": "grc", // Ancient Greek + "gre": "el", // Modern Greek + "haw": "haw", // Hawaiian + "heb": "he", // Hebrew + "hin": "hi", // Hindi + "hun": "hu", // Hungarian + "ice": "is", // Icelandic + "ind": "id", // Indonesian + "ita": "it", // Italian + "jpn": "ja", // Japanese + "khm": "km", // Khmer + "kor": "ko", // Korean + "kur": "ku", // Kurdish + "lat": "la", // Latin + "mal": "ml", // Malayalam + "mao": "mi", // Maori + "may": "ms", // Malay + "mnc": "mnc", // Manchu + "mol": "ro", // Moldavian + "mon": "mn", // Mongolian + "nor": "no", // Norwegian + "per": "fa", // Persian + "pol": "pl", // Polish + "por": "pt", // Portuguese + "rum": "ro", // Romanian + "rus": "ru", // Russian + "san": "sa", // Sanskrit + "scr": "hr", // Croatian + "slo": "sk", // Slovak + "slv": "sl", // Slovenian + "som": "so", // Somali + "spa": "es", // Spanish + "swa": "sw", // Swahili + "swe": "sv", // Swedish + "tah": "ty", // Tahitian + "tam": "ta", // Tamil + "tgl": "tl", // Tagalog + "tha": "th", // Thai + "tib": "bo", // Tibetan + "tur": "tr", // Turkish + "ukr": "uk", // Ukrainian + "urd": "ur", // Urdu + "vie": "vi", // Vietnamese + "yid": "yi", // Yiddish +} diff --git a/jmdict_forms.go b/jmdict_forms.go new file mode 100644 index 0000000..5d01de5 --- /dev/null +++ b/jmdict_forms.go @@ -0,0 +1,265 @@ +package yomichan + +import ( + "os" + "strings" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +func kata2hira(word string) string { + charMap := func(character rune) rune { + if (character >= 'ใ‚ก' && character <= 'ใƒถ') || (character >= 'ใƒฝ' && character <= 'ใƒพ') { + return character - 0x60 + } else { + return character + } + } + return strings.Map(charMap, word) +} + +func (h *headword) InfoSymbols() string { + infoSymbols := []string{} + if h.IsPriority { + infoSymbols = append(infoSymbols, prioritySymbol) + } + if h.IsRareKanji { + infoSymbols = append(infoSymbols, rareKanjiSymbol) + } + if h.IsIrregular { + infoSymbols = append(infoSymbols, irregularSymbol) + } + if h.IsOutdated { + infoSymbols = append(infoSymbols, outdatedSymbol) + } + return strings.Join(infoSymbols[:], " | ") +} + +func (h *headword) GlossText() string { + gloss := h.Expression + if h.IsAteji { + gloss = "ใ€ˆ" + gloss + "ใ€‰" + } + symbolText := h.InfoSymbols() + if symbolText != "" { + gloss += "๏ผˆ" + symbolText + "๏ผ‰" + } + return gloss +} + +func (h *headword) TableColHeaderText() string { + text := h.KanjiForm() + if h.IsAteji { + text = "ใ€ˆ" + text + "ใ€‰" + } + return text +} + +func (h *headword) TableRowHeaderText() string { + text := h.Reading + if h.IsGikun { + text = "ใ€ˆ" + text + "ใ€‰" + } + return text +} + +func (h *headword) TableCellText() string { + text := h.InfoSymbols() + if text == "" { + return defaultSymbol + } else { + return text + } +} + +func (h *headword) KanjiForm() string { + if h.IsKanaOnly() { + return "โˆ…" + } else { + return h.Expression + } +} + +func needsFormTable(headwords []headword) bool { + // Does the entry contain more than 1 distinct reading? + // E.g. ใƒใ‚ซใŒใ„ and ใฐใ‹ใŒใ„ are not distinct. + uniqueReading := "" + for _, h := range headwords { + if h.IsGikun { + return true + } else if h.IsSearchOnly { + continue + } else if h.IsKanaOnly() { + continue + } else if uniqueReading == "" { + uniqueReading = kata2hira(h.Reading) + } else if uniqueReading != kata2hira(h.Reading) { + return true + } + } + return false +} + +type formTableData struct { + kanjiForms []string + readings []string + colHeaderText map[string]string + rowHeaderText map[string]string + cellText map[string]map[string]string +} + +func tableData(headwords []headword) formTableData { + d := formTableData{ + kanjiForms: []string{}, + readings: []string{}, + colHeaderText: make(map[string]string), + rowHeaderText: make(map[string]string), + cellText: make(map[string]map[string]string), + } + for _, h := range headwords { + if h.IsSearchOnly { + continue + } + kanjiForm := h.KanjiForm() + if !slices.Contains(d.kanjiForms, kanjiForm) { + d.kanjiForms = append(d.kanjiForms, kanjiForm) + d.colHeaderText[kanjiForm] = h.TableColHeaderText() + } + reading := h.Reading + if !slices.Contains(d.readings, reading) { + d.readings = append(d.readings, reading) + d.rowHeaderText[reading] = h.TableRowHeaderText() + d.cellText[reading] = make(map[string]string) + } + d.cellText[reading][kanjiForm] = h.TableCellText() + } + return d +} + +func formsTableGlossary(headwords []headword) []any { + d := tableData(headwords) + + attr := contentAttr{} + centeredAttr := contentAttr{textAlign: "center"} + leftAttr := contentAttr{textAlign: "left"} + + cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner + headRowCells := []any{cornerCell} + for _, kanjiForm := range d.kanjiForms { + content := d.colHeaderText[kanjiForm] + cell := contentTableHeadCell(centeredAttr, content) + headRowCells = append(headRowCells, cell) + } + headRow := contentTableRow(attr, headRowCells...) + tableRows := []any{headRow} + for _, reading := range d.readings { + rowHeadCellText := d.rowHeaderText[reading] + rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText) + rowCells := []any{rowHeadCell} + for _, kanjiForm := range d.kanjiForms { + text := d.cellText[reading][kanjiForm] + rowCell := contentTableCell(centeredAttr, text) + rowCells = append(rowCells, rowCell) + } + tableRow := contentTableRow(attr, rowCells...) + tableRows = append(tableRows, tableRow) + } + tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}} + contentTable := contentTable(tableAttr, tableRows...) + content := contentStructure(contentTable) + return []any{content} +} + +func formsGlossary(headwords []headword) []any { + glossary := []any{} + for _, h := range headwords { + if h.IsSearchOnly { + continue + } + text := h.GlossText() + glossary = append(glossary, text) + } + return glossary +} + +func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm { + term := dbTerm{Sequence: entry.Sequence} + headwords := extractHeadwords(entry) + if needsFormTable(headwords) { + term.Glossary = formsTableGlossary(headwords) + } else { + term.Glossary = formsGlossary(headwords) + } + for _, sense := range entry.Sense { + rules := grammarRules(sense.PartsOfSpeech) + term.addRules(rules...) + } + return term +} + +func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error { + reader, err := os.Open(inputPath) + if err != nil { + return err + } + defer reader.Close() + + dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader) + if err != nil { + return err + } + + meta := newJmdictMetadata(dictionary, "") + + terms := dbTermList{} + for _, entry := range dictionary.Entries { + baseTerm := baseFormsTerm(entry) + headwords := extractHeadwords(entry) + for _, h := range headwords { + if h.IsSearchOnly { + if term, ok := createSearchTerm(h, entry, meta); ok { + terms = append(terms, term) + } + continue + } + term := baseTerm + term.Expression = h.Expression + term.Reading = h.Reading + term.addTermTags(h.TermTags...) + term.Score = calculateTermScore(1, 0, h) + terms = append(terms, term) + } + } + + tags := dbTagList{} + tags = append(tags, entityTags(entities)...) + tags = append(tags, newsFrequencyTags()...) + tags = append(tags, customDbTags()...) + + if title == "" { + title = "JMdict Forms" + } + + recordData := map[string]dbRecordList{ + "term": terms.crush(), + "tag": tags.crush(), + } + + jmdictDate := jmdictPublicationDate(dictionary) + + index := dbIndex{ + Title: title, + Revision: "JMdict." + jmdictDate, + Sequenced: true, + Attribution: edrdgAttribution, + } + + return writeDb( + outputPath, + index, + recordData, + stride, + pretty, + ) +} diff --git a/jmdict_glossary.go b/jmdict_glossary.go new file mode 100644 index 0000000..d116981 --- /dev/null +++ b/jmdict_glossary.go @@ -0,0 +1,300 @@ +package yomichan + +import ( + "fmt" + "strconv" + + "foosoft.net/projects/jmdict" +) + +func glossaryContainsLanguage(glossary []jmdict.JmdictGlossary, language string) bool { + hasGlosses := false + for _, gloss := range glossary { + if glossContainsLanguage(gloss, language) { + hasGlosses = true + break + } + } + return hasGlosses +} + +func glossContainsLanguage(gloss jmdict.JmdictGlossary, language string) bool { + if gloss.Language == nil && language != "eng" { + return false + } else if gloss.Language != nil && language != *gloss.Language { + return false + } else { + return true + } +} + +func makeGlossListItem(gloss jmdict.JmdictGlossary, language string) any { + contents := []any{gloss.Content} + listItem := contentListItem(contentAttr{}, contents...) + return listItem +} + +func makeInfoGlossListItem(gloss jmdict.JmdictGlossary, language string) any { + // Prepend gloss with "type" (literal, figurative, trademark, etc.) + glossTypeCode := *gloss.Type + contents := []any{} + if name, ok := glossTypeCodeToName[LangCode{language, glossTypeCode}]; ok { + if name != "" { + italicStyle := contentAttr{fontStyle: "italic"} + contents = append(contents, contentSpan(italicStyle, "("+name+")"), " ") + } + } else { + fmt.Println("Unknown glossary type code " + *gloss.Type + " for build language " + language) + contents = append(contents, "["+glossTypeCode+"] ") + } + contents = append(contents, gloss.Content) + listItem := contentListItem(contentAttr{}, contents...) + return listItem +} + +func makeSourceLangListItem(sourceLanguage jmdict.JmdictSource, language string) any { + contents := []any{} + + var srcLangCode string + if sourceLanguage.Language == nil { + srcLangCode = "eng" + } else { + srcLangCode = *sourceLanguage.Language + } + + // Format: [Language] ([Partial?], [Wasei?]): [Original word?] + // [Language] + if langName, ok := langCodeToName[LangCode{language, srcLangCode}]; ok { + contents = append(contents, langName) + } else { + contents = append(contents, srcLangCode) + fmt.Println("Unable to convert ISO 639 code " + srcLangCode + " to its full name in language " + language) + } + + // ([Partial?], [Wasei?]) + var sourceLangTypeCode string + if sourceLanguage.Type == nil { + sourceLangTypeCode = "" + } else { + sourceLangTypeCode = *sourceLanguage.Type + } + var sourceLangType string + if val, ok := sourceLangTypeCodeToType[LangCode{language, sourceLangTypeCode}]; ok { + sourceLangType = val + } else { + sourceLangType = sourceLangTypeCode + fmt.Println("Unknown source language type code " + sourceLangTypeCode + " for build language " + language) + } + if sourceLangType != "" && sourceLanguage.Wasei == "y" { + contents = append(contents, " ("+sourceLangType+", wasei)") + } else if sourceLangType != "" { + contents = append(contents, " ("+sourceLangType+")") + } else if sourceLanguage.Wasei == "y" { + contents = append(contents, " (wasei)") + } + + // : [Original word?] + if sourceLanguage.Content != "" { + contents = append(contents, ": ") + attr := contentAttr{lang: ISOtoHTML[srcLangCode]} + contents = append(contents, contentSpan(attr, sourceLanguage.Content)) + } + + listItem := contentListItem(contentAttr{}, contents...) + return listItem +} + +func makeReferenceListItem(reference string, refType string, meta jmdictMetadata) any { + contents := []any{} + attr := contentAttr{} + + hint := refNoteHint[LangCode{meta.language, refType}] + contents = append(contents, hint+": ") + + refHeadword, senseNumber, ok := parseReference(reference) + if !ok { + contents = append(contents, "ใ€"+reference+"ใ€‘") + return contentListItem(attr, contents...) + } + + sequence, ok := meta.referenceToSeq[reference] + if !ok { + contents = append(contents, "ใ€"+reference+"ใ€‘") + return contentListItem(attr, contents...) + } + + targetSense := senseID{ + sequence: sequence, + number: senseNumber, + } + + expHash := refHeadword.ExpHash() + doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1) + doDisplaySenseNumber := (meta.seqToSenseCount[targetSense.sequence] > 1) + refGlossAttr := contentAttr{ + fontSize: "65%", + verticalAlign: "middle", + data: map[string]string{"content": "refGlosses"}, + } + + contents = append(contents, refHeadword.ToInternalLink(doDisplayReading)) + if doDisplaySenseNumber { + contents = append(contents, contentSpan(refGlossAttr, " "+strconv.Itoa(targetSense.number)+". "+meta.condensedGlosses[targetSense])) + } else { + contents = append(contents, contentSpan(refGlossAttr, " "+meta.condensedGlosses[targetSense])) + } + + listItem := contentListItem(attr, contents...) + return listItem +} + +func makeExampleListItem(sentence jmdict.JmdictExampleSentence) any { + if sentence.Lang == "jpn" { + return contentListItem(contentAttr{}, sentence.Text) + } else { + attr := contentAttr{ + lang: ISOtoHTML[sentence.Lang], + listStyleType: ISOtoFlag[sentence.Lang], + } + return contentListItem(attr, sentence.Text) + } +} + +func listAttr(lang string, listStyleType string, dataContent string) contentAttr { + return contentAttr{ + lang: lang, + listStyleType: listStyleType, + data: map[string]string{"content": dataContent}, + } +} + +func needsStructuredContent(sense jmdict.JmdictSense, language string) bool { + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, language) && gloss.Type != nil { + return true + } + } + if len(sense.SourceLanguages) > 0 { + return true + } else if len(sense.Information) > 0 { + return true + } else if len(sense.Antonyms) > 0 { + return true + } else if len(sense.References) > 0 { + return true + } else if len(sense.Examples) > 0 { + return true + } else { + return false + } +} + +func createGlossaryContent(sense jmdict.JmdictSense, meta jmdictMetadata) any { + glossaryContents := []any{} + + // Add normal glosses + glossListItems := []any{} + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil { + listItem := makeGlossListItem(gloss, meta.language) + glossListItems = append(glossListItems, listItem) + } + } + if len(glossListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], "circle", "glossary") + list := contentUnorderedList(attr, glossListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add information glosses + infoGlossListItems := []any{} + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, meta.language) && gloss.Type != nil { + listItem := makeInfoGlossListItem(gloss, meta.language) + infoGlossListItems = append(infoGlossListItems, listItem) + } + } + if len(infoGlossListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], infoMarker, "infoGlossary") + list := contentUnorderedList(attr, infoGlossListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add language-of-origin / loanword information + sourceLangListItems := []any{} + for _, sourceLanguage := range sense.SourceLanguages { + listItem := makeSourceLangListItem(sourceLanguage, meta.language) + sourceLangListItems = append(sourceLangListItems, listItem) + } + if len(sourceLangListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], langMarker, "sourceLanguages") + list := contentUnorderedList(attr, sourceLangListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add sense notes + noteListItems := []any{} + for _, information := range sense.Information { + listItem := contentListItem(contentAttr{}, information) + noteListItems = append(noteListItems, listItem) + } + if len(noteListItems) > 0 { + attr := listAttr(ISOtoHTML["jpn"], noteMarker, "notes") // notes often contain japanese text + list := contentUnorderedList(attr, noteListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add antonyms + antonymListItems := []any{} + for _, antonym := range sense.Antonyms { + listItem := makeReferenceListItem(antonym, "ant", meta) + antonymListItems = append(antonymListItems, listItem) + } + if len(antonymListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], antonymMarker, "antonyms") + list := contentUnorderedList(attr, antonymListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add cross-references + referenceListItems := []any{} + for _, reference := range sense.References { + listItem := makeReferenceListItem(reference, "xref", meta) + referenceListItems = append(referenceListItems, listItem) + } + if len(referenceListItems) > 0 { + attr := listAttr(ISOtoHTML[meta.language], refMarker, "references") + list := contentUnorderedList(attr, referenceListItems...) + glossaryContents = append(glossaryContents, list) + } + + // Add example sentences + exampleListItems := []any{} + for _, example := range sense.Examples { + for _, sentence := range example.Sentences { + listItem := makeExampleListItem(sentence) + exampleListItems = append(exampleListItems, listItem) + } + } + if len(exampleListItems) > 0 { + attr := listAttr(ISOtoHTML["jpn"], ISOtoFlag["jpn"], "examples") + list := contentUnorderedList(attr, exampleListItems...) + glossaryContents = append(glossaryContents, list) + } + + return contentStructure(glossaryContents...) +} + +func createGlossary(sense jmdict.JmdictSense, meta jmdictMetadata) []any { + glossary := []any{} + if meta.extraMode && needsStructuredContent(sense, meta.language) { + glossary = append(glossary, createGlossaryContent(sense, meta)) + } else { + for _, gloss := range sense.Glossary { + if glossContainsLanguage(gloss, meta.language) { + glossary = append(glossary, gloss.Content) + } + } + } + return glossary +} diff --git a/jmdict_headword.go b/jmdict_headword.go new file mode 100644 index 0000000..19a4bba --- /dev/null +++ b/jmdict_headword.go @@ -0,0 +1,282 @@ +package yomichan + +import ( + "fmt" + "hash/fnv" + "regexp" + "strconv" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +type headword struct { + Expression string + Reading string + TermTags []string + Index int + IsPriority bool + IsFrequent bool + IsIrregular bool + IsOutdated bool + IsRareKanji bool + IsSearchOnly bool + IsAteji bool + IsGikun bool +} + +type hash uint64 + +func (h *headword) Hash() hash { + return hashText(h.Expression + "โž" + h.Reading) +} + +func (h *headword) ExpHash() hash { + return hashText(h.Expression + "โž" + h.Expression) +} + +func (h *headword) ReadingHash() hash { + return hashText(h.Reading + "โž" + h.Reading) +} + +func hashText(s string) hash { + h := fnv.New64a() + h.Write([]byte(s)) + return hash(h.Sum64()) +} + +func (h *headword) IsKanaOnly() bool { + if h.Expression != h.Reading { + return false + } + for _, char := range h.Expression { + if char >= 'ใ' && char <= 'ใƒฟ' { + // hiragana and katakana range + continue + } else if char >= '๏ฝฅ' && char <= '๏พŸ' { + // halfwidth katakana range + continue + } else if char == 'ใ€œ' { + continue + } else { + return false + } + } + return true +} + +func (h *headword) Score() int { + score := 0 + if h.IsPriority { + score += 1 + } + if h.IsFrequent { + score += 1 + } + if h.IsIrregular { + score -= 5 + } + if h.IsOutdated { + score -= 5 + } + if h.IsRareKanji { + score -= 5 + } + if h.IsSearchOnly { + score -= 5 + } + return score +} + +func (h *headword) ToInternalLink(includeReading bool) any { + if !includeReading || h.Expression == h.Reading { + return contentInternalLink( + contentAttr{lang: ISOtoHTML["jpn"]}, + h.Expression, + ) + } else { + return contentSpan( + contentAttr{lang: ISOtoHTML["jpn"]}, + contentInternalLink(contentAttr{}, h.Expression), + "๏ผˆ", + contentInternalLink(contentAttr{}, h.Reading), + "๏ผ‰", + ) + } +} + +func (h *headword) SetFlags(infoTags, freqTags []string) { + priorityTags := []string{"ichi1", "news1", "gai1", "spec1", "spec2"} + for _, priorityTag := range priorityTags { + if slices.Contains(freqTags, priorityTag) { + h.IsPriority = true + break + } + } + if len(freqTags) > 1 { + h.IsFrequent = true + } + for _, infoTag := range infoTags { + switch infoTag { + case "iK", "ik", "io": + h.IsIrregular = true + case "oK", "ok": + h.IsOutdated = true + case "sK", "sk": + h.IsSearchOnly = true + case "rK": + h.IsRareKanji = true + case "ateji": + h.IsAteji = true + case "gikun": + h.IsGikun = true + default: + fmt.Println("Unknown information tag type: " + infoTag) + h.TermTags = append(h.TermTags, infoTag) + } + } + if h.IsOutdated && h.IsRareKanji { + h.IsRareKanji = false + } +} + +func (h *headword) SetTermTags(freqTags []string) { + if h.IsPriority { + h.TermTags = append(h.TermTags, priorityTagName) + } + knownFreqTags := []string{"ichi1", "ichi2", "gai1", "gai2", "spec1", "spec2"} + for _, tag := range freqTags { + isNewsFreqTag, _ := regexp.MatchString(`nf\d\d`, tag) + if isNewsFreqTag { + // nf tags are divided into ranks of 500 + // (nf01 to nf48). Let's combine them into + // ranks of 1k (news1k, news2k, ..., news24k). + var i int + if _, err := fmt.Sscanf(tag, "nf%2d", &i); err == nil { + i = (i + (i % 2)) / 2 + newsTag := "news" + strconv.Itoa(i) + "k" + h.TermTags = append(h.TermTags, newsTag) + } + } else if tag == "news1" || tag == "news2" { + // News tags are derived from the nf + // rankings, so these are not needed. + continue + } else if slices.Contains(knownFreqTags, tag) { + tagWithoutTheNumber := tag[:len(tag)-1] + h.TermTags = append(h.TermTags, tagWithoutTheNumber) + } else { + fmt.Println("Unknown frequency tag type: " + tag) + h.TermTags = append(h.TermTags, tag) + } + } + if h.IsIrregular { + h.TermTags = append(h.TermTags, irregularTagName) + } + if h.IsOutdated { + h.TermTags = append(h.TermTags, outdatedTagName) + } + if h.IsRareKanji { + h.TermTags = append(h.TermTags, rareKanjiTagName) + } + if h.IsAteji { + h.TermTags = append(h.TermTags, atejiTagName) + } + if h.IsGikun { + h.TermTags = append(h.TermTags, gikunTagName) + } +} + +func newHeadword(kanji *jmdict.JmdictKanji, reading *jmdict.JmdictReading) headword { + h := headword{} + infoTags := []string{} + freqTags := []string{} + if kanji == nil { + h.Expression = reading.Reading + h.Reading = reading.Reading + infoTags = reading.Information + freqTags = reading.Priorities + } else if reading == nil { + // should only apply to search-only kanji terms + h.Expression = kanji.Expression + h.Reading = "" + infoTags = kanji.Information + freqTags = kanji.Priorities + } else { + h.Expression = kanji.Expression + h.Reading = reading.Reading + infoTags = union(kanji.Information, reading.Information) + freqTags = intersection(kanji.Priorities, reading.Priorities) + } + h.SetFlags(infoTags, freqTags) + h.SetTermTags(freqTags) + return h +} + +func areAllKanjiIrregular(allKanji []jmdict.JmdictKanji) bool { + // If every kanji form is rare or irregular, then we'll make + // kana-only headwords for each kana form. + if len(allKanji) == 0 { + return false + } + for _, kanji := range allKanji { + h := newHeadword(&kanji, nil) + kanjiIsIrregular := h.IsRareKanji || h.IsIrregular || h.IsOutdated || h.IsSearchOnly + if !kanjiIsIrregular { + return false + } + } + return true +} + +func extractHeadwords(entry jmdict.JmdictEntry) []headword { + headwords := []headword{} + allKanjiAreIrregular := areAllKanjiIrregular(entry.Kanji) + + if allKanjiAreIrregular { + // Adding the reading-only terms before kanji+reading + // terms here for the sake of the Index property, + // which affects the yomichan term ranking. + for _, reading := range entry.Readings { + h := newHeadword(nil, &reading) + h.Index = len(headwords) + headwords = append(headwords, h) + } + } + + for _, kanji := range entry.Kanji { + if slices.Contains(kanji.Information, "sK") { + // Search-only kanji forms do not have associated readings. + h := newHeadword(&kanji, nil) + h.Index = len(headwords) + headwords = append(headwords, h) + continue + } + for _, reading := range entry.Readings { + if reading.NoKanji != nil { + continue + } else if slices.Contains(reading.Information, "sk") { + // Search-only kana forms do not have associated kanji forms. + continue + } else if reading.Restrictions != nil && !slices.Contains(reading.Restrictions, kanji.Expression) { + continue + } else { + h := newHeadword(&kanji, &reading) + h.Index = len(headwords) + headwords = append(headwords, h) + } + } + } + + if !allKanjiAreIrregular { + noKanjiInEntry := (len(entry.Kanji) == 0) + for _, reading := range entry.Readings { + if reading.NoKanji != nil || noKanjiInEntry || slices.Contains(reading.Information, "sk") { + h := newHeadword(nil, &reading) + h.Index = len(headwords) + headwords = append(headwords, h) + } + } + } + + return headwords +} diff --git a/jmdict_metadata.go b/jmdict_metadata.go new file mode 100644 index 0000000..98e35d9 --- /dev/null +++ b/jmdict_metadata.go @@ -0,0 +1,183 @@ +package yomichan + +import ( + "strings" + + "foosoft.net/projects/jmdict" + "golang.org/x/exp/slices" +) + +type sequence = int + +type jmdictMetadata struct { + language string + condensedGlosses map[senseID]string + seqToSenseCount map[sequence]int + seqToMainHeadword map[sequence]headword + expHashToReadings map[hash][]string + headwordHashToSeqs map[hash][]sequence + references []string + referenceToSeq map[string]sequence + hashToSearchValues map[hash][]searchValue + seqToSearchHashes map[sequence][]searchHash + entryDepth map[sequence]int + hasMultipleForms map[sequence]bool + maxSenseCount int + extraMode bool +} + +type senseID struct { + sequence sequence + number int +} + +func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) { + // This is to ensure that terms are grouped among their + // entries of origin and displayed in correct sequential order + maxDepth := 0 + for _, headword := range headwords { + hash := headword.Hash() + for _, seq := range meta.headwordHashToSeqs[hash] { + seqDepth := meta.entryDepth[seq] + if seqDepth == 0 { + meta.entryDepth[seq] = 1 + seqDepth = 1 + } + if maxDepth < seqDepth+1 { + maxDepth = seqDepth + 1 + } + } + } + meta.entryDepth[entrySequence] = maxDepth +} + +func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) { + + // Determine how many senses are in this entry for this language + if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok { + senseCount := 0 + for _, entrySense := range entry.Sense { + for _, gloss := range entrySense.Glossary { + if glossContainsLanguage(gloss, meta.language) { + senseCount += 1 + break + } + } + } + meta.seqToSenseCount[entry.Sequence] = senseCount + } + + if meta.seqToSenseCount[entry.Sequence] == 0 { + return + } + + // main headwords (first ones that are found in entries). + if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok { + meta.seqToMainHeadword[entry.Sequence] = headword + } + + // hash the term pair so we can determine if it's used + // in more than one JMdict entry later. + headwordHash := headword.Hash() + if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) { + meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence) + } + + // hash the expression so that we can determine if we + // need to disambiguate it by displaying its reading + // in reference notes later. + expHash := headword.ExpHash() + if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) { + meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading) + } + + // e.g. for JMdict (English) we expect to end up with + // seqToHashedHeadwords[1260670] == ใ€ๅ…ƒใƒปใ‚‚ใจใ€‘ใ€ใ€ๅ…ƒใƒปๅ…ƒใ€‘ใ€ใ€ใ‚‚ใจใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปๆœฌใ€‘ใ€ใ€็ด ใƒปใ‚‚ใจใ€‘ใ€ใ€็ด ใƒป็ด ใ€‘ใ€ใ€ๅŸบใƒปใ‚‚ใจใ€‘ใ€ใ€ๅŸบใƒปๅŸบใ€‘ + // used for correlating references to sequence numbers later. + searchHashes := []searchHash{ + searchHash{headwordHash, headword.IsPriority}, + searchHash{expHash, headword.IsPriority}, + searchHash{headword.ReadingHash(), headword.IsPriority}, + } + for _, x := range searchHashes { + if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) { + meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x) + } + } + + currentSenseNumber := 1 + for _, entrySense := range entry.Sense { + if !glossaryContainsLanguage(entrySense.Glossary, meta.language) { + continue + } + if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) { + currentSenseNumber += 1 + continue + } + if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) { + currentSenseNumber += 1 + continue + } + + allReferences := append(entrySense.References, entrySense.Antonyms...) + for _, reference := range allReferences { + meta.references = append(meta.references, reference) + } + + currentSense := senseID{entry.Sequence, currentSenseNumber} + if meta.condensedGlosses[currentSense] == "" { + glosses := []string{} + for _, gloss := range entrySense.Glossary { + if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil { + glosses = append(glosses, gloss.Content) + } + } + meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ") + } + currentSenseNumber += 1 + } +} + +func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata { + meta := jmdictMetadata{ + language: langNameToCode[languageName], + seqToSenseCount: make(map[sequence]int), + condensedGlosses: make(map[senseID]string), + seqToMainHeadword: make(map[sequence]headword), + expHashToReadings: make(map[hash][]string), + seqToSearchHashes: make(map[sequence][]searchHash), + headwordHashToSeqs: make(map[hash][]sequence), + references: []string{}, + hashToSearchValues: nil, + referenceToSeq: nil, + entryDepth: make(map[sequence]int), + hasMultipleForms: make(map[sequence]bool), + maxSenseCount: 0, + extraMode: languageName == "english_extra", + } + + for _, entry := range dictionary.Entries { + headwords := extractHeadwords(entry) + formCount := 0 + for _, headword := range headwords { + meta.AddHeadword(headword, entry) + if !headword.IsSearchOnly { + formCount += 1 + } + } + meta.CalculateEntryDepth(headwords, entry.Sequence) + meta.hasMultipleForms[entry.Sequence] = (formCount > 1) + } + + // this correlation process will be unnecessary once JMdict + // includes sequence numbers in its cross-reference data + meta.MakeReferenceToSeqMap() + + for _, senseCount := range meta.seqToSenseCount { + if meta.maxSenseCount < senseCount { + meta.maxSenseCount = senseCount + } + } + + return meta +} diff --git a/jmdict_references.go b/jmdict_references.go new file mode 100644 index 0000000..aa5d229 --- /dev/null +++ b/jmdict_references.go @@ -0,0 +1,170 @@ +package yomichan + +import ( + "fmt" + "strconv" + "strings" +) + +/* + * In the future, JMdict will be updated to include sequence numbers + * with each cross reference. At that time, most of the functions and + * types defined in this file will become unnecessary. see: + * https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html + */ + +type searchValue struct { + sequence sequence + index int + isPriority bool +} + +type searchHash struct { + hash hash + isPriority bool +} + +func parseReference(reference string) (headword, int, bool) { + // Reference strings in JMDict currently consist of 3 parts at + // most, separated by ใƒป characters. The latter two parts are + // optional. When the sense number is not specified, it is + // implied to be the first sense. + var h headword + var senseNumber int + ok := true + refParts := strings.Split(reference, "ใƒป") + if len(refParts) == 1 { + // (Kanji) or (Reading) + h = headword{Expression: refParts[0], Reading: refParts[0]} + senseNumber = 1 + } else if len(refParts) == 2 { + // [Kanji + (Reading or Sense)] or (Reading + Sense) + val, err := strconv.Atoi(refParts[1]) + if err == nil { + h = headword{Expression: refParts[0], Reading: refParts[0]} + senseNumber = val + } else { + h = headword{Expression: refParts[0], Reading: refParts[1]} + senseNumber = 1 + } + } else if len(refParts) == 3 { + // Expression + Reading + Sense + h = headword{Expression: refParts[0], Reading: refParts[1]} + val, err := strconv.Atoi(strings.TrimSpace(refParts[2])) + if err == nil { + senseNumber = val + } else { + errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\"" + fmt.Println(errortext) + ok = false + } + } else { + errortext := "Unexpected format for x-ref \"" + reference + "\"" + fmt.Println(errortext) + ok = false + } + return h, senseNumber, ok +} + +func (meta *jmdictMetadata) MakeReferenceToSeqMap() { + + meta.referenceToSeq = make(map[string]sequence) + meta.MakeHashToSearchValuesMap() + + for _, reference := range meta.references { + if meta.referenceToSeq[reference] != 0 { + continue + } + seq := meta.FindBestSequence(reference) + if seq != 0 { + meta.referenceToSeq[reference] = seq + } else { + fmt.Println("Unable to convert reference to sequence number: `" + reference + "`") + } + } +} + +func (meta *jmdictMetadata) MakeHashToSearchValuesMap() { + meta.hashToSearchValues = make(map[hash][]searchValue) + for seq, searchHashes := range meta.seqToSearchHashes { + for idx, searchHash := range searchHashes { + searchValue := searchValue{ + sequence: seq, + index: idx, + isPriority: searchHash.isPriority, + } + meta.hashToSearchValues[searchHash.hash] = + append(meta.hashToSearchValues[searchHash.hash], searchValue) + } + } +} + +/* + * This function attemps to convert a JMdict reference string into a + * single definite sequence number. These reference strings are often + * ambiguous, so we have to resort to using heuristics. + * + * Generally, correspondence is determined by the order in which term + * pairs are extracted from each JMdict entry. Take for example the + * JMdict entry for ใ”ๆœฌ, which contains a reference to ๆœฌ (without a + * reading specified). To correlate this reference with a sequence + * number, our program searches each entry for the hash ofใ€ๆœฌใƒปๆœฌใ€‘. + * There are two entries in which it is found in JMdict (English): + * + * sequence 1260670: ใ€ๅ…ƒใƒปใ‚‚ใจใ€‘ใ€ใ€ๅ…ƒใƒปๅ…ƒใ€‘ใ€ใ€ใ‚‚ใจใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปใ‚‚ใจใ€‘ใ€ใ€ๆœฌใƒปๆœฌใ€‘ใ€ใ€็ด ใƒปใ‚‚ใจใ€‘ใ€ใ€็ด ใƒป็ด ใ€‘ใ€ใ€ๅŸบใƒปใ‚‚ใจใ€‘ใ€ใ€ๅŸบใƒปๅŸบใ€‘ + * sequence 1522150: ใ€ๆœฌใƒปใปใ‚“ใ€‘ใ€ใ€ๆœฌใƒปๆœฌใ€‘ใ€ใ€ใปใ‚“ใƒปใปใ‚“ใ€‘ + * + * Because ใ€ๆœฌใƒปๆœฌใ€‘ is closer to the beginning of the array in the + * latter (i.e., has the lowest index), sequence number 1522150 is + * returned. + * + * In situations in which multiple sequences are found with the same + * index, the entry with a priority tag ("news1", "ichi1", "spec1", + * "spec2", "gai1") is given preference. This mostly affects + * katakana-only loanwords like ใƒฉใ‚ฐ. + * + * To improve accuracy, this method also checks to see if the + * reference's specified sense number really exists in the + * corresponding entry. For example, sequence 1582850 ใ€ๅฆ‚ไฝ•ใงใƒปใ„ใ‹ใ‚“ใงใ€‘ + * has a reference to sense #2 of ใ„ใ‹ใ‚“ (no kanji specified), which + * could belong to 13 different sequences. However, sequences 1582850 + * and 2829697 are the only 2 of those 13 which contain more than one + * sense. Incidentally, sequence 1582850 is the correct match. + * + * All else being equal, the entry with the smallest sequence number + * is chosen. References in the JMdict file are currently ambiguous, + * and getting this perfect won't be possible until reference sequence + * numbers are included in the file. See: + * https://github.com/JMdictProject/JMdictIssues/issues/61 + */ +func (meta *jmdictMetadata) FindBestSequence(reference string) sequence { + bestSeq := 0 + lowestIndex := 100000 + bestIsPriority := false + headword, senseNumber, ok := parseReference(reference) + if !ok { + return bestSeq + } + hash := headword.Hash() + for _, v := range meta.hashToSearchValues[hash] { + if meta.seqToSenseCount[v.sequence] < senseNumber { + // entry must contain the specified sense + continue + } else if lowestIndex < v.index { + // lower indices are better + continue + } else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) { + // if indices match, check priority + continue + } else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) { + // if indices and priority match, check sequence number. + // lower sequence numbers are better + continue + } else { + lowestIndex = v.index + bestSeq = v.sequence + bestIsPriority = v.isPriority + } + } + return bestSeq +} diff --git a/jmdict_tags.go b/jmdict_tags.go new file mode 100644 index 0000000..b444c47 --- /dev/null +++ b/jmdict_tags.go @@ -0,0 +1,348 @@ +package yomichan + +import ( + "fmt" + "strconv" + + "golang.org/x/exp/slices" +) + +func senseNumberTags(maxSenseCount int) []dbTag { + tags := []dbTag{} + for i := 1; i <= maxSenseCount; i++ { + tag := dbTag{ + Name: strconv.Itoa(i), + Order: -10, // these tags will appear on the left side + Notes: "JMdict Sense #" + strconv.Itoa(i), + } + tags = append(tags, tag) + } + return tags +} + +func newsFrequencyTags() []dbTag { + // 24,000 ranks divided into 24 tags, news1k ... news24k + tags := []dbTag{} + for i := 1; i <= 24; i++ { + tagName := "news" + strconv.Itoa(i) + "k" + var startRank string + if i == 1 { + startRank = "1" + } else { + // technically should be ",001", but that looks odd + startRank = strconv.Itoa(i-1) + ",000" + } + endRank := strconv.Itoa(i) + ",000" + tag := dbTag{ + Name: tagName, + Order: -2, + Score: 0, + Category: "frequent", + Notes: "ranked between the top " + startRank + " and " + endRank + " words in a frequency analysis of the Mainichi Shimbun (1990s)", + } + tags = append(tags, tag) + } + return tags +} + +func entityTags(entities map[string]string) []dbTag { + tags := knownEntityTags() + for name, notes := range entities { + idx := slices.IndexFunc(tags, func(t dbTag) bool { return t.Name == name }) + if idx != -1 { + tags[idx].Notes = notes + } else { + fmt.Println("Unknown tag type \"" + name + "\": " + notes) + unknownTag := dbTag{Name: name, Notes: notes} + tags = append(tags, unknownTag) + } + } + return tags +} + +func customDbTags() []dbTag { + return []dbTag{ + dbTag{Name: priorityTagName, Order: -10, Score: 10, Category: "popular", Notes: "high priority term"}, + dbTag{Name: rareKanjiTagName, Order: 0, Score: -5, Category: "archaism", Notes: "rarely-used kanji form of this expression"}, + dbTag{Name: irregularTagName, Order: 0, Score: -5, Category: "archaism", Notes: "irregular form of this expression"}, + dbTag{Name: outdatedTagName, Order: 0, Score: -5, Category: "archaism", Notes: "outdated form of this expression"}, + dbTag{Name: "ichi", Order: -2, Score: 0, Category: "frequent", Notes: "included in Ichimango Goi Bunruishuu (๏ผ‘ไธ‡่ชž่ชžๅฝ™ๅˆ†้กž้›†)"}, + dbTag{Name: "spec", Order: -2, Score: 0, Category: "frequent", Notes: "specified as common by JMdict editors"}, + dbTag{Name: "gai", Order: -2, Score: 0, Category: "frequent", Notes: "common loanword (gairaigoใƒปๅค–ๆฅ่ชž)"}, + dbTag{Name: "forms", Order: 0, Score: 0, Category: "", Notes: "other surface forms and readings"}, + } +} + +func knownEntityTags() []dbTag { + return []dbTag{ + // see: https://www.edrdg.org/jmdictdb/cgi-bin/edhelp.py?svc=jmdict&sid=#kwabbr + // additional descriptions at the beginning of the JMdict file + + // reading info + dbTag{Name: "gikun", Order: 0, Score: 0, Category: ""}, // gikun (meaning as reading) or jukujikun (special kanji reading) + dbTag{Name: "ik", Order: 0, Score: -5, Category: ""}, // word containing irregular kana usage + dbTag{Name: "ok", Order: 0, Score: -5, Category: ""}, // out-dated or obsolete kana usage + dbTag{Name: "sk", Order: 0, Score: -5, Category: ""}, // search-only kana form + + // kanji info + /* kanji info also has a "ik" entity that would go here if not already for the re_inf tag */ + dbTag{Name: "ateji", Order: 0, Score: 0, Category: ""}, // ateji (phonetic) reading + dbTag{Name: "iK", Order: 0, Score: -5, Category: ""}, // word containing irregular kanji usage + dbTag{Name: "io", Order: 0, Score: -5, Category: ""}, // irregular okurigana usage + dbTag{Name: "oK", Order: 0, Score: -5, Category: ""}, // word containing out-dated kanji or kanji usage + dbTag{Name: "rK", Order: 0, Score: -5, Category: ""}, // rarely-used kanji form + dbTag{Name: "sK", Order: 0, Score: -5, Category: ""}, // search-only kanji form + + // miscellaneous sense info + dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation + dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism + dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character + dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language + dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism + dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name + dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature + dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term + dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity + dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory + dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document + dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic + dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event + dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language + dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language + dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction + dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term + dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified + dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group + dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term + dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language + dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language + dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression + dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term + dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend + dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang + dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language + dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology + dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang + dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object + dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term + dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word + dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name + dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other + dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person + dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name + dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term + dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language + dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name + dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb + dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation + dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare + dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion + dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive + dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service + dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name + dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang + dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station + dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname + dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone + dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name + dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word + dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name + dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software) + dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo + + // part-of-speech info + dbTag{Name: "adj-f", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or verb acting prenominally + dbTag{Name: "adj-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) + dbTag{Name: "adj-ix", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) - yoi/ii class + dbTag{Name: "adj-kari", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'kari' adjective (archaic) + dbTag{Name: "adj-ku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'ku' adjective (archaic) + dbTag{Name: "adj-na", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjectival nouns or quasi-adjectives (keiyodoshi) + dbTag{Name: "adj-nari", Order: -3, Score: 0, Category: "partOfSpeech"}, // archaic/formal form of na-adjective + dbTag{Name: "adj-no", Order: -3, Score: 0, Category: "partOfSpeech"}, // nouns which may take the genitive case particle 'no' + dbTag{Name: "adj-pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pre-noun adjectival (rentaishi) + dbTag{Name: "adj-shiku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'shiku' adjective (archaic) + dbTag{Name: "adj-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'taru' adjective + dbTag{Name: "adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb (fukushi) + dbTag{Name: "adv-to", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb taking the 'to' particle + dbTag{Name: "aux", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary + dbTag{Name: "aux-adj", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary adjective + dbTag{Name: "aux-v", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary verb + dbTag{Name: "conj", Order: -3, Score: 0, Category: "partOfSpeech"}, // conjunction + dbTag{Name: "cop", Order: -3, Score: 0, Category: "partOfSpeech"}, // copula + dbTag{Name: "ctr", Order: -3, Score: 0, Category: "partOfSpeech"}, // counter + dbTag{Name: "exp", Order: -5, Score: 0, Category: "expression"}, // expressions (phrases, clauses, etc.) + dbTag{Name: "int", Order: -3, Score: 0, Category: "partOfSpeech"}, // interjection (kandoushi) + dbTag{Name: "n", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (common) (futsuumeishi) + dbTag{Name: "n-adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverbial noun (fukushitekimeishi) + dbTag{Name: "n-pr", Order: -3, Score: 0, Category: "partOfSpeech"}, // proper noun + dbTag{Name: "n-pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a prefix + dbTag{Name: "n-suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a suffix + dbTag{Name: "n-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (temporal) (jisoumeishi) + dbTag{Name: "num", Order: -3, Score: 0, Category: "partOfSpeech"}, // numeric + dbTag{Name: "pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pronoun + dbTag{Name: "pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // prefix + dbTag{Name: "prt", Order: -3, Score: 0, Category: "partOfSpeech"}, // particle + dbTag{Name: "suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // suffix + dbTag{Name: "unc", Order: -3, Score: 0, Category: "partOfSpeech"}, // unclassified + dbTag{Name: "v-unspec", Order: -3, Score: 0, Category: "partOfSpeech"}, // verb unspecified + dbTag{Name: "v1", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb + dbTag{Name: "v1-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - kureru special class + dbTag{Name: "v2a-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb with 'u' ending (archaic) + dbTag{Name: "v2b-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'bu' ending (archaic) + dbTag{Name: "v2b-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'bu' ending (archaic) + dbTag{Name: "v2d-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'dzu' ending (archaic) + dbTag{Name: "v2d-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'dzu' ending (archaic) + dbTag{Name: "v2g-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'gu' ending (archaic) + dbTag{Name: "v2g-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'gu' ending (archaic) + dbTag{Name: "v2h-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'hu/fu' ending (archaic) + dbTag{Name: "v2h-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'hu/fu' ending (archaic) + dbTag{Name: "v2k-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ku' ending (archaic) + dbTag{Name: "v2k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ku' ending (archaic) + dbTag{Name: "v2m-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'mu' ending (archaic) + dbTag{Name: "v2m-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'mu' ending (archaic) + dbTag{Name: "v2n-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'nu' ending (archaic) + dbTag{Name: "v2r-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ru' ending (archaic) + dbTag{Name: "v2r-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ru' ending (archaic) + dbTag{Name: "v2s-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'su' ending (archaic) + dbTag{Name: "v2t-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'tsu' ending (archaic) + dbTag{Name: "v2t-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'tsu' ending (archaic) + dbTag{Name: "v2w-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic) + dbTag{Name: "v2y-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'yu' ending (archaic) + dbTag{Name: "v2y-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'yu' ending (archaic) + dbTag{Name: "v2z-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'zu' ending (archaic) + dbTag{Name: "v4b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'bu' ending (archaic) + dbTag{Name: "v4g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'gu' ending (archaic) + dbTag{Name: "v4h", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'hu/fu' ending (archaic) + dbTag{Name: "v4k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ku' ending (archaic) + dbTag{Name: "v4m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'mu' ending (archaic) + dbTag{Name: "v4n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'nu' ending (archaic) + dbTag{Name: "v4r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ru' ending (archaic) + dbTag{Name: "v4s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'su' ending (archaic) + dbTag{Name: "v4t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'tsu' ending (archaic) + dbTag{Name: "v5aru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - -aru special class + dbTag{Name: "v5b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'bu' ending + dbTag{Name: "v5g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'gu' ending + dbTag{Name: "v5k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ku' ending + dbTag{Name: "v5k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Iku/Yuku special class + dbTag{Name: "v5m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'mu' ending + dbTag{Name: "v5n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'nu' ending + dbTag{Name: "v5r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending + dbTag{Name: "v5r-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending (irregular verb) + dbTag{Name: "v5s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'su' ending + dbTag{Name: "v5t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'tsu' ending + dbTag{Name: "v5u", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending + dbTag{Name: "v5u-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending (special class) + dbTag{Name: "v5uru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Uru old class verb (old form of Eru) + dbTag{Name: "vi", Order: -3, Score: 0, Category: "partOfSpeech"}, // intransitive verb + dbTag{Name: "vk", Order: -3, Score: 0, Category: "partOfSpeech"}, // Kuru verb - special class + dbTag{Name: "vn", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular nu verb + dbTag{Name: "vr", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular ru verb, plain form ends with -ri + dbTag{Name: "vs", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or participle which takes the aux. verb suru + dbTag{Name: "vs-c", Order: -3, Score: 0, Category: "partOfSpeech"}, // su verb - precursor to the modern suru + dbTag{Name: "vs-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - included + dbTag{Name: "vs-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - special class + dbTag{Name: "vt", Order: -3, Score: 0, Category: "partOfSpeech"}, // transitive verb + dbTag{Name: "vz", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - zuru verb (alternative form of -jiru verbs) + + // usage domain + dbTag{Name: "agric", Order: 0, Score: 0, Category: ""}, // agriculture + dbTag{Name: "anat", Order: 0, Score: 0, Category: ""}, // anatomy + dbTag{Name: "archeol", Order: 0, Score: 0, Category: ""}, // archeology + dbTag{Name: "archit", Order: 0, Score: 0, Category: ""}, // architecture + dbTag{Name: "art", Order: 0, Score: 0, Category: ""}, // art, aesthetics + dbTag{Name: "astron", Order: 0, Score: 0, Category: ""}, // astronomy + dbTag{Name: "audvid", Order: 0, Score: 0, Category: ""}, // audiovisual + dbTag{Name: "aviat", Order: 0, Score: 0, Category: ""}, // aviation + dbTag{Name: "baseb", Order: 0, Score: 0, Category: ""}, // baseball + dbTag{Name: "biochem", Order: 0, Score: 0, Category: ""}, // biochemistry + dbTag{Name: "biol", Order: 0, Score: 0, Category: ""}, // biology + dbTag{Name: "bot", Order: 0, Score: 0, Category: ""}, // botany + dbTag{Name: "Buddh", Order: 0, Score: 0, Category: ""}, // Buddhism + dbTag{Name: "bus", Order: 0, Score: 0, Category: ""}, // business + dbTag{Name: "cards", Order: 0, Score: 0, Category: ""}, // card games + dbTag{Name: "chem", Order: 0, Score: 0, Category: ""}, // chemistry + dbTag{Name: "Christn", Order: 0, Score: 0, Category: ""}, // Christianity + dbTag{Name: "cloth", Order: 0, Score: 0, Category: ""}, // clothing + dbTag{Name: "comp", Order: 0, Score: 0, Category: ""}, // computing + dbTag{Name: "cryst", Order: 0, Score: 0, Category: ""}, // crystallography + dbTag{Name: "dent", Order: 0, Score: 0, Category: ""}, // dentistry + dbTag{Name: "ecol", Order: 0, Score: 0, Category: ""}, // ecology + dbTag{Name: "econ", Order: 0, Score: 0, Category: ""}, // economics + dbTag{Name: "elec", Order: 0, Score: 0, Category: ""}, // electricity, elec. eng. + dbTag{Name: "electr", Order: 0, Score: 0, Category: ""}, // electronics + dbTag{Name: "embryo", Order: 0, Score: 0, Category: ""}, // embryology + dbTag{Name: "engr", Order: 0, Score: 0, Category: ""}, // engineering + dbTag{Name: "ent", Order: 0, Score: 0, Category: ""}, // entomology + dbTag{Name: "film", Order: 0, Score: 0, Category: ""}, // film + dbTag{Name: "finc", Order: 0, Score: 0, Category: ""}, // finance + dbTag{Name: "fish", Order: 0, Score: 0, Category: ""}, // fishing + dbTag{Name: "food", Order: 0, Score: 0, Category: ""}, // food, cooking + dbTag{Name: "gardn", Order: 0, Score: 0, Category: ""}, // gardening, horticulture + dbTag{Name: "genet", Order: 0, Score: 0, Category: ""}, // genetics + dbTag{Name: "geogr", Order: 0, Score: 0, Category: ""}, // geography + dbTag{Name: "geol", Order: 0, Score: 0, Category: ""}, // geology + dbTag{Name: "geom", Order: 0, Score: 0, Category: ""}, // geometry + dbTag{Name: "go", Order: 0, Score: 0, Category: ""}, // go (game) + dbTag{Name: "golf", Order: 0, Score: 0, Category: ""}, // golf + dbTag{Name: "gramm", Order: 0, Score: 0, Category: ""}, // grammar + dbTag{Name: "grmyth", Order: 0, Score: 0, Category: ""}, // Greek mythology + dbTag{Name: "hanaf", Order: 0, Score: 0, Category: ""}, // hanafuda + dbTag{Name: "horse", Order: 0, Score: 0, Category: ""}, // horse racing + dbTag{Name: "kabuki", Order: 0, Score: 0, Category: ""}, // kabuki + dbTag{Name: "law", Order: 0, Score: 0, Category: ""}, // law + dbTag{Name: "ling", Order: 0, Score: 0, Category: ""}, // linguistics + dbTag{Name: "logic", Order: 0, Score: 0, Category: ""}, // logic + dbTag{Name: "MA", Order: 0, Score: 0, Category: ""}, // martial arts + dbTag{Name: "mahj", Order: 0, Score: 0, Category: ""}, // mahjong + dbTag{Name: "manga", Order: 0, Score: 0, Category: ""}, // manga + dbTag{Name: "math", Order: 0, Score: 0, Category: ""}, // mathematics + dbTag{Name: "mech", Order: 0, Score: 0, Category: ""}, // mechanical engineering + dbTag{Name: "med", Order: 0, Score: 0, Category: ""}, // medicine + dbTag{Name: "met", Order: 0, Score: 0, Category: ""}, // meteorology + dbTag{Name: "mil", Order: 0, Score: 0, Category: ""}, // military + dbTag{Name: "mining", Order: 0, Score: 0, Category: ""}, // mining + dbTag{Name: "music", Order: 0, Score: 0, Category: ""}, // music + dbTag{Name: "noh", Order: 0, Score: 0, Category: ""}, // noh + dbTag{Name: "ornith", Order: 0, Score: 0, Category: ""}, // ornithology + dbTag{Name: "paleo", Order: 0, Score: 0, Category: ""}, // paleontology + dbTag{Name: "pathol", Order: 0, Score: 0, Category: ""}, // pathology + dbTag{Name: "pharm", Order: 0, Score: 0, Category: ""}, // pharmacy + dbTag{Name: "phil", Order: 0, Score: 0, Category: ""}, // philosophy + dbTag{Name: "photo", Order: 0, Score: 0, Category: ""}, // photography + dbTag{Name: "physics", Order: 0, Score: 0, Category: ""}, // physics + dbTag{Name: "physiol", Order: 0, Score: 0, Category: ""}, // physiology + dbTag{Name: "politics", Order: 0, Score: 0, Category: ""}, // politics + dbTag{Name: "print", Order: 0, Score: 0, Category: ""}, // printing + dbTag{Name: "psy", Order: 0, Score: 0, Category: ""}, // psychiatry + dbTag{Name: "psyanal", Order: 0, Score: 0, Category: ""}, // psychoanalysis + dbTag{Name: "psych", Order: 0, Score: 0, Category: ""}, // psychology + dbTag{Name: "rail", Order: 0, Score: 0, Category: ""}, // railway + dbTag{Name: "rommyth", Order: 0, Score: 0, Category: ""}, // Roman mythology + dbTag{Name: "Shinto", Order: 0, Score: 0, Category: ""}, // Shinto + dbTag{Name: "shogi", Order: 0, Score: 0, Category: ""}, // shogi + dbTag{Name: "ski", Order: 0, Score: 0, Category: ""}, // skiing + dbTag{Name: "sports", Order: 0, Score: 0, Category: ""}, // sports + dbTag{Name: "stat", Order: 0, Score: 0, Category: ""}, // statistics + dbTag{Name: "stockm", Order: 0, Score: 0, Category: ""}, // stock market + dbTag{Name: "sumo", Order: 0, Score: 0, Category: ""}, // sumo + dbTag{Name: "telec", Order: 0, Score: 0, Category: ""}, // telecommunications + dbTag{Name: "tradem", Order: 0, Score: 0, Category: ""}, // trademark + dbTag{Name: "tv", Order: 0, Score: 0, Category: ""}, // television + dbTag{Name: "vidg", Order: 0, Score: 0, Category: ""}, // video games + dbTag{Name: "zool", Order: 0, Score: 0, Category: ""}, // zoology + + // dialect + dbTag{Name: "bra", Order: 0, Score: 0, Category: ""}, // Brazilian + dbTag{Name: "hob", Order: 0, Score: 0, Category: ""}, // Hokkaido-ben + dbTag{Name: "ksb", Order: 0, Score: 0, Category: ""}, // Kansai-ben + dbTag{Name: "ktb", Order: 0, Score: 0, Category: ""}, // Kantou-ben + dbTag{Name: "kyb", Order: 0, Score: 0, Category: ""}, // Kyoto-ben + dbTag{Name: "kyu", Order: 0, Score: 0, Category: ""}, // Kyuushuu-ben + dbTag{Name: "nab", Order: 0, Score: 0, Category: ""}, // Nagano-ben + dbTag{Name: "osb", Order: 0, Score: 0, Category: ""}, // Osaka-ben + dbTag{Name: "rkb", Order: 0, Score: 0, Category: ""}, // Ryuukyuu-ben + dbTag{Name: "thb", Order: 0, Score: 0, Category: ""}, // Touhoku-ben + dbTag{Name: "tsb", Order: 0, Score: 0, Category: ""}, // Tosa-ben + dbTag{Name: "tsug", Order: 0, Score: 0, Category: ""}, // Tsugaru-ben + } +} diff --git a/kanjidic.go b/kanjidic.go index 37bebdd..5474aed 100644 --- a/kanjidic.go +++ b/kanjidic.go @@ -7,8 +7,6 @@ import ( "foosoft.net/projects/jmdict" ) -const kanjidicRevision = "kanjidic2" - func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji { if entry.ReadingMeaning == nil { return nil @@ -161,11 +159,16 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int, "tag": tags.crush(), } + index := dbIndex{ + Title: title, + Revision: "kanjidic2", + Sequenced: false, + Attribution: edrdgAttribution, + } + return writeDb( outputPath, - title, - kanjidicRevision, - false, + index, recordData, stride, pretty, diff --git a/kotowaza.go b/kotowaza.go index 7f713ae..fca8f7d 100644 --- a/kotowaza.go +++ b/kotowaza.go @@ -72,7 +72,7 @@ func (e *kotowazaExtractor) extractTerms(entry zig.BookEntry, sequence int) []db term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/koujien.go b/koujien.go index 89b7379..049d5a0 100644 --- a/koujien.go +++ b/koujien.go @@ -75,7 +75,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -89,7 +89,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/meikyou.go b/meikyou.go index 78a3081..2ea33fe 100644 --- a/meikyou.go +++ b/meikyou.go @@ -106,7 +106,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT for _, reading := range readings { term := dbTerm{ Expression: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } @@ -120,7 +120,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, } diff --git a/rikai.go b/rikai.go index 651bc44..bfc5307 100644 --- a/rikai.go +++ b/rikai.go @@ -8,8 +8,6 @@ import ( _ "github.com/mattn/go-sqlite3" ) -const rikaiRevision = "rikai2" - type rikaiEntry struct { kanji string kana string @@ -154,11 +152,15 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr "tag": tags.crush(), } + index := dbIndex{ + Title: title, + Revision: "rikai2", + Sequenced: true, + } + return writeDb( outputPath, - title, - rikaiRevision, - true, + index, recordData, stride, pretty, diff --git a/scripts/build_dicts.sh b/scripts/build_dicts.sh index b91d9a5..df63ac6 100755 --- a/scripts/build_dicts.sh +++ b/scripts/build_dicts.sh @@ -5,13 +5,24 @@ go get foosoft.net/projects/yomichan-import/yomichan mkdir -p src mkdir -p dst -if [ ! -f src/JMdict ]; then - wget http://ftp.monash.edu/pub/nihongo/JMdict.gz - gunzip -c JMdict.gz > src/JMdict -fi +function refresh_source () { + NOW=$(date '+%s') + YESTERDAY=$((NOW - 86400)) # 86,400 seconds in 24 hours + if [ ! -f "src/$1" ]; then + wget "ftp.edrdg.org/pub/Nihongo/$1.gz" + gunzip -c "$1.gz" > "src/$1" + elif [[ $YESTERDAY -gt $(date -r "src/$1" '+%s') ]]; then + rsync "ftp.edrdg.org::nihongo/$1" "src/$1" + fi +} -yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip +refresh_source "JMdict_e_examp" +yomichan -language="english_extra" -title="JMdict" src/JMdict_e_examp dst/jmdict_english_extra_with_examples.zip + +refresh_source "JMdict" +yomichan -language="english_extra" -title="JMdict" src/JMdict dst/jmdict_english_extra.zip yomichan -language="english" -title="JMdict (English)" src/JMdict dst/jmdict_english.zip +yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip yomichan -language="french" -title="JMdict (French)" src/JMdict dst/jmdict_french.zip yomichan -language="german" -title="JMdict (German)" src/JMdict dst/jmdict_german.zip yomichan -language="hungarian" -title="JMdict (Hungarian)" src/JMdict dst/jmdict_hungarian.zip @@ -20,19 +31,13 @@ yomichan -language="slovenian" -title="JMdict (Slovenian)" src/JMdict dst/jmdict yomichan -language="spanish" -title="JMdict (Spanish)" src/JMdict dst/jmdict_spanish.zip yomichan -language="swedish" -title="JMdict (Swedish)" src/JMdict dst/jmdict_swedish.zip -if [ ! -f src/JMnedict.xml ]; then - wget http://ftp.monash.edu/pub/nihongo/JMnedict.xml.gz - gunzip -c JMnedict.xml.gz > src/JMnedict.xml -fi +yomichan -format="forms" -title="JMdict Forms" src/JMdict dst/jmdict_forms.zip +refresh_source "JMnedict.xml" yomichan src/JMnedict.xml dst/jmnedict.zip -if [ ! -f src/kanjidic2.xml ]; then - wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz - gunzip -c kanjidic2.xml.gz > src/kanjidic2.xml -fi - -yomichan -language="english" -title="KANJIDIC (English)" src/kanjidic2.xml dst/kanjidic_english.zip +refresh_source "kanjidic2.xml" +yomichan -language="english" -title="KANJIDIC" src/kanjidic2.xml dst/kanjidic_english.zip yomichan -language="french" -title="KANJIDIC (French)" src/kanjidic2.xml dst/kanjidic_french.zip yomichan -language="portuguese" -title="KANJIDIC (Portuguese)" src/kanjidic2.xml dst/kanjidic_portuguese.zip yomichan -language="spanish" -title="KANJIDIC (Spanish)" src/kanjidic2.xml dst/kanjidic_spanish.zip diff --git a/shougakukan2.go b/shougakukan2.go index 5b06ea6..7b16549 100644 --- a/shougakukan2.go +++ b/shougakukan2.go @@ -93,7 +93,7 @@ func (e *shougakukan2Extractor) extractTerms(entry zig.BookEntry, sequence int) terms = append(terms, dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, }) } diff --git a/structured_content.go b/structured_content.go new file mode 100644 index 0000000..ded8229 --- /dev/null +++ b/structured_content.go @@ -0,0 +1,192 @@ +package yomichan + +type contentAttr struct { + lang string + fontStyle string // normal, italic + fontWeight string // normal, bold + fontSize string // small, medium, large, smaller, 80%, 125%, etc. + textDecorationLine []string // underline, overline, line-through + verticalAlign string // baseline, sub, super, text-top, text-bottom, middle, top, bottom + textAlign string // start, end, left, right, center, justify, justify-all, match-parent + marginTop int + marginLeft int + marginRight int + marginBottom int + listStyleType string + data map[string]string +} + +// if the array contains adjacent strings, concatenate them. +// ex: ["one", "two", content_structure, "four"] -> ["onetwo", content_structure, "four"] +// if the array only contains strings, return a concatenated string. +// ex: ["one", "two"] -> "onetwo" +func contentReduce(contents []any) any { + if len(contents) == 1 { + return contents[0] + } + newContents := []any{} + var accumulator string + for _, content := range contents { + switch v := content.(type) { + case string: + accumulator = accumulator + v + default: + if accumulator != "" { + newContents = append(newContents, accumulator) + accumulator = "" + } + newContents = append(newContents, content) + } + } + if accumulator != "" { + newContents = append(newContents, accumulator) + } + if len(newContents) == 1 { + return newContents[0] + } else { + return newContents + } +} + +func contentStructure(contents ...any) map[string]any { + return map[string]any{ + "type": "structured-content", + "content": contentReduce(contents), + } +} + +func contentRuby(attr contentAttr, ruby string, contents ...any) map[string]any { + rubyContent := map[string]any{ + "tag": "ruby", + "content": []any{ + contentReduce(contents), + map[string]string{"tag": "rp", "content": "("}, + map[string]string{"tag": "rt", "content": ruby}, + map[string]string{"tag": "rp", "content": ")"}, + }, + } + if attr.lang != "" { + rubyContent["lang"] = attr.lang + } + if len(attr.data) != 0 { + rubyContent["data"] = attr.data + } + return rubyContent +} + +func contentInternalLink(attr contentAttr, query string, contents ...any) map[string]any { + linkContent := map[string]any{ + "tag": "a", + "href": "?query=" + query + "&wildcards=off", + } + if len(contents) == 0 { + linkContent["content"] = query + } else { + linkContent["content"] = contentReduce(contents) + } + if attr.lang != "" { + linkContent["lang"] = attr.lang + } + if len(attr.data) != 0 { + linkContent["data"] = attr.data + } + return linkContent +} + +func contentSpan(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "span", contents...) +} + +func contentDiv(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "div", contents...) +} + +func contentListItem(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "li", contents...) +} + +func contentOrderedList(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "ol", contents...) +} + +func contentUnorderedList(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "ul", contents...) +} + +func contentTable(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "table", contents...) +} + +func contentTableHead(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "thead", contents...) +} + +func contentTableBody(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "tbody", contents...) +} + +func contentTableRow(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "tr", contents...) +} + +func contentTableHeadCell(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "th", contents...) +} + +func contentTableCell(attr contentAttr, contents ...any) map[string]any { + return contentStyledContainer(attr, "td", contents...) +} + +func contentStyledContainer(attr contentAttr, tag string, contents ...any) map[string]any { + container := map[string]any{"tag": tag} + container["content"] = contentReduce(contents) + if attr.lang != "" { + container["lang"] = attr.lang + } + if len(attr.data) != 0 { + container["data"] = attr.data + } + style := contentStyle(attr) + if len(style) != 0 { + container["style"] = style + } + return container +} + +func contentStyle(attr contentAttr) map[string]any { + style := make(map[string]any) + if attr.fontStyle != "" { + style["fontStyle"] = attr.fontStyle + } + if attr.fontWeight != "" { + style["fontWeight"] = attr.fontWeight + } + if attr.fontSize != "" { + style["fontSize"] = attr.fontSize + } + if len(attr.textDecorationLine) != 0 { + style["textDecorationLine"] = attr.textDecorationLine + } + if attr.verticalAlign != "" { + style["verticalAlign"] = attr.verticalAlign + } + if attr.textAlign != "" { + style["textAlign"] = attr.textAlign + } + if attr.marginTop != 0 { + style["marginTop"] = attr.marginTop + } + if attr.marginLeft != 0 { + style["marginLeft"] = attr.marginLeft + } + if attr.marginRight != 0 { + style["marginRight"] = attr.marginRight + } + if attr.marginBottom != 0 { + style["marginBottom"] = attr.marginBottom + } + if attr.listStyleType != "" { + style["listStyleType"] = attr.listStyleType + } + return style +} diff --git a/wadai.go b/wadai.go index 2507b92..0226f64 100644 --- a/wadai.go +++ b/wadai.go @@ -74,7 +74,7 @@ func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTer term := dbTerm{ Expression: expression, Reading: reading, - Glossary: []string{entry.Text}, + Glossary: []any{entry.Text}, Sequence: sequence, }