Add new JMdict version
This commit is contained in:
parent
73fb992865
commit
abc28bb19d
@ -306,7 +306,7 @@ func detectFormat(path string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
switch filepath.Base(path) {
|
switch filepath.Base(path) {
|
||||||
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml":
|
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml", "JMdict_e_examp":
|
||||||
return "edict", nil
|
return "edict", nil
|
||||||
case "JMnedict", "JMnedict.xml":
|
case "JMnedict", "JMnedict.xml":
|
||||||
return "enamdict", nil
|
return "enamdict", nil
|
||||||
@ -336,7 +336,8 @@ func detectFormat(path string) (string, error) {
|
|||||||
|
|
||||||
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
|
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
|
||||||
handlers := map[string]func(string, string, string, string, int, bool) error{
|
handlers := map[string]func(string, string, string, string, int, bool) error{
|
||||||
"edict": jmdictExportDb,
|
"edict": jmdExportDb,
|
||||||
|
"forms": formsExportDb,
|
||||||
"enamdict": jmnedictExportDb,
|
"enamdict": jmnedictExportDb,
|
||||||
"epwing": epwingExportDb,
|
"epwing": epwingExportDb,
|
||||||
"kanjidic": kanjidicExportDb,
|
"kanjidic": kanjidicExportDb,
|
||||||
|
252
edict.go
252
edict.go
@ -1,252 +0,0 @@
|
|||||||
package yomichan
|
|
||||||
|
|
||||||
import (
|
|
||||||
"os"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"foosoft.net/projects/jmdict"
|
|
||||||
)
|
|
||||||
|
|
||||||
const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/"
|
|
||||||
|
|
||||||
func jmdictBuildRules(term *dbTerm) {
|
|
||||||
for _, tag := range term.DefinitionTags {
|
|
||||||
switch tag {
|
|
||||||
case "adj-i", "v1", "vk", "vz":
|
|
||||||
term.addRules(tag)
|
|
||||||
default:
|
|
||||||
if strings.HasPrefix(tag, "v5") {
|
|
||||||
term.addRules("v5")
|
|
||||||
} else if strings.HasPrefix(tag, "vs-") {
|
|
||||||
term.addRules("vs")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmdictBuildScore(term *dbTerm) {
|
|
||||||
for _, tag := range term.DefinitionTags {
|
|
||||||
switch tag {
|
|
||||||
case "arch":
|
|
||||||
term.Score -= 100
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, tag := range term.TermTags {
|
|
||||||
switch tag {
|
|
||||||
case "news", "ichi", "spec", "gai1":
|
|
||||||
term.Score += 100
|
|
||||||
case "P":
|
|
||||||
term.Score += 500
|
|
||||||
case "iK", "ik", "ok", "oK", "io", "oik":
|
|
||||||
term.Score -= 100
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmdictAddPriorities(term *dbTerm, priorities ...string) {
|
|
||||||
for _, priority := range priorities {
|
|
||||||
switch priority {
|
|
||||||
case "news1", "ichi1", "spec1", "gai1":
|
|
||||||
term.addTermTags("P")
|
|
||||||
fallthrough
|
|
||||||
case "news2", "ichi2", "spec2", "gai2":
|
|
||||||
term.addTermTags(priority[:len(priority)-1])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmdictBuildTagMeta(entities map[string]string) dbTagList {
|
|
||||||
tags := dbTagList{
|
|
||||||
dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2},
|
|
||||||
dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2},
|
|
||||||
dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2},
|
|
||||||
dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2},
|
|
||||||
dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10, Score: 10},
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, value := range entities {
|
|
||||||
tag := dbTag{Name: name, Notes: value}
|
|
||||||
|
|
||||||
switch name {
|
|
||||||
case "exp", "id":
|
|
||||||
tag.Category = "expression"
|
|
||||||
tag.Order = -5
|
|
||||||
case "arch":
|
|
||||||
tag.Category = "archaism"
|
|
||||||
tag.Order = -4
|
|
||||||
case "iK", "ik", "ok", "oK", "io", "oik":
|
|
||||||
tag.Score = -5
|
|
||||||
case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj",
|
|
||||||
"aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf",
|
|
||||||
"unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k",
|
|
||||||
"v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru",
|
|
||||||
"v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i",
|
|
||||||
"vs", "vs-s", "vt", "vz":
|
|
||||||
tag.Category = "partOfSpeech"
|
|
||||||
tag.Order = -3
|
|
||||||
}
|
|
||||||
|
|
||||||
tags = append(tags, tag)
|
|
||||||
}
|
|
||||||
|
|
||||||
return tags
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm {
|
|
||||||
var terms []dbTerm
|
|
||||||
|
|
||||||
convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) {
|
|
||||||
if kanji != nil && reading.Restrictions != nil && !hasString(kanji.Expression, reading.Restrictions) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var termBase dbTerm
|
|
||||||
termBase.addTermTags(reading.Information...)
|
|
||||||
|
|
||||||
if kanji == nil {
|
|
||||||
termBase.Expression = reading.Reading
|
|
||||||
jmdictAddPriorities(&termBase, reading.Priorities...)
|
|
||||||
} else {
|
|
||||||
termBase.Expression = kanji.Expression
|
|
||||||
termBase.Reading = reading.Reading
|
|
||||||
termBase.addTermTags(kanji.Information...)
|
|
||||||
|
|
||||||
for _, priority := range kanji.Priorities {
|
|
||||||
if hasString(priority, reading.Priorities) {
|
|
||||||
jmdictAddPriorities(&termBase, priority)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var partsOfSpeech []string
|
|
||||||
for index, sense := range edictEntry.Sense {
|
|
||||||
|
|
||||||
if len(sense.PartsOfSpeech) != 0 {
|
|
||||||
partsOfSpeech = sense.PartsOfSpeech
|
|
||||||
}
|
|
||||||
|
|
||||||
if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if kanji != nil && sense.RestrictedKanji != nil && !hasString(kanji.Expression, sense.RestrictedKanji) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
term := dbTerm{
|
|
||||||
Reading: termBase.Reading,
|
|
||||||
Expression: termBase.Expression,
|
|
||||||
Score: len(edictEntry.Sense) - index,
|
|
||||||
Sequence: edictEntry.Sequence,
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, glossary := range sense.Glossary {
|
|
||||||
if glossary.Language == nil && language == "" || glossary.Language != nil && language == *glossary.Language {
|
|
||||||
term.Glossary = append(term.Glossary, glossary.Content)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(term.Glossary) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
term.addDefinitionTags(termBase.DefinitionTags...)
|
|
||||||
term.addTermTags(termBase.TermTags...)
|
|
||||||
term.addDefinitionTags(partsOfSpeech...)
|
|
||||||
term.addDefinitionTags(sense.Fields...)
|
|
||||||
term.addDefinitionTags(sense.Misc...)
|
|
||||||
term.addDefinitionTags(sense.Dialects...)
|
|
||||||
|
|
||||||
jmdictBuildRules(&term)
|
|
||||||
jmdictBuildScore(&term)
|
|
||||||
|
|
||||||
terms = append(terms, term)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(edictEntry.Kanji) > 0 {
|
|
||||||
for _, kanji := range edictEntry.Kanji {
|
|
||||||
for _, reading := range edictEntry.Readings {
|
|
||||||
if reading.NoKanji == nil {
|
|
||||||
convert(reading, &kanji)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, reading := range edictEntry.Readings {
|
|
||||||
if reading.NoKanji != nil {
|
|
||||||
convert(reading, nil)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for _, reading := range edictEntry.Readings {
|
|
||||||
convert(reading, nil)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return terms
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmdictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
|
||||||
reader, err := os.Open(inputPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer reader.Close()
|
|
||||||
|
|
||||||
dict, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
var langTag string
|
|
||||||
switch language {
|
|
||||||
case "dutch":
|
|
||||||
langTag = "dut"
|
|
||||||
case "french":
|
|
||||||
langTag = "fre"
|
|
||||||
case "german":
|
|
||||||
langTag = "ger"
|
|
||||||
case "hungarian":
|
|
||||||
langTag = "hun"
|
|
||||||
case "italian":
|
|
||||||
langTag = "ita"
|
|
||||||
case "russian":
|
|
||||||
langTag = "rus"
|
|
||||||
case "slovenian":
|
|
||||||
langTag = "slv"
|
|
||||||
case "spanish":
|
|
||||||
langTag = "spa"
|
|
||||||
case "swedish":
|
|
||||||
langTag = "swe"
|
|
||||||
}
|
|
||||||
|
|
||||||
var terms dbTermList
|
|
||||||
for _, entry := range dict.Entries {
|
|
||||||
terms = append(terms, jmdictExtractTerms(entry, langTag)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
if title == "" {
|
|
||||||
title = "JMdict"
|
|
||||||
}
|
|
||||||
|
|
||||||
recordData := map[string]dbRecordList{
|
|
||||||
"term": terms.crush(),
|
|
||||||
"tag": jmdictBuildTagMeta(entities).crush(),
|
|
||||||
}
|
|
||||||
|
|
||||||
index := dbIndex{
|
|
||||||
Title: title,
|
|
||||||
Revision: "jmdict4",
|
|
||||||
Sequenced: true,
|
|
||||||
Attribution: edrdgAttribution,
|
|
||||||
}
|
|
||||||
index.setDefaults()
|
|
||||||
|
|
||||||
return writeDb(
|
|
||||||
outputPath,
|
|
||||||
index,
|
|
||||||
recordData,
|
|
||||||
stride,
|
|
||||||
pretty,
|
|
||||||
)
|
|
||||||
}
|
|
1
go.mod
1
go.mod
@ -7,6 +7,7 @@ require (
|
|||||||
foosoft.net/projects/zero-epwing-go v0.0.0-20220704035039-bc008453615d
|
foosoft.net/projects/zero-epwing-go v0.0.0-20220704035039-bc008453615d
|
||||||
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e
|
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e
|
||||||
github.com/mattn/go-sqlite3 v1.14.14
|
github.com/mattn/go-sqlite3 v1.14.14
|
||||||
|
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f
|
||||||
)
|
)
|
||||||
|
|
||||||
require golang.org/x/text v0.3.7 // indirect
|
require golang.org/x/text v0.3.7 // indirect
|
||||||
|
2
go.sum
2
go.sum
@ -6,5 +6,7 @@ github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e h1:wSQCJiig/QkoUnpvelSP
|
|||||||
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II=
|
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II=
|
||||||
github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw=
|
github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw=
|
||||||
github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
|
github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
|
||||||
|
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f h1:90Jq/vvGVDsqj8QqCynjFw9MCerDguSMODLYII416Y8=
|
||||||
|
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
|
||||||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
||||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||||
|
221
jmdict.go
Normal file
221
jmdict.go
Normal file
@ -0,0 +1,221 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"foosoft.net/projects/jmdict"
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
func grammarRules(partsOfSpeech []string) []string {
|
||||||
|
rules := []string{}
|
||||||
|
for _, partOfSpeech := range partsOfSpeech {
|
||||||
|
switch partOfSpeech {
|
||||||
|
case "adj-i", "vk", "vz":
|
||||||
|
rules = append(rules, partOfSpeech)
|
||||||
|
default:
|
||||||
|
if strings.HasPrefix(partOfSpeech, "v5") {
|
||||||
|
rules = append(rules, "v5")
|
||||||
|
} else if strings.HasPrefix(partOfSpeech, "v1") {
|
||||||
|
rules = append(rules, "v1")
|
||||||
|
} else if strings.HasPrefix(partOfSpeech, "vs-") {
|
||||||
|
rules = append(rules, "vs")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rules
|
||||||
|
}
|
||||||
|
|
||||||
|
func calculateTermScore(senseNumber int, headword headword) int {
|
||||||
|
const senseWeight int = 1
|
||||||
|
const entryPositionWeight int = 100
|
||||||
|
const priorityWeight int = 10000
|
||||||
|
|
||||||
|
score := 0
|
||||||
|
score -= (senseNumber - 1) * senseWeight
|
||||||
|
score -= headword.Index * entryPositionWeight
|
||||||
|
score += headword.Score() * priorityWeight
|
||||||
|
|
||||||
|
return score
|
||||||
|
}
|
||||||
|
|
||||||
|
func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) bool {
|
||||||
|
// Display sense numbers if the entry has more than one sense
|
||||||
|
// or if the headword is found in multiple entries.
|
||||||
|
hash := headword.Hash()
|
||||||
|
if meta.seqToSenseCount[entry.Sequence] > 1 {
|
||||||
|
return true
|
||||||
|
} else if len(meta.headwordHashToSeqs[hash]) > 1 {
|
||||||
|
return true
|
||||||
|
} else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
|
||||||
|
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
|
||||||
|
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
||||||
|
jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content)
|
||||||
|
return jmdictDate
|
||||||
|
}
|
||||||
|
|
||||||
|
func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
|
||||||
|
term := baseFormsTerm(entry)
|
||||||
|
term.Expression = headword.Expression
|
||||||
|
term.Reading = headword.Reading
|
||||||
|
|
||||||
|
term.addTermTags(headword.TermTags...)
|
||||||
|
|
||||||
|
term.addDefinitionTags("forms")
|
||||||
|
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
|
||||||
|
term.Score = calculateTermScore(senseNumber, headword)
|
||||||
|
return term
|
||||||
|
}
|
||||||
|
|
||||||
|
func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
|
||||||
|
term := dbTerm{
|
||||||
|
Expression: headword.Expression,
|
||||||
|
Sequence: -entry.Sequence,
|
||||||
|
}
|
||||||
|
for _, sense := range entry.Sense {
|
||||||
|
rules := grammarRules(sense.PartsOfSpeech)
|
||||||
|
term.addRules(rules...)
|
||||||
|
}
|
||||||
|
term.addTermTags(headword.TermTags...)
|
||||||
|
term.Score = calculateTermScore(0, headword)
|
||||||
|
|
||||||
|
redirectHeadword := meta.seqToMainHeadword[entry.Sequence]
|
||||||
|
expHash := redirectHeadword.ExpHash()
|
||||||
|
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
|
||||||
|
|
||||||
|
content := contentSpan(
|
||||||
|
contentAttr{fontSize: "130%"},
|
||||||
|
"⟶",
|
||||||
|
redirectHeadword.ToInternalLink(doDisplayReading),
|
||||||
|
)
|
||||||
|
|
||||||
|
term.Glossary = []any{contentStructure(content)}
|
||||||
|
return term
|
||||||
|
}
|
||||||
|
|
||||||
|
func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
|
||||||
|
term := dbTerm{
|
||||||
|
Expression: headword.Expression,
|
||||||
|
Reading: headword.Reading,
|
||||||
|
Sequence: entry.Sequence,
|
||||||
|
}
|
||||||
|
|
||||||
|
term.Glossary = createGlossary(sense, meta)
|
||||||
|
|
||||||
|
term.addTermTags(headword.TermTags...)
|
||||||
|
|
||||||
|
if doDisplaySenseNumberTag(headword, entry, meta) {
|
||||||
|
senseNumberTag := strconv.Itoa(senseNumber)
|
||||||
|
term.addDefinitionTags(senseNumberTag)
|
||||||
|
}
|
||||||
|
term.addDefinitionTags(sense.PartsOfSpeech...)
|
||||||
|
term.addDefinitionTags(sense.Fields...)
|
||||||
|
term.addDefinitionTags(sense.Misc...)
|
||||||
|
term.addDefinitionTags(sense.Dialects...)
|
||||||
|
|
||||||
|
rules := grammarRules(sense.PartsOfSpeech)
|
||||||
|
term.addRules(rules...)
|
||||||
|
|
||||||
|
term.Score = calculateTermScore(senseNumber, headword)
|
||||||
|
|
||||||
|
return term
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
|
||||||
|
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
if headword.IsSearchOnly {
|
||||||
|
searchTerm := createSearchTerm(headword, entry, meta)
|
||||||
|
return []dbTerm{searchTerm}, true
|
||||||
|
}
|
||||||
|
terms := []dbTerm{}
|
||||||
|
senseNumber := 1
|
||||||
|
for _, sense := range entry.Sense {
|
||||||
|
if !glossaryContainsLanguage(sense.Glossary, meta.language) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
|
||||||
|
senseNumber += 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) {
|
||||||
|
senseNumber += 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
senseTerm := createSenseTerm(sense, senseNumber, headword, entry, meta)
|
||||||
|
senseNumber += 1
|
||||||
|
terms = append(terms, senseTerm)
|
||||||
|
}
|
||||||
|
|
||||||
|
if meta.hasMultipleForms[entry.Sequence] {
|
||||||
|
formsTerm := createFormsTerm(headword, entry, meta)
|
||||||
|
terms = append(terms, formsTerm)
|
||||||
|
}
|
||||||
|
return terms, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
|
||||||
|
reader, err := os.Open(inputPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer reader.Close()
|
||||||
|
|
||||||
|
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
meta := newJmdictMetadata(dictionary, languageName)
|
||||||
|
|
||||||
|
terms := dbTermList{}
|
||||||
|
for _, entry := range dictionary.Entries {
|
||||||
|
headwords := extractHeadwords(entry)
|
||||||
|
for _, headword := range headwords {
|
||||||
|
if newTerms, ok := extractTerms(headword, entry, meta); ok {
|
||||||
|
terms = append(terms, newTerms...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tags := dbTagList{}
|
||||||
|
tags = append(tags, entityTags(entities)...)
|
||||||
|
tags = append(tags, senseNumberTags(meta.maxSenseCount)...)
|
||||||
|
tags = append(tags, newsFrequencyTags()...)
|
||||||
|
tags = append(tags, customDbTags()...)
|
||||||
|
|
||||||
|
recordData := map[string]dbRecordList{
|
||||||
|
"term": terms.crush(),
|
||||||
|
"tag": tags.crush(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if title == "" {
|
||||||
|
title = "JMdict"
|
||||||
|
}
|
||||||
|
jmdictDate := jmdictPublicationDate(dictionary)
|
||||||
|
|
||||||
|
index := dbIndex{
|
||||||
|
Title: title,
|
||||||
|
Revision: "JMdict." + jmdictDate,
|
||||||
|
Sequenced: true,
|
||||||
|
Attribution: edrdgAttribution,
|
||||||
|
}
|
||||||
|
index.setDefaults()
|
||||||
|
|
||||||
|
return writeDb(
|
||||||
|
outputPath,
|
||||||
|
index,
|
||||||
|
recordData,
|
||||||
|
stride,
|
||||||
|
pretty,
|
||||||
|
)
|
||||||
|
}
|
215
jmdictConstants.go
Normal file
215
jmdictConstants.go
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
type LangCode struct {
|
||||||
|
language string
|
||||||
|
code string
|
||||||
|
}
|
||||||
|
|
||||||
|
const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/"
|
||||||
|
|
||||||
|
const prioritySymbol = "★"
|
||||||
|
const rareKanjiSymbol = "🅁"
|
||||||
|
const irregularSymbol = "⚠"
|
||||||
|
const outdatedSymbol = "⛬"
|
||||||
|
const defaultSymbol = "㊒"
|
||||||
|
|
||||||
|
const priorityTagName = "⭐"
|
||||||
|
const rareKanjiTagName = "R"
|
||||||
|
const irregularTagName = "⚠️"
|
||||||
|
const outdatedTagName = "⛬"
|
||||||
|
const atejiTagName = "ateji"
|
||||||
|
const gikunTagName = "gikun"
|
||||||
|
|
||||||
|
const langMarker = "'🌐 '"
|
||||||
|
const noteMarker = "'📝 '"
|
||||||
|
const infoMarker = "'ℹ️ '"
|
||||||
|
const refMarker = "'➡️ '"
|
||||||
|
const antonymMarker = "'🔄 '"
|
||||||
|
|
||||||
|
var ISOtoFlag = map[string]string{
|
||||||
|
"": "'🇬🇧 '",
|
||||||
|
"eng": "'🇬🇧 '",
|
||||||
|
"dut": "'🇳🇱 '",
|
||||||
|
"fre": "'🇫🇷 '",
|
||||||
|
"ger": "'🇩🇪 '",
|
||||||
|
"hun": "'🇭🇺 '",
|
||||||
|
"ita": "'🇮🇹 '",
|
||||||
|
"jpn": "'🇯🇵 '",
|
||||||
|
"rus": "'🇷🇺 '",
|
||||||
|
"slv": "'🇸🇮 '",
|
||||||
|
"spa": "'🇪🇸 '",
|
||||||
|
"swe": "'🇸🇪 '",
|
||||||
|
}
|
||||||
|
|
||||||
|
var langNameToCode = map[string]string{
|
||||||
|
"": "eng",
|
||||||
|
"english": "eng",
|
||||||
|
"dutch": "dut",
|
||||||
|
"french": "fre",
|
||||||
|
"german": "ger",
|
||||||
|
"hungarian": "hun",
|
||||||
|
"italian": "ita",
|
||||||
|
"russian": "rus",
|
||||||
|
"slovenian": "slv",
|
||||||
|
"spanish": "spa",
|
||||||
|
"swedish": "swe",
|
||||||
|
}
|
||||||
|
|
||||||
|
var glossTypeCodeToName = map[LangCode]string{
|
||||||
|
LangCode{"eng", "lit"}: "literally",
|
||||||
|
LangCode{"eng", "fig"}: "figuratively",
|
||||||
|
LangCode{"eng", "expl"}: "", // don't need to tell the user that an explanation is an explanation
|
||||||
|
LangCode{"eng", "tm"}: "trademark",
|
||||||
|
}
|
||||||
|
|
||||||
|
var refNoteHint = map[LangCode]string{
|
||||||
|
LangCode{"eng", "xref"}: "see",
|
||||||
|
LangCode{"eng", "ant"}: "antonym",
|
||||||
|
}
|
||||||
|
|
||||||
|
var sourceLangTypeCodeToType = map[LangCode]string{
|
||||||
|
LangCode{"eng", "part"}: "partial",
|
||||||
|
LangCode{"eng", ""}: "", // implied "full"
|
||||||
|
}
|
||||||
|
|
||||||
|
var langCodeToName = map[LangCode]string{
|
||||||
|
LangCode{"eng", "afr"}: "Afrikaans",
|
||||||
|
LangCode{"eng", "ain"}: "Ainu",
|
||||||
|
LangCode{"eng", "alg"}: "Algonquian",
|
||||||
|
LangCode{"eng", "amh"}: "Amharic",
|
||||||
|
LangCode{"eng", "ara"}: "Arabic",
|
||||||
|
LangCode{"eng", "arn"}: "Mapudungun",
|
||||||
|
LangCode{"eng", "bnt"}: "Bantu",
|
||||||
|
LangCode{"eng", "bre"}: "Breton",
|
||||||
|
LangCode{"eng", "bul"}: "Bulgarian",
|
||||||
|
LangCode{"eng", "bur"}: "Burmese",
|
||||||
|
LangCode{"eng", "chi"}: "Chinese",
|
||||||
|
LangCode{"eng", "chn"}: "Chinook Jargon",
|
||||||
|
LangCode{"eng", "cze"}: "Czech",
|
||||||
|
LangCode{"eng", "dan"}: "Danish",
|
||||||
|
LangCode{"eng", "dut"}: "Dutch",
|
||||||
|
LangCode{"eng", "eng"}: "English",
|
||||||
|
LangCode{"eng", "epo"}: "Esperanto",
|
||||||
|
LangCode{"eng", "est"}: "Estonian",
|
||||||
|
LangCode{"eng", "fil"}: "Filipino",
|
||||||
|
LangCode{"eng", "fin"}: "Finnish",
|
||||||
|
LangCode{"eng", "fre"}: "French",
|
||||||
|
LangCode{"eng", "geo"}: "Georgian",
|
||||||
|
LangCode{"eng", "ger"}: "German",
|
||||||
|
LangCode{"eng", "glg"}: "Galician",
|
||||||
|
LangCode{"eng", "grc"}: "Ancient Greek",
|
||||||
|
LangCode{"eng", "gre"}: "Modern Greek",
|
||||||
|
LangCode{"eng", "haw"}: "Hawaiian",
|
||||||
|
LangCode{"eng", "heb"}: "Hebrew",
|
||||||
|
LangCode{"eng", "hin"}: "Hindi",
|
||||||
|
LangCode{"eng", "hun"}: "Hungarian",
|
||||||
|
LangCode{"eng", "ice"}: "Icelandic",
|
||||||
|
LangCode{"eng", "ind"}: "Indonesian",
|
||||||
|
LangCode{"eng", "ita"}: "Italian",
|
||||||
|
LangCode{"eng", "khm"}: "Khmer",
|
||||||
|
LangCode{"eng", "kor"}: "Korean",
|
||||||
|
LangCode{"eng", "kur"}: "Kurdish",
|
||||||
|
LangCode{"eng", "lat"}: "Latin",
|
||||||
|
LangCode{"eng", "mal"}: "Malayalam",
|
||||||
|
LangCode{"eng", "mao"}: "Maori",
|
||||||
|
LangCode{"eng", "may"}: "Malay",
|
||||||
|
LangCode{"eng", "mnc"}: "Manchu",
|
||||||
|
LangCode{"eng", "mol"}: "Moldavian", // ISO 639 deprecated (https://iso639-3.sil.org/code/mol)
|
||||||
|
LangCode{"eng", "mon"}: "Mongolian",
|
||||||
|
LangCode{"eng", "nor"}: "Norwegian",
|
||||||
|
LangCode{"eng", "per"}: "Persian",
|
||||||
|
LangCode{"eng", "pol"}: "Polish",
|
||||||
|
LangCode{"eng", "por"}: "Portuguese",
|
||||||
|
LangCode{"eng", "rum"}: "Romanian",
|
||||||
|
LangCode{"eng", "rus"}: "Russian",
|
||||||
|
LangCode{"eng", "san"}: "Sanskrit",
|
||||||
|
LangCode{"eng", "scr"}: "Croatian", // Code doesn't seem to exist in ISO 639. Should be "hrv" instead? (https://iso639-3.sil.org/code/hrv)
|
||||||
|
LangCode{"eng", "slo"}: "Slovak",
|
||||||
|
LangCode{"eng", "slv"}: "Slovenian",
|
||||||
|
LangCode{"eng", "som"}: "Somali",
|
||||||
|
LangCode{"eng", "spa"}: "Spanish",
|
||||||
|
LangCode{"eng", "swa"}: "Swahili",
|
||||||
|
LangCode{"eng", "swe"}: "Swedish",
|
||||||
|
LangCode{"eng", "tah"}: "Tahitian",
|
||||||
|
LangCode{"eng", "tam"}: "Tamil",
|
||||||
|
LangCode{"eng", "tgl"}: "Tagalog",
|
||||||
|
LangCode{"eng", "tha"}: "Thai",
|
||||||
|
LangCode{"eng", "tib"}: "Tibetan",
|
||||||
|
LangCode{"eng", "tur"}: "Turkish",
|
||||||
|
LangCode{"eng", "ukr"}: "Ukrainian",
|
||||||
|
LangCode{"eng", "urd"}: "Urdu",
|
||||||
|
LangCode{"eng", "vie"}: "Vietnamese",
|
||||||
|
LangCode{"eng", "yid"}: "Yiddish",
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
|
||||||
|
var ISOtoHTML = map[string]string{
|
||||||
|
"afr": "af", // Afrikaans
|
||||||
|
"ain": "ain", // Ainu
|
||||||
|
"alg": "alg", // Algonquian
|
||||||
|
"amh": "am", // Amharic
|
||||||
|
"ara": "ar", // Arabic
|
||||||
|
"arn": "arn", // Mapudungun
|
||||||
|
"bnt": "bnt", // Bantu
|
||||||
|
"bre": "br", // Breton
|
||||||
|
"bul": "bg", // Bulgarian
|
||||||
|
"bur": "my", // Burmese
|
||||||
|
"chi": "zh", // Chinese
|
||||||
|
"chn": "chn", // Chinook Jargon
|
||||||
|
"cze": "cs", // Czech
|
||||||
|
"dan": "da", // Danish
|
||||||
|
"dut": "nl", // Dutch
|
||||||
|
"eng": "en", // English
|
||||||
|
"epo": "eo", // Esperanto
|
||||||
|
"est": "et", // Estonian
|
||||||
|
"fil": "fil", // Filipino
|
||||||
|
"fin": "fi", // Finnish
|
||||||
|
"fre": "fr", // French
|
||||||
|
"geo": "ka", // Georgian
|
||||||
|
"ger": "de", // German
|
||||||
|
"glg": "gl", // Galician
|
||||||
|
"grc": "grc", // Ancient Greek
|
||||||
|
"gre": "el", // Modern Greek
|
||||||
|
"haw": "haw", // Hawaiian
|
||||||
|
"heb": "he", // Hebrew
|
||||||
|
"hin": "hi", // Hindi
|
||||||
|
"hun": "hu", // Hungarian
|
||||||
|
"ice": "is", // Icelandic
|
||||||
|
"ind": "id", // Indonesian
|
||||||
|
"ita": "it", // Italian
|
||||||
|
"jpn": "ja", // Japanese
|
||||||
|
"khm": "km", // Khmer
|
||||||
|
"kor": "ko", // Korean
|
||||||
|
"kur": "ku", // Kurdish
|
||||||
|
"lat": "la", // Latin
|
||||||
|
"mal": "ml", // Malayalam
|
||||||
|
"mao": "mi", // Maori
|
||||||
|
"may": "ms", // Malay
|
||||||
|
"mnc": "mnc", // Manchu
|
||||||
|
"mol": "ro", // Moldavian
|
||||||
|
"mon": "mn", // Mongolian
|
||||||
|
"nor": "no", // Norwegian
|
||||||
|
"per": "fa", // Persian
|
||||||
|
"pol": "pl", // Polish
|
||||||
|
"por": "pt", // Portuguese
|
||||||
|
"rum": "ro", // Romanian
|
||||||
|
"rus": "ru", // Russian
|
||||||
|
"san": "sa", // Sanskrit
|
||||||
|
"scr": "hr", // Croatian
|
||||||
|
"slo": "sk", // Slovak
|
||||||
|
"slv": "sl", // Slovenian
|
||||||
|
"som": "so", // Somali
|
||||||
|
"spa": "es", // Spanish
|
||||||
|
"swa": "sw", // Swahili
|
||||||
|
"swe": "sv", // Swedish
|
||||||
|
"tah": "ty", // Tahitian
|
||||||
|
"tam": "ta", // Tamil
|
||||||
|
"tgl": "tl", // Tagalog
|
||||||
|
"tha": "th", // Thai
|
||||||
|
"tib": "bo", // Tibetan
|
||||||
|
"tur": "tr", // Turkish
|
||||||
|
"ukr": "uk", // Ukrainian
|
||||||
|
"urd": "ur", // Urdu
|
||||||
|
"vie": "vi", // Vietnamese
|
||||||
|
"yid": "yi", // Yiddish
|
||||||
|
}
|
254
jmdictForms.go
Normal file
254
jmdictForms.go
Normal file
@ -0,0 +1,254 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"foosoft.net/projects/jmdict"
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
func kata2hira(word string) string {
|
||||||
|
charMap := func(character rune) rune {
|
||||||
|
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
|
||||||
|
return character - 0x60
|
||||||
|
} else {
|
||||||
|
return character
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.Map(charMap, word)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) InfoSymbols() string {
|
||||||
|
infoSymbols := []string{}
|
||||||
|
if h.IsPriority {
|
||||||
|
infoSymbols = append(infoSymbols, prioritySymbol)
|
||||||
|
}
|
||||||
|
if h.IsRareKanji {
|
||||||
|
infoSymbols = append(infoSymbols, rareKanjiSymbol)
|
||||||
|
}
|
||||||
|
if h.IsIrregular {
|
||||||
|
infoSymbols = append(infoSymbols, irregularSymbol)
|
||||||
|
}
|
||||||
|
if h.IsOutdated {
|
||||||
|
infoSymbols = append(infoSymbols, outdatedSymbol)
|
||||||
|
}
|
||||||
|
return strings.Join(infoSymbols[:], " | ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) GlossText() string {
|
||||||
|
gloss := h.Expression
|
||||||
|
if h.IsAteji {
|
||||||
|
gloss = "〈" + gloss + "〉"
|
||||||
|
}
|
||||||
|
symbolText := h.InfoSymbols()
|
||||||
|
if symbolText != "" {
|
||||||
|
gloss += "(" + symbolText + ")"
|
||||||
|
}
|
||||||
|
return gloss
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) TableColHeaderText() string {
|
||||||
|
text := h.KanjiForm()
|
||||||
|
if h.IsAteji {
|
||||||
|
text = "〈" + text + "〉"
|
||||||
|
}
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) TableRowHeaderText() string {
|
||||||
|
text := h.Reading
|
||||||
|
if h.IsGikun {
|
||||||
|
text = "〈" + text + "〉"
|
||||||
|
}
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) TableCellText() string {
|
||||||
|
text := h.InfoSymbols()
|
||||||
|
if text == "" {
|
||||||
|
return defaultSymbol
|
||||||
|
} else {
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) KanjiForm() string {
|
||||||
|
if h.IsKanaOnly() {
|
||||||
|
return "∅"
|
||||||
|
} else {
|
||||||
|
return h.Expression
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmdNeedsFormTable(headwords []headword) bool {
|
||||||
|
// Does the entry contain more than 1 distinct reading?
|
||||||
|
// E.g. バカがい and ばかがい are not distinct.
|
||||||
|
uniqueReading := ""
|
||||||
|
for _, h := range headwords {
|
||||||
|
if h.IsGikun {
|
||||||
|
return true
|
||||||
|
} else if h.IsSearchOnly {
|
||||||
|
continue
|
||||||
|
} else if h.IsKanaOnly() {
|
||||||
|
continue
|
||||||
|
} else if uniqueReading == "" {
|
||||||
|
uniqueReading = kata2hira(h.Reading)
|
||||||
|
} else if uniqueReading != kata2hira(h.Reading) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type formTableData struct {
|
||||||
|
kanjiForms []string
|
||||||
|
readings []string
|
||||||
|
colHeaderText map[string]string
|
||||||
|
rowHeaderText map[string]string
|
||||||
|
cellText map[string]map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func tableData(headwords []headword) formTableData {
|
||||||
|
d := formTableData{
|
||||||
|
kanjiForms: []string{},
|
||||||
|
readings: []string{},
|
||||||
|
colHeaderText: make(map[string]string),
|
||||||
|
rowHeaderText: make(map[string]string),
|
||||||
|
cellText: make(map[string]map[string]string),
|
||||||
|
}
|
||||||
|
for _, h := range headwords {
|
||||||
|
if h.IsSearchOnly {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
kanjiForm := h.KanjiForm()
|
||||||
|
if !slices.Contains(d.kanjiForms, kanjiForm) {
|
||||||
|
d.kanjiForms = append(d.kanjiForms, kanjiForm)
|
||||||
|
d.colHeaderText[kanjiForm] = h.TableColHeaderText()
|
||||||
|
}
|
||||||
|
reading := h.Reading
|
||||||
|
if !slices.Contains(d.readings, reading) {
|
||||||
|
d.readings = append(d.readings, reading)
|
||||||
|
d.rowHeaderText[reading] = h.TableRowHeaderText()
|
||||||
|
d.cellText[reading] = make(map[string]string)
|
||||||
|
}
|
||||||
|
d.cellText[reading][kanjiForm] = h.TableCellText()
|
||||||
|
}
|
||||||
|
return d
|
||||||
|
}
|
||||||
|
|
||||||
|
func formsTableGlossary(headwords []headword) []any {
|
||||||
|
d := tableData(headwords)
|
||||||
|
|
||||||
|
attr := contentAttr{}
|
||||||
|
centeredAttr := contentAttr{textAlign: "center"}
|
||||||
|
leftAttr := contentAttr{textAlign: "left"}
|
||||||
|
|
||||||
|
cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner
|
||||||
|
headRowCells := []any{cornerCell}
|
||||||
|
for _, kanjiForm := range d.kanjiForms {
|
||||||
|
content := d.colHeaderText[kanjiForm]
|
||||||
|
cell := contentTableHeadCell(centeredAttr, content)
|
||||||
|
headRowCells = append(headRowCells, cell)
|
||||||
|
}
|
||||||
|
headRow := contentTableRow(attr, headRowCells...)
|
||||||
|
tableRows := []any{headRow}
|
||||||
|
for _, reading := range d.readings {
|
||||||
|
rowHeadCellText := d.rowHeaderText[reading]
|
||||||
|
rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText)
|
||||||
|
rowCells := []any{rowHeadCell}
|
||||||
|
for _, kanjiForm := range d.kanjiForms {
|
||||||
|
text := d.cellText[reading][kanjiForm]
|
||||||
|
rowCell := contentTableCell(centeredAttr, text)
|
||||||
|
rowCells = append(rowCells, rowCell)
|
||||||
|
}
|
||||||
|
tableRow := contentTableRow(attr, rowCells...)
|
||||||
|
tableRows = append(tableRows, tableRow)
|
||||||
|
}
|
||||||
|
tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}}
|
||||||
|
contentTable := contentTable(tableAttr, tableRows...)
|
||||||
|
content := contentStructure(contentTable)
|
||||||
|
return []any{content}
|
||||||
|
}
|
||||||
|
|
||||||
|
func formsGlossary(headwords []headword) []any {
|
||||||
|
glossary := []any{}
|
||||||
|
for _, h := range headwords {
|
||||||
|
if h.IsSearchOnly {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
text := h.GlossText()
|
||||||
|
glossary = append(glossary, text)
|
||||||
|
}
|
||||||
|
return glossary
|
||||||
|
}
|
||||||
|
|
||||||
|
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
|
||||||
|
term := dbTerm{Sequence: entry.Sequence}
|
||||||
|
headwords := extractHeadwords(entry)
|
||||||
|
if jmdNeedsFormTable(headwords) {
|
||||||
|
term.Glossary = formsTableGlossary(headwords)
|
||||||
|
} else {
|
||||||
|
term.Glossary = formsGlossary(headwords)
|
||||||
|
}
|
||||||
|
for _, sense := range entry.Sense {
|
||||||
|
rules := grammarRules(sense.PartsOfSpeech)
|
||||||
|
term.addRules(rules...)
|
||||||
|
}
|
||||||
|
return term
|
||||||
|
}
|
||||||
|
|
||||||
|
func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error {
|
||||||
|
reader, err := os.Open(inputPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer reader.Close()
|
||||||
|
|
||||||
|
dictionary, _, err := jmdict.LoadJmdictNoTransform(reader)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
terms := dbTermList{}
|
||||||
|
for _, entry := range dictionary.Entries {
|
||||||
|
baseTerm := baseFormsTerm(entry)
|
||||||
|
headwords := extractHeadwords(entry)
|
||||||
|
for _, h := range headwords {
|
||||||
|
term := baseTerm
|
||||||
|
if h.IsSearchOnly {
|
||||||
|
term.Sequence = -term.Sequence
|
||||||
|
}
|
||||||
|
term.Expression = h.Expression
|
||||||
|
term.Reading = h.Reading
|
||||||
|
terms = append(terms, term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if title == "" {
|
||||||
|
title = "JMdict Forms"
|
||||||
|
}
|
||||||
|
|
||||||
|
recordData := map[string]dbRecordList{
|
||||||
|
"term": terms.crush(),
|
||||||
|
"tag": dbRecordList{},
|
||||||
|
}
|
||||||
|
|
||||||
|
jmdictDate := jmdictPublicationDate(dictionary)
|
||||||
|
|
||||||
|
index := dbIndex{
|
||||||
|
Title: title,
|
||||||
|
Revision: "JMdict." + jmdictDate,
|
||||||
|
Sequenced: true,
|
||||||
|
Attribution: edrdgAttribution,
|
||||||
|
}
|
||||||
|
index.setDefaults()
|
||||||
|
|
||||||
|
return writeDb(
|
||||||
|
outputPath,
|
||||||
|
index,
|
||||||
|
recordData,
|
||||||
|
stride,
|
||||||
|
pretty,
|
||||||
|
)
|
||||||
|
}
|
300
jmdictGlossary.go
Normal file
300
jmdictGlossary.go
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"foosoft.net/projects/jmdict"
|
||||||
|
)
|
||||||
|
|
||||||
|
func glossaryContainsLanguage(glossary []jmdict.JmdictGlossary, language string) bool {
|
||||||
|
hasGlosses := false
|
||||||
|
for _, gloss := range glossary {
|
||||||
|
if glossContainsLanguage(gloss, language) {
|
||||||
|
hasGlosses = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hasGlosses
|
||||||
|
}
|
||||||
|
|
||||||
|
func glossContainsLanguage(gloss jmdict.JmdictGlossary, language string) bool {
|
||||||
|
if gloss.Language == nil && language != "eng" {
|
||||||
|
return false
|
||||||
|
} else if gloss.Language != nil && language != *gloss.Language {
|
||||||
|
return false
|
||||||
|
} else {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeGlossListItem(gloss jmdict.JmdictGlossary, language string) any {
|
||||||
|
contents := []any{gloss.Content}
|
||||||
|
listItem := contentListItem(contentAttr{}, contents...)
|
||||||
|
return listItem
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeInfoGlossListItem(gloss jmdict.JmdictGlossary, language string) any {
|
||||||
|
// Prepend gloss with "type" (literal, figurative, trademark, etc.)
|
||||||
|
glossTypeCode := *gloss.Type
|
||||||
|
contents := []any{}
|
||||||
|
if name, ok := glossTypeCodeToName[LangCode{language, glossTypeCode}]; ok {
|
||||||
|
if name != "" {
|
||||||
|
italicStyle := contentAttr{fontStyle: "italic"}
|
||||||
|
contents = append(contents, contentSpan(italicStyle, "("+name+")"), " ")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fmt.Println("Unknown glossary type code " + *gloss.Type + " for build language " + language)
|
||||||
|
contents = append(contents, "["+glossTypeCode+"] ")
|
||||||
|
}
|
||||||
|
contents = append(contents, gloss.Content)
|
||||||
|
listItem := contentListItem(contentAttr{}, contents...)
|
||||||
|
return listItem
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeSourceLangListItem(sourceLanguage jmdict.JmdictSource, language string) any {
|
||||||
|
contents := []any{}
|
||||||
|
|
||||||
|
var srcLangCode string
|
||||||
|
if sourceLanguage.Language == nil {
|
||||||
|
srcLangCode = "eng"
|
||||||
|
} else {
|
||||||
|
srcLangCode = *sourceLanguage.Language
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format: [Language] ([Partial?], [Wasei?]): [Original word?]
|
||||||
|
// [Language]
|
||||||
|
if langName, ok := langCodeToName[LangCode{language, srcLangCode}]; ok {
|
||||||
|
contents = append(contents, langName)
|
||||||
|
} else {
|
||||||
|
contents = append(contents, srcLangCode)
|
||||||
|
fmt.Println("Unable to convert ISO 639 code " + srcLangCode + " to its full name in language " + language)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ([Partial?], [Wasei?])
|
||||||
|
var sourceLangTypeCode string
|
||||||
|
if sourceLanguage.Type == nil {
|
||||||
|
sourceLangTypeCode = ""
|
||||||
|
} else {
|
||||||
|
sourceLangTypeCode = *sourceLanguage.Type
|
||||||
|
}
|
||||||
|
var sourceLangType string
|
||||||
|
if val, ok := sourceLangTypeCodeToType[LangCode{language, sourceLangTypeCode}]; ok {
|
||||||
|
sourceLangType = val
|
||||||
|
} else {
|
||||||
|
sourceLangType = sourceLangTypeCode
|
||||||
|
fmt.Println("Unknown source language type code " + sourceLangTypeCode + " for build language " + language)
|
||||||
|
}
|
||||||
|
if sourceLangType != "" && sourceLanguage.Wasei == "y" {
|
||||||
|
contents = append(contents, " ("+sourceLangType+", wasei)")
|
||||||
|
} else if sourceLangType != "" {
|
||||||
|
contents = append(contents, " ("+sourceLangType+")")
|
||||||
|
} else if sourceLanguage.Wasei == "y" {
|
||||||
|
contents = append(contents, " (wasei)")
|
||||||
|
}
|
||||||
|
|
||||||
|
// : [Original word?]
|
||||||
|
if sourceLanguage.Content != "" {
|
||||||
|
contents = append(contents, ": ")
|
||||||
|
attr := contentAttr{lang: ISOtoHTML[srcLangCode]}
|
||||||
|
contents = append(contents, contentSpan(attr, sourceLanguage.Content))
|
||||||
|
}
|
||||||
|
|
||||||
|
listItem := contentListItem(contentAttr{}, contents...)
|
||||||
|
return listItem
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeReferenceListItem(reference string, refType string, meta jmdictMetadata) any {
|
||||||
|
contents := []any{}
|
||||||
|
attr := contentAttr{}
|
||||||
|
|
||||||
|
hint := refNoteHint[LangCode{meta.language, refType}]
|
||||||
|
contents = append(contents, hint+": ")
|
||||||
|
|
||||||
|
refHeadword, senseNumber, ok := parseReference(reference)
|
||||||
|
if !ok {
|
||||||
|
contents = append(contents, "【"+reference+"】")
|
||||||
|
return contentListItem(attr, contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence, ok := meta.referenceToSeq[reference]
|
||||||
|
if !ok {
|
||||||
|
contents = append(contents, "【"+reference+"】")
|
||||||
|
return contentListItem(attr, contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
targetSense := senseID{
|
||||||
|
sequence: sequence,
|
||||||
|
number: senseNumber,
|
||||||
|
}
|
||||||
|
|
||||||
|
expHash := refHeadword.ExpHash()
|
||||||
|
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
|
||||||
|
doDisplaySenseNumber := (meta.seqToSenseCount[targetSense.sequence] > 1)
|
||||||
|
refGlossAttr := contentAttr{
|
||||||
|
fontSize: "65%",
|
||||||
|
verticalAlign: "middle",
|
||||||
|
data: map[string]string{"content": "refGlosses"},
|
||||||
|
}
|
||||||
|
|
||||||
|
contents = append(contents, refHeadword.ToInternalLink(doDisplayReading))
|
||||||
|
if doDisplaySenseNumber {
|
||||||
|
contents = append(contents, contentSpan(refGlossAttr, " "+strconv.Itoa(targetSense.number)+". "+meta.condensedGlosses[targetSense]))
|
||||||
|
} else {
|
||||||
|
contents = append(contents, contentSpan(refGlossAttr, " "+meta.condensedGlosses[targetSense]))
|
||||||
|
}
|
||||||
|
|
||||||
|
listItem := contentListItem(attr, contents...)
|
||||||
|
return listItem
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeExampleListItem(sentence jmdict.JmdictExampleSentence) any {
|
||||||
|
if sentence.Lang == "jpn" {
|
||||||
|
return contentListItem(contentAttr{}, sentence.Text)
|
||||||
|
} else {
|
||||||
|
attr := contentAttr{
|
||||||
|
lang: ISOtoHTML[sentence.Lang],
|
||||||
|
listStyleType: ISOtoFlag[sentence.Lang],
|
||||||
|
}
|
||||||
|
return contentListItem(attr, sentence.Text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func listAttr(lang string, listStyleType string, dataContent string) contentAttr {
|
||||||
|
return contentAttr{
|
||||||
|
lang: lang,
|
||||||
|
listStyleType: listStyleType,
|
||||||
|
data: map[string]string{"content": dataContent},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func needsStructuredContent(sense jmdict.JmdictSense, language string) bool {
|
||||||
|
for _, gloss := range sense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, language) && gloss.Type != nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(sense.SourceLanguages) > 0 {
|
||||||
|
return true
|
||||||
|
} else if len(sense.Information) > 0 {
|
||||||
|
return true
|
||||||
|
} else if len(sense.Antonyms) > 0 {
|
||||||
|
return true
|
||||||
|
} else if len(sense.References) > 0 {
|
||||||
|
return true
|
||||||
|
} else if len(sense.Examples) > 0 {
|
||||||
|
return true
|
||||||
|
} else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func createGlossaryContent(sense jmdict.JmdictSense, meta jmdictMetadata) any {
|
||||||
|
glossaryContents := []any{}
|
||||||
|
|
||||||
|
// Add normal glosses
|
||||||
|
glossListItems := []any{}
|
||||||
|
for _, gloss := range sense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
||||||
|
listItem := makeGlossListItem(gloss, meta.language)
|
||||||
|
glossListItems = append(glossListItems, listItem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(glossListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML[meta.language], "circle", "glossary")
|
||||||
|
list := contentUnorderedList(attr, glossListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add information glosses
|
||||||
|
infoGlossListItems := []any{}
|
||||||
|
for _, gloss := range sense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, meta.language) && gloss.Type != nil {
|
||||||
|
listItem := makeInfoGlossListItem(gloss, meta.language)
|
||||||
|
infoGlossListItems = append(infoGlossListItems, listItem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(infoGlossListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML[meta.language], infoMarker, "infoGlossary")
|
||||||
|
list := contentUnorderedList(attr, infoGlossListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add language-of-origin / loanword information
|
||||||
|
sourceLangListItems := []any{}
|
||||||
|
for _, sourceLanguage := range sense.SourceLanguages {
|
||||||
|
listItem := makeSourceLangListItem(sourceLanguage, meta.language)
|
||||||
|
sourceLangListItems = append(sourceLangListItems, listItem)
|
||||||
|
}
|
||||||
|
if len(sourceLangListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML[meta.language], langMarker, "sourceLanguages")
|
||||||
|
list := contentUnorderedList(attr, sourceLangListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add sense notes
|
||||||
|
noteListItems := []any{}
|
||||||
|
for _, information := range sense.Information {
|
||||||
|
listItem := contentListItem(contentAttr{}, information)
|
||||||
|
noteListItems = append(noteListItems, listItem)
|
||||||
|
}
|
||||||
|
if len(noteListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML["jpn"], noteMarker, "notes") // notes often contain japanese text
|
||||||
|
list := contentUnorderedList(attr, noteListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add antonyms
|
||||||
|
antonymListItems := []any{}
|
||||||
|
for _, antonym := range sense.Antonyms {
|
||||||
|
listItem := makeReferenceListItem(antonym, "ant", meta)
|
||||||
|
antonymListItems = append(antonymListItems, listItem)
|
||||||
|
}
|
||||||
|
if len(antonymListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML[meta.language], antonymMarker, "antonyms")
|
||||||
|
list := contentUnorderedList(attr, antonymListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add cross-references
|
||||||
|
referenceListItems := []any{}
|
||||||
|
for _, reference := range sense.References {
|
||||||
|
listItem := makeReferenceListItem(reference, "xref", meta)
|
||||||
|
referenceListItems = append(referenceListItems, listItem)
|
||||||
|
}
|
||||||
|
if len(referenceListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML[meta.language], refMarker, "references")
|
||||||
|
list := contentUnorderedList(attr, referenceListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add example sentences
|
||||||
|
exampleListItems := []any{}
|
||||||
|
for _, example := range sense.Examples {
|
||||||
|
for _, sentence := range example.Sentences {
|
||||||
|
listItem := makeExampleListItem(sentence)
|
||||||
|
exampleListItems = append(exampleListItems, listItem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(exampleListItems) > 0 {
|
||||||
|
attr := listAttr(ISOtoHTML["jpn"], ISOtoFlag["jpn"], "examples")
|
||||||
|
list := contentUnorderedList(attr, exampleListItems...)
|
||||||
|
glossaryContents = append(glossaryContents, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
return contentStructure(glossaryContents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func createGlossary(sense jmdict.JmdictSense, meta jmdictMetadata) []any {
|
||||||
|
glossary := []any{}
|
||||||
|
if needsStructuredContent(sense, meta.language) {
|
||||||
|
glossary = append(glossary, createGlossaryContent(sense, meta))
|
||||||
|
} else {
|
||||||
|
for _, gloss := range sense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, meta.language) {
|
||||||
|
glossary = append(glossary, gloss.Content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return glossary
|
||||||
|
}
|
267
jmdictHeadword.go
Normal file
267
jmdictHeadword.go
Normal file
@ -0,0 +1,267 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"hash/fnv"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"foosoft.net/projects/jmdict"
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
type headword struct {
|
||||||
|
Expression string
|
||||||
|
Reading string
|
||||||
|
TermTags []string
|
||||||
|
Index int
|
||||||
|
IsPriority bool
|
||||||
|
IsIrregular bool
|
||||||
|
IsOutdated bool
|
||||||
|
IsRareKanji bool
|
||||||
|
IsSearchOnly bool
|
||||||
|
IsAteji bool
|
||||||
|
IsGikun bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type hash uint64
|
||||||
|
|
||||||
|
func (h *headword) Hash() hash {
|
||||||
|
return hashText(h.Expression + "␞" + h.Reading)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) ExpHash() hash {
|
||||||
|
return hashText(h.Expression + "␞" + h.Expression)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) ReadingHash() hash {
|
||||||
|
return hashText(h.Reading + "␞" + h.Reading)
|
||||||
|
}
|
||||||
|
|
||||||
|
func hashText(s string) hash {
|
||||||
|
h := fnv.New64a()
|
||||||
|
h.Write([]byte(s))
|
||||||
|
return hash(h.Sum64())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) IsKanaOnly() bool {
|
||||||
|
if h.Expression != h.Reading {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, char := range h.Expression {
|
||||||
|
if char >= 'ぁ' && char <= 'ヿ' {
|
||||||
|
// hiragana and katakana range
|
||||||
|
continue
|
||||||
|
} else if char >= '・' && char <= '゚' {
|
||||||
|
// halfwidth katakana range
|
||||||
|
continue
|
||||||
|
} else if char == '〜' {
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) Score() int {
|
||||||
|
score := 0
|
||||||
|
if h.IsPriority {
|
||||||
|
score += 1
|
||||||
|
}
|
||||||
|
if h.IsIrregular {
|
||||||
|
score -= 5
|
||||||
|
}
|
||||||
|
if h.IsOutdated {
|
||||||
|
score -= 5
|
||||||
|
}
|
||||||
|
if h.IsRareKanji {
|
||||||
|
score -= 5
|
||||||
|
}
|
||||||
|
if h.IsSearchOnly {
|
||||||
|
score -= 5
|
||||||
|
}
|
||||||
|
return score
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) ToInternalLink(includeReading bool) any {
|
||||||
|
if !includeReading || h.Expression == h.Reading {
|
||||||
|
return contentInternalLink(
|
||||||
|
contentAttr{lang: ISOtoHTML["jpn"]},
|
||||||
|
h.Expression,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
return contentSpan(
|
||||||
|
contentAttr{lang: ISOtoHTML["jpn"]},
|
||||||
|
contentInternalLink(contentAttr{}, h.Expression),
|
||||||
|
"(",
|
||||||
|
contentInternalLink(contentAttr{}, h.Reading),
|
||||||
|
")",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) SetFlags(infoTags, freqTags []string) {
|
||||||
|
priorityTags := []string{"ichi1", "news1", "gai1", "spec1", "spec2"}
|
||||||
|
for _, priorityTag := range priorityTags {
|
||||||
|
if slices.Contains(freqTags, priorityTag) {
|
||||||
|
h.IsPriority = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, infoTag := range infoTags {
|
||||||
|
switch infoTag {
|
||||||
|
case "iK", "ik", "io":
|
||||||
|
h.IsIrregular = true
|
||||||
|
case "oK", "ok":
|
||||||
|
h.IsOutdated = true
|
||||||
|
case "sK", "sk":
|
||||||
|
h.IsSearchOnly = true
|
||||||
|
case "rK":
|
||||||
|
h.IsRareKanji = true
|
||||||
|
case "ateji":
|
||||||
|
h.IsAteji = true
|
||||||
|
case "gikun":
|
||||||
|
h.IsGikun = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if h.IsOutdated && h.IsRareKanji {
|
||||||
|
h.IsRareKanji = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *headword) SetTermTags(freqTags []string) {
|
||||||
|
h.TermTags = []string{}
|
||||||
|
if h.IsPriority {
|
||||||
|
h.TermTags = append(h.TermTags, priorityTagName)
|
||||||
|
}
|
||||||
|
for _, tag := range freqTags {
|
||||||
|
isNewsFreqTag, _ := regexp.MatchString(`nf\d\d`, tag)
|
||||||
|
if isNewsFreqTag {
|
||||||
|
// nf tags are divided into ranks of 500
|
||||||
|
// (nf01 to nf48), but it will be easier
|
||||||
|
// for the user to read 1k, 2k, etc.
|
||||||
|
var i int
|
||||||
|
if _, err := fmt.Sscanf(tag, "nf%2d", &i); err == nil {
|
||||||
|
i = (i + (i % 2)) / 2
|
||||||
|
newsTag := "news" + strconv.Itoa(i) + "k"
|
||||||
|
h.TermTags = append(h.TermTags, newsTag)
|
||||||
|
}
|
||||||
|
} else if tag == "news1" || tag == "news2" {
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
tagWithoutTheNumber := tag[:len(tag)-1] // "ichi", "gai", or "spec"
|
||||||
|
h.TermTags = append(h.TermTags, tagWithoutTheNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if h.IsIrregular {
|
||||||
|
h.TermTags = append(h.TermTags, irregularTagName)
|
||||||
|
}
|
||||||
|
if h.IsOutdated {
|
||||||
|
h.TermTags = append(h.TermTags, outdatedTagName)
|
||||||
|
}
|
||||||
|
if h.IsRareKanji {
|
||||||
|
h.TermTags = append(h.TermTags, rareKanjiTagName)
|
||||||
|
}
|
||||||
|
if h.IsAteji {
|
||||||
|
h.TermTags = append(h.TermTags, atejiTagName)
|
||||||
|
}
|
||||||
|
if h.IsGikun {
|
||||||
|
h.TermTags = append(h.TermTags, gikunTagName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newHeadword(kanji *jmdict.JmdictKanji, reading *jmdict.JmdictReading) headword {
|
||||||
|
h := headword{}
|
||||||
|
infoTags := []string{}
|
||||||
|
freqTags := []string{}
|
||||||
|
if kanji == nil {
|
||||||
|
h.Expression = reading.Reading
|
||||||
|
h.Reading = reading.Reading
|
||||||
|
infoTags = reading.Information
|
||||||
|
freqTags = reading.Priorities
|
||||||
|
} else if reading == nil {
|
||||||
|
// should only apply to search-only kanji terms
|
||||||
|
h.Expression = kanji.Expression
|
||||||
|
h.Reading = ""
|
||||||
|
infoTags = kanji.Information
|
||||||
|
freqTags = kanji.Priorities
|
||||||
|
} else {
|
||||||
|
h.Expression = kanji.Expression
|
||||||
|
h.Reading = reading.Reading
|
||||||
|
infoTags = union(kanji.Information, reading.Information)
|
||||||
|
freqTags = intersection(kanji.Priorities, reading.Priorities)
|
||||||
|
}
|
||||||
|
h.SetFlags(infoTags, freqTags)
|
||||||
|
h.SetTermTags(freqTags)
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
|
func areAllKanjiIrregular(allKanji []jmdict.JmdictKanji) bool {
|
||||||
|
// If every kanji form is rare or irregular, then we'll make
|
||||||
|
// kana-only headwords for each kana form.
|
||||||
|
if len(allKanji) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, kanji := range allKanji {
|
||||||
|
h := newHeadword(&kanji, nil)
|
||||||
|
kanjiIsIrregular := h.IsRareKanji || h.IsIrregular || h.IsOutdated || h.IsSearchOnly
|
||||||
|
if !kanjiIsIrregular {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractHeadwords(entry jmdict.JmdictEntry) []headword {
|
||||||
|
headwords := []headword{}
|
||||||
|
allKanjiAreIrregular := areAllKanjiIrregular(entry.Kanji)
|
||||||
|
|
||||||
|
if allKanjiAreIrregular {
|
||||||
|
// Adding the reading-only terms before kanji+reading
|
||||||
|
// terms here for the sake of the Index property,
|
||||||
|
// which affects the yomichan term ranking.
|
||||||
|
for _, reading := range entry.Readings {
|
||||||
|
h := newHeadword(nil, &reading)
|
||||||
|
h.Index = len(headwords)
|
||||||
|
headwords = append(headwords, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, kanji := range entry.Kanji {
|
||||||
|
if slices.Contains(kanji.Information, "sK") {
|
||||||
|
// Search-only kanji forms do not have associated readings.
|
||||||
|
h := newHeadword(&kanji, nil)
|
||||||
|
h.Index = len(headwords)
|
||||||
|
headwords = append(headwords, h)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, reading := range entry.Readings {
|
||||||
|
if reading.NoKanji != nil {
|
||||||
|
continue
|
||||||
|
} else if slices.Contains(reading.Information, "sk") {
|
||||||
|
// Search-only kana forms do not have associated kanji forms.
|
||||||
|
continue
|
||||||
|
} else if reading.Restrictions != nil && !slices.Contains(reading.Restrictions, kanji.Expression) {
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
h := newHeadword(&kanji, &reading)
|
||||||
|
h.Index = len(headwords)
|
||||||
|
headwords = append(headwords, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !allKanjiAreIrregular {
|
||||||
|
noKanjiInEntry := (len(entry.Kanji) == 0)
|
||||||
|
for _, reading := range entry.Readings {
|
||||||
|
if reading.NoKanji != nil || noKanjiInEntry || slices.Contains(reading.Information, "sk") {
|
||||||
|
h := newHeadword(nil, &reading)
|
||||||
|
h.Index = len(headwords)
|
||||||
|
headwords = append(headwords, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return headwords
|
||||||
|
}
|
158
jmdictMetadata.go
Normal file
158
jmdictMetadata.go
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"foosoft.net/projects/jmdict"
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
type sequence = int
|
||||||
|
|
||||||
|
type jmdictMetadata struct {
|
||||||
|
language string
|
||||||
|
condensedGlosses map[senseID]string
|
||||||
|
seqToSenseCount map[sequence]int
|
||||||
|
seqToMainHeadword map[sequence]headword
|
||||||
|
expHashToReadings map[hash][]string
|
||||||
|
headwordHashToSeqs map[hash][]sequence
|
||||||
|
references []string
|
||||||
|
referenceToSeq map[string]sequence
|
||||||
|
hashToSearchValues map[hash][]searchValue
|
||||||
|
seqToSearchHashes map[sequence][]searchHash
|
||||||
|
hasMultipleForms map[sequence]bool
|
||||||
|
maxSenseCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
type senseID struct {
|
||||||
|
sequence sequence
|
||||||
|
number int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
|
||||||
|
|
||||||
|
// Determine how many senses are in this entry for this language
|
||||||
|
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
|
||||||
|
senseCount := 0
|
||||||
|
for _, entrySense := range entry.Sense {
|
||||||
|
for _, gloss := range entrySense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, meta.language) {
|
||||||
|
senseCount += 1
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
meta.seqToSenseCount[entry.Sequence] = senseCount
|
||||||
|
}
|
||||||
|
|
||||||
|
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// main headwords (first ones that are found in entries).
|
||||||
|
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
|
||||||
|
meta.seqToMainHeadword[entry.Sequence] = headword
|
||||||
|
}
|
||||||
|
|
||||||
|
// hash the term pair so we can determine if it's used
|
||||||
|
// in more than one JMdict entry later.
|
||||||
|
headwordHash := headword.Hash()
|
||||||
|
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
|
||||||
|
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
|
||||||
|
}
|
||||||
|
|
||||||
|
// hash the expression so that we can determine if we
|
||||||
|
// need to disambiguate it by displaying its reading
|
||||||
|
// in reference notes later.
|
||||||
|
expHash := headword.ExpHash()
|
||||||
|
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
|
||||||
|
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
|
||||||
|
}
|
||||||
|
|
||||||
|
// e.g. for JMdict (English) we expect to end up with
|
||||||
|
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
|
||||||
|
// used for correlating references to sequence numbers later.
|
||||||
|
searchHashes := []searchHash{
|
||||||
|
searchHash{headwordHash, headword.IsPriority},
|
||||||
|
searchHash{expHash, headword.IsPriority},
|
||||||
|
searchHash{headword.ReadingHash(), headword.IsPriority},
|
||||||
|
}
|
||||||
|
for _, x := range searchHashes {
|
||||||
|
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
|
||||||
|
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
currentSenseNumber := 1
|
||||||
|
for _, entrySense := range entry.Sense {
|
||||||
|
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
|
||||||
|
currentSenseNumber += 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
|
||||||
|
currentSenseNumber += 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
allReferences := append(entrySense.References, entrySense.Antonyms...)
|
||||||
|
for _, reference := range allReferences {
|
||||||
|
meta.references = append(meta.references, reference)
|
||||||
|
}
|
||||||
|
|
||||||
|
currentSense := senseID{entry.Sequence, currentSenseNumber}
|
||||||
|
if meta.condensedGlosses[currentSense] == "" {
|
||||||
|
glosses := []string{}
|
||||||
|
for _, gloss := range entrySense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
||||||
|
glosses = append(glosses, gloss.Content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
|
||||||
|
}
|
||||||
|
currentSenseNumber += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
|
||||||
|
meta := jmdictMetadata{
|
||||||
|
language: langNameToCode[languageName],
|
||||||
|
seqToSenseCount: make(map[sequence]int),
|
||||||
|
condensedGlosses: make(map[senseID]string),
|
||||||
|
seqToMainHeadword: make(map[sequence]headword),
|
||||||
|
expHashToReadings: make(map[hash][]string),
|
||||||
|
seqToSearchHashes: make(map[sequence][]searchHash),
|
||||||
|
headwordHashToSeqs: make(map[hash][]sequence),
|
||||||
|
references: []string{},
|
||||||
|
hashToSearchValues: nil,
|
||||||
|
referenceToSeq: nil,
|
||||||
|
hasMultipleForms: make(map[sequence]bool),
|
||||||
|
maxSenseCount: 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range dictionary.Entries {
|
||||||
|
headwords := extractHeadwords(entry)
|
||||||
|
formCount := 0
|
||||||
|
for _, headword := range headwords {
|
||||||
|
meta.AddHeadword(headword, entry)
|
||||||
|
if !headword.IsSearchOnly {
|
||||||
|
formCount += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// this correlation process will be unnecessary once JMdict
|
||||||
|
// includes sequence numbers in its cross-reference data
|
||||||
|
meta.MakeReferenceToSeqMap()
|
||||||
|
|
||||||
|
for _, senseCount := range meta.seqToSenseCount {
|
||||||
|
if meta.maxSenseCount < senseCount {
|
||||||
|
meta.maxSenseCount = senseCount
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return meta
|
||||||
|
}
|
166
jmdictReferences.go
Normal file
166
jmdictReferences.go
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In the future, JMdict will be updated to include sequence numbers
|
||||||
|
* with each cross reference. At that time, most of the functions and
|
||||||
|
* types defined in this file will become unnecessary. see:
|
||||||
|
* https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
|
||||||
|
*/
|
||||||
|
|
||||||
|
type searchValue struct {
|
||||||
|
sequence sequence
|
||||||
|
index int
|
||||||
|
isPriority bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type searchHash struct {
|
||||||
|
hash hash
|
||||||
|
isPriority bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseReference(reference string) (headword, int, bool) {
|
||||||
|
// Reference strings in JMDict currently consist of 3 parts at
|
||||||
|
// most, separated by ・ characters. The latter two parts are
|
||||||
|
// optional. When the sense number is not specified, it is
|
||||||
|
// implied to be the first sense.
|
||||||
|
var h headword
|
||||||
|
var senseNumber int
|
||||||
|
ok := true
|
||||||
|
refParts := strings.Split(reference, "・")
|
||||||
|
if len(refParts) == 1 {
|
||||||
|
// (Kanji) or (Reading)
|
||||||
|
h = headword{Expression: refParts[0], Reading: refParts[0]}
|
||||||
|
senseNumber = 1
|
||||||
|
} else if len(refParts) == 2 {
|
||||||
|
// [Kanji + (Reading or Sense)] or (Reading + Sense)
|
||||||
|
val, err := strconv.Atoi(refParts[1])
|
||||||
|
if err == nil {
|
||||||
|
h = headword{Expression: refParts[0], Reading: refParts[0]}
|
||||||
|
senseNumber = val
|
||||||
|
} else {
|
||||||
|
h = headword{Expression: refParts[0], Reading: refParts[1]}
|
||||||
|
senseNumber = 1
|
||||||
|
}
|
||||||
|
} else if len(refParts) == 3 {
|
||||||
|
// Expression + Reading + Sense
|
||||||
|
h = headword{Expression: refParts[0], Reading: refParts[1]}
|
||||||
|
val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
|
||||||
|
if err == nil {
|
||||||
|
senseNumber = val
|
||||||
|
} else {
|
||||||
|
errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
|
||||||
|
fmt.Println(errortext)
|
||||||
|
ok = false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
errortext := "Unexpected format for x-ref \"" + reference + "\""
|
||||||
|
fmt.Println(errortext)
|
||||||
|
ok = false
|
||||||
|
}
|
||||||
|
return h, senseNumber, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func (meta *jmdictMetadata) MakeReferenceToSeqMap() {
|
||||||
|
|
||||||
|
meta.referenceToSeq = make(map[string]sequence)
|
||||||
|
meta.MakeHashToSearchValuesMap()
|
||||||
|
|
||||||
|
for _, reference := range meta.references {
|
||||||
|
if meta.referenceToSeq[reference] != 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seq := meta.FindBestSequence(reference)
|
||||||
|
if seq != 0 {
|
||||||
|
meta.referenceToSeq[reference] = seq
|
||||||
|
} else {
|
||||||
|
fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
|
||||||
|
meta.hashToSearchValues = make(map[hash][]searchValue)
|
||||||
|
for seq, searchHashes := range meta.seqToSearchHashes {
|
||||||
|
for score, searchHash := range searchHashes {
|
||||||
|
searchValue := searchValue{
|
||||||
|
sequence: seq,
|
||||||
|
index: score,
|
||||||
|
isPriority: searchHash.isPriority,
|
||||||
|
}
|
||||||
|
meta.hashToSearchValues[searchHash.hash] =
|
||||||
|
append(meta.hashToSearchValues[searchHash.hash], searchValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generally, correspondence is determined by the order in which term
|
||||||
|
* pairs are extracted from each JMdict entry. Take for example the
|
||||||
|
* JMdict entry for ご本, which contains a reference to 本 (without a
|
||||||
|
* reading specified). To correlate this reference with a sequence
|
||||||
|
* number, our program searches each entry for the hash of【本・本】.
|
||||||
|
* There are two entries in which it is found in JMdict (English):
|
||||||
|
*
|
||||||
|
* sequence 1260670: 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
|
||||||
|
* sequence 1522150: 【本・ほん】、【本・本】、【ほん・ほん】
|
||||||
|
*
|
||||||
|
* Because 【本・本】 is closer to the beginning of the array in the
|
||||||
|
* latter (i.e., has the lowest index), sequence number 1522150 is
|
||||||
|
* returned.
|
||||||
|
*
|
||||||
|
* In situations in which multiple sequences are found with the same
|
||||||
|
* score, the entry with a priority tag ("news1", "ichi1", "spec1",
|
||||||
|
* "spec2", "gai1") is given preference. This mostly affects
|
||||||
|
* katakana-only loanwords like ラグ.
|
||||||
|
*
|
||||||
|
* To improve accuracy, this method also checks to see if the
|
||||||
|
* reference's specified sense number really exists in the
|
||||||
|
* corresponding entry. For example, sequence 1582850 【如何で・いかんで】
|
||||||
|
* has a reference to sense #2 of いかん (no kanji specified), which
|
||||||
|
* could belong to 13 different sequences. However, sequences 1582850
|
||||||
|
* and 2829697 are the only 2 of those 13 which contain more than one
|
||||||
|
* sense. Incidentally, sequence 1582850 is the correct match.
|
||||||
|
*
|
||||||
|
* All else being equal, the entry with the smallest sequence number
|
||||||
|
* is chosen. References in the JMdict file are currently ambiguous,
|
||||||
|
* and getting this perfect won't be possible until sequence numbers
|
||||||
|
* are explictly identified in these references. See:
|
||||||
|
* https://github.com/JMdictProject/JMdictIssues/issues/61
|
||||||
|
*/
|
||||||
|
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
|
||||||
|
bestSeq := 0
|
||||||
|
lowestIndex := 100000
|
||||||
|
bestIsPriority := false
|
||||||
|
headword, senseNumber, ok := parseReference(reference)
|
||||||
|
if !ok {
|
||||||
|
return bestSeq
|
||||||
|
}
|
||||||
|
hash := headword.Hash()
|
||||||
|
for _, seqScore := range meta.hashToSearchValues[hash] {
|
||||||
|
if meta.seqToSenseCount[seqScore.sequence] < senseNumber {
|
||||||
|
// entry must contain the specified sense
|
||||||
|
continue
|
||||||
|
} else if lowestIndex < seqScore.index {
|
||||||
|
// lower indices are better
|
||||||
|
continue
|
||||||
|
} else if (lowestIndex == seqScore.index) && (bestIsPriority && !seqScore.isPriority) {
|
||||||
|
// if scores match, check priority
|
||||||
|
continue
|
||||||
|
} else if (lowestIndex == seqScore.index) && (bestIsPriority == seqScore.isPriority) && (bestSeq < seqScore.sequence) {
|
||||||
|
// if scores and priority match, check sequence number.
|
||||||
|
// lower sequence numbers are better
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
lowestIndex = seqScore.index
|
||||||
|
bestSeq = seqScore.sequence
|
||||||
|
bestIsPriority = seqScore.isPriority
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bestSeq
|
||||||
|
}
|
348
jmdictTags.go
Normal file
348
jmdictTags.go
Normal file
@ -0,0 +1,348 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
func senseNumberTags(maxSenseCount int) []dbTag {
|
||||||
|
tags := []dbTag{}
|
||||||
|
for i := 1; i <= maxSenseCount; i++ {
|
||||||
|
tag := dbTag{
|
||||||
|
Name: strconv.Itoa(i),
|
||||||
|
Order: -10, // these tags will appear on the left side
|
||||||
|
Notes: "JMdict Sense #" + strconv.Itoa(i),
|
||||||
|
}
|
||||||
|
tags = append(tags, tag)
|
||||||
|
}
|
||||||
|
return tags
|
||||||
|
}
|
||||||
|
|
||||||
|
func newsFrequencyTags() []dbTag {
|
||||||
|
// 24,000 ranks divided into 24 tags, news1k ... news24k
|
||||||
|
tags := []dbTag{}
|
||||||
|
for i := 1; i <= 24; i++ {
|
||||||
|
tagName := "news" + strconv.Itoa(i) + "k"
|
||||||
|
var startRank string
|
||||||
|
if i == 1 {
|
||||||
|
startRank = "1"
|
||||||
|
} else {
|
||||||
|
// technically should be ",001", but that looks odd
|
||||||
|
startRank = strconv.Itoa(i-1) + ",000"
|
||||||
|
}
|
||||||
|
endRank := strconv.Itoa(i) + ",000"
|
||||||
|
tag := dbTag{
|
||||||
|
Name: tagName,
|
||||||
|
Order: -2,
|
||||||
|
Score: 0,
|
||||||
|
Category: "frequent",
|
||||||
|
Notes: "ranked between the top " + startRank + " and " + endRank + " words in a frequency analysis of the Mainichi Shimbun (1990s)",
|
||||||
|
}
|
||||||
|
tags = append(tags, tag)
|
||||||
|
}
|
||||||
|
return tags
|
||||||
|
}
|
||||||
|
|
||||||
|
func entityTags(entities map[string]string) []dbTag {
|
||||||
|
tags := knownEntityTags()
|
||||||
|
for name, notes := range entities {
|
||||||
|
idx := slices.IndexFunc(tags, func(t dbTag) bool { return t.Name == name })
|
||||||
|
if idx != -1 {
|
||||||
|
tags[idx].Notes = notes
|
||||||
|
} else {
|
||||||
|
fmt.Println("Unknown tag type \"" + name + "\": " + notes)
|
||||||
|
unknownTag := dbTag{Name: name, Notes: notes}
|
||||||
|
tags = append(tags, unknownTag)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tags
|
||||||
|
}
|
||||||
|
|
||||||
|
func customDbTags() []dbTag {
|
||||||
|
return []dbTag{
|
||||||
|
dbTag{Name: priorityTagName, Order: -10, Score: 10, Category: "popular", Notes: "high priority term"},
|
||||||
|
dbTag{Name: rareKanjiTagName, Order: 0, Score: -5, Category: "archaism", Notes: "rarely-used kanji form of this expression"},
|
||||||
|
dbTag{Name: irregularTagName, Order: 0, Score: -5, Category: "archaism", Notes: "irregular form of this expression"},
|
||||||
|
dbTag{Name: outdatedTagName, Order: 0, Score: -5, Category: "archaism", Notes: "outdated form of this expression"},
|
||||||
|
dbTag{Name: "ichi", Order: -2, Score: 0, Category: "frequent", Notes: "included in Ichimango Goi Bunruishuu (1万語語彙分類集)"},
|
||||||
|
dbTag{Name: "spec", Order: -2, Score: 0, Category: "frequent", Notes: "specified as common by JMdict editors"},
|
||||||
|
dbTag{Name: "gai", Order: -2, Score: 0, Category: "frequent", Notes: "common loanword (gairaigo・外来語)"},
|
||||||
|
dbTag{Name: "forms", Order: 0, Score: 0, Category: "", Notes: "other surface forms and readings"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func knownEntityTags() []dbTag {
|
||||||
|
return []dbTag{
|
||||||
|
// see: https://www.edrdg.org/jmdictdb/cgi-bin/edhelp.py?svc=jmdict&sid=#kwabbr
|
||||||
|
// additional descriptions at the beginning of the JMdict file
|
||||||
|
|
||||||
|
// <re_inf> reading info
|
||||||
|
dbTag{Name: "gikun", Order: 0, Score: 0, Category: ""}, // gikun (meaning as reading) or jukujikun (special kanji reading)
|
||||||
|
dbTag{Name: "ik", Order: 0, Score: -5, Category: ""}, // word containing irregular kana usage
|
||||||
|
dbTag{Name: "ok", Order: 0, Score: -5, Category: ""}, // out-dated or obsolete kana usage
|
||||||
|
dbTag{Name: "sk", Order: 0, Score: -5, Category: ""}, // search-only kana form
|
||||||
|
|
||||||
|
// <ke_inf> kanji info
|
||||||
|
/* kanji info also has a "ik" entity that would go here if not already for the re_inf tag */
|
||||||
|
dbTag{Name: "ateji", Order: 0, Score: 0, Category: ""}, // ateji (phonetic) reading
|
||||||
|
dbTag{Name: "iK", Order: 0, Score: -5, Category: ""}, // word containing irregular kanji usage
|
||||||
|
dbTag{Name: "io", Order: 0, Score: -5, Category: ""}, // irregular okurigana usage
|
||||||
|
dbTag{Name: "oK", Order: 0, Score: -5, Category: ""}, // word containing out-dated kanji or kanji usage
|
||||||
|
dbTag{Name: "rK", Order: 0, Score: -5, Category: ""}, // rarely-used kanji form
|
||||||
|
dbTag{Name: "sK", Order: 0, Score: -5, Category: ""}, // search-only kanji form
|
||||||
|
|
||||||
|
// <misc> miscellaneous sense info
|
||||||
|
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
|
||||||
|
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
|
||||||
|
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
|
||||||
|
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
|
||||||
|
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
|
||||||
|
dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name
|
||||||
|
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
|
||||||
|
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
|
||||||
|
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
|
||||||
|
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
|
||||||
|
dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document
|
||||||
|
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
|
||||||
|
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
|
||||||
|
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
|
||||||
|
dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language
|
||||||
|
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
|
||||||
|
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
|
||||||
|
dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified
|
||||||
|
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
|
||||||
|
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
|
||||||
|
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
|
||||||
|
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
|
||||||
|
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
|
||||||
|
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
|
||||||
|
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
|
||||||
|
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
|
||||||
|
dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language
|
||||||
|
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
|
||||||
|
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
|
||||||
|
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
|
||||||
|
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
|
||||||
|
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
|
||||||
|
dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name
|
||||||
|
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
|
||||||
|
dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person
|
||||||
|
dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name
|
||||||
|
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
|
||||||
|
dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language
|
||||||
|
dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name
|
||||||
|
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
|
||||||
|
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
|
||||||
|
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
|
||||||
|
dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion
|
||||||
|
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
|
||||||
|
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
|
||||||
|
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
|
||||||
|
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
|
||||||
|
dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station
|
||||||
|
dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname
|
||||||
|
dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone
|
||||||
|
dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name
|
||||||
|
dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word
|
||||||
|
dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name
|
||||||
|
dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software)
|
||||||
|
dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo
|
||||||
|
|
||||||
|
// <pos> part-of-speech info
|
||||||
|
dbTag{Name: "adj-f", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or verb acting prenominally
|
||||||
|
dbTag{Name: "adj-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi)
|
||||||
|
dbTag{Name: "adj-ix", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) - yoi/ii class
|
||||||
|
dbTag{Name: "adj-kari", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'kari' adjective (archaic)
|
||||||
|
dbTag{Name: "adj-ku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'ku' adjective (archaic)
|
||||||
|
dbTag{Name: "adj-na", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjectival nouns or quasi-adjectives (keiyodoshi)
|
||||||
|
dbTag{Name: "adj-nari", Order: -3, Score: 0, Category: "partOfSpeech"}, // archaic/formal form of na-adjective
|
||||||
|
dbTag{Name: "adj-no", Order: -3, Score: 0, Category: "partOfSpeech"}, // nouns which may take the genitive case particle 'no'
|
||||||
|
dbTag{Name: "adj-pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pre-noun adjectival (rentaishi)
|
||||||
|
dbTag{Name: "adj-shiku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'shiku' adjective (archaic)
|
||||||
|
dbTag{Name: "adj-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'taru' adjective
|
||||||
|
dbTag{Name: "adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb (fukushi)
|
||||||
|
dbTag{Name: "adv-to", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb taking the 'to' particle
|
||||||
|
dbTag{Name: "aux", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary
|
||||||
|
dbTag{Name: "aux-adj", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary adjective
|
||||||
|
dbTag{Name: "aux-v", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary verb
|
||||||
|
dbTag{Name: "conj", Order: -3, Score: 0, Category: "partOfSpeech"}, // conjunction
|
||||||
|
dbTag{Name: "cop", Order: -3, Score: 0, Category: "partOfSpeech"}, // copula
|
||||||
|
dbTag{Name: "ctr", Order: -3, Score: 0, Category: "partOfSpeech"}, // counter
|
||||||
|
dbTag{Name: "exp", Order: -5, Score: 0, Category: "expression"}, // expressions (phrases, clauses, etc.)
|
||||||
|
dbTag{Name: "int", Order: -3, Score: 0, Category: "partOfSpeech"}, // interjection (kandoushi)
|
||||||
|
dbTag{Name: "n", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (common) (futsuumeishi)
|
||||||
|
dbTag{Name: "n-adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverbial noun (fukushitekimeishi)
|
||||||
|
dbTag{Name: "n-pr", Order: -3, Score: 0, Category: "partOfSpeech"}, // proper noun
|
||||||
|
dbTag{Name: "n-pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a prefix
|
||||||
|
dbTag{Name: "n-suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a suffix
|
||||||
|
dbTag{Name: "n-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (temporal) (jisoumeishi)
|
||||||
|
dbTag{Name: "num", Order: -3, Score: 0, Category: "partOfSpeech"}, // numeric
|
||||||
|
dbTag{Name: "pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pronoun
|
||||||
|
dbTag{Name: "pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // prefix
|
||||||
|
dbTag{Name: "prt", Order: -3, Score: 0, Category: "partOfSpeech"}, // particle
|
||||||
|
dbTag{Name: "suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // suffix
|
||||||
|
dbTag{Name: "unc", Order: -3, Score: 0, Category: "partOfSpeech"}, // unclassified
|
||||||
|
dbTag{Name: "v-unspec", Order: -3, Score: 0, Category: "partOfSpeech"}, // verb unspecified
|
||||||
|
dbTag{Name: "v1", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb
|
||||||
|
dbTag{Name: "v1-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - kureru special class
|
||||||
|
dbTag{Name: "v2a-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb with 'u' ending (archaic)
|
||||||
|
dbTag{Name: "v2b-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'bu' ending (archaic)
|
||||||
|
dbTag{Name: "v2b-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'bu' ending (archaic)
|
||||||
|
dbTag{Name: "v2d-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'dzu' ending (archaic)
|
||||||
|
dbTag{Name: "v2d-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'dzu' ending (archaic)
|
||||||
|
dbTag{Name: "v2g-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'gu' ending (archaic)
|
||||||
|
dbTag{Name: "v2g-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'gu' ending (archaic)
|
||||||
|
dbTag{Name: "v2h-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'hu/fu' ending (archaic)
|
||||||
|
dbTag{Name: "v2h-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'hu/fu' ending (archaic)
|
||||||
|
dbTag{Name: "v2k-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ku' ending (archaic)
|
||||||
|
dbTag{Name: "v2k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ku' ending (archaic)
|
||||||
|
dbTag{Name: "v2m-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'mu' ending (archaic)
|
||||||
|
dbTag{Name: "v2m-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'mu' ending (archaic)
|
||||||
|
dbTag{Name: "v2n-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'nu' ending (archaic)
|
||||||
|
dbTag{Name: "v2r-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ru' ending (archaic)
|
||||||
|
dbTag{Name: "v2r-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ru' ending (archaic)
|
||||||
|
dbTag{Name: "v2s-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'su' ending (archaic)
|
||||||
|
dbTag{Name: "v2t-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'tsu' ending (archaic)
|
||||||
|
dbTag{Name: "v2t-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'tsu' ending (archaic)
|
||||||
|
dbTag{Name: "v2w-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)
|
||||||
|
dbTag{Name: "v2y-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'yu' ending (archaic)
|
||||||
|
dbTag{Name: "v2y-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'yu' ending (archaic)
|
||||||
|
dbTag{Name: "v2z-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'zu' ending (archaic)
|
||||||
|
dbTag{Name: "v4b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'bu' ending (archaic)
|
||||||
|
dbTag{Name: "v4g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'gu' ending (archaic)
|
||||||
|
dbTag{Name: "v4h", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'hu/fu' ending (archaic)
|
||||||
|
dbTag{Name: "v4k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ku' ending (archaic)
|
||||||
|
dbTag{Name: "v4m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'mu' ending (archaic)
|
||||||
|
dbTag{Name: "v4n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'nu' ending (archaic)
|
||||||
|
dbTag{Name: "v4r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ru' ending (archaic)
|
||||||
|
dbTag{Name: "v4s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'su' ending (archaic)
|
||||||
|
dbTag{Name: "v4t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'tsu' ending (archaic)
|
||||||
|
dbTag{Name: "v5aru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - -aru special class
|
||||||
|
dbTag{Name: "v5b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'bu' ending
|
||||||
|
dbTag{Name: "v5g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'gu' ending
|
||||||
|
dbTag{Name: "v5k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ku' ending
|
||||||
|
dbTag{Name: "v5k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Iku/Yuku special class
|
||||||
|
dbTag{Name: "v5m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'mu' ending
|
||||||
|
dbTag{Name: "v5n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'nu' ending
|
||||||
|
dbTag{Name: "v5r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending
|
||||||
|
dbTag{Name: "v5r-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending (irregular verb)
|
||||||
|
dbTag{Name: "v5s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'su' ending
|
||||||
|
dbTag{Name: "v5t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'tsu' ending
|
||||||
|
dbTag{Name: "v5u", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending
|
||||||
|
dbTag{Name: "v5u-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending (special class)
|
||||||
|
dbTag{Name: "v5uru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Uru old class verb (old form of Eru)
|
||||||
|
dbTag{Name: "vi", Order: -3, Score: 0, Category: "partOfSpeech"}, // intransitive verb
|
||||||
|
dbTag{Name: "vk", Order: -3, Score: 0, Category: "partOfSpeech"}, // Kuru verb - special class
|
||||||
|
dbTag{Name: "vn", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular nu verb
|
||||||
|
dbTag{Name: "vr", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular ru verb, plain form ends with -ri
|
||||||
|
dbTag{Name: "vs", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or participle which takes the aux. verb suru
|
||||||
|
dbTag{Name: "vs-c", Order: -3, Score: 0, Category: "partOfSpeech"}, // su verb - precursor to the modern suru
|
||||||
|
dbTag{Name: "vs-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - included
|
||||||
|
dbTag{Name: "vs-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - special class
|
||||||
|
dbTag{Name: "vt", Order: -3, Score: 0, Category: "partOfSpeech"}, // transitive verb
|
||||||
|
dbTag{Name: "vz", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - zuru verb (alternative form of -jiru verbs)
|
||||||
|
|
||||||
|
// <field> usage domain
|
||||||
|
dbTag{Name: "agric", Order: 0, Score: 0, Category: ""}, // agriculture
|
||||||
|
dbTag{Name: "anat", Order: 0, Score: 0, Category: ""}, // anatomy
|
||||||
|
dbTag{Name: "archeol", Order: 0, Score: 0, Category: ""}, // archeology
|
||||||
|
dbTag{Name: "archit", Order: 0, Score: 0, Category: ""}, // architecture
|
||||||
|
dbTag{Name: "art", Order: 0, Score: 0, Category: ""}, // art, aesthetics
|
||||||
|
dbTag{Name: "astron", Order: 0, Score: 0, Category: ""}, // astronomy
|
||||||
|
dbTag{Name: "audvid", Order: 0, Score: 0, Category: ""}, // audiovisual
|
||||||
|
dbTag{Name: "aviat", Order: 0, Score: 0, Category: ""}, // aviation
|
||||||
|
dbTag{Name: "baseb", Order: 0, Score: 0, Category: ""}, // baseball
|
||||||
|
dbTag{Name: "biochem", Order: 0, Score: 0, Category: ""}, // biochemistry
|
||||||
|
dbTag{Name: "biol", Order: 0, Score: 0, Category: ""}, // biology
|
||||||
|
dbTag{Name: "bot", Order: 0, Score: 0, Category: ""}, // botany
|
||||||
|
dbTag{Name: "Buddh", Order: 0, Score: 0, Category: ""}, // Buddhism
|
||||||
|
dbTag{Name: "bus", Order: 0, Score: 0, Category: ""}, // business
|
||||||
|
dbTag{Name: "cards", Order: 0, Score: 0, Category: ""}, // card games
|
||||||
|
dbTag{Name: "chem", Order: 0, Score: 0, Category: ""}, // chemistry
|
||||||
|
dbTag{Name: "Christn", Order: 0, Score: 0, Category: ""}, // Christianity
|
||||||
|
dbTag{Name: "cloth", Order: 0, Score: 0, Category: ""}, // clothing
|
||||||
|
dbTag{Name: "comp", Order: 0, Score: 0, Category: ""}, // computing
|
||||||
|
dbTag{Name: "cryst", Order: 0, Score: 0, Category: ""}, // crystallography
|
||||||
|
dbTag{Name: "dent", Order: 0, Score: 0, Category: ""}, // dentistry
|
||||||
|
dbTag{Name: "ecol", Order: 0, Score: 0, Category: ""}, // ecology
|
||||||
|
dbTag{Name: "econ", Order: 0, Score: 0, Category: ""}, // economics
|
||||||
|
dbTag{Name: "elec", Order: 0, Score: 0, Category: ""}, // electricity, elec. eng.
|
||||||
|
dbTag{Name: "electr", Order: 0, Score: 0, Category: ""}, // electronics
|
||||||
|
dbTag{Name: "embryo", Order: 0, Score: 0, Category: ""}, // embryology
|
||||||
|
dbTag{Name: "engr", Order: 0, Score: 0, Category: ""}, // engineering
|
||||||
|
dbTag{Name: "ent", Order: 0, Score: 0, Category: ""}, // entomology
|
||||||
|
dbTag{Name: "film", Order: 0, Score: 0, Category: ""}, // film
|
||||||
|
dbTag{Name: "finc", Order: 0, Score: 0, Category: ""}, // finance
|
||||||
|
dbTag{Name: "fish", Order: 0, Score: 0, Category: ""}, // fishing
|
||||||
|
dbTag{Name: "food", Order: 0, Score: 0, Category: ""}, // food, cooking
|
||||||
|
dbTag{Name: "gardn", Order: 0, Score: 0, Category: ""}, // gardening, horticulture
|
||||||
|
dbTag{Name: "genet", Order: 0, Score: 0, Category: ""}, // genetics
|
||||||
|
dbTag{Name: "geogr", Order: 0, Score: 0, Category: ""}, // geography
|
||||||
|
dbTag{Name: "geol", Order: 0, Score: 0, Category: ""}, // geology
|
||||||
|
dbTag{Name: "geom", Order: 0, Score: 0, Category: ""}, // geometry
|
||||||
|
dbTag{Name: "go", Order: 0, Score: 0, Category: ""}, // go (game)
|
||||||
|
dbTag{Name: "golf", Order: 0, Score: 0, Category: ""}, // golf
|
||||||
|
dbTag{Name: "gramm", Order: 0, Score: 0, Category: ""}, // grammar
|
||||||
|
dbTag{Name: "grmyth", Order: 0, Score: 0, Category: ""}, // Greek mythology
|
||||||
|
dbTag{Name: "hanaf", Order: 0, Score: 0, Category: ""}, // hanafuda
|
||||||
|
dbTag{Name: "horse", Order: 0, Score: 0, Category: ""}, // horse racing
|
||||||
|
dbTag{Name: "kabuki", Order: 0, Score: 0, Category: ""}, // kabuki
|
||||||
|
dbTag{Name: "law", Order: 0, Score: 0, Category: ""}, // law
|
||||||
|
dbTag{Name: "ling", Order: 0, Score: 0, Category: ""}, // linguistics
|
||||||
|
dbTag{Name: "logic", Order: 0, Score: 0, Category: ""}, // logic
|
||||||
|
dbTag{Name: "MA", Order: 0, Score: 0, Category: ""}, // martial arts
|
||||||
|
dbTag{Name: "mahj", Order: 0, Score: 0, Category: ""}, // mahjong
|
||||||
|
dbTag{Name: "manga", Order: 0, Score: 0, Category: ""}, // manga
|
||||||
|
dbTag{Name: "math", Order: 0, Score: 0, Category: ""}, // mathematics
|
||||||
|
dbTag{Name: "mech", Order: 0, Score: 0, Category: ""}, // mechanical engineering
|
||||||
|
dbTag{Name: "med", Order: 0, Score: 0, Category: ""}, // medicine
|
||||||
|
dbTag{Name: "met", Order: 0, Score: 0, Category: ""}, // meteorology
|
||||||
|
dbTag{Name: "mil", Order: 0, Score: 0, Category: ""}, // military
|
||||||
|
dbTag{Name: "mining", Order: 0, Score: 0, Category: ""}, // mining
|
||||||
|
dbTag{Name: "music", Order: 0, Score: 0, Category: ""}, // music
|
||||||
|
dbTag{Name: "noh", Order: 0, Score: 0, Category: ""}, // noh
|
||||||
|
dbTag{Name: "ornith", Order: 0, Score: 0, Category: ""}, // ornithology
|
||||||
|
dbTag{Name: "paleo", Order: 0, Score: 0, Category: ""}, // paleontology
|
||||||
|
dbTag{Name: "pathol", Order: 0, Score: 0, Category: ""}, // pathology
|
||||||
|
dbTag{Name: "pharm", Order: 0, Score: 0, Category: ""}, // pharmacy
|
||||||
|
dbTag{Name: "phil", Order: 0, Score: 0, Category: ""}, // philosophy
|
||||||
|
dbTag{Name: "photo", Order: 0, Score: 0, Category: ""}, // photography
|
||||||
|
dbTag{Name: "physics", Order: 0, Score: 0, Category: ""}, // physics
|
||||||
|
dbTag{Name: "physiol", Order: 0, Score: 0, Category: ""}, // physiology
|
||||||
|
dbTag{Name: "politics", Order: 0, Score: 0, Category: ""}, // politics
|
||||||
|
dbTag{Name: "print", Order: 0, Score: 0, Category: ""}, // printing
|
||||||
|
dbTag{Name: "psy", Order: 0, Score: 0, Category: ""}, // psychiatry
|
||||||
|
dbTag{Name: "psyanal", Order: 0, Score: 0, Category: ""}, // psychoanalysis
|
||||||
|
dbTag{Name: "psych", Order: 0, Score: 0, Category: ""}, // psychology
|
||||||
|
dbTag{Name: "rail", Order: 0, Score: 0, Category: ""}, // railway
|
||||||
|
dbTag{Name: "rommyth", Order: 0, Score: 0, Category: ""}, // Roman mythology
|
||||||
|
dbTag{Name: "Shinto", Order: 0, Score: 0, Category: ""}, // Shinto
|
||||||
|
dbTag{Name: "shogi", Order: 0, Score: 0, Category: ""}, // shogi
|
||||||
|
dbTag{Name: "ski", Order: 0, Score: 0, Category: ""}, // skiing
|
||||||
|
dbTag{Name: "sports", Order: 0, Score: 0, Category: ""}, // sports
|
||||||
|
dbTag{Name: "stat", Order: 0, Score: 0, Category: ""}, // statistics
|
||||||
|
dbTag{Name: "stockm", Order: 0, Score: 0, Category: ""}, // stock market
|
||||||
|
dbTag{Name: "sumo", Order: 0, Score: 0, Category: ""}, // sumo
|
||||||
|
dbTag{Name: "telec", Order: 0, Score: 0, Category: ""}, // telecommunications
|
||||||
|
dbTag{Name: "tradem", Order: 0, Score: 0, Category: ""}, // trademark
|
||||||
|
dbTag{Name: "tv", Order: 0, Score: 0, Category: ""}, // television
|
||||||
|
dbTag{Name: "vidg", Order: 0, Score: 0, Category: ""}, // video games
|
||||||
|
dbTag{Name: "zool", Order: 0, Score: 0, Category: ""}, // zoology
|
||||||
|
|
||||||
|
// <dial> dialect
|
||||||
|
dbTag{Name: "bra", Order: 0, Score: 0, Category: ""}, // Brazilian
|
||||||
|
dbTag{Name: "hob", Order: 0, Score: 0, Category: ""}, // Hokkaido-ben
|
||||||
|
dbTag{Name: "ksb", Order: 0, Score: 0, Category: ""}, // Kansai-ben
|
||||||
|
dbTag{Name: "ktb", Order: 0, Score: 0, Category: ""}, // Kantou-ben
|
||||||
|
dbTag{Name: "kyb", Order: 0, Score: 0, Category: ""}, // Kyoto-ben
|
||||||
|
dbTag{Name: "kyu", Order: 0, Score: 0, Category: ""}, // Kyuushuu-ben
|
||||||
|
dbTag{Name: "nab", Order: 0, Score: 0, Category: ""}, // Nagano-ben
|
||||||
|
dbTag{Name: "osb", Order: 0, Score: 0, Category: ""}, // Osaka-ben
|
||||||
|
dbTag{Name: "rkb", Order: 0, Score: 0, Category: ""}, // Ryuukyuu-ben
|
||||||
|
dbTag{Name: "thb", Order: 0, Score: 0, Category: ""}, // Touhoku-ben
|
||||||
|
dbTag{Name: "tsb", Order: 0, Score: 0, Category: ""}, // Tosa-ben
|
||||||
|
dbTag{Name: "tsug", Order: 0, Score: 0, Category: ""}, // Tsugaru-ben
|
||||||
|
}
|
||||||
|
}
|
192
structuredContent.go
Normal file
192
structuredContent.go
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
type contentAttr struct {
|
||||||
|
lang string
|
||||||
|
fontStyle string // normal, italic
|
||||||
|
fontWeight string // normal, bold
|
||||||
|
fontSize string // small, medium, large, smaller, 80%, 125%, etc.
|
||||||
|
textDecorationLine []string // underline, overline, line-through
|
||||||
|
verticalAlign string // baseline, sub, super, text-top, text-bottom, middle, top, bottom
|
||||||
|
textAlign string // start, end, left, right, center, justify, justify-all, match-parent
|
||||||
|
marginTop int
|
||||||
|
marginLeft int
|
||||||
|
marginRight int
|
||||||
|
marginBottom int
|
||||||
|
listStyleType string
|
||||||
|
data map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the array contains adjacent strings, concatenate them.
|
||||||
|
// ex: ["one", "two", content_structure, "four"] -> ["onetwo", content_structure, "four"]
|
||||||
|
// if the array only contains strings, return a concatenated string.
|
||||||
|
// ex: ["one", "two"] -> "onetwo"
|
||||||
|
func contentReduce(contents []any) any {
|
||||||
|
if len(contents) == 1 {
|
||||||
|
return contents[0]
|
||||||
|
}
|
||||||
|
newContents := []any{}
|
||||||
|
var accumulator string
|
||||||
|
for _, content := range contents {
|
||||||
|
switch v := content.(type) {
|
||||||
|
case string:
|
||||||
|
accumulator = accumulator + v
|
||||||
|
default:
|
||||||
|
if accumulator != "" {
|
||||||
|
newContents = append(newContents, accumulator)
|
||||||
|
accumulator = ""
|
||||||
|
}
|
||||||
|
newContents = append(newContents, content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if accumulator != "" {
|
||||||
|
newContents = append(newContents, accumulator)
|
||||||
|
}
|
||||||
|
if len(newContents) == 1 {
|
||||||
|
return newContents[0]
|
||||||
|
} else {
|
||||||
|
return newContents
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentStructure(contents ...any) map[string]any {
|
||||||
|
return map[string]any{
|
||||||
|
"type": "structured-content",
|
||||||
|
"content": contentReduce(contents),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentRuby(attr contentAttr, ruby string, contents ...any) map[string]any {
|
||||||
|
rubyContent := map[string]any{
|
||||||
|
"tag": "ruby",
|
||||||
|
"content": []any{
|
||||||
|
contentReduce(contents),
|
||||||
|
map[string]string{"tag": "rp", "content": "("},
|
||||||
|
map[string]string{"tag": "rt", "content": ruby},
|
||||||
|
map[string]string{"tag": "rp", "content": ")"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if attr.lang != "" {
|
||||||
|
rubyContent["lang"] = attr.lang
|
||||||
|
}
|
||||||
|
if len(attr.data) != 0 {
|
||||||
|
rubyContent["data"] = attr.data
|
||||||
|
}
|
||||||
|
return rubyContent
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentInternalLink(attr contentAttr, query string, contents ...any) map[string]any {
|
||||||
|
linkContent := map[string]any{
|
||||||
|
"tag": "a",
|
||||||
|
"href": "?query=" + query + "&wildcards=off",
|
||||||
|
}
|
||||||
|
if len(contents) == 0 {
|
||||||
|
linkContent["content"] = query
|
||||||
|
} else {
|
||||||
|
linkContent["content"] = contentReduce(contents)
|
||||||
|
}
|
||||||
|
if attr.lang != "" {
|
||||||
|
linkContent["lang"] = attr.lang
|
||||||
|
}
|
||||||
|
if len(attr.data) != 0 {
|
||||||
|
linkContent["data"] = attr.data
|
||||||
|
}
|
||||||
|
return linkContent
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentSpan(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "span", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentDiv(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "div", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentListItem(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "li", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentOrderedList(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "ol", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentUnorderedList(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "ul", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentTable(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "table", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentTableHead(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "thead", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentTableBody(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "tbody", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentTableRow(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "tr", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentTableHeadCell(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "th", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentTableCell(attr contentAttr, contents ...any) map[string]any {
|
||||||
|
return contentStyledContainer(attr, "td", contents...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentStyledContainer(attr contentAttr, tag string, contents ...any) map[string]any {
|
||||||
|
container := map[string]any{"tag": tag}
|
||||||
|
container["content"] = contentReduce(contents)
|
||||||
|
if attr.lang != "" {
|
||||||
|
container["lang"] = attr.lang
|
||||||
|
}
|
||||||
|
if len(attr.data) != 0 {
|
||||||
|
container["data"] = attr.data
|
||||||
|
}
|
||||||
|
style := contentStyle(attr)
|
||||||
|
if len(style) != 0 {
|
||||||
|
container["style"] = style
|
||||||
|
}
|
||||||
|
return container
|
||||||
|
}
|
||||||
|
|
||||||
|
func contentStyle(attr contentAttr) map[string]any {
|
||||||
|
style := make(map[string]any)
|
||||||
|
if attr.fontStyle != "" {
|
||||||
|
style["fontStyle"] = attr.fontStyle
|
||||||
|
}
|
||||||
|
if attr.fontWeight != "" {
|
||||||
|
style["fontWeight"] = attr.fontWeight
|
||||||
|
}
|
||||||
|
if attr.fontSize != "" {
|
||||||
|
style["fontSize"] = attr.fontSize
|
||||||
|
}
|
||||||
|
if len(attr.textDecorationLine) != 0 {
|
||||||
|
style["textDecorationLine"] = attr.textDecorationLine
|
||||||
|
}
|
||||||
|
if attr.verticalAlign != "" {
|
||||||
|
style["verticalAlign"] = attr.verticalAlign
|
||||||
|
}
|
||||||
|
if attr.textAlign != "" {
|
||||||
|
style["textAlign"] = attr.textAlign
|
||||||
|
}
|
||||||
|
if attr.marginTop != 0 {
|
||||||
|
style["marginTop"] = attr.marginTop
|
||||||
|
}
|
||||||
|
if attr.marginLeft != 0 {
|
||||||
|
style["marginLeft"] = attr.marginLeft
|
||||||
|
}
|
||||||
|
if attr.marginRight != 0 {
|
||||||
|
style["marginRight"] = attr.marginRight
|
||||||
|
}
|
||||||
|
if attr.marginBottom != 0 {
|
||||||
|
style["marginBottom"] = attr.marginBottom
|
||||||
|
}
|
||||||
|
if attr.listStyleType != "" {
|
||||||
|
style["listStyleType"] = attr.listStyleType
|
||||||
|
}
|
||||||
|
return style
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user