1

Merge pull request #40 from stephenmk/master

New version of JMdict for Yomichan
This commit is contained in:
Alexei Yatskov 2023-01-29 22:30:04 -08:00 committed by GitHub
commit 74de4ce9e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 2359 additions and 320 deletions

View File

@ -1,4 +1,4 @@
Copyright 2016-2022 Alex Yatskov
Copyright 2016-2023 Yomichan-Import Authors
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in

View File

@ -19,9 +19,7 @@ const (
DefaultTitle = ""
)
const databaseFormat = 3
type dbRecord []interface{}
type dbRecord []any
type dbRecordList []dbRecord
type dbTag struct {
@ -46,7 +44,7 @@ func (meta dbTagList) crush() dbRecordList {
type dbMeta struct {
Expression string
Mode string
Data interface{}
Data any
}
type dbMetaList []dbMeta
@ -66,7 +64,7 @@ type dbTerm struct {
DefinitionTags []string
Rules []string
Score int
Glossary []string
Glossary []any
Sequence int
TermTags []string
}
@ -142,11 +140,34 @@ func (kanji dbKanjiList) crush() dbRecordList {
return results
}
func writeDb(outputPath, title, revision string, sequenced bool, recordData map[string]dbRecordList, stride int, pretty bool) error {
type dbIndex struct {
Title string `json:"title"`
Format int `json:"format"`
Revision string `json:"revision"`
Sequenced bool `json:"sequenced"`
Author string `json:"author"`
Url string `json:"url"`
Description string `json:"description"`
Attribution string `json:"attribution"`
}
func (index *dbIndex) setDefaults() {
if index.Format == 0 {
index.Format = 3
}
if index.Author == "" {
index.Author = "yomichan-import"
}
if index.Url == "" {
index.Url = "https://github.com/FooSoft/yomichan-import"
}
}
func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordList, stride int, pretty bool) error {
var zbuff bytes.Buffer
zip := zip.NewWriter(&zbuff)
marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) {
marshalJSON := func(obj any, pretty bool) ([]byte, error) {
if pretty {
return json.MarshalIndent(obj, "", " ")
}
@ -186,17 +207,6 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[
}
var err error
var db struct {
Title string `json:"title"`
Format int `json:"format"`
Revision string `json:"revision"`
Sequenced bool `json:"sequenced"`
}
db.Title = title
db.Format = databaseFormat
db.Revision = revision
db.Sequenced = sequenced
for recordType, recordEntries := range recordData {
if _, err := writeDbRecords(recordType, recordEntries); err != nil {
@ -204,7 +214,8 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[
}
}
bytes, err := marshalJSON(db, pretty)
index.setDefaults()
bytes, err := marshalJSON(index, pretty)
if err != nil {
return err
}
@ -252,6 +263,39 @@ func hasString(needle string, haystack []string) bool {
return false
}
func intersection(s1, s2 []string) []string {
s := []string{}
m := make(map[string]bool)
for _, e := range s1 {
m[e] = true
}
for _, e := range s2 {
if m[e] {
s = append(s, e)
m[e] = false
}
}
return s
}
func union(s1, s2 []string) []string {
s := []string{}
m := make(map[string]bool)
for _, e := range s1 {
if !m[e] {
s = append(s, e)
m[e] = true
}
}
for _, e := range s2 {
if !m[e] {
s = append(s, e)
m[e] = true
}
}
return s
}
func detectFormat(path string) (string, error) {
switch filepath.Ext(path) {
case ".sqlite":
@ -263,7 +307,7 @@ func detectFormat(path string) (string, error) {
}
switch filepath.Base(path) {
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml":
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml", "JMdict_e_examp":
return "edict", nil
case "JMnedict", "JMnedict.xml":
return "enamdict", nil
@ -293,7 +337,8 @@ func detectFormat(path string) (string, error) {
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
handlers := map[string]func(string, string, string, string, int, bool) error{
"edict": jmdictExportDb,
"edict": jmdExportDb,
"forms": formsExportDb,
"enamdict": jmnedictExportDb,
"epwing": epwingExportDb,
"kanjidic": kanjidicExportDb,

View File

@ -65,7 +65,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}
@ -79,7 +79,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}

View File

@ -70,7 +70,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
if len(expressions) == 0 {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}
@ -82,7 +82,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}

246
edict.go
View File

@ -1,246 +0,0 @@
package yomichan
import (
"os"
"strings"
"foosoft.net/projects/jmdict"
)
const jmdictRevision = "jmdict4"
func jmdictBuildRules(term *dbTerm) {
for _, tag := range term.DefinitionTags {
switch tag {
case "adj-i", "v1", "vk", "vz":
term.addRules(tag)
default:
if strings.HasPrefix(tag, "v5") {
term.addRules("v5")
} else if strings.HasPrefix(tag, "vs-") {
term.addRules("vs")
}
}
}
}
func jmdictBuildScore(term *dbTerm) {
for _, tag := range term.DefinitionTags {
switch tag {
case "arch":
term.Score -= 100
}
}
for _, tag := range term.TermTags {
switch tag {
case "news", "ichi", "spec", "gai1":
term.Score += 100
case "P":
term.Score += 500
case "iK", "ik", "ok", "oK", "io", "oik":
term.Score -= 100
}
}
}
func jmdictAddPriorities(term *dbTerm, priorities ...string) {
for _, priority := range priorities {
switch priority {
case "news1", "ichi1", "spec1", "gai1":
term.addTermTags("P")
fallthrough
case "news2", "ichi2", "spec2", "gai2":
term.addTermTags(priority[:len(priority)-1])
}
}
}
func jmdictBuildTagMeta(entities map[string]string) dbTagList {
tags := dbTagList{
dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2},
dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2},
dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2},
dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2},
dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10, Score: 10},
}
for name, value := range entities {
tag := dbTag{Name: name, Notes: value}
switch name {
case "exp", "id":
tag.Category = "expression"
tag.Order = -5
case "arch":
tag.Category = "archaism"
tag.Order = -4
case "iK", "ik", "ok", "oK", "io", "oik":
tag.Score = -5
case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj",
"aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf",
"unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k",
"v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru",
"v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i",
"vs", "vs-s", "vt", "vz":
tag.Category = "partOfSpeech"
tag.Order = -3
}
tags = append(tags, tag)
}
return tags
}
func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm {
var terms []dbTerm
convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) {
if kanji != nil && reading.Restrictions != nil && !hasString(kanji.Expression, reading.Restrictions) {
return
}
var termBase dbTerm
termBase.addTermTags(reading.Information...)
if kanji == nil {
termBase.Expression = reading.Reading
jmdictAddPriorities(&termBase, reading.Priorities...)
} else {
termBase.Expression = kanji.Expression
termBase.Reading = reading.Reading
termBase.addTermTags(kanji.Information...)
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
jmdictAddPriorities(&termBase, priority)
}
}
}
var partsOfSpeech []string
for index, sense := range edictEntry.Sense {
if len(sense.PartsOfSpeech) != 0 {
partsOfSpeech = sense.PartsOfSpeech
}
if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) {
continue
}
if kanji != nil && sense.RestrictedKanji != nil && !hasString(kanji.Expression, sense.RestrictedKanji) {
continue
}
term := dbTerm{
Reading: termBase.Reading,
Expression: termBase.Expression,
Score: len(edictEntry.Sense) - index,
Sequence: edictEntry.Sequence,
}
for _, glossary := range sense.Glossary {
if glossary.Language == nil && language == "" || glossary.Language != nil && language == *glossary.Language {
term.Glossary = append(term.Glossary, glossary.Content)
}
}
if len(term.Glossary) == 0 {
continue
}
term.addDefinitionTags(termBase.DefinitionTags...)
term.addTermTags(termBase.TermTags...)
term.addDefinitionTags(partsOfSpeech...)
term.addDefinitionTags(sense.Fields...)
term.addDefinitionTags(sense.Misc...)
term.addDefinitionTags(sense.Dialects...)
jmdictBuildRules(&term)
jmdictBuildScore(&term)
terms = append(terms, term)
}
}
if len(edictEntry.Kanji) > 0 {
for _, kanji := range edictEntry.Kanji {
for _, reading := range edictEntry.Readings {
if reading.NoKanji == nil {
convert(reading, &kanji)
}
}
}
for _, reading := range edictEntry.Readings {
if reading.NoKanji != nil {
convert(reading, nil)
}
}
} else {
for _, reading := range edictEntry.Readings {
convert(reading, nil)
}
}
return terms
}
func jmdictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dict, entities, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil {
return err
}
var langTag string
switch language {
case "dutch":
langTag = "dut"
case "french":
langTag = "fre"
case "german":
langTag = "ger"
case "hungarian":
langTag = "hun"
case "italian":
langTag = "ita"
case "russian":
langTag = "rus"
case "slovenian":
langTag = "slv"
case "spanish":
langTag = "spa"
case "swedish":
langTag = "swe"
}
var terms dbTermList
for _, entry := range dict.Entries {
terms = append(terms, jmdictExtractTerms(entry, langTag)...)
}
if title == "" {
title = "JMdict"
}
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": jmdictBuildTagMeta(entities).crush(),
}
return writeDb(
outputPath,
title,
jmdictRevision,
true,
recordData,
stride,
pretty,
)
}

View File

@ -6,8 +6,6 @@ import (
"foosoft.net/projects/jmdict"
)
const jmnedictRevision = "jmnedict1"
func jmnedictBuildTagMeta(entities map[string]string) dbTagList {
var tags dbTagList
@ -53,7 +51,9 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
}
for _, trans := range enamdictEntry.Translations {
term.Glossary = append(term.Glossary, trans.Translations...)
for _, translation := range trans.Translations {
term.Glossary = append(term.Glossary, translation)
}
term.addDefinitionTags(trans.NameTypes...)
}
@ -101,11 +101,16 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int,
"tag": jmnedictBuildTagMeta(entities).crush(),
}
index := dbIndex{
Title: title,
Revision: "jmnedict1",
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
title,
jmnedictRevision,
true,
index,
recordData,
stride,
pretty,

View File

@ -101,11 +101,15 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
"term": terms.crush(),
}
index := dbIndex{
Title: title,
Revision: strings.Join(revisions, ";"),
Sequenced: true,
}
return writeDb(
outputPath,
title,
strings.Join(revisions, ";"),
true,
index,
recordData,
stride,
pretty,

View File

@ -7,8 +7,6 @@ import (
"strings"
)
const frequencyRevision = "frequency1"
func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
}
@ -57,11 +55,15 @@ func frequncyExportDb(inputPath, outputPath, language, title string, stride int,
key: frequencies.crush(),
}
index := dbIndex{
Title: title,
Revision: "frequency1",
Sequenced: false,
}
return writeDb(
outputPath,
title,
frequencyRevision,
false,
index,
recordData,
stride,
pretty,

View File

@ -90,7 +90,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entryText},
Glossary: []any{entryText},
Sequence: sequence,
}
@ -107,7 +107,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entryText},
Glossary: []any{entryText},
Sequence: sequence,
}

1
go.mod
View File

@ -7,6 +7,7 @@ require (
foosoft.net/projects/zero-epwing-go v0.0.0-20220704035039-bc008453615d
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e
github.com/mattn/go-sqlite3 v1.14.14
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f
)
require golang.org/x/text v0.3.7 // indirect

2
go.sum
View File

@ -6,5 +6,7 @@ github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e h1:wSQCJiig/QkoUnpvelSP
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II=
github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw=
github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f h1:90Jq/vvGVDsqj8QqCynjFw9MCerDguSMODLYII416Y8=
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=

258
jmdict.go Normal file
View File

@ -0,0 +1,258 @@
package yomichan
import (
"errors"
"os"
"regexp"
"strconv"
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
func grammarRules(partsOfSpeech []string) []string {
rules := []string{}
for _, partOfSpeech := range partsOfSpeech {
switch partOfSpeech {
case "adj-i", "vk", "vz":
rules = append(rules, partOfSpeech)
default:
if strings.HasPrefix(partOfSpeech, "v5") {
rules = append(rules, "v5")
} else if strings.HasPrefix(partOfSpeech, "v1") {
rules = append(rules, "v1")
} else if strings.HasPrefix(partOfSpeech, "vs-") {
rules = append(rules, "vs")
}
}
}
return rules
}
func calculateTermScore(senseNumber int, depth int, headword headword) int {
const senseWeight int = 1
const depthWeight int = 100
const entryPositionWeight int = 10000
const priorityWeight int = 1000000
score := 0
score -= (senseNumber - 1) * senseWeight
score -= depth * depthWeight
score -= headword.Index * entryPositionWeight
score += headword.Score() * priorityWeight
return score
}
func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) bool {
// Display sense numbers if the entry has more than one sense
// or if the headword is found in multiple entries.
hash := headword.Hash()
if !meta.extraMode {
return false
} else if meta.language != "eng" {
return false
} else if meta.seqToSenseCount[entry.Sequence] > 1 {
return true
} else if len(meta.headwordHashToSeqs[hash]) > 1 {
return true
} else {
return false
}
}
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content)
return jmdictDate
}
func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
// Don't add "forms" terms to non-English dictionaries.
// Information would be duplicated if users installed more
// than one version.
if meta.language != "eng" || !meta.extraMode {
return dbTerm{}, false
}
// Don't need a "forms" term for entries with one unique
// headword which does not appear in any other entries.
if !meta.hasMultipleForms[entry.Sequence] {
if len(meta.headwordHashToSeqs[headword.Hash()]) == 1 {
return dbTerm{}, false
}
}
term := baseFormsTerm(entry)
term.Expression = headword.Expression
term.Reading = headword.Reading
term.addTermTags(headword.TermTags...)
term.addDefinitionTags("forms")
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
entryDepth := meta.entryDepth[entry.Sequence]
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
return term, true
}
func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
// Don't add "search" terms to non-English dictionaries.
// Information would be duplicated if users installed more
// than one version.
if meta.language != "eng" {
return dbTerm{}, false
}
term := dbTerm{
Expression: headword.Expression,
Sequence: -entry.Sequence,
}
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech)
term.addRules(rules...)
}
term.addTermTags(headword.TermTags...)
term.Score = calculateTermScore(1, 0, headword)
redirectHeadword := meta.seqToMainHeadword[entry.Sequence]
expHash := redirectHeadword.ExpHash()
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
content := contentSpan(
contentAttr{fontSize: "130%"},
"⟶",
redirectHeadword.ToInternalLink(doDisplayReading),
)
term.Glossary = []any{contentStructure(content)}
return term, true
}
func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
return dbTerm{}, false
}
if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) {
return dbTerm{}, false
}
term := dbTerm{
Expression: headword.Expression,
Reading: headword.Reading,
Sequence: entry.Sequence,
}
term.Glossary = createGlossary(sense, meta)
term.addTermTags(headword.TermTags...)
if doDisplaySenseNumberTag(headword, entry, meta) {
senseNumberTag := strconv.Itoa(senseNumber)
term.addDefinitionTags(senseNumberTag)
}
term.addDefinitionTags(sense.PartsOfSpeech...)
term.addDefinitionTags(sense.Fields...)
term.addDefinitionTags(sense.Misc...)
term.addDefinitionTags(sense.Dialects...)
rules := grammarRules(sense.PartsOfSpeech)
term.addRules(rules...)
entryDepth := meta.entryDepth[entry.Sequence]
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
return term, true
}
func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
if meta.seqToSenseCount[entry.Sequence] == 0 {
return nil, false
}
if headword.IsSearchOnly {
if searchTerm, ok := createSearchTerm(headword, entry, meta); ok {
return []dbTerm{searchTerm}, true
} else {
return nil, false
}
}
terms := []dbTerm{}
senseNumber := 1
for _, sense := range entry.Sense {
if !glossaryContainsLanguage(sense.Glossary, meta.language) {
// Do not increment sense number
continue
}
if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok {
terms = append(terms, senseTerm)
}
senseNumber += 1
}
if formsTerm, ok := createFormsTerm(headword, entry, meta); ok {
terms = append(terms, formsTerm)
}
return terms, true
}
func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
if _, ok := langNameToCode[languageName]; !ok {
return errors.New("Unrecognized language parameter: " + languageName)
}
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil {
return err
}
meta := newJmdictMetadata(dictionary, languageName)
terms := dbTermList{}
for _, entry := range dictionary.Entries {
headwords := extractHeadwords(entry)
for _, headword := range headwords {
if newTerms, ok := extractTerms(headword, entry, meta); ok {
terms = append(terms, newTerms...)
}
}
}
tags := dbTagList{}
tags = append(tags, entityTags(entities)...)
tags = append(tags, senseNumberTags(meta.maxSenseCount)...)
tags = append(tags, newsFrequencyTags()...)
tags = append(tags, customDbTags()...)
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": tags.crush(),
}
if title == "" {
title = "JMdict"
}
jmdictDate := jmdictPublicationDate(dictionary)
index := dbIndex{
Title: title,
Revision: "JMdict." + jmdictDate,
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}

218
jmdict_constants.go Normal file
View File

@ -0,0 +1,218 @@
package yomichan
type LangCode struct {
language string
code string
}
const (
edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/"
prioritySymbol = "★"
rareKanjiSymbol = "🅁"
irregularSymbol = "⚠"
outdatedSymbol = "⛬"
defaultSymbol = "㊒"
priorityTagName = "⭐"
rareKanjiTagName = "R"
irregularTagName = "⚠️"
outdatedTagName = "⛬"
atejiTagName = "ateji"
gikunTagName = "gikun"
langMarker = "'🌐 '"
noteMarker = "'📝 '"
infoMarker = "' '"
refMarker = "'➡️ '"
antonymMarker = "'🔄 '"
)
var ISOtoFlag = map[string]string{
"": "'🇬🇧 '",
"eng": "'🇬🇧 '",
"dut": "'🇳🇱 '",
"fre": "'🇫🇷 '",
"ger": "'🇩🇪 '",
"hun": "'🇭🇺 '",
"ita": "'🇮🇹 '",
"jpn": "'🇯🇵 '",
"rus": "'🇷🇺 '",
"slv": "'🇸🇮 '",
"spa": "'🇪🇸 '",
"swe": "'🇸🇪 '",
}
var langNameToCode = map[string]string{
"": "eng",
"english": "eng",
"english_extra": "eng",
"dutch": "dut",
"french": "fre",
"german": "ger",
"hungarian": "hun",
"italian": "ita",
"russian": "rus",
"slovenian": "slv",
"spanish": "spa",
"swedish": "swe",
}
var glossTypeCodeToName = map[LangCode]string{
LangCode{"eng", "lit"}: "literally",
LangCode{"eng", "fig"}: "figuratively",
LangCode{"eng", "expl"}: "", // don't need to tell the user that an explanation is an explanation
LangCode{"eng", "tm"}: "trademark",
}
var refNoteHint = map[LangCode]string{
LangCode{"eng", "xref"}: "see",
LangCode{"eng", "ant"}: "antonym",
}
var sourceLangTypeCodeToType = map[LangCode]string{
LangCode{"eng", "part"}: "partial",
LangCode{"eng", ""}: "", // implied "full"
}
var langCodeToName = map[LangCode]string{
LangCode{"eng", "afr"}: "Afrikaans",
LangCode{"eng", "ain"}: "Ainu",
LangCode{"eng", "alg"}: "Algonquian",
LangCode{"eng", "amh"}: "Amharic",
LangCode{"eng", "ara"}: "Arabic",
LangCode{"eng", "arn"}: "Mapudungun",
LangCode{"eng", "bnt"}: "Bantu",
LangCode{"eng", "bre"}: "Breton",
LangCode{"eng", "bul"}: "Bulgarian",
LangCode{"eng", "bur"}: "Burmese",
LangCode{"eng", "chi"}: "Chinese",
LangCode{"eng", "chn"}: "Chinook Jargon",
LangCode{"eng", "cze"}: "Czech",
LangCode{"eng", "dan"}: "Danish",
LangCode{"eng", "dut"}: "Dutch",
LangCode{"eng", "eng"}: "English",
LangCode{"eng", "epo"}: "Esperanto",
LangCode{"eng", "est"}: "Estonian",
LangCode{"eng", "fil"}: "Filipino",
LangCode{"eng", "fin"}: "Finnish",
LangCode{"eng", "fre"}: "French",
LangCode{"eng", "geo"}: "Georgian",
LangCode{"eng", "ger"}: "German",
LangCode{"eng", "glg"}: "Galician",
LangCode{"eng", "grc"}: "Ancient Greek",
LangCode{"eng", "gre"}: "Modern Greek",
LangCode{"eng", "haw"}: "Hawaiian",
LangCode{"eng", "heb"}: "Hebrew",
LangCode{"eng", "hin"}: "Hindi",
LangCode{"eng", "hun"}: "Hungarian",
LangCode{"eng", "ice"}: "Icelandic",
LangCode{"eng", "ind"}: "Indonesian",
LangCode{"eng", "ita"}: "Italian",
LangCode{"eng", "khm"}: "Khmer",
LangCode{"eng", "kor"}: "Korean",
LangCode{"eng", "kur"}: "Kurdish",
LangCode{"eng", "lat"}: "Latin",
LangCode{"eng", "mal"}: "Malayalam",
LangCode{"eng", "mao"}: "Maori",
LangCode{"eng", "may"}: "Malay",
LangCode{"eng", "mnc"}: "Manchu",
LangCode{"eng", "mol"}: "Moldavian", // ISO 639 deprecated (https://iso639-3.sil.org/code/mol)
LangCode{"eng", "mon"}: "Mongolian",
LangCode{"eng", "nor"}: "Norwegian",
LangCode{"eng", "per"}: "Persian",
LangCode{"eng", "pol"}: "Polish",
LangCode{"eng", "por"}: "Portuguese",
LangCode{"eng", "rum"}: "Romanian",
LangCode{"eng", "rus"}: "Russian",
LangCode{"eng", "san"}: "Sanskrit",
LangCode{"eng", "scr"}: "Croatian", // Code doesn't seem to exist in ISO 639. Should be "hrv" instead? (https://iso639-3.sil.org/code/hrv)
LangCode{"eng", "slo"}: "Slovak",
LangCode{"eng", "slv"}: "Slovenian",
LangCode{"eng", "som"}: "Somali",
LangCode{"eng", "spa"}: "Spanish",
LangCode{"eng", "swa"}: "Swahili",
LangCode{"eng", "swe"}: "Swedish",
LangCode{"eng", "tah"}: "Tahitian",
LangCode{"eng", "tam"}: "Tamil",
LangCode{"eng", "tgl"}: "Tagalog",
LangCode{"eng", "tha"}: "Thai",
LangCode{"eng", "tib"}: "Tibetan",
LangCode{"eng", "tur"}: "Turkish",
LangCode{"eng", "ukr"}: "Ukrainian",
LangCode{"eng", "urd"}: "Urdu",
LangCode{"eng", "vie"}: "Vietnamese",
LangCode{"eng", "yid"}: "Yiddish",
}
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
var ISOtoHTML = map[string]string{
"afr": "af", // Afrikaans
"ain": "ain", // Ainu
"alg": "alg", // Algonquian
"amh": "am", // Amharic
"ara": "ar", // Arabic
"arn": "arn", // Mapudungun
"bnt": "bnt", // Bantu
"bre": "br", // Breton
"bul": "bg", // Bulgarian
"bur": "my", // Burmese
"chi": "zh", // Chinese
"chn": "chn", // Chinook Jargon
"cze": "cs", // Czech
"dan": "da", // Danish
"dut": "nl", // Dutch
"eng": "en", // English
"epo": "eo", // Esperanto
"est": "et", // Estonian
"fil": "fil", // Filipino
"fin": "fi", // Finnish
"fre": "fr", // French
"geo": "ka", // Georgian
"ger": "de", // German
"glg": "gl", // Galician
"grc": "grc", // Ancient Greek
"gre": "el", // Modern Greek
"haw": "haw", // Hawaiian
"heb": "he", // Hebrew
"hin": "hi", // Hindi
"hun": "hu", // Hungarian
"ice": "is", // Icelandic
"ind": "id", // Indonesian
"ita": "it", // Italian
"jpn": "ja", // Japanese
"khm": "km", // Khmer
"kor": "ko", // Korean
"kur": "ku", // Kurdish
"lat": "la", // Latin
"mal": "ml", // Malayalam
"mao": "mi", // Maori
"may": "ms", // Malay
"mnc": "mnc", // Manchu
"mol": "ro", // Moldavian
"mon": "mn", // Mongolian
"nor": "no", // Norwegian
"per": "fa", // Persian
"pol": "pl", // Polish
"por": "pt", // Portuguese
"rum": "ro", // Romanian
"rus": "ru", // Russian
"san": "sa", // Sanskrit
"scr": "hr", // Croatian
"slo": "sk", // Slovak
"slv": "sl", // Slovenian
"som": "so", // Somali
"spa": "es", // Spanish
"swa": "sw", // Swahili
"swe": "sv", // Swedish
"tah": "ty", // Tahitian
"tam": "ta", // Tamil
"tgl": "tl", // Tagalog
"tha": "th", // Thai
"tib": "bo", // Tibetan
"tur": "tr", // Turkish
"ukr": "uk", // Ukrainian
"urd": "ur", // Urdu
"vie": "vi", // Vietnamese
"yid": "yi", // Yiddish
}

265
jmdict_forms.go Normal file
View File

@ -0,0 +1,265 @@
package yomichan
import (
"os"
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
func kata2hira(word string) string {
charMap := func(character rune) rune {
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
return character - 0x60
} else {
return character
}
}
return strings.Map(charMap, word)
}
func (h *headword) InfoSymbols() string {
infoSymbols := []string{}
if h.IsPriority {
infoSymbols = append(infoSymbols, prioritySymbol)
}
if h.IsRareKanji {
infoSymbols = append(infoSymbols, rareKanjiSymbol)
}
if h.IsIrregular {
infoSymbols = append(infoSymbols, irregularSymbol)
}
if h.IsOutdated {
infoSymbols = append(infoSymbols, outdatedSymbol)
}
return strings.Join(infoSymbols[:], " | ")
}
func (h *headword) GlossText() string {
gloss := h.Expression
if h.IsAteji {
gloss = "〈" + gloss + "〉"
}
symbolText := h.InfoSymbols()
if symbolText != "" {
gloss += "" + symbolText + ""
}
return gloss
}
func (h *headword) TableColHeaderText() string {
text := h.KanjiForm()
if h.IsAteji {
text = "〈" + text + "〉"
}
return text
}
func (h *headword) TableRowHeaderText() string {
text := h.Reading
if h.IsGikun {
text = "〈" + text + "〉"
}
return text
}
func (h *headword) TableCellText() string {
text := h.InfoSymbols()
if text == "" {
return defaultSymbol
} else {
return text
}
}
func (h *headword) KanjiForm() string {
if h.IsKanaOnly() {
return "∅"
} else {
return h.Expression
}
}
func needsFormTable(headwords []headword) bool {
// Does the entry contain more than 1 distinct reading?
// E.g. バカがい and ばかがい are not distinct.
uniqueReading := ""
for _, h := range headwords {
if h.IsGikun {
return true
} else if h.IsSearchOnly {
continue
} else if h.IsKanaOnly() {
continue
} else if uniqueReading == "" {
uniqueReading = kata2hira(h.Reading)
} else if uniqueReading != kata2hira(h.Reading) {
return true
}
}
return false
}
type formTableData struct {
kanjiForms []string
readings []string
colHeaderText map[string]string
rowHeaderText map[string]string
cellText map[string]map[string]string
}
func tableData(headwords []headword) formTableData {
d := formTableData{
kanjiForms: []string{},
readings: []string{},
colHeaderText: make(map[string]string),
rowHeaderText: make(map[string]string),
cellText: make(map[string]map[string]string),
}
for _, h := range headwords {
if h.IsSearchOnly {
continue
}
kanjiForm := h.KanjiForm()
if !slices.Contains(d.kanjiForms, kanjiForm) {
d.kanjiForms = append(d.kanjiForms, kanjiForm)
d.colHeaderText[kanjiForm] = h.TableColHeaderText()
}
reading := h.Reading
if !slices.Contains(d.readings, reading) {
d.readings = append(d.readings, reading)
d.rowHeaderText[reading] = h.TableRowHeaderText()
d.cellText[reading] = make(map[string]string)
}
d.cellText[reading][kanjiForm] = h.TableCellText()
}
return d
}
func formsTableGlossary(headwords []headword) []any {
d := tableData(headwords)
attr := contentAttr{}
centeredAttr := contentAttr{textAlign: "center"}
leftAttr := contentAttr{textAlign: "left"}
cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner
headRowCells := []any{cornerCell}
for _, kanjiForm := range d.kanjiForms {
content := d.colHeaderText[kanjiForm]
cell := contentTableHeadCell(centeredAttr, content)
headRowCells = append(headRowCells, cell)
}
headRow := contentTableRow(attr, headRowCells...)
tableRows := []any{headRow}
for _, reading := range d.readings {
rowHeadCellText := d.rowHeaderText[reading]
rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText)
rowCells := []any{rowHeadCell}
for _, kanjiForm := range d.kanjiForms {
text := d.cellText[reading][kanjiForm]
rowCell := contentTableCell(centeredAttr, text)
rowCells = append(rowCells, rowCell)
}
tableRow := contentTableRow(attr, rowCells...)
tableRows = append(tableRows, tableRow)
}
tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}}
contentTable := contentTable(tableAttr, tableRows...)
content := contentStructure(contentTable)
return []any{content}
}
func formsGlossary(headwords []headword) []any {
glossary := []any{}
for _, h := range headwords {
if h.IsSearchOnly {
continue
}
text := h.GlossText()
glossary = append(glossary, text)
}
return glossary
}
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
term := dbTerm{Sequence: entry.Sequence}
headwords := extractHeadwords(entry)
if needsFormTable(headwords) {
term.Glossary = formsTableGlossary(headwords)
} else {
term.Glossary = formsGlossary(headwords)
}
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech)
term.addRules(rules...)
}
return term
}
func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil {
return err
}
meta := newJmdictMetadata(dictionary, "")
terms := dbTermList{}
for _, entry := range dictionary.Entries {
baseTerm := baseFormsTerm(entry)
headwords := extractHeadwords(entry)
for _, h := range headwords {
if h.IsSearchOnly {
if term, ok := createSearchTerm(h, entry, meta); ok {
terms = append(terms, term)
}
continue
}
term := baseTerm
term.Expression = h.Expression
term.Reading = h.Reading
term.addTermTags(h.TermTags...)
term.Score = calculateTermScore(1, 0, h)
terms = append(terms, term)
}
}
tags := dbTagList{}
tags = append(tags, entityTags(entities)...)
tags = append(tags, newsFrequencyTags()...)
tags = append(tags, customDbTags()...)
if title == "" {
title = "JMdict Forms"
}
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": tags.crush(),
}
jmdictDate := jmdictPublicationDate(dictionary)
index := dbIndex{
Title: title,
Revision: "JMdict." + jmdictDate,
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}

300
jmdict_glossary.go Normal file
View File

@ -0,0 +1,300 @@
package yomichan
import (
"fmt"
"strconv"
"foosoft.net/projects/jmdict"
)
func glossaryContainsLanguage(glossary []jmdict.JmdictGlossary, language string) bool {
hasGlosses := false
for _, gloss := range glossary {
if glossContainsLanguage(gloss, language) {
hasGlosses = true
break
}
}
return hasGlosses
}
func glossContainsLanguage(gloss jmdict.JmdictGlossary, language string) bool {
if gloss.Language == nil && language != "eng" {
return false
} else if gloss.Language != nil && language != *gloss.Language {
return false
} else {
return true
}
}
func makeGlossListItem(gloss jmdict.JmdictGlossary, language string) any {
contents := []any{gloss.Content}
listItem := contentListItem(contentAttr{}, contents...)
return listItem
}
func makeInfoGlossListItem(gloss jmdict.JmdictGlossary, language string) any {
// Prepend gloss with "type" (literal, figurative, trademark, etc.)
glossTypeCode := *gloss.Type
contents := []any{}
if name, ok := glossTypeCodeToName[LangCode{language, glossTypeCode}]; ok {
if name != "" {
italicStyle := contentAttr{fontStyle: "italic"}
contents = append(contents, contentSpan(italicStyle, "("+name+")"), " ")
}
} else {
fmt.Println("Unknown glossary type code " + *gloss.Type + " for build language " + language)
contents = append(contents, "["+glossTypeCode+"] ")
}
contents = append(contents, gloss.Content)
listItem := contentListItem(contentAttr{}, contents...)
return listItem
}
func makeSourceLangListItem(sourceLanguage jmdict.JmdictSource, language string) any {
contents := []any{}
var srcLangCode string
if sourceLanguage.Language == nil {
srcLangCode = "eng"
} else {
srcLangCode = *sourceLanguage.Language
}
// Format: [Language] ([Partial?], [Wasei?]): [Original word?]
// [Language]
if langName, ok := langCodeToName[LangCode{language, srcLangCode}]; ok {
contents = append(contents, langName)
} else {
contents = append(contents, srcLangCode)
fmt.Println("Unable to convert ISO 639 code " + srcLangCode + " to its full name in language " + language)
}
// ([Partial?], [Wasei?])
var sourceLangTypeCode string
if sourceLanguage.Type == nil {
sourceLangTypeCode = ""
} else {
sourceLangTypeCode = *sourceLanguage.Type
}
var sourceLangType string
if val, ok := sourceLangTypeCodeToType[LangCode{language, sourceLangTypeCode}]; ok {
sourceLangType = val
} else {
sourceLangType = sourceLangTypeCode
fmt.Println("Unknown source language type code " + sourceLangTypeCode + " for build language " + language)
}
if sourceLangType != "" && sourceLanguage.Wasei == "y" {
contents = append(contents, " ("+sourceLangType+", wasei)")
} else if sourceLangType != "" {
contents = append(contents, " ("+sourceLangType+")")
} else if sourceLanguage.Wasei == "y" {
contents = append(contents, " (wasei)")
}
// : [Original word?]
if sourceLanguage.Content != "" {
contents = append(contents, ": ")
attr := contentAttr{lang: ISOtoHTML[srcLangCode]}
contents = append(contents, contentSpan(attr, sourceLanguage.Content))
}
listItem := contentListItem(contentAttr{}, contents...)
return listItem
}
func makeReferenceListItem(reference string, refType string, meta jmdictMetadata) any {
contents := []any{}
attr := contentAttr{}
hint := refNoteHint[LangCode{meta.language, refType}]
contents = append(contents, hint+": ")
refHeadword, senseNumber, ok := parseReference(reference)
if !ok {
contents = append(contents, "【"+reference+"】")
return contentListItem(attr, contents...)
}
sequence, ok := meta.referenceToSeq[reference]
if !ok {
contents = append(contents, "【"+reference+"】")
return contentListItem(attr, contents...)
}
targetSense := senseID{
sequence: sequence,
number: senseNumber,
}
expHash := refHeadword.ExpHash()
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
doDisplaySenseNumber := (meta.seqToSenseCount[targetSense.sequence] > 1)
refGlossAttr := contentAttr{
fontSize: "65%",
verticalAlign: "middle",
data: map[string]string{"content": "refGlosses"},
}
contents = append(contents, refHeadword.ToInternalLink(doDisplayReading))
if doDisplaySenseNumber {
contents = append(contents, contentSpan(refGlossAttr, " "+strconv.Itoa(targetSense.number)+". "+meta.condensedGlosses[targetSense]))
} else {
contents = append(contents, contentSpan(refGlossAttr, " "+meta.condensedGlosses[targetSense]))
}
listItem := contentListItem(attr, contents...)
return listItem
}
func makeExampleListItem(sentence jmdict.JmdictExampleSentence) any {
if sentence.Lang == "jpn" {
return contentListItem(contentAttr{}, sentence.Text)
} else {
attr := contentAttr{
lang: ISOtoHTML[sentence.Lang],
listStyleType: ISOtoFlag[sentence.Lang],
}
return contentListItem(attr, sentence.Text)
}
}
func listAttr(lang string, listStyleType string, dataContent string) contentAttr {
return contentAttr{
lang: lang,
listStyleType: listStyleType,
data: map[string]string{"content": dataContent},
}
}
func needsStructuredContent(sense jmdict.JmdictSense, language string) bool {
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, language) && gloss.Type != nil {
return true
}
}
if len(sense.SourceLanguages) > 0 {
return true
} else if len(sense.Information) > 0 {
return true
} else if len(sense.Antonyms) > 0 {
return true
} else if len(sense.References) > 0 {
return true
} else if len(sense.Examples) > 0 {
return true
} else {
return false
}
}
func createGlossaryContent(sense jmdict.JmdictSense, meta jmdictMetadata) any {
glossaryContents := []any{}
// Add normal glosses
glossListItems := []any{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
listItem := makeGlossListItem(gloss, meta.language)
glossListItems = append(glossListItems, listItem)
}
}
if len(glossListItems) > 0 {
attr := listAttr(ISOtoHTML[meta.language], "circle", "glossary")
list := contentUnorderedList(attr, glossListItems...)
glossaryContents = append(glossaryContents, list)
}
// Add information glosses
infoGlossListItems := []any{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type != nil {
listItem := makeInfoGlossListItem(gloss, meta.language)
infoGlossListItems = append(infoGlossListItems, listItem)
}
}
if len(infoGlossListItems) > 0 {
attr := listAttr(ISOtoHTML[meta.language], infoMarker, "infoGlossary")
list := contentUnorderedList(attr, infoGlossListItems...)
glossaryContents = append(glossaryContents, list)
}
// Add language-of-origin / loanword information
sourceLangListItems := []any{}
for _, sourceLanguage := range sense.SourceLanguages {
listItem := makeSourceLangListItem(sourceLanguage, meta.language)
sourceLangListItems = append(sourceLangListItems, listItem)
}
if len(sourceLangListItems) > 0 {
attr := listAttr(ISOtoHTML[meta.language], langMarker, "sourceLanguages")
list := contentUnorderedList(attr, sourceLangListItems...)
glossaryContents = append(glossaryContents, list)
}
// Add sense notes
noteListItems := []any{}
for _, information := range sense.Information {
listItem := contentListItem(contentAttr{}, information)
noteListItems = append(noteListItems, listItem)
}
if len(noteListItems) > 0 {
attr := listAttr(ISOtoHTML["jpn"], noteMarker, "notes") // notes often contain japanese text
list := contentUnorderedList(attr, noteListItems...)
glossaryContents = append(glossaryContents, list)
}
// Add antonyms
antonymListItems := []any{}
for _, antonym := range sense.Antonyms {
listItem := makeReferenceListItem(antonym, "ant", meta)
antonymListItems = append(antonymListItems, listItem)
}
if len(antonymListItems) > 0 {
attr := listAttr(ISOtoHTML[meta.language], antonymMarker, "antonyms")
list := contentUnorderedList(attr, antonymListItems...)
glossaryContents = append(glossaryContents, list)
}
// Add cross-references
referenceListItems := []any{}
for _, reference := range sense.References {
listItem := makeReferenceListItem(reference, "xref", meta)
referenceListItems = append(referenceListItems, listItem)
}
if len(referenceListItems) > 0 {
attr := listAttr(ISOtoHTML[meta.language], refMarker, "references")
list := contentUnorderedList(attr, referenceListItems...)
glossaryContents = append(glossaryContents, list)
}
// Add example sentences
exampleListItems := []any{}
for _, example := range sense.Examples {
for _, sentence := range example.Sentences {
listItem := makeExampleListItem(sentence)
exampleListItems = append(exampleListItems, listItem)
}
}
if len(exampleListItems) > 0 {
attr := listAttr(ISOtoHTML["jpn"], ISOtoFlag["jpn"], "examples")
list := contentUnorderedList(attr, exampleListItems...)
glossaryContents = append(glossaryContents, list)
}
return contentStructure(glossaryContents...)
}
func createGlossary(sense jmdict.JmdictSense, meta jmdictMetadata) []any {
glossary := []any{}
if meta.extraMode && needsStructuredContent(sense, meta.language) {
glossary = append(glossary, createGlossaryContent(sense, meta))
} else {
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) {
glossary = append(glossary, gloss.Content)
}
}
}
return glossary
}

282
jmdict_headword.go Normal file
View File

@ -0,0 +1,282 @@
package yomichan
import (
"fmt"
"hash/fnv"
"regexp"
"strconv"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
type headword struct {
Expression string
Reading string
TermTags []string
Index int
IsPriority bool
IsFrequent bool
IsIrregular bool
IsOutdated bool
IsRareKanji bool
IsSearchOnly bool
IsAteji bool
IsGikun bool
}
type hash uint64
func (h *headword) Hash() hash {
return hashText(h.Expression + "␞" + h.Reading)
}
func (h *headword) ExpHash() hash {
return hashText(h.Expression + "␞" + h.Expression)
}
func (h *headword) ReadingHash() hash {
return hashText(h.Reading + "␞" + h.Reading)
}
func hashText(s string) hash {
h := fnv.New64a()
h.Write([]byte(s))
return hash(h.Sum64())
}
func (h *headword) IsKanaOnly() bool {
if h.Expression != h.Reading {
return false
}
for _, char := range h.Expression {
if char >= 'ぁ' && char <= 'ヿ' {
// hiragana and katakana range
continue
} else if char >= '・' && char <= '゚' {
// halfwidth katakana range
continue
} else if char == '〜' {
continue
} else {
return false
}
}
return true
}
func (h *headword) Score() int {
score := 0
if h.IsPriority {
score += 1
}
if h.IsFrequent {
score += 1
}
if h.IsIrregular {
score -= 5
}
if h.IsOutdated {
score -= 5
}
if h.IsRareKanji {
score -= 5
}
if h.IsSearchOnly {
score -= 5
}
return score
}
func (h *headword) ToInternalLink(includeReading bool) any {
if !includeReading || h.Expression == h.Reading {
return contentInternalLink(
contentAttr{lang: ISOtoHTML["jpn"]},
h.Expression,
)
} else {
return contentSpan(
contentAttr{lang: ISOtoHTML["jpn"]},
contentInternalLink(contentAttr{}, h.Expression),
"",
contentInternalLink(contentAttr{}, h.Reading),
"",
)
}
}
func (h *headword) SetFlags(infoTags, freqTags []string) {
priorityTags := []string{"ichi1", "news1", "gai1", "spec1", "spec2"}
for _, priorityTag := range priorityTags {
if slices.Contains(freqTags, priorityTag) {
h.IsPriority = true
break
}
}
if len(freqTags) > 1 {
h.IsFrequent = true
}
for _, infoTag := range infoTags {
switch infoTag {
case "iK", "ik", "io":
h.IsIrregular = true
case "oK", "ok":
h.IsOutdated = true
case "sK", "sk":
h.IsSearchOnly = true
case "rK":
h.IsRareKanji = true
case "ateji":
h.IsAteji = true
case "gikun":
h.IsGikun = true
default:
fmt.Println("Unknown information tag type: " + infoTag)
h.TermTags = append(h.TermTags, infoTag)
}
}
if h.IsOutdated && h.IsRareKanji {
h.IsRareKanji = false
}
}
func (h *headword) SetTermTags(freqTags []string) {
if h.IsPriority {
h.TermTags = append(h.TermTags, priorityTagName)
}
knownFreqTags := []string{"ichi1", "ichi2", "gai1", "gai2", "spec1", "spec2"}
for _, tag := range freqTags {
isNewsFreqTag, _ := regexp.MatchString(`nf\d\d`, tag)
if isNewsFreqTag {
// nf tags are divided into ranks of 500
// (nf01 to nf48). Let's combine them into
// ranks of 1k (news1k, news2k, ..., news24k).
var i int
if _, err := fmt.Sscanf(tag, "nf%2d", &i); err == nil {
i = (i + (i % 2)) / 2
newsTag := "news" + strconv.Itoa(i) + "k"
h.TermTags = append(h.TermTags, newsTag)
}
} else if tag == "news1" || tag == "news2" {
// News tags are derived from the nf
// rankings, so these are not needed.
continue
} else if slices.Contains(knownFreqTags, tag) {
tagWithoutTheNumber := tag[:len(tag)-1]
h.TermTags = append(h.TermTags, tagWithoutTheNumber)
} else {
fmt.Println("Unknown frequency tag type: " + tag)
h.TermTags = append(h.TermTags, tag)
}
}
if h.IsIrregular {
h.TermTags = append(h.TermTags, irregularTagName)
}
if h.IsOutdated {
h.TermTags = append(h.TermTags, outdatedTagName)
}
if h.IsRareKanji {
h.TermTags = append(h.TermTags, rareKanjiTagName)
}
if h.IsAteji {
h.TermTags = append(h.TermTags, atejiTagName)
}
if h.IsGikun {
h.TermTags = append(h.TermTags, gikunTagName)
}
}
func newHeadword(kanji *jmdict.JmdictKanji, reading *jmdict.JmdictReading) headword {
h := headword{}
infoTags := []string{}
freqTags := []string{}
if kanji == nil {
h.Expression = reading.Reading
h.Reading = reading.Reading
infoTags = reading.Information
freqTags = reading.Priorities
} else if reading == nil {
// should only apply to search-only kanji terms
h.Expression = kanji.Expression
h.Reading = ""
infoTags = kanji.Information
freqTags = kanji.Priorities
} else {
h.Expression = kanji.Expression
h.Reading = reading.Reading
infoTags = union(kanji.Information, reading.Information)
freqTags = intersection(kanji.Priorities, reading.Priorities)
}
h.SetFlags(infoTags, freqTags)
h.SetTermTags(freqTags)
return h
}
func areAllKanjiIrregular(allKanji []jmdict.JmdictKanji) bool {
// If every kanji form is rare or irregular, then we'll make
// kana-only headwords for each kana form.
if len(allKanji) == 0 {
return false
}
for _, kanji := range allKanji {
h := newHeadword(&kanji, nil)
kanjiIsIrregular := h.IsRareKanji || h.IsIrregular || h.IsOutdated || h.IsSearchOnly
if !kanjiIsIrregular {
return false
}
}
return true
}
func extractHeadwords(entry jmdict.JmdictEntry) []headword {
headwords := []headword{}
allKanjiAreIrregular := areAllKanjiIrregular(entry.Kanji)
if allKanjiAreIrregular {
// Adding the reading-only terms before kanji+reading
// terms here for the sake of the Index property,
// which affects the yomichan term ranking.
for _, reading := range entry.Readings {
h := newHeadword(nil, &reading)
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
for _, kanji := range entry.Kanji {
if slices.Contains(kanji.Information, "sK") {
// Search-only kanji forms do not have associated readings.
h := newHeadword(&kanji, nil)
h.Index = len(headwords)
headwords = append(headwords, h)
continue
}
for _, reading := range entry.Readings {
if reading.NoKanji != nil {
continue
} else if slices.Contains(reading.Information, "sk") {
// Search-only kana forms do not have associated kanji forms.
continue
} else if reading.Restrictions != nil && !slices.Contains(reading.Restrictions, kanji.Expression) {
continue
} else {
h := newHeadword(&kanji, &reading)
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
}
if !allKanjiAreIrregular {
noKanjiInEntry := (len(entry.Kanji) == 0)
for _, reading := range entry.Readings {
if reading.NoKanji != nil || noKanjiInEntry || slices.Contains(reading.Information, "sk") {
h := newHeadword(nil, &reading)
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
}
return headwords
}

183
jmdict_metadata.go Normal file
View File

@ -0,0 +1,183 @@
package yomichan
import (
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
type sequence = int
type jmdictMetadata struct {
language string
condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int
seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence
references []string
referenceToSeq map[string]sequence
hashToSearchValues map[hash][]searchValue
seqToSearchHashes map[sequence][]searchHash
entryDepth map[sequence]int
hasMultipleForms map[sequence]bool
maxSenseCount int
extraMode bool
}
type senseID struct {
sequence sequence
number int
}
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) {
// This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order
maxDepth := 0
for _, headword := range headwords {
hash := headword.Hash()
for _, seq := range meta.headwordHashToSeqs[hash] {
seqDepth := meta.entryDepth[seq]
if seqDepth == 0 {
meta.entryDepth[seq] = 1
seqDepth = 1
}
if maxDepth < seqDepth+1 {
maxDepth = seqDepth + 1
}
}
}
meta.entryDepth[entrySequence] = maxDepth
}
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
// Determine how many senses are in this entry for this language
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
senseCount := 0
for _, entrySense := range entry.Sense {
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) {
senseCount += 1
break
}
}
}
meta.seqToSenseCount[entry.Sequence] = senseCount
}
if meta.seqToSenseCount[entry.Sequence] == 0 {
return
}
// main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
meta.seqToMainHeadword[entry.Sequence] = headword
}
// hash the term pair so we can determine if it's used
// in more than one JMdict entry later.
headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
}
// hash the expression so that we can determine if we
// need to disambiguate it by displaying its reading
// in reference notes later.
expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
}
// e.g. for JMdict (English) we expect to end up with
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
// used for correlating references to sequence numbers later.
searchHashes := []searchHash{
searchHash{headwordHash, headword.IsPriority},
searchHash{expHash, headword.IsPriority},
searchHash{headword.ReadingHash(), headword.IsPriority},
}
for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
}
}
currentSenseNumber := 1
for _, entrySense := range entry.Sense {
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
continue
}
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
currentSenseNumber += 1
continue
}
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
currentSenseNumber += 1
continue
}
allReferences := append(entrySense.References, entrySense.Antonyms...)
for _, reference := range allReferences {
meta.references = append(meta.references, reference)
}
currentSense := senseID{entry.Sequence, currentSenseNumber}
if meta.condensedGlosses[currentSense] == "" {
glosses := []string{}
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
}
currentSenseNumber += 1
}
}
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{
language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int),
condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string),
seqToSearchHashes: make(map[sequence][]searchHash),
headwordHashToSeqs: make(map[hash][]sequence),
references: []string{},
hashToSearchValues: nil,
referenceToSeq: nil,
entryDepth: make(map[sequence]int),
hasMultipleForms: make(map[sequence]bool),
maxSenseCount: 0,
extraMode: languageName == "english_extra",
}
for _, entry := range dictionary.Entries {
headwords := extractHeadwords(entry)
formCount := 0
for _, headword := range headwords {
meta.AddHeadword(headword, entry)
if !headword.IsSearchOnly {
formCount += 1
}
}
meta.CalculateEntryDepth(headwords, entry.Sequence)
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
}
// this correlation process will be unnecessary once JMdict
// includes sequence numbers in its cross-reference data
meta.MakeReferenceToSeqMap()
for _, senseCount := range meta.seqToSenseCount {
if meta.maxSenseCount < senseCount {
meta.maxSenseCount = senseCount
}
}
return meta
}

170
jmdict_references.go Normal file
View File

@ -0,0 +1,170 @@
package yomichan
import (
"fmt"
"strconv"
"strings"
)
/*
* In the future, JMdict will be updated to include sequence numbers
* with each cross reference. At that time, most of the functions and
* types defined in this file will become unnecessary. see:
* https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
*/
type searchValue struct {
sequence sequence
index int
isPriority bool
}
type searchHash struct {
hash hash
isPriority bool
}
func parseReference(reference string) (headword, int, bool) {
// Reference strings in JMDict currently consist of 3 parts at
// most, separated by ・ characters. The latter two parts are
// optional. When the sense number is not specified, it is
// implied to be the first sense.
var h headword
var senseNumber int
ok := true
refParts := strings.Split(reference, "・")
if len(refParts) == 1 {
// (Kanji) or (Reading)
h = headword{Expression: refParts[0], Reading: refParts[0]}
senseNumber = 1
} else if len(refParts) == 2 {
// [Kanji + (Reading or Sense)] or (Reading + Sense)
val, err := strconv.Atoi(refParts[1])
if err == nil {
h = headword{Expression: refParts[0], Reading: refParts[0]}
senseNumber = val
} else {
h = headword{Expression: refParts[0], Reading: refParts[1]}
senseNumber = 1
}
} else if len(refParts) == 3 {
// Expression + Reading + Sense
h = headword{Expression: refParts[0], Reading: refParts[1]}
val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
if err == nil {
senseNumber = val
} else {
errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
fmt.Println(errortext)
ok = false
}
} else {
errortext := "Unexpected format for x-ref \"" + reference + "\""
fmt.Println(errortext)
ok = false
}
return h, senseNumber, ok
}
func (meta *jmdictMetadata) MakeReferenceToSeqMap() {
meta.referenceToSeq = make(map[string]sequence)
meta.MakeHashToSearchValuesMap()
for _, reference := range meta.references {
if meta.referenceToSeq[reference] != 0 {
continue
}
seq := meta.FindBestSequence(reference)
if seq != 0 {
meta.referenceToSeq[reference] = seq
} else {
fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
}
}
}
func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
meta.hashToSearchValues = make(map[hash][]searchValue)
for seq, searchHashes := range meta.seqToSearchHashes {
for idx, searchHash := range searchHashes {
searchValue := searchValue{
sequence: seq,
index: idx,
isPriority: searchHash.isPriority,
}
meta.hashToSearchValues[searchHash.hash] =
append(meta.hashToSearchValues[searchHash.hash], searchValue)
}
}
}
/*
* This function attemps to convert a JMdict reference string into a
* single definite sequence number. These reference strings are often
* ambiguous, so we have to resort to using heuristics.
*
* Generally, correspondence is determined by the order in which term
* pairs are extracted from each JMdict entry. Take for example the
* JMdict entry for ご本, which contains a reference to (without a
* reading specified). To correlate this reference with a sequence
* number, our program searches each entry for the hash of.
* There are two entries in which it is found in JMdict (English):
*
* sequence 1260670: もともともともともともと
* sequence 1522150: ほんほんほん
*
* Because is closer to the beginning of the array in the
* latter (i.e., has the lowest index), sequence number 1522150 is
* returned.
*
* In situations in which multiple sequences are found with the same
* index, the entry with a priority tag ("news1", "ichi1", "spec1",
* "spec2", "gai1") is given preference. This mostly affects
* katakana-only loanwords like ラグ.
*
* To improve accuracy, this method also checks to see if the
* reference's specified sense number really exists in the
* corresponding entry. For example, sequence 1582850 如何でいかんで
* has a reference to sense #2 of いかん (no kanji specified), which
* could belong to 13 different sequences. However, sequences 1582850
* and 2829697 are the only 2 of those 13 which contain more than one
* sense. Incidentally, sequence 1582850 is the correct match.
*
* All else being equal, the entry with the smallest sequence number
* is chosen. References in the JMdict file are currently ambiguous,
* and getting this perfect won't be possible until reference sequence
* numbers are included in the file. See:
* https://github.com/JMdictProject/JMdictIssues/issues/61
*/
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
bestSeq := 0
lowestIndex := 100000
bestIsPriority := false
headword, senseNumber, ok := parseReference(reference)
if !ok {
return bestSeq
}
hash := headword.Hash()
for _, v := range meta.hashToSearchValues[hash] {
if meta.seqToSenseCount[v.sequence] < senseNumber {
// entry must contain the specified sense
continue
} else if lowestIndex < v.index {
// lower indices are better
continue
} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {
// if indices match, check priority
continue
} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {
// if indices and priority match, check sequence number.
// lower sequence numbers are better
continue
} else {
lowestIndex = v.index
bestSeq = v.sequence
bestIsPriority = v.isPriority
}
}
return bestSeq
}

348
jmdict_tags.go Normal file
View File

@ -0,0 +1,348 @@
package yomichan
import (
"fmt"
"strconv"
"golang.org/x/exp/slices"
)
func senseNumberTags(maxSenseCount int) []dbTag {
tags := []dbTag{}
for i := 1; i <= maxSenseCount; i++ {
tag := dbTag{
Name: strconv.Itoa(i),
Order: -10, // these tags will appear on the left side
Notes: "JMdict Sense #" + strconv.Itoa(i),
}
tags = append(tags, tag)
}
return tags
}
func newsFrequencyTags() []dbTag {
// 24,000 ranks divided into 24 tags, news1k ... news24k
tags := []dbTag{}
for i := 1; i <= 24; i++ {
tagName := "news" + strconv.Itoa(i) + "k"
var startRank string
if i == 1 {
startRank = "1"
} else {
// technically should be ",001", but that looks odd
startRank = strconv.Itoa(i-1) + ",000"
}
endRank := strconv.Itoa(i) + ",000"
tag := dbTag{
Name: tagName,
Order: -2,
Score: 0,
Category: "frequent",
Notes: "ranked between the top " + startRank + " and " + endRank + " words in a frequency analysis of the Mainichi Shimbun (1990s)",
}
tags = append(tags, tag)
}
return tags
}
func entityTags(entities map[string]string) []dbTag {
tags := knownEntityTags()
for name, notes := range entities {
idx := slices.IndexFunc(tags, func(t dbTag) bool { return t.Name == name })
if idx != -1 {
tags[idx].Notes = notes
} else {
fmt.Println("Unknown tag type \"" + name + "\": " + notes)
unknownTag := dbTag{Name: name, Notes: notes}
tags = append(tags, unknownTag)
}
}
return tags
}
func customDbTags() []dbTag {
return []dbTag{
dbTag{Name: priorityTagName, Order: -10, Score: 10, Category: "popular", Notes: "high priority term"},
dbTag{Name: rareKanjiTagName, Order: 0, Score: -5, Category: "archaism", Notes: "rarely-used kanji form of this expression"},
dbTag{Name: irregularTagName, Order: 0, Score: -5, Category: "archaism", Notes: "irregular form of this expression"},
dbTag{Name: outdatedTagName, Order: 0, Score: -5, Category: "archaism", Notes: "outdated form of this expression"},
dbTag{Name: "ichi", Order: -2, Score: 0, Category: "frequent", Notes: "included in Ichimango Goi Bunruishuu (1万語語彙分類集)"},
dbTag{Name: "spec", Order: -2, Score: 0, Category: "frequent", Notes: "specified as common by JMdict editors"},
dbTag{Name: "gai", Order: -2, Score: 0, Category: "frequent", Notes: "common loanword (gairaigo・外来語)"},
dbTag{Name: "forms", Order: 0, Score: 0, Category: "", Notes: "other surface forms and readings"},
}
}
func knownEntityTags() []dbTag {
return []dbTag{
// see: https://www.edrdg.org/jmdictdb/cgi-bin/edhelp.py?svc=jmdict&sid=#kwabbr
// additional descriptions at the beginning of the JMdict file
// <re_inf> reading info
dbTag{Name: "gikun", Order: 0, Score: 0, Category: ""}, // gikun (meaning as reading) or jukujikun (special kanji reading)
dbTag{Name: "ik", Order: 0, Score: -5, Category: ""}, // word containing irregular kana usage
dbTag{Name: "ok", Order: 0, Score: -5, Category: ""}, // out-dated or obsolete kana usage
dbTag{Name: "sk", Order: 0, Score: -5, Category: ""}, // search-only kana form
// <ke_inf> kanji info
/* kanji info also has a "ik" entity that would go here if not already for the re_inf tag */
dbTag{Name: "ateji", Order: 0, Score: 0, Category: ""}, // ateji (phonetic) reading
dbTag{Name: "iK", Order: 0, Score: -5, Category: ""}, // word containing irregular kanji usage
dbTag{Name: "io", Order: 0, Score: -5, Category: ""}, // irregular okurigana usage
dbTag{Name: "oK", Order: 0, Score: -5, Category: ""}, // word containing out-dated kanji or kanji usage
dbTag{Name: "rK", Order: 0, Score: -5, Category: ""}, // rarely-used kanji form
dbTag{Name: "sK", Order: 0, Score: -5, Category: ""}, // search-only kanji form
// <misc> miscellaneous sense info
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person
dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language
dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station
dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname
dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone
dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name
dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word
dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name
dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software)
dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo
// <pos> part-of-speech info
dbTag{Name: "adj-f", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or verb acting prenominally
dbTag{Name: "adj-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi)
dbTag{Name: "adj-ix", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) - yoi/ii class
dbTag{Name: "adj-kari", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'kari' adjective (archaic)
dbTag{Name: "adj-ku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'ku' adjective (archaic)
dbTag{Name: "adj-na", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjectival nouns or quasi-adjectives (keiyodoshi)
dbTag{Name: "adj-nari", Order: -3, Score: 0, Category: "partOfSpeech"}, // archaic/formal form of na-adjective
dbTag{Name: "adj-no", Order: -3, Score: 0, Category: "partOfSpeech"}, // nouns which may take the genitive case particle 'no'
dbTag{Name: "adj-pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pre-noun adjectival (rentaishi)
dbTag{Name: "adj-shiku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'shiku' adjective (archaic)
dbTag{Name: "adj-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'taru' adjective
dbTag{Name: "adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb (fukushi)
dbTag{Name: "adv-to", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb taking the 'to' particle
dbTag{Name: "aux", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary
dbTag{Name: "aux-adj", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary adjective
dbTag{Name: "aux-v", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary verb
dbTag{Name: "conj", Order: -3, Score: 0, Category: "partOfSpeech"}, // conjunction
dbTag{Name: "cop", Order: -3, Score: 0, Category: "partOfSpeech"}, // copula
dbTag{Name: "ctr", Order: -3, Score: 0, Category: "partOfSpeech"}, // counter
dbTag{Name: "exp", Order: -5, Score: 0, Category: "expression"}, // expressions (phrases, clauses, etc.)
dbTag{Name: "int", Order: -3, Score: 0, Category: "partOfSpeech"}, // interjection (kandoushi)
dbTag{Name: "n", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (common) (futsuumeishi)
dbTag{Name: "n-adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverbial noun (fukushitekimeishi)
dbTag{Name: "n-pr", Order: -3, Score: 0, Category: "partOfSpeech"}, // proper noun
dbTag{Name: "n-pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a prefix
dbTag{Name: "n-suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a suffix
dbTag{Name: "n-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (temporal) (jisoumeishi)
dbTag{Name: "num", Order: -3, Score: 0, Category: "partOfSpeech"}, // numeric
dbTag{Name: "pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pronoun
dbTag{Name: "pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // prefix
dbTag{Name: "prt", Order: -3, Score: 0, Category: "partOfSpeech"}, // particle
dbTag{Name: "suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // suffix
dbTag{Name: "unc", Order: -3, Score: 0, Category: "partOfSpeech"}, // unclassified
dbTag{Name: "v-unspec", Order: -3, Score: 0, Category: "partOfSpeech"}, // verb unspecified
dbTag{Name: "v1", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb
dbTag{Name: "v1-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - kureru special class
dbTag{Name: "v2a-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb with 'u' ending (archaic)
dbTag{Name: "v2b-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'bu' ending (archaic)
dbTag{Name: "v2b-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'bu' ending (archaic)
dbTag{Name: "v2d-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'dzu' ending (archaic)
dbTag{Name: "v2d-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'dzu' ending (archaic)
dbTag{Name: "v2g-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'gu' ending (archaic)
dbTag{Name: "v2g-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'gu' ending (archaic)
dbTag{Name: "v2h-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'hu/fu' ending (archaic)
dbTag{Name: "v2h-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'hu/fu' ending (archaic)
dbTag{Name: "v2k-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ku' ending (archaic)
dbTag{Name: "v2k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ku' ending (archaic)
dbTag{Name: "v2m-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'mu' ending (archaic)
dbTag{Name: "v2m-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'mu' ending (archaic)
dbTag{Name: "v2n-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'nu' ending (archaic)
dbTag{Name: "v2r-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ru' ending (archaic)
dbTag{Name: "v2r-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ru' ending (archaic)
dbTag{Name: "v2s-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'su' ending (archaic)
dbTag{Name: "v2t-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'tsu' ending (archaic)
dbTag{Name: "v2t-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'tsu' ending (archaic)
dbTag{Name: "v2w-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)
dbTag{Name: "v2y-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'yu' ending (archaic)
dbTag{Name: "v2y-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'yu' ending (archaic)
dbTag{Name: "v2z-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'zu' ending (archaic)
dbTag{Name: "v4b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'bu' ending (archaic)
dbTag{Name: "v4g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'gu' ending (archaic)
dbTag{Name: "v4h", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'hu/fu' ending (archaic)
dbTag{Name: "v4k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ku' ending (archaic)
dbTag{Name: "v4m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'mu' ending (archaic)
dbTag{Name: "v4n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'nu' ending (archaic)
dbTag{Name: "v4r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ru' ending (archaic)
dbTag{Name: "v4s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'su' ending (archaic)
dbTag{Name: "v4t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'tsu' ending (archaic)
dbTag{Name: "v5aru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - -aru special class
dbTag{Name: "v5b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'bu' ending
dbTag{Name: "v5g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'gu' ending
dbTag{Name: "v5k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ku' ending
dbTag{Name: "v5k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Iku/Yuku special class
dbTag{Name: "v5m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'mu' ending
dbTag{Name: "v5n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'nu' ending
dbTag{Name: "v5r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending
dbTag{Name: "v5r-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending (irregular verb)
dbTag{Name: "v5s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'su' ending
dbTag{Name: "v5t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'tsu' ending
dbTag{Name: "v5u", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending
dbTag{Name: "v5u-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending (special class)
dbTag{Name: "v5uru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Uru old class verb (old form of Eru)
dbTag{Name: "vi", Order: -3, Score: 0, Category: "partOfSpeech"}, // intransitive verb
dbTag{Name: "vk", Order: -3, Score: 0, Category: "partOfSpeech"}, // Kuru verb - special class
dbTag{Name: "vn", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular nu verb
dbTag{Name: "vr", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular ru verb, plain form ends with -ri
dbTag{Name: "vs", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or participle which takes the aux. verb suru
dbTag{Name: "vs-c", Order: -3, Score: 0, Category: "partOfSpeech"}, // su verb - precursor to the modern suru
dbTag{Name: "vs-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - included
dbTag{Name: "vs-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - special class
dbTag{Name: "vt", Order: -3, Score: 0, Category: "partOfSpeech"}, // transitive verb
dbTag{Name: "vz", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - zuru verb (alternative form of -jiru verbs)
// <field> usage domain
dbTag{Name: "agric", Order: 0, Score: 0, Category: ""}, // agriculture
dbTag{Name: "anat", Order: 0, Score: 0, Category: ""}, // anatomy
dbTag{Name: "archeol", Order: 0, Score: 0, Category: ""}, // archeology
dbTag{Name: "archit", Order: 0, Score: 0, Category: ""}, // architecture
dbTag{Name: "art", Order: 0, Score: 0, Category: ""}, // art, aesthetics
dbTag{Name: "astron", Order: 0, Score: 0, Category: ""}, // astronomy
dbTag{Name: "audvid", Order: 0, Score: 0, Category: ""}, // audiovisual
dbTag{Name: "aviat", Order: 0, Score: 0, Category: ""}, // aviation
dbTag{Name: "baseb", Order: 0, Score: 0, Category: ""}, // baseball
dbTag{Name: "biochem", Order: 0, Score: 0, Category: ""}, // biochemistry
dbTag{Name: "biol", Order: 0, Score: 0, Category: ""}, // biology
dbTag{Name: "bot", Order: 0, Score: 0, Category: ""}, // botany
dbTag{Name: "Buddh", Order: 0, Score: 0, Category: ""}, // Buddhism
dbTag{Name: "bus", Order: 0, Score: 0, Category: ""}, // business
dbTag{Name: "cards", Order: 0, Score: 0, Category: ""}, // card games
dbTag{Name: "chem", Order: 0, Score: 0, Category: ""}, // chemistry
dbTag{Name: "Christn", Order: 0, Score: 0, Category: ""}, // Christianity
dbTag{Name: "cloth", Order: 0, Score: 0, Category: ""}, // clothing
dbTag{Name: "comp", Order: 0, Score: 0, Category: ""}, // computing
dbTag{Name: "cryst", Order: 0, Score: 0, Category: ""}, // crystallography
dbTag{Name: "dent", Order: 0, Score: 0, Category: ""}, // dentistry
dbTag{Name: "ecol", Order: 0, Score: 0, Category: ""}, // ecology
dbTag{Name: "econ", Order: 0, Score: 0, Category: ""}, // economics
dbTag{Name: "elec", Order: 0, Score: 0, Category: ""}, // electricity, elec. eng.
dbTag{Name: "electr", Order: 0, Score: 0, Category: ""}, // electronics
dbTag{Name: "embryo", Order: 0, Score: 0, Category: ""}, // embryology
dbTag{Name: "engr", Order: 0, Score: 0, Category: ""}, // engineering
dbTag{Name: "ent", Order: 0, Score: 0, Category: ""}, // entomology
dbTag{Name: "film", Order: 0, Score: 0, Category: ""}, // film
dbTag{Name: "finc", Order: 0, Score: 0, Category: ""}, // finance
dbTag{Name: "fish", Order: 0, Score: 0, Category: ""}, // fishing
dbTag{Name: "food", Order: 0, Score: 0, Category: ""}, // food, cooking
dbTag{Name: "gardn", Order: 0, Score: 0, Category: ""}, // gardening, horticulture
dbTag{Name: "genet", Order: 0, Score: 0, Category: ""}, // genetics
dbTag{Name: "geogr", Order: 0, Score: 0, Category: ""}, // geography
dbTag{Name: "geol", Order: 0, Score: 0, Category: ""}, // geology
dbTag{Name: "geom", Order: 0, Score: 0, Category: ""}, // geometry
dbTag{Name: "go", Order: 0, Score: 0, Category: ""}, // go (game)
dbTag{Name: "golf", Order: 0, Score: 0, Category: ""}, // golf
dbTag{Name: "gramm", Order: 0, Score: 0, Category: ""}, // grammar
dbTag{Name: "grmyth", Order: 0, Score: 0, Category: ""}, // Greek mythology
dbTag{Name: "hanaf", Order: 0, Score: 0, Category: ""}, // hanafuda
dbTag{Name: "horse", Order: 0, Score: 0, Category: ""}, // horse racing
dbTag{Name: "kabuki", Order: 0, Score: 0, Category: ""}, // kabuki
dbTag{Name: "law", Order: 0, Score: 0, Category: ""}, // law
dbTag{Name: "ling", Order: 0, Score: 0, Category: ""}, // linguistics
dbTag{Name: "logic", Order: 0, Score: 0, Category: ""}, // logic
dbTag{Name: "MA", Order: 0, Score: 0, Category: ""}, // martial arts
dbTag{Name: "mahj", Order: 0, Score: 0, Category: ""}, // mahjong
dbTag{Name: "manga", Order: 0, Score: 0, Category: ""}, // manga
dbTag{Name: "math", Order: 0, Score: 0, Category: ""}, // mathematics
dbTag{Name: "mech", Order: 0, Score: 0, Category: ""}, // mechanical engineering
dbTag{Name: "med", Order: 0, Score: 0, Category: ""}, // medicine
dbTag{Name: "met", Order: 0, Score: 0, Category: ""}, // meteorology
dbTag{Name: "mil", Order: 0, Score: 0, Category: ""}, // military
dbTag{Name: "mining", Order: 0, Score: 0, Category: ""}, // mining
dbTag{Name: "music", Order: 0, Score: 0, Category: ""}, // music
dbTag{Name: "noh", Order: 0, Score: 0, Category: ""}, // noh
dbTag{Name: "ornith", Order: 0, Score: 0, Category: ""}, // ornithology
dbTag{Name: "paleo", Order: 0, Score: 0, Category: ""}, // paleontology
dbTag{Name: "pathol", Order: 0, Score: 0, Category: ""}, // pathology
dbTag{Name: "pharm", Order: 0, Score: 0, Category: ""}, // pharmacy
dbTag{Name: "phil", Order: 0, Score: 0, Category: ""}, // philosophy
dbTag{Name: "photo", Order: 0, Score: 0, Category: ""}, // photography
dbTag{Name: "physics", Order: 0, Score: 0, Category: ""}, // physics
dbTag{Name: "physiol", Order: 0, Score: 0, Category: ""}, // physiology
dbTag{Name: "politics", Order: 0, Score: 0, Category: ""}, // politics
dbTag{Name: "print", Order: 0, Score: 0, Category: ""}, // printing
dbTag{Name: "psy", Order: 0, Score: 0, Category: ""}, // psychiatry
dbTag{Name: "psyanal", Order: 0, Score: 0, Category: ""}, // psychoanalysis
dbTag{Name: "psych", Order: 0, Score: 0, Category: ""}, // psychology
dbTag{Name: "rail", Order: 0, Score: 0, Category: ""}, // railway
dbTag{Name: "rommyth", Order: 0, Score: 0, Category: ""}, // Roman mythology
dbTag{Name: "Shinto", Order: 0, Score: 0, Category: ""}, // Shinto
dbTag{Name: "shogi", Order: 0, Score: 0, Category: ""}, // shogi
dbTag{Name: "ski", Order: 0, Score: 0, Category: ""}, // skiing
dbTag{Name: "sports", Order: 0, Score: 0, Category: ""}, // sports
dbTag{Name: "stat", Order: 0, Score: 0, Category: ""}, // statistics
dbTag{Name: "stockm", Order: 0, Score: 0, Category: ""}, // stock market
dbTag{Name: "sumo", Order: 0, Score: 0, Category: ""}, // sumo
dbTag{Name: "telec", Order: 0, Score: 0, Category: ""}, // telecommunications
dbTag{Name: "tradem", Order: 0, Score: 0, Category: ""}, // trademark
dbTag{Name: "tv", Order: 0, Score: 0, Category: ""}, // television
dbTag{Name: "vidg", Order: 0, Score: 0, Category: ""}, // video games
dbTag{Name: "zool", Order: 0, Score: 0, Category: ""}, // zoology
// <dial> dialect
dbTag{Name: "bra", Order: 0, Score: 0, Category: ""}, // Brazilian
dbTag{Name: "hob", Order: 0, Score: 0, Category: ""}, // Hokkaido-ben
dbTag{Name: "ksb", Order: 0, Score: 0, Category: ""}, // Kansai-ben
dbTag{Name: "ktb", Order: 0, Score: 0, Category: ""}, // Kantou-ben
dbTag{Name: "kyb", Order: 0, Score: 0, Category: ""}, // Kyoto-ben
dbTag{Name: "kyu", Order: 0, Score: 0, Category: ""}, // Kyuushuu-ben
dbTag{Name: "nab", Order: 0, Score: 0, Category: ""}, // Nagano-ben
dbTag{Name: "osb", Order: 0, Score: 0, Category: ""}, // Osaka-ben
dbTag{Name: "rkb", Order: 0, Score: 0, Category: ""}, // Ryuukyuu-ben
dbTag{Name: "thb", Order: 0, Score: 0, Category: ""}, // Touhoku-ben
dbTag{Name: "tsb", Order: 0, Score: 0, Category: ""}, // Tosa-ben
dbTag{Name: "tsug", Order: 0, Score: 0, Category: ""}, // Tsugaru-ben
}
}

View File

@ -7,8 +7,6 @@ import (
"foosoft.net/projects/jmdict"
)
const kanjidicRevision = "kanjidic2"
func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji {
if entry.ReadingMeaning == nil {
return nil
@ -161,11 +159,16 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int,
"tag": tags.crush(),
}
index := dbIndex{
Title: title,
Revision: "kanjidic2",
Sequenced: false,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
title,
kanjidicRevision,
false,
index,
recordData,
stride,
pretty,

View File

@ -72,7 +72,7 @@ func (e *kotowazaExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}

View File

@ -75,7 +75,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}
@ -89,7 +89,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}

View File

@ -106,7 +106,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}
@ -120,7 +120,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}

View File

@ -8,8 +8,6 @@ import (
_ "github.com/mattn/go-sqlite3"
)
const rikaiRevision = "rikai2"
type rikaiEntry struct {
kanji string
kana string
@ -154,11 +152,15 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr
"tag": tags.crush(),
}
index := dbIndex{
Title: title,
Revision: "rikai2",
Sequenced: true,
}
return writeDb(
outputPath,
title,
rikaiRevision,
true,
index,
recordData,
stride,
pretty,

View File

@ -5,13 +5,24 @@ go get foosoft.net/projects/yomichan-import/yomichan
mkdir -p src
mkdir -p dst
if [ ! -f src/JMdict ]; then
wget http://ftp.monash.edu/pub/nihongo/JMdict.gz
gunzip -c JMdict.gz > src/JMdict
fi
function refresh_source () {
NOW=$(date '+%s')
YESTERDAY=$((NOW - 86400)) # 86,400 seconds in 24 hours
if [ ! -f "src/$1" ]; then
wget "ftp.edrdg.org/pub/Nihongo/$1.gz"
gunzip -c "$1.gz" > "src/$1"
elif [[ $YESTERDAY -gt $(date -r "src/$1" '+%s') ]]; then
rsync "ftp.edrdg.org::nihongo/$1" "src/$1"
fi
}
yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip
refresh_source "JMdict_e_examp"
yomichan -language="english_extra" -title="JMdict" src/JMdict_e_examp dst/jmdict_english_extra_with_examples.zip
refresh_source "JMdict"
yomichan -language="english_extra" -title="JMdict" src/JMdict dst/jmdict_english_extra.zip
yomichan -language="english" -title="JMdict (English)" src/JMdict dst/jmdict_english.zip
yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip
yomichan -language="french" -title="JMdict (French)" src/JMdict dst/jmdict_french.zip
yomichan -language="german" -title="JMdict (German)" src/JMdict dst/jmdict_german.zip
yomichan -language="hungarian" -title="JMdict (Hungarian)" src/JMdict dst/jmdict_hungarian.zip
@ -20,19 +31,13 @@ yomichan -language="slovenian" -title="JMdict (Slovenian)" src/JMdict dst/jmdict
yomichan -language="spanish" -title="JMdict (Spanish)" src/JMdict dst/jmdict_spanish.zip
yomichan -language="swedish" -title="JMdict (Swedish)" src/JMdict dst/jmdict_swedish.zip
if [ ! -f src/JMnedict.xml ]; then
wget http://ftp.monash.edu/pub/nihongo/JMnedict.xml.gz
gunzip -c JMnedict.xml.gz > src/JMnedict.xml
fi
yomichan -format="forms" -title="JMdict Forms" src/JMdict dst/jmdict_forms.zip
refresh_source "JMnedict.xml"
yomichan src/JMnedict.xml dst/jmnedict.zip
if [ ! -f src/kanjidic2.xml ]; then
wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz
gunzip -c kanjidic2.xml.gz > src/kanjidic2.xml
fi
yomichan -language="english" -title="KANJIDIC (English)" src/kanjidic2.xml dst/kanjidic_english.zip
refresh_source "kanjidic2.xml"
yomichan -language="english" -title="KANJIDIC" src/kanjidic2.xml dst/kanjidic_english.zip
yomichan -language="french" -title="KANJIDIC (French)" src/kanjidic2.xml dst/kanjidic_french.zip
yomichan -language="portuguese" -title="KANJIDIC (Portuguese)" src/kanjidic2.xml dst/kanjidic_portuguese.zip
yomichan -language="spanish" -title="KANJIDIC (Spanish)" src/kanjidic2.xml dst/kanjidic_spanish.zip

View File

@ -93,7 +93,7 @@ func (e *shougakukan2Extractor) extractTerms(entry zig.BookEntry, sequence int)
terms = append(terms, dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
})
}

192
structured_content.go Normal file
View File

@ -0,0 +1,192 @@
package yomichan
type contentAttr struct {
lang string
fontStyle string // normal, italic
fontWeight string // normal, bold
fontSize string // small, medium, large, smaller, 80%, 125%, etc.
textDecorationLine []string // underline, overline, line-through
verticalAlign string // baseline, sub, super, text-top, text-bottom, middle, top, bottom
textAlign string // start, end, left, right, center, justify, justify-all, match-parent
marginTop int
marginLeft int
marginRight int
marginBottom int
listStyleType string
data map[string]string
}
// if the array contains adjacent strings, concatenate them.
// ex: ["one", "two", content_structure, "four"] -> ["onetwo", content_structure, "four"]
// if the array only contains strings, return a concatenated string.
// ex: ["one", "two"] -> "onetwo"
func contentReduce(contents []any) any {
if len(contents) == 1 {
return contents[0]
}
newContents := []any{}
var accumulator string
for _, content := range contents {
switch v := content.(type) {
case string:
accumulator = accumulator + v
default:
if accumulator != "" {
newContents = append(newContents, accumulator)
accumulator = ""
}
newContents = append(newContents, content)
}
}
if accumulator != "" {
newContents = append(newContents, accumulator)
}
if len(newContents) == 1 {
return newContents[0]
} else {
return newContents
}
}
func contentStructure(contents ...any) map[string]any {
return map[string]any{
"type": "structured-content",
"content": contentReduce(contents),
}
}
func contentRuby(attr contentAttr, ruby string, contents ...any) map[string]any {
rubyContent := map[string]any{
"tag": "ruby",
"content": []any{
contentReduce(contents),
map[string]string{"tag": "rp", "content": "("},
map[string]string{"tag": "rt", "content": ruby},
map[string]string{"tag": "rp", "content": ")"},
},
}
if attr.lang != "" {
rubyContent["lang"] = attr.lang
}
if len(attr.data) != 0 {
rubyContent["data"] = attr.data
}
return rubyContent
}
func contentInternalLink(attr contentAttr, query string, contents ...any) map[string]any {
linkContent := map[string]any{
"tag": "a",
"href": "?query=" + query + "&wildcards=off",
}
if len(contents) == 0 {
linkContent["content"] = query
} else {
linkContent["content"] = contentReduce(contents)
}
if attr.lang != "" {
linkContent["lang"] = attr.lang
}
if len(attr.data) != 0 {
linkContent["data"] = attr.data
}
return linkContent
}
func contentSpan(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "span", contents...)
}
func contentDiv(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "div", contents...)
}
func contentListItem(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "li", contents...)
}
func contentOrderedList(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "ol", contents...)
}
func contentUnorderedList(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "ul", contents...)
}
func contentTable(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "table", contents...)
}
func contentTableHead(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "thead", contents...)
}
func contentTableBody(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "tbody", contents...)
}
func contentTableRow(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "tr", contents...)
}
func contentTableHeadCell(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "th", contents...)
}
func contentTableCell(attr contentAttr, contents ...any) map[string]any {
return contentStyledContainer(attr, "td", contents...)
}
func contentStyledContainer(attr contentAttr, tag string, contents ...any) map[string]any {
container := map[string]any{"tag": tag}
container["content"] = contentReduce(contents)
if attr.lang != "" {
container["lang"] = attr.lang
}
if len(attr.data) != 0 {
container["data"] = attr.data
}
style := contentStyle(attr)
if len(style) != 0 {
container["style"] = style
}
return container
}
func contentStyle(attr contentAttr) map[string]any {
style := make(map[string]any)
if attr.fontStyle != "" {
style["fontStyle"] = attr.fontStyle
}
if attr.fontWeight != "" {
style["fontWeight"] = attr.fontWeight
}
if attr.fontSize != "" {
style["fontSize"] = attr.fontSize
}
if len(attr.textDecorationLine) != 0 {
style["textDecorationLine"] = attr.textDecorationLine
}
if attr.verticalAlign != "" {
style["verticalAlign"] = attr.verticalAlign
}
if attr.textAlign != "" {
style["textAlign"] = attr.textAlign
}
if attr.marginTop != 0 {
style["marginTop"] = attr.marginTop
}
if attr.marginLeft != 0 {
style["marginLeft"] = attr.marginLeft
}
if attr.marginRight != 0 {
style["marginRight"] = attr.marginRight
}
if attr.marginBottom != 0 {
style["marginBottom"] = attr.marginBottom
}
if attr.listStyleType != "" {
style["listStyleType"] = attr.listStyleType
}
return style
}

View File

@ -74,7 +74,7 @@ func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTer
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Glossary: []any{entry.Text},
Sequence: sequence,
}