Merge pull request #40 from stephenmk/master
New version of JMdict for Yomichan
This commit is contained in:
commit
74de4ce9e5
2
LICENSE
2
LICENSE
@ -1,4 +1,4 @@
|
||||
Copyright 2016-2022 Alex Yatskov
|
||||
Copyright 2016-2023 Yomichan-Import Authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
|
87
common.go
87
common.go
@ -19,9 +19,7 @@ const (
|
||||
DefaultTitle = ""
|
||||
)
|
||||
|
||||
const databaseFormat = 3
|
||||
|
||||
type dbRecord []interface{}
|
||||
type dbRecord []any
|
||||
type dbRecordList []dbRecord
|
||||
|
||||
type dbTag struct {
|
||||
@ -46,7 +44,7 @@ func (meta dbTagList) crush() dbRecordList {
|
||||
type dbMeta struct {
|
||||
Expression string
|
||||
Mode string
|
||||
Data interface{}
|
||||
Data any
|
||||
}
|
||||
|
||||
type dbMetaList []dbMeta
|
||||
@ -66,7 +64,7 @@ type dbTerm struct {
|
||||
DefinitionTags []string
|
||||
Rules []string
|
||||
Score int
|
||||
Glossary []string
|
||||
Glossary []any
|
||||
Sequence int
|
||||
TermTags []string
|
||||
}
|
||||
@ -142,11 +140,34 @@ func (kanji dbKanjiList) crush() dbRecordList {
|
||||
return results
|
||||
}
|
||||
|
||||
func writeDb(outputPath, title, revision string, sequenced bool, recordData map[string]dbRecordList, stride int, pretty bool) error {
|
||||
type dbIndex struct {
|
||||
Title string `json:"title"`
|
||||
Format int `json:"format"`
|
||||
Revision string `json:"revision"`
|
||||
Sequenced bool `json:"sequenced"`
|
||||
Author string `json:"author"`
|
||||
Url string `json:"url"`
|
||||
Description string `json:"description"`
|
||||
Attribution string `json:"attribution"`
|
||||
}
|
||||
|
||||
func (index *dbIndex) setDefaults() {
|
||||
if index.Format == 0 {
|
||||
index.Format = 3
|
||||
}
|
||||
if index.Author == "" {
|
||||
index.Author = "yomichan-import"
|
||||
}
|
||||
if index.Url == "" {
|
||||
index.Url = "https://github.com/FooSoft/yomichan-import"
|
||||
}
|
||||
}
|
||||
|
||||
func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordList, stride int, pretty bool) error {
|
||||
var zbuff bytes.Buffer
|
||||
zip := zip.NewWriter(&zbuff)
|
||||
|
||||
marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) {
|
||||
marshalJSON := func(obj any, pretty bool) ([]byte, error) {
|
||||
if pretty {
|
||||
return json.MarshalIndent(obj, "", " ")
|
||||
}
|
||||
@ -186,17 +207,6 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[
|
||||
}
|
||||
|
||||
var err error
|
||||
var db struct {
|
||||
Title string `json:"title"`
|
||||
Format int `json:"format"`
|
||||
Revision string `json:"revision"`
|
||||
Sequenced bool `json:"sequenced"`
|
||||
}
|
||||
|
||||
db.Title = title
|
||||
db.Format = databaseFormat
|
||||
db.Revision = revision
|
||||
db.Sequenced = sequenced
|
||||
|
||||
for recordType, recordEntries := range recordData {
|
||||
if _, err := writeDbRecords(recordType, recordEntries); err != nil {
|
||||
@ -204,7 +214,8 @@ func writeDb(outputPath, title, revision string, sequenced bool, recordData map[
|
||||
}
|
||||
}
|
||||
|
||||
bytes, err := marshalJSON(db, pretty)
|
||||
index.setDefaults()
|
||||
bytes, err := marshalJSON(index, pretty)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -252,6 +263,39 @@ func hasString(needle string, haystack []string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func intersection(s1, s2 []string) []string {
|
||||
s := []string{}
|
||||
m := make(map[string]bool)
|
||||
for _, e := range s1 {
|
||||
m[e] = true
|
||||
}
|
||||
for _, e := range s2 {
|
||||
if m[e] {
|
||||
s = append(s, e)
|
||||
m[e] = false
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func union(s1, s2 []string) []string {
|
||||
s := []string{}
|
||||
m := make(map[string]bool)
|
||||
for _, e := range s1 {
|
||||
if !m[e] {
|
||||
s = append(s, e)
|
||||
m[e] = true
|
||||
}
|
||||
}
|
||||
for _, e := range s2 {
|
||||
if !m[e] {
|
||||
s = append(s, e)
|
||||
m[e] = true
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func detectFormat(path string) (string, error) {
|
||||
switch filepath.Ext(path) {
|
||||
case ".sqlite":
|
||||
@ -263,7 +307,7 @@ func detectFormat(path string) (string, error) {
|
||||
}
|
||||
|
||||
switch filepath.Base(path) {
|
||||
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml":
|
||||
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml", "JMdict_e_examp":
|
||||
return "edict", nil
|
||||
case "JMnedict", "JMnedict.xml":
|
||||
return "enamdict", nil
|
||||
@ -293,7 +337,8 @@ func detectFormat(path string) (string, error) {
|
||||
|
||||
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
|
||||
handlers := map[string]func(string, string, string, string, int, bool) error{
|
||||
"edict": jmdictExportDb,
|
||||
"edict": jmdExportDb,
|
||||
"forms": formsExportDb,
|
||||
"enamdict": jmnedictExportDb,
|
||||
"epwing": epwingExportDb,
|
||||
"kanjidic": kanjidicExportDb,
|
||||
|
@ -65,7 +65,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
|
||||
for _, reading := range readings {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
@ -79,7 +79,7 @@ func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
|
||||
term := dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
|
||||
if len(expressions) == 0 {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
@ -82,7 +82,7 @@ func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
|
||||
term := dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
|
246
edict.go
246
edict.go
@ -1,246 +0,0 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"foosoft.net/projects/jmdict"
|
||||
)
|
||||
|
||||
const jmdictRevision = "jmdict4"
|
||||
|
||||
func jmdictBuildRules(term *dbTerm) {
|
||||
for _, tag := range term.DefinitionTags {
|
||||
switch tag {
|
||||
case "adj-i", "v1", "vk", "vz":
|
||||
term.addRules(tag)
|
||||
default:
|
||||
if strings.HasPrefix(tag, "v5") {
|
||||
term.addRules("v5")
|
||||
} else if strings.HasPrefix(tag, "vs-") {
|
||||
term.addRules("vs")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func jmdictBuildScore(term *dbTerm) {
|
||||
for _, tag := range term.DefinitionTags {
|
||||
switch tag {
|
||||
case "arch":
|
||||
term.Score -= 100
|
||||
}
|
||||
}
|
||||
for _, tag := range term.TermTags {
|
||||
switch tag {
|
||||
case "news", "ichi", "spec", "gai1":
|
||||
term.Score += 100
|
||||
case "P":
|
||||
term.Score += 500
|
||||
case "iK", "ik", "ok", "oK", "io", "oik":
|
||||
term.Score -= 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func jmdictAddPriorities(term *dbTerm, priorities ...string) {
|
||||
for _, priority := range priorities {
|
||||
switch priority {
|
||||
case "news1", "ichi1", "spec1", "gai1":
|
||||
term.addTermTags("P")
|
||||
fallthrough
|
||||
case "news2", "ichi2", "spec2", "gai2":
|
||||
term.addTermTags(priority[:len(priority)-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func jmdictBuildTagMeta(entities map[string]string) dbTagList {
|
||||
tags := dbTagList{
|
||||
dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2},
|
||||
dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2},
|
||||
dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2},
|
||||
dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2},
|
||||
dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10, Score: 10},
|
||||
}
|
||||
|
||||
for name, value := range entities {
|
||||
tag := dbTag{Name: name, Notes: value}
|
||||
|
||||
switch name {
|
||||
case "exp", "id":
|
||||
tag.Category = "expression"
|
||||
tag.Order = -5
|
||||
case "arch":
|
||||
tag.Category = "archaism"
|
||||
tag.Order = -4
|
||||
case "iK", "ik", "ok", "oK", "io", "oik":
|
||||
tag.Score = -5
|
||||
case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj",
|
||||
"aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf",
|
||||
"unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k",
|
||||
"v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru",
|
||||
"v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i",
|
||||
"vs", "vs-s", "vt", "vz":
|
||||
tag.Category = "partOfSpeech"
|
||||
tag.Order = -3
|
||||
}
|
||||
|
||||
tags = append(tags, tag)
|
||||
}
|
||||
|
||||
return tags
|
||||
}
|
||||
|
||||
func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm {
|
||||
var terms []dbTerm
|
||||
|
||||
convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) {
|
||||
if kanji != nil && reading.Restrictions != nil && !hasString(kanji.Expression, reading.Restrictions) {
|
||||
return
|
||||
}
|
||||
|
||||
var termBase dbTerm
|
||||
termBase.addTermTags(reading.Information...)
|
||||
|
||||
if kanji == nil {
|
||||
termBase.Expression = reading.Reading
|
||||
jmdictAddPriorities(&termBase, reading.Priorities...)
|
||||
} else {
|
||||
termBase.Expression = kanji.Expression
|
||||
termBase.Reading = reading.Reading
|
||||
termBase.addTermTags(kanji.Information...)
|
||||
|
||||
for _, priority := range kanji.Priorities {
|
||||
if hasString(priority, reading.Priorities) {
|
||||
jmdictAddPriorities(&termBase, priority)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var partsOfSpeech []string
|
||||
for index, sense := range edictEntry.Sense {
|
||||
|
||||
if len(sense.PartsOfSpeech) != 0 {
|
||||
partsOfSpeech = sense.PartsOfSpeech
|
||||
}
|
||||
|
||||
if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) {
|
||||
continue
|
||||
}
|
||||
|
||||
if kanji != nil && sense.RestrictedKanji != nil && !hasString(kanji.Expression, sense.RestrictedKanji) {
|
||||
continue
|
||||
}
|
||||
|
||||
term := dbTerm{
|
||||
Reading: termBase.Reading,
|
||||
Expression: termBase.Expression,
|
||||
Score: len(edictEntry.Sense) - index,
|
||||
Sequence: edictEntry.Sequence,
|
||||
}
|
||||
|
||||
for _, glossary := range sense.Glossary {
|
||||
if glossary.Language == nil && language == "" || glossary.Language != nil && language == *glossary.Language {
|
||||
term.Glossary = append(term.Glossary, glossary.Content)
|
||||
}
|
||||
}
|
||||
|
||||
if len(term.Glossary) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
term.addDefinitionTags(termBase.DefinitionTags...)
|
||||
term.addTermTags(termBase.TermTags...)
|
||||
term.addDefinitionTags(partsOfSpeech...)
|
||||
term.addDefinitionTags(sense.Fields...)
|
||||
term.addDefinitionTags(sense.Misc...)
|
||||
term.addDefinitionTags(sense.Dialects...)
|
||||
|
||||
jmdictBuildRules(&term)
|
||||
jmdictBuildScore(&term)
|
||||
|
||||
terms = append(terms, term)
|
||||
}
|
||||
}
|
||||
|
||||
if len(edictEntry.Kanji) > 0 {
|
||||
for _, kanji := range edictEntry.Kanji {
|
||||
for _, reading := range edictEntry.Readings {
|
||||
if reading.NoKanji == nil {
|
||||
convert(reading, &kanji)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, reading := range edictEntry.Readings {
|
||||
if reading.NoKanji != nil {
|
||||
convert(reading, nil)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for _, reading := range edictEntry.Readings {
|
||||
convert(reading, nil)
|
||||
}
|
||||
}
|
||||
|
||||
return terms
|
||||
}
|
||||
|
||||
func jmdictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
||||
reader, err := os.Open(inputPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
dict, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var langTag string
|
||||
switch language {
|
||||
case "dutch":
|
||||
langTag = "dut"
|
||||
case "french":
|
||||
langTag = "fre"
|
||||
case "german":
|
||||
langTag = "ger"
|
||||
case "hungarian":
|
||||
langTag = "hun"
|
||||
case "italian":
|
||||
langTag = "ita"
|
||||
case "russian":
|
||||
langTag = "rus"
|
||||
case "slovenian":
|
||||
langTag = "slv"
|
||||
case "spanish":
|
||||
langTag = "spa"
|
||||
case "swedish":
|
||||
langTag = "swe"
|
||||
}
|
||||
|
||||
var terms dbTermList
|
||||
for _, entry := range dict.Entries {
|
||||
terms = append(terms, jmdictExtractTerms(entry, langTag)...)
|
||||
}
|
||||
|
||||
if title == "" {
|
||||
title = "JMdict"
|
||||
}
|
||||
|
||||
recordData := map[string]dbRecordList{
|
||||
"term": terms.crush(),
|
||||
"tag": jmdictBuildTagMeta(entities).crush(),
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
title,
|
||||
jmdictRevision,
|
||||
true,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
)
|
||||
}
|
17
enamdict.go
17
enamdict.go
@ -6,8 +6,6 @@ import (
|
||||
"foosoft.net/projects/jmdict"
|
||||
)
|
||||
|
||||
const jmnedictRevision = "jmnedict1"
|
||||
|
||||
func jmnedictBuildTagMeta(entities map[string]string) dbTagList {
|
||||
var tags dbTagList
|
||||
|
||||
@ -53,7 +51,9 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
}
|
||||
|
||||
for _, trans := range enamdictEntry.Translations {
|
||||
term.Glossary = append(term.Glossary, trans.Translations...)
|
||||
for _, translation := range trans.Translations {
|
||||
term.Glossary = append(term.Glossary, translation)
|
||||
}
|
||||
term.addDefinitionTags(trans.NameTypes...)
|
||||
}
|
||||
|
||||
@ -101,11 +101,16 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int,
|
||||
"tag": jmnedictBuildTagMeta(entities).crush(),
|
||||
}
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: "jmnedict1",
|
||||
Sequenced: true,
|
||||
Attribution: edrdgAttribution,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
title,
|
||||
jmnedictRevision,
|
||||
true,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
|
10
epwing.go
10
epwing.go
@ -101,11 +101,15 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
|
||||
"term": terms.crush(),
|
||||
}
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: strings.Join(revisions, ";"),
|
||||
Sequenced: true,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
title,
|
||||
strings.Join(revisions, ";"),
|
||||
true,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
|
12
frequency.go
12
frequency.go
@ -7,8 +7,6 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const frequencyRevision = "frequency1"
|
||||
|
||||
func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
||||
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
|
||||
}
|
||||
@ -57,11 +55,15 @@ func frequncyExportDb(inputPath, outputPath, language, title string, stride int,
|
||||
key: frequencies.crush(),
|
||||
}
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: "frequency1",
|
||||
Sequenced: false,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
title,
|
||||
frequencyRevision,
|
||||
false,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
|
@ -90,7 +90,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe
|
||||
for _, reading := range readings {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entryText},
|
||||
Glossary: []any{entryText},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
@ -107,7 +107,7 @@ func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTe
|
||||
term := dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entryText},
|
||||
Glossary: []any{entryText},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
|
1
go.mod
1
go.mod
@ -7,6 +7,7 @@ require (
|
||||
foosoft.net/projects/zero-epwing-go v0.0.0-20220704035039-bc008453615d
|
||||
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e
|
||||
github.com/mattn/go-sqlite3 v1.14.14
|
||||
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f
|
||||
)
|
||||
|
||||
require golang.org/x/text v0.3.7 // indirect
|
||||
|
2
go.sum
2
go.sum
@ -6,5 +6,7 @@ github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e h1:wSQCJiig/QkoUnpvelSP
|
||||
github.com/andlabs/ui v0.0.0-20200610043537-70a69d6ae31e/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II=
|
||||
github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw=
|
||||
github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
|
||||
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f h1:90Jq/vvGVDsqj8QqCynjFw9MCerDguSMODLYII416Y8=
|
||||
golang.org/x/exp v0.0.0-20221207211629-99ab8fa1c11f/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
|
||||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
|
258
jmdict.go
Normal file
258
jmdict.go
Normal file
@ -0,0 +1,258 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"foosoft.net/projects/jmdict"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
func grammarRules(partsOfSpeech []string) []string {
|
||||
rules := []string{}
|
||||
for _, partOfSpeech := range partsOfSpeech {
|
||||
switch partOfSpeech {
|
||||
case "adj-i", "vk", "vz":
|
||||
rules = append(rules, partOfSpeech)
|
||||
default:
|
||||
if strings.HasPrefix(partOfSpeech, "v5") {
|
||||
rules = append(rules, "v5")
|
||||
} else if strings.HasPrefix(partOfSpeech, "v1") {
|
||||
rules = append(rules, "v1")
|
||||
} else if strings.HasPrefix(partOfSpeech, "vs-") {
|
||||
rules = append(rules, "vs")
|
||||
}
|
||||
}
|
||||
}
|
||||
return rules
|
||||
}
|
||||
|
||||
func calculateTermScore(senseNumber int, depth int, headword headword) int {
|
||||
const senseWeight int = 1
|
||||
const depthWeight int = 100
|
||||
const entryPositionWeight int = 10000
|
||||
const priorityWeight int = 1000000
|
||||
|
||||
score := 0
|
||||
score -= (senseNumber - 1) * senseWeight
|
||||
score -= depth * depthWeight
|
||||
score -= headword.Index * entryPositionWeight
|
||||
score += headword.Score() * priorityWeight
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) bool {
|
||||
// Display sense numbers if the entry has more than one sense
|
||||
// or if the headword is found in multiple entries.
|
||||
hash := headword.Hash()
|
||||
if !meta.extraMode {
|
||||
return false
|
||||
} else if meta.language != "eng" {
|
||||
return false
|
||||
} else if meta.seqToSenseCount[entry.Sequence] > 1 {
|
||||
return true
|
||||
} else if len(meta.headwordHashToSeqs[hash]) > 1 {
|
||||
return true
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
|
||||
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
|
||||
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
||||
jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content)
|
||||
return jmdictDate
|
||||
}
|
||||
|
||||
func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
||||
// Don't add "forms" terms to non-English dictionaries.
|
||||
// Information would be duplicated if users installed more
|
||||
// than one version.
|
||||
if meta.language != "eng" || !meta.extraMode {
|
||||
return dbTerm{}, false
|
||||
}
|
||||
// Don't need a "forms" term for entries with one unique
|
||||
// headword which does not appear in any other entries.
|
||||
if !meta.hasMultipleForms[entry.Sequence] {
|
||||
if len(meta.headwordHashToSeqs[headword.Hash()]) == 1 {
|
||||
return dbTerm{}, false
|
||||
}
|
||||
}
|
||||
|
||||
term := baseFormsTerm(entry)
|
||||
term.Expression = headword.Expression
|
||||
term.Reading = headword.Reading
|
||||
|
||||
term.addTermTags(headword.TermTags...)
|
||||
|
||||
term.addDefinitionTags("forms")
|
||||
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
|
||||
entryDepth := meta.entryDepth[entry.Sequence]
|
||||
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
|
||||
return term, true
|
||||
}
|
||||
|
||||
func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
||||
// Don't add "search" terms to non-English dictionaries.
|
||||
// Information would be duplicated if users installed more
|
||||
// than one version.
|
||||
if meta.language != "eng" {
|
||||
return dbTerm{}, false
|
||||
}
|
||||
|
||||
term := dbTerm{
|
||||
Expression: headword.Expression,
|
||||
Sequence: -entry.Sequence,
|
||||
}
|
||||
for _, sense := range entry.Sense {
|
||||
rules := grammarRules(sense.PartsOfSpeech)
|
||||
term.addRules(rules...)
|
||||
}
|
||||
term.addTermTags(headword.TermTags...)
|
||||
term.Score = calculateTermScore(1, 0, headword)
|
||||
|
||||
redirectHeadword := meta.seqToMainHeadword[entry.Sequence]
|
||||
expHash := redirectHeadword.ExpHash()
|
||||
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
|
||||
|
||||
content := contentSpan(
|
||||
contentAttr{fontSize: "130%"},
|
||||
"⟶",
|
||||
redirectHeadword.ToInternalLink(doDisplayReading),
|
||||
)
|
||||
|
||||
term.Glossary = []any{contentStructure(content)}
|
||||
return term, true
|
||||
}
|
||||
|
||||
func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
||||
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
|
||||
return dbTerm{}, false
|
||||
}
|
||||
if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) {
|
||||
return dbTerm{}, false
|
||||
}
|
||||
|
||||
term := dbTerm{
|
||||
Expression: headword.Expression,
|
||||
Reading: headword.Reading,
|
||||
Sequence: entry.Sequence,
|
||||
}
|
||||
|
||||
term.Glossary = createGlossary(sense, meta)
|
||||
|
||||
term.addTermTags(headword.TermTags...)
|
||||
|
||||
if doDisplaySenseNumberTag(headword, entry, meta) {
|
||||
senseNumberTag := strconv.Itoa(senseNumber)
|
||||
term.addDefinitionTags(senseNumberTag)
|
||||
}
|
||||
term.addDefinitionTags(sense.PartsOfSpeech...)
|
||||
term.addDefinitionTags(sense.Fields...)
|
||||
term.addDefinitionTags(sense.Misc...)
|
||||
term.addDefinitionTags(sense.Dialects...)
|
||||
|
||||
rules := grammarRules(sense.PartsOfSpeech)
|
||||
term.addRules(rules...)
|
||||
|
||||
entryDepth := meta.entryDepth[entry.Sequence]
|
||||
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
|
||||
|
||||
return term, true
|
||||
}
|
||||
|
||||
func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
|
||||
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
||||
return nil, false
|
||||
}
|
||||
if headword.IsSearchOnly {
|
||||
if searchTerm, ok := createSearchTerm(headword, entry, meta); ok {
|
||||
return []dbTerm{searchTerm}, true
|
||||
} else {
|
||||
return nil, false
|
||||
}
|
||||
}
|
||||
terms := []dbTerm{}
|
||||
senseNumber := 1
|
||||
for _, sense := range entry.Sense {
|
||||
if !glossaryContainsLanguage(sense.Glossary, meta.language) {
|
||||
// Do not increment sense number
|
||||
continue
|
||||
}
|
||||
if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok {
|
||||
terms = append(terms, senseTerm)
|
||||
}
|
||||
senseNumber += 1
|
||||
}
|
||||
|
||||
if formsTerm, ok := createFormsTerm(headword, entry, meta); ok {
|
||||
terms = append(terms, formsTerm)
|
||||
}
|
||||
|
||||
return terms, true
|
||||
}
|
||||
|
||||
func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
|
||||
if _, ok := langNameToCode[languageName]; !ok {
|
||||
return errors.New("Unrecognized language parameter: " + languageName)
|
||||
}
|
||||
|
||||
reader, err := os.Open(inputPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
meta := newJmdictMetadata(dictionary, languageName)
|
||||
|
||||
terms := dbTermList{}
|
||||
for _, entry := range dictionary.Entries {
|
||||
headwords := extractHeadwords(entry)
|
||||
for _, headword := range headwords {
|
||||
if newTerms, ok := extractTerms(headword, entry, meta); ok {
|
||||
terms = append(terms, newTerms...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tags := dbTagList{}
|
||||
tags = append(tags, entityTags(entities)...)
|
||||
tags = append(tags, senseNumberTags(meta.maxSenseCount)...)
|
||||
tags = append(tags, newsFrequencyTags()...)
|
||||
tags = append(tags, customDbTags()...)
|
||||
|
||||
recordData := map[string]dbRecordList{
|
||||
"term": terms.crush(),
|
||||
"tag": tags.crush(),
|
||||
}
|
||||
|
||||
if title == "" {
|
||||
title = "JMdict"
|
||||
}
|
||||
jmdictDate := jmdictPublicationDate(dictionary)
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: "JMdict." + jmdictDate,
|
||||
Sequenced: true,
|
||||
Attribution: edrdgAttribution,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
)
|
||||
}
|
218
jmdict_constants.go
Normal file
218
jmdict_constants.go
Normal file
@ -0,0 +1,218 @@
|
||||
package yomichan
|
||||
|
||||
type LangCode struct {
|
||||
language string
|
||||
code string
|
||||
}
|
||||
|
||||
const (
|
||||
edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/"
|
||||
|
||||
prioritySymbol = "★"
|
||||
rareKanjiSymbol = "🅁"
|
||||
irregularSymbol = "⚠"
|
||||
outdatedSymbol = "⛬"
|
||||
defaultSymbol = "㊒"
|
||||
|
||||
priorityTagName = "⭐"
|
||||
rareKanjiTagName = "R"
|
||||
irregularTagName = "⚠️"
|
||||
outdatedTagName = "⛬"
|
||||
atejiTagName = "ateji"
|
||||
gikunTagName = "gikun"
|
||||
|
||||
langMarker = "'🌐 '"
|
||||
noteMarker = "'📝 '"
|
||||
infoMarker = "'ℹ️ '"
|
||||
refMarker = "'➡️ '"
|
||||
antonymMarker = "'🔄 '"
|
||||
)
|
||||
|
||||
var ISOtoFlag = map[string]string{
|
||||
"": "'🇬🇧 '",
|
||||
"eng": "'🇬🇧 '",
|
||||
"dut": "'🇳🇱 '",
|
||||
"fre": "'🇫🇷 '",
|
||||
"ger": "'🇩🇪 '",
|
||||
"hun": "'🇭🇺 '",
|
||||
"ita": "'🇮🇹 '",
|
||||
"jpn": "'🇯🇵 '",
|
||||
"rus": "'🇷🇺 '",
|
||||
"slv": "'🇸🇮 '",
|
||||
"spa": "'🇪🇸 '",
|
||||
"swe": "'🇸🇪 '",
|
||||
}
|
||||
|
||||
var langNameToCode = map[string]string{
|
||||
"": "eng",
|
||||
"english": "eng",
|
||||
"english_extra": "eng",
|
||||
"dutch": "dut",
|
||||
"french": "fre",
|
||||
"german": "ger",
|
||||
"hungarian": "hun",
|
||||
"italian": "ita",
|
||||
"russian": "rus",
|
||||
"slovenian": "slv",
|
||||
"spanish": "spa",
|
||||
"swedish": "swe",
|
||||
}
|
||||
|
||||
var glossTypeCodeToName = map[LangCode]string{
|
||||
LangCode{"eng", "lit"}: "literally",
|
||||
LangCode{"eng", "fig"}: "figuratively",
|
||||
LangCode{"eng", "expl"}: "", // don't need to tell the user that an explanation is an explanation
|
||||
LangCode{"eng", "tm"}: "trademark",
|
||||
}
|
||||
|
||||
var refNoteHint = map[LangCode]string{
|
||||
LangCode{"eng", "xref"}: "see",
|
||||
LangCode{"eng", "ant"}: "antonym",
|
||||
}
|
||||
|
||||
var sourceLangTypeCodeToType = map[LangCode]string{
|
||||
LangCode{"eng", "part"}: "partial",
|
||||
LangCode{"eng", ""}: "", // implied "full"
|
||||
}
|
||||
|
||||
var langCodeToName = map[LangCode]string{
|
||||
LangCode{"eng", "afr"}: "Afrikaans",
|
||||
LangCode{"eng", "ain"}: "Ainu",
|
||||
LangCode{"eng", "alg"}: "Algonquian",
|
||||
LangCode{"eng", "amh"}: "Amharic",
|
||||
LangCode{"eng", "ara"}: "Arabic",
|
||||
LangCode{"eng", "arn"}: "Mapudungun",
|
||||
LangCode{"eng", "bnt"}: "Bantu",
|
||||
LangCode{"eng", "bre"}: "Breton",
|
||||
LangCode{"eng", "bul"}: "Bulgarian",
|
||||
LangCode{"eng", "bur"}: "Burmese",
|
||||
LangCode{"eng", "chi"}: "Chinese",
|
||||
LangCode{"eng", "chn"}: "Chinook Jargon",
|
||||
LangCode{"eng", "cze"}: "Czech",
|
||||
LangCode{"eng", "dan"}: "Danish",
|
||||
LangCode{"eng", "dut"}: "Dutch",
|
||||
LangCode{"eng", "eng"}: "English",
|
||||
LangCode{"eng", "epo"}: "Esperanto",
|
||||
LangCode{"eng", "est"}: "Estonian",
|
||||
LangCode{"eng", "fil"}: "Filipino",
|
||||
LangCode{"eng", "fin"}: "Finnish",
|
||||
LangCode{"eng", "fre"}: "French",
|
||||
LangCode{"eng", "geo"}: "Georgian",
|
||||
LangCode{"eng", "ger"}: "German",
|
||||
LangCode{"eng", "glg"}: "Galician",
|
||||
LangCode{"eng", "grc"}: "Ancient Greek",
|
||||
LangCode{"eng", "gre"}: "Modern Greek",
|
||||
LangCode{"eng", "haw"}: "Hawaiian",
|
||||
LangCode{"eng", "heb"}: "Hebrew",
|
||||
LangCode{"eng", "hin"}: "Hindi",
|
||||
LangCode{"eng", "hun"}: "Hungarian",
|
||||
LangCode{"eng", "ice"}: "Icelandic",
|
||||
LangCode{"eng", "ind"}: "Indonesian",
|
||||
LangCode{"eng", "ita"}: "Italian",
|
||||
LangCode{"eng", "khm"}: "Khmer",
|
||||
LangCode{"eng", "kor"}: "Korean",
|
||||
LangCode{"eng", "kur"}: "Kurdish",
|
||||
LangCode{"eng", "lat"}: "Latin",
|
||||
LangCode{"eng", "mal"}: "Malayalam",
|
||||
LangCode{"eng", "mao"}: "Maori",
|
||||
LangCode{"eng", "may"}: "Malay",
|
||||
LangCode{"eng", "mnc"}: "Manchu",
|
||||
LangCode{"eng", "mol"}: "Moldavian", // ISO 639 deprecated (https://iso639-3.sil.org/code/mol)
|
||||
LangCode{"eng", "mon"}: "Mongolian",
|
||||
LangCode{"eng", "nor"}: "Norwegian",
|
||||
LangCode{"eng", "per"}: "Persian",
|
||||
LangCode{"eng", "pol"}: "Polish",
|
||||
LangCode{"eng", "por"}: "Portuguese",
|
||||
LangCode{"eng", "rum"}: "Romanian",
|
||||
LangCode{"eng", "rus"}: "Russian",
|
||||
LangCode{"eng", "san"}: "Sanskrit",
|
||||
LangCode{"eng", "scr"}: "Croatian", // Code doesn't seem to exist in ISO 639. Should be "hrv" instead? (https://iso639-3.sil.org/code/hrv)
|
||||
LangCode{"eng", "slo"}: "Slovak",
|
||||
LangCode{"eng", "slv"}: "Slovenian",
|
||||
LangCode{"eng", "som"}: "Somali",
|
||||
LangCode{"eng", "spa"}: "Spanish",
|
||||
LangCode{"eng", "swa"}: "Swahili",
|
||||
LangCode{"eng", "swe"}: "Swedish",
|
||||
LangCode{"eng", "tah"}: "Tahitian",
|
||||
LangCode{"eng", "tam"}: "Tamil",
|
||||
LangCode{"eng", "tgl"}: "Tagalog",
|
||||
LangCode{"eng", "tha"}: "Thai",
|
||||
LangCode{"eng", "tib"}: "Tibetan",
|
||||
LangCode{"eng", "tur"}: "Turkish",
|
||||
LangCode{"eng", "ukr"}: "Ukrainian",
|
||||
LangCode{"eng", "urd"}: "Urdu",
|
||||
LangCode{"eng", "vie"}: "Vietnamese",
|
||||
LangCode{"eng", "yid"}: "Yiddish",
|
||||
}
|
||||
|
||||
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
|
||||
var ISOtoHTML = map[string]string{
|
||||
"afr": "af", // Afrikaans
|
||||
"ain": "ain", // Ainu
|
||||
"alg": "alg", // Algonquian
|
||||
"amh": "am", // Amharic
|
||||
"ara": "ar", // Arabic
|
||||
"arn": "arn", // Mapudungun
|
||||
"bnt": "bnt", // Bantu
|
||||
"bre": "br", // Breton
|
||||
"bul": "bg", // Bulgarian
|
||||
"bur": "my", // Burmese
|
||||
"chi": "zh", // Chinese
|
||||
"chn": "chn", // Chinook Jargon
|
||||
"cze": "cs", // Czech
|
||||
"dan": "da", // Danish
|
||||
"dut": "nl", // Dutch
|
||||
"eng": "en", // English
|
||||
"epo": "eo", // Esperanto
|
||||
"est": "et", // Estonian
|
||||
"fil": "fil", // Filipino
|
||||
"fin": "fi", // Finnish
|
||||
"fre": "fr", // French
|
||||
"geo": "ka", // Georgian
|
||||
"ger": "de", // German
|
||||
"glg": "gl", // Galician
|
||||
"grc": "grc", // Ancient Greek
|
||||
"gre": "el", // Modern Greek
|
||||
"haw": "haw", // Hawaiian
|
||||
"heb": "he", // Hebrew
|
||||
"hin": "hi", // Hindi
|
||||
"hun": "hu", // Hungarian
|
||||
"ice": "is", // Icelandic
|
||||
"ind": "id", // Indonesian
|
||||
"ita": "it", // Italian
|
||||
"jpn": "ja", // Japanese
|
||||
"khm": "km", // Khmer
|
||||
"kor": "ko", // Korean
|
||||
"kur": "ku", // Kurdish
|
||||
"lat": "la", // Latin
|
||||
"mal": "ml", // Malayalam
|
||||
"mao": "mi", // Maori
|
||||
"may": "ms", // Malay
|
||||
"mnc": "mnc", // Manchu
|
||||
"mol": "ro", // Moldavian
|
||||
"mon": "mn", // Mongolian
|
||||
"nor": "no", // Norwegian
|
||||
"per": "fa", // Persian
|
||||
"pol": "pl", // Polish
|
||||
"por": "pt", // Portuguese
|
||||
"rum": "ro", // Romanian
|
||||
"rus": "ru", // Russian
|
||||
"san": "sa", // Sanskrit
|
||||
"scr": "hr", // Croatian
|
||||
"slo": "sk", // Slovak
|
||||
"slv": "sl", // Slovenian
|
||||
"som": "so", // Somali
|
||||
"spa": "es", // Spanish
|
||||
"swa": "sw", // Swahili
|
||||
"swe": "sv", // Swedish
|
||||
"tah": "ty", // Tahitian
|
||||
"tam": "ta", // Tamil
|
||||
"tgl": "tl", // Tagalog
|
||||
"tha": "th", // Thai
|
||||
"tib": "bo", // Tibetan
|
||||
"tur": "tr", // Turkish
|
||||
"ukr": "uk", // Ukrainian
|
||||
"urd": "ur", // Urdu
|
||||
"vie": "vi", // Vietnamese
|
||||
"yid": "yi", // Yiddish
|
||||
}
|
265
jmdict_forms.go
Normal file
265
jmdict_forms.go
Normal file
@ -0,0 +1,265 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"foosoft.net/projects/jmdict"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
func kata2hira(word string) string {
|
||||
charMap := func(character rune) rune {
|
||||
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
|
||||
return character - 0x60
|
||||
} else {
|
||||
return character
|
||||
}
|
||||
}
|
||||
return strings.Map(charMap, word)
|
||||
}
|
||||
|
||||
func (h *headword) InfoSymbols() string {
|
||||
infoSymbols := []string{}
|
||||
if h.IsPriority {
|
||||
infoSymbols = append(infoSymbols, prioritySymbol)
|
||||
}
|
||||
if h.IsRareKanji {
|
||||
infoSymbols = append(infoSymbols, rareKanjiSymbol)
|
||||
}
|
||||
if h.IsIrregular {
|
||||
infoSymbols = append(infoSymbols, irregularSymbol)
|
||||
}
|
||||
if h.IsOutdated {
|
||||
infoSymbols = append(infoSymbols, outdatedSymbol)
|
||||
}
|
||||
return strings.Join(infoSymbols[:], " | ")
|
||||
}
|
||||
|
||||
func (h *headword) GlossText() string {
|
||||
gloss := h.Expression
|
||||
if h.IsAteji {
|
||||
gloss = "〈" + gloss + "〉"
|
||||
}
|
||||
symbolText := h.InfoSymbols()
|
||||
if symbolText != "" {
|
||||
gloss += "(" + symbolText + ")"
|
||||
}
|
||||
return gloss
|
||||
}
|
||||
|
||||
func (h *headword) TableColHeaderText() string {
|
||||
text := h.KanjiForm()
|
||||
if h.IsAteji {
|
||||
text = "〈" + text + "〉"
|
||||
}
|
||||
return text
|
||||
}
|
||||
|
||||
func (h *headword) TableRowHeaderText() string {
|
||||
text := h.Reading
|
||||
if h.IsGikun {
|
||||
text = "〈" + text + "〉"
|
||||
}
|
||||
return text
|
||||
}
|
||||
|
||||
func (h *headword) TableCellText() string {
|
||||
text := h.InfoSymbols()
|
||||
if text == "" {
|
||||
return defaultSymbol
|
||||
} else {
|
||||
return text
|
||||
}
|
||||
}
|
||||
|
||||
func (h *headword) KanjiForm() string {
|
||||
if h.IsKanaOnly() {
|
||||
return "∅"
|
||||
} else {
|
||||
return h.Expression
|
||||
}
|
||||
}
|
||||
|
||||
func needsFormTable(headwords []headword) bool {
|
||||
// Does the entry contain more than 1 distinct reading?
|
||||
// E.g. バカがい and ばかがい are not distinct.
|
||||
uniqueReading := ""
|
||||
for _, h := range headwords {
|
||||
if h.IsGikun {
|
||||
return true
|
||||
} else if h.IsSearchOnly {
|
||||
continue
|
||||
} else if h.IsKanaOnly() {
|
||||
continue
|
||||
} else if uniqueReading == "" {
|
||||
uniqueReading = kata2hira(h.Reading)
|
||||
} else if uniqueReading != kata2hira(h.Reading) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type formTableData struct {
|
||||
kanjiForms []string
|
||||
readings []string
|
||||
colHeaderText map[string]string
|
||||
rowHeaderText map[string]string
|
||||
cellText map[string]map[string]string
|
||||
}
|
||||
|
||||
func tableData(headwords []headword) formTableData {
|
||||
d := formTableData{
|
||||
kanjiForms: []string{},
|
||||
readings: []string{},
|
||||
colHeaderText: make(map[string]string),
|
||||
rowHeaderText: make(map[string]string),
|
||||
cellText: make(map[string]map[string]string),
|
||||
}
|
||||
for _, h := range headwords {
|
||||
if h.IsSearchOnly {
|
||||
continue
|
||||
}
|
||||
kanjiForm := h.KanjiForm()
|
||||
if !slices.Contains(d.kanjiForms, kanjiForm) {
|
||||
d.kanjiForms = append(d.kanjiForms, kanjiForm)
|
||||
d.colHeaderText[kanjiForm] = h.TableColHeaderText()
|
||||
}
|
||||
reading := h.Reading
|
||||
if !slices.Contains(d.readings, reading) {
|
||||
d.readings = append(d.readings, reading)
|
||||
d.rowHeaderText[reading] = h.TableRowHeaderText()
|
||||
d.cellText[reading] = make(map[string]string)
|
||||
}
|
||||
d.cellText[reading][kanjiForm] = h.TableCellText()
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func formsTableGlossary(headwords []headword) []any {
|
||||
d := tableData(headwords)
|
||||
|
||||
attr := contentAttr{}
|
||||
centeredAttr := contentAttr{textAlign: "center"}
|
||||
leftAttr := contentAttr{textAlign: "left"}
|
||||
|
||||
cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner
|
||||
headRowCells := []any{cornerCell}
|
||||
for _, kanjiForm := range d.kanjiForms {
|
||||
content := d.colHeaderText[kanjiForm]
|
||||
cell := contentTableHeadCell(centeredAttr, content)
|
||||
headRowCells = append(headRowCells, cell)
|
||||
}
|
||||
headRow := contentTableRow(attr, headRowCells...)
|
||||
tableRows := []any{headRow}
|
||||
for _, reading := range d.readings {
|
||||
rowHeadCellText := d.rowHeaderText[reading]
|
||||
rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText)
|
||||
rowCells := []any{rowHeadCell}
|
||||
for _, kanjiForm := range d.kanjiForms {
|
||||
text := d.cellText[reading][kanjiForm]
|
||||
rowCell := contentTableCell(centeredAttr, text)
|
||||
rowCells = append(rowCells, rowCell)
|
||||
}
|
||||
tableRow := contentTableRow(attr, rowCells...)
|
||||
tableRows = append(tableRows, tableRow)
|
||||
}
|
||||
tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}}
|
||||
contentTable := contentTable(tableAttr, tableRows...)
|
||||
content := contentStructure(contentTable)
|
||||
return []any{content}
|
||||
}
|
||||
|
||||
func formsGlossary(headwords []headword) []any {
|
||||
glossary := []any{}
|
||||
for _, h := range headwords {
|
||||
if h.IsSearchOnly {
|
||||
continue
|
||||
}
|
||||
text := h.GlossText()
|
||||
glossary = append(glossary, text)
|
||||
}
|
||||
return glossary
|
||||
}
|
||||
|
||||
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
|
||||
term := dbTerm{Sequence: entry.Sequence}
|
||||
headwords := extractHeadwords(entry)
|
||||
if needsFormTable(headwords) {
|
||||
term.Glossary = formsTableGlossary(headwords)
|
||||
} else {
|
||||
term.Glossary = formsGlossary(headwords)
|
||||
}
|
||||
for _, sense := range entry.Sense {
|
||||
rules := grammarRules(sense.PartsOfSpeech)
|
||||
term.addRules(rules...)
|
||||
}
|
||||
return term
|
||||
}
|
||||
|
||||
func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error {
|
||||
reader, err := os.Open(inputPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
meta := newJmdictMetadata(dictionary, "")
|
||||
|
||||
terms := dbTermList{}
|
||||
for _, entry := range dictionary.Entries {
|
||||
baseTerm := baseFormsTerm(entry)
|
||||
headwords := extractHeadwords(entry)
|
||||
for _, h := range headwords {
|
||||
if h.IsSearchOnly {
|
||||
if term, ok := createSearchTerm(h, entry, meta); ok {
|
||||
terms = append(terms, term)
|
||||
}
|
||||
continue
|
||||
}
|
||||
term := baseTerm
|
||||
term.Expression = h.Expression
|
||||
term.Reading = h.Reading
|
||||
term.addTermTags(h.TermTags...)
|
||||
term.Score = calculateTermScore(1, 0, h)
|
||||
terms = append(terms, term)
|
||||
}
|
||||
}
|
||||
|
||||
tags := dbTagList{}
|
||||
tags = append(tags, entityTags(entities)...)
|
||||
tags = append(tags, newsFrequencyTags()...)
|
||||
tags = append(tags, customDbTags()...)
|
||||
|
||||
if title == "" {
|
||||
title = "JMdict Forms"
|
||||
}
|
||||
|
||||
recordData := map[string]dbRecordList{
|
||||
"term": terms.crush(),
|
||||
"tag": tags.crush(),
|
||||
}
|
||||
|
||||
jmdictDate := jmdictPublicationDate(dictionary)
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: "JMdict." + jmdictDate,
|
||||
Sequenced: true,
|
||||
Attribution: edrdgAttribution,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
)
|
||||
}
|
300
jmdict_glossary.go
Normal file
300
jmdict_glossary.go
Normal file
@ -0,0 +1,300 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"foosoft.net/projects/jmdict"
|
||||
)
|
||||
|
||||
func glossaryContainsLanguage(glossary []jmdict.JmdictGlossary, language string) bool {
|
||||
hasGlosses := false
|
||||
for _, gloss := range glossary {
|
||||
if glossContainsLanguage(gloss, language) {
|
||||
hasGlosses = true
|
||||
break
|
||||
}
|
||||
}
|
||||
return hasGlosses
|
||||
}
|
||||
|
||||
func glossContainsLanguage(gloss jmdict.JmdictGlossary, language string) bool {
|
||||
if gloss.Language == nil && language != "eng" {
|
||||
return false
|
||||
} else if gloss.Language != nil && language != *gloss.Language {
|
||||
return false
|
||||
} else {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func makeGlossListItem(gloss jmdict.JmdictGlossary, language string) any {
|
||||
contents := []any{gloss.Content}
|
||||
listItem := contentListItem(contentAttr{}, contents...)
|
||||
return listItem
|
||||
}
|
||||
|
||||
func makeInfoGlossListItem(gloss jmdict.JmdictGlossary, language string) any {
|
||||
// Prepend gloss with "type" (literal, figurative, trademark, etc.)
|
||||
glossTypeCode := *gloss.Type
|
||||
contents := []any{}
|
||||
if name, ok := glossTypeCodeToName[LangCode{language, glossTypeCode}]; ok {
|
||||
if name != "" {
|
||||
italicStyle := contentAttr{fontStyle: "italic"}
|
||||
contents = append(contents, contentSpan(italicStyle, "("+name+")"), " ")
|
||||
}
|
||||
} else {
|
||||
fmt.Println("Unknown glossary type code " + *gloss.Type + " for build language " + language)
|
||||
contents = append(contents, "["+glossTypeCode+"] ")
|
||||
}
|
||||
contents = append(contents, gloss.Content)
|
||||
listItem := contentListItem(contentAttr{}, contents...)
|
||||
return listItem
|
||||
}
|
||||
|
||||
func makeSourceLangListItem(sourceLanguage jmdict.JmdictSource, language string) any {
|
||||
contents := []any{}
|
||||
|
||||
var srcLangCode string
|
||||
if sourceLanguage.Language == nil {
|
||||
srcLangCode = "eng"
|
||||
} else {
|
||||
srcLangCode = *sourceLanguage.Language
|
||||
}
|
||||
|
||||
// Format: [Language] ([Partial?], [Wasei?]): [Original word?]
|
||||
// [Language]
|
||||
if langName, ok := langCodeToName[LangCode{language, srcLangCode}]; ok {
|
||||
contents = append(contents, langName)
|
||||
} else {
|
||||
contents = append(contents, srcLangCode)
|
||||
fmt.Println("Unable to convert ISO 639 code " + srcLangCode + " to its full name in language " + language)
|
||||
}
|
||||
|
||||
// ([Partial?], [Wasei?])
|
||||
var sourceLangTypeCode string
|
||||
if sourceLanguage.Type == nil {
|
||||
sourceLangTypeCode = ""
|
||||
} else {
|
||||
sourceLangTypeCode = *sourceLanguage.Type
|
||||
}
|
||||
var sourceLangType string
|
||||
if val, ok := sourceLangTypeCodeToType[LangCode{language, sourceLangTypeCode}]; ok {
|
||||
sourceLangType = val
|
||||
} else {
|
||||
sourceLangType = sourceLangTypeCode
|
||||
fmt.Println("Unknown source language type code " + sourceLangTypeCode + " for build language " + language)
|
||||
}
|
||||
if sourceLangType != "" && sourceLanguage.Wasei == "y" {
|
||||
contents = append(contents, " ("+sourceLangType+", wasei)")
|
||||
} else if sourceLangType != "" {
|
||||
contents = append(contents, " ("+sourceLangType+")")
|
||||
} else if sourceLanguage.Wasei == "y" {
|
||||
contents = append(contents, " (wasei)")
|
||||
}
|
||||
|
||||
// : [Original word?]
|
||||
if sourceLanguage.Content != "" {
|
||||
contents = append(contents, ": ")
|
||||
attr := contentAttr{lang: ISOtoHTML[srcLangCode]}
|
||||
contents = append(contents, contentSpan(attr, sourceLanguage.Content))
|
||||
}
|
||||
|
||||
listItem := contentListItem(contentAttr{}, contents...)
|
||||
return listItem
|
||||
}
|
||||
|
||||
func makeReferenceListItem(reference string, refType string, meta jmdictMetadata) any {
|
||||
contents := []any{}
|
||||
attr := contentAttr{}
|
||||
|
||||
hint := refNoteHint[LangCode{meta.language, refType}]
|
||||
contents = append(contents, hint+": ")
|
||||
|
||||
refHeadword, senseNumber, ok := parseReference(reference)
|
||||
if !ok {
|
||||
contents = append(contents, "【"+reference+"】")
|
||||
return contentListItem(attr, contents...)
|
||||
}
|
||||
|
||||
sequence, ok := meta.referenceToSeq[reference]
|
||||
if !ok {
|
||||
contents = append(contents, "【"+reference+"】")
|
||||
return contentListItem(attr, contents...)
|
||||
}
|
||||
|
||||
targetSense := senseID{
|
||||
sequence: sequence,
|
||||
number: senseNumber,
|
||||
}
|
||||
|
||||
expHash := refHeadword.ExpHash()
|
||||
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
|
||||
doDisplaySenseNumber := (meta.seqToSenseCount[targetSense.sequence] > 1)
|
||||
refGlossAttr := contentAttr{
|
||||
fontSize: "65%",
|
||||
verticalAlign: "middle",
|
||||
data: map[string]string{"content": "refGlosses"},
|
||||
}
|
||||
|
||||
contents = append(contents, refHeadword.ToInternalLink(doDisplayReading))
|
||||
if doDisplaySenseNumber {
|
||||
contents = append(contents, contentSpan(refGlossAttr, " "+strconv.Itoa(targetSense.number)+". "+meta.condensedGlosses[targetSense]))
|
||||
} else {
|
||||
contents = append(contents, contentSpan(refGlossAttr, " "+meta.condensedGlosses[targetSense]))
|
||||
}
|
||||
|
||||
listItem := contentListItem(attr, contents...)
|
||||
return listItem
|
||||
}
|
||||
|
||||
func makeExampleListItem(sentence jmdict.JmdictExampleSentence) any {
|
||||
if sentence.Lang == "jpn" {
|
||||
return contentListItem(contentAttr{}, sentence.Text)
|
||||
} else {
|
||||
attr := contentAttr{
|
||||
lang: ISOtoHTML[sentence.Lang],
|
||||
listStyleType: ISOtoFlag[sentence.Lang],
|
||||
}
|
||||
return contentListItem(attr, sentence.Text)
|
||||
}
|
||||
}
|
||||
|
||||
func listAttr(lang string, listStyleType string, dataContent string) contentAttr {
|
||||
return contentAttr{
|
||||
lang: lang,
|
||||
listStyleType: listStyleType,
|
||||
data: map[string]string{"content": dataContent},
|
||||
}
|
||||
}
|
||||
|
||||
func needsStructuredContent(sense jmdict.JmdictSense, language string) bool {
|
||||
for _, gloss := range sense.Glossary {
|
||||
if glossContainsLanguage(gloss, language) && gloss.Type != nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if len(sense.SourceLanguages) > 0 {
|
||||
return true
|
||||
} else if len(sense.Information) > 0 {
|
||||
return true
|
||||
} else if len(sense.Antonyms) > 0 {
|
||||
return true
|
||||
} else if len(sense.References) > 0 {
|
||||
return true
|
||||
} else if len(sense.Examples) > 0 {
|
||||
return true
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func createGlossaryContent(sense jmdict.JmdictSense, meta jmdictMetadata) any {
|
||||
glossaryContents := []any{}
|
||||
|
||||
// Add normal glosses
|
||||
glossListItems := []any{}
|
||||
for _, gloss := range sense.Glossary {
|
||||
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
||||
listItem := makeGlossListItem(gloss, meta.language)
|
||||
glossListItems = append(glossListItems, listItem)
|
||||
}
|
||||
}
|
||||
if len(glossListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML[meta.language], "circle", "glossary")
|
||||
list := contentUnorderedList(attr, glossListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
// Add information glosses
|
||||
infoGlossListItems := []any{}
|
||||
for _, gloss := range sense.Glossary {
|
||||
if glossContainsLanguage(gloss, meta.language) && gloss.Type != nil {
|
||||
listItem := makeInfoGlossListItem(gloss, meta.language)
|
||||
infoGlossListItems = append(infoGlossListItems, listItem)
|
||||
}
|
||||
}
|
||||
if len(infoGlossListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML[meta.language], infoMarker, "infoGlossary")
|
||||
list := contentUnorderedList(attr, infoGlossListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
// Add language-of-origin / loanword information
|
||||
sourceLangListItems := []any{}
|
||||
for _, sourceLanguage := range sense.SourceLanguages {
|
||||
listItem := makeSourceLangListItem(sourceLanguage, meta.language)
|
||||
sourceLangListItems = append(sourceLangListItems, listItem)
|
||||
}
|
||||
if len(sourceLangListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML[meta.language], langMarker, "sourceLanguages")
|
||||
list := contentUnorderedList(attr, sourceLangListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
// Add sense notes
|
||||
noteListItems := []any{}
|
||||
for _, information := range sense.Information {
|
||||
listItem := contentListItem(contentAttr{}, information)
|
||||
noteListItems = append(noteListItems, listItem)
|
||||
}
|
||||
if len(noteListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML["jpn"], noteMarker, "notes") // notes often contain japanese text
|
||||
list := contentUnorderedList(attr, noteListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
// Add antonyms
|
||||
antonymListItems := []any{}
|
||||
for _, antonym := range sense.Antonyms {
|
||||
listItem := makeReferenceListItem(antonym, "ant", meta)
|
||||
antonymListItems = append(antonymListItems, listItem)
|
||||
}
|
||||
if len(antonymListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML[meta.language], antonymMarker, "antonyms")
|
||||
list := contentUnorderedList(attr, antonymListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
// Add cross-references
|
||||
referenceListItems := []any{}
|
||||
for _, reference := range sense.References {
|
||||
listItem := makeReferenceListItem(reference, "xref", meta)
|
||||
referenceListItems = append(referenceListItems, listItem)
|
||||
}
|
||||
if len(referenceListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML[meta.language], refMarker, "references")
|
||||
list := contentUnorderedList(attr, referenceListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
// Add example sentences
|
||||
exampleListItems := []any{}
|
||||
for _, example := range sense.Examples {
|
||||
for _, sentence := range example.Sentences {
|
||||
listItem := makeExampleListItem(sentence)
|
||||
exampleListItems = append(exampleListItems, listItem)
|
||||
}
|
||||
}
|
||||
if len(exampleListItems) > 0 {
|
||||
attr := listAttr(ISOtoHTML["jpn"], ISOtoFlag["jpn"], "examples")
|
||||
list := contentUnorderedList(attr, exampleListItems...)
|
||||
glossaryContents = append(glossaryContents, list)
|
||||
}
|
||||
|
||||
return contentStructure(glossaryContents...)
|
||||
}
|
||||
|
||||
func createGlossary(sense jmdict.JmdictSense, meta jmdictMetadata) []any {
|
||||
glossary := []any{}
|
||||
if meta.extraMode && needsStructuredContent(sense, meta.language) {
|
||||
glossary = append(glossary, createGlossaryContent(sense, meta))
|
||||
} else {
|
||||
for _, gloss := range sense.Glossary {
|
||||
if glossContainsLanguage(gloss, meta.language) {
|
||||
glossary = append(glossary, gloss.Content)
|
||||
}
|
||||
}
|
||||
}
|
||||
return glossary
|
||||
}
|
282
jmdict_headword.go
Normal file
282
jmdict_headword.go
Normal file
@ -0,0 +1,282 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"foosoft.net/projects/jmdict"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
type headword struct {
|
||||
Expression string
|
||||
Reading string
|
||||
TermTags []string
|
||||
Index int
|
||||
IsPriority bool
|
||||
IsFrequent bool
|
||||
IsIrregular bool
|
||||
IsOutdated bool
|
||||
IsRareKanji bool
|
||||
IsSearchOnly bool
|
||||
IsAteji bool
|
||||
IsGikun bool
|
||||
}
|
||||
|
||||
type hash uint64
|
||||
|
||||
func (h *headword) Hash() hash {
|
||||
return hashText(h.Expression + "␞" + h.Reading)
|
||||
}
|
||||
|
||||
func (h *headword) ExpHash() hash {
|
||||
return hashText(h.Expression + "␞" + h.Expression)
|
||||
}
|
||||
|
||||
func (h *headword) ReadingHash() hash {
|
||||
return hashText(h.Reading + "␞" + h.Reading)
|
||||
}
|
||||
|
||||
func hashText(s string) hash {
|
||||
h := fnv.New64a()
|
||||
h.Write([]byte(s))
|
||||
return hash(h.Sum64())
|
||||
}
|
||||
|
||||
func (h *headword) IsKanaOnly() bool {
|
||||
if h.Expression != h.Reading {
|
||||
return false
|
||||
}
|
||||
for _, char := range h.Expression {
|
||||
if char >= 'ぁ' && char <= 'ヿ' {
|
||||
// hiragana and katakana range
|
||||
continue
|
||||
} else if char >= '・' && char <= '゚' {
|
||||
// halfwidth katakana range
|
||||
continue
|
||||
} else if char == '〜' {
|
||||
continue
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (h *headword) Score() int {
|
||||
score := 0
|
||||
if h.IsPriority {
|
||||
score += 1
|
||||
}
|
||||
if h.IsFrequent {
|
||||
score += 1
|
||||
}
|
||||
if h.IsIrregular {
|
||||
score -= 5
|
||||
}
|
||||
if h.IsOutdated {
|
||||
score -= 5
|
||||
}
|
||||
if h.IsRareKanji {
|
||||
score -= 5
|
||||
}
|
||||
if h.IsSearchOnly {
|
||||
score -= 5
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func (h *headword) ToInternalLink(includeReading bool) any {
|
||||
if !includeReading || h.Expression == h.Reading {
|
||||
return contentInternalLink(
|
||||
contentAttr{lang: ISOtoHTML["jpn"]},
|
||||
h.Expression,
|
||||
)
|
||||
} else {
|
||||
return contentSpan(
|
||||
contentAttr{lang: ISOtoHTML["jpn"]},
|
||||
contentInternalLink(contentAttr{}, h.Expression),
|
||||
"(",
|
||||
contentInternalLink(contentAttr{}, h.Reading),
|
||||
")",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *headword) SetFlags(infoTags, freqTags []string) {
|
||||
priorityTags := []string{"ichi1", "news1", "gai1", "spec1", "spec2"}
|
||||
for _, priorityTag := range priorityTags {
|
||||
if slices.Contains(freqTags, priorityTag) {
|
||||
h.IsPriority = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if len(freqTags) > 1 {
|
||||
h.IsFrequent = true
|
||||
}
|
||||
for _, infoTag := range infoTags {
|
||||
switch infoTag {
|
||||
case "iK", "ik", "io":
|
||||
h.IsIrregular = true
|
||||
case "oK", "ok":
|
||||
h.IsOutdated = true
|
||||
case "sK", "sk":
|
||||
h.IsSearchOnly = true
|
||||
case "rK":
|
||||
h.IsRareKanji = true
|
||||
case "ateji":
|
||||
h.IsAteji = true
|
||||
case "gikun":
|
||||
h.IsGikun = true
|
||||
default:
|
||||
fmt.Println("Unknown information tag type: " + infoTag)
|
||||
h.TermTags = append(h.TermTags, infoTag)
|
||||
}
|
||||
}
|
||||
if h.IsOutdated && h.IsRareKanji {
|
||||
h.IsRareKanji = false
|
||||
}
|
||||
}
|
||||
|
||||
func (h *headword) SetTermTags(freqTags []string) {
|
||||
if h.IsPriority {
|
||||
h.TermTags = append(h.TermTags, priorityTagName)
|
||||
}
|
||||
knownFreqTags := []string{"ichi1", "ichi2", "gai1", "gai2", "spec1", "spec2"}
|
||||
for _, tag := range freqTags {
|
||||
isNewsFreqTag, _ := regexp.MatchString(`nf\d\d`, tag)
|
||||
if isNewsFreqTag {
|
||||
// nf tags are divided into ranks of 500
|
||||
// (nf01 to nf48). Let's combine them into
|
||||
// ranks of 1k (news1k, news2k, ..., news24k).
|
||||
var i int
|
||||
if _, err := fmt.Sscanf(tag, "nf%2d", &i); err == nil {
|
||||
i = (i + (i % 2)) / 2
|
||||
newsTag := "news" + strconv.Itoa(i) + "k"
|
||||
h.TermTags = append(h.TermTags, newsTag)
|
||||
}
|
||||
} else if tag == "news1" || tag == "news2" {
|
||||
// News tags are derived from the nf
|
||||
// rankings, so these are not needed.
|
||||
continue
|
||||
} else if slices.Contains(knownFreqTags, tag) {
|
||||
tagWithoutTheNumber := tag[:len(tag)-1]
|
||||
h.TermTags = append(h.TermTags, tagWithoutTheNumber)
|
||||
} else {
|
||||
fmt.Println("Unknown frequency tag type: " + tag)
|
||||
h.TermTags = append(h.TermTags, tag)
|
||||
}
|
||||
}
|
||||
if h.IsIrregular {
|
||||
h.TermTags = append(h.TermTags, irregularTagName)
|
||||
}
|
||||
if h.IsOutdated {
|
||||
h.TermTags = append(h.TermTags, outdatedTagName)
|
||||
}
|
||||
if h.IsRareKanji {
|
||||
h.TermTags = append(h.TermTags, rareKanjiTagName)
|
||||
}
|
||||
if h.IsAteji {
|
||||
h.TermTags = append(h.TermTags, atejiTagName)
|
||||
}
|
||||
if h.IsGikun {
|
||||
h.TermTags = append(h.TermTags, gikunTagName)
|
||||
}
|
||||
}
|
||||
|
||||
func newHeadword(kanji *jmdict.JmdictKanji, reading *jmdict.JmdictReading) headword {
|
||||
h := headword{}
|
||||
infoTags := []string{}
|
||||
freqTags := []string{}
|
||||
if kanji == nil {
|
||||
h.Expression = reading.Reading
|
||||
h.Reading = reading.Reading
|
||||
infoTags = reading.Information
|
||||
freqTags = reading.Priorities
|
||||
} else if reading == nil {
|
||||
// should only apply to search-only kanji terms
|
||||
h.Expression = kanji.Expression
|
||||
h.Reading = ""
|
||||
infoTags = kanji.Information
|
||||
freqTags = kanji.Priorities
|
||||
} else {
|
||||
h.Expression = kanji.Expression
|
||||
h.Reading = reading.Reading
|
||||
infoTags = union(kanji.Information, reading.Information)
|
||||
freqTags = intersection(kanji.Priorities, reading.Priorities)
|
||||
}
|
||||
h.SetFlags(infoTags, freqTags)
|
||||
h.SetTermTags(freqTags)
|
||||
return h
|
||||
}
|
||||
|
||||
func areAllKanjiIrregular(allKanji []jmdict.JmdictKanji) bool {
|
||||
// If every kanji form is rare or irregular, then we'll make
|
||||
// kana-only headwords for each kana form.
|
||||
if len(allKanji) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, kanji := range allKanji {
|
||||
h := newHeadword(&kanji, nil)
|
||||
kanjiIsIrregular := h.IsRareKanji || h.IsIrregular || h.IsOutdated || h.IsSearchOnly
|
||||
if !kanjiIsIrregular {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func extractHeadwords(entry jmdict.JmdictEntry) []headword {
|
||||
headwords := []headword{}
|
||||
allKanjiAreIrregular := areAllKanjiIrregular(entry.Kanji)
|
||||
|
||||
if allKanjiAreIrregular {
|
||||
// Adding the reading-only terms before kanji+reading
|
||||
// terms here for the sake of the Index property,
|
||||
// which affects the yomichan term ranking.
|
||||
for _, reading := range entry.Readings {
|
||||
h := newHeadword(nil, &reading)
|
||||
h.Index = len(headwords)
|
||||
headwords = append(headwords, h)
|
||||
}
|
||||
}
|
||||
|
||||
for _, kanji := range entry.Kanji {
|
||||
if slices.Contains(kanji.Information, "sK") {
|
||||
// Search-only kanji forms do not have associated readings.
|
||||
h := newHeadword(&kanji, nil)
|
||||
h.Index = len(headwords)
|
||||
headwords = append(headwords, h)
|
||||
continue
|
||||
}
|
||||
for _, reading := range entry.Readings {
|
||||
if reading.NoKanji != nil {
|
||||
continue
|
||||
} else if slices.Contains(reading.Information, "sk") {
|
||||
// Search-only kana forms do not have associated kanji forms.
|
||||
continue
|
||||
} else if reading.Restrictions != nil && !slices.Contains(reading.Restrictions, kanji.Expression) {
|
||||
continue
|
||||
} else {
|
||||
h := newHeadword(&kanji, &reading)
|
||||
h.Index = len(headwords)
|
||||
headwords = append(headwords, h)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !allKanjiAreIrregular {
|
||||
noKanjiInEntry := (len(entry.Kanji) == 0)
|
||||
for _, reading := range entry.Readings {
|
||||
if reading.NoKanji != nil || noKanjiInEntry || slices.Contains(reading.Information, "sk") {
|
||||
h := newHeadword(nil, &reading)
|
||||
h.Index = len(headwords)
|
||||
headwords = append(headwords, h)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return headwords
|
||||
}
|
183
jmdict_metadata.go
Normal file
183
jmdict_metadata.go
Normal file
@ -0,0 +1,183 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"foosoft.net/projects/jmdict"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
type sequence = int
|
||||
|
||||
type jmdictMetadata struct {
|
||||
language string
|
||||
condensedGlosses map[senseID]string
|
||||
seqToSenseCount map[sequence]int
|
||||
seqToMainHeadword map[sequence]headword
|
||||
expHashToReadings map[hash][]string
|
||||
headwordHashToSeqs map[hash][]sequence
|
||||
references []string
|
||||
referenceToSeq map[string]sequence
|
||||
hashToSearchValues map[hash][]searchValue
|
||||
seqToSearchHashes map[sequence][]searchHash
|
||||
entryDepth map[sequence]int
|
||||
hasMultipleForms map[sequence]bool
|
||||
maxSenseCount int
|
||||
extraMode bool
|
||||
}
|
||||
|
||||
type senseID struct {
|
||||
sequence sequence
|
||||
number int
|
||||
}
|
||||
|
||||
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) {
|
||||
// This is to ensure that terms are grouped among their
|
||||
// entries of origin and displayed in correct sequential order
|
||||
maxDepth := 0
|
||||
for _, headword := range headwords {
|
||||
hash := headword.Hash()
|
||||
for _, seq := range meta.headwordHashToSeqs[hash] {
|
||||
seqDepth := meta.entryDepth[seq]
|
||||
if seqDepth == 0 {
|
||||
meta.entryDepth[seq] = 1
|
||||
seqDepth = 1
|
||||
}
|
||||
if maxDepth < seqDepth+1 {
|
||||
maxDepth = seqDepth + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
meta.entryDepth[entrySequence] = maxDepth
|
||||
}
|
||||
|
||||
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
|
||||
|
||||
// Determine how many senses are in this entry for this language
|
||||
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
|
||||
senseCount := 0
|
||||
for _, entrySense := range entry.Sense {
|
||||
for _, gloss := range entrySense.Glossary {
|
||||
if glossContainsLanguage(gloss, meta.language) {
|
||||
senseCount += 1
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
meta.seqToSenseCount[entry.Sequence] = senseCount
|
||||
}
|
||||
|
||||
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// main headwords (first ones that are found in entries).
|
||||
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
|
||||
meta.seqToMainHeadword[entry.Sequence] = headword
|
||||
}
|
||||
|
||||
// hash the term pair so we can determine if it's used
|
||||
// in more than one JMdict entry later.
|
||||
headwordHash := headword.Hash()
|
||||
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
|
||||
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
|
||||
}
|
||||
|
||||
// hash the expression so that we can determine if we
|
||||
// need to disambiguate it by displaying its reading
|
||||
// in reference notes later.
|
||||
expHash := headword.ExpHash()
|
||||
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
|
||||
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
|
||||
}
|
||||
|
||||
// e.g. for JMdict (English) we expect to end up with
|
||||
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
|
||||
// used for correlating references to sequence numbers later.
|
||||
searchHashes := []searchHash{
|
||||
searchHash{headwordHash, headword.IsPriority},
|
||||
searchHash{expHash, headword.IsPriority},
|
||||
searchHash{headword.ReadingHash(), headword.IsPriority},
|
||||
}
|
||||
for _, x := range searchHashes {
|
||||
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
|
||||
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
|
||||
}
|
||||
}
|
||||
|
||||
currentSenseNumber := 1
|
||||
for _, entrySense := range entry.Sense {
|
||||
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
|
||||
continue
|
||||
}
|
||||
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
|
||||
currentSenseNumber += 1
|
||||
continue
|
||||
}
|
||||
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
|
||||
currentSenseNumber += 1
|
||||
continue
|
||||
}
|
||||
|
||||
allReferences := append(entrySense.References, entrySense.Antonyms...)
|
||||
for _, reference := range allReferences {
|
||||
meta.references = append(meta.references, reference)
|
||||
}
|
||||
|
||||
currentSense := senseID{entry.Sequence, currentSenseNumber}
|
||||
if meta.condensedGlosses[currentSense] == "" {
|
||||
glosses := []string{}
|
||||
for _, gloss := range entrySense.Glossary {
|
||||
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
||||
glosses = append(glosses, gloss.Content)
|
||||
}
|
||||
}
|
||||
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
|
||||
}
|
||||
currentSenseNumber += 1
|
||||
}
|
||||
}
|
||||
|
||||
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
|
||||
meta := jmdictMetadata{
|
||||
language: langNameToCode[languageName],
|
||||
seqToSenseCount: make(map[sequence]int),
|
||||
condensedGlosses: make(map[senseID]string),
|
||||
seqToMainHeadword: make(map[sequence]headword),
|
||||
expHashToReadings: make(map[hash][]string),
|
||||
seqToSearchHashes: make(map[sequence][]searchHash),
|
||||
headwordHashToSeqs: make(map[hash][]sequence),
|
||||
references: []string{},
|
||||
hashToSearchValues: nil,
|
||||
referenceToSeq: nil,
|
||||
entryDepth: make(map[sequence]int),
|
||||
hasMultipleForms: make(map[sequence]bool),
|
||||
maxSenseCount: 0,
|
||||
extraMode: languageName == "english_extra",
|
||||
}
|
||||
|
||||
for _, entry := range dictionary.Entries {
|
||||
headwords := extractHeadwords(entry)
|
||||
formCount := 0
|
||||
for _, headword := range headwords {
|
||||
meta.AddHeadword(headword, entry)
|
||||
if !headword.IsSearchOnly {
|
||||
formCount += 1
|
||||
}
|
||||
}
|
||||
meta.CalculateEntryDepth(headwords, entry.Sequence)
|
||||
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
|
||||
}
|
||||
|
||||
// this correlation process will be unnecessary once JMdict
|
||||
// includes sequence numbers in its cross-reference data
|
||||
meta.MakeReferenceToSeqMap()
|
||||
|
||||
for _, senseCount := range meta.seqToSenseCount {
|
||||
if meta.maxSenseCount < senseCount {
|
||||
meta.maxSenseCount = senseCount
|
||||
}
|
||||
}
|
||||
|
||||
return meta
|
||||
}
|
170
jmdict_references.go
Normal file
170
jmdict_references.go
Normal file
@ -0,0 +1,170 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
/*
|
||||
* In the future, JMdict will be updated to include sequence numbers
|
||||
* with each cross reference. At that time, most of the functions and
|
||||
* types defined in this file will become unnecessary. see:
|
||||
* https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
|
||||
*/
|
||||
|
||||
type searchValue struct {
|
||||
sequence sequence
|
||||
index int
|
||||
isPriority bool
|
||||
}
|
||||
|
||||
type searchHash struct {
|
||||
hash hash
|
||||
isPriority bool
|
||||
}
|
||||
|
||||
func parseReference(reference string) (headword, int, bool) {
|
||||
// Reference strings in JMDict currently consist of 3 parts at
|
||||
// most, separated by ・ characters. The latter two parts are
|
||||
// optional. When the sense number is not specified, it is
|
||||
// implied to be the first sense.
|
||||
var h headword
|
||||
var senseNumber int
|
||||
ok := true
|
||||
refParts := strings.Split(reference, "・")
|
||||
if len(refParts) == 1 {
|
||||
// (Kanji) or (Reading)
|
||||
h = headword{Expression: refParts[0], Reading: refParts[0]}
|
||||
senseNumber = 1
|
||||
} else if len(refParts) == 2 {
|
||||
// [Kanji + (Reading or Sense)] or (Reading + Sense)
|
||||
val, err := strconv.Atoi(refParts[1])
|
||||
if err == nil {
|
||||
h = headword{Expression: refParts[0], Reading: refParts[0]}
|
||||
senseNumber = val
|
||||
} else {
|
||||
h = headword{Expression: refParts[0], Reading: refParts[1]}
|
||||
senseNumber = 1
|
||||
}
|
||||
} else if len(refParts) == 3 {
|
||||
// Expression + Reading + Sense
|
||||
h = headword{Expression: refParts[0], Reading: refParts[1]}
|
||||
val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
|
||||
if err == nil {
|
||||
senseNumber = val
|
||||
} else {
|
||||
errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
|
||||
fmt.Println(errortext)
|
||||
ok = false
|
||||
}
|
||||
} else {
|
||||
errortext := "Unexpected format for x-ref \"" + reference + "\""
|
||||
fmt.Println(errortext)
|
||||
ok = false
|
||||
}
|
||||
return h, senseNumber, ok
|
||||
}
|
||||
|
||||
func (meta *jmdictMetadata) MakeReferenceToSeqMap() {
|
||||
|
||||
meta.referenceToSeq = make(map[string]sequence)
|
||||
meta.MakeHashToSearchValuesMap()
|
||||
|
||||
for _, reference := range meta.references {
|
||||
if meta.referenceToSeq[reference] != 0 {
|
||||
continue
|
||||
}
|
||||
seq := meta.FindBestSequence(reference)
|
||||
if seq != 0 {
|
||||
meta.referenceToSeq[reference] = seq
|
||||
} else {
|
||||
fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
|
||||
meta.hashToSearchValues = make(map[hash][]searchValue)
|
||||
for seq, searchHashes := range meta.seqToSearchHashes {
|
||||
for idx, searchHash := range searchHashes {
|
||||
searchValue := searchValue{
|
||||
sequence: seq,
|
||||
index: idx,
|
||||
isPriority: searchHash.isPriority,
|
||||
}
|
||||
meta.hashToSearchValues[searchHash.hash] =
|
||||
append(meta.hashToSearchValues[searchHash.hash], searchValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function attemps to convert a JMdict reference string into a
|
||||
* single definite sequence number. These reference strings are often
|
||||
* ambiguous, so we have to resort to using heuristics.
|
||||
*
|
||||
* Generally, correspondence is determined by the order in which term
|
||||
* pairs are extracted from each JMdict entry. Take for example the
|
||||
* JMdict entry for ご本, which contains a reference to 本 (without a
|
||||
* reading specified). To correlate this reference with a sequence
|
||||
* number, our program searches each entry for the hash of【本・本】.
|
||||
* There are two entries in which it is found in JMdict (English):
|
||||
*
|
||||
* sequence 1260670: 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
|
||||
* sequence 1522150: 【本・ほん】、【本・本】、【ほん・ほん】
|
||||
*
|
||||
* Because 【本・本】 is closer to the beginning of the array in the
|
||||
* latter (i.e., has the lowest index), sequence number 1522150 is
|
||||
* returned.
|
||||
*
|
||||
* In situations in which multiple sequences are found with the same
|
||||
* index, the entry with a priority tag ("news1", "ichi1", "spec1",
|
||||
* "spec2", "gai1") is given preference. This mostly affects
|
||||
* katakana-only loanwords like ラグ.
|
||||
*
|
||||
* To improve accuracy, this method also checks to see if the
|
||||
* reference's specified sense number really exists in the
|
||||
* corresponding entry. For example, sequence 1582850 【如何で・いかんで】
|
||||
* has a reference to sense #2 of いかん (no kanji specified), which
|
||||
* could belong to 13 different sequences. However, sequences 1582850
|
||||
* and 2829697 are the only 2 of those 13 which contain more than one
|
||||
* sense. Incidentally, sequence 1582850 is the correct match.
|
||||
*
|
||||
* All else being equal, the entry with the smallest sequence number
|
||||
* is chosen. References in the JMdict file are currently ambiguous,
|
||||
* and getting this perfect won't be possible until reference sequence
|
||||
* numbers are included in the file. See:
|
||||
* https://github.com/JMdictProject/JMdictIssues/issues/61
|
||||
*/
|
||||
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
|
||||
bestSeq := 0
|
||||
lowestIndex := 100000
|
||||
bestIsPriority := false
|
||||
headword, senseNumber, ok := parseReference(reference)
|
||||
if !ok {
|
||||
return bestSeq
|
||||
}
|
||||
hash := headword.Hash()
|
||||
for _, v := range meta.hashToSearchValues[hash] {
|
||||
if meta.seqToSenseCount[v.sequence] < senseNumber {
|
||||
// entry must contain the specified sense
|
||||
continue
|
||||
} else if lowestIndex < v.index {
|
||||
// lower indices are better
|
||||
continue
|
||||
} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {
|
||||
// if indices match, check priority
|
||||
continue
|
||||
} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {
|
||||
// if indices and priority match, check sequence number.
|
||||
// lower sequence numbers are better
|
||||
continue
|
||||
} else {
|
||||
lowestIndex = v.index
|
||||
bestSeq = v.sequence
|
||||
bestIsPriority = v.isPriority
|
||||
}
|
||||
}
|
||||
return bestSeq
|
||||
}
|
348
jmdict_tags.go
Normal file
348
jmdict_tags.go
Normal file
@ -0,0 +1,348 @@
|
||||
package yomichan
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
func senseNumberTags(maxSenseCount int) []dbTag {
|
||||
tags := []dbTag{}
|
||||
for i := 1; i <= maxSenseCount; i++ {
|
||||
tag := dbTag{
|
||||
Name: strconv.Itoa(i),
|
||||
Order: -10, // these tags will appear on the left side
|
||||
Notes: "JMdict Sense #" + strconv.Itoa(i),
|
||||
}
|
||||
tags = append(tags, tag)
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
func newsFrequencyTags() []dbTag {
|
||||
// 24,000 ranks divided into 24 tags, news1k ... news24k
|
||||
tags := []dbTag{}
|
||||
for i := 1; i <= 24; i++ {
|
||||
tagName := "news" + strconv.Itoa(i) + "k"
|
||||
var startRank string
|
||||
if i == 1 {
|
||||
startRank = "1"
|
||||
} else {
|
||||
// technically should be ",001", but that looks odd
|
||||
startRank = strconv.Itoa(i-1) + ",000"
|
||||
}
|
||||
endRank := strconv.Itoa(i) + ",000"
|
||||
tag := dbTag{
|
||||
Name: tagName,
|
||||
Order: -2,
|
||||
Score: 0,
|
||||
Category: "frequent",
|
||||
Notes: "ranked between the top " + startRank + " and " + endRank + " words in a frequency analysis of the Mainichi Shimbun (1990s)",
|
||||
}
|
||||
tags = append(tags, tag)
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
func entityTags(entities map[string]string) []dbTag {
|
||||
tags := knownEntityTags()
|
||||
for name, notes := range entities {
|
||||
idx := slices.IndexFunc(tags, func(t dbTag) bool { return t.Name == name })
|
||||
if idx != -1 {
|
||||
tags[idx].Notes = notes
|
||||
} else {
|
||||
fmt.Println("Unknown tag type \"" + name + "\": " + notes)
|
||||
unknownTag := dbTag{Name: name, Notes: notes}
|
||||
tags = append(tags, unknownTag)
|
||||
}
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
func customDbTags() []dbTag {
|
||||
return []dbTag{
|
||||
dbTag{Name: priorityTagName, Order: -10, Score: 10, Category: "popular", Notes: "high priority term"},
|
||||
dbTag{Name: rareKanjiTagName, Order: 0, Score: -5, Category: "archaism", Notes: "rarely-used kanji form of this expression"},
|
||||
dbTag{Name: irregularTagName, Order: 0, Score: -5, Category: "archaism", Notes: "irregular form of this expression"},
|
||||
dbTag{Name: outdatedTagName, Order: 0, Score: -5, Category: "archaism", Notes: "outdated form of this expression"},
|
||||
dbTag{Name: "ichi", Order: -2, Score: 0, Category: "frequent", Notes: "included in Ichimango Goi Bunruishuu (1万語語彙分類集)"},
|
||||
dbTag{Name: "spec", Order: -2, Score: 0, Category: "frequent", Notes: "specified as common by JMdict editors"},
|
||||
dbTag{Name: "gai", Order: -2, Score: 0, Category: "frequent", Notes: "common loanword (gairaigo・外来語)"},
|
||||
dbTag{Name: "forms", Order: 0, Score: 0, Category: "", Notes: "other surface forms and readings"},
|
||||
}
|
||||
}
|
||||
|
||||
func knownEntityTags() []dbTag {
|
||||
return []dbTag{
|
||||
// see: https://www.edrdg.org/jmdictdb/cgi-bin/edhelp.py?svc=jmdict&sid=#kwabbr
|
||||
// additional descriptions at the beginning of the JMdict file
|
||||
|
||||
// <re_inf> reading info
|
||||
dbTag{Name: "gikun", Order: 0, Score: 0, Category: ""}, // gikun (meaning as reading) or jukujikun (special kanji reading)
|
||||
dbTag{Name: "ik", Order: 0, Score: -5, Category: ""}, // word containing irregular kana usage
|
||||
dbTag{Name: "ok", Order: 0, Score: -5, Category: ""}, // out-dated or obsolete kana usage
|
||||
dbTag{Name: "sk", Order: 0, Score: -5, Category: ""}, // search-only kana form
|
||||
|
||||
// <ke_inf> kanji info
|
||||
/* kanji info also has a "ik" entity that would go here if not already for the re_inf tag */
|
||||
dbTag{Name: "ateji", Order: 0, Score: 0, Category: ""}, // ateji (phonetic) reading
|
||||
dbTag{Name: "iK", Order: 0, Score: -5, Category: ""}, // word containing irregular kanji usage
|
||||
dbTag{Name: "io", Order: 0, Score: -5, Category: ""}, // irregular okurigana usage
|
||||
dbTag{Name: "oK", Order: 0, Score: -5, Category: ""}, // word containing out-dated kanji or kanji usage
|
||||
dbTag{Name: "rK", Order: 0, Score: -5, Category: ""}, // rarely-used kanji form
|
||||
dbTag{Name: "sK", Order: 0, Score: -5, Category: ""}, // search-only kanji form
|
||||
|
||||
// <misc> miscellaneous sense info
|
||||
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
|
||||
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
|
||||
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
|
||||
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
|
||||
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
|
||||
dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name
|
||||
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
|
||||
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
|
||||
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
|
||||
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
|
||||
dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document
|
||||
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
|
||||
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
|
||||
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
|
||||
dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language
|
||||
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
|
||||
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
|
||||
dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified
|
||||
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
|
||||
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
|
||||
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
|
||||
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
|
||||
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
|
||||
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
|
||||
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
|
||||
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
|
||||
dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language
|
||||
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
|
||||
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
|
||||
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
|
||||
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
|
||||
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
|
||||
dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name
|
||||
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
|
||||
dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person
|
||||
dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name
|
||||
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
|
||||
dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language
|
||||
dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name
|
||||
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
|
||||
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
|
||||
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
|
||||
dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion
|
||||
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
|
||||
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
|
||||
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
|
||||
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
|
||||
dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station
|
||||
dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname
|
||||
dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone
|
||||
dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name
|
||||
dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word
|
||||
dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name
|
||||
dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software)
|
||||
dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo
|
||||
|
||||
// <pos> part-of-speech info
|
||||
dbTag{Name: "adj-f", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or verb acting prenominally
|
||||
dbTag{Name: "adj-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi)
|
||||
dbTag{Name: "adj-ix", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjective (keiyoushi) - yoi/ii class
|
||||
dbTag{Name: "adj-kari", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'kari' adjective (archaic)
|
||||
dbTag{Name: "adj-ku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'ku' adjective (archaic)
|
||||
dbTag{Name: "adj-na", Order: -3, Score: 0, Category: "partOfSpeech"}, // adjectival nouns or quasi-adjectives (keiyodoshi)
|
||||
dbTag{Name: "adj-nari", Order: -3, Score: 0, Category: "partOfSpeech"}, // archaic/formal form of na-adjective
|
||||
dbTag{Name: "adj-no", Order: -3, Score: 0, Category: "partOfSpeech"}, // nouns which may take the genitive case particle 'no'
|
||||
dbTag{Name: "adj-pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pre-noun adjectival (rentaishi)
|
||||
dbTag{Name: "adj-shiku", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'shiku' adjective (archaic)
|
||||
dbTag{Name: "adj-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // 'taru' adjective
|
||||
dbTag{Name: "adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb (fukushi)
|
||||
dbTag{Name: "adv-to", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverb taking the 'to' particle
|
||||
dbTag{Name: "aux", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary
|
||||
dbTag{Name: "aux-adj", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary adjective
|
||||
dbTag{Name: "aux-v", Order: -3, Score: 0, Category: "partOfSpeech"}, // auxiliary verb
|
||||
dbTag{Name: "conj", Order: -3, Score: 0, Category: "partOfSpeech"}, // conjunction
|
||||
dbTag{Name: "cop", Order: -3, Score: 0, Category: "partOfSpeech"}, // copula
|
||||
dbTag{Name: "ctr", Order: -3, Score: 0, Category: "partOfSpeech"}, // counter
|
||||
dbTag{Name: "exp", Order: -5, Score: 0, Category: "expression"}, // expressions (phrases, clauses, etc.)
|
||||
dbTag{Name: "int", Order: -3, Score: 0, Category: "partOfSpeech"}, // interjection (kandoushi)
|
||||
dbTag{Name: "n", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (common) (futsuumeishi)
|
||||
dbTag{Name: "n-adv", Order: -3, Score: 0, Category: "partOfSpeech"}, // adverbial noun (fukushitekimeishi)
|
||||
dbTag{Name: "n-pr", Order: -3, Score: 0, Category: "partOfSpeech"}, // proper noun
|
||||
dbTag{Name: "n-pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a prefix
|
||||
dbTag{Name: "n-suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun, used as a suffix
|
||||
dbTag{Name: "n-t", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun (temporal) (jisoumeishi)
|
||||
dbTag{Name: "num", Order: -3, Score: 0, Category: "partOfSpeech"}, // numeric
|
||||
dbTag{Name: "pn", Order: -3, Score: 0, Category: "partOfSpeech"}, // pronoun
|
||||
dbTag{Name: "pref", Order: -3, Score: 0, Category: "partOfSpeech"}, // prefix
|
||||
dbTag{Name: "prt", Order: -3, Score: 0, Category: "partOfSpeech"}, // particle
|
||||
dbTag{Name: "suf", Order: -3, Score: 0, Category: "partOfSpeech"}, // suffix
|
||||
dbTag{Name: "unc", Order: -3, Score: 0, Category: "partOfSpeech"}, // unclassified
|
||||
dbTag{Name: "v-unspec", Order: -3, Score: 0, Category: "partOfSpeech"}, // verb unspecified
|
||||
dbTag{Name: "v1", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb
|
||||
dbTag{Name: "v1-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - kureru special class
|
||||
dbTag{Name: "v2a-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb with 'u' ending (archaic)
|
||||
dbTag{Name: "v2b-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'bu' ending (archaic)
|
||||
dbTag{Name: "v2b-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'bu' ending (archaic)
|
||||
dbTag{Name: "v2d-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'dzu' ending (archaic)
|
||||
dbTag{Name: "v2d-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'dzu' ending (archaic)
|
||||
dbTag{Name: "v2g-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'gu' ending (archaic)
|
||||
dbTag{Name: "v2g-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'gu' ending (archaic)
|
||||
dbTag{Name: "v2h-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'hu/fu' ending (archaic)
|
||||
dbTag{Name: "v2h-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'hu/fu' ending (archaic)
|
||||
dbTag{Name: "v2k-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ku' ending (archaic)
|
||||
dbTag{Name: "v2k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ku' ending (archaic)
|
||||
dbTag{Name: "v2m-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'mu' ending (archaic)
|
||||
dbTag{Name: "v2m-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'mu' ending (archaic)
|
||||
dbTag{Name: "v2n-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'nu' ending (archaic)
|
||||
dbTag{Name: "v2r-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'ru' ending (archaic)
|
||||
dbTag{Name: "v2r-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'ru' ending (archaic)
|
||||
dbTag{Name: "v2s-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'su' ending (archaic)
|
||||
dbTag{Name: "v2t-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'tsu' ending (archaic)
|
||||
dbTag{Name: "v2t-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'tsu' ending (archaic)
|
||||
dbTag{Name: "v2w-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)
|
||||
dbTag{Name: "v2y-k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (upper class) with 'yu' ending (archaic)
|
||||
dbTag{Name: "v2y-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'yu' ending (archaic)
|
||||
dbTag{Name: "v2z-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Nidan verb (lower class) with 'zu' ending (archaic)
|
||||
dbTag{Name: "v4b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'bu' ending (archaic)
|
||||
dbTag{Name: "v4g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'gu' ending (archaic)
|
||||
dbTag{Name: "v4h", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'hu/fu' ending (archaic)
|
||||
dbTag{Name: "v4k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ku' ending (archaic)
|
||||
dbTag{Name: "v4m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'mu' ending (archaic)
|
||||
dbTag{Name: "v4n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'nu' ending (archaic)
|
||||
dbTag{Name: "v4r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'ru' ending (archaic)
|
||||
dbTag{Name: "v4s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'su' ending (archaic)
|
||||
dbTag{Name: "v4t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Yodan verb with 'tsu' ending (archaic)
|
||||
dbTag{Name: "v5aru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - -aru special class
|
||||
dbTag{Name: "v5b", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'bu' ending
|
||||
dbTag{Name: "v5g", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'gu' ending
|
||||
dbTag{Name: "v5k", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ku' ending
|
||||
dbTag{Name: "v5k-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Iku/Yuku special class
|
||||
dbTag{Name: "v5m", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'mu' ending
|
||||
dbTag{Name: "v5n", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'nu' ending
|
||||
dbTag{Name: "v5r", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending
|
||||
dbTag{Name: "v5r-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'ru' ending (irregular verb)
|
||||
dbTag{Name: "v5s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'su' ending
|
||||
dbTag{Name: "v5t", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'tsu' ending
|
||||
dbTag{Name: "v5u", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending
|
||||
dbTag{Name: "v5u-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb with 'u' ending (special class)
|
||||
dbTag{Name: "v5uru", Order: -3, Score: 0, Category: "partOfSpeech"}, // Godan verb - Uru old class verb (old form of Eru)
|
||||
dbTag{Name: "vi", Order: -3, Score: 0, Category: "partOfSpeech"}, // intransitive verb
|
||||
dbTag{Name: "vk", Order: -3, Score: 0, Category: "partOfSpeech"}, // Kuru verb - special class
|
||||
dbTag{Name: "vn", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular nu verb
|
||||
dbTag{Name: "vr", Order: -3, Score: 0, Category: "partOfSpeech"}, // irregular ru verb, plain form ends with -ri
|
||||
dbTag{Name: "vs", Order: -3, Score: 0, Category: "partOfSpeech"}, // noun or participle which takes the aux. verb suru
|
||||
dbTag{Name: "vs-c", Order: -3, Score: 0, Category: "partOfSpeech"}, // su verb - precursor to the modern suru
|
||||
dbTag{Name: "vs-i", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - included
|
||||
dbTag{Name: "vs-s", Order: -3, Score: 0, Category: "partOfSpeech"}, // suru verb - special class
|
||||
dbTag{Name: "vt", Order: -3, Score: 0, Category: "partOfSpeech"}, // transitive verb
|
||||
dbTag{Name: "vz", Order: -3, Score: 0, Category: "partOfSpeech"}, // Ichidan verb - zuru verb (alternative form of -jiru verbs)
|
||||
|
||||
// <field> usage domain
|
||||
dbTag{Name: "agric", Order: 0, Score: 0, Category: ""}, // agriculture
|
||||
dbTag{Name: "anat", Order: 0, Score: 0, Category: ""}, // anatomy
|
||||
dbTag{Name: "archeol", Order: 0, Score: 0, Category: ""}, // archeology
|
||||
dbTag{Name: "archit", Order: 0, Score: 0, Category: ""}, // architecture
|
||||
dbTag{Name: "art", Order: 0, Score: 0, Category: ""}, // art, aesthetics
|
||||
dbTag{Name: "astron", Order: 0, Score: 0, Category: ""}, // astronomy
|
||||
dbTag{Name: "audvid", Order: 0, Score: 0, Category: ""}, // audiovisual
|
||||
dbTag{Name: "aviat", Order: 0, Score: 0, Category: ""}, // aviation
|
||||
dbTag{Name: "baseb", Order: 0, Score: 0, Category: ""}, // baseball
|
||||
dbTag{Name: "biochem", Order: 0, Score: 0, Category: ""}, // biochemistry
|
||||
dbTag{Name: "biol", Order: 0, Score: 0, Category: ""}, // biology
|
||||
dbTag{Name: "bot", Order: 0, Score: 0, Category: ""}, // botany
|
||||
dbTag{Name: "Buddh", Order: 0, Score: 0, Category: ""}, // Buddhism
|
||||
dbTag{Name: "bus", Order: 0, Score: 0, Category: ""}, // business
|
||||
dbTag{Name: "cards", Order: 0, Score: 0, Category: ""}, // card games
|
||||
dbTag{Name: "chem", Order: 0, Score: 0, Category: ""}, // chemistry
|
||||
dbTag{Name: "Christn", Order: 0, Score: 0, Category: ""}, // Christianity
|
||||
dbTag{Name: "cloth", Order: 0, Score: 0, Category: ""}, // clothing
|
||||
dbTag{Name: "comp", Order: 0, Score: 0, Category: ""}, // computing
|
||||
dbTag{Name: "cryst", Order: 0, Score: 0, Category: ""}, // crystallography
|
||||
dbTag{Name: "dent", Order: 0, Score: 0, Category: ""}, // dentistry
|
||||
dbTag{Name: "ecol", Order: 0, Score: 0, Category: ""}, // ecology
|
||||
dbTag{Name: "econ", Order: 0, Score: 0, Category: ""}, // economics
|
||||
dbTag{Name: "elec", Order: 0, Score: 0, Category: ""}, // electricity, elec. eng.
|
||||
dbTag{Name: "electr", Order: 0, Score: 0, Category: ""}, // electronics
|
||||
dbTag{Name: "embryo", Order: 0, Score: 0, Category: ""}, // embryology
|
||||
dbTag{Name: "engr", Order: 0, Score: 0, Category: ""}, // engineering
|
||||
dbTag{Name: "ent", Order: 0, Score: 0, Category: ""}, // entomology
|
||||
dbTag{Name: "film", Order: 0, Score: 0, Category: ""}, // film
|
||||
dbTag{Name: "finc", Order: 0, Score: 0, Category: ""}, // finance
|
||||
dbTag{Name: "fish", Order: 0, Score: 0, Category: ""}, // fishing
|
||||
dbTag{Name: "food", Order: 0, Score: 0, Category: ""}, // food, cooking
|
||||
dbTag{Name: "gardn", Order: 0, Score: 0, Category: ""}, // gardening, horticulture
|
||||
dbTag{Name: "genet", Order: 0, Score: 0, Category: ""}, // genetics
|
||||
dbTag{Name: "geogr", Order: 0, Score: 0, Category: ""}, // geography
|
||||
dbTag{Name: "geol", Order: 0, Score: 0, Category: ""}, // geology
|
||||
dbTag{Name: "geom", Order: 0, Score: 0, Category: ""}, // geometry
|
||||
dbTag{Name: "go", Order: 0, Score: 0, Category: ""}, // go (game)
|
||||
dbTag{Name: "golf", Order: 0, Score: 0, Category: ""}, // golf
|
||||
dbTag{Name: "gramm", Order: 0, Score: 0, Category: ""}, // grammar
|
||||
dbTag{Name: "grmyth", Order: 0, Score: 0, Category: ""}, // Greek mythology
|
||||
dbTag{Name: "hanaf", Order: 0, Score: 0, Category: ""}, // hanafuda
|
||||
dbTag{Name: "horse", Order: 0, Score: 0, Category: ""}, // horse racing
|
||||
dbTag{Name: "kabuki", Order: 0, Score: 0, Category: ""}, // kabuki
|
||||
dbTag{Name: "law", Order: 0, Score: 0, Category: ""}, // law
|
||||
dbTag{Name: "ling", Order: 0, Score: 0, Category: ""}, // linguistics
|
||||
dbTag{Name: "logic", Order: 0, Score: 0, Category: ""}, // logic
|
||||
dbTag{Name: "MA", Order: 0, Score: 0, Category: ""}, // martial arts
|
||||
dbTag{Name: "mahj", Order: 0, Score: 0, Category: ""}, // mahjong
|
||||
dbTag{Name: "manga", Order: 0, Score: 0, Category: ""}, // manga
|
||||
dbTag{Name: "math", Order: 0, Score: 0, Category: ""}, // mathematics
|
||||
dbTag{Name: "mech", Order: 0, Score: 0, Category: ""}, // mechanical engineering
|
||||
dbTag{Name: "med", Order: 0, Score: 0, Category: ""}, // medicine
|
||||
dbTag{Name: "met", Order: 0, Score: 0, Category: ""}, // meteorology
|
||||
dbTag{Name: "mil", Order: 0, Score: 0, Category: ""}, // military
|
||||
dbTag{Name: "mining", Order: 0, Score: 0, Category: ""}, // mining
|
||||
dbTag{Name: "music", Order: 0, Score: 0, Category: ""}, // music
|
||||
dbTag{Name: "noh", Order: 0, Score: 0, Category: ""}, // noh
|
||||
dbTag{Name: "ornith", Order: 0, Score: 0, Category: ""}, // ornithology
|
||||
dbTag{Name: "paleo", Order: 0, Score: 0, Category: ""}, // paleontology
|
||||
dbTag{Name: "pathol", Order: 0, Score: 0, Category: ""}, // pathology
|
||||
dbTag{Name: "pharm", Order: 0, Score: 0, Category: ""}, // pharmacy
|
||||
dbTag{Name: "phil", Order: 0, Score: 0, Category: ""}, // philosophy
|
||||
dbTag{Name: "photo", Order: 0, Score: 0, Category: ""}, // photography
|
||||
dbTag{Name: "physics", Order: 0, Score: 0, Category: ""}, // physics
|
||||
dbTag{Name: "physiol", Order: 0, Score: 0, Category: ""}, // physiology
|
||||
dbTag{Name: "politics", Order: 0, Score: 0, Category: ""}, // politics
|
||||
dbTag{Name: "print", Order: 0, Score: 0, Category: ""}, // printing
|
||||
dbTag{Name: "psy", Order: 0, Score: 0, Category: ""}, // psychiatry
|
||||
dbTag{Name: "psyanal", Order: 0, Score: 0, Category: ""}, // psychoanalysis
|
||||
dbTag{Name: "psych", Order: 0, Score: 0, Category: ""}, // psychology
|
||||
dbTag{Name: "rail", Order: 0, Score: 0, Category: ""}, // railway
|
||||
dbTag{Name: "rommyth", Order: 0, Score: 0, Category: ""}, // Roman mythology
|
||||
dbTag{Name: "Shinto", Order: 0, Score: 0, Category: ""}, // Shinto
|
||||
dbTag{Name: "shogi", Order: 0, Score: 0, Category: ""}, // shogi
|
||||
dbTag{Name: "ski", Order: 0, Score: 0, Category: ""}, // skiing
|
||||
dbTag{Name: "sports", Order: 0, Score: 0, Category: ""}, // sports
|
||||
dbTag{Name: "stat", Order: 0, Score: 0, Category: ""}, // statistics
|
||||
dbTag{Name: "stockm", Order: 0, Score: 0, Category: ""}, // stock market
|
||||
dbTag{Name: "sumo", Order: 0, Score: 0, Category: ""}, // sumo
|
||||
dbTag{Name: "telec", Order: 0, Score: 0, Category: ""}, // telecommunications
|
||||
dbTag{Name: "tradem", Order: 0, Score: 0, Category: ""}, // trademark
|
||||
dbTag{Name: "tv", Order: 0, Score: 0, Category: ""}, // television
|
||||
dbTag{Name: "vidg", Order: 0, Score: 0, Category: ""}, // video games
|
||||
dbTag{Name: "zool", Order: 0, Score: 0, Category: ""}, // zoology
|
||||
|
||||
// <dial> dialect
|
||||
dbTag{Name: "bra", Order: 0, Score: 0, Category: ""}, // Brazilian
|
||||
dbTag{Name: "hob", Order: 0, Score: 0, Category: ""}, // Hokkaido-ben
|
||||
dbTag{Name: "ksb", Order: 0, Score: 0, Category: ""}, // Kansai-ben
|
||||
dbTag{Name: "ktb", Order: 0, Score: 0, Category: ""}, // Kantou-ben
|
||||
dbTag{Name: "kyb", Order: 0, Score: 0, Category: ""}, // Kyoto-ben
|
||||
dbTag{Name: "kyu", Order: 0, Score: 0, Category: ""}, // Kyuushuu-ben
|
||||
dbTag{Name: "nab", Order: 0, Score: 0, Category: ""}, // Nagano-ben
|
||||
dbTag{Name: "osb", Order: 0, Score: 0, Category: ""}, // Osaka-ben
|
||||
dbTag{Name: "rkb", Order: 0, Score: 0, Category: ""}, // Ryuukyuu-ben
|
||||
dbTag{Name: "thb", Order: 0, Score: 0, Category: ""}, // Touhoku-ben
|
||||
dbTag{Name: "tsb", Order: 0, Score: 0, Category: ""}, // Tosa-ben
|
||||
dbTag{Name: "tsug", Order: 0, Score: 0, Category: ""}, // Tsugaru-ben
|
||||
}
|
||||
}
|
13
kanjidic.go
13
kanjidic.go
@ -7,8 +7,6 @@ import (
|
||||
"foosoft.net/projects/jmdict"
|
||||
)
|
||||
|
||||
const kanjidicRevision = "kanjidic2"
|
||||
|
||||
func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji {
|
||||
if entry.ReadingMeaning == nil {
|
||||
return nil
|
||||
@ -161,11 +159,16 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int,
|
||||
"tag": tags.crush(),
|
||||
}
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: "kanjidic2",
|
||||
Sequenced: false,
|
||||
Attribution: edrdgAttribution,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
title,
|
||||
kanjidicRevision,
|
||||
false,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
|
@ -72,7 +72,7 @@ func (e *kotowazaExtractor) extractTerms(entry zig.BookEntry, sequence int) []db
|
||||
term := dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
|
@ -75,7 +75,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
|
||||
for _, reading := range readings {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
@ -89,7 +89,7 @@ func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
|
||||
term := dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
|
@ -106,7 +106,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
|
||||
for _, reading := range readings {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
@ -120,7 +120,7 @@ func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbT
|
||||
term := dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
|
12
rikai.go
12
rikai.go
@ -8,8 +8,6 @@ import (
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
const rikaiRevision = "rikai2"
|
||||
|
||||
type rikaiEntry struct {
|
||||
kanji string
|
||||
kana string
|
||||
@ -154,11 +152,15 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr
|
||||
"tag": tags.crush(),
|
||||
}
|
||||
|
||||
index := dbIndex{
|
||||
Title: title,
|
||||
Revision: "rikai2",
|
||||
Sequenced: true,
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputPath,
|
||||
title,
|
||||
rikaiRevision,
|
||||
true,
|
||||
index,
|
||||
recordData,
|
||||
stride,
|
||||
pretty,
|
||||
|
@ -5,13 +5,24 @@ go get foosoft.net/projects/yomichan-import/yomichan
|
||||
mkdir -p src
|
||||
mkdir -p dst
|
||||
|
||||
if [ ! -f src/JMdict ]; then
|
||||
wget http://ftp.monash.edu/pub/nihongo/JMdict.gz
|
||||
gunzip -c JMdict.gz > src/JMdict
|
||||
function refresh_source () {
|
||||
NOW=$(date '+%s')
|
||||
YESTERDAY=$((NOW - 86400)) # 86,400 seconds in 24 hours
|
||||
if [ ! -f "src/$1" ]; then
|
||||
wget "ftp.edrdg.org/pub/Nihongo/$1.gz"
|
||||
gunzip -c "$1.gz" > "src/$1"
|
||||
elif [[ $YESTERDAY -gt $(date -r "src/$1" '+%s') ]]; then
|
||||
rsync "ftp.edrdg.org::nihongo/$1" "src/$1"
|
||||
fi
|
||||
}
|
||||
|
||||
yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip
|
||||
refresh_source "JMdict_e_examp"
|
||||
yomichan -language="english_extra" -title="JMdict" src/JMdict_e_examp dst/jmdict_english_extra_with_examples.zip
|
||||
|
||||
refresh_source "JMdict"
|
||||
yomichan -language="english_extra" -title="JMdict" src/JMdict dst/jmdict_english_extra.zip
|
||||
yomichan -language="english" -title="JMdict (English)" src/JMdict dst/jmdict_english.zip
|
||||
yomichan -language="dutch" -title="JMdict (Dutch)" src/JMdict dst/jmdict_dutch.zip
|
||||
yomichan -language="french" -title="JMdict (French)" src/JMdict dst/jmdict_french.zip
|
||||
yomichan -language="german" -title="JMdict (German)" src/JMdict dst/jmdict_german.zip
|
||||
yomichan -language="hungarian" -title="JMdict (Hungarian)" src/JMdict dst/jmdict_hungarian.zip
|
||||
@ -20,19 +31,13 @@ yomichan -language="slovenian" -title="JMdict (Slovenian)" src/JMdict dst/jmdict
|
||||
yomichan -language="spanish" -title="JMdict (Spanish)" src/JMdict dst/jmdict_spanish.zip
|
||||
yomichan -language="swedish" -title="JMdict (Swedish)" src/JMdict dst/jmdict_swedish.zip
|
||||
|
||||
if [ ! -f src/JMnedict.xml ]; then
|
||||
wget http://ftp.monash.edu/pub/nihongo/JMnedict.xml.gz
|
||||
gunzip -c JMnedict.xml.gz > src/JMnedict.xml
|
||||
fi
|
||||
yomichan -format="forms" -title="JMdict Forms" src/JMdict dst/jmdict_forms.zip
|
||||
|
||||
refresh_source "JMnedict.xml"
|
||||
yomichan src/JMnedict.xml dst/jmnedict.zip
|
||||
|
||||
if [ ! -f src/kanjidic2.xml ]; then
|
||||
wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz
|
||||
gunzip -c kanjidic2.xml.gz > src/kanjidic2.xml
|
||||
fi
|
||||
|
||||
yomichan -language="english" -title="KANJIDIC (English)" src/kanjidic2.xml dst/kanjidic_english.zip
|
||||
refresh_source "kanjidic2.xml"
|
||||
yomichan -language="english" -title="KANJIDIC" src/kanjidic2.xml dst/kanjidic_english.zip
|
||||
yomichan -language="french" -title="KANJIDIC (French)" src/kanjidic2.xml dst/kanjidic_french.zip
|
||||
yomichan -language="portuguese" -title="KANJIDIC (Portuguese)" src/kanjidic2.xml dst/kanjidic_portuguese.zip
|
||||
yomichan -language="spanish" -title="KANJIDIC (Spanish)" src/kanjidic2.xml dst/kanjidic_spanish.zip
|
||||
|
@ -93,7 +93,7 @@ func (e *shougakukan2Extractor) extractTerms(entry zig.BookEntry, sequence int)
|
||||
terms = append(terms, dbTerm{
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Glossary: []any{entry.Text},
|
||||
Sequence: sequence,
|
||||
})
|
||||
}
|
||||
|
192
structured_content.go
Normal file
192
structured_content.go
Normal file
@ -0,0 +1,192 @@
|
||||
package yomichan
|
||||
|
||||
type contentAttr struct {
|
||||
lang string
|
||||
fontStyle string // normal, italic
|
||||
fontWeight string // normal, bold
|
||||
fontSize string // small, medium, large, smaller, 80%, 125%, etc.
|
||||
textDecorationLine []string // underline, overline, line-through
|
||||
verticalAlign string // baseline, sub, super, text-top, text-bottom, middle, top, bottom
|
||||
textAlign string // start, end, left, right, center, justify, justify-all, match-parent
|
||||
marginTop int
|
||||
marginLeft int
|
||||
marginRight int
|
||||
marginBottom int
|
||||
listStyleType string
|
||||
data map[string]string
|
||||
}
|
||||
|
||||
// if the array contains adjacent strings, concatenate them.
|
||||
// ex: ["one", "two", content_structure, "four"] -> ["onetwo", content_structure, "four"]
|
||||
// if the array only contains strings, return a concatenated string.
|
||||
// ex: ["one", "two"] -> "onetwo"
|
||||
func contentReduce(contents []any) any {
|
||||
if len(contents) == 1 {
|
||||
return contents[0]
|
||||
}
|
||||
newContents := []any{}
|
||||
var accumulator string
|
||||
for _, content := range contents {
|
||||
switch v := content.(type) {
|
||||
case string:
|
||||
accumulator = accumulator + v
|
||||
default:
|
||||
if accumulator != "" {
|
||||
newContents = append(newContents, accumulator)
|
||||
accumulator = ""
|
||||
}
|
||||
newContents = append(newContents, content)
|
||||
}
|
||||
}
|
||||
if accumulator != "" {
|
||||
newContents = append(newContents, accumulator)
|
||||
}
|
||||
if len(newContents) == 1 {
|
||||
return newContents[0]
|
||||
} else {
|
||||
return newContents
|
||||
}
|
||||
}
|
||||
|
||||
func contentStructure(contents ...any) map[string]any {
|
||||
return map[string]any{
|
||||
"type": "structured-content",
|
||||
"content": contentReduce(contents),
|
||||
}
|
||||
}
|
||||
|
||||
func contentRuby(attr contentAttr, ruby string, contents ...any) map[string]any {
|
||||
rubyContent := map[string]any{
|
||||
"tag": "ruby",
|
||||
"content": []any{
|
||||
contentReduce(contents),
|
||||
map[string]string{"tag": "rp", "content": "("},
|
||||
map[string]string{"tag": "rt", "content": ruby},
|
||||
map[string]string{"tag": "rp", "content": ")"},
|
||||
},
|
||||
}
|
||||
if attr.lang != "" {
|
||||
rubyContent["lang"] = attr.lang
|
||||
}
|
||||
if len(attr.data) != 0 {
|
||||
rubyContent["data"] = attr.data
|
||||
}
|
||||
return rubyContent
|
||||
}
|
||||
|
||||
func contentInternalLink(attr contentAttr, query string, contents ...any) map[string]any {
|
||||
linkContent := map[string]any{
|
||||
"tag": "a",
|
||||
"href": "?query=" + query + "&wildcards=off",
|
||||
}
|
||||
if len(contents) == 0 {
|
||||
linkContent["content"] = query
|
||||
} else {
|
||||
linkContent["content"] = contentReduce(contents)
|
||||
}
|
||||
if attr.lang != "" {
|
||||
linkContent["lang"] = attr.lang
|
||||
}
|
||||
if len(attr.data) != 0 {
|
||||
linkContent["data"] = attr.data
|
||||
}
|
||||
return linkContent
|
||||
}
|
||||
|
||||
func contentSpan(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "span", contents...)
|
||||
}
|
||||
|
||||
func contentDiv(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "div", contents...)
|
||||
}
|
||||
|
||||
func contentListItem(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "li", contents...)
|
||||
}
|
||||
|
||||
func contentOrderedList(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "ol", contents...)
|
||||
}
|
||||
|
||||
func contentUnorderedList(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "ul", contents...)
|
||||
}
|
||||
|
||||
func contentTable(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "table", contents...)
|
||||
}
|
||||
|
||||
func contentTableHead(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "thead", contents...)
|
||||
}
|
||||
|
||||
func contentTableBody(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "tbody", contents...)
|
||||
}
|
||||
|
||||
func contentTableRow(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "tr", contents...)
|
||||
}
|
||||
|
||||
func contentTableHeadCell(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "th", contents...)
|
||||
}
|
||||
|
||||
func contentTableCell(attr contentAttr, contents ...any) map[string]any {
|
||||
return contentStyledContainer(attr, "td", contents...)
|
||||
}
|
||||
|
||||
func contentStyledContainer(attr contentAttr, tag string, contents ...any) map[string]any {
|
||||
container := map[string]any{"tag": tag}
|
||||
container["content"] = contentReduce(contents)
|
||||
if attr.lang != "" {
|
||||
container["lang"] = attr.lang
|
||||
}
|
||||
if len(attr.data) != 0 {
|
||||
container["data"] = attr.data
|
||||
}
|
||||
style := contentStyle(attr)
|
||||
if len(style) != 0 {
|
||||
container["style"] = style
|
||||
}
|
||||
return container
|
||||
}
|
||||
|
||||
func contentStyle(attr contentAttr) map[string]any {
|
||||
style := make(map[string]any)
|
||||
if attr.fontStyle != "" {
|
||||
style["fontStyle"] = attr.fontStyle
|
||||
}
|
||||
if attr.fontWeight != "" {
|
||||
style["fontWeight"] = attr.fontWeight
|
||||
}
|
||||
if attr.fontSize != "" {
|
||||
style["fontSize"] = attr.fontSize
|
||||
}
|
||||
if len(attr.textDecorationLine) != 0 {
|
||||
style["textDecorationLine"] = attr.textDecorationLine
|
||||
}
|
||||
if attr.verticalAlign != "" {
|
||||
style["verticalAlign"] = attr.verticalAlign
|
||||
}
|
||||
if attr.textAlign != "" {
|
||||
style["textAlign"] = attr.textAlign
|
||||
}
|
||||
if attr.marginTop != 0 {
|
||||
style["marginTop"] = attr.marginTop
|
||||
}
|
||||
if attr.marginLeft != 0 {
|
||||
style["marginLeft"] = attr.marginLeft
|
||||
}
|
||||
if attr.marginRight != 0 {
|
||||
style["marginRight"] = attr.marginRight
|
||||
}
|
||||
if attr.marginBottom != 0 {
|
||||
style["marginBottom"] = attr.marginBottom
|
||||
}
|
||||
if attr.listStyleType != "" {
|
||||
style["listStyleType"] = attr.listStyleType
|
||||
}
|
||||
return style
|
||||
}
|
Loading…
Reference in New Issue
Block a user