New JMnedict version
This commit is contained in:
parent
b826dbf264
commit
8281301869
118
enamdict.go
118
enamdict.go
@ -1,118 +0,0 @@
|
|||||||
package yomichan
|
|
||||||
|
|
||||||
import (
|
|
||||||
"os"
|
|
||||||
|
|
||||||
"foosoft.net/projects/jmdict"
|
|
||||||
)
|
|
||||||
|
|
||||||
func jmnedictBuildTagMeta(entities map[string]string) dbTagList {
|
|
||||||
var tags dbTagList
|
|
||||||
|
|
||||||
for name, value := range entities {
|
|
||||||
tag := dbTag{Name: name, Notes: value}
|
|
||||||
|
|
||||||
switch name {
|
|
||||||
case "company", "fem", "given", "masc", "organization", "person", "place", "product", "station", "surname", "unclass", "work":
|
|
||||||
tag.Category = "name"
|
|
||||||
tag.Order = 4
|
|
||||||
}
|
|
||||||
|
|
||||||
tags = append(tags, tag)
|
|
||||||
}
|
|
||||||
|
|
||||||
return tags
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
|
||||||
var terms []dbTerm
|
|
||||||
|
|
||||||
convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) {
|
|
||||||
if kanji != nil && hasString(kanji.Expression, reading.Restrictions) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var term dbTerm
|
|
||||||
term.Sequence = enamdictEntry.Sequence
|
|
||||||
term.addTermTags(reading.Information...)
|
|
||||||
|
|
||||||
if kanji == nil {
|
|
||||||
term.Expression = reading.Reading
|
|
||||||
} else {
|
|
||||||
term.Expression = kanji.Expression
|
|
||||||
term.Reading = reading.Reading
|
|
||||||
term.addTermTags(kanji.Information...)
|
|
||||||
|
|
||||||
for _, priority := range kanji.Priorities {
|
|
||||||
if hasString(priority, reading.Priorities) {
|
|
||||||
term.addTermTags(priority)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, trans := range enamdictEntry.Translations {
|
|
||||||
for _, translation := range trans.Translations {
|
|
||||||
term.Glossary = append(term.Glossary, translation)
|
|
||||||
}
|
|
||||||
term.addDefinitionTags(trans.NameTypes...)
|
|
||||||
}
|
|
||||||
|
|
||||||
terms = append(terms, term)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(enamdictEntry.Kanji) > 0 {
|
|
||||||
for _, kanji := range enamdictEntry.Kanji {
|
|
||||||
for _, reading := range enamdictEntry.Readings {
|
|
||||||
convert(reading, &kanji)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for _, reading := range enamdictEntry.Readings {
|
|
||||||
convert(reading, nil)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return terms
|
|
||||||
}
|
|
||||||
|
|
||||||
func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
|
||||||
reader, err := os.Open(inputPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer reader.Close()
|
|
||||||
|
|
||||||
dict, entities, err := jmdict.LoadJmnedictNoTransform(reader)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
var terms dbTermList
|
|
||||||
for _, entry := range dict.Entries {
|
|
||||||
terms = append(terms, jmnedictExtractTerms(entry)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
if title == "" {
|
|
||||||
title = "JMnedict"
|
|
||||||
}
|
|
||||||
|
|
||||||
recordData := map[string]dbRecordList{
|
|
||||||
"term": terms.crush(),
|
|
||||||
"tag": jmnedictBuildTagMeta(entities).crush(),
|
|
||||||
}
|
|
||||||
|
|
||||||
index := dbIndex{
|
|
||||||
Title: title,
|
|
||||||
Revision: "jmnedict1",
|
|
||||||
Sequenced: true,
|
|
||||||
Attribution: edrdgAttribution,
|
|
||||||
}
|
|
||||||
|
|
||||||
return writeDb(
|
|
||||||
outputPath,
|
|
||||||
index,
|
|
||||||
recordData,
|
|
||||||
stride,
|
|
||||||
pretty,
|
|
||||||
)
|
|
||||||
}
|
|
@ -8,17 +8,6 @@ import (
|
|||||||
"golang.org/x/exp/slices"
|
"golang.org/x/exp/slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
func kata2hira(word string) string {
|
|
||||||
charMap := func(character rune) rune {
|
|
||||||
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
|
|
||||||
return character - 0x60
|
|
||||||
} else {
|
|
||||||
return character
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return strings.Map(charMap, word)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *headword) InfoSymbols() string {
|
func (h *headword) InfoSymbols() string {
|
||||||
infoSymbols := []string{}
|
infoSymbols := []string{}
|
||||||
if h.IsPriority {
|
if h.IsPriority {
|
||||||
@ -93,8 +82,8 @@ func needsFormTable(headwords []headword) bool {
|
|||||||
} else if h.IsKanaOnly() {
|
} else if h.IsKanaOnly() {
|
||||||
continue
|
continue
|
||||||
} else if uniqueReading == "" {
|
} else if uniqueReading == "" {
|
||||||
uniqueReading = kata2hira(h.Reading)
|
uniqueReading = katakanaToHiragana(h.Reading)
|
||||||
} else if uniqueReading != kata2hira(h.Reading) {
|
} else if uniqueReading != katakanaToHiragana(h.Reading) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -99,7 +99,7 @@ func knownEntityTags() []dbTag {
|
|||||||
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
|
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
|
||||||
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
|
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
|
||||||
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
|
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
|
||||||
dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name
|
dbTag{Name: "company", Order: 4, Score: 0, Category: "name"}, // company name
|
||||||
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
|
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
|
||||||
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
|
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
|
||||||
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
|
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
|
||||||
@ -108,10 +108,10 @@ func knownEntityTags() []dbTag {
|
|||||||
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
|
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
|
||||||
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
|
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
|
||||||
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
|
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
|
||||||
dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language
|
dbTag{Name: "fem", Order: 4, Score: 0, Category: "name"}, // female term, language, or name
|
||||||
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
|
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
|
||||||
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
|
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
|
||||||
dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified
|
dbTag{Name: "given", Order: 4, Score: 0, Category: "name"}, // given name or forename, gender not specified
|
||||||
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
|
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
|
||||||
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
|
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
|
||||||
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
|
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
|
||||||
@ -120,19 +120,20 @@ func knownEntityTags() []dbTag {
|
|||||||
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
|
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
|
||||||
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
|
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
|
||||||
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
|
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
|
||||||
dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language
|
dbTag{Name: "male", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
|
||||||
|
dbTag{Name: "masc", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
|
||||||
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
|
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
|
||||||
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
|
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
|
||||||
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
|
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
|
||||||
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
|
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
|
||||||
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
|
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
|
||||||
dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name
|
dbTag{Name: "organization", Order: 4, Score: 0, Category: "name"}, // organization name
|
||||||
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
|
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
|
||||||
dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person
|
dbTag{Name: "person", Order: 4, Score: 0, Category: "name"}, // full name of a particular person
|
||||||
dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name
|
dbTag{Name: "place", Order: 4, Score: 0, Category: "name"}, // place name
|
||||||
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
|
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
|
||||||
dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language
|
dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language
|
||||||
dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name
|
dbTag{Name: "product", Order: 4, Score: 0, Category: "name"}, // product name
|
||||||
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
|
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
|
||||||
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
|
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
|
||||||
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
|
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
|
||||||
@ -141,12 +142,12 @@ func knownEntityTags() []dbTag {
|
|||||||
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
|
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
|
||||||
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
|
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
|
||||||
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
|
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
|
||||||
dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station
|
dbTag{Name: "station", Order: 4, Score: 0, Category: "name"}, // railway station
|
||||||
dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname
|
dbTag{Name: "surname", Order: 4, Score: 0, Category: "name"}, // family or surname
|
||||||
dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone
|
dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone
|
||||||
dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name
|
dbTag{Name: "unclass", Order: 4, Score: 0, Category: "name"}, // unclassified name
|
||||||
dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word
|
dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word
|
||||||
dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name
|
dbTag{Name: "work", Order: 4, Score: 0, Category: "name"}, // work of art, literature, music, etc. name
|
||||||
dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software)
|
dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software)
|
||||||
dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo
|
dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo
|
||||||
|
|
||||||
|
132
jmnedict.go
Normal file
132
jmnedict.go
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"foosoft.net/projects/jmdict"
|
||||||
|
)
|
||||||
|
|
||||||
|
func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string {
|
||||||
|
if len(dictionary.Entries) == 0 {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
|
||||||
|
if len(dateEntry.Translations) == 0 || len(dateEntry.Translations[0].Translations) == 0 {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
||||||
|
jmnedictDate := r.FindString(dateEntry.Translations[0].Translations[0])
|
||||||
|
if jmnedictDate != "" {
|
||||||
|
return jmnedictDate
|
||||||
|
} else {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmnedictSenseTerm(headword headword, seq sequence, sense jmdict.JmnedictTranslation, senseNumber int) dbTerm {
|
||||||
|
term := dbTerm{
|
||||||
|
Expression: headword.Expression,
|
||||||
|
Reading: headword.Reading,
|
||||||
|
Sequence: seq,
|
||||||
|
}
|
||||||
|
for _, gloss := range sense.Translations {
|
||||||
|
term.Glossary = append(term.Glossary, gloss)
|
||||||
|
}
|
||||||
|
term.addDefinitionTags(sense.NameTypes...)
|
||||||
|
term.Score = calculateTermScore(senseNumber, 0, headword)
|
||||||
|
return term
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmnedictTerms(headword headword, entry jmdict.JmnedictEntry, g genericTermInfo) []dbTerm {
|
||||||
|
terms := []dbTerm{}
|
||||||
|
for idx, sense := range entry.Translations {
|
||||||
|
if g.IsGenericName(headword, sense.Translations) {
|
||||||
|
g.AddGlosses(headword.Expression, sense.NameTypes, headword.Reading)
|
||||||
|
} else {
|
||||||
|
g.AddUsedSequence(entry.Sequence)
|
||||||
|
senseTerm := jmnedictSenseTerm(headword, entry.Sequence, sense, idx+1)
|
||||||
|
terms = append(terms, senseTerm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return terms
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmnedictHeadwords(entry jmdict.JmnedictEntry) (headwords []headword) {
|
||||||
|
// Note that JMnedict doesn't (currently) use priority tags,
|
||||||
|
// frequency tags, or any sort of reading/kanji restrictions.
|
||||||
|
for _, reading := range entry.Readings {
|
||||||
|
for _, kanji := range entry.Kanji {
|
||||||
|
h := headword{
|
||||||
|
Expression: kanji.Expression,
|
||||||
|
Reading: reading.Reading,
|
||||||
|
}
|
||||||
|
h.Index = len(headwords)
|
||||||
|
headwords = append(headwords, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(entry.Kanji) == 0 {
|
||||||
|
for _, reading := range entry.Readings {
|
||||||
|
h := headword{
|
||||||
|
Expression: reading.Reading,
|
||||||
|
Reading: reading.Reading,
|
||||||
|
}
|
||||||
|
h.Index = len(headwords)
|
||||||
|
headwords = append(headwords, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return headwords
|
||||||
|
}
|
||||||
|
|
||||||
|
func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
||||||
|
reader, err := os.Open(inputPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer reader.Close()
|
||||||
|
|
||||||
|
dictionary, entities, err := jmdict.LoadJmnedictNoTransform(reader)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
genericTermInfo := newGenericTermInfo()
|
||||||
|
|
||||||
|
terms := dbTermList{}
|
||||||
|
for _, entry := range dictionary.Entries {
|
||||||
|
headwords := jmnedictHeadwords(entry)
|
||||||
|
for _, headword := range headwords {
|
||||||
|
newTerms := jmnedictTerms(headword, entry, genericTermInfo)
|
||||||
|
terms = append(terms, newTerms...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
terms = append(terms, genericTermInfo.Terms()...)
|
||||||
|
|
||||||
|
tags := dbTagList{}
|
||||||
|
tags = append(tags, entityTags(entities)...)
|
||||||
|
|
||||||
|
recordData := map[string]dbRecordList{
|
||||||
|
"term": terms.crush(),
|
||||||
|
"tag": tags.crush(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if title == "" {
|
||||||
|
title = "JMnedict"
|
||||||
|
}
|
||||||
|
jmnedictDate := jmnedictPublicationDate(dictionary)
|
||||||
|
|
||||||
|
index := dbIndex{
|
||||||
|
Title: title,
|
||||||
|
Revision: "JMnedict." + jmnedictDate,
|
||||||
|
Sequenced: true,
|
||||||
|
Attribution: edrdgAttribution,
|
||||||
|
}
|
||||||
|
|
||||||
|
return writeDb(
|
||||||
|
outputPath,
|
||||||
|
index,
|
||||||
|
recordData,
|
||||||
|
stride,
|
||||||
|
pretty,
|
||||||
|
)
|
||||||
|
}
|
80
jmnedict_generic_terms.go
Normal file
80
jmnedict_generic_terms.go
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
type genericTermMap map[string]map[string][]string
|
||||||
|
|
||||||
|
type genericTermInfo struct {
|
||||||
|
expressionToTagToGlosses genericTermMap
|
||||||
|
usedSequences map[sequence]bool
|
||||||
|
currentSequence sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
func newGenericTermInfo() genericTermInfo {
|
||||||
|
return genericTermInfo{
|
||||||
|
expressionToTagToGlosses: genericTermMap{},
|
||||||
|
usedSequences: map[sequence]bool{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *genericTermInfo) NewSequence() sequence {
|
||||||
|
seq := i.currentSequence + 1
|
||||||
|
for i.usedSequences[seq] {
|
||||||
|
seq += 1
|
||||||
|
}
|
||||||
|
i.AddUsedSequence(seq)
|
||||||
|
i.currentSequence = seq
|
||||||
|
return seq
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *genericTermInfo) AddUsedSequence(s sequence) {
|
||||||
|
i.usedSequences[s] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *genericTermInfo) AddGlosses(exp string, tags []string, gloss string) {
|
||||||
|
if i.expressionToTagToGlosses[exp] == nil {
|
||||||
|
i.expressionToTagToGlosses[exp] = map[string][]string{}
|
||||||
|
}
|
||||||
|
for _, tag := range tags {
|
||||||
|
glosses := i.expressionToTagToGlosses[exp][tag]
|
||||||
|
if !slices.Contains(glosses, gloss) {
|
||||||
|
glosses = append(glosses, gloss)
|
||||||
|
i.expressionToTagToGlosses[exp][tag] = glosses
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *genericTermInfo) IsGenericName(headword headword, definitions []string) bool {
|
||||||
|
if headword.IsKanaOnly() {
|
||||||
|
// No reason to process these terms.
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
isGenericName := true
|
||||||
|
for _, definition := range definitions {
|
||||||
|
if !isTransliteration(definition, headword.Reading) {
|
||||||
|
isGenericName = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isGenericName
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *genericTermInfo) Terms() (terms []dbTerm) {
|
||||||
|
for expression, tagToGlosses := range i.expressionToTagToGlosses {
|
||||||
|
seq := i.NewSequence()
|
||||||
|
for tag, glosses := range tagToGlosses {
|
||||||
|
term := dbTerm{
|
||||||
|
Expression: expression,
|
||||||
|
Sequence: seq,
|
||||||
|
}
|
||||||
|
for _, gloss := range glosses {
|
||||||
|
term.Glossary = append(term.Glossary, gloss)
|
||||||
|
}
|
||||||
|
term.addDefinitionTags(tag)
|
||||||
|
terms = append(terms, term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return terms
|
||||||
|
}
|
254
jmnedict_text_util.go
Normal file
254
jmnedict_text_util.go
Normal file
@ -0,0 +1,254 @@
|
|||||||
|
package yomichan
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Returns text with all katakana characters converted into hiragana.
|
||||||
|
func katakanaToHiragana(text string) string {
|
||||||
|
f := func(x rune) rune {
|
||||||
|
if x >= 'ァ' && x <= 'ヶ' || x >= 'ヽ' && x <= 'ヾ' {
|
||||||
|
return x - 0x60
|
||||||
|
} else {
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.Map(f, text)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace hiragana iteration marks with the appropriate characters.
|
||||||
|
// E.g. "さゝき" -> "ささき"; "たゞの" -> "ただの"
|
||||||
|
func replaceIterationMarks(text string) string {
|
||||||
|
iterationMarks := []struct {
|
||||||
|
char rune
|
||||||
|
offset rune
|
||||||
|
}{
|
||||||
|
{'ゝ', 0x00},
|
||||||
|
{'ゞ', 0x01},
|
||||||
|
}
|
||||||
|
for _, x := range iterationMarks {
|
||||||
|
for strings.IndexRune(text, x.char) > 0 {
|
||||||
|
runes := []rune(text)
|
||||||
|
idx := slices.Index(runes, x.char)
|
||||||
|
runes[idx] = runes[idx-1] + x.offset
|
||||||
|
text = string(runes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns an array of the input text split into segments.
|
||||||
|
// E.g. "しょくぎょう" -> ["しょ", "く", "ぎょ", "う"]
|
||||||
|
// Returns nil if no segmentation is possible.
|
||||||
|
func makeKanaSegments(kana string) (segments []string) {
|
||||||
|
hiragana := replaceIterationMarks(katakanaToHiragana(kana))
|
||||||
|
kanaRunes := []rune{}
|
||||||
|
for _, kanaRune := range hiragana {
|
||||||
|
kanaRunes = append(kanaRunes, kanaRune)
|
||||||
|
}
|
||||||
|
kanaRuneCount := len(kanaRunes)
|
||||||
|
for i := 0; i < kanaRuneCount; i++ {
|
||||||
|
for j := 0; j < kanaRuneCount-i; j++ {
|
||||||
|
segment := string(kanaRunes[i : kanaRuneCount-j])
|
||||||
|
if _, ok := kanaSegmentToRomajiList[segment]; ok {
|
||||||
|
segments = append(segments, segment)
|
||||||
|
i = kanaRuneCount - j - 1
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if j == kanaRuneCount-i-1 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return segments
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns a map of ltr substrings of the input text.
|
||||||
|
// E.g. "nihon" -> ["n", "ni", "nih", "niho", "nihon"]
|
||||||
|
func makeSubstringMap(text string) map[string]bool {
|
||||||
|
substrings := make(map[string]bool)
|
||||||
|
for i := 1; i <= len(text); i++ {
|
||||||
|
substring := text[:i]
|
||||||
|
substrings[substring] = true
|
||||||
|
}
|
||||||
|
return substrings
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determines if the input text is a valid romaji representation of
|
||||||
|
// the input kana.
|
||||||
|
//
|
||||||
|
// The strategy is to calculate every possible romaji representation
|
||||||
|
// of a given string of kana and check if the input text is one of
|
||||||
|
// them. Since the number of combinations grows very large for long
|
||||||
|
// strings of kana, we need to prune invalid branches from the
|
||||||
|
// combination tree along the way.
|
||||||
|
func isTransliteration(text string, kana string) bool {
|
||||||
|
romaji := strings.TrimSpace(strings.ToLower(text))
|
||||||
|
validSubstrings := makeSubstringMap(romaji)
|
||||||
|
kanaSegments := makeKanaSegments(kana)
|
||||||
|
possibilities := []string{""}
|
||||||
|
for _, segment := range kanaSegments {
|
||||||
|
newPossibilities := map[string]bool{}
|
||||||
|
for _, x := range possibilities {
|
||||||
|
for _, y := range kanaSegmentToRomajiList[segment] {
|
||||||
|
z := x + y
|
||||||
|
newPossibilities[z] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
possibilities = nil
|
||||||
|
for z := range newPossibilities {
|
||||||
|
if validSubstrings[z] {
|
||||||
|
possibilities = append(possibilities, z)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if possibilities == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return slices.Contains(possibilities, romaji)
|
||||||
|
}
|
||||||
|
|
||||||
|
var kanaSegmentToRomajiList = map[string][]string{
|
||||||
|
"ぁ": []string{"", "a"},
|
||||||
|
"ぃ": []string{"", "i"},
|
||||||
|
"ぅ": []string{"", "u"},
|
||||||
|
"ぇ": []string{"", "e"},
|
||||||
|
"ぉ": []string{"", "o"},
|
||||||
|
"ゃ": []string{"ya"},
|
||||||
|
"ゅ": []string{"yu"},
|
||||||
|
"ょ": []string{"yo"},
|
||||||
|
"ゎ": []string{"wa"},
|
||||||
|
"っ": []string{"", "k", "g", "s", "z", "t", "d", "f", "h", "b", "p", "n", "m", "y", "w", "c"},
|
||||||
|
"ー": []string{"", "a", "i", "u", "e", "o", "-"},
|
||||||
|
"あ": []string{"", "a", "ā", "wa", "wā"},
|
||||||
|
"い": []string{"", "i", "ī", "wi", "wī"},
|
||||||
|
"う": []string{"", "u", "ū", "wu", "wū"},
|
||||||
|
"え": []string{"", "e", "ē", "we", "wē"},
|
||||||
|
"お": []string{"", "o", "ō", "wo", "wō"},
|
||||||
|
"ゔ": []string{"vu", "vū", "bu", "bū"},
|
||||||
|
"か": []string{"ka", "kā"},
|
||||||
|
"が": []string{"ga", "gā"},
|
||||||
|
"き": []string{"ki", "kī"},
|
||||||
|
"ぎ": []string{"gi", "gī"},
|
||||||
|
"く": []string{"ku", "kū"},
|
||||||
|
"ぐ": []string{"gu", "gū"},
|
||||||
|
"け": []string{"ke", "kē"},
|
||||||
|
"げ": []string{"ge", "gē"},
|
||||||
|
"こ": []string{"ko", "kō"},
|
||||||
|
"ご": []string{"go", "gō"},
|
||||||
|
"さ": []string{"sa", "sā"},
|
||||||
|
"ざ": []string{"za", "zā"},
|
||||||
|
"し": []string{"si", "sī", "shi", "shī"},
|
||||||
|
"じ": []string{"zi", "zī", "ji", "jī"},
|
||||||
|
"す": []string{"su", "sū"},
|
||||||
|
"ず": []string{"zu", "zū"},
|
||||||
|
"せ": []string{"se", "sē"},
|
||||||
|
"ぜ": []string{"ze", "zē"},
|
||||||
|
"そ": []string{"so", "sō"},
|
||||||
|
"ぞ": []string{"zo", "zō"},
|
||||||
|
"た": []string{"ta", "tā"},
|
||||||
|
"だ": []string{"da", "dā"},
|
||||||
|
"ち": []string{"ti", "tī", "chi", "chī"},
|
||||||
|
"ぢ": []string{"di", "dī", "dhi", "dhī", "ji", "jī", "dji", "djī", "dzi", "dzī"},
|
||||||
|
"つ": []string{"tu", "tū", "tsu", "tsū"},
|
||||||
|
"づ": []string{"du", "dū", "dzu", "dzū", "zu", "zū"},
|
||||||
|
"て": []string{"te", "tē"},
|
||||||
|
"で": []string{"de", "dē"},
|
||||||
|
"と": []string{"to", "tō"},
|
||||||
|
"ど": []string{"do", "dō"},
|
||||||
|
"な": []string{"na", "nā"},
|
||||||
|
"に": []string{"ni", "nī"},
|
||||||
|
"ぬ": []string{"nu", "nū"},
|
||||||
|
"ね": []string{"ne", "nē"},
|
||||||
|
"の": []string{"no", "nō"},
|
||||||
|
"は": []string{"ha", "hā", "wa", "wā", "a", "ā"},
|
||||||
|
"ば": []string{"ba", "bā"},
|
||||||
|
"ぱ": []string{"pa", "pā"},
|
||||||
|
"ひ": []string{"hi", "hī", "i", "ī"},
|
||||||
|
"び": []string{"bi", "bī"},
|
||||||
|
"ぴ": []string{"pi", "pī"},
|
||||||
|
"ふ": []string{"hu", "hū", "fu", "fū", "u", "ū"},
|
||||||
|
"ぶ": []string{"bu", "bū"},
|
||||||
|
"ぷ": []string{"pu", "pū"},
|
||||||
|
"へ": []string{"he", "hē", "e", "ē"},
|
||||||
|
"べ": []string{"be", "bē"},
|
||||||
|
"ぺ": []string{"pe", "pē"},
|
||||||
|
"ほ": []string{"ho", "hō", "o", "ō"},
|
||||||
|
"ぼ": []string{"bo", "bō"},
|
||||||
|
"ぽ": []string{"po", "pō"},
|
||||||
|
"ま": []string{"ma", "mā"},
|
||||||
|
"み": []string{"mi", "mī"},
|
||||||
|
"む": []string{"mu", "mū"},
|
||||||
|
"め": []string{"me", "mē"},
|
||||||
|
"も": []string{"mo", "mō"},
|
||||||
|
"や": []string{"ya", "yā"},
|
||||||
|
"ゆ": []string{"yu", "yū"},
|
||||||
|
"よ": []string{"yo", "yō"},
|
||||||
|
"ら": []string{"ra", "rā"},
|
||||||
|
"り": []string{"ri", "rī"},
|
||||||
|
"る": []string{"ru", "rū"},
|
||||||
|
"れ": []string{"re", "rē"},
|
||||||
|
"ろ": []string{"ro", "rō"},
|
||||||
|
"わ": []string{"wa", "wā"},
|
||||||
|
"ゐ": []string{"wi", "wī", "i", "ī"},
|
||||||
|
"ゑ": []string{"we", "wē", "e", "ē"},
|
||||||
|
"を": []string{"wo", "wō", "o", "ō"},
|
||||||
|
"ん": []string{"n", "n'", "m"},
|
||||||
|
"うぁ": []string{"wa", "wā", "ua", "uā"},
|
||||||
|
"うぃ": []string{"wi", "wī", "ui", "uī"},
|
||||||
|
"うぇ": []string{"we", "wē", "ue", "uē"},
|
||||||
|
"うぉ": []string{"wo", "wō", "uo", "uō"},
|
||||||
|
"きゃ": []string{"kya", "kyā"},
|
||||||
|
"きゅ": []string{"kyu", "kyū"},
|
||||||
|
"きょ": []string{"kyo", "kyō"},
|
||||||
|
"ぎゃ": []string{"gya", "gyā"},
|
||||||
|
"ぎゅ": []string{"gyu", "gyū"},
|
||||||
|
"ぎょ": []string{"gyo", "gyō"},
|
||||||
|
"くゎ": []string{"kwa", "kwā"},
|
||||||
|
"くゅ": []string{"kyu", "kyū"},
|
||||||
|
"しぇ": []string{"she", "shē", "shie", "shiē"},
|
||||||
|
"しゃ": []string{"sha", "shā", "sya", "syā"},
|
||||||
|
"しゅ": []string{"shu", "shū", "syu", "syū"},
|
||||||
|
"しょ": []string{"sho", "shō", "syo", "syō"},
|
||||||
|
"じぇ": []string{"je", "jē"},
|
||||||
|
"じゃ": []string{"ja", "jā", "jya", "jyā"},
|
||||||
|
"じゅ": []string{"ju", "jū", "jyu", "jyū"},
|
||||||
|
"じょ": []string{"jo", "jō", "jyo", "jyō"},
|
||||||
|
"ちぁ": []string{"cha", "chā", "chia", "chiā"},
|
||||||
|
"ちぇ": []string{"che", "chē", "chie", "chiē"},
|
||||||
|
"ちゃ": []string{"cha", "chā", "tya", "tyā"},
|
||||||
|
"ちゅ": []string{"chu", "chū", "tyu", "tyū"},
|
||||||
|
"ちょ": []string{"cho", "chō", "tyo", "tyō"},
|
||||||
|
"ぢゃ": []string{"ja", "jā", "jya", "jyā", "dya", "dyā"},
|
||||||
|
"ぢゅ": []string{"ju", "jū", "jyu", "jyū", "dyu", "dyū"},
|
||||||
|
"ぢょ": []string{"jo", "jō", "jyo", "jyō", "dyo", "dyō"},
|
||||||
|
"つぁ": []string{"tsa", "tsā", "tsua", "tsuā"},
|
||||||
|
"つぇ": []string{"tse", "tsē", "tsue", "tsuē"},
|
||||||
|
"てぃ": []string{"ti", "tī", "tei", "teī"},
|
||||||
|
"でぃ": []string{"di", "dī", "dei", "deī"},
|
||||||
|
"でゅ": []string{"dyu", "dyū", "deyu", "deyū"},
|
||||||
|
"にゃ": []string{"nya", "nyā"},
|
||||||
|
"にゅ": []string{"nyu", "nyū"},
|
||||||
|
"にょ": []string{"nyo", "nyō"},
|
||||||
|
"ひゃ": []string{"hya", "hyā"},
|
||||||
|
"ひゅ": []string{"hyu", "hyū"},
|
||||||
|
"ひょ": []string{"hyo", "hyō"},
|
||||||
|
"びゃ": []string{"bya", "byā"},
|
||||||
|
"びゅ": []string{"byu", "byū"},
|
||||||
|
"びょ": []string{"byo", "byō"},
|
||||||
|
"ぴゃ": []string{"pya", "pyā"},
|
||||||
|
"ぴゅ": []string{"pyu", "pyū"},
|
||||||
|
"ぴょ": []string{"pyo", "pyō"},
|
||||||
|
"ふぁ": []string{"fa", "fā"},
|
||||||
|
"ふぃ": []string{"fi", "fī"},
|
||||||
|
"ふぇ": []string{"fe", "fē"},
|
||||||
|
"ふぉ": []string{"fo", "fō"},
|
||||||
|
"みゃ": []string{"mya", "myā"},
|
||||||
|
"みゅ": []string{"myu", "myū"},
|
||||||
|
"みょ": []string{"myo", "myō"},
|
||||||
|
"りゃ": []string{"rya", "ryā"},
|
||||||
|
"りゅ": []string{"ryu", "ryū"},
|
||||||
|
"りょ": []string{"ryo", "ryō"},
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user