159 lines
4.9 KiB
Go
159 lines
4.9 KiB
Go
package yomichan
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"foosoft.net/projects/jmdict"
|
|
"golang.org/x/exp/slices"
|
|
)
|
|
|
|
type sequence = int
|
|
|
|
type jmdictMetadata struct {
|
|
language string
|
|
condensedGlosses map[senseID]string
|
|
seqToSenseCount map[sequence]int
|
|
seqToMainHeadword map[sequence]headword
|
|
expHashToReadings map[hash][]string
|
|
headwordHashToSeqs map[hash][]sequence
|
|
references []string
|
|
referenceToSeq map[string]sequence
|
|
hashToSearchValues map[hash][]searchValue
|
|
seqToSearchHashes map[sequence][]searchHash
|
|
hasMultipleForms map[sequence]bool
|
|
maxSenseCount int
|
|
}
|
|
|
|
type senseID struct {
|
|
sequence sequence
|
|
number int
|
|
}
|
|
|
|
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
|
|
|
|
// Determine how many senses are in this entry for this language
|
|
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
|
|
senseCount := 0
|
|
for _, entrySense := range entry.Sense {
|
|
for _, gloss := range entrySense.Glossary {
|
|
if glossContainsLanguage(gloss, meta.language) {
|
|
senseCount += 1
|
|
break
|
|
}
|
|
}
|
|
}
|
|
meta.seqToSenseCount[entry.Sequence] = senseCount
|
|
}
|
|
|
|
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
|
return
|
|
}
|
|
|
|
// main headwords (first ones that are found in entries).
|
|
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
|
|
meta.seqToMainHeadword[entry.Sequence] = headword
|
|
}
|
|
|
|
// hash the term pair so we can determine if it's used
|
|
// in more than one JMdict entry later.
|
|
headwordHash := headword.Hash()
|
|
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
|
|
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
|
|
}
|
|
|
|
// hash the expression so that we can determine if we
|
|
// need to disambiguate it by displaying its reading
|
|
// in reference notes later.
|
|
expHash := headword.ExpHash()
|
|
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
|
|
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
|
|
}
|
|
|
|
// e.g. for JMdict (English) we expect to end up with
|
|
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
|
|
// used for correlating references to sequence numbers later.
|
|
searchHashes := []searchHash{
|
|
searchHash{headwordHash, headword.IsPriority},
|
|
searchHash{expHash, headword.IsPriority},
|
|
searchHash{headword.ReadingHash(), headword.IsPriority},
|
|
}
|
|
for _, x := range searchHashes {
|
|
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
|
|
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
|
|
}
|
|
}
|
|
|
|
currentSenseNumber := 1
|
|
for _, entrySense := range entry.Sense {
|
|
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
|
|
continue
|
|
}
|
|
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
|
|
currentSenseNumber += 1
|
|
continue
|
|
}
|
|
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
|
|
currentSenseNumber += 1
|
|
continue
|
|
}
|
|
|
|
allReferences := append(entrySense.References, entrySense.Antonyms...)
|
|
for _, reference := range allReferences {
|
|
meta.references = append(meta.references, reference)
|
|
}
|
|
|
|
currentSense := senseID{entry.Sequence, currentSenseNumber}
|
|
if meta.condensedGlosses[currentSense] == "" {
|
|
glosses := []string{}
|
|
for _, gloss := range entrySense.Glossary {
|
|
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
|
glosses = append(glosses, gloss.Content)
|
|
}
|
|
}
|
|
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
|
|
}
|
|
currentSenseNumber += 1
|
|
}
|
|
}
|
|
|
|
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
|
|
meta := jmdictMetadata{
|
|
language: langNameToCode[languageName],
|
|
seqToSenseCount: make(map[sequence]int),
|
|
condensedGlosses: make(map[senseID]string),
|
|
seqToMainHeadword: make(map[sequence]headword),
|
|
expHashToReadings: make(map[hash][]string),
|
|
seqToSearchHashes: make(map[sequence][]searchHash),
|
|
headwordHashToSeqs: make(map[hash][]sequence),
|
|
references: []string{},
|
|
hashToSearchValues: nil,
|
|
referenceToSeq: nil,
|
|
hasMultipleForms: make(map[sequence]bool),
|
|
maxSenseCount: 0,
|
|
}
|
|
|
|
for _, entry := range dictionary.Entries {
|
|
headwords := extractHeadwords(entry)
|
|
formCount := 0
|
|
for _, headword := range headwords {
|
|
meta.AddHeadword(headword, entry)
|
|
if !headword.IsSearchOnly {
|
|
formCount += 1
|
|
}
|
|
}
|
|
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
|
|
}
|
|
|
|
// this correlation process will be unnecessary once JMdict
|
|
// includes sequence numbers in its cross-reference data
|
|
meta.MakeReferenceToSeqMap()
|
|
|
|
for _, senseCount := range meta.seqToSenseCount {
|
|
if meta.maxSenseCount < senseCount {
|
|
meta.maxSenseCount = senseCount
|
|
}
|
|
}
|
|
|
|
return meta
|
|
}
|