1
yomichan-import/jmdictMetadata.go
stephenmk 517ef3d052
Fix bug in term score assignments
This commit ensures that terms are grouped among their entries of
origin and displayed in correct sequential order in Yomichan's default
result grouping mode, "Group term-reading pairs."
2023-01-27 19:09:12 -06:00

182 lines
5.6 KiB
Go

package yomichan
import (
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
type sequence = int
type jmdictMetadata struct {
language string
condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int
seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence
references []string
referenceToSeq map[string]sequence
hashToSearchValues map[hash][]searchValue
seqToSearchHashes map[sequence][]searchHash
entryDepth map[sequence]int
hasMultipleForms map[sequence]bool
maxSenseCount int
}
type senseID struct {
sequence sequence
number int
}
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) {
// This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order
maxDepth := 0
for _, headword := range headwords {
hash := headword.Hash()
for _, seq := range meta.headwordHashToSeqs[hash] {
seqDepth := meta.entryDepth[seq]
if seqDepth == 0 {
meta.entryDepth[seq] = 1
seqDepth = 1
}
if maxDepth < seqDepth+1 {
maxDepth = seqDepth + 1
}
}
}
meta.entryDepth[entrySequence] = maxDepth
}
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
// Determine how many senses are in this entry for this language
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
senseCount := 0
for _, entrySense := range entry.Sense {
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) {
senseCount += 1
break
}
}
}
meta.seqToSenseCount[entry.Sequence] = senseCount
}
if meta.seqToSenseCount[entry.Sequence] == 0 {
return
}
// main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
meta.seqToMainHeadword[entry.Sequence] = headword
}
// hash the term pair so we can determine if it's used
// in more than one JMdict entry later.
headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
}
// hash the expression so that we can determine if we
// need to disambiguate it by displaying its reading
// in reference notes later.
expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
}
// e.g. for JMdict (English) we expect to end up with
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
// used for correlating references to sequence numbers later.
searchHashes := []searchHash{
searchHash{headwordHash, headword.IsPriority},
searchHash{expHash, headword.IsPriority},
searchHash{headword.ReadingHash(), headword.IsPriority},
}
for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
}
}
currentSenseNumber := 1
for _, entrySense := range entry.Sense {
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
continue
}
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
currentSenseNumber += 1
continue
}
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
currentSenseNumber += 1
continue
}
allReferences := append(entrySense.References, entrySense.Antonyms...)
for _, reference := range allReferences {
meta.references = append(meta.references, reference)
}
currentSense := senseID{entry.Sequence, currentSenseNumber}
if meta.condensedGlosses[currentSense] == "" {
glosses := []string{}
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
}
currentSenseNumber += 1
}
}
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{
language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int),
condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string),
seqToSearchHashes: make(map[sequence][]searchHash),
headwordHashToSeqs: make(map[hash][]sequence),
references: []string{},
hashToSearchValues: nil,
referenceToSeq: nil,
entryDepth: make(map[sequence]int),
hasMultipleForms: make(map[sequence]bool),
maxSenseCount: 0,
}
for _, entry := range dictionary.Entries {
headwords := extractHeadwords(entry)
formCount := 0
for _, headword := range headwords {
meta.AddHeadword(headword, entry)
if !headword.IsSearchOnly {
formCount += 1
}
}
meta.CalculateEntryDepth(headwords, entry.Sequence)
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
}
// this correlation process will be unnecessary once JMdict
// includes sequence numbers in its cross-reference data
meta.MakeReferenceToSeqMap()
for _, senseCount := range meta.seqToSenseCount {
if meta.maxSenseCount < senseCount {
meta.maxSenseCount = senseCount
}
}
return meta
}