1
yomichan-import/jmdict_metadata.go

180 lines
5.4 KiB
Go
Raw Normal View History

2023-01-22 20:37:18 +00:00
package yomichan
import (
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
type sequence = int
type jmdictMetadata struct {
language string
condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int
seqToPartsOfSpeech map[sequence][]string
2023-01-22 20:37:18 +00:00
seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence
references []string
referenceToSeq map[string]sequence
hashToSearchValues map[hash][]searchValue
seqToSearchHashes map[sequence][]searchHash
entryDepth map[sequence]int
2023-01-22 20:37:18 +00:00
hasMultipleForms map[sequence]bool
maxSenseCount int
extraMode bool
2023-01-22 20:37:18 +00:00
}
type senseID struct {
sequence sequence
number int
}
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, seq sequence) {
// This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order
maxDepth := 0
for _, headword := range headwords {
hash := headword.Hash()
for _, seq := range meta.headwordHashToSeqs[hash] {
seqDepth := meta.entryDepth[seq]
if seqDepth == 0 {
meta.entryDepth[seq] = 1
seqDepth = 1
}
if maxDepth < seqDepth+1 {
maxDepth = seqDepth + 1
}
}
}
meta.entryDepth[seq] = maxDepth
}
func (meta *jmdictMetadata) AddEntry(entry jmdict.JmdictEntry) {
partsOfSpeech := []string{}
senseCount := 0
for _, sense := range entry.Sense {
// Only English-language senses contain part-of-speech info,
// but other languages need them for deinflection rules.
for _, pos := range sense.PartsOfSpeech {
if !slices.Contains(partsOfSpeech, pos) {
partsOfSpeech = append(partsOfSpeech, pos)
2023-01-22 20:37:18 +00:00
}
}
if glossaryContainsLanguage(sense.Glossary, meta.language) {
senseCount += 1
} else {
continue
}
for _, reference := range sense.References {
meta.references = append(meta.references, reference)
}
for _, antonym := range sense.Antonyms {
meta.references = append(meta.references, antonym)
}
currentSenseID := senseID{entry.Sequence, senseCount}
glosses := []string{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSenseID] = strings.Join(glosses, "; ")
2023-01-22 20:37:18 +00:00
}
meta.seqToPartsOfSpeech[entry.Sequence] = partsOfSpeech
meta.seqToSenseCount[entry.Sequence] = senseCount
}
2023-01-22 20:37:18 +00:00
func (meta *jmdictMetadata) AddHeadword(headword headword, seq sequence) {
if meta.seqToSenseCount[seq] == 0 {
2023-01-22 20:37:18 +00:00
return
}
// main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[seq]; !ok {
meta.seqToMainHeadword[seq] = headword
2023-01-22 20:37:18 +00:00
}
// hash the term pair so we can determine if it's used
// in more than one JMdict entry later.
headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], seq) {
meta.headwordHashToSeqs[headwordHash] =
append(meta.headwordHashToSeqs[headwordHash], seq)
2023-01-22 20:37:18 +00:00
}
// hash the expression so that we can determine if we
// need to disambiguate it by displaying its reading
// in reference notes later.
expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] =
append(meta.expHashToReadings[expHash], headword.Reading)
2023-01-22 20:37:18 +00:00
}
// e.g. for JMdict (English) we expect to end up with
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
// used for correlating references to sequence numbers later.
searchHashes := []searchHash{
searchHash{headwordHash, headword.IsPriority},
searchHash{expHash, headword.IsPriority},
searchHash{headword.ReadingHash(), headword.IsPriority},
}
for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[seq], x) {
meta.seqToSearchHashes[seq] = append(meta.seqToSearchHashes[seq], x)
2023-01-22 20:37:18 +00:00
}
}
}
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{
language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int),
seqToPartsOfSpeech: make(map[sequence][]string),
2023-01-22 20:37:18 +00:00
condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string),
seqToSearchHashes: make(map[sequence][]searchHash),
headwordHashToSeqs: make(map[hash][]sequence),
references: []string{},
hashToSearchValues: nil,
referenceToSeq: nil,
entryDepth: make(map[sequence]int),
2023-01-22 20:37:18 +00:00
hasMultipleForms: make(map[sequence]bool),
maxSenseCount: 0,
extraMode: languageName == "english_extra",
2023-01-22 20:37:18 +00:00
}
for _, entry := range dictionary.Entries {
meta.AddEntry(entry)
2023-01-22 20:37:18 +00:00
headwords := extractHeadwords(entry)
formCount := 0
for _, headword := range headwords {
meta.AddHeadword(headword, entry.Sequence)
2023-01-22 20:37:18 +00:00
if !headword.IsSearchOnly {
formCount += 1
}
}
meta.CalculateEntryDepth(headwords, entry.Sequence)
2023-01-22 20:37:18 +00:00
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
}
// this correlation process will be unnecessary once JMdict
// includes sequence numbers in its cross-reference data
meta.MakeReferenceToSeqMap()
for _, senseCount := range meta.seqToSenseCount {
if meta.maxSenseCount < senseCount {
meta.maxSenseCount = senseCount
}
}
return meta
}