1
yomichan-import/jmdict_metadata.go
stephenmk 7bff70b71c
JMdict: Ensure part-of-speech info is added in non-English versions
Only English-language senses in JMdict contain part-of-speech tags.
This info is displayed to users in definition tags and also used
for deinflecting verbs and adjectives during term lookups.

The old version of Yomichan-Import took the PoS tags from the final
sense in the English version of an entry and applied them to every
sense of every other language. For example, 川・かわ has two senses in
English JMdict: a noun sense and a suffix sense. Therefore every sense
of 川・かわ in every other language was tagged as a suffix.

Instead, I suggest gathering all distinct PoS tags from each English
entry and applying them all to each non-English sense. Every
non-English sense of 川・かわ will therefore be tagged as both a noun
and suffix.
2023-02-02 10:44:16 -06:00

180 lines
5.4 KiB
Go

package yomichan
import (
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
type sequence = int
type jmdictMetadata struct {
language string
condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int
seqToPartsOfSpeech map[sequence][]string
seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence
references []string
referenceToSeq map[string]sequence
hashToSearchValues map[hash][]searchValue
seqToSearchHashes map[sequence][]searchHash
entryDepth map[sequence]int
hasMultipleForms map[sequence]bool
maxSenseCount int
extraMode bool
}
type senseID struct {
sequence sequence
number int
}
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, seq sequence) {
// This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order
maxDepth := 0
for _, headword := range headwords {
hash := headword.Hash()
for _, seq := range meta.headwordHashToSeqs[hash] {
seqDepth := meta.entryDepth[seq]
if seqDepth == 0 {
meta.entryDepth[seq] = 1
seqDepth = 1
}
if maxDepth < seqDepth+1 {
maxDepth = seqDepth + 1
}
}
}
meta.entryDepth[seq] = maxDepth
}
func (meta *jmdictMetadata) AddEntry(entry jmdict.JmdictEntry) {
partsOfSpeech := []string{}
senseCount := 0
for _, sense := range entry.Sense {
// Only English-language senses contain part-of-speech info,
// but other languages need them for deinflection rules.
for _, pos := range sense.PartsOfSpeech {
if !slices.Contains(partsOfSpeech, pos) {
partsOfSpeech = append(partsOfSpeech, pos)
}
}
if glossaryContainsLanguage(sense.Glossary, meta.language) {
senseCount += 1
} else {
continue
}
for _, reference := range sense.References {
meta.references = append(meta.references, reference)
}
for _, antonym := range sense.Antonyms {
meta.references = append(meta.references, antonym)
}
currentSenseID := senseID{entry.Sequence, senseCount}
glosses := []string{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSenseID] = strings.Join(glosses, "; ")
}
meta.seqToPartsOfSpeech[entry.Sequence] = partsOfSpeech
meta.seqToSenseCount[entry.Sequence] = senseCount
}
func (meta *jmdictMetadata) AddHeadword(headword headword, seq sequence) {
if meta.seqToSenseCount[seq] == 0 {
return
}
// main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[seq]; !ok {
meta.seqToMainHeadword[seq] = headword
}
// hash the term pair so we can determine if it's used
// in more than one JMdict entry later.
headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], seq) {
meta.headwordHashToSeqs[headwordHash] =
append(meta.headwordHashToSeqs[headwordHash], seq)
}
// hash the expression so that we can determine if we
// need to disambiguate it by displaying its reading
// in reference notes later.
expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] =
append(meta.expHashToReadings[expHash], headword.Reading)
}
// e.g. for JMdict (English) we expect to end up with
// seqToHashedHeadwords[1260670] == 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
// used for correlating references to sequence numbers later.
searchHashes := []searchHash{
searchHash{headwordHash, headword.IsPriority},
searchHash{expHash, headword.IsPriority},
searchHash{headword.ReadingHash(), headword.IsPriority},
}
for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[seq], x) {
meta.seqToSearchHashes[seq] = append(meta.seqToSearchHashes[seq], x)
}
}
}
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{
language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int),
seqToPartsOfSpeech: make(map[sequence][]string),
condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string),
seqToSearchHashes: make(map[sequence][]searchHash),
headwordHashToSeqs: make(map[hash][]sequence),
references: []string{},
hashToSearchValues: nil,
referenceToSeq: nil,
entryDepth: make(map[sequence]int),
hasMultipleForms: make(map[sequence]bool),
maxSenseCount: 0,
extraMode: languageName == "english_extra",
}
for _, entry := range dictionary.Entries {
meta.AddEntry(entry)
headwords := extractHeadwords(entry)
formCount := 0
for _, headword := range headwords {
meta.AddHeadword(headword, entry.Sequence)
if !headword.IsSearchOnly {
formCount += 1
}
}
meta.CalculateEntryDepth(headwords, entry.Sequence)
meta.hasMultipleForms[entry.Sequence] = (formCount > 1)
}
// this correlation process will be unnecessary once JMdict
// includes sequence numbers in its cross-reference data
meta.MakeReferenceToSeqMap()
for _, senseCount := range meta.seqToSenseCount {
if meta.maxSenseCount < senseCount {
meta.maxSenseCount = senseCount
}
}
return meta
}