1

JMdict: Ensure part-of-speech info is added in non-English versions

Only English-language senses in JMdict contain part-of-speech tags.
This info is displayed to users in definition tags and also used
for deinflecting verbs and adjectives during term lookups.

The old version of Yomichan-Import took the PoS tags from the final
sense in the English version of an entry and applied them to every
sense of every other language. For example, 川・かわ has two senses in
English JMdict: a noun sense and a suffix sense. Therefore every sense
of 川・かわ in every other language was tagged as a suffix.

Instead, I suggest gathering all distinct PoS tags from each English
entry and applying them all to each non-English sense. Every
non-English sense of 川・かわ will therefore be tagged as both a noun
and suffix.
This commit is contained in:
stephenmk 2023-02-02 10:44:16 -06:00
parent 19d6d0bb43
commit 7bff70b71c
No known key found for this signature in database
GPG Key ID: B6DA730DB06235F1
2 changed files with 59 additions and 56 deletions

View File

@ -162,6 +162,13 @@ func jmdictSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
senseNumberTag := strconv.Itoa(senseNumber) senseNumberTag := strconv.Itoa(senseNumber)
term.addDefinitionTags(senseNumberTag) term.addDefinitionTags(senseNumberTag)
} }
if len(sense.PartsOfSpeech) == 0 && meta.language != "eng" {
// This is a hack to provide part-of-speech info to
// non-English versions of JMdict.
sense.PartsOfSpeech = meta.seqToPartsOfSpeech[entry.Sequence]
}
term.addDefinitionTags(sense.PartsOfSpeech...) term.addDefinitionTags(sense.PartsOfSpeech...)
term.addDefinitionTags(sense.Fields...) term.addDefinitionTags(sense.Fields...)
term.addDefinitionTags(sense.Misc...) term.addDefinitionTags(sense.Misc...)

View File

@ -13,6 +13,7 @@ type jmdictMetadata struct {
language string language string
condensedGlosses map[senseID]string condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int seqToSenseCount map[sequence]int
seqToPartsOfSpeech map[sequence][]string
seqToMainHeadword map[sequence]headword seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence headwordHashToSeqs map[hash][]sequence
@ -31,7 +32,7 @@ type senseID struct {
number int number int
} }
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) { func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, seq sequence) {
// This is to ensure that terms are grouped among their // This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order // entries of origin and displayed in correct sequential order
maxDepth := 0 maxDepth := 0
@ -48,39 +49,63 @@ func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySeque
} }
} }
} }
meta.entryDepth[entrySequence] = maxDepth meta.entryDepth[seq] = maxDepth
} }
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) { func (meta *jmdictMetadata) AddEntry(entry jmdict.JmdictEntry) {
partsOfSpeech := []string{}
// Determine how many senses are in this entry for this language senseCount := 0
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok { for _, sense := range entry.Sense {
senseCount := 0 // Only English-language senses contain part-of-speech info,
for _, entrySense := range entry.Sense { // but other languages need them for deinflection rules.
for _, gloss := range entrySense.Glossary { for _, pos := range sense.PartsOfSpeech {
if glossContainsLanguage(gloss, meta.language) { if !slices.Contains(partsOfSpeech, pos) {
senseCount += 1 partsOfSpeech = append(partsOfSpeech, pos)
break
}
} }
} }
meta.seqToSenseCount[entry.Sequence] = senseCount
}
if meta.seqToSenseCount[entry.Sequence] == 0 { if glossaryContainsLanguage(sense.Glossary, meta.language) {
senseCount += 1
} else {
continue
}
for _, reference := range sense.References {
meta.references = append(meta.references, reference)
}
for _, antonym := range sense.Antonyms {
meta.references = append(meta.references, antonym)
}
currentSenseID := senseID{entry.Sequence, senseCount}
glosses := []string{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSenseID] = strings.Join(glosses, "; ")
}
meta.seqToPartsOfSpeech[entry.Sequence] = partsOfSpeech
meta.seqToSenseCount[entry.Sequence] = senseCount
}
func (meta *jmdictMetadata) AddHeadword(headword headword, seq sequence) {
if meta.seqToSenseCount[seq] == 0 {
return return
} }
// main headwords (first ones that are found in entries). // main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok { if _, ok := meta.seqToMainHeadword[seq]; !ok {
meta.seqToMainHeadword[entry.Sequence] = headword meta.seqToMainHeadword[seq] = headword
} }
// hash the term pair so we can determine if it's used // hash the term pair so we can determine if it's used
// in more than one JMdict entry later. // in more than one JMdict entry later.
headwordHash := headword.Hash() headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) { if !slices.Contains(meta.headwordHashToSeqs[headwordHash], seq) {
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence) meta.headwordHashToSeqs[headwordHash] =
append(meta.headwordHashToSeqs[headwordHash], seq)
} }
// hash the expression so that we can determine if we // hash the expression so that we can determine if we
@ -88,7 +113,8 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
// in reference notes later. // in reference notes later.
expHash := headword.ExpHash() expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) { if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading) meta.expHashToReadings[expHash] =
append(meta.expHashToReadings[expHash], headword.Reading)
} }
// e.g. for JMdict (English) we expect to end up with // e.g. for JMdict (English) we expect to end up with
@ -100,48 +126,17 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
searchHash{headword.ReadingHash(), headword.IsPriority}, searchHash{headword.ReadingHash(), headword.IsPriority},
} }
for _, x := range searchHashes { for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) { if !slices.Contains(meta.seqToSearchHashes[seq], x) {
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x) meta.seqToSearchHashes[seq] = append(meta.seqToSearchHashes[seq], x)
} }
} }
currentSenseNumber := 1
for _, entrySense := range entry.Sense {
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
continue
}
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
currentSenseNumber += 1
continue
}
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
currentSenseNumber += 1
continue
}
allReferences := append(entrySense.References, entrySense.Antonyms...)
for _, reference := range allReferences {
meta.references = append(meta.references, reference)
}
currentSense := senseID{entry.Sequence, currentSenseNumber}
if meta.condensedGlosses[currentSense] == "" {
glosses := []string{}
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
}
currentSenseNumber += 1
}
} }
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata { func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{ meta := jmdictMetadata{
language: langNameToCode[languageName], language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int), seqToSenseCount: make(map[sequence]int),
seqToPartsOfSpeech: make(map[sequence][]string),
condensedGlosses: make(map[senseID]string), condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword), seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string), expHashToReadings: make(map[hash][]string),
@ -157,10 +152,11 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta
} }
for _, entry := range dictionary.Entries { for _, entry := range dictionary.Entries {
meta.AddEntry(entry)
headwords := extractHeadwords(entry) headwords := extractHeadwords(entry)
formCount := 0 formCount := 0
for _, headword := range headwords { for _, headword := range headwords {
meta.AddHeadword(headword, entry) meta.AddHeadword(headword, entry.Sequence)
if !headword.IsSearchOnly { if !headword.IsSearchOnly {
formCount += 1 formCount += 1
} }