1
yomichan-import/jmdict_references.go
2023-01-28 18:17:06 -06:00

171 lines
5.7 KiB
Go

package yomichan
import (
"fmt"
"strconv"
"strings"
)
/*
* In the future, JMdict will be updated to include sequence numbers
* with each cross reference. At that time, most of the functions and
* types defined in this file will become unnecessary. see:
* https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
*/
type searchValue struct {
sequence sequence
index int
isPriority bool
}
type searchHash struct {
hash hash
isPriority bool
}
func parseReference(reference string) (headword, int, bool) {
// Reference strings in JMDict currently consist of 3 parts at
// most, separated by ・ characters. The latter two parts are
// optional. When the sense number is not specified, it is
// implied to be the first sense.
var h headword
var senseNumber int
ok := true
refParts := strings.Split(reference, "・")
if len(refParts) == 1 {
// (Kanji) or (Reading)
h = headword{Expression: refParts[0], Reading: refParts[0]}
senseNumber = 1
} else if len(refParts) == 2 {
// [Kanji + (Reading or Sense)] or (Reading + Sense)
val, err := strconv.Atoi(refParts[1])
if err == nil {
h = headword{Expression: refParts[0], Reading: refParts[0]}
senseNumber = val
} else {
h = headword{Expression: refParts[0], Reading: refParts[1]}
senseNumber = 1
}
} else if len(refParts) == 3 {
// Expression + Reading + Sense
h = headword{Expression: refParts[0], Reading: refParts[1]}
val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
if err == nil {
senseNumber = val
} else {
errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
fmt.Println(errortext)
ok = false
}
} else {
errortext := "Unexpected format for x-ref \"" + reference + "\""
fmt.Println(errortext)
ok = false
}
return h, senseNumber, ok
}
func (meta *jmdictMetadata) MakeReferenceToSeqMap() {
meta.referenceToSeq = make(map[string]sequence)
meta.MakeHashToSearchValuesMap()
for _, reference := range meta.references {
if meta.referenceToSeq[reference] != 0 {
continue
}
seq := meta.FindBestSequence(reference)
if seq != 0 {
meta.referenceToSeq[reference] = seq
} else {
fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
}
}
}
func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
meta.hashToSearchValues = make(map[hash][]searchValue)
for seq, searchHashes := range meta.seqToSearchHashes {
for idx, searchHash := range searchHashes {
searchValue := searchValue{
sequence: seq,
index: idx,
isPriority: searchHash.isPriority,
}
meta.hashToSearchValues[searchHash.hash] =
append(meta.hashToSearchValues[searchHash.hash], searchValue)
}
}
}
/*
* This function attemps to convert a JMdict reference string into a
* single definite sequence number. These reference strings are often
* ambiguous, so we have to resort to using heuristics.
*
* Generally, correspondence is determined by the order in which term
* pairs are extracted from each JMdict entry. Take for example the
* JMdict entry for ご本, which contains a reference to 本 (without a
* reading specified). To correlate this reference with a sequence
* number, our program searches each entry for the hash of【本・本】.
* There are two entries in which it is found in JMdict (English):
*
* sequence 1260670: 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
* sequence 1522150: 【本・ほん】、【本・本】、【ほん・ほん】
*
* Because 【本・本】 is closer to the beginning of the array in the
* latter (i.e., has the lowest index), sequence number 1522150 is
* returned.
*
* In situations in which multiple sequences are found with the same
* index, the entry with a priority tag ("news1", "ichi1", "spec1",
* "spec2", "gai1") is given preference. This mostly affects
* katakana-only loanwords like ラグ.
*
* To improve accuracy, this method also checks to see if the
* reference's specified sense number really exists in the
* corresponding entry. For example, sequence 1582850 【如何で・いかんで】
* has a reference to sense #2 of いかん (no kanji specified), which
* could belong to 13 different sequences. However, sequences 1582850
* and 2829697 are the only 2 of those 13 which contain more than one
* sense. Incidentally, sequence 1582850 is the correct match.
*
* All else being equal, the entry with the smallest sequence number
* is chosen. References in the JMdict file are currently ambiguous,
* and getting this perfect won't be possible until reference sequence
* numbers are included in the file. See:
* https://github.com/JMdictProject/JMdictIssues/issues/61
*/
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
bestSeq := 0
lowestIndex := 100000
bestIsPriority := false
headword, senseNumber, ok := parseReference(reference)
if !ok {
return bestSeq
}
hash := headword.Hash()
for _, v := range meta.hashToSearchValues[hash] {
if meta.seqToSenseCount[v.sequence] < senseNumber {
// entry must contain the specified sense
continue
} else if lowestIndex < v.index {
// lower indices are better
continue
} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {
// if indices match, check priority
continue
} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {
// if indices and priority match, check sequence number.
// lower sequence numbers are better
continue
} else {
lowestIndex = v.index
bestSeq = v.sequence
bestIsPriority = v.isPriority
}
}
return bestSeq
}