yomichan-import/jmdict_references.go

package yomichan

import (
	"fmt"
	"strconv"
	"strings"
)

/*
 * In the future, JMdict will be updated to include sequence numbers
 * with each cross reference. At that time, most of the functions and
 * types defined in this file will become unnecessary.  see:
 * https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
 */

type searchValue struct {
	sequence   sequence
	index      int
	isPriority bool
}

type searchHash struct {
	hash       hash
	isPriority bool
}

func parseReference(reference string) (headword, int, bool) {
	// Reference strings in JMDict currently consist of 3 parts at
	// most, separated by ・ characters. The latter two parts are
	// optional.  When the sense number is not specified, it is
	// implied to be the first sense.
	var h headword
	var senseNumber int
	ok := true
	refParts := strings.Split(reference, "・")
	if len(refParts) == 1 {
		// (Kanji) or (Reading)
		h = headword{Expression: refParts[0], Reading: refParts[0]}
		senseNumber = 1
	} else if len(refParts) == 2 {
		// [Kanji + (Reading or Sense)] or (Reading + Sense)
		val, err := strconv.Atoi(refParts[1])
		if err == nil {
			h = headword{Expression: refParts[0], Reading: refParts[0]}
			senseNumber = val
		} else {
			h = headword{Expression: refParts[0], Reading: refParts[1]}
			senseNumber = 1
		}
	} else if len(refParts) == 3 {
		// Expression + Reading + Sense
		h = headword{Expression: refParts[0], Reading: refParts[1]}
		val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
		if err == nil {
			senseNumber = val
		} else {
			errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
			fmt.Println(errortext)
			ok = false
		}
	} else {
		errortext := "Unexpected format for x-ref \"" + reference + "\""
		fmt.Println(errortext)
		ok = false
	}
	return h, senseNumber, ok
}

func (meta *jmdictMetadata) MakeReferenceToSeqMap() {

	meta.referenceToSeq = make(map[string]sequence)
	meta.MakeHashToSearchValuesMap()

	for _, reference := range meta.references {
		if meta.referenceToSeq[reference] != 0 {
			continue
		}
		seq := meta.FindBestSequence(reference)
		if seq != 0 {
			meta.referenceToSeq[reference] = seq
		} else {
			fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
		}
	}
}

func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
	meta.hashToSearchValues = make(map[hash][]searchValue)
	for seq, searchHashes := range meta.seqToSearchHashes {
		for idx, searchHash := range searchHashes {
			searchValue := searchValue{
				sequence:   seq,
				index:      idx,
				isPriority: searchHash.isPriority,
			}
			meta.hashToSearchValues[searchHash.hash] =
				append(meta.hashToSearchValues[searchHash.hash], searchValue)
		}
	}
}

/*
 * This function attemps to convert a JMdict reference string into a
 * single definite sequence number. These reference strings are often
 * ambiguous, so we have to resort to using heuristics.
 *
 * Generally, correspondence is determined by the order in which term
 * pairs are extracted from each JMdict entry. Take for example the
 * JMdict entry for ご本, which contains a reference to 本 (without a
 * reading specified). To correlate this reference with a sequence
 * number, our program searches each entry for the hash of【本・本】.
 * There are two entries in which it is found in JMdict (English):
 *
 * sequence 1260670: 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
 * sequence 1522150: 【本・ほん】、【本・本】、【ほん・ほん】
 *
 * Because 【本・本】 is closer to the beginning of the array in the
 * latter (i.e., has the lowest index), sequence number 1522150 is
 * returned.
 *
 * In situations in which multiple sequences are found with the same
 * index, the entry with a priority tag ("news1", "ichi1", "spec1",
 * "spec2", "gai1") is given preference. This mostly affects
 * katakana-only loanwords like ラグ.
 *
 * To improve accuracy, this method also checks to see if the
 * reference's specified sense number really exists in the
 * corresponding entry. For example, sequence 1582850 【如何で・いかんで】
 * has a reference to sense #2 of いかん (no kanji specified), which
 * could belong to 13 different sequences. However, sequences 1582850
 * and 2829697 are the only 2 of those 13 which contain more than one
 * sense. Incidentally, sequence 1582850 is the correct match.
 *
 * All else being equal, the entry with the smallest sequence number
 * is chosen. References in the JMdict file are currently ambiguous,
 * and getting this perfect won't be possible until reference sequence
 * numbers are included in the file.  See:
 * https://github.com/JMdictProject/JMdictIssues/issues/61
 */
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
	bestSeq := 0
	lowestIndex := 100000
	bestIsPriority := false
	headword, senseNumber, ok := parseReference(reference)
	if !ok {
		return bestSeq
	}
	hash := headword.Hash()
	for _, v := range meta.hashToSearchValues[hash] {
		if meta.seqToSenseCount[v.sequence] < senseNumber {
			// entry must contain the specified sense
			continue
		} else if lowestIndex < v.index {
			// lower indices are better
			continue
		} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {
			// if indices match, check priority
			continue
		} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {
			// if indices and priority match, check sequence number.
			// lower sequence numbers are better
			continue
		} else {
			lowestIndex = v.index
			bestSeq = v.sequence
			bestIsPriority = v.isPriority
		}
	}
	return bestSeq
}