yomichan-import/jmdict_references.go

package yomichan

import (
	"fmt"
	"strconv"
	"strings"
)

/*
 * In the future, JMdict will be updated to include sequence numbers
 * with each cross reference. At that time, most of the functions and
 * types defined in this file will become unnecessary.  see:
 * https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
 */

type searchValue struct {
	sequence   sequence
	index      int
	isPriority bool
}

type searchHash struct {
	hash       hash
	isPriority bool
}

func parseReference(reference string) (headword, int, bool) {
	// Reference strings in JMDict currently consist of 3 parts at
	// most, separated by ・ characters. The latter two parts are
	// optional.  When the sense number is not specified, it is
	// implied to be the first sense.
	var h headword
	var senseNumber int
	ok := true
	refParts := strings.Split(reference, "・")
	if len(refParts) == 1 {
		// (Kanji) or (Reading)
		h = headword{Expression: refParts[0], Reading: refParts[0]}
		senseNumber = 1
	} else if len(refParts) == 2 {
		// [Kanji + (Reading or Sense)] or (Reading + Sense)
		val, err := strconv.Atoi(refParts[1])
		if err == nil {
			h = headword{Expression: refParts[0], Reading: refParts[0]}
			senseNumber = val
		} else {
			h = headword{Expression: refParts[0], Reading: refParts[1]}
			senseNumber = 1
		}
	} else if len(refParts) == 3 {
		// Expression + Reading + Sense
		h = headword{Expression: refParts[0], Reading: refParts[1]}
		val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
		if err == nil {
			senseNumber = val
		} else {
			errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
			fmt.Println(errortext)
			ok = false
		}
	} else {
		errortext := "Unexpected format for x-ref \"" + reference + "\""
		fmt.Println(errortext)
		ok = false
	}
	return h, senseNumber, ok
}

func (meta *jmdictMetadata) MakeReferenceToSeqMap() {

	meta.referenceToSeq = make(map[string]sequence)
	meta.MakeHashToSearchValuesMap()

	for _, reference := range meta.references {
		if meta.referenceToSeq[reference] != 0 {
			continue
		}
		seq := meta.FindBestSequence(reference)
		if seq != 0 {
			meta.referenceToSeq[reference] = seq
		} else {
			fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
		}
	}
}

func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
	meta.hashToSearchValues = make(map[hash][]searchValue)
	for seq, searchHashes := range meta.seqToSearchHashes {
		for idx, searchHash := range searchHashes {
			searchValue := searchValue{
				sequence:   seq,
				index:      idx,
				isPriority: searchHash.isPriority,
			}
			meta.hashToSearchValues[searchHash.hash] =
				append(meta.hashToSearchValues[searchHash.hash], searchValue)
		}
	}
}

/*
 * This function attemps to convert a JMdict reference string into a
 * single definite sequence number. These reference strings are often
 * ambiguous, so we have to resort to using heuristics.
 *
 * Generally, correspondence is determined by the order in which term
 * pairs are extracted from each JMdict entry. Take for example the
 * JMdict entry for ご本, which contains a reference to 本 (without a
 * reading specified). To correlate this reference with a sequence
 * number, our program searches each entry for the hash of【本・本】.
 * There are two entries in which it is found in JMdict (English):
 *
 * sequence 1260670: 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】
 * sequence 1522150: 【本・ほん】、【本・本】、【ほん・ほん】
 *
 * Because 【本・本】 is closer to the beginning of the array in the
 * latter (i.e., has the lowest index), sequence number 1522150 is
 * returned.
 *
 * In situations in which multiple sequences are found with the same
 * index, the entry with a priority tag ("news1", "ichi1", "spec1",
 * "spec2", "gai1") is given preference. This mostly affects
 * katakana-only loanwords like ラグ.
 *
 * To improve accuracy, this method also checks to see if the
 * reference's specified sense number really exists in the
 * corresponding entry. For example, sequence 1582850 【如何で・いかんで】
 * has a reference to sense #2 of いかん (no kanji specified), which
 * could belong to 13 different sequences. However, sequences 1582850
 * and 2829697 are the only 2 of those 13 which contain more than one
 * sense. Incidentally, sequence 1582850 is the correct match.
 *
 * All else being equal, the entry with the smallest sequence number
 * is chosen. References in the JMdict file are currently ambiguous,
 * and getting this perfect won't be possible until reference sequence
 * numbers are included in the file.  See:
 * https://github.com/JMdictProject/JMdictIssues/issues/61
 */
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
	bestSeq := 0
	lowestIndex := 100000
	bestIsPriority := false
	headword, senseNumber, ok := parseReference(reference)
	if !ok {
		return bestSeq
	}
	hash := headword.Hash()
	for _, v := range meta.hashToSearchValues[hash] {
		if meta.seqToSenseCount[v.sequence] < senseNumber {
			// entry must contain the specified sense
			continue
		} else if lowestIndex < v.index {
			// lower indices are better
			continue
		} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {
			// if indices match, check priority
			continue
		} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {
			// if indices and priority match, check sequence number.
			// lower sequence numbers are better
			continue
		} else {
			lowestIndex = v.index
			bestSeq = v.sequence
			bestIsPriority = v.isPriority
		}
	}
	return bestSeq
}
Add new JMdict version 2023-01-22 20:37:18 +00:00			`package yomichan`

			`import (`
			`"fmt"`
			`"strconv"`
			`"strings"`
			`)`

			`/*`
			`* In the future, JMdict will be updated to include sequence numbers`
			`* with each cross reference. At that time, most of the functions and`
			`* types defined in this file will become unnecessary. see:`
			`* https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html`
			`*/`

			`type searchValue struct {`
			`sequence sequence`
			`index int`
			`isPriority bool`
			`}`

			`type searchHash struct {`
			`hash hash`
			`isPriority bool`
			`}`

			`func parseReference(reference string) (headword, int, bool) {`
			`// Reference strings in JMDict currently consist of 3 parts at`
			`// most, separated by ・ characters. The latter two parts are`
			`// optional. When the sense number is not specified, it is`
			`// implied to be the first sense.`
			`var h headword`
			`var senseNumber int`
			`ok := true`
			`refParts := strings.Split(reference, "・")`
			`if len(refParts) == 1 {`
			`// (Kanji) or (Reading)`
			`h = headword{Expression: refParts[0], Reading: refParts[0]}`
			`senseNumber = 1`
			`} else if len(refParts) == 2 {`
			`// [Kanji + (Reading or Sense)] or (Reading + Sense)`
			`val, err := strconv.Atoi(refParts[1])`
			`if err == nil {`
			`h = headword{Expression: refParts[0], Reading: refParts[0]}`
			`senseNumber = val`
			`} else {`
			`h = headword{Expression: refParts[0], Reading: refParts[1]}`
			`senseNumber = 1`
			`}`
			`} else if len(refParts) == 3 {`
			`// Expression + Reading + Sense`
			`h = headword{Expression: refParts[0], Reading: refParts[1]}`
			`val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))`
			`if err == nil {`
			`senseNumber = val`
			`} else {`
			`errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""`
			`fmt.Println(errortext)`
			`ok = false`
			`}`
			`} else {`
			`errortext := "Unexpected format for x-ref \"" + reference + "\""`
			`fmt.Println(errortext)`
			`ok = false`
			`}`
			`return h, senseNumber, ok`
			`}`

			`func (meta *jmdictMetadata) MakeReferenceToSeqMap() {`

			`meta.referenceToSeq = make(map[string]sequence)`
			`meta.MakeHashToSearchValuesMap()`

			`for _, reference := range meta.references {`
			`if meta.referenceToSeq[reference] != 0 {`
			`continue`
			`}`
			`seq := meta.FindBestSequence(reference)`
			`if seq != 0 {`
			`meta.referenceToSeq[reference] = seq`
			`} else {`
			fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
			`}`
			`}`
			`}`

			`func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {`
			`meta.hashToSearchValues = make(map[hash][]searchValue)`
			`for seq, searchHashes := range meta.seqToSearchHashes {`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`for idx, searchHash := range searchHashes {`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`searchValue := searchValue{`
			`sequence: seq,`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`index: idx,`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`isPriority: searchHash.isPriority,`
			`}`
			`meta.hashToSearchValues[searchHash.hash] =`
			`append(meta.hashToSearchValues[searchHash.hash], searchValue)`
			`}`
			`}`
			`}`

			`/*`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`* This function attemps to convert a JMdict reference string into a`
			`* single definite sequence number. These reference strings are often`
			`* ambiguous, so we have to resort to using heuristics.`
			`*`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`* Generally, correspondence is determined by the order in which term`
			`* pairs are extracted from each JMdict entry. Take for example the`
			`* JMdict entry for ご本, which contains a reference to 本 (without a`
			`* reading specified). To correlate this reference with a sequence`
			`* number, our program searches each entry for the hash of【本・本】.`
			`* There are two entries in which it is found in JMdict (English):`
			`*`
			`* sequence 1260670: 【元・もと】、【元・元】、【もと・もと】、【本・もと】、【本・本】、【素・もと】、【素・素】、【基・もと】、【基・基】`
			`* sequence 1522150: 【本・ほん】、【本・本】、【ほん・ほん】`
			`*`
			`* Because 【本・本】 is closer to the beginning of the array in the`
			`* latter (i.e., has the lowest index), sequence number 1522150 is`
			`* returned.`
			`*`
			`* In situations in which multiple sequences are found with the same`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`* index, the entry with a priority tag ("news1", "ichi1", "spec1",`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`* "spec2", "gai1") is given preference. This mostly affects`
			`* katakana-only loanwords like ラグ.`
			`*`
			`* To improve accuracy, this method also checks to see if the`
			`* reference's specified sense number really exists in the`
			`* corresponding entry. For example, sequence 1582850 【如何で・いかんで】`
			`* has a reference to sense #2 of いかん (no kanji specified), which`
			`* could belong to 13 different sequences. However, sequences 1582850`
			`* and 2829697 are the only 2 of those 13 which contain more than one`
			`* sense. Incidentally, sequence 1582850 is the correct match.`
			`*`
			`* All else being equal, the entry with the smallest sequence number`
			`* is chosen. References in the JMdict file are currently ambiguous,`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`* and getting this perfect won't be possible until reference sequence`
			`* numbers are included in the file. See:`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`* https://github.com/JMdictProject/JMdictIssues/issues/61`
			`*/`
			`func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {`
			`bestSeq := 0`
			`lowestIndex := 100000`
			`bestIsPriority := false`
			`headword, senseNumber, ok := parseReference(reference)`
			`if !ok {`
			`return bestSeq`
			`}`
			`hash := headword.Hash()`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`for _, v := range meta.hashToSearchValues[hash] {`
			`if meta.seqToSenseCount[v.sequence] < senseNumber {`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`// entry must contain the specified sense`
			`continue`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`} else if lowestIndex < v.index {`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`// lower indices are better`
			`continue`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {`
			`// if indices match, check priority`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`continue`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {`
			`// if indices and priority match, check sequence number.`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`// lower sequence numbers are better`
			`continue`
			`} else {`
Rename variables for consistency 2023-01-23 20:09:50 +00:00			`lowestIndex = v.index`
			`bestSeq = v.sequence`
			`bestIsPriority = v.isPriority`
Add new JMdict version 2023-01-22 20:37:18 +00:00			`}`
			`}`
			`return bestSeq`
			`}`