2023-01-22 20:37:18 +00:00
|
|
|
package yomichan
|
|
|
|
|
|
|
|
import (
|
2023-01-29 20:06:50 +00:00
|
|
|
"errors"
|
2023-01-22 20:37:18 +00:00
|
|
|
"os"
|
|
|
|
"regexp"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"foosoft.net/projects/jmdict"
|
|
|
|
"golang.org/x/exp/slices"
|
|
|
|
)
|
|
|
|
|
|
|
|
func grammarRules(partsOfSpeech []string) []string {
|
|
|
|
rules := []string{}
|
|
|
|
for _, partOfSpeech := range partsOfSpeech {
|
|
|
|
switch partOfSpeech {
|
|
|
|
case "adj-i", "vk", "vz":
|
|
|
|
rules = append(rules, partOfSpeech)
|
|
|
|
default:
|
|
|
|
if strings.HasPrefix(partOfSpeech, "v5") {
|
|
|
|
rules = append(rules, "v5")
|
|
|
|
} else if strings.HasPrefix(partOfSpeech, "v1") {
|
|
|
|
rules = append(rules, "v1")
|
|
|
|
} else if strings.HasPrefix(partOfSpeech, "vs-") {
|
|
|
|
rules = append(rules, "vs")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rules
|
|
|
|
}
|
|
|
|
|
2023-01-28 01:09:12 +00:00
|
|
|
func calculateTermScore(senseNumber int, depth int, headword headword) int {
|
2023-01-22 20:37:18 +00:00
|
|
|
const senseWeight int = 1
|
2023-01-28 01:09:12 +00:00
|
|
|
const depthWeight int = 100
|
|
|
|
const entryPositionWeight int = 10000
|
|
|
|
const priorityWeight int = 1000000
|
2023-01-22 20:37:18 +00:00
|
|
|
|
|
|
|
score := 0
|
|
|
|
score -= (senseNumber - 1) * senseWeight
|
2023-01-28 01:09:12 +00:00
|
|
|
score -= depth * depthWeight
|
2023-01-22 20:37:18 +00:00
|
|
|
score -= headword.Index * entryPositionWeight
|
|
|
|
score += headword.Score() * priorityWeight
|
|
|
|
|
|
|
|
return score
|
|
|
|
}
|
|
|
|
|
|
|
|
func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) bool {
|
|
|
|
// Display sense numbers if the entry has more than one sense
|
|
|
|
// or if the headword is found in multiple entries.
|
|
|
|
hash := headword.Hash()
|
2023-01-29 20:06:50 +00:00
|
|
|
if !meta.extraMode {
|
|
|
|
return false
|
|
|
|
} else if meta.language != "eng" {
|
|
|
|
return false
|
|
|
|
} else if meta.seqToSenseCount[entry.Sequence] > 1 {
|
2023-01-22 20:37:18 +00:00
|
|
|
return true
|
|
|
|
} else if len(meta.headwordHashToSeqs[hash]) > 1 {
|
|
|
|
return true
|
|
|
|
} else {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
|
2023-02-04 07:42:08 +00:00
|
|
|
unknownDate := "unknown"
|
|
|
|
idx := len(dictionary.Entries) - 1
|
2023-01-30 19:26:26 +00:00
|
|
|
if len(dictionary.Entries) == 0 {
|
2023-02-04 07:42:08 +00:00
|
|
|
return unknownDate
|
|
|
|
} else if len(dictionary.Entries[idx].Sense) == 0 {
|
|
|
|
return unknownDate
|
|
|
|
} else if len(dictionary.Entries[idx].Sense[0].Glossary) == 0 {
|
|
|
|
return unknownDate
|
2023-01-30 19:26:26 +00:00
|
|
|
}
|
2023-02-04 07:42:08 +00:00
|
|
|
dateGloss := dictionary.Entries[idx].Sense[0].Glossary[0].Content
|
2023-01-22 20:37:18 +00:00
|
|
|
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
2023-02-04 07:42:08 +00:00
|
|
|
date := r.FindString(dateGloss)
|
|
|
|
if date != "" {
|
|
|
|
return date
|
2023-01-30 19:26:26 +00:00
|
|
|
} else {
|
2023-02-04 07:42:08 +00:00
|
|
|
return unknownDate
|
2023-01-30 19:26:26 +00:00
|
|
|
}
|
2023-01-22 20:37:18 +00:00
|
|
|
}
|
|
|
|
|
2023-02-02 01:14:37 +00:00
|
|
|
func jmdictFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
2023-01-26 00:26:47 +00:00
|
|
|
// Don't add "forms" terms to non-English dictionaries.
|
|
|
|
// Information would be duplicated if users installed more
|
|
|
|
// than one version.
|
2023-01-29 20:06:50 +00:00
|
|
|
if meta.language != "eng" || !meta.extraMode {
|
2023-01-26 00:26:47 +00:00
|
|
|
return dbTerm{}, false
|
|
|
|
}
|
|
|
|
// Don't need a "forms" term for entries with one unique
|
|
|
|
// headword which does not appear in any other entries.
|
|
|
|
if !meta.hasMultipleForms[entry.Sequence] {
|
|
|
|
if len(meta.headwordHashToSeqs[headword.Hash()]) == 1 {
|
|
|
|
return dbTerm{}, false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-02 21:50:57 +00:00
|
|
|
term := baseFormsTerm(entry, meta)
|
2023-01-22 20:37:18 +00:00
|
|
|
term.Expression = headword.Expression
|
|
|
|
term.Reading = headword.Reading
|
|
|
|
|
|
|
|
term.addTermTags(headword.TermTags...)
|
|
|
|
term.addDefinitionTags("forms")
|
2023-02-02 21:50:57 +00:00
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
|
2023-01-28 01:09:12 +00:00
|
|
|
entryDepth := meta.entryDepth[entry.Sequence]
|
|
|
|
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
|
2023-02-02 21:50:57 +00:00
|
|
|
|
2023-01-26 00:26:47 +00:00
|
|
|
return term, true
|
2023-01-22 20:37:18 +00:00
|
|
|
}
|
|
|
|
|
2023-02-02 01:14:37 +00:00
|
|
|
func jmdictSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
2023-01-26 00:26:47 +00:00
|
|
|
// Don't add "search" terms to non-English dictionaries.
|
|
|
|
// Information would be duplicated if users installed more
|
|
|
|
// than one version.
|
|
|
|
if meta.language != "eng" {
|
|
|
|
return dbTerm{}, false
|
|
|
|
}
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
term := dbTerm{
|
|
|
|
Expression: headword.Expression,
|
|
|
|
Sequence: -entry.Sequence,
|
|
|
|
}
|
2023-02-02 21:50:57 +00:00
|
|
|
|
|
|
|
partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
|
|
|
|
rules := grammarRules(partsOfSpeech)
|
|
|
|
term.addRules(rules...)
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
term.addTermTags(headword.TermTags...)
|
2023-01-28 01:09:12 +00:00
|
|
|
term.Score = calculateTermScore(1, 0, headword)
|
2023-01-22 20:37:18 +00:00
|
|
|
|
|
|
|
redirectHeadword := meta.seqToMainHeadword[entry.Sequence]
|
|
|
|
expHash := redirectHeadword.ExpHash()
|
|
|
|
doDisplayReading := (len(meta.expHashToReadings[expHash]) > 1)
|
|
|
|
|
|
|
|
content := contentSpan(
|
|
|
|
contentAttr{fontSize: "130%"},
|
|
|
|
"⟶",
|
|
|
|
redirectHeadword.ToInternalLink(doDisplayReading),
|
|
|
|
)
|
|
|
|
|
|
|
|
term.Glossary = []any{contentStructure(content)}
|
2023-01-26 00:26:47 +00:00
|
|
|
return term, true
|
2023-01-22 20:37:18 +00:00
|
|
|
}
|
|
|
|
|
2023-02-02 01:14:37 +00:00
|
|
|
func jmdictSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
2023-01-26 00:26:47 +00:00
|
|
|
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
|
|
|
|
return dbTerm{}, false
|
|
|
|
}
|
|
|
|
if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) {
|
|
|
|
return dbTerm{}, false
|
|
|
|
}
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
term := dbTerm{
|
|
|
|
Expression: headword.Expression,
|
|
|
|
Reading: headword.Reading,
|
|
|
|
Sequence: entry.Sequence,
|
|
|
|
}
|
|
|
|
|
|
|
|
term.Glossary = createGlossary(sense, meta)
|
|
|
|
|
|
|
|
term.addTermTags(headword.TermTags...)
|
|
|
|
|
|
|
|
if doDisplaySenseNumberTag(headword, entry, meta) {
|
|
|
|
senseNumberTag := strconv.Itoa(senseNumber)
|
|
|
|
term.addDefinitionTags(senseNumberTag)
|
|
|
|
}
|
2023-02-02 16:44:16 +00:00
|
|
|
|
|
|
|
if len(sense.PartsOfSpeech) == 0 && meta.language != "eng" {
|
|
|
|
// This is a hack to provide part-of-speech info to
|
|
|
|
// non-English versions of JMdict.
|
|
|
|
sense.PartsOfSpeech = meta.seqToPartsOfSpeech[entry.Sequence]
|
|
|
|
}
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
term.addDefinitionTags(sense.PartsOfSpeech...)
|
|
|
|
term.addDefinitionTags(sense.Fields...)
|
|
|
|
term.addDefinitionTags(sense.Misc...)
|
|
|
|
term.addDefinitionTags(sense.Dialects...)
|
|
|
|
|
|
|
|
rules := grammarRules(sense.PartsOfSpeech)
|
|
|
|
term.addRules(rules...)
|
|
|
|
|
2023-01-28 01:09:12 +00:00
|
|
|
entryDepth := meta.entryDepth[entry.Sequence]
|
|
|
|
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
|
2023-01-22 20:37:18 +00:00
|
|
|
|
2023-01-26 00:26:47 +00:00
|
|
|
return term, true
|
2023-01-22 20:37:18 +00:00
|
|
|
}
|
|
|
|
|
2023-02-02 01:14:37 +00:00
|
|
|
func jmdictTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
|
2023-01-22 20:37:18 +00:00
|
|
|
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
if headword.IsSearchOnly {
|
2023-02-02 01:14:37 +00:00
|
|
|
if searchTerm, ok := jmdictSearchTerm(headword, entry, meta); ok {
|
2023-01-22 23:55:27 +00:00
|
|
|
return []dbTerm{searchTerm}, true
|
|
|
|
} else {
|
|
|
|
return nil, false
|
|
|
|
}
|
2023-01-22 20:37:18 +00:00
|
|
|
}
|
|
|
|
terms := []dbTerm{}
|
|
|
|
senseNumber := 1
|
|
|
|
for _, sense := range entry.Sense {
|
|
|
|
if !glossaryContainsLanguage(sense.Glossary, meta.language) {
|
2023-01-26 00:26:47 +00:00
|
|
|
// Do not increment sense number
|
2023-01-22 20:37:18 +00:00
|
|
|
continue
|
|
|
|
}
|
2023-02-02 01:14:37 +00:00
|
|
|
if senseTerm, ok := jmdictSenseTerm(sense, senseNumber, headword, entry, meta); ok {
|
2023-01-26 00:26:47 +00:00
|
|
|
terms = append(terms, senseTerm)
|
2023-01-22 20:37:18 +00:00
|
|
|
}
|
|
|
|
senseNumber += 1
|
|
|
|
}
|
|
|
|
|
2023-02-02 01:14:37 +00:00
|
|
|
if formsTerm, ok := jmdictFormsTerm(headword, entry, meta); ok {
|
2023-01-22 20:37:18 +00:00
|
|
|
terms = append(terms, formsTerm)
|
|
|
|
}
|
2023-01-26 00:26:47 +00:00
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
return terms, true
|
|
|
|
}
|
|
|
|
|
2023-02-02 01:14:37 +00:00
|
|
|
func jmdictExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
|
2023-01-29 20:06:50 +00:00
|
|
|
if _, ok := langNameToCode[languageName]; !ok {
|
|
|
|
return errors.New("Unrecognized language parameter: " + languageName)
|
|
|
|
}
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
reader, err := os.Open(inputPath)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer reader.Close()
|
|
|
|
|
|
|
|
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
meta := newJmdictMetadata(dictionary, languageName)
|
|
|
|
|
|
|
|
terms := dbTermList{}
|
|
|
|
for _, entry := range dictionary.Entries {
|
|
|
|
headwords := extractHeadwords(entry)
|
|
|
|
for _, headword := range headwords {
|
2023-02-02 01:14:37 +00:00
|
|
|
if newTerms, ok := jmdictTerms(headword, entry, meta); ok {
|
2023-01-22 20:37:18 +00:00
|
|
|
terms = append(terms, newTerms...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
tags := dbTagList{}
|
|
|
|
tags = append(tags, entityTags(entities)...)
|
|
|
|
tags = append(tags, senseNumberTags(meta.maxSenseCount)...)
|
|
|
|
tags = append(tags, newsFrequencyTags()...)
|
|
|
|
tags = append(tags, customDbTags()...)
|
|
|
|
|
|
|
|
recordData := map[string]dbRecordList{
|
|
|
|
"term": terms.crush(),
|
|
|
|
"tag": tags.crush(),
|
|
|
|
}
|
|
|
|
|
|
|
|
if title == "" {
|
|
|
|
title = "JMdict"
|
|
|
|
}
|
|
|
|
jmdictDate := jmdictPublicationDate(dictionary)
|
|
|
|
|
|
|
|
index := dbIndex{
|
|
|
|
Title: title,
|
|
|
|
Revision: "JMdict." + jmdictDate,
|
|
|
|
Sequenced: true,
|
|
|
|
Attribution: edrdgAttribution,
|
|
|
|
}
|
|
|
|
|
|
|
|
return writeDb(
|
|
|
|
outputPath,
|
|
|
|
index,
|
|
|
|
recordData,
|
|
|
|
stride,
|
|
|
|
pretty,
|
|
|
|
)
|
|
|
|
}
|