Compare commits
10 Commits
8281301869
...
00dc44386e
Author | SHA1 | Date | |
---|---|---|---|
00dc44386e | |||
|
f4da17e228 | ||
|
ecf22da5a3 | ||
|
a9d85dc720 | ||
|
70611a51c4 | ||
|
dffbec6337 | ||
|
5755b79341 | ||
|
7bff70b71c | ||
|
19d6d0bb43 | ||
|
3b420f8b6c |
@ -1,5 +1,8 @@
|
|||||||
# Yomichan Import
|
# Yomichan Import
|
||||||
|
|
||||||
|
*Note: this project is no longer maintained. Please see [this
|
||||||
|
post](https://foosoft.net/posts/sunsetting-the-yomichan-project/) for more information.*
|
||||||
|
|
||||||
Yomichan Import allows users of the [Yomichan](https://foosoft.net/projects/yomichan) extension to import custom
|
Yomichan Import allows users of the [Yomichan](https://foosoft.net/projects/yomichan) extension to import custom
|
||||||
dictionary files. It currently supports the following formats:
|
dictionary files. It currently supports the following formats:
|
||||||
|
|
||||||
|
18
common.go
18
common.go
@ -9,6 +9,8 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -116,7 +118,7 @@ type dbKanjiList []dbKanji
|
|||||||
|
|
||||||
func (kanji *dbKanji) addTags(tags ...string) {
|
func (kanji *dbKanji) addTags(tags ...string) {
|
||||||
for _, tag := range tags {
|
for _, tag := range tags {
|
||||||
if !hasString(tag, kanji.Tags) {
|
if !slices.Contains(kanji.Tags, tag) {
|
||||||
kanji.Tags = append(kanji.Tags, tag)
|
kanji.Tags = append(kanji.Tags, tag)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -245,7 +247,7 @@ func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordLis
|
|||||||
|
|
||||||
func appendStringUnique(target []string, source ...string) []string {
|
func appendStringUnique(target []string, source ...string) []string {
|
||||||
for _, str := range source {
|
for _, str := range source {
|
||||||
if !hasString(str, target) {
|
if !slices.Contains(target, str) {
|
||||||
target = append(target, str)
|
target = append(target, str)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -253,16 +255,6 @@ func appendStringUnique(target []string, source ...string) []string {
|
|||||||
return target
|
return target
|
||||||
}
|
}
|
||||||
|
|
||||||
func hasString(needle string, haystack []string) bool {
|
|
||||||
for _, value := range haystack {
|
|
||||||
if needle == value {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func intersection(s1, s2 []string) []string {
|
func intersection(s1, s2 []string) []string {
|
||||||
s := []string{}
|
s := []string{}
|
||||||
m := make(map[string]bool)
|
m := make(map[string]bool)
|
||||||
@ -337,7 +329,7 @@ func detectFormat(path string) (string, error) {
|
|||||||
|
|
||||||
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
|
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
|
||||||
handlers := map[string]func(string, string, string, string, int, bool) error{
|
handlers := map[string]func(string, string, string, string, int, bool) error{
|
||||||
"edict": jmdExportDb,
|
"edict": jmdictExportDb,
|
||||||
"forms": formsExportDb,
|
"forms": formsExportDb,
|
||||||
"enamdict": jmnedictExportDb,
|
"enamdict": jmnedictExportDb,
|
||||||
"epwing": epwingExportDb,
|
"epwing": epwingExportDb,
|
||||||
|
@ -8,14 +8,14 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
||||||
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
|
return frequencyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
|
||||||
}
|
}
|
||||||
|
|
||||||
func frequencyKanjiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
func frequencyKanjiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
|
||||||
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "kanji_meta")
|
return frequencyExportDb(inputPath, outputPath, language, title, stride, pretty, "kanji_meta")
|
||||||
}
|
}
|
||||||
|
|
||||||
func frequncyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool, key string) error {
|
func frequencyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool, key string) error {
|
||||||
reader, err := os.Open(inputPath)
|
reader, err := os.Open(inputPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
58
jmdict.go
58
jmdict.go
@ -63,23 +63,26 @@ func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta j
|
|||||||
}
|
}
|
||||||
|
|
||||||
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
|
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
|
||||||
|
unknownDate := "unknown"
|
||||||
|
idx := len(dictionary.Entries) - 1
|
||||||
if len(dictionary.Entries) == 0 {
|
if len(dictionary.Entries) == 0 {
|
||||||
return "unknown"
|
return unknownDate
|
||||||
}
|
} else if len(dictionary.Entries[idx].Sense) == 0 {
|
||||||
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
|
return unknownDate
|
||||||
if len(dateEntry.Sense) == 0 || len(dateEntry.Sense[0].Glossary) == 0 {
|
} else if len(dictionary.Entries[idx].Sense[0].Glossary) == 0 {
|
||||||
return "unknown"
|
return unknownDate
|
||||||
}
|
}
|
||||||
|
dateGloss := dictionary.Entries[idx].Sense[0].Glossary[0].Content
|
||||||
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
||||||
jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content)
|
date := r.FindString(dateGloss)
|
||||||
if jmdictDate != "" {
|
if date != "" {
|
||||||
return jmdictDate
|
return date
|
||||||
} else {
|
} else {
|
||||||
return "unknown"
|
return unknownDate
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
func jmdictFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
||||||
// Don't add "forms" terms to non-English dictionaries.
|
// Don't add "forms" terms to non-English dictionaries.
|
||||||
// Information would be duplicated if users installed more
|
// Information would be duplicated if users installed more
|
||||||
// than one version.
|
// than one version.
|
||||||
@ -94,20 +97,21 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
term := baseFormsTerm(entry)
|
term := baseFormsTerm(entry, meta)
|
||||||
term.Expression = headword.Expression
|
term.Expression = headword.Expression
|
||||||
term.Reading = headword.Reading
|
term.Reading = headword.Reading
|
||||||
|
|
||||||
term.addTermTags(headword.TermTags...)
|
term.addTermTags(headword.TermTags...)
|
||||||
|
|
||||||
term.addDefinitionTags("forms")
|
term.addDefinitionTags("forms")
|
||||||
|
|
||||||
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
|
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
|
||||||
entryDepth := meta.entryDepth[entry.Sequence]
|
entryDepth := meta.entryDepth[entry.Sequence]
|
||||||
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
|
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
|
||||||
|
|
||||||
return term, true
|
return term, true
|
||||||
}
|
}
|
||||||
|
|
||||||
func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
func jmdictSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
||||||
// Don't add "search" terms to non-English dictionaries.
|
// Don't add "search" terms to non-English dictionaries.
|
||||||
// Information would be duplicated if users installed more
|
// Information would be duplicated if users installed more
|
||||||
// than one version.
|
// than one version.
|
||||||
@ -119,10 +123,11 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe
|
|||||||
Expression: headword.Expression,
|
Expression: headword.Expression,
|
||||||
Sequence: -entry.Sequence,
|
Sequence: -entry.Sequence,
|
||||||
}
|
}
|
||||||
for _, sense := range entry.Sense {
|
|
||||||
rules := grammarRules(sense.PartsOfSpeech)
|
partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
|
||||||
|
rules := grammarRules(partsOfSpeech)
|
||||||
term.addRules(rules...)
|
term.addRules(rules...)
|
||||||
}
|
|
||||||
term.addTermTags(headword.TermTags...)
|
term.addTermTags(headword.TermTags...)
|
||||||
term.Score = calculateTermScore(1, 0, headword)
|
term.Score = calculateTermScore(1, 0, headword)
|
||||||
|
|
||||||
@ -140,7 +145,7 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe
|
|||||||
return term, true
|
return term, true
|
||||||
}
|
}
|
||||||
|
|
||||||
func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
func jmdictSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
|
||||||
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
|
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
|
||||||
return dbTerm{}, false
|
return dbTerm{}, false
|
||||||
}
|
}
|
||||||
@ -162,6 +167,13 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
|
|||||||
senseNumberTag := strconv.Itoa(senseNumber)
|
senseNumberTag := strconv.Itoa(senseNumber)
|
||||||
term.addDefinitionTags(senseNumberTag)
|
term.addDefinitionTags(senseNumberTag)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(sense.PartsOfSpeech) == 0 && meta.language != "eng" {
|
||||||
|
// This is a hack to provide part-of-speech info to
|
||||||
|
// non-English versions of JMdict.
|
||||||
|
sense.PartsOfSpeech = meta.seqToPartsOfSpeech[entry.Sequence]
|
||||||
|
}
|
||||||
|
|
||||||
term.addDefinitionTags(sense.PartsOfSpeech...)
|
term.addDefinitionTags(sense.PartsOfSpeech...)
|
||||||
term.addDefinitionTags(sense.Fields...)
|
term.addDefinitionTags(sense.Fields...)
|
||||||
term.addDefinitionTags(sense.Misc...)
|
term.addDefinitionTags(sense.Misc...)
|
||||||
@ -176,12 +188,12 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
|
|||||||
return term, true
|
return term, true
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
|
func jmdictTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
|
||||||
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
||||||
return nil, false
|
return nil, false
|
||||||
}
|
}
|
||||||
if headword.IsSearchOnly {
|
if headword.IsSearchOnly {
|
||||||
if searchTerm, ok := createSearchTerm(headword, entry, meta); ok {
|
if searchTerm, ok := jmdictSearchTerm(headword, entry, meta); ok {
|
||||||
return []dbTerm{searchTerm}, true
|
return []dbTerm{searchTerm}, true
|
||||||
} else {
|
} else {
|
||||||
return nil, false
|
return nil, false
|
||||||
@ -194,20 +206,20 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada
|
|||||||
// Do not increment sense number
|
// Do not increment sense number
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok {
|
if senseTerm, ok := jmdictSenseTerm(sense, senseNumber, headword, entry, meta); ok {
|
||||||
terms = append(terms, senseTerm)
|
terms = append(terms, senseTerm)
|
||||||
}
|
}
|
||||||
senseNumber += 1
|
senseNumber += 1
|
||||||
}
|
}
|
||||||
|
|
||||||
if formsTerm, ok := createFormsTerm(headword, entry, meta); ok {
|
if formsTerm, ok := jmdictFormsTerm(headword, entry, meta); ok {
|
||||||
terms = append(terms, formsTerm)
|
terms = append(terms, formsTerm)
|
||||||
}
|
}
|
||||||
|
|
||||||
return terms, true
|
return terms, true
|
||||||
}
|
}
|
||||||
|
|
||||||
func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
|
func jmdictExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
|
||||||
if _, ok := langNameToCode[languageName]; !ok {
|
if _, ok := langNameToCode[languageName]; !ok {
|
||||||
return errors.New("Unrecognized language parameter: " + languageName)
|
return errors.New("Unrecognized language parameter: " + languageName)
|
||||||
}
|
}
|
||||||
@ -229,7 +241,7 @@ func jmdExportDb(inputPath string, outputPath string, languageName string, title
|
|||||||
for _, entry := range dictionary.Entries {
|
for _, entry := range dictionary.Entries {
|
||||||
headwords := extractHeadwords(entry)
|
headwords := extractHeadwords(entry)
|
||||||
for _, headword := range headwords {
|
for _, headword := range headwords {
|
||||||
if newTerms, ok := extractTerms(headword, entry, meta); ok {
|
if newTerms, ok := jmdictTerms(headword, entry, meta); ok {
|
||||||
terms = append(terms, newTerms...)
|
terms = append(terms, newTerms...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -172,18 +172,20 @@ func formsGlossary(headwords []headword) []any {
|
|||||||
return glossary
|
return glossary
|
||||||
}
|
}
|
||||||
|
|
||||||
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
|
func baseFormsTerm(entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
|
||||||
term := dbTerm{Sequence: entry.Sequence}
|
term := dbTerm{Sequence: entry.Sequence}
|
||||||
headwords := extractHeadwords(entry)
|
headwords := extractHeadwords(entry)
|
||||||
|
|
||||||
if needsFormTable(headwords) {
|
if needsFormTable(headwords) {
|
||||||
term.Glossary = formsTableGlossary(headwords)
|
term.Glossary = formsTableGlossary(headwords)
|
||||||
} else {
|
} else {
|
||||||
term.Glossary = formsGlossary(headwords)
|
term.Glossary = formsGlossary(headwords)
|
||||||
}
|
}
|
||||||
for _, sense := range entry.Sense {
|
|
||||||
rules := grammarRules(sense.PartsOfSpeech)
|
partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
|
||||||
|
rules := grammarRules(partsOfSpeech)
|
||||||
term.addRules(rules...)
|
term.addRules(rules...)
|
||||||
}
|
|
||||||
return term
|
return term
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -203,11 +205,11 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int
|
|||||||
|
|
||||||
terms := dbTermList{}
|
terms := dbTermList{}
|
||||||
for _, entry := range dictionary.Entries {
|
for _, entry := range dictionary.Entries {
|
||||||
baseTerm := baseFormsTerm(entry)
|
baseTerm := baseFormsTerm(entry, meta)
|
||||||
headwords := extractHeadwords(entry)
|
headwords := extractHeadwords(entry)
|
||||||
for _, h := range headwords {
|
for _, h := range headwords {
|
||||||
if h.IsSearchOnly {
|
if h.IsSearchOnly {
|
||||||
if term, ok := createSearchTerm(h, entry, meta); ok {
|
if term, ok := jmdictSearchTerm(h, entry, meta); ok {
|
||||||
terms = append(terms, term)
|
terms = append(terms, term)
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
|
@ -13,6 +13,7 @@ type jmdictMetadata struct {
|
|||||||
language string
|
language string
|
||||||
condensedGlosses map[senseID]string
|
condensedGlosses map[senseID]string
|
||||||
seqToSenseCount map[sequence]int
|
seqToSenseCount map[sequence]int
|
||||||
|
seqToPartsOfSpeech map[sequence][]string
|
||||||
seqToMainHeadword map[sequence]headword
|
seqToMainHeadword map[sequence]headword
|
||||||
expHashToReadings map[hash][]string
|
expHashToReadings map[hash][]string
|
||||||
headwordHashToSeqs map[hash][]sequence
|
headwordHashToSeqs map[hash][]sequence
|
||||||
@ -31,7 +32,7 @@ type senseID struct {
|
|||||||
number int
|
number int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) {
|
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, seq sequence) {
|
||||||
// This is to ensure that terms are grouped among their
|
// This is to ensure that terms are grouped among their
|
||||||
// entries of origin and displayed in correct sequential order
|
// entries of origin and displayed in correct sequential order
|
||||||
maxDepth := 0
|
maxDepth := 0
|
||||||
@ -48,39 +49,63 @@ func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySeque
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
meta.entryDepth[entrySequence] = maxDepth
|
meta.entryDepth[seq] = maxDepth
|
||||||
}
|
}
|
||||||
|
|
||||||
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
|
func (meta *jmdictMetadata) AddEntry(entry jmdict.JmdictEntry) {
|
||||||
|
partsOfSpeech := []string{}
|
||||||
// Determine how many senses are in this entry for this language
|
|
||||||
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
|
|
||||||
senseCount := 0
|
senseCount := 0
|
||||||
for _, entrySense := range entry.Sense {
|
for _, sense := range entry.Sense {
|
||||||
for _, gloss := range entrySense.Glossary {
|
// Only English-language senses contain part-of-speech info,
|
||||||
if glossContainsLanguage(gloss, meta.language) {
|
// but other languages need them for deinflection rules.
|
||||||
|
for _, pos := range sense.PartsOfSpeech {
|
||||||
|
if !slices.Contains(partsOfSpeech, pos) {
|
||||||
|
partsOfSpeech = append(partsOfSpeech, pos)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if glossaryContainsLanguage(sense.Glossary, meta.language) {
|
||||||
senseCount += 1
|
senseCount += 1
|
||||||
break
|
} else {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, reference := range sense.References {
|
||||||
|
meta.references = append(meta.references, reference)
|
||||||
|
}
|
||||||
|
for _, antonym := range sense.Antonyms {
|
||||||
|
meta.references = append(meta.references, antonym)
|
||||||
|
}
|
||||||
|
|
||||||
|
currentSenseID := senseID{entry.Sequence, senseCount}
|
||||||
|
glosses := []string{}
|
||||||
|
for _, gloss := range sense.Glossary {
|
||||||
|
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
||||||
|
glosses = append(glosses, gloss.Content)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
meta.condensedGlosses[currentSenseID] = strings.Join(glosses, "; ")
|
||||||
}
|
}
|
||||||
|
meta.seqToPartsOfSpeech[entry.Sequence] = partsOfSpeech
|
||||||
meta.seqToSenseCount[entry.Sequence] = senseCount
|
meta.seqToSenseCount[entry.Sequence] = senseCount
|
||||||
}
|
}
|
||||||
|
|
||||||
if meta.seqToSenseCount[entry.Sequence] == 0 {
|
func (meta *jmdictMetadata) AddHeadword(headword headword, seq sequence) {
|
||||||
|
if meta.seqToSenseCount[seq] == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// main headwords (first ones that are found in entries).
|
// main headwords (first ones that are found in entries).
|
||||||
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
|
if _, ok := meta.seqToMainHeadword[seq]; !ok {
|
||||||
meta.seqToMainHeadword[entry.Sequence] = headword
|
meta.seqToMainHeadword[seq] = headword
|
||||||
}
|
}
|
||||||
|
|
||||||
// hash the term pair so we can determine if it's used
|
// hash the term pair so we can determine if it's used
|
||||||
// in more than one JMdict entry later.
|
// in more than one JMdict entry later.
|
||||||
headwordHash := headword.Hash()
|
headwordHash := headword.Hash()
|
||||||
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
|
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], seq) {
|
||||||
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
|
meta.headwordHashToSeqs[headwordHash] =
|
||||||
|
append(meta.headwordHashToSeqs[headwordHash], seq)
|
||||||
}
|
}
|
||||||
|
|
||||||
// hash the expression so that we can determine if we
|
// hash the expression so that we can determine if we
|
||||||
@ -88,7 +113,8 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
|
|||||||
// in reference notes later.
|
// in reference notes later.
|
||||||
expHash := headword.ExpHash()
|
expHash := headword.ExpHash()
|
||||||
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
|
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
|
||||||
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
|
meta.expHashToReadings[expHash] =
|
||||||
|
append(meta.expHashToReadings[expHash], headword.Reading)
|
||||||
}
|
}
|
||||||
|
|
||||||
// e.g. for JMdict (English) we expect to end up with
|
// e.g. for JMdict (English) we expect to end up with
|
||||||
@ -100,48 +126,17 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
|
|||||||
searchHash{headword.ReadingHash(), headword.IsPriority},
|
searchHash{headword.ReadingHash(), headword.IsPriority},
|
||||||
}
|
}
|
||||||
for _, x := range searchHashes {
|
for _, x := range searchHashes {
|
||||||
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
|
if !slices.Contains(meta.seqToSearchHashes[seq], x) {
|
||||||
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
|
meta.seqToSearchHashes[seq] = append(meta.seqToSearchHashes[seq], x)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
currentSenseNumber := 1
|
|
||||||
for _, entrySense := range entry.Sense {
|
|
||||||
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
|
|
||||||
currentSenseNumber += 1
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
|
|
||||||
currentSenseNumber += 1
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
allReferences := append(entrySense.References, entrySense.Antonyms...)
|
|
||||||
for _, reference := range allReferences {
|
|
||||||
meta.references = append(meta.references, reference)
|
|
||||||
}
|
|
||||||
|
|
||||||
currentSense := senseID{entry.Sequence, currentSenseNumber}
|
|
||||||
if meta.condensedGlosses[currentSense] == "" {
|
|
||||||
glosses := []string{}
|
|
||||||
for _, gloss := range entrySense.Glossary {
|
|
||||||
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
|
|
||||||
glosses = append(glosses, gloss.Content)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
|
|
||||||
}
|
|
||||||
currentSenseNumber += 1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
|
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
|
||||||
meta := jmdictMetadata{
|
meta := jmdictMetadata{
|
||||||
language: langNameToCode[languageName],
|
language: langNameToCode[languageName],
|
||||||
seqToSenseCount: make(map[sequence]int),
|
seqToSenseCount: make(map[sequence]int),
|
||||||
|
seqToPartsOfSpeech: make(map[sequence][]string),
|
||||||
condensedGlosses: make(map[senseID]string),
|
condensedGlosses: make(map[senseID]string),
|
||||||
seqToMainHeadword: make(map[sequence]headword),
|
seqToMainHeadword: make(map[sequence]headword),
|
||||||
expHashToReadings: make(map[hash][]string),
|
expHashToReadings: make(map[hash][]string),
|
||||||
@ -157,10 +152,11 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, entry := range dictionary.Entries {
|
for _, entry := range dictionary.Entries {
|
||||||
|
meta.AddEntry(entry)
|
||||||
headwords := extractHeadwords(entry)
|
headwords := extractHeadwords(entry)
|
||||||
formCount := 0
|
formCount := 0
|
||||||
for _, headword := range headwords {
|
for _, headword := range headwords {
|
||||||
meta.AddHeadword(headword, entry)
|
meta.AddHeadword(headword, entry.Sequence)
|
||||||
if !headword.IsSearchOnly {
|
if !headword.IsSearchOnly {
|
||||||
formCount += 1
|
formCount += 1
|
||||||
}
|
}
|
||||||
|
@ -96,39 +96,39 @@ func knownEntityTags() []dbTag {
|
|||||||
// <misc> miscellaneous sense info
|
// <misc> miscellaneous sense info
|
||||||
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
|
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
|
||||||
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
|
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
|
||||||
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
|
dbTag{Name: "char", Order: 4, Score: 0, Category: "name"}, // character
|
||||||
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
|
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
|
||||||
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
|
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
|
||||||
dbTag{Name: "company", Order: 4, Score: 0, Category: "name"}, // company name
|
dbTag{Name: "company", Order: 4, Score: 0, Category: "name"}, // company name
|
||||||
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
|
dbTag{Name: "creat", Order: 4, Score: 0, Category: "name"}, // creature
|
||||||
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
|
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
|
||||||
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
|
dbTag{Name: "dei", Order: 4, Score: 0, Category: "name"}, // deity
|
||||||
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
|
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
|
||||||
dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document
|
dbTag{Name: "doc", Order: 4, Score: 0, Category: "name"}, // document
|
||||||
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
|
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
|
||||||
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
|
dbTag{Name: "ev", Order: 4, Score: 0, Category: "name"}, // event
|
||||||
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
|
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
|
||||||
dbTag{Name: "fem", Order: 4, Score: 0, Category: "name"}, // female term, language, or name
|
dbTag{Name: "fem", Order: 4, Score: 0, Category: "name"}, // female term, language, or name
|
||||||
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
|
dbTag{Name: "fict", Order: 4, Score: 0, Category: "name"}, // fiction
|
||||||
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
|
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
|
||||||
dbTag{Name: "given", Order: 4, Score: 0, Category: "name"}, // given name or forename, gender not specified
|
dbTag{Name: "given", Order: 4, Score: 0, Category: "name"}, // given name or forename, gender not specified
|
||||||
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
|
dbTag{Name: "group", Order: 4, Score: 0, Category: "name"}, // group
|
||||||
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
|
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
|
||||||
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
|
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
|
||||||
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
|
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
|
||||||
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
|
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
|
||||||
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
|
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
|
||||||
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
|
dbTag{Name: "leg", Order: 4, Score: 0, Category: "name"}, // legend
|
||||||
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
|
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
|
||||||
dbTag{Name: "male", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
|
dbTag{Name: "male", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
|
||||||
dbTag{Name: "masc", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
|
dbTag{Name: "masc", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
|
||||||
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
|
dbTag{Name: "myth", Order: 4, Score: 0, Category: "name"}, // mythology
|
||||||
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
|
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
|
||||||
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
|
dbTag{Name: "obj", Order: 4, Score: 0, Category: "name"}, // object
|
||||||
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
|
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
|
||||||
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
|
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
|
||||||
dbTag{Name: "organization", Order: 4, Score: 0, Category: "name"}, // organization name
|
dbTag{Name: "organization", Order: 4, Score: 0, Category: "name"}, // organization name
|
||||||
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
|
dbTag{Name: "oth", Order: 4, Score: 0, Category: "name"}, // other
|
||||||
dbTag{Name: "person", Order: 4, Score: 0, Category: "name"}, // full name of a particular person
|
dbTag{Name: "person", Order: 4, Score: 0, Category: "name"}, // full name of a particular person
|
||||||
dbTag{Name: "place", Order: 4, Score: 0, Category: "name"}, // place name
|
dbTag{Name: "place", Order: 4, Score: 0, Category: "name"}, // place name
|
||||||
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
|
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
|
||||||
@ -137,10 +137,10 @@ func knownEntityTags() []dbTag {
|
|||||||
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
|
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
|
||||||
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
|
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
|
||||||
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
|
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
|
||||||
dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion
|
dbTag{Name: "relig", Order: 4, Score: 0, Category: "name"}, // religion
|
||||||
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
|
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
|
||||||
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
|
dbTag{Name: "serv", Order: 4, Score: 0, Category: "name"}, // service
|
||||||
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
|
dbTag{Name: "ship", Order: 4, Score: 0, Category: "name"}, // ship name
|
||||||
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
|
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
|
||||||
dbTag{Name: "station", Order: 4, Score: 0, Category: "name"}, // railway station
|
dbTag{Name: "station", Order: 4, Score: 0, Category: "name"}, // railway station
|
||||||
dbTag{Name: "surname", Order: 4, Score: 0, Category: "name"}, // family or surname
|
dbTag{Name: "surname", Order: 4, Score: 0, Category: "name"}, // family or surname
|
||||||
|
21
jmnedict.go
21
jmnedict.go
@ -8,19 +8,22 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string {
|
func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string {
|
||||||
|
unknownDate := "unknown"
|
||||||
|
idx := len(dictionary.Entries) - 1
|
||||||
if len(dictionary.Entries) == 0 {
|
if len(dictionary.Entries) == 0 {
|
||||||
return "unknown"
|
return unknownDate
|
||||||
}
|
} else if len(dictionary.Entries[idx].Translations) == 0 {
|
||||||
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
|
return unknownDate
|
||||||
if len(dateEntry.Translations) == 0 || len(dateEntry.Translations[0].Translations) == 0 {
|
} else if len(dictionary.Entries[idx].Translations[0].Translations) == 0 {
|
||||||
return "unknown"
|
return unknownDate
|
||||||
}
|
}
|
||||||
|
dateGloss := dictionary.Entries[idx].Translations[0].Translations[0]
|
||||||
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
|
||||||
jmnedictDate := r.FindString(dateEntry.Translations[0].Translations[0])
|
date := r.FindString(dateGloss)
|
||||||
if jmnedictDate != "" {
|
if date != "" {
|
||||||
return jmnedictDate
|
return date
|
||||||
} else {
|
} else {
|
||||||
return "unknown"
|
return unknownDate
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,10 +44,7 @@ func replaceIterationMarks(text string) string {
|
|||||||
// Returns nil if no segmentation is possible.
|
// Returns nil if no segmentation is possible.
|
||||||
func makeKanaSegments(kana string) (segments []string) {
|
func makeKanaSegments(kana string) (segments []string) {
|
||||||
hiragana := replaceIterationMarks(katakanaToHiragana(kana))
|
hiragana := replaceIterationMarks(katakanaToHiragana(kana))
|
||||||
kanaRunes := []rune{}
|
kanaRunes := []rune(hiragana)
|
||||||
for _, kanaRune := range hiragana {
|
|
||||||
kanaRunes = append(kanaRunes, kanaRune)
|
|
||||||
}
|
|
||||||
kanaRuneCount := len(kanaRunes)
|
kanaRuneCount := len(kanaRunes)
|
||||||
for i := 0; i < kanaRuneCount; i++ {
|
for i := 0; i < kanaRuneCount; i++ {
|
||||||
for j := 0; j < kanaRuneCount-i; j++ {
|
for j := 0; j < kanaRuneCount-i; j++ {
|
||||||
|
Loading…
Reference in New Issue
Block a user