1
yomichan-import/jmnedict.go

136 lines
3.3 KiB
Go
Raw Normal View History

2023-02-02 00:55:03 +00:00
package yomichan
import (
"os"
"regexp"
"foosoft.net/projects/jmdict"
)
func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string {
unknownDate := "unknown"
idx := len(dictionary.Entries) - 1
2023-02-02 00:55:03 +00:00
if len(dictionary.Entries) == 0 {
return unknownDate
} else if len(dictionary.Entries[idx].Translations) == 0 {
return unknownDate
} else if len(dictionary.Entries[idx].Translations[0].Translations) == 0 {
return unknownDate
2023-02-02 00:55:03 +00:00
}
dateGloss := dictionary.Entries[idx].Translations[0].Translations[0]
2023-02-02 00:55:03 +00:00
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
date := r.FindString(dateGloss)
if date != "" {
return date
2023-02-02 00:55:03 +00:00
} else {
return unknownDate
2023-02-02 00:55:03 +00:00
}
}
func jmnedictSenseTerm(headword headword, seq sequence, sense jmdict.JmnedictTranslation, senseNumber int) dbTerm {
term := dbTerm{
Expression: headword.Expression,
Reading: headword.Reading,
Sequence: seq,
}
for _, gloss := range sense.Translations {
term.Glossary = append(term.Glossary, gloss)
}
term.addDefinitionTags(sense.NameTypes...)
term.Score = calculateTermScore(senseNumber, 0, headword)
return term
}
func jmnedictTerms(headword headword, entry jmdict.JmnedictEntry, g genericTermInfo) []dbTerm {
terms := []dbTerm{}
for idx, sense := range entry.Translations {
if g.IsGenericName(headword, sense.Translations) {
g.AddGlosses(headword.Expression, sense.NameTypes, headword.Reading)
} else {
g.AddUsedSequence(entry.Sequence)
senseTerm := jmnedictSenseTerm(headword, entry.Sequence, sense, idx+1)
terms = append(terms, senseTerm)
}
}
return terms
}
func jmnedictHeadwords(entry jmdict.JmnedictEntry) (headwords []headword) {
// Note that JMnedict doesn't (currently) use priority tags,
// frequency tags, or any sort of reading/kanji restrictions.
for _, reading := range entry.Readings {
for _, kanji := range entry.Kanji {
h := headword{
Expression: kanji.Expression,
Reading: reading.Reading,
}
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
if len(entry.Kanji) == 0 {
for _, reading := range entry.Readings {
h := headword{
Expression: reading.Reading,
Reading: reading.Reading,
}
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
return headwords
}
func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dictionary, entities, err := jmdict.LoadJmnedictNoTransform(reader)
if err != nil {
return err
}
genericTermInfo := newGenericTermInfo()
terms := dbTermList{}
for _, entry := range dictionary.Entries {
headwords := jmnedictHeadwords(entry)
for _, headword := range headwords {
newTerms := jmnedictTerms(headword, entry, genericTermInfo)
terms = append(terms, newTerms...)
}
}
terms = append(terms, genericTermInfo.Terms()...)
tags := dbTagList{}
tags = append(tags, entityTags(entities)...)
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": tags.crush(),
}
if title == "" {
title = "JMnedict"
}
jmnedictDate := jmnedictPublicationDate(dictionary)
index := dbIndex{
Title: title,
Revision: "JMnedict." + jmnedictDate,
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}