1

Merge pull request #41 from stephenmk/master

New version of JMnedict (the proper name dictionary)
This commit is contained in:
Alexei Yatskov 2023-02-05 09:57:17 -08:00 committed by GitHub
commit f4da17e228
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 604 additions and 254 deletions

View File

@ -9,6 +9,8 @@ import (
"os"
"path/filepath"
"strings"
"golang.org/x/exp/slices"
)
const (
@ -116,7 +118,7 @@ type dbKanjiList []dbKanji
func (kanji *dbKanji) addTags(tags ...string) {
for _, tag := range tags {
if !hasString(tag, kanji.Tags) {
if !slices.Contains(kanji.Tags, tag) {
kanji.Tags = append(kanji.Tags, tag)
}
}
@ -245,7 +247,7 @@ func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordLis
func appendStringUnique(target []string, source ...string) []string {
for _, str := range source {
if !hasString(str, target) {
if !slices.Contains(target, str) {
target = append(target, str)
}
}
@ -253,16 +255,6 @@ func appendStringUnique(target []string, source ...string) []string {
return target
}
func hasString(needle string, haystack []string) bool {
for _, value := range haystack {
if needle == value {
return true
}
}
return false
}
func intersection(s1, s2 []string) []string {
s := []string{}
m := make(map[string]bool)
@ -337,7 +329,7 @@ func detectFormat(path string) (string, error) {
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
handlers := map[string]func(string, string, string, string, int, bool) error{
"edict": jmdExportDb,
"edict": jmdictExportDb,
"forms": formsExportDb,
"enamdict": jmnedictExportDb,
"epwing": epwingExportDb,

View File

@ -1,118 +0,0 @@
package yomichan
import (
"os"
"foosoft.net/projects/jmdict"
)
func jmnedictBuildTagMeta(entities map[string]string) dbTagList {
var tags dbTagList
for name, value := range entities {
tag := dbTag{Name: name, Notes: value}
switch name {
case "company", "fem", "given", "masc", "organization", "person", "place", "product", "station", "surname", "unclass", "work":
tag.Category = "name"
tag.Order = 4
}
tags = append(tags, tag)
}
return tags
}
func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
var terms []dbTerm
convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) {
if kanji != nil && hasString(kanji.Expression, reading.Restrictions) {
return
}
var term dbTerm
term.Sequence = enamdictEntry.Sequence
term.addTermTags(reading.Information...)
if kanji == nil {
term.Expression = reading.Reading
} else {
term.Expression = kanji.Expression
term.Reading = reading.Reading
term.addTermTags(kanji.Information...)
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
term.addTermTags(priority)
}
}
}
for _, trans := range enamdictEntry.Translations {
for _, translation := range trans.Translations {
term.Glossary = append(term.Glossary, translation)
}
term.addDefinitionTags(trans.NameTypes...)
}
terms = append(terms, term)
}
if len(enamdictEntry.Kanji) > 0 {
for _, kanji := range enamdictEntry.Kanji {
for _, reading := range enamdictEntry.Readings {
convert(reading, &kanji)
}
}
} else {
for _, reading := range enamdictEntry.Readings {
convert(reading, nil)
}
}
return terms
}
func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dict, entities, err := jmdict.LoadJmnedictNoTransform(reader)
if err != nil {
return err
}
var terms dbTermList
for _, entry := range dict.Entries {
terms = append(terms, jmnedictExtractTerms(entry)...)
}
if title == "" {
title = "JMnedict"
}
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": jmnedictBuildTagMeta(entities).crush(),
}
index := dbIndex{
Title: title,
Revision: "jmnedict1",
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}

View File

@ -8,14 +8,14 @@ import (
)
func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
return frequencyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
}
func frequencyKanjiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "kanji_meta")
return frequencyExportDb(inputPath, outputPath, language, title, stride, pretty, "kanji_meta")
}
func frequncyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool, key string) error {
func frequencyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool, key string) error {
reader, err := os.Open(inputPath)
if err != nil {
return err

View File

@ -63,13 +63,26 @@ func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta j
}
func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
dateEntry := dictionary.Entries[len(dictionary.Entries)-1]
unknownDate := "unknown"
idx := len(dictionary.Entries) - 1
if len(dictionary.Entries) == 0 {
return unknownDate
} else if len(dictionary.Entries[idx].Sense) == 0 {
return unknownDate
} else if len(dictionary.Entries[idx].Sense[0].Glossary) == 0 {
return unknownDate
}
dateGloss := dictionary.Entries[idx].Sense[0].Glossary[0].Content
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content)
return jmdictDate
date := r.FindString(dateGloss)
if date != "" {
return date
} else {
return unknownDate
}
}
func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
func jmdictFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
// Don't add "forms" terms to non-English dictionaries.
// Information would be duplicated if users installed more
// than one version.
@ -84,20 +97,21 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet
}
}
term := baseFormsTerm(entry)
term := baseFormsTerm(entry, meta)
term.Expression = headword.Expression
term.Reading = headword.Reading
term.addTermTags(headword.TermTags...)
term.addDefinitionTags("forms")
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
entryDepth := meta.entryDepth[entry.Sequence]
term.Score = calculateTermScore(senseNumber, entryDepth, headword)
return term, true
}
func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
func jmdictSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
// Don't add "search" terms to non-English dictionaries.
// Information would be duplicated if users installed more
// than one version.
@ -109,10 +123,11 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe
Expression: headword.Expression,
Sequence: -entry.Sequence,
}
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech)
partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
rules := grammarRules(partsOfSpeech)
term.addRules(rules...)
}
term.addTermTags(headword.TermTags...)
term.Score = calculateTermScore(1, 0, headword)
@ -130,7 +145,7 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe
return term, true
}
func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
func jmdictSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
return dbTerm{}, false
}
@ -152,6 +167,13 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
senseNumberTag := strconv.Itoa(senseNumber)
term.addDefinitionTags(senseNumberTag)
}
if len(sense.PartsOfSpeech) == 0 && meta.language != "eng" {
// This is a hack to provide part-of-speech info to
// non-English versions of JMdict.
sense.PartsOfSpeech = meta.seqToPartsOfSpeech[entry.Sequence]
}
term.addDefinitionTags(sense.PartsOfSpeech...)
term.addDefinitionTags(sense.Fields...)
term.addDefinitionTags(sense.Misc...)
@ -166,12 +188,12 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
return term, true
}
func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
func jmdictTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
if meta.seqToSenseCount[entry.Sequence] == 0 {
return nil, false
}
if headword.IsSearchOnly {
if searchTerm, ok := createSearchTerm(headword, entry, meta); ok {
if searchTerm, ok := jmdictSearchTerm(headword, entry, meta); ok {
return []dbTerm{searchTerm}, true
} else {
return nil, false
@ -184,20 +206,20 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada
// Do not increment sense number
continue
}
if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok {
if senseTerm, ok := jmdictSenseTerm(sense, senseNumber, headword, entry, meta); ok {
terms = append(terms, senseTerm)
}
senseNumber += 1
}
if formsTerm, ok := createFormsTerm(headword, entry, meta); ok {
if formsTerm, ok := jmdictFormsTerm(headword, entry, meta); ok {
terms = append(terms, formsTerm)
}
return terms, true
}
func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
func jmdictExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
if _, ok := langNameToCode[languageName]; !ok {
return errors.New("Unrecognized language parameter: " + languageName)
}
@ -219,7 +241,7 @@ func jmdExportDb(inputPath string, outputPath string, languageName string, title
for _, entry := range dictionary.Entries {
headwords := extractHeadwords(entry)
for _, headword := range headwords {
if newTerms, ok := extractTerms(headword, entry, meta); ok {
if newTerms, ok := jmdictTerms(headword, entry, meta); ok {
terms = append(terms, newTerms...)
}
}

View File

@ -8,17 +8,6 @@ import (
"golang.org/x/exp/slices"
)
func kata2hira(word string) string {
charMap := func(character rune) rune {
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
return character - 0x60
} else {
return character
}
}
return strings.Map(charMap, word)
}
func (h *headword) InfoSymbols() string {
infoSymbols := []string{}
if h.IsPriority {
@ -93,8 +82,8 @@ func needsFormTable(headwords []headword) bool {
} else if h.IsKanaOnly() {
continue
} else if uniqueReading == "" {
uniqueReading = kata2hira(h.Reading)
} else if uniqueReading != kata2hira(h.Reading) {
uniqueReading = katakanaToHiragana(h.Reading)
} else if uniqueReading != katakanaToHiragana(h.Reading) {
return true
}
}
@ -183,18 +172,20 @@ func formsGlossary(headwords []headword) []any {
return glossary
}
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
func baseFormsTerm(entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
term := dbTerm{Sequence: entry.Sequence}
headwords := extractHeadwords(entry)
if needsFormTable(headwords) {
term.Glossary = formsTableGlossary(headwords)
} else {
term.Glossary = formsGlossary(headwords)
}
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech)
partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
rules := grammarRules(partsOfSpeech)
term.addRules(rules...)
}
return term
}
@ -214,11 +205,11 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int
terms := dbTermList{}
for _, entry := range dictionary.Entries {
baseTerm := baseFormsTerm(entry)
baseTerm := baseFormsTerm(entry, meta)
headwords := extractHeadwords(entry)
for _, h := range headwords {
if h.IsSearchOnly {
if term, ok := createSearchTerm(h, entry, meta); ok {
if term, ok := jmdictSearchTerm(h, entry, meta); ok {
terms = append(terms, term)
}
continue

View File

@ -13,6 +13,7 @@ type jmdictMetadata struct {
language string
condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int
seqToPartsOfSpeech map[sequence][]string
seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence
@ -31,7 +32,7 @@ type senseID struct {
number int
}
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) {
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, seq sequence) {
// This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order
maxDepth := 0
@ -48,39 +49,63 @@ func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySeque
}
}
}
meta.entryDepth[entrySequence] = maxDepth
meta.entryDepth[seq] = maxDepth
}
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) {
// Determine how many senses are in this entry for this language
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok {
func (meta *jmdictMetadata) AddEntry(entry jmdict.JmdictEntry) {
partsOfSpeech := []string{}
senseCount := 0
for _, entrySense := range entry.Sense {
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) {
senseCount += 1
break
for _, sense := range entry.Sense {
// Only English-language senses contain part-of-speech info,
// but other languages need them for deinflection rules.
for _, pos := range sense.PartsOfSpeech {
if !slices.Contains(partsOfSpeech, pos) {
partsOfSpeech = append(partsOfSpeech, pos)
}
}
}
meta.seqToSenseCount[entry.Sequence] = senseCount
}
if meta.seqToSenseCount[entry.Sequence] == 0 {
if glossaryContainsLanguage(sense.Glossary, meta.language) {
senseCount += 1
} else {
continue
}
for _, reference := range sense.References {
meta.references = append(meta.references, reference)
}
for _, antonym := range sense.Antonyms {
meta.references = append(meta.references, antonym)
}
currentSenseID := senseID{entry.Sequence, senseCount}
glosses := []string{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSenseID] = strings.Join(glosses, "; ")
}
meta.seqToPartsOfSpeech[entry.Sequence] = partsOfSpeech
meta.seqToSenseCount[entry.Sequence] = senseCount
}
func (meta *jmdictMetadata) AddHeadword(headword headword, seq sequence) {
if meta.seqToSenseCount[seq] == 0 {
return
}
// main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok {
meta.seqToMainHeadword[entry.Sequence] = headword
if _, ok := meta.seqToMainHeadword[seq]; !ok {
meta.seqToMainHeadword[seq] = headword
}
// hash the term pair so we can determine if it's used
// in more than one JMdict entry later.
headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) {
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence)
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], seq) {
meta.headwordHashToSeqs[headwordHash] =
append(meta.headwordHashToSeqs[headwordHash], seq)
}
// hash the expression so that we can determine if we
@ -88,7 +113,8 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
// in reference notes later.
expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading)
meta.expHashToReadings[expHash] =
append(meta.expHashToReadings[expHash], headword.Reading)
}
// e.g. for JMdict (English) we expect to end up with
@ -100,48 +126,17 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
searchHash{headword.ReadingHash(), headword.IsPriority},
}
for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) {
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x)
if !slices.Contains(meta.seqToSearchHashes[seq], x) {
meta.seqToSearchHashes[seq] = append(meta.seqToSearchHashes[seq], x)
}
}
currentSenseNumber := 1
for _, entrySense := range entry.Sense {
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
continue
}
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
currentSenseNumber += 1
continue
}
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
currentSenseNumber += 1
continue
}
allReferences := append(entrySense.References, entrySense.Antonyms...)
for _, reference := range allReferences {
meta.references = append(meta.references, reference)
}
currentSense := senseID{entry.Sequence, currentSenseNumber}
if meta.condensedGlosses[currentSense] == "" {
glosses := []string{}
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
}
currentSenseNumber += 1
}
}
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{
language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int),
seqToPartsOfSpeech: make(map[sequence][]string),
condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string),
@ -157,10 +152,11 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta
}
for _, entry := range dictionary.Entries {
meta.AddEntry(entry)
headwords := extractHeadwords(entry)
formCount := 0
for _, headword := range headwords {
meta.AddHeadword(headword, entry)
meta.AddHeadword(headword, entry.Sequence)
if !headword.IsSearchOnly {
formCount += 1
}

View File

@ -96,57 +96,58 @@ func knownEntityTags() []dbTag {
// <misc> miscellaneous sense info
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character
dbTag{Name: "char", Order: 4, Score: 0, Category: "name"}, // character
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
dbTag{Name: "company", Order: 0, Score: 0, Category: ""}, // company name
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature
dbTag{Name: "company", Order: 4, Score: 0, Category: "name"}, // company name
dbTag{Name: "creat", Order: 4, Score: 0, Category: "name"}, // creature
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity
dbTag{Name: "dei", Order: 4, Score: 0, Category: "name"}, // deity
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document
dbTag{Name: "doc", Order: 4, Score: 0, Category: "name"}, // document
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event
dbTag{Name: "ev", Order: 4, Score: 0, Category: "name"}, // event
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
dbTag{Name: "fem", Order: 0, Score: 0, Category: ""}, // female term or language
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction
dbTag{Name: "fem", Order: 4, Score: 0, Category: "name"}, // female term, language, or name
dbTag{Name: "fict", Order: 4, Score: 0, Category: "name"}, // fiction
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
dbTag{Name: "given", Order: 0, Score: 0, Category: ""}, // given name or forename, gender not specified
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group
dbTag{Name: "given", Order: 4, Score: 0, Category: "name"}, // given name or forename, gender not specified
dbTag{Name: "group", Order: 4, Score: 0, Category: "name"}, // group
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend
dbTag{Name: "leg", Order: 4, Score: 0, Category: "name"}, // legend
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
dbTag{Name: "male", Order: 0, Score: 0, Category: ""}, // male term or language
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology
dbTag{Name: "male", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
dbTag{Name: "masc", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
dbTag{Name: "myth", Order: 4, Score: 0, Category: "name"}, // mythology
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object
dbTag{Name: "obj", Order: 4, Score: 0, Category: "name"}, // object
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
dbTag{Name: "organization", Order: 0, Score: 0, Category: ""}, // organization name
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other
dbTag{Name: "person", Order: 0, Score: 0, Category: ""}, // full name of a particular person
dbTag{Name: "place", Order: 0, Score: 0, Category: ""}, // place name
dbTag{Name: "organization", Order: 4, Score: 0, Category: "name"}, // organization name
dbTag{Name: "oth", Order: 4, Score: 0, Category: "name"}, // other
dbTag{Name: "person", Order: 4, Score: 0, Category: "name"}, // full name of a particular person
dbTag{Name: "place", Order: 4, Score: 0, Category: "name"}, // place name
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
dbTag{Name: "pol", Order: 0, Score: 0, Category: ""}, // polite (teineigo) language
dbTag{Name: "product", Order: 0, Score: 0, Category: ""}, // product name
dbTag{Name: "product", Order: 4, Score: 0, Category: "name"}, // product name
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion
dbTag{Name: "relig", Order: 4, Score: 0, Category: "name"}, // religion
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name
dbTag{Name: "serv", Order: 4, Score: 0, Category: "name"}, // service
dbTag{Name: "ship", Order: 4, Score: 0, Category: "name"}, // ship name
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
dbTag{Name: "station", Order: 0, Score: 0, Category: ""}, // railway station
dbTag{Name: "surname", Order: 0, Score: 0, Category: ""}, // family or surname
dbTag{Name: "station", Order: 4, Score: 0, Category: "name"}, // railway station
dbTag{Name: "surname", Order: 4, Score: 0, Category: "name"}, // family or surname
dbTag{Name: "uk", Order: 0, Score: 0, Category: ""}, // word usually written using kana alone
dbTag{Name: "unclass", Order: 0, Score: 0, Category: ""}, // unclassified name
dbTag{Name: "unclass", Order: 4, Score: 0, Category: "name"}, // unclassified name
dbTag{Name: "vulg", Order: 0, Score: 0, Category: ""}, // vulgar expression or word
dbTag{Name: "work", Order: 0, Score: 0, Category: ""}, // work of art, literature, music, etc. name
dbTag{Name: "work", Order: 4, Score: 0, Category: "name"}, // work of art, literature, music, etc. name
dbTag{Name: "X", Order: 0, Score: 0, Category: ""}, // rude or X-rated term (not displayed in educational software)
dbTag{Name: "yoji", Order: 0, Score: 0, Category: ""}, // yojijukugo

135
jmnedict.go Normal file
View File

@ -0,0 +1,135 @@
package yomichan
import (
"os"
"regexp"
"foosoft.net/projects/jmdict"
)
func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string {
unknownDate := "unknown"
idx := len(dictionary.Entries) - 1
if len(dictionary.Entries) == 0 {
return unknownDate
} else if len(dictionary.Entries[idx].Translations) == 0 {
return unknownDate
} else if len(dictionary.Entries[idx].Translations[0].Translations) == 0 {
return unknownDate
}
dateGloss := dictionary.Entries[idx].Translations[0].Translations[0]
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
date := r.FindString(dateGloss)
if date != "" {
return date
} else {
return unknownDate
}
}
func jmnedictSenseTerm(headword headword, seq sequence, sense jmdict.JmnedictTranslation, senseNumber int) dbTerm {
term := dbTerm{
Expression: headword.Expression,
Reading: headword.Reading,
Sequence: seq,
}
for _, gloss := range sense.Translations {
term.Glossary = append(term.Glossary, gloss)
}
term.addDefinitionTags(sense.NameTypes...)
term.Score = calculateTermScore(senseNumber, 0, headword)
return term
}
func jmnedictTerms(headword headword, entry jmdict.JmnedictEntry, g genericTermInfo) []dbTerm {
terms := []dbTerm{}
for idx, sense := range entry.Translations {
if g.IsGenericName(headword, sense.Translations) {
g.AddGlosses(headword.Expression, sense.NameTypes, headword.Reading)
} else {
g.AddUsedSequence(entry.Sequence)
senseTerm := jmnedictSenseTerm(headword, entry.Sequence, sense, idx+1)
terms = append(terms, senseTerm)
}
}
return terms
}
func jmnedictHeadwords(entry jmdict.JmnedictEntry) (headwords []headword) {
// Note that JMnedict doesn't (currently) use priority tags,
// frequency tags, or any sort of reading/kanji restrictions.
for _, reading := range entry.Readings {
for _, kanji := range entry.Kanji {
h := headword{
Expression: kanji.Expression,
Reading: reading.Reading,
}
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
if len(entry.Kanji) == 0 {
for _, reading := range entry.Readings {
h := headword{
Expression: reading.Reading,
Reading: reading.Reading,
}
h.Index = len(headwords)
headwords = append(headwords, h)
}
}
return headwords
}
func jmnedictExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dictionary, entities, err := jmdict.LoadJmnedictNoTransform(reader)
if err != nil {
return err
}
genericTermInfo := newGenericTermInfo()
terms := dbTermList{}
for _, entry := range dictionary.Entries {
headwords := jmnedictHeadwords(entry)
for _, headword := range headwords {
newTerms := jmnedictTerms(headword, entry, genericTermInfo)
terms = append(terms, newTerms...)
}
}
terms = append(terms, genericTermInfo.Terms()...)
tags := dbTagList{}
tags = append(tags, entityTags(entities)...)
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": tags.crush(),
}
if title == "" {
title = "JMnedict"
}
jmnedictDate := jmnedictPublicationDate(dictionary)
index := dbIndex{
Title: title,
Revision: "JMnedict." + jmnedictDate,
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}

80
jmnedict_generic_terms.go Normal file
View File

@ -0,0 +1,80 @@
package yomichan
import (
"golang.org/x/exp/slices"
)
type genericTermMap map[string]map[string][]string
type genericTermInfo struct {
expressionToTagToGlosses genericTermMap
usedSequences map[sequence]bool
currentSequence sequence
}
func newGenericTermInfo() genericTermInfo {
return genericTermInfo{
expressionToTagToGlosses: genericTermMap{},
usedSequences: map[sequence]bool{},
}
}
func (i *genericTermInfo) NewSequence() sequence {
seq := i.currentSequence + 1
for i.usedSequences[seq] {
seq += 1
}
i.AddUsedSequence(seq)
i.currentSequence = seq
return seq
}
func (i *genericTermInfo) AddUsedSequence(s sequence) {
i.usedSequences[s] = true
}
func (i *genericTermInfo) AddGlosses(exp string, tags []string, gloss string) {
if i.expressionToTagToGlosses[exp] == nil {
i.expressionToTagToGlosses[exp] = map[string][]string{}
}
for _, tag := range tags {
glosses := i.expressionToTagToGlosses[exp][tag]
if !slices.Contains(glosses, gloss) {
glosses = append(glosses, gloss)
i.expressionToTagToGlosses[exp][tag] = glosses
}
}
}
func (i *genericTermInfo) IsGenericName(headword headword, definitions []string) bool {
if headword.IsKanaOnly() {
// No reason to process these terms.
return false
}
isGenericName := true
for _, definition := range definitions {
if !isTransliteration(definition, headword.Reading) {
isGenericName = false
break
}
}
return isGenericName
}
func (i *genericTermInfo) Terms() (terms []dbTerm) {
for expression, tagToGlosses := range i.expressionToTagToGlosses {
seq := i.NewSequence()
for tag, glosses := range tagToGlosses {
term := dbTerm{
Expression: expression,
Sequence: seq,
}
for _, gloss := range glosses {
term.Glossary = append(term.Glossary, gloss)
}
term.addDefinitionTags(tag)
terms = append(terms, term)
}
}
return terms
}

251
jmnedict_text_util.go Normal file
View File

@ -0,0 +1,251 @@
package yomichan
import (
"strings"
"golang.org/x/exp/slices"
)
// Returns text with all katakana characters converted into hiragana.
func katakanaToHiragana(text string) string {
f := func(x rune) rune {
if x >= 'ァ' && x <= 'ヶ' || x >= 'ヽ' && x <= 'ヾ' {
return x - 0x60
} else {
return x
}
}
return strings.Map(f, text)
}
// Replace hiragana iteration marks with the appropriate characters.
// E.g. "さゝき" -> "ささき"; "たゞの" -> "ただの"
func replaceIterationMarks(text string) string {
iterationMarks := []struct {
char rune
offset rune
}{
{'ゝ', 0x00},
{'ゞ', 0x01},
}
for _, x := range iterationMarks {
for strings.IndexRune(text, x.char) > 0 {
runes := []rune(text)
idx := slices.Index(runes, x.char)
runes[idx] = runes[idx-1] + x.offset
text = string(runes)
}
}
return text
}
// Returns an array of the input text split into segments.
// E.g. "しょくぎょう" -> ["しょ", "く", "ぎょ", "う"]
// Returns nil if no segmentation is possible.
func makeKanaSegments(kana string) (segments []string) {
hiragana := replaceIterationMarks(katakanaToHiragana(kana))
kanaRunes := []rune(hiragana)
kanaRuneCount := len(kanaRunes)
for i := 0; i < kanaRuneCount; i++ {
for j := 0; j < kanaRuneCount-i; j++ {
segment := string(kanaRunes[i : kanaRuneCount-j])
if _, ok := kanaSegmentToRomajiList[segment]; ok {
segments = append(segments, segment)
i = kanaRuneCount - j - 1
break
}
if j == kanaRuneCount-i-1 {
return nil
}
}
}
return segments
}
// Returns a map of ltr substrings of the input text.
// E.g. "nihon" -> ["n", "ni", "nih", "niho", "nihon"]
func makeSubstringMap(text string) map[string]bool {
substrings := make(map[string]bool)
for i := 1; i <= len(text); i++ {
substring := text[:i]
substrings[substring] = true
}
return substrings
}
// Determines if the input text is a valid romaji representation of
// the input kana.
//
// The strategy is to calculate every possible romaji representation
// of a given string of kana and check if the input text is one of
// them. Since the number of combinations grows very large for long
// strings of kana, we need to prune invalid branches from the
// combination tree along the way.
func isTransliteration(text string, kana string) bool {
romaji := strings.TrimSpace(strings.ToLower(text))
validSubstrings := makeSubstringMap(romaji)
kanaSegments := makeKanaSegments(kana)
possibilities := []string{""}
for _, segment := range kanaSegments {
newPossibilities := map[string]bool{}
for _, x := range possibilities {
for _, y := range kanaSegmentToRomajiList[segment] {
z := x + y
newPossibilities[z] = true
}
}
possibilities = nil
for z := range newPossibilities {
if validSubstrings[z] {
possibilities = append(possibilities, z)
}
}
if possibilities == nil {
return false
}
}
return slices.Contains(possibilities, romaji)
}
var kanaSegmentToRomajiList = map[string][]string{
"ぁ": []string{"", "a"},
"ぃ": []string{"", "i"},
"ぅ": []string{"", "u"},
"ぇ": []string{"", "e"},
"ぉ": []string{"", "o"},
"ゃ": []string{"ya"},
"ゅ": []string{"yu"},
"ょ": []string{"yo"},
"ゎ": []string{"wa"},
"っ": []string{"", "k", "g", "s", "z", "t", "d", "f", "h", "b", "p", "n", "m", "y", "w", "c"},
"ー": []string{"", "a", "i", "u", "e", "o", "-"},
"あ": []string{"", "a", "ā", "wa", "wā"},
"い": []string{"", "i", "ī", "wi", "wī"},
"う": []string{"", "u", "ū", "wu", "wū"},
"え": []string{"", "e", "ē", "we", "wē"},
"お": []string{"", "o", "ō", "wo", "wō"},
"ゔ": []string{"vu", "vū", "bu", "bū"},
"か": []string{"ka", "kā"},
"が": []string{"ga", "gā"},
"き": []string{"ki", "kī"},
"ぎ": []string{"gi", "gī"},
"く": []string{"ku", "kū"},
"ぐ": []string{"gu", "gū"},
"け": []string{"ke", "kē"},
"げ": []string{"ge", "gē"},
"こ": []string{"ko", "kō"},
"ご": []string{"go", "gō"},
"さ": []string{"sa", "sā"},
"ざ": []string{"za", "zā"},
"し": []string{"si", "sī", "shi", "shī"},
"じ": []string{"zi", "zī", "ji", "jī"},
"す": []string{"su", "sū"},
"ず": []string{"zu", "zū"},
"せ": []string{"se", "sē"},
"ぜ": []string{"ze", "zē"},
"そ": []string{"so", "sō"},
"ぞ": []string{"zo", "zō"},
"た": []string{"ta", "tā"},
"だ": []string{"da", "dā"},
"ち": []string{"ti", "tī", "chi", "chī"},
"ぢ": []string{"di", "dī", "dhi", "dhī", "ji", "jī", "dji", "djī", "dzi", "dzī"},
"つ": []string{"tu", "tū", "tsu", "tsū"},
"づ": []string{"du", "dū", "dzu", "dzū", "zu", "zū"},
"て": []string{"te", "tē"},
"で": []string{"de", "dē"},
"と": []string{"to", "tō"},
"ど": []string{"do", "dō"},
"な": []string{"na", "nā"},
"に": []string{"ni", "nī"},
"ぬ": []string{"nu", "nū"},
"ね": []string{"ne", "nē"},
"の": []string{"no", "nō"},
"は": []string{"ha", "hā", "wa", "wā", "a", "ā"},
"ば": []string{"ba", "bā"},
"ぱ": []string{"pa", "pā"},
"ひ": []string{"hi", "hī", "i", "ī"},
"び": []string{"bi", "bī"},
"ぴ": []string{"pi", "pī"},
"ふ": []string{"hu", "hū", "fu", "fū", "u", "ū"},
"ぶ": []string{"bu", "bū"},
"ぷ": []string{"pu", "pū"},
"へ": []string{"he", "hē", "e", "ē"},
"べ": []string{"be", "bē"},
"ぺ": []string{"pe", "pē"},
"ほ": []string{"ho", "hō", "o", "ō"},
"ぼ": []string{"bo", "bō"},
"ぽ": []string{"po", "pō"},
"ま": []string{"ma", "mā"},
"み": []string{"mi", "mī"},
"む": []string{"mu", "mū"},
"め": []string{"me", "mē"},
"も": []string{"mo", "mō"},
"や": []string{"ya", "yā"},
"ゆ": []string{"yu", "yū"},
"よ": []string{"yo", "yō"},
"ら": []string{"ra", "rā"},
"り": []string{"ri", "rī"},
"る": []string{"ru", "rū"},
"れ": []string{"re", "rē"},
"ろ": []string{"ro", "rō"},
"わ": []string{"wa", "wā"},
"ゐ": []string{"wi", "wī", "i", "ī"},
"ゑ": []string{"we", "wē", "e", "ē"},
"を": []string{"wo", "wō", "o", "ō"},
"ん": []string{"n", "n'", "m"},
"うぁ": []string{"wa", "wā", "ua", "uā"},
"うぃ": []string{"wi", "wī", "ui", "uī"},
"うぇ": []string{"we", "wē", "ue", "uē"},
"うぉ": []string{"wo", "wō", "uo", "uō"},
"きゃ": []string{"kya", "kyā"},
"きゅ": []string{"kyu", "kyū"},
"きょ": []string{"kyo", "kyō"},
"ぎゃ": []string{"gya", "gyā"},
"ぎゅ": []string{"gyu", "gyū"},
"ぎょ": []string{"gyo", "gyō"},
"くゎ": []string{"kwa", "kwā"},
"くゅ": []string{"kyu", "kyū"},
"しぇ": []string{"she", "shē", "shie", "shiē"},
"しゃ": []string{"sha", "shā", "sya", "syā"},
"しゅ": []string{"shu", "shū", "syu", "syū"},
"しょ": []string{"sho", "shō", "syo", "syō"},
"じぇ": []string{"je", "jē"},
"じゃ": []string{"ja", "jā", "jya", "jyā"},
"じゅ": []string{"ju", "jū", "jyu", "jyū"},
"じょ": []string{"jo", "jō", "jyo", "jyō"},
"ちぁ": []string{"cha", "chā", "chia", "chiā"},
"ちぇ": []string{"che", "chē", "chie", "chiē"},
"ちゃ": []string{"cha", "chā", "tya", "tyā"},
"ちゅ": []string{"chu", "chū", "tyu", "tyū"},
"ちょ": []string{"cho", "chō", "tyo", "tyō"},
"ぢゃ": []string{"ja", "jā", "jya", "jyā", "dya", "dyā"},
"ぢゅ": []string{"ju", "jū", "jyu", "jyū", "dyu", "dyū"},
"ぢょ": []string{"jo", "jō", "jyo", "jyō", "dyo", "dyō"},
"つぁ": []string{"tsa", "tsā", "tsua", "tsuā"},
"つぇ": []string{"tse", "tsē", "tsue", "tsuē"},
"てぃ": []string{"ti", "tī", "tei", "teī"},
"でぃ": []string{"di", "dī", "dei", "deī"},
"でゅ": []string{"dyu", "dyū", "deyu", "deyū"},
"にゃ": []string{"nya", "nyā"},
"にゅ": []string{"nyu", "nyū"},
"にょ": []string{"nyo", "nyō"},
"ひゃ": []string{"hya", "hyā"},
"ひゅ": []string{"hyu", "hyū"},
"ひょ": []string{"hyo", "hyō"},
"びゃ": []string{"bya", "byā"},
"びゅ": []string{"byu", "byū"},
"びょ": []string{"byo", "byō"},
"ぴゃ": []string{"pya", "pyā"},
"ぴゅ": []string{"pyu", "pyū"},
"ぴょ": []string{"pyo", "pyō"},
"ふぁ": []string{"fa", "fā"},
"ふぃ": []string{"fi", "fī"},
"ふぇ": []string{"fe", "fē"},
"ふぉ": []string{"fo", "fō"},
"みゃ": []string{"mya", "myā"},
"みゅ": []string{"myu", "myū"},
"みょ": []string{"myo", "myō"},
"りゃ": []string{"rya", "ryā"},
"りゅ": []string{"ryu", "ryū"},
"りょ": []string{"ryo", "ryō"},
}