1

Compare commits

...

10 Commits

Author SHA1 Message Date
00dc44386e Update maintanence info 2023-02-25 12:43:03 -08:00
Alexei Yatskov
f4da17e228
Merge pull request #41 from stephenmk/master
New version of JMnedict (the proper name dictionary)
2023-02-05 09:57:17 -08:00
stephenmk
ecf22da5a3
Improve readability of publication date functions 2023-02-04 01:42:08 -06:00
stephenmk
a9d85dc720
Simplify string -> runes conversion 2023-02-03 22:07:41 -06:00
stephenmk
70611a51c4
Fix typo 2023-02-03 15:51:52 -06:00
stephenmk
dffbec6337
Designate more JMnedict category tags 2023-02-02 20:15:28 -06:00
stephenmk
5755b79341
Use cached part-of-speech values 2023-02-02 15:50:57 -06:00
stephenmk
7bff70b71c
JMdict: Ensure part-of-speech info is added in non-English versions
Only English-language senses in JMdict contain part-of-speech tags.
This info is displayed to users in definition tags and also used
for deinflecting verbs and adjectives during term lookups.

The old version of Yomichan-Import took the PoS tags from the final
sense in the English version of an entry and applied them to every
sense of every other language. For example, 川・かわ has two senses in
English JMdict: a noun sense and a suffix sense. Therefore every sense
of 川・かわ in every other language was tagged as a suffix.

Instead, I suggest gathering all distinct PoS tags from each English
entry and applying them all to each non-English sense. Every
non-English sense of 川・かわ will therefore be tagged as both a noun
and suffix.
2023-02-02 10:44:16 -06:00
stephenmk
19d6d0bb43
Rename some jmdict functions 2023-02-01 19:14:37 -06:00
stephenmk
3b420f8b6c
Use library implementation of Contains function 2023-02-01 18:57:35 -06:00
9 changed files with 135 additions and 130 deletions

View File

@ -1,5 +1,8 @@
# Yomichan Import # Yomichan Import
*Note: this project is no longer maintained. Please see [this
post](https://foosoft.net/posts/sunsetting-the-yomichan-project/) for more information.*
Yomichan Import allows users of the [Yomichan](https://foosoft.net/projects/yomichan) extension to import custom Yomichan Import allows users of the [Yomichan](https://foosoft.net/projects/yomichan) extension to import custom
dictionary files. It currently supports the following formats: dictionary files. It currently supports the following formats:

View File

@ -9,6 +9,8 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
"golang.org/x/exp/slices"
) )
const ( const (
@ -116,7 +118,7 @@ type dbKanjiList []dbKanji
func (kanji *dbKanji) addTags(tags ...string) { func (kanji *dbKanji) addTags(tags ...string) {
for _, tag := range tags { for _, tag := range tags {
if !hasString(tag, kanji.Tags) { if !slices.Contains(kanji.Tags, tag) {
kanji.Tags = append(kanji.Tags, tag) kanji.Tags = append(kanji.Tags, tag)
} }
} }
@ -245,7 +247,7 @@ func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordLis
func appendStringUnique(target []string, source ...string) []string { func appendStringUnique(target []string, source ...string) []string {
for _, str := range source { for _, str := range source {
if !hasString(str, target) { if !slices.Contains(target, str) {
target = append(target, str) target = append(target, str)
} }
} }
@ -253,16 +255,6 @@ func appendStringUnique(target []string, source ...string) []string {
return target return target
} }
func hasString(needle string, haystack []string) bool {
for _, value := range haystack {
if needle == value {
return true
}
}
return false
}
func intersection(s1, s2 []string) []string { func intersection(s1, s2 []string) []string {
s := []string{} s := []string{}
m := make(map[string]bool) m := make(map[string]bool)
@ -337,7 +329,7 @@ func detectFormat(path string) (string, error) {
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error { func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
handlers := map[string]func(string, string, string, string, int, bool) error{ handlers := map[string]func(string, string, string, string, int, bool) error{
"edict": jmdExportDb, "edict": jmdictExportDb,
"forms": formsExportDb, "forms": formsExportDb,
"enamdict": jmnedictExportDb, "enamdict": jmnedictExportDb,
"epwing": epwingExportDb, "epwing": epwingExportDb,

View File

@ -8,14 +8,14 @@ import (
) )
func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { func frequencyTermsExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta") return frequencyExportDb(inputPath, outputPath, language, title, stride, pretty, "term_meta")
} }
func frequencyKanjiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { func frequencyKanjiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
return frequncyExportDb(inputPath, outputPath, language, title, stride, pretty, "kanji_meta") return frequencyExportDb(inputPath, outputPath, language, title, stride, pretty, "kanji_meta")
} }
func frequncyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool, key string) error { func frequencyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool, key string) error {
reader, err := os.Open(inputPath) reader, err := os.Open(inputPath)
if err != nil { if err != nil {
return err return err

View File

@ -63,23 +63,26 @@ func doDisplaySenseNumberTag(headword headword, entry jmdict.JmdictEntry, meta j
} }
func jmdictPublicationDate(dictionary jmdict.Jmdict) string { func jmdictPublicationDate(dictionary jmdict.Jmdict) string {
unknownDate := "unknown"
idx := len(dictionary.Entries) - 1
if len(dictionary.Entries) == 0 { if len(dictionary.Entries) == 0 {
return "unknown" return unknownDate
} } else if len(dictionary.Entries[idx].Sense) == 0 {
dateEntry := dictionary.Entries[len(dictionary.Entries)-1] return unknownDate
if len(dateEntry.Sense) == 0 || len(dateEntry.Sense[0].Glossary) == 0 { } else if len(dictionary.Entries[idx].Sense[0].Glossary) == 0 {
return "unknown" return unknownDate
} }
dateGloss := dictionary.Entries[idx].Sense[0].Glossary[0].Content
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`) r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
jmdictDate := r.FindString(dateEntry.Sense[0].Glossary[0].Content) date := r.FindString(dateGloss)
if jmdictDate != "" { if date != "" {
return jmdictDate return date
} else { } else {
return "unknown" return unknownDate
} }
} }
func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { func jmdictFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
// Don't add "forms" terms to non-English dictionaries. // Don't add "forms" terms to non-English dictionaries.
// Information would be duplicated if users installed more // Information would be duplicated if users installed more
// than one version. // than one version.
@ -94,20 +97,21 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet
} }
} }
term := baseFormsTerm(entry) term := baseFormsTerm(entry, meta)
term.Expression = headword.Expression term.Expression = headword.Expression
term.Reading = headword.Reading term.Reading = headword.Reading
term.addTermTags(headword.TermTags...) term.addTermTags(headword.TermTags...)
term.addDefinitionTags("forms") term.addDefinitionTags("forms")
senseNumber := meta.seqToSenseCount[entry.Sequence] + 1 senseNumber := meta.seqToSenseCount[entry.Sequence] + 1
entryDepth := meta.entryDepth[entry.Sequence] entryDepth := meta.entryDepth[entry.Sequence]
term.Score = calculateTermScore(senseNumber, entryDepth, headword) term.Score = calculateTermScore(senseNumber, entryDepth, headword)
return term, true return term, true
} }
func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { func jmdictSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
// Don't add "search" terms to non-English dictionaries. // Don't add "search" terms to non-English dictionaries.
// Information would be duplicated if users installed more // Information would be duplicated if users installed more
// than one version. // than one version.
@ -119,10 +123,11 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe
Expression: headword.Expression, Expression: headword.Expression,
Sequence: -entry.Sequence, Sequence: -entry.Sequence,
} }
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech) partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
term.addRules(rules...) rules := grammarRules(partsOfSpeech)
} term.addRules(rules...)
term.addTermTags(headword.TermTags...) term.addTermTags(headword.TermTags...)
term.Score = calculateTermScore(1, 0, headword) term.Score = calculateTermScore(1, 0, headword)
@ -140,7 +145,7 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe
return term, true return term, true
} }
func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { func jmdictSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) {
if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) {
return dbTerm{}, false return dbTerm{}, false
} }
@ -162,6 +167,13 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
senseNumberTag := strconv.Itoa(senseNumber) senseNumberTag := strconv.Itoa(senseNumber)
term.addDefinitionTags(senseNumberTag) term.addDefinitionTags(senseNumberTag)
} }
if len(sense.PartsOfSpeech) == 0 && meta.language != "eng" {
// This is a hack to provide part-of-speech info to
// non-English versions of JMdict.
sense.PartsOfSpeech = meta.seqToPartsOfSpeech[entry.Sequence]
}
term.addDefinitionTags(sense.PartsOfSpeech...) term.addDefinitionTags(sense.PartsOfSpeech...)
term.addDefinitionTags(sense.Fields...) term.addDefinitionTags(sense.Fields...)
term.addDefinitionTags(sense.Misc...) term.addDefinitionTags(sense.Misc...)
@ -176,12 +188,12 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor
return term, true return term, true
} }
func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) { func jmdictTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) {
if meta.seqToSenseCount[entry.Sequence] == 0 { if meta.seqToSenseCount[entry.Sequence] == 0 {
return nil, false return nil, false
} }
if headword.IsSearchOnly { if headword.IsSearchOnly {
if searchTerm, ok := createSearchTerm(headword, entry, meta); ok { if searchTerm, ok := jmdictSearchTerm(headword, entry, meta); ok {
return []dbTerm{searchTerm}, true return []dbTerm{searchTerm}, true
} else { } else {
return nil, false return nil, false
@ -194,20 +206,20 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada
// Do not increment sense number // Do not increment sense number
continue continue
} }
if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok { if senseTerm, ok := jmdictSenseTerm(sense, senseNumber, headword, entry, meta); ok {
terms = append(terms, senseTerm) terms = append(terms, senseTerm)
} }
senseNumber += 1 senseNumber += 1
} }
if formsTerm, ok := createFormsTerm(headword, entry, meta); ok { if formsTerm, ok := jmdictFormsTerm(headword, entry, meta); ok {
terms = append(terms, formsTerm) terms = append(terms, formsTerm)
} }
return terms, true return terms, true
} }
func jmdExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error { func jmdictExportDb(inputPath string, outputPath string, languageName string, title string, stride int, pretty bool) error {
if _, ok := langNameToCode[languageName]; !ok { if _, ok := langNameToCode[languageName]; !ok {
return errors.New("Unrecognized language parameter: " + languageName) return errors.New("Unrecognized language parameter: " + languageName)
} }
@ -229,7 +241,7 @@ func jmdExportDb(inputPath string, outputPath string, languageName string, title
for _, entry := range dictionary.Entries { for _, entry := range dictionary.Entries {
headwords := extractHeadwords(entry) headwords := extractHeadwords(entry)
for _, headword := range headwords { for _, headword := range headwords {
if newTerms, ok := extractTerms(headword, entry, meta); ok { if newTerms, ok := jmdictTerms(headword, entry, meta); ok {
terms = append(terms, newTerms...) terms = append(terms, newTerms...)
} }
} }

View File

@ -172,18 +172,20 @@ func formsGlossary(headwords []headword) []any {
return glossary return glossary
} }
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm { func baseFormsTerm(entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
term := dbTerm{Sequence: entry.Sequence} term := dbTerm{Sequence: entry.Sequence}
headwords := extractHeadwords(entry) headwords := extractHeadwords(entry)
if needsFormTable(headwords) { if needsFormTable(headwords) {
term.Glossary = formsTableGlossary(headwords) term.Glossary = formsTableGlossary(headwords)
} else { } else {
term.Glossary = formsGlossary(headwords) term.Glossary = formsGlossary(headwords)
} }
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech) partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
term.addRules(rules...) rules := grammarRules(partsOfSpeech)
} term.addRules(rules...)
return term return term
} }
@ -203,11 +205,11 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int
terms := dbTermList{} terms := dbTermList{}
for _, entry := range dictionary.Entries { for _, entry := range dictionary.Entries {
baseTerm := baseFormsTerm(entry) baseTerm := baseFormsTerm(entry, meta)
headwords := extractHeadwords(entry) headwords := extractHeadwords(entry)
for _, h := range headwords { for _, h := range headwords {
if h.IsSearchOnly { if h.IsSearchOnly {
if term, ok := createSearchTerm(h, entry, meta); ok { if term, ok := jmdictSearchTerm(h, entry, meta); ok {
terms = append(terms, term) terms = append(terms, term)
} }
continue continue

View File

@ -13,6 +13,7 @@ type jmdictMetadata struct {
language string language string
condensedGlosses map[senseID]string condensedGlosses map[senseID]string
seqToSenseCount map[sequence]int seqToSenseCount map[sequence]int
seqToPartsOfSpeech map[sequence][]string
seqToMainHeadword map[sequence]headword seqToMainHeadword map[sequence]headword
expHashToReadings map[hash][]string expHashToReadings map[hash][]string
headwordHashToSeqs map[hash][]sequence headwordHashToSeqs map[hash][]sequence
@ -31,7 +32,7 @@ type senseID struct {
number int number int
} }
func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySequence sequence) { func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, seq sequence) {
// This is to ensure that terms are grouped among their // This is to ensure that terms are grouped among their
// entries of origin and displayed in correct sequential order // entries of origin and displayed in correct sequential order
maxDepth := 0 maxDepth := 0
@ -48,39 +49,63 @@ func (meta *jmdictMetadata) CalculateEntryDepth(headwords []headword, entrySeque
} }
} }
} }
meta.entryDepth[entrySequence] = maxDepth meta.entryDepth[seq] = maxDepth
} }
func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEntry) { func (meta *jmdictMetadata) AddEntry(entry jmdict.JmdictEntry) {
partsOfSpeech := []string{}
// Determine how many senses are in this entry for this language senseCount := 0
if _, ok := meta.seqToSenseCount[entry.Sequence]; !ok { for _, sense := range entry.Sense {
senseCount := 0 // Only English-language senses contain part-of-speech info,
for _, entrySense := range entry.Sense { // but other languages need them for deinflection rules.
for _, gloss := range entrySense.Glossary { for _, pos := range sense.PartsOfSpeech {
if glossContainsLanguage(gloss, meta.language) { if !slices.Contains(partsOfSpeech, pos) {
senseCount += 1 partsOfSpeech = append(partsOfSpeech, pos)
break
}
} }
} }
meta.seqToSenseCount[entry.Sequence] = senseCount
}
if meta.seqToSenseCount[entry.Sequence] == 0 { if glossaryContainsLanguage(sense.Glossary, meta.language) {
senseCount += 1
} else {
continue
}
for _, reference := range sense.References {
meta.references = append(meta.references, reference)
}
for _, antonym := range sense.Antonyms {
meta.references = append(meta.references, antonym)
}
currentSenseID := senseID{entry.Sequence, senseCount}
glosses := []string{}
for _, gloss := range sense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSenseID] = strings.Join(glosses, "; ")
}
meta.seqToPartsOfSpeech[entry.Sequence] = partsOfSpeech
meta.seqToSenseCount[entry.Sequence] = senseCount
}
func (meta *jmdictMetadata) AddHeadword(headword headword, seq sequence) {
if meta.seqToSenseCount[seq] == 0 {
return return
} }
// main headwords (first ones that are found in entries). // main headwords (first ones that are found in entries).
if _, ok := meta.seqToMainHeadword[entry.Sequence]; !ok { if _, ok := meta.seqToMainHeadword[seq]; !ok {
meta.seqToMainHeadword[entry.Sequence] = headword meta.seqToMainHeadword[seq] = headword
} }
// hash the term pair so we can determine if it's used // hash the term pair so we can determine if it's used
// in more than one JMdict entry later. // in more than one JMdict entry later.
headwordHash := headword.Hash() headwordHash := headword.Hash()
if !slices.Contains(meta.headwordHashToSeqs[headwordHash], entry.Sequence) { if !slices.Contains(meta.headwordHashToSeqs[headwordHash], seq) {
meta.headwordHashToSeqs[headwordHash] = append(meta.headwordHashToSeqs[headwordHash], entry.Sequence) meta.headwordHashToSeqs[headwordHash] =
append(meta.headwordHashToSeqs[headwordHash], seq)
} }
// hash the expression so that we can determine if we // hash the expression so that we can determine if we
@ -88,7 +113,8 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
// in reference notes later. // in reference notes later.
expHash := headword.ExpHash() expHash := headword.ExpHash()
if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) { if !slices.Contains(meta.expHashToReadings[expHash], headword.Reading) {
meta.expHashToReadings[expHash] = append(meta.expHashToReadings[expHash], headword.Reading) meta.expHashToReadings[expHash] =
append(meta.expHashToReadings[expHash], headword.Reading)
} }
// e.g. for JMdict (English) we expect to end up with // e.g. for JMdict (English) we expect to end up with
@ -100,48 +126,17 @@ func (meta *jmdictMetadata) AddHeadword(headword headword, entry jmdict.JmdictEn
searchHash{headword.ReadingHash(), headword.IsPriority}, searchHash{headword.ReadingHash(), headword.IsPriority},
} }
for _, x := range searchHashes { for _, x := range searchHashes {
if !slices.Contains(meta.seqToSearchHashes[entry.Sequence], x) { if !slices.Contains(meta.seqToSearchHashes[seq], x) {
meta.seqToSearchHashes[entry.Sequence] = append(meta.seqToSearchHashes[entry.Sequence], x) meta.seqToSearchHashes[seq] = append(meta.seqToSearchHashes[seq], x)
} }
} }
currentSenseNumber := 1
for _, entrySense := range entry.Sense {
if !glossaryContainsLanguage(entrySense.Glossary, meta.language) {
continue
}
if entrySense.RestrictedReadings != nil && !slices.Contains(entrySense.RestrictedReadings, headword.Reading) {
currentSenseNumber += 1
continue
}
if entrySense.RestrictedKanji != nil && !slices.Contains(entrySense.RestrictedKanji, headword.Expression) {
currentSenseNumber += 1
continue
}
allReferences := append(entrySense.References, entrySense.Antonyms...)
for _, reference := range allReferences {
meta.references = append(meta.references, reference)
}
currentSense := senseID{entry.Sequence, currentSenseNumber}
if meta.condensedGlosses[currentSense] == "" {
glosses := []string{}
for _, gloss := range entrySense.Glossary {
if glossContainsLanguage(gloss, meta.language) && gloss.Type == nil {
glosses = append(glosses, gloss.Content)
}
}
meta.condensedGlosses[currentSense] = strings.Join(glosses, "; ")
}
currentSenseNumber += 1
}
} }
func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata { func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMetadata {
meta := jmdictMetadata{ meta := jmdictMetadata{
language: langNameToCode[languageName], language: langNameToCode[languageName],
seqToSenseCount: make(map[sequence]int), seqToSenseCount: make(map[sequence]int),
seqToPartsOfSpeech: make(map[sequence][]string),
condensedGlosses: make(map[senseID]string), condensedGlosses: make(map[senseID]string),
seqToMainHeadword: make(map[sequence]headword), seqToMainHeadword: make(map[sequence]headword),
expHashToReadings: make(map[hash][]string), expHashToReadings: make(map[hash][]string),
@ -157,10 +152,11 @@ func newJmdictMetadata(dictionary jmdict.Jmdict, languageName string) jmdictMeta
} }
for _, entry := range dictionary.Entries { for _, entry := range dictionary.Entries {
meta.AddEntry(entry)
headwords := extractHeadwords(entry) headwords := extractHeadwords(entry)
formCount := 0 formCount := 0
for _, headword := range headwords { for _, headword := range headwords {
meta.AddHeadword(headword, entry) meta.AddHeadword(headword, entry.Sequence)
if !headword.IsSearchOnly { if !headword.IsSearchOnly {
formCount += 1 formCount += 1
} }

View File

@ -96,39 +96,39 @@ func knownEntityTags() []dbTag {
// <misc> miscellaneous sense info // <misc> miscellaneous sense info
dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation dbTag{Name: "abbr", Order: 0, Score: 0, Category: ""}, // abbreviation
dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism dbTag{Name: "arch", Order: -4, Score: 0, Category: "archaism"}, // archaism
dbTag{Name: "char", Order: 0, Score: 0, Category: ""}, // character dbTag{Name: "char", Order: 4, Score: 0, Category: "name"}, // character
dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language dbTag{Name: "chn", Order: 0, Score: 0, Category: ""}, // children's language
dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism dbTag{Name: "col", Order: 0, Score: 0, Category: ""}, // colloquialism
dbTag{Name: "company", Order: 4, Score: 0, Category: "name"}, // company name dbTag{Name: "company", Order: 4, Score: 0, Category: "name"}, // company name
dbTag{Name: "creat", Order: 0, Score: 0, Category: ""}, // creature dbTag{Name: "creat", Order: 4, Score: 0, Category: "name"}, // creature
dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term dbTag{Name: "dated", Order: -4, Score: 0, Category: "archaism"}, // dated term
dbTag{Name: "dei", Order: 0, Score: 0, Category: ""}, // deity dbTag{Name: "dei", Order: 4, Score: 0, Category: "name"}, // deity
dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory dbTag{Name: "derog", Order: 0, Score: 0, Category: ""}, // derogatory
dbTag{Name: "doc", Order: 0, Score: 0, Category: ""}, // document dbTag{Name: "doc", Order: 4, Score: 0, Category: "name"}, // document
dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic dbTag{Name: "euph", Order: 0, Score: 0, Category: ""}, // euphemistic
dbTag{Name: "ev", Order: 0, Score: 0, Category: ""}, // event dbTag{Name: "ev", Order: 4, Score: 0, Category: "name"}, // event
dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language dbTag{Name: "fam", Order: 0, Score: 0, Category: ""}, // familiar language
dbTag{Name: "fem", Order: 4, Score: 0, Category: "name"}, // female term, language, or name dbTag{Name: "fem", Order: 4, Score: 0, Category: "name"}, // female term, language, or name
dbTag{Name: "fict", Order: 0, Score: 0, Category: ""}, // fiction dbTag{Name: "fict", Order: 4, Score: 0, Category: "name"}, // fiction
dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term dbTag{Name: "form", Order: 0, Score: 0, Category: ""}, // formal or literary term
dbTag{Name: "given", Order: 4, Score: 0, Category: "name"}, // given name or forename, gender not specified dbTag{Name: "given", Order: 4, Score: 0, Category: "name"}, // given name or forename, gender not specified
dbTag{Name: "group", Order: 0, Score: 0, Category: ""}, // group dbTag{Name: "group", Order: 4, Score: 0, Category: "name"}, // group
dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term dbTag{Name: "hist", Order: 0, Score: 0, Category: ""}, // historical term
dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language dbTag{Name: "hon", Order: 0, Score: 0, Category: ""}, // honorific or respectful (sonkeigo) language
dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language dbTag{Name: "hum", Order: 0, Score: 0, Category: ""}, // humble (kenjougo) language
dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression dbTag{Name: "id", Order: -5, Score: 0, Category: "expression"}, // idiomatic expression
dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term dbTag{Name: "joc", Order: 0, Score: 0, Category: ""}, // jocular, humorous term
dbTag{Name: "leg", Order: 0, Score: 0, Category: ""}, // legend dbTag{Name: "leg", Order: 4, Score: 0, Category: "name"}, // legend
dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang dbTag{Name: "m-sl", Order: 0, Score: 0, Category: ""}, // manga slang
dbTag{Name: "male", Order: 4, Score: 0, Category: "name"}, // male term, language, or name dbTag{Name: "male", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
dbTag{Name: "masc", Order: 4, Score: 0, Category: "name"}, // male term, language, or name dbTag{Name: "masc", Order: 4, Score: 0, Category: "name"}, // male term, language, or name
dbTag{Name: "myth", Order: 0, Score: 0, Category: ""}, // mythology dbTag{Name: "myth", Order: 4, Score: 0, Category: "name"}, // mythology
dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang dbTag{Name: "net-sl", Order: 0, Score: 0, Category: ""}, // Internet slang
dbTag{Name: "obj", Order: 0, Score: 0, Category: ""}, // object dbTag{Name: "obj", Order: 4, Score: 0, Category: "name"}, // object
dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term dbTag{Name: "obs", Order: -4, Score: 0, Category: "archaism"}, // obsolete term
dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word dbTag{Name: "on-mim", Order: 0, Score: 0, Category: ""}, // onomatopoeic or mimetic word
dbTag{Name: "organization", Order: 4, Score: 0, Category: "name"}, // organization name dbTag{Name: "organization", Order: 4, Score: 0, Category: "name"}, // organization name
dbTag{Name: "oth", Order: 0, Score: 0, Category: ""}, // other dbTag{Name: "oth", Order: 4, Score: 0, Category: "name"}, // other
dbTag{Name: "person", Order: 4, Score: 0, Category: "name"}, // full name of a particular person dbTag{Name: "person", Order: 4, Score: 0, Category: "name"}, // full name of a particular person
dbTag{Name: "place", Order: 4, Score: 0, Category: "name"}, // place name dbTag{Name: "place", Order: 4, Score: 0, Category: "name"}, // place name
dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term dbTag{Name: "poet", Order: 0, Score: 0, Category: ""}, // poetical term
@ -137,10 +137,10 @@ func knownEntityTags() []dbTag {
dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb dbTag{Name: "proverb", Order: 0, Score: 0, Category: "expression"}, // proverb
dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation dbTag{Name: "quote", Order: 0, Score: 0, Category: "expression"}, // quotation
dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare dbTag{Name: "rare", Order: -4, Score: 0, Category: "archaism"}, // rare
dbTag{Name: "relig", Order: 0, Score: 0, Category: ""}, // religion dbTag{Name: "relig", Order: 4, Score: 0, Category: "name"}, // religion
dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive dbTag{Name: "sens", Order: 0, Score: 0, Category: ""}, // sensitive
dbTag{Name: "serv", Order: 0, Score: 0, Category: ""}, // service dbTag{Name: "serv", Order: 4, Score: 0, Category: "name"}, // service
dbTag{Name: "ship", Order: 0, Score: 0, Category: ""}, // ship name dbTag{Name: "ship", Order: 4, Score: 0, Category: "name"}, // ship name
dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang dbTag{Name: "sl", Order: 0, Score: 0, Category: ""}, // slang
dbTag{Name: "station", Order: 4, Score: 0, Category: "name"}, // railway station dbTag{Name: "station", Order: 4, Score: 0, Category: "name"}, // railway station
dbTag{Name: "surname", Order: 4, Score: 0, Category: "name"}, // family or surname dbTag{Name: "surname", Order: 4, Score: 0, Category: "name"}, // family or surname

View File

@ -8,19 +8,22 @@ import (
) )
func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string { func jmnedictPublicationDate(dictionary jmdict.Jmnedict) string {
unknownDate := "unknown"
idx := len(dictionary.Entries) - 1
if len(dictionary.Entries) == 0 { if len(dictionary.Entries) == 0 {
return "unknown" return unknownDate
} } else if len(dictionary.Entries[idx].Translations) == 0 {
dateEntry := dictionary.Entries[len(dictionary.Entries)-1] return unknownDate
if len(dateEntry.Translations) == 0 || len(dateEntry.Translations[0].Translations) == 0 { } else if len(dictionary.Entries[idx].Translations[0].Translations) == 0 {
return "unknown" return unknownDate
} }
dateGloss := dictionary.Entries[idx].Translations[0].Translations[0]
r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`) r := regexp.MustCompile(`\d{4}-\d{2}-\d{2}`)
jmnedictDate := r.FindString(dateEntry.Translations[0].Translations[0]) date := r.FindString(dateGloss)
if jmnedictDate != "" { if date != "" {
return jmnedictDate return date
} else { } else {
return "unknown" return unknownDate
} }
} }

View File

@ -44,10 +44,7 @@ func replaceIterationMarks(text string) string {
// Returns nil if no segmentation is possible. // Returns nil if no segmentation is possible.
func makeKanaSegments(kana string) (segments []string) { func makeKanaSegments(kana string) (segments []string) {
hiragana := replaceIterationMarks(katakanaToHiragana(kana)) hiragana := replaceIterationMarks(katakanaToHiragana(kana))
kanaRunes := []rune{} kanaRunes := []rune(hiragana)
for _, kanaRune := range hiragana {
kanaRunes = append(kanaRunes, kanaRune)
}
kanaRuneCount := len(kanaRunes) kanaRuneCount := len(kanaRunes)
for i := 0; i < kanaRuneCount; i++ { for i := 0; i < kanaRuneCount; i++ {
for j := 0; j < kanaRuneCount-i; j++ { for j := 0; j < kanaRuneCount-i; j++ {