Merge pull request #11 from siikamiika/dev
changes related to FooSoft/yomichan#84
This commit is contained in:
commit
cc4140fd4c
26
common.go
26
common.go
@ -74,18 +74,24 @@ func (freqs dbMetaList) crush() dbRecordList {
|
||||
}
|
||||
|
||||
type dbTerm struct {
|
||||
Expression string
|
||||
Reading string
|
||||
Tags []string
|
||||
Rules []string
|
||||
Score int
|
||||
Glossary []string
|
||||
Expression string
|
||||
Reading string
|
||||
DefinitionTags []string
|
||||
Rules []string
|
||||
Score int
|
||||
Glossary []string
|
||||
Sequence int
|
||||
TermTags []string
|
||||
}
|
||||
|
||||
type dbTermList []dbTerm
|
||||
|
||||
func (term *dbTerm) addTags(tags ...string) {
|
||||
term.Tags = appendStringUnique(term.Tags, tags...)
|
||||
func (term *dbTerm) addDefinitionTags(tags ...string) {
|
||||
term.DefinitionTags = appendStringUnique(term.DefinitionTags, tags...)
|
||||
}
|
||||
|
||||
func (term *dbTerm) addTermTags(tags ...string) {
|
||||
term.TermTags = appendStringUnique(term.TermTags, tags...)
|
||||
}
|
||||
|
||||
func (term *dbTerm) addRules(rules ...string) {
|
||||
@ -98,10 +104,12 @@ func (terms dbTermList) crush() dbRecordList {
|
||||
result := dbRecord{
|
||||
t.Expression,
|
||||
t.Reading,
|
||||
strings.Join(t.Tags, " "),
|
||||
strings.Join(t.DefinitionTags, " "),
|
||||
strings.Join(t.Rules, " "),
|
||||
t.Score,
|
||||
t.Glossary,
|
||||
t.Sequence,
|
||||
strings.Join(t.TermTags, " "),
|
||||
}
|
||||
|
||||
results = append(results, result)
|
||||
|
@ -47,7 +47,7 @@ func makeDaijirinExtractor() epwingExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
func (e *daijirinExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
|
||||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||||
if matches == nil {
|
||||
return nil
|
||||
@ -86,6 +86,7 @@ func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
e.exportRules(&term, tags)
|
||||
@ -99,6 +100,7 @@ func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
e.exportRules(&term, tags)
|
||||
|
@ -49,7 +49,7 @@ func makeDaijisenExtractor() epwingExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
func (e *daijisenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
|
||||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||||
if matches == nil {
|
||||
return nil
|
||||
@ -88,6 +88,7 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
e.exportRules(&term, tags)
|
||||
@ -99,6 +100,7 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
e.exportRules(&term, tags)
|
||||
|
64
edict.go
64
edict.go
@ -29,10 +29,10 @@ import (
|
||||
"github.com/FooSoft/jmdict"
|
||||
)
|
||||
|
||||
const jmdictRevision = "jmdict3"
|
||||
const jmdictRevision = "jmdict4"
|
||||
|
||||
func jmdictBuildRules(term *dbTerm) {
|
||||
for _, tag := range term.Tags {
|
||||
for _, tag := range term.DefinitionTags {
|
||||
switch tag {
|
||||
case "adj-i", "v1", "vk":
|
||||
term.addRules(tag)
|
||||
@ -47,13 +47,19 @@ func jmdictBuildRules(term *dbTerm) {
|
||||
}
|
||||
|
||||
func jmdictBuildScore(term *dbTerm) {
|
||||
for _, tag := range term.Tags {
|
||||
for _, tag := range term.DefinitionTags {
|
||||
switch tag {
|
||||
case "news", "ichi", "spec", "gai":
|
||||
case "arch":
|
||||
term.Score -= 100
|
||||
}
|
||||
}
|
||||
for _, tag := range term.TermTags {
|
||||
switch tag {
|
||||
case "news", "ichi", "spec", "gai1":
|
||||
term.Score += 100
|
||||
case "P":
|
||||
term.Score += 500
|
||||
case "arch", "iK":
|
||||
case "iK":
|
||||
term.Score -= 100
|
||||
}
|
||||
}
|
||||
@ -63,10 +69,10 @@ func jmdictAddPriorities(term *dbTerm, priorities ...string) {
|
||||
for _, priority := range priorities {
|
||||
switch priority {
|
||||
case "news1", "ichi1", "spec1", "gai1":
|
||||
term.addTags("P")
|
||||
term.addTermTags("P")
|
||||
fallthrough
|
||||
case "news2", "ichi2", "spec2", "gai2":
|
||||
term.addTags(priority[:len(priority)-1])
|
||||
term.addTermTags(priority[:len(priority)-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -90,6 +96,14 @@ func jmdictBuildTagMeta(entities map[string]string) dbTagList {
|
||||
case "arch", "iK":
|
||||
tag.Category = "archaism"
|
||||
tag.Order = -4
|
||||
case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj",
|
||||
"aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf",
|
||||
"unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k",
|
||||
"v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru",
|
||||
"v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i",
|
||||
"vs", "vs-s", "vt", "vz":
|
||||
tag.Category = "partOfSpeech"
|
||||
tag.Order = -3
|
||||
}
|
||||
|
||||
tags = append(tags, tag)
|
||||
@ -107,7 +121,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
|
||||
}
|
||||
|
||||
var termBase dbTerm
|
||||
termBase.addTags(reading.Information...)
|
||||
termBase.addTermTags(reading.Information...)
|
||||
|
||||
if kanji == nil {
|
||||
termBase.Expression = reading.Reading
|
||||
@ -115,7 +129,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
|
||||
} else {
|
||||
termBase.Expression = kanji.Expression
|
||||
termBase.Reading = reading.Reading
|
||||
termBase.addTags(kanji.Information...)
|
||||
termBase.addTermTags(kanji.Information...)
|
||||
|
||||
for _, priority := range kanji.Priorities {
|
||||
if hasString(priority, reading.Priorities) {
|
||||
@ -126,6 +140,11 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
|
||||
|
||||
var partsOfSpeech []string
|
||||
for index, sense := range edictEntry.Sense {
|
||||
|
||||
if len(sense.PartsOfSpeech) != 0 {
|
||||
partsOfSpeech = sense.PartsOfSpeech
|
||||
}
|
||||
|
||||
if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) {
|
||||
continue
|
||||
}
|
||||
@ -138,6 +157,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
|
||||
Reading: termBase.Reading,
|
||||
Expression: termBase.Expression,
|
||||
Score: len(edictEntry.Sense) - index,
|
||||
Sequence: edictEntry.Sequence,
|
||||
}
|
||||
|
||||
for _, glossary := range sense.Glossary {
|
||||
@ -150,17 +170,12 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
|
||||
continue
|
||||
}
|
||||
|
||||
term.addTags(termBase.Tags...)
|
||||
term.addTags(sense.PartsOfSpeech...)
|
||||
term.addTags(sense.Fields...)
|
||||
term.addTags(sense.Misc...)
|
||||
term.addTags(sense.Dialects...)
|
||||
|
||||
if index == 0 {
|
||||
partsOfSpeech = sense.PartsOfSpeech
|
||||
} else {
|
||||
term.addTags(partsOfSpeech...)
|
||||
}
|
||||
term.addDefinitionTags(termBase.DefinitionTags...)
|
||||
term.addTermTags(termBase.TermTags...)
|
||||
term.addDefinitionTags(partsOfSpeech...)
|
||||
term.addDefinitionTags(sense.Fields...)
|
||||
term.addDefinitionTags(sense.Misc...)
|
||||
term.addDefinitionTags(sense.Dialects...)
|
||||
|
||||
jmdictBuildRules(&term)
|
||||
jmdictBuildScore(&term)
|
||||
@ -172,7 +187,14 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
|
||||
if len(edictEntry.Kanji) > 0 {
|
||||
for _, kanji := range edictEntry.Kanji {
|
||||
for _, reading := range edictEntry.Readings {
|
||||
convert(reading, &kanji)
|
||||
if reading.NoKanji == nil {
|
||||
convert(reading, &kanji)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, reading := range edictEntry.Readings {
|
||||
if reading.NoKanji != nil {
|
||||
convert(reading, nil)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
10
enamdict.go
10
enamdict.go
@ -57,26 +57,26 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
}
|
||||
|
||||
var term dbTerm
|
||||
term.addTags(reading.Information...)
|
||||
term.Sequence = enamdictEntry.Sequence
|
||||
term.addTermTags(reading.Information...)
|
||||
|
||||
if kanji == nil {
|
||||
term.Expression = reading.Reading
|
||||
term.addTags(reading.Information...)
|
||||
} else {
|
||||
term.Expression = kanji.Expression
|
||||
term.Reading = reading.Reading
|
||||
term.addTags(kanji.Information...)
|
||||
term.addTermTags(kanji.Information...)
|
||||
|
||||
for _, priority := range kanji.Priorities {
|
||||
if hasString(priority, reading.Priorities) {
|
||||
term.addTags(priority)
|
||||
term.addTermTags(priority)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, trans := range enamdictEntry.Translations {
|
||||
term.Glossary = append(term.Glossary, trans.Translations...)
|
||||
term.addTags(trans.NameTypes...)
|
||||
term.addDefinitionTags(trans.NameTypes...)
|
||||
}
|
||||
|
||||
terms = append(terms, term)
|
||||
|
@ -55,7 +55,7 @@ type epwingBook struct {
|
||||
}
|
||||
|
||||
type epwingExtractor interface {
|
||||
extractTerms(entry epwingEntry) []dbTerm
|
||||
extractTerms(entry epwingEntry, sequence int) []dbTerm
|
||||
extractKanji(entry epwingEntry) []dbKanji
|
||||
getFontNarrow() map[int]string
|
||||
getFontWide() map[int]string
|
||||
@ -155,6 +155,8 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
|
||||
|
||||
log.Println("formatting dictionary data...")
|
||||
|
||||
var sequence int
|
||||
|
||||
for _, subbook := range book.Subbooks {
|
||||
if extractor, ok := epwingExtractors[subbook.Title]; ok {
|
||||
fontNarrow := extractor.getFontNarrow()
|
||||
@ -185,8 +187,10 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
|
||||
entry.Heading = translate(entry.Heading)
|
||||
entry.Text = translate(entry.Text)
|
||||
|
||||
terms = append(terms, extractor.extractTerms(entry)...)
|
||||
terms = append(terms, extractor.extractTerms(entry, sequence)...)
|
||||
kanji = append(kanji, extractor.extractKanji(entry)...)
|
||||
|
||||
sequence++
|
||||
}
|
||||
|
||||
revisions = append(revisions, extractor.getRevision())
|
||||
|
@ -43,7 +43,7 @@ func makeKotowazaExtractor() epwingExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
func (e *kotowazaExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
func (e *kotowazaExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
|
||||
heading := entry.Heading
|
||||
|
||||
queue := []string{heading}
|
||||
@ -93,6 +93,7 @@ func (e *kotowazaExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
terms = append(terms, term)
|
||||
|
@ -77,7 +77,7 @@ func makeMeikyouExtractor() epwingExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
func (e *meikyouExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
func (e *meikyouExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
|
||||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||||
if matches == nil {
|
||||
return nil
|
||||
@ -127,6 +127,7 @@ func (e *meikyouExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
term := dbTerm{
|
||||
Expression: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
e.exportRules(&term, tags)
|
||||
@ -140,6 +141,7 @@ func (e *meikyouExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
e.exportRules(&term, tags)
|
||||
|
11
rikai.go
11
rikai.go
@ -39,7 +39,7 @@ type rikaiEntry struct {
|
||||
}
|
||||
|
||||
func rikaiBuildRules(term *dbTerm) {
|
||||
for _, tag := range term.Tags {
|
||||
for _, tag := range term.DefinitionTags {
|
||||
switch tag {
|
||||
case "adj-i", "v1", "vk":
|
||||
term.addRules(tag)
|
||||
@ -54,7 +54,7 @@ func rikaiBuildRules(term *dbTerm) {
|
||||
}
|
||||
|
||||
func rikaiBuildScore(term *dbTerm) {
|
||||
for _, tag := range term.Tags {
|
||||
for _, tag := range term.DefinitionTags {
|
||||
switch tag {
|
||||
case "news", "ichi", "spec", "gai":
|
||||
term.Score++
|
||||
@ -73,6 +73,8 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
|
||||
readExp := regexp.MustCompile(`\[([^\]]+)\]`)
|
||||
tagExp := regexp.MustCompile(`[\s\(\),]`)
|
||||
|
||||
var sequence int
|
||||
|
||||
for rows.Next() {
|
||||
var (
|
||||
kanji, kana, entry *string
|
||||
@ -104,6 +106,7 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
|
||||
}
|
||||
|
||||
var term dbTerm
|
||||
term.Sequence = sequence
|
||||
if kana != nil {
|
||||
term.Expression = *kana
|
||||
term.Reading = *kana
|
||||
@ -118,7 +121,7 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
|
||||
if dfnMatch := dfnExp.FindStringSubmatch(segment); dfnMatch != nil {
|
||||
for _, tag := range tagExp.Split(dfnMatch[1], -1) {
|
||||
if rikaiTagParsed(tag) {
|
||||
term.addTags(tag)
|
||||
term.addDefinitionTags(tag)
|
||||
}
|
||||
}
|
||||
|
||||
@ -132,6 +135,8 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
|
||||
rikaiBuildScore(&term)
|
||||
|
||||
terms = append(terms, term)
|
||||
|
||||
sequence++
|
||||
}
|
||||
|
||||
return terms, nil
|
||||
|
3
wadai.go
3
wadai.go
@ -45,7 +45,7 @@ func makeWadaiExtractor() epwingExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
func (e *wadaiExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
func (e *wadaiExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
|
||||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||||
if matches == nil {
|
||||
return nil
|
||||
@ -90,6 +90,7 @@ func (e *wadaiExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
Expression: expression,
|
||||
Reading: reading,
|
||||
Glossary: []string{entry.Text},
|
||||
Sequence: sequence,
|
||||
}
|
||||
|
||||
terms = append(terms, term)
|
||||
|
Loading…
Reference in New Issue
Block a user