1

Merge pull request #11 from siikamiika/dev

changes related to FooSoft/yomichan#84
This commit is contained in:
Alex Yatskov 2017-10-13 08:40:17 -07:00 committed by GitHub
commit cc4140fd4c
10 changed files with 92 additions and 45 deletions

View File

@ -74,18 +74,24 @@ func (freqs dbMetaList) crush() dbRecordList {
}
type dbTerm struct {
Expression string
Reading string
Tags []string
Rules []string
Score int
Glossary []string
Expression string
Reading string
DefinitionTags []string
Rules []string
Score int
Glossary []string
Sequence int
TermTags []string
}
type dbTermList []dbTerm
func (term *dbTerm) addTags(tags ...string) {
term.Tags = appendStringUnique(term.Tags, tags...)
func (term *dbTerm) addDefinitionTags(tags ...string) {
term.DefinitionTags = appendStringUnique(term.DefinitionTags, tags...)
}
func (term *dbTerm) addTermTags(tags ...string) {
term.TermTags = appendStringUnique(term.TermTags, tags...)
}
func (term *dbTerm) addRules(rules ...string) {
@ -98,10 +104,12 @@ func (terms dbTermList) crush() dbRecordList {
result := dbRecord{
t.Expression,
t.Reading,
strings.Join(t.Tags, " "),
strings.Join(t.DefinitionTags, " "),
strings.Join(t.Rules, " "),
t.Score,
t.Glossary,
t.Sequence,
strings.Join(t.TermTags, " "),
}
results = append(results, result)

View File

@ -47,7 +47,7 @@ func makeDaijirinExtractor() epwingExtractor {
}
}
func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
func (e *daijirinExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -86,6 +86,7 @@ func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)
@ -99,6 +100,7 @@ func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)

View File

@ -49,7 +49,7 @@ func makeDaijisenExtractor() epwingExtractor {
}
}
func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
func (e *daijisenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -88,6 +88,7 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)
@ -99,6 +100,7 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)

View File

@ -29,10 +29,10 @@ import (
"github.com/FooSoft/jmdict"
)
const jmdictRevision = "jmdict3"
const jmdictRevision = "jmdict4"
func jmdictBuildRules(term *dbTerm) {
for _, tag := range term.Tags {
for _, tag := range term.DefinitionTags {
switch tag {
case "adj-i", "v1", "vk":
term.addRules(tag)
@ -47,13 +47,19 @@ func jmdictBuildRules(term *dbTerm) {
}
func jmdictBuildScore(term *dbTerm) {
for _, tag := range term.Tags {
for _, tag := range term.DefinitionTags {
switch tag {
case "news", "ichi", "spec", "gai":
case "arch":
term.Score -= 100
}
}
for _, tag := range term.TermTags {
switch tag {
case "news", "ichi", "spec", "gai1":
term.Score += 100
case "P":
term.Score += 500
case "arch", "iK":
case "iK":
term.Score -= 100
}
}
@ -63,10 +69,10 @@ func jmdictAddPriorities(term *dbTerm, priorities ...string) {
for _, priority := range priorities {
switch priority {
case "news1", "ichi1", "spec1", "gai1":
term.addTags("P")
term.addTermTags("P")
fallthrough
case "news2", "ichi2", "spec2", "gai2":
term.addTags(priority[:len(priority)-1])
term.addTermTags(priority[:len(priority)-1])
}
}
}
@ -90,6 +96,14 @@ func jmdictBuildTagMeta(entities map[string]string) dbTagList {
case "arch", "iK":
tag.Category = "archaism"
tag.Order = -4
case "adj-f", "adj-i", "adj-ix", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "aux-adj",
"aux", "aux-v", "conj", "cop-da", "ctr", "int", "n-adv", "n", "n-pref", "n-pr", "n-suf", "n-t", "num", "pn", "pref", "prt", "suf",
"unc", "v1", "v1-s", "v2a-s", "v2b-k", "v2d-s", "v2g-k", "v2g-s", "v2h-k", "v2h-s", "v2k-k", "v2k-s", "v2m-s", "v2n-s", "v2r-k",
"v2r-s", "v2s-s", "v2t-k", "v2t-s", "v2w-s", "v2y-k", "v2y-s", "v2z-s", "v4b", "v4h", "v4k", "v4m", "v4r", "v4s", "v4t", "v5aru",
"v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r-i", "v5r", "v5s", "v5t", "v5u", "v5u-s", "vi", "vk", "vn", "vr", "vs-c", "vs-i",
"vs", "vs-s", "vt", "vz":
tag.Category = "partOfSpeech"
tag.Order = -3
}
tags = append(tags, tag)
@ -107,7 +121,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
}
var termBase dbTerm
termBase.addTags(reading.Information...)
termBase.addTermTags(reading.Information...)
if kanji == nil {
termBase.Expression = reading.Reading
@ -115,7 +129,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
} else {
termBase.Expression = kanji.Expression
termBase.Reading = reading.Reading
termBase.addTags(kanji.Information...)
termBase.addTermTags(kanji.Information...)
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
@ -126,6 +140,11 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
var partsOfSpeech []string
for index, sense := range edictEntry.Sense {
if len(sense.PartsOfSpeech) != 0 {
partsOfSpeech = sense.PartsOfSpeech
}
if sense.RestrictedReadings != nil && !hasString(reading.Reading, sense.RestrictedReadings) {
continue
}
@ -138,6 +157,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
Reading: termBase.Reading,
Expression: termBase.Expression,
Score: len(edictEntry.Sense) - index,
Sequence: edictEntry.Sequence,
}
for _, glossary := range sense.Glossary {
@ -150,17 +170,12 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
continue
}
term.addTags(termBase.Tags...)
term.addTags(sense.PartsOfSpeech...)
term.addTags(sense.Fields...)
term.addTags(sense.Misc...)
term.addTags(sense.Dialects...)
if index == 0 {
partsOfSpeech = sense.PartsOfSpeech
} else {
term.addTags(partsOfSpeech...)
}
term.addDefinitionTags(termBase.DefinitionTags...)
term.addTermTags(termBase.TermTags...)
term.addDefinitionTags(partsOfSpeech...)
term.addDefinitionTags(sense.Fields...)
term.addDefinitionTags(sense.Misc...)
term.addDefinitionTags(sense.Dialects...)
jmdictBuildRules(&term)
jmdictBuildScore(&term)
@ -172,7 +187,14 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry, language string) []dbTerm
if len(edictEntry.Kanji) > 0 {
for _, kanji := range edictEntry.Kanji {
for _, reading := range edictEntry.Readings {
convert(reading, &kanji)
if reading.NoKanji == nil {
convert(reading, &kanji)
}
}
}
for _, reading := range edictEntry.Readings {
if reading.NoKanji != nil {
convert(reading, nil)
}
}
} else {

View File

@ -57,26 +57,26 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
}
var term dbTerm
term.addTags(reading.Information...)
term.Sequence = enamdictEntry.Sequence
term.addTermTags(reading.Information...)
if kanji == nil {
term.Expression = reading.Reading
term.addTags(reading.Information...)
} else {
term.Expression = kanji.Expression
term.Reading = reading.Reading
term.addTags(kanji.Information...)
term.addTermTags(kanji.Information...)
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
term.addTags(priority)
term.addTermTags(priority)
}
}
}
for _, trans := range enamdictEntry.Translations {
term.Glossary = append(term.Glossary, trans.Translations...)
term.addTags(trans.NameTypes...)
term.addDefinitionTags(trans.NameTypes...)
}
terms = append(terms, term)

View File

@ -55,7 +55,7 @@ type epwingBook struct {
}
type epwingExtractor interface {
extractTerms(entry epwingEntry) []dbTerm
extractTerms(entry epwingEntry, sequence int) []dbTerm
extractKanji(entry epwingEntry) []dbKanji
getFontNarrow() map[int]string
getFontWide() map[int]string
@ -155,6 +155,8 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
log.Println("formatting dictionary data...")
var sequence int
for _, subbook := range book.Subbooks {
if extractor, ok := epwingExtractors[subbook.Title]; ok {
fontNarrow := extractor.getFontNarrow()
@ -185,8 +187,10 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
entry.Heading = translate(entry.Heading)
entry.Text = translate(entry.Text)
terms = append(terms, extractor.extractTerms(entry)...)
terms = append(terms, extractor.extractTerms(entry, sequence)...)
kanji = append(kanji, extractor.extractKanji(entry)...)
sequence++
}
revisions = append(revisions, extractor.getRevision())

View File

@ -43,7 +43,7 @@ func makeKotowazaExtractor() epwingExtractor {
}
}
func (e *kotowazaExtractor) extractTerms(entry epwingEntry) []dbTerm {
func (e *kotowazaExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
heading := entry.Heading
queue := []string{heading}
@ -93,6 +93,7 @@ func (e *kotowazaExtractor) extractTerms(entry epwingEntry) []dbTerm {
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
terms = append(terms, term)

View File

@ -77,7 +77,7 @@ func makeMeikyouExtractor() epwingExtractor {
}
}
func (e *meikyouExtractor) extractTerms(entry epwingEntry) []dbTerm {
func (e *meikyouExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -127,6 +127,7 @@ func (e *meikyouExtractor) extractTerms(entry epwingEntry) []dbTerm {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)
@ -140,6 +141,7 @@ func (e *meikyouExtractor) extractTerms(entry epwingEntry) []dbTerm {
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)

View File

@ -39,7 +39,7 @@ type rikaiEntry struct {
}
func rikaiBuildRules(term *dbTerm) {
for _, tag := range term.Tags {
for _, tag := range term.DefinitionTags {
switch tag {
case "adj-i", "v1", "vk":
term.addRules(tag)
@ -54,7 +54,7 @@ func rikaiBuildRules(term *dbTerm) {
}
func rikaiBuildScore(term *dbTerm) {
for _, tag := range term.Tags {
for _, tag := range term.DefinitionTags {
switch tag {
case "news", "ichi", "spec", "gai":
term.Score++
@ -73,6 +73,8 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
readExp := regexp.MustCompile(`\[([^\]]+)\]`)
tagExp := regexp.MustCompile(`[\s\(\),]`)
var sequence int
for rows.Next() {
var (
kanji, kana, entry *string
@ -104,6 +106,7 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
}
var term dbTerm
term.Sequence = sequence
if kana != nil {
term.Expression = *kana
term.Reading = *kana
@ -118,7 +121,7 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
if dfnMatch := dfnExp.FindStringSubmatch(segment); dfnMatch != nil {
for _, tag := range tagExp.Split(dfnMatch[1], -1) {
if rikaiTagParsed(tag) {
term.addTags(tag)
term.addDefinitionTags(tag)
}
}
@ -132,6 +135,8 @@ func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
rikaiBuildScore(&term)
terms = append(terms, term)
sequence++
}
return terms, nil

View File

@ -45,7 +45,7 @@ func makeWadaiExtractor() epwingExtractor {
}
}
func (e *wadaiExtractor) extractTerms(entry epwingEntry) []dbTerm {
func (e *wadaiExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -90,6 +90,7 @@ func (e *wadaiExtractor) extractTerms(entry epwingEntry) []dbTerm {
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
terms = append(terms, term)