1

Cleanup & fixes

This commit is contained in:
Alex Yatskov 2016-12-18 11:46:47 -08:00
parent 764a3c4e0f
commit 1f95077e7b
7 changed files with 53 additions and 47 deletions

View File

@ -116,7 +116,7 @@ func (kanji dbKanjiList) crush() [][]interface{} {
func writeDb(outputDir, title string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, pretty bool) error { func writeDb(outputDir, title string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, pretty bool) error {
const DB_VERSION = 1 const DB_VERSION = 1
const BANK_STRIDE = 50000 const BANK_STRIDE = 10000
marshalJson := func(obj interface{}, pretty bool) ([]byte, error) { marshalJson := func(obj interface{}, pretty bool) ([]byte, error) {
if pretty { if pretty {

View File

@ -27,6 +27,13 @@ import (
"strings" "strings"
) )
type daijirinExtractor struct {
partsExp *regexp.Regexp
phonExp *regexp.Regexp
variantExp *regexp.Regexp
annotExp *regexp.Regexp
}
func makeDaijirinExtractor() epwingExtractor { func makeDaijirinExtractor() epwingExtractor {
return &daijirinExtractor{ return &daijirinExtractor{
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`), partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`),

View File

@ -29,7 +29,7 @@ import (
"github.com/FooSoft/jmdict" "github.com/FooSoft/jmdict"
) )
func computeJmdictRules(term *dbTerm) { func jmdictBuildRules(term *dbTerm) {
for _, tag := range term.Tags { for _, tag := range term.Tags {
switch tag { switch tag {
case "adj-i", "v1", "vk", "vs": case "adj-i", "v1", "vk", "vs":
@ -42,7 +42,7 @@ func computeJmdictRules(term *dbTerm) {
} }
} }
func computeJmdictScore(term *dbTerm) { func jmdictBuildScore(term *dbTerm) {
term.Score = 0 term.Score = 0
for _, tag := range term.Tags { for _, tag := range term.Tags {
switch tag { switch tag {
@ -54,31 +54,37 @@ func computeJmdictScore(term *dbTerm) {
} }
} }
func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta { func jmdictAddPriorities(term *dbTerm, priorities ...string) {
for _, priority := range priorities {
switch priority {
case "news1", "ichi1", "spec1", "gai1":
term.addTags("P")
fallthrough
case "news2", "ichi2", "spec2", "gai2":
term.addTags(priority[:len(priority)-1])
}
}
}
func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := map[string]dbTagMeta{ tags := map[string]dbTagMeta{
"news1": {Notes: "appears frequently in Mainichi Shimbun (top listing)", Category: "frequent", Order: 3}, "news": {Notes: "appears frequently in Mainichi Shimbun"},
"ichi1": {Notes: "listed as common in Ichimango Goi Bunruishuu (top listing)", Category: "frequent", Order: 3}, "ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu"},
"spec1": {Notes: "common words not included in frequency lists (top listing)", Category: "frequent", Order: 3}, "spec": {Notes: "common words not included in frequency lists"},
"gai1": {Notes: "common loanword (top listing)", Category: "frequent", Order: 3}, "gai": {Notes: "common loanword"},
"news2": {Notes: "appears frequently in Mainichi Shimbun (bottom listing)", Order: 3}, "P": {Notes: "popular term", Category: "popular", Order: -10},
"ichi2": {Notes: "listed as common in Ichimango Goi Bunruishuu (bottom listing)", Order: 3},
"spec2": {Notes: "common words not included in frequency lists (bottom listing)", Order: 3},
"gai2": {Notes: "common loanword (bottom listing)", Order: 3},
} }
for name, value := range entities { for name, value := range entities {
tag := dbTagMeta{Notes: value} tag := dbTagMeta{Notes: value}
switch name { switch name {
case "gai1", "ichi1", "news1", "spec1":
tag.Category = "frequent"
tag.Order = 1
case "exp", "id": case "exp", "id":
tag.Category = "expression" tag.Category = "expression"
tag.Order = 2 tag.Order = -5
case "arch", "iK": case "arch", "iK":
tag.Category = "archaism" tag.Category = "archaism"
tag.Order = 2 tag.Order = -5
} }
tags[name] = tag tags[name] = tag
@ -87,7 +93,7 @@ func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
return tags return tags
} }
func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm { func jmdictExtractTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
var terms []dbTerm var terms []dbTerm
convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) { convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) {
@ -100,7 +106,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
if kanji == nil { if kanji == nil {
termBase.Expression = reading.Reading termBase.Expression = reading.Reading
termBase.addTags(reading.Priorities...) jmdictAddPriorities(&termBase, reading.Priorities...)
} else { } else {
termBase.Expression = kanji.Expression termBase.Expression = kanji.Expression
termBase.Reading = reading.Reading termBase.Reading = reading.Reading
@ -108,7 +114,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
for _, priority := range kanji.Priorities { for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) { if hasString(priority, reading.Priorities) {
termBase.addTags(priority) jmdictAddPriorities(&termBase, priority)
} }
} }
} }
@ -133,8 +139,8 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
term.Glossary = append(term.Glossary, glossary.Content) term.Glossary = append(term.Glossary, glossary.Content)
} }
computeJmdictRules(&term) jmdictBuildRules(&term)
computeJmdictScore(&term) jmdictBuildScore(&term)
terms = append(terms, term) terms = append(terms, term)
} }
@ -155,7 +161,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
return terms return terms
} }
func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error { func jmdictExportDb(outputDir, title string, reader io.Reader, flags int) error {
dict, entities, err := jmdict.LoadJmdictNoTransform(reader) dict, entities, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil { if err != nil {
return err return err
@ -163,7 +169,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
var terms dbTermList var terms dbTermList
for _, entry := range dict.Entries { for _, entry := range dict.Entries {
terms = append(terms, extractJmdictTerms(entry)...) terms = append(terms, jmdictExtractTerms(entry)...)
} }
return writeDb( return writeDb(
@ -171,7 +177,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
title, title,
terms.crush(), terms.crush(),
nil, nil,
computeJmdictTagMeta(entities), jmdictBuildTagMeta(entities),
flags&flagPretty == flagPretty, flags&flagPretty == flagPretty,
) )
} }

View File

@ -28,7 +28,7 @@ import (
"github.com/FooSoft/jmdict" "github.com/FooSoft/jmdict"
) )
func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta { func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := make(map[string]dbTagMeta) tags := make(map[string]dbTagMeta)
for name, value := range entities { for name, value := range entities {
@ -46,7 +46,7 @@ func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
return tags return tags
} }
func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm { func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
var terms []dbTerm var terms []dbTerm
convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) { convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) {
@ -95,7 +95,7 @@ func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
return terms return terms
} }
func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) error { func jmnedictExportDb(outputDir, title string, reader io.Reader, flags int) error {
dict, entities, err := jmdict.LoadJmnedictNoTransform(reader) dict, entities, err := jmdict.LoadJmnedictNoTransform(reader)
if err != nil { if err != nil {
return err return err
@ -103,7 +103,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
var terms dbTermList var terms dbTermList
for _, e := range dict.Entries { for _, e := range dict.Entries {
terms = append(terms, extractJmnedictTerms(e)...) terms = append(terms, jmnedictExtractTerms(e)...)
} }
return writeDb( return writeDb(
@ -111,7 +111,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
title, title,
terms.crush(), terms.crush(),
nil, nil,
computeJmnedictTagMeta(entities), jmnedictBuildTagMeta(entities),
flags&flagPretty == flagPretty, flags&flagPretty == flagPretty,
) )
} }

View File

@ -55,14 +55,7 @@ type epwingExtractor interface {
getFontWide() map[int]string getFontWide() map[int]string
} }
type daijirinExtractor struct { func epwingExportDb(outputDir, title string, reader io.Reader, flags int) error {
partsExp *regexp.Regexp
phonExp *regexp.Regexp
variantExp *regexp.Regexp
annotExp *regexp.Regexp
}
func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error {
data, err := ioutil.ReadAll(reader) data, err := ioutil.ReadAll(reader)
if err != nil { if err != nil {
return err return err

View File

@ -30,7 +30,7 @@ import (
"github.com/FooSoft/jmdict" "github.com/FooSoft/jmdict"
) )
func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji { func kanjidicExtractKanji(entry jmdict.KanjidicCharacter) dbKanji {
kanji := dbKanji{Character: entry.Literal} kanji := dbKanji{Character: entry.Literal}
if level := entry.Misc.JlptLevel; level != nil { if level := entry.Misc.JlptLevel; level != nil {
@ -78,7 +78,7 @@ func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
return kanji return kanji
} }
func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) error { func kanjidicExportDb(outputDir, title string, reader io.Reader, flags int) error {
dict, err := jmdict.LoadKanjidic(reader) dict, err := jmdict.LoadKanjidic(reader)
if err != nil { if err != nil {
return err return err
@ -86,12 +86,12 @@ func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) erro
var kanji dbKanjiList var kanji dbKanjiList
for _, entry := range dict.Characters { for _, entry := range dict.Characters {
kanji = append(kanji, extractKanjidicKanji(entry)) kanji = append(kanji, kanjidicExtractKanji(entry))
} }
tagMeta := map[string]dbTagMeta{ tagMeta := map[string]dbTagMeta{
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: 3}, "jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5},
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: 3}, "jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5},
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"}, "jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"},
"grade": {Notes: "school grade level at which the character is taught"}, "grade": {Notes: "school grade level at which the character is taught"},
"strokes": {Notes: "number of strokes needed to write the character"}, "strokes": {Notes: "number of strokes needed to write the character"},

View File

@ -44,10 +44,10 @@ func usage() {
func exportDb(inputPath, outputDir, format, title string, flags int) error { func exportDb(inputPath, outputDir, format, title string, flags int) error {
handlers := map[string]func(string, string, io.Reader, int) error{ handlers := map[string]func(string, string, io.Reader, int) error{
"edict": exportJmdictDb, "edict": jmdictExportDb,
"enamdict": exportJmnedictDb, "enamdict": jmnedictExportDb,
"kanjidic": exportKanjidicDb, "kanjidic": kanjidicExportDb,
"epwing": exportEpwingDb, "epwing": epwingExportDb,
} }
handler, ok := handlers[format] handler, ok := handlers[format]