1

Cleanup & fixes

This commit is contained in:
Alex Yatskov 2016-12-18 11:46:47 -08:00
parent 764a3c4e0f
commit 1f95077e7b
7 changed files with 53 additions and 47 deletions

View File

@ -116,7 +116,7 @@ func (kanji dbKanjiList) crush() [][]interface{} {
func writeDb(outputDir, title string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, pretty bool) error {
const DB_VERSION = 1
const BANK_STRIDE = 50000
const BANK_STRIDE = 10000
marshalJson := func(obj interface{}, pretty bool) ([]byte, error) {
if pretty {

View File

@ -27,6 +27,13 @@ import (
"strings"
)
type daijirinExtractor struct {
partsExp *regexp.Regexp
phonExp *regexp.Regexp
variantExp *regexp.Regexp
annotExp *regexp.Regexp
}
func makeDaijirinExtractor() epwingExtractor {
return &daijirinExtractor{
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`),

View File

@ -29,7 +29,7 @@ import (
"github.com/FooSoft/jmdict"
)
func computeJmdictRules(term *dbTerm) {
func jmdictBuildRules(term *dbTerm) {
for _, tag := range term.Tags {
switch tag {
case "adj-i", "v1", "vk", "vs":
@ -42,7 +42,7 @@ func computeJmdictRules(term *dbTerm) {
}
}
func computeJmdictScore(term *dbTerm) {
func jmdictBuildScore(term *dbTerm) {
term.Score = 0
for _, tag := range term.Tags {
switch tag {
@ -54,31 +54,37 @@ func computeJmdictScore(term *dbTerm) {
}
}
func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
func jmdictAddPriorities(term *dbTerm, priorities ...string) {
for _, priority := range priorities {
switch priority {
case "news1", "ichi1", "spec1", "gai1":
term.addTags("P")
fallthrough
case "news2", "ichi2", "spec2", "gai2":
term.addTags(priority[:len(priority)-1])
}
}
}
func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := map[string]dbTagMeta{
"news1": {Notes: "appears frequently in Mainichi Shimbun (top listing)", Category: "frequent", Order: 3},
"ichi1": {Notes: "listed as common in Ichimango Goi Bunruishuu (top listing)", Category: "frequent", Order: 3},
"spec1": {Notes: "common words not included in frequency lists (top listing)", Category: "frequent", Order: 3},
"gai1": {Notes: "common loanword (top listing)", Category: "frequent", Order: 3},
"news2": {Notes: "appears frequently in Mainichi Shimbun (bottom listing)", Order: 3},
"ichi2": {Notes: "listed as common in Ichimango Goi Bunruishuu (bottom listing)", Order: 3},
"spec2": {Notes: "common words not included in frequency lists (bottom listing)", Order: 3},
"gai2": {Notes: "common loanword (bottom listing)", Order: 3},
"news": {Notes: "appears frequently in Mainichi Shimbun"},
"ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu"},
"spec": {Notes: "common words not included in frequency lists"},
"gai": {Notes: "common loanword"},
"P": {Notes: "popular term", Category: "popular", Order: -10},
}
for name, value := range entities {
tag := dbTagMeta{Notes: value}
switch name {
case "gai1", "ichi1", "news1", "spec1":
tag.Category = "frequent"
tag.Order = 1
case "exp", "id":
tag.Category = "expression"
tag.Order = 2
tag.Order = -5
case "arch", "iK":
tag.Category = "archaism"
tag.Order = 2
tag.Order = -5
}
tags[name] = tag
@ -87,7 +93,7 @@ func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
return tags
}
func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
func jmdictExtractTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
var terms []dbTerm
convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) {
@ -100,7 +106,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
if kanji == nil {
termBase.Expression = reading.Reading
termBase.addTags(reading.Priorities...)
jmdictAddPriorities(&termBase, reading.Priorities...)
} else {
termBase.Expression = kanji.Expression
termBase.Reading = reading.Reading
@ -108,7 +114,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
termBase.addTags(priority)
jmdictAddPriorities(&termBase, priority)
}
}
}
@ -133,8 +139,8 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
term.Glossary = append(term.Glossary, glossary.Content)
}
computeJmdictRules(&term)
computeJmdictScore(&term)
jmdictBuildRules(&term)
jmdictBuildScore(&term)
terms = append(terms, term)
}
@ -155,7 +161,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
return terms
}
func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error {
func jmdictExportDb(outputDir, title string, reader io.Reader, flags int) error {
dict, entities, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil {
return err
@ -163,7 +169,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
var terms dbTermList
for _, entry := range dict.Entries {
terms = append(terms, extractJmdictTerms(entry)...)
terms = append(terms, jmdictExtractTerms(entry)...)
}
return writeDb(
@ -171,7 +177,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
title,
terms.crush(),
nil,
computeJmdictTagMeta(entities),
jmdictBuildTagMeta(entities),
flags&flagPretty == flagPretty,
)
}

View File

@ -28,7 +28,7 @@ import (
"github.com/FooSoft/jmdict"
)
func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := make(map[string]dbTagMeta)
for name, value := range entities {
@ -46,7 +46,7 @@ func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
return tags
}
func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
var terms []dbTerm
convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) {
@ -95,7 +95,7 @@ func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
return terms
}
func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) error {
func jmnedictExportDb(outputDir, title string, reader io.Reader, flags int) error {
dict, entities, err := jmdict.LoadJmnedictNoTransform(reader)
if err != nil {
return err
@ -103,7 +103,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
var terms dbTermList
for _, e := range dict.Entries {
terms = append(terms, extractJmnedictTerms(e)...)
terms = append(terms, jmnedictExtractTerms(e)...)
}
return writeDb(
@ -111,7 +111,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
title,
terms.crush(),
nil,
computeJmnedictTagMeta(entities),
jmnedictBuildTagMeta(entities),
flags&flagPretty == flagPretty,
)
}

View File

@ -55,14 +55,7 @@ type epwingExtractor interface {
getFontWide() map[int]string
}
type daijirinExtractor struct {
partsExp *regexp.Regexp
phonExp *regexp.Regexp
variantExp *regexp.Regexp
annotExp *regexp.Regexp
}
func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error {
func epwingExportDb(outputDir, title string, reader io.Reader, flags int) error {
data, err := ioutil.ReadAll(reader)
if err != nil {
return err

View File

@ -30,7 +30,7 @@ import (
"github.com/FooSoft/jmdict"
)
func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
func kanjidicExtractKanji(entry jmdict.KanjidicCharacter) dbKanji {
kanji := dbKanji{Character: entry.Literal}
if level := entry.Misc.JlptLevel; level != nil {
@ -78,7 +78,7 @@ func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
return kanji
}
func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) error {
func kanjidicExportDb(outputDir, title string, reader io.Reader, flags int) error {
dict, err := jmdict.LoadKanjidic(reader)
if err != nil {
return err
@ -86,12 +86,12 @@ func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) erro
var kanji dbKanjiList
for _, entry := range dict.Characters {
kanji = append(kanji, extractKanjidicKanji(entry))
kanji = append(kanji, kanjidicExtractKanji(entry))
}
tagMeta := map[string]dbTagMeta{
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: 3},
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: 3},
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5},
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5},
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"},
"grade": {Notes: "school grade level at which the character is taught"},
"strokes": {Notes: "number of strokes needed to write the character"},

View File

@ -44,10 +44,10 @@ func usage() {
func exportDb(inputPath, outputDir, format, title string, flags int) error {
handlers := map[string]func(string, string, io.Reader, int) error{
"edict": exportJmdictDb,
"enamdict": exportJmnedictDb,
"kanjidic": exportKanjidicDb,
"epwing": exportEpwingDb,
"edict": jmdictExportDb,
"enamdict": jmnedictExportDb,
"kanjidic": kanjidicExportDb,
"epwing": epwingExportDb,
}
handler, ok := handlers[format]