Cleanup & fixes
This commit is contained in:
parent
764a3c4e0f
commit
1f95077e7b
@ -116,7 +116,7 @@ func (kanji dbKanjiList) crush() [][]interface{} {
|
||||
|
||||
func writeDb(outputDir, title string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, pretty bool) error {
|
||||
const DB_VERSION = 1
|
||||
const BANK_STRIDE = 50000
|
||||
const BANK_STRIDE = 10000
|
||||
|
||||
marshalJson := func(obj interface{}, pretty bool) ([]byte, error) {
|
||||
if pretty {
|
||||
|
@ -27,6 +27,13 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
type daijirinExtractor struct {
|
||||
partsExp *regexp.Regexp
|
||||
phonExp *regexp.Regexp
|
||||
variantExp *regexp.Regexp
|
||||
annotExp *regexp.Regexp
|
||||
}
|
||||
|
||||
func makeDaijirinExtractor() epwingExtractor {
|
||||
return &daijirinExtractor{
|
||||
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:((.*)))?`),
|
||||
|
54
edict.go
54
edict.go
@ -29,7 +29,7 @@ import (
|
||||
"github.com/FooSoft/jmdict"
|
||||
)
|
||||
|
||||
func computeJmdictRules(term *dbTerm) {
|
||||
func jmdictBuildRules(term *dbTerm) {
|
||||
for _, tag := range term.Tags {
|
||||
switch tag {
|
||||
case "adj-i", "v1", "vk", "vs":
|
||||
@ -42,7 +42,7 @@ func computeJmdictRules(term *dbTerm) {
|
||||
}
|
||||
}
|
||||
|
||||
func computeJmdictScore(term *dbTerm) {
|
||||
func jmdictBuildScore(term *dbTerm) {
|
||||
term.Score = 0
|
||||
for _, tag := range term.Tags {
|
||||
switch tag {
|
||||
@ -54,31 +54,37 @@ func computeJmdictScore(term *dbTerm) {
|
||||
}
|
||||
}
|
||||
|
||||
func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
func jmdictAddPriorities(term *dbTerm, priorities ...string) {
|
||||
for _, priority := range priorities {
|
||||
switch priority {
|
||||
case "news1", "ichi1", "spec1", "gai1":
|
||||
term.addTags("P")
|
||||
fallthrough
|
||||
case "news2", "ichi2", "spec2", "gai2":
|
||||
term.addTags(priority[:len(priority)-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
tags := map[string]dbTagMeta{
|
||||
"news1": {Notes: "appears frequently in Mainichi Shimbun (top listing)", Category: "frequent", Order: 3},
|
||||
"ichi1": {Notes: "listed as common in Ichimango Goi Bunruishuu (top listing)", Category: "frequent", Order: 3},
|
||||
"spec1": {Notes: "common words not included in frequency lists (top listing)", Category: "frequent", Order: 3},
|
||||
"gai1": {Notes: "common loanword (top listing)", Category: "frequent", Order: 3},
|
||||
"news2": {Notes: "appears frequently in Mainichi Shimbun (bottom listing)", Order: 3},
|
||||
"ichi2": {Notes: "listed as common in Ichimango Goi Bunruishuu (bottom listing)", Order: 3},
|
||||
"spec2": {Notes: "common words not included in frequency lists (bottom listing)", Order: 3},
|
||||
"gai2": {Notes: "common loanword (bottom listing)", Order: 3},
|
||||
"news": {Notes: "appears frequently in Mainichi Shimbun"},
|
||||
"ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu"},
|
||||
"spec": {Notes: "common words not included in frequency lists"},
|
||||
"gai": {Notes: "common loanword"},
|
||||
"P": {Notes: "popular term", Category: "popular", Order: -10},
|
||||
}
|
||||
|
||||
for name, value := range entities {
|
||||
tag := dbTagMeta{Notes: value}
|
||||
|
||||
switch name {
|
||||
case "gai1", "ichi1", "news1", "spec1":
|
||||
tag.Category = "frequent"
|
||||
tag.Order = 1
|
||||
case "exp", "id":
|
||||
tag.Category = "expression"
|
||||
tag.Order = 2
|
||||
tag.Order = -5
|
||||
case "arch", "iK":
|
||||
tag.Category = "archaism"
|
||||
tag.Order = 2
|
||||
tag.Order = -5
|
||||
}
|
||||
|
||||
tags[name] = tag
|
||||
@ -87,7 +93,7 @@ func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
return tags
|
||||
}
|
||||
|
||||
func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
func jmdictExtractTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
var terms []dbTerm
|
||||
|
||||
convert := func(reading jmdict.JmdictReading, kanji *jmdict.JmdictKanji) {
|
||||
@ -100,7 +106,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
|
||||
if kanji == nil {
|
||||
termBase.Expression = reading.Reading
|
||||
termBase.addTags(reading.Priorities...)
|
||||
jmdictAddPriorities(&termBase, reading.Priorities...)
|
||||
} else {
|
||||
termBase.Expression = kanji.Expression
|
||||
termBase.Reading = reading.Reading
|
||||
@ -108,7 +114,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
|
||||
for _, priority := range kanji.Priorities {
|
||||
if hasString(priority, reading.Priorities) {
|
||||
termBase.addTags(priority)
|
||||
jmdictAddPriorities(&termBase, priority)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -133,8 +139,8 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
term.Glossary = append(term.Glossary, glossary.Content)
|
||||
}
|
||||
|
||||
computeJmdictRules(&term)
|
||||
computeJmdictScore(&term)
|
||||
jmdictBuildRules(&term)
|
||||
jmdictBuildScore(&term)
|
||||
|
||||
terms = append(terms, term)
|
||||
}
|
||||
@ -155,7 +161,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
return terms
|
||||
}
|
||||
|
||||
func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
func jmdictExportDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
dict, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -163,7 +169,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
|
||||
|
||||
var terms dbTermList
|
||||
for _, entry := range dict.Entries {
|
||||
terms = append(terms, extractJmdictTerms(entry)...)
|
||||
terms = append(terms, jmdictExtractTerms(entry)...)
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
@ -171,7 +177,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
|
||||
title,
|
||||
terms.crush(),
|
||||
nil,
|
||||
computeJmdictTagMeta(entities),
|
||||
jmdictBuildTagMeta(entities),
|
||||
flags&flagPretty == flagPretty,
|
||||
)
|
||||
}
|
||||
|
10
enamdict.go
10
enamdict.go
@ -28,7 +28,7 @@ import (
|
||||
"github.com/FooSoft/jmdict"
|
||||
)
|
||||
|
||||
func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
tags := make(map[string]dbTagMeta)
|
||||
|
||||
for name, value := range entities {
|
||||
@ -46,7 +46,7 @@ func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
return tags
|
||||
}
|
||||
|
||||
func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
var terms []dbTerm
|
||||
|
||||
convert := func(reading jmdict.JmnedictReading, kanji *jmdict.JmnedictKanji) {
|
||||
@ -95,7 +95,7 @@ func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
return terms
|
||||
}
|
||||
|
||||
func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
func jmnedictExportDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
dict, entities, err := jmdict.LoadJmnedictNoTransform(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -103,7 +103,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
|
||||
|
||||
var terms dbTermList
|
||||
for _, e := range dict.Entries {
|
||||
terms = append(terms, extractJmnedictTerms(e)...)
|
||||
terms = append(terms, jmnedictExtractTerms(e)...)
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
@ -111,7 +111,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
|
||||
title,
|
||||
terms.crush(),
|
||||
nil,
|
||||
computeJmnedictTagMeta(entities),
|
||||
jmnedictBuildTagMeta(entities),
|
||||
flags&flagPretty == flagPretty,
|
||||
)
|
||||
}
|
||||
|
@ -55,14 +55,7 @@ type epwingExtractor interface {
|
||||
getFontWide() map[int]string
|
||||
}
|
||||
|
||||
type daijirinExtractor struct {
|
||||
partsExp *regexp.Regexp
|
||||
phonExp *regexp.Regexp
|
||||
variantExp *regexp.Regexp
|
||||
annotExp *regexp.Regexp
|
||||
}
|
||||
|
||||
func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
func epwingExportDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
data, err := ioutil.ReadAll(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
|
10
kanjidic.go
10
kanjidic.go
@ -30,7 +30,7 @@ import (
|
||||
"github.com/FooSoft/jmdict"
|
||||
)
|
||||
|
||||
func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
|
||||
func kanjidicExtractKanji(entry jmdict.KanjidicCharacter) dbKanji {
|
||||
kanji := dbKanji{Character: entry.Literal}
|
||||
|
||||
if level := entry.Misc.JlptLevel; level != nil {
|
||||
@ -78,7 +78,7 @@ func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
|
||||
return kanji
|
||||
}
|
||||
|
||||
func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
func kanjidicExportDb(outputDir, title string, reader io.Reader, flags int) error {
|
||||
dict, err := jmdict.LoadKanjidic(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -86,12 +86,12 @@ func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) erro
|
||||
|
||||
var kanji dbKanjiList
|
||||
for _, entry := range dict.Characters {
|
||||
kanji = append(kanji, extractKanjidicKanji(entry))
|
||||
kanji = append(kanji, kanjidicExtractKanji(entry))
|
||||
}
|
||||
|
||||
tagMeta := map[string]dbTagMeta{
|
||||
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: 3},
|
||||
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: 3},
|
||||
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5},
|
||||
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5},
|
||||
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"},
|
||||
"grade": {Notes: "school grade level at which the character is taught"},
|
||||
"strokes": {Notes: "number of strokes needed to write the character"},
|
||||
|
8
main.go
8
main.go
@ -44,10 +44,10 @@ func usage() {
|
||||
|
||||
func exportDb(inputPath, outputDir, format, title string, flags int) error {
|
||||
handlers := map[string]func(string, string, io.Reader, int) error{
|
||||
"edict": exportJmdictDb,
|
||||
"enamdict": exportJmnedictDb,
|
||||
"kanjidic": exportKanjidicDb,
|
||||
"epwing": exportEpwingDb,
|
||||
"edict": jmdictExportDb,
|
||||
"enamdict": jmnedictExportDb,
|
||||
"kanjidic": kanjidicExportDb,
|
||||
"epwing": epwingExportDb,
|
||||
}
|
||||
|
||||
handler, ok := handlers[format]
|
||||
|
Loading…
Reference in New Issue
Block a user