WIP
This commit is contained in:
parent
ee2318d4bd
commit
879a3828b3
54
common.go
54
common.go
@ -27,37 +27,33 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type dbTagMeta struct {
|
||||
Notes string `json:"notes"`
|
||||
Class string `json:"class"`
|
||||
Order int `json:"order"`
|
||||
}
|
||||
|
||||
type dbTerm struct {
|
||||
Expression string
|
||||
Reading string
|
||||
Tags []string
|
||||
Rules []string
|
||||
Score int
|
||||
Glossary []string
|
||||
}
|
||||
|
||||
type dbTermList []dbTerm
|
||||
|
||||
func (term *dbTerm) addTags(tags ...string) {
|
||||
for _, tag := range tags {
|
||||
if !hasString(tag, term.Tags) {
|
||||
term.Tags = append(term.Tags, tag)
|
||||
}
|
||||
}
|
||||
term.Tags = appendStringUnique(term.Tags, tags...)
|
||||
}
|
||||
|
||||
func (term *dbTerm) addTagsPri(tags ...string) {
|
||||
for _, tag := range tags {
|
||||
switch tag {
|
||||
case "news1", "ichi1", "spec1", "gai1":
|
||||
term.addTags("P")
|
||||
fallthrough
|
||||
case "news2", "ichi2", "spec2", "gai2":
|
||||
term.addTags(tag[:len(tag)-1])
|
||||
break
|
||||
}
|
||||
}
|
||||
func (term *dbTerm) addRules(rules ...string) {
|
||||
term.Rules = appendStringUnique(term.Rules, rules...)
|
||||
}
|
||||
|
||||
func (terms dbTermList) crush() [][]string {
|
||||
@ -67,6 +63,8 @@ func (terms dbTermList) crush() [][]string {
|
||||
t.Expression,
|
||||
t.Reading,
|
||||
strings.Join(t.Tags, " "),
|
||||
strings.Join(t.Rules, " "),
|
||||
strconv.Itoa(t.Score),
|
||||
}
|
||||
|
||||
result = append(result, t.Glossary...)
|
||||
@ -111,7 +109,7 @@ func (kanji dbKanjiList) crush() [][]string {
|
||||
return results
|
||||
}
|
||||
|
||||
func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]string, entities map[string]string, pretty bool) error {
|
||||
func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]string, tagMeta map[string]dbTagMeta, pretty bool) error {
|
||||
const DB_VERSION = 1
|
||||
const BANK_STRIDE = 50000
|
||||
|
||||
@ -161,16 +159,16 @@ func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]s
|
||||
|
||||
var err error
|
||||
var db struct {
|
||||
Title string `json:"title"`
|
||||
Version int `json:"version"`
|
||||
Entities map[string]string `json:"entities"`
|
||||
TermBanks int `json:"termBanks"`
|
||||
KanjiBanks int `json:"kanjiBanks"`
|
||||
Title string `json:"title"`
|
||||
Version int `json:"version"`
|
||||
TagMeta map[string]dbTagMeta `json:"tagMeta"`
|
||||
TermBanks int `json:"termBanks"`
|
||||
KanjiBanks int `json:"kanjiBanks"`
|
||||
}
|
||||
|
||||
db.Title = title
|
||||
db.Version = DB_VERSION
|
||||
db.Entities = entities
|
||||
db.TagMeta = tagMeta
|
||||
|
||||
if db.TermBanks, err = writeDbRecords("term", termRecords); err != nil {
|
||||
return err
|
||||
@ -198,6 +196,16 @@ func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]s
|
||||
return nil
|
||||
}
|
||||
|
||||
func appendStringUnique(target []string, source ...string) []string {
|
||||
for _, str := range source {
|
||||
if !hasString(str, target) {
|
||||
target = append(target, str)
|
||||
}
|
||||
}
|
||||
|
||||
return target
|
||||
}
|
||||
|
||||
func hasString(needle string, haystack []string) bool {
|
||||
for _, value := range haystack {
|
||||
if needle == value {
|
||||
|
@ -169,7 +169,7 @@ func (*daijirinExtractor) getTags() map[string]string {
|
||||
"動ワ下二": "",
|
||||
"動ワ五[ハ四]": "",
|
||||
"名": "",
|
||||
"形": "",
|
||||
"形": "adj-i",
|
||||
"形ク": "",
|
||||
"形シク": "",
|
||||
"形動": "",
|
||||
|
71
edict.go
71
edict.go
@ -24,10 +24,72 @@ package main
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/FooSoft/jmdict"
|
||||
)
|
||||
|
||||
func computeJmdictRules(term *dbTerm) {
|
||||
for _, tag := range term.Tags {
|
||||
switch tag {
|
||||
case "adj-i":
|
||||
case "v1":
|
||||
case "vk":
|
||||
case "vs":
|
||||
term.addRules(tag)
|
||||
default:
|
||||
if strings.HasPrefix(tag, "v5") {
|
||||
term.addRules("v5")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func computeJmdictScore(term *dbTerm) {
|
||||
term.Score = 0
|
||||
for _, tag := range term.Tags {
|
||||
switch tag {
|
||||
case "gai1":
|
||||
case "ichi1":
|
||||
case "news1":
|
||||
case "spec1":
|
||||
term.Score += 5
|
||||
case "arch":
|
||||
case "iK":
|
||||
term.Score -= 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
tags := make(map[string]dbTagMeta)
|
||||
|
||||
for name, value := range entities {
|
||||
tag := dbTagMeta{Notes: value}
|
||||
|
||||
switch name {
|
||||
case "gai1":
|
||||
case "ichi1":
|
||||
case "news1":
|
||||
case "spec1":
|
||||
tag.Class = "frequent"
|
||||
tag.Order = 1
|
||||
case "exp":
|
||||
case "id":
|
||||
tag.Class = "expression"
|
||||
tag.Order = 2
|
||||
case "arch":
|
||||
case "iK":
|
||||
tag.Class = "archaism"
|
||||
tag.Order = 2
|
||||
}
|
||||
|
||||
tags[name] = tag
|
||||
}
|
||||
|
||||
return tags
|
||||
}
|
||||
|
||||
func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
var terms []dbTerm
|
||||
|
||||
@ -41,7 +103,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
|
||||
if kanji == nil {
|
||||
termBase.Expression = reading.Reading
|
||||
termBase.addTagsPri(reading.Priorities...)
|
||||
termBase.addTags(reading.Priorities...)
|
||||
} else {
|
||||
termBase.Expression = kanji.Expression
|
||||
termBase.Reading = reading.Reading
|
||||
@ -49,7 +111,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
|
||||
for _, priority := range kanji.Priorities {
|
||||
if hasString(priority, reading.Priorities) {
|
||||
termBase.addTagsPri(priority)
|
||||
termBase.addTags(priority)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -74,6 +136,9 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
|
||||
term.Glossary = append(term.Glossary, glossary.Content)
|
||||
}
|
||||
|
||||
computeJmdictRules(&term)
|
||||
computeJmdictScore(&term)
|
||||
|
||||
terms = append(terms, term)
|
||||
}
|
||||
}
|
||||
@ -109,7 +174,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
|
||||
title,
|
||||
terms.crush(),
|
||||
nil,
|
||||
entities,
|
||||
computeJmdictTagMeta(entities),
|
||||
flags&flagPretty == flagPretty,
|
||||
)
|
||||
}
|
||||
|
33
enamdict.go
33
enamdict.go
@ -28,6 +28,35 @@ import (
|
||||
"github.com/FooSoft/jmdict"
|
||||
)
|
||||
|
||||
func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
|
||||
tags := make(map[string]dbTagMeta)
|
||||
|
||||
for name, value := range entities {
|
||||
tag := dbTagMeta{Notes: value}
|
||||
|
||||
switch name {
|
||||
case "company":
|
||||
case "fem":
|
||||
case "given":
|
||||
case "masc":
|
||||
case "organization":
|
||||
case "person":
|
||||
case "place":
|
||||
case "product":
|
||||
case "station":
|
||||
case "surname":
|
||||
case "unclass":
|
||||
case "work":
|
||||
tag.Class = "name"
|
||||
tag.Order = 4
|
||||
}
|
||||
|
||||
tags[name] = tag
|
||||
}
|
||||
|
||||
return tags
|
||||
}
|
||||
|
||||
func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
var terms []dbTerm
|
||||
|
||||
@ -49,7 +78,7 @@ func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
|
||||
|
||||
for _, priority := range kanji.Priorities {
|
||||
if hasString(priority, reading.Priorities) {
|
||||
term.addTagsPri(priority)
|
||||
term.addTags(priority)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -93,7 +122,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
|
||||
title,
|
||||
terms.crush(),
|
||||
nil,
|
||||
entities,
|
||||
computeJmnedictTagMeta(entities),
|
||||
flags&flagPretty == flagPretty,
|
||||
)
|
||||
}
|
||||
|
13
kanjidic.go
13
kanjidic.go
@ -69,10 +69,8 @@ func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
|
||||
switch r.Type {
|
||||
case "ja_on":
|
||||
kanji.Onyomi = append(kanji.Onyomi, r.Value)
|
||||
break
|
||||
case "ja_kun":
|
||||
kanji.Kunyomi = append(kanji.Kunyomi, r.Value)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -91,12 +89,21 @@ func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) erro
|
||||
kanji = append(kanji, extractKanjidicKanji(entry))
|
||||
}
|
||||
|
||||
tagMeta := map[string]dbTagMeta{
|
||||
"jouyou": {Notes: "included in list of regular-use characters", Class: "frequent", Order: 3},
|
||||
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Class: "frequent", Order: 3},
|
||||
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"},
|
||||
"grade": {Notes: "school grade level at which the character is taught"},
|
||||
"strokes": {Notes: "number of strokes needed to write the character"},
|
||||
"heisig": {Notes: "frame number in Remembering the Kanji"},
|
||||
}
|
||||
|
||||
return writeDb(
|
||||
outputDir,
|
||||
title,
|
||||
nil,
|
||||
kanji.crush(),
|
||||
nil,
|
||||
tagMeta,
|
||||
flags&flagPretty == flagPretty,
|
||||
)
|
||||
}
|
||||
|
2
main.go
2
main.go
@ -37,7 +37,7 @@ const (
|
||||
)
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [edict|enamdict|kanjidic] input output\n\n", path.Base(os.Args[0]))
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [edict|enamdict|kanjidic|epwing] input output\n\n", path.Base(os.Args[0]))
|
||||
fmt.Fprintf(os.Stderr, "Parameters:\n")
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user