1
This commit is contained in:
Alex Yatskov 2016-12-17 15:48:13 -08:00
parent ee2318d4bd
commit 879a3828b3
6 changed files with 142 additions and 33 deletions

View File

@ -27,37 +27,33 @@ import (
"fmt"
"os"
"path"
"strconv"
"strings"
)
type dbTagMeta struct {
Notes string `json:"notes"`
Class string `json:"class"`
Order int `json:"order"`
}
type dbTerm struct {
Expression string
Reading string
Tags []string
Rules []string
Score int
Glossary []string
}
type dbTermList []dbTerm
func (term *dbTerm) addTags(tags ...string) {
for _, tag := range tags {
if !hasString(tag, term.Tags) {
term.Tags = append(term.Tags, tag)
}
}
term.Tags = appendStringUnique(term.Tags, tags...)
}
func (term *dbTerm) addTagsPri(tags ...string) {
for _, tag := range tags {
switch tag {
case "news1", "ichi1", "spec1", "gai1":
term.addTags("P")
fallthrough
case "news2", "ichi2", "spec2", "gai2":
term.addTags(tag[:len(tag)-1])
break
}
}
func (term *dbTerm) addRules(rules ...string) {
term.Rules = appendStringUnique(term.Rules, rules...)
}
func (terms dbTermList) crush() [][]string {
@ -67,6 +63,8 @@ func (terms dbTermList) crush() [][]string {
t.Expression,
t.Reading,
strings.Join(t.Tags, " "),
strings.Join(t.Rules, " "),
strconv.Itoa(t.Score),
}
result = append(result, t.Glossary...)
@ -111,7 +109,7 @@ func (kanji dbKanjiList) crush() [][]string {
return results
}
func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]string, entities map[string]string, pretty bool) error {
func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]string, tagMeta map[string]dbTagMeta, pretty bool) error {
const DB_VERSION = 1
const BANK_STRIDE = 50000
@ -161,16 +159,16 @@ func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]s
var err error
var db struct {
Title string `json:"title"`
Version int `json:"version"`
Entities map[string]string `json:"entities"`
TermBanks int `json:"termBanks"`
KanjiBanks int `json:"kanjiBanks"`
Title string `json:"title"`
Version int `json:"version"`
TagMeta map[string]dbTagMeta `json:"tagMeta"`
TermBanks int `json:"termBanks"`
KanjiBanks int `json:"kanjiBanks"`
}
db.Title = title
db.Version = DB_VERSION
db.Entities = entities
db.TagMeta = tagMeta
if db.TermBanks, err = writeDbRecords("term", termRecords); err != nil {
return err
@ -198,6 +196,16 @@ func writeDb(outputDir, title string, termRecords [][]string, kanjiRecords [][]s
return nil
}
func appendStringUnique(target []string, source ...string) []string {
for _, str := range source {
if !hasString(str, target) {
target = append(target, str)
}
}
return target
}
func hasString(needle string, haystack []string) bool {
for _, value := range haystack {
if needle == value {

View File

@ -169,7 +169,7 @@ func (*daijirinExtractor) getTags() map[string]string {
"動ワ下二": "",
"動ワ五[ハ四]": "",
"名": "",
"形": "",
"形": "adj-i",
"形ク": "",
"形シク": "",
"形動": "",

View File

@ -24,10 +24,72 @@ package main
import (
"io"
"strings"
"github.com/FooSoft/jmdict"
)
func computeJmdictRules(term *dbTerm) {
for _, tag := range term.Tags {
switch tag {
case "adj-i":
case "v1":
case "vk":
case "vs":
term.addRules(tag)
default:
if strings.HasPrefix(tag, "v5") {
term.addRules("v5")
}
}
}
}
func computeJmdictScore(term *dbTerm) {
term.Score = 0
for _, tag := range term.Tags {
switch tag {
case "gai1":
case "ichi1":
case "news1":
case "spec1":
term.Score += 5
case "arch":
case "iK":
term.Score -= 1
}
}
}
func computeJmdictTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := make(map[string]dbTagMeta)
for name, value := range entities {
tag := dbTagMeta{Notes: value}
switch name {
case "gai1":
case "ichi1":
case "news1":
case "spec1":
tag.Class = "frequent"
tag.Order = 1
case "exp":
case "id":
tag.Class = "expression"
tag.Order = 2
case "arch":
case "iK":
tag.Class = "archaism"
tag.Order = 2
}
tags[name] = tag
}
return tags
}
func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
var terms []dbTerm
@ -41,7 +103,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
if kanji == nil {
termBase.Expression = reading.Reading
termBase.addTagsPri(reading.Priorities...)
termBase.addTags(reading.Priorities...)
} else {
termBase.Expression = kanji.Expression
termBase.Reading = reading.Reading
@ -49,7 +111,7 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
termBase.addTagsPri(priority)
termBase.addTags(priority)
}
}
}
@ -74,6 +136,9 @@ func extractJmdictTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
term.Glossary = append(term.Glossary, glossary.Content)
}
computeJmdictRules(&term)
computeJmdictScore(&term)
terms = append(terms, term)
}
}
@ -109,7 +174,7 @@ func exportJmdictDb(outputDir, title string, reader io.Reader, flags int) error
title,
terms.crush(),
nil,
entities,
computeJmdictTagMeta(entities),
flags&flagPretty == flagPretty,
)
}

View File

@ -28,6 +28,35 @@ import (
"github.com/FooSoft/jmdict"
)
func computeJmnedictTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := make(map[string]dbTagMeta)
for name, value := range entities {
tag := dbTagMeta{Notes: value}
switch name {
case "company":
case "fem":
case "given":
case "masc":
case "organization":
case "person":
case "place":
case "product":
case "station":
case "surname":
case "unclass":
case "work":
tag.Class = "name"
tag.Order = 4
}
tags[name] = tag
}
return tags
}
func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
var terms []dbTerm
@ -49,7 +78,7 @@ func extractJmnedictTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
for _, priority := range kanji.Priorities {
if hasString(priority, reading.Priorities) {
term.addTagsPri(priority)
term.addTags(priority)
}
}
}
@ -93,7 +122,7 @@ func exportJmnedictDb(outputDir, title string, reader io.Reader, flags int) erro
title,
terms.crush(),
nil,
entities,
computeJmnedictTagMeta(entities),
flags&flagPretty == flagPretty,
)
}

View File

@ -69,10 +69,8 @@ func extractKanjidicKanji(entry jmdict.KanjidicCharacter) dbKanji {
switch r.Type {
case "ja_on":
kanji.Onyomi = append(kanji.Onyomi, r.Value)
break
case "ja_kun":
kanji.Kunyomi = append(kanji.Kunyomi, r.Value)
break
}
}
}
@ -91,12 +89,21 @@ func exportKanjidicDb(outputDir, title string, reader io.Reader, flags int) erro
kanji = append(kanji, extractKanjidicKanji(entry))
}
tagMeta := map[string]dbTagMeta{
"jouyou": {Notes: "included in list of regular-use characters", Class: "frequent", Order: 3},
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Class: "frequent", Order: 3},
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"},
"grade": {Notes: "school grade level at which the character is taught"},
"strokes": {Notes: "number of strokes needed to write the character"},
"heisig": {Notes: "frame number in Remembering the Kanji"},
}
return writeDb(
outputDir,
title,
nil,
kanji.crush(),
nil,
tagMeta,
flags&flagPretty == flagPretty,
)
}

View File

@ -37,7 +37,7 @@ const (
)
func usage() {
fmt.Fprintf(os.Stderr, "Usage: %s [edict|enamdict|kanjidic] input output\n\n", path.Base(os.Args[0]))
fmt.Fprintf(os.Stderr, "Usage: %s [edict|enamdict|kanjidic|epwing] input output\n\n", path.Base(os.Args[0]))
fmt.Fprintf(os.Stderr, "Parameters:\n")
flag.PrintDefaults()
}