1

initial work on db redesign

This commit is contained in:
Alex Yatskov 2017-09-10 13:25:11 -07:00
parent f0d72fefaa
commit 532838764b
7 changed files with 201 additions and 82 deletions

View File

@ -33,10 +33,43 @@ import (
"strings" "strings"
) )
type dbTagMeta struct { const databaseVersion = 2
Category string `json:"category,omitempty"`
Notes string `json:"notes,omitempty"` type dbRecord []interface{}
Order int `json:"order,omitempty"` type dbRecordList []dbRecord
type dbTag struct {
Name string
Category string
Order int
Notes string
}
type dbTagList []dbTag
func (meta dbTagList) crush() dbRecordList {
var results dbRecordList
for _, m := range meta {
results = append(results, dbRecord{m.Name, m.Category, m.Order, m.Notes})
}
return results
}
type dbFrequency struct {
Expression string
Count int
}
type dbFrequencyList []dbFrequency
func (freqs dbFrequencyList) crush() dbRecordList {
var results dbRecordList
for _, f := range freqs {
results = append(results, dbRecord{f.Expression, f.Count})
}
return results
} }
type dbTerm struct { type dbTerm struct {
@ -58,10 +91,10 @@ func (term *dbTerm) addRules(rules ...string) {
term.Rules = appendStringUnique(term.Rules, rules...) term.Rules = appendStringUnique(term.Rules, rules...)
} }
func (terms dbTermList) crush() [][]interface{} { func (terms dbTermList) crush() dbRecordList {
var results [][]interface{} var results dbRecordList
for _, t := range terms { for _, t := range terms {
result := []interface{}{ result := dbRecord{
t.Expression, t.Expression,
t.Reading, t.Reading,
strings.Join(t.Tags, " "), strings.Join(t.Tags, " "),
@ -97,10 +130,10 @@ func (kanji *dbKanji) addTags(tags ...string) {
} }
} }
func (kanji dbKanjiList) crush() [][]interface{} { func (kanji dbKanjiList) crush() dbRecordList {
var results [][]interface{} var results dbRecordList
for _, k := range kanji { for _, k := range kanji {
result := []interface{}{ result := dbRecord{
k.Character, k.Character,
strings.Join(k.Onyomi, " "), strings.Join(k.Onyomi, " "),
strings.Join(k.Kunyomi, " "), strings.Join(k.Kunyomi, " "),
@ -117,13 +150,11 @@ func (kanji dbKanjiList) crush() [][]interface{} {
return results return results
} }
func writeDb(outputPath, title, revision string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, stride int, pretty bool) error { func writeDb(outputPath, title, revision string, recordData map[string]dbRecordList, stride int, pretty bool) error {
const DB_VERSION = 1
var zbuff bytes.Buffer var zbuff bytes.Buffer
zip := zip.NewWriter(&zbuff) zip := zip.NewWriter(&zbuff)
marshalJson := func(obj interface{}, pretty bool) ([]byte, error) { marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) {
if pretty { if pretty {
return json.MarshalIndent(obj, "", " ") return json.MarshalIndent(obj, "", " ")
} }
@ -131,7 +162,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
return json.Marshal(obj) return json.Marshal(obj)
} }
writeDbRecords := func(prefix string, records [][]interface{}) (int, error) { writeDbRecords := func(prefix string, records dbRecordList) (int, error) {
recordCount := len(records) recordCount := len(records)
bankCount := 0 bankCount := 0
@ -142,7 +173,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
indexDst = recordCount indexDst = recordCount
} }
bytes, err := marshalJson(records[indexSrc:indexDst], pretty) bytes, err := marshalJSON(records[indexSrc:indexDst], pretty)
if err != nil { if err != nil {
return 0, err return 0, err
} }
@ -156,7 +187,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
return 0, err return 0, err
} }
bankCount += 1 bankCount++
} }
return bankCount, nil return bankCount, nil
@ -167,25 +198,19 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
Title string `json:"title"` Title string `json:"title"`
Version int `json:"version"` Version int `json:"version"`
Revision string `json:"revision"` Revision string `json:"revision"`
TagMeta map[string]dbTagMeta `json:"tagMeta"`
TermBanks int `json:"termBanks"`
KanjiBanks int `json:"kanjiBanks"`
} }
db.Title = title db.Title = title
db.Version = DB_VERSION db.Version = databaseVersion
db.Revision = revision db.Revision = revision
db.TagMeta = tagMeta
if db.TermBanks, err = writeDbRecords("term", termRecords); err != nil { for recordType, recordEntries := range recordData {
if _, err := writeDbRecords(recordType, recordEntries); err != nil {
return err return err
} }
if db.KanjiBanks, err = writeDbRecords("kanji", kanjiRecords); err != nil {
return err
} }
bytes, err := marshalJson(db, pretty) bytes, err := marshalJSON(db, pretty)
if err != nil { if err != nil {
return err return err
} }
@ -234,8 +259,13 @@ func hasString(needle string, haystack []string) bool {
} }
func detectFormat(path string) (string, error) { func detectFormat(path string) (string, error) {
if filepath.Ext(path) == ".sqlite" { switch filepath.Ext(path) {
case ".sqlite":
return "rikai", nil return "rikai", nil
case ".kanji_freq":
return "kanji_freq", nil
case ".term_freq":
return "term_freq", nil
} }
switch filepath.Base(path) { switch filepath.Base(path) {

View File

@ -29,7 +29,7 @@ import (
"github.com/FooSoft/jmdict" "github.com/FooSoft/jmdict"
) )
const JMDICT_REVISION = "jmdict3" const jmdictRevision = "jmdict3"
func jmdictBuildRules(term *dbTerm) { func jmdictBuildRules(term *dbTerm) {
for _, tag := range term.Tags { for _, tag := range term.Tags {
@ -71,17 +71,17 @@ func jmdictAddPriorities(term *dbTerm, priorities ...string) {
} }
} }
func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { func jmdictBuildTagMeta(entities map[string]string) dbTagList {
tags := map[string]dbTagMeta{ tags := dbTagList{
"news": {Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2}, dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2},
"ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2}, dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2},
"spec": {Notes: "common words not included in frequency lists", Category: "frequent", Order: -2}, dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2},
"gai": {Notes: "common loanword", Category: "frequent", Order: -2}, dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2},
"P": {Notes: "popular term", Category: "popular", Order: -10}, dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10},
} }
for name, value := range entities { for name, value := range entities {
tag := dbTagMeta{Notes: value} tag := dbTag{Name: name, Notes: value}
switch name { switch name {
case "exp", "id": case "exp", "id":
@ -92,7 +92,7 @@ func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tag.Order = -4 tag.Order = -4
} }
tags[name] = tag tags = append(tags, tag)
} }
return tags return tags
@ -227,13 +227,16 @@ func jmdictExportDb(inputPath, outputPath, language, title string, stride int, p
title = "JMdict" title = "JMdict"
} }
recordData := map[string]dbRecordList{
"terms": terms.crush(),
"tags": jmdictBuildTagMeta(entities).crush(),
}
return writeDb( return writeDb(
outputPath, outputPath,
title, title,
JMDICT_REVISION, jmdictRevision,
terms.crush(), recordData,
nil,
jmdictBuildTagMeta(entities),
stride, stride,
pretty, pretty,
) )

View File

@ -28,13 +28,13 @@ import (
"github.com/FooSoft/jmdict" "github.com/FooSoft/jmdict"
) )
const JMNEDICT_REVISION = "jmnedict1" const jmnedictRevision = "jmnedict1"
func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta { func jmnedictBuildTagMeta(entities map[string]string) dbTagList {
tags := make(map[string]dbTagMeta) var tags dbTagList
for name, value := range entities { for name, value := range entities {
tag := dbTagMeta{Notes: value} tag := dbTag{Name: name, Notes: value}
switch name { switch name {
case "company", "fem", "given", "masc", "organization", "person", "place", "product", "station", "surname", "unclass", "work": case "company", "fem", "given", "masc", "organization", "person", "place", "product", "station", "surname", "unclass", "work":
@ -42,7 +42,7 @@ func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tag.Order = 4 tag.Order = 4
} }
tags[name] = tag tags = append(tags, tag)
} }
return tags return tags
@ -118,13 +118,16 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int,
title = "JMnedict" title = "JMnedict"
} }
recordData := map[string]dbRecordList{
"terms": terms.crush(),
"tags": jmnedictBuildTagMeta(entities).crush(),
}
return writeDb( return writeDb(
outputPath, outputPath,
title, title,
JMNEDICT_REVISION, jmnedictRevision,
terms.crush(), recordData,
nil,
jmnedictBuildTagMeta(entities),
stride, stride,
pretty, pretty,
) )

View File

@ -200,13 +200,16 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
title = strings.Join(titles, ", ") title = strings.Join(titles, ", ")
} }
recordData := map[string]dbRecordList{
"kanji": kanji.crush(),
"terms": terms.crush(),
}
return writeDb( return writeDb(
outputPath, outputPath,
title, title,
strings.Join(revisions, ";"), strings.Join(revisions, ";"),
terms.crush(), recordData,
kanji.crush(),
nil,
stride, stride,
pretty, pretty,
) )

74
frequency.go Normal file
View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2017 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"bufio"
"os"
"strconv"
"strings"
)
const frequencyRevision = "frequency1"
func frequncyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
var frequencies dbFrequencyList
for scanner := bufio.NewScanner(reader); scanner.Scan(); {
line := scanner.Text()
if strings.HasPrefix(line, "#") {
continue
}
parts := strings.Split(line, "\t")
if len(parts) != 2 {
continue
}
expression := parts[0]
count, err := strconv.Atoi(parts[1])
if err != nil {
continue
}
frequencies = append(frequencies, dbFrequency{expression, count})
}
recordData := map[string]dbRecordList{
"frequencies": frequencies.crush(),
}
return writeDb(
outputPath,
title,
frequencyRevision,
recordData,
stride,
pretty,
)
}

View File

@ -30,7 +30,7 @@ import (
"github.com/FooSoft/jmdict" "github.com/FooSoft/jmdict"
) )
const KANJIDIC_REVISION = "kanjidic1" const kanjidicRevision = "kanjidic1"
func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji { func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji {
if entry.ReadingMeaning == nil { if entry.ReadingMeaning == nil {
@ -116,26 +116,29 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int,
} }
} }
tagMeta := map[string]dbTagMeta{ tags := dbTagList{
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5}, dbTag{Name: "jouyou", Notes: "included in list of regular-use characters", Category: "frequent", Order: -5},
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5}, dbTag{Name: "jinmeiyou", Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5},
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"}, dbTag{Name: "jlpt", Notes: "corresponding Japanese Language Proficiency Test level"},
"grade": {Notes: "school grade level at which the character is taught"}, dbTag{Name: "grade", Notes: "school grade level at which the character is taught"},
"strokes": {Notes: "number of strokes needed to write the character"}, dbTag{Name: "strokes", Notes: "number of strokes needed to write the character"},
"heisig": {Notes: "frame number in Remembering the Kanji"}, dbTag{Name: "heisig", Notes: "frame number in Remembering the Kanji"},
} }
if title == "" { if title == "" {
title = "KANJIDIC2" title = "KANJIDIC2"
} }
recordData := map[string]dbRecordList{
"kanji": kanji.crush(),
"tags": tags.crush(),
}
return writeDb( return writeDb(
outputPath, outputPath,
title, title,
KANJIDIC_REVISION, kanjidicRevision,
nil, recordData,
kanji.crush(),
tagMeta,
stride, stride,
pretty, pretty,
) )

View File

@ -30,7 +30,7 @@ import (
_ "github.com/mattn/go-sqlite3" _ "github.com/mattn/go-sqlite3"
) )
const RIKAI_REVISION = "rikai2" const rikaiRevision = "rikai2"
type rikaiEntry struct { type rikaiEntry struct {
kanji string kanji string
@ -158,21 +158,24 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr
title = "Rikai" title = "Rikai"
} }
entities := map[string]dbTagMeta{ tags := dbTagList{
"P": {Category: "popular", Order: -10}, dbTag{Name: "P", Category: "popular", Order: -10},
"exp": {Category: "expression", Order: -5}, dbTag{Name: "exp", Category: "expression", Order: -5},
"id": {Category: "expression", Order: -5}, dbTag{Name: "id", Category: "expression", Order: -5},
"arch": {Category: "archaism", Order: -4}, dbTag{Name: "arch", Category: "archaism", Order: -4},
"iK": {Category: "archaism", Order: -4}, dbTag{Name: "iK", Category: "archaism", Order: -4},
}
recordData := map[string]dbRecordList{
"terms": terms.crush(),
"tags": tags.crush(),
} }
return writeDb( return writeDb(
outputPath, outputPath,
title, title,
RIKAI_REVISION, rikaiRevision,
terms.crush(), recordData,
nil,
entities,
stride, stride,
pretty, pretty,
) )