1

initial work on db redesign

This commit is contained in:
Alex Yatskov 2017-09-10 13:25:11 -07:00
parent f0d72fefaa
commit 532838764b
7 changed files with 201 additions and 82 deletions

View File

@ -33,10 +33,43 @@ import (
"strings"
)
type dbTagMeta struct {
Category string `json:"category,omitempty"`
Notes string `json:"notes,omitempty"`
Order int `json:"order,omitempty"`
const databaseVersion = 2
type dbRecord []interface{}
type dbRecordList []dbRecord
type dbTag struct {
Name string
Category string
Order int
Notes string
}
type dbTagList []dbTag
func (meta dbTagList) crush() dbRecordList {
var results dbRecordList
for _, m := range meta {
results = append(results, dbRecord{m.Name, m.Category, m.Order, m.Notes})
}
return results
}
type dbFrequency struct {
Expression string
Count int
}
type dbFrequencyList []dbFrequency
func (freqs dbFrequencyList) crush() dbRecordList {
var results dbRecordList
for _, f := range freqs {
results = append(results, dbRecord{f.Expression, f.Count})
}
return results
}
type dbTerm struct {
@ -58,10 +91,10 @@ func (term *dbTerm) addRules(rules ...string) {
term.Rules = appendStringUnique(term.Rules, rules...)
}
func (terms dbTermList) crush() [][]interface{} {
var results [][]interface{}
func (terms dbTermList) crush() dbRecordList {
var results dbRecordList
for _, t := range terms {
result := []interface{}{
result := dbRecord{
t.Expression,
t.Reading,
strings.Join(t.Tags, " "),
@ -97,10 +130,10 @@ func (kanji *dbKanji) addTags(tags ...string) {
}
}
func (kanji dbKanjiList) crush() [][]interface{} {
var results [][]interface{}
func (kanji dbKanjiList) crush() dbRecordList {
var results dbRecordList
for _, k := range kanji {
result := []interface{}{
result := dbRecord{
k.Character,
strings.Join(k.Onyomi, " "),
strings.Join(k.Kunyomi, " "),
@ -117,13 +150,11 @@ func (kanji dbKanjiList) crush() [][]interface{} {
return results
}
func writeDb(outputPath, title, revision string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, stride int, pretty bool) error {
const DB_VERSION = 1
func writeDb(outputPath, title, revision string, recordData map[string]dbRecordList, stride int, pretty bool) error {
var zbuff bytes.Buffer
zip := zip.NewWriter(&zbuff)
marshalJson := func(obj interface{}, pretty bool) ([]byte, error) {
marshalJSON := func(obj interface{}, pretty bool) ([]byte, error) {
if pretty {
return json.MarshalIndent(obj, "", " ")
}
@ -131,7 +162,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
return json.Marshal(obj)
}
writeDbRecords := func(prefix string, records [][]interface{}) (int, error) {
writeDbRecords := func(prefix string, records dbRecordList) (int, error) {
recordCount := len(records)
bankCount := 0
@ -142,7 +173,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
indexDst = recordCount
}
bytes, err := marshalJson(records[indexSrc:indexDst], pretty)
bytes, err := marshalJSON(records[indexSrc:indexDst], pretty)
if err != nil {
return 0, err
}
@ -156,7 +187,7 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
return 0, err
}
bankCount += 1
bankCount++
}
return bankCount, nil
@ -164,28 +195,22 @@ func writeDb(outputPath, title, revision string, termRecords [][]interface{}, ka
var err error
var db struct {
Title string `json:"title"`
Version int `json:"version"`
Revision string `json:"revision"`
TagMeta map[string]dbTagMeta `json:"tagMeta"`
TermBanks int `json:"termBanks"`
KanjiBanks int `json:"kanjiBanks"`
Title string `json:"title"`
Version int `json:"version"`
Revision string `json:"revision"`
}
db.Title = title
db.Version = DB_VERSION
db.Version = databaseVersion
db.Revision = revision
db.TagMeta = tagMeta
if db.TermBanks, err = writeDbRecords("term", termRecords); err != nil {
return err
for recordType, recordEntries := range recordData {
if _, err := writeDbRecords(recordType, recordEntries); err != nil {
return err
}
}
if db.KanjiBanks, err = writeDbRecords("kanji", kanjiRecords); err != nil {
return err
}
bytes, err := marshalJson(db, pretty)
bytes, err := marshalJSON(db, pretty)
if err != nil {
return err
}
@ -234,8 +259,13 @@ func hasString(needle string, haystack []string) bool {
}
func detectFormat(path string) (string, error) {
if filepath.Ext(path) == ".sqlite" {
switch filepath.Ext(path) {
case ".sqlite":
return "rikai", nil
case ".kanji_freq":
return "kanji_freq", nil
case ".term_freq":
return "term_freq", nil
}
switch filepath.Base(path) {

View File

@ -29,7 +29,7 @@ import (
"github.com/FooSoft/jmdict"
)
const JMDICT_REVISION = "jmdict3"
const jmdictRevision = "jmdict3"
func jmdictBuildRules(term *dbTerm) {
for _, tag := range term.Tags {
@ -71,17 +71,17 @@ func jmdictAddPriorities(term *dbTerm, priorities ...string) {
}
}
func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := map[string]dbTagMeta{
"news": {Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2},
"ichi": {Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2},
"spec": {Notes: "common words not included in frequency lists", Category: "frequent", Order: -2},
"gai": {Notes: "common loanword", Category: "frequent", Order: -2},
"P": {Notes: "popular term", Category: "popular", Order: -10},
func jmdictBuildTagMeta(entities map[string]string) dbTagList {
tags := dbTagList{
dbTag{Name: "news", Notes: "appears frequently in Mainichi Shimbun", Category: "frequent", Order: -2},
dbTag{Name: "ichi", Notes: "listed as common in Ichimango Goi Bunruishuu", Category: "frequent", Order: -2},
dbTag{Name: "spec", Notes: "common words not included in frequency lists", Category: "frequent", Order: -2},
dbTag{Name: "gai", Notes: "common loanword", Category: "frequent", Order: -2},
dbTag{Name: "P", Notes: "popular term", Category: "popular", Order: -10},
}
for name, value := range entities {
tag := dbTagMeta{Notes: value}
tag := dbTag{Name: name, Notes: value}
switch name {
case "exp", "id":
@ -92,7 +92,7 @@ func jmdictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tag.Order = -4
}
tags[name] = tag
tags = append(tags, tag)
}
return tags
@ -227,13 +227,16 @@ func jmdictExportDb(inputPath, outputPath, language, title string, stride int, p
title = "JMdict"
}
recordData := map[string]dbRecordList{
"terms": terms.crush(),
"tags": jmdictBuildTagMeta(entities).crush(),
}
return writeDb(
outputPath,
title,
JMDICT_REVISION,
terms.crush(),
nil,
jmdictBuildTagMeta(entities),
jmdictRevision,
recordData,
stride,
pretty,
)

View File

@ -28,13 +28,13 @@ import (
"github.com/FooSoft/jmdict"
)
const JMNEDICT_REVISION = "jmnedict1"
const jmnedictRevision = "jmnedict1"
func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tags := make(map[string]dbTagMeta)
func jmnedictBuildTagMeta(entities map[string]string) dbTagList {
var tags dbTagList
for name, value := range entities {
tag := dbTagMeta{Notes: value}
tag := dbTag{Name: name, Notes: value}
switch name {
case "company", "fem", "given", "masc", "organization", "person", "place", "product", "station", "surname", "unclass", "work":
@ -42,7 +42,7 @@ func jmnedictBuildTagMeta(entities map[string]string) map[string]dbTagMeta {
tag.Order = 4
}
tags[name] = tag
tags = append(tags, tag)
}
return tags
@ -118,13 +118,16 @@ func jmnedictExportDb(inputPath, outputPath, language, title string, stride int,
title = "JMnedict"
}
recordData := map[string]dbRecordList{
"terms": terms.crush(),
"tags": jmnedictBuildTagMeta(entities).crush(),
}
return writeDb(
outputPath,
title,
JMNEDICT_REVISION,
terms.crush(),
nil,
jmnedictBuildTagMeta(entities),
jmnedictRevision,
recordData,
stride,
pretty,
)

View File

@ -200,13 +200,16 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
title = strings.Join(titles, ", ")
}
recordData := map[string]dbRecordList{
"kanji": kanji.crush(),
"terms": terms.crush(),
}
return writeDb(
outputPath,
title,
strings.Join(revisions, ";"),
terms.crush(),
kanji.crush(),
nil,
recordData,
stride,
pretty,
)

74
frequency.go Normal file
View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2017 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"bufio"
"os"
"strconv"
"strings"
)
const frequencyRevision = "frequency1"
func frequncyExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
var frequencies dbFrequencyList
for scanner := bufio.NewScanner(reader); scanner.Scan(); {
line := scanner.Text()
if strings.HasPrefix(line, "#") {
continue
}
parts := strings.Split(line, "\t")
if len(parts) != 2 {
continue
}
expression := parts[0]
count, err := strconv.Atoi(parts[1])
if err != nil {
continue
}
frequencies = append(frequencies, dbFrequency{expression, count})
}
recordData := map[string]dbRecordList{
"frequencies": frequencies.crush(),
}
return writeDb(
outputPath,
title,
frequencyRevision,
recordData,
stride,
pretty,
)
}

View File

@ -30,7 +30,7 @@ import (
"github.com/FooSoft/jmdict"
)
const KANJIDIC_REVISION = "kanjidic1"
const kanjidicRevision = "kanjidic1"
func kanjidicExtractKanji(entry jmdict.KanjidicCharacter, language string) *dbKanji {
if entry.ReadingMeaning == nil {
@ -116,26 +116,29 @@ func kanjidicExportDb(inputPath, outputPath, language, title string, stride int,
}
}
tagMeta := map[string]dbTagMeta{
"jouyou": {Notes: "included in list of regular-use characters", Category: "frequent", Order: -5},
"jinmeiyou": {Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5},
"jlpt": {Notes: "corresponding Japanese Language Proficiency Test level"},
"grade": {Notes: "school grade level at which the character is taught"},
"strokes": {Notes: "number of strokes needed to write the character"},
"heisig": {Notes: "frame number in Remembering the Kanji"},
tags := dbTagList{
dbTag{Name: "jouyou", Notes: "included in list of regular-use characters", Category: "frequent", Order: -5},
dbTag{Name: "jinmeiyou", Notes: "included in list of characters for use in personal names", Category: "frequent", Order: -5},
dbTag{Name: "jlpt", Notes: "corresponding Japanese Language Proficiency Test level"},
dbTag{Name: "grade", Notes: "school grade level at which the character is taught"},
dbTag{Name: "strokes", Notes: "number of strokes needed to write the character"},
dbTag{Name: "heisig", Notes: "frame number in Remembering the Kanji"},
}
if title == "" {
title = "KANJIDIC2"
}
recordData := map[string]dbRecordList{
"kanji": kanji.crush(),
"tags": tags.crush(),
}
return writeDb(
outputPath,
title,
KANJIDIC_REVISION,
nil,
kanji.crush(),
tagMeta,
kanjidicRevision,
recordData,
stride,
pretty,
)

View File

@ -30,7 +30,7 @@ import (
_ "github.com/mattn/go-sqlite3"
)
const RIKAI_REVISION = "rikai2"
const rikaiRevision = "rikai2"
type rikaiEntry struct {
kanji string
@ -158,21 +158,24 @@ func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pr
title = "Rikai"
}
entities := map[string]dbTagMeta{
"P": {Category: "popular", Order: -10},
"exp": {Category: "expression", Order: -5},
"id": {Category: "expression", Order: -5},
"arch": {Category: "archaism", Order: -4},
"iK": {Category: "archaism", Order: -4},
tags := dbTagList{
dbTag{Name: "P", Category: "popular", Order: -10},
dbTag{Name: "exp", Category: "expression", Order: -5},
dbTag{Name: "id", Category: "expression", Order: -5},
dbTag{Name: "arch", Category: "archaism", Order: -4},
dbTag{Name: "iK", Category: "archaism", Order: -4},
}
recordData := map[string]dbRecordList{
"terms": terms.crush(),
"tags": tags.crush(),
}
return writeDb(
outputPath,
title,
RIKAI_REVISION,
terms.crush(),
nil,
entities,
rikaiRevision,
recordData,
stride,
pretty,
)