1
yomichan-import/common.go

356 lines
6.8 KiB
Go
Raw Normal View History

2021-01-01 22:31:58 +00:00
package yomichan
2016-08-07 01:17:02 +00:00
import (
2017-06-26 00:22:17 +00:00
"archive/zip"
"bytes"
2016-08-07 01:17:02 +00:00
"encoding/json"
2017-06-26 01:06:41 +00:00
"errors"
2016-08-23 03:51:30 +00:00
"fmt"
"os"
2016-12-29 01:45:33 +00:00
"path/filepath"
2016-08-07 01:17:02 +00:00
"strings"
"golang.org/x/exp/slices"
2016-08-07 01:17:02 +00:00
)
2021-01-01 22:31:58 +00:00
const (
DefaultFormat = ""
DefaultLanguage = ""
DefaultPretty = false
DefaultStride = 10000
DefaultTitle = ""
)
type dbRecord []any
2017-09-10 20:25:11 +00:00
type dbRecordList []dbRecord
type dbTag struct {
Name string
Category string
Order int
Notes string
2017-10-24 10:24:41 +00:00
Score int
2017-09-10 20:25:11 +00:00
}
type dbTagList []dbTag
func (meta dbTagList) crush() dbRecordList {
var results dbRecordList
for _, m := range meta {
2017-10-24 10:24:41 +00:00
results = append(results, dbRecord{m.Name, m.Category, m.Order, m.Notes, m.Score})
2017-09-10 20:25:11 +00:00
}
return results
}
2017-09-23 06:03:05 +00:00
type dbMeta struct {
2017-09-10 20:25:11 +00:00
Expression string
2017-09-23 06:03:05 +00:00
Mode string
Data any
2017-09-10 20:25:11 +00:00
}
2017-09-23 06:03:05 +00:00
type dbMetaList []dbMeta
2017-09-10 20:25:11 +00:00
2017-09-23 06:03:05 +00:00
func (freqs dbMetaList) crush() dbRecordList {
2017-09-10 20:25:11 +00:00
var results dbRecordList
for _, f := range freqs {
2017-09-23 06:03:05 +00:00
results = append(results, dbRecord{f.Expression, f.Mode, f.Data})
2017-09-10 20:25:11 +00:00
}
return results
2016-12-17 23:48:13 +00:00
}
2016-11-05 20:13:13 +00:00
type dbTerm struct {
2017-10-12 23:46:06 +00:00
Expression string
Reading string
DefinitionTags []string
Rules []string
Score int
Glossary []any
2017-10-12 23:46:06 +00:00
Sequence int
TermTags []string
2016-08-07 01:17:02 +00:00
}
2016-11-05 20:13:13 +00:00
type dbTermList []dbTerm
2017-10-12 23:46:06 +00:00
func (term *dbTerm) addDefinitionTags(tags ...string) {
term.DefinitionTags = appendStringUnique(term.DefinitionTags, tags...)
2016-08-07 01:17:02 +00:00
}
2017-10-12 23:46:06 +00:00
func (term *dbTerm) addTermTags(tags ...string) {
term.TermTags = appendStringUnique(term.TermTags, tags...)
2017-10-11 06:39:29 +00:00
}
2016-12-17 23:48:13 +00:00
func (term *dbTerm) addRules(rules ...string) {
term.Rules = appendStringUnique(term.Rules, rules...)
2016-08-07 20:24:56 +00:00
}
2017-09-10 20:25:11 +00:00
func (terms dbTermList) crush() dbRecordList {
var results dbRecordList
2016-11-05 20:13:13 +00:00
for _, t := range terms {
2017-09-10 20:25:11 +00:00
result := dbRecord{
2016-11-05 20:13:13 +00:00
t.Expression,
t.Reading,
2017-10-12 23:46:06 +00:00
strings.Join(t.DefinitionTags, " "),
2016-12-17 23:48:13 +00:00
strings.Join(t.Rules, " "),
2016-12-18 03:24:08 +00:00
t.Score,
2017-09-16 20:54:28 +00:00
t.Glossary,
2017-10-11 02:30:59 +00:00
t.Sequence,
2017-10-12 23:46:06 +00:00
strings.Join(t.TermTags, " "),
2016-11-05 20:13:13 +00:00
}
results = append(results, result)
2016-08-22 02:51:43 +00:00
}
2016-08-07 01:17:02 +00:00
2016-11-05 20:13:13 +00:00
return results
}
type dbKanji struct {
Character string
Onyomi []string
Kunyomi []string
Tags []string
Meanings []string
2017-09-16 20:54:28 +00:00
Stats map[string]string
2016-11-05 20:13:13 +00:00
}
type dbKanjiList []dbKanji
func (kanji *dbKanji) addTags(tags ...string) {
for _, tag := range tags {
if !slices.Contains(kanji.Tags, tag) {
2016-11-05 20:13:13 +00:00
kanji.Tags = append(kanji.Tags, tag)
}
2016-08-07 01:17:02 +00:00
}
2016-11-05 20:13:13 +00:00
}
2017-09-10 20:25:11 +00:00
func (kanji dbKanjiList) crush() dbRecordList {
var results dbRecordList
2016-11-05 20:13:13 +00:00
for _, k := range kanji {
2017-09-10 20:25:11 +00:00
result := dbRecord{
2016-11-05 20:13:13 +00:00
k.Character,
strings.Join(k.Onyomi, " "),
strings.Join(k.Kunyomi, " "),
strings.Join(k.Tags, " "),
2017-09-16 20:54:28 +00:00
k.Meanings,
k.Stats,
2016-12-18 03:24:08 +00:00
}
2016-11-05 20:13:13 +00:00
results = append(results, result)
2016-08-23 03:51:30 +00:00
}
2016-11-05 20:13:13 +00:00
return results
2016-08-23 03:51:30 +00:00
}
2016-08-07 01:17:02 +00:00
type dbIndex struct {
Title string `json:"title"`
Format int `json:"format"`
Revision string `json:"revision"`
Sequenced bool `json:"sequenced"`
Author string `json:"author"`
Url string `json:"url"`
Description string `json:"description"`
Attribution string `json:"attribution"`
}
func (index *dbIndex) setDefaults() {
if index.Format == 0 {
index.Format = 3
}
if index.Author == "" {
index.Author = "yomichan-import"
}
if index.Url == "" {
index.Url = "https://github.com/FooSoft/yomichan-import"
}
}
func writeDb(outputPath string, index dbIndex, recordData map[string]dbRecordList, stride int, pretty bool) error {
2017-06-26 00:22:17 +00:00
var zbuff bytes.Buffer
zip := zip.NewWriter(&zbuff)
marshalJSON := func(obj any, pretty bool) ([]byte, error) {
2016-11-05 20:13:13 +00:00
if pretty {
return json.MarshalIndent(obj, "", " ")
}
return json.Marshal(obj)
}
2017-09-10 20:25:11 +00:00
writeDbRecords := func(prefix string, records dbRecordList) (int, error) {
2016-11-06 06:24:57 +00:00
recordCount := len(records)
bankCount := 0
2016-12-29 01:45:33 +00:00
for i := 0; i < recordCount; i += stride {
2016-11-06 06:24:57 +00:00
indexSrc := i
2016-12-29 01:45:33 +00:00
indexDst := i + stride
2016-11-06 06:24:57 +00:00
if indexDst > recordCount {
indexDst = recordCount
}
2017-09-10 20:25:11 +00:00
bytes, err := marshalJSON(records[indexSrc:indexDst], pretty)
2016-11-06 06:24:57 +00:00
if err != nil {
return 0, err
}
2017-06-26 00:22:17 +00:00
zw, err := zip.Create(fmt.Sprintf("%s_bank_%d.json", prefix, i/stride+1))
2016-11-06 06:24:57 +00:00
if err != nil {
return 0, err
}
2017-06-26 00:22:17 +00:00
if _, err := zw.Write(bytes); err != nil {
2016-11-06 06:24:57 +00:00
return 0, err
}
2017-09-10 20:25:11 +00:00
bankCount++
2016-11-06 06:24:57 +00:00
}
return bankCount, nil
2016-11-05 20:13:13 +00:00
}
2016-11-06 06:24:57 +00:00
var err error
2017-09-10 20:25:11 +00:00
for recordType, recordEntries := range recordData {
if _, err := writeDbRecords(recordType, recordEntries); err != nil {
return err
}
2016-08-23 03:51:30 +00:00
}
2016-08-07 01:17:02 +00:00
2023-01-29 00:39:08 +00:00
index.setDefaults()
bytes, err := marshalJSON(index, pretty)
2016-08-23 03:51:30 +00:00
if err != nil {
return err
2016-08-07 01:17:02 +00:00
}
2017-06-26 00:22:17 +00:00
zw, err := zip.Create("index.json")
if err != nil {
return err
}
if _, err := zw.Write(bytes); err != nil {
return err
}
zip.Close()
fp, err := os.Create(outputPath)
2016-08-07 01:17:02 +00:00
if err != nil {
return err
}
2017-06-26 00:22:17 +00:00
if _, err := fp.Write(zbuff.Bytes()); err != nil {
2016-08-23 03:51:30 +00:00
return err
}
2017-06-26 00:22:17 +00:00
return fp.Close()
2016-08-24 16:02:26 +00:00
}
2016-12-17 23:48:13 +00:00
func appendStringUnique(target []string, source ...string) []string {
for _, str := range source {
if !slices.Contains(target, str) {
2016-12-17 23:48:13 +00:00
target = append(target, str)
}
}
return target
}
func intersection(s1, s2 []string) []string {
s := []string{}
m := make(map[string]bool)
for _, e := range s1 {
m[e] = true
}
for _, e := range s2 {
if m[e] {
s = append(s, e)
m[e] = false
}
}
return s
}
func union(s1, s2 []string) []string {
s := []string{}
m := make(map[string]bool)
for _, e := range s1 {
if !m[e] {
s = append(s, e)
m[e] = true
}
}
for _, e := range s2 {
if !m[e] {
s = append(s, e)
m[e] = true
}
}
return s
}
2021-01-02 00:18:55 +00:00
func detectFormat(path string) (string, error) {
switch filepath.Ext(path) {
2017-09-10 20:25:11 +00:00
case ".sqlite":
2017-08-19 21:13:51 +00:00
return "rikai", nil
2017-09-10 20:45:06 +00:00
case ".kanjifreq":
return "kanjifreq", nil
case ".termfreq":
return "termfreq", nil
2017-08-19 21:13:51 +00:00
}
2021-01-02 00:18:55 +00:00
switch filepath.Base(path) {
2023-01-22 20:37:18 +00:00
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml", "JMdict_e_examp":
2017-06-26 01:06:41 +00:00
return "edict", nil
case "JMnedict", "JMnedict.xml":
return "enamdict", nil
case "kanjidic2", "kanjidic2.xml":
return "kanjidic", nil
}
2021-01-02 00:18:55 +00:00
info, err := os.Stat(path)
2016-12-29 01:45:33 +00:00
if err != nil {
2017-06-26 01:06:41 +00:00
return "", err
2016-12-29 01:45:33 +00:00
}
if info.IsDir() {
2021-01-02 00:18:55 +00:00
_, err := os.Stat(filepath.Join(path, "CATALOGS"))
2016-12-29 01:45:33 +00:00
if err == nil {
2017-06-26 01:06:41 +00:00
return "epwing", nil
2016-12-29 01:45:33 +00:00
}
_, err = os.Stat(filepath.Join(path, "catalogs"))
if err == nil {
return "epwing", nil
}
2016-12-29 01:45:33 +00:00
}
2017-06-26 01:06:41 +00:00
return "", errors.New("unrecognized dictionary format")
2016-12-29 01:45:33 +00:00
}
2021-01-01 22:31:58 +00:00
func ExportDb(inputPath, outputPath, format, language, title string, stride int, pretty bool) error {
handlers := map[string]func(string, string, string, string, int, bool) error{
2023-02-02 01:14:37 +00:00
"edict": jmdictExportDb,
2023-01-22 20:37:18 +00:00
"forms": formsExportDb,
2021-01-01 22:31:58 +00:00
"enamdict": jmnedictExportDb,
"epwing": epwingExportDb,
"kanjidic": kanjidicExportDb,
"rikai": rikaiExportDb,
"kanjifreq": frequencyKanjiExportDb,
"termfreq": frequencyTermsExportDb,
}
var err error
if format == DefaultFormat {
2021-01-02 00:18:55 +00:00
if format, err = detectFormat(inputPath); err != nil {
2021-01-01 22:31:58 +00:00
return err
}
}
handler, ok := handlers[strings.ToLower(format)]
if !ok {
return errors.New("unrecognized dictionary format")
}
return handler(inputPath, outputPath, strings.ToLower(language), title, stride, pretty)
}