1

auto detect dictionary format

This commit is contained in:
Alex Yatskov 2016-12-28 17:45:33 -08:00
parent da60bede76
commit a78a5a4a4a
6 changed files with 82 additions and 34 deletions

View File

@ -27,6 +27,7 @@ import (
"fmt"
"os"
"path"
"path/filepath"
"strings"
)
@ -114,9 +115,8 @@ func (kanji dbKanjiList) crush() [][]interface{} {
return results
}
func writeDb(outputDir, title string, revision string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, pretty bool) error {
func writeDb(outputDir, title string, revision string, termRecords [][]interface{}, kanjiRecords [][]interface{}, tagMeta map[string]dbTagMeta, stride int, pretty bool) error {
const DB_VERSION = 1
const BANK_STRIDE = 10000
marshalJson := func(obj interface{}, pretty bool) ([]byte, error) {
if pretty {
@ -130,9 +130,9 @@ func writeDb(outputDir, title string, revision string, termRecords [][]interface
recordCount := len(records)
bankCount := 0
for i := 0; i < recordCount; i += BANK_STRIDE {
for i := 0; i < recordCount; i += stride {
indexSrc := i
indexDst := i + BANK_STRIDE
indexDst := i + stride
if indexDst > recordCount {
indexDst = recordCount
}
@ -142,7 +142,7 @@ func writeDb(outputDir, title string, revision string, termRecords [][]interface
return 0, err
}
fp, err := os.Create(path.Join(outputDir, fmt.Sprintf("%s_bank_%d.json", prefix, i/BANK_STRIDE+1)))
fp, err := os.Create(path.Join(outputDir, fmt.Sprintf("%s_bank_%d.json", prefix, i/stride+1)))
if err != nil {
return 0, err
}
@ -222,3 +222,29 @@ func hasString(needle string, haystack []string) bool {
return false
}
func detectFormat(path string) string {
info, err := os.Stat(path)
if err != nil {
return ""
}
if info.IsDir() {
_, err := os.Stat(filepath.Join(path, "CATALOGS"))
if err == nil {
return "epwing"
}
} else {
base := filepath.Base(path)
switch base {
case "JMdict_e.xml":
return "edict"
case "JMnedict.xml":
return "enamdict"
case "kanjidic2.xml":
return "kanjidic"
}
}
return ""
}

View File

@ -165,7 +165,7 @@ func jmdictExtractTerms(edictEntry jmdict.JmdictEntry) []dbTerm {
return terms
}
func jmdictExportDb(inputPath, outputDir, title string, pretty bool) error {
func jmdictExportDb(inputPath, outputDir, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
@ -182,6 +182,10 @@ func jmdictExportDb(inputPath, outputDir, title string, pretty bool) error {
terms = append(terms, jmdictExtractTerms(entry)...)
}
if title == "" {
title = "JMdict"
}
return writeDb(
outputDir,
title,
@ -189,6 +193,7 @@ func jmdictExportDb(inputPath, outputDir, title string, pretty bool) error {
terms.crush(),
nil,
jmdictBuildTagMeta(entities),
stride,
pretty,
)
}

View File

@ -97,7 +97,7 @@ func jmnedictExtractTerms(enamdictEntry jmdict.JmnedictEntry) []dbTerm {
return terms
}
func jmnedictExportDb(inputPath, outputDir, title string, pretty bool) error {
func jmnedictExportDb(inputPath, outputDir, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
@ -114,6 +114,10 @@ func jmnedictExportDb(inputPath, outputDir, title string, pretty bool) error {
terms = append(terms, jmnedictExtractTerms(e)...)
}
if title == "" {
title = "JMnedict"
}
return writeDb(
outputDir,
title,
@ -121,6 +125,7 @@ func jmnedictExportDb(inputPath, outputDir, title string, pretty bool) error {
terms.crush(),
nil,
jmnedictBuildTagMeta(entities),
stride,
pretty,
)
}

View File

@ -60,7 +60,7 @@ type epwingExtractor interface {
getRevision() string
}
func epwingExportDb(inputPath, outputDir, title string, pretty bool) error {
func epwingExportDb(inputPath, outputDir, title string, stride int, pretty bool) error {
stat, err := os.Stat(inputPath)
if err != nil {
return err
@ -99,9 +99,12 @@ func epwingExportDb(inputPath, outputDir, title string, pretty bool) error {
"三省堂 スーパー大辞林": makeDaijirinExtractor(),
}
var terms dbTermList
var kanji dbKanjiList
var revisions []string
var (
terms dbTermList
kanji dbKanjiList
revisions []string
titles []string
)
for _, subbook := range book.Subbooks {
if extractor, ok := epwingExtractors[subbook.Title]; ok {
@ -138,11 +141,16 @@ func epwingExportDb(inputPath, outputDir, title string, pretty bool) error {
}
revisions = append(revisions, extractor.getRevision())
titles = append(titles, subbook.Title)
} else {
return fmt.Errorf("failed to find compatible extractor for '%s'", subbook.Title)
}
}
if title == "" {
title = strings.Join(titles, ", ")
}
return writeDb(
outputDir,
title,
@ -150,6 +158,7 @@ func epwingExportDb(inputPath, outputDir, title string, pretty bool) error {
terms.crush(),
kanji.crush(),
nil,
stride,
pretty,
)
}

View File

@ -80,7 +80,7 @@ func kanjidicExtractKanji(entry jmdict.KanjidicCharacter) dbKanji {
return kanji
}
func kanjidicExportDb(inputPath, outputDir, title string, pretty bool) error {
func kanjidicExportDb(inputPath, outputDir, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
@ -106,6 +106,10 @@ func kanjidicExportDb(inputPath, outputDir, title string, pretty bool) error {
"heisig": {Notes: "frame number in Remembering the Kanji"},
}
if title == "" {
title = "KANJIDIC2"
}
return writeDb(
outputDir,
title,
@ -113,6 +117,7 @@ func kanjidicExportDb(inputPath, outputDir, title string, pretty bool) error {
nil,
kanji.crush(),
tagMeta,
stride,
pretty,
)
}

42
main.go
View File

@ -31,17 +31,16 @@ import (
"net/http"
"os"
"path"
"time"
)
func usage() {
fmt.Fprintf(os.Stderr, "Usage:\n %s [options] [edict|enamdict|kanjidic|epwing] input-path [output-dir]\n\n", path.Base(os.Args[0]))
fmt.Fprintf(os.Stderr, "Usage:\n %s [options] input-path [output-dir]\n\n", path.Base(os.Args[0]))
fmt.Fprintf(os.Stderr, "Parameters:\n")
flag.PrintDefaults()
}
func exportDb(inputPath, outputDir, format, title string, pretty bool) error {
handlers := map[string]func(string, string, string, bool) error{
func exportDb(inputPath, outputDir, format, title string, stride int, pretty bool) error {
handlers := map[string]func(string, string, string, int, bool) error{
"edict": jmdictExportDb,
"enamdict": jmnedictExportDb,
"kanjidic": kanjidicExportDb,
@ -53,39 +52,43 @@ func exportDb(inputPath, outputDir, format, title string, pretty bool) error {
return errors.New("unrecognized dictionray format")
}
log.Printf("converting '%s' to '%s'...", inputPath, outputDir)
return handler(inputPath, outputDir, title, pretty)
log.Printf("converting '%s' to '%s' in '%s' format...", inputPath, outputDir, format)
return handler(inputPath, outputDir, title, stride, pretty)
}
func serveDb(serveDir string, port int) error {
log.Printf("starting HTTP server on port %d...\n", port)
log.Printf("starting dictionary server on port %d...\n", port)
return http.ListenAndServe(fmt.Sprintf(":%d", port), http.FileServer(http.Dir(serveDir)))
}
func main() {
var (
serve = flag.Bool("serve", false, "serve JSON over HTTP")
format = flag.String("format", "", "dictionary format [edict|enamdict|kanjidic|epwing]")
port = flag.Int("port", 9876, "port to serve JSON on")
pretty = flag.Bool("pretty", false, "output prettified JSON")
serve = flag.Bool("serve", false, "serve JSON over HTTP")
stride = flag.Int("stride", 10000, "dictionary bank stride")
title = flag.String("title", "", "dictionary title")
)
flag.Usage = usage
flag.Parse()
if flag.NArg() != 2 && flag.NArg() != 3 {
if flag.NArg() != 1 && flag.NArg() != 2 {
usage()
os.Exit(2)
}
var (
format = flag.Arg(0)
inputPath = flag.Arg(1)
outputDir string
)
inputPath := flag.Arg(0)
if *format == "" {
if *format = detectFormat(inputPath); *format == "" {
log.Fatal("failed to detect dictionary format")
}
}
if flag.NArg() == 3 {
outputDir = flag.Arg(2)
var outputDir string
if flag.NArg() == 2 {
outputDir = flag.Arg(1)
} else {
var err error
outputDir, err = ioutil.TempDir("", "yomichan_tmp_")
@ -94,12 +97,7 @@ func main() {
}
}
if *title == "" {
t := time.Now()
*title = fmt.Sprintf("%s-%s", format, t.Format("20060102150405"))
}
if err := exportDb(inputPath, outputDir, format, *title, *pretty); err != nil {
if err := exportDb(inputPath, outputDir, *format, *title, *stride, *pretty); err != nil {
log.Fatal(err)
}