From b66d908b23affca52dee1d9583a9a718a5bf430b Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Thu, 31 Dec 2020 21:53:10 -0800 Subject: [PATCH] Switch to zig for EPWING parsing --- common.go | 11 +++--- daijirin.go | 6 ++- daijisen.go | 6 ++- epwing.go | 107 +++------------------------------------------------- gakken.go | 6 ++- go.mod | 9 ----- go.sum | 6 --- gui.go | 2 +- kotowaza.go | 6 ++- koujien.go | 6 ++- main.go | 2 +- meikyou.go | 6 ++- wadai.go | 6 ++- 13 files changed, 42 insertions(+), 137 deletions(-) delete mode 100644 go.mod delete mode 100644 go.sum diff --git a/common.go b/common.go index 270acf6..d263bdb 100644 --- a/common.go +++ b/common.go @@ -266,8 +266,8 @@ func hasString(needle string, haystack []string) bool { return false } -func detectFormat(path string) (string, error) { - switch filepath.Ext(path) { +func detectFormat(path *string) (string, error) { + switch filepath.Ext(*path) { case ".sqlite": return "rikai", nil case ".kanjifreq": @@ -276,7 +276,7 @@ func detectFormat(path string) (string, error) { return "termfreq", nil } - switch filepath.Base(path) { + switch filepath.Base(*path) { case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml": return "edict", nil case "JMnedict", "JMnedict.xml": @@ -284,16 +284,17 @@ func detectFormat(path string) (string, error) { case "kanjidic2", "kanjidic2.xml": return "kanjidic", nil case "CATALOGS": + *path = filepath.Dir(*path) return "epwing", nil } - info, err := os.Stat(path) + info, err := os.Stat(*path) if err != nil { return "", err } if info.IsDir() { - _, err := os.Stat(filepath.Join(path, "CATALOGS")) + _, err := os.Stat(filepath.Join(*path, "CATALOGS")) if err == nil { return "epwing", nil } diff --git a/daijirin.go b/daijirin.go index 47d2054..1617d3d 100644 --- a/daijirin.go +++ b/daijirin.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type daijirinExtractor struct { @@ -47,7 +49,7 @@ func makeDaijirinExtractor() epwingExtractor { } } -func (e *daijirinExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { matches := e.partsExp.FindStringSubmatch(entry.Heading) if matches == nil { return nil @@ -112,7 +114,7 @@ func (e *daijirinExtractor) extractTerms(entry epwingEntry, sequence int) []dbTe return terms } -func (*daijirinExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (*daijirinExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil } diff --git a/daijisen.go b/daijisen.go index 44e1f32..ef1de4e 100644 --- a/daijisen.go +++ b/daijisen.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type daijisenExtractor struct { @@ -49,7 +51,7 @@ func makeDaijisenExtractor() epwingExtractor { } } -func (e *daijisenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { matches := e.partsExp.FindStringSubmatch(entry.Heading) if matches == nil { return nil @@ -111,7 +113,7 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTe return terms } -func (*daijisenExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (*daijisenExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil } diff --git a/epwing.go b/epwing.go index 0647735..4205ef1 100644 --- a/epwing.go +++ b/epwing.go @@ -23,123 +23,29 @@ package main import ( - "bufio" - "encoding/json" "fmt" - "io/ioutil" "log" - "os" - "os/exec" - "path/filepath" "regexp" - "runtime" "strconv" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) -type epwingEntry struct { - Heading string `json:"heading"` - Text string `json:"text"` -} - -type epwingSubbook struct { - Title string `json:"title"` - Copyright string `json:"copyright"` - Entries []epwingEntry `json:"entries"` -} - -type epwingBook struct { - CharCode string `json:"charCode"` - DiscCode string `json:"discCode"` - Subbooks []epwingSubbook `json:"subbooks"` -} - type epwingExtractor interface { - extractTerms(entry epwingEntry, sequence int) []dbTerm - extractKanji(entry epwingEntry) []dbKanji + extractTerms(entry zig.BookEntry, sequence int) []dbTerm + extractKanji(entry zig.BookEntry) []dbKanji getFontNarrow() map[int]string getFontWide() map[int]string getRevision() string } func epwingExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error { - stat, err := os.Stat(inputPath) + book, err := zig.Load(inputPath) if err != nil { return err } - var toolExec bool - if stat.IsDir() { - toolExec = true - } else if filepath.Base(inputPath) == "CATALOGS" { - inputPath = filepath.Dir(inputPath) - toolExec = true - } - - var data []byte - if toolExec { - exePath, err := os.Executable() - if err != nil { - return err - } - - toolPath := filepath.Join("bin", runtime.GOOS, "zero-epwing") - if runtime.GOOS == "windows" { - toolPath += ".exe" - } - - toolPath = filepath.Join(filepath.Dir(exePath), toolPath) - - if _, err = os.Stat(toolPath); err != nil { - return fmt.Errorf("failed to find zero-epwing in '%s'", toolPath) - } - - cmd := exec.Command(toolPath, "--entries", inputPath) - - stdout, err := cmd.StdoutPipe() - if err != nil { - return err - } - - stderr, err := cmd.StderrPipe() - if err != nil { - return err - } - - log.Printf("invoking zero-epwing from '%s'...\n", toolPath) - if err := cmd.Start(); err != nil { - return err - } - - go func() { - scanner := bufio.NewScanner(stderr) - for scanner.Scan() { - log.Printf("\t > %s\n", scanner.Text()) - } - }() - - if data, err = ioutil.ReadAll(stdout); err != nil { - return err - } - - if err := cmd.Wait(); err != nil { - return err - } - - log.Println("completed zero-epwing processing") - } else { - data, err = ioutil.ReadFile(inputPath) - } - - if err != nil { - return err - } - - var book epwingBook - if err := json.Unmarshal(data, &book); err != nil { - return err - } - translateExp := regexp.MustCompile(`{{([nw])_(\d+)}}`) epwingExtractors := map[string]epwingExtractor{ "三省堂 スーパー大辞林": makeDaijirinExtractor(), @@ -160,12 +66,11 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p kanji dbKanjiList revisions []string titles []string + sequence int ) log.Println("formatting dictionary data...") - var sequence int - for _, subbook := range book.Subbooks { if extractor, ok := epwingExtractors[subbook.Title]; ok { fontNarrow := extractor.getFontNarrow() diff --git a/gakken.go b/gakken.go index 82026ed..4ec0ec6 100644 --- a/gakken.go +++ b/gakken.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type gakkenExtractor struct { @@ -69,7 +71,7 @@ var cosmetics = strings.NewReplacer("(1)", "①", "(2)", "②", "(3)", "③", "( "セ゛", "ゼ", "ソ゛", "ゾ") -func (e *gakkenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { matches := e.partsExp.FindStringSubmatch(entry.Heading) if matches == nil { return nil @@ -140,7 +142,7 @@ func (e *gakkenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm return terms } -func (*gakkenExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (*gakkenExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil } diff --git a/go.mod b/go.mod deleted file mode 100644 index e1829f1..0000000 --- a/go.mod +++ /dev/null @@ -1,9 +0,0 @@ -module github.com/FooSoft/yomichan-import - -go 1.13 - -require ( - github.com/FooSoft/jmdict v0.0.0-20190926045629-808d66c7b050 - github.com/andlabs/ui v0.0.0-20180902183112-867a9e5a498d - github.com/mattn/go-sqlite3 v2.0.2+incompatible -) diff --git a/go.sum b/go.sum deleted file mode 100644 index 3b7b4f4..0000000 --- a/go.sum +++ /dev/null @@ -1,6 +0,0 @@ -github.com/FooSoft/jmdict v0.0.0-20190926045629-808d66c7b050 h1:3VWk8B61jEgfcV+pEgzWO1j7TVXC9g5QaS0J06994Zc= -github.com/FooSoft/jmdict v0.0.0-20190926045629-808d66c7b050/go.mod h1:zo92ezZlNld5cN1iuS0QRAmSsHcpvcqGZLVNKPM4Hlg= -github.com/andlabs/ui v0.0.0-20180902183112-867a9e5a498d h1:4ianvxb8s3oyizgjuWWxGuTAUU+6JStcvj6BuHS4PVY= -github.com/andlabs/ui v0.0.0-20180902183112-867a9e5a498d/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II= -github.com/mattn/go-sqlite3 v2.0.2+incompatible h1:qzw9c2GNT8UFrgWNDhCTqRqYUSmu/Dav/9Z58LGpk7U= -github.com/mattn/go-sqlite3 v2.0.2+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= diff --git a/gui.go b/gui.go index 65a2e58..d2ac3fd 100644 --- a/gui.go +++ b/gui.go @@ -116,7 +116,7 @@ func gui() error { return } - format, err := detectFormat(inputPath) + format, err := detectFormat(&inputPath) if err != nil { ui.MsgBoxError(window, "Error", "Unable to detect dictionary format") importButton.Enable() diff --git a/kotowaza.go b/kotowaza.go index feddb8b..13ead72 100644 --- a/kotowaza.go +++ b/kotowaza.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type kotowazaExtractor struct { @@ -43,7 +45,7 @@ func makeKotowazaExtractor() epwingExtractor { } } -func (e *kotowazaExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *kotowazaExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { heading := entry.Heading queue := []string{heading} @@ -104,7 +106,7 @@ func (e *kotowazaExtractor) extractTerms(entry epwingEntry, sequence int) []dbTe return terms } -func (e *kotowazaExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (e *kotowazaExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil } diff --git a/koujien.go b/koujien.go index 96af4d7..773e803 100644 --- a/koujien.go +++ b/koujien.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type koujienExtractor struct { @@ -57,7 +59,7 @@ func makeFuzokuExtractor() epwingExtractor { } } -func (e *koujienExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { matches := e.partsExp.FindStringSubmatch(entry.Heading) if matches == nil { return nil @@ -122,7 +124,7 @@ func (e *koujienExtractor) extractTerms(entry epwingEntry, sequence int) []dbTer return terms } -func (*koujienExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (*koujienExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil } diff --git a/main.go b/main.go index b39d4c3..985d9f9 100644 --- a/main.go +++ b/main.go @@ -107,7 +107,7 @@ func main() { if *format == "" { var err error - if *format, err = detectFormat(inputPath); err != nil { + if *format, err = detectFormat(&inputPath); err != nil { log.Fatal(err) } } diff --git a/meikyou.go b/meikyou.go index ed0bf42..bd4d90f 100644 --- a/meikyou.go +++ b/meikyou.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type meikyouExtractor struct { @@ -77,7 +79,7 @@ func makeMeikyouExtractor() epwingExtractor { } } -func (e *meikyouExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { matches := e.partsExp.FindStringSubmatch(entry.Heading) if matches == nil { return nil @@ -153,7 +155,7 @@ func (e *meikyouExtractor) extractTerms(entry epwingEntry, sequence int) []dbTer return terms } -func (e *meikyouExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (e *meikyouExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil } diff --git a/wadai.go b/wadai.go index 1fddcfb..489e0ab 100644 --- a/wadai.go +++ b/wadai.go @@ -25,6 +25,8 @@ package main import ( "regexp" "strings" + + "github.com/FooSoft/zero-epwing-go/zig" ) type wadaiExtractor struct { @@ -45,7 +47,7 @@ func makeWadaiExtractor() epwingExtractor { } } -func (e *wadaiExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm { +func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm { matches := e.partsExp.FindStringSubmatch(entry.Heading) if matches == nil { return nil @@ -104,7 +106,7 @@ func (e *wadaiExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm return terms } -func (e *wadaiExtractor) extractKanji(entry epwingEntry) []dbKanji { +func (e *wadaiExtractor) extractKanji(entry zig.BookEntry) []dbKanji { return nil }