1

Switch to zig for EPWING parsing

This commit is contained in:
Alex Yatskov 2020-12-31 21:53:10 -08:00
parent 50901f7155
commit b66d908b23
13 changed files with 42 additions and 137 deletions

View File

@ -266,8 +266,8 @@ func hasString(needle string, haystack []string) bool {
return false
}
func detectFormat(path string) (string, error) {
switch filepath.Ext(path) {
func detectFormat(path *string) (string, error) {
switch filepath.Ext(*path) {
case ".sqlite":
return "rikai", nil
case ".kanjifreq":
@ -276,7 +276,7 @@ func detectFormat(path string) (string, error) {
return "termfreq", nil
}
switch filepath.Base(path) {
switch filepath.Base(*path) {
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml":
return "edict", nil
case "JMnedict", "JMnedict.xml":
@ -284,16 +284,17 @@ func detectFormat(path string) (string, error) {
case "kanjidic2", "kanjidic2.xml":
return "kanjidic", nil
case "CATALOGS":
*path = filepath.Dir(*path)
return "epwing", nil
}
info, err := os.Stat(path)
info, err := os.Stat(*path)
if err != nil {
return "", err
}
if info.IsDir() {
_, err := os.Stat(filepath.Join(path, "CATALOGS"))
_, err := os.Stat(filepath.Join(*path, "CATALOGS"))
if err == nil {
return "epwing", nil
}

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type daijirinExtractor struct {
@ -47,7 +49,7 @@ func makeDaijirinExtractor() epwingExtractor {
}
}
func (e *daijirinExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *daijirinExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -112,7 +114,7 @@ func (e *daijirinExtractor) extractTerms(entry epwingEntry, sequence int) []dbTe
return terms
}
func (*daijirinExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (*daijirinExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type daijisenExtractor struct {
@ -49,7 +51,7 @@ func makeDaijisenExtractor() epwingExtractor {
}
}
func (e *daijisenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -111,7 +113,7 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTe
return terms
}
func (*daijisenExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (*daijisenExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}

107
epwing.go
View File

@ -23,123 +23,29 @@
package main
import (
"bufio"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type epwingEntry struct {
Heading string `json:"heading"`
Text string `json:"text"`
}
type epwingSubbook struct {
Title string `json:"title"`
Copyright string `json:"copyright"`
Entries []epwingEntry `json:"entries"`
}
type epwingBook struct {
CharCode string `json:"charCode"`
DiscCode string `json:"discCode"`
Subbooks []epwingSubbook `json:"subbooks"`
}
type epwingExtractor interface {
extractTerms(entry epwingEntry, sequence int) []dbTerm
extractKanji(entry epwingEntry) []dbKanji
extractTerms(entry zig.BookEntry, sequence int) []dbTerm
extractKanji(entry zig.BookEntry) []dbKanji
getFontNarrow() map[int]string
getFontWide() map[int]string
getRevision() string
}
func epwingExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
stat, err := os.Stat(inputPath)
book, err := zig.Load(inputPath)
if err != nil {
return err
}
var toolExec bool
if stat.IsDir() {
toolExec = true
} else if filepath.Base(inputPath) == "CATALOGS" {
inputPath = filepath.Dir(inputPath)
toolExec = true
}
var data []byte
if toolExec {
exePath, err := os.Executable()
if err != nil {
return err
}
toolPath := filepath.Join("bin", runtime.GOOS, "zero-epwing")
if runtime.GOOS == "windows" {
toolPath += ".exe"
}
toolPath = filepath.Join(filepath.Dir(exePath), toolPath)
if _, err = os.Stat(toolPath); err != nil {
return fmt.Errorf("failed to find zero-epwing in '%s'", toolPath)
}
cmd := exec.Command(toolPath, "--entries", inputPath)
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
stderr, err := cmd.StderrPipe()
if err != nil {
return err
}
log.Printf("invoking zero-epwing from '%s'...\n", toolPath)
if err := cmd.Start(); err != nil {
return err
}
go func() {
scanner := bufio.NewScanner(stderr)
for scanner.Scan() {
log.Printf("\t > %s\n", scanner.Text())
}
}()
if data, err = ioutil.ReadAll(stdout); err != nil {
return err
}
if err := cmd.Wait(); err != nil {
return err
}
log.Println("completed zero-epwing processing")
} else {
data, err = ioutil.ReadFile(inputPath)
}
if err != nil {
return err
}
var book epwingBook
if err := json.Unmarshal(data, &book); err != nil {
return err
}
translateExp := regexp.MustCompile(`{{([nw])_(\d+)}}`)
epwingExtractors := map[string]epwingExtractor{
"三省堂 スーパー大辞林": makeDaijirinExtractor(),
@ -160,12 +66,11 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
kanji dbKanjiList
revisions []string
titles []string
sequence int
)
log.Println("formatting dictionary data...")
var sequence int
for _, subbook := range book.Subbooks {
if extractor, ok := epwingExtractors[subbook.Title]; ok {
fontNarrow := extractor.getFontNarrow()

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type gakkenExtractor struct {
@ -69,7 +71,7 @@ var cosmetics = strings.NewReplacer("(1)", "①", "(2)", "②", "(3)", "③", "(
"セ゛", "ゼ",
"ソ゛", "ゾ")
func (e *gakkenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -140,7 +142,7 @@ func (e *gakkenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm
return terms
}
func (*gakkenExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (*gakkenExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}

9
go.mod
View File

@ -1,9 +0,0 @@
module github.com/FooSoft/yomichan-import
go 1.13
require (
github.com/FooSoft/jmdict v0.0.0-20190926045629-808d66c7b050
github.com/andlabs/ui v0.0.0-20180902183112-867a9e5a498d
github.com/mattn/go-sqlite3 v2.0.2+incompatible
)

6
go.sum
View File

@ -1,6 +0,0 @@
github.com/FooSoft/jmdict v0.0.0-20190926045629-808d66c7b050 h1:3VWk8B61jEgfcV+pEgzWO1j7TVXC9g5QaS0J06994Zc=
github.com/FooSoft/jmdict v0.0.0-20190926045629-808d66c7b050/go.mod h1:zo92ezZlNld5cN1iuS0QRAmSsHcpvcqGZLVNKPM4Hlg=
github.com/andlabs/ui v0.0.0-20180902183112-867a9e5a498d h1:4ianvxb8s3oyizgjuWWxGuTAUU+6JStcvj6BuHS4PVY=
github.com/andlabs/ui v0.0.0-20180902183112-867a9e5a498d/go.mod h1:5G2EjwzgZUPnnReoKvPWVneT8APYbyKkihDVAHUi0II=
github.com/mattn/go-sqlite3 v2.0.2+incompatible h1:qzw9c2GNT8UFrgWNDhCTqRqYUSmu/Dav/9Z58LGpk7U=
github.com/mattn/go-sqlite3 v2.0.2+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=

2
gui.go
View File

@ -116,7 +116,7 @@ func gui() error {
return
}
format, err := detectFormat(inputPath)
format, err := detectFormat(&inputPath)
if err != nil {
ui.MsgBoxError(window, "Error", "Unable to detect dictionary format")
importButton.Enable()

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type kotowazaExtractor struct {
@ -43,7 +45,7 @@ func makeKotowazaExtractor() epwingExtractor {
}
}
func (e *kotowazaExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *kotowazaExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
heading := entry.Heading
queue := []string{heading}
@ -104,7 +106,7 @@ func (e *kotowazaExtractor) extractTerms(entry epwingEntry, sequence int) []dbTe
return terms
}
func (e *kotowazaExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (e *kotowazaExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type koujienExtractor struct {
@ -57,7 +59,7 @@ func makeFuzokuExtractor() epwingExtractor {
}
}
func (e *koujienExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *koujienExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -122,7 +124,7 @@ func (e *koujienExtractor) extractTerms(entry epwingEntry, sequence int) []dbTer
return terms
}
func (*koujienExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (*koujienExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}

View File

@ -107,7 +107,7 @@ func main() {
if *format == "" {
var err error
if *format, err = detectFormat(inputPath); err != nil {
if *format, err = detectFormat(&inputPath); err != nil {
log.Fatal(err)
}
}

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type meikyouExtractor struct {
@ -77,7 +79,7 @@ func makeMeikyouExtractor() epwingExtractor {
}
}
func (e *meikyouExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *meikyouExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -153,7 +155,7 @@ func (e *meikyouExtractor) extractTerms(entry epwingEntry, sequence int) []dbTer
return terms
}
func (e *meikyouExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (e *meikyouExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}

View File

@ -25,6 +25,8 @@ package main
import (
"regexp"
"strings"
"github.com/FooSoft/zero-epwing-go/zig"
)
type wadaiExtractor struct {
@ -45,7 +47,7 @@ func makeWadaiExtractor() epwingExtractor {
}
}
func (e *wadaiExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
@ -104,7 +106,7 @@ func (e *wadaiExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm
return terms
}
func (e *wadaiExtractor) extractKanji(entry epwingEntry) []dbKanji {
func (e *wadaiExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
return nil
}