WIP
This commit is contained in:
parent
63c8268fcc
commit
8faa2c3354
92
epwing.go
92
epwing.go
@ -24,9 +24,11 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type epwingEntry struct {
|
type epwingEntry struct {
|
||||||
@ -46,36 +48,62 @@ type epwingBook struct {
|
|||||||
Subbooks []epwingSubbook `json:"subbooks"`
|
Subbooks []epwingSubbook `json:"subbooks"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3934
|
type epwingExtractor interface {
|
||||||
// (?P<kana>[^(【<]+)(?:【(?P<kanji>.*)】)?(?:<\?>(?P<native>.*)<\?>)?(?:((?P<tag>.*)))?
|
extractTerms(entry epwingEntry) []dbTerm
|
||||||
// (?P<kana>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:((?P<tag>.*)))?
|
extractKanji(entry epwingEntry) []dbKanji
|
||||||
// "heading": "きれ‐あが・る【切れ上がる】",
|
}
|
||||||
// "text": "きれ‐あが・る【切れ上がる】\n[動ラ五(四)]上の方まで切れる。また、目尻や額の生え際などが上の方へ上がっている。「―・った目元」\n"
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "heading": "きれ‐あじ【切れ味】‐あぢ",
|
|
||||||
// "text": "きれ‐あじ【切れ味】‐あぢ\n<?>刃物の切れぐあい。「―のいいナイフ」<?>才能・技などの鋭さ。「鋭い―の批評」「―のいいショット」\n"
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "heading": "き‐れい【×綺麗・奇麗】",
|
|
||||||
// "text": "き‐れい【×綺麗・奇麗】\n[形動]<?>[ナリ]<?>色・形などが華やかな美しさをもっているさま。「―な花」「―に着飾る」<?>姿・顔かたちが整っていて美しいさま。「―な脚」「―な女性」<?>声などが快く聞こえるさま。「―な発音」<?>よごれがなく清潔なさま。「手を―に洗う」「―な空気」「―な選挙」<?>男女間に肉体的な交渉がないさま。清純。「―な関係」<?>乱れたところがないさま。整然としているさま。「机の上を―に片づける」<?>(「きれいに」の形で)残りなく物事が行われるさま。すっかり。「―に忘れる」「―にたいらげる」→美しい[用法]\n[派生]きれいさ[名]\n[類語](<?>)美しい・美美(びび)しい・煌(きら)やか・鮮やか・美麗・華麗・華美・鮮麗・流麗・優美・美的/(<?>)麗(うるわ)しい・見目よい・端整・端麗・秀麗・佳麗(かれい)・艶美(えんび)・艶麗(えんれい)・あでやか/(<?>)清い・清らか・清潔・清浄(せいじよう・しようじよう)・清澄・清冽(せいれつ)・無垢(むく)・純潔・潔白(けつぱく)\n"
|
|
||||||
// },
|
|
||||||
|
|
||||||
func extractDaijirinTerms(entry epwingEntry) []dbTerm {
|
type daijirinExtractor struct {
|
||||||
exp := regexp.MustCompile(`(?P<kana>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:((?P<tag>.*)))?`)
|
partsExp *regexp.Regexp
|
||||||
matches := exp.FindStringSubmatch(entry.Heading)
|
phonExp *regexp.Regexp
|
||||||
|
variantExp *regexp.Regexp
|
||||||
|
annotExp *regexp.Regexp
|
||||||
|
}
|
||||||
|
|
||||||
results := make(map[string]string)
|
func makeDaijirinExtractor() epwingExtractor {
|
||||||
for i, name := range exp.SubexpNames() {
|
return &daijirinExtractor{
|
||||||
if i > 0 {
|
partsExp: regexp.MustCompile(`(?P<reading>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:((?P<tag>.*)))?`),
|
||||||
results[name] = matches[i]
|
phonExp: regexp.MustCompile(`[-・]+`),
|
||||||
|
variantExp: regexp.MustCompile(`\((.*)\)`),
|
||||||
|
annotExp: regexp.MustCompile(`(.*)`),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||||
|
var readings []string
|
||||||
|
var expressions []string
|
||||||
|
|
||||||
|
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||||||
|
for i, name := range e.partsExp.SubexpNames() {
|
||||||
|
value := matches[i]
|
||||||
|
if i == 0 || len(value) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
switch name {
|
||||||
|
case "expression":
|
||||||
|
expression := e.annotExp.ReplaceAllLiteralString(value, "")
|
||||||
|
for _, split := range strings.Split(expression, `・`) {
|
||||||
|
splitInc := e.variantExp.ReplaceAllString(split, "$1")
|
||||||
|
expressions = append(expressions, splitInc)
|
||||||
|
if split != splitInc {
|
||||||
|
splitExc := e.variantExp.ReplaceAllLiteralString(split, "")
|
||||||
|
expressions = append(expressions, splitExc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case "reading":
|
||||||
|
reading := e.phonExp.ReplaceAllLiteralString(value, "")
|
||||||
|
readings = append(readings, reading)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("%q\n", expressions)
|
||||||
|
fmt.Printf("%q\n", readings)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractDaijirinKanji(entry epwingEntry) []dbKanji {
|
func (e *daijirinExtractor) extractKanji(entry epwingEntry) []dbKanji {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,26 +118,18 @@ func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
termExtractors := map[string]func(epwingEntry) []dbTerm{
|
epwingExtractors := map[string]epwingExtractor{
|
||||||
"三省堂 スーパー大辞林": extractDaijirinTerms,
|
"三省堂 スーパー大辞林": makeDaijirinExtractor(),
|
||||||
}
|
}
|
||||||
|
|
||||||
var terms dbTermList
|
var terms dbTermList
|
||||||
for _, subbook := range book.Subbooks {
|
|
||||||
if extractor, ok := termExtractors[subbook.Title]; ok {
|
|
||||||
for _, entry := range subbook.Entries {
|
|
||||||
terms = append(terms, extractor(entry)...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
kanjiExtractors := map[string]func(epwingEntry) []dbKanji{}
|
|
||||||
|
|
||||||
var kanji dbKanjiList
|
var kanji dbKanjiList
|
||||||
|
|
||||||
for _, subbook := range book.Subbooks {
|
for _, subbook := range book.Subbooks {
|
||||||
if extractor, ok := kanjiExtractors[subbook.Title]; ok {
|
if extractor, ok := epwingExtractors[subbook.Title]; ok {
|
||||||
for _, entry := range subbook.Entries {
|
for _, entry := range subbook.Entries {
|
||||||
kanji = append(kanji, extractor(entry)...)
|
terms = append(terms, extractor.extractTerms(entry)...)
|
||||||
|
kanji = append(kanji, extractor.extractKanji(entry)...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user