WIP
This commit is contained in:
parent
63c8268fcc
commit
8faa2c3354
92
epwing.go
92
epwing.go
@ -24,9 +24,11 @@ package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type epwingEntry struct {
|
||||
@ -46,36 +48,62 @@ type epwingBook struct {
|
||||
Subbooks []epwingSubbook `json:"subbooks"`
|
||||
}
|
||||
|
||||
// 3934
|
||||
// (?P<kana>[^(【<]+)(?:【(?P<kanji>.*)】)?(?:<\?>(?P<native>.*)<\?>)?(?:((?P<tag>.*)))?
|
||||
// (?P<kana>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:((?P<tag>.*)))?
|
||||
// "heading": "きれ‐あが・る【切れ上がる】",
|
||||
// "text": "きれ‐あが・る【切れ上がる】\n[動ラ五(四)]上の方まで切れる。また、目尻や額の生え際などが上の方へ上がっている。「―・った目元」\n"
|
||||
// },
|
||||
// {
|
||||
// "heading": "きれ‐あじ【切れ味】‐あぢ",
|
||||
// "text": "きれ‐あじ【切れ味】‐あぢ\n<?>刃物の切れぐあい。「―のいいナイフ」<?>才能・技などの鋭さ。「鋭い―の批評」「―のいいショット」\n"
|
||||
// },
|
||||
// {
|
||||
// "heading": "き‐れい【×綺麗・奇麗】",
|
||||
// "text": "き‐れい【×綺麗・奇麗】\n[形動]<?>[ナリ]<?>色・形などが華やかな美しさをもっているさま。「―な花」「―に着飾る」<?>姿・顔かたちが整っていて美しいさま。「―な脚」「―な女性」<?>声などが快く聞こえるさま。「―な発音」<?>よごれがなく清潔なさま。「手を―に洗う」「―な空気」「―な選挙」<?>男女間に肉体的な交渉がないさま。清純。「―な関係」<?>乱れたところがないさま。整然としているさま。「机の上を―に片づける」<?>(「きれいに」の形で)残りなく物事が行われるさま。すっかり。「―に忘れる」「―にたいらげる」→美しい[用法]\n[派生]きれいさ[名]\n[類語](<?>)美しい・美美(びび)しい・煌(きら)やか・鮮やか・美麗・華麗・華美・鮮麗・流麗・優美・美的/(<?>)麗(うるわ)しい・見目よい・端整・端麗・秀麗・佳麗(かれい)・艶美(えんび)・艶麗(えんれい)・あでやか/(<?>)清い・清らか・清潔・清浄(せいじよう・しようじよう)・清澄・清冽(せいれつ)・無垢(むく)・純潔・潔白(けつぱく)\n"
|
||||
// },
|
||||
type epwingExtractor interface {
|
||||
extractTerms(entry epwingEntry) []dbTerm
|
||||
extractKanji(entry epwingEntry) []dbKanji
|
||||
}
|
||||
|
||||
func extractDaijirinTerms(entry epwingEntry) []dbTerm {
|
||||
exp := regexp.MustCompile(`(?P<kana>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:((?P<tag>.*)))?`)
|
||||
matches := exp.FindStringSubmatch(entry.Heading)
|
||||
type daijirinExtractor struct {
|
||||
partsExp *regexp.Regexp
|
||||
phonExp *regexp.Regexp
|
||||
variantExp *regexp.Regexp
|
||||
annotExp *regexp.Regexp
|
||||
}
|
||||
|
||||
results := make(map[string]string)
|
||||
for i, name := range exp.SubexpNames() {
|
||||
if i > 0 {
|
||||
results[name] = matches[i]
|
||||
func makeDaijirinExtractor() epwingExtractor {
|
||||
return &daijirinExtractor{
|
||||
partsExp: regexp.MustCompile(`(?P<reading>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:((?P<tag>.*)))?`),
|
||||
phonExp: regexp.MustCompile(`[-・]+`),
|
||||
variantExp: regexp.MustCompile(`\((.*)\)`),
|
||||
annotExp: regexp.MustCompile(`(.*)`),
|
||||
}
|
||||
}
|
||||
|
||||
func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
|
||||
var readings []string
|
||||
var expressions []string
|
||||
|
||||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||||
for i, name := range e.partsExp.SubexpNames() {
|
||||
value := matches[i]
|
||||
if i == 0 || len(value) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
switch name {
|
||||
case "expression":
|
||||
expression := e.annotExp.ReplaceAllLiteralString(value, "")
|
||||
for _, split := range strings.Split(expression, `・`) {
|
||||
splitInc := e.variantExp.ReplaceAllString(split, "$1")
|
||||
expressions = append(expressions, splitInc)
|
||||
if split != splitInc {
|
||||
splitExc := e.variantExp.ReplaceAllLiteralString(split, "")
|
||||
expressions = append(expressions, splitExc)
|
||||
}
|
||||
}
|
||||
case "reading":
|
||||
reading := e.phonExp.ReplaceAllLiteralString(value, "")
|
||||
readings = append(readings, reading)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("%q\n", expressions)
|
||||
fmt.Printf("%q\n", readings)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractDaijirinKanji(entry epwingEntry) []dbKanji {
|
||||
func (e *daijirinExtractor) extractKanji(entry epwingEntry) []dbKanji {
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -90,26 +118,18 @@ func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error
|
||||
return err
|
||||
}
|
||||
|
||||
termExtractors := map[string]func(epwingEntry) []dbTerm{
|
||||
"三省堂 スーパー大辞林": extractDaijirinTerms,
|
||||
epwingExtractors := map[string]epwingExtractor{
|
||||
"三省堂 スーパー大辞林": makeDaijirinExtractor(),
|
||||
}
|
||||
|
||||
var terms dbTermList
|
||||
for _, subbook := range book.Subbooks {
|
||||
if extractor, ok := termExtractors[subbook.Title]; ok {
|
||||
for _, entry := range subbook.Entries {
|
||||
terms = append(terms, extractor(entry)...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kanjiExtractors := map[string]func(epwingEntry) []dbKanji{}
|
||||
|
||||
var kanji dbKanjiList
|
||||
|
||||
for _, subbook := range book.Subbooks {
|
||||
if extractor, ok := kanjiExtractors[subbook.Title]; ok {
|
||||
if extractor, ok := epwingExtractors[subbook.Title]; ok {
|
||||
for _, entry := range subbook.Entries {
|
||||
kanji = append(kanji, extractor(entry)...)
|
||||
terms = append(terms, extractor.extractTerms(entry)...)
|
||||
kanji = append(kanji, extractor.extractKanji(entry)...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user