1
yomichan-import/rikai.go

297 lines
4.4 KiB
Go
Raw Normal View History

2021-01-01 22:31:58 +00:00
package yomichan
2017-08-19 19:17:18 +00:00
import (
2017-08-19 19:17:31 +00:00
"database/sql"
2017-08-19 21:13:51 +00:00
"regexp"
2017-08-19 19:17:18 +00:00
"strings"
2017-08-19 19:17:31 +00:00
_ "github.com/mattn/go-sqlite3"
2017-08-19 19:17:18 +00:00
)
2017-08-19 21:13:51 +00:00
type rikaiEntry struct {
kanji string
kana string
entry string
}
2017-08-19 19:17:18 +00:00
func rikaiBuildRules(term *dbTerm) {
for _, tag := range term.DefinitionTags {
2017-08-19 19:17:18 +00:00
switch tag {
2020-12-06 04:48:34 +00:00
case "adj-i", "v1", "vk", "vz":
2017-08-19 19:17:18 +00:00
term.addRules(tag)
default:
if strings.HasPrefix(tag, "v5") {
term.addRules("v5")
} else if strings.HasPrefix(tag, "vs") {
term.addRules("vs")
}
}
}
}
func rikaiBuildScore(term *dbTerm) {
for _, tag := range term.DefinitionTags {
2017-08-19 19:17:18 +00:00
switch tag {
case "news", "ichi", "spec", "gai":
2017-09-10 20:37:40 +00:00
term.Score++
case "arch", "iK":
term.Score--
2017-08-19 19:17:18 +00:00
case "P":
term.Score += 5
}
}
}
2017-08-19 21:13:51 +00:00
func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
var terms dbTermList
dfnExp := regexp.MustCompile(`^(?:\(KC\) )?((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$`)
readExp := regexp.MustCompile(`\[([^\]]+)\]`)
tagExp := regexp.MustCompile(`[\s\(\),]`)
var sequence int
2017-08-19 21:13:51 +00:00
for rows.Next() {
var (
kanji, kana, entry *string
)
if err := rows.Scan(&kanji, &kana, &entry); err != nil {
return nil, err
}
if entry == nil {
continue
}
segments := strings.Split(*entry, "/")
indexFirst := 0
if !strings.HasPrefix(segments[0], "") && !strings.HasPrefix(segments[0], "(") {
expParts := strings.Split(segments[0], " ")
if len(expParts) > 1 {
if readMatch := readExp.FindStringSubmatch(expParts[1]); readMatch != nil {
kana = &readMatch[1]
}
}
indexFirst = 1
if indexFirst == len(segments) {
continue
}
}
var term dbTerm
term.Sequence = sequence
2017-08-19 21:13:51 +00:00
if kana != nil {
term.Expression = *kana
term.Reading = *kana
}
if kanji != nil {
term.Expression = *kanji
}
for i := indexFirst; i < len(segments); i++ {
segment := segments[i]
if dfnMatch := dfnExp.FindStringSubmatch(segment); dfnMatch != nil {
for _, tag := range tagExp.Split(dfnMatch[1], -1) {
if rikaiTagParsed(tag) {
term.addDefinitionTags(tag)
2017-08-19 21:13:51 +00:00
}
}
if len(dfnMatch[2]) > 0 {
term.Glossary = append(term.Glossary, dfnMatch[2])
}
}
}
rikaiBuildRules(&term)
rikaiBuildScore(&term)
terms = append(terms, term)
sequence++
2017-08-19 21:13:51 +00:00
}
return terms, nil
}
func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
2017-08-19 19:17:31 +00:00
db, err := sql.Open("sqlite3", inputPath)
2017-08-19 19:17:18 +00:00
if err != nil {
return err
}
2017-08-19 19:17:31 +00:00
defer db.Close()
2017-08-19 19:17:18 +00:00
2017-08-19 19:17:31 +00:00
dictRows, err := db.Query("SELECT kanji, kana, entry FROM dict")
2017-08-19 19:17:18 +00:00
if err != nil {
return err
}
2017-08-19 21:13:51 +00:00
terms, err := rikaiExtractTerms(dictRows)
if err != nil {
return err
2017-08-19 19:17:18 +00:00
}
if title == "" {
2017-08-19 19:17:31 +00:00
title = "Rikai"
2017-08-19 19:17:18 +00:00
}
2017-09-10 20:25:11 +00:00
tags := dbTagList{
dbTag{Name: "P", Category: "popular", Order: -10},
dbTag{Name: "exp", Category: "expression", Order: -5},
dbTag{Name: "id", Category: "expression", Order: -5},
dbTag{Name: "arch", Category: "archaism", Order: -4},
dbTag{Name: "iK", Category: "archaism", Order: -4},
}
recordData := map[string]dbRecordList{
2017-09-10 20:45:06 +00:00
"term": terms.crush(),
"tag": tags.crush(),
2017-08-19 21:13:51 +00:00
}
index := dbIndex{
Title: title,
Revision: "rikai2",
Sequenced: true,
Description: "",
Attribution: "",
}
index.setDefaults()
2017-08-19 21:13:51 +00:00
return writeDb(
outputPath,
index,
2017-09-10 20:25:11 +00:00
recordData,
2017-08-19 21:13:51 +00:00
stride,
pretty,
)
}
func rikaiTagParsed(tag string) bool {
tags := []string{
"Buddh",
"MA",
"X",
"abbr",
"adj",
"adj-f",
"adj-i",
"adj-na",
"adj-no",
"adj-pn",
"adj-t",
"adv",
"adv-n",
"adv-to",
"arch",
"ateji",
"aux",
"aux-adj",
"aux-v",
"c",
"chn",
"col",
"comp",
"conj",
"ctr",
"derog",
"eK",
"ek",
"exp",
"f",
"fam",
"fem",
"food",
"g",
"geom",
"gikun",
"gram",
"h",
"hon",
"hum",
"iK",
"id",
"ik",
"int",
"io",
"iv",
"ling",
"m",
"m-sl",
"male",
"male-sl",
"math",
"mil",
"n",
"n-adv",
"n-pref",
"n-suf",
"n-t",
"num",
"oK",
"obs",
"obsc",
"ok",
"on-mim",
"P",
"p",
"physics",
"pn",
"poet",
"pol",
"pr",
"pref",
"prt",
"rare",
"s",
"sens",
"sl",
"st",
"suf",
"u",
"uK",
"uk",
"v1",
"v2a-s",
"v4h",
"v4r",
"v5",
"v5aru",
"v5b",
"v5g",
"v5k",
"v5k-s",
"v5m",
"v5n",
"v5r",
"v5r-i",
"v5s",
"v5t",
"v5u",
"v5u-s",
"v5uru",
"v5z",
"vi",
"vk",
"vn",
"vs",
"vs-c",
"vs-i",
"vs-s",
"vt",
"vulg",
"vz",
}
for _, curr := range tags {
if curr == tag {
return true
}
}
return false
2017-08-19 19:17:18 +00:00
}