1

add support for rikai formatted dicts

This commit is contained in:
Alex Yatskov 2017-08-19 14:13:51 -07:00
parent d3593004fd
commit da6eeb585c
2 changed files with 228 additions and 20 deletions

View File

@ -234,6 +234,10 @@ func hasString(needle string, haystack []string) bool {
}
func detectFormat(path string) (string, error) {
if filepath.Ext(path) == ".sqlite" {
return "rikai", nil
}
switch filepath.Base(path) {
case "JMdict", "JMdict.xml", "JMdict_e", "JMdict_e.xml":
return "edict", nil

244
rikai.go
View File

@ -24,7 +24,7 @@ package main
import (
"database/sql"
"log"
"regexp"
"strings"
_ "github.com/mattn/go-sqlite3"
@ -32,6 +32,12 @@ import (
const RIKAI_REVISION = "rikai1"
type rikaiEntry struct {
kanji string
kana string
entry string
}
func rikaiBuildRules(term *dbTerm) {
for _, tag := range term.Tags {
switch tag {
@ -59,7 +65,78 @@ func rikaiBuildScore(term *dbTerm) {
}
}
func rikaiExportDb(inputPath, outputDir, title string, stride int, pretty bool) error {
func rikaiExtractTerms(rows *sql.Rows) (dbTermList, error) {
var terms dbTermList
dfnExp := regexp.MustCompile(`^(?:\(KC\) )?((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$`)
readExp := regexp.MustCompile(`\[([^\]]+)\]`)
tagExp := regexp.MustCompile(`[\s\(\),]`)
for rows.Next() {
var (
kanji, kana, entry *string
)
if err := rows.Scan(&kanji, &kana, &entry); err != nil {
return nil, err
}
if entry == nil {
continue
}
segments := strings.Split(*entry, "/")
indexFirst := 0
if !strings.HasPrefix(segments[0], "") && !strings.HasPrefix(segments[0], "(") {
expParts := strings.Split(segments[0], " ")
if len(expParts) > 1 {
if readMatch := readExp.FindStringSubmatch(expParts[1]); readMatch != nil {
kana = &readMatch[1]
}
}
indexFirst = 1
if indexFirst == len(segments) {
continue
}
}
var term dbTerm
if kana != nil {
term.Expression = *kana
term.Reading = *kana
}
if kanji != nil {
term.Expression = *kanji
}
for i := indexFirst; i < len(segments); i++ {
segment := segments[i]
if dfnMatch := dfnExp.FindStringSubmatch(segment); dfnMatch != nil {
for _, tag := range tagExp.Split(dfnMatch[1], -1) {
if rikaiTagParsed(tag) {
term.addTags(tag)
}
}
if len(dfnMatch[2]) > 0 {
term.Glossary = append(term.Glossary, dfnMatch[2])
}
}
}
rikaiBuildRules(&term)
rikaiBuildScore(&term)
terms = append(terms, term)
}
return terms, nil
}
func rikaiExportDb(inputPath, outputPath, language, title string, stride int, pretty bool) error {
db, err := sql.Open("sqlite3", inputPath)
if err != nil {
return err
@ -71,29 +148,156 @@ func rikaiExportDb(inputPath, outputDir, title string, stride int, pretty bool)
return err
}
for dictRows.Next() {
var kanji, kana, entry *string
if err := dictRows.Scan(&kanji, &kana, &entry); err != nil {
return err
}
log.Print(kanji, kana, entry)
terms, err := rikaiExtractTerms(dictRows)
if err != nil {
return err
}
if title == "" {
title = "Rikai"
}
return nil
entities := map[string]dbTagMeta{
"P": {Category: "popular", Order: -10},
"exp": {Category: "expression", Order: -5},
"id": {Category: "expression", Order: -5},
"arch": {Category: "archaism", Order: 5},
"iK": {Category: "archaism", Order: 5},
}
// return writeDb(
// outputDir,
// title,
// RIKAI_REVISION,
// terms.crush(),
// nil,
// rikaiBuildTagMeta(entities),
// stride,
// pretty,
// )
return writeDb(
outputPath,
title,
RIKAI_REVISION,
terms.crush(),
nil,
entities,
stride,
pretty,
)
}
func rikaiTagParsed(tag string) bool {
tags := []string{
"Buddh",
"MA",
"X",
"abbr",
"adj",
"adj-f",
"adj-i",
"adj-na",
"adj-no",
"adj-pn",
"adj-t",
"adv",
"adv-n",
"adv-to",
"arch",
"ateji",
"aux",
"aux-adj",
"aux-v",
"c",
"chn",
"col",
"comp",
"conj",
"ctr",
"derog",
"eK",
"ek",
"exp",
"f",
"fam",
"fem",
"food",
"g",
"geom",
"gikun",
"gram",
"h",
"hon",
"hum",
"iK",
"id",
"ik",
"int",
"io",
"iv",
"ling",
"m",
"m-sl",
"male",
"male-sl",
"math",
"mil",
"n",
"n-adv",
"n-pref",
"n-suf",
"n-t",
"num",
"oK",
"obs",
"obsc",
"ok",
"on-mim",
"P",
"p",
"physics",
"pn",
"poet",
"pol",
"pr",
"pref",
"prt",
"rare",
"s",
"sens",
"sl",
"st",
"suf",
"u",
"uK",
"uk",
"v1",
"v2a-s",
"v4h",
"v4r",
"v5",
"v5aru",
"v5b",
"v5g",
"v5k",
"v5k-s",
"v5m",
"v5n",
"v5r",
"v5r-i",
"v5s",
"v5t",
"v5u",
"v5u-s",
"v5uru",
"v5z",
"vi",
"vk",
"vn",
"vs",
"vs-c",
"vs-i",
"vs-s",
"vt",
"vulg",
"vz",
}
for _, curr := range tags {
if curr == tag {
return true
}
}
return false
}