2023-01-22 20:37:18 +00:00
|
|
|
|
package yomichan
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"os"
|
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
|
|
"foosoft.net/projects/jmdict"
|
|
|
|
|
"golang.org/x/exp/slices"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func kata2hira(word string) string {
|
|
|
|
|
charMap := func(character rune) rune {
|
|
|
|
|
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
|
|
|
|
|
return character - 0x60
|
|
|
|
|
} else {
|
|
|
|
|
return character
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return strings.Map(charMap, word)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (h *headword) InfoSymbols() string {
|
|
|
|
|
infoSymbols := []string{}
|
|
|
|
|
if h.IsPriority {
|
|
|
|
|
infoSymbols = append(infoSymbols, prioritySymbol)
|
|
|
|
|
}
|
|
|
|
|
if h.IsRareKanji {
|
|
|
|
|
infoSymbols = append(infoSymbols, rareKanjiSymbol)
|
|
|
|
|
}
|
|
|
|
|
if h.IsIrregular {
|
|
|
|
|
infoSymbols = append(infoSymbols, irregularSymbol)
|
|
|
|
|
}
|
|
|
|
|
if h.IsOutdated {
|
|
|
|
|
infoSymbols = append(infoSymbols, outdatedSymbol)
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(infoSymbols[:], " | ")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (h *headword) GlossText() string {
|
|
|
|
|
gloss := h.Expression
|
|
|
|
|
if h.IsAteji {
|
|
|
|
|
gloss = "〈" + gloss + "〉"
|
|
|
|
|
}
|
|
|
|
|
symbolText := h.InfoSymbols()
|
|
|
|
|
if symbolText != "" {
|
|
|
|
|
gloss += "(" + symbolText + ")"
|
|
|
|
|
}
|
|
|
|
|
return gloss
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (h *headword) TableColHeaderText() string {
|
|
|
|
|
text := h.KanjiForm()
|
|
|
|
|
if h.IsAteji {
|
|
|
|
|
text = "〈" + text + "〉"
|
|
|
|
|
}
|
|
|
|
|
return text
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (h *headword) TableRowHeaderText() string {
|
|
|
|
|
text := h.Reading
|
|
|
|
|
if h.IsGikun {
|
|
|
|
|
text = "〈" + text + "〉"
|
|
|
|
|
}
|
|
|
|
|
return text
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (h *headword) TableCellText() string {
|
|
|
|
|
text := h.InfoSymbols()
|
|
|
|
|
if text == "" {
|
|
|
|
|
return defaultSymbol
|
|
|
|
|
} else {
|
|
|
|
|
return text
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (h *headword) KanjiForm() string {
|
|
|
|
|
if h.IsKanaOnly() {
|
|
|
|
|
return "∅"
|
|
|
|
|
} else {
|
|
|
|
|
return h.Expression
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-24 05:52:42 +00:00
|
|
|
|
func needsFormTable(headwords []headword) bool {
|
2023-01-22 20:37:18 +00:00
|
|
|
|
// Does the entry contain more than 1 distinct reading?
|
|
|
|
|
// E.g. バカがい and ばかがい are not distinct.
|
|
|
|
|
uniqueReading := ""
|
|
|
|
|
for _, h := range headwords {
|
|
|
|
|
if h.IsGikun {
|
|
|
|
|
return true
|
|
|
|
|
} else if h.IsSearchOnly {
|
|
|
|
|
continue
|
|
|
|
|
} else if h.IsKanaOnly() {
|
|
|
|
|
continue
|
|
|
|
|
} else if uniqueReading == "" {
|
|
|
|
|
uniqueReading = kata2hira(h.Reading)
|
|
|
|
|
} else if uniqueReading != kata2hira(h.Reading) {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type formTableData struct {
|
|
|
|
|
kanjiForms []string
|
|
|
|
|
readings []string
|
|
|
|
|
colHeaderText map[string]string
|
|
|
|
|
rowHeaderText map[string]string
|
|
|
|
|
cellText map[string]map[string]string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func tableData(headwords []headword) formTableData {
|
|
|
|
|
d := formTableData{
|
|
|
|
|
kanjiForms: []string{},
|
|
|
|
|
readings: []string{},
|
|
|
|
|
colHeaderText: make(map[string]string),
|
|
|
|
|
rowHeaderText: make(map[string]string),
|
|
|
|
|
cellText: make(map[string]map[string]string),
|
|
|
|
|
}
|
|
|
|
|
for _, h := range headwords {
|
|
|
|
|
if h.IsSearchOnly {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
kanjiForm := h.KanjiForm()
|
|
|
|
|
if !slices.Contains(d.kanjiForms, kanjiForm) {
|
|
|
|
|
d.kanjiForms = append(d.kanjiForms, kanjiForm)
|
|
|
|
|
d.colHeaderText[kanjiForm] = h.TableColHeaderText()
|
|
|
|
|
}
|
|
|
|
|
reading := h.Reading
|
|
|
|
|
if !slices.Contains(d.readings, reading) {
|
|
|
|
|
d.readings = append(d.readings, reading)
|
|
|
|
|
d.rowHeaderText[reading] = h.TableRowHeaderText()
|
|
|
|
|
d.cellText[reading] = make(map[string]string)
|
|
|
|
|
}
|
|
|
|
|
d.cellText[reading][kanjiForm] = h.TableCellText()
|
|
|
|
|
}
|
|
|
|
|
return d
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func formsTableGlossary(headwords []headword) []any {
|
|
|
|
|
d := tableData(headwords)
|
|
|
|
|
|
|
|
|
|
attr := contentAttr{}
|
|
|
|
|
centeredAttr := contentAttr{textAlign: "center"}
|
|
|
|
|
leftAttr := contentAttr{textAlign: "left"}
|
|
|
|
|
|
|
|
|
|
cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner
|
|
|
|
|
headRowCells := []any{cornerCell}
|
|
|
|
|
for _, kanjiForm := range d.kanjiForms {
|
|
|
|
|
content := d.colHeaderText[kanjiForm]
|
|
|
|
|
cell := contentTableHeadCell(centeredAttr, content)
|
|
|
|
|
headRowCells = append(headRowCells, cell)
|
|
|
|
|
}
|
|
|
|
|
headRow := contentTableRow(attr, headRowCells...)
|
|
|
|
|
tableRows := []any{headRow}
|
|
|
|
|
for _, reading := range d.readings {
|
|
|
|
|
rowHeadCellText := d.rowHeaderText[reading]
|
|
|
|
|
rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText)
|
|
|
|
|
rowCells := []any{rowHeadCell}
|
|
|
|
|
for _, kanjiForm := range d.kanjiForms {
|
|
|
|
|
text := d.cellText[reading][kanjiForm]
|
|
|
|
|
rowCell := contentTableCell(centeredAttr, text)
|
|
|
|
|
rowCells = append(rowCells, rowCell)
|
|
|
|
|
}
|
|
|
|
|
tableRow := contentTableRow(attr, rowCells...)
|
|
|
|
|
tableRows = append(tableRows, tableRow)
|
|
|
|
|
}
|
|
|
|
|
tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}}
|
|
|
|
|
contentTable := contentTable(tableAttr, tableRows...)
|
|
|
|
|
content := contentStructure(contentTable)
|
|
|
|
|
return []any{content}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func formsGlossary(headwords []headword) []any {
|
|
|
|
|
glossary := []any{}
|
|
|
|
|
for _, h := range headwords {
|
|
|
|
|
if h.IsSearchOnly {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := h.GlossText()
|
|
|
|
|
glossary = append(glossary, text)
|
|
|
|
|
}
|
|
|
|
|
return glossary
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
|
|
|
|
|
term := dbTerm{Sequence: entry.Sequence}
|
|
|
|
|
headwords := extractHeadwords(entry)
|
2023-01-24 05:52:42 +00:00
|
|
|
|
if needsFormTable(headwords) {
|
2023-01-22 20:37:18 +00:00
|
|
|
|
term.Glossary = formsTableGlossary(headwords)
|
|
|
|
|
} else {
|
|
|
|
|
term.Glossary = formsGlossary(headwords)
|
|
|
|
|
}
|
|
|
|
|
for _, sense := range entry.Sense {
|
|
|
|
|
rules := grammarRules(sense.PartsOfSpeech)
|
|
|
|
|
term.addRules(rules...)
|
|
|
|
|
}
|
|
|
|
|
return term
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error {
|
|
|
|
|
reader, err := os.Open(inputPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
defer reader.Close()
|
|
|
|
|
|
2023-01-24 19:02:50 +00:00
|
|
|
|
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
|
2023-01-22 20:37:18 +00:00
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-26 00:26:47 +00:00
|
|
|
|
meta := newJmdictMetadata(dictionary, "english")
|
2023-01-22 23:55:27 +00:00
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
|
terms := dbTermList{}
|
|
|
|
|
for _, entry := range dictionary.Entries {
|
|
|
|
|
baseTerm := baseFormsTerm(entry)
|
|
|
|
|
headwords := extractHeadwords(entry)
|
|
|
|
|
for _, h := range headwords {
|
|
|
|
|
if h.IsSearchOnly {
|
2023-01-26 00:26:47 +00:00
|
|
|
|
if term, ok := createSearchTerm(h, entry, meta); ok {
|
|
|
|
|
terms = append(terms, term)
|
|
|
|
|
}
|
|
|
|
|
continue
|
2023-01-22 20:37:18 +00:00
|
|
|
|
}
|
2023-01-26 00:26:47 +00:00
|
|
|
|
term := baseTerm
|
|
|
|
|
term.Expression = h.Expression
|
|
|
|
|
term.Reading = h.Reading
|
|
|
|
|
term.addTermTags(h.TermTags...)
|
2023-01-28 01:09:12 +00:00
|
|
|
|
term.Score = calculateTermScore(1, 0, h)
|
2023-01-22 20:37:18 +00:00
|
|
|
|
terms = append(terms, term)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-24 19:02:50 +00:00
|
|
|
|
tags := dbTagList{}
|
|
|
|
|
tags = append(tags, entityTags(entities)...)
|
|
|
|
|
tags = append(tags, newsFrequencyTags()...)
|
|
|
|
|
tags = append(tags, customDbTags()...)
|
|
|
|
|
|
2023-01-22 20:37:18 +00:00
|
|
|
|
if title == "" {
|
|
|
|
|
title = "JMdict Forms"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
recordData := map[string]dbRecordList{
|
|
|
|
|
"term": terms.crush(),
|
2023-01-24 19:02:50 +00:00
|
|
|
|
"tag": tags.crush(),
|
2023-01-22 20:37:18 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
jmdictDate := jmdictPublicationDate(dictionary)
|
|
|
|
|
|
|
|
|
|
index := dbIndex{
|
|
|
|
|
Title: title,
|
|
|
|
|
Revision: "JMdict." + jmdictDate,
|
|
|
|
|
Sequenced: true,
|
|
|
|
|
Attribution: edrdgAttribution,
|
|
|
|
|
}
|
|
|
|
|
index.setDefaults()
|
|
|
|
|
|
|
|
|
|
return writeDb(
|
|
|
|
|
outputPath,
|
|
|
|
|
index,
|
|
|
|
|
recordData,
|
|
|
|
|
stride,
|
|
|
|
|
pretty,
|
|
|
|
|
)
|
|
|
|
|
}
|