1
Fork 0
yomichan-import/jmdict_forms.go

257 lines
5.9 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package yomichan
import (
"os"
"strings"
"git.foosoft.net/alex/jmdict"
"golang.org/x/exp/slices"
)
func (h *headword) InfoSymbols() string {
infoSymbols := []string{}
if h.IsPriority {
infoSymbols = append(infoSymbols, prioritySymbol)
}
if h.IsRareKanji {
infoSymbols = append(infoSymbols, rareKanjiSymbol)
}
if h.IsIrregular {
infoSymbols = append(infoSymbols, irregularSymbol)
}
if h.IsOutdated {
infoSymbols = append(infoSymbols, outdatedSymbol)
}
return strings.Join(infoSymbols[:], " | ")
}
func (h *headword) GlossText() string {
gloss := h.Expression
if h.IsAteji {
gloss = "〈" + gloss + "〉"
}
symbolText := h.InfoSymbols()
if symbolText != "" {
gloss += "" + symbolText + ""
}
return gloss
}
func (h *headword) TableColHeaderText() string {
text := h.KanjiForm()
if h.IsAteji {
text = "〈" + text + "〉"
}
return text
}
func (h *headword) TableRowHeaderText() string {
text := h.Reading
if h.IsGikun {
text = "〈" + text + "〉"
}
return text
}
func (h *headword) TableCellText() string {
text := h.InfoSymbols()
if text == "" {
return defaultSymbol
} else {
return text
}
}
func (h *headword) KanjiForm() string {
if h.IsKanaOnly() {
return "∅"
} else {
return h.Expression
}
}
func needsFormTable(headwords []headword) bool {
// Does the entry contain more than 1 distinct reading?
// E.g. バカがい and ばかがい are not distinct.
uniqueReading := ""
for _, h := range headwords {
if h.IsGikun {
return true
} else if h.IsSearchOnly {
continue
} else if h.IsKanaOnly() {
continue
} else if uniqueReading == "" {
uniqueReading = katakanaToHiragana(h.Reading)
} else if uniqueReading != katakanaToHiragana(h.Reading) {
return true
}
}
return false
}
type formTableData struct {
kanjiForms []string
readings []string
colHeaderText map[string]string
rowHeaderText map[string]string
cellText map[string]map[string]string
}
func tableData(headwords []headword) formTableData {
d := formTableData{
kanjiForms: []string{},
readings: []string{},
colHeaderText: make(map[string]string),
rowHeaderText: make(map[string]string),
cellText: make(map[string]map[string]string),
}
for _, h := range headwords {
if h.IsSearchOnly {
continue
}
kanjiForm := h.KanjiForm()
if !slices.Contains(d.kanjiForms, kanjiForm) {
d.kanjiForms = append(d.kanjiForms, kanjiForm)
d.colHeaderText[kanjiForm] = h.TableColHeaderText()
}
reading := h.Reading
if !slices.Contains(d.readings, reading) {
d.readings = append(d.readings, reading)
d.rowHeaderText[reading] = h.TableRowHeaderText()
d.cellText[reading] = make(map[string]string)
}
d.cellText[reading][kanjiForm] = h.TableCellText()
}
return d
}
func formsTableGlossary(headwords []headword) []any {
d := tableData(headwords)
attr := contentAttr{}
centeredAttr := contentAttr{textAlign: "center"}
leftAttr := contentAttr{textAlign: "left"}
cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner
headRowCells := []any{cornerCell}
for _, kanjiForm := range d.kanjiForms {
content := d.colHeaderText[kanjiForm]
cell := contentTableHeadCell(centeredAttr, content)
headRowCells = append(headRowCells, cell)
}
headRow := contentTableRow(attr, headRowCells...)
tableRows := []any{headRow}
for _, reading := range d.readings {
rowHeadCellText := d.rowHeaderText[reading]
rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText)
rowCells := []any{rowHeadCell}
for _, kanjiForm := range d.kanjiForms {
text := d.cellText[reading][kanjiForm]
rowCell := contentTableCell(centeredAttr, text)
rowCells = append(rowCells, rowCell)
}
tableRow := contentTableRow(attr, rowCells...)
tableRows = append(tableRows, tableRow)
}
tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}}
contentTable := contentTable(tableAttr, tableRows...)
content := contentStructure(contentTable)
return []any{content}
}
func formsGlossary(headwords []headword) []any {
glossary := []any{}
for _, h := range headwords {
if h.IsSearchOnly {
continue
}
text := h.GlossText()
glossary = append(glossary, text)
}
return glossary
}
func baseFormsTerm(entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm {
term := dbTerm{Sequence: entry.Sequence}
headwords := extractHeadwords(entry)
if needsFormTable(headwords) {
term.Glossary = formsTableGlossary(headwords)
} else {
term.Glossary = formsGlossary(headwords)
}
partsOfSpeech := meta.seqToPartsOfSpeech[entry.Sequence]
rules := grammarRules(partsOfSpeech)
term.addRules(rules...)
return term
}
func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dictionary, entities, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil {
return err
}
meta := newJmdictMetadata(dictionary, "")
terms := dbTermList{}
for _, entry := range dictionary.Entries {
baseTerm := baseFormsTerm(entry, meta)
headwords := extractHeadwords(entry)
for _, h := range headwords {
if h.IsSearchOnly {
if term, ok := jmdictSearchTerm(h, entry, meta); ok {
terms = append(terms, term)
}
continue
}
term := baseTerm
term.Expression = h.Expression
term.Reading = h.Reading
term.addTermTags(h.TermTags...)
term.Score = calculateTermScore(1, 0, h)
terms = append(terms, term)
}
}
tags := dbTagList{}
tags = append(tags, entityTags(entities)...)
tags = append(tags, newsFrequencyTags()...)
tags = append(tags, customDbTags()...)
if title == "" {
title = "JMdict Forms"
}
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": tags.crush(),
}
jmdictDate := jmdictPublicationDate(dictionary)
index := dbIndex{
Title: title,
Revision: "JMdict." + jmdictDate,
Sequenced: true,
Attribution: edrdgAttribution,
}
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}