1
yomichan-import/jmdictForms.go
stephenmk d8a3b420ee
Exclude "search" and "forms" terms from non-English dictionaries
This allows a user to install the English version and another version
without cluttering their setup with duplicated information.

If a user doesn't want to use the English version, they can get the
"search" and "forms" terms by installing the separate jmdict_forms
file.
2023-01-22 17:55:27 -06:00

259 lines
5.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package yomichan
import (
"os"
"strings"
"foosoft.net/projects/jmdict"
"golang.org/x/exp/slices"
)
func kata2hira(word string) string {
charMap := func(character rune) rune {
if (character >= 'ァ' && character <= 'ヶ') || (character >= 'ヽ' && character <= 'ヾ') {
return character - 0x60
} else {
return character
}
}
return strings.Map(charMap, word)
}
func (h *headword) InfoSymbols() string {
infoSymbols := []string{}
if h.IsPriority {
infoSymbols = append(infoSymbols, prioritySymbol)
}
if h.IsRareKanji {
infoSymbols = append(infoSymbols, rareKanjiSymbol)
}
if h.IsIrregular {
infoSymbols = append(infoSymbols, irregularSymbol)
}
if h.IsOutdated {
infoSymbols = append(infoSymbols, outdatedSymbol)
}
return strings.Join(infoSymbols[:], " | ")
}
func (h *headword) GlossText() string {
gloss := h.Expression
if h.IsAteji {
gloss = "〈" + gloss + "〉"
}
symbolText := h.InfoSymbols()
if symbolText != "" {
gloss += "" + symbolText + ""
}
return gloss
}
func (h *headword) TableColHeaderText() string {
text := h.KanjiForm()
if h.IsAteji {
text = "〈" + text + "〉"
}
return text
}
func (h *headword) TableRowHeaderText() string {
text := h.Reading
if h.IsGikun {
text = "〈" + text + "〉"
}
return text
}
func (h *headword) TableCellText() string {
text := h.InfoSymbols()
if text == "" {
return defaultSymbol
} else {
return text
}
}
func (h *headword) KanjiForm() string {
if h.IsKanaOnly() {
return "∅"
} else {
return h.Expression
}
}
func jmdNeedsFormTable(headwords []headword) bool {
// Does the entry contain more than 1 distinct reading?
// E.g. バカがい and ばかがい are not distinct.
uniqueReading := ""
for _, h := range headwords {
if h.IsGikun {
return true
} else if h.IsSearchOnly {
continue
} else if h.IsKanaOnly() {
continue
} else if uniqueReading == "" {
uniqueReading = kata2hira(h.Reading)
} else if uniqueReading != kata2hira(h.Reading) {
return true
}
}
return false
}
type formTableData struct {
kanjiForms []string
readings []string
colHeaderText map[string]string
rowHeaderText map[string]string
cellText map[string]map[string]string
}
func tableData(headwords []headword) formTableData {
d := formTableData{
kanjiForms: []string{},
readings: []string{},
colHeaderText: make(map[string]string),
rowHeaderText: make(map[string]string),
cellText: make(map[string]map[string]string),
}
for _, h := range headwords {
if h.IsSearchOnly {
continue
}
kanjiForm := h.KanjiForm()
if !slices.Contains(d.kanjiForms, kanjiForm) {
d.kanjiForms = append(d.kanjiForms, kanjiForm)
d.colHeaderText[kanjiForm] = h.TableColHeaderText()
}
reading := h.Reading
if !slices.Contains(d.readings, reading) {
d.readings = append(d.readings, reading)
d.rowHeaderText[reading] = h.TableRowHeaderText()
d.cellText[reading] = make(map[string]string)
}
d.cellText[reading][kanjiForm] = h.TableCellText()
}
return d
}
func formsTableGlossary(headwords []headword) []any {
d := tableData(headwords)
attr := contentAttr{}
centeredAttr := contentAttr{textAlign: "center"}
leftAttr := contentAttr{textAlign: "left"}
cornerCell := contentTableHeadCell(attr, "") // empty cell in upper left corner
headRowCells := []any{cornerCell}
for _, kanjiForm := range d.kanjiForms {
content := d.colHeaderText[kanjiForm]
cell := contentTableHeadCell(centeredAttr, content)
headRowCells = append(headRowCells, cell)
}
headRow := contentTableRow(attr, headRowCells...)
tableRows := []any{headRow}
for _, reading := range d.readings {
rowHeadCellText := d.rowHeaderText[reading]
rowHeadCell := contentTableHeadCell(leftAttr, rowHeadCellText)
rowCells := []any{rowHeadCell}
for _, kanjiForm := range d.kanjiForms {
text := d.cellText[reading][kanjiForm]
rowCell := contentTableCell(centeredAttr, text)
rowCells = append(rowCells, rowCell)
}
tableRow := contentTableRow(attr, rowCells...)
tableRows = append(tableRows, tableRow)
}
tableAttr := contentAttr{data: map[string]string{"content": "formsTable"}}
contentTable := contentTable(tableAttr, tableRows...)
content := contentStructure(contentTable)
return []any{content}
}
func formsGlossary(headwords []headword) []any {
glossary := []any{}
for _, h := range headwords {
if h.IsSearchOnly {
continue
}
text := h.GlossText()
glossary = append(glossary, text)
}
return glossary
}
func baseFormsTerm(entry jmdict.JmdictEntry) dbTerm {
term := dbTerm{Sequence: entry.Sequence}
headwords := extractHeadwords(entry)
if jmdNeedsFormTable(headwords) {
term.Glossary = formsTableGlossary(headwords)
} else {
term.Glossary = formsGlossary(headwords)
}
for _, sense := range entry.Sense {
rules := grammarRules(sense.PartsOfSpeech)
term.addRules(rules...)
}
return term
}
func formsExportDb(inputPath, outputPath, languageName, title string, stride int, pretty bool) error {
reader, err := os.Open(inputPath)
if err != nil {
return err
}
defer reader.Close()
dictionary, _, err := jmdict.LoadJmdictNoTransform(reader)
if err != nil {
return err
}
meta := newJmdictMetadata(dictionary, languageName)
terms := dbTermList{}
for _, entry := range dictionary.Entries {
baseTerm := baseFormsTerm(entry)
headwords := extractHeadwords(entry)
for _, h := range headwords {
var term dbTerm
if h.IsSearchOnly {
term = createSearchTerm(h, entry, meta)
} else {
term = baseTerm
term.Expression = h.Expression
term.Reading = h.Reading
}
terms = append(terms, term)
}
}
if title == "" {
title = "JMdict Forms"
}
recordData := map[string]dbRecordList{
"term": terms.crush(),
"tag": dbRecordList{},
}
jmdictDate := jmdictPublicationDate(dictionary)
index := dbIndex{
Title: title,
Revision: "JMdict." + jmdictDate,
Sequenced: true,
Attribution: edrdgAttribution,
}
index.setDefaults()
return writeDb(
outputPath,
index,
recordData,
stride,
pretty,
)
}