From 7bd967915c4597c38a93d79cb89a5f305ddbedca Mon Sep 17 00:00:00 2001 From: stephenmk Date: Wed, 25 Jan 2023 18:26:47 -0600 Subject: [PATCH] Add "forms" term in special circumstances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a headword appears in multiple entries, then each entry needs a corresponding "forms" term in the output dictionary. For example, 軽卒 is the only headword in entry 2275730, but 軽卒 also appears as an irregular form in entry 1252910. If a "forms" term is not included for the former entry, then it will appear that 軽卒 is irregular for all senses in the output dictionary. --- jmdict.go | 59 ++++++++++++++++++++++++++++++++++---------------- jmdictForms.go | 19 ++++++++-------- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/jmdict.go b/jmdict.go index 7283907..362b14d 100644 --- a/jmdict.go +++ b/jmdict.go @@ -62,7 +62,21 @@ func jmdictPublicationDate(dictionary jmdict.Jmdict) string { return jmdictDate } -func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { +func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + // Don't add "forms" terms to non-English dictionaries. + // Information would be duplicated if users installed more + // than one version. + if meta.language != "eng" { + return dbTerm{}, false + } + // Don't need a "forms" term for entries with one unique + // headword which does not appear in any other entries. + if !meta.hasMultipleForms[entry.Sequence] { + if len(meta.headwordHashToSeqs[headword.Hash()]) == 1 { + return dbTerm{}, false + } + } + term := baseFormsTerm(entry) term.Expression = headword.Expression term.Reading = headword.Reading @@ -72,10 +86,17 @@ func createFormsTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMet term.addDefinitionTags("forms") senseNumber := meta.seqToSenseCount[entry.Sequence] + 1 term.Score = calculateTermScore(senseNumber, headword) - return term + return term, true } -func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { +func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + // Don't add "search" terms to non-English dictionaries. + // Information would be duplicated if users installed more + // than one version. + if meta.language != "eng" { + return dbTerm{}, false + } + term := dbTerm{ Expression: headword.Expression, Sequence: -entry.Sequence, @@ -98,10 +119,17 @@ func createSearchTerm(headword headword, entry jmdict.JmdictEntry, meta jmdictMe ) term.Glossary = []any{contentStructure(content)} - return term + return term, true } -func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) dbTerm { +func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) (dbTerm, bool) { + if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { + return dbTerm{}, false + } + if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) { + return dbTerm{}, false + } + term := dbTerm{ Expression: headword.Expression, Reading: headword.Reading, @@ -126,7 +154,7 @@ func createSenseTerm(sense jmdict.JmdictSense, senseNumber int, headword headwor term.Score = calculateTermScore(senseNumber, headword) - return term + return term, true } func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetadata) ([]dbTerm, bool) { @@ -134,8 +162,7 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada return nil, false } if headword.IsSearchOnly { - if meta.language == "eng" { - searchTerm := createSearchTerm(headword, entry, meta) + if searchTerm, ok := createSearchTerm(headword, entry, meta); ok { return []dbTerm{searchTerm}, true } else { return nil, false @@ -145,25 +172,19 @@ func extractTerms(headword headword, entry jmdict.JmdictEntry, meta jmdictMetada senseNumber := 1 for _, sense := range entry.Sense { if !glossaryContainsLanguage(sense.Glossary, meta.language) { + // Do not increment sense number continue } - if sense.RestrictedReadings != nil && !slices.Contains(sense.RestrictedReadings, headword.Reading) { - senseNumber += 1 - continue + if senseTerm, ok := createSenseTerm(sense, senseNumber, headword, entry, meta); ok { + terms = append(terms, senseTerm) } - if sense.RestrictedKanji != nil && !slices.Contains(sense.RestrictedKanji, headword.Expression) { - senseNumber += 1 - continue - } - senseTerm := createSenseTerm(sense, senseNumber, headword, entry, meta) senseNumber += 1 - terms = append(terms, senseTerm) } - if meta.hasMultipleForms[entry.Sequence] && meta.language == "eng" { - formsTerm := createFormsTerm(headword, entry, meta) + if formsTerm, ok := createFormsTerm(headword, entry, meta); ok { terms = append(terms, formsTerm) } + return terms, true } diff --git a/jmdictForms.go b/jmdictForms.go index 9d21ac4..af4bba6 100644 --- a/jmdictForms.go +++ b/jmdictForms.go @@ -210,23 +210,24 @@ func formsExportDb(inputPath, outputPath, languageName, title string, stride int return err } - meta := newJmdictMetadata(dictionary, languageName) + meta := newJmdictMetadata(dictionary, "english") terms := dbTermList{} for _, entry := range dictionary.Entries { baseTerm := baseFormsTerm(entry) headwords := extractHeadwords(entry) for _, h := range headwords { - var term dbTerm if h.IsSearchOnly { - term = createSearchTerm(h, entry, meta) - } else { - term = baseTerm - term.Expression = h.Expression - term.Reading = h.Reading - term.addTermTags(h.TermTags...) - term.Score = calculateTermScore(1, h) + if term, ok := createSearchTerm(h, entry, meta); ok { + terms = append(terms, term) + } + continue } + term := baseTerm + term.Expression = h.Expression + term.Reading = h.Reading + term.addTermTags(h.TermTags...) + term.Score = calculateTermScore(1, h) terms = append(terms, term) } }