2021-01-01 22:31:58 +00:00
package yomichan
2016-08-07 01:17:02 +00:00
import (
2016-12-19 01:46:40 +00:00
"os"
2016-12-17 23:48:13 +00:00
"strings"
2016-08-07 01:17:02 +00:00
2022-07-04 03:59:33 +00:00
"foosoft.net/projects/jmdict"
2016-08-07 01:17:02 +00:00
)
2023-01-22 20:27:02 +00:00
const edrdgAttribution = "This publication has included material from the JMdict (EDICT, etc.) dictionary files in accordance with the licence provisions of the Electronic Dictionaries Research Group. See http://www.edrdg.org/"
2016-12-24 05:52:49 +00:00
2016-12-18 19:46:47 +00:00
func jmdictBuildRules ( term * dbTerm ) {
2017-10-12 23:46:06 +00:00
for _ , tag := range term . DefinitionTags {
2016-12-17 23:48:13 +00:00
switch tag {
2020-12-06 04:48:34 +00:00
case "adj-i" , "v1" , "vk" , "vz" :
2016-12-17 23:48:13 +00:00
term . addRules ( tag )
default :
if strings . HasPrefix ( tag , "v5" ) {
term . addRules ( "v5" )
2022-08-14 19:35:20 +00:00
} else if strings . HasPrefix ( tag , "vs-" ) {
2016-12-20 05:42:21 +00:00
term . addRules ( "vs" )
2016-12-17 23:48:13 +00:00
}
}
}
}
2016-12-18 19:46:47 +00:00
func jmdictBuildScore ( term * dbTerm ) {
2017-10-12 23:46:06 +00:00
for _ , tag := range term . DefinitionTags {
2016-12-17 23:48:13 +00:00
switch tag {
2017-10-11 23:58:40 +00:00
case "arch" :
term . Score -= 100
}
}
for _ , tag := range term . TermTags {
switch tag {
case "news" , "ichi" , "spec" , "gai1" :
2017-08-24 03:03:06 +00:00
term . Score += 100
2016-12-21 06:59:17 +00:00
case "P" :
2017-08-24 03:03:06 +00:00
term . Score += 500
2017-10-24 10:24:41 +00:00
case "iK" , "ik" , "ok" , "oK" , "io" , "oik" :
2017-08-24 03:03:06 +00:00
term . Score -= 100
2016-12-17 23:48:13 +00:00
}
}
}
2016-12-18 19:46:47 +00:00
func jmdictAddPriorities ( term * dbTerm , priorities ... string ) {
for _ , priority := range priorities {
switch priority {
case "news1" , "ichi1" , "spec1" , "gai1" :
2017-10-11 06:39:29 +00:00
term . addTermTags ( "P" )
2016-12-18 19:46:47 +00:00
fallthrough
case "news2" , "ichi2" , "spec2" , "gai2" :
2017-10-11 06:39:29 +00:00
term . addTermTags ( priority [ : len ( priority ) - 1 ] )
2016-12-18 19:46:47 +00:00
}
}
}
2017-09-10 20:25:11 +00:00
func jmdictBuildTagMeta ( entities map [ string ] string ) dbTagList {
tags := dbTagList {
dbTag { Name : "news" , Notes : "appears frequently in Mainichi Shimbun" , Category : "frequent" , Order : - 2 } ,
dbTag { Name : "ichi" , Notes : "listed as common in Ichimango Goi Bunruishuu" , Category : "frequent" , Order : - 2 } ,
dbTag { Name : "spec" , Notes : "common words not included in frequency lists" , Category : "frequent" , Order : - 2 } ,
dbTag { Name : "gai" , Notes : "common loanword" , Category : "frequent" , Order : - 2 } ,
2017-10-24 10:24:41 +00:00
dbTag { Name : "P" , Notes : "popular term" , Category : "popular" , Order : - 10 , Score : 10 } ,
2016-12-18 01:08:38 +00:00
}
2016-12-17 23:48:13 +00:00
for name , value := range entities {
2017-09-10 20:25:11 +00:00
tag := dbTag { Name : name , Notes : value }
2016-12-17 23:48:13 +00:00
switch name {
2016-12-18 01:08:38 +00:00
case "exp" , "id" :
2016-12-18 01:39:24 +00:00
tag . Category = "expression"
2016-12-18 19:46:47 +00:00
tag . Order = - 5
2017-10-24 10:24:41 +00:00
case "arch" :
2016-12-18 01:39:24 +00:00
tag . Category = "archaism"
2017-08-24 03:03:06 +00:00
tag . Order = - 4
2017-10-24 10:24:41 +00:00
case "iK" , "ik" , "ok" , "oK" , "io" , "oik" :
tag . Score = - 5
2017-10-12 03:51:00 +00:00
case "adj-f" , "adj-i" , "adj-ix" , "adj-ku" , "adj-na" , "adj-nari" , "adj-no" , "adj-pn" , "adj-shiku" , "adj-t" , "adv" , "adv-to" , "aux-adj" ,
"aux" , "aux-v" , "conj" , "cop-da" , "ctr" , "int" , "n-adv" , "n" , "n-pref" , "n-pr" , "n-suf" , "n-t" , "num" , "pn" , "pref" , "prt" , "suf" ,
"unc" , "v1" , "v1-s" , "v2a-s" , "v2b-k" , "v2d-s" , "v2g-k" , "v2g-s" , "v2h-k" , "v2h-s" , "v2k-k" , "v2k-s" , "v2m-s" , "v2n-s" , "v2r-k" ,
"v2r-s" , "v2s-s" , "v2t-k" , "v2t-s" , "v2w-s" , "v2y-k" , "v2y-s" , "v2z-s" , "v4b" , "v4h" , "v4k" , "v4m" , "v4r" , "v4s" , "v4t" , "v5aru" ,
"v5b" , "v5g" , "v5k" , "v5k-s" , "v5m" , "v5n" , "v5r-i" , "v5r" , "v5s" , "v5t" , "v5u" , "v5u-s" , "vi" , "vk" , "vn" , "vr" , "vs-c" , "vs-i" ,
"vs" , "vs-s" , "vt" , "vz" :
2017-10-13 00:15:00 +00:00
tag . Category = "partOfSpeech"
2017-10-12 03:51:00 +00:00
tag . Order = - 3
2016-12-17 23:48:13 +00:00
}
2017-09-10 20:25:11 +00:00
tags = append ( tags , tag )
2016-12-17 23:48:13 +00:00
}
return tags
}
2017-06-10 23:53:31 +00:00
func jmdictExtractTerms ( edictEntry jmdict . JmdictEntry , language string ) [ ] dbTerm {
2016-11-05 20:13:13 +00:00
var terms [ ] dbTerm
2016-08-07 01:17:02 +00:00
convert := func ( reading jmdict . JmdictReading , kanji * jmdict . JmdictKanji ) {
2016-09-28 04:28:04 +00:00
if kanji != nil && reading . Restrictions != nil && ! hasString ( kanji . Expression , reading . Restrictions ) {
2016-08-07 01:17:02 +00:00
return
}
2016-11-05 20:13:13 +00:00
var termBase dbTerm
2017-10-11 06:39:29 +00:00
termBase . addTermTags ( reading . Information ... )
2016-09-18 18:36:54 +00:00
2016-08-07 01:17:02 +00:00
if kanji == nil {
2016-11-05 20:13:13 +00:00
termBase . Expression = reading . Reading
2016-12-18 19:46:47 +00:00
jmdictAddPriorities ( & termBase , reading . Priorities ... )
2016-08-07 01:17:02 +00:00
} else {
2016-11-05 20:13:13 +00:00
termBase . Expression = kanji . Expression
termBase . Reading = reading . Reading
2017-10-11 06:39:29 +00:00
termBase . addTermTags ( kanji . Information ... )
2016-11-05 20:13:13 +00:00
for _ , priority := range kanji . Priorities {
if hasString ( priority , reading . Priorities ) {
2016-12-18 19:46:47 +00:00
jmdictAddPriorities ( & termBase , priority )
2016-11-05 20:13:13 +00:00
}
}
2016-08-07 01:17:02 +00:00
}
2017-02-19 22:38:10 +00:00
var partsOfSpeech [ ] string
for index , sense := range edictEntry . Sense {
2017-10-12 04:01:36 +00:00
if len ( sense . PartsOfSpeech ) != 0 {
partsOfSpeech = sense . PartsOfSpeech
}
2016-09-28 04:37:13 +00:00
if sense . RestrictedReadings != nil && ! hasString ( reading . Reading , sense . RestrictedReadings ) {
2016-08-07 01:17:02 +00:00
continue
}
2016-09-28 04:37:13 +00:00
if kanji != nil && sense . RestrictedKanji != nil && ! hasString ( kanji . Expression , sense . RestrictedKanji ) {
2016-08-07 01:17:02 +00:00
continue
}
2017-06-10 23:53:31 +00:00
term := dbTerm {
Reading : termBase . Reading ,
Expression : termBase . Expression ,
2017-08-24 03:03:06 +00:00
Score : len ( edictEntry . Sense ) - index ,
2017-10-11 02:30:59 +00:00
Sequence : edictEntry . Sequence ,
2017-06-10 23:53:31 +00:00
}
for _ , glossary := range sense . Glossary {
if glossary . Language == nil && language == "" || glossary . Language != nil && language == * glossary . Language {
term . Glossary = append ( term . Glossary , glossary . Content )
}
}
if len ( term . Glossary ) == 0 {
continue
}
2017-10-12 23:46:06 +00:00
term . addDefinitionTags ( termBase . DefinitionTags ... )
2017-10-11 06:39:29 +00:00
term . addTermTags ( termBase . TermTags ... )
2017-10-12 23:46:06 +00:00
term . addDefinitionTags ( partsOfSpeech ... )
term . addDefinitionTags ( sense . Fields ... )
term . addDefinitionTags ( sense . Misc ... )
term . addDefinitionTags ( sense . Dialects ... )
2016-09-18 18:36:54 +00:00
2016-12-18 19:46:47 +00:00
jmdictBuildRules ( & term )
jmdictBuildScore ( & term )
2016-12-17 23:48:13 +00:00
2016-11-05 20:13:13 +00:00
terms = append ( terms , term )
2016-09-18 18:36:54 +00:00
}
2016-08-07 01:17:02 +00:00
}
if len ( edictEntry . Kanji ) > 0 {
for _ , kanji := range edictEntry . Kanji {
for _ , reading := range edictEntry . Readings {
2017-10-11 23:28:51 +00:00
if reading . NoKanji == nil {
convert ( reading , & kanji )
}
}
}
for _ , reading := range edictEntry . Readings {
2017-10-29 18:42:36 +00:00
if reading . NoKanji != nil {
2017-10-11 23:28:51 +00:00
convert ( reading , nil )
2016-08-07 01:17:02 +00:00
}
}
} else {
for _ , reading := range edictEntry . Readings {
convert ( reading , nil )
}
}
2016-11-05 20:13:13 +00:00
return terms
2016-08-07 01:17:02 +00:00
}
2017-06-26 00:22:17 +00:00
func jmdictExportDb ( inputPath , outputPath , language , title string , stride int , pretty bool ) error {
2016-12-19 01:46:40 +00:00
reader , err := os . Open ( inputPath )
if err != nil {
return err
}
defer reader . Close ( )
2016-08-07 01:17:02 +00:00
dict , entities , err := jmdict . LoadJmdictNoTransform ( reader )
if err != nil {
return err
}
2017-06-11 00:35:58 +00:00
var langTag string
switch language {
case "dutch" :
langTag = "dut"
case "french" :
langTag = "fre"
case "german" :
langTag = "ger"
case "hungarian" :
langTag = "hun"
case "italian" :
langTag = "ita"
case "russian" :
langTag = "rus"
case "slovenian" :
langTag = "slv"
case "spanish" :
langTag = "spa"
case "swedish" :
langTag = "swe"
}
2017-06-10 23:53:31 +00:00
2016-11-05 20:13:13 +00:00
var terms dbTermList
for _ , entry := range dict . Entries {
2017-06-10 23:53:31 +00:00
terms = append ( terms , jmdictExtractTerms ( entry , langTag ) ... )
2016-08-07 01:17:02 +00:00
}
2016-12-29 01:45:33 +00:00
if title == "" {
title = "JMdict"
}
2017-09-10 20:25:11 +00:00
recordData := map [ string ] dbRecordList {
2017-09-10 20:45:06 +00:00
"term" : terms . crush ( ) ,
"tag" : jmdictBuildTagMeta ( entities ) . crush ( ) ,
2017-09-10 20:25:11 +00:00
}
2023-01-22 20:27:02 +00:00
index := dbIndex {
Title : title ,
Revision : "jmdict4" ,
Sequenced : true ,
Attribution : edrdgAttribution ,
}
index . setDefaults ( )
2016-11-05 20:13:13 +00:00
return writeDb (
2017-06-26 00:22:17 +00:00
outputPath ,
2023-01-22 20:27:02 +00:00
index ,
2017-09-10 20:25:11 +00:00
recordData ,
2016-12-29 01:45:33 +00:00
stride ,
2016-12-19 01:31:27 +00:00
pretty ,
2016-11-05 20:13:13 +00:00
)
2016-08-07 01:17:02 +00:00
}