Cleanup
This commit is contained in:
parent
dcc38db022
commit
670ad8bac9
36
common.go
36
common.go
@ -28,9 +28,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Parser func(decoder *xml.Decoder, element *xml.StartElement) error
|
func parseDict(reader io.Reader, container interface{}, transform bool) (map[string]string, error) {
|
||||||
|
|
||||||
func parseDoc(reader io.Reader, container interface{}, transform bool) (map[string]string, error) {
|
|
||||||
decoder := xml.NewDecoder(reader)
|
decoder := xml.NewDecoder(reader)
|
||||||
|
|
||||||
var entities map[string]string
|
var entities map[string]string
|
||||||
@ -62,38 +60,6 @@ func parseDoc(reader io.Reader, container interface{}, transform bool) (map[stri
|
|||||||
return entities, nil
|
return entities, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseDocument(reader io.Reader, transform bool, callback Parser) (map[string]string, error) {
|
|
||||||
decoder := xml.NewDecoder(reader)
|
|
||||||
|
|
||||||
var entities map[string]string
|
|
||||||
for {
|
|
||||||
token, _ := decoder.Token()
|
|
||||||
if token == nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
switch startElement := token.(type) {
|
|
||||||
case xml.Directive:
|
|
||||||
directive := token.(xml.Directive)
|
|
||||||
entities = parseEntities(&directive)
|
|
||||||
if transform {
|
|
||||||
decoder.Entity = entities
|
|
||||||
} else {
|
|
||||||
decoder.Entity = make(map[string]string)
|
|
||||||
for k, _ := range entities {
|
|
||||||
decoder.Entity[k] = k
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case xml.StartElement:
|
|
||||||
if err := callback(decoder, &startElement); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return entities, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseEntities(d *xml.Directive) map[string]string {
|
func parseEntities(d *xml.Directive) map[string]string {
|
||||||
re := regexp.MustCompile("<!ENTITY\\s([0-9\\-A-z]+)\\s\"(.+)\">")
|
re := regexp.MustCompile("<!ENTITY\\s([0-9\\-A-z]+)\\s\"(.+)\">")
|
||||||
matches := re.FindAllStringSubmatch(string(*d), -1)
|
matches := re.FindAllStringSubmatch(string(*d), -1)
|
||||||
|
@ -22,15 +22,16 @@
|
|||||||
|
|
||||||
package jmdict
|
package jmdict
|
||||||
|
|
||||||
import (
|
import "io"
|
||||||
"encoding/xml"
|
|
||||||
"io"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Entries consist of kanji elements, reading elements,
|
type Jmdict struct {
|
||||||
// general information and sense elements. Each entry must have at
|
// Entries consist of kanji elements, reading elements,
|
||||||
// least one reading element and one sense element. Others are optional.
|
// general information and sense elements. Each entry must have at
|
||||||
type EdictEntry struct {
|
// least one reading element and one sense element. Others are optional.
|
||||||
|
Entries []JmdictEntry `xml:"entry"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type JmdictEntry struct {
|
||||||
// A unique numeric sequence number for each entry
|
// A unique numeric sequence number for each entry
|
||||||
Sequence int `xml:"ent_seq"`
|
Sequence int `xml:"ent_seq"`
|
||||||
|
|
||||||
@ -44,7 +45,7 @@ type EdictEntry struct {
|
|||||||
// included, provided they are associated with appropriate information
|
// included, provided they are associated with appropriate information
|
||||||
// fields. Synonyms are not included; they may be indicated in the
|
// fields. Synonyms are not included; they may be indicated in the
|
||||||
// cross-reference field associated with the sense element.
|
// cross-reference field associated with the sense element.
|
||||||
Kanji []EdictKanji `xml:"k_ele"`
|
Kanji []JmdictKanji `xml:"k_ele"`
|
||||||
|
|
||||||
// The reading element typically contains the valid readings
|
// The reading element typically contains the valid readings
|
||||||
// of the word(s) in the kanji element using modern kanadzukai.
|
// of the word(s) in the kanji element using modern kanadzukai.
|
||||||
@ -52,16 +53,16 @@ type EdictEntry struct {
|
|||||||
// alternative readings of the kanji element. In the absence of a
|
// alternative readings of the kanji element. In the absence of a
|
||||||
// kanji element, i.e. in the case of a word or phrase written
|
// kanji element, i.e. in the case of a word or phrase written
|
||||||
// entirely in kana, these elements will define the entry.
|
// entirely in kana, these elements will define the entry.
|
||||||
Readings []EdictReading `xml:"r_ele"`
|
Readings []JmdictReading `xml:"r_ele"`
|
||||||
|
|
||||||
// The sense element will record the translational equivalent
|
// The sense element will record the translational equivalent
|
||||||
// of the Japanese word, plus other related information. Where there
|
// of the Japanese word, plus other related information. Where there
|
||||||
// are several distinctly different meanings of the word, multiple
|
// are several distinctly different meanings of the word, multiple
|
||||||
// sense elements will be employed.
|
// sense elements will be employed.
|
||||||
Sense []EdictSense `xml:"sense"`
|
Sense []JmdictSense `xml:"sense"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EdictKanji struct {
|
type JmdictKanji struct {
|
||||||
// This element will contain a word or short phrase in Japanese
|
// This element will contain a word or short phrase in Japanese
|
||||||
// which is written using at least one non-kana character (usually kanji,
|
// which is written using at least one non-kana character (usually kanji,
|
||||||
// but can be other characters). The valid characters are
|
// but can be other characters). The valid characters are
|
||||||
@ -105,7 +106,7 @@ type EdictKanji struct {
|
|||||||
Priorities []string `xml:"ke_pri"`
|
Priorities []string `xml:"ke_pri"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EdictReading struct {
|
type JmdictReading struct {
|
||||||
// This element content is restricted to kana and related
|
// This element content is restricted to kana and related
|
||||||
// characters such as chouon and kurikaeshi. Kana usage will be
|
// characters such as chouon and kurikaeshi. Kana usage will be
|
||||||
// consistent between the keb and reb elements; e.g. if the keb
|
// consistent between the keb and reb elements; e.g. if the keb
|
||||||
@ -134,7 +135,7 @@ type EdictReading struct {
|
|||||||
Priorities []string `xml:"re_pri"`
|
Priorities []string `xml:"re_pri"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EdictSource struct {
|
type JmdictSource struct {
|
||||||
Content string `xml:",chardata"`
|
Content string `xml:",chardata"`
|
||||||
|
|
||||||
// The xml:lang attribute defines the language(s) from which
|
// The xml:lang attribute defines the language(s) from which
|
||||||
@ -156,7 +157,7 @@ type EdictSource struct {
|
|||||||
Wasei string `xml:"ls_wasei,attr"`
|
Wasei string `xml:"ls_wasei,attr"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EdictGlossary struct {
|
type JmdictGlossary struct {
|
||||||
Content string `xml:",chardata"`
|
Content string `xml:",chardata"`
|
||||||
|
|
||||||
// The xml:lang attribute defines the target language of the
|
// The xml:lang attribute defines the target language of the
|
||||||
@ -171,7 +172,7 @@ type EdictGlossary struct {
|
|||||||
Gender string `xml:"g_gend"`
|
Gender string `xml:"g_gend"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EdictSense struct {
|
type JmdictSense struct {
|
||||||
// These elements, if present, indicate that the sense is restricted
|
// These elements, if present, indicate that the sense is restricted
|
||||||
// to the lexeme represented by the keb and/or reb.
|
// to the lexeme represented by the keb and/or reb.
|
||||||
RestrictedKanji []string `xml:"stagk"`
|
RestrictedKanji []string `xml:"stagk"`
|
||||||
@ -211,7 +212,7 @@ type EdictSense struct {
|
|||||||
// language(s) of a loan-word/gairaigo. If the source language is other
|
// language(s) of a loan-word/gairaigo. If the source language is other
|
||||||
// than English, the language is indicated by the xml:lang attribute.
|
// than English, the language is indicated by the xml:lang attribute.
|
||||||
// The element value (if any) is the source word or phrase.
|
// The element value (if any) is the source word or phrase.
|
||||||
SourceLanguages []EdictSource `xml:"lsource"`
|
SourceLanguages []JmdictSource `xml:"lsource"`
|
||||||
|
|
||||||
// For words specifically associated with regional dialects in
|
// For words specifically associated with regional dialects in
|
||||||
// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
|
// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
|
||||||
@ -227,25 +228,17 @@ type EdictSense struct {
|
|||||||
// target-language words or phrases which are equivalents to the
|
// target-language words or phrases which are equivalents to the
|
||||||
// Japanese word. This element would normally be present, however it
|
// Japanese word. This element would normally be present, however it
|
||||||
// may be omitted in entries which are purely for a cross-reference.
|
// may be omitted in entries which are purely for a cross-reference.
|
||||||
Glossary []EdictGlossary `xml:"gloss"`
|
Glossary []JmdictGlossary `xml:"gloss"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadEdict(reader io.Reader, transform bool) ([]EdictEntry, map[string]string, error) {
|
func LoadJmdict(reader io.Reader) (Jmdict, map[string]string, error) {
|
||||||
var entries []EdictEntry
|
var dict Jmdict
|
||||||
|
entities, err := parseDict(reader, &dict, true)
|
||||||
entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error {
|
return dict, entities, err
|
||||||
if element.Name.Local != "entry" {
|
}
|
||||||
return nil
|
|
||||||
}
|
func LoadJmdictNoTransform(reader io.Reader) (Jmdict, map[string]string, error) {
|
||||||
|
var dict Jmdict
|
||||||
var entry EdictEntry
|
entities, err := parseDict(reader, &dict, false)
|
||||||
if err := decoder.DecodeElement(&entry, element); err != nil {
|
return dict, entities, err
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
entries = append(entries, entry)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
|
|
||||||
return entries, entities, err
|
|
||||||
}
|
}
|
@ -22,15 +22,16 @@
|
|||||||
|
|
||||||
package jmdict
|
package jmdict
|
||||||
|
|
||||||
import (
|
import "io"
|
||||||
"encoding/xml"
|
|
||||||
"io"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Entries consist of kanji elements, reading elements
|
type Jmnedict struct {
|
||||||
// name translation elements. Each entry must have at
|
// Entries consist of kanji elements, reading elements
|
||||||
// least one reading element and one sense element. Others are optional.
|
// name translation elements. Each entry must have at
|
||||||
type EnamdictEntry struct {
|
// least one reading element and one sense element. Others are optional.
|
||||||
|
Entries []JmnedictEntry `xml:"entry"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type JmnedictEntry struct {
|
||||||
// A unique numeric sequence number for each entry
|
// A unique numeric sequence number for each entry
|
||||||
Sequence int `xml:"ent_seq"`
|
Sequence int `xml:"ent_seq"`
|
||||||
|
|
||||||
@ -44,7 +45,7 @@ type EnamdictEntry struct {
|
|||||||
// included, provided they are associated with appropriate information
|
// included, provided they are associated with appropriate information
|
||||||
// fields. Synonyms are not included; they may be indicated in the
|
// fields. Synonyms are not included; they may be indicated in the
|
||||||
// cross-reference field associated with the sense element.
|
// cross-reference field associated with the sense element.
|
||||||
Kanji []EnamdictKanji `xml:"k_ele"`
|
Kanji []JmnedictKanji `xml:"k_ele"`
|
||||||
|
|
||||||
// The reading element typically contains the valid readings
|
// The reading element typically contains the valid readings
|
||||||
// of the word(s) in the kanji element using modern kanadzukai.
|
// of the word(s) in the kanji element using modern kanadzukai.
|
||||||
@ -52,14 +53,14 @@ type EnamdictEntry struct {
|
|||||||
// alternative readings of the kanji element. In the absence of a
|
// alternative readings of the kanji element. In the absence of a
|
||||||
// kanji element, i.e. in the case of a word or phrase written
|
// kanji element, i.e. in the case of a word or phrase written
|
||||||
// entirely in kana, these elements will define the entry.
|
// entirely in kana, these elements will define the entry.
|
||||||
Readings []EnamdictReading `xml:"r_ele"`
|
Readings []JmnedictReading `xml:"r_ele"`
|
||||||
|
|
||||||
// The trans element will record the translational equivalent
|
// The trans element will record the translational equivalent
|
||||||
// of the Japanese name, plus other related information.
|
// of the Japanese name, plus other related information.
|
||||||
Translations []EnamdictTranslation `xml:"trans"`
|
Translations []JmnedictTranslation `xml:"trans"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EnamdictKanji struct {
|
type JmnedictKanji struct {
|
||||||
// This element will contain an entity name in Japanese
|
// This element will contain an entity name in Japanese
|
||||||
// which is written using at least one non-kana character (usually
|
// which is written using at least one non-kana character (usually
|
||||||
// kanji, but can be other characters). The valid
|
// kanji, but can be other characters). The valid
|
||||||
@ -82,7 +83,7 @@ type EnamdictKanji struct {
|
|||||||
Priorities []string `xml:"ke_pri"`
|
Priorities []string `xml:"ke_pri"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EnamdictReading struct {
|
type JmnedictReading struct {
|
||||||
// This element content is restricted to kana and related
|
// This element content is restricted to kana and related
|
||||||
// characters such as chouon and kurikaeshi. Kana usage will be
|
// characters such as chouon and kurikaeshi. Kana usage will be
|
||||||
// consistent between the keb and reb elements; e.g. if the keb
|
// consistent between the keb and reb elements; e.g. if the keb
|
||||||
@ -104,7 +105,7 @@ type EnamdictReading struct {
|
|||||||
Priorities []string `xml:"re_pri"`
|
Priorities []string `xml:"re_pri"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type EnamdictTranslation struct {
|
type JmnedictTranslation struct {
|
||||||
// The type of name, recorded in the appropriate entity codes.
|
// The type of name, recorded in the appropriate entity codes.
|
||||||
NameTypes []string `xml:"name_type"`
|
NameTypes []string `xml:"name_type"`
|
||||||
|
|
||||||
@ -129,22 +130,14 @@ type EnamdictTranslation struct {
|
|||||||
Language string `xml:"lang,attr"`
|
Language string `xml:"lang,attr"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadEnamdict(reader io.Reader, transform bool) ([]EnamdictEntry, map[string]string, error) {
|
func LoadJmnedict(reader io.Reader) (Jmnedict, map[string]string, error) {
|
||||||
var entries []EnamdictEntry
|
var dic Jmnedict
|
||||||
|
entities, err := parseDict(reader, &dic, true)
|
||||||
entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error {
|
return dic, entities, err
|
||||||
if element.Name.Local != "entry" {
|
}
|
||||||
return nil
|
|
||||||
}
|
func LoadJmnedictNoTransform(reader io.Reader) (Jmnedict, map[string]string, error) {
|
||||||
|
var dic Jmnedict
|
||||||
var entry EnamdictEntry
|
entities, err := parseDict(reader, &dic, false)
|
||||||
if err := decoder.DecodeElement(&entry, element); err != nil {
|
return dic, entities, err
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
entries = append(entries, entry)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
|
|
||||||
return entries, entities, err
|
|
||||||
}
|
}
|
@ -315,7 +315,7 @@ type KanjidicMeaning struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func LoadKanjidic(reader io.Reader) (Kanjidic, error) {
|
func LoadKanjidic(reader io.Reader) (Kanjidic, error) {
|
||||||
var kanjidic Kanjidic
|
var dic Kanjidic
|
||||||
_, err := parseDoc(reader, &kanjidic, false)
|
_, err := parseDict(reader, &dic, true)
|
||||||
return kanjidic, err
|
return dic, err
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user