diff --git a/common.go b/common.go index 6ff4578..c411887 100644 --- a/common.go +++ b/common.go @@ -28,9 +28,7 @@ import ( "regexp" ) -type Parser func(decoder *xml.Decoder, element *xml.StartElement) error - -func parseDoc(reader io.Reader, container interface{}, transform bool) (map[string]string, error) { +func parseDict(reader io.Reader, container interface{}, transform bool) (map[string]string, error) { decoder := xml.NewDecoder(reader) var entities map[string]string @@ -62,38 +60,6 @@ func parseDoc(reader io.Reader, container interface{}, transform bool) (map[stri return entities, nil } -func parseDocument(reader io.Reader, transform bool, callback Parser) (map[string]string, error) { - decoder := xml.NewDecoder(reader) - - var entities map[string]string - for { - token, _ := decoder.Token() - if token == nil { - break - } - - switch startElement := token.(type) { - case xml.Directive: - directive := token.(xml.Directive) - entities = parseEntities(&directive) - if transform { - decoder.Entity = entities - } else { - decoder.Entity = make(map[string]string) - for k, _ := range entities { - decoder.Entity[k] = k - } - } - case xml.StartElement: - if err := callback(decoder, &startElement); err != nil { - return nil, err - } - } - } - - return entities, nil -} - func parseEntities(d *xml.Directive) map[string]string { re := regexp.MustCompile("") matches := re.FindAllStringSubmatch(string(*d), -1) diff --git a/edict.go b/jmdict.go similarity index 90% rename from edict.go rename to jmdict.go index 6fc7c78..193049e 100644 --- a/edict.go +++ b/jmdict.go @@ -22,15 +22,16 @@ package jmdict -import ( - "encoding/xml" - "io" -) +import "io" -// Entries consist of kanji elements, reading elements, -// general information and sense elements. Each entry must have at -// least one reading element and one sense element. Others are optional. -type EdictEntry struct { +type Jmdict struct { + // Entries consist of kanji elements, reading elements, + // general information and sense elements. Each entry must have at + // least one reading element and one sense element. Others are optional. + Entries []JmdictEntry `xml:"entry"` +} + +type JmdictEntry struct { // A unique numeric sequence number for each entry Sequence int `xml:"ent_seq"` @@ -44,7 +45,7 @@ type EdictEntry struct { // included, provided they are associated with appropriate information // fields. Synonyms are not included; they may be indicated in the // cross-reference field associated with the sense element. - Kanji []EdictKanji `xml:"k_ele"` + Kanji []JmdictKanji `xml:"k_ele"` // The reading element typically contains the valid readings // of the word(s) in the kanji element using modern kanadzukai. @@ -52,16 +53,16 @@ type EdictEntry struct { // alternative readings of the kanji element. In the absence of a // kanji element, i.e. in the case of a word or phrase written // entirely in kana, these elements will define the entry. - Readings []EdictReading `xml:"r_ele"` + Readings []JmdictReading `xml:"r_ele"` // The sense element will record the translational equivalent // of the Japanese word, plus other related information. Where there // are several distinctly different meanings of the word, multiple // sense elements will be employed. - Sense []EdictSense `xml:"sense"` + Sense []JmdictSense `xml:"sense"` } -type EdictKanji struct { +type JmdictKanji struct { // This element will contain a word or short phrase in Japanese // which is written using at least one non-kana character (usually kanji, // but can be other characters). The valid characters are @@ -105,7 +106,7 @@ type EdictKanji struct { Priorities []string `xml:"ke_pri"` } -type EdictReading struct { +type JmdictReading struct { // This element content is restricted to kana and related // characters such as chouon and kurikaeshi. Kana usage will be // consistent between the keb and reb elements; e.g. if the keb @@ -134,7 +135,7 @@ type EdictReading struct { Priorities []string `xml:"re_pri"` } -type EdictSource struct { +type JmdictSource struct { Content string `xml:",chardata"` // The xml:lang attribute defines the language(s) from which @@ -156,7 +157,7 @@ type EdictSource struct { Wasei string `xml:"ls_wasei,attr"` } -type EdictGlossary struct { +type JmdictGlossary struct { Content string `xml:",chardata"` // The xml:lang attribute defines the target language of the @@ -171,7 +172,7 @@ type EdictGlossary struct { Gender string `xml:"g_gend"` } -type EdictSense struct { +type JmdictSense struct { // These elements, if present, indicate that the sense is restricted // to the lexeme represented by the keb and/or reb. RestrictedKanji []string `xml:"stagk"` @@ -211,7 +212,7 @@ type EdictSense struct { // language(s) of a loan-word/gairaigo. If the source language is other // than English, the language is indicated by the xml:lang attribute. // The element value (if any) is the source word or phrase. - SourceLanguages []EdictSource `xml:"lsource"` + SourceLanguages []JmdictSource `xml:"lsource"` // For words specifically associated with regional dialects in // Japanese, the entity code for that dialect, e.g. ksb for Kansaiben. @@ -227,25 +228,17 @@ type EdictSense struct { // target-language words or phrases which are equivalents to the // Japanese word. This element would normally be present, however it // may be omitted in entries which are purely for a cross-reference. - Glossary []EdictGlossary `xml:"gloss"` + Glossary []JmdictGlossary `xml:"gloss"` } -func LoadEdict(reader io.Reader, transform bool) ([]EdictEntry, map[string]string, error) { - var entries []EdictEntry - - entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error { - if element.Name.Local != "entry" { - return nil - } - - var entry EdictEntry - if err := decoder.DecodeElement(&entry, element); err != nil { - return err - } - - entries = append(entries, entry) - return nil - }) - - return entries, entities, err +func LoadJmdict(reader io.Reader) (Jmdict, map[string]string, error) { + var dict Jmdict + entities, err := parseDict(reader, &dict, true) + return dict, entities, err +} + +func LoadJmdictNoTransform(reader io.Reader) (Jmdict, map[string]string, error) { + var dict Jmdict + entities, err := parseDict(reader, &dict, false) + return dict, entities, err } diff --git a/enamdict.go b/jmnedict.go similarity index 84% rename from enamdict.go rename to jmnedict.go index c9cbd20..8648281 100644 --- a/enamdict.go +++ b/jmnedict.go @@ -22,15 +22,16 @@ package jmdict -import ( - "encoding/xml" - "io" -) +import "io" -// Entries consist of kanji elements, reading elements -// name translation elements. Each entry must have at -// least one reading element and one sense element. Others are optional. -type EnamdictEntry struct { +type Jmnedict struct { + // Entries consist of kanji elements, reading elements + // name translation elements. Each entry must have at + // least one reading element and one sense element. Others are optional. + Entries []JmnedictEntry `xml:"entry"` +} + +type JmnedictEntry struct { // A unique numeric sequence number for each entry Sequence int `xml:"ent_seq"` @@ -44,7 +45,7 @@ type EnamdictEntry struct { // included, provided they are associated with appropriate information // fields. Synonyms are not included; they may be indicated in the // cross-reference field associated with the sense element. - Kanji []EnamdictKanji `xml:"k_ele"` + Kanji []JmnedictKanji `xml:"k_ele"` // The reading element typically contains the valid readings // of the word(s) in the kanji element using modern kanadzukai. @@ -52,14 +53,14 @@ type EnamdictEntry struct { // alternative readings of the kanji element. In the absence of a // kanji element, i.e. in the case of a word or phrase written // entirely in kana, these elements will define the entry. - Readings []EnamdictReading `xml:"r_ele"` + Readings []JmnedictReading `xml:"r_ele"` // The trans element will record the translational equivalent // of the Japanese name, plus other related information. - Translations []EnamdictTranslation `xml:"trans"` + Translations []JmnedictTranslation `xml:"trans"` } -type EnamdictKanji struct { +type JmnedictKanji struct { // This element will contain an entity name in Japanese // which is written using at least one non-kana character (usually // kanji, but can be other characters). The valid @@ -82,7 +83,7 @@ type EnamdictKanji struct { Priorities []string `xml:"ke_pri"` } -type EnamdictReading struct { +type JmnedictReading struct { // This element content is restricted to kana and related // characters such as chouon and kurikaeshi. Kana usage will be // consistent between the keb and reb elements; e.g. if the keb @@ -104,7 +105,7 @@ type EnamdictReading struct { Priorities []string `xml:"re_pri"` } -type EnamdictTranslation struct { +type JmnedictTranslation struct { // The type of name, recorded in the appropriate entity codes. NameTypes []string `xml:"name_type"` @@ -129,22 +130,14 @@ type EnamdictTranslation struct { Language string `xml:"lang,attr"` } -func LoadEnamdict(reader io.Reader, transform bool) ([]EnamdictEntry, map[string]string, error) { - var entries []EnamdictEntry - - entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error { - if element.Name.Local != "entry" { - return nil - } - - var entry EnamdictEntry - if err := decoder.DecodeElement(&entry, element); err != nil { - return err - } - - entries = append(entries, entry) - return nil - }) - - return entries, entities, err +func LoadJmnedict(reader io.Reader) (Jmnedict, map[string]string, error) { + var dic Jmnedict + entities, err := parseDict(reader, &dic, true) + return dic, entities, err +} + +func LoadJmnedictNoTransform(reader io.Reader) (Jmnedict, map[string]string, error) { + var dic Jmnedict + entities, err := parseDict(reader, &dic, false) + return dic, entities, err } diff --git a/kanjidic.go b/kanjidic.go index 54c8112..4ff9770 100644 --- a/kanjidic.go +++ b/kanjidic.go @@ -315,7 +315,7 @@ type KanjidicMeaning struct { } func LoadKanjidic(reader io.Reader) (Kanjidic, error) { - var kanjidic Kanjidic - _, err := parseDoc(reader, &kanjidic, false) - return kanjidic, err + var dic Kanjidic + _, err := parseDict(reader, &dic, true) + return dic, err }