Cleanup

2016-08-06 17:43:26 -07:00 · 2016-08-06 17:43:26 -07:00 · 670ad8bac9
commit 670ad8bac9
parent dcc38db022
4 changed files with 58 additions and 106 deletions
--- a/common.go
+++ b/common.go
@ -28,9 +28,7 @@ import (
 	"regexp"
 )

-type Parser func(decoder *xml.Decoder, element *xml.StartElement) error
-
-func parseDoc(reader io.Reader, container interface{}, transform bool) (map[string]string, error) {
+func parseDict(reader io.Reader, container interface{}, transform bool) (map[string]string, error) {
 	decoder := xml.NewDecoder(reader)

 	var entities map[string]string
@ -62,38 +60,6 @@ func parseDoc(reader io.Reader, container interface{}, transform bool) (map[stri
 	return entities, nil
 }

-func parseDocument(reader io.Reader, transform bool, callback Parser) (map[string]string, error) {
-	decoder := xml.NewDecoder(reader)
-
-	var entities map[string]string
-	for {
-		token, _ := decoder.Token()
-		if token == nil {
-			break
-		}
-
-		switch startElement := token.(type) {
-		case xml.Directive:
-			directive := token.(xml.Directive)
-			entities = parseEntities(&directive)
-			if transform {
-				decoder.Entity = entities
-			} else {
-				decoder.Entity = make(map[string]string)
-				for k, _ := range entities {
-					decoder.Entity[k] = k
-				}
-			}
-		case xml.StartElement:
-			if err := callback(decoder, &startElement); err != nil {
-				return nil, err
-			}
-		}
-	}
-
-	return entities, nil
-}
-
 func parseEntities(d *xml.Directive) map[string]string {
 	re := regexp.MustCompile("<!ENTITY\\s([0-9\\-A-z]+)\\s\"(.+)\">")
 	matches := re.FindAllStringSubmatch(string(*d), -1)
--- a/jmdict.go
+++ b/jmdict.go
@ -22,15 +22,16 @@

 package jmdict

-import (
-	"encoding/xml"
-	"io"
-)
+import "io"

-// Entries consist of kanji elements, reading elements,
-// general information and sense elements. Each entry must have at
-// least one reading element and one sense element. Others are optional.
-type EdictEntry struct {
+type Jmdict struct {
+	// Entries consist of kanji elements, reading elements,
+	// general information and sense elements. Each entry must have at
+	// least one reading element and one sense element. Others are optional.
+	Entries []JmdictEntry `xml:"entry"`
+}
+
+type JmdictEntry struct {
 	// A unique numeric sequence number for each entry
 	Sequence int `xml:"ent_seq"`

@ -44,7 +45,7 @@ type EdictEntry struct {
 	// included, provided they are associated with appropriate information
 	// fields. Synonyms are not included; they may be indicated in the
 	// cross-reference field associated with the sense element.
-	Kanji []EdictKanji `xml:"k_ele"`
+	Kanji []JmdictKanji `xml:"k_ele"`

 	// The reading element typically contains the valid readings
 	// of the word(s) in the kanji element using modern kanadzukai.
@ -52,16 +53,16 @@ type EdictEntry struct {
 	// alternative readings of the kanji element. In the absence of a
 	// kanji element, i.e. in the case of a word or phrase written
 	// entirely in kana, these elements will define the entry.
-	Readings []EdictReading `xml:"r_ele"`
+	Readings []JmdictReading `xml:"r_ele"`

 	// The sense element will record the translational equivalent
 	// of the Japanese word, plus other related information. Where there
 	// are several distinctly different meanings of the word, multiple
 	// sense elements will be employed.
-	Sense []EdictSense `xml:"sense"`
+	Sense []JmdictSense `xml:"sense"`
 }

-type EdictKanji struct {
+type JmdictKanji struct {
 	// This element will contain a word or short phrase in Japanese
 	// which is written using at least one non-kana character (usually kanji,
 	// but can be other characters). The valid characters are
@ -105,7 +106,7 @@ type EdictKanji struct {
 	Priorities []string `xml:"ke_pri"`
 }

-type EdictReading struct {
+type JmdictReading struct {
 	// This element content is restricted to kana and related
 	// characters such as chouon and kurikaeshi. Kana usage will be
 	// consistent between the keb and reb elements; e.g. if the keb
@ -134,7 +135,7 @@ type EdictReading struct {
 	Priorities []string `xml:"re_pri"`
 }

-type EdictSource struct {
+type JmdictSource struct {
 	Content string `xml:",chardata"`

 	// The xml:lang attribute defines the language(s) from which
@ -156,7 +157,7 @@ type EdictSource struct {
 	Wasei string `xml:"ls_wasei,attr"`
 }

-type EdictGlossary struct {
+type JmdictGlossary struct {
 	Content string `xml:",chardata"`

 	// The xml:lang attribute defines the target language of the
@ -171,7 +172,7 @@ type EdictGlossary struct {
 	Gender string `xml:"g_gend"`
 }

-type EdictSense struct {
+type JmdictSense struct {
 	// These elements, if present, indicate that the sense is restricted
 	// to the lexeme represented by the keb and/or reb.
 	RestrictedKanji    []string `xml:"stagk"`
@ -211,7 +212,7 @@ type EdictSense struct {
 	// language(s) of a loan-word/gairaigo. If the source language is other
 	// than English, the language is indicated by the xml:lang attribute.
 	// The element value (if any) is the source word or phrase.
-	SourceLanguages []EdictSource `xml:"lsource"`
+	SourceLanguages []JmdictSource `xml:"lsource"`

 	// For words specifically associated with regional dialects in
 	// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
@ -227,25 +228,17 @@ type EdictSense struct {
 	// target-language words or phrases which are equivalents to the
 	// Japanese word. This element would normally be present, however it
 	// may be omitted in entries which are purely for a cross-reference.
-	Glossary []EdictGlossary `xml:"gloss"`
+	Glossary []JmdictGlossary `xml:"gloss"`
 }

-func LoadEdict(reader io.Reader, transform bool) ([]EdictEntry, map[string]string, error) {
-	var entries []EdictEntry
-
-	entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error {
-		if element.Name.Local != "entry" {
-			return nil
-		}
-
-		var entry EdictEntry
-		if err := decoder.DecodeElement(&entry, element); err != nil {
-			return err
-		}
-
-		entries = append(entries, entry)
-		return nil
-	})
-
-	return entries, entities, err
+func LoadJmdict(reader io.Reader) (Jmdict, map[string]string, error) {
+	var dict Jmdict
+	entities, err := parseDict(reader, &dict, true)
+	return dict, entities, err
+}
+
+func LoadJmdictNoTransform(reader io.Reader) (Jmdict, map[string]string, error) {
+	var dict Jmdict
+	entities, err := parseDict(reader, &dict, false)
+	return dict, entities, err
 }
--- a/jmnedict.go
+++ b/jmnedict.go
@ -22,15 +22,16 @@

 package jmdict

-import (
-	"encoding/xml"
-	"io"
-)
+import "io"

-// Entries consist of kanji elements, reading elements
-// name translation elements. Each entry must have at
-// least one reading element and one sense element. Others are optional.
-type EnamdictEntry struct {
+type Jmnedict struct {
+	// Entries consist of kanji elements, reading elements
+	// name translation elements. Each entry must have at
+	// least one reading element and one sense element. Others are optional.
+	Entries []JmnedictEntry `xml:"entry"`
+}
+
+type JmnedictEntry struct {
 	// A unique numeric sequence number for each entry
 	Sequence int `xml:"ent_seq"`

@ -44,7 +45,7 @@ type EnamdictEntry struct {
 	// included, provided they are associated with appropriate information
 	// fields. Synonyms are not included; they may be indicated in the
 	// cross-reference field associated with the sense element.
-	Kanji []EnamdictKanji `xml:"k_ele"`
+	Kanji []JmnedictKanji `xml:"k_ele"`

 	// The reading element typically contains the valid readings
 	// of the word(s) in the kanji element using modern kanadzukai.
@ -52,14 +53,14 @@ type EnamdictEntry struct {
 	// alternative readings of the kanji element. In the absence of a
 	// kanji element, i.e. in the case of a word or phrase written
 	// entirely in kana, these elements will define the entry.
-	Readings []EnamdictReading `xml:"r_ele"`
+	Readings []JmnedictReading `xml:"r_ele"`

 	// The trans element will record the translational equivalent
 	// of the Japanese name, plus other related information.
-	Translations []EnamdictTranslation `xml:"trans"`
+	Translations []JmnedictTranslation `xml:"trans"`
 }

-type EnamdictKanji struct {
+type JmnedictKanji struct {
 	// This element will contain an entity name in Japanese
 	// which is written using at least one non-kana character (usually
 	// kanji, but can be other characters). The valid
@ -82,7 +83,7 @@ type EnamdictKanji struct {
 	Priorities []string `xml:"ke_pri"`
 }

-type EnamdictReading struct {
+type JmnedictReading struct {
 	// This element content is restricted to kana and related
 	// characters such as chouon and kurikaeshi. Kana usage will be
 	// consistent between the keb and reb elements; e.g. if the keb
@ -104,7 +105,7 @@ type EnamdictReading struct {
 	Priorities []string `xml:"re_pri"`
 }

-type EnamdictTranslation struct {
+type JmnedictTranslation struct {
 	// The type of name, recorded in the appropriate entity codes.
 	NameTypes []string `xml:"name_type"`

@ -129,22 +130,14 @@ type EnamdictTranslation struct {
 	Language string `xml:"lang,attr"`
 }

-func LoadEnamdict(reader io.Reader, transform bool) ([]EnamdictEntry, map[string]string, error) {
-	var entries []EnamdictEntry
-
-	entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error {
-		if element.Name.Local != "entry" {
-			return nil
-		}
-
-		var entry EnamdictEntry
-		if err := decoder.DecodeElement(&entry, element); err != nil {
-			return err
-		}
-
-		entries = append(entries, entry)
-		return nil
-	})
-
-	return entries, entities, err
+func LoadJmnedict(reader io.Reader) (Jmnedict, map[string]string, error) {
+	var dic Jmnedict
+	entities, err := parseDict(reader, &dic, true)
+	return dic, entities, err
+}
+
+func LoadJmnedictNoTransform(reader io.Reader) (Jmnedict, map[string]string, error) {
+	var dic Jmnedict
+	entities, err := parseDict(reader, &dic, false)
+	return dic, entities, err
 }
--- a/kanjidic.go
+++ b/kanjidic.go
@ -315,7 +315,7 @@ type KanjidicMeaning struct {
 }

 func LoadKanjidic(reader io.Reader) (Kanjidic, error) {
-	var kanjidic Kanjidic
-	_, err := parseDoc(reader, &kanjidic, false)
-	return kanjidic, err
+	var dic Kanjidic
+	_, err := parseDict(reader, &dic, true)
+	return dic, err
 }