This commit is contained in:
Alex Yatskov 2016-08-06 17:43:26 -07:00
parent dcc38db022
commit 670ad8bac9
4 changed files with 58 additions and 106 deletions

View File

@ -28,9 +28,7 @@ import (
"regexp" "regexp"
) )
type Parser func(decoder *xml.Decoder, element *xml.StartElement) error func parseDict(reader io.Reader, container interface{}, transform bool) (map[string]string, error) {
func parseDoc(reader io.Reader, container interface{}, transform bool) (map[string]string, error) {
decoder := xml.NewDecoder(reader) decoder := xml.NewDecoder(reader)
var entities map[string]string var entities map[string]string
@ -62,38 +60,6 @@ func parseDoc(reader io.Reader, container interface{}, transform bool) (map[stri
return entities, nil return entities, nil
} }
func parseDocument(reader io.Reader, transform bool, callback Parser) (map[string]string, error) {
decoder := xml.NewDecoder(reader)
var entities map[string]string
for {
token, _ := decoder.Token()
if token == nil {
break
}
switch startElement := token.(type) {
case xml.Directive:
directive := token.(xml.Directive)
entities = parseEntities(&directive)
if transform {
decoder.Entity = entities
} else {
decoder.Entity = make(map[string]string)
for k, _ := range entities {
decoder.Entity[k] = k
}
}
case xml.StartElement:
if err := callback(decoder, &startElement); err != nil {
return nil, err
}
}
}
return entities, nil
}
func parseEntities(d *xml.Directive) map[string]string { func parseEntities(d *xml.Directive) map[string]string {
re := regexp.MustCompile("<!ENTITY\\s([0-9\\-A-z]+)\\s\"(.+)\">") re := regexp.MustCompile("<!ENTITY\\s([0-9\\-A-z]+)\\s\"(.+)\">")
matches := re.FindAllStringSubmatch(string(*d), -1) matches := re.FindAllStringSubmatch(string(*d), -1)

View File

@ -22,15 +22,16 @@
package jmdict package jmdict
import ( import "io"
"encoding/xml"
"io"
)
type Jmdict struct {
// Entries consist of kanji elements, reading elements, // Entries consist of kanji elements, reading elements,
// general information and sense elements. Each entry must have at // general information and sense elements. Each entry must have at
// least one reading element and one sense element. Others are optional. // least one reading element and one sense element. Others are optional.
type EdictEntry struct { Entries []JmdictEntry `xml:"entry"`
}
type JmdictEntry struct {
// A unique numeric sequence number for each entry // A unique numeric sequence number for each entry
Sequence int `xml:"ent_seq"` Sequence int `xml:"ent_seq"`
@ -44,7 +45,7 @@ type EdictEntry struct {
// included, provided they are associated with appropriate information // included, provided they are associated with appropriate information
// fields. Synonyms are not included; they may be indicated in the // fields. Synonyms are not included; they may be indicated in the
// cross-reference field associated with the sense element. // cross-reference field associated with the sense element.
Kanji []EdictKanji `xml:"k_ele"` Kanji []JmdictKanji `xml:"k_ele"`
// The reading element typically contains the valid readings // The reading element typically contains the valid readings
// of the word(s) in the kanji element using modern kanadzukai. // of the word(s) in the kanji element using modern kanadzukai.
@ -52,16 +53,16 @@ type EdictEntry struct {
// alternative readings of the kanji element. In the absence of a // alternative readings of the kanji element. In the absence of a
// kanji element, i.e. in the case of a word or phrase written // kanji element, i.e. in the case of a word or phrase written
// entirely in kana, these elements will define the entry. // entirely in kana, these elements will define the entry.
Readings []EdictReading `xml:"r_ele"` Readings []JmdictReading `xml:"r_ele"`
// The sense element will record the translational equivalent // The sense element will record the translational equivalent
// of the Japanese word, plus other related information. Where there // of the Japanese word, plus other related information. Where there
// are several distinctly different meanings of the word, multiple // are several distinctly different meanings of the word, multiple
// sense elements will be employed. // sense elements will be employed.
Sense []EdictSense `xml:"sense"` Sense []JmdictSense `xml:"sense"`
} }
type EdictKanji struct { type JmdictKanji struct {
// This element will contain a word or short phrase in Japanese // This element will contain a word or short phrase in Japanese
// which is written using at least one non-kana character (usually kanji, // which is written using at least one non-kana character (usually kanji,
// but can be other characters). The valid characters are // but can be other characters). The valid characters are
@ -105,7 +106,7 @@ type EdictKanji struct {
Priorities []string `xml:"ke_pri"` Priorities []string `xml:"ke_pri"`
} }
type EdictReading struct { type JmdictReading struct {
// This element content is restricted to kana and related // This element content is restricted to kana and related
// characters such as chouon and kurikaeshi. Kana usage will be // characters such as chouon and kurikaeshi. Kana usage will be
// consistent between the keb and reb elements; e.g. if the keb // consistent between the keb and reb elements; e.g. if the keb
@ -134,7 +135,7 @@ type EdictReading struct {
Priorities []string `xml:"re_pri"` Priorities []string `xml:"re_pri"`
} }
type EdictSource struct { type JmdictSource struct {
Content string `xml:",chardata"` Content string `xml:",chardata"`
// The xml:lang attribute defines the language(s) from which // The xml:lang attribute defines the language(s) from which
@ -156,7 +157,7 @@ type EdictSource struct {
Wasei string `xml:"ls_wasei,attr"` Wasei string `xml:"ls_wasei,attr"`
} }
type EdictGlossary struct { type JmdictGlossary struct {
Content string `xml:",chardata"` Content string `xml:",chardata"`
// The xml:lang attribute defines the target language of the // The xml:lang attribute defines the target language of the
@ -171,7 +172,7 @@ type EdictGlossary struct {
Gender string `xml:"g_gend"` Gender string `xml:"g_gend"`
} }
type EdictSense struct { type JmdictSense struct {
// These elements, if present, indicate that the sense is restricted // These elements, if present, indicate that the sense is restricted
// to the lexeme represented by the keb and/or reb. // to the lexeme represented by the keb and/or reb.
RestrictedKanji []string `xml:"stagk"` RestrictedKanji []string `xml:"stagk"`
@ -211,7 +212,7 @@ type EdictSense struct {
// language(s) of a loan-word/gairaigo. If the source language is other // language(s) of a loan-word/gairaigo. If the source language is other
// than English, the language is indicated by the xml:lang attribute. // than English, the language is indicated by the xml:lang attribute.
// The element value (if any) is the source word or phrase. // The element value (if any) is the source word or phrase.
SourceLanguages []EdictSource `xml:"lsource"` SourceLanguages []JmdictSource `xml:"lsource"`
// For words specifically associated with regional dialects in // For words specifically associated with regional dialects in
// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben. // Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
@ -227,25 +228,17 @@ type EdictSense struct {
// target-language words or phrases which are equivalents to the // target-language words or phrases which are equivalents to the
// Japanese word. This element would normally be present, however it // Japanese word. This element would normally be present, however it
// may be omitted in entries which are purely for a cross-reference. // may be omitted in entries which are purely for a cross-reference.
Glossary []EdictGlossary `xml:"gloss"` Glossary []JmdictGlossary `xml:"gloss"`
} }
func LoadEdict(reader io.Reader, transform bool) ([]EdictEntry, map[string]string, error) { func LoadJmdict(reader io.Reader) (Jmdict, map[string]string, error) {
var entries []EdictEntry var dict Jmdict
entities, err := parseDict(reader, &dict, true)
entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error { return dict, entities, err
if element.Name.Local != "entry" {
return nil
} }
var entry EdictEntry func LoadJmdictNoTransform(reader io.Reader) (Jmdict, map[string]string, error) {
if err := decoder.DecodeElement(&entry, element); err != nil { var dict Jmdict
return err entities, err := parseDict(reader, &dict, false)
} return dict, entities, err
entries = append(entries, entry)
return nil
})
return entries, entities, err
} }

View File

@ -22,15 +22,16 @@
package jmdict package jmdict
import ( import "io"
"encoding/xml"
"io"
)
type Jmnedict struct {
// Entries consist of kanji elements, reading elements // Entries consist of kanji elements, reading elements
// name translation elements. Each entry must have at // name translation elements. Each entry must have at
// least one reading element and one sense element. Others are optional. // least one reading element and one sense element. Others are optional.
type EnamdictEntry struct { Entries []JmnedictEntry `xml:"entry"`
}
type JmnedictEntry struct {
// A unique numeric sequence number for each entry // A unique numeric sequence number for each entry
Sequence int `xml:"ent_seq"` Sequence int `xml:"ent_seq"`
@ -44,7 +45,7 @@ type EnamdictEntry struct {
// included, provided they are associated with appropriate information // included, provided they are associated with appropriate information
// fields. Synonyms are not included; they may be indicated in the // fields. Synonyms are not included; they may be indicated in the
// cross-reference field associated with the sense element. // cross-reference field associated with the sense element.
Kanji []EnamdictKanji `xml:"k_ele"` Kanji []JmnedictKanji `xml:"k_ele"`
// The reading element typically contains the valid readings // The reading element typically contains the valid readings
// of the word(s) in the kanji element using modern kanadzukai. // of the word(s) in the kanji element using modern kanadzukai.
@ -52,14 +53,14 @@ type EnamdictEntry struct {
// alternative readings of the kanji element. In the absence of a // alternative readings of the kanji element. In the absence of a
// kanji element, i.e. in the case of a word or phrase written // kanji element, i.e. in the case of a word or phrase written
// entirely in kana, these elements will define the entry. // entirely in kana, these elements will define the entry.
Readings []EnamdictReading `xml:"r_ele"` Readings []JmnedictReading `xml:"r_ele"`
// The trans element will record the translational equivalent // The trans element will record the translational equivalent
// of the Japanese name, plus other related information. // of the Japanese name, plus other related information.
Translations []EnamdictTranslation `xml:"trans"` Translations []JmnedictTranslation `xml:"trans"`
} }
type EnamdictKanji struct { type JmnedictKanji struct {
// This element will contain an entity name in Japanese // This element will contain an entity name in Japanese
// which is written using at least one non-kana character (usually // which is written using at least one non-kana character (usually
// kanji, but can be other characters). The valid // kanji, but can be other characters). The valid
@ -82,7 +83,7 @@ type EnamdictKanji struct {
Priorities []string `xml:"ke_pri"` Priorities []string `xml:"ke_pri"`
} }
type EnamdictReading struct { type JmnedictReading struct {
// This element content is restricted to kana and related // This element content is restricted to kana and related
// characters such as chouon and kurikaeshi. Kana usage will be // characters such as chouon and kurikaeshi. Kana usage will be
// consistent between the keb and reb elements; e.g. if the keb // consistent between the keb and reb elements; e.g. if the keb
@ -104,7 +105,7 @@ type EnamdictReading struct {
Priorities []string `xml:"re_pri"` Priorities []string `xml:"re_pri"`
} }
type EnamdictTranslation struct { type JmnedictTranslation struct {
// The type of name, recorded in the appropriate entity codes. // The type of name, recorded in the appropriate entity codes.
NameTypes []string `xml:"name_type"` NameTypes []string `xml:"name_type"`
@ -129,22 +130,14 @@ type EnamdictTranslation struct {
Language string `xml:"lang,attr"` Language string `xml:"lang,attr"`
} }
func LoadEnamdict(reader io.Reader, transform bool) ([]EnamdictEntry, map[string]string, error) { func LoadJmnedict(reader io.Reader) (Jmnedict, map[string]string, error) {
var entries []EnamdictEntry var dic Jmnedict
entities, err := parseDict(reader, &dic, true)
entities, err := parseDocument(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error { return dic, entities, err
if element.Name.Local != "entry" {
return nil
} }
var entry EnamdictEntry func LoadJmnedictNoTransform(reader io.Reader) (Jmnedict, map[string]string, error) {
if err := decoder.DecodeElement(&entry, element); err != nil { var dic Jmnedict
return err entities, err := parseDict(reader, &dic, false)
} return dic, entities, err
entries = append(entries, entry)
return nil
})
return entries, entities, err
} }

View File

@ -315,7 +315,7 @@ type KanjidicMeaning struct {
} }
func LoadKanjidic(reader io.Reader) (Kanjidic, error) { func LoadKanjidic(reader io.Reader) (Kanjidic, error) {
var kanjidic Kanjidic var dic Kanjidic
_, err := parseDoc(reader, &kanjidic, false) _, err := parseDict(reader, &dic, true)
return kanjidic, err return dic, err
} }