From 71c8138456755b6ec5bb26d63c045baccaa38d9b Mon Sep 17 00:00:00 2001
From: Alex Yatskov <alex@foosoft.net>
Date: Sat, 30 Jul 2016 11:01:41 -0700
Subject: [PATCH] External reference to jmdict

---
 edict.go | 256 +------------------------------------------------------
 1 file changed, 3 insertions(+), 253 deletions(-)

diff --git a/edict.go b/edict.go
index 0a55904..3d751b1 100644
--- a/edict.go
+++ b/edict.go
@@ -23,217 +23,14 @@
 package main
 
 import (
-	"encoding/xml"
 	"io"
 	"log"
-	"regexp"
+
+	"github.com/FooSoft/jmdict"
 )
 
-type edictKanji struct {
-	// This element will contain a word or short phrase in Japanese
-	// which is written using at least one non-kana character (usually kanji,
-	// but can be other characters). The valid characters are
-	// kanji, kana, related characters such as chouon and kurikaeshi, and
-	// in exceptional cases, letters from other alphabets.
-	Expression string `xml:"keb"`
-
-	// This is a coded information field related specifically to the
-	// orthography of the keb, and will typically indicate some unusual
-	// aspect, such as okurigana irregularity.
-	Information []string `xml:"ke_inf"`
-
-	// This and the equivalent re_pri field are provided to record
-	// information about the relative priority of the entry,  and consist
-	// of codes indicating the word appears in various references which
-	// can be taken as an indication of the frequency with which the word
-	// is used. This field is intended for use either by applications which
-	// want to concentrate on entries of  a particular priority, or to
-	// generate subset files.
-	// The current values in this field are:
-	// - news1/2: appears in the "wordfreq" file compiled by Alexandre Girardi
-	// from the Mainichi Shimbun. (See the Monash ftp archive for a copy.)
-	// Words in the first 12,000 in that file are marked "news1" and words
-	// in the second 12,000 are marked "news2".
-	// - ichi1/2: appears in the "Ichimango goi bunruishuu", Senmon Kyouiku
-	// Publishing, Tokyo, 1998.  (The entries marked "ichi2" were
-	// demoted from ichi1 because they were observed to have low
-	// frequencies in the WWW and newspapers.)
-	// - spec1 and spec2: a small number of words use this marker when they
-	// are detected as being common, but are not included in other lists.
-	// - gai1/2: common loanwords, based on the wordfreq file.
-	// - nfxx: this is an indicator of frequency-of-use ranking in the
-	// wordfreq file. "xx" is the number of the set of 500 words in which
-	// the entry can be found, with "01" assigned to the first 500, "02"
-	// to the second, and so on. (The entries with news1, ichi1, spec1 and
-	// gai1 values are marked with a "(P)" in the EDICT and EDICT2
-	// files.)
-	// The reason both the kanji and reading elements are tagged is because
-	// on occasions a priority is only associated with a particular
-	// kanji/reading pair.
-	Priority []string `xml:"ke_pri"`
-}
-
-type edictReading struct {
-	// This element content is restricted to kana and related
-	// characters such as chouon and kurikaeshi. Kana usage will be
-	// consistent between the keb and reb elements; e.g. if the keb
-	// contains katakana, so too will the reb.
-	Reading string `xml:"reb"`
-
-	// This element, which will usually have a null value, indicates
-	// that the reb, while associated with the keb, cannot be regarded
-	// as a true reading of the kanji. It is typically used for words
-	// such as foreign place names, gairaigo which can be in kanji or
-	// katakana, etc.
-	NoKanji *string `xml:"re_nokanji"`
-
-	// This element is used to indicate when the reading only applies
-	// to a subset of the keb elements in the entry. In its absence, all
-	// readings apply to all kanji elements. The contents of this element
-	// must exactly match those of one of the keb elements.
-	Restrictions []string `xml:"re_restr"`
-
-	// General coded information pertaining to the specific reading.
-	// Typically it will be used to indicate some unusual aspect of
-	// the reading.
-	Information []string `xml:"re_inf"`
-
-	// See the comment on ke_pri above.
-	Priority []string `xml:"re_pri"`
-}
-
-type edictSource struct {
-	Content string `xml:",chardata"`
-
-	// The xml:lang attribute defines the language(s) from which
-	// a loanword is drawn.  It will be coded using the three-letter language
-	// code from the ISO 639-2 standard. When absent, the value "eng" (i.e.
-	// English) is the default value. The bibliographic (B) codes are used.
-	Language string `xml:"lang,attr"`
-
-	// The ls_type attribute indicates whether the lsource element
-	// fully or partially describes the source word or phrase of the
-	// loanword. If absent, it will have the implied value of "full".
-	// Otherwise it will contain "part".
-	Type string `xml:"ls_type,attr"`
-
-	// The ls_wasei attribute indicates that the Japanese word
-	// has been constructed from words in the source language, and
-	// not from an actual phrase in that language. Most commonly used to
-	// indicate "waseieigo".
-	Wasei string `xml:"ls_wasei,attr"`
-}
-
-type edictGlossary struct {
-	Content string `xml:",chardata"`
-
-	// The xml:lang attribute defines the target language of the
-	// gloss. It will be coded using the three-letter language code from
-	// the ISO 639 standard. When absent, the value "eng" (i.e. English)
-	// is the default value.
-	Language string `xml:"lang,attr"`
-
-	// The g_gend attribute defines the gender of the gloss (typically
-	// a noun in the target language. When absent, the gender is either
-	// not relevant or has yet to be provided.
-	Gender string `xml:"g_gend"`
-}
-
-type edictSense struct {
-	// These elements, if present, indicate that the sense is restricted
-	// to the lexeme represented by the keb and/or reb.
-	RestrictKanji   []string `xml:"stagk"`
-	RestrictReading []string `xml:"stagr"`
-
-	// This element is used to indicate a cross-reference to another
-	// entry with a similar or related meaning or sense. The content of
-	// this element is typically a keb or reb element in another entry. In some
-	// cases a keb will be followed by a reb and/or a sense number to provide
-	// a precise target for the cross-reference. Where this happens, a JIS
-	// "centre-dot" (0x2126) is placed between the components of the
-	// cross-reference.
-	References []string `xml:"xref"`
-
-	// This element is used to indicate another entry which is an
-	// antonym of the current entry/sense. The content of this element
-	// must exactly match that of a keb or reb element in another entry.
-	Antonyms []string `xml:"ant"`
-
-	// Part-of-speech information about the entry/sense. Should use
-	// appropriate entity codes. In general where there are multiple senses
-	// in an entry, the part-of-speech of an earlier sense will apply to
-	// later senses unless there is a new part-of-speech indicated.
-	PartOfSpeech []string `xml:"pos"`
-
-	// Information about the field of application of the entry/sense.
-	// When absent, general application is implied. Entity coding for
-	// specific fields of application.
-	Field []string `xml:"field"`
-
-	// This element is used for other relevant information about
-	// the entry/sense. As with part-of-speech, information will usually
-	// apply to several senses.
-	Misc []string `xml:"misc"`
-
-	// This element records the information about the source
-	// language(s) of a loan-word/gairaigo. If the source language is other
-	// than English, the language is indicated by the xml:lang attribute.
-	// The element value (if any) is the source word or phrase.
-	SourceLanguage []edictSource `xml:"lsource"`
-
-	// For words specifically associated with regional dialects in
-	// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
-	Dialect []string `xml:"dial"`
-
-	// The sense-information elements provided for additional
-	// information to be recorded about a sense. Typical usage would
-	// be to indicate such things as level of currency of a sense, the
-	// regional variations, etc.
-	Information []string `xml:"s_inf"`
-
-	// Within each sense will be one or more "glosses", i.e.
-	// target-language words or phrases which are equivalents to the
-	// Japanese word. This element would normally be present, however it
-	// may be omitted in entries which are purely for a cross-reference.
-	Glossary []edictGlossary `xml:"gloss"`
-}
-
-// Entries consist of kanji elements, reading elements,
-// general information and sense elements. Each entry must have at
-// least one reading element and one sense element. Others are optional.
-type edictEntry struct {
-	// A unique numeric sequence number for each entry
-	Sequence int `xml:"ent_seq"`
-
-	// The kanji element, or in its absence, the reading element, is
-	// the defining component of each entry.
-	// The overwhelming majority of entries will have a single kanji
-	// element associated with a word in Japanese. Where there are
-	// multiple kanji elements within an entry, they will be orthographical
-	// variants of the same word, either using variations in okurigana, or
-	// alternative and equivalent kanji. Common "mis-spellings" may be
-	// included, provided they are associated with appropriate information
-	// fields. Synonyms are not included; they may be indicated in the
-	// cross-reference field associated with the sense element.
-	Kanji []edictKanji `xml:"k_ele"`
-
-	// The reading element typically contains the valid readings
-	// of the word(s) in the kanji element using modern kanadzukai.
-	// Where there are multiple reading elements, they will typically be
-	// alternative readings of the kanji element. In the absence of a
-	// kanji element, i.e. in the case of a word or phrase written
-	// entirely in kana, these elements will define the entry.
-	Reading []edictReading `xml:"r_ele"`
-
-	// The sense element will record the translational equivalent
-	// of the Japanese word, plus other related information. Where there
-	// are several distinctly different meanings of the word, multiple
-	// sense elements will be employed.
-	Sense []edictSense `xml:"sense"`
-}
-
 func processEdict(reader io.Reader, writer io.Writer) error {
-	entries, err := loadEdict(reader)
+	entries, err := jmdict.LoadEdict(reader)
 
 	for _, entry := range entries {
 		if len(entry.Reading) > 0 {
@@ -245,50 +42,3 @@ func processEdict(reader io.Reader, writer io.Writer) error {
 
 	return err
 }
-
-func loadEdict(reader io.Reader) ([]edictEntry, error) {
-	var (
-		err     error
-		entries []edictEntry
-	)
-
-	decoder := xml.NewDecoder(reader)
-
-	for {
-		token, _ := decoder.Token()
-		if token == nil {
-			break
-		}
-
-		switch startElement := token.(type) {
-		case xml.Directive:
-			directive := token.(xml.Directive)
-			if decoder.Entity, err = parseEntities(&directive); err != nil {
-				return nil, err
-			}
-		case xml.StartElement:
-			if startElement.Name.Local == "entry" {
-				var entry edictEntry
-				if err := decoder.DecodeElement(&entry, &startElement); err != nil {
-					return nil, err
-				}
-
-				entries = append(entries, entry)
-			}
-		}
-	}
-
-	return entries, nil
-}
-
-func parseEntities(d *xml.Directive) (map[string]string, error) {
-	re := regexp.MustCompile("<!ENTITY\\s([0-9\\-A-z]+)\\s\"(.+)\">")
-	matches := re.FindAllStringSubmatch(string(*d), -1)
-
-	entities := make(map[string]string)
-	for _, match := range matches {
-		entities[match[1]] = match[2]
-	}
-
-	return entities, nil
-}