Documenting

2016-07-27 09:11:29 -07:00 · 2016-07-27 09:11:29 -07:00 · 74e86bd280
commit 74e86bd280
parent d4dc16f62d
1 changed files with 152 additions and 21 deletions
--- a/edict.go
+++ b/edict.go
@ -24,39 +24,170 @@ package main

 import "io"

-type edictKanjiElement struct {
-	Expression  string   `xml:"keb"`
+type edictKanji struct {
+	// This element will contain a word or short phrase in Japanese
+	// which is written using at least one non-kana character (usually kanji,
+	// but can be other characters). The valid characters are
+	// kanji, kana, related characters such as chouon and kurikaeshi, and
+	// in exceptional cases, letters from other alphabets.
+	Expression string `xml:"keb"`
+
+	// This is a coded information field related specifically to the
+	// orthography of the keb, and will typically indicate some unusual
+	// aspect, such as okurigana irregularity.
 	Information []string `xml:"ke_inf"`
-	Priority    []string `xml:"ke_pri"`
+
+	// This and the equivalent re_pri field are provided to record
+	// information about the relative priority of the entry,  and consist
+	// of codes indicating the word appears in various references which
+	// can be taken as an indication of the frequency with which the word
+	// is used. This field is intended for use either by applications which
+	// want to concentrate on entries of  a particular priority, or to
+	// generate subset files.
+	// The current values in this field are:
+	// - news1/2: appears in the "wordfreq" file compiled by Alexandre Girardi
+	// from the Mainichi Shimbun. (See the Monash ftp archive for a copy.)
+	// Words in the first 12,000 in that file are marked "news1" and words
+	// in the second 12,000 are marked "news2".
+	// - ichi1/2: appears in the "Ichimango goi bunruishuu", Senmon Kyouiku
+	// Publishing, Tokyo, 1998.  (The entries marked "ichi2" were
+	// demoted from ichi1 because they were observed to have low
+	// frequencies in the WWW and newspapers.)
+	// - spec1 and spec2: a small number of words use this marker when they
+	// are detected as being common, but are not included in other lists.
+	// - gai1/2: common loanwords, based on the wordfreq file.
+	// - nfxx: this is an indicator of frequency-of-use ranking in the
+	// wordfreq file. "xx" is the number of the set of 500 words in which
+	// the entry can be found, with "01" assigned to the first 500, "02"
+	// to the second, and so on. (The entries with news1, ichi1, spec1 and
+	// gai1 values are marked with a "(P)" in the EDICT and EDICT2
+	// files.)
+	// The reason both the kanji and reading elements are tagged is because
+	// on occasions a priority is only associated with a particular
+	// kanji/reading pair.
+	Priority []string `xml:"ke_pri"`
 }

-type edictReadingElement struct {
-	Reading      string   `xml:"reb"`
-	NoKanji      string   `xml:"re_nokanji"`
+type edictReading struct {
+	// This element content is restricted to kana and related
+	// characters such as chouon and kurikaeshi. Kana usage will be
+	// consistent between the keb and reb elements; e.g. if the keb
+	// contains katakana, so too will the reb.
+	Reading string `xml:"reb"`
+
+	// This element, which will usually have a null value, indicates
+	// that the reb, while associated with the keb, cannot be regarded
+	// as a true reading of the kanji. It is typically used for words
+	// such as foreign place names, gairaigo which can be in kanji or
+	// katakana, etc.
+	NoKanji string `xml:"re_nokanji"`
+
+	// This element is used to indicate when the reading only applies
+	// to a subset of the keb elements in the entry. In its absence, all
+	// readings apply to all kanji elements. The contents of this element
+	// must exactly match those of one of the keb elements.
 	Restrictions []string `xml:"re_restr"`
-	Information  []string `xml:"re_inf"`
-	Priority     []string `xml:"re_pri"`
+
+	// General coded information pertaining to the specific reading.
+	// Typically it will be used to indicate some unusual aspect of
+	// the reading.
+	Information []string `xml:"re_inf"`
+
+	// See the comment on ke_pri above.
+	Priority []string `xml:"re_pri"`
 }

 type edictSense struct {
+	// These elements, if present, indicate that the sense is restricted
+	// to the lexeme represented by the keb and/or reb.
 	RestrictKanji   []string `xml:"stagk"`
 	RestrictReading []string `xml:"stagr"`
-	References      []string `xml:"xref"`
-	Antonyms        []string `xml:"ant"`
-	PartOfSpeech    []string `xml:"pos"`
-	Field           []string `xml:"field"`
-	Misc            []string `xml:"misc"`
-	SourceLanguage  []string `xml:"lsource"`
-	Dialect         []string `xml:"dial"`
-	Information     []string `xml:"s_inf"`
-	Glossary        []string `xml:"gloss"`
+
+	// This element is used to indicate a cross-reference to another
+	// entry with a similar or related meaning or sense. The content of
+	// this element is typically a keb or reb element in another entry. In some
+	// cases a keb will be followed by a reb and/or a sense number to provide
+	// a precise target for the cross-reference. Where this happens, a JIS
+	// "centre-dot" (0x2126) is placed between the components of the
+	// cross-reference.
+	References []string `xml:"xref"`
+
+	// This element is used to indicate another entry which is an
+	// antonym of the current entry/sense. The content of this element
+	// must exactly match that of a keb or reb element in another entry.
+	Antonyms []string `xml:"ant"`
+
+	// Part-of-speech information about the entry/sense. Should use
+	// appropriate entity codes. In general where there are multiple senses
+	// in an entry, the part-of-speech of an earlier sense will apply to
+	// later senses unless there is a new part-of-speech indicated.
+	PartOfSpeech []string `xml:"pos"`
+
+	// Information about the field of application of the entry/sense.
+	// When absent, general application is implied. Entity coding for
+	// specific fields of application.
+	Field []string `xml:"field"`
+
+	// This element is used for other relevant information about
+	// the entry/sense. As with part-of-speech, information will usually
+	// apply to several senses.
+	Misc []string `xml:"misc"`
+
+	// This element records the information about the source
+	// language(s) of a loan-word/gairaigo. If the source language is other
+	// than English, the language is indicated by the xml:lang attribute.
+	// The element value (if any) is the source word or phrase.
+	SourceLanguage []string `xml:"lsource"`
+
+	// For words specifically associated with regional dialects in
+	// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
+	Dialect []string `xml:"dial"`
+
+	// The sense-information elements provided for additional
+	// information to be recorded about a sense. Typical usage would
+	// be to indicate such things as level of currency of a sense, the
+	// regional variations, etc.
+	Information []string `xml:"s_inf"`
+
+	// Within each sense will be one or more "glosses", i.e.
+	// target-language words or phrases which are equivalents to the
+	// Japanese word. This element would normally be present, however it
+	// may be omitted in entries which are purely for a cross-reference.
+	Glossary []string `xml:"gloss"`
 }

+// Entries consist of kanji elements, reading elements,
+// general information and sense elements. Each entry must have at
+// least one reading element and one sense element. Others are optional.
 type edictEntry struct {
-	Sequence int                   `xml:"ent_seq"`
-	Kanji    []edictKanjiElement   `xml:"k_ele"`
-	Reading  []edictReadingElement `xml:"r_ele"`
-	Sense    []edictSense          `xml:"sense"`
+	// A unique numeric sequence number for each entry
+	Sequence int `xml:"ent_seq"`
+
+	// The kanji element, or in its absence, the reading element, is
+	// the defining component of each entry.
+	// The overwhelming majority of entries will have a single kanji
+	// element associated with a word in Japanese. Where there are
+	// multiple kanji elements within an entry, they will be orthographical
+	// variants of the same word, either using variations in okurigana, or
+	// alternative and equivalent kanji. Common "mis-spellings" may be
+	// included, provided they are associated with appropriate information
+	// fields. Synonyms are not included; they may be indicated in the
+	// cross-reference field associated with the sense element.
+	Kanji []edictKanji `xml:"k_ele"`
+
+	// The reading element typically contains the valid readings
+	// of the word(s) in the kanji element using modern kanadzukai.
+	// Where there are multiple reading elements, they will typically be
+	// alternative readings of the kanji element. In the absence of a
+	// kanji element, i.e. in the case of a word or phrase written
+	// entirely in kana, these elements will define the entry.
+	Reading []edictReading `xml:"r_ele"`
+
+	// The sense element will record the translational equivalent
+	// of the Japanese word, plus other related information. Where there
+	// are several distinctly different meanings of the word, multiple
+	// sense elements will be employed.
+	Sense []edictSense `xml:"sense"`
 }

 func processEdict(reader io.Reader, writer io.Writer) error {