This commit is contained in:
Alex Yatskov 2016-07-30 17:30:45 -07:00
parent 554c64d773
commit 15dc6933e5
2 changed files with 206 additions and 43 deletions

View File

@ -25,8 +25,43 @@ package jmdict
import (
"encoding/xml"
"io"
"log"
)
// Entries consist of kanji elements, reading elements,
// general information and sense elements. Each entry must have at
// least one reading element and one sense element. Others are optional.
type edictEntry struct {
// A unique numeric sequence number for each entry
Sequence int `xml:"ent_seq"`
// The kanji element, or in its absence, the reading element, is
// the defining component of each entry.
// The overwhelming majority of entries will have a single kanji
// element associated with a word in Japanese. Where there are
// multiple kanji elements within an entry, they will be orthographical
// variants of the same word, either using variations in okurigana, or
// alternative and equivalent kanji. Common "mis-spellings" may be
// included, provided they are associated with appropriate information
// fields. Synonyms are not included; they may be indicated in the
// cross-reference field associated with the sense element.
Kanji []edictKanji `xml:"k_ele"`
// The reading element typically contains the valid readings
// of the word(s) in the kanji element using modern kanadzukai.
// Where there are multiple reading elements, they will typically be
// alternative readings of the kanji element. In the absence of a
// kanji element, i.e. in the case of a word or phrase written
// entirely in kana, these elements will define the entry.
Reading []edictReading `xml:"r_ele"`
// The sense element will record the translational equivalent
// of the Japanese word, plus other related information. Where there
// are several distinctly different meanings of the word, multiple
// sense elements will be employed.
Sense []edictSense `xml:"sense"`
}
type edictKanji struct {
// This element will contain a word or short phrase in Japanese
// which is written using at least one non-kana character (usually kanji,
@ -196,48 +231,10 @@ type edictSense struct {
Glossary []edictGlossary `xml:"gloss"`
}
// Entries consist of kanji elements, reading elements,
// general information and sense elements. Each entry must have at
// least one reading element and one sense element. Others are optional.
type edictEntry struct {
// A unique numeric sequence number for each entry
Sequence int `xml:"ent_seq"`
// The kanji element, or in its absence, the reading element, is
// the defining component of each entry.
// The overwhelming majority of entries will have a single kanji
// element associated with a word in Japanese. Where there are
// multiple kanji elements within an entry, they will be orthographical
// variants of the same word, either using variations in okurigana, or
// alternative and equivalent kanji. Common "mis-spellings" may be
// included, provided they are associated with appropriate information
// fields. Synonyms are not included; they may be indicated in the
// cross-reference field associated with the sense element.
Kanji []edictKanji `xml:"k_ele"`
// The reading element typically contains the valid readings
// of the word(s) in the kanji element using modern kanadzukai.
// Where there are multiple reading elements, they will typically be
// alternative readings of the kanji element. In the absence of a
// kanji element, i.e. in the case of a word or phrase written
// entirely in kana, these elements will define the entry.
Reading []edictReading `xml:"r_ele"`
// The sense element will record the translational equivalent
// of the Japanese word, plus other related information. Where there
// are several distinctly different meanings of the word, multiple
// sense elements will be employed.
Sense []edictSense `xml:"sense"`
}
func LoadEdict(reader io.Reader) ([]edictEntry, error) {
var (
err error
entries []edictEntry
)
func LoadEdict(reader io.Reader) ([]edictEntry, map[string]string, error) {
decoder := xml.NewDecoder(reader)
var entries []edictEntry
for {
token, _ := decoder.Token()
if token == nil {
@ -247,14 +244,16 @@ func LoadEdict(reader io.Reader) ([]edictEntry, error) {
switch startElement := token.(type) {
case xml.Directive:
directive := token.(xml.Directive)
var err error
if decoder.Entity, err = parseEntities(&directive); err != nil {
return nil, err
return nil, nil, err
}
log.Print(decoder.Entity)
case xml.StartElement:
if startElement.Name.Local == "entry" {
var entry edictEntry
if err := decoder.DecodeElement(&entry, &startElement); err != nil {
return nil, err
return nil, nil, err
}
entries = append(entries, entry)
@ -262,5 +261,5 @@ func LoadEdict(reader io.Reader) ([]edictEntry, error) {
}
}
return entries, nil
return entries, decoder.Entity, nil
}

164
enamdict.go Normal file
View File

@ -0,0 +1,164 @@
/*
* Copyright (c) 2016 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package jmdict
import (
"encoding/xml"
"io"
"log"
)
// Entries consist of kanji elements, reading elements
// name translation elements. Each entry must have at
// least one reading element and one sense element. Others are optional.
type enamdictEntry struct {
// A unique numeric sequence number for each entry
Sequence int `xml:"ent_seq"`
// The kanji element, or in its absence, the reading element, is
// the defining component of each entry.
// The overwhelming majority of entries will have a single kanji
// element associated with an entity name in Japanese. Where there are
// multiple kanji elements within an entry, they will be orthographical
// variants of the same word, either using variations in okurigana, or
// alternative and equivalent kanji. Common "mis-spellings" may be
// included, provided they are associated with appropriate information
// fields. Synonyms are not included; they may be indicated in the
// cross-reference field associated with the sense element.
Kanji []enamdictKanji `xml:"k_ele"`
// The reading element typically contains the valid readings
// of the word(s) in the kanji element using modern kanadzukai.
// Where there are multiple reading elements, they will typically be
// alternative readings of the kanji element. In the absence of a
// kanji element, i.e. in the case of a word or phrase written
// entirely in kana, these elements will define the entry.
Reading []enamdictReading `xml:"r_ele"`
// The trans element will record the translational equivalent
// of the Japanese name, plus other related information.
Translation []enamTranslation `xml:"trans"`
}
type enamdictKanji struct {
// This element will contain an entity name in Japanese
// which is written using at least one non-kana character (usually
// kanji, but can be other characters). The valid
// characters are kanji, kana, related characters such as chouon and
// kurikaeshi, and in exceptional cases, letters from other alphabets.
Expression string `xml:"keb"`
// This is a coded information field related specifically to the
// orthography of the keb, and will typically indicate some unusual
// aspect, such as okurigana irregularity.
Information []string `xml:"ke_inf"`
// This and the equivalent re_pri field are provided to record
// information about the relative priority of the entry, and are for
// use either by applications which want to concentrate on entries of
// a particular priority, or to generate subset files. The reason
// both the kanji and reading elements are tagged is because on
// occasions a priority is only associated with a particular
// kanji/reading pair.
Priority []string `xml:"ke_pri"`
}
type enamdictReading struct {
// This element content is restricted to kana and related
// characters such as chouon and kurikaeshi. Kana usage will be
// consistent between the keb and reb elements; e.g. if the keb
// contains katakana, so too will the reb.
Reading string `xml:"reb"`
// This element is used to indicate when the reading only applies
// to a subset of the keb elements in the entry. In its absence, all
// readings apply to all kanji elements. The contents of this element
// must exactly match those of one of the keb elements.
Restrictions []string `xml:"re_restr"`
// General coded information pertaining to the specific reading.
// Typically it will be used to indicate some unusual aspect of
// the reading.
Information []string `xml:"re_inf"`
// See the comment on ke_pri above.
Priority []string `xml:"re_pri"`
}
type enamTranslation struct {
// The type of name, recorded in the appropriate entity codes.
NameType []string `xml:"name_type"`
// This element is used to indicate a cross-reference to another
// entry with a similar or related meaning or sense. The content of
// this element is typically a keb or reb element in another entry. In some
// cases a keb will be followed by a reb and/or a sense number to provide
// a precise target for the cross-reference. Where this happens, a JIS
// "centre-dot" (0x2126) is placed between the components of the
// cross-reference.
References []string `xml:"xref"`
// The actual translations of the name, usually as a transcription
// into the target language.
Translations []string `xml:"trans_det"`
// The xml:lang attribute defines the target language of the
// translated name. It will be coded using the three-letter language
// code from the ISO 639-2 standard. When absent, the value "eng"
// (i.e. English) is the default value. The bibliographic (B) codes
// are used.
Language string `xml:"lang,attr"`
}
func LoadEnamdict(reader io.Reader) ([]enamdictEntry, map[string]string, error) {
decoder := xml.NewDecoder(reader)
var entries []enamdictEntry
for {
token, _ := decoder.Token()
if token == nil {
break
}
switch startElement := token.(type) {
case xml.Directive:
directive := token.(xml.Directive)
var err error
if decoder.Entity, err = parseEntities(&directive); err != nil {
return nil, nil, err
}
log.Print(decoder.Entity)
case xml.StartElement:
if startElement.Name.Local == "entry" {
var entry enamdictEntry
if err := decoder.DecodeElement(&entry, &startElement); err != nil {
return nil, nil, err
}
entries = append(entries, entry)
}
}
}
return entries, decoder.Entity, nil
}