/* * Copyright (c) 2016 Alex Yatskov * Author: Alex Yatskov * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in * the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of * the Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package jmdict import ( "encoding/xml" "io" ) // Entries consist of kanji elements, reading elements // name translation elements. Each entry must have at // least one reading element and one sense element. Others are optional. type EnamdictEntry struct { // A unique numeric sequence number for each entry Sequence int `xml:"ent_seq"` // The kanji element, or in its absence, the reading element, is // the defining component of each entry. // The overwhelming majority of entries will have a single kanji // element associated with an entity name in Japanese. Where there are // multiple kanji elements within an entry, they will be orthographical // variants of the same word, either using variations in okurigana, or // alternative and equivalent kanji. Common "mis-spellings" may be // included, provided they are associated with appropriate information // fields. Synonyms are not included; they may be indicated in the // cross-reference field associated with the sense element. Kanji []EnamdictKanji `xml:"k_ele"` // The reading element typically contains the valid readings // of the word(s) in the kanji element using modern kanadzukai. // Where there are multiple reading elements, they will typically be // alternative readings of the kanji element. In the absence of a // kanji element, i.e. in the case of a word or phrase written // entirely in kana, these elements will define the entry. Reading []EnamdictReading `xml:"r_ele"` // The trans element will record the translational equivalent // of the Japanese name, plus other related information. Translation []EnamdictTranslation `xml:"trans"` } type EnamdictKanji struct { // This element will contain an entity name in Japanese // which is written using at least one non-kana character (usually // kanji, but can be other characters). The valid // characters are kanji, kana, related characters such as chouon and // kurikaeshi, and in exceptional cases, letters from other alphabets. Expression string `xml:"keb"` // This is a coded information field related specifically to the // orthography of the keb, and will typically indicate some unusual // aspect, such as okurigana irregularity. Information []string `xml:"ke_inf"` // This and the equivalent re_pri field are provided to record // information about the relative priority of the entry, and are for // use either by applications which want to concentrate on entries of // a particular priority, or to generate subset files. The reason // both the kanji and reading elements are tagged is because on // occasions a priority is only associated with a particular // kanji/reading pair. Priority []string `xml:"ke_pri"` } type EnamdictReading struct { // This element content is restricted to kana and related // characters such as chouon and kurikaeshi. Kana usage will be // consistent between the keb and reb elements; e.g. if the keb // contains katakana, so too will the reb. Reading string `xml:"reb"` // This element is used to indicate when the reading only applies // to a subset of the keb elements in the entry. In its absence, all // readings apply to all kanji elements. The contents of this element // must exactly match those of one of the keb elements. Restrictions []string `xml:"re_restr"` // General coded information pertaining to the specific reading. // Typically it will be used to indicate some unusual aspect of // the reading. Information []string `xml:"re_inf"` // See the comment on ke_pri above. Priority []string `xml:"re_pri"` } type EnamdictTranslation struct { // The type of name, recorded in the appropriate entity codes. NameType []string `xml:"name_type"` // This element is used to indicate a cross-reference to another // entry with a similar or related meaning or sense. The content of // this element is typically a keb or reb element in another entry. In some // cases a keb will be followed by a reb and/or a sense number to provide // a precise target for the cross-reference. Where this happens, a JIS // "centre-dot" (0x2126) is placed between the components of the // cross-reference. References []string `xml:"xref"` // The actual translations of the name, usually as a transcription // into the target language. Translations []string `xml:"trans_det"` // The xml:lang attribute defines the target language of the // translated name. It will be coded using the three-letter language // code from the ISO 639-2 standard. When absent, the value "eng" // (i.e. English) is the default value. The bibliographic (B) codes // are used. Language string `xml:"lang,attr"` } func LoadEnamdict(reader io.Reader, transform bool) ([]EnamdictEntry, map[string]string, error) { var entries []EnamdictEntry entities, err := parseEntries(reader, transform, func(decoder *xml.Decoder, element *xml.StartElement) error { if element.Name.Local != "entry" { return nil } var entry EnamdictEntry if err := decoder.DecodeElement(&entry, element); err != nil { return err } entries = append(entries, entry) return nil }) return entries, entities, err }