diff --git a/epwing.go b/epwing.go index 8ad84b5..ec17aac 100644 --- a/epwing.go +++ b/epwing.go @@ -138,6 +138,7 @@ func epwingExportDb(inputPath, outputDir, title string, stride int, pretty bool) epwingExtractors := map[string]epwingExtractor{ "三省堂 スーパー大辞林": makeDaijirinExtractor(), "大辞泉": makeDaijisenExtractor(), + "明鏡国語辞典": makeMeikyoExtractor(), } var ( diff --git a/meikyo.go b/meikyo.go new file mode 100644 index 0000000..e87f036 --- /dev/null +++ b/meikyo.go @@ -0,0 +1,782 @@ +package main + +import ( + "regexp" + "strings" +) + +type meikyoExtractor struct { + partsExp *regexp.Regexp + expForeignMetaExp *regexp.Regexp + expShapesExp *regexp.Regexp + expBracketedExp *regexp.Regexp + expTermsExp *regexp.Regexp + readGroupExp *regexp.Regexp + metaExp *regexp.Regexp +} + +func makeMeikyoExtractor() epwingExtractor { + var foreignMeta = []string{ + "和製", + "中国", + "朝鮮", + "イタリア", + "スペイン", + "ドイツ", + "フランス", + "オランダ", + "ポルトガル", + "ギリシア", + "アラビア", + "チベット", + "タガログ", + "ヘブライ", + "ヒンディー", + "マレーシア", + "ラテン", + "アフリカーンス", + "ロシア", + "ハワイ", + "マレー", + "スウェーデン", + "ノルウェー・デンマーク", + "フィンランド", + "サンスクリット", + "ポーランド", + } + return &meikyoExtractor{ + partsExp: regexp.MustCompile(`([^(【〖[]+)(?:【(.*)】)?(?:\[(.*)\])?(?:((.*)))?`), + expForeignMetaExp: regexp.MustCompile(strings.Join(foreignMeta, "|")), + expShapesExp: regexp.MustCompile(`[▼▽]+`), + expBracketedExp: regexp.MustCompile(`(?:[〈《])([^〉》]*)(?:[〉》])`), + expTermsExp: regexp.MustCompile(`([^(]*)?(?:((.*)))?`), + metaExp: regexp.MustCompile(`〘([^〙]*)〙`), + readGroupExp: regexp.MustCompile(`[‐・]+`), + } +} + +func (e *meikyoExtractor) extractTerms(entry epwingEntry) []dbTerm { + matches := e.partsExp.FindStringSubmatch(entry.Heading) + if matches == nil { + return nil + } + + var expressions, readings []string + if expression := matches[2]; len(expression) > 0 { + expression = e.expShapesExp.ReplaceAllLiteralString(expression, "") + expression = e.expBracketedExp.ReplaceAllString(expression, "$1") + if termsMatches := e.expTermsExp.FindStringSubmatch(expression); termsMatches != nil { + termsMatches = termsMatches[1:] + for _, terms := range termsMatches { + if len(terms) > 0 { + for _, split := range strings.Split(terms, "・") { + expressions = append(expressions, split) + } + } + } + } + } + + if expression := matches[3]; len(expression) > 0 { + expression = e.expForeignMetaExp.ReplaceAllLiteralString(expression, "") + expression = strings.Replace(expression, "+", " ", -1) + for _, split := range strings.Split(expression, "・") { + expressions = append(expressions, split) + } + } + + if reading := matches[1]; len(reading) > 0 { + reading = e.readGroupExp.ReplaceAllLiteralString(reading, "") + readings = append(readings, reading) + } + + var tags []string + for _, split := range strings.Split(entry.Text, "\n") { + if matches := e.metaExp.FindStringSubmatch(split); matches != nil { + for _, tag := range strings.Split(matches[1], "・") { + tags = append(tags, tag) + } + } + } + + var terms []dbTerm + if len(expressions) == 0 { + for _, reading := range readings { + term := dbTerm{ + Expression: reading, + Glossary: []string{entry.Text}, + } + + e.exportRules(&term, tags) + terms = append(terms, term) + } + + } else { + for _, expression := range expressions { + for _, reading := range readings { + term := dbTerm{ + Expression: expression, + Reading: reading, + Glossary: []string{entry.Text}, + } + + e.exportRules(&term, tags) + terms = append(terms, term) + } + } + } + + return terms +} + +func (e *meikyoExtractor) extractKanji(entry epwingEntry) []dbKanji { + return nil +} + +func (e *meikyoExtractor) exportRules(term *dbTerm, tags []string) { + for _, tag := range tags { + if tag == "名" { + term.addRules("n") + } else if tag == "代" { + term.addRules("pn") + } else if tag == "連体" { + term.addRules("adj-pn") + } else if tag == "副" { + term.addRules("adv") + } else if tag == "副ト" || tag == "副トニ" || tag == "トニ" { + term.addRules("adv-to") + } else if tag == "副助" { + term.addRules("adv") + term.addRules("prt") + } else if tag == "格助" || tag == "終助" { + term.addRules("prt") + } else if tag == "接" { + term.addRules("conj") + } else if tag == "接助" { + term.addRules("conj") + term.addRules("prt") + } else if tag == "接尾" { + term.addRules("suf") + } else if tag == "接頭" { + term.addRules("pref") + } else if tag == "補形" { + term.addRules("aux-adj") + } else if strings.HasPrefix(tag, "助動") || strings.HasPrefix(tag, "補動") { + term.addRules("aux-v") + } else if tag == "形動トタル" { + term.addRules("adj-t") + term.addRules("adv-to") + } else if tag == "形" { + term.addRules("adj-i") + } else if tag == "形動" { + term.addRules("adj-na") + } + + if strings.Contains(tag, "他") { + term.addRules("vt") + } + if strings.Contains(tag, "自") { + term.addRules("vi") + } + if strings.Contains(tag, "一") { + term.addRules("v1") + } else if strings.ContainsAny(tag, "二四五") && tag != "二" && tag != " トニ" { + term.addRules("v5") + } else if strings.Contains(tag, "サ変") && (strings.HasSuffix(term.Expression, "する") || strings.HasSuffix(term.Expression, "為る")) { + term.addRules("vs") + } else if term.Expression == "来る" { + term.addRules("vk") + } + } +} + +func (*meikyoExtractor) getRevision() string { + return "meikyo1" +} + +func (*meikyoExtractor) getFontNarrow() map[int]string { + return map[int]string{ + 41249: " ", + 41250: "¡", + 41251: "¢", + 41252: "£", + 41253: "¤", + 41254: "¥", + 41255: "¦", + 41256: "§", + 41257: "¨", + 41258: "©", + 41259: "ª", + 41260: "«", + 41261: "¬", + 41262: "­", + 41263: "®", + 41264: "¯", + 41265: "°", + 41266: "±", + 41267: "²", + 41268: "³", + 41269: "´", + 41270: "µ", + 41271: "¶", + 41272: "·", + 41273: "¸", + 41274: "¹", + 41275: "º", + 41276: "»", + 41277: "¼", + 41278: "½", + 41279: "¾", + 41280: "¿", + 41281: "À", + 41282: "Á", + 41283: "Â", + 41284: "Ã", + 41285: "Ä", + 41286: "Å", + 41287: "Æ", + 41288: "Ç", + 41289: "È", + 41290: "É", + 41291: "Ê", + 41292: "Ë", + 41293: "Ì", + 41294: "Í", + 41295: "Î", + 41296: "Ï", + 41297: "Ð", + 41298: "Ñ", + 41299: "Ò", + 41300: "Ó", + 41301: "Ô", + 41302: "Õ", + 41303: "Ö", + 41304: "×", + 41305: "Ø", + 41306: "Ù", + 41307: "Ú", + 41308: "Û", + 41309: "Ü", + 41310: "Ý", + 41311: "Þ", + 41312: "ß", + 41313: "à", + 41314: "á", + 41315: "â", + 41316: "ã", + 41317: "ä", + 41318: "å", + 41319: "æ", + 41320: "ç", + 41321: "è", + 41322: "é", + 41323: "ê", + 41324: "ë", + 41325: "ì", + 41326: "í", + 41327: "î", + 41328: "ï", + 41329: "ð", + 41330: "ñ", + 41331: "ò", + 41332: "ó", + 41333: "ô", + 41334: "õ", + 41335: "ö", + 41336: "÷", + 41337: "ø", + 41338: "ù", + 41339: "ú", + 41340: "û", + 41341: "ü", + 41342: "ý", + 41505: "þ", + 41506: "ÿ", + 41507: "Ā", + 41508: "ā", + 41509: "Ă", + 41510: "ă", + 41511: "Ą", + 41512: "ą", + 41513: "Ć", + 41514: "ć", + 41515: "Ĉ", + 41516: "ĉ", + 41517: "Ċ", + 41518: "ċ", + 41519: "Č", + 41520: "č", + 41521: "Ď", + 41522: "ď", + 41523: "Đ", + 41524: "đ", + 41525: "Ē", + 41526: "ē", + 41527: "Ĕ", + 41528: "ĕ", + 41529: "Ė", + 41530: "ė", + 41531: "Ę", + 41532: "ę", + 41533: "Ě", + 41534: "ě", + 41535: "Ĝ", + 41536: "ĝ", + 41537: "Ğ", + 41538: "ğ", + 41539: "Ġ", + 41540: "ġ", + 41541: "Ģ", + 41542: "ģ", + 41543: "Ĥ", + 41544: "ĥ", + 41545: "Ħ", + 41546: "ħ", + 41547: "Ĩ", + 41548: "ĩ", + 41549: "Ī", + 41550: "ī", + 41551: "Ĭ", + 41552: "ĭ", + 41553: "Į", + 41554: "į", + 41555: "İ", + 41556: "ı", + 41557: "IJ", + 41558: "ij", + 41559: "Ĵ", + 41560: "ĵ", + 41561: "Ķ", + 41562: "ķ", + 41563: "ĸ", + 41564: "Ĺ", + 41565: "ĺ", + 41566: "Ļ", + 41567: "ļ", + 41568: "Ľ", + 41569: "ľ", + 41570: "Ŀ", + 41571: "ŀ", + 41572: "Ł", + 41573: "ł", + 41574: "Ń", + 41575: "ń", + 41576: "Ņ", + 41577: "ņ", + 41578: "Ň", + 41579: "ň", + 41580: "ʼn", + 41581: "Ŋ", + 41582: "ŋ", + 41583: "Ō", + 41584: "ō", + 41585: "Ŏ", + 41586: "ŏ", + 41587: "Ő", + 41588: "ő", + 41589: "Œ", + 41590: "œ", + 41591: "Ŕ", + 41592: "ŕ", + 41593: "Ŗ", + 41594: "ŗ", + 41595: "Ř", + 41596: "ř", + 41597: "Ś", + 41598: "ś", + 41761: "Ŝ", + 41762: "ŝ", + 41763: "Ş", + 41764: "ş", + 41765: "Š", + 41766: "š", + 41767: "Ţ", + 41768: "ţ", + 41769: "Ť", + 41770: "ť", + 41771: "Ŧ", + 41772: "ŧ", + 41773: "Ũ", + 41774: "ũ", + 41775: "Ū", + 41776: "ū", + 41777: "Ŭ", + 41778: "ŭ", + 41779: "Ů", + 41780: "ů", + 41781: "Ű", + 41782: "ű", + 41783: "Ų", + 41784: "ų", + 41785: "Ŵ", + 41786: "ŵ", + 41787: "Ŷ", + 41788: "ŷ", + 41789: "Ÿ", + 41790: "Ź", + 41791: "ź", + 41792: "Ż", + 41793: "ż", + 41794: "Ž", + 41795: "ž", + 41796: "ſ", + 41797: "Ǎ", + 41798: "ǎ", + 41799: "Ǐ", + 41800: "ǐ", + 41801: "Ǒ", + 41802: "ǒ", + 41803: "Ǔ", + 41804: "ǔ", + 41805: "ƒ", + 41806: "ˆ", + 41807: "˜", + 41808: "ɔ", + 41809: "ɔ̀", + 41810: "ɔ́", + 41811: "ǝ", + 41812: "ǝ̀", + 41813: "ǝ́", + 41814: "ʌ", + 41815: "ʌ̀", + 41816: "ʌ́", + 41817: "", + 41818: "ɑ", + 41819: "ɑ̀", + 41820: "ɑ́", + 41821: "ʃ", + 41822: "ʊ", + 41823: "θ", + 41824: "ʒ", + 41825: "ɒ", + 41826: "ǽ", + 41827: "ɚ", + 41828: "ɡ", + 41829: "ʤ", + 41830: "ʧ", + 41831: "-", + 41832: ".", + 41833: "¯", + 41834: "℉", + 41835: "Ⅰ", + 41836: "Ⅱ", + 41837: "Ⅲ", + 41838: "Ⅳ", + 41839: "Ⅴ", + 41840: "Ⅹ", + 41841: "↕", + 41842: "■", + 41843: "°", + 41844: "∛", + 41845: "∜", + 41846: "∥", + 41847: "〻", + 41848: "≣", + 41849: "≺", + 41850: "≻", + 41851: "∧", + 41852: "", + 41853: "♠", + 41854: "♣", + 42017: "♥", + 42018: "♦", + 42019: "♩", + 42020: "♮", + 42021: "√", + } +} + +func (*meikyoExtractor) getFontWide() map[int]string { + return map[int]string{ + 45089: "鄧", + 45090: "疒", + 45091: "©", + 45092: "æ", + 45093: "æ̀", + 45094: "ǽ", + 45095: "①", + 45096: "➁", + 45097: "③", + 45098: "④", + 45099: "⑤", + 45100: "⑥", + 45101: "⑦", + 45102: "⑧", + 45103: "⑨", + 45104: "⑩", + 45105: "⑪", + 45106: "⑫", + 45107: "⑬", + 45108: "⑭", + 45109: "⑮", + 45110: "⑯", + 45111: "⑰", + 45112: "⑱", + 45113: "⑲", + 45114: "⑳", + 45115: "⑴", + 45116: "⑵", + 45117: "⑶", + 45118: "〘", + 45119: "〙", + 45120: "\", + 45121: "/", + 45122: "㋐", + 45123: "㋑", + 45124: "㋒", + 45125: "㋓", + 45126: "㋔", + 45127: "㋕", + 45128: "㋖", + 45129: "㋗", + 45130: "㋘", + 45131: "㋙", + 45132: "㋚", + 45133: "㋛", + 45134: "㋜", + 45135: "㋝", + 45136: "㋞", + 45137: "㋟", + 45138: "㋠", + 45139: "㋡", + 45140: "㋢", + 45141: "㋣", + 45142: "丰", + 45143: "仐", + 45144: "你", + 45145: "俏", + 45146: "俠", + 45147: "偓", + 45148: "儈", + 45149: "", + 45150: "厴", + 45151: "呍", + 45152: "啞", + 45153: "嘻", + 45154: "噦", + 45155: "噯", + 45156: "嚙", + 45157: "嚢", + 45158: "埵", + 45159: "塡", + 45160: "增", + 45161: "壔", + 45162: "妤", + 45163: "婟", + 45164: "孒", + 45165: "尩", + 45166: "屢", + 45167: "弴", + 45168: "彽", + 45169: "德", + 45170: "憍", + 45171: "扌", + 45172: "挍", + 45173: "挘", + 45174: "挵", + 45175: "捥", + 45176: "搔", + 45177: "摑", + 45178: "撿", + 45179: "擊", + 45180: "擤", + 45181: "攙", + 45182: "攩", + 45345: "昻", + 45346: "晳", + 45347: "枘", + 45348: "栱", + 45349: "桛", + 45350: "梂", + 45351: "梘", + 45352: "梣", + 45353: "梲", + 45354: "梻", + 45355: "棰", + 45356: "楉", + 45357: "楤", + 45358: "榨", + 45359: "樏", + 45360: "樝", + 45361: "橅", + 45362: "橐", + 45363: "橫", + 45364: "檝", + 45365: "檞", + 45366: "櫧", + 45367: "氵", + 45368: "洄", + 45369: "湑", + 45370: "潑", + 45371: "濹", + 45372: "瀆", + 45373: "瀨", + 45374: "灬", + 45375: "炷", + 45376: "炻", + 45377: "焰", + 45378: "煆", + 45379: "煠", + 45380: "熅", + 45381: "牓", + 45382: "玕", + 45383: "瑇", + 45384: "疒", + 45385: "痀", + 45386: "痎", + 45387: "痹", + 45388: "瘙", + 45389: "瘦", + 45390: "瘭", + 45391: "癤", + 45392: "皂", + 45393: "盬", + 45394: "眴", + 45395: "眶", + 45396: "睺", + 45397: "矠", + 45398: "矻", + 45399: "硨", + 45400: "磲", + 45401: "祆", + 45402: "禱", + 45403: "稭", + 45404: "穇", + 45405: "窠", + 45406: "笧", + 45407: "筕", + 45408: "篊", + 45409: "篖", + 45410: "簎", + 45411: "簶", + 45412: "籡", + 45413: "籹", + 45414: "粑", + 45415: "糈", + 45416: "糗", + 45417: "糝", + 45418: "絇", + 45419: "綠", + 45420: "緖", + 45421: "縕", + 45422: "繇", + 45423: "繡", + 45424: "繫", + 45425: "胳", + 45426: "腭", + 45427: "舢", + 45428: "苆", + 45429: "萁", + 45430: "萊", + 45431: "蒴", + 45432: "蔞", + 45433: "蔣", + 45434: "蔲", + 45435: "蕺", + 45436: "薰", + 45437: "蘞", + 45438: "蘩", + 45601: "虯", + 45602: "蛽", + 45603: "蜱", + 45604: "蜾", + 45605: "蝲", + 45606: "螈", + 45607: "蟎", + 45608: "蟖", + 45609: "蠃", + 45610: "蠆", + 45611: "蠊", + 45612: "蠟", + 45613: "袘", + 45614: "袪", + 45615: "裑", + 45616: "襬", + 45617: "豇", + 45618: "賴", + 45619: "跆", + 45620: "跑", + 45621: "踠", + 45622: "軀", + 45623: "辨", + 45624: "邌", + 45625: "醞", + 45626: "醱", + 45627: "鈸", + 45628: "鎺", + 45629: "雞", + 45630: "韛", + 45631: "頰", + 45632: "顖", + 45633: "顚", + 45634: "顬", + 45635: "飥", + 45636: "餺", + 45637: "駃", + 45638: "騠", + 45639: "驎", + 45640: "骶", + 45641: "魬", + 45642: "魳", + 45643: "鮄", + 45644: "鮧", + 45645: "鮬", + 45646: "鮸", + 45647: "鯁", + 45648: "鯎", + 45649: "鯥", + 45650: "鯧", + 45651: "鰙", + 45652: "鰶", + 45653: "鱁", + 45654: "鱏", + 45655: "鱓", + 45656: "鱝", + 45657: "鱩", + 45658: "鱪", + 45659: "鱮", + 45660: "鱰", + 45661: "鱲", + 45662: "鱵", + 45663: "鵇", + 45664: "鵼", + 45665: "鶀", + 45666: "鷉", + 45667: "鷗", + 45668: "鸊", + 45669: "鹼", + 45670: "麨", + 45671: "麬", + 45672: "麴", + 45673: "黑", + 45674: "鼯", + 45675: "鼹", + 45676: "爛", + 45677: "朗", + 45678: "塚", + 45679: "神", + 45680: "祥", + 45681: "福", + 45682: "﨟", + 45683: "諸", + 45684: "都", + 45685: "-", + 45686: "~", + 45687: "¢", + 45688: "£", + 45689: "〓", + 45690: "〰", + 45691: "㊀", + 45692: "㊁", + 45693: "㊂", + 45694: "㊃", + 45857: "㊙", + 45858: "㋤", + 45859: "懀", + 45860: "杮", + 45861: "〓", + 45862: "", + 45863: "○", + 45864: "", + } +}