#!/usr/bin/env python

#
# Copyright (C) 2011  Alex Yatskov
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import codecs
import optparse
import os
import re
import sqlite3
import sys


GRAMMAR_TAGS = {
    'adj',      # former adjective classification (being removed)
    'adj-f',    # noun or verb acting prenominally (other than the above)
    'adj-i',    # adjective (keiyoushi)
    'adj-na',   # adjectival nouns or quasi-adjectives (keiyodoshi)
    'adj-no',   # nouns which may take the genitive case particle `no'
    'adj-pn',   # pre-noun adjectival (rentaishi)
    'adj-t',    # `taru' adjective
    'adv',      # adverb (fukushi)
    'adv-n',    # adverbial noun
    'adv-to',   # adverb taking the `to' particle
    'aux',      # auxiliary
    'aux-adj',  # auxiliary adjective
    'aux-v',    # auxiliary verb
    'conj',     # conjunction
    'ctr',      # counter
    'exp',      # Expressions (phrases, clauses, etc.)
    'int',      # interjection (kandoushi)
    'iv',       # irregular verb
    'n',        # noun (common) (futsuumeishi)
    'n-adv',    # adverbial noun (fukushitekimeishi)
    'n-pref',   # noun, used as a prefix
    'n-suf',    # noun, used as a suffix
    'n-t',      # noun (temporal) (jisoumeishi)
    'num',      # numeric
    'pn',       # pronoun
    'pref' ,    # prefix
    'prt',      # particle
    'suf',      # suffix
    'v1',       # Ichidan verb
    'v2a-s',    # Nidan verb with 'u' ending (archaic)
    'v4h',      # Yodan verb with `hu/fu' ending (archaic)
    'v4r',      # Yodan verb with `ru' ending (archaic)
    'v5',       # Godan verb (not completely classified)
    'v5aru',    # Godan verb - -aru special class
    'v5b',      # Godan verb with `bu' ending
    'v5g',      # Godan verb with `gu' ending
    'v5k',      # Godan verb with `ku' ending
    'v5k-s',    # Godan verb - iku/yuku special class
    'v5m',      # Godan verb with `mu' ending
    'v5n',      # Godan verb with `nu' ending
    'v5r',      # Godan verb with `ru' ending
    'v5r-i',    # Godan verb with `ru' ending (irregular verb)
    'v5s',      # Godan verb with `su' ending
    'v5t',      # Godan verb with `tsu' ending
    'v5u',      # Godan verb with `u' ending
    'v5u-s',    # Godan verb with `u' ending (special class)
    'v5uru',    # Godan verb - uru old class verb (old form of Eru)
    'v5z',      # Godan verb with `zu' ending
    'vi',       # intransitive verb
    'vk',       # kuru verb - special class
    'vn',       # irregular nu verb
    'vs',       # noun or participle which takes the aux. verb suru
    'vs-c',     # su verb - precursor to the modern suru
    'vs-i',     # suru verb - irregular
    'vs-s',     # suru verb - special class
    'vt',       # transitive verb
    'vz',       # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
}


def isHiragana(c):
    return 0x3040 <= ord(c) < 0x30a0


def isKatakana(c):
    return 0x30a0 <= ord(c) < 0x3100


def loadDefinitions(path):
    print 'Parsing "{0}"...'.format(path)
    with codecs.open(path, encoding='euc-jp') as fp:
        return filter(lambda x: x and x[0] != '#', fp.read().splitlines())


def parseKanjiDic(path):
    results = list()

    for line in loadDefinitions(path):
        segments = line.split()
        character = segments[0]
        kunYomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
        onYomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
        meanings = '; '.join(re.findall('\{([^\}]+)\}', line))
        results.append((character, onYomi, kunYomi, meanings))

    return results


def writeKanjiDic(cursor, values):
    cursor.execute('DROP TABLE IF EXISTS Kanji')
    cursor.execute('CREATE TABLE Kanji(character TEXT, kunYomi TEXT, onYomi TEXT, meanings TEXT)')
    cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)


def parseKradFile(path):
    results = list()

    for line in loadDefinitions(path):
        segments = line.split(' ')
        character = segments[0]
        radicals = ', '.join(segments[2:])
        results.append((character, radicals))

    return results


def writeKradFile(cursor, values):
    cursor.execute('DROP TABLE IF EXISTS Radicals')
    cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
    cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)


def parseEdict(path):
    results = list()

    for line in loadDefinitions(path):
        segments = line.split('/')

        expression = segments[0].split(' ')
        term = expression[0]
        match = re.search('\[([^\]]+)\]', expression[1])
        reading = None if match is None else match.group(1)

        definitions = filter(lambda x: len(x) > 0, segments[1:])
        definitions = '; '.join(definitions)
        definitions = re.sub('\(\d+\)\s*', str(), definitions)

        tags = list()
        for group in re.findall('\(([^\)\]]+)\)', definitions):
            tags.extend(group.split(','))

        tags = set(tags).intersection(GRAMMAR_TAGS)
        tags = ', '.join(sorted(tags))

        results.append((term, reading, definitions, tags))

    return results


def writeEdict(cursor, values):
    cursor.execute('DROP TABLE IF EXISTS Terms')
    cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, definitions TEXT, tags TEXT)')
    cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)


def build(path, kanjidic, kradfile, edict):
    with sqlite3.connect(path) as db:
        cursor = db.cursor()

        if kanjidic is not None:
            writeKanjiDic(cursor, parseKanjiDic(kanjidic))

        if kradfile is not None:
            writeKradFile(cursor, parseKradFile(kradfile))

        if edict is not None:
            writeEdict(cursor, parseEdict(edict))


def main():
    parser = optparse.OptionParser()
    parser.add_option('--kanjidic', dest='kanjidic')
    parser.add_option('--kradfile', dest='kradfile')
    parser.add_option('--edict', dest='edict')

    options, args = parser.parse_args()

    if len(args) == 0:
        parser.print_help()
    else:
        build(args[0], options.kanjidic, options.kradfile, options.edict)


if __name__ == '__main__':
    main()