/* vim:set ts=8 sts=4 sw=4 tw=0: */
/*
 * migemo.c -
 *
 * Written By:  MURAOKA Taro <koron@tka.att.ne.jp>
 * Last Change: 19-Jun-2004.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "wordbuf.h"
#include "wordlist.h"
#include "mnode.h"
#include "rxgen.h"
#include "romaji.h"
#include "filename.h"
#include "charset.h"
#include "migemo.h"

#define DICT_MIGEMO "migemo-dict"
#define DICT_ROMA2HIRA "roma2hira.dat"
#define DICT_HIRA2KATA "hira2kata.dat"
#define DICT_HAN2ZEN "han2zen.dat"
#define DICT_ZEN2HAN "zen2han.dat"
#define VOWELS_STRING "aiueo"
#define BUFLEN_DETECT_CHARSET 4096

#ifdef __BORLANDC__
# define EXPORTS __declspec(dllexport)
#else
# define EXPORTS
#endif

typedef int (*MIGEMO_PROC_ADDWORD)(void* data, unsigned char* word);

/* migemoIuWFNg */
struct _migemo
{
    int enable;
    mtree_p mtree;
    int charset;
    romaji* roma2hira;
    romaji* hira2kata;
    romaji* han2zen;
    romaji* zen2han;
    rxgen* rx;
    MIGEMO_PROC_ADDWORD addword;
    CHARSET_PROC_CHAR2INT char2int;
};

    static mtree_p
load_mtree_dictionary(mtree_p mtree, const char* dict_file)
{
    FILE *fp;

    if ((fp = fopen(dict_file, "rt")) == NULL)
	return NULL;			/* Can't find file */
    mtree = mnode_load(mtree, fp);
    fclose(fp);
    return mtree;
}

    static mtree_p
load_mtree_dictionary2(migemo* obj, const char* dict_file)
{
    if (obj->charset == CHARSET_NONE)
    {
	/* ̕Zbgɂ킹ĐK\̊֐ύX */
	CHARSET_PROC_CHAR2INT char2int = NULL;
	CHARSET_PROC_INT2CHAR int2char = NULL;
	obj->charset = charset_detect_file(dict_file);
	charset_getproc(obj->charset, &char2int, &int2char);
	if (char2int)
	{
	    migemo_setproc_char2int(obj, (MIGEMO_PROC_CHAR2INT)char2int);
	    obj->char2int = char2int;
	}
	if (int2char)
	    migemo_setproc_int2char(obj, (MIGEMO_PROC_INT2CHAR)int2char);
    }
    return load_mtree_dictionary(obj->mtree, dict_file);
}

    static void
dircat(char* buf, const char* dir, const char* file)
{
    strcpy(buf, dir);
    strcat(buf, "/");
    strcat(buf, file);
}

/*
 * migemo interfaces
 */

/**
 * MigemoIuWFNgɎA܂̓f[^t@Cǉǂݍ݂B
 * dict_file͓ǂݍރt@Cw肷Bdict_id͓ǂݍގEf[^
 * ނw肷̂ňȉ̂ǂꂩw肷:
 *
 *  <dl>
 *  <dt>MIGEMO_DICTID_MIGEMO</dt>
 *	<dd>mikgemo-dict</dd>
 *  <dt>MIGEMO_DICTID_ROMA2HIRA</dt>
 *	<dd>[}ϊ\</dd>
 *  <dt>MIGEMO_DICTID_HIRA2KATA</dt>
 *	<dd>J^Jiϊ\</dd>
 *  <dt>MIGEMO_DICTID_HAN2ZEN</dt>
 *	<dd>pSpϊ\</dd>
 *  <dt>MIGEMO_DICTID_ZEN2HAN</dt>
 *	<dd>Sppϊ\</dd>
 *  </dl>
 *
 *  ߂l͎ۂɓǂݍ񂾎ނAL̑ɓǂݍ݂ɎsƂ
 *  ̉Ԃ邱ƂB
 *
 *  <dl><dt>MIGEMO_DICTID_INVALID</dt></dl>
 * @param obj MigemoIuWFNg
 * @param dict_id t@C̎
 * @param dict_file t@C̃pX
 */
    EXPORTS
    int 
migemo_load(migemo* obj, int dict_id, const char* dict_file)
{
    if (!obj && dict_file)
	return MIGEMO_DICTID_INVALID;

    if (dict_id == MIGEMO_DICTID_MIGEMO)
    {
	/* migemoǂݍ */
	mtree_p mtree;

	if ((mtree = load_mtree_dictionary2(obj, dict_file)) == NULL)
	    return MIGEMO_DICTID_INVALID;
	obj->mtree = mtree;
	obj->enable = 1;
	return dict_id;			/* Loaded successfully */
    }
    else
    {
	romaji *dict;

	switch (dict_id)
	{
	    case MIGEMO_DICTID_ROMA2HIRA:
		/* [}ǂݍ */
		dict = obj->roma2hira;
		break;
	    case MIGEMO_DICTID_HIRA2KATA:
		/* J^Jiǂݍ */
		dict = obj->hira2kata;
		break;
	    case MIGEMO_DICTID_HAN2ZEN:
		/* pSpǂݍ */
		dict = obj->han2zen;
		break;
	    case MIGEMO_DICTID_ZEN2HAN:
		/* pSpǂݍ */
		dict = obj->zen2han;
		break;
	    default:
		dict = NULL;
		break;
	}
	if (dict && romaji_load(dict, dict_file) == 0)
	    return dict_id;
	else
	    return MIGEMO_DICTID_INVALID;
    }
}

/**
 * MigemoIuWFNg쐬B쐬ɐƃIuWFNg߂lƂ
 * ԂAsNULLԂBdictŎw肵t@Cmigemo-dictƂ
 * IuWFNg쐬ɓǂݍ܂BƓfBNg:
 *
 *  <dl>
 *  <dt>roma2hira.dat</dt>
 *	<dd>[}ϊ\ </dd>
 *  <dt>hira2kata.dat</dt>
 *	<dd>J^Jiϊ\ </dd>
 *  <dt>han2zen.dat</dt>
 *	<dd>pSpϊ\ </dd>
 *  </dl>
 *
 * ƂÕt@C݂΁Â݂ǂݍ܂Bdict
 * NULLw肵ꍇɂ́A܂߂ĂȂt@Cǂݍ܂ȂB
 * t@C̓IuWFNg쐬ɂmigemo_load()֐gp邱ƂŒǉǂ
 * ݂łB
 * @param dict migemo-dict̃pXBNULL͎̎ǂݍ܂ȂB
 * @returns 쐬ꂽMigemoIuWFNg
 */
    EXPORTS
    migemo*
migemo_open(const char* dict)
{
    migemo *obj;

    /* migemoIuWFNgƊeo\z */
    if (!(obj = (migemo*)calloc(1, sizeof(migemo))))
	return obj;
    obj->enable = 0;
    obj->mtree = mnode_open(NULL);
    obj->charset = CHARSET_NONE;
    obj->rx = rxgen_open();
    obj->roma2hira =	romaji_open();
    obj->hira2kata =	romaji_open();
    obj->han2zen =	romaji_open();
    obj->zen2han =	romaji_open();
    if (!obj->rx || !obj->roma2hira || !obj->hira2kata || !obj->han2zen
	    || !obj->zen2han)
    {
	migemo_close(obj);
	return obj = NULL;
    }

    /* ftHgmigemow肳Ă烍[}ƃJ^JiT */
    if (dict)
    {
#ifndef _MAX_PATH
# define _MAX_PATH 1024 /* Ȑl */
#endif
	char dir[_MAX_PATH];
	char roma_dict[_MAX_PATH];
	char kata_dict[_MAX_PATH];
	char h2z_dict[_MAX_PATH];
	char z2h_dict[_MAX_PATH];
	const char *tmp;
	mtree_p mtree;

	filename_directory(dir, dict);
	tmp = strlen(dir) ? dir : ".";
	dircat(roma_dict, tmp, DICT_ROMA2HIRA);
	dircat(kata_dict, tmp, DICT_HIRA2KATA);
	dircat(h2z_dict,  tmp, DICT_HAN2ZEN);
	dircat(z2h_dict,  tmp, DICT_ZEN2HAN);

	mtree = load_mtree_dictionary2(obj, dict);
	if (mtree)
	{
	    obj->mtree = mtree;
	    obj->enable = 1;
	    romaji_load(obj->roma2hira, roma_dict);
	    romaji_load(obj->hira2kata, kata_dict);
	    romaji_load(obj->han2zen, h2z_dict);
	    romaji_load(obj->zen2han, z2h_dict);
	}
    }
    return obj;
}

/**
 * MigemoIuWFNgjAgpĂ\[XB
 * @param obj jMigemoIuWFNg
 */
    EXPORTS
    void
migemo_close(migemo* obj)
{
    if (obj)
    {
	if (obj->zen2han)
	    romaji_close(obj->zen2han);
	if (obj->han2zen)
	    romaji_close(obj->han2zen);
	if (obj->hira2kata)
	    romaji_close(obj->hira2kata);
	if (obj->roma2hira)
	    romaji_close(obj->roma2hira);
	if (obj->rx)
	    rxgen_close(obj->rx);
	if (obj->mtree)
	    mnode_close(obj->mtree);
	free(obj);
    }
}

/*
 * query version 2
 */

/*
 * mnode̎PꃊXg𐳋K\GWɓ͂B
 */
    static void
migemo_query_proc(mnode* p, void* data)
{
    migemo *object = (migemo*)data;
    wordlist_p list = p->list;

    for (; list; list = list->next)
	object->addword(object, list->ptr);
}

/*
 * obt@pӂmnodeɍċAŏ܂
 */
    static void
add_mnode_query(migemo* object, unsigned char* query)
{
    mnode *pnode;

    if ((pnode = mnode_query(object->mtree, query)) != NULL)
	mnode_traverse(pnode, migemo_query_proc, object);
}

    static int
add_roma(migemo* object, unsigned char* query)
{
    unsigned char *stop, *hira, *kata, *han;

    hira = romaji_convert(object->roma2hira, query, &stop);
    if (!stop)
    {
	object->addword(object, hira);
	/* ɂ鎫 */
	add_mnode_query(object, hira);
	/* Љ𐶐ɉ */
	kata = romaji_convert2(object->hira2kata, hira, NULL, 0);
	object->addword(object, kata);
	/* TODO: pJi𐶐ɉ */
#if 1
	han = romaji_convert2(object->zen2han, kata, NULL, 0);
	object->addword(object, han);
	/*printf("kata=%s\nhan=%s\n", kata, han);*/
	romaji_release(object->zen2han, han);
#endif
	/* J^Jiɂ鎫 */
	add_mnode_query(object, kata);
	romaji_release(object->hira2kata, kata); /* J^Ji */
    }
    romaji_release(object->roma2hira, hira); /*  */

    return stop ? 1 : 0;
}

/*
 * [}ϊsSɁA[aiueo]"xn""xtu"ĕϊ
 * ݂B
 */
    static void
add_dubious_roma(migemo* object, rxgen* rx, unsigned char* query)
{
    static unsigned char candidate[] = VOWELS_STRING;
    int len;
    char *buf;

    if (!(len = strlen(query)))
	return;
    if (!(buf = malloc(len + 1 + 3))) /* NULƊgp(Œ:xtu) */
	return;
    memcpy(buf, query, len + 1);
    buf[len + 1] = '\0';

    if (!strchr(candidate, buf[len - 1]))
    {
	unsigned char *ptr;

	/* [aiueo]Ԃɕ₤ */
	for (ptr = candidate; *ptr; ++ptr)
	{
	    buf[len] = *ptr;
	    add_roma(object, buf);
	}
	/* mP̒2Am蕶̒OꉹȂ΁c */
	if (len < 2 || strchr(candidate, buf[len - 2]))
	{
	    if (buf[len - 1] == 'n')
	    {
		/* uvĂ݂ */
		strcpy(&buf[len - 1], "xn");
		add_roma(object, buf);
	    }
	    else
	    {
		/* uvĂ݂ */
		strcpy(&buf[len - 1], "xtu");
		add_roma(object, buf);
	    }
	}
    }

    free(buf);
}

/*
 * query𕶐߂ɕB߂̐؂ڂ͒ʏAt@xbg̑啶B߂
 * ̑啶Ŏn܂߂͔啶؂ƂB
 */
    static wordlist_p
parse_query(migemo* object, const unsigned char* query)
{
    const unsigned char *curr = query;
    const unsigned char *start = NULL;
    const unsigned char *end = NULL;
    wordlist_p querylist = NULL, *pp = &querylist;

    while (1)
    {
	int len, upper;

	if (!object->char2int || (len = object->char2int(curr, NULL)) < 1)
	    len = 1;
	start = curr;
	upper = (len == 1 && isupper(*curr) && isupper(curr[1]));
	curr += len;
	while (1)
	{
	    if (!object->char2int || (len = object->char2int(curr, NULL)) < 1)
		len = 1;
	    if (*curr == '\0' || (len == 1 && (isupper(*curr) != 0) != upper))
		break;
	    curr += len;
	}
	/* ߂o^ */
	if (start && start < curr)
	{
	    *pp = wordlist_open_len(start, curr - start);
	    pp = &(*pp)->next;
	}
	if (*curr == '\0')
	    break;
    }
    return querylist;
}

/*
 * 1̒PmigemoϊB̃`FbN͍sȂȂB
 */
    static int
query_a_word(migemo* object, unsigned char* query)
{
    unsigned char* zen;
    unsigned char* han;

    /* queryM͂ɉ */
    object->addword(object, query);
    /* querŷ̂ł̎ */
    add_mnode_query(object, query);

    /* querySpɂČɉ */
    zen = romaji_convert2(object->han2zen, query, NULL, 0);
    if (zen != NULL)
    {
	object->addword(object, zen);
	romaji_release(object->han2zen, zen);
    }

    /* query𔼊pɂČɉ */
    han = romaji_convert2(object->zen2han, query, NULL, 0);
    if (han != NULL)
    {
	object->addword(object, han);
	romaji_release(object->zen2han, han);
    }

    /* AJ^JiAyтɂ鎫ǉ */
    if (add_roma(object, query))
	add_dubious_roma(object, object->rx, query);

    return 1;
}

    static int
addword_rxgen(migemo* object, unsigned char* word)
{
    /* K\GWɒǉꂽP\ */
    /*printf("addword_rxgen: %s\n", word);*/
    return rxgen_add(object->rx, word);
}

/**
 * queryŗ^ꂽ([}){ꌟ̂߂̐K\֕ϊB
 * ߂l͕ϊꂽʂ̕(K\)ŁAgp#migemo_release()֐
 * ֓nƂŉȂ΂ȂȂB
 * @param object MigemoIuWFNg
 * @param query ₢킹
 * @returns K\B#migemo_release() ŉKvLB
 */
    EXPORTS
    unsigned char*
migemo_query(migemo* object, const unsigned char* query)
{
    unsigned char *retval = NULL;
    wordlist_p querylist = NULL;
    wordbuf_p outbuf = NULL;

    if (object && object->rx && query)
    {
	wordlist_p p;

	querylist = parse_query(object, query);
	if (querylist == NULL)
	    goto MIGEMO_QUERY_END; /* querŷ߃G[ */
	outbuf = wordbuf_open();
	if (outbuf == NULL)
	    goto MIGEMO_QUERY_END; /* o͗p̃̈ŝ߃G[ */

	/* PQrxgenIuWFNgɓ͂K\𓾂 */
	object->addword = (MIGEMO_PROC_ADDWORD)addword_rxgen;
	rxgen_reset(object->rx);
	for (p = querylist; p; p = p->next)
	{
	    unsigned char* answer;

	    /*printf("query=%s\n", p->ptr);*/
	    query_a_word(object, p->ptr);
	    /* p^[(K\) */
	    answer = rxgen_generate(object->rx);
	    rxgen_reset(object->rx);
	    wordbuf_cat(outbuf, answer);
	    rxgen_release(object->rx, answer);
	}
    }

MIGEMO_QUERY_END:
    if (outbuf)
    {
	retval = outbuf->buf;
	outbuf->buf = NULL;
	wordbuf_close(outbuf);
    }
    if (querylist)
	wordlist_close(querylist);

    return retval;
}

/**
 * gImigemo_query()֐œꂽK\B
 * @param p MigemoIuWFNg
 * @param string K\
 */
    EXPORTS
    void
migemo_release(migemo* p, unsigned char* string)
{
    free(string);
}

/**
 * MigemoIuWFNg鐳K\Ɏgp郁^(Zq)w肷
 * Bindexłǂ̃^w肵AopŒuBindexɂ͈ȉ̒lw
 * \ł:
 *
 *  <dl>
 *  <dt>MIGEMO_OPINDEX_OR</dt>
 *	<dd>_aBftHg "|" Bvimŗpۂ "\|" B</dd>
 *  <dt>MIGEMO_OPINDEX_NEST_IN</dt>
 *	<dd>O[sOɗpJʁBftHg "(" Bvimł̓WX^
 *	\\1`\\9ɋLȂ悤ɂ邽߂ "\%(" pBPerlłl
 *	Ƃژ_ނȂ "(?:" gp\B</dd>
 *  <dt>MIGEMO_OPINDEX_NEST_OUT</dt>
 *	<dd>O[sȌI\ʁBftHgł ")" Bvimł
 *	"\)" B</dd>
 *  <dt>MIGEMO_OPINDEX_SELECT_IN</dt>
 *	<dd>I̊Jn\JpʁBftHgł "[" B</dd>
 *  <dt>MIGEMO_OPINDEX_SELECT_OUT</dt>
 *	<dd>ȈI\pʁBftHgł "]" B</dd>
 *  <dt>MIGEMO_OPINDEX_NEWLINE</dt>
 *	<dd>e̊Ԃɑ}u0ȏ̋󔒂͉sɃ}b`v
 *	p^[BftHgł "" łݒ肳ȂBvimł "\_s*" w
 *	肷B</dd>
 *  </dl>
 *
 * ftHg̃^͓ɒf肪ȂPerl̂ƓӖłBݒ
 * ɐƖ߂l1(0ȊO)ƂȂAs0ɂȂB
 * @param object MigemoIuWFNg
 * @param index ^ʎq
 * @param op ^
 * @returns 0ȊOAs0B
 */
    EXPORTS
    int
migemo_set_operator(migemo* object, int index, const unsigned char* op)
{
    if (object)
    {
	int retval = rxgen_set_operator(object->rx, index, op);
	return retval ? 0 : 1;
    }
    else
	return 0;
}

/**
 * MigemoIuWFNg鐳K\ɎgpĂ郁^(Zq)擾
 * BindexɂĂmigemo_set_operator()֐QƁB߂lɂindex̎w
 * 肪΃^i[ւ̃|C^AsłNULL
 * ԂB
 * @param object MigemoIuWFNg
 * @param index ^ʎq
 * @returns ݂̃^
 */
    EXPORTS
    const unsigned char*
migemo_get_operator(migemo* object, int index)
{
    return object ? rxgen_get_operator(object->rx, index) : NULL;
}

/**
 * MigemoIuWFNgɃR[hϊp̃vV[Wݒ肷BvV[W
 * Ă̏ڍׂ́u^t@XvZNVMIGEMO_PROC_CHAR2INTQƁB
 * @param object MigemoIuWFNg
 * @param proc R[hϊpvV[W
 */
    EXPORTS
    void
migemo_setproc_char2int(migemo* object, MIGEMO_PROC_CHAR2INT proc)
{
    if (object)
	rxgen_setproc_char2int(object->rx, (RXGEN_PROC_CHAR2INT)proc);
}

/**
 * MigemoIuWFNgɃR[hϊp̃vV[Wݒ肷BvV[W
 * Ă̏ڍׂ́u^t@XvZNVMIGEMO_PROC_INT2CHARQƁB
 * @param object MigemoIuWFNg
 * @param proc R[hϊpvV[W
 */
    EXPORTS
    void
migemo_setproc_int2char(migemo* object, MIGEMO_PROC_INT2CHAR proc)
{
    if (object)
	rxgen_setproc_int2char(object->rx, (RXGEN_PROC_INT2CHAR)proc);
}

/**
 * MigemoIuWFNgmigemo_dictǂݍ߂Ă邩`FbNBL
 * migemo_dictǂݍ߂ēɕϊe[u\złĂ0ȊO(TRUE)
 * A\złĂȂƂɂ0(FALSE)ԂB
 * @param obj MigemoIuWFNg
 * @returns 0ȊOAs0B
 */
    EXPORTS
    int
migemo_is_enable(migemo* obj)
{
    return obj ? obj->enable : 0;
}

#if 1
/*
 * ɃfobOp̉B֐
 */
    EXPORTS
    void
migemo_print(migemo* object)
{
    if (object)
	mnode_print(object->mtree, NULL);
}
#endif
