/* 
 * Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
 *
 * This file is part of Rast.
 * See the file COPYING for redistribution information.
 *
 */

#ifndef RAST_ENCODING_H
#define RAST_ENCODING_H

/**
 * @file encoding.h encoding
 */

#include <apr_pools.h>

#include "rast/rast.h"
#include "rast/error.h"
#include "rast/macros.h"

RAST_EXTERN_C_BEGIN

/**
 * @defgroup encoding encoding
 * @{
 */

typedef struct rast_encoding_module_t rast_encoding_module_t;

/** A structure that represents a character */
typedef struct {
    rast_encoding_module_t *encoding_module;
    const char *ptr;
    int nbytes;
} rast_char_t;

/** A structure that represents a token */
typedef struct {
    const char *ptr;
    int nbytes;
    int nchars;
    rast_pos_t pos;
    int is_complete;
} rast_token_t;

/** A structure that represents a tokenizer */
typedef struct {
    rast_encoding_module_t *encoding_module;
    apr_pool_t *pool;
    const unsigned char *ptr;
    const unsigned char *ptr_end;
    rast_pos_t pos;
    void *context;
} rast_tokenizer_t;

/** A structure that represents an encoding module definition */
struct rast_encoding_module_t {
    const char *encoding;
    /**
     * Return the length of the current character.
     * @param tokenizer The tokenizer.
     * @param len The length of the current character in bytes
     * @return RAST_OK if succeeded, error otherwise
     */
    rast_error_t *(*get_char_len)(rast_tokenizer_t *tokenizer,
                                  rast_size_t *len);

    /**
     * Return the current token.
     * @param tokenizer The tokenizer.
     * @param token The current token
     * @return RAST_OK if succeeded, error otherwise
     */
    rast_error_t *(*get_token)(rast_tokenizer_t *tokenizer,
                               rast_token_t *token);

    /**
     * Return the offset to the next token.
     * @param tokenizer The tokenizer.
     * @param byte_offset The byte offset to the next token
     * @param char_offset The character offset to the next token
     * @return RAST_OK if succeeded, error otherwise
     */
    rast_error_t *(*get_next_offset)(rast_tokenizer_t *tokenizer,
                                     rast_size_t *byte_offset,
                                     rast_size_t *char_offset);

    /**
     * Normalize text.  This function may change number of characters.
     * @param pool The pool to allocate the memory out of
     * @param src The source string
     * @param src_len The length of the source string
     * @param dst The destination string
     * @param dst_len The length of the destination string
     */
    void (*normalize_text)(apr_pool_t *pool,
                           const char *src, rast_size_t src_len,
                           char **dst, rast_size_t *dst_len);

    /**
     * Normalize each character in src.  This function should not change
     * number of characters.
     * @param pool The pool to allocate the memory out of
     * @param src The source string
     * @param src_len The length of the source string
     * @param dst The destination string
     * @param dst_len The length of the destination string
     */
    void (*normalize_chars)(apr_pool_t *pool,
                            const char *src, rast_size_t src_len,
                            char **dst, rast_size_t *dst_len);

    /**
     * Check whether a character is a space character or not.
     * @param ch The character to check
     * @return 1 if ch is a space character, 0 otherwise
     */
    int (*is_space)(rast_char_t *ch);
};

/**
 * Load encoding modules.
 * @param dirname The name of the directory where encoding modules are located.
 * @return RAST_OK if succeeded, error otherwise
 */
rast_error_t *rast_load_encoding_modules(const char *dirname);

/**
 * Unload encoding modules.
 * @return RAST_OK if succeeded, error otherwise
 */
rast_error_t *rast_unload_encoding_modules();

/**
 * Get an encoding module.
 * @param name The name of the encoding module
 * @param module The encoding module
 * @return RAST_OK if succeeded, error otherwise
 */
rast_error_t *rast_get_encoding_module(const char *name,
                                       rast_encoding_module_t **module);

/**
 * Normalize the text by the specified encoding_module.  This function calls
 * rast_encoding_module_t::normalize_text at first, then calls
 * rast_encoding_module_t::normalize_chars.
 * @param encoding_module The encoding module for normalizing.
 * @param s The source string
 * @param nbytes The length of the source string in bytes.
 * @param new_nbytes The length of the normalized string in bytes.
 * @param pool The pool to allocate the memory out of
 * @return The normalized string
 */
char *rast_normalize_text(rast_encoding_module_t *encoding_module,
                          const char *s, rast_size_t nbytes,
                          rast_size_t *new_nbytes, apr_pool_t *pool);

rast_tokenizer_t *rast_char_tokenizer_create(apr_pool_t *pool,
                                             rast_encoding_module_t *,
                                             const char *s,
                                             rast_size_t nbytes);
rast_error_t *rast_char_tokenizer_next(rast_tokenizer_t *tokenizer);
rast_error_t *rast_char_tokenizer_get_current(rast_tokenizer_t *tokenizer,
                                              rast_char_t *ch);
int rast_char_tokenizer_is_done(rast_tokenizer_t *tokenizer);

rast_tokenizer_t *rast_register_tokenizer_create(apr_pool_t *pool,
                                                 rast_encoding_module_t *,
                                                 const char *s,
                                                 rast_size_t nbytes);
rast_error_t *rast_register_tokenizer_next(rast_tokenizer_t *tokenizer);
rast_error_t *rast_register_tokenizer_get_current(rast_tokenizer_t *tokenizer,
                                                  rast_token_t *token);
int rast_register_tokenizer_is_done(rast_tokenizer_t *tokenizer);

rast_tokenizer_t *rast_search_tokenizer_create(apr_pool_t *pool,
                                               rast_encoding_module_t *,
                                               const char *s,
                                               rast_size_t nbytes);
rast_error_t *rast_search_tokenizer_next(rast_tokenizer_t *tokenizer);
rast_error_t *rast_search_tokenizer_get_current(rast_tokenizer_t *tokenizer,
                                                rast_token_t *token);
int rast_search_tokenizer_is_done(rast_tokenizer_t *tokenizer);

int rast_count_chars(rast_encoding_module_t *encoding_module,
                     const char *s, rast_size_t nbytes, apr_pool_t *pool);
int rast_char_is_space(rast_char_t *ch);

/** @} */

RAST_EXTERN_C_END

#endif /* RAST_ENCODING_H */

/* vim: set filetype=c sw=4 expandtab : */
