/*
 * CESU-8 to UTF-8 converter
 *
 * SPDX-FileType: SOURCE
 * SPDX-FileCopyrightText: Michael Bäuerle
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include "assert.h"
#include "string.h"

#include "libuciconv-0/iconv.h"  /* Always include main header file first */
#include "check_nul.h"
#include "iconv_cesu-8.h"
#include "nonident.h"


/* Check for continuation octet */
#define UCIC0_I_UTF8_CONTINUATION(c)  (0x80 == ((unsigned char)c & 0xC0))


/* ========================================================================== */
/*
 * Encode UTF-8 sequence for Unicode codepoint
 *
 * The buffer 'buf' must provide memory for four bytes.
 */
static void ucic0_i_encode_pair_to_utf8(char *buf, long int ucp)
{
    const unsigned char prefix_four = 0xF0;
    const unsigned char prefix_cont = 0x80;
    unsigned char       data        = 0;
    size_t              i           = 0;  /* Index in UTF-8 sequence */

    assert(0x00FFFFL <  ucp);  /* Not within Unicode Plane 0 (BMP) */
    assert(0x10FFFFL >= ucp);  /* Not beyond Unicode Plane 16 */

    data = (unsigned char)((ucp >> 18) & 0x07L);
    buf[i++] = prefix_four | data;
    data = (unsigned char)((ucp >> 12) & 0x3FL);
    buf[i++] = prefix_cont | data;
    data = (unsigned char)((ucp >> 6) & 0x3FL);
    buf[i++] = prefix_cont | data;
    data = (unsigned char)(ucp & 0x3FL);
    buf[i++] = prefix_cont | data;
}


/* ========================================================================== */
/*
 * Check whether UTF-8 sequence is valid
 *
 * Returns zero (false / no error) for valid.
 */
static ucic0_i_bool ucic0_i_check_invalid(const char *data,  size_t seqlen)
{
    size_t i = 1;

    if ((1U > seqlen) || (4U < seqlen))
        return 1;

    if ((1U == seqlen) && (0x7FU < (unsigned char)data[0]))
        return 1;

    for (; seqlen > i; ++i)
    {
        /* Check for UTF-8 continuation octet */
        if (!UCIC0_I_UTF8_CONTINUATION(data[i]))
            return 1;
    }

    return 0;
}


/* ========================================================================== */
/*
 * Decode UTF-8 sequence to Unicode codepoint
 *
 * This function is suitable for unassigned, private and surrogate codepoints.
 *
 * Returns Unicode codepoint or -1 on error.
 */
static long int ucic0_i_decode_utf8(const char *s, size_t seqlen)
{
    long int ucp = -1;

    if (!ucic0_i_check_invalid(s, seqlen))
    {
        switch (seqlen)
        {
        case 1:
            ucp = s[0];
            break;
        case 2:
            ucp  = (long int)(s[0] & 0x1F) << 6;
            ucp |= (long int)(s[1] & 0x3F);
            break;
        case 3:
            ucp  = (long int)(s[0] & 0x0F) << 12;
            ucp |= (long int)(s[1] & 0x3F) << 6;
            ucp |= (long int)(s[2] & 0x3F);
            break;
        case 4:
            ucp  = (long int)(s[0] & 0x07) << 18;
            ucp |= (long int)(s[1] & 0x3F) << 12;
            ucp |= (long int)(s[2] & 0x3F) << 6;
            ucp |= (long int)(s[3] & 0x3F);
            break;
        }
    }

    return ucp;
}


/* ========================================================================== */
/*
 * Synchronize to next UTF-8 non-continuation octet
 *
 * A nonidentical conversion is executed for every input octet that is a UTF-8
 * continuation octet (but always for the first octet), if requested by caller.
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static ucic0_i_bool ucic0_i_resync(ucic0_i_state *state)
{
    ucic0_i_bool first = 1;

    while (*(state->inlen))
    {
        const size_t index_in = state->inlen_start - *(state->inlen);

        if (!first)
        {
            /* Check for UTF-8 continuation octet */
            if (!UCIC0_I_UTF8_CONTINUATION(state->inarray[index_in]))
                break;
        }

        /* Check for NUL control character */
        if (ucic0_i_check_nul(state, state->inarray[index_in]))
            break;

        first = 0;
        if (ucic0_i_nonident(state))
            return 1;
    }

    return 0;
}


/* ========================================================================== */
/*
 * Calculate length of UTF-8 sequence
 *
 * Returns UTF-8 sequence length on success or zero otherwise.
 */
static size_t ucic0_i_get_sequence_length(const char c)
{
    size_t seqlen = 1;

    if (0x80 & c)
    {
        if ((0xE0 & c) == 0xC0)
            seqlen = 2;
        else if ((0xF0 & c) == 0xE0)
            seqlen = 3;
        else if ((0xF8 & c) == 0xF0)
            seqlen = 4;
        else
            seqlen = 0;
    }

    return seqlen;
}


/* ========================================================================== */
/*
 * Check for truncated UTF-8 sequence or surrogate pair
 *
 * Returns nonzero if sequence is truncated.
 */
static ucic0_i_bool ucic0_i_check_incomplete(const ucic0_i_state *state,
                                             const char *seq,
                                             const size_t seqlen)
{
    size_t i = 0;

    if (*(state->inlen) < seqlen)
        return 1;

    if (!(UCIC0_ICONV_IGNORE_NULL & state->flag))
    {
        for (; seqlen > i; ++i)
            if ((const char)0x00 == seq[i])
                return 1;
    }

    return 0;
}


/* ========================================================================== */
/*
 * Decode surrogate pair
 *
 * Returns Unicode codepoint for surrogate pair on success.
 * Returns 0 if resync was requested.
 * Returns -1 for error after 'errno' was set.
 */
static long int ucic0_i_decode_surrogate_pair(ucic0_i_state *state,
                                              const char *seq,
                                              const long int hs)
{
    long int ls = -1;

    if (UCIC0_I_UTF16_SURROGATE_LOW(hs))
    {
        /* Invalid surrogate pair (must start with high surrogate codepoint) */
        state->resync = 1;
        return 0;
    }

    if (ucic0_i_check_incomplete(state, seq, 6U))
    {
        /* Incomplete sequence for surrogate pair at end of input data */
        errno = UCIC0_I_EINVAL;
        return -1;
    }

    {
        const char *seq_ls = seq + 3;

        if (3U == ucic0_i_get_sequence_length(seq_ls[0]))
            ls = ucic0_i_decode_utf8(seq_ls, 3U);
        if (0 > ls)
        {
            /* Invalid UTF-8 sequence */
            if (ucic0_i_nonident(state))
                return -1;
            state->resync = 1;
            return 0;
        }
    }

    if (UCIC0_I_UTF16_SURROGATE_HIGH(ls))
    {
        /* Invalid surrogate pair (low surrogate codepoint must follow) */
        state->resync = 1;
        return 0;
    }

    {
        long int ucp = ((hs & 0x3FF) << 10) | (ls & 0x3FF);

        ucp += 0x10000L;
        assert(0xFFFFL < ucp);
        return ucp;
    }
}


/* ========================================================================== */
/*
 * Process CESU-8 data
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static ucic0_i_bool ucic0_i_process(ucic0_i_state *state)
{
    /* UTF-8 sequence for combined surrogate pair */
    char combined_pair[4] = { 0, 0, 0, 0 };

    while (*(state->inlen))
    {
        const size_t  index_in   = state->inlen_start  - *(state->inlen);
        const size_t  index_out  = state->outlen_start - *(state->outlen);
        const char    c          = state->inarray[index_in];
        size_t        seqlen_in  = ucic0_i_get_sequence_length(c);
        size_t        seqlen_out = seqlen_in;
        const char   *seq        = &(state->inarray)[index_in];
        long int      ucp        = -1;  /* Unicode codepoint */

        if (ucic0_i_check_nul(state, c))
            break;

        if (0U == seqlen_in)
        {
            /* Invalid UTF-8 sequence */
            state->resync = 1;
            break;
        }

        if (ucic0_i_check_incomplete(state, seq, seqlen_in))
        {
            /* Incomplete UTF-8 sequence at end of input data */
            errno = UCIC0_I_EINVAL;
            return 1;
        }

        ucp = ucic0_i_decode_utf8(seq, seqlen_in);
        if (0 > ucp)
        {
            /* Invalid UTF-8 sequence */
            state->resync = 1;
            break;
        }

        if (UCIC0_I_UTF16_SURROGATE(ucp))
        {
            assert(3U == seqlen_in);
            ucp = ucic0_i_decode_surrogate_pair(state, seq, ucp);
            if (-1 == ucp)
                return 1;
            if (0 == ucp)
                break;
            ucic0_i_encode_pair_to_utf8(combined_pair, ucp);
            seq        = combined_pair;
            seqlen_out = 4;
            seqlen_in  = 6;
        }

        if (*(state->outlen) < seqlen_out)
        {
            /* Not enough space in outarray */
            errno = UCIC0_I_E2BIG;
            return 1;
        }
        (void)memcpy(&(state->outarray)[index_out], seq, seqlen_out);
        *(state->outlen) -= seqlen_out;

        /* Consume input data */
        *(state->inlen) -= seqlen_in;
    }

    return 0;
}


/* ========================================================================== */
/*
 * CESU-8 to UTF-8 converter
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
ucic0_i_bool ucic0_i_conv_cesu8(ucic0_i_state *state)
{
    while (*(state->inlen))
    {
        if (state->resync)
        {
            /* Lost synchronization */
            if (ucic0_i_resync(state))
               return 1;
            else
               state->resync = 0;
        }
        else
        {
            if (ucic0_i_process(state))
               return 1;
        }

        if (state->abort)
            break;
    }

    return 0;
}
