/* ========================================================================== */
/*! \file
 * \brief Test of \c enc_convert_to_utf8_nfc() implementation
 *
 * Copyright (c) 2012-2024 by the developers. See the LICENSE file for details.
 */


/* ========================================================================== */
/* Include headers */

#include "posix.h"  /* Include this first because of feature test macros */

#include <stdio.h>
#include <string.h>

#include "config.h"

#include "encoding.h"
#include "test.h"
#include "test_unicode.h"


/* ========================================================================== */
/* Data types */

enum sm_state
{
   SM_SRC,
   SM_NFC,
   SM_NFD,
   SM_INVALID
};


/* ========================================================================== */
/* Constants */

/* Buffer size in codepoint units */
#define BUFSIZE  (size_t) 16

/*
 * Every codepoint can require up to 4 byte in UTF-8
 * +1 for leading space
 * +1 for NUL termination
 */
#define BUFSIZE_UTF8  (BUFSIZE * (size_t) 4 + (size_t) 2)

#include "uc_test_nfc.c"


/* ========================================================================== */
/*! \addtogroup TEST */
/*! @{ */


/* ========================================================================== */
/* Create octet representation for debug output
 *
 * \param[out] ob         Pointer to buffer for human readable octets
 * \param[in]  len_ob     Size of buffer at address \e ob
 * \param[in]  uc_string  Unicode string in UTF-8 format
 *
 * \e len_ob must be at least (3 * strlen( \e uc_string ))
 *
 * \note Only the first 16 octets are printed.
 */

void test_unicode_octets(char*  ob, size_t  len_ob, const char*  uc_string)
{
    size_t  len = strlen(uc_string);
    size_t  i   = 0;

    if (0U == len_ob || 3U * len > len_ob)
    {
       ob[0] = 0;
       return;
    }

    for (; len > i; ++i)
    {
        api_posix_snprintf(&ob[3U * i], len_ob - 3U * i,
                           "%02X ", (unsigned int)(unsigned char)uc_string[i]);
    }

    ob[3U * len - 1] = 0;
}


/* ========================================================================== */
/* Test Unicode conformance with official test data
 *
 * \param[in] record    Record number
 * \param[in] src_utf8  Source data
 * \param[in] nfc_utf8  Data in normal form C (NFC)
 * \param[in] nfd_utf8  Data in normal form D (NFD)
 *
 * According to the Unicode standard, NFC normalization must behave like this:
 *
 *    nfc_utf8 == toNFC(src_utf8) == toNFC(nfc_utf8) == toNFC(nfd_utf8)
 *
 * \return
 * - \c EXIT_SUCCESS on success
 * - \c EXIT_FAILURE on error
 */

static int  test_unicode_conformance(size_t  record, const char*  src_utf8,
                                     const char*  nfc_utf8,
                                     const char*  nfd_utf8)
{
   int  res = API_POSIX_EXIT_SUCCESS;
   const char*  buf1 = NULL;
   const char*  buf2 = NULL;
   const char*  buf3 = NULL;
   const char*  loc;
   const char*  input = NULL;
   const char*  err = NULL;

#if 0
   /* For debugging */
   printf("================\nrecord: %u\n", (unsigned int) record);
   printf("src_utf8: \"%s\"\n", src_utf8);
   printf("nfc_utf8: \"%s\"\n", nfc_utf8);
   printf("nfd_utf8: \"%s\"\n", nfd_utf8);
   printf("================\n"),
#endif

   /* nfc_utf8 == toNFC(src_utf8) */
   input = src_utf8;
   buf1 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
   if(NULL == buf1)
   {
      print_error("Conversion SRC => NFC failed");
      res = API_POSIX_EXIT_FAILURE;
   }
   else if(strcmp(nfc_utf8, buf1))
   {
      print_error("Result mismatch for SRC => NFC");
      err = buf1;
      res = API_POSIX_EXIT_FAILURE;
   }

   /* nfc_utf8 == toNFC(nfc_utf8) */
   if(API_POSIX_EXIT_SUCCESS == res)
   {
      input = nfc_utf8;
      buf2 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
      if(NULL == buf2)
      {
         print_error("Conversion NFC => NFC failed");
         res = API_POSIX_EXIT_FAILURE;
      }
      else if(strcmp(nfc_utf8, buf2))
      {
         print_error("Result mismatch for NFC => NFC");
         err = buf2;
         res = API_POSIX_EXIT_FAILURE;
      }
   }

   /* nfc_utf8 == toNFC(nfd_utf8) */
   if(API_POSIX_EXIT_SUCCESS == res)
   {
      input = nfd_utf8;
      buf3 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
      if(NULL == buf3)
      {
         print_error("Conversion NFD => NFC failed");
         res = API_POSIX_EXIT_FAILURE;
      }
      else if(strcmp(nfc_utf8, buf3))
      {
         print_error("Result mismatch for NFD => NFC");
         err = buf3;
         res = API_POSIX_EXIT_FAILURE;
      }
   }

   /* For debugging */
   if(API_POSIX_EXIT_SUCCESS != res && NULL != err)
   {
#if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
      loc = api_posix_setlocale(API_POSIX_LC_CTYPE, "");
#else  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
      loc = NULL;
#endif  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
      if(NULL == loc)
      {
         print_error("Setting locale for debug messages failed");
      }
      else
      {
         /* Print Unicode data only if terminal use Unicode locale */
         if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
         {
            print_error(
               "Debug messages can't be printed with current locale");
         }
         else
         {
#define TEST_BUFSIZE  60U
            char ob[TEST_BUFSIZE];  /* Buffer for human readable octets */

            fprintf(stderr, TEST_TAB "Record number in test data file: %lu\n",
                    (unsigned long int) record);
            test_unicode_octets(ob, TEST_BUFSIZE, input);
            fprintf(stderr, TEST_TAB "Input data: \"%s\" (%s)\n", input, ob);
            test_unicode_octets(ob, TEST_BUFSIZE, err);
            fprintf(stderr, TEST_TAB "Result is : \"%s\" (%s)\n", err, ob);
            test_unicode_octets(ob, TEST_BUFSIZE, nfc_utf8);
            fprintf(stderr, TEST_TAB "Should be : \"%s\" (%s)\n", nfc_utf8, ob);
         }
      }
   }

   /* Release memory */
   if(buf1 != src_utf8)  { enc_free((void*) buf1); }
   if(buf2 != nfc_utf8)  { enc_free((void*) buf2); }
   if(buf3 != nfd_utf8)  { enc_free((void*) buf3); }

   return(res);
}


/* ========================================================================== */
/* Extract official Unicode test data records
 *
 * \return
 * - \c EXIT_SUCCESS on success
 * - \c EXIT_FAILURE on error
 */

static int  test_unicode_part2(void)
{
   int  res = API_POSIX_EXIT_SUCCESS;
   size_t  i = 0;
   long int  ucp = -1L;
   enum sm_state  state = SM_SRC;
   long int  src[BUFSIZE];
   long int  nfc[BUFSIZE];
   long int  nfd[BUFSIZE];
   size_t  src_i = 0;
   size_t  nfc_i = 0;
   size_t  nfd_i = 0;
   char  src_utf8[BUFSIZE_UTF8];
   char  nfc_utf8[BUFSIZE_UTF8];
   char  nfd_utf8[BUFSIZE_UTF8];
   size_t  utf8_i;
   size_t  rec = 0;

   /* Assignment in truth expression is intended */
   while(API_POSIX_EXIT_SUCCESS == res && -1 != (ucp = uc_test_nfc_table[i++]))
   {
      /* Test sequence parser (-2: Field separator, -3: Record separator) */
      switch(state)
      {
         case SM_SRC:
         {
            if(-2L == ucp)  { state = SM_NFC; }
            else if(0L > ucp)
            {
               print_error("Invalid data found");
               res = API_POSIX_EXIT_FAILURE;
            }
            else
            {
               if(BUFSIZE <= src_i)
               {
                  print_error("SRC data buffer too small");
                  res = API_POSIX_EXIT_FAILURE;
               }
               else  { src[src_i++] = ucp; }
            }
            break;
         }
         case SM_NFC:
         {
            if(-2L == ucp)  { state = SM_NFD; }
            else if(0L > ucp)
            {
               print_error("Invalid data found");
               res = API_POSIX_EXIT_FAILURE;
            }
            else
            {
               if(BUFSIZE <= nfc_i)
               {
                  print_error("NFC data buffer too small");
                  res = API_POSIX_EXIT_FAILURE;
               }
               else  { nfc[nfc_i++] = ucp; }
            }
            break;
         }
         case SM_NFD:
         {
            if(-3L == ucp)
            {
               /* Data extraction from record complete, convert data to UTF-8 */
               src_utf8[0] = ' '; utf8_i = 1;
               enc_uc_encode_utf8(src_utf8, &utf8_i, src, &src_i);
               src_utf8[utf8_i] = 0;
               nfc_utf8[0] = ' '; utf8_i = 1;
               enc_uc_encode_utf8(nfc_utf8, &utf8_i, nfc, &nfc_i);
               nfc_utf8[utf8_i] = 0;
               nfd_utf8[0] = ' '; utf8_i = 1;
               enc_uc_encode_utf8(nfd_utf8, &utf8_i, nfd, &nfd_i);
               nfd_utf8[utf8_i] = 0;
               if(src_i || nfc_i || nfd_i)
               {
                  print_error("Encoding test data to UTF-8 failed");
                  res = API_POSIX_EXIT_FAILURE;
               }
               else
               {
                  /* Execute Unicode conformance checks */
                  res = test_unicode_conformance(rec++,
                                                 src_utf8, nfc_utf8, nfd_utf8);
                  /* Extract next record */
                  state = SM_SRC;
               }
            }
            else if(0L > ucp)
            {
               print_error("Invalid data found");
               res = API_POSIX_EXIT_FAILURE;
            }
            else
            {
               if(BUFSIZE <= nfd_i)
               {
                  print_error("NFD data buffer too small");
                  res = API_POSIX_EXIT_FAILURE;
               }
               else  { nfd[nfd_i++] = ucp; }
            }
            break;
         }
         default:
         {
            print_error("Parser state machine error");
            res = API_POSIX_EXIT_FAILURE;
            break;
         }
      }
   }

   return(res);
}


/* ========================================================================== */
/*! \brief Test \c enc_convert_to_utf8_nfc() implementation
 *
 * \note
 * The UTF-7 transformation format use base64 encoded UTF-16BE as internal
 * representation. Therefore all Unicode codepoints beyond the BMP must be
 * encoded using surrogate codepoints (that are forbidden in UTF-8).
 *
 * Part 1: The following cases are tested:
 * - ASCII only (trivial)
 * - Unicode already in NFC normalization (NFC quick check)
 * - Unicode precomposed but with composition exception (requires lookup table)
 * - Unicode with NFD normalization (trivial canonical composition)
 * - Unicode with noncanonical order A (canonical reordering and composition)
 * - Unicode with noncanonical order B (canonical reordering)
 * - Unicode singleton (decomposition to another single codepoint)
 * - Unicode algorithmic composition (used for hangul syllables)
 * - UTF-7 to UTF-8 conversion (and conversion from NFD to NFC normalization)
 * - UTF-7 to UTF-8 conversion (with codepoint that require surrogate pair)
 * - UTF-7 to UTF-8 conversion (shift sequence terminated by SP or end-of-data)
 *
 * Part2: The Unicode normalization conformance test data file is used.
 *
 * \note
 * For part 2 all test strings are prepended with a space because our
 * normalization implementation will intentionally strip "defective combining
 * character sequences" at the start of strings (even if they are not
 * "ill-formed" according to the standard).
 *
 * \return
 * - \c EXIT_SUCCESS on success
 * - \c EXIT_FAILURE on error
 */

int  test_unicode(void)
{
#define TS_NUM  (size_t) 11  /* Number of test strings */
#define TS_UTF7  (size_t) 8  /* First index of UTF-7 section */
   static const char*  ts[TS_NUM] =
   {
      /* UTF-8 section */
      "This is an ASCII string",
      "This is an Unicode string: \xC3\xA4word",
      "This is an Unicode string: \xE0\xAD\x9Cword",
      "This is an Unicode string: a\xCC\x88word",
      "This is an Unicode string: start\xCE\xB1\xCC\x94\xCC\x81\xCD\x85word",
      "This is an Unicode string: start\xCE\xB1\xCC\x81\xCC\x94\xCD\x85word",
      "This is an Unicode string: \xE2\x84\xA6word",
      "Composition of hangul jamo: \xE1\x84\x91\xE1\x85\xB1\xE1\x86\xB6",
      /* UTF-7 section */
      "This is an Unicode string: hundertf+AHUDCA-nfzig",
      "This is an Unicode string: Violinschl+APw-ssel (+2DTdHg-)",
      "Shift sequence terminated by SP or end-of-data: A+AMQ- O+ANY U+ANw"
   };
   static const char*  rs[TS_NUM] =  {
      /* UTF-8 section */
      "This is an ASCII string",
      "This is an Unicode string: \xC3\xA4word",
      "This is an Unicode string: \xE0\xAC\xA1\xE0\xAC\xBCword",
      "This is an Unicode string: \xC3\xA4word",
      "This is an Unicode string: start\xE1\xBE\x85word",
      "This is an Unicode string: start\xE1\xBE\xB4\xCC\x94word",
      "This is an Unicode string: \xCE\xA9word",
      "Composition of hangul jamo: \xED\x93\x9B",
      /* UTF-7 section */
      "This is an Unicode string: hundertf\xC3\xBCnfzig",
      "This is an Unicode string: Violinschl\xC3\xBCssel (\xF0\x9D\x84\x9E)",
      "Shift sequence terminated by SP or end-of-data: A\xC3\x84 O\xC3\x96 U\xC3\x9C"
   };
   int  res = API_POSIX_EXIT_SUCCESS;
   size_t  i;
   const char*  buf;
   const char*  loc;

   /* Part 1: Check with internal test data */
   for(i = 0; i < TS_NUM; ++i)
   {
      if(TS_UTF7 <= i)
      {
         buf = enc_convert_to_utf8_nfc(ENC_CS_UTF_7, ts[i]);
      }
      else
      {
         buf = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, ts[i]);
      }
      if(NULL == buf)
      {
         print_error("Conversion of data to UTF-8 failed");
         res = API_POSIX_EXIT_FAILURE;
         break;
      }
      if(strcmp(rs[i], buf))
      {
         print_error("Result is not correct");
         /* For debugging */
#if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
         loc = api_posix_setlocale(API_POSIX_LC_CTYPE, "");
#else  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
         loc = NULL;
#endif  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
         if(NULL == loc)
         {
            print_error("Setting locale for debug messages failed");
         }
         else
         {
            /* Print Unicode data only if terminal use Unicode locale */
            if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
            {
               print_error(
                  "Debug messages can't be printed with current locale");
            }
            else
            {
               fprintf(stderr, TEST_TAB "Input data: \"%s\"\n", ts[i]);
               fprintf(stderr, TEST_TAB "Result is : \"%s\"\n", buf);
               fprintf(stderr, TEST_TAB "Should be : \"%s\"\n", rs[i]);
            }
         }
         res = API_POSIX_EXIT_FAILURE;
         break;
      }
      if(buf != ts[i])  { enc_free((void*) buf); }
   }

   /* Part 2: Check with external test data (from Unicode data file) */
   if(API_POSIX_EXIT_SUCCESS == res)
   {
      res = test_unicode_part2();
   }

   return(res);
}


/*! @} */

/* EOF */
