/*
 *  SPL - The SPL Programming Language
 *  Copyright (C) 2004, 2005  Clifford Wolf <clifford@clifford.at>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *  utf8.c: Simple utf8 conversion and check functions
 */

#define _GNU_SOURCE

#include <string.h>
#include <stdlib.h>
#include <assert.h>

#include "spl.h"
#include "utf8tab.h"

typedef unsigned char charset_map_table_type[4];

static struct {
	charset_map_table_type *table;
	char *name;
} charset_map[] = {
	{ UTF8TAB_ISO8859_1, "ascii" },
	{ UTF8TAB_ISO8859_1, "latin_1" },
	{ UTF8TAB_ISO8859_1, "iso8859_1" },
	{ 0, 0 }
};

const char *spl_utf8_check(const char *text)
{
	const unsigned char *utext = (const unsigned char *)text;
	int inc = 0;

	for (const unsigned char *t = utext; *t; t += inc + 1)
	{
#define IN(off, mask, value) ((t[inc=off] & (mask)) == (value))

		if (IN(0, 0x80, 0x00)) continue;

		if (IN(0, 0xE0, 0xC0) &&
		    IN(1, 0xC0, 0x80)) continue;

		if (IN(0, 0xF0, 0xE0) &&
		    IN(1, 0xC0, 0x80) &&
		    IN(2, 0xC0, 0x80)) continue;

		if (IN(0, 0xF8, 0xF0) &&
		    IN(1, 0xC0, 0x80) &&
		    IN(2, 0xC0, 0x80) &&
		    IN(3, 0xC0, 0x80)) continue;

		return (char*)t;
#undef IN
	}
	return 0;
}

char *spl_utf8_import(const char *text, const char *charset)
{
	const unsigned char *utext = (const unsigned char *)text;
	charset_map_table_type *tab = 0;

	if (!strcmp("utf_8", charset))
		return strdup(text);

	for (int i=0; charset_map[i].table; i++)
		if (!strcmp(charset_map[i].name, charset))
			tab = charset_map[i].table;

	if (!tab)
		return 0;

	int result_len = 0;

	for (const unsigned char *t = utext; *t; t++)
		if ((*t & 0x80) == 0) result_len++;
		else result_len += strlen((char*)(tab[*t-128]));

	char *result = malloc(result_len+1);
	char *r = result;

	for (const unsigned char *t = utext; *t; t++)
		if ((*t & 0x80) == 0) *(r++) = *t;
		else {
			strcpy(r, (const char*)tab[*t-128]);
			r += strlen((const char*)tab[*t-128]);
		}

	assert(r == result+result_len);
	result[result_len] = 0;
	return result;
}

char *spl_utf8_export(const char *text, const char *charset)
{
	const unsigned char *utext = (const unsigned char *)text;
	charset_map_table_type *tab = 0;

	if (!strcmp("utf_8", charset))
		return strdup(text);

	for (int i=0; charset_map[i].table; i++)
		if (!strcmp(charset_map[i].name, charset))
			tab = charset_map[i].table;

	if (!tab)
		return 0;

	int result_len = 0;
	char *result = malloc(strlen(text)+1);

	int inc = 0;
	for (const unsigned char *t = utext; *t; t += inc + 1)
	{
#define IN(off, mask, value) ((t[inc=off] & (mask)) == (value))

		if (IN(0, 0x80, 0x00)) {
			result[result_len++] = t[0];
			continue;
		}

		for (int i=0; i<128; i++) {
			inc = strlen((const char*)tab[i]);
			if (!strncmp((const char*)t, (const char*)tab[i], i)) {
				result[result_len++] = 128+i;
				goto next_export_char;
			}
		}

		result[result_len++] = '?';

		if (IN(0, 0xE0, 0xC0) &&
		    IN(1, 0xC0, 0x80)) continue;

		if (IN(0, 0xF0, 0xE0) &&
		    IN(1, 0xC0, 0x80) &&
		    IN(2, 0xC0, 0x80)) continue;

		if (IN(0, 0xF8, 0xF0) &&
		    IN(1, 0xC0, 0x80) &&
		    IN(2, 0xC0, 0x80) &&
		    IN(3, 0xC0, 0x80)) continue;

		/* Input is not UTF-8. This should not happen.. */
		for (inc=1; t[inc] & 0x80; t++)
			result[result_len++] = '?';

next_export_char:;
#undef IN
	}

	result[result_len++] = 0;
	return realloc(result, result_len);
}

