/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */

/* 
   approximate.c : function to transliterate unicode into any character set

   Copyright (C) 1999 Robert Brady

   The Gnome Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The Gnome Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the Gnome Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

#include <string.h>
#include <stdio.h>
#include <unicode.h>

#define TERMINAL_CHARSET "UTF-8"

/* 
 * You can get info on many unicode issues, including, utf-8 xterms and unicode 
 * fonts for X11 from :
 * 
 *   http://www.cl.cam.ac.uk/~mgk25/unicode.html
 *
 */


/* WARNING : This code is really messy */

struct thing {
	unicode_char_t ch;
	const char *alternatives; /* \t used as seperator */
};

struct thing things[] = {

/* Convert Unicode IPA to ASCII IPA. 
   
 *  THETA is rather annoying, in ASCII IPA, theta is 'T', but for
 * greek is should be transliterated 'th'.
 
 * this would not be a problem except IPA theta and GREEK theta were
 * wrongly unified by the Unicode people, and they are dragging their
 * heels fixing it. (there is no LATIN SMALL LETTER THETA in the Beta of 
 * Unicode 3. :(
 */

	{ 0x131, "I\t"  },
	{ 0x259, "@\t"  },
	{ 0x25b, "E\t"  },
	{ 0x283, "S\t"  },
	{ 0x28a, "U\t"  },
	{ 0x2c8, "'\t"  },

/* 
 * Transliterate Greek into Latin, rather crudely. Doing this properly
 *  takes some smarts, and is beyond the scope of this approximator.
 * For now, anyway.
 * 
 * one other problem with this is that it fails to deal with titlecase. If 
 * there was some greek : 
 * 
 * GREEK CAPITAL LETTER PHI
 * GREEK SMALL LETTER EPSILON
 * 
 * It would be converted to "PHe".
 *
 * again, fixing this is possible, but another project.
 * 
 */

	{ 0x391, "A\t" }, /* alpha */
	{ 0x392, "B\t" }, /* beta */
	{ 0x393, "G\t" }, /* gamma */
	{ 0x394, "D\t" }, /* delta */
	{ 0x395, "E\t" }, /* epsilon */
	{ 0x396, "Z\t" }, /* zeta */
	{ 0x397, /* E with acute? */ "E\t" }, /* eta */

	{ 0x398, "Þ\tTH\t" }, /* theta  : Thorn first, then capital t */

	{ 0x399, "I\t" },      /* iota */
	{ 0x39a, "K\t" },      /* kappa */
	{ 0x39b, "L\t" },      /* lambda */
	{ 0x39c, "M\t" },      /* mu */
	{ 0x39d, "N\t" },      /* nu */
	{ 0x39e, "X\t" },      /* xi */
	{ 0x39f, "O\t" },/* o breve? */  /* omicron */
	{ 0x3a0, "P\t" },      /* pi */
	{ 0x3a1, "R\t" },      /* rho */
	{ 0x3a2, "S\t" },      /* sigma */
	{ 0x3a3, "T\t" },      /* tau */
	{ 0x3a4, "U\t" },      /* upsilon */
	{ 0x3a5, "PH\t" },     /* phi : ? maybe 'f' ? */
	{ 0x3a6, "CH\t" },     /* chi : ? consider C-caron ? */
	{ 0x3a7, "PS\t" },     /* psi */
	{ 0x3a8, "O\t" },      /* omega */




	{ 0x3b1, "a\t" }, /* alpha */
	{ 0x3b2, "b\t" }, /* beta */
	{ 0x3b3, "g\t" }, /* gamma */
	{ 0x3b4, "d\t" }, /* delta */
	{ 0x3b5, "e\t" }, /* epsilon */
	{ 0x3b6, "z\t" }, /* zeta */
	{ 0x3b7, /* E with acute? */ "e\t" }, /* eta */

	{ 0x3b8, "þ\tth\t" }, /* theta  : Thorn first, then capital t */

	{ 0x3b9, "i\t" },      /* iota */
	{ 0x3ba, "k\t" },      /* kappa */
	{ 0x3bb, "l\t" },      /* lambda */
	{ 0x3bc, "m\t" },      /* mu */
	{ 0x3bd, "n\t" },      /* nu */
	{ 0x3be, "x\t" },      /* xi */
	{ 0x3bf, "o\t" },/* o breve? */  /* omicron */
	{ 0x3c0, "p\t" },      /* pi */
	{ 0x3c1, "r\t" },      /* rho */
	{ 0x3c2, "s\t" },      /* sigma */
	{ 0x3c3, "t\t" },      /* tau */
	{ 0x3c4, "u\t" },      /* upsilon */
	{ 0x3c5, "ph\t" },     /* phi */
	{ 0x3c6, "ch\t" },     /* chi */
	{ 0x3c7, "ps\t" },     /* psi */
	{ 0x3c8, "o\t" },      /* omega */

	{ 0x1f73, "é\t"},     /* epsilon acute */

/*
 *
 *  Convert Roman Numerals to conventional ASCII sequences
 *
 * 
 *  thise, and many other things like it, can be pulled straight out of the 
 * compatibility decompositions in UnicodeData. Ideally, they should be 
 * generated by a script.
 * 
 */

	{ 0x2170, "I\t" } ,
	{ 0x2171, "II\t" } ,
	{ 0x2172, "III\t" } ,
	{ 0x2173, "IV\t" } ,
	{ 0x2174, "V\t" } ,
	{ 0x2175, "VI\t" } ,
	{ 0x2176, "VII\t" } ,
	{ 0x2177, "VIII\t" } ,
	{ 0x2179, "IX\t" } ,
	{ 0x217a, "X\t" } ,
	{ 0x217b, "XI\t" } ,
	{ 0x217c, "XII\t" } ,
	{ 0, 0 },
};

const char *get_alternatives(unicode_char_t a) {
	struct thing *my_thing = things;
	while (my_thing->ch) {
		if (my_thing->ch == a) return my_thing->alternatives;
		my_thing++;
	}
	return NULL;
}
  
const unsigned char *convert_utf8_to(const char *str, const char *to_what) {
	int tmplen = strlen(str)*4, strl = strlen(str);
	char *tmp = malloc(strlen(str)*4), *tmp2 = tmp;
	iconv_t i = unicode_iconv_open(to_what, "UTF-8");
	while (strl) {
	i_wish_c_had_multilevel_break:
		if (unicode_iconv(i, &str, &strl, &tmp2, &tmplen)==-1) {
			const char *s2;
			unicode_char_t result;
			char character_to_use = '?';
			if (unicode_get_utf8(str, &result)!=NULL) {

				char *m = get_alternatives(result);
				while (m && *m) {
					char *f = strchr(m, '\t');
					int l = f - m;
					if (!f) {
						break;
					}
					if (unicode_iconv(i, &m, &l, &tmp2, &tmplen)!=-1) {
						s2 = str;
						str = unicode_next_utf8(str);
						strl -= str - s2;
						goto i_wish_c_had_multilevel_break;
					} else {
					}
					m = f+1;
				}
			}
			
			s2 = str;
			str = unicode_next_utf8(str);
			strl -= str - s2;
			printf("Argh : dunno what to do with : U+%04x\n", result);
			
			*(tmp2++) = character_to_use;
			tmplen--;
		}
	}
	*tmp2 = 0;
	unicode_iconv_close(i);
	return tmp;
}

int
main (int argc, char**argv)
{      

	const char * demo = "William Henry Gates ⅲ"

	printf("%s\n", convert_utf8_to(demo, "ISO-8859-1"));
	       
	return 0;

}
