From 6ed652d87201c190125877cf6912cd4f2e4269b5 Mon Sep 17 00:00:00 2001 From: Natanael Copa Date: Wed, 22 Oct 2008 13:44:07 +0200 Subject: initial commit --- Makefile | 61 +++++++++++++++++ README | 4 ++ TODO | 2 + iconv.c | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ iconv.h | 16 +++++ test.c | 72 ++++++++++++++++++++ 6 files changed, 385 insertions(+) create mode 100644 Makefile create mode 100644 README create mode 100644 TODO create mode 100644 iconv.c create mode 100644 iconv.h create mode 100644 test.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f22d82d --- /dev/null +++ b/Makefile @@ -0,0 +1,61 @@ +NAME= uiconv +VERSION= 0.2.3 + +LIB= lib +INCLUDE= include +PREFIX= /usr/local +DESTDIR= + +CFLAGS= -Wall -Werror -Os +CFLAGS+= -std=gnu99 -fPIC + +TARGETS= libiconv.a + +DISTFILES= Makefile README TODO iconv.c iconv.h test.c + +all: static + +static: libiconv.a + +shared: libiconv.so + +libiconv.a: iconv.o + $(AR) -cr $@ $< + +libiconv.so.0: iconv.o + $(CC) $(LDFLAGS) -o $@ -shared $< + +libiconv.so: libiconv.so.0 + ln -sf $< $@ + +%.o: %.c + $(CC) $(CFLAGS) -c -I. $< + +clean: + rm -f *.a *.so *.so.0 *.o + +test-gnu: test.c libiconv.a + $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $< + +test-uiconv: test.c libiconv.a + $(CC) -I. -L. -o $@ $(CFLAGS) $(LDFLAGS) $< + +test: test-gnu test-uiconv + ./test-uiconv ABC `echo -e "a\xc3\xb8"` `echo -e "a\xf8"` + +P=$(NAME)-$(VERSION) +dist: $(P).tar.gz + +$(P).tar.gz: $(DISTFILES) + rm -rf $(P) + mkdir $(P) + cp $(DISTFILES) $(P)/ + tar -zcf $@ $(P) + rm -rf $(P) + +install: + mkdir -p $(DESTDIR)$(PREFIX)/$(LIB) + mkdir -p $(DESTDIR)$(PREFIX)/$(INCLUDE) + cp libiconv.a $(DESTDIR)$(PREFIX)/$(LIB) + cp iconv.h $(DESTDIR)$(PREFIX)/$(INCLUDE) + diff --git a/README b/README new file mode 100644 index 0000000..d7681d0 --- /dev/null +++ b/README @@ -0,0 +1,4 @@ +Minor iconv implementation + +Handles ASCII, ISO-8859-1 and UTF-8 (2 byte lenght chars) + diff --git a/TODO b/TODO new file mode 100644 index 0000000..4ba3307 --- /dev/null +++ b/TODO @@ -0,0 +1,2 @@ +Support multiple names on same codesets. i.e UTF8 = UTF-8 + diff --git a/iconv.c b/iconv.c new file mode 100644 index 0000000..533da0a --- /dev/null +++ b/iconv.c @@ -0,0 +1,230 @@ + +#include +#include + +#define __need_size_t +#include +#include + +#include "iconv.h" + +#ifndef UNUSED +#define UNUSED __attribute__ ((__unused__)) +#endif + + +static int decode_ascii(int *scalar, const unsigned char *inbuf, int count); +static int decode_iso_8859_1(int *scalar, const unsigned char *inbuf, + int count); +static int decode_utf_8(int *scalar, const unsigned char *inbuf, int count); + +static int encode_ascii(int scalar, unsigned char *outbuf); +static int encode_iso_8859_1(int scalar, unsigned char *outbuf); +static int encode_utf_8(int scalar, unsigned char *outbuf); + +enum { + ASCII = 0, + UTF_8, + ISO_8859_1 +}; + +struct converter { + int code; + const char *name; + int (*decode)(int *scalar, const unsigned char *inbuf, int count); + int (*encode)(int scalar, unsigned char *outbuf); +}; + +struct converter supported_codesets[] = { + { + .code = ASCII, + .name = "ASCII", + .decode = &decode_ascii, + .encode = &encode_ascii, + }, { + .code = UTF_8, + .name = "UTF-8", + .decode = &decode_utf_8, + .encode = &encode_utf_8, + }, { + .code = ISO_8859_1, + .name = "ISO-8859-1", + .decode = &decode_iso_8859_1, + .encode = &encode_iso_8859_1 + }, { + .code = -1, + } +}; + +/* ASCII */ +static int decode_ascii(int *scalar, const unsigned char *inbuf, + int UNUSED count) +{ + if (*inbuf >= 0x80) { + errno = EILSEQ; + return -1; + } + *scalar = *inbuf; + return 1; +} + +static int encode_ascii(int scalar, unsigned char *outbuf) +{ + if (scalar >= 0x80) + return 0; + *outbuf = scalar; + return 1; +} + +/* ISO-8859-1 */ +static int decode_iso_8859_1(int *scalar, const unsigned char *inbuf, + int UNUSED count) +{ + *scalar = *inbuf; + return 1; +} + +static int encode_iso_8859_1(int scalar, unsigned char *outbuf) +{ + if (scalar > 0xff) + return 0; + *outbuf = scalar; + return 1; +} + + +/* UTF-8 */ +/* scalar value utf-8 + * 00000000 00000000 0zzzzzzz 0zzzzzzz + * 00000000 00000yyy yyzzzzzz 110yyyyy 10zzzzzz + * 00000000 xxxxyyyy yyzzzzzz 1110xxxx 10yyyyyy 10zzzzzz + * 000wwwxx xxxxyyyy yyzzzzzz 11110www 10xxxxxx 10yyyyyy 10zzzzzz + * + * This implementation only supports the 2 first variants + */ +static int decode_utf_8(int *scalar, const unsigned char *inbuf, int count) +{ + int i = inbuf[0]; + + if (i < 0x80) { + *scalar = i; + return 1; + } + + if (count < 2) { + errno = EINVAL; + return -1; + } + + i = (i << 8) | inbuf[1]; + if ((i & 0xe0c0) != 0xc080) { + errno = EILSEQ; + return -1; + } + + *scalar = ((i & 0x1700) >> 2) | (i & 0x3f); + return 2; +} + +static int encode_utf_8(int scalar, unsigned char *outbuf) +{ + if (scalar < 0x80) { + *outbuf = scalar; + return 1; + } + + if (scalar > 0x7ff) + return 0; + + outbuf[0] = (scalar >> 6) | 0xc0; + outbuf[1] = (scalar & 0x3f) | 0x80; + return 2; +} + +static void toupperstr(char *p) +{ + while (p && *p) { + if (*p == '_') + *p = '-'; + *p = toupper(*p); + p++; + } +} + +static int find_converter(const char *str) +{ + int i; + char buf[16]; + strncpy(buf, str, sizeof(buf)); + buf[15] = '\0'; + toupperstr(buf); + for (i = 0; supported_codesets[i].code != -1; i++) { + if (strcmp(buf, supported_codesets[i].name) == 0) + return i; + } + return -1; +} + +iconv_t iconv_open(const char *tocode, const char *fromcode) +{ + int src, dest; + iconv_t cd = (iconv_t) -1; + + if (tocode == NULL || fromcode == NULL) { + errno = EINVAL; + goto return_error; + } + + src = find_converter(fromcode); + dest = find_converter(tocode); + if (src == -1 || dest == -1) { + errno = EINVAL; + goto return_error; + } + + cd = (iconv_t) (dest << 8) | src; + +return_error: + return cd; +} + +int iconv_close(iconv_t UNUSED cd) +{ + return 0; +} + +size_t iconv(iconv_t cd, const char **inbuf, size_t *insize, + char **outbuf, size_t *outsize) +{ + struct converter *in = &supported_codesets[cd & 0xff]; + struct converter *out = &supported_codesets[cd >> 8]; + size_t nonidentical = 0; + if (!inbuf || !*inbuf || !outbuf || !*outbuf) + return 0; + while (*insize) { + int scalar, infwd, outfwd; + infwd = in->decode(&scalar, (unsigned char *)*inbuf, *insize); + if (infwd < 0) + goto ret_error; + + outfwd = out->encode(scalar, (unsigned char *)*outbuf); + if (outfwd > *outsize) { + errno = E2BIG; + goto ret_error; + } + + if (**inbuf != scalar || outfwd == 0) + nonidentical++; + + *inbuf += infwd; + *insize -= infwd; + *outbuf += outfwd; + *outsize -= outfwd; + } + if (*outsize) + *outbuf = '\0'; + return nonidentical; + +ret_error: + return -1; +} diff --git a/iconv.h b/iconv.h new file mode 100644 index 0000000..42b3ea3 --- /dev/null +++ b/iconv.h @@ -0,0 +1,16 @@ +#ifndef ICONV_H +#define ICONV_H + +#include + +typedef int iconv_t; + +iconv_t iconv_open(const char *tocode, const char *fromcode) __THROW; + +size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) __THROW; + +int iconv_close(iconv_t) __THROW; + + +#endif diff --git a/test.c b/test.c new file mode 100644 index 0000000..1cb8011 --- /dev/null +++ b/test.c @@ -0,0 +1,72 @@ +#include +#include +#include +#include + +#include "iconv.h" + +void print_hex(char *str) +{ + printf("%.2x", (unsigned char) *str); + str++; + while (*str) { + printf(" %.2x", (unsigned char) *str); + str++; + } +} + +void convert_string(iconv_t cd, char *str) +{ + char outbuf[256]; + char *outptr = outbuf; + const char *inptr = str; + size_t outsize = sizeof(outbuf); + size_t insize = strlen(str); + int ret, err = 0; + memset(outbuf, 0, sizeof(outbuf)); + print_hex(str); + printf(" -> "); + ret = iconv(cd, &inptr, &insize, &outptr, &outsize); + if (ret < 0) + err= errno; + print_hex(outbuf); + if (err) + printf(" (%s)", strerror(err)); + printf("\n"); +} + + +void convert_args(char *from, char *to, int argc, char **argv) +{ + iconv_t cd; + int i; + + printf("\n>>> %s: %s -> %s\n", basename(argv[0]), from, to); + cd = iconv_open(to, from); + if (cd < 0) { + printf("iconv_open(\"%s\", \"%s\"): %s\n", to, from, strerror(errno)); + return; + } + + for (i = 1; i < argc; i++) + convert_string(cd, argv[i]); + + iconv_close(cd); +} + + +int main(int argc, char **argv) +{ + char *codesets[] = { "ASCII", "ISO-8859-1", "UTF-8", "invalid", NULL }; + char **from, **to; + + for (from = codesets; *from; from++) + for (to = codesets; *to; to++) + convert_args(*from, *to, argc, argv); + + return 0; +} + + + + -- cgit v1.2.3