diff options
Diffstat (limited to 'libc/string/_collate.c')
| -rw-r--r-- | libc/string/_collate.c | 686 | 
1 files changed, 686 insertions, 0 deletions
diff --git a/libc/string/_collate.c b/libc/string/_collate.c new file mode 100644 index 000000000..35fe7dc1b --- /dev/null +++ b/libc/string/_collate.c @@ -0,0 +1,686 @@ +/* + * Copyright (C) 2002     Manuel Novoa III + * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/*  Dec 20, 2002 + *  Initial test implementation of strcoll, strxfrm, wcscoll, and wcsxfrm. + *  The code needs to be cleaned up a good bit, but I'd like to see people + *  test it out. + * + */ + +#include "_string.h" +#include <ctype.h> +#include <locale.h> +#include <stdlib.h> +#include <errno.h> +#include <assert.h> + +extern size_t __strlcpy(char *__restrict dst, const char *__restrict src, +                      size_t n) attribute_hidden; + +#ifdef WANT_WIDE +extern int __wcscmp (__const wchar_t *__s1, __const wchar_t *__s2) attribute_hidden; +extern size_t __wcsxfrm (wchar_t *__restrict __s1, +		       __const wchar_t *__restrict __s2, size_t __n) attribute_hidden; +#endif +#ifdef __UCLIBC_HAS_XLOCALE__ +extern int __strcoll_l (__const char *__s1, __const char *__s2, __locale_t __l) attribute_hidden; +extern size_t __strxfrm_l (char *__dest, __const char *__src, size_t __n, __locale_t __l) attribute_hidden; +extern int __wcscoll_l (__const wchar_t *__s1, __const wchar_t *__s2, __locale_t __loc) attribute_hidden; +extern size_t __wcsxfrm_l (wchar_t *__s1, __const wchar_t *__s2, size_t __n, __locale_t __loc) attribute_hidden; +#endif + +#ifdef __UCLIBC_HAS_LOCALE__ +#if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) + +#ifdef L_strxfrm +#ifndef WANT_WIDE +#error WANT_WIDE should be defined for L_strxfrm +#endif +#ifdef L_wcsxfrm +#error L_wcsxfrm already defined for L_strxfrm +#endif +#endif /* L_strxfrm */ + +#if defined(L_strxfrm) || defined(L_strxfrm_l) + +#define wcscoll   strcoll +#define __wcscoll   __strcoll +#define wcscoll_l strcoll_l +#define __wcscoll_l __strcoll_l +#define wcsxfrm   strxfrm +#define __wcsxfrm   __strxfrm +#define wcsxfrm_l strxfrm_l +#define __wcsxfrm_l __strxfrm_l + +#undef WANT_WIDE +#undef Wvoid +#undef Wchar +#undef Wuchar +#undef Wint + +#define Wchar char + +#endif /* defined(L_strxfrm) || defined(L_strxfrm_l) */ + +#if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) + +int attribute_hidden __wcscoll (const Wchar *s0, const Wchar *s1) +{ +	return __wcscoll_l(s0, s1, __UCLIBC_CURLOCALE ); +} +strong_alias(__wcscoll,wcscoll) + +size_t attribute_hidden __wcsxfrm(Wchar *__restrict ws1, const Wchar *__restrict ws2, size_t n) +{ +	return __wcsxfrm_l(ws1, ws2, n, __UCLIBC_CURLOCALE ); +} +strong_alias(__wcsxfrm,wcsxfrm) + +#else  /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */ + + +#if 0 +#define CUR_COLLATE (&__UCLIBC_CURLOCALE_DATA.collate) +#else +#define CUR_COLLATE (& __LOCALE_PTR->collate) +#endif + +#define MAX_PENDING 8 + +typedef struct { +	const Wchar *s; +	const Wchar *eob;			/* end of backward */ + +	__uwchar_t weight; +	__uwchar_t ui_weight;		/* undefined or invalid */ +	int colitem; +	int weightidx; +	int rule; +	size_t position; +	/* should be wchar_t.  if wchar < 0 do EILSEQ? */ +	__uwchar_t *cip; +	__uwchar_t ci_pending[MAX_PENDING];	/* nul-terminated */ + +	char *back_buf; +	char *bbe;					/* end of back_buf (actual last... not 1 past end) */ +	char *bp;					/* ptr into backbuf, NULL if not in backward mode */ +	char ibb[128]; +	size_t bb_size; + +	int ru_pushed; +} col_state_t; + + +#define WEIGHT_MASK	0x3fffU +#define RULE_MASK	0xc000U + +#define RULE_FORWARD  (1 << 14) +#define RULE_POSITION (1 << 15) + +#define UI_IDX		(WEIGHT_MASK-6) +#define POSIT_IDX	(WEIGHT_MASK-5) +#define RANGE_IDX	(WEIGHT_MASK-4) +#define UNDEF_IDX	(WEIGHT_MASK-3) +#define INVAL_IDX	(WEIGHT_MASK-2) +#define DITTO_IDX   (WEIGHT_MASK-1) + + +#undef TRACE +#if 0 +#define TRACE(X)	printf X +#else +#define TRACE(X)	((void)0) +#endif + +static int lookup(wchar_t wc   __LOCALE_PARAM ) +{ +	unsigned int sc, n, i0, i1; + +	if (((__uwchar_t) wc) > 0xffffU) { +		return 0; +	} + +	sc = wc & CUR_COLLATE->ti_mask; +	wc >>= CUR_COLLATE->ti_shift; +	n = wc & CUR_COLLATE->ii_mask; +	wc >>= CUR_COLLATE->ii_shift; + +	i0 = CUR_COLLATE->wcs2colidt_tbl[wc]; +	i0 <<= CUR_COLLATE->ii_shift; +	i1 = CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + i0 + n]; +	i1 <<= CUR_COLLATE->ti_shift; +	return CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + CUR_COLLATE->ti_len + i1 + sc]; + +} + +static void init_col_state(col_state_t *cs, const Wchar *wcs) +{ +	__memset(cs, 0, sizeof(col_state_t)); +	cs->s = wcs; +	cs->bp = cs->back_buf = cs->ibb; +	cs->bb_size = 128; +	cs->bbe = cs->back_buf + (cs->bb_size -1); +} + +static void next_weight(col_state_t *cs, int pass   __LOCALE_PARAM ) +{ +	int r, w, ru, ri, popping_backup_stack; +	ssize_t n; +	const uint16_t *p; +#ifdef WANT_WIDE +#define WC (*cs->s) +#define N (1) +#else  /* WANT_WIDE */ +	wchar_t WC; +	size_t n0, nx; +#define N n0 + +#endif /* WANT_WIDE */ + +	do { + +		if (cs->ru_pushed) { +			ru = cs->ru_pushed; +			TRACE(("ru_pushed = %d\n", ru)); +			cs->ru_pushed = 0; +			goto POSITION_SKIP; +		} + +#ifdef __UCLIBC_MJN3_ONLY__ +#warning should we walk pendings backwards? +#endif +		if (cs->cip) {			/* possible pending weight */ +			if ((r = *(cs->cip++)) == 0) { +				cs->cip = NULL; +				continue; +			} +			cs->weightidx = r & WEIGHT_MASK; +			assert(cs->weightidx); +/* 			assert(cs->weightidx != WEIGHT_MASK); */ +		} else {				/* get the next collation item from the string */ +			TRACE(("clearing popping flag\n")); +			popping_backup_stack = 0; + +		IGNORE_LOOP: +			/* keep first pos as 0 for a sentinal */ +			if (*cs->bp) {				/* pending backward chars */ +			POP_BACKUP: +				popping_backup_stack = 1; +				TRACE(("setting popping flag\n")); +				n = 0; +				if (*cs->bp > 0) {		/* singles pending */ +					cs->s -= 1; +					if ((*cs->bp -= 1) == 0) { +						cs->bp -= 1; +					} +				} else {				/* last was a multi */ +					cs->s += *cs->bp; +					cs->bp -= 1; +				} +			} else if (!*cs->s) { /* not in backward mode and end of string */ +				cs->weight = 0; +				return; +			} else { +				cs->position += 1; +			} + +		BACK_LOOP: +#ifdef WANT_WIDE +			n = 1; +			cs->colitem = r = lookup(*cs->s   __LOCALE_ARG ); +#else  /* WANT_WIDE */ +			n = n0 = __locale_mbrtowc_l(&WC, cs->s, __LOCALE_PTR); +			if (n < 0) { +				__set_errno(EILSEQ); +				cs->weight = 0; +				return; +			} +			cs->colitem = r = lookup(WC   __LOCALE_ARG ); +#endif /* WANT_WIDE */ + +			TRACE((" r=%d WC=%#lx\n", r, (unsigned long)(WC))); + +			if (r > CUR_COLLATE->max_col_index) { /* starting char for one or more sequences */ +				p = CUR_COLLATE->multistart_tbl; +				p += p[r-CUR_COLLATE->max_col_index -1]; +				do { +					n = N; +					r = *p++; +					do { +						if (!*p) {		/* found it */ +							cs->colitem = r; +							TRACE(("    found multi %d\n", n)); +							goto FOUND; +						} +#ifdef WANT_WIDE +						/* the lookup check here is safe since we're assured that *p is a valid colidx */ +						if (!cs->s[n] || (lookup(cs->s[n]   __LOCALE_ARG ) != *p)) { +							do {} while (*p++); +							break; +						} +						++p; +						++n; +#else  /* WANT_WIDE */ +						if (cs->s[n]) { +							nx = __locale_mbrtowc_l(&WC, cs->s + n, __LOCALE_PTR); +							if (nx < 0) { +								__set_errno(EILSEQ); +								cs->weight = 0; +								return; +							} +						} +						if (!cs->s[n] || (lookup(WC   __LOCALE_ARG ) != *p)) { +							do {} while (*p++); +							break; +						} +						++p; +						n += nx; /* Only gets here if cs->s[n] != 0, so nx is set. */ +#endif /* WANT_WIDE */ +					} while (1); +				} while (1); +			} else if (r == 0) {		/* illegal, undefined, or part of a range */ +				if ((CUR_COLLATE->range_count) +#ifdef __UCLIBC_MJN3_ONLY__ +#warning .. need to introduce range as a collating item? +#endif +					&& (((__uwchar_t)(WC - CUR_COLLATE->range_low)) <= CUR_COLLATE->range_count) +					) {					/* part of a range */ +					/* Note: cs->colitem = 0 already. */ +					TRACE(("    found range\n")); +					ru = CUR_COLLATE->ruletable[CUR_COLLATE->range_rule_offset*CUR_COLLATE->MAX_WEIGHTS + pass]; +					assert((ru & WEIGHT_MASK) != DITTO_IDX); +					if ((ru & WEIGHT_MASK) == WEIGHT_MASK) { +						ru = (ru & RULE_MASK) | RANGE_IDX; +						cs->weight = CUR_COLLATE->range_base_weight + (WC - CUR_COLLATE->range_low); +					} +					goto RANGE_SKIP_TO; +				} else if (((__uwchar_t)(WC)) <= 0x7fffffffUL) { /* legal but undefined */ +				UNDEFINED: +					/* Note: cs->colitem = 0 already. */ +					ri = CUR_COLLATE->undefined_idx; +					assert(ri != 0); /* implicit undefined isn't supported */ + +					TRACE(("    found explicit UNDEFINED\n")); +#ifdef __UCLIBC_MJN3_ONLY__ +#warning right now single weight locales do not support .. +#endif +					if (CUR_COLLATE->num_weights == 1) { +						TRACE(("    single weight UNDEFINED\n")); +						cs->weightidx = RANGE_IDX; +						cs->weight = ri; +						cs->s += n; +						goto PROCESS_WEIGHT; +					} + +					ri = CUR_COLLATE->index2ruleidx[ri - 1]; +					ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass]; +					assert((ru & WEIGHT_MASK) != WEIGHT_MASK); /* TODO: handle ".." */ +					if ((ru & WEIGHT_MASK) == DITTO_IDX) { +						cs->colitem = CUR_COLLATE->undefined_idx; +					} +					goto RANGE_SKIP_TO; +				} else {		/* illegal */ +					TRACE(("    found illegal\n")); +					__set_errno(EINVAL); +					/* We put all illegals in the same equiv class with maximal weight, +					 * and ignore them after the first pass. */ +					if (pass > 0) { +						cs->s += n; +						goto IGNORE_LOOP; +					} +					ru = (RULE_FORWARD | RANGE_IDX); +					cs->weight = 0xffffU; +					goto RANGE_SKIP_TO; +				} +			} else if (CUR_COLLATE->num_weights == 1) { +				TRACE(("    single weight\n")); +				cs->weightidx = RANGE_IDX; +				cs->weight = cs->colitem; +				cs->s += n; +				goto PROCESS_WEIGHT; +			} else { +				TRACE(("    normal\n")); +			} + +			/* if we get here, it is a normal char either singlely weighted, undefined, or in a range */ +		FOUND: +			ri = CUR_COLLATE->index2ruleidx[cs->colitem - 1]; +			TRACE((" ri=%d ", ri)); +#ifdef __UCLIBC_MJN3_ONLY__ +#warning make sure this is correct +#endif +			if (!ri) { +				TRACE(("NOT IN THIS LOCALE\n")); +				goto UNDEFINED; +			} +			ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass]; + +		RANGE_SKIP_TO: + +#ifdef __UCLIBC_MJN3_ONLY__ +#warning ignoreables probably should not interrupt backwards processing, but this is wrong +#endif +/* 			if (!(ru & WEIGHT_MASK)) { */ +/* 				TRACE(("IGNORE\n")); */ +/* 				cs->s += n; */ +/* 				continue; */ +/* 			} */ + + +			TRACE((" rule = %#x  weight = %#x  popping = %d  s = %p  eob = %p\n", +				   ru & RULE_MASK, ru & WEIGHT_MASK, popping_backup_stack, +				   cs->s, cs->eob)); +			/* now we need to check if we're going backwards... */ + +			if (!popping_backup_stack) { +				if (!(ru & RULE_MASK)) { /* backward */ +					TRACE(("backwards\n")); +					assert(cs->bp <= cs->bbe); +					if (cs->bp == cs->bbe) { +						if (cs->back_buf == cs->ibb) { /* was using internal buffer */ +							cs->bp = malloc(cs->bb_size + 128); +							if (!cs->bp) { +								__set_errno(ENOMEM); +#ifdef __UCLIBC_MJN3_ONLY__ +#warning what to do here? +#endif +								cs->weight = 0; +								return; +							} +							__memcpy(cs->bp, cs->back_buf, cs->bb_size); + +						} else { +							cs->bp = realloc(cs->back_buf, cs->bb_size + 128); +							if (!cs->bp) { +								__set_errno(ENOMEM); +#ifdef __UCLIBC_MJN3_ONLY__ +#warning what to do here? +#endif +								cs->weight = 0; +								return; +							} +						} +						cs->bb_size += 128; +						cs->bbe = cs->bp + (cs->bbe - cs->back_buf); +						cs->back_buf = cs->bp; +						cs->bp = cs->bbe; + +					} +					if (n==1) {			/* single char */ +						if (*cs->bp && (((unsigned char)(*cs->bp)) < CHAR_MAX)) { +							*cs->bp += 1; /* increment last single's count */ +						} else {	  /* last was a multi, or just starting */ +							if (!cs->bp) { +								cs->bp = cs->back_buf; +							} else { +								assert(cs->bp < cs->bbe); +								++cs->bp; +							} +							*cs->bp = 1; +						} +					} else {			/* multichar */ +						assert(n>1); +						assert(cs->bp < cs->bbe); +						*++cs->bp = -n; +					} +					cs->s += n; +					if (*cs->s) { +						goto BACK_LOOP; +					} +					/* end-of-string so start popping */ +					cs->eob = cs->s; +					TRACE(("popping\n")); +					goto POP_BACKUP; +				} else if (*cs->bp) { /* was going backward but this element isn't */ +					/* discard current and use previous backward element */ +					assert(!cs->cip); +					cs->eob = cs->s; +					TRACE(("popping\n")); +					goto POP_BACKUP; +				} else {				/* was and still going forward */ +					TRACE(("forwards\n")); +					if ((ru & (RULE_POSITION|WEIGHT_MASK)) > RULE_POSITION) { +						assert(ru & WEIGHT_MASK); +						cs->ru_pushed = ru; +						cs->weight = cs->position; +#ifdef __UCLIBC_MJN3_ONLY__ +#warning devel code +#endif +						cs->position = 0;	/* reset to reduce size for strcoll? */ +						cs->s += n; +						cs->weightidx = RANGE_IDX; +						goto PROCESS_WEIGHT; +					} +				} +			} else {					/* popping backwards stack */ +				TRACE(("popping (continued)\n")); +				if (!*cs->bp) { +					cs->s = cs->eob; +				} +				cs->s -= n; +			} + +			cs->s += n; +		POSITION_SKIP: +			cs->weightidx = ru & WEIGHT_MASK; +			cs->rule = ru & RULE_MASK; +		} + +#ifdef __UCLIBC_MJN3_ONLY__ +#warning for pending we only want the weight... _not_ the rule +#endif +		if (!cs->weightidx) {	/* ignore */ +			continue; +		} + +	PROCESS_WEIGHT: +		assert(cs->weightidx); + + +		if (((unsigned int)(cs->weightidx - UI_IDX)) <= (INVAL_IDX-UI_IDX)) { +			if (cs->weightidx == UI_IDX) { +				cs->weight = cs->ui_weight; +			} +			return; +		} + +		assert(cs->weightidx != WEIGHT_MASK); +		if (cs->weightidx == DITTO_IDX) { /* want the weight of the current collating item */ +			TRACE(("doing ditto\n")); +			w = CUR_COLLATE->index2weight[cs->colitem -1]; +		} else if (cs->weightidx <= CUR_COLLATE->max_col_index) { /* normal */ +			TRACE(("doing normal\n")); +			w = CUR_COLLATE->index2weight[cs->weightidx -1]; +		} else {				/* a string */ +			TRACE(("doing string\n")); +			assert(!(cs->weightidx & RULE_MASK)); +			/* note: iso14561 allows null string here */ +			p = CUR_COLLATE->weightstr + (cs->weightidx - (CUR_COLLATE->max_col_index + 2)); +			if (*p & WEIGHT_MASK) { +				r = 0; +				do { +					assert(r < MAX_PENDING); +					cs->ci_pending[r++] = *p++; +				} while (*p & WEIGHT_MASK); +				cs->cip = cs->ci_pending; +			} +			continue; +		} + +		cs->weight = w; +		return; +	} while (1); +} + +int attribute_hidden __UCXL(wcscoll) (const Wchar *s0, const Wchar *s1   __LOCALE_PARAM ) +{ +	col_state_t ws[2]; +	int pass; + +	if (!CUR_COLLATE->num_weights) { /* C locale */ +#ifdef WANT_WIDE +		return __wcscmp(s0, s1); +#else  /* WANT_WIDE */ +		return __strcmp(s0, s1); +#endif /* WANT_WIDE */ +	} + +	pass = 0; +	do {						/* loop through the weights levels */ +		init_col_state(ws, s0); +		init_col_state(ws+1, s1); +		do {					/* loop through the strings */ +			/* for each string, get the next weight */ +			next_weight(ws, pass   __LOCALE_ARG ); +			next_weight(ws+1, pass   __LOCALE_ARG ); +			TRACE(("w0=%lu  w1=%lu\n", +				   (unsigned long) ws[0].weight, +				   (unsigned long) ws[1].weight)); + +			if (ws[0].weight != ws[1].weight) { +				return ws[0].weight - ws[1].weight; +			} +		} while (ws[0].weight); +	} while (++pass < CUR_COLLATE->num_weights); + +	return 0; +} +__UCXL_ALIAS(wcscoll) + +#ifdef WANT_WIDE + +size_t attribute_hidden __UCXL(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2, +					 size_t n   __LOCALE_PARAM ) +{ +	col_state_t cs; +	size_t count; +	int pass; + +	if (!CUR_COLLATE->num_weights) { /* C locale */ +		return __wcsxfrm(ws1, ws2, n); +	} + +#ifdef __UCLIBC_MJN3_ONLY__ +#warning handle empty string as a special case +#endif + +	count = pass = 0; +	do {						/* loop through the weights levels */ +		init_col_state(&cs, ws2); +		do {					/* loop through the string */ +			next_weight(&cs, pass   __LOCALE_ARG ); +			TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight)); +			if (count < n) { +				ws1[count] = cs.weight +1; +			} +			++count; +			TRACE(("--------------------------------------------\n")); +		} while (cs.weight); +		if (count <= n) {		/* overwrite the trailing 0 end-of-pass marker */ +			ws1[count-1] = 1; +		} +		TRACE(("--------------------  pass %d  --------------------\n", pass)); +	} while (++pass < CUR_COLLATE->num_weights); +	if (count <= n) {			/* oops... change it back */ +		ws1[count-1] = 0; +	} +	return count-1; +} + +__UCXL_ALIAS(wcsxfrm) + +#else  /* WANT_WIDE */ + +static const unsigned long bound[] = { +	1UL << 7, +	1UL << 11, +	1UL << 16, +	1UL << 21, +	1UL << 26, +}; + +static unsigned char first[] = { +	0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc +}; + +/* Use an extension of UTF-8 to store a 32 bit val in max 6 bytes. */ + +static size_t store(unsigned char *s, size_t count, size_t n, __uwchar_t weight) +{ +	int i, r; + +	i = 0; +	do { +		if (weight < bound[i]) { +			break; +		} +	} while (++i < sizeof(bound)/sizeof(bound[0])); + +	r = i+1; +	if (i + count < n) { +		s += count; +		s[0] = first[i]; +		while (i) { +			s[i] = 0x80 | (weight & 0x3f); +			weight >>= 6; +			--i; +		} +		s[0] |= weight; +	} + +	return r; +} + +size_t attribute_hidden __UCXL(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_t n +					 __LOCALE_PARAM ) +{ +	col_state_t cs; +	size_t count, inc; +	int pass; + +	if (!CUR_COLLATE->num_weights) { /* C locale */ +		return __strlcpy(ws1, ws2, n); +	} + +#ifdef __UCLIBC_MJN3_ONLY__ +#warning handle empty string as a special case +#endif + +	inc = count = pass = 0; +	do {						/* loop through the weights levels */ +		init_col_state(&cs, ws2); +		do {					/* loop through the string */ +			next_weight(&cs, pass   __LOCALE_ARG ); +			TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight)); +			inc = store((unsigned char *)ws1, count, n, cs.weight + 1); +			count += inc; +			TRACE(("--------------------------------------------\n")); +		} while (cs.weight); +		/* overwrite the trailing 0 end-of-pass marker */ +		assert(inc == 1); +		if (count <= n) { +			ws1[count-1] = 1; +		} +		TRACE(("--------------------  pass %d  --------------------\n", pass)); +	} while (++pass < CUR_COLLATE->num_weights); +	if (count <= n) {			/* oops... change it back */ +		ws1[count-1] = 0; +	} +	return count-1; +} + +__UCXL_ALIAS(strxfrm) + +#endif /* WANT_WIDE */ + +#endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */ + +#endif /* defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) */ + +#endif /* __UCLIBC_HAS_LOCALE__ */ +/**********************************************************************/  | 
