/* Part of SWI-Prolog Author: Jan Wielemaker and Anjo Anjewierden E-mail: J.Wielemaker@vu.nl WWW: http://www.swi-prolog.org Copyright (c) 2011-2023, University of Amsterdam VU University Amsterdam SWI-Prolog Solutions b.v. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pl-incl.h" #include "pl-arith.h" #include "pl-ctype.h" #include "pl-utf8.h" #include "../pl-codelist.h" #include "../pl-write.h" #include "../pl-prims.h" #include #include #include #if HAVE_LIMITS_H #include /* solaris compatibility */ #endif #undef LD #define LD LOCAL_LD static int text_representation_error(PL_chars_t *text, IOENC enc); static int text_error(PL_chars_t *text, int rc); /******************************* * UNIFIED TEXT STUFF * *******************************/ static inline size_t bufsize_text(PL_chars_t *text, size_t len) { size_t unit; switch(text->encoding) { case ENC_ISO_LATIN_1: case ENC_ASCII: case ENC_UTF8: case ENC_UTF16BE: case ENC_UTF16LE: case ENC_ANSI: unit = sizeof(char); break; case ENC_WCHAR: unit = sizeof(pl_wchar_t); break; default: assert(0); unit = sizeof(char); /*NOTREACHED*/ } return len*unit; } int PL_save_text(PL_chars_t *text, int flags) { if ( (flags & BUF_MALLOC) && text->storage != PL_CHARS_MALLOC ) { size_t bl = bufsize_text(text, text->length+1); void *new = PL_malloc(bl); if ( new ) { memcpy(new, text->text.t, bl); text->text.t = new; text->storage = PL_CHARS_MALLOC; } else { return FALSE; } } else if ( text->storage == PL_CHARS_LOCAL ) { Buffer b = findBuffer(BUF_STACK); size_t bl = bufsize_text(text, text->length+1); addMultipleBuffer(b, text->text.t, bl, char); text->text.t = baseBuffer(b, char); text->storage = PL_CHARS_RING; } else if ( text->storage == PL_CHARS_MALLOC ) { Buffer b = findBuffer(BUF_STACK); size_t bl = bufsize_text(text, text->length+1); addMultipleBuffer(b, text->text.t, bl, char); PL_free_text(text); text->text.t = baseBuffer(b, char); text->storage = PL_CHARS_RING; } return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - PL_from_stack_text() moves a string from the stack, so it won't get corrupted if GC/shift comes along. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int PL_from_stack_text(PL_chars_t *text, int flags) { if ( !(flags&BUF_ALLOW_STACK) ) { if ( text->storage == PL_CHARS_STACK ) { size_t bl = bufsize_text(text, text->length+1); if ( bl < sizeof(text->buf) ) { memcpy(text->buf, text->text.t, bl); text->text.t = text->buf; text->storage = PL_CHARS_LOCAL; } else { Buffer b = findBuffer(BUF_STACK); addMultipleBuffer(b, text->text.t, bl, char); text->text.t = baseBuffer(b, char); text->storage = PL_CHARS_RING; } } } return TRUE; } size_t PL_text_length(const PL_chars_t *text) { assert(text->canonical); #if SIZEOF_WCHAR_T == 2 if ( text->encoding == ENC_WCHAR ) { const wchar_t *s = (const wchar_t *)text->text.t; const wchar_t *e = &s[text->length]; size_t count = 0; while(s < e) { int c = *s++; count++; if ( IS_UTF16_LEAD(c) ) s++; } return count; } #endif return text->length; } #define INT64_DIGITS 20 static char * ui64toa(uint64_t val, char *out, int base) { char tmpBuf[INT64_DIGITS + 1]; char *ptrOrg = tmpBuf + INT64_DIGITS; char *ptr = ptrOrg; size_t nbDigs; static const char digits[] = "0123456789abcdef"; do { int rem = val % base; *--ptr = digits[rem]; val /= base; } while ( val ); nbDigs = ptrOrg - ptr; memcpy(out, ptr, nbDigs); out += nbDigs; *out = '\0'; return out; /* points to the END */ } static char * i64toa(int64_t val, char *out, int base) { if ( val < 0 ) { *out++ = '-'; val = -(uint64_t)val; } return ui64toa((uint64_t)val, out, base); } int PL_get_text(DECL_LD term_t l, PL_chars_t *text, int flags) { word w = valHandle(l); if ( (flags & CVT_ATOM) && isAtom(w) ) { if ( isNil(w) && (flags&CVT_LIST) ) goto case_list; if ( !get_atom_text(w, text) ) goto maybe_write; } else if ( (flags & CVT_STRING) && isString(w) ) { if ( !get_string_text(w, text) ) goto maybe_write; if ( !PL_from_stack_text(text, flags) ) return FALSE; /* no memory */ } else if ( ((flags&CVT_RATIONAL) && isRational(w)) || ((flags&CVT_INTEGER) && isInteger(w)) ) { number n; int base = (flags&CVT_XINTEGER)==CVT_XINTEGER ? 16 : 10; PL_get_number(l, &n); switch(n.type) { case V_INTEGER: { char *ep = i64toa(n.value.i, text->buf, base); text->text.t = text->buf; text->length = ep-text->text.t; text->storage = PL_CHARS_LOCAL; break; } #ifdef O_BIGNUM case V_MPZ: { size_t sz = (double)mpz_sizeinbase(n.value.mpz, 2)*log(10)/log(2)*1.2 + 2; Buffer b = findBuffer(BUF_STACK); if ( !growBuffer(b, sz) ) outOfCore(); mpz_get_str(b->base, base, n.value.mpz); b->top = b->base + strlen(b->base); text->text.t = baseBuffer(b, char); text->length = entriesBuffer(b, char); text->storage = PL_CHARS_RING; break; } case V_MPQ: { size_t sz = (double)( mpz_sizeinbase(mpq_numref(n.value.mpq), 2) + mpz_sizeinbase(mpq_denref(n.value.mpq), 2) + 4 ) * log(10)/log(2) * 1.2; Buffer b = findBuffer(BUF_STACK); if ( !growBuffer(b, sz) ) outOfCore(); mpz_get_str(b->base, base, mpq_numref(n.value.mpq)); b->top = b->base + strlen(b->base); *b->top++ = 'r'; /* '/' under some condition? */ mpz_get_str(b->top, base, mpq_denref(n.value.mpq)); b->top += strlen(b->top); text->text.t = baseBuffer(b, char); text->length = entriesBuffer(b, char); text->storage = PL_CHARS_RING; break; } #endif default: assert(0); } text->encoding = ENC_ISO_LATIN_1; text->canonical = TRUE; } else if ( (flags & CVT_FLOAT) && isFloat(w) ) { format_float(valFloat(w), text->buf); text->text.t = text->buf; text->length = strlen(text->text.t); text->encoding = ENC_ISO_LATIN_1; text->storage = PL_CHARS_LOCAL; text->canonical = TRUE; } else if ( (flags & CVT_LIST) ) { Buffer b; CVT_result result; case_list: if ( (b = codes_or_chars_to_buffer(l, BUF_STACK, FALSE, &result)) ) { text->length = entriesBuffer(b, char); addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->encoding = ENC_ISO_LATIN_1; } else if ( result.status == CVT_wide && (b = codes_or_chars_to_buffer(l, BUF_STACK, TRUE, &result)) ) { text->length = entriesBuffer(b, pl_wchar_t); addBuffer(b, EOS, pl_wchar_t); text->text.w = baseBuffer(b, pl_wchar_t); text->encoding = ENC_WCHAR; } else if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL|CVT_WRITEQ)) ) { goto case_write; } else { if ( (flags & CVT_VARNOFAIL) && result.status == CVT_partial ) return 2; if ( (flags & CVT_EXCEPTION) ) { switch(result.status) { case CVT_partial: return PL_error(NULL, 0, NULL, ERR_INSTANTIATION); case CVT_nolist: goto error; case CVT_nocode: case CVT_nochar: { term_t culprit = PL_new_term_ref(); atom_t type; *valTermRef(culprit) = result.culprit; if ( result.status == CVT_nocode ) type = ATOM_character_code; else type = ATOM_character; return PL_error(NULL, 0, NULL, ERR_TYPE, type, culprit); } case CVT_representation: return PL_representation_error("character_code"); default: break; } } goto error; } text->storage = PL_CHARS_RING; text->canonical = TRUE; } else if ( (flags & CVT_VARIABLE) && isVar(w) ) { text->text.t = varName(l, text->buf); text->length = strlen(text->text.t); text->encoding = ENC_ISO_LATIN_1; text->storage = PL_CHARS_LOCAL; text->canonical = TRUE; } else if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL|CVT_WRITEQ)) ) { IOENC encodings[3]; IOENC *enc; char *r; int wflags; case_write: if ( (flags&REP_UTF8) ) { encodings[0] = ENC_UTF8; encodings[1] = ENC_UNKNOWN; } else { encodings[0] = ENC_ISO_LATIN_1; encodings[1] = ENC_WCHAR; encodings[2] = ENC_UNKNOWN; } if ( (flags&CVT_WRITEQ) ) wflags = PL_WRT_QUOTED|PL_WRT_NUMBERVARS; else if ( (flags&CVT_WRITE_CANONICAL) ) wflags = (PL_WRT_QUOTED|PL_WRT_QUOTE_NON_ASCII| PL_WRT_IGNOREOPS|PL_WRT_VARNAMES| PL_WRT_NODOTINATOM|PL_WRT_BRACETERMS); else wflags = PL_WRT_NUMBERVARS; int rc = FALSE; BEGIN_NUMBERVARS(TRUE); nv_options options = { .functor = FUNCTOR_isovar1, .on_attvar = AV_SKIP, .singletons = PL_is_acyclic(l), .numbered_check = FALSE }; if ( numberVars(l, &options, 0) == NV_ERROR ) goto error; for(enc = encodings; *enc != ENC_UNKNOWN; enc++) { size_t size; IOSTREAM *fd; r = text->buf; size = sizeof(text->buf); fd = Sopenmem(&r, &size, "w"); fd->encoding = *enc; if ( PL_write_term(fd, l, 1200, wflags) && Sputcode(EOS, fd) >= 0 && Sflush(fd) >= 0 ) { text->encoding = *enc; text->storage = (r == text->buf ? PL_CHARS_LOCAL : PL_CHARS_MALLOC); text->canonical = TRUE; if ( *enc == ENC_WCHAR ) { text->length = (size/sizeof(pl_wchar_t))-1; text->text.w = (pl_wchar_t *)r; } else { text->length = size-1; text->text.t = r; } Sclose(fd); rc = TRUE; break; } else { Sclose(fd); if ( *enc == ENC_ISO_LATIN_1 && enc[1] != ENC_UNKNOWN ) PL_clear_exception(); if ( r != text->buf ) Sfree(r); } } END_NUMBERVARS(TRUE); if ( rc ) goto out; goto error; } else { goto error; } out: return TRUE; maybe_write: if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL|CVT_WRITEQ)) ) goto case_write; error: if ( canBind(w) && (flags & CVT_VARNOFAIL) ) return 2; if ( (flags & CVT_EXCEPTION) ) { atom_t expected; if ( (flags & CVT_LIST) && !(flags&(CVT_ATOM|CVT_NUMBER)) ) expected = ATOM_list; /* List and/or string object */ else if ( flags & CVT_LIST ) expected = ATOM_text; else if ( (flags & CVT_ATOM) && w == ATOM_nil ) expected = ATOM_atom; /* [] \== '[]' */ else if ( flags & CVT_NUMBER ) expected = ATOM_atomic; else expected = ATOM_atom; return PL_error(NULL, 0, NULL, ERR_TYPE, expected, l); } fail; } atom_t textToAtom(PL_chars_t *text) { GET_LD atom_t a; int rc; PL_STRINGS_MARK(); if ( (rc=PL_canonicalise_text(text)) == TRUE ) { if ( text->encoding == ENC_ISO_LATIN_1 ) a = lookupAtom(text->text.t, text->length); else a = lookupUCSAtom(text->text.w, text->length); } else { text_error(text, rc); a = 0; } PL_STRINGS_RELEASE(); return a; } word textToString(PL_chars_t *text) { GET_LD atom_t a; int rc; PL_STRINGS_MARK(); if ( (rc=PL_canonicalise_text(text)) == TRUE ) { if ( text->encoding == ENC_ISO_LATIN_1 ) a = globalString(text->length, text->text.t); else a = globalWString(text->length, text->text.w); } else { text_error(text, rc); a = 0; } PL_STRINGS_RELEASE(); return a; } static size_t globalSpaceRequirement(PL_chars_t *text) { size_t len; if ( text->encoding == ENC_ISO_LATIN_1 ) { len = text->length+1; } else { len = (text->length+1)*sizeof(pl_wchar_t); } return 2 + (len+sizeof(word))/sizeof(word); } #define unify_text(term, tail, text, type) \ LDFUNC(unify_text, term, tail, text, type) static int unify_text(DECL_LD term_t term, term_t tail, PL_chars_t *text, int type) { switch(type) { case PL_ATOM: { atom_t a = textToAtom(text); if ( a ) { int rval = _PL_unify_atomic(term, a); PL_unregister_atom(a); return rval; } return FALSE; } case PL_STRING: { word w; int rc; if ( (rc=PL_canonicalise_text(text)) == TRUE ) { if ( hasGlobalSpace(globalSpaceRequirement(text)) || PL_from_stack_text(text, 0) ) { if ( (w = textToString(text)) ) return _PL_unify_atomic(term, w); } return FALSE; } else return text_error(text, rc); } case PL_CODE_LIST: case PL_CHAR_LIST: { if ( !hasGlobalSpace(text->length*3+1) && !PL_from_stack_text(text, 0) ) return FALSE; if ( text->length == 0 ) { if ( tail ) { PL_put_term(tail, term); return TRUE; } else { return PL_unify_nil(term); } } else { term_t l = PL_new_term_ref(); Word p0, p; switch(text->encoding) { case ENC_ISO_LATIN_1: { const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; if ( !(p0 = p = INIT_SEQ_STRING(text->length)) ) return FALSE; if ( type == PL_CODE_LIST ) { for( ; s < e; s++) p = EXTEND_SEQ_CODES(p, *s); } else { for( ; s < e; s++) p = EXTEND_SEQ_CHARS(p, *s); } break; } case ENC_WCHAR: { const pl_wchar_t *s = (const pl_wchar_t *)text->text.t; const pl_wchar_t *e = &s[text->length]; if ( !(p0 = p = INIT_SEQ_STRING(text->length)) ) return FALSE; if ( type == PL_CODE_LIST ) { while(s < e) { int c; s = get_wchar(s, &c); p = EXTEND_SEQ_CODES(p, c); } } else { while(s < e) { int c; s = get_wchar(s, &c); p = EXTEND_SEQ_CHARS(p, c); } } break; } case ENC_UTF8: { const char *s = text->text.t; const char *e = &s[text->length]; size_t len = utf8_strlen(s, text->length); if ( !(p0 = p = INIT_SEQ_STRING(len)) ) return FALSE; if ( type == PL_CODE_LIST ) { while (s < e) { int chr; PL_utf8_code_point(&s, e, &chr); p = EXTEND_SEQ_CODES(p, chr); } } else { while (s < e) { int chr; PL_utf8_code_point(&s, e, &chr); p = EXTEND_SEQ_CHARS(p, chr); } } break; } case ENC_ANSI: { const char *s = text->text.t; size_t rc, n = text->length; size_t len = 0; mbstate_t mbs; wchar_t wc; memset(&mbs, 0, sizeof(mbs)); while( n > 0 ) { if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0 ) return PL_error(NULL, 0, "cannot represent text in current locale", ERR_REPRESENTATION, ATOM_encoding); len++; n -= rc; s += rc; } if ( !(p0 = p = INIT_SEQ_STRING(len)) ) return FALSE; n = text->length; s = text->text.t; memset(&mbs, 0, sizeof(mbs)); while(n > 0) { rc = mbrtowc(&wc, s, n, &mbs); if ( type == PL_CODE_LIST ) p = EXTEND_SEQ_CODES(p, wc); else p = EXTEND_SEQ_CHARS(p, wc); s += rc; n -= rc; } break; } default: { assert(0); return FALSE; } } return CLOSE_SEQ_STRING(p, p0, tail, term, l ); } } default: { assert(0); return FALSE; } } } int PL_unify_text(term_t term, term_t tail, PL_chars_t *text, int type) { GET_LD int rc; PL_STRINGS_MARK(); rc = unify_text(term, tail, text, type); PL_STRINGS_RELEASE(); return rc; } static const void* PL_seek_text_from(const PL_chars_t *t, const void *start, ssize_t offset) { if ( offset == 0 ) return start; switch(t->encoding) { case ENC_ISO_LATIN_1: { const char *s = start; s += offset; if ( s < t->text.t || s > t->text.t+t->length ) return NULL; return s; } case ENC_WCHAR: { #if SIZEOF_WCHAR_T == 2 const wchar_t *s = start; if ( offset > 0 ) { const wchar_t *e = &t->text.w[t->length]; while ( offset-- > 0 && s < e ) { int c = *s++; if ( IS_UTF16_LEAD(c) ) s++; } return s <= e ? s : NULL; } else { while ( offset++ < 0 && s > t->text.w ) { int c = *--s; if ( IS_UTF16_TRAIL(c) ) s--; } return offset == 0 ? s : NULL; } #else const wchar_t *s = start; s += offset; if ( s < t->text.w || s > t->text.w+t->length ) return NULL; return s; #endif } default: assert(0); return NULL; } } static const void* PL_seek_text(const PL_chars_t *t, size_t offset) { return PL_seek_text_from(t, t->text.t, offset); } static const void* PL_seek_text_rev(const PL_chars_t *t, size_t offset) { ssize_t off = offset; const void *from; switch(t->encoding) { case ENC_ISO_LATIN_1: from = &t->text.t[t->length]; break; case ENC_WCHAR: from = &t->text.w[t->length]; break; default: assert(0); from = NULL; } return PL_seek_text_from(t, from, -off); } int PL_unify_text_range(term_t term, const PL_chars_t *text, size_t offset, size_t len, int type) { const void *as = PL_seek_text(text, offset); const void *ae = PL_seek_text_from(text, as, len); assert(text->canonical); if ( !as || !ae ) return FALSE; /* offset > length */ if ( len == 1 && type == PL_ATOM && ae > as ) { GET_LD int c; if ( text->encoding == ENC_ISO_LATIN_1 ) { const unsigned char *s = as; c = s[0]; } else { const wchar_t *s = as; get_wchar(s, &c); } return PL_unify_atom(term, codeToAtom(c)); } PL_chars_t sub; sub.storage = text->storage == PL_CHARS_STACK ? PL_CHARS_STACK : PL_CHARS_HEAP; if ( text->encoding == ENC_ISO_LATIN_1 ) { sub.text.t = (char*)as; sub.length = (const char*)ae - (const char *)as; sub.encoding = ENC_ISO_LATIN_1; sub.canonical = TRUE; } else { sub.text.w = (wchar_t*)as; sub.length = (const wchar_t*)ae - (const wchar_t*)as; sub.encoding = ENC_WCHAR; sub.canonical = FALSE; } int rc = PL_unify_text(term, 0, &sub, type); PL_free_text(&sub); return rc; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int PL_promote_text(PL_chars_t *text) Promote a text to USC if it is currently 8-bit text. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ int PL_promote_text(PL_chars_t *text) { if ( text->encoding != ENC_WCHAR ) { if ( text->storage == PL_CHARS_MALLOC ) { pl_wchar_t *new = PL_malloc(sizeof(pl_wchar_t)*(text->length+1)); pl_wchar_t *t = new; const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; while(stext.t); text->text.w = new; text->encoding = ENC_WCHAR; } else if ( text->storage == PL_CHARS_LOCAL && (text->length+1)*sizeof(pl_wchar_t) < sizeof(text->buf) ) { unsigned char buf[sizeof(text->buf)]; unsigned char *f = buf; unsigned char *e = &buf[text->length]; pl_wchar_t *t = (pl_wchar_t*)text->buf; memcpy(buf, text->buf, text->length*sizeof(char)); while(fencoding = ENC_WCHAR; } else { Buffer b = findBuffer(BUF_STACK); const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; for( ; stext.w = baseBuffer(b, pl_wchar_t); text->encoding = ENC_WCHAR; text->storage = PL_CHARS_RING; } } succeed; } static int PL_demote_text(PL_chars_t *text, int flags) { if ( text->encoding != ENC_ISO_LATIN_1 ) { if ( text->storage == PL_CHARS_MALLOC ) { char *new = PL_malloc(sizeof(char)*(text->length+1)); char *t = new; const pl_wchar_t *s = (const pl_wchar_t *)text->text.t; const pl_wchar_t *e = &s[text->length]; while(s 0xff ) { PL_free(new); reperr: if ( (flags&CVT_EXCEPTION) ) return text_representation_error(text, ENC_ISO_LATIN_1); return FALSE; } *t++ = *s++ & 0xff; } *t = EOS; PL_free(text->text.t); text->text.t = new; text->encoding = ENC_ISO_LATIN_1; } else if ( text->storage == PL_CHARS_LOCAL ) { pl_wchar_t buf[sizeof(text->buf)/sizeof(pl_wchar_t)]; pl_wchar_t *f = buf; pl_wchar_t *e = &buf[text->length]; char *t = text->buf; memcpy(buf, text->buf, text->length*sizeof(pl_wchar_t)); while(f 0xff ) goto reperr; *t++ = *f++ & 0xff; } *t = EOS; text->encoding = ENC_ISO_LATIN_1; } else { Buffer b = findBuffer(BUF_STACK); const pl_wchar_t *s = (const pl_wchar_t*)text->text.w; const pl_wchar_t *e = &s[text->length]; for( ; s 0xff ) { unfindBuffer(b, BUF_STACK); goto reperr; } addBuffer(b, *s&0xff, char); } addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->storage = PL_CHARS_RING; text->encoding = ENC_ISO_LATIN_1; } } succeed; } static int can_demote(PL_chars_t *text) { if ( text->encoding != ENC_ISO_LATIN_1 ) { const pl_wchar_t *w = (const pl_wchar_t*)text->text.w; const pl_wchar_t *e = &w[text->length]; for(; w 0xff ) return FALSE; } } return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Convert text to 8-bit according to flags. May hold REP_UTF8 to convert to UTF-8, REP_MB to convert to locale 8-bit representation or nothing to convert to ISO Latin-1. This predicate can fail of the text cannot be represented. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int wctobuffer(wchar_t c, mbstate_t *mbs, Buffer buf) { char b[PL_MB_LEN_MAX]; size_t n; if ( (n=wcrtomb(b, c, mbs)) != (size_t)-1 ) { size_t i; for(i=0; iencoding != target ) { Buffer b = findBuffer(BUF_STACK); switch(text->encoding) { case ENC_ISO_LATIN_1: { const unsigned char *s = (const unsigned char*)text->text.t; const unsigned char *e = &s[text->length]; if ( target == ENC_UTF8 ) { for( ; stext.w; const pl_wchar_t *e = &w[text->length]; if ( target == ENC_UTF8 ) { while( w < e ) { int c; w = get_wchar(w, &c); utf8tobuffer(c, b); } addBuffer(b, 0, char); } else /* if ( target == ENC_MB ) */ { mbstate_t mbs; memset(&mbs, 0, sizeof(mbs)); for( ; wlength = sizeOfBuffer(b)-1; text->text.t = baseBuffer(b, char); text->encoding = target; text->storage = PL_CHARS_RING; } succeed; rep_error: if ( (flags & CVT_EXCEPTION) ) { char msg[128]; sprintf(msg, "Cannot represent char U%04x using %s encoding", norep, target == ENC_ISO_LATIN_1 ? "ISO Latin-1" : "current locale"); return PL_error(NULL, 0, msg, ERR_REPRESENTATION, ATOM_encoding); } fail; } static void flip_shorts(unsigned char *s, size_t len) { unsigned char *e = s+len; for(; scanonical ) { switch(text->encoding ) { case ENC_OCTET: text->encoding = ENC_ISO_LATIN_1; case ENC_ISO_LATIN_1: text->canonical = TRUE; break; /* nothing to do */ case ENC_WCHAR: { const pl_wchar_t *w; const pl_wchar_t *e; int wide; #if SIZEOF_WCHAR_T == 2 case_wchar: #endif w = (const pl_wchar_t*)text->text.w; e = &w[text->length]; wide = FALSE; #if SIZEOF_WCHAR_T == 2 for(; w 0xff ) wide = TRUE; if ( IS_UTF16_LEAD(*w) ) { if ( w+1 0xff ) wide = TRUE; if ( !VALID_CODE_POINT(*w) ) return ERR_TEXT_INVALID_CODE_POINT; } #endif if ( !wide ) return PL_demote_text(text, 0); return TRUE; } case ENC_UTF16LE: /* assume text->length is in bytes */ case ENC_UTF16BE: { if ( text->length%2 != 0 ) return ERR_TEXT_INCOMPLETE_MULTIBYTE_SEQUENCE; #if SIZEOF_WCHAR_T == 2 if ( !native_byte_order(text->encoding) ) { if ( text->storage == PL_CHARS_HEAP ) PL_save_text(text, BUF_MALLOC); flip_shorts((unsigned char*)text->text.t, text->length); } text->encoding = ENC_WCHAR; text->length /= sizeof(wchar_t); goto case_wchar; #else /*SIZEOF_WCHAR_T!=2*/ size_t len = text->length/sizeof(short); size_t code_points = 0; const unsigned short *w, *e; int wide = FALSE; if ( !native_byte_order(text->encoding) ) { if ( text->storage == PL_CHARS_HEAP ) PL_save_text(text, BUF_MALLOC); flip_shorts((unsigned char*)text->text.t, text->length); } w = (const unsigned short *)text->text.t; e = &w[len]; for(; w 0xff ) wide = TRUE; if ( IS_UTF16_LEAD(*w) ) { if ( w+1text.t; if ( wide ) { pl_wchar_t *t, *to = PL_malloc(sizeof(pl_wchar_t)*(code_points+1)); for(t=to; wlength = len; text->encoding = ENC_WCHAR; if ( text->storage == PL_CHARS_MALLOC ) PL_free(text->text.t); else text->storage = PL_CHARS_MALLOC; text->text.w = to; } else { unsigned char *t, *to = PL_malloc(code_points+1); for(t=to; wlength = len; text->encoding = ENC_ISO_LATIN_1; if ( text->storage == PL_CHARS_MALLOC ) PL_free(text->text.t); else text->storage = PL_CHARS_MALLOC; text->text.t = (char*)to; } return TRUE; #endif /*SIZEOF_WCHAR_T==2*/ } case ENC_UTF8: { const char *s = text->text.t; const char *e = &s[text->length]; while(sencoding = ENC_ISO_LATIN_1; text->canonical = TRUE; } else { int chr; int wide = FALSE; size_t len = s - text->text.t; while(s 0xff ) /* requires wide characters */ wide = TRUE; len++; #if SIZEOF_WCHAR_T == 2 if ( chr > 0xffff ) len++; #endif } s = (const char *)text->text.t; text->length = len; if ( wide ) { pl_wchar_t *t, *to = PL_malloc(sizeof(pl_wchar_t)*(len+1)); for(t=to; sencoding = ENC_WCHAR; if ( text->storage == PL_CHARS_MALLOC ) PL_free(text->text.t); text->text.w = to; text->storage = PL_CHARS_MALLOC; } else { char *t, *to = PL_malloc(len+1); for(t=to; sencoding = ENC_ISO_LATIN_1; if ( text->storage == PL_CHARS_MALLOC ) PL_free(text->text.t); text->text.t = to; text->storage = PL_CHARS_MALLOC; } text->canonical = TRUE; } return TRUE; } case ENC_ANSI: { mbstate_t mbs; size_t len = 0; int wide = FALSE; char *s = text->text.t; size_t rc, n = text->length; wchar_t wc; memset(&mbs, 0, sizeof(mbs)); while( n > 0 ) { if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0) return ERR_TEXT_ILLEGAL_MULTIBYTE_SEQUENCE; if ( wc > 0xff ) wide = TRUE; len++; #if SIZEOF_WCHAR_T == 2 if ( wc > 0xffff ) len++; #endif n -= rc; s += rc; } if ( n == 0 ) { const char *from = text->text.t; void *do_free; n = text->length; memset(&mbs, 0, sizeof(mbs)); if ( text->storage == PL_CHARS_MALLOC ) do_free = text->text.t; else do_free = NULL; if ( !wide ) { char *to; text->encoding = ENC_ISO_LATIN_1; if ( len+1 < sizeof(text->buf) ) { text->text.t = text->buf; text->storage = PL_CHARS_LOCAL; } else { text->text.t = PL_malloc(len+1); text->storage = PL_CHARS_MALLOC; } to = text->text.t; while( n > 0 ) { rc = mbrtowc(&wc, from, n, &mbs); *to++ = (char)wc; n -= rc; from += rc; } *to = EOS; } else { wchar_t *to; char b2[sizeof(text->buf)]; text->encoding = ENC_WCHAR; if ( len+1 < sizeof(text->buf)/sizeof(wchar_t) ) { if ( text->text.t == text->buf ) { memcpy(b2, text->buf, sizeof(text->buf)); from = b2; } text->text.w = (wchar_t*)text->buf; } else { text->text.w = PL_malloc((len+1)*sizeof(wchar_t)); text->storage = PL_CHARS_MALLOC; } to = text->text.w; while( n > 0 ) { rc = mbrtowc(&wc, from, n, &mbs); to = put_wchar(to, wc); n -= rc; from += rc; } *to = EOS; } text->length = len; text->canonical = TRUE; if ( do_free ) PL_free(do_free); return TRUE; } fail; } default: assert(0); } } return TRUE; } static int text_representation_error(PL_chars_t *text, IOENC enc) { char msg[100]; Ssnprintf(msg, sizeof(msg), "cannot represent text using encoding %s", PL_atom_chars(PL_encoding_to_atom(enc))); return PL_error(NULL, 0, msg, ERR_REPRESENTATION, ATOM_encoding); } static int text_error(PL_chars_t *text, int rc) { (void)text; switch(rc) { case ERR_TEXT_ILLEGAL_UTF8: return PL_syntax_error("illegal_utf8_sequence", NULL); case ERR_TEXT_ILLEGAL_UTF16: return PL_syntax_error("illegal_utf16_sequence", NULL); case ERR_TEXT_ILLEGAL_MULTIBYTE_SEQUENCE: return PL_syntax_error("illegal_multibyte_sequence", NULL); case ERR_TEXT_INCOMPLETE_MULTIBYTE_SEQUENCE: return PL_syntax_error("incomplete_multibyte_sequence", NULL); case ERR_TEXT_INVALID_CODE_POINT: return PL_representation_error("code_point"); default: assert(0); return FALSE; } } int PL_canonicalise_text_ex(PL_chars_t *text) { int rc; if ( (rc=PL_canonicalise_text(text)) == TRUE ) return TRUE; return text_error(text, rc); } void PL_free_text(PL_chars_t *text) { if ( text->storage == PL_CHARS_MALLOC && text->text.t ) PL_free(text->text.t); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Recode a text to the given encoding. Currrenly only supports re-coding to UTF-8 for ENC_ASCII, ENC_ISO_LATIN_1, ENC_WCHAR and ENC_ANSI. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void addUTF16Buffer_unit(Buffer b, int c, IOENC enc) { if ( native_byte_order(enc) ) { addBuffer(b, c, unsigned short); } else { union { unsigned short s; char c[2]; } swap; swap.s = c; char t = swap.c[0]; swap.c[0] = swap.c[1]; swap.c[1] = t; addBuffer(b, swap.s, unsigned short); } } static void addUTF16Buffer(Buffer b, int c, IOENC enc) { if ( c > 0xffff ) { int l, t; utf16_encode(c, &l, &t); addUTF16Buffer_unit(b, l, enc); addUTF16Buffer_unit(b, t, enc); } else addUTF16Buffer_unit(b, c, enc); } int PL_text_recode(PL_chars_t *text, IOENC encoding) { if ( text->encoding != encoding ) { switch(encoding) { case ENC_UTF8: { Buffer b; switch(text->encoding) { case ENC_ASCII: text->encoding = ENC_UTF8; break; case ENC_ISO_LATIN_1: { const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; for( ; stext.t; goto convert_utf8; } } /* ASCII; nothing to do */ text->encoding = ENC_UTF8; break; convert_utf8: b = findBuffer(BUF_STACK); for( ; slength = entriesBuffer(b, char); addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->encoding = ENC_UTF8; text->storage = PL_CHARS_RING; break; } case ENC_WCHAR: { const pl_wchar_t *s = text->text.w; const pl_wchar_t *e = &s[text->length]; b = findBuffer(BUF_STACK); while( s < e ) { int c; s = get_wchar(s, &c); utf8tobuffer(c, b); } goto swap_to_utf8; } case ENC_ANSI: { mbstate_t mbs; size_t rc, n = text->length; wchar_t wc; const char *s = (const char *)text->text.t; b = findBuffer(BUF_STACK); memset(&mbs, 0, sizeof(mbs)); while( n > 0 ) { if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0) return FALSE; /* encoding error */ utf8tobuffer(wc, b); n -= rc; s += rc; } if ( n == 0 ) goto swap_to_utf8; return FALSE; } default: assert(0); return FALSE; } return TRUE; case ENC_ISO_LATIN_1: /* --> ISO Latin 1 */ case ENC_OCTET: /* --> bytes */ case ENC_ASCII: /* --> ASCII */ { assert(text->canonical); switch(text->encoding) { case ENC_WCHAR: return text_representation_error(text, encoding); case ENC_ISO_LATIN_1: if ( encoding == ENC_ASCII ) { const unsigned char *s =(const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; while(s= 128 ) return text_representation_error(text, encoding); } } text->canonical = FALSE; text->encoding = encoding; return TRUE; default: assert(0); return FALSE; } } case ENC_UTF16LE: /* --> UTF-16 */ case ENC_UTF16BE: { Buffer b; assert(text->canonical); switch(text->encoding) { case ENC_ISO_LATIN_1: { b = findBuffer(BUF_STACK); const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; for( ; slength = entriesBuffer(b, char); addBuffer(b, EOS, short); text->text.t = baseBuffer(b, char); text->encoding = encoding; text->storage = PL_CHARS_RING; return TRUE; } case ENC_WCHAR: { b = findBuffer(BUF_STACK); const wchar_t *s = text->text.w; const wchar_t *e = &s[text->length]; for( ; scanonical); return PL_mb_text(text, REP_MB); } default: assert(0); return FALSE; } } } else return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - PL_cmp_text(PL_chars_t *t1, size_t o1, PL_chars_t *t2, size_t o2, size_t len) Compares two substrings of two text representations. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ int PL_cmp_text(PL_chars_t *t1, size_t o1, PL_chars_t *t2, size_t o2, size_t len) { const void *s1 = PL_seek_text(t1, o1); const void *s2 = PL_seek_text(t2, o2); #define CMP_LENGTH \ if ( len-- == 0 ) return CMP_EQUAL; \ if ( (const void*)s >= e1 ) return CMP_LESS; \ if ( (const void*)q >= e2 ) return CMP_GREATER #define CMP_CHARS \ if ( c1 != c2 ) \ return (c1 > c2 ? CMP_GREATER : CMP_LESS) if ( s1 && s2 ) { const void *e1 = PL_seek_text_rev(t1, 0); const void *e2 = PL_seek_text_rev(t2, 0); if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_ISO_LATIN_1 ) { const unsigned char *s = s1; const unsigned char *q = s2; for(;;) { int c1, c2; CMP_LENGTH; c1 = *s++; c2 = *q++; CMP_CHARS; } } else if ( t1->encoding == ENC_WCHAR && t2->encoding == ENC_WCHAR ) { const pl_wchar_t *s = s1; const pl_wchar_t *q = s2; for(;;) { int c1, c2; CMP_LENGTH; s = get_wchar(s, &c1); q = get_wchar(q, &c2); CMP_CHARS; } } else if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_WCHAR ) { const unsigned char *s = s1; const pl_wchar_t *q = s2; for(;;) { int c1, c2; CMP_LENGTH; c1 = *s++; q = get_wchar(q, &c2); CMP_CHARS; } } else { const pl_wchar_t *s = s1; const unsigned char *q = s2; for(;;) { int c1, c2; CMP_LENGTH; s = get_wchar(s, &c1); c2 = *q++; CMP_CHARS; } } } else if ( s1 /*&& !s2*/ ) { return CMP_GREATER; } else if ( s2 /*&& !s1*/ ) { return CMP_LESS; } else { return CMP_EQUAL; } } int PL_concat_text(int n, PL_chars_t **text, PL_chars_t *result) { size_t total_length = 0; int latin = TRUE; int i; for(i=0; ilength; } result->canonical = TRUE; result->length = total_length; if ( latin ) { char *to; result->encoding = ENC_ISO_LATIN_1; if ( total_length+1 < sizeof(result->buf) ) { result->text.t = result->buf; result->storage = PL_CHARS_LOCAL; } else { result->text.t = PL_malloc(total_length+1); result->storage = PL_CHARS_MALLOC; } for(to=result->text.t, i=0; itext.t, text[i]->length); to += text[i]->length; } *to = EOS; } else { pl_wchar_t *to; result->encoding = ENC_WCHAR; if ( total_length+1 < sizeof(result->buf)/sizeof(pl_wchar_t) ) { result->text.w = (pl_wchar_t*)result->buf; result->storage = PL_CHARS_LOCAL; } else { result->text.w = PL_malloc((total_length+1)*sizeof(pl_wchar_t)); result->storage = PL_CHARS_MALLOC; } for(to=result->text.w, i=0; iencoding == ENC_WCHAR ) { memcpy(to, text[i]->text.w, text[i]->length*sizeof(pl_wchar_t)); to += text[i]->length; } else { const unsigned char *f = (const unsigned char *)text[i]->text.t; const unsigned char *e = &f[text[i]->length]; while(ftext.w) == total_length); *to = EOS; } return TRUE; } IOSTREAM * Sopen_text(PL_chars_t *txt, const char *mode) { IOSTREAM *stream; if ( !streq(mode, "r") ) { errno = EINVAL; return NULL; } stream = Sopen_string(NULL, txt->text.t, bufsize_text(txt, txt->length), mode); stream->encoding = txt->encoding; return stream; }