From a136b663bd57769ec38589a78fe0c3f0758091e8 Mon Sep 17 00:00:00 2001 From: Juan Jose Garcia Ripoll Date: Sun, 20 Feb 2011 23:52:24 +0000 Subject: [PATCH] Embed the Unicode database in the C library as a sequence of C arrays. --- src/Makefile.in | 4 --- src/c/char_ctype.d | 30 +++++++++++---------- src/c/main.d | 66 ---------------------------------------------- src/configure | 2 ++ src/configure.in | 2 ++ src/h/external.h | 6 ----- 6 files changed, 20 insertions(+), 90 deletions(-) diff --git a/src/Makefile.in b/src/Makefile.in index 03a6bf628..472e0f9bc 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -69,8 +69,6 @@ ecl/external.h: c/ecl/external.h c/ecl/external.h cp -rf $(srcdir)/h/*.h $(srcdir)/h/impl ecl/ bin/ecl$(EXE): ecl_min$(EXE) compile.lsp ecl/external.h build-stamp $(top_srcdir)/lsp/*.lsp - cp $(top_srcdir)/../contrib/unicode/ucd.dat . - cp $(top_srcdir)/../contrib/unicode/ucd16.dat . if [ -f CROSS-COMPILER ]; then \ ./CROSS-COMPILER compile; \ else \ @@ -144,8 +142,6 @@ install: for i in $(TARGETS); do \ $(INSTALL_PROGRAM) $$i $(DESTDIR)$(bindir); \ done - $(INSTALL_DATA) ucd.dat $(DESTDIR)$(ecldir)/ - $(INSTALL_DATA) ucd16.dat $(DESTDIR)$(ecldir)/ if [ -d encodings ]; then \ $(mkinstalldirs) $(DESTDIR)$(ecldir)/encodings; \ for i in ./encodings/*; do \ diff --git a/src/c/char_ctype.d b/src/c/char_ctype.d index 57f8a3e86..57679df3f 100644 --- a/src/c/char_ctype.d +++ b/src/c/char_ctype.d @@ -67,16 +67,20 @@ ecl_char_downcase(ecl_character code) #else /* ECL_UNICODE */ +extern const unsigned char ecl_ucd_misc_table[]; +extern const unsigned char *ecl_ucd_page_table[]; +extern const unsigned char ecl_ucd_page_table_1[]; + /* * 21-bits Unicode (0 to #x110000 char codes) */ #if ECL_UNICODE > 16 -static uint8_t * +const unsigned char * ucd_char_data(ecl_character code) { - unsigned char page = cl_core.ucd_pages[code >> 8]; - return cl_core.ucd_data + ((cl_index)page << 10) + 4 * (code & 0xFF); + const unsigned char *page = ecl_ucd_page_table[code >> 8]; + return page + (4 * (code & 0xFF)); } static cl_index @@ -85,7 +89,7 @@ ucd_value_0(ecl_character code) return ucd_char_data(code)[0]; } -#define read_case_bytes(c) (c[0] + (c[1] << 8) + (c[3] << 16)) +#define read_case_bytes(c) (c[1] + (c[2] << 8) + (c[3] << 16)) #endif /* @@ -94,11 +98,11 @@ ucd_value_0(ecl_character code) */ #if ECL_UNICODE <= 16 -static uint8_t * +const unsigned char * ucd_char_data(ecl_character code) { - unsigned char page = cl_core.ucd_pages[code >> 8]; - return cl_core.ucd_data + ((cl_index)page * (256 * 3)) + 3 * (code & 0xFF); + const unsigned char *page = ecl_ucd_page_table[code >> 8]; + return page + (3 * (code & 0xFF)); } static cl_index @@ -107,19 +111,19 @@ ucd_value_0(ecl_character code) return ucd_char_data(code)[0]; } -#define read_case_bytes(c) (c[0] + (c[1] << 8)) +#define read_case_bytes(c) (c[1] + (c[2] << 8)) #endif static int ucd_general_category(ecl_character code) { - return cl_core.ucd_misc[8 * ucd_value_0(code)]; + return ecl_ucd_misc_table[8 * ucd_value_0(code)]; } static int ucd_decimal_digit(ecl_character code) { - return cl_core.ucd_misc[3 + 8 * ucd_value_0(code)]; + return ecl_ucd_misc_table[3 + 8 * ucd_value_0(code)]; } bool @@ -163,9 +167,8 @@ ecl_alphanumericp(ecl_character i) ecl_character ecl_char_upcase(ecl_character code) { - uint8_t *c = ucd_char_data(code); + const unsigned char *c = ucd_char_data(code); if (c[0] == 1) { - c++; return read_case_bytes(c); } else { return code; @@ -175,9 +178,8 @@ ecl_char_upcase(ecl_character code) ecl_character ecl_char_downcase(ecl_character code) { - uint8_t *c = ucd_char_data(code); + const unsigned char *c = ucd_char_data(code); if (c[0] == 0) { - c++; return read_case_bytes(c); } else { return code; diff --git a/src/c/main.d b/src/c/main.d index f659a01cc..b92b0b936 100644 --- a/src/c/main.d +++ b/src/c/main.d @@ -262,61 +262,6 @@ cl_shutdown(void) ecl_set_option(ECL_OPT_BOOTED, -1); } -#ifdef ECL_UNICODE -static void -read_char_database() -{ -#if ECL_UNICODE > 16 -#define UCD "ucd.dat" -#else -#define UCD "ucd16.dat" -#endif - cl_object s = si_base_string_concatenate(2, - si_get_library_pathname(), - make_constant_base_string(UCD)); - cl_object output = Cnil; - FILE *f = fopen((char *)s->base_string.self, "rb"); - printf("%s\n", UCD); - if (f) { - cl_index size, read; - if (!fseek(f, 0, SEEK_END)) { - size = ftell(f); - fseek(f, 0, SEEK_SET); - output = ecl_alloc_simple_vector(size, aet_b8); - read = 0; - while (read < size) { - cl_index res; - res = fread(output->vector.self.b8 + read, 1, size - read, f); - if (res > 0) { - read += res; - } else { - output = Cnil; - break; - } - } - } - fclose(f); - } - if (output == Cnil) { - printf("Unable to read Unicode database: %s\n", s->base_string.self); - abort(); - } else { - uint8_t *p = output->vector.self.b8; - cl_core.unicode_database = output; - cl_core.ucd_misc = p + 2; - cl_core.ucd_pages = cl_core.ucd_misc + (p[0] + (p[1]<<8)); -#if ECL_UNICODE > 16 - cl_core.ucd_data = cl_core.ucd_pages + (0x110000 / 256); -#else - cl_core.ucd_data = cl_core.ucd_pages + (65536 / 256); -#endif - } - ECL_SET(@'si::+unicode-database+', output); -} -#else -#define read_char_database() (void)0 -#endif - ecl_def_ct_single_float(default_rehash_size,1.5f,static,const); ecl_def_ct_single_float(default_rehash_threshold,0.75f,static,const); ecl_def_ct_base_string(str_common_lisp,"COMMON-LISP",11,static,const); @@ -456,12 +401,6 @@ struct cl_core_struct cl_core = { #endif Cnil, /* signal_queue */ -#ifdef ECL_UNICODE - Cnil, /* unicode_database */ - NULL, /* ucd_misc */ - NULL, /* ucd_pages */ - NULL, /* ucd_data */ -#endif NULL, /* default_sigmask */ #ifdef ECL_THREADS @@ -629,11 +568,6 @@ cl_boot(int argc, char **argv) ECL_SET(@'mp::*current-process*', env->own_process); #endif - /* - * Initialize Unicode character database. - */ - read_char_database(); - /* * Load character names. The following hash table is a map * from names to character codes and viceversa. Note that we diff --git a/src/configure b/src/configure index e0d96e19b..14d6c74f6 100755 --- a/src/configure +++ b/src/configure @@ -15257,6 +15257,7 @@ _ACEOF CHAR_CODE_LIMIT=65536 ECL_CHARACTER=$ECL_INT16_T + EXTRA_OBJS="$EXTRA_OBJS unicode/ucd16.o unicode/ucd16-0000.o unicode/ucd16-0016.o unicode/ucd16-0032.o unicode/ucd16-0048.o unicode/ucd16-0064.o" else cat >>confdefs.h <<\_ACEOF @@ -15265,6 +15266,7 @@ _ACEOF CHAR_CODE_LIMIT=1114112 ECL_CHARACTER=$ECL_INT32_T + EXTRA_OBJS="$EXTRA_OBJS unicode/ucd.o unicode/ucd-0000.o unicode/ucd-0016.o unicode/ucd-0032.o unicode/ucd-0048.o unicode/ucd-0064.o unicode/ucd-0080.o unicode/ucd-0096.o" fi else CHAR_CODE_LIMIT=256 diff --git a/src/configure.in b/src/configure.in index 69c168772..5ce6f8660 100644 --- a/src/configure.in +++ b/src/configure.in @@ -871,10 +871,12 @@ if test "x${enable_unicode}" != "xno"; then AC_DEFINE(ECL_UNICODE, [16], [Support for Unicode]) CHAR_CODE_LIMIT=65536 ECL_CHARACTER=$ECL_INT16_T + EXTRA_OBJS="$EXTRA_OBJS unicode/ucd16.o unicode/ucd16-0000.o unicode/ucd16-0016.o unicode/ucd16-0032.o unicode/ucd16-0048.o unicode/ucd16-0064.o" else AC_DEFINE(ECL_UNICODE, [21], [Support for Unicode]) CHAR_CODE_LIMIT=1114112 ECL_CHARACTER=$ECL_INT32_T + EXTRA_OBJS="$EXTRA_OBJS unicode/ucd.o unicode/ucd-0000.o unicode/ucd-0016.o unicode/ucd-0032.o unicode/ucd-0048.o unicode/ucd-0064.o unicode/ucd-0080.o unicode/ucd-0096.o" fi else CHAR_CODE_LIMIT=256 diff --git a/src/h/external.h b/src/h/external.h index fdda59a4d..3795a78cc 100755 --- a/src/h/external.h +++ b/src/h/external.h @@ -231,12 +231,6 @@ struct cl_core_struct { #endif cl_object signal_queue; -#ifdef ECL_UNICODE - cl_object unicode_database; - uint8_t *ucd_misc; - uint8_t *ucd_pages; - uint8_t *ucd_data; -#endif void *default_sigmask; #ifdef ECL_THREADS