diff --git a/src/c/file.d b/src/c/file.d index e818203d1..058d677f8 100755 --- a/src/c/file.d +++ b/src/c/file.d @@ -4457,6 +4457,77 @@ seq_in_unread_char(cl_object strm, ecl_character c) strm->stream.byte_stack = ECL_NIL; } +#ifdef ecl_uint16_t +static ecl_character +seq_in_ucs2_read_char(cl_object strm) +{ + cl_fixnum curr_pos = SEQ_INPUT_POSITION(strm); + cl_fixnum last = SEQ_INPUT_LIMIT(strm); + if (curr_pos >= last) { + return EOF; + } + cl_object vector = SEQ_INPUT_VECTOR(strm); + ecl_character c = vector->vector.self.b16[curr_pos++]; + cl_object err; + if (c >= 0xD800 && c <= 0xDBFF) { + if (curr_pos >= last) { + err = ecl_list1(ecl_make_fixnum(c)); + goto DECODING_ERROR; + } + ecl_character aux = vector->vector.self.b16[curr_pos++]; + if (aux < 0xDC00 || aux > 0xDFFF) { + err = cl_list(2, ecl_make_fixnum(c), ecl_make_fixnum(aux)); + goto DECODING_ERROR; + } + c = ((c & 0x3FFF) << 10) + (aux & 0x3FFF) + 0x10000; + } + SEQ_INPUT_POSITION(strm) = curr_pos; + return c; + cl_object code; + DECODING_ERROR: + code = _ecl_funcall4(@'ext::decoding-error', strm, + cl_stream_external_format(strm), + err); + if (Null(code)) { + /* Go for next character */ + return seq_in_ucs2_read_char(strm); + } else { + /* Return supplied character */ + return ecl_char_code(code); + } +} + +static void +seq_in_ucs2_unread_char(cl_object strm, ecl_character c) +{ + if (c >= 0x10000) { + SEQ_INPUT_POSITION(strm) -= 2; + } else { + SEQ_INPUT_POSITION(strm) -= 1; + } +} +#endif + +#ifdef ecl_uint32_t +static ecl_character +seq_in_ucs4_read_char(cl_object strm) +{ + cl_fixnum curr_pos = SEQ_INPUT_POSITION(strm); + if (curr_pos >= SEQ_INPUT_LIMIT(strm)) { + return EOF; + } + cl_object vector = SEQ_INPUT_VECTOR(strm); + SEQ_INPUT_POSITION(strm) += 1; + return vector->vector.self.b32[curr_pos]; +} + +static void +seq_in_ucs4_unread_char(cl_object strm, ecl_character c) +{ + SEQ_INPUT_POSITION(strm) -= 1; +} +#endif + static int seq_in_listen(cl_object strm) { @@ -4530,11 +4601,10 @@ make_sequence_input_stream(cl_object vector, cl_index istart, cl_index iend, cl_object type_name; int byte_size; int flags = 0; - if (!ECL_VECTORP(vector) || - ecl_aet_size[type = ecl_array_elttype(vector)] != 1) - { - FEerror("MAKE-SEQUENCE-INPUT-STREAM only accepts vectors whose element has a size of 1 byte.~%~A", 1, vector); - } + if (!ECL_VECTORP(vector)) { + FEwrong_type_nth_arg(@[ext::make-sequence-input-stream], 1, vector, @[vector]); + } + type = ecl_array_elttype(vector); type_name = ecl_elttype_to_symbol(type); byte_size = ecl_normalize_stream_element_type(type_name); /* Character streams always get some external format. For binary @@ -4545,9 +4615,32 @@ make_sequence_input_stream(cl_object vector, cl_index istart, cl_index iend, if (!byte_size && Null(external_format)) { external_format = @':default'; } - set_stream_elt_type(strm, byte_size, flags, external_format); - /* Override byte size */ - if (byte_size) strm->stream.byte_size = 8; + if (ecl_aet_size[type] == 1) { + set_stream_elt_type(strm, byte_size, flags, external_format); + /* Override byte size */ + if (byte_size) strm->stream.byte_size = 8; + } +#ifdef ecl_uint16_t + else if (ecl_aet_size[type] == 2 && external_format == @':ucs-2') { + IO_STREAM_ELT_TYPE(strm) = @'character'; + strm->stream.format = @':ucs-2'; + strm->stream.byte_size = 2*8; + strm->stream.ops->read_char = seq_in_ucs2_read_char; + strm->stream.ops->unread_char = seq_in_ucs2_unread_char; + } +#endif +#ifdef ecl_uint32_t + else if (ecl_aet_size[type] == 4 && external_format == @':ucs-4') { + IO_STREAM_ELT_TYPE(strm) = @'character'; + strm->stream.format = @':ucs-4'; + strm->stream.byte_size = 4*8; + strm->stream.ops->read_char = seq_in_ucs4_read_char; + strm->stream.ops->unread_char = seq_in_ucs4_unread_char; + } +#endif + else { + FEerror("Illegal combination of external-format ~A and input vector ~A for MAKE-SEQUENCE-INPUT-STREAM.~%", 2, external_format, vector); + } SEQ_INPUT_VECTOR(strm) = vector; SEQ_INPUT_POSITION(strm) = istart; SEQ_INPUT_LIMIT(strm) = iend; @@ -4570,6 +4663,18 @@ make_sequence_input_stream(cl_object vector, cl_index istart, cl_index iend, * SEQUENCE OUTPUT STREAMS */ +static void +seq_out_enlarge_vector(cl_object strm) +{ + cl_object vector = SEQ_OUTPUT_VECTOR(strm); + if (!ECL_ADJUSTABLE_ARRAY_P(vector)) { + FEerror("Can't adjust the dimensions of the sequence of sequence stream ~A", 1, strm); + } + vector = _ecl_funcall3(@'adjust-array', vector, + ecl_ash(ecl_make_fixnum(vector->vector.dim), 1)); + SEQ_OUTPUT_VECTOR(strm) = vector; +} + static cl_index seq_out_write_byte8(cl_object strm, unsigned char *c, cl_index n) { @@ -4580,13 +4685,7 @@ seq_out_write_byte8(cl_object strm, unsigned char *c, cl_index n) cl_fixnum last = vector->vector.dim; cl_fixnum delta = last - curr_pos; if (delta < n) { - /* Not enough space, enlarge */ - if (!ECL_ADJUSTABLE_ARRAY_P(vector)) { - FEerror("Can't adjust the dimensions of the sequence of sequence stream ~A", 1, strm); - } - vector = _ecl_funcall3(@'adjust-array', vector, - ecl_ash(ecl_make_fixnum(last), 1)); - SEQ_OUTPUT_VECTOR(strm) = vector; + seq_out_enlarge_vector(strm); goto AGAIN; } memcpy(vector->vector.self.bc + curr_pos, c, n); @@ -4597,6 +4696,55 @@ seq_out_write_byte8(cl_object strm, unsigned char *c, cl_index n) return n; } +#ifdef ecl_uint16_t +static ecl_character +seq_out_ucs2_write_char(cl_object strm, ecl_character c) +{ + AGAIN: + { + cl_object vector = SEQ_OUTPUT_VECTOR(strm); + cl_fixnum curr_pos = SEQ_OUTPUT_POSITION(strm); + cl_fixnum n = (c >= 0x10000) ? 2 : 1; + if (vector->vector.dim - curr_pos < n) { + seq_out_enlarge_vector(strm); + goto AGAIN; + } + if (c >= 0x10000) { + c -= 0x10000; + vector->vector.self.b16[curr_pos++] = (c >> 10) | 0xD800; + vector->vector.self.b16[curr_pos++] = (c & 0x3FFF) | 0xDC00; + } else { + vector->vector.self.b16[curr_pos++] = c; + } + SEQ_OUTPUT_POSITION(strm) = curr_pos; + if (vector->vector.fillp < curr_pos) + vector->vector.fillp = curr_pos; + } + return c; +} +#endif + +#ifdef ecl_uint32_t +static ecl_character +seq_out_ucs4_write_char(cl_object strm, ecl_character c) +{ + AGAIN: + { + cl_object vector = SEQ_OUTPUT_VECTOR(strm); + cl_fixnum curr_pos = SEQ_OUTPUT_POSITION(strm); + if (vector->vector.dim - curr_pos < 1) { + seq_out_enlarge_vector(strm); + goto AGAIN; + } + vector->vector.self.b32[curr_pos++] = c; + SEQ_OUTPUT_POSITION(strm) = curr_pos; + if (vector->vector.fillp < curr_pos) + vector->vector.fillp = curr_pos; + } + return c; +} +#endif + static cl_object seq_out_get_position(cl_object strm) { @@ -4661,11 +4809,10 @@ make_sequence_output_stream(cl_object vector, cl_object external_format) cl_object type_name; int byte_size; int flags = 0; - if (!ECL_VECTORP(vector) || - ecl_aet_size[type = ecl_array_elttype(vector)] != 1) - { - FEerror("MAKE-SEQUENCE-OUTPUT-STREAM only accepts vectors whose element has a size of 1 byte.~%~A", 1, vector); - } + if (!ECL_VECTORP(vector)) { + FEwrong_type_nth_arg(@[ext::make-sequence-output-stream], 1, vector, @[vector]); + } + type = ecl_array_elttype(vector); type_name = ecl_elttype_to_symbol(type); byte_size = ecl_normalize_stream_element_type(type_name); /* Character streams always get some external format. For binary @@ -4676,9 +4823,30 @@ make_sequence_output_stream(cl_object vector, cl_object external_format) if (!byte_size && Null(external_format)) { external_format = @':default'; } - set_stream_elt_type(strm, byte_size, flags, external_format); - /* Override byte size */ - if (byte_size) strm->stream.byte_size = 8; + if (ecl_aet_size[type] == 1) { + set_stream_elt_type(strm, byte_size, flags, external_format); + /* Override byte size */ + if (byte_size) strm->stream.byte_size = 8; + } +#ifdef ecl_uint16_t + else if (ecl_aet_size[type] == 2 && external_format == @':ucs-2') { + IO_STREAM_ELT_TYPE(strm) = @'character'; + strm->stream.format = @':ucs-2'; + strm->stream.byte_size = 2*8; + strm->stream.ops->write_char = seq_out_ucs2_write_char; + } +#endif +#ifdef ecl_uint32_t + else if (ecl_aet_size[type] == 4 && external_format == @':ucs-4') { + IO_STREAM_ELT_TYPE(strm) = @'character'; + strm->stream.format = @':ucs-4'; + strm->stream.byte_size = 4*8; + strm->stream.ops->write_char = seq_out_ucs4_write_char; + } +#endif + else { + FEerror("Illegal combination of external-format ~A and output vector ~A for MAKE-SEQUENCE-OUTPUT-STREAM.~%", 2, external_format, vector); + } SEQ_OUTPUT_VECTOR(strm) = vector; SEQ_OUTPUT_POSITION(strm) = vector->vector.fillp; return strm; @@ -5203,7 +5371,29 @@ ecl_normalize_stream_element_type(cl_object element_type) return -8; } else if (element_type == @'unsigned-byte' || element_type == @'ext::byte8') { return 8; - } else if (element_type == @':default') { + } +#ifdef ecl_uint16_t + else if (element_type == @'ext::integer16') { + return -16; + } else if (element_type == @'ext::byte16') { + return 16; + } +#endif +#ifdef ecl_uint32_t + else if (element_type == @'ext::integer32') { + return -32; + } else if (element_type == @'ext::byte32') { + return 32; + } +#endif +#ifdef ecl_uint64_t + else if (element_type == @'ext::integer64') { + return -64; + } else if (element_type == @'ext::byte64') { + return 64; + } +#endif + else if (element_type == @':default') { return 0; } else if (element_type == @'base-char' || element_type == @'character') { return 0; diff --git a/src/c/string.d b/src/c/string.d index 07749c99f..2a4dc010c 100644 --- a/src/c/string.d +++ b/src/c/string.d @@ -70,7 +70,7 @@ do_make_string(cl_index s, ecl_character code) @) /* - Make a string of a certain size, with some eading zeros to + Make a string of a certain size, with some leading zeros to keep C happy. The string must be adjustable, to allow further growth. (See unixfsys.c for its use). */ @@ -895,3 +895,174 @@ nstring_case(cl_narg narg, cl_object fun, ecl_casefun casefun, ecl_va_list ARGS) } @(return output); @) + +@(defun ext::octets-to-string (input &key + (external_format @':default') + (start ecl_make_fixnum(0)) + (end ECL_NIL)) + cl_object output; + cl_index input_size; + cl_object input_stream; + cl_index output_size; + cl_object ret; + @ + output = si_get_buffer_string(); + input_stream = si_make_sequence_input_stream(7, input, + @':external-format', external_format, + @':start', start, + @':end', end); + /* INV: MAKE-SEQUENCE-INPUT-STREAM checks types of start and end indices */ + input_size = (Null(end) ? ecl_length(input) : ecl_fixnum(end)) - ecl_fixnum(start); + output_size = 0; + do { + output->base_string.fillp = output->base_string.dim; + output_size += ecl_to_unsigned_integer(si_do_read_sequence(output, input_stream, + ecl_make_fixnum(output_size), + ecl_make_fixnum(output->base_string.dim))); + if (output_size < output->base_string.dim) { + break; + } + output = _ecl_funcall3(@'adjust-array', output, + ecl_make_fixnum(input_size > output_size + ? input_size + : output_size + 128)); + } while (1); + output->base_string.fillp = output_size; + if (ecl_fits_in_base_string(output)) { + ret = si_copy_to_simple_base_string(output); + } else { + ret = cl_copy_seq(output); + } + si_put_buffer_string(output); + @(return ret); +@) + +@(defun ext::string-to-octets (input &key + (external_format @':default') + (start ecl_make_fixnum(0)) + (end ECL_NIL) + (null_terminate ECL_NIL) + (element_type @'ext::byte8')) + cl_object output; + cl_object output_stream; + @ + output = si_make_vector(element_type, /* element-type */ + cl_length(input), /* length */ + ECL_T, /* adjustable */ + ecl_make_fixnum(0), /* fillp */ + ECL_NIL, /* displaced */ + ECL_NIL); /* displaced-offset */ + output_stream = si_make_sequence_output_stream(3, output, + @':external-format', external_format); + si_do_write_sequence(input, output_stream, start, end); + if (!Null(null_terminate)) { + ecl_write_char(0, output_stream); + } + @(return output); +@) + +cl_object +ecl_decode_from_cstring(const char *s, cl_fixnum len, cl_object encoding) +{ + volatile cl_object ret; + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-decoding-error')) { + ret = si_octets_to_string(3, ecl_make_constant_base_string(s, len), @':external-format', encoding); + } ECL_HANDLER_CASE(1, c) { + ret = c; /* suppress "unused variable `c`" warning */ + ret = OBJNULL; + } ECL_HANDLER_CASE_END; + return ret; +} + +cl_fixnum +ecl_encode_to_cstring(char *output, cl_fixnum output_len, cl_object input, cl_object encoding) +{ + volatile cl_fixnum ret; + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-encoding-error')) { + cl_object output_vec = si_string_to_octets(3, input, @':external-format', encoding); + ret = output_vec->vector.fillp + 1; + if (ret <= output_len) { + memcpy(output, output_vec->vector.self.b8, (ret-1)*sizeof(char)); + output[ret-1] = 0; /* null-terminator */ + } + } ECL_HANDLER_CASE(1, c) { + input = c; /* suppress "unused variable `c`" warning */ + ret = -1; + } ECL_HANDLER_CASE_END; + return ret; +} + +#ifdef HAVE_WCHAR_H +cl_object +ecl_decode_from_unicode_wstring(const wchar_t *s, cl_fixnum len) +{ + cl_object input; + cl_object elttype; + cl_object encoding; + volatile cl_object ret; + if (len < 0) { + len = wcslen(s); + } + switch (sizeof(wchar_t)) { + case 1: + elttype = @'ext::byte8'; + encoding = @':utf-8'; + break; + case 2: + elttype = @'ext::byte16'; + encoding = @':ucs-2'; + break; + case 4: + elttype = @'ext::byte32'; + encoding = @':ucs-4'; + break; + default: + ecl_internal_error("Unexpected sizeof(wchar_t)"); + } + input = si_make_vector(elttype, ecl_make_fixnum(len), ECL_NIL, ECL_NIL, ECL_NIL, ECL_NIL); + memcpy(input->vector.self.b8, s, len*sizeof(wchar_t)); + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-decoding-error')) { + ret = si_octets_to_string(3, input, @':external-format', encoding); + } ECL_HANDLER_CASE(1, c) { + ret = c; /* suppress "unused variable `c`" warning */ + ret = OBJNULL; + } ECL_HANDLER_CASE_END; + return ret; +} + +cl_fixnum +ecl_encode_to_unicode_wstring(wchar_t *output, cl_fixnum output_len, cl_object input) +{ + cl_object elttype; + cl_object encoding; + volatile cl_fixnum ret; + switch (sizeof(wchar_t)) { + case 1: + elttype = @'ext::byte8'; + encoding = @':utf-8'; + break; + case 2: + elttype = @'ext::byte16'; + encoding = @':ucs-2'; + break; + case 4: + elttype = @'ext::byte32'; + encoding = @':ucs-4'; + break; + default: + ecl_internal_error("Unexpected sizeof(wchar_t)"); + } + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-encoding-error')) { + cl_object output_vec = si_string_to_octets(5, input, @':external-format', encoding, @':element-type', elttype); + ret = output_vec->vector.fillp + 1; + if (ret <= output_len) { + memcpy(output, output_vec->vector.self.b8, (ret-1)*sizeof(wchar_t)); + output[ret-1] = 0; /* null-terminator */ + } + } ECL_HANDLER_CASE(1, c) { + input = c; /* suppress "unused variable `c`" warning */ + ret = -1; + } ECL_HANDLER_CASE_END; + return ret; +} +#endif diff --git a/src/c/symbols_list.h b/src/c/symbols_list.h index 360987d1c..f75b4b020 100755 --- a/src/c/symbols_list.h +++ b/src/c/symbols_list.h @@ -2183,6 +2183,9 @@ cl_symbols[] = { {EXT_ "STREAM-ENCODING-ERROR" ECL_FUN(NULL, NULL, -1) ECL_VAR(EXT_ORDINARY, OBJNULL)}, {EXT_ "DECODING-ERROR" ECL_FUN(NULL, NULL, 3) ECL_VAR(EXT_ORDINARY, OBJNULL)}, {EXT_ "ENCODING-ERROR" ECL_FUN(NULL, NULL, 3) ECL_VAR(EXT_ORDINARY, OBJNULL)}, +{EXT_ "OCTETS-TO-STRING" ECL_FUN("si_octets_to_string", si_octets_to_string, -2) ECL_VAR(EXT_ORDINARY, OBJNULL)}, +{EXT_ "STRING-TO-OCTETS" ECL_FUN("si_string_to_octets", si_string_to_octets, -2) ECL_VAR(EXT_ORDINARY, OBJNULL)}, +{KEY_ "NULL-TERMINATE" ECL_FUN(NULL, NULL, -1) ECL_VAR(KEYWORD, OBJNULL)}, /* #endif ECL_UNICODE */ {SYS_ "PROPER-LIST-P" ECL_FUN("si_proper_list_p", si_proper_list_p, 1) ECL_VAR(SI_ORDINARY, OBJNULL)}, diff --git a/src/configure b/src/configure index 2e6c70bbd..f65d8c6d7 100755 --- a/src/configure +++ b/src/configure @@ -10099,6 +10099,18 @@ $as_echo "#define ECL_UNICODE 21" >>confdefs.h $as_echo "#define ECL_UNICODE_NAMES 1" >>confdefs.h EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o" + for ac_header in wchar.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "wchar.h" "ac_cv_header_wchar_h" "$ac_includes_default" +if test "x$ac_cv_header_wchar_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_WCHAR_H 1 +_ACEOF + +fi + +done + else CHAR_CODE_LIMIT=256 ECL_CHARACTER="int" diff --git a/src/configure.ac b/src/configure.ac index 29f86fe36..f294a37fa 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -908,6 +908,7 @@ if test "x${enable_unicode}" != "xno"; then fi AC_DEFINE([ECL_UNICODE_NAMES], [1], [Link in the database of Unicode names]) EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o" + AC_CHECK_HEADERS([wchar.h]) else CHAR_CODE_LIMIT=256 ECL_CHARACTER="int" diff --git a/src/doc/help.lsp b/src/doc/help.lsp index 39b0d0aa3..f2bf7b2fa 100644 --- a/src/doc/help.lsp +++ b/src/doc/help.lsp @@ -2301,6 +2301,15 @@ Evaluates FORMs in order from left to right. If any FORM evaluates to non- NIL, quits and returns that (single) value. If the last FORM is reached, returns whatever values it returns.") +(docfun ext::octets-to-string function (input &key + (external-format :default) + (start 0) + (end nil)) " +Decode a sequence of octets into a string according to the given +external format. The bounding index designators start and end optionally +denote a subsequence to be decoded. +") + (docfun output-stream-p function (stream) " Returns T if STREAM can handle output operations; NIL otherwise.") @@ -3014,6 +3023,17 @@ Similar to STRING>=, but ignores cases.") Returns a copy of STRING with the specified characters removed from the right end. CHAR-SPEC must be a sequence of characters.") +(docfun ext::string-to-octets function (input &key + (external-format :default) + (start 0) + (end nil) + (null-terminate nil)) " +Encode a string into a sequence of octets according to the given +external format. The bounding index designators start and end +optionally denote a subsequence to be encoded. If null-terminate is +true, add a terminating null byte. +") + (docfun si::string-to-object function (string) " ECL specific. Equivalent to (READ-FROM-STRING STRING), but is much faster.") diff --git a/src/doc/manual/standards/streams.txi b/src/doc/manual/standards/streams.txi index 698578d08..717b3fdaa 100644 --- a/src/doc/manual/standards/streams.txi +++ b/src/doc/manual/standards/streams.txi @@ -152,6 +152,13 @@ Return the POSIX file descriptor of @var{file-stream} as an integer @subsubsection External Format Extensions +@lspdef ext:*default-external-format* +@defvar ext:*default-external-format* +Default external format to use for reading from streams, dealing with +filenames, etc. The default is to use utf-8 encoding if ECL is built +with Unicode support. +@end defvar + @lspdef ext:all-encodings @defun ext:all-encodings Return a list of all supported external formats diff --git a/src/doc/manual/standards/strings.txi b/src/doc/manual/standards/strings.txi index e19dd4b14..b27f192b7 100644 --- a/src/doc/manual/standards/strings.txi +++ b/src/doc/manual/standards/strings.txi @@ -67,6 +67,56 @@ The counterpart of the previous function is @coderef{ecl_char_set}, which implem Both functions check the type of their arguments and verify that the indices do not exceed the string boundaries. Otherwise they signal a @code{serious-condition}. @end deftypefun +@subsubsection Converting Unicode strings +Converting between different encodings. See @ref{Streams - External formats} for a list of supported encodings (external formats). + +@subsubheading Functions +@cppdef si_octets_to_string +@lspdef ext:octets-to-string +@defun ext:octets-to-string octets &key (external-format :default) (start 0) (end nil) +Decode a sequence of octets (i.e. 8-bit bytes) into a string according +to the given external format. @var{octets} must be a vector whose +elements have a size of 8-bit. The bounding index designators +@var{start} and @var{end} optionally denote a subsequence to be decoded. +Signals an @coderef{ext:character-decoding-error} if the decoding fails. +@end defun + +@cppdef si_string_to_octets +@lspdef ext:string-to-octets +@defun ext:string-to-octets string &key (external-format :default) (start 0) (end nil) (null-terminate nil) +Encode a string into a sequence of octets according to the given +external format. The bounding index designators @var{start} and +@var{end} optionally denote a subsequence to be encoded. If +@var{null-terminate} is true, add a terminating null byte. Signals an +@coderef{ext:character-encoding-error} if the encoding fails. +@end defun + +@cppdef ecl_decode_from_cstring +@deftypefun cl_object ecl_decode_from_cstring (const char *string, cl_fixnum length, cl_object external_format) +Decode a C string of the given length into a Lisp string using the +specified external format. If @var{length} is -1, the length is +determined by @code{strlen}. Returns @code{NULL} if the decoding fails. +@end deftypefun + +@cppdef ecl_encode_to_cstring +@deftypefun cl_fixnum ecl_encode_to_cstring (char *output, cl_fixnum output_length, cl_object input, cl_object external_format) +Encode the Lisp string @var{input} into a C string of the given length +using the specified external format. Returns the number of characters +necessary to encode the Lisp string (including the null terminator). If +this is larger than @var{output_length}, @var{output} is unchanged. +Returns -1 if the encoding fails. +@end deftypefun + +@cppdef ecl_decode_from_unicode_wstring +@cppdef ecl_encode_to_unicode_wstring +@deftypefun cl_object ecl_decode_from_unicode_wstring (const wchar_t *string, cl_fixnum length) +@deftypefunx cl_fixnum ecl_encode_to_unicode_wstring (wchar_t *output, cl_fixnum output_length, cl_object input) +These functions work the same as @coderef{ecl_decode_from_cstring}, +@coderef{ecl_encode_to_cstring}, except that the external format used is +either utf-8, utf-16 or utf-32 depending on whether +@code{sizeof(wchar_t)} is 1, 2, or 4 respectively. +@end deftypefun + @subsubsection ANSI dictionary Common Lisp and C equivalence diff --git a/src/ecl/configpre.h b/src/ecl/configpre.h index 17b67b980..6b886dd8b 100644 --- a/src/ecl/configpre.h +++ b/src/ecl/configpre.h @@ -593,6 +593,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_VFORK_H +/* Define to 1 if you have the header file. */ +#undef HAVE_WCHAR_H + /* Define to 1 if `fork' works. */ #undef HAVE_WORKING_FORK diff --git a/src/h/config.h.in b/src/h/config.h.in index 1550bdcd3..2ee7f42e9 100644 --- a/src/h/config.h.in +++ b/src/h/config.h.in @@ -232,6 +232,9 @@ typedef unsigned char ecl_base_char; /* feenableexcept is available */ #undef HAVE_FEENABLEEXCEPT +/* wide-strings are available */ +#undef HAVE_WCHAR_H + /* * C macros for inlining, denoting probable code paths and other stuff * that makes better code. Most of it is GCC specific. diff --git a/src/h/external.h b/src/h/external.h index a32ef1c63..354eeb4c4 100755 --- a/src/h/external.h +++ b/src/h/external.h @@ -1697,6 +1697,15 @@ extern ECL_API bool ecl_member_char(ecl_character c, cl_object char_bag); extern ECL_API bool ecl_fits_in_base_string(cl_object s); extern ECL_API ecl_character ecl_char(cl_object s, cl_index i); extern ECL_API ecl_character ecl_char_set(cl_object s, cl_index i, ecl_character c); +extern ECL_API cl_object si_octets_to_string _ECL_ARGS((cl_narg narg, cl_object input, ...)); +extern ECL_API cl_object si_string_to_octets _ECL_ARGS((cl_narg narg, cl_object input, ...)); +extern ECL_API cl_object ecl_decode_from_cstring(const char *s, cl_fixnum len, cl_object encoding); +extern ECL_API cl_fixnum ecl_encode_to_cstring(char *output, cl_fixnum output_len, cl_object input, cl_object encoding); +#ifdef HAVE_WCHAR_H +#include +extern ECL_API cl_object ecl_decode_from_unicode_wstring(const wchar_t *s, cl_fixnum len); +extern ECL_API cl_fixnum ecl_encode_to_unicode_wstring(wchar_t *output, cl_fixnum output_len, cl_object input); +#endif /* structure.c */ diff --git a/src/lsp/cmdline.lsp b/src/lsp/cmdline.lsp index 403fc6e1a..406c521c3 100644 --- a/src/lsp/cmdline.lsp +++ b/src/lsp/cmdline.lsp @@ -82,9 +82,11 @@ appeared after a '--'.") ("--c-stack" 1 (ext:set-limit 'ext:c-stack (read-from-string 1))) ("--trap-fpe" 0 (si::trap-fpe t t)) ("--no-trap-fpe" 0 (si::trap-fpe t nil)) - ("--encoding" 1 (dolist (i (list *standard-input* *standard-output* - *error-output* *trace-output*)) - (setf (stream-external-format i) (read-from-string 1)))) + ("--encoding" 1 (let ((enc (read-from-string 1))) + (setf ext::*default-external-format* enc) + (dolist (i (list *standard-input* *standard-output* + *error-output* *trace-output*)) + (setf (stream-external-format i) enc)))) ("--input-encoding" 1 (setf (stream-external-format *standard-input*) (read-from-string 1))) ("--output-encoding" 1 diff --git a/src/tests/normal-tests/embedding.lsp b/src/tests/normal-tests/embedding.lsp index 43df6ea27..100c6b76f 100644 --- a/src/tests/normal-tests/embedding.lsp +++ b/src/tests/normal-tests/embedding.lsp @@ -302,3 +302,103 @@ int main(int narg, char **argv) return 0; }")) (test-C-program c-code)))) + +;;; Date: 2021-08-13 (Marius Gerbershagen) +;;; Description: +;;; +;;; Verify that ecl_decode_from_cstring, ecl_encode_to_cstring and +;;; wide string equivalents work correctly +;;; +#+unicode +(test emb.0005.decode/encode-cstrings + (is-true + (let* ((c-code " +#include + +int main(int argc, char** argv) { + cl_boot(argc, argv); + + cl_object utf_8 = ecl_make_keyword(\"UTF-8\"); + + unsigned char invalid[3] = {0xff, 0xfe, 0}; + if (ecl_decode_from_cstring(invalid, -1, utf_8) != NULL) { + return -1; + } + + unsigned char x[9] = {240, 159, 145, 137, 240, 159, 145, 136, 0}; + cl_object s = cl_make_string(1, ecl_make_fixnum(2)); + ecl_char_set(s, 0, 128073); + ecl_char_set(s, 1, 128072); + + if (!ecl_equal(s, ecl_decode_from_cstring(x, -1, utf_8)) + || !ecl_equal(s, ecl_decode_from_cstring(x, 8, utf_8))) { + return -2; + } + + unsigned char y[9]; + if (ecl_encode_to_cstring(y, 9, s, utf_8) != 9) { + return -3; + } + for (int i = 0; i < 9; i++) { + if (x[i] != y[i]) { + return -4; + } + } + + if (ecl_encode_to_cstring(y, 1, s, utf_8) != 9) { + return -5; + } + + if (ecl_encode_to_cstring(y, 9, s, ecl_make_keyword(\"US-ASCII\")) != -1) { + return -6; + } + +#ifdef HAVE_WCHAR_H + if (sizeof(wchar_t) == 2) { + wchar_t u[5] = {55357, 64585, 55357, 64584, 0}; + if (!ecl_equal(s, ecl_decode_from_unicode_wstring(u, -1)) + || !ecl_equal(s, ecl_decode_from_unicode_wstring(u, 4))) { + return -7; + } + + wchar_t v[5]; + if (ecl_encode_to_unicode_wstring(v, 5, s) != 5) { + return -8; + } + for (int i = 0; i < 5; i++) { + if (u[i] != v[i]) { + return -9; + } + } + + if (ecl_encode_to_unicode_wstring(v, 1, s) != 5) { + return -10; + } + } else if (sizeof(wchar_t) == 4) { + wchar_t u[3] = {128073, 128072, 0}; + if (!ecl_equal(s, ecl_decode_from_unicode_wstring(u, -1)) + || !ecl_equal(s, ecl_decode_from_unicode_wstring(u, 2))) { + return -7; + } + + wchar_t v[3]; + if (ecl_encode_to_unicode_wstring(v, 3, s) != 3) { + return -8; + } + for (int i = 0; i < 3; i++) { + if (u[i] != v[i]) { + return -9; + } + } + + if (ecl_encode_to_unicode_wstring(v, 1, s) != 3) { + return -10; + } + } +#endif + + cl_shutdown(); + return 0; +} +")) + (test-C-program c-code))))