diff --git a/src/c/string.d b/src/c/string.d index a45ea0750..2a4dc010c 100644 --- a/src/c/string.d +++ b/src/c/string.d @@ -960,3 +960,109 @@ nstring_case(cl_narg narg, cl_object fun, ecl_casefun casefun, ecl_va_list ARGS) } @(return output); @) + +cl_object +ecl_decode_from_cstring(const char *s, cl_fixnum len, cl_object encoding) +{ + volatile cl_object ret; + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-decoding-error')) { + ret = si_octets_to_string(3, ecl_make_constant_base_string(s, len), @':external-format', encoding); + } ECL_HANDLER_CASE(1, c) { + ret = c; /* suppress "unused variable `c`" warning */ + ret = OBJNULL; + } ECL_HANDLER_CASE_END; + return ret; +} + +cl_fixnum +ecl_encode_to_cstring(char *output, cl_fixnum output_len, cl_object input, cl_object encoding) +{ + volatile cl_fixnum ret; + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-encoding-error')) { + cl_object output_vec = si_string_to_octets(3, input, @':external-format', encoding); + ret = output_vec->vector.fillp + 1; + if (ret <= output_len) { + memcpy(output, output_vec->vector.self.b8, (ret-1)*sizeof(char)); + output[ret-1] = 0; /* null-terminator */ + } + } ECL_HANDLER_CASE(1, c) { + input = c; /* suppress "unused variable `c`" warning */ + ret = -1; + } ECL_HANDLER_CASE_END; + return ret; +} + +#ifdef HAVE_WCHAR_H +cl_object +ecl_decode_from_unicode_wstring(const wchar_t *s, cl_fixnum len) +{ + cl_object input; + cl_object elttype; + cl_object encoding; + volatile cl_object ret; + if (len < 0) { + len = wcslen(s); + } + switch (sizeof(wchar_t)) { + case 1: + elttype = @'ext::byte8'; + encoding = @':utf-8'; + break; + case 2: + elttype = @'ext::byte16'; + encoding = @':ucs-2'; + break; + case 4: + elttype = @'ext::byte32'; + encoding = @':ucs-4'; + break; + default: + ecl_internal_error("Unexpected sizeof(wchar_t)"); + } + input = si_make_vector(elttype, ecl_make_fixnum(len), ECL_NIL, ECL_NIL, ECL_NIL, ECL_NIL); + memcpy(input->vector.self.b8, s, len*sizeof(wchar_t)); + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-decoding-error')) { + ret = si_octets_to_string(3, input, @':external-format', encoding); + } ECL_HANDLER_CASE(1, c) { + ret = c; /* suppress "unused variable `c`" warning */ + ret = OBJNULL; + } ECL_HANDLER_CASE_END; + return ret; +} + +cl_fixnum +ecl_encode_to_unicode_wstring(wchar_t *output, cl_fixnum output_len, cl_object input) +{ + cl_object elttype; + cl_object encoding; + volatile cl_fixnum ret; + switch (sizeof(wchar_t)) { + case 1: + elttype = @'ext::byte8'; + encoding = @':utf-8'; + break; + case 2: + elttype = @'ext::byte16'; + encoding = @':ucs-2'; + break; + case 4: + elttype = @'ext::byte32'; + encoding = @':ucs-4'; + break; + default: + ecl_internal_error("Unexpected sizeof(wchar_t)"); + } + ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-encoding-error')) { + cl_object output_vec = si_string_to_octets(5, input, @':external-format', encoding, @':element-type', elttype); + ret = output_vec->vector.fillp + 1; + if (ret <= output_len) { + memcpy(output, output_vec->vector.self.b8, (ret-1)*sizeof(wchar_t)); + output[ret-1] = 0; /* null-terminator */ + } + } ECL_HANDLER_CASE(1, c) { + input = c; /* suppress "unused variable `c`" warning */ + ret = -1; + } ECL_HANDLER_CASE_END; + return ret; +} +#endif diff --git a/src/configure b/src/configure index 2e6c70bbd..f65d8c6d7 100755 --- a/src/configure +++ b/src/configure @@ -10099,6 +10099,18 @@ $as_echo "#define ECL_UNICODE 21" >>confdefs.h $as_echo "#define ECL_UNICODE_NAMES 1" >>confdefs.h EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o" + for ac_header in wchar.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "wchar.h" "ac_cv_header_wchar_h" "$ac_includes_default" +if test "x$ac_cv_header_wchar_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_WCHAR_H 1 +_ACEOF + +fi + +done + else CHAR_CODE_LIMIT=256 ECL_CHARACTER="int" diff --git a/src/configure.ac b/src/configure.ac index 29f86fe36..f294a37fa 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -908,6 +908,7 @@ if test "x${enable_unicode}" != "xno"; then fi AC_DEFINE([ECL_UNICODE_NAMES], [1], [Link in the database of Unicode names]) EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o" + AC_CHECK_HEADERS([wchar.h]) else CHAR_CODE_LIMIT=256 ECL_CHARACTER="int" diff --git a/src/doc/manual/standards/strings.txi b/src/doc/manual/standards/strings.txi index 02c44a47f..b27f192b7 100644 --- a/src/doc/manual/standards/strings.txi +++ b/src/doc/manual/standards/strings.txi @@ -90,6 +90,33 @@ external format. The bounding index designators @var{start} and @var{null-terminate} is true, add a terminating null byte. Signals an @coderef{ext:character-encoding-error} if the encoding fails. @end defun + +@cppdef ecl_decode_from_cstring +@deftypefun cl_object ecl_decode_from_cstring (const char *string, cl_fixnum length, cl_object external_format) +Decode a C string of the given length into a Lisp string using the +specified external format. If @var{length} is -1, the length is +determined by @code{strlen}. Returns @code{NULL} if the decoding fails. +@end deftypefun + +@cppdef ecl_encode_to_cstring +@deftypefun cl_fixnum ecl_encode_to_cstring (char *output, cl_fixnum output_length, cl_object input, cl_object external_format) +Encode the Lisp string @var{input} into a C string of the given length +using the specified external format. Returns the number of characters +necessary to encode the Lisp string (including the null terminator). If +this is larger than @var{output_length}, @var{output} is unchanged. +Returns -1 if the encoding fails. +@end deftypefun + +@cppdef ecl_decode_from_unicode_wstring +@cppdef ecl_encode_to_unicode_wstring +@deftypefun cl_object ecl_decode_from_unicode_wstring (const wchar_t *string, cl_fixnum length) +@deftypefunx cl_fixnum ecl_encode_to_unicode_wstring (wchar_t *output, cl_fixnum output_length, cl_object input) +These functions work the same as @coderef{ecl_decode_from_cstring}, +@coderef{ecl_encode_to_cstring}, except that the external format used is +either utf-8, utf-16 or utf-32 depending on whether +@code{sizeof(wchar_t)} is 1, 2, or 4 respectively. +@end deftypefun + @subsubsection ANSI dictionary Common Lisp and C equivalence diff --git a/src/ecl/configpre.h b/src/ecl/configpre.h index 17b67b980..6b886dd8b 100644 --- a/src/ecl/configpre.h +++ b/src/ecl/configpre.h @@ -593,6 +593,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_VFORK_H +/* Define to 1 if you have the header file. */ +#undef HAVE_WCHAR_H + /* Define to 1 if `fork' works. */ #undef HAVE_WORKING_FORK diff --git a/src/h/config.h.in b/src/h/config.h.in index 1550bdcd3..2ee7f42e9 100644 --- a/src/h/config.h.in +++ b/src/h/config.h.in @@ -232,6 +232,9 @@ typedef unsigned char ecl_base_char; /* feenableexcept is available */ #undef HAVE_FEENABLEEXCEPT +/* wide-strings are available */ +#undef HAVE_WCHAR_H + /* * C macros for inlining, denoting probable code paths and other stuff * that makes better code. Most of it is GCC specific. diff --git a/src/h/external.h b/src/h/external.h index efcf5fcc5..354eeb4c4 100755 --- a/src/h/external.h +++ b/src/h/external.h @@ -1699,6 +1699,13 @@ extern ECL_API ecl_character ecl_char(cl_object s, cl_index i); extern ECL_API ecl_character ecl_char_set(cl_object s, cl_index i, ecl_character c); extern ECL_API cl_object si_octets_to_string _ECL_ARGS((cl_narg narg, cl_object input, ...)); extern ECL_API cl_object si_string_to_octets _ECL_ARGS((cl_narg narg, cl_object input, ...)); +extern ECL_API cl_object ecl_decode_from_cstring(const char *s, cl_fixnum len, cl_object encoding); +extern ECL_API cl_fixnum ecl_encode_to_cstring(char *output, cl_fixnum output_len, cl_object input, cl_object encoding); +#ifdef HAVE_WCHAR_H +#include +extern ECL_API cl_object ecl_decode_from_unicode_wstring(const wchar_t *s, cl_fixnum len); +extern ECL_API cl_fixnum ecl_encode_to_unicode_wstring(wchar_t *output, cl_fixnum output_len, cl_object input); +#endif /* structure.c */ diff --git a/src/tests/normal-tests/embedding.lsp b/src/tests/normal-tests/embedding.lsp index 43df6ea27..100c6b76f 100644 --- a/src/tests/normal-tests/embedding.lsp +++ b/src/tests/normal-tests/embedding.lsp @@ -302,3 +302,103 @@ int main(int narg, char **argv) return 0; }")) (test-C-program c-code)))) + +;;; Date: 2021-08-13 (Marius Gerbershagen) +;;; Description: +;;; +;;; Verify that ecl_decode_from_cstring, ecl_encode_to_cstring and +;;; wide string equivalents work correctly +;;; +#+unicode +(test emb.0005.decode/encode-cstrings + (is-true + (let* ((c-code " +#include + +int main(int argc, char** argv) { + cl_boot(argc, argv); + + cl_object utf_8 = ecl_make_keyword(\"UTF-8\"); + + unsigned char invalid[3] = {0xff, 0xfe, 0}; + if (ecl_decode_from_cstring(invalid, -1, utf_8) != NULL) { + return -1; + } + + unsigned char x[9] = {240, 159, 145, 137, 240, 159, 145, 136, 0}; + cl_object s = cl_make_string(1, ecl_make_fixnum(2)); + ecl_char_set(s, 0, 128073); + ecl_char_set(s, 1, 128072); + + if (!ecl_equal(s, ecl_decode_from_cstring(x, -1, utf_8)) + || !ecl_equal(s, ecl_decode_from_cstring(x, 8, utf_8))) { + return -2; + } + + unsigned char y[9]; + if (ecl_encode_to_cstring(y, 9, s, utf_8) != 9) { + return -3; + } + for (int i = 0; i < 9; i++) { + if (x[i] != y[i]) { + return -4; + } + } + + if (ecl_encode_to_cstring(y, 1, s, utf_8) != 9) { + return -5; + } + + if (ecl_encode_to_cstring(y, 9, s, ecl_make_keyword(\"US-ASCII\")) != -1) { + return -6; + } + +#ifdef HAVE_WCHAR_H + if (sizeof(wchar_t) == 2) { + wchar_t u[5] = {55357, 64585, 55357, 64584, 0}; + if (!ecl_equal(s, ecl_decode_from_unicode_wstring(u, -1)) + || !ecl_equal(s, ecl_decode_from_unicode_wstring(u, 4))) { + return -7; + } + + wchar_t v[5]; + if (ecl_encode_to_unicode_wstring(v, 5, s) != 5) { + return -8; + } + for (int i = 0; i < 5; i++) { + if (u[i] != v[i]) { + return -9; + } + } + + if (ecl_encode_to_unicode_wstring(v, 1, s) != 5) { + return -10; + } + } else if (sizeof(wchar_t) == 4) { + wchar_t u[3] = {128073, 128072, 0}; + if (!ecl_equal(s, ecl_decode_from_unicode_wstring(u, -1)) + || !ecl_equal(s, ecl_decode_from_unicode_wstring(u, 2))) { + return -7; + } + + wchar_t v[3]; + if (ecl_encode_to_unicode_wstring(v, 3, s) != 3) { + return -8; + } + for (int i = 0; i < 3; i++) { + if (u[i] != v[i]) { + return -9; + } + } + + if (ecl_encode_to_unicode_wstring(v, 1, s) != 3) { + return -10; + } + } +#endif + + cl_shutdown(); + return 0; +} +")) + (test-C-program c-code))))