add convenience methods for de-/encoding strings and wide-strings from C

2026-03-15 09:20:23 -07:00 · 2021-08-13 11:11:20 +02:00 · 2021-08-13 11:11:20 +02:00 · a488595241
commit a488595241
parent 548309e165
8 changed files with 259 additions and 0 deletions
--- a/src/c/string.d
+++ b/src/c/string.d
@ -960,3 +960,109 @@ nstring_case(cl_narg narg, cl_object fun, ecl_casefun casefun, ecl_va_list ARGS)
  }
  @(return output);
@)
+
+cl_object
+ecl_decode_from_cstring(const char *s, cl_fixnum len, cl_object encoding)
+{
+  volatile cl_object ret;
+  ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-decoding-error')) {
+    ret = si_octets_to_string(3, ecl_make_constant_base_string(s, len), @':external-format', encoding);
+  } ECL_HANDLER_CASE(1, c) {
+    ret = c; /* suppress "unused variable `c`" warning */
+    ret = OBJNULL;
+  } ECL_HANDLER_CASE_END;
+  return ret;
+}
+
+cl_fixnum
+ecl_encode_to_cstring(char *output, cl_fixnum output_len, cl_object input, cl_object encoding)
+{
+  volatile cl_fixnum ret;
+  ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-encoding-error')) {
+    cl_object output_vec = si_string_to_octets(3, input, @':external-format', encoding);
+    ret = output_vec->vector.fillp + 1;
+    if (ret <= output_len) {
+      memcpy(output, output_vec->vector.self.b8, (ret-1)*sizeof(char));
+      output[ret-1] = 0; /* null-terminator */
+    }
+  } ECL_HANDLER_CASE(1, c) {
+    input = c; /* suppress "unused variable `c`" warning */
+    ret = -1;
+  } ECL_HANDLER_CASE_END;
+  return ret;
+}
+
+#ifdef HAVE_WCHAR_H
+cl_object
+ecl_decode_from_unicode_wstring(const wchar_t *s, cl_fixnum len)
+{
+  cl_object input;
+  cl_object elttype;
+  cl_object encoding;
+  volatile cl_object ret;
+  if (len < 0) {
+    len = wcslen(s);
+  }
+  switch (sizeof(wchar_t)) {
+  case 1:
+    elttype = @'ext::byte8';
+    encoding = @':utf-8';
+    break;
+  case 2:
+    elttype = @'ext::byte16';
+    encoding = @':ucs-2';
+    break;
+  case 4:
+    elttype = @'ext::byte32';
+    encoding = @':ucs-4';
+    break;
+  default:
+    ecl_internal_error("Unexpected sizeof(wchar_t)");
+  }
+  input = si_make_vector(elttype, ecl_make_fixnum(len), ECL_NIL, ECL_NIL, ECL_NIL, ECL_NIL);
+  memcpy(input->vector.self.b8, s, len*sizeof(wchar_t));
+  ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-decoding-error')) {
+    ret = si_octets_to_string(3, input, @':external-format', encoding);
+  } ECL_HANDLER_CASE(1, c) {
+    ret = c; /* suppress "unused variable `c`" warning */
+    ret = OBJNULL;
+  } ECL_HANDLER_CASE_END;
+  return ret;
+}
+
+cl_fixnum
+ecl_encode_to_unicode_wstring(wchar_t *output, cl_fixnum output_len, cl_object input)
+{
+  cl_object elttype;
+  cl_object encoding;
+  volatile cl_fixnum ret;
+  switch (sizeof(wchar_t)) {
+  case 1:
+    elttype = @'ext::byte8';
+    encoding = @':utf-8';
+    break;
+  case 2:
+    elttype = @'ext::byte16';
+    encoding = @':ucs-2';
+    break;
+  case 4:
+    elttype = @'ext::byte32';
+    encoding = @':ucs-4';
+    break;
+  default:
+    ecl_internal_error("Unexpected sizeof(wchar_t)");
+  }
+  ECL_HANDLER_CASE_BEGIN(ecl_process_env(), ecl_list1(@'ext::character-encoding-error')) {
+    cl_object output_vec = si_string_to_octets(5, input, @':external-format', encoding, @':element-type', elttype);
+    ret = output_vec->vector.fillp + 1;
+    if (ret <= output_len) {
+      memcpy(output, output_vec->vector.self.b8, (ret-1)*sizeof(wchar_t));
+      output[ret-1] = 0; /* null-terminator */
+    }
+  } ECL_HANDLER_CASE(1, c) {
+    input = c; /* suppress "unused variable `c`" warning */
+    ret = -1;
+  } ECL_HANDLER_CASE_END;
+  return ret;
+}
+#endif
--- a/src/configure
+++ b/src/configure
@ -10099,6 +10099,18 @@ $as_echo "#define ECL_UNICODE 21" >>confdefs.h
 $as_echo "#define ECL_UNICODE_NAMES 1" >>confdefs.h

   EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o"
+	for ac_header in wchar.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "wchar.h" "ac_cv_header_wchar_h" "$ac_includes_default"
+if test "x$ac_cv_header_wchar_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_WCHAR_H 1
+_ACEOF
+
+fi
+
+done
+
 else
   CHAR_CODE_LIMIT=256
   ECL_CHARACTER="int"
--- a/src/configure.ac
+++ b/src/configure.ac
@ -908,6 +908,7 @@ if test "x${enable_unicode}" != "xno"; then
   fi
   AC_DEFINE([ECL_UNICODE_NAMES], [1], [Link in the database of Unicode names])
   EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o"
+	AC_CHECK_HEADERS([wchar.h])
 else
   CHAR_CODE_LIMIT=256
   ECL_CHARACTER="int"
--- a/src/doc/manual/standards/strings.txi
+++ b/src/doc/manual/standards/strings.txi
@ -90,6 +90,33 @@ external format. The bounding index designators @var{start} and
@var{null-terminate} is true, add a terminating null byte. Signals an
@coderef{ext:character-encoding-error} if the encoding fails.
@end defun
+
+@cppdef ecl_decode_from_cstring
+@deftypefun cl_object ecl_decode_from_cstring (const char *string, cl_fixnum length, cl_object external_format)
+Decode a C string of the given length into a Lisp string using the
+specified external format. If @var{length} is -1, the length is
+determined by @code{strlen}. Returns @code{NULL} if the decoding fails.
+@end deftypefun
+
+@cppdef ecl_encode_to_cstring
+@deftypefun cl_fixnum ecl_encode_to_cstring (char *output, cl_fixnum output_length, cl_object input, cl_object external_format)
+Encode the Lisp string @var{input} into a C string of the given length
+using the specified external format. Returns the number of characters
+necessary to encode the Lisp string (including the null terminator). If
+this is larger than @var{output_length}, @var{output} is unchanged.
+Returns -1 if the encoding fails.
+@end deftypefun
+
+@cppdef ecl_decode_from_unicode_wstring
+@cppdef ecl_encode_to_unicode_wstring
+@deftypefun cl_object ecl_decode_from_unicode_wstring (const wchar_t *string, cl_fixnum length)
+@deftypefunx cl_fixnum ecl_encode_to_unicode_wstring (wchar_t *output, cl_fixnum output_length, cl_object input)
+These functions work the same as @coderef{ecl_decode_from_cstring},
+@coderef{ecl_encode_to_cstring}, except that the external format used is
+either utf-8, utf-16 or utf-32 depending on whether
+@code{sizeof(wchar_t)} is 1, 2, or 4 respectively.
+@end deftypefun
+
@subsubsection ANSI dictionary
 Common Lisp and C equivalence

--- a/src/ecl/configpre.h
+++ b/src/ecl/configpre.h
@ -593,6 +593,9 @@
 /* Define to 1 if you have the <vfork.h> header file. */
 #undef HAVE_VFORK_H

+/* Define to 1 if you have the <wchar.h> header file. */
+#undef HAVE_WCHAR_H
+
 /* Define to 1 if `fork' works. */
 #undef HAVE_WORKING_FORK

--- a/src/h/config.h.in
+++ b/src/h/config.h.in
@ -232,6 +232,9 @@ typedef unsigned char ecl_base_char;
 /* feenableexcept is available */
 #undef HAVE_FEENABLEEXCEPT

+/* wide-strings are available */
+#undef HAVE_WCHAR_H
+
 /*
 * C macros for inlining, denoting probable code paths and other stuff
 * that makes better code. Most of it is GCC specific.
--- a/src/h/external.h
+++ b/src/h/external.h
@ -1699,6 +1699,13 @@ extern ECL_API ecl_character ecl_char(cl_object s, cl_index i);
 extern ECL_API ecl_character ecl_char_set(cl_object s, cl_index i, ecl_character c);
 extern ECL_API cl_object si_octets_to_string _ECL_ARGS((cl_narg narg, cl_object input, ...));
 extern ECL_API cl_object si_string_to_octets _ECL_ARGS((cl_narg narg, cl_object input, ...));
+extern ECL_API cl_object ecl_decode_from_cstring(const char *s, cl_fixnum len, cl_object encoding);
+extern ECL_API cl_fixnum ecl_encode_to_cstring(char *output, cl_fixnum output_len, cl_object input, cl_object encoding);
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+extern ECL_API cl_object ecl_decode_from_unicode_wstring(const wchar_t *s, cl_fixnum len);
+extern ECL_API cl_fixnum ecl_encode_to_unicode_wstring(wchar_t *output, cl_fixnum output_len, cl_object input);
+#endif

 /* structure.c */

--- a/src/tests/normal-tests/embedding.lsp
+++ b/src/tests/normal-tests/embedding.lsp
@ -302,3 +302,103 @@ int main(int narg, char **argv)
        return 0;
 }"))
     (test-C-program c-code))))
+
+;;; Date: 2021-08-13 (Marius Gerbershagen)
+;;; Description:
+;;;
+;;;     Verify that ecl_decode_from_cstring, ecl_encode_to_cstring and
+;;;     wide string equivalents work correctly
+;;;
+#+unicode
+(test emb.0005.decode/encode-cstrings
+  (is-true
+   (let* ((c-code "
+#include <ecl/ecl.h>
+
+int main(int argc, char** argv) {
+	cl_boot(argc, argv);
+
+   cl_object utf_8 = ecl_make_keyword(\"UTF-8\");
+
+	unsigned char invalid[3] = {0xff, 0xfe, 0};
+   if (ecl_decode_from_cstring(invalid, -1, utf_8) != NULL) {
+      return -1;
+   }
+
+   unsigned char x[9] = {240, 159, 145, 137, 240, 159, 145, 136, 0};
+   cl_object s = cl_make_string(1, ecl_make_fixnum(2));
+	ecl_char_set(s, 0, 128073);
+	ecl_char_set(s, 1, 128072);
+
+   if (!ecl_equal(s, ecl_decode_from_cstring(x, -1, utf_8))
+       || !ecl_equal(s, ecl_decode_from_cstring(x, 8, utf_8))) {
+      return -2;
+   }
+
+   unsigned char y[9];
+   if (ecl_encode_to_cstring(y, 9, s, utf_8) != 9) {
+      return -3;
+   }
+   for (int i = 0; i < 9; i++) {
+      if (x[i] != y[i]) {
+         return -4;
+      }
+   }
+
+   if (ecl_encode_to_cstring(y, 1, s, utf_8) != 9) {
+      return -5;
+   }
+
+   if (ecl_encode_to_cstring(y, 9, s, ecl_make_keyword(\"US-ASCII\")) != -1) {
+      return -6;
+   }
+
+#ifdef HAVE_WCHAR_H
+   if (sizeof(wchar_t) == 2) {
+      wchar_t u[5] = {55357, 64585, 55357, 64584, 0};
+      if (!ecl_equal(s, ecl_decode_from_unicode_wstring(u, -1))
+          || !ecl_equal(s, ecl_decode_from_unicode_wstring(u, 4))) {
+         return -7;
+      }
+
+      wchar_t v[5];
+      if (ecl_encode_to_unicode_wstring(v, 5, s) != 5) {
+         return -8;
+      }
+      for (int i = 0; i < 5; i++) {
+         if (u[i] != v[i]) {
+            return -9;
+         }
+      }
+      
+      if (ecl_encode_to_unicode_wstring(v, 1, s) != 5) {
+         return -10;
+      }
+   } else if (sizeof(wchar_t) == 4) {
+      wchar_t u[3] = {128073, 128072, 0};
+      if (!ecl_equal(s, ecl_decode_from_unicode_wstring(u, -1))
+          || !ecl_equal(s, ecl_decode_from_unicode_wstring(u, 2))) {
+         return -7;
+      }
+
+      wchar_t v[3];
+      if (ecl_encode_to_unicode_wstring(v, 3, s) != 3) {
+         return -8;
+      }
+      for (int i = 0; i < 3; i++) {
+         if (u[i] != v[i]) {
+            return -9;
+         }
+      }
+      
+      if (ecl_encode_to_unicode_wstring(v, 1, s) != 3) {
+         return -10;
+      }
+   }
+#endif
+
+	cl_shutdown();
+	return 0;
+}
+"))
+     (test-C-program c-code))))