From 55af7bae8568ec1f74fdceb46fc9d2bd2bca3887 Mon Sep 17 00:00:00 2001 From: Marius Gerbershagen Date: Sat, 27 Feb 2021 19:51:50 +0100 Subject: [PATCH] strings: add functions to encode/decode strings into byte sequences API copied from sbcl. --- src/c/string.d | 67 +++++++++++++++++++++++++++- src/c/symbols_list.h | 3 ++ src/doc/help.lsp | 20 +++++++++ src/doc/manual/standards/strings.txi | 23 ++++++++++ src/h/external.h | 2 + 5 files changed, 114 insertions(+), 1 deletion(-) diff --git a/src/c/string.d b/src/c/string.d index 07749c99f..a45ea0750 100644 --- a/src/c/string.d +++ b/src/c/string.d @@ -70,7 +70,7 @@ do_make_string(cl_index s, ecl_character code) @) /* - Make a string of a certain size, with some eading zeros to + Make a string of a certain size, with some leading zeros to keep C happy. The string must be adjustable, to allow further growth. (See unixfsys.c for its use). */ @@ -895,3 +895,68 @@ nstring_case(cl_narg narg, cl_object fun, ecl_casefun casefun, ecl_va_list ARGS) } @(return output); @) + +@(defun ext::octets-to-string (input &key + (external_format @':default') + (start ecl_make_fixnum(0)) + (end ECL_NIL)) + cl_object output; + cl_index input_size; + cl_object input_stream; + cl_index output_size; + cl_object ret; + @ + output = si_get_buffer_string(); + input_stream = si_make_sequence_input_stream(7, input, + @':external-format', external_format, + @':start', start, + @':end', end); + /* INV: MAKE-SEQUENCE-INPUT-STREAM checks types of start and end indices */ + input_size = (Null(end) ? ecl_length(input) : ecl_fixnum(end)) - ecl_fixnum(start); + output_size = 0; + do { + output->base_string.fillp = output->base_string.dim; + output_size += ecl_to_unsigned_integer(si_do_read_sequence(output, input_stream, + ecl_make_fixnum(output_size), + ecl_make_fixnum(output->base_string.dim))); + if (output_size < output->base_string.dim) { + break; + } + output = _ecl_funcall3(@'adjust-array', output, + ecl_make_fixnum(input_size > output_size + ? input_size + : output_size + 128)); + } while (1); + output->base_string.fillp = output_size; + if (ecl_fits_in_base_string(output)) { + ret = si_copy_to_simple_base_string(output); + } else { + ret = cl_copy_seq(output); + } + si_put_buffer_string(output); + @(return ret); +@) + +@(defun ext::string-to-octets (input &key + (external_format @':default') + (start ecl_make_fixnum(0)) + (end ECL_NIL) + (null_terminate ECL_NIL) + (element_type @'ext::byte8')) + cl_object output; + cl_object output_stream; + @ + output = si_make_vector(element_type, /* element-type */ + cl_length(input), /* length */ + ECL_T, /* adjustable */ + ecl_make_fixnum(0), /* fillp */ + ECL_NIL, /* displaced */ + ECL_NIL); /* displaced-offset */ + output_stream = si_make_sequence_output_stream(3, output, + @':external-format', external_format); + si_do_write_sequence(input, output_stream, start, end); + if (!Null(null_terminate)) { + ecl_write_char(0, output_stream); + } + @(return output); +@) diff --git a/src/c/symbols_list.h b/src/c/symbols_list.h index 360987d1c..f75b4b020 100755 --- a/src/c/symbols_list.h +++ b/src/c/symbols_list.h @@ -2183,6 +2183,9 @@ cl_symbols[] = { {EXT_ "STREAM-ENCODING-ERROR" ECL_FUN(NULL, NULL, -1) ECL_VAR(EXT_ORDINARY, OBJNULL)}, {EXT_ "DECODING-ERROR" ECL_FUN(NULL, NULL, 3) ECL_VAR(EXT_ORDINARY, OBJNULL)}, {EXT_ "ENCODING-ERROR" ECL_FUN(NULL, NULL, 3) ECL_VAR(EXT_ORDINARY, OBJNULL)}, +{EXT_ "OCTETS-TO-STRING" ECL_FUN("si_octets_to_string", si_octets_to_string, -2) ECL_VAR(EXT_ORDINARY, OBJNULL)}, +{EXT_ "STRING-TO-OCTETS" ECL_FUN("si_string_to_octets", si_string_to_octets, -2) ECL_VAR(EXT_ORDINARY, OBJNULL)}, +{KEY_ "NULL-TERMINATE" ECL_FUN(NULL, NULL, -1) ECL_VAR(KEYWORD, OBJNULL)}, /* #endif ECL_UNICODE */ {SYS_ "PROPER-LIST-P" ECL_FUN("si_proper_list_p", si_proper_list_p, 1) ECL_VAR(SI_ORDINARY, OBJNULL)}, diff --git a/src/doc/help.lsp b/src/doc/help.lsp index 39b0d0aa3..f2bf7b2fa 100644 --- a/src/doc/help.lsp +++ b/src/doc/help.lsp @@ -2301,6 +2301,15 @@ Evaluates FORMs in order from left to right. If any FORM evaluates to non- NIL, quits and returns that (single) value. If the last FORM is reached, returns whatever values it returns.") +(docfun ext::octets-to-string function (input &key + (external-format :default) + (start 0) + (end nil)) " +Decode a sequence of octets into a string according to the given +external format. The bounding index designators start and end optionally +denote a subsequence to be decoded. +") + (docfun output-stream-p function (stream) " Returns T if STREAM can handle output operations; NIL otherwise.") @@ -3014,6 +3023,17 @@ Similar to STRING>=, but ignores cases.") Returns a copy of STRING with the specified characters removed from the right end. CHAR-SPEC must be a sequence of characters.") +(docfun ext::string-to-octets function (input &key + (external-format :default) + (start 0) + (end nil) + (null-terminate nil)) " +Encode a string into a sequence of octets according to the given +external format. The bounding index designators start and end +optionally denote a subsequence to be encoded. If null-terminate is +true, add a terminating null byte. +") + (docfun si::string-to-object function (string) " ECL specific. Equivalent to (READ-FROM-STRING STRING), but is much faster.") diff --git a/src/doc/manual/standards/strings.txi b/src/doc/manual/standards/strings.txi index e19dd4b14..02c44a47f 100644 --- a/src/doc/manual/standards/strings.txi +++ b/src/doc/manual/standards/strings.txi @@ -67,6 +67,29 @@ The counterpart of the previous function is @coderef{ecl_char_set}, which implem Both functions check the type of their arguments and verify that the indices do not exceed the string boundaries. Otherwise they signal a @code{serious-condition}. @end deftypefun +@subsubsection Converting Unicode strings +Converting between different encodings. See @ref{Streams - External formats} for a list of supported encodings (external formats). + +@subsubheading Functions +@cppdef si_octets_to_string +@lspdef ext:octets-to-string +@defun ext:octets-to-string octets &key (external-format :default) (start 0) (end nil) +Decode a sequence of octets (i.e. 8-bit bytes) into a string according +to the given external format. @var{octets} must be a vector whose +elements have a size of 8-bit. The bounding index designators +@var{start} and @var{end} optionally denote a subsequence to be decoded. +Signals an @coderef{ext:character-decoding-error} if the decoding fails. +@end defun + +@cppdef si_string_to_octets +@lspdef ext:string-to-octets +@defun ext:string-to-octets string &key (external-format :default) (start 0) (end nil) (null-terminate nil) +Encode a string into a sequence of octets according to the given +external format. The bounding index designators @var{start} and +@var{end} optionally denote a subsequence to be encoded. If +@var{null-terminate} is true, add a terminating null byte. Signals an +@coderef{ext:character-encoding-error} if the encoding fails. +@end defun @subsubsection ANSI dictionary Common Lisp and C equivalence diff --git a/src/h/external.h b/src/h/external.h index a32ef1c63..efcf5fcc5 100755 --- a/src/h/external.h +++ b/src/h/external.h @@ -1697,6 +1697,8 @@ extern ECL_API bool ecl_member_char(ecl_character c, cl_object char_bag); extern ECL_API bool ecl_fits_in_base_string(cl_object s); extern ECL_API ecl_character ecl_char(cl_object s, cl_index i); extern ECL_API ecl_character ecl_char_set(cl_object s, cl_index i, ecl_character c); +extern ECL_API cl_object si_octets_to_string _ECL_ARGS((cl_narg narg, cl_object input, ...)); +extern ECL_API cl_object si_string_to_octets _ECL_ARGS((cl_narg narg, cl_object input, ...)); /* structure.c */