diff --git a/src/CHANGELOG b/src/CHANGELOG index 248ed655f..bffae4dae 100644 --- a/src/CHANGELOG +++ b/src/CHANGELOG @@ -25,9 +25,9 @@ ECL 9.1.0: as a base-string. - ECL supports external formats. They may be a symbol, denoting the encoding - or an encoding option, or a list of symbols. Valid symbols are :DEFAULT, - :LATIN-1, :ISO-8859-1, :UTF-8, :UCS-{2,4}{,BE,LE} :CR, :LF and :CRLF. Default - option is :LF. + or an encoding option, an association table between bytes and unicode codes + or a list of these. Valid symbols are :DEFAULT, :LATIN-1, :ISO-8859-1, + :UTF-8, :UCS-{2,4}{,BE,LE} :CR, :LF and :CRLF. Default option is :LF. * Bugs fixed: diff --git a/src/c/file.d b/src/c/file.d index 75e433fef..77ed3173a 100644 --- a/src/c/file.d +++ b/src/c/file.d @@ -725,6 +725,56 @@ ucs_2_encoder(cl_object stream, unsigned char *buffer, int c) return 2 + ucs_2be_encoder(stream, buffer+2, c); } +/* + * UCS-2 BOM ENDIAN + */ + +static int +user_decoder(cl_object stream, cl_eformat_read_byte8 read_byte8, cl_object source) +{ + cl_object table = stream->stream.format_table; + cl_object character, byte; + unsigned char buffer[2]; + if (read_byte8(source, buffer, 1) < 1) { + return EOF; + } + byte = MAKE_FIXNUM(buffer); + character = ecl_gethash_safe(MAKE_FIXNUM(buffer[0]), table, Cnil); + if (Null(character)) { + return EOF-2; + } + if (character == Ct) { + if (read_byte8(source, buffer+1, 1) < 1) { + return EOF; + } + character = ecl_gethash_safe(MAKE_FIXNUM((buffer[0]<<8+buffer[1])), + table, Cnil); + if (Null(character)) { + return EOF-2; + } + } + return CHAR_CODE(character); +} + +static int +user_encoder(cl_object stream, unsigned char *buffer, int c) +{ + cl_object byte = ecl_gethash_safe(CODE_CHAR(c), stream->stream.format_table, Cnil); + if (Null(byte)) { + return 0; + } else { + cl_fixnum code = fix(byte); + if (c > 0xFF) { + buffer[1] = c & 8; c >>= 8; + buffer[0] = c; + return 2; + } else { + buffer[0] = c; + return 1; + } + } +} + /* * UTF-8 */ @@ -2519,15 +2569,14 @@ const struct ecl_file_ops input_file_ops = { static int -parse_external_format(cl_object strm, cl_object format) +parse_external_format(cl_object stream, cl_object format) { if (CONSP(format)) { int flags = 0; do { - flags |= parse_external_format(strm, ECL_CONS_CAR(format)); + flags |= parse_external_format(stream, ECL_CONS_CAR(format)); format = cl_cdr(format); } while (CONSP(format)); - printf("%d\n", flags); return flags; } if (FIXNUMP(format)) { @@ -2582,6 +2631,10 @@ parse_external_format(cl_object strm, cl_object format) /* Binary stream */ return 0; } + if (type_of(format) == t_hashtable) { + stream->stream.format_table = format; + return ECL_STREAM_USER_FORMAT; + } FEerror("Unknown external format: ~A", 1, format); return ECL_STREAM_DEFAULT_FORMAT; } @@ -2668,6 +2721,13 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags, stream->stream.decoder = ucs_4le_decoder; break; #endif + case ECL_STREAM_USER_FORMAT: + IO_STREAM_ELT_TYPE(stream) = @'character'; + byte_size = 8; + stream->stream.format = stream->stream.format_table; + stream->stream.encoder = user_encoder; + stream->stream.decoder = user_decoder; + break; default: FEerror("Invalid external format code ~D with ~A", 1, MAKE_FIXNUM(flags), external_format); diff --git a/src/h/object.h b/src/h/object.h index dca9d6bf0..7720003f4 100644 --- a/src/h/object.h +++ b/src/h/object.h @@ -526,12 +526,12 @@ enum { ECL_STREAM_UCS_4 = 6, ECL_STREAM_UCS_4LE = 7, ECL_STREAM_UCS_4BE = 8, - ECL_STREAM_8BIT = 5, - ECL_STREAM_CR = 8, - ECL_STREAM_LF = 16, - ECL_STREAM_SIGNED_BYTES = 32, - ECL_STREAM_C_STREAM = 64, - ECL_STREAM_MIGHT_SEEK = 128 + ECL_STREAM_USER_FORMAT = 9, + ECL_STREAM_CR = 16, + ECL_STREAM_LF = 32, + ECL_STREAM_SIGNED_BYTES = 64, + ECL_STREAM_C_STREAM = 128, + ECL_STREAM_MIGHT_SEEK = 256 }; typedef int (*cl_eformat_encoder)(cl_object stream, unsigned char *buffer, int c); @@ -556,8 +556,7 @@ struct ecl_stream { cl_object format; /* external format */ cl_eformat_encoder encoder; cl_eformat_decoder decoder; - cl_object encoder_table; - cl_object decoder_table; + cl_object format_table; int flags; /* character table, flags, etc */ }; diff --git a/src/lsp/iolib.lsp b/src/lsp/iolib.lsp index 29b6be26f..a91257491 100644 --- a/src/lsp/iolib.lsp +++ b/src/lsp/iolib.lsp @@ -287,3 +287,24 @@ the one defined in the ANSI standard. *print-base* is 10, *print-array* is t, (print-unreadable-object-function ,object ,stream ,type ,identity #'.print-unreadable-object-body.)) `(print-unreadable-object-function ,object ,stream ,type ,identity nil))) + +(defun ext::load-encoding (name) + (let ((filename (make-pathname :name (symbol-name name) :defaults "SYS:encodings;"))) + (print name) + (unless (probe-file filename) + (error "Unable to find mapping file ~A for encoding ~A" filename name)) + (with-open-file (s filename :direction :input) + (read s)))) + +(defun ext::make-encoding (mapping) + (when (symbolp mapping) + (setf mapping (load-encoding mapping))) + (let ((output (make-hash-table :size 512 :test 'eq))) + (dolist (record mapping output) + (let* ((byte (car record)) + (unicode (cdr record)) + (unicode-char (code-char unicode))) + (when (> byte #xFF) + (setf (gethash (ash byte -8) output) t)) + (setf (gethash byte output) unicode-char) + (setf (gethash unicode-char output) byte)))))