diff --git a/contrib/encodings/ISO-2022-JP b/contrib/encodings/ISO-2022-JP new file mode 100644 index 000000000..999ce3edc --- /dev/null +++ b/contrib/encodings/ISO-2022-JP @@ -0,0 +1,55 @@ +(cl:unless (cl:find-package :ISO-2022-JP) + (make-package :ISO-2022-JP)) +(in-package :ISO-2022-JP) + +(defun compose (bytes) + (loop for i in bytes + with cum = 0 + do (setf cum (+ (ash cum 8) i)) + finally (return cum))) + +(defun mapping-hash-table (sequence &optional (mask 0)) + (loop with hash = (make-hash-table :size (floor (* 1.5 (length sequence))) + :test 'eq) + for i from 0 below (length sequence) by 2 + for multibyte = (elt sequence i) + for codepoint = (elt sequence (1+ i)) + for unicode-char = (code-char codepoint) + when (zerop (logand multibyte mask)) + do (progn + (setf (gethash multibyte hash) unicode-char) + (setf (gethash unicode-char hash) multibyte) + (when (> multibyte #xFF) + (setf (gethash (ash multibyte -8) hash) t))) + finally (return hash))) + +(defun multimap (escapes tables) + (loop for seq in escapes + for table in tables + for table-cons = (member table tables :test 'eq) + do (progn + ;; Change escape sequence into byte codes + (setf seq (mapcar #'char-code seq)) + ;; Store it in the hash table + (setf (gethash t table) seq) + (loop for other-table in tables + do (loop for i from 1 + for precedings = (butlast seq i) + while precedings + do (setf (gethash (compose precedings) other-table) t) + finally (setf (gethash (compose seq) other-table) table-cons))))) + (nconc tables tables)) + +(defparameter ext::iso-2022-jp + (let* ((ascii-no-esc (mapping-hash-table (loop for i from 0 to 127 + unless (= i (char-code #\esc)) + nconc (list i i)))) + (jis208 (mapping-hash-table (ext::load-encoding :jisx0208) #x8080)) + (jis201 (mapping-hash-table (ext::load-encoding :jisx0201) #x80))) + (multimap '((#\Esc #\( #\B) + (#\Esc #\( #\J) + (#\Esc #\$ #\@) + (#\Esc #\$ #\B)) + (list ascii-no-esc jis201 jis208 jis208)))) + +(delete-package :ISO-2022-JP) diff --git a/contrib/encodings/ISO-2022-JP-1 b/contrib/encodings/ISO-2022-JP-1 new file mode 100644 index 000000000..aba8da88d --- /dev/null +++ b/contrib/encodings/ISO-2022-JP-1 @@ -0,0 +1,57 @@ +(cl:unless (cl:find-package :ISO-2022-JP-1) + (make-package :ISO-2022-JP-1)) +(in-package :ISO-2022-JP-1) + +(defun compose (bytes) + (loop for i in bytes + with cum = 0 + do (setf cum (+ (ash cum 8) i)) + finally (return cum))) + +(defun mapping-hash-table (sequence &optional (mask 0)) + (loop with hash = (make-hash-table :size (floor (* 1.5 (length sequence))) + :test 'eq) + for i from 0 below (length sequence) by 2 + for multibyte = (elt sequence i) + for codepoint = (elt sequence (1+ i)) + for unicode-char = (code-char codepoint) + when (zerop (logand multibyte mask)) + do (progn + (setf (gethash multibyte hash) unicode-char) + (setf (gethash unicode-char hash) multibyte) + (when (> multibyte #xFF) + (setf (gethash (ash multibyte -8) hash) t))) + finally (return hash))) + +(defun multimap (escapes tables) + (loop for seq in escapes + for table in tables + for table-cons = (member table tables :test 'eq) + do (progn + ;; Change escape sequence into byte codes + (setf seq (mapcar #'char-code seq)) + ;; Store it in the hash table + (setf (gethash t table) seq) + (loop for other-table in tables + do (loop for i from 1 + for precedings = (butlast seq i) + while precedings + do (setf (gethash (compose precedings) other-table) t) + finally (setf (gethash (compose seq) other-table) table-cons))))) + (nconc tables tables)) + +(defparameter ext::iso-2022-jp-1 + (let* ((ascii-no-esc (mapping-hash-table (loop for i from 0 to 127 + unless (= i (char-code #\esc)) + nconc (list i i)))) + (jis212 (mapping-hash-table (ext::load-encoding :jisx0212) #x8080)) + (jis208 (mapping-hash-table (ext::load-encoding :jisx0208) #x8080)) + (jis201 (mapping-hash-table (ext::load-encoding :jisx0201) #x80))) + (multimap '((#\Esc #\( #\B) + (#\Esc #\( #\J) + (#\Esc #\$ #\@) + (#\Esc #\$ #\B) + (#\Esc #\$ #\( #\D)) + (list ascii-no-esc jis201 jis208 jis208 jis212)))) + +(delete-package :ISO-2022-JP-1) diff --git a/contrib/encodings/JISX0201.BIN b/contrib/encodings/JISX0201.BIN new file mode 100644 index 000000000..c66e30fb3 Binary files /dev/null and b/contrib/encodings/JISX0201.BIN differ diff --git a/contrib/encodings/JISX0208.BIN b/contrib/encodings/JISX0208.BIN new file mode 100644 index 000000000..3b3a15370 Binary files /dev/null and b/contrib/encodings/JISX0208.BIN differ diff --git a/contrib/encodings/JISX0212.BIN b/contrib/encodings/JISX0212.BIN new file mode 100644 index 000000000..8b5cb4fda Binary files /dev/null and b/contrib/encodings/JISX0212.BIN differ diff --git a/contrib/encodings/SHIFT-JIS.BIN b/contrib/encodings/SHIFT-JIS.BIN new file mode 100644 index 000000000..9beafe5c9 Binary files /dev/null and b/contrib/encodings/SHIFT-JIS.BIN differ diff --git a/contrib/encodings/generate.lisp b/contrib/encodings/generate.lisp index b7a93d630..95ad3f035 100644 --- a/contrib/encodings/generate.lisp +++ b/contrib/encodings/generate.lisp @@ -1,6 +1,6 @@ ;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*- ;;; -;;; Copyright (c) 2009, Giuseppe Attardi. +;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll ;;; ;;; This program is free software; you can redistribute it and/or ;;; modify it under the terms of the GNU Library General Public @@ -9,130 +9,18 @@ ;;; ;;; See file '../Copyright' for full details. -(defconstant +sequence-type+ '(unsigned-byte 16)) - -(defun read-mapping (url) - (let ((command (format nil "curl \"~A\" | sed '/^#.*$/d;s,0x,#x,g;s,U+\\([0-9A-Fa-f]*\\),#x\\1,g;s,#UNDEFINED,NIL # UNDEFINED,g;/LEAD BYTE/d' | grep -v '' | sed 's,# .*$,,g;/#x.*/!d' > tmp.txt" url))) - (unless (zerop (si::system command)) - (error "Unable to retrieve file ~A" url))) - (let ((mapping '())) - (with-open-file (s "tmp.txt" :direction :input :external-format :utf-8) - (loop for line = (read-line s nil nil) - while line - do (with-input-from-string (aux line) - (let ((byte 0) - (unicode 0)) - (when (and (setf byte (read aux nil nil)) - (setf unicode (read aux nil nil))) - (unless (and (typep byte +sequence-type+) - (typep unicode +sequence-type+)) - (error "Sequence type ~A is unable to capture this encoding (codes ~X and ~X found)" - +sequence-type+ byte unicode)) - (setf mapping (list* unicode byte mapping))))))) - (unless mapping - (error "Error reading file ~A" url)) - (si::system "rm -f tmp.txt") - (print (reduce #'max mapping :initial-value 0)) - (make-array (length mapping) :element-type +sequence-type+ :initial-contents (nreverse mapping)))) - -(defun generate-mapping (name url output-file) - (let* ((mapping (read-mapping url))) - (format t "~&;;; Generating ~A~%;;; ~Tfrom ~A" output-file url) - (force-output t) - (if (pathname-type output-file) - (with-open-file (s output-file :direction :output :if-exists :supersede - :element-type +sequence-type+ :external-format :big-endian) - (write-byte (length mapping) s) - (write-sequence mapping s)) - (with-open-file (s output-file :direction :output :if-exists :supersede) - (print mapping s))))) - -(defconstant +all-mappings+ - '(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT") - - ("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT") - ("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT") - ("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT") - ("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT") - ("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT") - ("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT") - ("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT") - ("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT") - ("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT") - ("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT") - ("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT") - ("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT") - ("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT") - ("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT") - ("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT") - ("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT") - ("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT") - ("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT") - ("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT") - - ("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT") - ("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT") - ("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT") - ("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT") - ("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT") - ("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT") - ("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT") - ("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT") - ("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT") - ("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT") - ("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT") - ("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT") - ("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT") - ("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT") - ("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT") - ("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT") - - ; Redundant WINDOWS-CP874 DOS-CP874 - ;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT") - - ("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT" "BIN") - ("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT" "BIN") - ("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT" "BIN") - ("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT" "BIN") - - ("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT") - ("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT") - ("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT") - ("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT") - ("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT") - ("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT") - ("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT") - ("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT") - ("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT") - - ;("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT") - ;("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT") - ;("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT") - - ;Unable to parse because they output more than one Unicode character - ;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt") - ;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis") - )) - -(defun copy-file (in out) - (let ((buffer (make-array 8192 :element-type '(unsigned-byte 8)))) - (format t "~%;;; Copying ~A to ~A" in out) - (with-open-file (sin in :direction :input :element-type '(unsigned-byte 8)) - (with-open-file (sout out :direction :output :element-type '(unsigned-byte 8) - :if-exists :supersede :if-does-not-exist :create) - (loop for nbytes = (read-sequence buffer sin) - until (zerop nbytes) - do (write-sequence buffer sout :end nbytes)))))) +(load (merge-pathnames "tools" *load-pathname*)) (loop for entry in +all-mappings+ for name = (first entry) - for url = (second entry) - for type = (or (third entry) "BIN") - for orig = (make-pathname :name name :type type :defaults "ext:encodings;") - for copy = (ensure-directories-exist (make-pathname :name name :type type :defaults "build:encodings;")) + for orig = (make-pathname :name name :type "BIN" :defaults "ext:encodings;") + for copy = (merge-pathnames "build:encodings;" orig) do (progn (unless (probe-file orig) - (generate-mapping name url orig)) + (let ((mapping (if (equalp name "JISX0208") + (mapcar #'rest (read-mapping name 3)) + (read-mapping name)))) + (dump-mapping-array mapping orig))) (copy-file orig copy))) (defconstant +aliases+ @@ -197,4 +85,7 @@ do (with-open-file (out filename :direction :output :if-exists :supersede :if-does-not-exist :create :element-type 'base-char) (format t "~%;;; Creating alias ~A -> ~A, ~A" alias name filename) - (princ name out)))) + (format out "(defparameter ext::~A 'ext::~A)" alias name)))) + +(copy-file "ext:encodings;ISO-2022-JP" "build:encodings;ISO-2022-JP") +(copy-file "ext:encodings;ISO-2022-JP-1" "build:encodings;ISO-2022-JP-1") diff --git a/contrib/encodings/tools.lisp b/contrib/encodings/tools.lisp new file mode 100644 index 000000000..0b13a391d --- /dev/null +++ b/contrib/encodings/tools.lisp @@ -0,0 +1,194 @@ +;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*- +;;; +;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll +;;; +;;; This program is free software; you can redistribute it and/or +;;; modify it under the terms of the GNU Library General Public +;;; License as published by the Free Software Foundation; either +;;; version 2 of the License, or (at your option) any later version. +;;; +;;; See file '../Copyright' for full details. + +(defconstant +sequence-type+ '(unsigned-byte 16)) + +(defconstant +source-pathname+ + (make-pathname :name nil :type nil + :defaults (merge-pathnames "ext:;sources;" *load-pathname*))) + +(defconstant +all-mappings+ + '(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT") + + ("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT") + ("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT") + ("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT") + ("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT") + ("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT") + ("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT") + ("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT") + ("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT") + ("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT") + ("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT") + ("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT") + ("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT") + ("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT") + ("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT") + ("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT") + ("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT") + ("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT") + ("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT") + ("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT") + + ("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT") + ("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT") + ("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT") + ("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT") + ("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT") + ("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT") + ("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT") + ("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT") + ("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT") + ("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT") + ("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT") + ("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT") + ("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT") + ("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT") + ("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT") + ("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT") + + ; Redundant WINDOWS-CP874 DOS-CP874 + ;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT") + + ("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT") + ("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT") + ("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT") + ("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT") + + ("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT") + ("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT") + ("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT") + ("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT") + ("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT") + ("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT") + ("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT") + ("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT") + ("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT") + + ("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT") + ("JISX0208" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT" + ;; Fixes compatible with libiconv: we replace a reverse solidus with a + ;; fullwidth reverse solidus, so that JISX0208 does not contain characters + ;; in the ASCII range (Needed by ISO-2022-JP-1) + ((#x815F #x2140 #xff3c))) + + ("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT" + ;; Fixes compatible with libiconv: we replace a tilde with a + ;; fullwidth tilde, so that JISX0212 does not contain characters + ;; in the ASCII range (Needed by ISO-2022-JP-1) + ((#x2237 #xff5e))) + + ("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT") + + ;Unable to parse because they output more than one Unicode character + ;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt") + ;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis") + )) + +(defun download (filename url) + (unless (probe-file filename) + (let ((command (format nil "curl \"~A\" > ~A" url filename))) + (unless (zerop (si::system command)) + (error "Unable to retrieve file ~A" url))))) + +(defun reformat (line) + (loop with l = (length line) + for i from 0 below l + for c = (char line i) + do (cond ((eql c #\#) + (return (if (zerop i) "" (subseq line 0 (1- i))))) + ((not (standard-char-p c)) + (setf (char line i) #\space)) + ((and (eql c #\0) + (let ((j (1+ i))) + (and (< j l) (member (char line j) '(#\x #\X))))) + (setf (char line i) #\#))) + finally (return line))) + +(defun read-mapping (name &optional (n 2)) + (let* ((source-file (make-pathname :name name :defaults +source-pathname+)) + (record (find name +all-mappings+ :key #'first :test #'equalp)) + (fixes (third record)) + (source-url (fourth record))) + (unless (probe-file source-file) + (unless source-url + (error "Unknown encoding ~A" name)) + (download file source-url)) + (with-open-file (in source-file :direction :input) + (loop with output = '() + for line = (reformat (read-line in nil nil)) + while line + unless (zerop (length line)) + do (with-input-from-string (aux line) + (let ((byte-list (loop for byte = (read aux nil nil) + while byte + collect byte))) + (unless (/= (length byte-list) n) + (loop for i in fixes + when (= (first i) (first byte-list)) + do (progn (setf byte-list i) (return))) + (push byte-list output)))) + finally (return (nreverse output)))))) + +(defun mapping-hash-table (mapping) + (loop with hash = (make-hash-table :size (floor (* 1.5 (length mapping))) + :test 'eq) + for (multibyte codepoint) in mapping + for unicode-char = (code-char codepoint) + do (progn + (setf (gethash multibyte hash) unicode-char) + (setf (gethash unicode-char hash) multibyte) + (when (> multibyte #xFF) + (setf (gethash (ash multibyte -8) hash) t))) + finally (return hash))) + +(defun dump-mapping-array (mapping-assoc output-file) + (let* ((mapping-list (reduce #'nconc mapping-assoc)) + (mapping-array (make-array (length mapping-list) :element-type +sequence-type+ + :initial-contents mapping-list))) + (format t "~%;;; Generating ~A" output-file) + (force-output t) + (with-open-file (s output-file :direction :output :if-exists :supersede + :element-type +sequence-type+ :external-format :big-endian) + (write-byte (length mapping-array) s) + (write-sequence mapping-array s)))) + +(defun copy-file (in out) + (let ((buffer (make-array 8192 :element-type '(unsigned-byte 8)))) + (format t "~%;;; Copying ~A to ~A" in out) + (with-open-file (sin in :direction :input :element-type '(unsigned-byte 8)) + (with-open-file (sout out :direction :output :element-type '(unsigned-byte 8) + :if-exists :supersede :if-does-not-exist :create) + (loop for nbytes = (read-sequence buffer sin) + until (zerop nbytes) + do (write-sequence buffer sout :end nbytes)))))) + +(defun all-valid-unicode-chars (mapping) + (if (consp mapping) + (loop for sublist on mapping + for i from 0 below 10 + until (and (eq sublist mapping) (plusp i)) + collect (all-valid-unicode-chars (first sublist))) + (concatenate 'string (loop for key being the hash-key in mapping + when (characterp key) + collect key)))) + +(defun compare-hashes (h1 h2) + (flet ((h1-in-h2 (h1 h2) + (loop for k being the hash-key in h1 using (hash-value v) + for v2 = (gethash k h2 nil) + unless (or (consp v2) (consp v) (equal v v2)) + do (progn (print (list h1 k v h2 k v2)) + (error) + (return nil)) + finally (return t)))) + (and (h1-in-h2 h1 h2) + (h1-in-h2 h2 h1)))) diff --git a/src/c/file.d b/src/c/file.d index 3acfa31da..023067eb1 100644 --- a/src/c/file.d +++ b/src/c/file.d @@ -60,6 +60,12 @@ #define ecl_ftello ftello #endif +/* Maximum number of bytes required to encode a character. + * This currently corresponds to (4 + 2) for the ISO-2022-JP-* encodings + * with 4 being the charset prefix, 2 for the character. + */ +#define ENCODING_BUFFER_MAX_SIZE 6 + static cl_index ecl_read_byte8(cl_object stream, unsigned char *c, cl_index n); static cl_index ecl_write_byte8(cl_object stream, unsigned char *c, cl_index n); @@ -565,7 +571,7 @@ eformat_unread_char(cl_object strm, int c) } { cl_object l = Cnil; - unsigned char buffer[10]; + unsigned char buffer[2*ENCODING_BUFFER_MAX_SIZE]; int ndx = 0; cl_fixnum i = strm->stream.last_code[0]; if (i != EOF) { @@ -598,7 +604,7 @@ eformat_read_char(cl_object strm) static int eformat_write_char(cl_object strm, int c) { - unsigned char buffer[4]; + unsigned char buffer[ENCODING_BUFFER_MAX_SIZE]; int nbytes = strm->stream.encoder(strm, buffer, c); if (nbytes == 0) { character_size_overflow(strm, c); @@ -926,7 +932,7 @@ ucs_2_encoder(cl_object stream, unsigned char *buffer, int c) } /* - * USER DEFINED ENCODINGS + * USER DEFINED ENCODINGS. SIMPLE CASE. */ static int @@ -975,6 +981,84 @@ user_encoder(cl_object stream, unsigned char *buffer, int c) } } +/* + * USER DEFINED ENCODINGS. SIMPLE CASE. + */ + +static int +user_multistate_decoder(cl_object stream, cl_eformat_read_byte8 read_byte8, + cl_object source) +{ + cl_object table_list = stream->stream.format_table; + cl_object table = ECL_CONS_CAR(table_list); + cl_object character; + cl_fixnum i, j; + unsigned char buffer[ENCODING_BUFFER_MAX_SIZE]; + for (i = j = 0; i < ENCODING_BUFFER_MAX_SIZE; i++) { + if (read_byte8(source, buffer+i, 1) < 1) { + return EOF; + } + j = (j << 8) | buffer[i]; + character = ecl_gethash_safe(MAKE_FIXNUM(j), table, Cnil); + if (CHARACTERP(character)) { + return CHAR_CODE(character); + } + if (Null(character)) { + invalid_codepoint(stream, buffer[0]); + } + if (character == Ct) { + /* Need more characters */ + continue; + } + if (CONSP(character)) { + /* Changed the state. */ + stream->stream.format_table = table_list = character; + table = ECL_CONS_CAR(table_list); + i = j = 0; + continue; + } + break; + } + FEerror("Internal error in decoder table.", 0); +} + +static int +user_multistate_encoder(cl_object stream, unsigned char *buffer, int c) +{ + cl_object table_list = stream->stream.format_table; + cl_object p = table_list; + do { + cl_object table = ECL_CONS_CAR(p); + cl_object byte = ecl_gethash_safe(CODE_CHAR(c), table, Cnil); + if (!Null(byte)) { + cl_fixnum code = fix(byte); + int n = 0; + if (p != table_list) { + /* Must output a escape sequence */ + cl_object x = ecl_gethash_safe(Ct, table, Cnil); + while (!Null(x)) { + buffer[0] = fix(ECL_CONS_CAR(x)); + buffer++; + x = ECL_CONS_CDR(x); + n++; + } + stream->stream.format_table = p; + } + if (code > 0xFF) { + buffer[1] = code & 0xFF; code >>= 8; + buffer[0] = code; + return n+2; + } else { + buffer[0] = code; + return n+1; + } + } + p = ECL_CONS_CDR(p); + } while (p != table_list); + /* Exhausted all lists */ + return 0; +} + /* * UTF-8 */ @@ -2880,7 +2964,7 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags, switch (flags & ECL_STREAM_FORMAT) { case ECL_STREAM_BINARY: IO_STREAM_ELT_TYPE(stream) = cl_list(2, t, MAKE_FIXNUM(byte_size)); - stream->stream.format = Cnil; + stream->stream.format = t; stream->stream.ops->read_char = not_character_read_char; stream->stream.ops->write_char = not_character_write_char; break; @@ -2944,8 +3028,13 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags, IO_STREAM_ELT_TYPE(stream) = @'character'; byte_size = 8; stream->stream.format = stream->stream.format_table; - stream->stream.encoder = user_encoder; - stream->stream.decoder = user_decoder; + if (CONSP(stream->stream.format)) { + stream->stream.encoder = user_multistate_encoder; + stream->stream.decoder = user_multistate_decoder; + } else { + stream->stream.encoder = user_encoder; + stream->stream.decoder = user_decoder; + } break; case ECL_STREAM_US_ASCII: IO_STREAM_ELT_TYPE(stream) = @'base-char'; @@ -2967,20 +3056,21 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags, FEerror("Invalid or unsupported external format ~A with code ~D", 2, external_format, MAKE_FIXNUM(flags)); } + t = @':LF'; if (stream->stream.ops->write_char == eformat_write_char && (flags & ECL_STREAM_CR)) { cl_object key; if (flags & ECL_STREAM_LF) { stream->stream.ops->read_char = eformat_read_char_crlf; stream->stream.ops->write_char = eformat_write_char_crlf; - key = @':CRLF'; + t = @':CRLF'; } else { stream->stream.ops->read_char = eformat_read_char_cr; stream->stream.ops->write_char = eformat_write_char_cr; - key = @':CR'; + t = @':CR'; } - stream->stream.format = cl_list(2, key, stream->stream.format); } + stream->stream.format = cl_list(2, stream->stream.format, t); { cl_object (*read_byte)(cl_object); void (*write_byte)(cl_object,cl_object); diff --git a/src/lsp/iolib.lsp b/src/lsp/iolib.lsp index 4b6d0f684..f0c62520c 100644 --- a/src/lsp/iolib.lsp +++ b/src/lsp/iolib.lsp @@ -291,8 +291,8 @@ the one defined in the ANSI standard. *print-base* is 10, *print-array* is t, (defun ext:load-encoding (name) (let ((filename (make-pathname :name (symbol-name name) :defaults "sys:encodings;"))) (cond ((probe-file filename) - (with-open-file (s filename :direction :input) - (read s))) + (load filename :verbose nil) + name) ((probe-file (setf filename (make-pathname :type "BIN" :defaults filename))) (with-open-file (in filename :element-type '(unsigned-byte 16) :external-format :big-endian)