mirror of
https://gitlab.com/embeddable-common-lisp/ecl.git
synced 2025-12-06 02:40:26 -08:00
202 lines
10 KiB
Common Lisp
202 lines
10 KiB
Common Lisp
;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*-
|
|
;;;
|
|
;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll
|
|
;;;
|
|
;;; This program is free software; you can redistribute it and/or
|
|
;;; modify it under the terms of the GNU Library General Public
|
|
;;; License as published by the Free Software Foundation; either
|
|
;;; version 2 of the License, or (at your option) any later version.
|
|
;;;
|
|
;;; See file '../Copyright' for full details.
|
|
|
|
(defconstant +sequence-type+ '(unsigned-byte 16))
|
|
|
|
(defconstant +source-pathname+
|
|
(make-pathname :name nil :type nil
|
|
:directory (append (pathname-directory *load-pathname*)
|
|
(list "sources"))
|
|
:host (pathname-host *load-pathname*)
|
|
:device (pathname-device *load-pathname*)))
|
|
|
|
(defconstant +all-mappings+
|
|
'(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT")
|
|
|
|
("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT")
|
|
("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT")
|
|
("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT")
|
|
("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT")
|
|
("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT")
|
|
("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT")
|
|
("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT")
|
|
("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT")
|
|
("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT")
|
|
("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT")
|
|
("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT")
|
|
("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT")
|
|
("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT")
|
|
("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT")
|
|
("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT")
|
|
("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT")
|
|
("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT")
|
|
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
|
|
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
|
|
|
|
("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT")
|
|
("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT")
|
|
("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT")
|
|
("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT")
|
|
("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT")
|
|
("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT")
|
|
("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT")
|
|
("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT")
|
|
("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT")
|
|
("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT")
|
|
("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT")
|
|
("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT")
|
|
("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT")
|
|
("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT")
|
|
("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT")
|
|
("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT")
|
|
|
|
; Redundant WINDOWS-CP874 DOS-CP874
|
|
;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT")
|
|
|
|
("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT")
|
|
("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT")
|
|
("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT")
|
|
("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT")
|
|
|
|
("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT")
|
|
("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT")
|
|
("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT")
|
|
("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT")
|
|
("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT")
|
|
("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT")
|
|
("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT")
|
|
("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT")
|
|
("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT")
|
|
|
|
("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT")
|
|
("JISX0208" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT"
|
|
;; Fixes compatible with libiconv: we replace a reverse solidus with a
|
|
;; fullwidth reverse solidus, so that JISX0208 does not contain characters
|
|
;; in the ASCII range (Needed by ISO-2022-JP-1)
|
|
((#x815F #x2140 #xff3c)))
|
|
|
|
("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT"
|
|
;; Fixes compatible with libiconv: we replace a tilde with a
|
|
;; fullwidth tilde, so that JISX0212 does not contain characters
|
|
;; in the ASCII range (Needed by ISO-2022-JP-1)
|
|
((#x2237 #xff5e)))
|
|
|
|
("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT")
|
|
|
|
;Unable to parse because they output more than one Unicode character
|
|
;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt")
|
|
;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis")
|
|
))
|
|
|
|
(defun download (filename url)
|
|
(unless (probe-file filename)
|
|
(let ((command (format nil "curl \"~A\" > ~A" url filename)))
|
|
(unless (zerop (si::system command))
|
|
(error "Unable to retrieve file ~A" url)))))
|
|
|
|
(defun reformat (line)
|
|
(loop with l = (length line)
|
|
for i from 0 below l
|
|
for c = (char line i)
|
|
do (cond ((eql c #\#)
|
|
(return (if (zerop i) "" (subseq line 0 (1- i)))))
|
|
((not (standard-char-p c))
|
|
(setf (char line i) #\space))
|
|
((and (eql c #\0)
|
|
(let ((j (1+ i)))
|
|
(and (< j l) (member (char line j) '(#\x #\X)))))
|
|
(setf (char line i) #\#)))
|
|
finally (return line)))
|
|
|
|
(defun read-mapping (name &optional (n 2))
|
|
(let* ((source-file (make-pathname :name name :defaults +source-pathname+))
|
|
(record (find name +all-mappings+ :key #'first :test #'equalp))
|
|
(fixes (third record))
|
|
(source-url (fourth record)))
|
|
(unless (probe-file source-file)
|
|
(unless source-url
|
|
(error "Unknown encoding ~A" name))
|
|
(download file source-url))
|
|
(with-open-file (in source-file :direction :input)
|
|
(loop with output = '()
|
|
for line = (reformat (read-line in nil nil))
|
|
while line
|
|
unless (zerop (length line))
|
|
do (with-input-from-string (aux line)
|
|
(let ((byte-list (loop for byte = (read aux nil nil)
|
|
while byte
|
|
collect byte)))
|
|
(unless (/= (length byte-list) n)
|
|
(loop for i in fixes
|
|
when (= (first i) (first byte-list))
|
|
do (progn (setf byte-list i) (return)))
|
|
(push byte-list output))))
|
|
finally (return (nreverse output))))))
|
|
|
|
(defun mapping-hash-table (mapping)
|
|
(loop with hash = (make-hash-table :size (floor (* 1.5 (length mapping)))
|
|
:test 'eq)
|
|
for (multibyte codepoint) in mapping
|
|
for unicode-char = (code-char codepoint)
|
|
do (progn
|
|
(setf (gethash multibyte hash) unicode-char)
|
|
(setf (gethash unicode-char hash) multibyte)
|
|
(when (> multibyte #xFF)
|
|
(setf (gethash (ash multibyte -8) hash) t)))
|
|
finally (return hash)))
|
|
|
|
(defun dump-mapping-array (mapping-assoc output-file)
|
|
(let* ((mapping-list (reduce #'nconc mapping-assoc))
|
|
(mapping-array (make-array (length mapping-list) :element-type +sequence-type+
|
|
:initial-contents mapping-list)))
|
|
(format t "~%;;; Generating ~A" output-file)
|
|
(force-output t)
|
|
(with-open-file (s output-file :direction :output :if-exists :supersede
|
|
:element-type +sequence-type+ :external-format :big-endian)
|
|
(write-byte (length mapping-array) s)
|
|
(write-sequence mapping-array s))))
|
|
|
|
(defun copy-encoding-file (in out)
|
|
(let ((buffer (make-array 8192 :element-type '(unsigned-byte 8))))
|
|
(format t "~%;;; Copying ~A to ~A" in out)
|
|
(with-open-file (sin in :direction :input :element-type '(unsigned-byte 8))
|
|
(with-open-file (sout out :direction :output :element-type '(unsigned-byte 8)
|
|
:if-exists :supersede :if-does-not-exist :create)
|
|
(loop for nbytes = (read-sequence buffer sin)
|
|
until (zerop nbytes)
|
|
do (write-sequence buffer sout :end nbytes))))))
|
|
|
|
(defun all-valid-unicode-chars (mapping)
|
|
(cond ((consp mapping)
|
|
(loop for sublist on mapping
|
|
for i from 0 below 10
|
|
until (and (eq sublist mapping) (plusp i))
|
|
collect (all-valid-unicode-chars (first sublist))))
|
|
((hash-table-p mapping)
|
|
(concatenate 'string (loop for key being the hash-key in mapping
|
|
when (characterp key)
|
|
collect key)))
|
|
((eq mapping :iso-8859-1)
|
|
(coerce 'string (loop for i from 0 to 255 collect (code-char i))))
|
|
(t
|
|
(error "Unknown encoding"))))
|
|
|
|
(defun compare-hashes (h1 h2)
|
|
(flet ((h1-in-h2 (h1 h2)
|
|
(loop for k being the hash-key in h1 using (hash-value v)
|
|
for v2 = (gethash k h2 nil)
|
|
unless (or (consp v2) (consp v) (equal v v2))
|
|
do (progn (print (list h1 k v h2 k v2))
|
|
(error)
|
|
(return nil))
|
|
finally (return t))))
|
|
(and (h1-in-h2 h1 h2)
|
|
(h1-in-h2 h2 h1))))
|