mirror of
https://gitlab.com/embeddable-common-lisp/ecl.git
synced 2026-01-13 21:02:47 -08:00
412 lines
19 KiB
Common Lisp
412 lines
19 KiB
Common Lisp
;;; Common
|
|
|
|
(defparameter *extension-directory*
|
|
(make-pathname :directory (pathname-directory *load-truename*)))
|
|
|
|
(defparameter *page-size-exponent* 8)
|
|
|
|
(defun cp-high (cp)
|
|
(ash cp (- *page-size-exponent*)))
|
|
|
|
(defun cp-low (cp)
|
|
(ldb (byte *page-size-exponent* 0) cp))
|
|
|
|
;;; Generator
|
|
|
|
(defstruct ucd misc transform)
|
|
|
|
(defparameter *unicode-character-database*
|
|
(make-pathname :directory (pathname-directory *load-truename*)))
|
|
|
|
(defparameter *ucd-base* nil)
|
|
(defparameter *unicode-names* (make-hash-table))
|
|
|
|
(defparameter *last-uppercase* nil)
|
|
(defparameter *uppercase-transition-count* 0)
|
|
(defparameter *different-titlecases* nil)
|
|
(defparameter *different-numerics* nil)
|
|
(defparameter *name-size* 0)
|
|
(defparameter *misc-hash* (make-hash-table :test #'equal))
|
|
(defparameter *misc-index* -1)
|
|
(defparameter *misc-table* nil)
|
|
(defparameter *misc-mapping* nil)
|
|
(defparameter *both-cases* nil)
|
|
(defparameter *decompositions* nil)
|
|
(defparameter *decomposition-length-max* nil)
|
|
(defparameter *decomposition-types* nil)
|
|
(defparameter *decomposition-base* nil)
|
|
|
|
(defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
|
|
bidi-mirrored cl-both-case-p)
|
|
(let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
|
|
bidi-mirrored cl-both-case-p))
|
|
(index (gethash list *misc-hash*)))
|
|
(or index
|
|
(progn
|
|
(vector-push list *misc-table*)
|
|
(setf (gethash list *misc-hash*)
|
|
(incf *misc-index*))))))
|
|
|
|
(defun compare-misc-entry (left right)
|
|
(destructuring-bind (left-gc-index left-bidi-index left-ccc-index
|
|
left-decimal-digit left-digit left-bidi-mirrored
|
|
left-cl-both-case-p)
|
|
left
|
|
(destructuring-bind (right-gc-index right-bidi-index right-ccc-index
|
|
right-decimal-digit right-digit right-bidi-mirrored
|
|
right-cl-both-case-p)
|
|
right
|
|
(or (and left-cl-both-case-p (not right-cl-both-case-p))
|
|
(and (or left-cl-both-case-p (not right-cl-both-case-p))
|
|
(or (< left-gc-index right-gc-index)
|
|
(and (= left-gc-index right-gc-index)
|
|
(or (< left-bidi-index right-bidi-index)
|
|
(and (= left-bidi-index right-bidi-index)
|
|
(or (< left-ccc-index right-ccc-index)
|
|
(and (= left-ccc-index right-ccc-index)
|
|
(or (string< left-decimal-digit
|
|
right-decimal-digit)
|
|
(and (string= left-decimal-digit
|
|
right-decimal-digit)
|
|
(or (string< left-digit right-digit)
|
|
(and (string= left-digit
|
|
right-digit)
|
|
(string< left-bidi-mirrored
|
|
right-bidi-mirrored))))))))))))))))
|
|
|
|
(defun build-misc-table ()
|
|
(sort *misc-table* #'compare-misc-entry)
|
|
(setq *misc-mapping* (make-array (1+ *misc-index*)))
|
|
(loop for i from 0 to *misc-index*
|
|
do (setf (aref *misc-mapping*
|
|
(gethash (aref *misc-table* i) *misc-hash*))
|
|
i)))
|
|
|
|
(defun slurp-ucd ()
|
|
(setq *last-uppercase* nil)
|
|
(setq *uppercase-transition-count* 0)
|
|
(setq *different-titlecases* nil)
|
|
(setq *different-numerics* nil)
|
|
(setq *name-size* 0)
|
|
(setq *misc-hash* (make-hash-table :test #'equal))
|
|
(setq *misc-index* -1)
|
|
(setq *misc-table* (make-array 256 :fill-pointer 0))
|
|
(setq *both-cases* nil)
|
|
(setq *decompositions* 0)
|
|
(setq *decomposition-types* (make-hash-table :test #'equal))
|
|
(setq *decomposition-length-max* 0)
|
|
(setq *decomposition-base* (make-array (ash #x110000
|
|
(- *page-size-exponent*))
|
|
:initial-element nil))
|
|
(setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
|
|
:initial-element nil))
|
|
(with-open-file (*standard-input*
|
|
(make-pathname :name "UnicodeData" :type "txt"
|
|
:defaults *extension-directory*)
|
|
:direction :input :external-format '(:utf-8 :crlf))
|
|
(loop for line = (read-line nil nil)
|
|
while line
|
|
do (slurp-ucd-line line)))
|
|
(second-pass)
|
|
(build-misc-table)
|
|
*decompositions*)
|
|
|
|
(defun split-string (line character)
|
|
(loop for prev-position = 0 then (1+ position)
|
|
for position = (position character line :start prev-position)
|
|
collect (subseq line prev-position position)
|
|
do (unless position
|
|
(loop-finish))))
|
|
|
|
(defun init-indices (strings)
|
|
(let ((hash (make-hash-table :test #'equal)))
|
|
(loop for string in strings
|
|
for index from 0
|
|
do (setf (gethash string hash) index))
|
|
hash))
|
|
|
|
(defparameter *general-categories*
|
|
(init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
|
|
"Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
|
|
"Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
|
|
(defparameter *bidi-classes*
|
|
(init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
|
|
"NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
|
|
|
|
|
|
(defparameter *block-first* nil)
|
|
|
|
(defun normalize-character-name (name)
|
|
(when (find #\_ name)
|
|
(error "Bad name for a character: ~A" name))
|
|
(unless (or (zerop (length name)) (find #\< name) (find #\> name))
|
|
(substitute #\_ #\Space name)))
|
|
|
|
;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
|
|
;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
|
|
;;; D800 -- F8FF : surrogates and private use
|
|
;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
|
|
;;; F0000 -- FFFFD : private use
|
|
;;; 100000 -- 10FFFD: private use
|
|
(defun encode-ucd-line (line code-point)
|
|
(destructuring-bind (name general-category canonical-combining-class
|
|
bidi-class decomposition-type-and-mapping
|
|
decimal-digit digit numeric bidi-mirrored
|
|
unicode-1-name iso-10646-comment simple-uppercase
|
|
simple-lowercase simple-titlecase)
|
|
line
|
|
(declare (ignore unicode-1-name iso-10646-comment))
|
|
(if (and (> (length name) 8)
|
|
(string= ", First>" name :start2 (- (length name) 8)))
|
|
(progn
|
|
(setq *block-first* code-point)
|
|
nil)
|
|
(let* ((gc-index (or (gethash general-category *general-categories*)
|
|
(error "unknown general category ~A"
|
|
general-category)))
|
|
(bidi-index (or (gethash bidi-class *bidi-classes*)
|
|
(error "unknown bidirectional class ~A"
|
|
bidi-class)))
|
|
(ccc-index (parse-integer canonical-combining-class))
|
|
(digit-index (unless (string= "" decimal-digit)
|
|
(parse-integer decimal-digit)))
|
|
(upper-index (unless (string= "" simple-uppercase)
|
|
(parse-integer simple-uppercase :radix 16)))
|
|
(lower-index (unless (string= "" simple-lowercase)
|
|
(parse-integer simple-lowercase :radix 16)))
|
|
(title-index (unless (string= "" simple-titlecase)
|
|
(parse-integer simple-titlecase :radix 16)))
|
|
(cl-both-case-p
|
|
(not (null (or (and (= gc-index 0) lower-index)
|
|
(and (= gc-index 1) upper-index)))))
|
|
(misc-index (hash-misc gc-index bidi-index ccc-index
|
|
decimal-digit digit bidi-mirrored
|
|
cl-both-case-p)))
|
|
(declare (ignore digit-index))
|
|
(incf *name-size* (length name))
|
|
(when (string/= "" decomposition-type-and-mapping)
|
|
(let ((split (split-string decomposition-type-and-mapping
|
|
#\Space)))
|
|
(when (char= #\< (aref (first split) 0))
|
|
(setf (gethash (pop split) *decomposition-types*) t))
|
|
(unless (aref *decomposition-base* (cp-high code-point))
|
|
(setf (aref *decomposition-base* (cp-high code-point))
|
|
(make-array (ash 1 *page-size-exponent*)
|
|
:initial-element nil)))
|
|
(setf (aref (aref *decomposition-base* (cp-high code-point))
|
|
(cp-low code-point))
|
|
(mapcar #'(lambda (string)
|
|
(parse-integer string :radix 16))
|
|
split))
|
|
(setq *decomposition-length-max*
|
|
(max *decomposition-length-max* (length split)))
|
|
(incf *decompositions* (length split))))
|
|
(when (and (string/= "" simple-uppercase)
|
|
(string/= "" simple-lowercase))
|
|
(push (list code-point upper-index lower-index) *both-cases*))
|
|
(when (string/= simple-uppercase simple-titlecase)
|
|
(push (cons code-point title-index) *different-titlecases*))
|
|
(when (string/= digit numeric)
|
|
(push (cons code-point numeric) *different-numerics*))
|
|
(cond
|
|
((= gc-index 8)
|
|
(unless *last-uppercase*
|
|
(incf *uppercase-transition-count*))
|
|
(setq *last-uppercase* t))
|
|
(t
|
|
(when *last-uppercase*
|
|
(incf *uppercase-transition-count*))
|
|
(setq *last-uppercase* nil)))
|
|
(when (> ccc-index 255)
|
|
(error "canonical combining class too large ~A" ccc-index))
|
|
(let ((result (make-ucd :misc misc-index
|
|
:transform (or upper-index lower-index 0))))
|
|
(when (and (> (length name) 7)
|
|
(string= ", Last>" name :start2 (- (length name) 7)))
|
|
(let ((page-start (ash (+ *block-first*
|
|
(ash 1 *page-size-exponent*)
|
|
-1)
|
|
(- *page-size-exponent*)))
|
|
(page-end (ash code-point (- *page-size-exponent*))))
|
|
(loop for point from *block-first*
|
|
below (ash page-start *page-size-exponent*)
|
|
do (setf (aref (aref *ucd-base* (cp-high point))
|
|
(cp-low point))
|
|
result))
|
|
(loop for page from page-start below page-end
|
|
do (setf (aref *ucd-base* page)
|
|
(make-array (ash 1 *page-size-exponent*)
|
|
:initial-element result)))
|
|
(loop for point from (ash page-end *page-size-exponent*)
|
|
below code-point
|
|
do (setf (aref (aref *ucd-base* (cp-high point))
|
|
(cp-low point))
|
|
result))))
|
|
(values result (normalize-character-name name)))))))
|
|
|
|
(defun slurp-ucd-line (line)
|
|
(let* ((split-line (split-string line #\;))
|
|
(code-point (parse-integer (first split-line) :radix 16))
|
|
(code-high (ash code-point (- *page-size-exponent*)))
|
|
(code-low (ldb (byte *page-size-exponent* 0) code-point)))
|
|
(unless (aref *ucd-base* code-high)
|
|
(setf (aref *ucd-base* code-high)
|
|
(make-array (ash 1 *page-size-exponent*)
|
|
:initial-element nil)))
|
|
(multiple-value-bind (encoding name)
|
|
(encode-ucd-line (cdr split-line) code-point)
|
|
(setf (aref (aref *ucd-base* code-high) code-low) encoding
|
|
(gethash code-point *unicode-names*) name))))
|
|
|
|
(defun second-pass ()
|
|
(loop for i from 0 below (length *ucd-base*)
|
|
when (aref *ucd-base* i)
|
|
do (loop for j from 0 below (length (aref *ucd-base* i))
|
|
for result = (aref (aref *ucd-base* i) j)
|
|
when result
|
|
when (let* ((transform-point (ucd-transform result))
|
|
(transform-high (ash transform-point
|
|
(- *page-size-exponent*)))
|
|
(transform-low (ldb (byte *page-size-exponent* 0)
|
|
transform-point)))
|
|
(and (plusp transform-point)
|
|
(/= (ucd-transform
|
|
(aref (aref *ucd-base* transform-high)
|
|
transform-low))
|
|
(+ (ash i *page-size-exponent*) j))))
|
|
do (destructuring-bind (gc-index bidi-index ccc-index
|
|
decimal-digit digit bidi-mirrored
|
|
cl-both-case-p)
|
|
(aref *misc-table* (ucd-misc result))
|
|
(declare (ignore cl-both-case-p))
|
|
(format t "~A~%" (+ (ash i *page-size-exponent*) j))
|
|
(setf (ucd-misc result)
|
|
(hash-misc gc-index bidi-index ccc-index
|
|
decimal-digit digit bidi-mirrored
|
|
nil))))))
|
|
|
|
(defun write-3-byte (triplet stream)
|
|
(write-byte (ldb (byte 8 0) triplet) stream)
|
|
(write-byte (ldb (byte 8 8) triplet) stream)
|
|
(write-byte (ldb (byte 8 16) triplet) stream))
|
|
|
|
(defun digit-to-byte (digit)
|
|
(if (string= "" digit)
|
|
255
|
|
(parse-integer digit)))
|
|
|
|
(defun output ()
|
|
(let ((hash (make-hash-table :test #'equalp))
|
|
(index 0))
|
|
(loop for page across *ucd-base*
|
|
do (when page
|
|
(unless (gethash page hash)
|
|
(setf (gethash page hash)
|
|
(incf index)))))
|
|
(let ((array (make-array (1+ index))))
|
|
(maphash #'(lambda (key value)
|
|
(setf (aref array value) key))
|
|
hash)
|
|
(setf (aref array 0)
|
|
(make-array (ash 1 *page-size-exponent*) :initial-element nil))
|
|
(with-open-file (stream (make-pathname :name "ucd"
|
|
:type "dat"
|
|
:defaults *extension-directory*)
|
|
:direction :output
|
|
:element-type '(unsigned-byte 8)
|
|
:if-exists :supersede
|
|
:if-does-not-exist :create)
|
|
(let ((offset (* (length *misc-table*) 8)))
|
|
(write-byte (mod offset 256) stream)
|
|
(write-byte (floor offset 256) stream))
|
|
(loop for (gc-index bidi-index ccc-index decimal-digit digit
|
|
bidi-mirrored)
|
|
across *misc-table*
|
|
do (write-byte gc-index stream)
|
|
do (write-byte bidi-index stream)
|
|
do (write-byte ccc-index stream)
|
|
do (write-byte (digit-to-byte decimal-digit) stream)
|
|
do (write-byte (digit-to-byte digit) stream)
|
|
do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
|
|
do (write-byte 0 stream)
|
|
do (write-byte 0 stream))
|
|
(loop for page across *ucd-base*
|
|
do (write-byte (if page (gethash page hash) 0) stream))
|
|
(loop for page across array
|
|
do (loop for entry across page
|
|
do (write-byte (if entry
|
|
(aref *misc-mapping* (ucd-misc entry))
|
|
255)
|
|
stream)
|
|
do (write-3-byte (if entry (ucd-transform entry) 0)
|
|
stream))))))
|
|
#+(or)
|
|
(with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
|
|
:defaults *extension-directory*)
|
|
:direction :output
|
|
:if-exists :supersede
|
|
:if-does-not-exist :create)
|
|
(with-standard-io-syntax
|
|
(write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
|
|
(maphash (lambda (code name)
|
|
(when name
|
|
(print code f)
|
|
(prin1 name f)))
|
|
*unicode-names*))
|
|
(setf *unicode-names* nil))
|
|
(with-open-file (*standard-output*
|
|
(make-pathname :name "numerics"
|
|
:type "lisp-expr"
|
|
:defaults *extension-directory*)
|
|
:direction :output
|
|
:if-exists :supersede
|
|
:if-does-not-exist :create)
|
|
(let ((*print-pretty* t))
|
|
(prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
|
|
*different-numerics*))))
|
|
(with-open-file (*standard-output*
|
|
(make-pathname :name "titlecases"
|
|
:type "lisp-expr"
|
|
:defaults *extension-directory*)
|
|
:direction :output
|
|
:if-exists :supersede
|
|
:if-does-not-exist :create)
|
|
(let ((*print-pretty* t))
|
|
(prin1 *different-titlecases*)))
|
|
(with-open-file (*standard-output*
|
|
(make-pathname :name "misc"
|
|
:type "lisp-expr"
|
|
:defaults *extension-directory*)
|
|
:direction :output
|
|
:if-exists :supersede
|
|
:if-does-not-exist :create)
|
|
(let ((*print-pretty* t))
|
|
(prin1 `(:length ,(length *misc-table*)
|
|
:uppercase ,(loop for (gc-index) across *misc-table*
|
|
for i from 0
|
|
when (= gc-index 0)
|
|
collect i)
|
|
:lowercase ,(loop for (gc-index) across *misc-table*
|
|
for i from 0
|
|
when (= gc-index 1)
|
|
collect i)
|
|
:titlecase ,(loop for (gc-index) across *misc-table*
|
|
for i from 0
|
|
when (= gc-index 2)
|
|
collect i)))))
|
|
(values))
|
|
|
|
(defun read-compiled-ucd ()
|
|
(with-open-file (stream (make-pathname :name "ucd"
|
|
:type "dat"
|
|
:defaults *extension-directory*)
|
|
:direction :input
|
|
:element-type '(unsigned-byte 8))
|
|
(let ((length (file-length stream)))
|
|
(setq *compiled-ucd*
|
|
(make-array length :element-type '(unsigned-byte 8)))
|
|
(read-sequence *compiled-ucd* stream)))
|
|
(values))
|
|
|
|
(slurp-ucd)
|
|
(output)
|