mirror of
https://gitlab.com/embeddable-common-lisp/ecl.git
synced 2025-12-05 18:30:24 -08:00
Provide some comments on what the code is doing. Increase size of integers used in encoding pairs of consecutive words (needed to encode the current ucd). Introduce additional bidirectional classes. Provide new ucd.h header with enums for general-category and bidirectional class. Add update-ucd.sh script to automate the updating process.
144 lines
4.6 KiB
Common Lisp
144 lines
4.6 KiB
Common Lisp
(declaim (optimize (debug 0) (speed 3)))
|
|
|
|
(setf *print-circle* t)
|
|
|
|
(defun compute-pairs (data table)
|
|
;; Compute the frequency of consecutive pairs of words and store
|
|
;; them in table.
|
|
(clrhash table)
|
|
(loop with max = 0
|
|
with max-pair = nil
|
|
for (code name . l) in data
|
|
do (loop for l2 on l
|
|
for a = (first l2)
|
|
for b = (second l2)
|
|
while b
|
|
do (let* ((pair (cons a b))
|
|
(freq (gethash pair table)))
|
|
(setf (gethash pair table)
|
|
(setf freq (if freq (1+ freq) 1))
|
|
a b)
|
|
(when (> freq max)
|
|
(setf max freq max-pair pair))))
|
|
finally (return (cons max max-pair))))
|
|
|
|
(defun replace-pair (pair code data)
|
|
;; For all characters in data, replace the pair representing two
|
|
;; consecutive words by code (which is a single integer).
|
|
;; Returns the total number of pairs left.
|
|
(let ((old-a (car pair))
|
|
(old-b (cdr pair)))
|
|
(loop with more = 0
|
|
for (ucd-code name . l) in data
|
|
do (loop with l2 = l
|
|
for a = (first l2)
|
|
for b = (second l2)
|
|
while b
|
|
do (when (and (eql a old-a) (eql b old-b))
|
|
;; replace (a b . rest) with (code . rest)
|
|
(setf (car l2) code
|
|
(cdr l2) (cddr l2)))
|
|
do (setf l2 (cdr l2)))
|
|
(incf more (1- (length l)))
|
|
finally (return more))))
|
|
|
|
(defun compress (data)
|
|
;; Compress data by replacing the pair of consecutive words with
|
|
;; maximum frequency with a new code until no pairs with frequency >
|
|
;; 1 are left.
|
|
(loop with last-length = 0
|
|
with table = (make-hash-table :size 2048 :test #'equal)
|
|
with pairs = '()
|
|
for new-symbol from (1+ *last-word-index*)
|
|
for (frequency . pair) = (compute-pairs data table)
|
|
while (and pair (> frequency 1))
|
|
do
|
|
(format t "~%;;; ~A, ~D -> ~D, ~D left" pair frequency new-symbol
|
|
(replace-pair pair new-symbol data))
|
|
(setf pairs (acons new-symbol pair pairs))
|
|
finally
|
|
;; There are no redundant pairs. We just define ad-hoc new
|
|
;; symbols for all remaining strings.
|
|
(loop with n = new-symbol
|
|
for (code name . l) in data
|
|
do (loop with l2 = l
|
|
for a = (first l2)
|
|
for b = (second l2)
|
|
while b
|
|
do (setf pairs (acons n (cons a b) pairs)
|
|
(car l2) n
|
|
(cdr l2) (cddr l2)
|
|
n (1+ n))))
|
|
(print 'finished)
|
|
(return-from compress (nreverse pairs))))
|
|
|
|
(progn
|
|
(defparameter *compressed-data* (copy-tree *data*))
|
|
(defparameter *paired-data* (compress *compressed-data*)))
|
|
|
|
(defparameter *last-code* (first (first (last *paired-data*))))
|
|
|
|
(defparameter *code-ndx-size* (ceiling (integer-length *last-code*) 8))
|
|
|
|
(defparameter *pair-table-size* (* (length *paired-data*)
|
|
(* 2 *code-ndx-size*)))
|
|
|
|
(defparameter *code-to-name-bytes*
|
|
(* (length *compressed-data*)
|
|
(+ 3 ; Size of Unicode code
|
|
;; Size of index into the data table
|
|
*code-ndx-size*)))
|
|
|
|
(defparameter *sorted-names-bytes*
|
|
;; The sorted list of character names is just a list of indices into
|
|
;; the *code-to-name-bytes* table
|
|
(* (length *compressed-data*) *code-ndx-size*))
|
|
|
|
(defparameter *word-dictionary*
|
|
(+ *words-array-bytes*))
|
|
|
|
(format t "
|
|
;;; Codes dictionary = ~D bytes
|
|
;;; Pair table size = ~D bytes
|
|
;;; Code to names table = ~D bytes
|
|
;;; Names to codes table = ~D bytes
|
|
;;; Total = ~D bytes
|
|
"
|
|
*word-dictionary*
|
|
*pair-table-size*
|
|
*code-to-name-bytes*
|
|
*sorted-names-bytes*
|
|
(+
|
|
*word-dictionary*
|
|
*pair-table-size*
|
|
*code-to-name-bytes*
|
|
*sorted-names-bytes*
|
|
))
|
|
|
|
;;; WITH HANGUL
|
|
;;; Codes dictionary = 78566 bytes
|
|
;;; Pair table size = 198752 bytes
|
|
;;; Code to names table = 164570 bytes
|
|
;;; Names to codes table = 65828 bytes
|
|
;;; Total = 507716 bytes
|
|
|
|
;;; WITHOUT HANGUL
|
|
;;; Codes dictionary = 78555 bytes
|
|
;;; Pair table size = 150868 bytes
|
|
;;; Code to names table = 108710 bytes
|
|
;;; Names to codes table = 43484 bytes
|
|
;;; Total = 381617 bytes
|
|
|
|
;;; Without HANGUL (split by space and -)
|
|
;;; Codes dictionary = 58258 bytes
|
|
;;; Pair table size = 160576 bytes
|
|
;;; Code to names table = 108710 bytes
|
|
;;; Names to codes table = 43484 bytes
|
|
;;; Total = 371028 bytes
|
|
|
|
;;; With HANGUL (split by space and -)
|
|
;;; Codes dictionary = 58269 bytes
|
|
;;; Pair table size = 208460 bytes
|
|
;;; Code to names table = 164570 bytes
|
|
;;; Names to codes table = 65828 bytes
|
|
;;; Total = 497127 bytes
|