Added the Unicode character database to the C library

This commit is contained in:
Juan Jose Garcia Ripoll 2012-12-26 11:04:27 +01:00
parent b6519e5c7a
commit ebafa5f275
14 changed files with 81861 additions and 6 deletions

View file

@ -0,0 +1,123 @@
(defun split-words (text &key (set '(#\Space)) (exclude t))
(loop with start = 0
with output = '()
with elt-type = (array-element-type text)
for i from 0 below (length text)
for c across text
when (member c set)
do (setf output (list* (make-array (+ (- i start) (if exclude 0 1))
:element-type elt-type
:displaced-to text
:displaced-index-offset start)
output)
start (1+ i))
finally (return (nreverse (list* (make-array (- i start)
:element-type elt-type
:displaced-to text
:displaced-index-offset start)
output)))))
(defun encode-words (words hash)
(loop for word in words
collect (or (gethash word hash)
(let* ((word (copy-seq word))
(ndx (hash-table-count hash)))
(setf (gethash word hash) (1+ ndx))))))
(defun fixup-hangul-syllables (dictionary)
;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
(let* ((sbase #xac00)
(lbase #x1100)
(vbase #x1161)
(tbase #x11a7)
(scount 11172)
(lcount 19)
(vcount 21)
(tcount 28)
(ncount (* vcount tcount))
(table (make-hash-table)))
(with-open-file (*standard-input*
(make-pathname :name "Jamo" :type "txt"))
(loop for line = (read-line nil nil)
while line
if (position #\; line)
do (add-jamo-information line table)))
(loop for sindex from 0 below scount
for l = (+ lbase (floor sindex ncount))
for v = (+ vbase (floor (mod sindex ncount) tcount))
for tee = (+ tbase (mod sindex tcount))
for name = (list* "HANGUL_" "SYLLABLE_"
(gethash l table) (gethash v table)
(unless (= tee tbase) (list (gethash tee table))))
for code = (+ sbase sindex)
collect (list* code (apply #'concatenate 'string name)
(encode-words name dictionary)))))
(defun add-jamo-information (line table)
(let* ((split (split-words line :set '(#\;) :exclude t))
(code (parse-integer (first split) :radix 16))
(syllable (string-trim '(#\Space)
(subseq (second split) 0 (position #\# (second split))))))
(setf (gethash code table) syllable)))
(defvar *words*)
(defparameter *data*
(with-open-file (in "~/src/sbcl/tools-for-build/UnicodeData.txt" :direction :input)
(loop with words = (setf *words* (make-hash-table :size 1024 :test #'equal))
for ucd-line = (read-line in nil nil nil)
while ucd-line
nconc (let* ((ucd-data (split-words ucd-line :set '(#\;)))
(code (first ucd-data))
(name (second ucd-data)))
(unless (eql (char name 0) #\<)
(setf name (substitute #\_ #\Space name))
(list (list* (parse-integer code :radix 16)
name
(encode-words (split-words
name
:set '(#\Space #\_ #\-)
:exclude nil)
words))))))))
(print (length *data*))
(print (first (last *data*)))
;#+(or)
(progn
(setf *data*
(sort (nconc (fixup-hangul-syllables *words*) *data*)
#'<
:key #'car))
(print (length *data*))
(print (first (last *data*))))
(defparameter *words-array*
(loop with array = (make-array (1+ (hash-table-count *words*)))
for k being the hash-key in *words* using (hash-value v)
do (setf (aref array v) k)
finally (return array)))
(defparameter *last-word-index* (1- (length *words-array*)))
(defparameter *words-array-bytes*
(loop for c across *words-array*
sum (1+ (length c))))
(defun code-to-string (code)
(aref *words-array* code))
(defparameter *flattened-data*
(loop for (code name . rest) in *data*
nconc (append rest (list 0))))
(defparameter *group-names*
(loop with output = '()
with start = (first (first *data*))
with last = start
for (code name . rest) in *data*
do (when (>= (- code last) 2)
(setf output (cons (list start last) output)
start code))
(setf last code)
finally (return (nreverse (cons (list start code) output)))))

View file

@ -0,0 +1,298 @@
(defparameter *destination*
(merge-pathnames "../../src/c/unicode/"
(or *load-truename* *compile-pathname*)))
(let* ((translated-data (copy-tree *compressed-data*))
(pairs (copy-tree *paired-data*))
(first-code (loop for (pair-code . pair) in pairs minimize pair-code))
(last-code (loop for (pair-code . pair) in pairs maximize pair-code)))
;;
;; We make sure that for each character there is a unique pair which is not
;; used anywhere else
;;
(loop with used-code = (make-array (1+ last-code) :initial-element nil)
for line in translated-data
for pair-code = (third line)
do (cond ((/= (length line) 3)
(error "Error in compressed data: too long code ~A" line))
((or (aref used-code pair-code)
(< pair-code first-code))
(let ((new-pair (cons pair-code 0)))
(setf pairs (acons (incf last-code) new-pair pairs)
(third line) last-code)))
(t
(setf (aref used-code pair-code) t))))
;;
;; We now renumber all pairs.
;;
(let ((translation-table (make-array (1+ last-code) :initial-element nil))
(counter -1))
(flet ((add-code (code)
(or (aref translation-table code)
(setf (aref translation-table code) (incf counter))))
(translate (old-code)
(or (aref translation-table old-code)
(error "Unknown code ~A" old-code))))
;; First of all we add the words
(loop for i from 0 below first-code
do (add-code i))
;; Then we add all pairs that represent characters, so that they
;; are consecutive, too.
(loop for line in translated-data
do (setf (third line) (add-code (third line))))
;; Finally, we add the remaining pairs
(loop for record in pairs
do (setf (car record) (add-code (car record))))
;; ... and we fix the definitions
(loop for (code . pair) in pairs
do (setf (car pair) (translate (car pair))
(cdr pair) (translate (cdr pair))))))
(defparameter *sorted-compressed-data* translated-data)
(defparameter *sorted-pairs* (sort pairs #'< :key #'car))
(print 'finished)
)
(defparameter *grouped-characters*
(loop with last-ucd-code = nil
with start-ucd-code = nil
with start-code = nil
with output = '()
with aux = '()
for n from (third (first *sorted-compressed-data*))
for line in *sorted-compressed-data*
for (ucd-code name code) = line
do (cond ((/= code n)
(error "Codes in *sorted-compressed-data* are not consecutive:~%~A"
(cons line (subseq aux 0 10))))
((null start-ucd-code)
(setf start-ucd-code ucd-code
start-code code))
((= last-ucd-code (1- ucd-code))
)
(t
(push (list start-ucd-code last-ucd-code start-code)
output)
(setf start-ucd-code ucd-code
start-code code)))
(setf last-ucd-code ucd-code aux (cons line aux))
finally (return (nreverse output))))
(with-open-file (s (merge-pathnames "ucd_names.h" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* UNICODE NAMES DATABASE
*/
#ifndef ECL_UCD_NAMES_H
#define ECL_UCD_NAMES_H 1
#define ECL_UCD_FIRST_PAIR ~D
#define ECL_UCD_TOTAL_PAIRS ~D
#define ECL_UCD_TOTAL_GROUPS ~D
#define ECL_UCD_LARGEST_CHAR_NAME ~D
#define ECL_UCD_TOTAL_NAMES ~D
typedef struct {
unsigned char codes[4];
} ecl_ucd_names_pair_type;
typedef struct {
int smallest, largest, pair_code;
} ecl_ucd_names_char_group;
typedef struct {
unsigned char pair[2];
unsigned char code[3];
} ecl_ucd_code_and_pair;
extern const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS];
extern const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS];
extern const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR];
extern const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES];
#endif
"
(1+ *last-word-index*)
(length *sorted-pairs*)
(length *grouped-characters*)
(loop for (code name . rest) in *compressed-data*
maximize (length name))
(length *compressed-data*)
))
(with-open-file (s (merge-pathnames "ucd_names_pair.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Pairs of symbols.
*/
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS] = {
"
(length *sorted-pairs*) (length *sorted-pairs*))
(loop for i from 0
for (pair-code . (a . b)) in *sorted-pairs*
do (format s "~A{~D, ~D, ~D, ~D}~%"
(if (plusp i) "," "")
(logand a #xff) (ash a -8)
(logand b #xff) (ash b -8)
))
(format s "};~%"))
(with-open-file (s (merge-pathnames "ucd_names_codes.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Sorted character names.
*/
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES] = {
")
(loop with l = (sort (copy-tree *sorted-compressed-data*) #'string<= :key #'second)
for (ucd-code name code) in l
for i from 0
do (format s "~A{{~D, ~D}, {~D, ~D, ~D}}~%"
(if (plusp i) "," "")
(logand code #xff) (ash code -8)
(logand ucd-code #xff) (logand (ash ucd-code -8) #xff)
(logand (ash ucd-code -16) #xff)))
(format s "};"))
(with-open-file (s (merge-pathnames "ucd_names_str.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Dictionary words.
*/
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR] = {
")
(loop for i from 0
for c across *words-array*
do (format s "~A~S~%" (if (plusp i) "," "") (or c "")))
(format s "};~%"))
(with-open-file (s (merge-pathnames "ucd_names_char.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Dictionary words.
*/
#include <string.h>
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS] = {
"
(length *grouped-characters*))
(loop for i from 0
for (start end pair-code) in *grouped-characters*
do (format s "~A{~D,~D,~D}~%" (if (plusp i) "," "")
start end pair-code))
(format s "};
static int
search_pair(ecl_character c)
{
int mid, low = 0, up = ECL_UCD_TOTAL_GROUPS-1;
do {
mid = (up + low) / 2;
if (c < ecl_ucd_names_char[mid].smallest)
up = mid-1;
else if (c > ecl_ucd_names_char[mid].largest)
low = mid+1;
else
return (c - ecl_ucd_names_char[mid].smallest) +
ecl_ucd_names_char[mid].pair_code;
} while (low <= up && (low >= 0) && (up < ECL_UCD_TOTAL_GROUPS));
return -1;
}
static void
fill_pair_name(char *buffer, int pair)
{
if (pair < ECL_UCD_FIRST_PAIR) {
strncat(buffer, ecl_ucd_names_word[pair], ECL_UCD_LARGEST_CHAR_NAME+1);
/*
printf(\"text=%s\\n\", ecl_ucd_names_word[pair]);
*/
} else {
const ecl_ucd_names_pair_type p = ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR];
/*
printf(\"ndx=%d\\n\", pair - ECL_UCD_FIRST_PAIR);
printf(\"c0=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[0]);
printf(\"c1=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[1]);
printf(\"c2=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[2]);
printf(\"c3=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[3]);
*/
fill_pair_name(buffer, (((unsigned int)p.codes[1]) << 8) | p.codes[0]);
fill_pair_name(buffer, (((unsigned int)p.codes[3]) << 8) | p.codes[2]);
}
}
cl_object
_ecl_ucd_code_to_name(ecl_character c)
{
int pair = search_pair(c);
if (pair < 0)
return ECL_NIL;
else {
char buffer[ECL_UCD_LARGEST_CHAR_NAME+1];
buffer[0] = 0;
fill_pair_name(buffer, pair);
return make_base_string_copy(buffer);
}
}
cl_object
_ecl_ucd_name_to_code(cl_object name)
{
int mid, low = 0, up = ECL_UCD_TOTAL_NAMES-1;
int l = ecl_length(name);
if (l <= ECL_UCD_LARGEST_CHAR_NAME) {
char buffer1[ECL_UCD_LARGEST_CHAR_NAME+1];
char buffer2[ECL_UCD_LARGEST_CHAR_NAME+1];
for (mid = 0; mid < l; mid++) {
ecl_character c = ecl_char_upcase(ecl_char(name, mid));
buffer1[mid] = c;
if (c < 32 || c > 127) /* All character names are [-A-Z_0-9]* */
return ECL_NIL;
}
buffer1[mid] = 0;
do {
ecl_ucd_code_and_pair p = ecl_ucd_sorted_pairs[mid = (low + up) / 2];
int flag, pair = ((unsigned int)p.pair[1] << 8) | p.pair[0];
buffer2[0] = 0;
fill_pair_name(buffer2, pair);
flag = strcmp(buffer1, buffer2);
/*
printf(\"[%d,%d,%d] %s <> (%d)%s -> %d\\n\",
low, mid, up, buffer1, pair, buffer2, flag);
*/
if (flag == 0) {
return ecl_make_fixnum(((unsigned int)p.code[2] << 16) |
((unsigned int)p.code[1] << 8) |
p.code[0]);
} else if (flag < 0) {
up = mid - 1;
} else {
low = mid + 1;
}
} while (low <= up);
}
return ECL_NIL;
}
"))
;(ext:run-program "/bin/sh" '("-c" "cp *.c *.h ~/devel/ecl/src/c/unicode/"))

View file

@ -0,0 +1,138 @@
(load "./load-names.lisp")
(declaim (optimize (debug 0) (speed 3)))
(setf *print-circle* t)
(defun compute-pairs (data table)
(clrhash table)
(loop with max = 0
with max-pair = nil
for (code name . l) in data
do (loop for l2 on l
for a = (car l2)
for b = (cadr l2)
while b
do (let* ((pair (cons a b))
(c (gethash pair table)))
(setf (gethash pair table)
(setf c (if c (1+ c) 1))
a b)
(when (> c max)
(setf max c max-pair pair))))
finally (return (cons max max-pair))))
(defun replace-pair (pair code data)
(let ((old-a (car pair))
(old-b (cdr pair)))
(loop with more = 0
for (ucd-code name . l) in data
do (loop with l2 = l
for a = (first l2)
for b = (second l2)
while b
do (when (and (eql a old-a) (eql b old-b))
;; replace (a b . c) with (pair . c)
(setf (car l2) code
(cdr l2) (cddr l2)))
do (setf l2 (cdr l2)))
do (setf more (+ more (1- (length l))))
finally (return more))))
(defun compress (data)
(loop with last-length = 0
with table = (make-hash-table :size 2048 :test #'equal)
with pairs = '()
for new-symbol from (1+ *last-word-index*)
for (frequency . pair) = (compute-pairs data table)
while (and pair (> frequency 1))
do
(format t "~%;;; ~A, ~D -> ~D, ~D left" pair frequency new-symbol
(replace-pair pair new-symbol data))
(setf pairs (acons new-symbol pair pairs))
finally
;; There are no redundant pairs. We just define ad-hoc new
;; symbols for all remaining strings.
(loop with n = new-symbol
for (code name . l) in data
do (loop with l2 = l
for a = (first l2)
for b = (second l2)
while b
do (setf pairs (acons n (cons a b) pairs)
(car l2) n
(cdr l2) (cddr l2)
n (1+ n))))
(print 'finished)
(return-from compress (nreverse pairs))))
(progn
(defparameter *compressed-data* (copy-tree *data*))
(defparameter *paired-data* (compress *compressed-data*)))
(defparameter *last-code* (first (first (last *paired-data*))))
(defparameter *code-ndx-size* (ceiling (integer-length *last-code*) 8))
(defparameter *pair-table-size* (* (length *paired-data*)
(* 2 *code-ndx-size*)))
(defparameter *code-to-name-bytes*
(* (length *compressed-data*)
(+ 3 ; Size of Unicode code
;; Size of index into the data table
*code-ndx-size*)))
(defparameter *sorted-names-bytes*
;; The sorted list of character names is just a list of indices into
;; the *code-to-name-bytes* table
(* (length *compressed-data*) *code-ndx-size*))
(defparameter *word-dictionary*
(+ *words-array-bytes*))
(format t "
;;; Codes dictionary = ~D bytes
;;; Pair table size = ~D bytes
;;; Code to names table = ~D bytes
;;; Names to codes table = ~D bytes
;;; Total = ~D bytes
"
*word-dictionary*
*pair-table-size*
*code-to-name-bytes*
*sorted-names-bytes*
(+
*word-dictionary*
*pair-table-size*
*code-to-name-bytes*
*sorted-names-bytes*
))
;;; WITH HANGUL
;;; Codes dictionary = 78566 bytes
;;; Pair table size = 198752 bytes
;;; Code to names table = 164570 bytes
;;; Names to codes table = 65828 bytes
;;; Total = 507716 bytes
;;; WITHOUT HANGUL
;;; Codes dictionary = 78555 bytes
;;; Pair table size = 150868 bytes
;;; Code to names table = 108710 bytes
;;; Names to codes table = 43484 bytes
;;; Total = 381617 bytes
;;; Without HANGUL (split by space and -)
;;; Codes dictionary = 58258 bytes
;;; Pair table size = 160576 bytes
;;; Code to names table = 108710 bytes
;;; Names to codes table = 43484 bytes
;;; Total = 371028 bytes
;;; With HANGUL (split by space and -)
;;; Codes dictionary = 58269 bytes
;;; Pair table size = 208460 bytes
;;; Code to names table = 164570 bytes
;;; Names to codes table = 65828 bytes
;;; Total = 497127 bytes

View file

@ -465,7 +465,15 @@ cl_char_name(cl_object c)
{
ecl_character code = ecl_char_code(c);
cl_object output;
if (code > 127) {
if (code <= 127) {
output = ecl_gethash_safe(ecl_make_fixnum(code), cl_core.char_names, ECL_NIL);
}
#ifdef ECL_UNICODE_NAMES
else if (!Null(output = _ecl_ucd_code_to_name(code))) {
(void)0;
}
#endif
else {
ecl_base_char name[8];
ecl_base_char *start;
name[7] = 0;
@ -482,8 +490,6 @@ cl_char_name(cl_object c)
}
start[0] = 'U';
output = make_base_string_copy((const char*)start);
} else {
output = ecl_gethash_safe(ecl_make_fixnum(code), cl_core.char_names, ECL_NIL);
}
@(return output);
}
@ -491,13 +497,21 @@ cl_char_name(cl_object c)
cl_object
cl_name_char(cl_object name)
{
const cl_env_ptr the_env = ecl_process_env();
cl_object c;
cl_index l;
name = cl_string(name);
c = ecl_gethash_safe(name, cl_core.char_names, ECL_NIL);
if (c != ECL_NIL) {
c = ECL_CODE_CHAR(ecl_fixnum(c));
} else if (ecl_stringp(name) && (l = ecl_length(name))) {
ecl_return1(the_env, ECL_CODE_CHAR(ecl_fixnum(c)));
}
#ifdef ECL_UNICODE_NAMES
c = _ecl_ucd_name_to_code(name);
if (c != ECL_NIL) {
ecl_return1(the_env, cl_code_char(c));
}
#endif
if (ecl_stringp(name) && (l = ecl_length(name))) {
c = cl_char(name, ecl_make_fixnum(0));
if (l == 1) {
(void)0;
@ -516,5 +530,5 @@ cl_name_char(cl_object name)
}
}
}
@(return c);
ecl_return1(the_env, c);
}

31
src/c/unicode/ucd_names.h Normal file
View file

@ -0,0 +1,31 @@
/*
* UNICODE NAMES DATABASE
*/
#ifndef ECL_UCD_NAMES_H
#define ECL_UCD_NAMES_H 1
#define ECL_UCD_FIRST_PAIR 9699
#define ECL_UCD_TOTAL_PAIRS 37993
#define ECL_UCD_TOTAL_GROUPS 481
#define ECL_UCD_LARGEST_CHAR_NAME 83
#define ECL_UCD_TOTAL_NAMES 32914
typedef struct {
unsigned char codes[4];
} ecl_ucd_names_pair_type;
typedef struct {
int smallest, largest, pair_code;
} ecl_ucd_names_char_group;
typedef struct {
unsigned char pair[2];
unsigned char code[3];
} ecl_ucd_code_and_pair;
extern const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS];
extern const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS];
extern const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR];
extern const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES];
#endif

View file

@ -0,0 +1,584 @@
/*
* Dictionary words.
*/
#include <string.h>
#include <ecl/ecl.h>
#include "ucd_names.h"
const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS] = {
{32,126,9699}
,{160,887,9794}
,{890,894,10522}
,{900,906,10527}
,{908,908,10534}
,{910,929,10535}
,{931,1317,10555}
,{1329,1366,10942}
,{1369,1375,10980}
,{1377,1415,10987}
,{1417,1418,11026}
,{1425,1479,11028}
,{1488,1514,11083}
,{1520,1524,11110}
,{1536,1539,11115}
,{1542,1563,11119}
,{1566,1567,11141}
,{1569,1630,11143}
,{1632,1805,11205}
,{1807,1866,11379}
,{1869,1969,11439}
,{1984,2042,11540}
,{2048,2093,11599}
,{2096,2110,11645}
,{2304,2361,11660}
,{2364,2382,11718}
,{2384,2389,11737}
,{2392,2418,11743}
,{2425,2431,11770}
,{2433,2435,11777}
,{2437,2444,11780}
,{2447,2448,11788}
,{2451,2472,11790}
,{2474,2480,11812}
,{2482,2482,11819}
,{2486,2489,11820}
,{2492,2500,11824}
,{2503,2504,11833}
,{2507,2510,11835}
,{2519,2519,11839}
,{2524,2525,11840}
,{2527,2531,11842}
,{2534,2555,11847}
,{2561,2563,11869}
,{2565,2570,11872}
,{2575,2576,11878}
,{2579,2600,11880}
,{2602,2608,11902}
,{2610,2611,11909}
,{2613,2614,11911}
,{2616,2617,11913}
,{2620,2620,11915}
,{2622,2626,11916}
,{2631,2632,11921}
,{2635,2637,11923}
,{2641,2641,11926}
,{2649,2652,11927}
,{2654,2654,11931}
,{2662,2677,11932}
,{2689,2691,11948}
,{2693,2701,11951}
,{2703,2705,11960}
,{2707,2728,11963}
,{2730,2736,11985}
,{2738,2739,11992}
,{2741,2745,11994}
,{2748,2757,11999}
,{2759,2761,12009}
,{2763,2765,12012}
,{2768,2768,12015}
,{2784,2787,12016}
,{2790,2799,12020}
,{2801,2801,12030}
,{2817,2819,12031}
,{2821,2828,12034}
,{2831,2832,12042}
,{2835,2856,12044}
,{2858,2864,12066}
,{2866,2867,12073}
,{2869,2873,12075}
,{2876,2884,12080}
,{2887,2888,12089}
,{2891,2893,12091}
,{2902,2903,12094}
,{2908,2909,12096}
,{2911,2915,12098}
,{2918,2929,12103}
,{2946,2947,12115}
,{2949,2954,12117}
,{2958,2960,12123}
,{2962,2965,12126}
,{2969,2970,12130}
,{2972,2972,12132}
,{2974,2975,12133}
,{2979,2980,12135}
,{2984,2986,12137}
,{2990,3001,12140}
,{3006,3010,12152}
,{3014,3016,12157}
,{3018,3021,12160}
,{3024,3024,12164}
,{3031,3031,12165}
,{3046,3066,12166}
,{3073,3075,12187}
,{3077,3084,12190}
,{3086,3088,12198}
,{3090,3112,12201}
,{3114,3123,12224}
,{3125,3129,12234}
,{3133,3140,12239}
,{3142,3144,12247}
,{3146,3149,12250}
,{3157,3158,12254}
,{3160,3161,12256}
,{3168,3171,12258}
,{3174,3183,12262}
,{3192,3199,12272}
,{3202,3203,12280}
,{3205,3212,12282}
,{3214,3216,12290}
,{3218,3240,12293}
,{3242,3251,12316}
,{3253,3257,12326}
,{3260,3268,12331}
,{3270,3272,12340}
,{3274,3277,12343}
,{3285,3286,12347}
,{3294,3294,12349}
,{3296,3299,12350}
,{3302,3311,12354}
,{3313,3314,12364}
,{3330,3331,12366}
,{3333,3340,12368}
,{3342,3344,12376}
,{3346,3368,12379}
,{3370,3385,12402}
,{3389,3396,12418}
,{3398,3400,12426}
,{3402,3405,12429}
,{3415,3415,12433}
,{3424,3427,12434}
,{3430,3445,12438}
,{3449,3455,12454}
,{3458,3459,12461}
,{3461,3478,12463}
,{3482,3505,12481}
,{3507,3515,12505}
,{3517,3517,12514}
,{3520,3526,12515}
,{3530,3530,12522}
,{3535,3540,12523}
,{3542,3542,12529}
,{3544,3551,12530}
,{3570,3572,12538}
,{3585,3642,12541}
,{3647,3675,12599}
,{3713,3714,12628}
,{3716,3716,12630}
,{3719,3720,12631}
,{3722,3722,12633}
,{3725,3725,12634}
,{3732,3735,12635}
,{3737,3743,12639}
,{3745,3747,12646}
,{3749,3749,12649}
,{3751,3751,12650}
,{3754,3755,12651}
,{3757,3769,12653}
,{3771,3773,12666}
,{3776,3780,12669}
,{3782,3782,12674}
,{3784,3789,12675}
,{3792,3801,12681}
,{3804,3805,12691}
,{3840,3911,12693}
,{3913,3948,12765}
,{3953,3979,12801}
,{3984,3991,12828}
,{3993,4028,12836}
,{4030,4044,12872}
,{4046,4056,12887}
,{4096,4293,12898}
,{4304,4348,13096}
,{4352,4680,13141}
,{4682,4685,13470}
,{4688,4694,13474}
,{4696,4696,13481}
,{4698,4701,13482}
,{4704,4744,13486}
,{4746,4749,13527}
,{4752,4784,13531}
,{4786,4789,13564}
,{4792,4798,13568}
,{4800,4800,13575}
,{4802,4805,13576}
,{4808,4822,13580}
,{4824,4880,13595}
,{4882,4885,13652}
,{4888,4954,13656}
,{4959,4988,13723}
,{4992,5017,13753}
,{5024,5108,13779}
,{5120,5788,13864}
,{5792,5872,14533}
,{5888,5900,14614}
,{5902,5908,14627}
,{5920,5942,14634}
,{5952,5971,14657}
,{5984,5996,14677}
,{5998,6000,14690}
,{6002,6003,14693}
,{6016,6109,14695}
,{6112,6121,14789}
,{6128,6137,14799}
,{6144,6158,14809}
,{6160,6169,14824}
,{6176,6263,14834}
,{6272,6314,14922}
,{6320,6389,14965}
,{6400,6428,15035}
,{6432,6443,15064}
,{6448,6459,15076}
,{6464,6464,15088}
,{6468,6509,15089}
,{6512,6516,15131}
,{6528,6571,15136}
,{6576,6601,15180}
,{6608,6618,15206}
,{6622,6683,15217}
,{6686,6750,15279}
,{6752,6780,15344}
,{6783,6793,15373}
,{6800,6809,15384}
,{6816,6829,15394}
,{6912,6987,15408}
,{6992,7036,15484}
,{7040,7082,15529}
,{7086,7097,15572}
,{7168,7223,15584}
,{7227,7241,15640}
,{7245,7295,15655}
,{7376,7410,15706}
,{7424,7654,15741}
,{7677,7957,15972}
,{7960,7965,16253}
,{7968,8005,16259}
,{8008,8013,16297}
,{8016,8023,16303}
,{8025,8025,16311}
,{8027,8027,16312}
,{8029,8029,16313}
,{8031,8061,16314}
,{8064,8116,16345}
,{8118,8132,16398}
,{8134,8147,16413}
,{8150,8155,16427}
,{8157,8175,16433}
,{8178,8180,16452}
,{8182,8190,16455}
,{8192,8292,16464}
,{8298,8305,16565}
,{8308,8334,16573}
,{8336,8340,16600}
,{8352,8376,16605}
,{8400,8432,16630}
,{8448,8585,16663}
,{8592,9192,16801}
,{9216,9254,17402}
,{9280,9290,17441}
,{9312,9933,17452}
,{9935,9953,18074}
,{9955,9955,18093}
,{9960,9983,18094}
,{9985,9988,18118}
,{9990,9993,18122}
,{9996,10023,18126}
,{10025,10059,18154}
,{10061,10061,18189}
,{10063,10066,18190}
,{10070,10078,18194}
,{10081,10132,18203}
,{10136,10159,18255}
,{10161,10174,18279}
,{10176,10186,18293}
,{10188,10188,18304}
,{10192,11084,18305}
,{11088,11097,19198}
,{11264,11310,19208}
,{11312,11358,19255}
,{11360,11505,19302}
,{11513,11557,19448}
,{11568,11621,19493}
,{11631,11631,19547}
,{11648,11670,19548}
,{11680,11686,19571}
,{11688,11694,19578}
,{11696,11702,19585}
,{11704,11710,19592}
,{11712,11718,19599}
,{11720,11726,19606}
,{11728,11734,19613}
,{11736,11742,19620}
,{11744,11825,19627}
,{11904,11929,19709}
,{11931,12019,19735}
,{12032,12245,19824}
,{12272,12283,20038}
,{12288,12351,20050}
,{12353,12438,20114}
,{12441,12543,20200}
,{12549,12589,20303}
,{12593,12686,20344}
,{12688,12727,20438}
,{12736,12771,20478}
,{12784,12830,20514}
,{12832,13054,20561}
,{13056,13311,20784}
,{19904,19967,21040}
,{40960,42124,21104}
,{42128,42182,22269}
,{42192,42539,22324}
,{42560,42591,22672}
,{42594,42611,22704}
,{42620,42647,22722}
,{42656,42743,22750}
,{42752,42892,22838}
,{43003,43051,22979}
,{43056,43065,23028}
,{43072,43127,23038}
,{43136,43204,23094}
,{43214,43225,23163}
,{43232,43259,23175}
,{43264,43347,23203}
,{43359,43388,23287}
,{43392,43469,23317}
,{43471,43481,23395}
,{43486,43487,23406}
,{43520,43574,23408}
,{43584,43597,23463}
,{43600,43609,23477}
,{43612,43643,23487}
,{43648,43714,23519}
,{43739,43743,23586}
,{43968,44013,23591}
,{44016,44025,23637}
,{44032,55203,23647}
,{55216,55238,34819}
,{55243,55291,34842}
,{63744,64045,34891}
,{64048,64109,35193}
,{64112,64217,35255}
,{64256,64262,35361}
,{64275,64279,35368}
,{64285,64310,35373}
,{64312,64316,35399}
,{64318,64318,35404}
,{64320,64321,35405}
,{64323,64324,35407}
,{64326,64433,35409}
,{64467,64831,35517}
,{64848,64911,35882}
,{64914,64967,35946}
,{65008,65021,36000}
,{65024,65049,36014}
,{65056,65062,36040}
,{65072,65106,36047}
,{65108,65126,36082}
,{65128,65131,36101}
,{65136,65140,36105}
,{65142,65276,36110}
,{65279,65279,36245}
,{65281,65470,36246}
,{65474,65479,36436}
,{65482,65487,36442}
,{65490,65495,36448}
,{65498,65500,36454}
,{65504,65510,36457}
,{65512,65518,36464}
,{65529,65533,36471}
,{65536,65547,36476}
,{65549,65574,36488}
,{65576,65594,36514}
,{65596,65597,36533}
,{65599,65613,36535}
,{65616,65629,36550}
,{65664,65786,36564}
,{65792,65794,36687}
,{65799,65843,36690}
,{65847,65930,36735}
,{65936,65947,36819}
,{66000,66045,36831}
,{66176,66204,36877}
,{66208,66256,36906}
,{66304,66334,36955}
,{66336,66339,36986}
,{66352,66378,36990}
,{66432,66461,37017}
,{66463,66499,37047}
,{66504,66517,37084}
,{66560,66717,37098}
,{66720,66729,37256}
,{67584,67589,37266}
,{67592,67592,37272}
,{67594,67637,37273}
,{67639,67640,37317}
,{67644,67644,37319}
,{67647,67669,37320}
,{67671,67679,37343}
,{67840,67867,37352}
,{67871,67897,37380}
,{67903,67903,37407}
,{68096,68099,37408}
,{68101,68102,37412}
,{68108,68115,37414}
,{68117,68119,37422}
,{68121,68147,37425}
,{68152,68154,37452}
,{68159,68167,37455}
,{68176,68184,37464}
,{68192,68223,37473}
,{68352,68405,37505}
,{68409,68437,37559}
,{68440,68466,37588}
,{68472,68479,37615}
,{68608,68680,37623}
,{69216,69246,37696}
,{69760,69825,37727}
,{73728,74606,37793}
,{74752,74850,38672}
,{74864,74867,38771}
,{77824,78894,38775}
,{118784,119029,39846}
,{119040,119078,40092}
,{119081,119261,40131}
,{119296,119365,40312}
,{119552,119638,40382}
,{119648,119665,40469}
,{119808,119892,40487}
,{119894,119964,40572}
,{119966,119967,40643}
,{119970,119970,40645}
,{119973,119974,40646}
,{119977,119980,40648}
,{119982,119993,40652}
,{119995,119995,40664}
,{119997,120003,40665}
,{120005,120069,40672}
,{120071,120074,40737}
,{120077,120084,40741}
,{120086,120092,40749}
,{120094,120121,40756}
,{120123,120126,40784}
,{120128,120132,40788}
,{120134,120134,40793}
,{120138,120144,40794}
,{120146,120485,40801}
,{120488,120779,41141}
,{120782,120831,41433}
,{126976,127019,41483}
,{127024,127123,41527}
,{127232,127242,41627}
,{127248,127278,41638}
,{127281,127281,41669}
,{127293,127293,41670}
,{127295,127295,41671}
,{127298,127298,41672}
,{127302,127302,41673}
,{127306,127310,41674}
,{127319,127319,41679}
,{127327,127327,41680}
,{127353,127353,41681}
,{127355,127356,41682}
,{127359,127359,41684}
,{127370,127373,41685}
,{127376,127376,41689}
,{127488,127488,41690}
,{127504,127537,41691}
,{127552,127560,41725}
,{194560,195101,41734}
,{917505,917505,42276}
,{917536,917631,42277}
};
static int
search_pair(ecl_character c)
{
int mid, low = 0, up = ECL_UCD_TOTAL_GROUPS-1;
do {
mid = (up + low) / 2;
if (c < ecl_ucd_names_char[mid].smallest)
up = mid-1;
else if (c > ecl_ucd_names_char[mid].largest)
low = mid+1;
else
return (c - ecl_ucd_names_char[mid].smallest) +
ecl_ucd_names_char[mid].pair_code;
} while (low <= up && (low >= 0) && (up < ECL_UCD_TOTAL_GROUPS));
return -1;
}
static void
fill_pair_name(char *buffer, int pair)
{
if (pair < ECL_UCD_FIRST_PAIR) {
strncat(buffer, ecl_ucd_names_word[pair], ECL_UCD_LARGEST_CHAR_NAME+1);
/*
printf("text=%s\n", ecl_ucd_names_word[pair]);
*/
} else {
const ecl_ucd_names_pair_type p = ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR];
/*
printf("ndx=%d\n", pair - ECL_UCD_FIRST_PAIR);
printf("c0=%d\n", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[0]);
printf("c1=%d\n", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[1]);
printf("c2=%d\n", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[2]);
printf("c3=%d\n", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[3]);
*/
fill_pair_name(buffer, (((unsigned int)p.codes[1]) << 8) | p.codes[0]);
fill_pair_name(buffer, (((unsigned int)p.codes[3]) << 8) | p.codes[2]);
}
}
cl_object
_ecl_ucd_code_to_name(ecl_character c)
{
int pair = search_pair(c);
if (pair < 0)
return ECL_NIL;
else {
char buffer[ECL_UCD_LARGEST_CHAR_NAME+1];
buffer[0] = 0;
fill_pair_name(buffer, pair);
return make_base_string_copy(buffer);
}
}
cl_object
_ecl_ucd_name_to_code(cl_object name)
{
int mid, low = 0, up = ECL_UCD_TOTAL_NAMES-1;
int l = ecl_length(name);
if (l <= ECL_UCD_LARGEST_CHAR_NAME) {
char buffer1[ECL_UCD_LARGEST_CHAR_NAME+1];
char buffer2[ECL_UCD_LARGEST_CHAR_NAME+1];
for (mid = 0; mid < l; mid++) {
ecl_character c = ecl_char_upcase(ecl_char(name, mid));
buffer1[mid] = c;
if (c < 32 || c > 127) /* All character names are [-A-Z_0-9]* */
return ECL_NIL;
}
buffer1[mid] = 0;
do {
ecl_ucd_code_and_pair p = ecl_ucd_sorted_pairs[mid = (low + up) / 2];
int flag, pair = ((unsigned int)p.pair[1] << 8) | p.pair[0];
buffer2[0] = 0;
fill_pair_name(buffer2, pair);
flag = strcmp(buffer1, buffer2);
/*
printf("[%d,%d,%d] %s <> (%d)%s -> %d\n",
low, mid, up, buffer1, pair, buffer2, flag);
*/
if (flag == 0) {
return ecl_make_fixnum(((unsigned int)p.code[2] << 16) |
((unsigned int)p.code[1] << 8) |
p.code[0]);
} else if (flag < 0) {
up = mid - 1;
} else {
low = mid + 1;
}
} while (low <= up);
}
return ECL_NIL;
}

File diff suppressed because it is too large Load diff

38002
src/c/unicode/ucd_names_pair.c Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

17
src/configure vendored
View file

@ -806,6 +806,7 @@ with_profile_cflags
with_newcmp
with_extra_files
with_init_form
with_unicode_names
with_x
'
ac_precious_vars='build_alias
@ -1522,6 +1523,7 @@ Optional Packages:
--with-extra-files list of additional source files (default="")
--with-init-form lisp forms to execute at startup
(default="(si::top-level t)")
--with-unicode-names link in the database of Unicode names (YES,no)
--with-x use the X Window System
Some influential environment variables:
@ -2989,6 +2991,15 @@ else
fi
# Check whether --with-unicode-names was given.
if test "${with_unicode_names+set}" = set; then :
withval=$with_unicode_names;
else
with_unicode_names="yes"
fi
ecldir="${libdir}/ecl-${PACKAGE_VERSION}"
test -z "${docdir}" && docdir="${datadir}/doc/ecl-${PACKAGE_VERSION}"
@ -9403,6 +9414,12 @@ $as_echo "#define ECL_UNICODE 21" >>confdefs.h
ECL_CHARACTER=$ECL_INT32_T
EXTRA_OBJS="$EXTRA_OBJS unicode/ucd.o unicode/ucd-0000.o unicode/ucd-0016.o unicode/ucd-0032.o unicode/ucd-0048.o unicode/ucd-0064.o unicode/ucd-0080.o unicode/ucd-0096.o"
fi
if test "${with_unicode_names}" = "yes"; then
$as_echo "#define ECL_UNICODE_NAMES 1" >>confdefs.h
EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o"
fi
else
CHAR_CODE_LIMIT=256
ECL_CHARACTER="int"

View file

@ -294,6 +294,12 @@ AC_ARG_WITH(init-form,
[lisp forms to execute at startup (default="(si::top-level t)")]),
[with_init_form="${withval}"], [with_init_form=""])
AC_ARG_WITH(unicode-names,
AS_HELP_STRING( [--with-unicode-names],
[link in the database of Unicode names]
[(YES,no)]),
[],[with_unicode_names="yes"])
dnl -----------------------------------------------------------------------
dnl Installation directories
ecldir="${libdir}/ecl-${PACKAGE_VERSION}"
@ -852,6 +858,10 @@ if test "x${enable_unicode}" != "xno"; then
ECL_CHARACTER=$ECL_INT32_T
EXTRA_OBJS="$EXTRA_OBJS unicode/ucd.o unicode/ucd-0000.o unicode/ucd-0016.o unicode/ucd-0032.o unicode/ucd-0048.o unicode/ucd-0064.o unicode/ucd-0080.o unicode/ucd-0096.o"
fi
if test "${with_unicode_names}" = "yes"; then
AC_DEFINE(ECL_UNICODE_NAMES, [1], [Link in the database of Unicode names])
EXTRA_OBJS="$EXTRA_OBJS unicode/ucd_names_char.o unicode/ucd_names_codes.o unicode/ucd_names_pair.o unicode/ucd_names_str.o"
fi
else
CHAR_CODE_LIMIT=256
ECL_CHARACTER="int"

View file

@ -102,6 +102,8 @@
/* Support for Unicode strings */
#undef ECL_UNICODE
/* Link in the Unicode names for all characters (takes ~0.5 Mb) */
#undef ECL_UNICODE_NAMES
/* Allow STREAM operations to work on arbitrary objects */
#undef ECL_CLOS_STREAMS

View file

@ -1901,6 +1901,10 @@ extern ECL_API cl_object si_coerce_to_base_string(cl_object x);
extern ECL_API cl_object si_coerce_to_extended_string(cl_object x);
#define ecl_alloc_simple_extended_string(l) ecl_alloc_simple_vector((l),ecl_aet_ch)
extern ECL_API cl_object ecl_alloc_adjustable_extended_string(cl_index l);
# ifdef ECL_UNICODE_NAMES
extern ECL_API cl_object _ecl_ucd_code_to_name(ecl_character c);
extern ECL_API cl_object _ecl_ucd_name_to_code(cl_object name);
# endif
#else
#define si_base_char_p cl_characterp
#define si_base_string_p cl_stringp

View file

@ -1428,6 +1428,7 @@ package."
(*print-readably* nil)
(*print-pretty* nil)
(*print-circle* t)
(*print-length* 2)
(*readtable* (or *break-readtable* *readtable*))
(*break-message* (format nil "~&Condition of type: ~A~%~A~%"
(type-of condition) condition))