Added the Unicode character database to the C library

This commit is contained in:
Juan Jose Garcia Ripoll 2012-12-26 11:04:27 +01:00
parent b6519e5c7a
commit ebafa5f275
14 changed files with 81861 additions and 6 deletions

View file

@ -0,0 +1,298 @@
(defparameter *destination*
(merge-pathnames "../../src/c/unicode/"
(or *load-truename* *compile-pathname*)))
(let* ((translated-data (copy-tree *compressed-data*))
(pairs (copy-tree *paired-data*))
(first-code (loop for (pair-code . pair) in pairs minimize pair-code))
(last-code (loop for (pair-code . pair) in pairs maximize pair-code)))
;;
;; We make sure that for each character there is a unique pair which is not
;; used anywhere else
;;
(loop with used-code = (make-array (1+ last-code) :initial-element nil)
for line in translated-data
for pair-code = (third line)
do (cond ((/= (length line) 3)
(error "Error in compressed data: too long code ~A" line))
((or (aref used-code pair-code)
(< pair-code first-code))
(let ((new-pair (cons pair-code 0)))
(setf pairs (acons (incf last-code) new-pair pairs)
(third line) last-code)))
(t
(setf (aref used-code pair-code) t))))
;;
;; We now renumber all pairs.
;;
(let ((translation-table (make-array (1+ last-code) :initial-element nil))
(counter -1))
(flet ((add-code (code)
(or (aref translation-table code)
(setf (aref translation-table code) (incf counter))))
(translate (old-code)
(or (aref translation-table old-code)
(error "Unknown code ~A" old-code))))
;; First of all we add the words
(loop for i from 0 below first-code
do (add-code i))
;; Then we add all pairs that represent characters, so that they
;; are consecutive, too.
(loop for line in translated-data
do (setf (third line) (add-code (third line))))
;; Finally, we add the remaining pairs
(loop for record in pairs
do (setf (car record) (add-code (car record))))
;; ... and we fix the definitions
(loop for (code . pair) in pairs
do (setf (car pair) (translate (car pair))
(cdr pair) (translate (cdr pair))))))
(defparameter *sorted-compressed-data* translated-data)
(defparameter *sorted-pairs* (sort pairs #'< :key #'car))
(print 'finished)
)
(defparameter *grouped-characters*
(loop with last-ucd-code = nil
with start-ucd-code = nil
with start-code = nil
with output = '()
with aux = '()
for n from (third (first *sorted-compressed-data*))
for line in *sorted-compressed-data*
for (ucd-code name code) = line
do (cond ((/= code n)
(error "Codes in *sorted-compressed-data* are not consecutive:~%~A"
(cons line (subseq aux 0 10))))
((null start-ucd-code)
(setf start-ucd-code ucd-code
start-code code))
((= last-ucd-code (1- ucd-code))
)
(t
(push (list start-ucd-code last-ucd-code start-code)
output)
(setf start-ucd-code ucd-code
start-code code)))
(setf last-ucd-code ucd-code aux (cons line aux))
finally (return (nreverse output))))
(with-open-file (s (merge-pathnames "ucd_names.h" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* UNICODE NAMES DATABASE
*/
#ifndef ECL_UCD_NAMES_H
#define ECL_UCD_NAMES_H 1
#define ECL_UCD_FIRST_PAIR ~D
#define ECL_UCD_TOTAL_PAIRS ~D
#define ECL_UCD_TOTAL_GROUPS ~D
#define ECL_UCD_LARGEST_CHAR_NAME ~D
#define ECL_UCD_TOTAL_NAMES ~D
typedef struct {
unsigned char codes[4];
} ecl_ucd_names_pair_type;
typedef struct {
int smallest, largest, pair_code;
} ecl_ucd_names_char_group;
typedef struct {
unsigned char pair[2];
unsigned char code[3];
} ecl_ucd_code_and_pair;
extern const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS];
extern const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS];
extern const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR];
extern const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES];
#endif
"
(1+ *last-word-index*)
(length *sorted-pairs*)
(length *grouped-characters*)
(loop for (code name . rest) in *compressed-data*
maximize (length name))
(length *compressed-data*)
))
(with-open-file (s (merge-pathnames "ucd_names_pair.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Pairs of symbols.
*/
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS] = {
"
(length *sorted-pairs*) (length *sorted-pairs*))
(loop for i from 0
for (pair-code . (a . b)) in *sorted-pairs*
do (format s "~A{~D, ~D, ~D, ~D}~%"
(if (plusp i) "," "")
(logand a #xff) (ash a -8)
(logand b #xff) (ash b -8)
))
(format s "};~%"))
(with-open-file (s (merge-pathnames "ucd_names_codes.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Sorted character names.
*/
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES] = {
")
(loop with l = (sort (copy-tree *sorted-compressed-data*) #'string<= :key #'second)
for (ucd-code name code) in l
for i from 0
do (format s "~A{{~D, ~D}, {~D, ~D, ~D}}~%"
(if (plusp i) "," "")
(logand code #xff) (ash code -8)
(logand ucd-code #xff) (logand (ash ucd-code -8) #xff)
(logand (ash ucd-code -16) #xff)))
(format s "};"))
(with-open-file (s (merge-pathnames "ucd_names_str.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Dictionary words.
*/
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR] = {
")
(loop for i from 0
for c across *words-array*
do (format s "~A~S~%" (if (plusp i) "," "") (or c "")))
(format s "};~%"))
(with-open-file (s (merge-pathnames "ucd_names_char.c" *destination*)
:direction :output
:if-exists :supersede)
(format s "/*
* Dictionary words.
*/
#include <string.h>
#include <ecl/ecl.h>
#include \"ucd_names.h\"
const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS] = {
"
(length *grouped-characters*))
(loop for i from 0
for (start end pair-code) in *grouped-characters*
do (format s "~A{~D,~D,~D}~%" (if (plusp i) "," "")
start end pair-code))
(format s "};
static int
search_pair(ecl_character c)
{
int mid, low = 0, up = ECL_UCD_TOTAL_GROUPS-1;
do {
mid = (up + low) / 2;
if (c < ecl_ucd_names_char[mid].smallest)
up = mid-1;
else if (c > ecl_ucd_names_char[mid].largest)
low = mid+1;
else
return (c - ecl_ucd_names_char[mid].smallest) +
ecl_ucd_names_char[mid].pair_code;
} while (low <= up && (low >= 0) && (up < ECL_UCD_TOTAL_GROUPS));
return -1;
}
static void
fill_pair_name(char *buffer, int pair)
{
if (pair < ECL_UCD_FIRST_PAIR) {
strncat(buffer, ecl_ucd_names_word[pair], ECL_UCD_LARGEST_CHAR_NAME+1);
/*
printf(\"text=%s\\n\", ecl_ucd_names_word[pair]);
*/
} else {
const ecl_ucd_names_pair_type p = ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR];
/*
printf(\"ndx=%d\\n\", pair - ECL_UCD_FIRST_PAIR);
printf(\"c0=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[0]);
printf(\"c1=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[1]);
printf(\"c2=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[2]);
printf(\"c3=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[3]);
*/
fill_pair_name(buffer, (((unsigned int)p.codes[1]) << 8) | p.codes[0]);
fill_pair_name(buffer, (((unsigned int)p.codes[3]) << 8) | p.codes[2]);
}
}
cl_object
_ecl_ucd_code_to_name(ecl_character c)
{
int pair = search_pair(c);
if (pair < 0)
return ECL_NIL;
else {
char buffer[ECL_UCD_LARGEST_CHAR_NAME+1];
buffer[0] = 0;
fill_pair_name(buffer, pair);
return make_base_string_copy(buffer);
}
}
cl_object
_ecl_ucd_name_to_code(cl_object name)
{
int mid, low = 0, up = ECL_UCD_TOTAL_NAMES-1;
int l = ecl_length(name);
if (l <= ECL_UCD_LARGEST_CHAR_NAME) {
char buffer1[ECL_UCD_LARGEST_CHAR_NAME+1];
char buffer2[ECL_UCD_LARGEST_CHAR_NAME+1];
for (mid = 0; mid < l; mid++) {
ecl_character c = ecl_char_upcase(ecl_char(name, mid));
buffer1[mid] = c;
if (c < 32 || c > 127) /* All character names are [-A-Z_0-9]* */
return ECL_NIL;
}
buffer1[mid] = 0;
do {
ecl_ucd_code_and_pair p = ecl_ucd_sorted_pairs[mid = (low + up) / 2];
int flag, pair = ((unsigned int)p.pair[1] << 8) | p.pair[0];
buffer2[0] = 0;
fill_pair_name(buffer2, pair);
flag = strcmp(buffer1, buffer2);
/*
printf(\"[%d,%d,%d] %s <> (%d)%s -> %d\\n\",
low, mid, up, buffer1, pair, buffer2, flag);
*/
if (flag == 0) {
return ecl_make_fixnum(((unsigned int)p.code[2] << 16) |
((unsigned int)p.code[1] << 8) |
p.code[0]);
} else if (flag < 0) {
up = mid - 1;
} else {
low = mid + 1;
}
} while (low <= up);
}
return ECL_NIL;
}
"))
;(ext:run-program "/bin/sh" '("-c" "cp *.c *.h ~/devel/ecl/src/c/unicode/"))