ecl/contrib/unicode/names-pairs-sort.lisp

(defparameter *destination*
  (merge-pathnames "../../src/c/unicode/"
                   (or *load-truename* *compile-pathname*)))

(let* ((translated-data (copy-tree *compressed-data*))
       (pairs (copy-tree *paired-data*))
       (first-code (loop for (pair-code . pair) in pairs minimize pair-code))
       (last-code (loop for (pair-code . pair) in pairs maximize pair-code)))
  ;;
  ;; We make sure that for each character there is a unique pair which is not
  ;; used anywhere else
  ;;
  (loop with used-code = (make-array (1+ last-code) :initial-element nil)
     for line in translated-data
     for pair-code = (third line)
     do (cond ((/= (length line) 3)
               (error "Error in compressed data: too long code ~A" line))
              ((or (aref used-code pair-code)
                   (< pair-code first-code))
               (let ((new-pair (cons pair-code 0)))
                 (setf pairs (acons (incf last-code) new-pair pairs)
                       (third line) last-code)))
              (t
               (setf (aref used-code pair-code) t))))
  ;;
  ;; We now renumber all pairs.
  ;;
  (let ((translation-table (make-array (1+ last-code) :initial-element nil))
        (counter -1))
    (flet ((add-code (code)
             (or (aref translation-table code)
                 (setf (aref translation-table code) (incf counter))))
           (translate (old-code)
             (or (aref translation-table old-code)
                 (error "Unknown code ~A" old-code))))
      ;; First of all we add the words
      (loop for i from 0 below first-code
         do (add-code i))
      ;; Then we add all pairs that represent characters, so that they
      ;; are consecutive, too.
      (loop for line in translated-data
         do (setf (third line) (add-code (third line))))
      ;; Finally, we add the remaining pairs
      (loop for record in pairs
         do (setf (car record) (add-code (car record))))
      ;; ... and we fix the definitions
      (loop for (code . pair) in pairs
         do (setf (car pair) (translate (car pair))
                  (cdr pair) (translate (cdr pair))))))
  (defparameter *sorted-compressed-data* translated-data)
  (defparameter *sorted-pairs* (sort pairs #'< :key #'car))
  (print 'finished)
  )

(defparameter *grouped-characters*
  (loop with last-ucd-code = nil
     with start-ucd-code = nil
     with start-code = nil
     with output = '()
     with aux = '()
     for n from (third (first *sorted-compressed-data*))
     for line in *sorted-compressed-data*
     for (ucd-code name code) = line
     do (cond ((/= code n)
               (error "Codes in *sorted-compressed-data* are not consecutive:~%~A"
                      (cons line (subseq aux 0 10))))
              ((null start-ucd-code)
               (setf start-ucd-code ucd-code
                     start-code code))
              ((= last-ucd-code (1- ucd-code))
               )
              (t
               (push (list start-ucd-code last-ucd-code start-code)
                     output)
               (setf start-ucd-code ucd-code
                     start-code code)))
       (setf last-ucd-code ucd-code aux (cons line aux))
     finally (return (nreverse output))))

(with-open-file (s (merge-pathnames "ucd_names.h" *destination*)
                   :direction :output
                   :if-exists :supersede)
  (format s "/*
 * UNICODE NAMES DATABASE
 */
#ifndef ECL_UCD_NAMES_H
#define ECL_UCD_NAMES_H 1

#define ECL_UCD_FIRST_PAIR ~D
#define ECL_UCD_TOTAL_PAIRS ~D
#define ECL_UCD_TOTAL_GROUPS ~D
#define ECL_UCD_LARGEST_CHAR_NAME ~D
#define ECL_UCD_TOTAL_NAMES ~D

typedef struct {
  unsigned char codes[4];
} ecl_ucd_names_pair_type;

typedef struct {
  int smallest, largest, pair_code;
} ecl_ucd_names_char_group;

typedef struct {
  unsigned char pair[2];
  unsigned char code[3];
} ecl_ucd_code_and_pair;

extern const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS];
extern const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS];
extern const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR];
extern const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES];

#endif
"
          (1+ *last-word-index*)
          (length *sorted-pairs*)
          (length *grouped-characters*)
          (loop for (code name . rest) in *compressed-data*
               maximize (length name))
          (length *compressed-data*)
          ))

(with-open-file (s (merge-pathnames "ucd_names_pair.c" *destination*)
                   :direction :output
                   :if-exists :supersede)
  (format s "/*
 * Pairs of symbols.
 */

#include <ecl/ecl.h>
#include \"ucd_names.h\"

const ecl_ucd_names_pair_type ecl_ucd_names_pair[ECL_UCD_TOTAL_PAIRS] = {
"
          (length *sorted-pairs*) (length *sorted-pairs*))
  (loop for i from 0
     for (pair-code . (a . b)) in *sorted-pairs*
     do (format s "~A{~D, ~D, ~D, ~D}~%"
                (if (plusp i) "," "")
                (logand a #xff) (ash a -8)
                (logand b #xff) (ash b -8)
                ))
  (format s "};~%"))

(with-open-file (s (merge-pathnames "ucd_names_codes.c" *destination*)
                   :direction :output
                   :if-exists :supersede)
  (format s "/*
 * Sorted character names.
 */

#include <ecl/ecl.h>
#include \"ucd_names.h\"

const ecl_ucd_code_and_pair ecl_ucd_sorted_pairs[ECL_UCD_TOTAL_NAMES] = {
")
  (loop with l = (sort (copy-tree *sorted-compressed-data*) #'string<= :key #'second)
     for (ucd-code name code) in l
     for i from 0
     do (format s "~A{{~D, ~D}, {~D, ~D, ~D}}~%"
                (if (plusp i) "," "")
                (logand code #xff) (ash code -8)
                (logand ucd-code #xff) (logand (ash ucd-code -8) #xff)
                (logand (ash ucd-code -16) #xff)))
  (format s "};"))

(with-open-file (s (merge-pathnames "ucd_names_str.c" *destination*)
                   :direction :output
                   :if-exists :supersede)
  (format s "/*
 * Dictionary words.
 */

#include <ecl/ecl.h>
#include \"ucd_names.h\"

const char *ecl_ucd_names_word[ECL_UCD_FIRST_PAIR] = {
")
  (loop for i from 0
     for c across *words-array*
     do (format s "~A~S~%" (if (plusp i) "," "") (or c "")))
  (format s "};~%"))

(with-open-file (s (merge-pathnames "ucd_names_char.c" *destination*)
                   :direction :output
                   :if-exists :supersede)
  (format s "/*
 * Dictionary words.
 */

#include <string.h>
#include <ecl/ecl.h>
#include \"ucd_names.h\"

const ecl_ucd_names_char_group ecl_ucd_names_char[ECL_UCD_TOTAL_GROUPS] = {
"
          (length *grouped-characters*))
  (loop for i from 0
     for (start end pair-code) in *grouped-characters*
     do (format s "~A{~D,~D,~D}~%" (if (plusp i) "," "")
                start end pair-code))
  (format s "};

static int
search_pair(ecl_character c)
{
  int mid, low = 0, up = ECL_UCD_TOTAL_GROUPS-1;
  do {
    mid = (up + low) / 2;
    if (c < ecl_ucd_names_char[mid].smallest)
      up = mid-1;
    else if (c > ecl_ucd_names_char[mid].largest)
      low = mid+1;
    else
      return (c - ecl_ucd_names_char[mid].smallest) +
             ecl_ucd_names_char[mid].pair_code;
  } while (low <= up && (low >= 0) && (up < ECL_UCD_TOTAL_GROUPS));
  return -1;
}

static void
fill_pair_name(char *buffer, int pair)
{
  if (pair < ECL_UCD_FIRST_PAIR) {
    strncat(buffer, ecl_ucd_names_word[pair], ECL_UCD_LARGEST_CHAR_NAME+1);
/*
    printf(\"text=%s\\n\", ecl_ucd_names_word[pair]);
 */
  } else {
    const ecl_ucd_names_pair_type p = ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR];
/*
    printf(\"ndx=%d\\n\", pair - ECL_UCD_FIRST_PAIR);
    printf(\"c0=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[0]);
    printf(\"c1=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[1]);
    printf(\"c2=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[2]);
    printf(\"c3=%d\\n\", ecl_ucd_names_pair[pair - ECL_UCD_FIRST_PAIR].codes[3]);
 */
    fill_pair_name(buffer, (((unsigned int)p.codes[1]) << 8) | p.codes[0]);
    fill_pair_name(buffer, (((unsigned int)p.codes[3]) << 8) | p.codes[2]);
  }
}

cl_object
_ecl_ucd_code_to_name(ecl_character c)
{
  int pair = search_pair(c);
  if (pair < 0)
    return ECL_NIL;
  else {
    char buffer[ECL_UCD_LARGEST_CHAR_NAME+1];
    buffer[0] = 0;
    fill_pair_name(buffer, pair);
    return make_base_string_copy(buffer);
  }
}

cl_object
_ecl_ucd_name_to_code(cl_object name)
{
  int mid, low = 0, up = ECL_UCD_TOTAL_NAMES-1;
  int l = ecl_length(name);
  if (l <= ECL_UCD_LARGEST_CHAR_NAME) {
    char buffer1[ECL_UCD_LARGEST_CHAR_NAME+1];
    char buffer2[ECL_UCD_LARGEST_CHAR_NAME+1];
    for (mid = 0; mid < l; mid++) {
      ecl_character c = ecl_char_upcase(ecl_char(name, mid));
      buffer1[mid] = c;
      if (c < 32 || c > 127) /* All character names are [-A-Z_0-9]* */
        return ECL_NIL;
    }
    buffer1[mid] = 0;
    do {
      ecl_ucd_code_and_pair p = ecl_ucd_sorted_pairs[mid = (low + up) / 2];
      int flag, pair = ((unsigned int)p.pair[1] << 8) | p.pair[0];
      buffer2[0] = 0;
      fill_pair_name(buffer2, pair);
      flag = strcmp(buffer1, buffer2);
/*
      printf(\"[%d,%d,%d] %s <> (%d)%s -> %d\\n\",
             low, mid, up, buffer1, pair, buffer2, flag);
 */
      if (flag == 0) {
        return ecl_make_fixnum(((unsigned int)p.code[2] << 16) |
                               ((unsigned int)p.code[1] << 8) |
                               p.code[0]);
      } else if (flag < 0) {
        up = mid - 1;
      } else {
        low = mid + 1;
      }
    } while (low <= up);
  }
  return ECL_NIL;
}

"))

;(ext:run-program "/bin/sh" '("-c" "cp *.c *.h ~/devel/ecl/src/c/unicode/"))