From 63d3360340ccff14ec7520100e2fc800cb71b49d Mon Sep 17 00:00:00 2001 From: Tomek Kurcz Date: Tue, 22 Aug 2017 09:57:06 +0200 Subject: [PATCH] Port `Characters' section to texinfo --- src/doc/new-doc/standards/characters.txi | 155 +++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 src/doc/new-doc/standards/characters.txi diff --git a/src/doc/new-doc/standards/characters.txi b/src/doc/new-doc/standards/characters.txi new file mode 100644 index 000000000..881313382 --- /dev/null +++ b/src/doc/new-doc/standards/characters.txi @@ -0,0 +1,155 @@ +@node Characters +@section Characters + +ECL is fully ANSI Common-Lisp compliant in all aspects of the character data type, with the following peculiarities. + + +@subsection Unicode vs. POSIX locale +There are two ways of building ECL: with C or with Unicode character codes. These build modes are accessed using the @code{--disable-unicode} and @code{--enable-unicode} configuration options, the last one being the default. + +When using C characters we are actually relying on the char type of the C language, using the C library functions for tasks such as character conversions, comparison, etc. In this case characters are typically 8 bit wide and the character order and collation are determines by the current POSIX or C locale. This is not very accurate, leaves out many languages and character encodings but it is sufficient for small applications that do not need multilingual support. + +When no option is specified ECL builds with support for a larger character set, the Unicode 6.0 standard. This uses 24 bit large character codes, also known as @dfn{codepoints}, with a large database of character properties which include their nature (alphanumeric, numeric, etc), their case, their collation properties, whether they are standalone or composing characters, etc. + +@subsubsection Character types +If compiled without Unicode support, ECL all characters are implemented using 8-bit codes and the type @code{extended-char} is empty. If compiled with Unicode support, characters are implemented using 24 bits and the @code{extended-char} type covers characters above code 255. +@multitable @columnfractions .33 .33 .33 +@headitem Type @tab With Unicode @tab Without Unicode +@item standard-char @tab #\Newline,32-126 @tab #\Newline,32-126 +@item base-char @tab 0-255 @tab 0-255 +@item extended-char @tab - @tab 256-16777215 +@end multitable + +@subsubsection Character names +All characters have a name. For non-printing characters between 0 and 32, and for 127 we use the ordinary ASCII names. Characters above 127 are printed and read using hexadecimal Unicode notation, with a U followed by 24 bit hexadecimal number, as in @code{U0126}. + +@float Table, tab:example-char-names +@multitable {aaaaaaaaaaa} {aaa} +@headitem Character @tab Code +@item #\Null @tab 0 +@item #\Ack @tab 1 +@item #\Bell @tab 7 +@item #\Backspace @tab 8 +@item #\Tab @tab 9 +@item #\Newline @tab 10 +@item #\Linefeed @tab 10 +@item #\Page @tab 12 +@item #\Esc @tab 27 +@item #\Escape @tab 27 +@item #\Space @tab 32 +@item #\Rubout @tab 127 +@item #\U0080 @tab 128 +@end multitable +@end float + +Note that @code{#\Linefeed} is synonymous with @code{#\Newline} and thus is a member of @code{standard-char}. + + +@subsection @code{#\Newline} characters +Internally, ECL represents the @code{#\Newline} character by a single code. However, when using external formats, ECL may parse character pairs as a single @code{#\Newline}, and viceversa, use multiple characters to represent a single @code{#\Newline}. +@c TODO: add an @xref to Stream -> External formats once it's written + + + +@subsection C Reference + +@subsubsection C types +C character types +@subsubheading Type names +@multitable {aaaaaaaaaaaaa} {aaaaaaaaa} +@item ecl_character @tab character +@item ecl_base_char @tab base-char +@end multitable +@subsubheading Description +ECL defines two C types to hold its characters: @code{ecl_base_char} and @code{ecl_character}. + + When ECL is built without Unicode, they both coincide and typically match @code{unsigned char}, to cover the 256 codes that are needed. + + When ECL is built with Unicode, the two types are no longer equivalent, with @code{ecl_character} being larger. + +For your code to be portable and future proof, use both types to really express what you intend to do. + +@subsubsection Constructors +Creating and extracting characters from Lisp objects +@subsubheading Functions +@deftypefun cl_object ECL_CODE_CHAR (ecl_character @var{code}); +@end deftypefun +@deftypefun ecl_character ECL_CHAR_CODE (cl_object @var{o}); +@end deftypefun +@deftypefun cl_object ecl_char_code (ecl_character @var{code}); +@end deftypefun +@deftypefun ecl_base_char ecl_base_char_code (cl_object @var{o}); +@end deftypefun +@subsubheading Description +These functions and macros convert back and forth from C character types to Lisp. The macros @code{ECL_CHAR_CODE} and @code{ECL_CODE_CHAR} perform this coercion without checking the arguments. The functions @code{ecl_char_code} and @code{ecl_base_char_code}, on the other hand, verify that the argument has the right type and signal an error otherwise. + +@subsubsection Predicates +C predicates for Lisp characters +@subsubheading Functions +@deftypefun bool ecl_base_char_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_alpha_char_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_alphanumeric_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_graphic_char_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_digit_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_standard_char_p (ecl_character @var{c}); +@end deftypefun +@subsubheading Description +These functions are equivalent to their Lisp equivalents but return C booleans. + +@subsubsection Character case +C functions related to the character case +@subsubheading Functions +@deftypefun bool ecl_upper_case_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_lower_case_p (ecl_character @var{c}); +@end deftypefun +@deftypefun bool ecl_both_case_p (ecl_character @var{c}); +@end deftypefun +@deftypefun ecl_character ecl_char_downcase (ecl_character @var{c}); +@end deftypefun +@deftypefun ecl_character ecl_char_upcase (ecl_character @var{c}); +@end deftypefun +@subsubheading Description +These functions check or change the case of a character. Note that in an Unicode context, the output of these functions might not be accurate (for instance when the uppercase character has two or more codepoints). + +@subsubsection ANSI Dictionary +Common Lisp and C equivalence +@subsubheading Synopsis +@multitable {aaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@headitem Lisp symbol @tab C function +@item char= @tab cl_object cl_charE(cl_narg narg, ...) +@item char/= @tab cl_object cl_charNE(cl_narg narg, ...) +@item char< @tab cl_object cl_charL(cl_narg narg, ...) +@item char> @tab cl_object cl_charG(cl_narg narg, ...) +@item char<= @tab cl_object cl_charLE(cl_narg narg, ...) +@item char>= @tab cl_object cl_charGE(cl_narg narg, ...) +@item char-equal @tab cl_object cl_char_equal(cl_narg narg, ...) +@item char-not-equal @tab cl_object cl_char_not_equal(cl_narg narg, ...) +@item char-lessp @tab cl_object cl_char_lessp(cl_narg narg, ...) +@item char-greaterp @tab cl_object cl_char_greaterp(cl_narg narg, ...) +@item char-not-greaterp @tab cl_object cl_char_not_greaterp(cl_narg narg, ...) +@item char-not-lessp @tab cl_object cl_char_not_lessp(cl_narg narg, ...) +@item character @tab cl_object cl_character(cl_object char_designator) +@item characterp @tab cl_object cl_characterp(cl_object object) +@item alpha-char-p @tab cl_object cl_alpha_char_p(cl_object character) +@item alphanumericp @tab cl_object cl_alphanumericp(cl_object character) +@item digit-char @tab cl_object cl_digit_char(cl_narg narg, cl_object character, ...) +@item digit-char-p @tab cl_object cl_digit_char_p(cl_narg narg, cl_object character, ...) +@item graphic-char-p @tab cl_object cl_graphic_char_p(cl_object character) +@item standard-char-p @tab cl_object cl_standard_char_p(cl_object character) +@item char_upcase @tab cl_object cl_char_upcase(cl_object character) +@item char-downcase @tab cl_object cl_char_downcase(cl_object character) +@item upper-case-p @tab cl_object cl_upper_case_p(cl_object character) +@item lower-case-p @tab cl_object cl_lower_case_p(cl_object character) +@item both-case-p @tab cl_object cl_both_case_p(cl_object character) +@item char-code @tab cl_object cl_char_code(cl_object character) +@item char-int @tab cl_object cl_char_int(cl_object character) +@item code-char @tab cl_object cl_code_char(cl_object code) +@item char-name @tab cl_object cl_char_name(cl_object character) +@item name-char @tab cl_object cl_name_char(cl_object name) +@end multitable