1
Fork 0
mirror of git://git.sv.gnu.org/emacs.git synced 2026-01-23 21:13:52 -08:00

(ctext-no-compositions, compound-text-with-extensions): New coding systems.

(x-ctext-with-extensions, ctext-with-extensions): Aliases for
compound-text-with-extensions.
(non-standard-icccm-encodings-alist, non-standard-designations-alist): New
variables.
(ctext-post-read-conversion, ctext-pre-write-conversion): New functions.
This commit is contained in:
Eli Zaretskii 2002-02-22 10:45:22 +00:00
parent 102ddfc133
commit ec2b93f7b5

View file

@ -405,6 +405,188 @@ is treated as a character."
(define-coding-system-alias 'x-ctext 'compound-text)
(define-coding-system-alias 'ctext 'compound-text)
;; Same as compound-text, but doesn't produce composition escape
;; sequences. Used in post-read and pre-write conversions of
;; ctext-with-extensions, below.
(make-coding-system
'ctext-no-compositions 2 ?x
"Compound text based generic encoding for decoding unknown messages.
Like `compound-text', but does not produce escape sequences for compositions."
'((ascii t) (latin-iso8859-1 katakana-jisx0201 t) t t
nil ascii-eol ascii-cntl nil locking-shift single-shift nil nil nil
init-bol nil nil)
'((safe-charsets . t)
(mime-charset . x-ctext)))
(defvar non-standard-icccm-encodings-alist
'(("ISO8859-15" . latin-iso8859-15)
("ISO8859-14" . latin-iso8859-14)
("KOI8-R" . koi8-r)
("BIG5-0" . big5))
"Alist of font charset names defined by XLFD, and the corresponding Emacs
charsets or coding systems.")
;; Functions to support "Non-Standard Character Set Encodings" defined
;; by the ICCCM spec. We support that by converting the leading
;; sequence of the ``extended segment'' to the corresponding ISO-2022
;; sequences (if the leading sequence names an Emacs charset), or decode
;; the segment (if it names a coding system). Encoding does the reverse.
(defun ctext-post-read-conversion (len)
"Decode LEN characters encoded as Compound Text with Extended Segments."
(buffer-disable-undo) ; minimize consing due to insertions and deletions
(narrow-to-region (point) (+ (point) len))
(save-match-data
(let ((pt (point-marker))
(oldpt (point-marker))
(newpt (make-marker))
(modified-p (buffer-modified-p))
(case-fold-search nil)
last-coding-system-used
encoding textlen chset)
(while (re-search-forward
"\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002"
nil 'move)
(set-marker newpt (point))
(set-marker pt (match-beginning 0))
(setq encoding (match-string 3))
(setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128)
(- (aref (match-string 2) 1) 128))
(1+ (length encoding))))
(setq
chset (cdr (assoc-ignore-case encoding
non-standard-icccm-encodings-alist)))
(cond ((null chset)
;; This charset is not supported--leave this extended
;; segment unaltered and skip over it.
(goto-char (+ (point) textlen)))
((charsetp chset)
;; If it's a charset, replace the leading escape sequence
;; with a standard ISO-2022 sequence. We will decode all
;; such segments later, in one go, when we exit the loop
;; or find an extended segment that names a coding
;; system, not a charset.
(replace-match
(concat "\\1"
(if (= 0 (charset-iso-graphic-plane chset))
;; GL charsets
(if (= 1 (charset-dimension chset)) "(" "$(")
;; GR charsets
(if (= 96 (charset-chars chset))
"-"
(if (= 1 (charset-dimension chset)) ")" "$)")))
(string (charset-iso-final-char chset)))
t)
(goto-char (+ (point) textlen)))
((coding-system-p chset)
;; If it's a coding system, we need to decode the segment
;; right away. But first, decode what we've skipped
;; across until now.
(when (> pt oldpt)
(decode-coding-region oldpt pt 'ctext-no-compositions))
(delete-region pt newpt)
(set-marker newpt (+ newpt textlen))
(decode-coding-region pt newpt chset)
(goto-char newpt)
(set-marker oldpt newpt))))
;; Decode what's left.
(when (> (point) oldpt)
(decode-coding-region oldpt (point) 'ctext-no-compositions))
;; This buffer started as unibyte, because the string we get from
;; the X selection is a unibyte string. We must now make it
;; multibyte, so that the decoded text is inserted as multibyte
;; into its buffer.
(set-buffer-multibyte t)
(set-buffer-modified-p modified-p)
(- (point-max) (point-min)))))
(defvar non-standard-designations-alist
'(("$(0" . (big5 "big5-0" 2))
("$(1" . (big5 "big5-0" 2))
("-V" . (t "iso8859-10" 1))
("-Y" . (t "iso8859-13" 1))
("-_" . (t "iso8859-14" 1))
("-b" . (t "iso8859-15" 1))
("-f" . (t "iso8859-16" 1)))
"Alist of ctext control sequences that introduce character sets which
are not in the list of approved ICCCM encodings, and the corresponding
coding system, identifier string, and number of octets per encoded
character.
Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)). CTLSEQ
is the control sequence (sans the leading ESC) that introduces the character
set in the text encoded by compound-text. ENCODING is a coding system
symbol; if it is t, it means that the ctext coding system already encodes
the text correctly, and only the leading control sequence needs to be altered.
If ENCODING is a coding system, we need to re-encode the text with that
coding system. CHARSET is the ICCCM name of the charset we need to put into
the leading control sequence. NOCTETS is the number of octets (bytes) that
encode each character in this charset. NOCTETS can be 0 (meaning the number
of octets per character is variable), 1, 2, 3, or 4.")
(defun ctext-pre-write-conversion (from to)
"Encode characters between FROM and TO as Compound Text w/Extended Segments."
(buffer-disable-undo) ; minimize consing due to insertions and deletions
(narrow-to-region from to)
(encode-coding-region from to 'ctext-no-compositions)
;; Replace ISO-2022 charset designations with extended segments, for
;; those charsets that are not part of the official X registry.
(save-match-data
(goto-char (point-min))
(let ((newpt (make-marker))
(case-fold-search nil)
pt desig encode-info encoding chset noctets textlen)
(set-buffer-multibyte nil)
(while (re-search-forward "\e\\(\$([01]\\|-[VY_bf]\\)" nil 'move)
(setq desig (match-string 1)
pt (point-marker)
encode-info (cdr (assoc desig non-standard-designations-alist))
encoding (car encode-info)
chset (cadr encode-info)
noctets (car (cddr encode-info)))
(skip-chars-forward "^\e")
(set-marker newpt (point))
(cond
((eq encoding t) ; only the leading sequence needs to be changed
(setq textlen (+ (- newpt pt) (length chset) 1))
(replace-match (format "\e%%/%d%c%c%s"
noctets
(+ (/ textlen 128) 128)
(+ (% textlen 128) 128)
chset)
t t))
((coding-system-p encoding) ; need to recode the entire segment...
(set-marker pt (match-beginning 0))
(decode-coding-region pt newpt 'ctext-no-compositions)
(set-buffer-multibyte t)
(encode-coding-region pt newpt encoding)
(set-buffer-multibyte nil)
(setq textlen (+ (- newpt pt) (length chset) 1))
(goto-char pt)
(insert (format "\e%%/%d%c%c%s"
noctets
(+ (/ textlen 128) 128)
(+ (% textlen 128) 128)
chset))))
(goto-char newpt))))
(set-buffer-multibyte t)
nil)
(make-coding-system
'compound-text-with-extensions 5 ?x
"Compound text encoding with ICCCM Extended Segment extensions.
This coding system should be used only for X selections. It is inappropriate
for decoding and encoding files, process I/O, etc."
nil
'((post-read-conversion . ctext-post-read-conversion)
(pre-write-conversion . ctext-pre-write-conversion)))
(define-coding-system-alias
'x-ctext-with-extensions 'compound-text-with-extensions)
(define-coding-system-alias
'ctext-with-extensions 'compound-text-with-extensions)
(make-coding-system
'iso-safe 2 ?-
"Convert all characters but ASCII to `?'."