mirror of
git://git.sv.gnu.org/emacs.git
synced 2025-12-24 14:30:43 -08:00
Don't distort character ranges in rx translation
The Emacs regexp engine interprets character ranges from ASCII to raw bytes, such as [a-\xfe], as not including non-ASCII Unicode at all; ranges from non-ACII Unicode to raw bytes, such as [ü-\x91], are ignored entirely. To make rx produce a translation that works as intended, split ranges that that go from ordinary characters to raw bytes. Such ranges may appear from set manipulation and regexp optimisation. * lisp/emacs-lisp/rx.el (rx--generate-alt): Split intervals that straddle the char-raw boundary when rendering a string regexp from an interval set. * test/lisp/emacs-lisp/rx-tests.el (rx-char-any-raw-byte): Add test cases.
This commit is contained in:
parent
7446a8c34e
commit
157e735ce8
2 changed files with 17 additions and 1 deletions
|
|
@ -98,7 +98,17 @@
|
|||
"[\177Å\211\326-\377]"))
|
||||
;; Split range; \177-\377ÿ should not be optimized to \177-\377.
|
||||
(should (equal (rx (any "\177-\377" ?ÿ))
|
||||
"[\177ÿ\200-\377]")))
|
||||
"[\177ÿ\200-\377]"))
|
||||
;; Range between normal chars and raw bytes: must be split to be parsed
|
||||
;; correctly by the Emacs regexp engine.
|
||||
(should (equal
|
||||
(rx (any (0 . #x3fffff)) (any (?G . #x3fff9a)) (any (?Ü . #x3ffff2)))
|
||||
"[\0-\x3fff7f\x80-\xff][G-\x3fff7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]"))
|
||||
;; As above but with ranges in string form. For historical reasons,
|
||||
;; we special-case ASCII-to-raw ranges to exclude non-ASCII unicode.
|
||||
(should (equal
|
||||
(rx (any "\x00-\xff") (any "G-\x9a") (any "Ü-\xf2"))
|
||||
"[\0-\x7f\x80-\xff][G-\x7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]")))
|
||||
|
||||
(ert-deftest rx-any ()
|
||||
(should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS"))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue