mirror of
git://git.sv.gnu.org/emacs.git
synced 2026-01-04 02:51:31 -08:00
New rx implementation
* lisp/emacs-lisp/rx.el: * test/lisp/emacs-lisp/rx-tests.el: * doc/lispref/searching.texi (Rx Constructs): Rewrite rx for correctness, clarity, and performance. The new implementation retains full compatibility and has more comprehensive tests. * lisp/emacs-lisp/re-builder.el (reb-rx-font-lock-keywords): Adapt to changes in internal variables in rx.el.
This commit is contained in:
parent
a773a64748
commit
2ed71227c6
4 changed files with 1039 additions and 1015 deletions
|
|
@ -1,4 +1,4 @@
|
|||
;;; rx-tests.el --- test for rx.el functions -*- lexical-binding: t -*-
|
||||
;;; rx-tests.el --- tests for rx.el -*- lexical-binding: t -*-
|
||||
|
||||
;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
|
||||
|
||||
|
|
@ -17,21 +17,44 @@
|
|||
;; You should have received a copy of the GNU General Public License
|
||||
;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
;;; Commentary:
|
||||
|
||||
(require 'ert)
|
||||
(require 'rx)
|
||||
|
||||
;;; Code:
|
||||
(ert-deftest rx-seq ()
|
||||
(should (equal (rx "a.b" "*" "c")
|
||||
"a\\.b\\*c"))
|
||||
(should (equal (rx (seq "a" (: "b" (and "c" (sequence "d" nonl)
|
||||
"e")
|
||||
"f")
|
||||
"g"))
|
||||
"abcd.efg"))
|
||||
(should (equal (rx "a$" "b")
|
||||
"a\\$b"))
|
||||
(should (equal (rx bol "a" "b" ?c eol)
|
||||
"^abc$"))
|
||||
(should (equal (rx "a" "" "b")
|
||||
"ab"))
|
||||
(should (equal (rx (seq))
|
||||
""))
|
||||
(should (equal (rx "" (or "ab" nonl) "")
|
||||
"ab\\|.")))
|
||||
|
||||
(ert-deftest rx-or ()
|
||||
(should (equal (rx (or "ab" (| "c" nonl) "de"))
|
||||
"ab\\|c\\|.\\|de"))
|
||||
(should (equal (rx (or "ab" "abc" "a"))
|
||||
"\\(?:ab\\|abc\\|a\\)"))
|
||||
(should (equal (rx (| nonl "a") (| "b" blank))
|
||||
"\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
|
||||
(should (equal (rx (|))
|
||||
"\\`a\\`")))
|
||||
|
||||
(ert-deftest rx-char-any ()
|
||||
"Test character alternatives with `]' and `-' (Bug#25123)."
|
||||
(should (string-match
|
||||
(should (equal
|
||||
(rx string-start (1+ (char (?\] . ?\{) (?< . ?\]) (?- . ?:)))
|
||||
string-end)
|
||||
(apply #'string (nconc (number-sequence ?\] ?\{)
|
||||
(number-sequence ?< ?\])
|
||||
(number-sequence ?- ?:))))))
|
||||
"\\`[.-:<-{-]+\\'")))
|
||||
|
||||
(ert-deftest rx-char-any-range-nl ()
|
||||
"Test character alternatives with LF as a range endpoint."
|
||||
|
|
@ -40,28 +63,72 @@
|
|||
(should (equal (rx (any "\a-\n"))
|
||||
"[\a-\n]")))
|
||||
|
||||
(ert-deftest rx-char-any-range-bad ()
|
||||
(should-error (rx (any "0-9a-Z")))
|
||||
(should-error (rx (any (?0 . ?9) (?a . ?Z)))))
|
||||
|
||||
(ert-deftest rx-char-any-raw-byte ()
|
||||
"Test raw bytes in character alternatives."
|
||||
|
||||
;; The multibyteness of the rx return value sometimes depends on whether
|
||||
;; the test had been byte-compiled or not, so we add explicit conversions.
|
||||
|
||||
;; Separate raw characters.
|
||||
(should (equal (string-match-p (rx (any "\326A\333B"))
|
||||
"X\326\333")
|
||||
1))
|
||||
(should (equal (string-to-multibyte (rx (any "\326A\333B")))
|
||||
(string-to-multibyte "[AB\326\333]")))
|
||||
;; Range of raw characters, unibyte.
|
||||
(should (equal (string-match-p (rx (any "\200-\377"))
|
||||
"ÿA\310B")
|
||||
2))
|
||||
(should (equal (string-to-multibyte (rx (any "\200-\377")))
|
||||
(string-to-multibyte "[\200-\377]")))
|
||||
|
||||
;; Range of raw characters, multibyte.
|
||||
(should (equal (string-match-p (rx (any "Å\211\326-\377\177"))
|
||||
"XY\355\177\327")
|
||||
2))
|
||||
(should (equal (rx (any "Å\211\326-\377\177"))
|
||||
"[\177Å\211\326-\377]"))
|
||||
;; Split range; \177-\377ÿ should not be optimised to \177-\377.
|
||||
(should (equal (string-match-p (rx (any "\177-\377" ?ÿ))
|
||||
"ÿA\310B")
|
||||
0)))
|
||||
(should (equal (rx (any "\177-\377" ?ÿ))
|
||||
"[\177ÿ\200-\377]")))
|
||||
|
||||
(ert-deftest rx-any ()
|
||||
(should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS"))
|
||||
"[ACDF-HJ-S]"))
|
||||
(should (equal (rx (in "a!f" ?c) (char "q-z" "0-3")
|
||||
(not-char "a-e1-5") (not (in "A-M" ?q)))
|
||||
"[!acf][0-3q-z][^1-5a-e][^A-Mq]"))
|
||||
(should (equal (rx (any "^") (any "]") (any "-")
|
||||
(not (any "^")) (not (any "]")) (not (any "-")))
|
||||
"\\^]-[^^][^]][^-]"))
|
||||
(should (equal (rx (any "]" "^") (any "]" "-") (any "-" "^")
|
||||
(not (any "]" "^")) (not (any "]" "-"))
|
||||
(not (any "-" "^")))
|
||||
"[]^][]-][-^][^]^][^]-][^-^]"))
|
||||
(should (equal (rx (any "]" "^" "-") (not (any "]" "^" "-")))
|
||||
"[]^-][^]^-]"))
|
||||
(should (equal (rx (any "-" ascii) (any "^" ascii) (any "]" ascii))
|
||||
"[[:ascii:]-][[:ascii:]^][][:ascii:]]"))
|
||||
(should (equal (rx (not (any "-" ascii)) (not (any "^" ascii))
|
||||
(not (any "]" ascii)))
|
||||
"[^[:ascii:]-][^[:ascii:]^][^][:ascii:]]"))
|
||||
(should (equal (rx (any "-]" ascii) (any "^]" ascii) (any "-^" ascii))
|
||||
"[][:ascii:]-][]^[:ascii:]][[:ascii:]^-]"))
|
||||
(should (equal (rx (not (any "-]" ascii)) (not (any "^]" ascii))
|
||||
(not (any "-^" ascii)))
|
||||
"[^][:ascii:]-][^]^[:ascii:]][^[:ascii:]^-]"))
|
||||
(should (equal (rx (any "-]^" ascii) (not (any "-]^" ascii)))
|
||||
"[]^[:ascii:]-][^]^[:ascii:]-]"))
|
||||
(should (equal (rx (any "^" lower upper) (not (any "^" lower upper)))
|
||||
"[[:lower:]^[:upper:]][^[:lower:]^[:upper:]]"))
|
||||
(should (equal (rx (any "-" lower upper) (not (any "-" lower upper)))
|
||||
"[[:lower:][:upper:]-][^[:lower:][:upper:]-]"))
|
||||
(should (equal (rx (any "]" lower upper) (not (any "]" lower upper)))
|
||||
"[][:lower:][:upper:]][^][:lower:][:upper:]]"))
|
||||
(should (equal (rx (any "-a" "c-" "f-f" "--/*--"))
|
||||
"[*-/acf]"))
|
||||
(should (equal (rx (any "]-a" ?-) (not (any "]-a" ?-)))
|
||||
"[]-a-][^]-a-]"))
|
||||
(should (equal (rx (any "--]") (not (any "--]"))
|
||||
(any "-" "^-a") (not (any "-" "^-a")))
|
||||
"[].-\\-][^].-\\-][-^-a][^-^-a]"))
|
||||
(should (equal (rx (not (any "!a" "0-8" digit nonascii)))
|
||||
"[^!0-8a[:digit:][:nonascii:]]"))
|
||||
(should (equal (rx (any) (not (any)))
|
||||
"\\`a\\`\\(?:.\\|\n\\)"))
|
||||
(should (equal (rx (any "") (not (any "")))
|
||||
"\\`a\\`\\(?:.\\|\n\\)")))
|
||||
|
||||
(ert-deftest rx-pcase ()
|
||||
(should (equal (pcase "a 1 2 3 1 1 b"
|
||||
|
|
@ -71,7 +138,11 @@
|
|||
(backref u) space
|
||||
(backref 1))
|
||||
(list u v)))
|
||||
'("1" "3"))))
|
||||
'("1" "3")))
|
||||
(let ((k "blue"))
|
||||
(should (equal (pcase "<blue>"
|
||||
((rx "<" (literal k) ">") 'ok))
|
||||
'ok))))
|
||||
|
||||
(ert-deftest rx-kleene ()
|
||||
"Test greedy and non-greedy repetition operators."
|
||||
|
|
@ -94,71 +165,158 @@
|
|||
(should (equal (rx (maximal-match
|
||||
(seq (* "a") (+ "b") (\? "c") (?\s "d")
|
||||
(*? "e") (+? "f") (\?? "g") (?? "h"))))
|
||||
"a*b+c?d?e*?f+?g??h??")))
|
||||
"a*b+c?d?e*?f+?g??h??"))
|
||||
(should (equal (rx "a" (*) (+ (*)) (? (*) (+)) "b")
|
||||
"ab")))
|
||||
|
||||
(ert-deftest rx-or ()
|
||||
;; Test or-pattern reordering (Bug#34641).
|
||||
(let ((s "abc"))
|
||||
(should (equal (and (string-match (rx (or "abc" "ab" "a")) s)
|
||||
(match-string 0 s))
|
||||
"abc"))
|
||||
(should (equal (and (string-match (rx (or "ab" "abc" "a")) s)
|
||||
(match-string 0 s))
|
||||
"ab"))
|
||||
(should (equal (and (string-match (rx (or "a" "ab" "abc")) s)
|
||||
(match-string 0 s))
|
||||
"a")))
|
||||
;; Test zero-argument `or'.
|
||||
(should (equal (rx (or)) regexp-unmatchable)))
|
||||
(ert-deftest rx-repeat ()
|
||||
(should (equal (rx (= 3 "a") (>= 51 "b")
|
||||
(** 2 11 "c") (repeat 6 "d") (repeat 4 8 "e"))
|
||||
"a\\{3\\}b\\{51,\\}c\\{2,11\\}d\\{6\\}e\\{4,8\\}"))
|
||||
(should (equal (rx (= 0 "k") (>= 0 "l") (** 0 0 "m") (repeat 0 "n")
|
||||
(repeat 0 0 "o"))
|
||||
"k\\{0\\}l\\{0,\\}m\\{0\\}n\\{0\\}o\\{0\\}"))
|
||||
(should (equal (rx (opt (0+ "a")))
|
||||
"\\(?:a*\\)?"))
|
||||
(should (equal (rx (opt (= 4 "a")))
|
||||
"a\\{4\\}?"))
|
||||
(should (equal (rx "a" (** 3 7) (= 4) (>= 3) (= 4 (>= 7) (= 2)) "b")
|
||||
"ab")))
|
||||
|
||||
(ert-deftest rx-seq ()
|
||||
;; Test zero-argument `seq'.
|
||||
(should (equal (rx (seq)) "")))
|
||||
(ert-deftest rx-atoms ()
|
||||
(should (equal (rx anything)
|
||||
".\\|\n"))
|
||||
(should (equal (rx line-start not-newline nonl any line-end)
|
||||
"^...$"))
|
||||
(should (equal (rx bol string-start string-end buffer-start buffer-end
|
||||
bos eos bot eot eol)
|
||||
"^\\`\\'\\`\\'\\`\\'\\`\\'$"))
|
||||
(should (equal (rx point word-start word-end bow eow symbol-start symbol-end
|
||||
word-boundary not-word-boundary not-wordchar)
|
||||
"\\=\\<\\>\\<\\>\\_<\\_>\\b\\B\\W"))
|
||||
(should (equal (rx digit numeric num control cntrl)
|
||||
"[[:digit:]][[:digit:]][[:digit:]][[:cntrl:]][[:cntrl:]]"))
|
||||
(should (equal (rx hex-digit hex xdigit blank)
|
||||
"[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:blank:]]"))
|
||||
(should (equal (rx graph graphic print printing)
|
||||
"[[:graph:]][[:graph:]][[:print:]][[:print:]]"))
|
||||
(should (equal (rx alphanumeric alnum letter alphabetic alpha)
|
||||
"[[:alnum:]][[:alnum:]][[:alpha:]][[:alpha:]][[:alpha:]]"))
|
||||
(should (equal (rx ascii nonascii lower lower-case)
|
||||
"[[:ascii:]][[:nonascii:]][[:lower:]][[:lower:]]"))
|
||||
(should (equal (rx punctuation punct space whitespace white)
|
||||
"[[:punct:]][[:punct:]][[:space:]][[:space:]][[:space:]]"))
|
||||
(should (equal (rx upper upper-case word wordchar)
|
||||
"[[:upper:]][[:upper:]][[:word:]][[:word:]]"))
|
||||
(should (equal (rx unibyte multibyte)
|
||||
"[[:unibyte:]][[:multibyte:]]")))
|
||||
|
||||
(defmacro rx-tests--match (regexp string &optional match)
|
||||
(macroexp-let2 nil strexp string
|
||||
`(ert-info ((format "Matching %S to %S" ',regexp ,strexp))
|
||||
(should (string-match ,regexp ,strexp))
|
||||
,@(when match
|
||||
`((should (equal (match-string 0 ,strexp) ,match)))))))
|
||||
(ert-deftest rx-syntax ()
|
||||
(should (equal (rx (syntax whitespace) (syntax punctuation)
|
||||
(syntax word) (syntax symbol)
|
||||
(syntax open-parenthesis) (syntax close-parenthesis))
|
||||
"\\s-\\s.\\sw\\s_\\s(\\s)"))
|
||||
(should (equal (rx (syntax string-quote) (syntax paired-delimiter)
|
||||
(syntax escape) (syntax character-quote)
|
||||
(syntax comment-start) (syntax comment-end)
|
||||
(syntax string-delimiter) (syntax comment-delimiter))
|
||||
"\\s\"\\s$\\s\\\\s/\\s<\\s>\\s|\\s!")))
|
||||
|
||||
(ert-deftest rx-nonstring-expr ()
|
||||
(let ((bee "b")
|
||||
(vowel "[aeiou]"))
|
||||
(rx-tests--match (rx "a" (literal bee) "c") "abc")
|
||||
(rx-tests--match (rx "a" (regexp bee) "c") "abc")
|
||||
(rx-tests--match (rx "a" (or (regexp bee) "xy") "c") "abc")
|
||||
(rx-tests--match (rx "a" (or "xy" (regexp bee)) "c") "abc")
|
||||
(should-not (string-match (rx (or (regexp bee) "xy")) ""))
|
||||
(rx-tests--match (rx "a" (= 3 (regexp bee)) "c") "abbbc")
|
||||
(rx-tests--match (rx "x" (= 3 (regexp vowel)) "z") "xeoez")
|
||||
(should-not (string-match (rx "x" (= 3 (regexp vowel)) "z") "xe[]z"))
|
||||
(rx-tests--match (rx "x" (= 3 (literal vowel)) "z")
|
||||
"x[aeiou][aeiou][aeiou]z")
|
||||
(rx-tests--match (rx "x" (repeat 1 (regexp vowel)) "z") "xaz")
|
||||
(rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xaz")
|
||||
(rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xauz")
|
||||
(rx-tests--match (rx "x" (>= 1 (regexp vowel)) "z") "xaiiz")
|
||||
(rx-tests--match (rx "x" (** 1 2 (regexp vowel)) "z") "xaiz")
|
||||
(rx-tests--match (rx "x" (group (regexp vowel)) "z") "xaz")
|
||||
(rx-tests--match (rx "x" (group-n 1 (regexp vowel)) "z") "xaz")
|
||||
(rx-tests--match (rx "x" (? (regexp vowel)) "z") "xz")))
|
||||
(ert-deftest rx-category ()
|
||||
(should (equal (rx (category space-for-indent) (category base)
|
||||
(category consonant) (category base-vowel)
|
||||
(category upper-diacritical-mark)
|
||||
(category lower-diacritical-mark)
|
||||
(category tone-mark) (category symbol)
|
||||
(category digit)
|
||||
(category vowel-modifying-diacritical-mark)
|
||||
(category vowel-sign) (category semivowel-lower)
|
||||
(category not-at-end-of-line)
|
||||
(category not-at-beginning-of-line))
|
||||
"\\c \\c.\\c0\\c1\\c2\\c3\\c4\\c5\\c6\\c7\\c8\\c9\\c<\\c>"))
|
||||
(should (equal (rx (category alpha-numeric-two-byte)
|
||||
(category chinese-two-byte) (category greek-two-byte)
|
||||
(category japanese-hiragana-two-byte)
|
||||
(category indian-two-byte)
|
||||
(category japanese-katakana-two-byte)
|
||||
(category strong-left-to-right)
|
||||
(category korean-hangul-two-byte)
|
||||
(category strong-right-to-left)
|
||||
(category cyrillic-two-byte)
|
||||
(category combining-diacritic))
|
||||
"\\cA\\cC\\cG\\cH\\cI\\cK\\cL\\cN\\cR\\cY\\c^"))
|
||||
(should (equal (rx (category ascii) (category arabic) (category chinese)
|
||||
(category ethiopic) (category greek) (category korean)
|
||||
(category indian) (category japanese)
|
||||
(category japanese-katakana) (category latin)
|
||||
(category lao) (category tibetan))
|
||||
"\\ca\\cb\\cc\\ce\\cg\\ch\\ci\\cj\\ck\\cl\\co\\cq"))
|
||||
(should (equal (rx (category japanese-roman) (category thai)
|
||||
(category vietnamese) (category hebrew)
|
||||
(category cyrillic) (category can-break))
|
||||
"\\cr\\ct\\cv\\cw\\cy\\c|"))
|
||||
(should (equal (rx (category ?g) (not (category ?~)))
|
||||
"\\cg\\C~")))
|
||||
|
||||
(ert-deftest rx-nonstring-expr-non-greedy ()
|
||||
"`rx's greediness can't affect runtime regexp parts."
|
||||
(let ((ad-min "[ad]*?")
|
||||
(ad-max "[ad]*")
|
||||
(ad "[ad]"))
|
||||
(rx-tests--match (rx "c" (regexp ad-min) "a") "cdaaada" "cda")
|
||||
(rx-tests--match (rx "c" (regexp ad-max) "a") "cdaaada" "cdaaada")
|
||||
(rx-tests--match (rx "c" (minimal-match (regexp ad-max)) "a") "cdaaada" "cdaaada")
|
||||
(rx-tests--match (rx "c" (maximal-match (regexp ad-min)) "a") "cdaaada" "cda")
|
||||
(rx-tests--match (rx "c" (minimal-match (0+ (regexp ad))) "a") "cdaaada" "cda")
|
||||
(rx-tests--match (rx "c" (maximal-match (0+ (regexp ad))) "a") "cdaaada" "cdaaada")))
|
||||
(ert-deftest rx-not ()
|
||||
(should (equal (rx (not word-boundary))
|
||||
"\\B"))
|
||||
(should (equal (rx (not ascii) (not lower-case) (not wordchar))
|
||||
"[^[:ascii:]][^[:lower:]][^[:word:]]"))
|
||||
(should (equal (rx (not (syntax punctuation)) (not (syntax escape)))
|
||||
"\\S.\\S\\"))
|
||||
(should (equal (rx (not (category tone-mark)) (not (category lao)))
|
||||
"\\C4\\Co")))
|
||||
|
||||
(ert-deftest rx-group ()
|
||||
(should (equal (rx (group nonl) (submatch "x")
|
||||
(group-n 3 "y") (submatch-n 13 "z") (backref 1))
|
||||
"\\(.\\)\\(x\\)\\(?3:y\\)\\(?13:z\\)\\1"))
|
||||
(should (equal (rx (group) (group-n 2))
|
||||
"\\(\\)\\(?2:\\)")))
|
||||
|
||||
(ert-deftest rx-regexp ()
|
||||
(should (equal (rx (regexp "abc") (regex "[de]"))
|
||||
"\\(?:abc\\)[de]"))
|
||||
(let ((x "a*"))
|
||||
(should (equal (rx (regexp x) "b")
|
||||
"\\(?:a*\\)b"))
|
||||
(should (equal (rx "" (regexp x) (eval ""))
|
||||
"a*"))))
|
||||
|
||||
(ert-deftest rx-eval ()
|
||||
(should (equal (rx (eval (list 'syntax 'symbol)))
|
||||
"\\s_"))
|
||||
(should (equal (rx "a" (eval (concat)) "b")
|
||||
"ab")))
|
||||
|
||||
(ert-deftest rx-literal ()
|
||||
(should (equal (rx (literal (char-to-string 42)) nonl)
|
||||
"\\*."))
|
||||
(let ((x "a+b"))
|
||||
(should (equal (rx (opt (literal (upcase x))))
|
||||
"\\(?:A\\+B\\)?"))))
|
||||
|
||||
(ert-deftest rx-to-string ()
|
||||
(should (equal (rx-to-string '(or nonl "\nx"))
|
||||
"\\(?:.\\|\nx\\)"))
|
||||
(should (equal (rx-to-string '(or nonl "\nx") t)
|
||||
".\\|\nx")))
|
||||
|
||||
|
||||
(ert-deftest rx-constituents ()
|
||||
(let ((rx-constituents
|
||||
(append '((beta . gamma)
|
||||
(gamma . "a*b")
|
||||
(delta . ((lambda (form)
|
||||
(regexp-quote (format "<%S>" form)))
|
||||
1 nil symbolp))
|
||||
(epsilon . delta))
|
||||
rx-constituents)))
|
||||
(should (equal (rx-to-string '(seq (+ beta) nonl gamma) t)
|
||||
"\\(?:a*b\\)+.\\(?:a*b\\)"))
|
||||
(should (equal (rx-to-string '(seq (delta a b c) (* (epsilon d e))) t)
|
||||
"\\(?:<(delta a b c)>\\)\\(?:<(epsilon d e)>\\)*"))))
|
||||
|
||||
(ert-deftest rx-to-string-lisp-forms ()
|
||||
(rx-tests--match (rx-to-string '(seq "a" (literal "b") "c")) "abc")
|
||||
(rx-tests--match (rx-to-string '(seq "a" (regexp "b") "c")) "abc"))
|
||||
|
||||
(provide 'rx-tests)
|
||||
;; rx-tests.el ends here.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue