1
Fork 0
mirror of git://git.sv.gnu.org/emacs.git synced 2026-01-07 04:10:27 -08:00

Improve handling of non-ASCII characters in 'transpose-regions'

* src/editfns.c (Ftranspose_regions): Separate code related to character
semantics from that related to byte semantics and in that way leverage
optimizations for regions of equal length with respect to both
semantics.  Move and update comments dating back to the initial
implementation.
* test/src/editfns-tests.el (editfns-tests--transpose-regions-tests)
(editfns-tests--transpose-regions-markups)
(editfns-tests--transpose-regions): New test and accompanying variables.
This commit is contained in:
Jens Schmidt 2025-12-17 22:20:15 +01:00 committed by Stefan Monnier
parent 875e42d501
commit 0aabe62b64
2 changed files with 296 additions and 63 deletions

View file

@ -4525,7 +4525,8 @@ ring. */)
ptrdiff_t len1_byte, len_mid_byte, len2_byte;
unsigned char *start1_addr, *start2_addr, *temp;
INTERVAL cur_intv, tmp_interval1, tmp_interval2, tmp_interval3;
INTERVAL cur_intv, tmp_interval1, tmp_interval2;
INTERVAL tmp_interval_mid, tmp_interval3;
Lisp_Object buf;
XSETBUFFER (buf, current_buffer);
@ -4540,7 +4541,9 @@ ring. */)
end2 = XFIXNAT (endr2);
gap = GPT;
/* Swap the regions if they're reversed. */
/* Swap the regions if they're reversed. We do not swap the
corresponding Lisp objects as well, since we reference these only
to clear text properties in both regions. */
if (start2 < end1)
{
register ptrdiff_t glumph = start1;
@ -4560,28 +4563,6 @@ ring. */)
else if ((start1 == end1 || start2 == end2) && end1 == start2)
return Qnil;
/* The possibilities are:
1. Adjacent (contiguous) regions, or separate but equal regions
(no, really equal, in this case!), or
2. Separate regions of unequal size.
The worst case is usually No. 2. It means that (aside from
potential need for getting the gap out of the way), there also
needs to be a shifting of the text between the two regions. So
if they are spread far apart, we are that much slower... sigh. */
/* It must be pointed out that the really studly thing to do would
be not to move the gap at all, but to leave it in place and work
around it if necessary. This would be extremely efficient,
especially considering that people are likely to do
transpositions near where they are working interactively, which
is exactly where the gap would be found. However, such code
would be much harder to write and to read. So, if you are
reading this comment and are feeling squirrely, by all means have
a go! I just didn't feel like doing it, so I will simply move
the gap the minimum distance to get it out of the way, and then
deal with an unbroken array. */
start1_byte = CHAR_TO_BYTE (start1);
end2_byte = CHAR_TO_BYTE (end2);
@ -4597,6 +4578,22 @@ ring. */)
/* Run the before-change-functions *before* we move the gap. */
modify_text (start1, end2);
/* It must be pointed out that the really studly thing to do would
be not to move the gap at all, but to leave it in place and work
around it if necessary. This would be extremely efficient,
especially considering that people are likely to do
transpositions near where they are working interactively, which
is exactly where the gap would be found. However, such code
would be much harder to write and to read. So, if you are
reading this comment and are feeling squirrely, by all means have
a go! I just didn't feel like doing it, so I will simply move
the gap the minimum distance to get it out of the way, and then
deal with an unbroken array. */
/* Hmmm... how about checking to see if the gap is large
enough to use as the temporary storage? That would avoid an
allocation... interesting. Later, don't fool with it now. */
/* Make sure the gap won't interfere, by moving it out of the text
we will operate on. */
if (start1 < gap && gap < end2)
@ -4637,16 +4634,36 @@ ring. */)
}
#endif
/* Hmmm... how about checking to see if the gap is large
enough to use as the temporary storage? That would avoid an
allocation... interesting. Later, don't fool with it now. */
/* The possibilities are:
1. Regions of equal size, possibly even adjacent (contiguous).
2. Regions of unequal size.
In case 1. we can leave the "mid", that is, the region between the
two regions untouched.
The worst case is usually No. 2. It means that (aside from
potential need for getting the gap out of the way), there also
needs to be a shifting of the text between the two regions. So
if they are spread far apart, we are that much slower... sigh. */
/* As an additional difficulty, we have to carefully consider byte vs.
character semantics: Maintaining undo and text properties needs to
be done in terms of characters, swapping text in memory needs to be
done in terms of bytes.
Handling case 1. mentioned above in a special way is beneficial
both for undo/text properties and for memory swapping, only we have
to consider case 1. for the character-related bits (len1 == len2)
and case 1. for the byte-related bits (len1_byte == len2_byte)
separately. */
tmp_interval1 = copy_intervals (cur_intv, start1, len1);
tmp_interval2 = copy_intervals (cur_intv, start2, len2);
USE_SAFE_ALLOCA;
if (len1_byte == len2_byte && len1 == len2)
/* Regions are same size, though, how nice. */
/* The char lengths also have to match, for text-properties. */
len_mid = start2 - end1;
len_mid_byte = start2_byte - end1_byte;
if (len1 == len2)
{
if (end1 == start2) /* Merge the two parts into a single one. */
record_change (start1, (end2 - start1));
@ -4663,7 +4680,24 @@ ring. */)
tmp_interval3 = validate_interval_range (buf, &startr2, &endr2, 0);
if (tmp_interval3)
set_text_properties_1 (startr2, endr2, Qnil, buf, tmp_interval3);
}
else
/* Regions have different length, character-wise. Handle undo and
text properties for both regions as one long piece of text
spanning both regions and the mid. But while doing so, save the
intervals of the mid to later restore them in their new
position. */
{
record_change (start1, (end2 - start1));
tmp_interval_mid = copy_intervals (cur_intv, end1, len_mid);
tmp_interval3 = validate_interval_range (buf, &startr1, &endr2, 0);
if (tmp_interval3)
set_text_properties_1 (startr1, endr2, Qnil, buf, tmp_interval3);
}
USE_SAFE_ALLOCA;
if (len1_byte == len2_byte)
{
temp = SAFE_ALLOCA (len1_byte);
start1_addr = BYTE_POS_ADDR (start1_byte);
start2_addr = BYTE_POS_ADDR (start2_byte);
@ -4671,42 +4705,37 @@ ring. */)
memcpy (start1_addr, start2_addr, len2_byte);
memcpy (start2_addr, temp, len1_byte);
}
else
else if (len1_byte < len2_byte) /* Second region larger than first */
{
/* holds region 2 */
temp = SAFE_ALLOCA (len2_byte);
start1_addr = BYTE_POS_ADDR (start1_byte);
start2_addr = BYTE_POS_ADDR (start2_byte);
memcpy (temp, start2_addr, len2_byte);
memcpy (start1_addr + len_mid_byte + len2_byte, start1_addr, len1_byte);
memmove (start1_addr + len2_byte, start1_addr + len1_byte, len_mid_byte);
memcpy (start1_addr, temp, len2_byte);
}
else
/* Second region smaller than first. */
{
/* holds region 1 */
temp = SAFE_ALLOCA (len1_byte);
start1_addr = BYTE_POS_ADDR (start1_byte);
start2_addr = BYTE_POS_ADDR (start2_byte);
memcpy (temp, start1_addr, len1_byte);
memcpy (start1_addr, start2_addr, len2_byte);
memmove (start1_addr + len2_byte, start1_addr + len1_byte, len_mid_byte);
memcpy (start1_addr + len2_byte + len_mid_byte, temp, len1_byte);
}
SAFE_FREE ();
if (len1 != len2)
/* Restore intervals of the mid. */
{
len_mid = start2 - end1;
len_mid_byte = start2_byte - end1_byte;
record_change (start1, (end2 - start1));
INTERVAL tmp_interval_mid = copy_intervals (cur_intv, end1, len_mid);
tmp_interval3 = validate_interval_range (buf, &startr1, &endr2, 0);
if (tmp_interval3)
set_text_properties_1 (startr1, endr2, Qnil, buf, tmp_interval3);
if (len1_byte < len2_byte) /* Second region larger than first */
{
/* holds region 2 */
temp = SAFE_ALLOCA (len2_byte);
start1_addr = BYTE_POS_ADDR (start1_byte);
start2_addr = BYTE_POS_ADDR (start2_byte);
memcpy (temp, start2_addr, len2_byte);
memcpy (start1_addr + len_mid_byte + len2_byte, start1_addr, len1_byte);
memmove (start1_addr + len2_byte, start1_addr + len1_byte, len_mid_byte);
memcpy (start1_addr, temp, len2_byte);
}
else
/* Second region smaller than first. */
{
/* holds region 1 */
temp = SAFE_ALLOCA (len1_byte);
start1_addr = BYTE_POS_ADDR (start1_byte);
start2_addr = BYTE_POS_ADDR (start2_byte);
memcpy (temp, start1_addr, len1_byte);
memcpy (start1_addr, start2_addr, len2_byte);
memmove (start1_addr + len2_byte, start1_addr + len1_byte, len_mid_byte);
memcpy (start1_addr + len2_byte + len_mid_byte, temp, len1_byte);
}
graft_intervals_into_buffer (tmp_interval_mid, start1 + len2,
len_mid, current_buffer, 0);
}
SAFE_FREE ();
graft_intervals_into_buffer (tmp_interval1, end2 - len1,
len1, current_buffer, 0);
graft_intervals_into_buffer (tmp_interval2, start1,

View file

@ -190,6 +190,210 @@
(should (equal-including-properties
str1 (buffer-substring (+ (point-min) 5) (+ (point-min) 7)))))))
(defconst editfns-tests--transpose-regions-tests
'(;; adjacent regions with one being empty
("" "foo" "" "" "" [0 3 0 0 0])
("" "" "" "baz" "" [0 0 0 3 0])
;; For the following tests, assume that characters from the range
;; [a-z] are 1 byte long in Emacs's internal text representation,
;; while LATIN SMALL LETTER [AO] WITH DIAERESIS is 2 bytes long.
;; (len1 == len2) && (end1 == start2) && (len1_byte == len2_byte)
("" "fo(o" "" "b)az" "" [0 3 0 3 0])
;; (len1 == len2) && (end1 != start2) && (len1_byte == len2_byte)
("" "fo(o" "[bar]" "b)az" "" [0 3 3 3 0])
;; (len1 != len2) && (end1 != start2) && (len1_byte < len2_byte)
("" "fo(o" "[bar]" "baaz)" "" [0 3 3 4 0])
;; (len1 != len2) && (end1 != start2) && (len1_byte > len2_byte)
("" "(fooo" "[bar]" "baz)" "" [0 4 3 3 0])
;; (len1 == len2) && (end1 == start2) && (len1_byte < len2_byte)
("" "fo(o" "" "b)äz" "" [0 3 0 4 0])
;; (len1 == len2) && (end1 == start2) && (len1_byte > len2_byte)
("" "fo(ö" "" "b)az" "" [0 4 0 3 0])
;; (len1 == len2) && (end1 != start2) && (len1_byte > len2_byte)
("" "fo(o" "[bar]" "b)äz" "" [0 3 3 4 0])
;; (len1 == len2) && (end1 != start2) && (len1_byte > len2_byte)
("" "fo(ö" "[bar]" "b)az" "" [0 4 3 3 0])
;; (len1 != len2) && (end1 == start2) && (len1_byte == len2_byte)
("" "fo(ö" "" "baaz)" "" [0 4 0 4 0])
;; (len1 != len2) && (end1 == start2) && (len1_byte == len2_byte)
("" "(fooo" "" "bäz)" "" [0 4 0 4 0])
;; (len1 != len2) && (end1 != start2) && (len1_byte == len2_byte)
("" "fo(ö" "[bar]" "baaz)" "" [0 4 3 4 0])
;; (len1 != len2) && (end1 != start2) && (len1_byte == len2_byte)
("" "(fooo" "[bar]" "bäz)" "" [0 4 3 4 0])
;; Going entirely non-ASCII. Assume plain greek small letters are
;; two bytes long in Emacs's internal text representation, GREEK
;; SMALL LETTER ALPHA WITH PSILI is three bytes long.
;; To cover the initial patch from bug#70122, define a test
;; consisting of three three-letter strings REG1 MID REG2, with
;; (length REG1) == (length REG2) but (byte-length REG1) !=
;; (byte-length REG2) ...
("ἀ(ρχή" "φ[ωω" "β){αρ" "β<ἀ]ζ}" "τέλ>ος" [9 6 6 7 10])
;; ... and a test with (length REG1) == (length REG2) and
;; (byte-length REG1) == (byte-length REG2).
("ἀ(ρχή" "φ[ωω" "β){αρ" "β<α]ζ}" "τέλ>ος" [9 6 6 6 10])
;; Define the moral equivalent of
;; `editfns-tests--transpose-equal-but-not'.
(" " "(ab)" "[SPC]" "{é}" " " [1 2 3 2 1])
;; Likewise, for the testcase from bug#70122 in
;; https://debbugs.gnu.org/cgi/bugreport.cgi?bug=70122#5.
("" "" "(a):\n[b]: \x2113\x2080\n" "{v}: scaling" "" [0 0 13 10 0])
;; Likewise, for the testcase from bug#70122 in
;; https://debbugs.gnu.org/cgi/bugreport.cgi?bug=70122#52.
("(Query replace (default abc → d): )" "abc" "[ → ]" "d" "" [35 3 5 1 0]))
"List of test strings and their markup to test `transpose-regions'.
Each element of this list should be a list
HEAD REG1 MID REG2 TAIL BYTE-LENGTHS
where the first five elements are (possibly empty) string snippets and
the sixth element is a five-element vector providing the lengths of the
string snippets, counted in bytes in Emacs's internal text
representation.
Test `editfns-tests--transpose-regions' inserts the five snippets into
its temporary buffer, adds text properties to them as described for
variable `editfns-tests--transpose-regions-markups', transposes REG1 and
REG2, probably undoes the change, and at each stage ensures that all
involved entities look as expected.")
(defconst editfns-tests--transpose-regions-markups
'("()" "[]" "{}" "<>")
"List of two-characters strings \"BE\" describing text property markup.
For each element in this list, test `editfns-tests--transpose-regions'
searches once for regular expression \"B.+E\" in its temporary buffer,
adds a text property `markup' with value \"BE\" to the matching text,
and then removes the markup characters B and E around the matching text.
The test searches in the buffer with all test snippets already inserted,
so characters B and E can originate from different snippets, and the
various B's and E's of different markup items do not need to nest.")
(ert-deftest editfns-tests--transpose-regions ()
"Test function `transpose-regions'.
Execute tests as described by `editfns-tests--transpose-regions-tests'."
(dolist (test editfns-tests--transpose-regions-tests)
(dolist (leave-markers '(nil t))
(message "test: %S leave-markers: %S" test leave-markers)
(with-temp-buffer
(let ((test (take 5 test))
(blengthv (nth 5 test))
(smarkers nil) ; Separator markers.
(pmarkers nil) ; Property markers.
(pmpos nil) ; Their positions before transposing.
(strings nil) ; Net text snippets, propertized.
(blengths nil) ; Their lengths in bytes.
(tstrings nil) ; Net text snippets with REG1/2 transposed.
(test-undo nil)
p beg end beg2 end2)
(buffer-enable-undo)
;; Insert text snippets. While doing so, create the separator
;; markers which we need later to determine the net text
;; snippets.
(cl-assert (eq (length test) 5))
(setq p test)
(while (cdr p)
(insert (car p))
(push (point-marker) smarkers)
(setq p (cdr p)))
(insert (car p))
(setq smarkers (nreverse smarkers))
;; Propertize them according to markup, remove markup
;; characters, add property markers.
(dolist (markup editfns-tests--transpose-regions-markups)
(cl-assert (eq (length markup) 2))
(goto-char (point-min))
(when (search-forward-regexp
(concat "\\("
(regexp-quote (substring markup 0 1))
".+"
(regexp-quote (substring markup 1 2))
"\\)")
nil t)
(setq beg (copy-marker (match-beginning 1))
end (copy-marker (match-end 1)))
(delete-region beg (1+ beg))
(delete-region (1- end) end)
(add-text-properties beg end (list 'markup markup))
(push beg pmarkers)
(push end pmarkers)))
(setq pmarkers (sort pmarkers)
pmpos (mapcar #'marker-position pmarkers))
;; Determine net text snippets, plain and with transposed REG1
;; and REG2. Determine the byte lengths of the net text
;; snippets and ensure they meet our expectation.
(setq p smarkers
beg (point-min))
(while p
(push (buffer-substring beg (car p)) strings)
(push (- (position-bytes (car p)) (position-bytes beg))
blengths)
(setq beg (car p) p (cdr p)))
(push (buffer-substring beg (point-max)) strings)
(push (- (position-bytes (point-max)) (position-bytes beg))
blengths)
(setq strings (nreverse strings)
blengths (nreverse blengths))
(setq tstrings (list (nth 0 strings) (nth 3 strings)
(nth 2 strings) (nth 1 strings)
(nth 4 strings)))
(should (equal blengthv (apply #'vector blengths)))
;; Transpose REG1 and REG2. Some transpositions might not
;; generate undo, keep track of that in flag `test-undo'.
(setq beg (+ 1 (length (nth 0 strings)))
end (+ beg (length (nth 1 strings)))
beg2 (+ end (length (nth 2 strings)))
end2 (+ beg2 (length (nth 3 strings))))
(undo-boundary)
(transpose-regions beg end beg2 end2 leave-markers)
(when (car buffer-undo-list)
(setq test-undo t))
(undo-boundary)
;; Check resulting buffer text and its properties.
(should (equal-including-properties
(buffer-string)
(mapconcat #'identity tstrings)))
;; Check property marker positions.
(if leave-markers
(should (equal (mapcar #'marker-position pmarkers) pmpos))
;; Meh. This more or less blindly duplicates function
;; transpose_markers, since I have been too lazy to
;; reproduce the arithmetics myself.
(setq pmpos
(mapcar
(lambda (pos)
(cond
((< pos beg) pos)
((>= pos end2) pos)
((< pos end) (+ pos (+ (- end2 beg2) (- beg2 end))))
((< pos beg2) (+ pos (- (- end2 beg2) (- end beg))))
(t (- pos (+ (- end beg) (- beg2 end))))))
pmpos))
(should (equal (mapcar #'marker-position pmarkers) pmpos)))
;; Undo the transposition and check text and properties again,
;; if needed. This does not undo any marker transpositions as
;; per the comment before the call to transpose_markers in
;; Ftranspose_regions, so nothing to check on the marker side
;; after the undo.
(when test-undo
(undo)
(should (equal-including-properties
(buffer-string)
(mapconcat #'identity strings))))
;; Be nice and clean up markers.
(dolist (marker smarkers) (set-marker marker nil))
(dolist (marker pmarkers) (set-marker marker nil)))))))
(ert-deftest format-c-float ()
(should-error (format "%c" 0.5)))