* regex.c (RE_TARGET_MULTIBYTE_P): New macro.

(GET_CHAR_BEFORE_2): Check target_multibyte, not multibyte. If that is zero, convert an eight-bit char to multibyte. (MAKE_CHAR_MULTIBYTE, CHAR_LEADING_CODE): New dummy new macros for non-emacs case. (PATFETCH): Convert an eight-bit char to multibyte. (HANDLE_UNIBYTE_RANGE): New macro. (regex_compile): Setup the compiled pattern for multibyte chars even if the given regex string is unibyte. Use PATFETCH_RAW instead of PATFETCH in many places. To handle `charset' specification of unibyte, call HANDLE_UNIBYTE_RANGE. Use bitmap only for ASCII chars. (analyse_first) <exactn>: Simplified because the compiled pattern is multibyte. <charset_not>: Setup fastmap from bitmap only for ASCII chars. <charset>: Use CHAR_LEADING_CODE to get leading codes. <categoryspec>: If multibyte, setup fastmap only for ASCII chars here. (re_compile_fastmap) [emacs]: Call analyse_first with the arg multibyte always 1. (re_search_2) In emacs, set the locale variable multibyte to 1, otherwise to 0. New local variable target_multibyte. Check it to decide the multibyteness of STR1 and STR2. If target_multibyte is zero, convert unibyte chars to multibyte before translating and checking fastmap. (TARGET_CHAR_AND_LENGTH): New macro. (re_match_2_internal): In emacs, set the locale variable multibyte to 1, otherwise to 0. New local variable target_multibyte. Check it to decide the multibyteness of STR1 and STR2. Use TARGET_CHAR_AND_LENGTH to fetch a character from D. <charset, charset_not>: If multibyte is nonzero, check fastmap only for ASCII chars. Call bcmp_translate with target_multibyte, not with multibyte. <begline>: Declare the local variable C as `unsigned'. (bcmp_translate): Change the last arg name to target_multibyte.
2026-03-14 10:51:20 -07:00 · 2002-09-03 04:09:40 +00:00 · 2002-09-03 04:09:40 +00:00 · bf2164799a
commit bf2164799a
parent 66f089b275
1 changed files with 270 additions and 119 deletions
--- a/src/regex.c
+++ b/src/regex.c
@ -146,6 +146,7 @@
 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))

 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
+# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
 # define RE_STRING_CHAR(p, s) \
  (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
 # define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
@ -154,17 +155,21 @@
 /* Set C a (possibly multibyte) character before P.  P points into a
   string which is the virtual concatenation of STR1 (which ends at
   END1) or STR2 (which ends at END2).  */
-# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)		\
-  do {									\
-    if (multibyte)							\
-       {						    		\
-	 re_char *dtemp = (p) == (str2) ? (end1) : (p);		    	\
-	 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
-	 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));		\
-	 c = STRING_CHAR (dtemp, (p) - dtemp);				\
-       }						    		\
-     else						    		\
-       (c = ((p) == (str2) ? (end1) : (p))[-1]);			\
+# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)		     \
+  do {									     \
+    if (target_multibyte)						     \
+      {									     \
+	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
+	re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
+	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));		     \
+	c = STRING_CHAR (dtemp, (p) - dtemp);				     \
+      }									     \
+    else								     \
+      {									     \
+	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
+	if (multibyte)							     \
+	  MAKE_CHAR_MULTIBYTE (c);					     \
+      }									     \
  } while (0)


@ -233,6 +238,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
 # define CHARSET_LEADING_CODE_BASE(c) 0
 # define MAX_MULTIBYTE_LENGTH 1
 # define RE_MULTIBYTE_P(x) 0
+# define RE_TARGET_MULTIBYTE_P(x) 0
 # define WORD_BOUNDARY_P(c1, c2) (0)
 # define CHAR_HEAD_P(p) (1)
 # define SINGLE_BYTE_CHAR_P(c) (1)
@ -248,6 +254,8 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
 # define MAKE_CHAR(charset, c1, c2) (c1)
 # define BYTE8_TO_CHAR(c) (c)
 # define CHAR_BYTE8_P(c) (0)
+# define MAKE_CHAR_MULTIBYTE(c) 0
+# define CHAR_LEADING_CODE(c) (c)
 #endif /* not emacs */

 #ifndef RE_TRANSLATE
@ -1665,6 +1673,8 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
 #define PATFETCH(c)							\
  do {									\
    PATFETCH_RAW (c);							\
+    if (! multibyte)							\
+      MAKE_CHAR_MULTIBYTE (c);						\
    c = TRANSLATE (c);							\
  } while (0)

@ -1917,6 +1927,54 @@ struct range_table_work_area
 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))


+#ifdef emacs
+
+/* It is better to implement this jumbo macro by a function, but it's
+   not that easy because macros called within it assumes various
+   variables being defined.  */
+
+#define HANDLE_UNIBYTE_RANGE(work_area, c1, c2)				\
+  do {									\
+    int char_table[257];						\
+    int i, j, c;							\
+									\
+    char_table[(c1) - 1] = -2;	/* head sentinel */			\
+    for (i = (c1); i <= (c2); i++)					\
+      char_table[i] = TRANSLATE (unibyte_char_to_multibyte (i));	\
+    char_table[i] = MAX_CHAR + 2;	/* tail sentinel */		\
+									\
+    /* As the number of data is small (at most 128) and we can expect	\
+       that data in char_table are mostly sorted, we use fairly simple	\
+       `insertion sort'.  */						\
+    for (i = (c1) + 1; i <= (c2); i++)					\
+      {									\
+	c = char_table[i];						\
+	j = i;								\
+	while (char_table[j - 1] > c)					\
+	  char_table[j] = char_table[j - 1], j--;			\
+	char_table[j] = c;						\
+      }									\
+									\
+    for (i = (c1); i <= (c2); i++)					\
+      {									\
+	c = char_table[i];						\
+	if (! IS_REAL_ASCII (c))					\
+	  break;							\
+	SET_LIST_BIT (c);						\
+      }									\
+    while (i <= (c2))							\
+      {									\
+	c = char_table[i];						\
+	for (j = i + 1; j <= (c2); j++)					\
+	  if (char_table[j] - c != j - i)				\
+	    break;							\
+	SET_RANGE_TABLE_WORK_AREA ((work_area), c, char_table[j - 1]);	\
+	i = j;								\
+      }									\
+  } while (0)
+
+#endif /* emacs */
+
 /* Get the next unsigned number in the uncompiled pattern.  */
 #define GET_UNSIGNED_NUMBER(num)					\
 do { if (p != pend)							\
@ -2264,7 +2322,7 @@ regex_compile (pattern, size, syntax, bufp)
  /* Loop through the uncompiled pattern until we're at the end.  */
  while (p != pend)
    {
-      PATFETCH (c);
+      PATFETCH_RAW (c);

      switch (c)
 	{
@ -2346,15 +2404,15 @@ regex_compile (pattern, size, syntax, bufp)
 		    if (p+1 == pend)
 		      FREE_STACK_RETURN (REG_EESCAPE);
 		    if (p[1] == '+' || p[1] == '?')
-		      PATFETCH (c); /* Gobble up the backslash.  */
+		      PATFETCH_RAW (c); /* Gobble up the backslash.  */
 		    else
 		      break;
 		  }
 		else
 		  break;
 		/* If we get here, we found another repeat character.  */
-		PATFETCH (c);
-	       }
+		PATFETCH_RAW (c);
+	      }

 	    /* Star, etc. applied to an empty pattern is equivalent
 	       to an empty pattern.  */
@ -2495,14 +2553,14 @@ regex_compile (pattern, size, syntax, bufp)

 		if (p == pend) FREE_STACK_RETURN (REG_EBRACK);

-		PATFETCH (c);
+		PATFETCH_RAW (c);

 		/* \ might escape characters inside [...] and [^...].  */
 		if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
 		  {
 		    if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);

-		    PATFETCH (c);
+		    PATFETCH_RAW (c);
 		    escaped_char = true;
 		  }
 		else
@ -2528,7 +2586,7 @@ regex_compile (pattern, size, syntax, bufp)
 		    unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
 		    const unsigned char *class_beg;

-		    PATFETCH (c);
+		    PATFETCH_RAW (c);
 		    c1 = 0;
 		    class_beg = p;

@ -2537,7 +2595,7 @@ regex_compile (pattern, size, syntax, bufp)

 		    for (;;)
 		      {
-		        PATFETCH (c);
+		        PATFETCH_RAW (c);
 		        if ((c == ':' && *p == ']') || p == pend)
 		          break;
 			if (c1 < CHAR_CLASS_MAX_LENGTH)
@ -2564,7 +2622,7 @@ regex_compile (pattern, size, syntax, bufp)

                        /* Throw away the ] at the end of the character
                           class.  */
-                        PATFETCH (c);
+                        PATFETCH_RAW (c);

                        if (p == pend) FREE_STACK_RETURN (REG_EBRACK);

@ -2573,17 +2631,20 @@ regex_compile (pattern, size, syntax, bufp)
 			   is_digit, is_cntrl, and is_xdigit, since
 			   they can only match ASCII characters.  We
 			   don't need to handle them for multibyte.
-			   They are distinguished by a negative wctype.  */
-
-			if (multibyte)
-			  SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
-							 re_wctype_to_bit (cc));
+			   They are distinguished by a negative wctype.
+			   We need this only for Emacs.  */
+#ifdef emacs
+			SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
+						       re_wctype_to_bit (cc));
+#endif

                        for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
 			  {
-			    int translated = TRANSLATE (ch);
-			    if (re_iswctype (btowc (ch), cc))
-			      SET_LIST_BIT (translated);
+			    MAKE_CHAR_MULTIBYTE (ch);
+			    ch = TRANSLATE (ch);
+			    if (IS_REAL_ASCII (ch)
+				& re_iswctype (btowc (ch), cc))
+			      SET_LIST_BIT (ch);
 			  }

 			/* Repeat the loop. */
@ -2606,35 +2667,51 @@ regex_compile (pattern, size, syntax, bufp)
 		  {

 		    /* Discard the `-'. */
-		    PATFETCH (c1);
+		    PATFETCH_RAW (c1);

 		    /* Fetch the character which ends the range. */
-		    PATFETCH (c1);
-
-		    if (SINGLE_BYTE_CHAR_P (c)
-			&& ! SINGLE_BYTE_CHAR_P (c1))
+		    PATFETCH_RAW (c1);
+#ifdef emacs
+		    if (multibyte)
 		      {
-			/* Handle a range starting with a character
-			   fitting in a bitmap to a character not
-			   fitting in a bitmap (thus require range
-			   table).  We use both a bitmap (for the
-			   range from C to 255) and a range table (for
-			   the remaining range).  Here, we setup only
-			   a range table.  A bitmap is setup later.  */
-			re_wchar_t c2
-			  = CHAR_BYTE8_P (c1) ? BYTE8_TO_CHAR (0x80) : 256;
-
-			SET_RANGE_TABLE_WORK_AREA (range_table_work, c2, c1);
-			c1 = 255;
+			c = TRANSLATE (c);
+			c1 = TRANSLATE (c1);
+			if (! IS_REAL_ASCII (c1))
+			  {
+			    SET_RANGE_TABLE_WORK_AREA (range_table_work,
+						       c, c1);
+			    c1 = 127;
+			  }
 		      }
+		    else
+		      {
+			if (! IS_REAL_ASCII (c1))
+			  {
+			    int c2 = MAX (c, 128);
+
+			    HANDLE_UNIBYTE_RANGE (range_table_work, c2, c1);
+			    c1 = 127;
+			  }
+		      }
+#endif
 		  }
 		else
-		  /* Range from C to C. */
-		  c1 = c;
+		  {
+		    /* Range from C to C. */
+		    if (! multibyte)
+		      MAKE_CHAR_MULTIBYTE (c);
+		    c = TRANSLATE (c);
+		    if (IS_REAL_ASCII (c))
+		      c1 = c;
+		    else
+		      {
+			SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c);
+			c = -1;	/* Suppress setting bitmap.  */
+		      }
+		  }

-		/* Set the range ... */
-		if (SINGLE_BYTE_CHAR_P (c))
-		  /* ... into bitmap.  */
+		/* Set the range into bitmap */
+		if (c >= 0)
 		  {
 		    re_wchar_t this_char;
 		    int range_start = c, range_end = c1;
@ -2653,9 +2730,6 @@ regex_compile (pattern, size, syntax, bufp)
 			  SET_LIST_BIT (TRANSLATE (this_char));
 		      }
 		  }
-		else
-		  /* ... into range table.  */
-		  SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
 	      }

 	    /* Discard any (non)matching list bytes that are all 0 at the
@ -2750,7 +2824,7 @@ regex_compile (pattern, size, syntax, bufp)
 		    /* Look for a special (?...) construct */
 		    if ((syntax & RE_SHY_GROUPS) && *p == '?')
 		      {
-			PATFETCH (c); /* Gobble up the '?'.  */
+			PATFETCH_RAW (c); /* Gobble up the '?'.  */
 			PATFETCH (c);
 			switch (c)
 			  {
@ -3230,10 +3304,10 @@ regex_compile (pattern, size, syntax, bufp)
 	  {
 	    int len;

-	    if (multibyte)
-	      len = CHAR_STRING (c, b);
-	    else
-	      *b = c, len = 1;
+	    if (! multibyte)
+	      MAKE_CHAR_MULTIBYTE (c);
+	    c = TRANSLATE (c);
+	    len = CHAR_STRING (c, b);
 	    b += len;
 	    (*pending_exact) += len;
 	  }
@ -3439,6 +3513,8 @@ group_in_compile_stack (compile_stack, regnum)
   bother filling it up (obviously) and only return whether the
   pattern could potentially match the empty string.

+   MULTIBYTE is always 1 for Emacs, and 0 otherwise.
+
   Return 1  if p..pend might match the empty string.
   Return 0  if p..pend matches at least one char.
   Return -1 if fastmap was not updated accurately.  */
@ -3505,14 +3581,11 @@ analyse_first (p, pend, fastmap, multibyte)

 	case exactn:
 	  if (fastmap)
-	    {
-	      int c = RE_STRING_CHAR (p + 1, pend - p);
-
-	      if (SINGLE_BYTE_CHAR_P (c))
-		fastmap[c] = 1;
-	      else
-		fastmap[p[1]] = 1;
-	    }
+	    /* If multibyte is nonzero, the first byte of each
+	       character is an ASCII or a leading code.  Otherwise,
+	       each byte is a character.  Thus, this works in both
+	       cases. */
+	    fastmap[p[1]] = 1;
 	  break;


@ -3524,14 +3597,17 @@ analyse_first (p, pend, fastmap, multibyte)


 	case charset_not:
-	  /* Chars beyond end of bitmap are possible matches.
-	     All the single-byte codes can occur in multibyte buffers.
-	     So any that are not listed in the charset
-	     are possible matches, even in multibyte buffers.  */
 	  if (!fastmap) break;
-	  for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
-	       j < (1 << BYTEWIDTH); j++)
-	    fastmap[j] = 1;
+	  {
+	    /* Chars beyond end of bitmap are possible matches.  */
+	    /* Emacs uses the bitmap only for ASCII characters.  */
+	    int limit = multibyte ? 128 : (1 << BYTEWIDTH);
+
+	    for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+		 j < limit; j++)
+	      fastmap[j] = 1;
+	  }
+
 	  /* Fallthrough */
 	case charset:
 	  if (!fastmap) break;
@ -3542,7 +3618,7 @@ analyse_first (p, pend, fastmap, multibyte)
 	      fastmap[j] = 1;

 	  if ((not && multibyte)
-	      /* Any character set can possibly contain a character
+	      /* Any leading code can possibly start a character
 		 which doesn't match the specified set of characters.  */
 	      || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
 		  && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
@ -3562,11 +3638,10 @@ analyse_first (p, pend, fastmap, multibyte)
 	  else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
 		   && match_any_multibyte_characters == false)
 	    {
-	      /* Set fastmap[I] to 1 where I is a base leading code of each
+	      /* Set fastmap[I] to 1 where I is a leading code of each
 		 multibyte characer in the range table. */
 	      int c, count;
-	      unsigned char buf1[MAX_MULTIBYTE_LENGTH];
-	      unsigned char buf2[MAX_MULTIBYTE_LENGTH];
+	      unsigned char lc1, lc2;

 	      /* Make P points the range table.  `+ 2' is to skip flag
 		 bits for a character class.  */
@ -3578,11 +3653,11 @@ analyse_first (p, pend, fastmap, multibyte)
 		{
 		  /* Extract the start and end of each range.  */
 		  EXTRACT_CHARACTER (c, p);
-		  CHAR_STRING (c, buf1);
+		  lc1 = CHAR_LEADING_CODE (c);
 		  p += 3;
 		  EXTRACT_CHARACTER (c, p);
-		  CHAR_STRING (c, buf2);
-		  for (j = buf1[0]; j <= buf2[0]; j++)
+		  lc2 = CHAR_LEADING_CODE (c);
+		  for (j = lc1; j <= lc2; j++)
 		    fastmap[j] = 1;
 		}
 	    }
@ -3608,7 +3683,7 @@ analyse_first (p, pend, fastmap, multibyte)
 	  if (!fastmap) break;
 	  not = (re_opcode_t)p[-1] == notcategoryspec;
 	  k = *p++;
-	  for (j = 0; j < (1 << BYTEWIDTH); j++)
+	  for (j = (multibyte ? 127 : (1 << BYTEWIDTH)); j >= 0; j--)
 	    if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
 	      fastmap[j] = 1;

@ -3754,7 +3829,15 @@ re_compile_fastmap (bufp)
  bufp->fastmap_accurate = 1;	    /* It will be when we're done.  */

  analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
-			    fastmap, RE_MULTIBYTE_P (bufp));
+			    fastmap, 
+#ifdef emacs
+			    /* The compiled pattern buffer is always
+			       setup for multibyte characters.  */
+			    1
+#else
+			    0
+#endif
+			    );
  bufp->can_be_null = (analysis != 0);
  return 0;
 } /* re_compile_fastmap */
@ -3860,8 +3943,14 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
  int endpos = startpos + range;
  boolean anchored_start;

-  /* Nonzero if we have to concern multibyte character.	 */
-  const boolean multibyte = RE_MULTIBYTE_P (bufp);
+  /* Nonzero if BUFP is setup for multibyte characters.  */
+#ifdef emacs
+  const boolean multibyte = 1;
+#else 
+  const boolean multibyte = 0;
+#endif
+  /* Nonzero if STR1 and STR2 contains multibyte characters.  */
+  const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);

  /* Check for out-of-range STARTPOS.  */
  if (startpos < 0 || startpos > total_size)
@ -3950,7 +4039,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
 		 inside the loop.  */
 	      if (RE_TRANSLATE_P (translate))
 		{
-		  if (multibyte)
+		  if (target_multibyte)
 		    while (range > lim)
 		      {
 			int buf_charlen;
@ -3959,13 +4048,24 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
 							 buf_charlen);

 			buf_ch = RE_TRANSLATE (translate, buf_ch);
-			if (buf_ch >= 0400
-			    || fastmap[buf_ch])
+			if (fastmap[CHAR_LEADING_CODE (buf_ch)])
 			  break;

 			range -= buf_charlen;
 			d += buf_charlen;
 		      }
+		  else if (multibyte)
+		    while (range > lim)
+		      {
+			buf_ch = *d;
+			MAKE_CHAR_MULTIBYTE (buf_ch);
+			buf_ch = RE_TRANSLATE (translate, buf_ch);
+			if (fastmap[CHAR_LEADING_CODE (buf_ch)])
+			  break;
+
+			d++;
+			range--;
+		      }
 		  else
 		    while (range > lim
 			   && !fastmap[RE_TRANSLATE (translate, *d)])
@ -3974,6 +4074,16 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
 			range--;
 		      }
 		}
+	      else if (multibyte && ! target_multibyte)
+		{
+		  buf_ch = *d;
+		  MAKE_CHAR_MULTIBYTE (buf_ch);
+		  if (fastmap[CHAR_LEADING_CODE (buf_ch)])
+		    break;
+
+		  d++;
+		  range--;
+		}
 	      else
 		while (range > lim && !fastmap[*d])
 		  {
@ -3989,10 +4099,11 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
 			  ? size2 + size1 - startpos
 			  : size1 - startpos);
 	      buf_ch = RE_STRING_CHAR (d, room);
+	      if (! target_multibyte)
+		MAKE_CHAR_MULTIBYTE (buf_ch);
 	      buf_ch = TRANSLATE (buf_ch);

-	      if (! (buf_ch >= 0400
-		     || fastmap[buf_ch]))
+	      if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
 		goto advance;
 	    }
 	}
@ -4022,7 +4133,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
      else if (range > 0)
 	{
 	  /* Update STARTPOS to the next character boundary.  */
-	  if (multibyte)
+	  if (target_multibyte)
 	    {
 	      re_char *p = POS_ADDR_VSTRING (startpos);
 	      re_char *pend = STOP_ADDR_VSTRING (startpos);
@ -4045,7 +4156,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
 	  startpos--;

 	  /* Update STARTPOS to the previous character boundary.  */
-	  if (multibyte)
+	  if (target_multibyte)
 	    {
 	      re_char *p = POS_ADDR_VSTRING (startpos);
 	      int len = 0;
@ -4502,6 +4613,17 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
 }
 WEAK_ALIAS (__re_match_2, re_match_2)

+#ifdef emacs
+#define TARGET_CHAR_AND_LENGTH(d, len, actual_len)	\
+  (target_multibyte					\
+   ? STRING_CHAR_AND_LENGTH (d, len, actual_len)	\
+   : (actual_len = 1, unibyte_char_to_multibyte (*d)))
+#else
+#define TARGET_CHAR_AND_LENGTH(d, len, actual_len)	\
+  (actual_len = 1, *d)
+#endif
+
+
 /* This is a separate function so that we can force an alloca cleanup
   afterwards.	*/
 static int
@ -4541,8 +4663,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
  /* We use this to map every character in the string.	*/
  RE_TRANSLATE_TYPE translate = bufp->translate;

-  /* Nonzero if we have to concern multibyte character.	 */
-  const boolean multibyte = RE_MULTIBYTE_P (bufp);
+  /* Nonzero if BUFP is setup for multibyte characters.	 */
+#ifdef emacs
+  const boolean multibyte = 1;
+#else
+  const boolean multibyte = 0;
+#endif
+  /* Nonzero if STR1 and STR2 contains multibyte characters.  */
+  const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);

  /* Failure point stack.  Each place that can handle a failure further
     down the line pushes a failure point on this stack.  It consists of
@ -4907,7 +5035,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)

 		    PREFETCH ();
 		    pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
-		    buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+		    buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);

 		    if (RE_TRANSLATE (translate, buf_ch)
 			!= pat_ch)
@ -4936,16 +5064,37 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	    }
 	  else
 	    {
-	      do
-		{
-		  PREFETCH ();
-		  if (*d++ != *p++)
-		    {
-		      d = dfail;
-		      goto fail;
-		    }
-		}
-	      while (--mcnt);
+	      if (multibyte == target_multibyte)
+		do
+		  {
+		    PREFETCH ();
+		    if (*d++ != *p++)
+		      {
+			d = dfail;
+			goto fail;
+		      }
+		  }
+		while (--mcnt);
+	      else		/* i.e. multibyte && ! target_multibyte */
+		do
+		  {
+		    int pat_charlen, buf_charlen;
+		    unsigned int pat_ch, buf_ch;
+
+		    PREFETCH ();
+		    pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+		    buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+
+		    if (pat_ch != buf_ch)
+		      {
+			d = dfail;
+			goto fail;
+		      }
+		    p += pat_charlen;
+		    d += buf_charlen;
+		    mcnt -= pat_charlen;
+		  }
+		while (mcnt > 0);
 	    }
 	  break;

@ -4959,7 +5108,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	    DEBUG_PRINT1 ("EXECUTING anychar.\n");

 	    PREFETCH ();
-	    buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+	    buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
 	    buf_ch = TRANSLATE (buf_ch);

 	    if ((!(bufp->syntax & RE_DOT_NEWLINE)
@ -5003,10 +5152,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	      }

 	    PREFETCH ();
-	    c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
+	    c = TARGET_CHAR_AND_LENGTH (d, dend - d, len);
 	    c = TRANSLATE (c); /* The character to match.  */

-	    if (SINGLE_BYTE_CHAR_P (c))
+	    if (! multibyte || IS_REAL_ASCII (c))
 	      {			/* Lookup bitmap.  */
 		/* Cast to `unsigned' instead of `unsigned char' in
 		   case the bit list is a full 32 bytes long.  */
@ -5146,7 +5295,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 		/* Compare that many; failure if mismatch, else move
 		   past them.  */
 		if (RE_TRANSLATE_P (translate)
-		    ? bcmp_translate (d, d2, mcnt, translate, multibyte)
+		    ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
 		    : memcmp (d, d2, mcnt))
 		  {
 		    d = dfail;
@ -5169,7 +5318,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	    }
 	  else
 	    {
-	      unsigned char c;
+	      unsigned c;
 	      GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
 	      if (c == '\n')
 		break;
@ -5421,6 +5570,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 		 is the character at D, and S2 is the syntax of C2.  */
 	      re_wchar_t c1, c2;
 	      int s1, s2;
+	      int dummy;
 #ifdef emacs
 	      int offset = PTR_TO_OFFSET (d - 1);
 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
@ -5432,7 +5582,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	      UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
 #endif
 	      PREFETCH_NOLIMIT ();
-	      c2 = RE_STRING_CHAR (d, dend - d);
+	      c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy);
 	      s2 = SYNTAX (c2);

 	      if (/* Case 2: Only one of S1 and S2 is Sword.  */
@ -5461,13 +5611,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 		 is the character at D, and S2 is the syntax of C2.  */
 	      re_wchar_t c1, c2;
 	      int s1, s2;
+	      int dummy;
 #ifdef emacs
 	      int offset = PTR_TO_OFFSET (d);
 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
 	      UPDATE_SYNTAX_TABLE (charpos);
 #endif
 	      PREFETCH ();
-	      c2 = RE_STRING_CHAR (d, dend - d);
+	      c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy);
 	      s2 = SYNTAX (c2);
 	
 	      /* Case 2: S2 is not Sword. */
@ -5505,6 +5656,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 		 is the character at D, and S2 is the syntax of C2.  */
 	      re_wchar_t c1, c2;
 	      int s1, s2;
+	      int dummy;
 #ifdef emacs
 	      int offset = PTR_TO_OFFSET (d) - 1;
 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
@ -5521,7 +5673,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	      if (!AT_STRINGS_END (d))
 		{
 		  PREFETCH_NOLIMIT ();
-		  c2 = RE_STRING_CHAR (d, dend - d);
+		  c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy);
 #ifdef emacs
 		  UPDATE_SYNTAX_TABLE_FORWARD (charpos);
 #endif
@ -5552,8 +5704,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	    int len;
 	    re_wchar_t c;

-	    c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
-
+	    c = TARGET_CHAR_AND_LENGTH (d, dend - d, len);
 	    if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
 	      goto fail;
 	    d += len;
@ -5589,7 +5740,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	    int len;
 	    re_wchar_t c;

-	    c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
+	    c = TARGET_CHAR_AND_LENGTH (d, dend - d, len);

 	    if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
 	      goto fail;
@ -5665,11 +5816,11 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
   bytes; nonzero otherwise.  */

 static int
-bcmp_translate (s1, s2, len, translate, multibyte)
+bcmp_translate (s1, s2, len, translate, target_multibyte)
     re_char *s1, *s2;
     register int len;
     RE_TRANSLATE_TYPE translate;
-     const int multibyte;
+     const int target_multibyte;
 {
  register re_char *p1 = s1, *p2 = s2;
  re_char *p1_end = s1 + len;
@ -5682,8 +5833,8 @@ bcmp_translate (s1, s2, len, translate, multibyte)
      int p1_charlen, p2_charlen;
      re_wchar_t p1_ch, p2_ch;

-      p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
-      p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+      p1_ch = TARGET_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
+      p2_ch = TARGET_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);

      if (RE_TRANSLATE (translate, p1_ch)
 	  != RE_TRANSLATE (translate, p2_ch))