From xemacs-m  Wed Feb 26 18:13:02 1997
Received: from mailbox1.ucsd.edu (mailbox1.ucsd.edu [132.239.1.53])
	by xemacs.org (8.8.5/8.8.5) with ESMTP id SAA18675
	for <xemacs-beta@xemacs.org>; Wed, 26 Feb 1997 18:12:53 -0600 (CST)
Received: from sdnp5.ucsd.edu (sdnp5.ucsd.edu [132.239.79.10]) by mailbox1.ucsd.edu (8.8.5/8.6.9) with SMTP id QAA28353 for <xemacs-beta@xemacs.org>; Wed, 26 Feb 1997 16:12:49 -0800 (PST)
Received: by sdnp5.ucsd.edu (SMI-8.6/SMI-SVR4)
	id QAA08638; Wed, 26 Feb 1997 16:15:10 -0800
Sender: dmoore@sdnp5.ucsd.edu
To: XEmacs Beta Mailing List <xemacs-beta@xemacs.org>
Subject: regex.c patch
X-Face: "oX;zS#-JU$-,WKSzG.1gGE]x^cIg!hW.dq>.f6pzS^A+(k!T|M:}5{_%>Io<>L&{hO7W4cicOQ|>/lZ1G(m%7iaCf,6Qgk0%%Bz7b2-W3jd0m_UG\Y;?]}4s0O-U)uox>P3JN)9cm]O\@,vy2e{`3pb!"pqmRy3peB90*2L
Mail-Copies-To: never
Mime-Version: 1.0 (generated by tm-edit 7.105)
Content-Type: text/plain; charset=US-ASCII
From: David Moore <dmoore@ucsd.edu>
Date: 26 Feb 1997 16:15:09 -0800
Message-ID: <rvwwrv3y0i.fsf@sdnp5.ucsd.edu>
Lines: 199
X-Mailer: Gnus v5.4.8/XEmacs 20.1


This patch fixes some MULE specific bugs in regex.c and at least one
non-mule specific bug.  It's not as efficient as possible; some of the
INC_CHARPTR & charcount_to_bytecount's could probably be combined to do
the check just once, if someone so wished.

It fixes all the regexp bugs I've seen reported recently.  And this is
the first time I've run Gnus under MULE, and that seems to be working
still.

--- regex.c.orig	Wed Feb 26 16:04:21 1997
+++ regex.c	Wed Feb 26 16:03:47 1997
@@ -3775,12 +3775,19 @@
 /* Using the compiled pattern in BUFP->buffer, first tries to match the
    virtual concatenation of STRING1 and STRING2, starting first at index
    STARTPOS, then at STARTPOS + 1, and so on.
+
+   With MULE, STARTPOS is a byte position, not a char position.  And the
+   search will increment STARTPOS by the width of the current leading
+   character.
    
    STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
    
    RANGE is how far to scan while trying to match.  RANGE = 0 means try
    only at STARTPOS; in general, the last start tried is STARTPOS +
    RANGE.
+
+   With MULE, RANGE is a byte position, not a char position.  The last
+   start tried is the character starting <= STARTPOS + RANGE.
    
    In REGS, return the indices of the virtual concatenation of STRING1
    and STRING2 that matched the entire BUFP->buffer and its contained
@@ -3813,8 +3820,13 @@
     
   /* Fix up RANGE if it might eventually take us outside
      the virtual concatenation of STRING1 and STRING2.  */
+#if 0
   if (endpos < -1)
     range = -1 - startpos;
+#else
+  if (endpos < 0)
+    range = 0 - startpos;
+#endif
   else if (endpos > total_size)
     range = total_size - startpos;
 
@@ -3862,25 +3874,33 @@
 	  /* whose stupid idea was it anyway to make this
 	     function take two strings to match?? */
 	  int lim = 0;
-	  unsigned char *p;
+	  register CONST unsigned char *d;
 	  int irange = range;
+
 	  if (startpos < size1 && startpos + range >= size1)
 	    lim = range - (size1 - startpos);
 
-	  p = ((unsigned char *)
-	       &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
-	  p--;
+	  d = ((CONST unsigned char *)
+	       (startpos >= size1 ? string2 - size1 : string1) + startpos);
+	  DEC_CHARPTR(d);
 
 	  if (translate)
-	    {
-	      while (range > lim && translate[*p++] != '\n')
-		range--;
-	    }
+#ifdef MULE
+	    while (range > lim && (*d >= 0x80 || translate[*d] != '\n'))
+#else
+	    while (range > lim && translate[*d] != '\n')
+#endif
+	      {
+		INC_CHARPTR(d);
+		range -= charcount_to_bytecount (d, 1);
+	      }
 	  else
-	    {
-	      while (range > lim && *p++ != '\n')
-		range--;
-	    }
+	    while (range > lim && *d != '\n')
+	      {
+		INC_CHARPTR(d);
+		range -= charcount_to_bytecount (d, 1);
+	      }
+
 	  startpos += irange - range;
 	}
 #endif /* REGEX_BEGLINE_CHECK */
@@ -3893,35 +3913,47 @@
 	{
 	  if (range > 0)	/* Searching forwards.  */
 	    {
-	      register CONST char *d;
+	      register CONST unsigned char *d;
 	      register int lim = 0;
 	      int irange = range;
 
               if (startpos < size1 && startpos + range >= size1)
                 lim = range - (size1 - startpos);
 
-	      d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
+	      d = ((CONST unsigned char *)
+		   (startpos >= size1 ? string2 - size1 : string1) + startpos);
    
               /* Written out as an if-else to avoid testing `translate'
                  inside the loop.  */
 	      if (translate)
-                while (range > lim
-                       && !fastmap[(unsigned char)
-				   translate[(unsigned char) *d++]])
-                  range--;
+#ifdef MULE
+                while (range > lim && *d < 0x80 && !fastmap[translate[*d]])
+#else
+                while (range > lim && !fastmap[translate[*d]])
+#endif
+		  {
+		    range -= charcount_to_bytecount (d, 1);
+		    INC_CHARPTR(d);
+		  }
 	      else
-                while (range > lim && !fastmap[(unsigned char) *d++])
-                  range--;
+                while (range > lim && !fastmap[*d])
+		  {
+		    range -= charcount_to_bytecount (d, 1);
+		    INC_CHARPTR(d);
+		  }
 
 	      startpos += irange - range;
 	    }
 	  else				/* Searching backwards.  */
 	    {
-	      register char c = (size1 == 0 || startpos >= size1
-                                 ? string2[startpos - size1] 
-                                 : string1[startpos]);
-
+	      register unsigned char c = (size1 == 0 || startpos >= size1
+					  ? string2[startpos - size1] 
+					  : string1[startpos]);
+#ifdef MULE
+	      if (c < 0x80 && !fastmap[(unsigned char) TRANSLATE (c)])
+#else
 	      if (!fastmap[(unsigned char) TRANSLATE (c)])
+#endif
 		goto advance;
 	    }
 	}
@@ -3951,17 +3983,28 @@
 
     advance:
       if (!range) 
-        break;
-      else if (range > 0) 
-        {
-          range--; 
-          startpos++;
-        }
-      else
-        {
-          range++; 
-          startpos--;
-        }
+	break;
+      else {
+	register CONST unsigned char *d;
+	Charcount d_size;
+
+	d = ((CONST unsigned char *)
+	     (startpos >= size1 ? string2 - size1 : string1) + startpos);
+
+	if (range > 0) 
+	  {
+	    d_size = charcount_to_bytecount (d, 1);
+	    range -= d_size;
+	    startpos += d_size;
+	  }
+	else
+	  {
+	    DEC_CHARPTR(d);
+	    d_size = charcount_to_bytecount (d, 1);
+	    range += d_size;
+	    startpos -= d_size;
+	  }
+      }
     }
   return -1;
 } /* re_search_2 */


-- 
David Moore <dmoore@ucsd.edu>       | Computer Systems Lab      __o
UCSD Dept. Computer Science - 0114  | Work: (619) 534-8604    _ \<,_
La Jolla, CA 92093-0114             | Fax:  (619) 534-1445   (_)/ (_)
<URL:http://oj.egbt.org/dmoore/>    | In a cloud bones of steel.

