[svn] Minor improvements to fd_read_hunk.

author hniksic <devnull@localhost>

Fri, 1 Jul 2005 23:37:50 +0000 (16:37 -0700)

committer hniksic <devnull@localhost>

Fri, 1 Jul 2005 23:37:50 +0000 (16:37 -0700)
author hniksic <devnull@localhost>
Fri, 1 Jul 2005 23:37:50 +0000 (16:37 -0700)
committer hniksic <devnull@localhost>
Fri, 1 Jul 2005 23:37:50 +0000 (16:37 -0700)
diff --git a/src/ChangeLog b/src/ChangeLog

index a0fabf772e5c3fe9d39433f9b6cd63607f636137..6777851db985e6a06a895072663b9de30ce9914e 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,11 @@
+2005-07-02  Hrvoje Niksic  <hniksic@xemacs.org>
+
+       * http.c (response_head_terminator): Minor optimization.
+
+       * retr.c (fd_read_hunk): Call terminator with pointer to the start
+       of the data and the pointer to the current data.  Changed all
+       callers.
+
  2005-07-01  Hrvoje Niksic  <hniksic@xemacs.org>
  
         * url.c (url_parse): Make sure u->params is not initialized for
diff --git a/src/http.c b/src/http.c

index 4d1c16ebc917f72dbb3f94876f4abe98b9d2dbc4..e281cd5232f1f00944c2f200e6608d4ce0c79cfa 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -416,40 +416,51 @@ post_file (int sock, const char *file_name, wgint promised_size)
    return 0;
  }
  \f
+/* Determine whether [START, PEEKED + PEEKLEN) contains an empty line.
+   If so, return the pointer to the position after the line, otherwise
+   return NULL.  This is used as callback to fd_read_hunk.  The data
+   between START and PEEKED has been read and cannot be "unread"; the
+   data after PEEKED has only been peeked.  */
+
  static const char *
-response_head_terminator (const char *hunk, int oldlen, int peeklen)
+response_head_terminator (const char *start, const char *peeked, int peeklen)
  {
-  const char *start, *end;
+  const char *p, *end;
  
    /* If at first peek, verify whether HUNK starts with "HTTP".  If
       not, this is a HTTP/0.9 request and we must bail out without
       reading anything.  */
-  if (oldlen == 0 && 0 != memcmp (hunk, "HTTP", MIN (peeklen, 4)))
-    return hunk;
-
-  if (oldlen < 4)
-    start = hunk;
-  else
-    start = hunk + oldlen - 4;
-  end = hunk + oldlen + peeklen;
-
-  for (; start < end - 1; start++)
-    if (*start == '\n')
+  if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4)))
+    return start;
+
+  /* Look for "\n[\r]\n", and return the following position if found.
+     Start two chars before the current to cover the possibility that
+     part of the terminator (e.g. "\n\r") arrived in the previous
+     batch.  */
+  p = peeked - start < 2 ? start : peeked - 2;
+  end = peeked + peeklen;
+
+  /* Check for \n\r\n or \n\n anywhere in [p, end-2). */
+  for (; p < end - 2; p++)
+    if (*p == '\n')
        {
-       if (start < end - 2
-           && start[1] == '\r'
-           && start[2] == '\n')
-         return start + 3;
-       if (start[1] == '\n')
-         return start + 2;
+       if (p[1] == '\r' && p[2] == '\n')
+         return p + 3;
+       else if (p[1] == '\n')
+         return p + 2;
        }
+  /* p==end-2: check for \n\n directly preceding END. */
+  if (p[0] == '\n' && p[1] == '\n')
+    return p + 2;
+
    return NULL;
  }
  
-/* The maximum size of a single HTTP response we care to read.  This
-   is not meant to impose an arbitrary limit, but to protect the user
-   from Wget slurping up available memory upon encountering malicious
-   or buggy server output.  Define it to 0 to remove the limit.  */
+/* The maximum size of a single HTTP response we care to read.  Rather
+   than being a limit of the reader implementation, this limit
+   prevents Wget from slurping all available memory upon encountering
+   malicious or buggy server output, thus protecting the user.  Define
+   it to 0 to remove the limit.  */
  
  #define HTTP_RESPONSE_MAX_SIZE 65536
  
diff --git a/src/retr.c b/src/retr.c

index 450619360b639bdac49a4353a7c8a52ee32e01e0..918fb5de6df611a4b328d501925a2fa8f11ac2fe 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -336,22 +336,35 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
    return ret;
  }
  \f
-/* Read a hunk of data from FD, up until a terminator.  The terminator
-   is whatever the TERMINATOR function determines it to be; for
-   example, it can be a line of data, or the head of an HTTP response.
-   The function returns the data read allocated with malloc.
-
-   In case of error, NULL is returned.  In case of EOF and no data
-   read, NULL is returned and errno set to 0.  In case of EOF with
-   data having been read, the data is returned, but it will
-   (obviously) not contain the terminator.
+/* Read a hunk of data from FD, up until a terminator.  The hunk is
+   limited by whatever the TERMINATOR callback chooses as its
+   terminator.  For example, if terminator stops at newline, the hunk
+   will consist of a line of data; if terminator stops at two
+   newlines, it can be used to read the head of an HTTP response.
+   Upon determining the boundary, the function returns the data (up to
+   the terminator) in malloc-allocated storage.
+
+   In case of read error, NULL is returned.  In case of EOF and no
+   data read, NULL is returned and errno set to 0.  In case of having
+   read some data, but encountering EOF before seeing the terminator,
+   the data that has been read is returned, but it will (obviously)
+   not contain the terminator.
+
+   The TERMINATOR function is called with three arguments: the
+   beginning of the data read so far, the beginning of the current
+   block of peeked-at data, and the length of the current block.
+   Depending on its needs, the function is free to choose whether to
+   analyze all data or just the newly arrived data.  If TERMINATOR
+   returns NULL, it means that the terminator has not been seen.
+   Otherwise it should return a pointer to the charactre immediately
+   following the terminator.
  
     The idea is to be able to read a line of input, or otherwise a hunk
     of text, such as the head of an HTTP request, without crossing the
     boundary, so that the next call to fd_read etc. reads the data
     after the hunk.  To achieve that, this function does the following:
  
-   1. Peek at available data.
+   1. Peek at incoming data.
  
     2. Determine whether the peeked data, along with the previously
        read data, includes the terminator.
@@ -396,12 +409,13 @@ fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
           xfree (hunk);
           return NULL;
         }
-      end = terminator (hunk, tail, pklen);
+      end = terminator (hunk, hunk + tail, pklen);
        if (end)
         {
           /* The data contains the terminator: we'll drain the data up
              to the end of the terminator.  */
           remain = end - (hunk + tail);
+         assert (remain >= 0);
           if (remain == 0)
             {
               /* No more data needs to be read. */
@@ -471,11 +485,11 @@ fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
  }
  
  static const char *
-line_terminator (const char *hunk, int oldlen, int peeklen)
+line_terminator (const char *start, const char *peeked, int peeklen)
  {
-  const char *p = memchr (hunk + oldlen, '\n', peeklen);
+  const char *p = memchr (peeked, '\n', peeklen);
    if (p)
-    /* p+1 because we want the line to include '\n' */
+    /* p+1 because the line must include '\n' */
      return p + 1;
    return NULL;
  }
diff --git a/src/retr.h b/src/retr.h

index 305ced4f94bee212d871a750144e2eb26dae48e0..441471ac14ea11fdf7905f956e71d7a3a0bb4e11 100644 (file)
--- a/src/retr.h
+++ b/src/retr.h
@@ -45,7 +45,7 @@ enum {
  
  int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int);
  
-typedef const char *(*hunk_terminator_t) (const char *, int, int);
+typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
  
  char *fd_read_hunk (int, hunk_terminator_t, long, long);
  char *fd_read_line (int);
author	hniksic <devnull@localhost>
	Fri, 1 Jul 2005 23:37:50 +0000 (16:37 -0700)
committer	hniksic <devnull@localhost>
	Fri, 1 Jul 2005 23:37:50 +0000 (16:37 -0700)
src/ChangeLog		patch \| blob \| history
src/http.c		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/retr.h		patch \| blob \| history