[svn] Update copyright blurbs with the year 2000.

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index b00484e4feaa9406ded4feba51b7b8c4be065542..6b2423a7781736f3cc4a95ea054c2c6e4ace8079 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1,12 +1,12 @@
  /* URL handling.
-   Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
  
  This file is part of Wget.
  
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -53,14 +53,18 @@ extern int errno;
  
  /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
     ':' (not listed in RFC) were added because of user/password
-   encoding, and \033 for safe printing.  */
+   encoding.  */
  
  #ifndef WINDOWS
-# define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
+# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  #else  /* WINDOWS */
-# define URL_UNSAFE " <>\"%{}|\\^[]`\033"
+# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  #endif /* WINDOWS */
  
+#define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
+                       || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
+                       || strchr (URL_UNSAFE_CHARS, c))
+
  /* If S contains unsafe characters, free it and replace it with a
     version that doesn't.  */
  #define URL_CLEANSE(s) do                      \
@@ -78,6 +82,11 @@ extern int errno;
  /* Is a directory ".."?  */
  #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  
+#if 0
+static void path_simplify_with_kludge PARAMS ((char *));
+#endif
+static int urlpath_length PARAMS ((const char *));
+
  /* NULL-terminated list of strings to be recognized as prototypes (URL
     schemes).  Note that recognized doesn't mean supported -- only HTTP
     and FTP are currently supported.
@@ -154,9 +163,9 @@ skip_url (const char *url)
  {
    int i;
  
-  if (toupper (url[0]) == 'U'
-      && toupper (url[1]) == 'R'
-      && toupper (url[2]) == 'L'
+  if (TOUPPER (url[0]) == 'U'
+      && TOUPPER (url[1]) == 'R'
+      && TOUPPER (url[2]) == 'L'
        && url[3] == ':')
      {
        /* Skip blanks.  */
@@ -172,7 +181,7 @@ int
  contains_unsafe (const char *s)
  {
    for (; *s; s++)
-    if (strchr (URL_UNSAFE, *s))
+    if (UNSAFE_CHAR (*s))
        return 1;
    return 0;
  }
@@ -209,8 +218,8 @@ decode_string (char *s)
    *p = '\0';
  }
  
-/* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
-   string, returning a malloc-ed %XX encoded string.  */
+/* Encode the unsafe characters (as determined by URL_UNSAFE) in a
+   given string, returning a malloc-ed %XX encoded string.  */
  char *
  encode_string (const char *s)
  {
@@ -220,12 +229,12 @@ encode_string (const char *s)
  
    b = s;
    for (i = 0; *s; s++, i++)
-    if (strchr (URL_UNSAFE, *s))
+    if (UNSAFE_CHAR (*s))
        i += 2; /* Two more characters (hex digits) */
    res = (char *)xmalloc (i + 1);
    s = b;
    for (p = res; *s; s++)
-    if (strchr (URL_UNSAFE, *s))
+    if (UNSAFE_CHAR (*s))
        {
         const unsigned char c = *s;
         *p++ = '%';
@@ -396,7 +405,7 @@ parseurl (const char *url, struct urlinfo *u, int strict)
      }
    /* If protocol is recognizable, but unsupported, bail out, else
       suppose unknown.  */
-  if (recognizable && !sup_protos[i].name)
+  if (recognizable && i == ARRAY_SIZE (sup_protos))
      return URLUNKNOWN;
    else if (i == ARRAY_SIZE (sup_protos))
      type = URLUNKNOWN;
@@ -458,28 +467,13 @@ parseurl (const char *url, struct urlinfo *u, int strict)
    if (type == URLHTTP)
      while (url[i] && url[i] == '/')
        ++i;
-
-  /* dfb: break "path" into "path" and "qstring" if the URL is HTTP 
-     if it's not an HTTP url, set l to the last character, so the 
-     xmalloc and strncpy work as desired */
-  if (type == URLHTTP) {
-    for (l = i; url[l] && url[l] != '?'; l++);
-    if (l != strlen(url)) {
-      /* copy the query string, including the '?' into u->qstring */
-      u->qstring = (char *)xmalloc (strlen (url + l) + 8);
-      strcpy (u->qstring, url + l);
-    }
-  } else {
-    l = strlen(url);
-  }
-  
-
-  u->path = strdupdelim (url + i, url + l);
+  u->path = (char *)xmalloc (strlen (url + i) + 8);
+  strcpy (u->path, url + i);
    if (type == URLFTP)
      {
        u->ftp_type = process_ftp_type (u->path);
        /* #### We don't handle type `d' correctly yet.  */
-      if (!u->ftp_type || toupper (u->ftp_type) == 'D')
+      if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
         u->ftp_type = 'I';
      }
    DEBUGP (("opath %s -> ", u->path));
@@ -495,8 +489,6 @@ parseurl (const char *url, struct urlinfo *u, int strict)
    /* Parse the directory.  */
    parse_dir (u->path, &u->dir, &u->file);
    DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
-  if (type == URLHTTP && u->qstring) 
-    DEBUGP (("query-string %s -> ", u->qstring));
    /* Simplify the directory.  */
    path_simplify (u->dir);
    /* Remove the leading `/' in HTTP.  */
@@ -516,11 +508,18 @@ parseurl (const char *url, struct urlinfo *u, int strict)
    strcat (u->path, *u->dir ? "/" : "");
    strcat (u->path, u->file);
    URL_CLEANSE (u->path);
+  DEBUGP (("newpath: %s\n", u->path));
    /* Create the clean URL.  */
    u->url = str_url (u, 0);
    return URLOK;
  }
  \f
+/* Special versions of DOTP and DDOTP for parse_dir(). */
+
+#define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
+#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')            \
+                    && (!*((x) + 2) || *((x) + 2) == '?'))
+
  /* Build the directory and filename components of the path.  Both
     components are *separately* malloc-ed strings!  It does not change
     the contents of path.
@@ -532,13 +531,16 @@ parse_dir (const char *path, char **dir, char **file)
  {
    int i, l;
  
-  for (i = l = strlen (path); i && path[i] != '/'; i--);
+  l = urlpath_length (path);
+  for (i = l; i && path[i] != '/'; i--);
+
    if (!i && *path != '/')   /* Just filename */
      {
-      if (DOTP (path) || DDOTP (path))
+      if (PD_DOTP (path) || PD_DDOTP (path))
         {
-         *dir = xstrdup (path);
-         *file = xstrdup ("");
+         *dir = strdupdelim (path, path + l);
+         *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
         }
        else
         {
@@ -548,10 +550,11 @@ parse_dir (const char *path, char **dir, char **file)
      }
    else if (!i)                 /* /filename */
      {
-      if (DOTP (path + 1) || DDOTP (path + 1))
+      if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
         {
-         *dir = xstrdup (path);
-         *file = xstrdup ("");
+         *dir = strdupdelim (path, path + l);
+         *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
         }
        else
         {
@@ -561,15 +564,16 @@ parse_dir (const char *path, char **dir, char **file)
      }
    else /* Nonempty directory with or without a filename */
      {
-      if (DOTP (path + i + 1) || DDOTP (path + i + 1))
+      if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
         {
-         *dir = xstrdup (path);
-         *file = xstrdup ("");
+         *dir = strdupdelim (path, path + l);
+         *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
         }
        else
         {
           *dir = strdupdelim (path, path + i);
-         *file = strdupdelim (path + i + 1, path + l + 1);
+         *file = xstrdup (path + i + 1);
         }
      }
  }
@@ -635,15 +639,16 @@ process_ftp_type (char *path)
      return '\0';
  }
  \f
-/* Return the URL as fine-formed string, with a proper protocol, port
-   number, directory and optional user/password.  If HIDE is non-zero,
-   password will be hidden.  The forbidden characters in the URL will
-   be cleansed.  */
+/* Return the URL as fine-formed string, with a proper protocol,
+   optional port number, directory and optional user/password.  If
+   HIDE is non-zero, password will be hidden.  The forbidden
+   characters in the URL will be cleansed.  */
  char *
  str_url (const struct urlinfo *u, int hide)
  {
    char *res, *host, *user, *passwd, *proto_name, *dir, *file;
-  int i, l, ln, lu, lh, lp, lf, ld, lq;
+  int i, l, ln, lu, lh, lp, lf, ld;
+  unsigned short proto_default_port;
  
    /* Look for the protocol name.  */
    for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
@@ -652,6 +657,7 @@ str_url (const struct urlinfo *u, int hide)
    if (i == ARRAY_SIZE (sup_protos))
      return NULL;
    proto_name = sup_protos[i].name;
+  proto_default_port = sup_protos[i].port;
    host = CLEANDUP (u->host);
    dir = CLEANDUP (u->dir);
    file = CLEANDUP (u->file);
@@ -670,7 +676,7 @@ str_url (const struct urlinfo *u, int hide)
      {
        char *tmp = (char *)xmalloc (strlen (dir) + 3);
        /*sprintf (tmp, "%%2F%s", dir + 1);*/
-      *tmp = '%';
+      tmp[0] = '%';
        tmp[1] = '2';
        tmp[2] = 'F';
        strcpy (tmp + 3, dir + 1);
@@ -684,8 +690,7 @@ str_url (const struct urlinfo *u, int hide)
    lh = strlen (host);
    ld = strlen (dir);
    lf = strlen (file);
-  lq = (u->proto == URLHTTP && u->qstring) ? strlen (u->qstring) : 0;
-  res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + lq + 20); /* safe sex */
+  res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
    /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
       (user ? user : ""), (passwd ? ":" : ""),
       (passwd ? passwd : ""), (user ? "@" : ""),
@@ -707,24 +712,21 @@ str_url (const struct urlinfo *u, int hide)
      }
    memcpy (res + l, host, lh);
    l += lh;
-  res[l++] = ':';
-  long_to_string (res + l, (long)u->port);
-  l += numdigit (u->port);
+  if (u->port != proto_default_port)
+    {
+      res[l++] = ':';
+      long_to_string (res + l, (long)u->port);
+      l += numdigit (u->port);
+    }
    res[l++] = '/';
    memcpy (res + l, dir, ld);
    l += ld;
    if (*dir)
      res[l++] = '/';
    strcpy (res + l, file);
-  l += lf;
    free (host);
    free (dir);
    free (file);
-  if (u->qstring)
-    {
-      /* copy in the raw query string to avoid munging arguments */
-      memcpy (res + l, u->qstring, lq);
-    }
    FREE_MAYBE (user);
    FREE_MAYBE (passwd);
    return res;
@@ -845,7 +847,8 @@ get_urls_file (const char *file)
  
     If SILENT is non-zero, do not barf on baseless relative links.  */
  urlpos *
-get_urls_html (const char *file, const char *this_url, int silent)
+get_urls_html (const char *file, const char *this_url, int silent,
+              int dash_p_leaf_HTML)
  {
    long nread;
    FILE *fp;
@@ -874,7 +877,8 @@ get_urls_html (const char *file, const char *this_url, int silent)
    first_time = 1;
    /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
    for (buf = orig_buf;
-       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
+       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
+                          dash_p_leaf_HTML));
         buf += step)
      {
        int i, no_proto;
@@ -882,6 +886,7 @@ get_urls_html (const char *file, const char *this_url, int silent)
        const char *pbuf = buf;
        char *constr, *base;
        const char *cbase;
+      char *needs_freeing, *url_data;
  
        first_time = 0;
  
@@ -902,16 +907,27 @@ get_urls_html (const char *file, const char *this_url, int silent)
        if (!size)
         break;
  
+      /* It would be nice if we could avoid allocating memory in this
+         loop, but I don't see an easy way.  To process the entities,
+         we need to either copy the data, or change it destructively.
+         I choose the former.
+
+        We have two pointers: needs_freeing and url_data, because the
+        code below does thing like url_data += <something>, and we
+        want to pass the original string to free(). */
+      needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
+      size = strlen (url_data);
+
        for (i = 0; protostrings[i]; i++)
         {
-         if (!strncasecmp (protostrings[i], pbuf,
+         if (!strncasecmp (protostrings[i], url_data,
                             MINVAL (strlen (protostrings[i]), size)))
             break;
         }
        /* Check for http:RELATIVE_URI.  See below for details.  */
        if (protostrings[i]
-         && !(strncasecmp (pbuf, "http:", 5) == 0
-              && strncasecmp (pbuf, "http://", 7) != 0))
+         && !(strncasecmp (url_data, "http:", 5) == 0
+              && strncasecmp (url_data, "http://", 7) != 0))
         {
           no_proto = 0;
         }
@@ -922,20 +938,23 @@ get_urls_html (const char *file, const char *this_url, int silent)
              relative URI-s as <a href="http:URL">.  Just strip off the
              silly leading "http:" (as well as any leading blanks
              before it).  */
-         if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
-           pbuf += 5, size -= 5;
+         if ((size > 5) && !strncasecmp ("http:", url_data, 5))
+           url_data += 5, size -= 5;
         }
        if (!no_proto)
         {
           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
             {
-             if (!strncasecmp (sup_protos[i].name, pbuf,
+             if (!strncasecmp (sup_protos[i].name, url_data,
                                MINVAL (strlen (sup_protos[i].name), size)))
                 break;
             }
           /* Do *not* accept a non-supported protocol.  */
           if (i == ARRAY_SIZE (sup_protos))
-           continue;
+           {
+             free (needs_freeing);
+             continue;
+           }
         }
        if (no_proto)
         {
@@ -958,13 +977,14 @@ get_urls_html (const char *file, const char *this_url, int silent)
                   /* Use malloc, not alloca because this is called in
                       a loop. */
                   char *temp = (char *)malloc (size + 1);
-                 strncpy (temp, pbuf, size);
+                 strncpy (temp, url_data, size);
                   temp[size] = '\0';
                   logprintf (LOG_NOTQUIET,
                              _("Error (%s): Link %s without a base provided.\n"),
                              file, temp);
                   free (temp);
                 }
+             free (needs_freeing);
               continue;
             }
           if (this_url)
@@ -979,17 +999,18 @@ get_urls_html (const char *file, const char *this_url, int silent)
                   logprintf (LOG_NOTQUIET, _("\
  Error (%s): Base %s relative, without referer URL.\n"),
                              file, cbase);
+                 free (needs_freeing);
                   continue;
                 }
               base = xstrdup (cbase);
             }
-         constr = construct (base, pbuf, size, no_proto);
+         constr = construct (base, url_data, size, no_proto);
           free (base);
         }
        else /* has proto */
         {
           constr = (char *)xmalloc (size + 1);
-         strncpy (constr, pbuf, size);
+         strncpy (constr, url_data, size);
           constr[size] = '\0';
         }
  #ifdef DEBUG
@@ -1001,7 +1022,7 @@ Error (%s): Base %s relative, without referer URL.\n"),
           tmp2 = html_base ();
           /* Use malloc, not alloca because this is called in a loop. */
           tmp = (char *)xmalloc (size + 1);
-         strncpy (tmp, pbuf, size);
+         strncpy (tmp, url_data, size);
           tmp[size] = '\0';
           logprintf (LOG_ALWAYS,
                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
@@ -1022,14 +1043,15 @@ Error (%s): Base %s relative, without referer URL.\n"),
        memset (current, 0, sizeof (*current));
        current->next = NULL;
        current->url = constr;
-      current->size = size;
-      current->pos = pbuf - orig_buf;
+      current->size = step;
+      current->pos = buf - orig_buf;
        /* A URL is relative if the host and protocol are not named,
          and the name does not start with `/'.  */
-      if (no_proto && *pbuf != '/')
+      if (no_proto && *url_data != '/')
         current->flags |= (URELATIVE | UNOPROTO);
        else if (no_proto)
         current->flags |= UNOPROTO;
+      free (needs_freeing);
      }
    free (orig_buf);
  
@@ -1279,8 +1301,28 @@ url_filename (const struct urlinfo *u)
    return name;
  }
  
+/* Like strlen(), but allow the URL to be ended with '?'.  */
+static int
+urlpath_length (const char *url)
+{
+  const char *q = strchr (url, '?');
+  if (q)
+    return q - url;
+  return strlen (url);
+}
+
+static const char *
+find_last_char (const char *b, const char *e, char c)
+{
+  for (; e > b; e--)
+    if (*e == c)
+      return e;
+  return NULL;
+}
+
  /* Construct an absolute URL, given a (possibly) relative one.  This
-   is more tricky than it might seem, but it works.  */
+   gets tricky if you want to cover all the "reasonable" cases, but
+   I'm satisfied with the result.  */
  static char *
  construct (const char *url, const char *sub, int subsize, int no_proto)
  {
@@ -1288,65 +1330,126 @@ construct (const char *url, const char *sub, int subsize, int no_proto)
  
    if (no_proto)
      {
-      int i;
+      const char *end = url + urlpath_length (url);
  
        if (*sub != '/')
         {
-         for (i = strlen (url); i && url[i] != '/'; i--);
-         if (!i || (url[i] == url[i - 1]))
+         /* SUB is a relative URL: we need to replace everything
+            after last slash (possibly empty) with SUB.
+
+            So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
+            our result should be "whatever/foo/qux/xyzzy".  */
+         int need_explicit_slash = 0;
+         int span;
+         const char *start_insert;
+         const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
+         if (!last_slash)
             {
-             int l = strlen (url);
-             char *t = (char *)alloca (l + 2);
-             strcpy (t, url);
-             t[l] = '/';
-             t[l + 1] = '\0';
-             url = t;
-             i = l;
+             /* No slash found at all.  Append SUB to what we have,
+                but we'll need a slash as a separator.
+
+                Example: if url == "foo" and sub == "qux/xyzzy", then
+                we cannot just append sub to url, because we'd get
+                "fooqux/xyzzy", whereas what we want is
+                "foo/qux/xyzzy".
+
+                To make sure the / gets inserted, we set
+                need_explicit_slash to 1.  We also set start_insert
+                to end + 1, so that the length calculations work out
+                correctly for one more (slash) character.  Accessing
+                that character is fine, since it will be the
+                delimiter, '\0' or '?'.  */
+             /* example: "foo?..." */
+             /*               ^    ('?' gets changed to '/') */
+             start_insert = end + 1;
+             need_explicit_slash = 1;
             }
-         constr = (char *)xmalloc (i + 1 + subsize + 1);
-         strncpy (constr, url, i + 1);
-         constr[i + 1] = '\0';
-         strncat (constr, sub, subsize);
+         else
+           {
+             /* example: "whatever/foo/bar" */
+             /*                        ^    */
+             start_insert = last_slash + 1;
+           }
+
+         span = start_insert - url;
+         constr = (char *)xmalloc (span + subsize + 1);
+         if (span)
+           memcpy (constr, url, span);
+         if (need_explicit_slash)
+           constr[span - 1] = '/';
+         if (subsize)
+           memcpy (constr + span, sub, subsize);
+         constr[span + subsize] = '\0';
         }
        else /* *sub == `/' */
         {
-         int fl;
-
-         i = 0;
-         do
-           {
-             for (; url[i] && url[i] != '/'; i++);
-             if (!url[i])
-               break;
-             fl = (url[i] == url[i + 1] && url[i + 1] == '/');
-             if (fl)
-               i += 2;
-           }
-         while (fl);
-         if (!url[i])
-           {
-             int l = strlen (url);
-             char *t = (char *)alloca (l + 2);
-             strcpy (t, url);
-             t[l] = '/';
-             t[l + 1] = '\0';
-             url = t;
-           }
-         constr = (char *)xmalloc (i + 1 + subsize + 1);
-         strncpy (constr, url, i);
-         constr[i] = '\0';
-         strncat (constr + i, sub, subsize);
-         constr[i + subsize] = '\0';
-       } /* *sub == `/' */
+         /* SUB is an absolute path: we need to replace everything
+             after (and including) the FIRST slash with SUB.
+
+            So, if URL is "http://host/whatever/foo/bar", and SUB is
+            "/qux/xyzzy", our result should be
+            "http://host/qux/xyzzy".  */
+         int span;
+         const char *slash, *start_insert;
+         const char *pos = url;
+         int seen_slash_slash = 0;
+         /* We're looking for the first slash, but want to ignore
+             double slash. */
+       again:
+         slash = memchr (pos, '/', end - pos);
+         if (slash && !seen_slash_slash)
+           if (*(slash + 1) == '/')
+             {
+               pos = slash + 2;
+               seen_slash_slash = 1;
+               goto again;
+             }
+
+         /* At this point, SLASH is the location of the first / after
+            "//", or the first slash altogether.  START_INSERT is the
+            pointer to the location where SUB will be inserted.  When
+            examining the last two examples, keep in mind that SUB
+            begins with '/'. */
+
+         if (!slash && !seen_slash_slash)
+           /* example: "foo" */
+           /*           ^    */
+           start_insert = url;
+         else if (!slash && seen_slash_slash)
+           /* example: "http://foo" */
+           /*                     ^ */
+           start_insert = end;
+         else if (slash && !seen_slash_slash)
+           /* example: "foo/bar" */
+           /*           ^        */
+           start_insert = url;
+         else if (slash && seen_slash_slash)
+           /* example: "http://something/" */
+           /*                           ^  */
+           start_insert = slash;
+
+         span = start_insert - url;
+         constr = (char *)xmalloc (span + subsize + 1);
+         if (span)
+           memcpy (constr, url, span);
+         if (subsize)
+           memcpy (constr + span, sub, subsize);
+         constr[span + subsize] = '\0';
+       }
      }
    else /* !no_proto */
      {
-      constr = (char *)xmalloc (subsize + 1);
-      strncpy (constr, sub, subsize);
-      constr[subsize] = '\0';
+      constr = strdupdelim (sub, sub + subsize);
      }
    return constr;
  }
+
+/* Like the function above, but with a saner caller interface. */
+char *
+url_concat (const char *base_url, const char *new_url)
+{
+  return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
+}
  \f
  /* Optimize URL by host, destructively replacing u->host with realhost
     (u->host).  Do this regardless of opt.simple_check.  */
@@ -1362,6 +1465,32 @@ opt_url (struct urlinfo *u)
    free (u->url);
    u->url = str_url (u, 0);
  }
+
+/* This beautiful kludge is fortunately not needed, as I've made
+   parse_dir do the (almost) right thing, so that a query can never
+   become a part of directory.  */
+#if 0
+/* Call path_simplify, but make sure that the part after the
+   question-mark, if any, is not destroyed by path_simplify's
+   "optimizations".  */
+void
+path_simplify_with_kludge (char *path)
+{
+  char *query = strchr (path, '?');
+  if (query)
+    /* path_simplify also works destructively, so we also have the
+       license to write. */
+    *query = '\0';
+  path_simplify (path);
+  if (query)
+    {
+      char *newend = path + strlen (path);
+      *query = '?';
+      if (newend != query)
+       memmove (newend, query, strlen (query) + 1);
+    }
+}
+#endif
  \f
  /* Returns proxy host address, in accordance with PROTO.  */
  char *
@@ -1390,9 +1519,10 @@ no_proxy_match (const char *host, const char **no_proxy)
  void
  convert_links (const char *file, urlpos *l)
  {
-  FILE *fp;
-  char *buf, *p, *p2;
-  long size;
+  FILE               *fp;
+  char               *buf, *p, *p2;
+  downloaded_file_t  downloaded_file_return;
+  long               size;
  
    logprintf (LOG_VERBOSE, _("Converting %s... "), file);
    /* Read from the file....  */
@@ -1406,7 +1536,10 @@ convert_links (const char *file, urlpos *l)
    /* ...to a buffer.  */
    load_file (fp, &buf, &size);
    fclose (fp);
-  if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
+
+  downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
+
+  if (opt.backup_converted && downloaded_file_return)
      /* Rather than just writing over the original .html file with the converted
         version, save the former to *.orig.  Note we only do this for files we've
         _successfully_ downloaded, so we don't clobber .orig files sitting around
@@ -1414,15 +1547,31 @@ convert_links (const char *file, urlpos *l)
      {
        /* Construct the backup filename as the original name plus ".orig". */
        size_t         filename_len = strlen(file);
-      char*          filename_plus_orig_suffix = malloc(filename_len +
-                                                       sizeof(".orig"));
+      char*          filename_plus_orig_suffix;
        boolean        already_wrote_backup_file = FALSE;
        slist*         converted_file_ptr;
        static slist*  converted_files = NULL;
  
-      /* Would a single s[n]printf() call be faster? */
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+      if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+       {
+         /* Just write "orig" over "html".  We need to do it this way because
+            when we're checking to see if we've downloaded the file before (to
+            see if we can skip downloading it), we don't know if it's a
+            text/html file.  Therefore we don't know yet at that stage that -E
+            is going to cause us to tack on ".html", so we need to compare
+            vs. the original URL plus ".orig", not the original URL plus
+            ".html.orig". */
+         filename_plus_orig_suffix = xmalloc(filename_len + 1);
+         strcpy(filename_plus_orig_suffix, file);
+         strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+       }
+      else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+       {
+         /* Append ".orig" to the name. */
+         filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
+         strcpy(filename_plus_orig_suffix, file);
+         strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+       }
  
        /* We can get called twice on the same URL thanks to the
          convert_all_links() call in main().  If we write the .orig file each
@@ -1455,7 +1604,7 @@ convert_links (const char *file, urlpos *l)
              thought I could just add a field to the urlpos structure saying
              that we'd written a .orig file for this URL, but that didn't work,
              so I had to make this separate list. */
-         converted_file_ptr = malloc(sizeof(slist));
+         converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
           converted_file_ptr->next = converted_files;
           converted_files = converted_file_ptr;
@@ -1472,8 +1621,8 @@ convert_links (const char *file, urlpos *l)
        free (buf);
        return;
      }
-  /* [If someone understands why multiple URLs can correspond to one local file,
-     can they please add a comment here...?] */
+  /* Presumably we have to loop through multiple URLs here (even though we're
+     only talking about a single local file) because of the -O option. */
    for (p = buf; l; l = l->next)
      {
        if (l->pos >= size)
@@ -1495,6 +1644,7 @@ convert_links (const char *file, urlpos *l)
        for (p2 = buf + l->pos; p < p2; p++)
         putc (*p, fp);
        if (l->flags & UABS2REL)
+       /* Convert absolute URL to relative. */
         {
           char *newname = construct_relative (file, l->local_name);
           fprintf (fp, "%s", newname);
@@ -1504,6 +1654,7 @@ convert_links (const char *file, urlpos *l)
         }
        p += l->size;
      }
+  /* Output the rest of the file. */
    if (p - buf < size)
      {
        for (p2 = buf + size; p < p2; p++)
@@ -1583,23 +1734,34 @@ add_url (urlpos *l, const char *url, const char *file)
  }
  
  
-/* Remembers which files have been downloaded.  Should be called with
-   add_or_check == ADD_FILE for each file we actually download successfully
-   (i.e. not for ones we have failures on or that we skip due to -N).  If you
-   just want to check if a file has been previously added without adding it,
-   call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
-   function with local filenames, not remote URLs -- by some means that isn't
-   commented well enough for me understand, multiple remote URLs can apparently
-   correspond to a single local file. */
-boolean
-downloaded_file (downloaded_file_t  add_or_check, const char*  file)
+/* Remembers which files have been downloaded.  In the standard case, should be
+   called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
+   download successfully (i.e. not for ones we have failures on or that we skip
+   due to -N).
+
+   When we've downloaded a file and tacked on a ".html" extension due to -E,
+   call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
+   FILE_DOWNLOADED_NORMALLY.
+
+   If you just want to check if a file has been previously added without adding
+   it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
+   with local filenames, not remote URLs. */
+downloaded_file_t
+downloaded_file (downloaded_file_t  mode, const char*  file)
  {
-  boolean        found_file = FALSE;
-  static slist*  downloaded_files = NULL;
-  slist*         rover = downloaded_files;
+  typedef struct _downloaded_file_list
+  {
+    char*                          file;
+    downloaded_file_t              download_type;
+    struct _downloaded_file_list*  next;
+  } downloaded_file_list;
+  
+  boolean                       found_file = FALSE;
+  static downloaded_file_list*  downloaded_files = NULL;
+  downloaded_file_list*         rover = downloaded_files;
  
    while (rover != NULL)
-    if (strcmp(rover->string, file) == 0)
+    if (strcmp(rover->file, file) == 0)
        {
         found_file = TRUE;
         break;
@@ -1608,17 +1770,18 @@ downloaded_file (downloaded_file_t  add_or_check, const char*  file)
        rover = rover->next;
  
    if (found_file)
-    return TRUE;  /* file had already been downloaded */
+    return rover->download_type;  /* file had already been downloaded */
    else
      {
-      if (add_or_check == ADD_FILE)
+      if (mode != CHECK_FOR_FILE)
         {
-         rover = malloc(sizeof(slist));
-         rover->string = xstrdup(file);  /* die on out-of-mem. */
+         rover = xmalloc(sizeof(*rover));
+         rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
+         rover->download_type = mode;
           rover->next = downloaded_files;
           downloaded_files = rover;
         }
  
-      return FALSE;  /* file had not already been downloaded */
+      return FILE_NOT_ALREADY_DOWNLOADED;
      }
  }