[svn] Update copyright blurbs with the year 2000.

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index c42c1e6469cb1d325614e152dfbe1fc792f47623..6b2423a7781736f3cc4a95ea054c2c6e4ace8079 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1,12 +1,12 @@
  /* URL handling.
-   Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
  
  This file is part of Wget.
  
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -61,7 +61,8 @@ extern int errno;
  # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  #endif /* WINDOWS */
  
-#define UNSAFE_CHAR(c) (((c) >= 0 && (c) <= 32)                        \
+#define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
+                       || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
                         || strchr (URL_UNSAFE_CHARS, c))
  
  /* If S contains unsafe characters, free it and replace it with a
@@ -81,6 +82,11 @@ extern int errno;
  /* Is a directory ".."?  */
  #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  
+#if 0
+static void path_simplify_with_kludge PARAMS ((char *));
+#endif
+static int urlpath_length PARAMS ((const char *));
+
  /* NULL-terminated list of strings to be recognized as prototypes (URL
     schemes).  Note that recognized doesn't mean supported -- only HTTP
     and FTP are currently supported.
@@ -212,7 +218,7 @@ decode_string (char *s)
    *p = '\0';
  }
  
-/* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
+/* Encode the unsafe characters (as determined by URL_UNSAFE) in a
     given string, returning a malloc-ed %XX encoded string.  */
  char *
  encode_string (const char *s)
@@ -399,7 +405,7 @@ parseurl (const char *url, struct urlinfo *u, int strict)
      }
    /* If protocol is recognizable, but unsupported, bail out, else
       suppose unknown.  */
-  if (recognizable && !sup_protos[i].name)
+  if (recognizable && i == ARRAY_SIZE (sup_protos))
      return URLUNKNOWN;
    else if (i == ARRAY_SIZE (sup_protos))
      type = URLUNKNOWN;
@@ -502,11 +508,18 @@ parseurl (const char *url, struct urlinfo *u, int strict)
    strcat (u->path, *u->dir ? "/" : "");
    strcat (u->path, u->file);
    URL_CLEANSE (u->path);
+  DEBUGP (("newpath: %s\n", u->path));
    /* Create the clean URL.  */
    u->url = str_url (u, 0);
    return URLOK;
  }
  \f
+/* Special versions of DOTP and DDOTP for parse_dir(). */
+
+#define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
+#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')            \
+                    && (!*((x) + 2) || *((x) + 2) == '?'))
+
  /* Build the directory and filename components of the path.  Both
     components are *separately* malloc-ed strings!  It does not change
     the contents of path.
@@ -518,13 +531,16 @@ parse_dir (const char *path, char **dir, char **file)
  {
    int i, l;
  
-  for (i = l = strlen (path); i && path[i] != '/'; i--);
+  l = urlpath_length (path);
+  for (i = l; i && path[i] != '/'; i--);
+
    if (!i && *path != '/')   /* Just filename */
      {
-      if (DOTP (path) || DDOTP (path))
+      if (PD_DOTP (path) || PD_DDOTP (path))
         {
-         *dir = xstrdup (path);
-         *file = xstrdup ("");
+         *dir = strdupdelim (path, path + l);
+         *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
         }
        else
         {
@@ -534,10 +550,11 @@ parse_dir (const char *path, char **dir, char **file)
      }
    else if (!i)                 /* /filename */
      {
-      if (DOTP (path + 1) || DDOTP (path + 1))
+      if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
         {
-         *dir = xstrdup (path);
-         *file = xstrdup ("");
+         *dir = strdupdelim (path, path + l);
+         *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
         }
        else
         {
@@ -547,15 +564,16 @@ parse_dir (const char *path, char **dir, char **file)
      }
    else /* Nonempty directory with or without a filename */
      {
-      if (DOTP (path + i + 1) || DDOTP (path + i + 1))
+      if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
         {
-         *dir = xstrdup (path);
-         *file = xstrdup ("");
+         *dir = strdupdelim (path, path + l);
+         *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
         }
        else
         {
           *dir = strdupdelim (path, path + i);
-         *file = strdupdelim (path + i + 1, path + l + 1);
+         *file = xstrdup (path + i + 1);
         }
      }
  }
@@ -621,15 +639,16 @@ process_ftp_type (char *path)
      return '\0';
  }
  \f
-/* Return the URL as fine-formed string, with a proper protocol, port
-   number, directory and optional user/password.  If HIDE is non-zero,
-   password will be hidden.  The forbidden characters in the URL will
-   be cleansed.  */
+/* Return the URL as fine-formed string, with a proper protocol,
+   optional port number, directory and optional user/password.  If
+   HIDE is non-zero, password will be hidden.  The forbidden
+   characters in the URL will be cleansed.  */
  char *
  str_url (const struct urlinfo *u, int hide)
  {
    char *res, *host, *user, *passwd, *proto_name, *dir, *file;
    int i, l, ln, lu, lh, lp, lf, ld;
+  unsigned short proto_default_port;
  
    /* Look for the protocol name.  */
    for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
@@ -638,6 +657,7 @@ str_url (const struct urlinfo *u, int hide)
    if (i == ARRAY_SIZE (sup_protos))
      return NULL;
    proto_name = sup_protos[i].name;
+  proto_default_port = sup_protos[i].port;
    host = CLEANDUP (u->host);
    dir = CLEANDUP (u->dir);
    file = CLEANDUP (u->file);
@@ -656,7 +676,7 @@ str_url (const struct urlinfo *u, int hide)
      {
        char *tmp = (char *)xmalloc (strlen (dir) + 3);
        /*sprintf (tmp, "%%2F%s", dir + 1);*/
-      *tmp = '%';
+      tmp[0] = '%';
        tmp[1] = '2';
        tmp[2] = 'F';
        strcpy (tmp + 3, dir + 1);
@@ -692,9 +712,12 @@ str_url (const struct urlinfo *u, int hide)
      }
    memcpy (res + l, host, lh);
    l += lh;
-  res[l++] = ':';
-  long_to_string (res + l, (long)u->port);
-  l += numdigit (u->port);
+  if (u->port != proto_default_port)
+    {
+      res[l++] = ':';
+      long_to_string (res + l, (long)u->port);
+      l += numdigit (u->port);
+    }
    res[l++] = '/';
    memcpy (res + l, dir, ld);
    l += ld;
@@ -824,7 +847,8 @@ get_urls_file (const char *file)
  
     If SILENT is non-zero, do not barf on baseless relative links.  */
  urlpos *
-get_urls_html (const char *file, const char *this_url, int silent)
+get_urls_html (const char *file, const char *this_url, int silent,
+              int dash_p_leaf_HTML)
  {
    long nread;
    FILE *fp;
@@ -853,7 +877,8 @@ get_urls_html (const char *file, const char *this_url, int silent)
    first_time = 1;
    /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
    for (buf = orig_buf;
-       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
+       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
+                          dash_p_leaf_HTML));
         buf += step)
      {
        int i, no_proto;
@@ -861,6 +886,7 @@ get_urls_html (const char *file, const char *this_url, int silent)
        const char *pbuf = buf;
        char *constr, *base;
        const char *cbase;
+      char *needs_freeing, *url_data;
  
        first_time = 0;
  
@@ -881,16 +907,27 @@ get_urls_html (const char *file, const char *this_url, int silent)
        if (!size)
         break;
  
+      /* It would be nice if we could avoid allocating memory in this
+         loop, but I don't see an easy way.  To process the entities,
+         we need to either copy the data, or change it destructively.
+         I choose the former.
+
+        We have two pointers: needs_freeing and url_data, because the
+        code below does thing like url_data += <something>, and we
+        want to pass the original string to free(). */
+      needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
+      size = strlen (url_data);
+
        for (i = 0; protostrings[i]; i++)
         {
-         if (!strncasecmp (protostrings[i], pbuf,
+         if (!strncasecmp (protostrings[i], url_data,
                             MINVAL (strlen (protostrings[i]), size)))
             break;
         }
        /* Check for http:RELATIVE_URI.  See below for details.  */
        if (protostrings[i]
-         && !(strncasecmp (pbuf, "http:", 5) == 0
-              && strncasecmp (pbuf, "http://", 7) != 0))
+         && !(strncasecmp (url_data, "http:", 5) == 0
+              && strncasecmp (url_data, "http://", 7) != 0))
         {
           no_proto = 0;
         }
@@ -901,20 +938,23 @@ get_urls_html (const char *file, const char *this_url, int silent)
              relative URI-s as <a href="http:URL">.  Just strip off the
              silly leading "http:" (as well as any leading blanks
              before it).  */
-         if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
-           pbuf += 5, size -= 5;
+         if ((size > 5) && !strncasecmp ("http:", url_data, 5))
+           url_data += 5, size -= 5;
         }
        if (!no_proto)
         {
           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
             {
-             if (!strncasecmp (sup_protos[i].name, pbuf,
+             if (!strncasecmp (sup_protos[i].name, url_data,
                                MINVAL (strlen (sup_protos[i].name), size)))
                 break;
             }
           /* Do *not* accept a non-supported protocol.  */
           if (i == ARRAY_SIZE (sup_protos))
-           continue;
+           {
+             free (needs_freeing);
+             continue;
+           }
         }
        if (no_proto)
         {
@@ -937,13 +977,14 @@ get_urls_html (const char *file, const char *this_url, int silent)
                   /* Use malloc, not alloca because this is called in
                       a loop. */
                   char *temp = (char *)malloc (size + 1);
-                 strncpy (temp, pbuf, size);
+                 strncpy (temp, url_data, size);
                   temp[size] = '\0';
                   logprintf (LOG_NOTQUIET,
                              _("Error (%s): Link %s without a base provided.\n"),
                              file, temp);
                   free (temp);
                 }
+             free (needs_freeing);
               continue;
             }
           if (this_url)
@@ -958,17 +999,18 @@ get_urls_html (const char *file, const char *this_url, int silent)
                   logprintf (LOG_NOTQUIET, _("\
  Error (%s): Base %s relative, without referer URL.\n"),
                              file, cbase);
+                 free (needs_freeing);
                   continue;
                 }
               base = xstrdup (cbase);
             }
-         constr = construct (base, pbuf, size, no_proto);
+         constr = construct (base, url_data, size, no_proto);
           free (base);
         }
        else /* has proto */
         {
           constr = (char *)xmalloc (size + 1);
-         strncpy (constr, pbuf, size);
+         strncpy (constr, url_data, size);
           constr[size] = '\0';
         }
  #ifdef DEBUG
@@ -980,7 +1022,7 @@ Error (%s): Base %s relative, without referer URL.\n"),
           tmp2 = html_base ();
           /* Use malloc, not alloca because this is called in a loop. */
           tmp = (char *)xmalloc (size + 1);
-         strncpy (tmp, pbuf, size);
+         strncpy (tmp, url_data, size);
           tmp[size] = '\0';
           logprintf (LOG_ALWAYS,
                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
@@ -1001,14 +1043,15 @@ Error (%s): Base %s relative, without referer URL.\n"),
        memset (current, 0, sizeof (*current));
        current->next = NULL;
        current->url = constr;
-      current->size = size;
-      current->pos = pbuf - orig_buf;
+      current->size = step;
+      current->pos = buf - orig_buf;
        /* A URL is relative if the host and protocol are not named,
          and the name does not start with `/'.  */
-      if (no_proto && *pbuf != '/')
+      if (no_proto && *url_data != '/')
         current->flags |= (URELATIVE | UNOPROTO);
        else if (no_proto)
         current->flags |= UNOPROTO;
+      free (needs_freeing);
      }
    free (orig_buf);
  
@@ -1258,8 +1301,28 @@ url_filename (const struct urlinfo *u)
    return name;
  }
  
+/* Like strlen(), but allow the URL to be ended with '?'.  */
+static int
+urlpath_length (const char *url)
+{
+  const char *q = strchr (url, '?');
+  if (q)
+    return q - url;
+  return strlen (url);
+}
+
+static const char *
+find_last_char (const char *b, const char *e, char c)
+{
+  for (; e > b; e--)
+    if (*e == c)
+      return e;
+  return NULL;
+}
+
  /* Construct an absolute URL, given a (possibly) relative one.  This
-   is more tricky than it might seem, but it works.  */
+   gets tricky if you want to cover all the "reasonable" cases, but
+   I'm satisfied with the result.  */
  static char *
  construct (const char *url, const char *sub, int subsize, int no_proto)
  {
@@ -1267,65 +1330,126 @@ construct (const char *url, const char *sub, int subsize, int no_proto)
  
    if (no_proto)
      {
-      int i;
+      const char *end = url + urlpath_length (url);
  
        if (*sub != '/')
         {
-         for (i = strlen (url); i && url[i] != '/'; i--);
-         if (!i || (url[i] == url[i - 1]))
+         /* SUB is a relative URL: we need to replace everything
+            after last slash (possibly empty) with SUB.
+
+            So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
+            our result should be "whatever/foo/qux/xyzzy".  */
+         int need_explicit_slash = 0;
+         int span;
+         const char *start_insert;
+         const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
+         if (!last_slash)
+           {
+             /* No slash found at all.  Append SUB to what we have,
+                but we'll need a slash as a separator.
+
+                Example: if url == "foo" and sub == "qux/xyzzy", then
+                we cannot just append sub to url, because we'd get
+                "fooqux/xyzzy", whereas what we want is
+                "foo/qux/xyzzy".
+
+                To make sure the / gets inserted, we set
+                need_explicit_slash to 1.  We also set start_insert
+                to end + 1, so that the length calculations work out
+                correctly for one more (slash) character.  Accessing
+                that character is fine, since it will be the
+                delimiter, '\0' or '?'.  */
+             /* example: "foo?..." */
+             /*               ^    ('?' gets changed to '/') */
+             start_insert = end + 1;
+             need_explicit_slash = 1;
+           }
+         else
             {
-             int l = strlen (url);
-             char *t = (char *)alloca (l + 2);
-             strcpy (t, url);
-             t[l] = '/';
-             t[l + 1] = '\0';
-             url = t;
-             i = l;
+             /* example: "whatever/foo/bar" */
+             /*                        ^    */
+             start_insert = last_slash + 1;
             }
-         constr = (char *)xmalloc (i + 1 + subsize + 1);
-         strncpy (constr, url, i + 1);
-         constr[i + 1] = '\0';
-         strncat (constr, sub, subsize);
+
+         span = start_insert - url;
+         constr = (char *)xmalloc (span + subsize + 1);
+         if (span)
+           memcpy (constr, url, span);
+         if (need_explicit_slash)
+           constr[span - 1] = '/';
+         if (subsize)
+           memcpy (constr + span, sub, subsize);
+         constr[span + subsize] = '\0';
         }
        else /* *sub == `/' */
         {
-         int fl;
-
-         i = 0;
-         do
-           {
-             for (; url[i] && url[i] != '/'; i++);
-             if (!url[i])
-               break;
-             fl = (url[i] == url[i + 1] && url[i + 1] == '/');
-             if (fl)
-               i += 2;
-           }
-         while (fl);
-         if (!url[i])
-           {
-             int l = strlen (url);
-             char *t = (char *)alloca (l + 2);
-             strcpy (t, url);
-             t[l] = '/';
-             t[l + 1] = '\0';
-             url = t;
-           }
-         constr = (char *)xmalloc (i + 1 + subsize + 1);
-         strncpy (constr, url, i);
-         constr[i] = '\0';
-         strncat (constr + i, sub, subsize);
-         constr[i + subsize] = '\0';
-       } /* *sub == `/' */
+         /* SUB is an absolute path: we need to replace everything
+             after (and including) the FIRST slash with SUB.
+
+            So, if URL is "http://host/whatever/foo/bar", and SUB is
+            "/qux/xyzzy", our result should be
+            "http://host/qux/xyzzy".  */
+         int span;
+         const char *slash, *start_insert;
+         const char *pos = url;
+         int seen_slash_slash = 0;
+         /* We're looking for the first slash, but want to ignore
+             double slash. */
+       again:
+         slash = memchr (pos, '/', end - pos);
+         if (slash && !seen_slash_slash)
+           if (*(slash + 1) == '/')
+             {
+               pos = slash + 2;
+               seen_slash_slash = 1;
+               goto again;
+             }
+
+         /* At this point, SLASH is the location of the first / after
+            "//", or the first slash altogether.  START_INSERT is the
+            pointer to the location where SUB will be inserted.  When
+            examining the last two examples, keep in mind that SUB
+            begins with '/'. */
+
+         if (!slash && !seen_slash_slash)
+           /* example: "foo" */
+           /*           ^    */
+           start_insert = url;
+         else if (!slash && seen_slash_slash)
+           /* example: "http://foo" */
+           /*                     ^ */
+           start_insert = end;
+         else if (slash && !seen_slash_slash)
+           /* example: "foo/bar" */
+           /*           ^        */
+           start_insert = url;
+         else if (slash && seen_slash_slash)
+           /* example: "http://something/" */
+           /*                           ^  */
+           start_insert = slash;
+
+         span = start_insert - url;
+         constr = (char *)xmalloc (span + subsize + 1);
+         if (span)
+           memcpy (constr, url, span);
+         if (subsize)
+           memcpy (constr + span, sub, subsize);
+         constr[span + subsize] = '\0';
+       }
      }
    else /* !no_proto */
      {
-      constr = (char *)xmalloc (subsize + 1);
-      strncpy (constr, sub, subsize);
-      constr[subsize] = '\0';
+      constr = strdupdelim (sub, sub + subsize);
      }
    return constr;
  }
+
+/* Like the function above, but with a saner caller interface. */
+char *
+url_concat (const char *base_url, const char *new_url)
+{
+  return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
+}
  \f
  /* Optimize URL by host, destructively replacing u->host with realhost
     (u->host).  Do this regardless of opt.simple_check.  */
@@ -1341,6 +1465,32 @@ opt_url (struct urlinfo *u)
    free (u->url);
    u->url = str_url (u, 0);
  }
+
+/* This beautiful kludge is fortunately not needed, as I've made
+   parse_dir do the (almost) right thing, so that a query can never
+   become a part of directory.  */
+#if 0
+/* Call path_simplify, but make sure that the part after the
+   question-mark, if any, is not destroyed by path_simplify's
+   "optimizations".  */
+void
+path_simplify_with_kludge (char *path)
+{
+  char *query = strchr (path, '?');
+  if (query)
+    /* path_simplify also works destructively, so we also have the
+       license to write. */
+    *query = '\0';
+  path_simplify (path);
+  if (query)
+    {
+      char *newend = path + strlen (path);
+      *query = '?';
+      if (newend != query)
+       memmove (newend, query, strlen (query) + 1);
+    }
+}
+#endif
  \f
  /* Returns proxy host address, in accordance with PROTO.  */
  char *
@@ -1369,9 +1519,10 @@ no_proxy_match (const char *host, const char **no_proxy)
  void
  convert_links (const char *file, urlpos *l)
  {
-  FILE *fp;
-  char *buf, *p, *p2;
-  long size;
+  FILE               *fp;
+  char               *buf, *p, *p2;
+  downloaded_file_t  downloaded_file_return;
+  long               size;
  
    logprintf (LOG_VERBOSE, _("Converting %s... "), file);
    /* Read from the file....  */
@@ -1385,7 +1536,10 @@ convert_links (const char *file, urlpos *l)
    /* ...to a buffer.  */
    load_file (fp, &buf, &size);
    fclose (fp);
-  if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
+
+  downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
+
+  if (opt.backup_converted && downloaded_file_return)
      /* Rather than just writing over the original .html file with the converted
         version, save the former to *.orig.  Note we only do this for files we've
         _successfully_ downloaded, so we don't clobber .orig files sitting around
@@ -1393,15 +1547,31 @@ convert_links (const char *file, urlpos *l)
      {
        /* Construct the backup filename as the original name plus ".orig". */
        size_t         filename_len = strlen(file);
-      char*          filename_plus_orig_suffix = malloc(filename_len +
-                                                       sizeof(".orig"));
+      char*          filename_plus_orig_suffix;
        boolean        already_wrote_backup_file = FALSE;
        slist*         converted_file_ptr;
        static slist*  converted_files = NULL;
  
-      /* Would a single s[n]printf() call be faster? */
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+      if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+       {
+         /* Just write "orig" over "html".  We need to do it this way because
+            when we're checking to see if we've downloaded the file before (to
+            see if we can skip downloading it), we don't know if it's a
+            text/html file.  Therefore we don't know yet at that stage that -E
+            is going to cause us to tack on ".html", so we need to compare
+            vs. the original URL plus ".orig", not the original URL plus
+            ".html.orig". */
+         filename_plus_orig_suffix = xmalloc(filename_len + 1);
+         strcpy(filename_plus_orig_suffix, file);
+         strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+       }
+      else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+       {
+         /* Append ".orig" to the name. */
+         filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
+         strcpy(filename_plus_orig_suffix, file);
+         strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+       }
  
        /* We can get called twice on the same URL thanks to the
          convert_all_links() call in main().  If we write the .orig file each
@@ -1434,7 +1604,7 @@ convert_links (const char *file, urlpos *l)
              thought I could just add a field to the urlpos structure saying
              that we'd written a .orig file for this URL, but that didn't work,
              so I had to make this separate list. */
-         converted_file_ptr = malloc(sizeof(slist));
+         converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
           converted_file_ptr->next = converted_files;
           converted_files = converted_file_ptr;
@@ -1451,8 +1621,8 @@ convert_links (const char *file, urlpos *l)
        free (buf);
        return;
      }
-  /* [If someone understands why multiple URLs can correspond to one local file,
-     can they please add a comment here...?] */
+  /* Presumably we have to loop through multiple URLs here (even though we're
+     only talking about a single local file) because of the -O option. */
    for (p = buf; l; l = l->next)
      {
        if (l->pos >= size)
@@ -1474,6 +1644,7 @@ convert_links (const char *file, urlpos *l)
        for (p2 = buf + l->pos; p < p2; p++)
         putc (*p, fp);
        if (l->flags & UABS2REL)
+       /* Convert absolute URL to relative. */
         {
           char *newname = construct_relative (file, l->local_name);
           fprintf (fp, "%s", newname);
@@ -1483,6 +1654,7 @@ convert_links (const char *file, urlpos *l)
         }
        p += l->size;
      }
+  /* Output the rest of the file. */
    if (p - buf < size)
      {
        for (p2 = buf + size; p < p2; p++)
@@ -1562,23 +1734,34 @@ add_url (urlpos *l, const char *url, const char *file)
  }
  
  
-/* Remembers which files have been downloaded.  Should be called with
-   add_or_check == ADD_FILE for each file we actually download successfully
-   (i.e. not for ones we have failures on or that we skip due to -N).  If you
-   just want to check if a file has been previously added without adding it,
-   call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
-   function with local filenames, not remote URLs -- by some means that isn't
-   commented well enough for me understand, multiple remote URLs can apparently
-   correspond to a single local file. */
-boolean
-downloaded_file (downloaded_file_t  add_or_check, const char*  file)
+/* Remembers which files have been downloaded.  In the standard case, should be
+   called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
+   download successfully (i.e. not for ones we have failures on or that we skip
+   due to -N).
+
+   When we've downloaded a file and tacked on a ".html" extension due to -E,
+   call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
+   FILE_DOWNLOADED_NORMALLY.
+
+   If you just want to check if a file has been previously added without adding
+   it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
+   with local filenames, not remote URLs. */
+downloaded_file_t
+downloaded_file (downloaded_file_t  mode, const char*  file)
  {
-  boolean        found_file = FALSE;
-  static slist*  downloaded_files = NULL;
-  slist*         rover = downloaded_files;
+  typedef struct _downloaded_file_list
+  {
+    char*                          file;
+    downloaded_file_t              download_type;
+    struct _downloaded_file_list*  next;
+  } downloaded_file_list;
+  
+  boolean                       found_file = FALSE;
+  static downloaded_file_list*  downloaded_files = NULL;
+  downloaded_file_list*         rover = downloaded_files;
  
    while (rover != NULL)
-    if (strcmp(rover->string, file) == 0)
+    if (strcmp(rover->file, file) == 0)
        {
         found_file = TRUE;
         break;
@@ -1587,17 +1770,18 @@ downloaded_file (downloaded_file_t  add_or_check, const char*  file)
        rover = rover->next;
  
    if (found_file)
-    return TRUE;  /* file had already been downloaded */
+    return rover->download_type;  /* file had already been downloaded */
    else
      {
-      if (add_or_check == ADD_FILE)
+      if (mode != CHECK_FOR_FILE)
         {
-         rover = malloc(sizeof(slist));
-         rover->string = xstrdup(file);  /* die on out-of-mem. */
+         rover = xmalloc(sizeof(*rover));
+         rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
+         rover->download_type = mode;
           rover->next = downloaded_files;
           downloaded_files = rover;
         }
  
-      return FALSE;  /* file had not already been downloaded */
+      return FILE_NOT_ALREADY_DOWNLOADED;
      }
  }