X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Furl.c;h=eea36a10d865d0e7ad7cea42735921a73ca560d2;hb=7b5ad90acfc8c101a6cf919cd2a00217f0194e93;hp=a0747a566d0b07156a9af5007310f20ad0870f46;hpb=0dd418242a66f82def061205fc6366ae63223723;p=wget

diff --git a/src/url.c b/src/url.c
index a0747a56..eea36a10 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1,12 +1,12 @@
 /* URL handling.
-   Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
 
 This file is part of Wget.
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -38,7 +38,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "utils.h"
 #include "url.h"
 #include "host.h"
-#include "html.h"
 
 #ifndef errno
 extern int errno;
@@ -48,22 +47,12 @@ extern int errno;
 #define DEFAULT_HTTP_PORT 80
 #define DEFAULT_FTP_PORT 21
 
-/* URL separator (for findurl) */
-#define URL_SEPARATOR "!\"#'(),>`{}|<>"
+/* Table of Unsafe chars.  This is intialized in
+   init_unsafe_char_table.  */
 
-/* A list of unsafe characters for encoding, as per RFC1738.  '@' and
-   ':' (not listed in RFC) were added because of user/password
-   encoding.  */
+static char unsafe_char_table[256];
 
-#ifndef WINDOWS
-# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
-#else  /* WINDOWS */
-# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
-#endif /* WINDOWS */
-
-#define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
-   			|| ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
-			|| strchr (URL_UNSAFE_CHARS, c))
+#define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
 
 /* If S contains unsafe characters, free it and replace it with a
    version that doesn't.  */
@@ -72,7 +61,7 @@ extern int errno;
   if (contains_unsafe (s))			\
     {						\
       char *uc_tmp = encode_string (s);		\
-      free (s);					\
+      xfree (s);				\
       (s) = uc_tmp;				\
     }						\
 } while (0)
@@ -82,6 +71,11 @@ extern int errno;
 /* Is a directory ".."?  */
 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
 
+#if 0
+static void path_simplify_with_kludge PARAMS ((char *));
+#endif
+static int urlpath_length PARAMS ((const char *));
+
 /* NULL-terminated list of strings to be recognized as prototypes (URL
    schemes).  Note that recognized doesn't mean supported -- only HTTP
    and FTP are currently supported.
@@ -171,6 +165,35 @@ skip_url (const char *url)
     return 0;
 }
 
+/* Unsafe chars:
+   - anything <= 32;
+   - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
+   - @ and :, for user/password encoding.
+   - everything over 127 (but we don't bother with recording those.  */
+void
+init_unsafe_char_table (void)
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (i < 32 || i >= 127
+	|| i == ' '
+	|| i == '<'
+	|| i == '>'
+	|| i == '\"'
+	|| i == '#'
+	|| i == '%'
+	|| i == '{'
+	|| i == '}'
+	|| i == '|'
+	|| i == '\\'
+	|| i == '^'
+	|| i == '~'
+	|| i == '['
+	|| i == ']'
+	|| i == '`')
+      unsafe_char_table[i] = 1;
+}
+
 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 int
 contains_unsafe (const char *s)
@@ -291,7 +314,7 @@ skip_proto (const char *url)
 
 /* Returns 1 if the URL begins with a protocol (supported or
    unsupported), 0 otherwise.  */
-static int
+int
 has_proto (const char *url)
 {
   char **s;
@@ -355,7 +378,7 @@ freeurl (struct urlinfo *u, int complete)
   if (u->proxy)
     freeurl (u->proxy, 1);
   if (complete)
-    free (u);
+    xfree (u);
   return;
 }
 
@@ -400,7 +423,7 @@ parseurl (const char *url, struct urlinfo *u, int strict)
     }
   /* If protocol is recognizable, but unsupported, bail out, else
      suppose unknown.  */
-  if (recognizable && !sup_protos[i].name)
+  if (recognizable && i == ARRAY_SIZE (sup_protos))
     return URLUNKNOWN;
   else if (i == ARRAY_SIZE (sup_protos))
     type = URLUNKNOWN;
@@ -503,11 +526,18 @@ parseurl (const char *url, struct urlinfo *u, int strict)
   strcat (u->path, *u->dir ? "/" : "");
   strcat (u->path, u->file);
   URL_CLEANSE (u->path);
+  DEBUGP (("newpath: %s\n", u->path));
   /* Create the clean URL.  */
   u->url = str_url (u, 0);
   return URLOK;
 }
 
+/* Special versions of DOTP and DDOTP for parse_dir(). */
+
+#define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
+#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')		\
+		     && (!*((x) + 2) || *((x) + 2) == '?'))
+
 /* Build the directory and filename components of the path.  Both
    components are *separately* malloc-ed strings!  It does not change
    the contents of path.
@@ -519,13 +549,16 @@ parse_dir (const char *path, char **dir, char **file)
 {
   int i, l;
 
-  for (i = l = strlen (path); i && path[i] != '/'; i--);
+  l = urlpath_length (path);
+  for (i = l; i && path[i] != '/'; i--);
+
   if (!i && *path != '/')   /* Just filename */
     {
-      if (DOTP (path) || DDOTP (path))
+      if (PD_DOTP (path) || PD_DDOTP (path))
 	{
-	  *dir = xstrdup (path);
-	  *file = xstrdup ("");
+	  *dir = strdupdelim (path, path + l);
+	  *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
 	}
       else
 	{
@@ -535,10 +568,11 @@ parse_dir (const char *path, char **dir, char **file)
     }
   else if (!i)                 /* /filename */
     {
-      if (DOTP (path + 1) || DDOTP (path + 1))
+      if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 	{
-	  *dir = xstrdup (path);
-	  *file = xstrdup ("");
+	  *dir = strdupdelim (path, path + l);
+	  *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
 	}
       else
 	{
@@ -548,15 +582,16 @@ parse_dir (const char *path, char **dir, char **file)
     }
   else /* Nonempty directory with or without a filename */
     {
-      if (DOTP (path + i + 1) || DDOTP (path + i + 1))
+      if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 	{
-	  *dir = xstrdup (path);
-	  *file = xstrdup ("");
+	  *dir = strdupdelim (path, path + l);
+	  *file = xstrdup (path + l); /* normally empty, but could
+                                         contain ?... */
 	}
       else
 	{
 	  *dir = strdupdelim (path, path + i);
-	  *file = strdupdelim (path + i + 1, path + l + 1);
+	  *file = xstrdup (path + i + 1);
 	}
     }
 }
@@ -622,10 +657,10 @@ process_ftp_type (char *path)
     return '\0';
 }
 
-/* Return the URL as fine-formed string, with a proper protocol, port
-   number, directory and optional user/password.  If HIDE is non-zero,
-   password will be hidden.  The forbidden characters in the URL will
-   be cleansed.  */
+/* Return the URL as fine-formed string, with a proper protocol,
+   optional port number, directory and optional user/password.  If
+   HIDE is non-zero, password will be hidden.  The forbidden
+   characters in the URL will be cleansed.  */
 char *
 str_url (const struct urlinfo *u, int hide)
 {
@@ -659,11 +694,11 @@ str_url (const struct urlinfo *u, int hide)
     {
       char *tmp = (char *)xmalloc (strlen (dir) + 3);
       /*sprintf (tmp, "%%2F%s", dir + 1);*/
-      *tmp = '%';
+      tmp[0] = '%';
       tmp[1] = '2';
       tmp[2] = 'F';
       strcpy (tmp + 3, dir + 1);
-      free (dir);
+      xfree (dir);
       dir = tmp;
     }
 
@@ -707,9 +742,9 @@ str_url (const struct urlinfo *u, int hide)
   if (*dir)
     res[l++] = '/';
   strcpy (res + l, file);
-  free (host);
-  free (dir);
-  free (file);
+  xfree (host);
+  xfree (dir);
+  xfree (file);
   FREE_MAYBE (user);
   FREE_MAYBE (passwd);
   return res;
@@ -748,279 +783,54 @@ url_equal (const char *url1, const char *url2)
   return res;
 }
 
-/* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
-   buffer may contain pretty much anything; no errors are signaled.  */
-static const char *
-findurl (const char *buf, int howmuch, int *count)
-{
-  char **prot;
-  const char *s1, *s2;
-
-  for (s1 = buf; howmuch; s1++, howmuch--)
-    for (prot = protostrings; *prot; prot++)
-      if (howmuch <= strlen (*prot))
-	continue;
-      else if (!strncasecmp (*prot, s1, strlen (*prot)))
-	{
-	  for (s2 = s1, *count = 0;
-	       howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
-		 !strchr (URL_SEPARATOR, *s2);
-	       s2++, (*count)++, howmuch--);
-	  return s1;
-	}
-  return NULL;
-}
-
-/* Scans the file for signs of URL-s.  Returns a vector of pointers,
-   each pointer representing a URL string.  The file is *not* assumed
-   to be HTML.  */
 urlpos *
 get_urls_file (const char *file)
 {
-  long nread;
-  FILE *fp;
-  char *buf;
-  const char *pbuf;
-  int size;
-  urlpos *first, *current, *old;
-
-  if (file && !HYPHENP (file))
-    {
-      fp = fopen (file, "rb");
-      if (!fp)
-	{
-	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
-	  return NULL;
-	}
-    }
-  else
-    fp = stdin;
-  /* Load the file.  */
-  load_file (fp, &buf, &nread);
-  if (file && !HYPHENP (file))
-    fclose (fp);
-  DEBUGP (("Loaded %s (size %ld).\n", file, nread));
-  first = current = NULL;
-  /* Fill the linked list with URLs.  */
-  for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
-       pbuf += size)
-    {
-      /* Allocate the space.  */
-      old = current;
-      current = (urlpos *)xmalloc (sizeof (urlpos));
-      if (old)
-	old->next = current;
-      memset (current, 0, sizeof (*current));
-      current->next = NULL;
-      current->url = (char *)xmalloc (size + 1);
-      memcpy (current->url, pbuf, size);
-      current->url[size] = '\0';
-      if (!first)
-	first = current;
-    }
-  /* Free the buffer.  */
-  free (buf);
-
-  return first;
-}
-
-/* Similar to get_urls_file, but for HTML files.  FILE is scanned as
-   an HTML document using htmlfindurl(), which see.  get_urls_html()
-   constructs the HTML-s from the relative href-s.
+  struct file_memory *fm;
+  urlpos *head, *tail;
+  const char *text, *text_end;
 
-   If SILENT is non-zero, do not barf on baseless relative links.  */
-urlpos *
-get_urls_html (const char *file, const char *this_url, int silent,
-	       int dash_p_leaf_HTML)
-{
-  long nread;
-  FILE *fp;
-  char *orig_buf;
-  const char *buf;
-  int step, first_time;
-  urlpos *first, *current, *old;
-
-  if (file && !HYPHENP (file))
+  /* Load the file.  */
+  fm = read_file (file);
+  if (!fm)
     {
-      fp = fopen (file, "rb");
-      if (!fp)
-	{
-	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
-	  return NULL;
-	}
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
     }
-  else
-    fp = stdin;
-  /* Load the file.  */
-  load_file (fp, &orig_buf, &nread);
-  if (file && !HYPHENP (file))
-    fclose (fp);
-  DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
-  first = current = NULL;
-  first_time = 1;
-  /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
-  for (buf = orig_buf;
-       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
-			   dash_p_leaf_HTML));
-       buf += step)
+  DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+  head = tail = NULL;
+  text = fm->content;
+  text_end = fm->content + fm->length;
+  while (text < text_end)
     {
-      int i, no_proto;
-      int size = step;
-      const char *pbuf = buf;
-      char *constr, *base;
-      const char *cbase;
-
-      first_time = 0;
-
-      /* A frequent phenomenon that needs to be handled are pages
-         generated by brain-damaged HTML generators, which refer to to
-         URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
-         any spaces at the beginning or at the end of the string.
-         This is probably not strictly correct, but that's what the
-         browsers do, so we may follow.  May the authors of "WYSIWYG"
-         HTML tools burn in hell for the damage they've inflicted!  */
-      while ((pbuf < buf + step) && ISSPACE (*pbuf))
-        {
-          ++pbuf;
-          --size;
-        }
-      while (size && ISSPACE (pbuf[size - 1]))
-	--size;
-      if (!size)
-	break;
-
-      for (i = 0; protostrings[i]; i++)
-	{
-	  if (!strncasecmp (protostrings[i], pbuf,
-			    MINVAL (strlen (protostrings[i]), size)))
-	    break;
-	}
-      /* Check for http:RELATIVE_URI.  See below for details.  */
-      if (protostrings[i]
-	  && !(strncasecmp (pbuf, "http:", 5) == 0
-	       && strncasecmp (pbuf, "http://", 7) != 0))
-	{
-	  no_proto = 0;
-	}
+      const char *line_beg = text;
+      const char *line_end = memchr (text, '\n', text_end - text);
+      if (!line_end)
+	line_end = text_end;
       else
+	++line_end;
+      text = line_end;
+      while (line_beg < line_end
+	     && ISSPACE (*line_beg))
+	++line_beg;
+      while (line_end > line_beg + 1
+	     && ISSPACE (*(line_end - 1)))
+	--line_end;
+      if (line_end > line_beg)
 	{
-	  no_proto = 1;
-	  /* This is for extremely brain-damaged pages that refer to
-	     relative URI-s as <a href="http:URL">.  Just strip off the
-	     silly leading "http:" (as well as any leading blanks
-	     before it).  */
-	  if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
-	    pbuf += 5, size -= 5;
-	}
-      if (!no_proto)
-	{
-	  for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
-	    {
-	      if (!strncasecmp (sup_protos[i].name, pbuf,
-			       MINVAL (strlen (sup_protos[i].name), size)))
-		break;
-	    }
-	  /* Do *not* accept a non-supported protocol.  */
-	  if (i == ARRAY_SIZE (sup_protos))
-	    continue;
-	}
-      if (no_proto)
-	{
-	  /* First, construct the base, which can be relative itself.
-
-	     Criteria for creating the base are:
-	     1) html_base created by <base href="...">
-	     2) current URL
-	     3) base provided from the command line */
-	  cbase = html_base ();
-	  if (!cbase)
-	    cbase = this_url;
-	  if (!cbase)
-	    cbase = opt.base_href;
-	  if (!cbase)             /* Error condition -- a baseless
-				     relative link.  */
-	    {
-	      if (!opt.quiet && !silent)
-		{
-		  /* Use malloc, not alloca because this is called in
-                     a loop. */
-		  char *temp = (char *)malloc (size + 1);
-		  strncpy (temp, pbuf, size);
-		  temp[size] = '\0';
-		  logprintf (LOG_NOTQUIET,
-			     _("Error (%s): Link %s without a base provided.\n"),
-			     file, temp);
-		  free (temp);
-		}
-	      continue;
-	    }
-	  if (this_url)
-	    base = construct (this_url, cbase, strlen (cbase),
-			      !has_proto (cbase));
+	  urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
+	  memset (entry, 0, sizeof (*entry));
+	  entry->next = NULL;
+	  entry->url = strdupdelim (line_beg, line_end);
+	  if (!head)
+	    head = entry;
 	  else
-	    {
-	      /* Base must now be absolute, with host name and
-		 protocol.  */
-	      if (!has_proto (cbase))
-		{
-		  logprintf (LOG_NOTQUIET, _("\
-Error (%s): Base %s relative, without referer URL.\n"),
-			     file, cbase);
-		  continue;
-		}
-	      base = xstrdup (cbase);
-	    }
-	  constr = construct (base, pbuf, size, no_proto);
-	  free (base);
+	    tail->next = entry;
+	  tail = entry;
 	}
-      else /* has proto */
-	{
-	  constr = (char *)xmalloc (size + 1);
-	  strncpy (constr, pbuf, size);
-	  constr[size] = '\0';
-	}
-#ifdef DEBUG
-      if (opt.debug)
-	{
-	  char *tmp;
-	  const char *tmp2;
-
-	  tmp2 = html_base ();
-	  /* Use malloc, not alloca because this is called in a loop. */
-	  tmp = (char *)xmalloc (size + 1);
-	  strncpy (tmp, pbuf, size);
-	  tmp[size] = '\0';
-	  logprintf (LOG_ALWAYS,
-		     "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
-		     file, this_url ? this_url : "(null)",
-		     tmp2 ? tmp2 : "(null)", tmp, constr);
-	  free (tmp);
-	}
-#endif
-
-      /* Allocate the space.  */
-      old = current;
-      current = (urlpos *)xmalloc (sizeof (urlpos));
-      if (old)
-	old->next = current;
-      if (!first)
-	first = current;
-      /* Fill the values.  */
-      memset (current, 0, sizeof (*current));
-      current->next = NULL;
-      current->url = constr;
-      current->size = size;
-      current->pos = pbuf - orig_buf;
-      /* A URL is relative if the host and protocol are not named,
-	 and the name does not start with `/'.  */
-      if (no_proto && *pbuf != '/')
-	current->flags |= (URELATIVE | UNOPROTO);
-      else if (no_proto)
-	current->flags |= UNOPROTO;
     }
-  free (orig_buf);
-
-  return first;
+  read_file_free (fm);
+  return head;
 }
 
 /* Free the linked list of urlpos.  */
@@ -1030,9 +840,9 @@ free_urlpos (urlpos *l)
   while (l)
     {
       urlpos *next = l->next;
-      free (l->url);
+      xfree (l->url);
       FREE_MAYBE (l->local_name);
-      free (l);
+      xfree (l);
       l = next;
     }
 }
@@ -1085,7 +895,7 @@ mkalldirs (const char *path)
     {
       if (S_ISDIR (st.st_mode))
 	{
-	  free (t);
+	  xfree (t);
 	  return 0;
 	}
       else
@@ -1109,7 +919,7 @@ mkalldirs (const char *path)
   res = make_directory (t);
   if (res != 0)
     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
-  free (t);
+  xfree (t);
   return res;
 }
 
@@ -1153,7 +963,7 @@ mkstruct (const struct urlinfo *u)
   if (opt.add_hostdir && !opt.simple_check)
     {
       char *nhost = realhost (host);
-      free (host);
+      xfree (host);
       host = nhost;
     }
   /* Add dir_prefix and hostname (if required) to the beginning of
@@ -1176,7 +986,7 @@ mkstruct (const struct urlinfo *u)
       else
 	dirpref = "";
     }
-  free (host);
+  xfree (host);
 
   /* If there is a prefix, prepend it.  */
   if (*dirpref)
@@ -1199,7 +1009,7 @@ mkstruct (const struct urlinfo *u)
   /* Finally, construct the full name.  */
   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
-  free (dir);
+  xfree (dir);
   return res;
 }
 
@@ -1233,7 +1043,7 @@ url_filename (const struct urlinfo *u)
 	  char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
 					 + 1 + strlen (file) + 1);
 	  sprintf (nfile, "%s/%s", opt.dir_prefix, file);
-	  free (file);
+	  xfree (file);
 	  file = nfile;
 	}
     }
@@ -1262,29 +1072,37 @@ url_filename (const struct urlinfo *u)
 
   /* Find a unique name.  */
   name = unique_name (file);
-  free (file);
+  xfree (file);
   return name;
 }
 
-/* Like strlen(), except if `?' is present in the URL and its protocol
-   is HTTP, act as if `?' is the end of the string.  Needed for the
-   correct implementation of `construct' below, at least until we code
-   up proper parsing of URLs.  */
+/* Like strlen(), but allow the URL to be ended with '?'.  */
 static int
-urllen_http_hack (const char *url)
+urlpath_length (const char *url)
 {
-  if ((!strncmp (url, "http://", 7)
-       || !strncmp (url, "https://", 7)))
-    {
-      const char *q = strchr (url, '?');
-      if (q)
-	return q - url;
-    }
+  const char *q = strchr (url, '?');
+  if (q)
+    return q - url;
   return strlen (url);
 }
 
-/* Construct an absolute URL, given a (possibly) relative one.  This
-   is more tricky than it might seem, but it works.  */
+/* Find the last occurrence of character C in the range [b, e), or
+   NULL, if none are present.  This is almost completely equivalent to
+   { *e = '\0'; return strrchr(b); }, except that it doesn't change
+   the contents of the string.  */
+static const char *
+find_last_char (const char *b, const char *e, char c)
+{
+  for (; e > b; e--)
+    if (*e == c)
+      return e;
+  return NULL;
+}
+
+/* Construct a URL by concatenating an absolute URL and a path, which
+   may or may not be absolute.  This tries to behave "reasonably" in
+   all foreseeable cases.  It employs little specific knowledge about
+   protocols or URL-specific stuff -- it just works on strings.  */
 static char *
 construct (const char *url, const char *sub, int subsize, int no_proto)
 {
@@ -1292,62 +1110,124 @@ construct (const char *url, const char *sub, int subsize, int no_proto)
 
   if (no_proto)
     {
-      int i;
+      const char *end = url + urlpath_length (url);
 
       if (*sub != '/')
 	{
-	  for (i = urllen_http_hack (url); i && url[i] != '/'; i--);
-	  if (!i || (url[i] == url[i - 1]))
+	  /* SUB is a relative URL: we need to replace everything
+	     after last slash (possibly empty) with SUB.
+
+	     So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
+	     our result should be "whatever/foo/qux/xyzzy".  */
+	  int need_explicit_slash = 0;
+	  int span;
+	  const char *start_insert;
+	  const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
+	  if (!last_slash)
 	    {
-	      int l = urllen_http_hack (url);
-	      char *t = (char *)alloca (l + 2);
-	      memcpy (t, url, l);
-	      t[l] = '/';
-	      t[l + 1] = '\0';
-	      url = t;
-	      i = l;
+	      /* No slash found at all.  Append SUB to what we have,
+		 but we'll need a slash as a separator.
+
+		 Example: if url == "foo" and sub == "qux/xyzzy", then
+		 we cannot just append sub to url, because we'd get
+		 "fooqux/xyzzy", whereas what we want is
+		 "foo/qux/xyzzy".
+
+		 To make sure the / gets inserted, we set
+		 need_explicit_slash to 1.  We also set start_insert
+		 to end + 1, so that the length calculations work out
+		 correctly for one more (slash) character.  Accessing
+		 that character is fine, since it will be the
+		 delimiter, '\0' or '?'.  */
+	      /* example: "foo?..." */
+	      /*               ^    ('?' gets changed to '/') */
+	      start_insert = end + 1;
+	      need_explicit_slash = 1;
 	    }
-	  constr = (char *)xmalloc (i + 1 + subsize + 1);
-	  strncpy (constr, url, i + 1);
-	  constr[i + 1] = '\0';
-	  strncat (constr, sub, subsize);
-	}
-      else /* *sub == `/' */
-	{
-	  int fl;
-
-	  i = 0;
-	  do
+	  else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
 	    {
-	      for (; url[i] && url[i] != '/'; i++);
-	      if (!url[i])
-		break;
-	      fl = (url[i] == url[i + 1] && url[i + 1] == '/');
-	      if (fl)
-		i += 2;
+	      /* example: http://host"  */
+	      /*                      ^ */
+	      start_insert = end + 1;
+	      need_explicit_slash = 1;
 	    }
-	  while (fl);
-	  if (!url[i])
+	  else
 	    {
-	      int l = urllen_http_hack (url);
-	      char *t = (char *)alloca (l + 2);
-	      strcpy (t, url);
-	      t[l] = '/';
-	      t[l + 1] = '\0';
-	      url = t;
+	      /* example: "whatever/foo/bar" */
+	      /*                        ^    */
+	      start_insert = last_slash + 1;
 	    }
-	  constr = (char *)xmalloc (i + 1 + subsize + 1);
-	  strncpy (constr, url, i);
-	  constr[i] = '\0';
-	  strncat (constr + i, sub, subsize);
-	  constr[i + subsize] = '\0';
-	} /* *sub == `/' */
+
+	  span = start_insert - url;
+	  constr = (char *)xmalloc (span + subsize + 1);
+	  if (span)
+	    memcpy (constr, url, span);
+	  if (need_explicit_slash)
+	    constr[span - 1] = '/';
+	  if (subsize)
+	    memcpy (constr + span, sub, subsize);
+	  constr[span + subsize] = '\0';
+	}
+      else /* *sub == `/' */
+	{
+	  /* SUB is an absolute path: we need to replace everything
+             after (and including) the FIRST slash with SUB.
+
+	     So, if URL is "http://host/whatever/foo/bar", and SUB is
+	     "/qux/xyzzy", our result should be
+	     "http://host/qux/xyzzy".  */
+	  int span;
+	  const char *slash;
+	  const char *start_insert = NULL; /* for gcc to shut up. */
+	  const char *pos = url;
+	  int seen_slash_slash = 0;
+	  /* We're looking for the first slash, but want to ignore
+             double slash. */
+	again:
+	  slash = memchr (pos, '/', end - pos);
+	  if (slash && !seen_slash_slash)
+	    if (*(slash + 1) == '/')
+	      {
+		pos = slash + 2;
+		seen_slash_slash = 1;
+		goto again;
+	      }
+
+	  /* At this point, SLASH is the location of the first / after
+	     "//", or the first slash altogether.  START_INSERT is the
+	     pointer to the location where SUB will be inserted.  When
+	     examining the last two examples, keep in mind that SUB
+	     begins with '/'. */
+
+	  if (!slash && !seen_slash_slash)
+	    /* example: "foo" */
+	    /*           ^    */
+	    start_insert = url;
+	  else if (!slash && seen_slash_slash)
+	    /* example: "http://foo" */
+	    /*                     ^ */
+	    start_insert = end;
+	  else if (slash && !seen_slash_slash)
+	    /* example: "foo/bar" */
+	    /*           ^        */
+	    start_insert = url;
+	  else if (slash && seen_slash_slash)
+	    /* example: "http://something/" */
+	    /*                           ^  */
+	    start_insert = slash;
+
+	  span = start_insert - url;
+	  constr = (char *)xmalloc (span + subsize + 1);
+	  if (span)
+	    memcpy (constr, url, span);
+	  if (subsize)
+	    memcpy (constr + span, sub, subsize);
+	  constr[span + subsize] = '\0';
+	}
     }
   else /* !no_proto */
     {
-      constr = (char *)xmalloc (subsize + 1);
-      strncpy (constr, sub, subsize);
-      constr[subsize] = '\0';
+      constr = strdupdelim (sub, sub + subsize);
     }
   return constr;
 }
@@ -1366,13 +1246,39 @@ opt_url (struct urlinfo *u)
 {
   /* Find the "true" host.  */
   char *host = realhost (u->host);
-  free (u->host);
+  xfree (u->host);
   u->host = host;
   assert (u->dir != NULL);      /* the URL must have been parsed */
   /* Refresh the printed representation.  */
-  free (u->url);
+  xfree (u->url);
   u->url = str_url (u, 0);
 }
+
+/* This beautiful kludge is fortunately not needed, as I've made
+   parse_dir do the (almost) right thing, so that a query can never
+   become a part of directory.  */
+#if 0
+/* Call path_simplify, but make sure that the part after the
+   question-mark, if any, is not destroyed by path_simplify's
+   "optimizations".  */
+void
+path_simplify_with_kludge (char *path)
+{
+  char *query = strchr (path, '?');
+  if (query)
+    /* path_simplify also works destructively, so we also have the
+       license to write. */
+    *query = '\0';
+  path_simplify (path);
+  if (query)
+    {
+      char *newend = path + strlen (path);
+      *query = '?';
+      if (newend != query)
+	memmove (newend, query, strlen (query) + 1);
+    }
+}
+#endif
 
 /* Returns proxy host address, in accordance with PROTO.  */
 char *
@@ -1396,103 +1302,58 @@ no_proxy_match (const char *host, const char **no_proxy)
     return !sufmatch (no_proxy, host);
 }
 
+static void write_backup_file PARAMS ((const char *, downloaded_file_t));
+
 /* Change the links in an HTML document.  Accepts a structure that
    defines the positions of all the links.  */
 void
 convert_links (const char *file, urlpos *l)
 {
+  struct file_memory *fm;
   FILE               *fp;
-  char               *buf, *p, *p2;
+  char               *p;
   downloaded_file_t  downloaded_file_return;
-  long               size;
 
   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
-  /* Read from the file....  */
-  fp = fopen (file, "rb");
-  if (!fp)
+
+  {
+    /* First we do a "dry run": go through the list L and see whether
+       any URL needs to be converted in the first place.  If not, just
+       leave the file alone.  */
+    int count = 0;
+    urlpos *dry = l;
+    for (dry = l; dry; dry = dry->next)
+      if (dry->convert != CO_NOCONVERT)
+	++count;
+    if (!count)
+      {
+	logputs (LOG_VERBOSE, _("nothing to do.\n"));
+	return;
+      }
+  }
+
+  fm = read_file (file);
+  if (!fm)
     {
       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 		 file, strerror (errno));
       return;
     }
-  /* ...to a buffer.  */
-  load_file (fp, &buf, &size);
-  fclose (fp);
-
-  downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
 
+  downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
   if (opt.backup_converted && downloaded_file_return)
-    /* Rather than just writing over the original .html file with the converted
-       version, save the former to *.orig.  Note we only do this for files we've
-       _successfully_ downloaded, so we don't clobber .orig files sitting around
-       from previous invocations. */
-    {
-      /* Construct the backup filename as the original name plus ".orig". */
-      size_t         filename_len = strlen(file);
-      char*          filename_plus_orig_suffix;
-      boolean        already_wrote_backup_file = FALSE;
-      slist*         converted_file_ptr;
-      static slist*  converted_files = NULL;
-
-      if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
-	{
-	  /* Just write "orig" over "html".  We need to do it this way because
-	     when we're checking to see if we've downloaded the file before (to
-	     see if we can skip downloading it), we don't know if it's a
-	     text/html file.  Therefore we don't know yet at that stage that -E
-	     is going to cause us to tack on ".html", so we need to compare
-	     vs. the original URL plus ".orig", not the original URL plus
-	     ".html.orig". */
-	  filename_plus_orig_suffix = xmalloc(filename_len + 1);
-	  strcpy(filename_plus_orig_suffix, file);
-	  strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
-	}
-      else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
-	{
-	  /* Append ".orig" to the name. */
-	  filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
-	  strcpy(filename_plus_orig_suffix, file);
-	  strcpy(filename_plus_orig_suffix + filename_len, ".orig");
-	}
+    write_backup_file (file, downloaded_file_return);
 
-      /* We can get called twice on the same URL thanks to the
-	 convert_all_links() call in main().  If we write the .orig file each
-	 time in such a case, it'll end up containing the first-pass conversion,
-	 not the original file.  So, see if we've already been called on this
-	 file. */
-      converted_file_ptr = converted_files;
-      while (converted_file_ptr != NULL)
-	if (strcmp(converted_file_ptr->string, file) == 0)
-	  {
-	    already_wrote_backup_file = TRUE;
-	    break;
-	  }
-	else
-	  converted_file_ptr = converted_file_ptr->next;
-
-      if (!already_wrote_backup_file)
-	{
-	  /* Rename <file> to <file>.orig before former gets written over. */
-	  if (rename(file, filename_plus_orig_suffix) != 0)
-	    logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
-		       file, filename_plus_orig_suffix, strerror (errno));
-
-	  /* Remember that we've already written a .orig backup for this file.
-	     Note that we never free this memory since we need it till the
-	     convert_all_links() call, which is one of the last things the
-	     program does before terminating.  BTW, I'm not sure if it would be
-	     safe to just set 'converted_file_ptr->string' to 'file' below,
-	     rather than making a copy of the string...  Another note is that I
-	     thought I could just add a field to the urlpos structure saying
-	     that we'd written a .orig file for this URL, but that didn't work,
-	     so I had to make this separate list. */
-	  converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
-	  converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
-	  converted_file_ptr->next = converted_files;
-	  converted_files = converted_file_ptr;
-	}
-
-      free(filename_plus_orig_suffix);
+  /* Before opening the file for writing, unlink the file.  This is
+     important if the data in FM is mmaped.  In such case, nulling the
+     file, which is what fopen() below does, would make us read all
+     zeroes from the mmaped region.  */
+  if (unlink (file) < 0 && errno != ENOENT)
+    {
+      logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
+		 file, strerror (errno));
+      read_file_free (fm);
+      return;
     }
   /* Now open the file for writing.  */
   fp = fopen (file, "wb");
@@ -1500,50 +1361,66 @@ convert_links (const char *file, urlpos *l)
     {
       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 		 file, strerror (errno));
-      free (buf);
+      read_file_free (fm);
       return;
     }
-  /* Presumably we have to loop through multiple URLs here (even though we're
-     only talking about a single local file) because of the -O option. */
-  for (p = buf; l; l = l->next)
+  /* Here we loop through all the URLs in file, replacing those of
+     them that are downloaded with relative references.  */
+  p = fm->content;
+  for (; l; l = l->next)
     {
-      if (l->pos >= size)
+      char *url_start = fm->content + l->pos;
+      if (l->pos >= fm->length)
 	{
 	  DEBUGP (("Something strange is going on.  Please investigate."));
 	  break;
 	}
-      /* If the URL already is relative or it is not to be converted
-	 for some other reason (e.g. because of not having been
-	 downloaded in the first place), skip it.  */
-      if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
+      /* If the URL is not to be converted, skip it.  */
+      if (l->convert == CO_NOCONVERT)
 	{
-	  DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
-		   l->pos, l->flags));
+	  DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
 	  continue;
 	}
-      /* Else, reach the position of the offending URL, echoing
-	 everything up to it to the outfile.  */
-      for (p2 = buf + l->pos; p < p2; p++)
-	putc (*p, fp);
-      if (l->flags & UABS2REL)
-	/* Convert absolute URL to relative. */
+
+      /* Echo the file contents, up to the offending URL's opening
+         quote, to the outfile.  */
+      fwrite (p, 1, url_start - p, fp);
+      p = url_start;
+      if (l->convert == CO_CONVERT_TO_RELATIVE)
 	{
+	  /* Convert absolute URL to relative. */
 	  char *newname = construct_relative (file, l->local_name);
-	  fprintf (fp, "%s", newname);
-	  DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
+	  char *quoted_newname = html_quote_string (newname);
+	  putc (*p, fp);	/* quoting char */
+	  fputs (quoted_newname, fp);
+	  p += l->size - 1;
+	  putc (*p, fp);	/* close quote */
+	  ++p;
+	  xfree (newname);
+	  xfree (quoted_newname);
+	  DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 		   l->url, newname, l->pos, file));
-	  free (newname);
 	}
-      p += l->size;
+      else if (l->convert == CO_CONVERT_TO_COMPLETE)
+	{
+	  /* Convert the link to absolute URL. */
+	  char *newlink = l->url;
+	  char *quoted_newlink = html_quote_string (newlink);
+	  putc (*p, fp);	/* quoting char */
+	  fputs (quoted_newlink, fp);
+	  p += l->size - 1;
+	  putc (*p, fp);	/* close quote */
+	  ++p;
+	  xfree (quoted_newlink);
+	  DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
+		   newlink, l->pos, file));
+	}
     }
   /* Output the rest of the file. */
-  if (p - buf < size)
-    {
-      for (p2 = buf + size; p < p2; p++)
-	putc (*p, fp);
-    }
+  if (p - fm->content < fm->length)
+    fwrite (p, 1, fm->length - (p - fm->content), fp);
   fclose (fp);
-  free (buf);
+  read_file_free (fm);
   logputs (LOG_VERBOSE, _("done.\n"));
 }
 
@@ -1615,6 +1492,99 @@ add_url (urlpos *l, const char *url, const char *file)
   return t;
 }
 
+static void
+write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
+{
+  /* Rather than just writing over the original .html file with the
+     converted version, save the former to *.orig.  Note we only do
+     this for files we've _successfully_ downloaded, so we don't
+     clobber .orig files sitting around from previous invocations. */
+
+  /* Construct the backup filename as the original name plus ".orig". */
+  size_t         filename_len = strlen(file);
+  char*          filename_plus_orig_suffix;
+  boolean        already_wrote_backup_file = FALSE;
+  slist*         converted_file_ptr;
+  static slist*  converted_files = NULL;
+
+  if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+    {
+      /* Just write "orig" over "html".  We need to do it this way
+	 because when we're checking to see if we've downloaded the
+	 file before (to see if we can skip downloading it), we don't
+	 know if it's a text/html file.  Therefore we don't know yet
+	 at that stage that -E is going to cause us to tack on
+	 ".html", so we need to compare vs. the original URL plus
+	 ".orig", not the original URL plus ".html.orig". */
+      filename_plus_orig_suffix = alloca (filename_len + 1);
+      strcpy(filename_plus_orig_suffix, file);
+      strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+    }
+  else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+    {
+      /* Append ".orig" to the name. */
+      filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
+      strcpy(filename_plus_orig_suffix, file);
+      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+    }
+
+  /* We can get called twice on the same URL thanks to the
+     convert_all_links() call in main().  If we write the .orig file
+     each time in such a case, it'll end up containing the first-pass
+     conversion, not the original file.  So, see if we've already been
+     called on this file. */
+  converted_file_ptr = converted_files;
+  while (converted_file_ptr != NULL)
+    if (strcmp(converted_file_ptr->string, file) == 0)
+      {
+	already_wrote_backup_file = TRUE;
+	break;
+      }
+    else
+      converted_file_ptr = converted_file_ptr->next;
+
+  if (!already_wrote_backup_file)
+    {
+      /* Rename <file> to <file>.orig before former gets written over. */
+      if (rename(file, filename_plus_orig_suffix) != 0)
+	logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
+		   file, filename_plus_orig_suffix, strerror (errno));
+
+      /* Remember that we've already written a .orig backup for this file.
+	 Note that we never free this memory since we need it till the
+	 convert_all_links() call, which is one of the last things the
+	 program does before terminating.  BTW, I'm not sure if it would be
+	 safe to just set 'converted_file_ptr->string' to 'file' below,
+	 rather than making a copy of the string...  Another note is that I
+	 thought I could just add a field to the urlpos structure saying
+	 that we'd written a .orig file for this URL, but that didn't work,
+	 so I had to make this separate list.
+
+         This [adding a field to the urlpos structure] didn't work
+         because convert_file() is called twice: once after all its
+         sublinks have been retrieved in recursive_retrieve(), and
+         once at the end of the day in convert_all_links().  The
+         original linked list collected in recursive_retrieve() is
+         lost after the first invocation of convert_links(), and
+         convert_all_links() makes a new one (it calls get_urls_html()
+         for each file it covers.)  That's why your approach didn't
+         work.  The way to make it work is perhaps to make this flag a
+         field in the `urls_html' list.  */
+
+      converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
+      converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
+      converted_file_ptr->next = converted_files;
+      converted_files = converted_file_ptr;
+    }
+}
+
+typedef struct _downloaded_file_list {
+  char*                          file;
+  downloaded_file_t              download_type;
+  struct _downloaded_file_list*  next;
+} downloaded_file_list;
+
+static downloaded_file_list *downloaded_files;
 
 /* Remembers which files have been downloaded.  In the standard case, should be
    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
@@ -1631,15 +1601,7 @@ add_url (urlpos *l, const char *url, const char *file)
 downloaded_file_t
 downloaded_file (downloaded_file_t  mode, const char*  file)
 {
-  typedef struct _downloaded_file_list
-  {
-    char*                          file;
-    downloaded_file_t              download_type;
-    struct _downloaded_file_list*  next;
-  } downloaded_file_list;
-  
   boolean                       found_file = FALSE;
-  static downloaded_file_list*  downloaded_files = NULL;
   downloaded_file_list*         rover = downloaded_files;
 
   while (rover != NULL)
@@ -1667,3 +1629,23 @@ downloaded_file (downloaded_file_t  mode, const char*  file)
       return FILE_NOT_ALREADY_DOWNLOADED;
     }
 }
+
+void
+downloaded_files_free (void)
+{
+  downloaded_file_list*         rover = downloaded_files;
+  while (rover)
+    {
+      downloaded_file_list *next = rover->next;
+      xfree (rover->file);
+      xfree (rover);
+      rover = next;
+    }
+}
+
+/* Initialization of static stuff. */
+void
+url_init (void)
+{
+  init_unsafe_char_table ();
+}