[svn] Rewrite parsing and handling of URLs.

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index 582023e8a6fa98ff254311b733a755f01dea8e1b..e1685ab9aa8554093b0ba6d7bd6e817f1b951324 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -49,27 +49,26 @@ extern int errno;
  
  static int urlpath_length PARAMS ((const char *));
  
  
  static int urlpath_length PARAMS ((const char *));
  
-struct proto
+struct scheme_data
  {
  {
-  char *name;
-  uerr_t ind;
-  unsigned short port;
+  char *leading_string;
+  int default_port;
  };
  
  };
  
-/* Supported protocols: */
-static struct proto sup_protos[] =
+/* Supported schemes: */
+static struct scheme_data supported_schemes[] =
  {
  {
-  { "http://", URLHTTP, DEFAULT_HTTP_PORT },
+  { "http://",  DEFAULT_HTTP_PORT },
  #ifdef HAVE_SSL
  #ifdef HAVE_SSL
-  { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
+  { "https://", DEFAULT_HTTPS_PORT },
  #endif
  #endif
-  { "ftp://", URLFTP, DEFAULT_FTP_PORT }
+  { "ftp://",   DEFAULT_FTP_PORT },
+
+  /* SCHEME_INVALID */
+  { NULL,       -1 }
  };
  
  };
  
-static void parse_dir PARAMS ((const char *, char **, char **));
-static uerr_t parse_uname PARAMS ((const char *, char **, char **));
  static char *construct_relative PARAMS ((const char *, const char *));
  static char *construct_relative PARAMS ((const char *, const char *));
-static char process_ftp_type PARAMS ((char *));
  
  \f
  /* Support for encoding and decoding of URL strings.  We determine
  
  \f
  /* Support for encoding and decoding of URL strings.  We determine
@@ -87,17 +86,11 @@ enum {
  
  #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  
  
  #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  
-/* rfc1738 reserved chars.  We don't use this yet; preservation of
-   reserved chars will be implemented when I integrate the new
-   `reencode_string' function.  */
+/* rfc1738 reserved chars, preserved from encoding.  */
  
  #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  
  
  #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  
-/* Unsafe chars:
-   - anything <= 32;
-   - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
-   - '@' and ':'; needed for encoding URL username and password.
-   - anything >= 127. */
+/* rfc1738 unsafe chars, plus some more.  */
  
  #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  
  
  #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  
@@ -107,10 +100,10 @@ const static unsigned char urlchr_table[256] =
    U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
    U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
    U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
    U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
    U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
    U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
-  U,  0,  U,  U,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
+  U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
    0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
    0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
-  0,  0,  U,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
+  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
   RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
   RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
@@ -229,39 +222,217 @@ encode_string (const char *s)
      }                                          \
  } while (0)
  \f
      }                                          \
  } while (0)
  \f
-/* Returns the protocol type if URL's protocol is supported, or
-   URLUNKNOWN if not.  */
-uerr_t
-urlproto (const char *url)
-{
-  int i;
+enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
  
  
-  for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
-    if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
-      return sup_protos[i].ind;
-  for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
-  if (url[i] == ':')
+/* Decide whether to encode, decode, or pass through the char at P.
+   This used to be a macro, but it got a little too convoluted.  */
+static inline enum copy_method
+decide_copy_method (const char *p)
+{
+  if (*p == '%')
      {
      {
-      for (++i; url[i] && url[i] != '/'; i++)
-       if (!ISDIGIT (url[i]))
-         return URLBADPORT;
-      if (url[i - 1] == ':')
-       return URLFTP;
+      if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
+       {
+         /* %xx sequence: decode it, unless it would decode to an
+            unsafe or a reserved char; in that case, leave it as
+            is. */
+         char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
+           XCHAR_TO_XDIGIT (*(p + 2));
+
+         if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
+           return CM_PASSTHROUGH;
+         else
+           return CM_DECODE;
+       }
        else
        else
-       return URLHTTP;
+       /* Garbled %.. sequence: encode `%'. */
+       return CM_ENCODE;
      }
      }
+  else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
+    return CM_ENCODE;
    else
    else
-    return URLHTTP;
+    return CM_PASSTHROUGH;
+}
+
+/* Translate a %-quoting (but possibly non-conformant) input string S
+   into a %-quoting (and conformant) output string.  If no characters
+   are encoded or decoded, return the same string S; otherwise, return
+   a freshly allocated string with the new contents.
+
+   After a URL has been run through this function, the protocols that
+   use `%' as the quote character can use the resulting string as-is,
+   while those that don't call decode_string() to get to the intended
+   data.  This function is also stable: after an input string is
+   transformed the first time, all further transformations of the
+   result yield the same result string.
+
+   Let's discuss why this function is needed.
+
+   Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
+   space character would mess up the HTTP request, it needs to be
+   quoted, like this:
+
+       GET /abc%20def HTTP/1.0
+
+   So it appears that the unsafe chars need to be quoted, as with
+   encode_string.  But what if we're requested to download
+   `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
+   the user meant was a literal space, and he was kind enough to quote
+   it.  In that case, Wget should obviously leave the `%20' as is, and
+   send the same request as above.  So in this case we may not call
+   encode_string.
+
+   But what if the requested URI is `abc%20 def'?  If we call
+   encode_string, we end up with `/abc%2520%20def', which is almost
+   certainly not intended.  If we don't call encode_string, we are
+   left with the embedded space and cannot send the request.  What the
+   user meant was for Wget to request `/abc%20%20def', and this is
+   where reencode_string kicks in.
+
+   Wget used to solve this by first decoding %-quotes, and then
+   encoding all the "unsafe" characters found in the resulting string.
+   This was wrong because it didn't preserve certain URL special
+   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
+   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
+   whether we considered `+' reserved (it is).  One of these results
+   is inevitable because by the second step we would lose information
+   on whether the `+' was originally encoded or not.  Both results
+   were wrong because in CGI parameters + means space, while %2B means
+   literal plus.  reencode_string correctly translates the above to
+   "a%2B+b", i.e. returns the original string.
+
+   This function uses an algorithm proposed by Anon Sricharoenchai:
+
+   1. Encode all URL_UNSAFE and the "%" that are not followed by 2
+      hexdigits.
+
+   2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
+      "+".
+
+   ...except that this code conflates the two steps, and decides
+   whether to encode, decode, or pass through each character in turn.
+   The function still uses two passes, but their logic is the same --
+   the first pass exists merely for the sake of allocation.  Another
+   small difference is that we include `+' to URL_RESERVED.
+
+   Anon's test case:
+
+   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
+   ->
+   "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
+
+   Simpler test cases:
+
+   "foo bar"         -> "foo%20bar"
+   "foo%20bar"       -> "foo%20bar"
+   "foo %20bar"      -> "foo%20%20bar"
+   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
+   "foo%25%20bar"    -> "foo%25%20bar"
+   "foo%2%20bar"     -> "foo%252%20bar"
+   "foo+bar"         -> "foo+bar"            (plus is reserved!)
+   "foo%2b+bar"      -> "foo%2b+bar"  */
+
+char *
+reencode_string (const char *s)
+{
+  const char *p1;
+  char *newstr, *p2;
+  int oldlen, newlen;
+
+  int encode_count = 0;
+  int decode_count = 0;
+
+  /* First, pass through the string to see if there's anything to do,
+     and to calculate the new length.  */
+  for (p1 = s; *p1; p1++)
+    {
+      switch (decide_copy_method (p1))
+       {
+       case CM_ENCODE:
+         ++encode_count;
+         break;
+       case CM_DECODE:
+         ++decode_count;
+         break;
+       case CM_PASSTHROUGH:
+         break;
+       }
+    }
+
+  if (!encode_count && !decode_count)
+    /* The string is good as it is. */
+    return (char *)s;          /* C const model sucks. */
+
+  oldlen = p1 - s;
+  /* Each encoding adds two characters (hex digits), while each
+     decoding removes two characters.  */
+  newlen = oldlen + 2 * (encode_count - decode_count);
+  newstr = xmalloc (newlen + 1);
+
+  p1 = s;
+  p2 = newstr;
+
+  while (*p1)
+    {
+      switch (decide_copy_method (p1))
+       {
+       case CM_ENCODE:
+         {
+           char c = *p1++;
+           *p2++ = '%';
+           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
+           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
+         }
+         break;
+       case CM_DECODE:
+         *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
+                  + (XCHAR_TO_XDIGIT (*(p1 + 2))));
+         p1 += 3;              /* skip %xx */
+         break;
+       case CM_PASSTHROUGH:
+         *p2++ = *p1++;
+       }
+    }
+  *p2 = '\0';
+  assert (p2 - newstr == newlen);
+  return newstr;
  }
  
  }
  
-/* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
-   part is found, returns 0.  */
+/* Run PTR_VAR through reencode_string.  If a new string is consed,
+   free PTR_VAR and make it point to the new storage.  Obviously,
+   PTR_VAR needs to be an lvalue.  */
+
+#define REENCODE(ptr_var) do {                 \
+  char *rf_new = reencode_string (ptr_var);    \
+  if (rf_new != ptr_var)                       \
+    {                                          \
+      xfree (ptr_var);                         \
+      ptr_var = rf_new;                                \
+    }                                          \
+} while (0)
+\f
+/* Returns the scheme type if the scheme is supported, or
+   SCHEME_INVALID if not.  */
+enum url_scheme
+url_scheme (const char *url)
+{
+  int i;
+
+  for (i = 0; supported_schemes[i].leading_string; i++)
+    if (!strncasecmp (url, supported_schemes[i].leading_string,
+                     strlen (supported_schemes[i].leading_string)))
+      return (enum url_scheme)i;
+  return SCHEME_INVALID;
+}
+
+/* Return the number of characters needed to skip the scheme part of
+   the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
  int
  int
-skip_proto (const char *url)
+url_skip_scheme (const char *url)
  {
    const char *p = url;
  
  {
    const char *p = url;
  
-  /* Skip protocol name.  We allow `-' and `+' because of `whois++',
+  /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
       etc. */
    while (ISALNUM (*p) || *p == '-' || *p == '+')
      ++p;
       etc. */
    while (ISALNUM (*p) || *p == '-' || *p == '+')
      ++p;
@@ -277,10 +448,10 @@ skip_proto (const char *url)
    return p - url;
  }
  
    return p - url;
  }
  
-/* Returns 1 if the URL begins with a protocol (supported or
+/* Returns 1 if the URL begins with a scheme (supported or
     unsupported), 0 otherwise.  */
  int
     unsupported), 0 otherwise.  */
  int
-has_proto (const char *url)
+url_has_scheme (const char *url)
  {
    const char *p = url;
    while (ISALNUM (*p) || *p == '-' || *p == '+')
  {
    const char *p = url;
    while (ISALNUM (*p) || *p == '-' || *p == '+')
@@ -288,474 +459,479 @@ has_proto (const char *url)
    return *p == ':';
  }
  
    return *p == ':';
  }
  
+int
+scheme_default_port (enum url_scheme scheme)
+{
+  return supported_schemes[scheme].default_port;
+}
+
  /* Skip the username and password, if present here.  The function
     should be called *not* with the complete URL, but with the part
  /* Skip the username and password, if present here.  The function
     should be called *not* with the complete URL, but with the part
-   right after the protocol.
+   right after the scheme.
  
     If no username and password are found, return 0.  */
  int
  
     If no username and password are found, return 0.  */
  int
-skip_uname (const char *url)
+url_skip_uname (const char *url)
  {
    const char *p;
  {
    const char *p;
-  const char *q = NULL;
-  for (p = url ; *p && *p != '/'; p++)
-    if (*p == '@') q = p;
-  /* If a `@' was found before the first occurrence of `/', skip
-     it.  */
-  if (q != NULL)
-    return q - url + 1;
-  else
+
+  /* Look for '@' that comes before '/' or '?'. */
+  p = (const char *)strpbrk (url, "/?@");
+  if (!p || *p != '@')
      return 0;
      return 0;
-}
-\f
-/* Allocate a new urlinfo structure, fill it with default values and
-   return a pointer to it.  */
-struct urlinfo *
-newurl (void)
-{
-  struct urlinfo *u;
  
  
-  u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
-  memset (u, 0, sizeof (*u));
-  u->proto = URLUNKNOWN;
-  return u;
+  return p - url + 1;
  }
  
  }
  
-/* Perform a "deep" free of the urlinfo structure.  The structure
-   should have been created with newurl, but need not have been used.
-   If free_pointer is non-0, free the pointer itself.  */
-void
-freeurl (struct urlinfo *u, int complete)
-{
-  assert (u != NULL);
-  FREE_MAYBE (u->url);
-  FREE_MAYBE (u->host);
-  FREE_MAYBE (u->path);
-  FREE_MAYBE (u->file);
-  FREE_MAYBE (u->dir);
-  FREE_MAYBE (u->user);
-  FREE_MAYBE (u->passwd);
-  FREE_MAYBE (u->local);
-  FREE_MAYBE (u->referer);
-  if (u->proxy)
-    freeurl (u->proxy, 1);
-  if (complete)
-    xfree (u);
-  return;
-}
-\f
-/* Extract the given URL of the form
-   (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
-   1. hostname (terminated with `/' or `:')
-   2. port number (terminated with `/'), or chosen for the protocol
-   3. dirname (everything after hostname)
-   Most errors are handled.  No allocation is done, you must supply
-   pointers to allocated memory.
-   ...and a host of other stuff :-)
-
-   - Recognizes hostname:dir/file for FTP and
-     hostname (:portnum)?/dir/file for HTTP.
-   - Parses the path to yield directory and file
-   - Parses the URL to yield the username and passwd (if present)
-   - Decodes the strings, in case they contain "forbidden" characters
-   - Writes the result to struct urlinfo
-
-   If the argument STRICT is set, it recognizes only the canonical
-   form.  */
-uerr_t
-parseurl (const char *url, struct urlinfo *u, int strict)
+static int
+parse_uname (const char *str, int len, char **user, char **passwd)
  {
  {
-  int i, l, abs_ftp;
-  int recognizable;            /* Recognizable URL is the one where
-                                 the protocol name was explicitly
-                                 named, i.e. it wasn't deduced from
-                                 the URL format.  */
-  uerr_t type;
-
-  DEBUGP (("parseurl (\"%s\") -> ", url));
-  recognizable = has_proto (url);
-  if (strict && !recognizable)
-    return URLUNKNOWN;
-  for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
+  char *colon;
+
+  if (len == 0)
+    /* Empty user name not allowed. */
+    return 0;
+
+  colon = memchr (str, ':', len);
+  if (colon == str)
+    /* Empty user name again. */
+    return 0;
+
+  if (colon)
      {
      {
-      l = strlen (sup_protos[i].name);
-      if (!strncasecmp (sup_protos[i].name, url, l))
-       break;
+      int pwlen = len - (colon + 1 - str);
+      *passwd = xmalloc (pwlen + 1);
+      memcpy (*passwd, colon + 1, pwlen);
+      (*passwd)[pwlen] = '\0';
+      len -= pwlen + 1;
      }
      }
-  /* If protocol is recognizable, but unsupported, bail out, else
-     suppose unknown.  */
-  if (recognizable && i == ARRAY_SIZE (sup_protos))
-    return URLUNKNOWN;
-  else if (i == ARRAY_SIZE (sup_protos))
-    type = URLUNKNOWN;
    else
    else
-    u->proto = type = sup_protos[i].ind;
-
-  if (type == URLUNKNOWN)
-    l = 0;
-  /* Allow a username and password to be specified (i.e. just skip
-     them for now).  */
-  if (recognizable)
-    l += skip_uname (url + l);
-  for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
-  if (i == l)
-    return URLBADHOST;
-  /* Get the hostname.  */
-  u->host = strdupdelim (url + l, url + i);
-  DEBUGP (("host %s -> ", u->host));
-
-  /* Assume no port has been given.  */
-  u->port = 0;
-  if (url[i] == ':')
-    {
-      /* We have a colon delimiting the hostname.  It could mean that
-        a port number is following it, or a directory.  */
-      if (ISDIGIT (url[++i]))    /* A port number */
-       {
-         if (type == URLUNKNOWN)
-           u->proto = type = URLHTTP;
-         for (; url[i] && url[i] != '/'; i++)
-           if (ISDIGIT (url[i]))
-             u->port = 10 * u->port + (url[i] - '0');
-           else
-             return URLBADPORT;
-         if (!u->port)
-           return URLBADPORT;
-         DEBUGP (("port %hu -> ", u->port));
-       }
-      else if (type == URLUNKNOWN) /* or a directory */
-       u->proto = type = URLFTP;
-      else                      /* or just a misformed port number */
-       return URLBADPORT;
-    }
-  else if (type == URLUNKNOWN)
-    u->proto = type = URLHTTP;
-  if (!u->port)
+    *passwd = NULL;
+
+  *user = xmalloc (len + 1);
+  memcpy (*user, str, len);
+  (*user)[len] = '\0';
+
+  return 1;
+}
+
+/* Used by main.c: detect URLs written using the "shorthand" URL forms
+   popularized by Netscape and NcFTP.  HTTP shorthands look like this:
+
+   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
+   www.foo.com[:port]            -> http://www.foo.com[:port]
+
+   FTP shorthands look like this:
+
+   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
+   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
+
+   If the URL needs not or cannot be rewritten, return NULL.  */
+char *
+rewrite_shorthand_url (const char *url)
+{
+  const char *p;
+
+  if (url_has_scheme (url))
+    return NULL;
+
+  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
+     latter Netscape.  */
+  for (p = url; *p && *p != ':' && *p != '/'; p++)
+    ;
+
+  if (p == url)
+    return NULL;
+
+  if (*p == ':')
      {
      {
-      int ind;
-      for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
-       if (sup_protos[ind].ind == type)
-         break;
-      if (ind == ARRAY_SIZE (sup_protos))
-       return URLUNKNOWN;
-      u->port = sup_protos[ind].port;
+      const char *pp, *path;
+      char *res;
+      /* If the characters after the colon and before the next slash
+        or end of string are all digits, it's HTTP.  */
+      int digits = 0;
+      for (pp = p + 1; ISDIGIT (*pp); pp++)
+       ++digits;
+      if (digits > 0
+         && (*pp == '/' || *pp == '\0'))
+       goto http;
+
+      /* Prepend "ftp://" to the entire URL... */
+      path = p + 1;
+      res = xmalloc (6 + strlen (url) + 1);
+      sprintf (res, "ftp://%s", url);
+      /* ...and replace ':' with '/'. */
+      res[6 + (p - url)] = '/';
+      return res;
      }
      }
-  /* Some delimiter troubles...  */
-  if (url[i] == '/' && url[i - 1] != ':')
-    ++i;
-  if (type == URLHTTP)
-    while (url[i] && url[i] == '/')
-      ++i;
-  u->path = (char *)xmalloc (strlen (url + i) + 8);
-  strcpy (u->path, url + i);
-  if (type == URLFTP)
+  else
      {
      {
-      u->ftp_type = process_ftp_type (u->path);
-      /* #### We don't handle type `d' correctly yet.  */
-      if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
-       u->ftp_type = 'I';
-      DEBUGP (("ftp_type %c -> ", u->ftp_type));
+      char *res;
+    http:
+      /* Just prepend "http://" to what we have. */
+      res = xmalloc (7 + strlen (url) + 1);
+      sprintf (res, "http://%s", url);
+      return res;
      }
      }
-  DEBUGP (("opath %s -> ", u->path));
-  /* Parse the username and password (if existing).  */
-  parse_uname (url, &u->user, &u->passwd);
-  /* Decode the strings, as per RFC 1738.  */
-  decode_string (u->host);
-  decode_string (u->path);
-  if (u->user)
-    decode_string (u->user);
-  if (u->passwd)
-    decode_string (u->passwd);
-  /* Parse the directory.  */
-  parse_dir (u->path, &u->dir, &u->file);
-  DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
-  /* Simplify the directory.  */
-  path_simplify (u->dir);
-  /* Remove the leading `/' in HTTP.  */
-  if (type == URLHTTP && *u->dir == '/')
-    strcpy (u->dir, u->dir + 1);
-  DEBUGP (("ndir %s\n", u->dir));
-  /* Strip trailing `/'.  */
-  l = strlen (u->dir);
-  if (l > 1 && u->dir[l - 1] == '/')
-    u->dir[l - 1] = '\0';
-  /* Re-create the path: */
-  abs_ftp = (u->proto == URLFTP && *u->dir == '/');
-  /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
-      abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
-  strcpy (u->path, abs_ftp ? "%2F" : "/");
-  strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
-  strcat (u->path, *u->dir ? "/" : "");
-  strcat (u->path, u->file);
-  ENCODE (u->path);
-  DEBUGP (("newpath: %s\n", u->path));
-  /* Create the clean URL.  */
-  u->url = str_url (u, 0);
-  return URLOK;
  }
  \f
  }
  \f
-/* Special versions of DOTP and DDOTP for parse_dir().  They work like
-   DOTP and DDOTP, but they also recognize `?' as end-of-string
-   delimiter.  This is needed for correct handling of query
-   strings.  */
+static void parse_path PARAMS ((const char *, char **, char **));
  
  
-#define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
-#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')            \
-                    && (!*((x) + 2) || *((x) + 2) == '?'))
+static char *
+strpbrk_or_eos (const char *s, const char *accept)
+{
+  char *p = strpbrk (s, accept);
+  if (!p)
+    p = (char *)s + strlen (s);
+  return p;
+}
  
  
-/* Build the directory and filename components of the path.  Both
-   components are *separately* malloc-ed strings!  It does not change
-   the contents of path.
+static char *parse_errors[] = {
+#define PE_NO_ERROR            0
+  "No error",
+#define PE_UNRECOGNIZED_SCHEME 1
+  "Unrecognized scheme",
+#define PE_EMPTY_HOST          2
+  "Empty host",
+#define PE_BAD_PORT_NUMBER     3
+  "Bad port number",
+#define PE_INVALID_USER_NAME   4
+  "Invalid user name"
+};
  
  
-   If the path ends with "." or "..", they are (correctly) counted as
-   directories.  */
-static void
-parse_dir (const char *path, char **dir, char **file)
+#define SETERR(p, v) do {                      \
+  if (p)                                       \
+    *(p) = (v);                                        \
+} while (0)
+
+/* Parse a URL.
+
+   Return a new struct url if successful, NULL on error.  In case of
+   error, and if ERROR is not NULL, also set *ERROR to the appropriate
+   error code. */
+struct url *
+url_parse (const char *url, int *error)
  {
  {
-  int i, l;
+  struct url *u;
+  const char *p;
+
+  enum url_scheme scheme;
+
+  const char *uname_b,     *uname_e;
+  const char *host_b,      *host_e;
+  const char *path_b,      *path_e;
+  const char *params_b,    *params_e;
+  const char *query_b,     *query_e;
+  const char *fragment_b,  *fragment_e;
+
+  int port;
+  char *user = NULL, *passwd = NULL;
  
  
-  l = urlpath_length (path);
-  for (i = l; i && path[i] != '/'; i--);
+  const char *url_orig = url;
  
  
-  if (!i && *path != '/')   /* Just filename */
+  p = url = reencode_string (url);
+
+  scheme = url_scheme (url);
+  if (scheme == SCHEME_INVALID)
      {
      {
-      if (PD_DOTP (path) || PD_DDOTP (path))
-       {
-         *dir = strdupdelim (path, path + l);
-         *file = xstrdup (path + l); /* normally empty, but could
-                                         contain ?... */
-       }
-      else
-       {
-         *dir = xstrdup ("");     /* This is required because of FTP */
-         *file = xstrdup (path);
-       }
+      SETERR (error, PE_UNRECOGNIZED_SCHEME);
+      return NULL;
      }
      }
-  else if (!i)                 /* /filename */
+
+  p += strlen (supported_schemes[scheme].leading_string);
+  uname_b = p;
+  p += url_skip_uname (p);
+  uname_e = p;
+
+  /* scheme://user:pass@host[:port]... */
+  /*                    ^              */
+
+  /* We attempt to break down the URL into the components path,
+     params, query, and fragment.  They are ordered like this:
+
+       scheme://host[:port][/path][;params][?query][#fragment]  */
+
+  params_b   = params_e   = NULL;
+  query_b    = query_e    = NULL;
+  fragment_b = fragment_e = NULL;
+
+  host_b = p;
+  p = strpbrk_or_eos (p, ":/;?#");
+  host_e = p;
+
+  if (host_b == host_e)
      {
      {
-      if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
-       {
-         *dir = strdupdelim (path, path + l);
-         *file = xstrdup (path + l); /* normally empty, but could
-                                         contain ?... */
-       }
-      else
-       {
-         *dir = xstrdup ("/");
-         *file = xstrdup (path + 1);
-       }
+      SETERR (error, PE_EMPTY_HOST);
+      return NULL;
      }
      }
-  else /* Nonempty directory with or without a filename */
+
+  port = scheme_default_port (scheme);
+  if (*p == ':')
      {
      {
-      if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
-       {
-         *dir = strdupdelim (path, path + l);
-         *file = xstrdup (path + l); /* normally empty, but could
-                                         contain ?... */
-       }
-      else
+      const char *port_b, *port_e, *pp;
+
+      /* scheme://host:port/tralala */
+      /*              ^             */
+      ++p;
+      port_b = p;
+      p = strpbrk_or_eos (p, "/;?#");
+      port_e = p;
+
+      if (port_b == port_e)
         {
         {
-         *dir = strdupdelim (path, path + i);
-         *file = xstrdup (path + i + 1);
+         /* http://host:/whatever */
+         /*             ^         */
+         SETERR (error, PE_BAD_PORT_NUMBER);
+         return NULL;
         }
         }
-    }
-}
  
  
-/* Find the optional username and password within the URL, as per
-   RFC1738.  The returned user and passwd char pointers are
-   malloc-ed.  */
-static uerr_t
-parse_uname (const char *url, char **user, char **passwd)
-{
-  int l;
-  const char *p, *q, *col;
-  char **where;
-
-  *user = NULL;
-  *passwd = NULL;
-
-  /* Look for the end of the protocol string.  */
-  l = skip_proto (url);
-  if (!l)
-    return URLUNKNOWN;
-  /* Add protocol offset.  */
-  url += l;
-  /* Is there an `@' character?  */
-  for (p = url; *p && *p != '/'; p++)
-    if (*p == '@')
-      break;
-  /* If not, return.  */
-  if (*p != '@')
-    return URLOK;
-  /* Else find the username and password.  */
-  for (p = q = col = url; *p && *p != '/'; p++)
-    {
-      if (*p == ':' && !*user)
+      for (port = 0, pp = port_b; pp < port_e; pp++)
         {
         {
-         *user = (char *)xmalloc (p - url + 1);
-         memcpy (*user, url, p - url);
-         (*user)[p - url] = '\0';
-         col = p + 1;
+         if (!ISDIGIT (*pp))
+           {
+             /* http://host:12randomgarbage/blah */
+             /*               ^                  */
+             SETERR (error, PE_BAD_PORT_NUMBER);
+             return NULL;
+           }
+         port = 10 * port + (*pp - '0');
         }
         }
-      if (*p == '@') q = p;
      }
      }
-  /* Decide whether you have only the username or both.  */
-  where = *user ? passwd : user;
-  *where = (char *)xmalloc (q - col + 1);
-  memcpy (*where, col, q - col);
-  (*where)[q - col] = '\0';
-  return URLOK;
-}
  
  
-/* If PATH ends with `;type=X', return the character X.  */
-static char
-process_ftp_type (char *path)
-{
-  int len = strlen (path);
-
-  if (len >= 7
-      && !memcmp (path + len - 7, ";type=", 6))
+  if (*p == '/')
      {
      {
-      path[len - 7] = '\0';
-      return path[len - 1];
+      ++p;
+      path_b = p;
+      p = strpbrk_or_eos (p, ";?#");
+      path_e = p;
      }
    else
      }
    else
-    return '\0';
-}
-\f
-/* Return the URL as fine-formed string, with a proper protocol, optional port
-   number, directory and optional user/password.  If `hide' is non-zero (as it
-   is when we're calling this on a URL we plan to print, but not when calling it
-   to canonicalize a URL for use within the program), password will be hidden.
-   The forbidden characters in the URL will be cleansed.  */
-char *
-str_url (const struct urlinfo *u, int hide)
-{
-  char *res, *host, *user, *passwd, *proto_name, *dir, *file;
-  int i, l, ln, lu, lh, lp, lf, ld;
-  unsigned short proto_default_port;
+    {
+      /* Path is not allowed not to exist. */
+      path_b = path_e = p;
+    }
  
  
-  /* Look for the protocol name.  */
-  for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
-    if (sup_protos[i].ind == u->proto)
-      break;
-  if (i == ARRAY_SIZE (sup_protos))
-    return NULL;
-  proto_name = sup_protos[i].name;
-  proto_default_port = sup_protos[i].port;
-  host = encode_string (u->host);
-  dir = encode_string (u->dir);
-  file = encode_string (u->file);
-  user = passwd = NULL;
-  if (u->user)
-    user = encode_string (u->user);
-  if (u->passwd)
+  if (*p == ';')
      {
      {
-      if (hide)
-       /* Don't output the password, or someone might see it over the user's
-          shoulder (or in saved wget output).  Don't give away the number of
-          characters in the password, either, as we did in past versions of
-          this code, when we replaced the password characters with 'x's. */
-       passwd = xstrdup("<password>");
-      else
-       passwd = encode_string (u->passwd);
+      ++p;
+      params_b = p;
+      p = strpbrk_or_eos (p, "?#");
+      params_e = p;
      }
      }
-  if (u->proto == URLFTP && *dir == '/')
+  if (*p == '?')
      {
      {
-      char *tmp = (char *)xmalloc (strlen (dir) + 3);
-      /*sprintf (tmp, "%%2F%s", dir + 1);*/
-      tmp[0] = '%';
-      tmp[1] = '2';
-      tmp[2] = 'F';
-      strcpy (tmp + 3, dir + 1);
-      xfree (dir);
-      dir = tmp;
+      ++p;
+      query_b = p;
+      p = strpbrk_or_eos (p, "#");
+      query_e = p;
      }
      }
+  if (*p == '#')
+    {
+      ++p;
+      fragment_b = p;
+      p += strlen (p);
+      fragment_e = p;
+    }
+  assert (*p == 0);
  
  
-  ln = strlen (proto_name);
-  lu = user ? strlen (user) : 0;
-  lp = passwd ? strlen (passwd) : 0;
-  lh = strlen (host);
-  ld = strlen (dir);
-  lf = strlen (file);
-  res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
-  /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
-     (user ? user : ""), (passwd ? ":" : ""),
-     (passwd ? passwd : ""), (user ? "@" : ""),
-     host, u->port, dir, *dir ? "/" : "", file); */
-  l = 0;
-  memcpy (res, proto_name, ln);
-  l += ln;
-  if (user)
+  if (uname_b != uname_e)
      {
      {
-      memcpy (res + l, user, lu);
-      l += lu;
-      if (passwd)
+      /* http://user:pass@host */
+      /*        ^         ^    */
+      /*     uname_b   uname_e */
+      if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
         {
         {
-         res[l++] = ':';
-         memcpy (res + l, passwd, lp);
-         l += lp;
+         SETERR (error, PE_INVALID_USER_NAME);
+         return NULL;
         }
         }
-      res[l++] = '@';
      }
      }
-  memcpy (res + l, host, lh);
-  l += lh;
-  if (u->port != proto_default_port)
+
+  u = (struct url *)xmalloc (sizeof (struct url));
+  memset (u, 0, sizeof (*u));
+
+  if (url == url_orig)
+    u->url    = xstrdup (url);
+  else
+    u->url    = (char *)url;
+
+  u->scheme = scheme;
+  u->host   = strdupdelim (host_b, host_e);
+  u->port   = port;
+  u->user   = user;
+  u->passwd = passwd;
+
+  u->path = strdupdelim (path_b, path_e);
+  path_simplify (u->path);
+
+  if (params_b)
+    u->params = strdupdelim (params_b, params_e);
+  if (query_b)
+    u->query = strdupdelim (query_b, query_e);
+  if (fragment_b)
+    u->fragment = strdupdelim (fragment_b, fragment_e);
+
+  parse_path (u->path, &u->dir, &u->file);
+
+  return u;
+}
+
+const char *
+url_error (int error_code)
+{
+  assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
+  return parse_errors[error_code];
+}
+
+static void
+parse_path (const char *quoted_path, char **dir, char **file)
+{
+  char *path, *last_slash;
+
+  STRDUP_ALLOCA (path, quoted_path);
+  decode_string (path);
+
+  last_slash = strrchr (path, '/');
+  if (!last_slash)
      {
      {
-      res[l++] = ':';
-      long_to_string (res + l, (long)u->port);
-      l += numdigit (u->port);
+      *dir = xstrdup ("");
+      *file = xstrdup (path);
+    }
+  else
+    {
+      *dir = strdupdelim (path, last_slash);
+      *file = xstrdup (last_slash + 1);
      }
      }
-  res[l++] = '/';
-  memcpy (res + l, dir, ld);
-  l += ld;
-  if (*dir)
-    res[l++] = '/';
-  strcpy (res + l, file);
-  xfree (host);
-  xfree (dir);
-  xfree (file);
-  FREE_MAYBE (user);
-  FREE_MAYBE (passwd);
-  return res;
  }
  
  }
  
-/* Check whether two URL-s are equivalent, i.e. pointing to the same
-   location.  Uses parseurl to parse them, and compares the canonical
-   forms.
+/* Note: URL's "full path" is the path with the query string and
+   params appended.  The "fragment" (#foo) is intentionally ignored,
+   but that might be changed.  For example, if the original URL was
+   "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
+   the full path will be "/foo/bar/baz;bullshit?querystring".  */
  
  
-   Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
-   return 0 on error.  */
-/* Do not compile unused code. */
-#if 0
-int
-url_equal (const char *url1, const char *url2)
+/* Return the length of the full path, without the terminating
+   zero.  */
+
+static int
+full_path_length (const struct url *url)
  {
  {
-  struct urlinfo *u1, *u2;
-  uerr_t err;
-  int res;
+  int len = 0;
+
+#define FROB(el) if (url->el) len += 1 + strlen (url->el)
+
+  FROB (path);
+  FROB (params);
+  FROB (query);
+
+#undef FROB
+
+  return len;
+}
  
  
-  u1 = newurl ();
-  err = parseurl (url1, u1, 0);
-  if (err != URLOK)
+/* Write out the full path. */
+
+static void
+full_path_write (const struct url *url, char *where)
+{
+#define FROB(el, chr) do {                     \
+  char *f_el = url->el;                                \
+  if (f_el) {                                  \
+    int l = strlen (f_el);                     \
+    *where++ = chr;                            \
+    memcpy (where, f_el, l);                   \
+    where += l;                                        \
+  }                                            \
+} while (0)
+
+  FROB (path, '/');
+  FROB (params, ';');
+  FROB (query, '?');
+
+#undef FROB
+}
+
+/* Public function for getting the "full path". */
+char *
+url_full_path (const struct url *url)
+{
+  int length = full_path_length (url);
+  char *full_path = (char *)xmalloc(length + 1);
+
+  full_path_write (url, full_path);
+  full_path[length] = '\0';
+
+  return full_path;
+}
+
+/* Sync u->path and u->url with u->dir and u->file. */
+static void
+sync_path (struct url *url)
+{
+  char *newpath;
+
+  xfree (url->path);
+
+  if (!*url->dir)
      {
      {
-      freeurl (u1, 1);
-      return 0;
+      newpath = xstrdup (url->file);
+      REENCODE (newpath);
      }
      }
-  u2 = newurl ();
-  err = parseurl (url2, u2, 0);
-  if (err != URLOK)
+  else
      {
      {
-      freeurl (u1, 1);
-      freeurl (u2, 1);
-      return 0;
+      int dirlen = strlen (url->dir);
+      int filelen = strlen (url->file);
+
+      newpath = xmalloc (dirlen + 1 + filelen + 1);
+      memcpy (newpath, url->dir, dirlen);
+      newpath[dirlen] = '/';
+      memcpy (newpath + dirlen + 1, url->file, filelen);
+      newpath[dirlen + 1 + filelen] = '\0';
+      REENCODE (newpath);
      }
      }
-  res = !strcmp (u1->url, u2->url);
-  freeurl (u1, 1);
-  freeurl (u2, 1);
-  return res;
+
+  url->path = newpath;
+
+  /* Synchronize u->url. */
+  xfree (url->url);
+  url->url = url_string (url, 0);
+}
+
+/* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
+   This way we can sync u->path and u->url when they get changed.  */
+
+void
+url_set_dir (struct url *url, const char *newdir)
+{
+  xfree (url->dir);
+  url->dir = xstrdup (newdir);
+  sync_path (url);
+}
+
+void
+url_set_file (struct url *url, const char *newfile)
+{
+  xfree (url->file);
+  url->file = xstrdup (newfile);
+  sync_path (url);
+}
+
+void
+url_free (struct url *url)
+{
+  xfree (url->host);
+  xfree (url->path);
+  xfree (url->url);
+
+  FREE_MAYBE (url->params);
+  FREE_MAYBE (url->query);
+  FREE_MAYBE (url->fragment);
+  FREE_MAYBE (url->user);
+  FREE_MAYBE (url->passwd);
+  FREE_MAYBE (url->dir);
+  FREE_MAYBE (url->file);
+
+  xfree (url);
  }
  }
-#endif /* 0 */
  \f
  urlpos *
  get_urls_file (const char *file)
  \f
  urlpos *
  get_urls_file (const char *file)
@@ -910,14 +1086,11 @@ count_slashes (const char *s)
  /* Return the path name of the URL-equivalent file name, with a
     remote-like structure of directories.  */
  static char *
  /* Return the path name of the URL-equivalent file name, with a
     remote-like structure of directories.  */
  static char *
-mkstruct (const struct urlinfo *u)
+mkstruct (const struct url *u)
  {
    char *host, *dir, *file, *res, *dirpref;
    int l;
  
  {
    char *host, *dir, *file, *res, *dirpref;
    int l;
  
-  assert (u->dir != NULL);
-  assert (u->host != NULL);
-
    if (opt.cut_dirs)
      {
        char *ptr = u->dir + (*u->dir == '/');
    if (opt.cut_dirs)
      {
        char *ptr = u->dir + (*u->dir == '/');
@@ -986,42 +1159,67 @@ mkstruct (const struct urlinfo *u)
    return res;
  }
  
    return res;
  }
  
-/* Return a malloced copy of S, but protect any '/' characters. */
+/* Compose a file name out of BASE, an unescaped file name, and QUERY,
+   an escaped query string.  The trick is to make sure that unsafe
+   characters in BASE are escaped, and that slashes in QUERY are also
+   escaped.  */
  
  static char *
  
  static char *
-file_name_protect_query_string (const char *s)
+compose_file_name (char *base, char *query)
  {
  {
-  const char *from;
-  char *to, *dest;
-  int destlen = 0;
-  for (from = s; *from; from++)
+  char result[256];
+  char *from;
+  char *to = result;
+
+  /* Copy BASE to RESULT and encode all unsafe characters.  */
+  from = base;
+  while (*from && to - result < sizeof (result))
      {
      {
-      ++destlen;
-      if (*from == '/')
-       destlen += 2;           /* each / gets replaced with %2F, so
-                                  it adds two more chars.  */
+      if (UNSAFE_CHAR (*from))
+       {
+         const unsigned char c = *from++;
+         *to++ = '%';
+         *to++ = XDIGIT_TO_XCHAR (c >> 4);
+         *to++ = XDIGIT_TO_XCHAR (c & 0xf);
+       }
+      else
+       *to++ = *from++;
      }
      }
-  dest = (char *)xmalloc (destlen + 1);
-  for (from = s, to = dest; *from; from++)
+
+  if (query && to - result < sizeof (result))
      {
      {
-      if (*from != '/')
-       *to++ = *from;
-      else
+      *to++ = '?';
+
+      /* Copy QUERY to RESULT and encode all '/' characters. */
+      from = query;
+      while (*from && to - result < sizeof (result))
         {
         {
-         *to++ = '%';
-         *to++ = '2';
-         *to++ = 'F';
+         if (*from == '/')
+           {
+             *to++ = '%';
+             *to++ = '2';
+             *to++ = 'F';
+             ++from;
+           }
+         else
+           *to++ = *from++;
         }
      }
         }
      }
-  assert (to - dest == destlen);
-  *to = '\0';
-  return dest;
+
+  if (to - result < sizeof (result))
+    *to = '\0';
+  else
+    /* Truncate input which is too long, presumably due to a huge
+       query string.  */
+    result[sizeof (result) - 1] = '\0';
+
+  return xstrdup (result);
  }
  
  /* Create a unique filename, corresponding to a given URL.  Calls
     mkstruct if necessary.  Does *not* actually create any directories.  */
  char *
  }
  
  /* Create a unique filename, corresponding to a given URL.  Calls
     mkstruct if necessary.  Does *not* actually create any directories.  */
  char *
-url_filename (const struct urlinfo *u)
+url_filename (const struct url *u)
  {
    char *file, *name;
    int have_prefix = 0;         /* whether we must prepend opt.dir_prefix */
  {
    char *file, *name;
    int have_prefix = 0;         /* whether we must prepend opt.dir_prefix */
@@ -1033,23 +1231,9 @@ url_filename (const struct urlinfo *u)
      }
    else
      {
      }
    else
      {
-      if (!*u->file)
-       file = xstrdup ("index.html");
-      else
-       {
-         /* If the URL came with a query string, u->file will contain
-            a question mark followed by query string contents.  These
-            contents can contain '/' which would make us create
-            unwanted directories.  These slashes must be protected
-            explicitly.  */
-         if (!strchr (u->file, '/'))
-           file = xstrdup (u->file);
-         else
-           {
-             /*assert (strchr (u->file, '?') != NULL);*/
-             file = file_name_protect_query_string (u->file);
-           }
-       }
+      char *base = *u->file ? u->file : "index.html";
+      char *query = u->query && *u->query ? u->query : NULL;
+      file = compose_file_name (base, query);
      }
  
    if (!have_prefix)
      }
  
    if (!have_prefix)
@@ -1123,7 +1307,7 @@ find_last_char (const char *b, const char *e, char c)
     Either of the URIs may be absolute or relative, complete with the
     host name, or path only.  This tries to behave "reasonably" in all
     foreseeable cases.  It employs little specific knowledge about
     Either of the URIs may be absolute or relative, complete with the
     host name, or path only.  This tries to behave "reasonably" in all
     foreseeable cases.  It employs little specific knowledge about
-   protocols or URL-specific stuff -- it just works on strings.
+   schemes or URL-specific stuff -- it just works on strings.
  
     The parameters LINKLENGTH is useful if LINK is not zero-terminated.
     See uri_merge for a gentler interface to this functionality.
  
     The parameters LINKLENGTH is useful if LINK is not zero-terminated.
     See uri_merge for a gentler interface to this functionality.
@@ -1131,11 +1315,11 @@ find_last_char (const char *b, const char *e, char c)
     #### This function should handle `./' and `../' so that the evil
     path_simplify can go.  */
  static char *
     #### This function should handle `./' and `../' so that the evil
     path_simplify can go.  */
  static char *
-uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
+uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
  {
    char *constr;
  
  {
    char *constr;
  
-  if (no_proto)
+  if (no_scheme)
      {
        const char *end = base + urlpath_length (base);
  
      {
        const char *end = base + urlpath_length (base);
  
@@ -1252,7 +1436,7 @@ uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
           constr[span + linklength] = '\0';
         }
      }
           constr[span + linklength] = '\0';
         }
      }
-  else /* !no_proto */
+  else /* !no_scheme */
      {
        constr = strdupdelim (link, link + linklength);
      }
      {
        constr = strdupdelim (link, link + linklength);
      }
@@ -1265,42 +1449,140 @@ uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
  char *
  uri_merge (const char *base, const char *link)
  {
  char *
  uri_merge (const char *base, const char *link)
  {
-  return uri_merge_1 (base, link, strlen (link), !has_proto (link));
+  return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
  }
  \f
  }
  \f
-/* Optimize URL by host, destructively replacing u->host with realhost
-   (u->host).  Do this regardless of opt.simple_check.  */
-void
-opt_url (struct urlinfo *u)
+#define APPEND(p, s) do {                      \
+  int len = strlen (s);                                \
+  memcpy (p, s, len);                          \
+  p += len;                                    \
+} while (0)
+
+/* Use this instead of password when the actual password is supposed
+   to be hidden.  We intentionally use a generic string without giving
+   away the number of characters in the password, like previous
+   versions did.  */
+#define HIDDEN_PASSWORD "*password*"
+
+/* Recreate the URL string from the data in URL.
+
+   If HIDE is non-zero (as it is when we're calling this on a URL we
+   plan to print, but not when calling it to canonicalize a URL for
+   use within the program), password will be hidden.  Unsafe
+   characters in the URL will be quoted.  */
+
+char *
+url_string (const struct url *url, int hide_password)
  {
  {
-  /* Find the "true" host.  */
-  char *host = realhost (u->host);
-  xfree (u->host);
-  u->host = host;
-  assert (u->dir != NULL);      /* the URL must have been parsed */
-  /* Refresh the printed representation.  */
-  xfree (u->url);
-  u->url = str_url (u, 0);
+  int size;
+  char *result, *p;
+  char *quoted_user = NULL, *quoted_passwd = NULL;
+
+  int scheme_port  = supported_schemes[url->scheme].default_port;
+  char *scheme_str = supported_schemes[url->scheme].leading_string;
+  int fplen = full_path_length (url);
+
+  assert (scheme_str != NULL);
+
+  /* Make sure the user name and password are quoted. */
+  if (url->user)
+    {
+      quoted_user = encode_string_maybe (url->user);
+      if (url->passwd)
+       {
+         if (hide_password)
+           quoted_passwd = HIDDEN_PASSWORD;
+         else
+           quoted_passwd = encode_string_maybe (url->passwd);
+       }
+    }
+
+  size = (strlen (scheme_str)
+         + strlen (url->host)
+         + fplen
+         + 1);
+  if (url->port != scheme_port)
+    size += 1 + numdigit (url->port);
+  if (quoted_user)
+    {
+      size += 1 + strlen (quoted_user);
+      if (quoted_passwd)
+       size += 1 + strlen (quoted_passwd);
+    }
+
+  p = result = xmalloc (size);
+
+  APPEND (p, scheme_str);
+  if (quoted_user)
+    {
+      APPEND (p, quoted_user);
+      if (quoted_passwd)
+       {
+         *p++ = ':';
+         APPEND (p, quoted_passwd);
+       }
+      *p++ = '@';
+    }
+
+  APPEND (p, url->host);
+  if (url->port != scheme_port)
+    {
+      *p++ = ':';
+      long_to_string (p, url->port);
+      p += strlen (p);
+    }
+
+  full_path_write (url, p);
+  p += fplen;
+  *p++ = '\0';
+
+  assert (p - result == size);
+
+  if (quoted_user && quoted_user != url->user)
+    xfree (quoted_user);
+  if (quoted_passwd && !hide_password
+      && quoted_passwd != url->passwd)
+    xfree (quoted_passwd);
+
+  return result;
  }
  \f
  }
  \f
-/* Returns proxy host address, in accordance with PROTO.  */
+/* Returns proxy host address, in accordance with SCHEME.  */
  char *
  char *
-getproxy (uerr_t proto)
+getproxy (enum url_scheme scheme)
  {
  {
-  char *proxy;
+  char *proxy = NULL;
+  char *rewritten_url;
+  static char rewritten_storage[1024];
  
  
-  if (proto == URLHTTP)
-    proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
-  else if (proto == URLFTP)
-    proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
+  switch (scheme)
+    {
+    case SCHEME_HTTP:
+      proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
+      break;
  #ifdef HAVE_SSL
  #ifdef HAVE_SSL
-  else if (proto == URLHTTPS)
-    proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
-#endif /* HAVE_SSL */
-  else
-    proxy = NULL;
+    case SCHEME_HTTPS:
+      proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
+      break;
+#endif
+    case SCHEME_FTP:
+      proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
+      break;
+    case SCHEME_INVALID:
+      break;
+    }
    if (!proxy || !*proxy)
      return NULL;
    if (!proxy || !*proxy)
      return NULL;
+
+  /* Handle shorthands. */
+  rewritten_url = rewrite_shorthand_url (proxy);
+  if (rewritten_url)
+    {
+      strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
+      rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
+      proxy = rewritten_storage;
+    }
+
    return proxy;
  }
  
    return proxy;
  }