[svn] Improved --restrict-file-names to accept ",nocontrol".

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index 7a44a8050d12710669875606b2034dace0abb401..307da8d572c42a0e053d2a87c285a6406afd1a3a 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1,5 +1,6 @@
  /* URL handling.
-   Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
+   Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -15,7 +16,17 @@ GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
  
  #include <config.h>
  
@@ -48,6 +59,11 @@ extern int errno;
  /* Is X ".."?  */
  #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  
+static const int NS_INADDRSZ  = 4;
+static const int NS_IN6ADDRSZ = 16;
+static const int NS_INT16SZ = 2;
+
+
  struct scheme_data
  {
    char *leading_string;
@@ -80,24 +96,22 @@ static int path_simplify PARAMS ((char *));
     code assumes ASCII character set and 8-bit chars.  */
  
  enum {
+  /* rfc1738 reserved chars, preserved from encoding.  */
    urlchr_reserved = 1,
+
+  /* rfc1738 unsafe chars, plus some more.  */
    urlchr_unsafe   = 2
  };
  
+#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
+#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
+#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
+
+/* Shorthands for the table: */
  #define R  urlchr_reserved
  #define U  urlchr_unsafe
  #define RU R|U
  
-#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
-
-/* rfc1738 reserved chars, preserved from encoding.  */
-
-#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
-
-/* rfc1738 unsafe chars, plus some more.  */
-
-#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
-
  const static unsigned char urlchr_table[256] =
  {
    U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
@@ -111,7 +125,7 @@ const static unsigned char urlchr_table[256] =
   RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
-  0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
+  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
    U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
@@ -127,15 +141,21 @@ const static unsigned char urlchr_table[256] =
    U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
    U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
  };
+#undef R
+#undef U
+#undef RU
+
+/* URL-unescape the string S.
  
-/* Decodes the forms %xy in a URL to the character the hexadecimal
-   code of which is xy.  xy are hexadecimal digits from
-   [0123456789ABCDEF] (case-insensitive).  If x or y are not
-   hex-digits or `%' precedes `\0', the sequence is inserted
-   literally.  */
+   This is done by transforming the sequences "%HH" to the character
+   represented by the hexadecimal digits HH.  If % is not followed by
+   two hexadecimal digits, it is inserted literally.
+
+   The transformation is done in place.  If you need the original
+   string intact, make a copy before calling this function.  */
  
  static void
-decode_string (char *s)
+url_unescape (char *s)
  {
    char *t = s;                 /* t - tortoise */
    char *h = s;                 /* h - hare     */
@@ -160,10 +180,15 @@ decode_string (char *s)
    *t = '\0';
  }
  
-/* Like encode_string, but return S if there are no unsafe chars.  */
+/* The core of url_escape_* functions.  Escapes the characters that
+   match the provided mask in urlchr_table.
+
+   If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
+   will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
+   freshly allocated string will be returned in all cases.  */
  
  static char *
-encode_string_maybe (const char *s)
+url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
  {
    const char *p1;
    char *p2, *newstr;
@@ -171,11 +196,11 @@ encode_string_maybe (const char *s)
    int addition = 0;
  
    for (p1 = s; *p1; p1++)
-    if (UNSAFE_CHAR (*p1))
+    if (urlchr_test (*p1, mask))
        addition += 2;           /* Two more characters (hex digits) */
  
    if (!addition)
-    return (char *)s;
+    return allow_passthrough ? (char *)s : xstrdup (s);
  
    newlen = (p1 - s) + addition;
    newstr = (char *)xmalloc (newlen + 1);
@@ -184,7 +209,8 @@ encode_string_maybe (const char *s)
    p2 = newstr;
    while (*p1)
      {
-      if (UNSAFE_CHAR (*p1))
+      /* Quote the characters that match the test mask. */
+      if (urlchr_test (*p1, mask))
         {
           unsigned char c = *p1++;
           *p2++ = '%';
@@ -194,37 +220,29 @@ encode_string_maybe (const char *s)
        else
         *p2++ = *p1++;
      }
-  *p2 = '\0';
    assert (p2 - newstr == newlen);
+  *p2 = '\0';
  
    return newstr;
  }
  
-/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
-   given string, returning a malloc-ed %XX encoded string.  */
-  
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+   string, returning a freshly allocated string.  */
+
  char *
-encode_string (const char *s)
+url_escape (const char *s)
  {
-  char *encoded = encode_string_maybe (s);
-  if (encoded != s)
-    return encoded;
-  else
-    return xstrdup (s);
+  return url_escape_1 (s, urlchr_unsafe, 0);
  }
  
-/* Encode unsafe characters in PTR to %xx.  If such encoding is done,
-   the old value of PTR is freed and PTR is made to point to the newly
-   allocated storage.  */
-
-#define ENCODE(ptr) do {                       \
-  char *e_new = encode_string_maybe (ptr);     \
-  if (e_new != ptr)                            \
-    {                                          \
-      xfree (ptr);                             \
-      ptr = e_new;                             \
-    }                                          \
-} while (0)
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+   string.  If no characters are unsafe, S is returned.  */
+
+static char *
+url_escape_allow_passthrough (const char *s)
+{
+  return url_escape_1 (s, urlchr_unsafe, 1);
+}
  \f
  enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
  
@@ -243,7 +261,7 @@ decide_copy_method (const char *p)
           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
             XCHAR_TO_XDIGIT (*(p + 2));
  
-         if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
+         if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
             return CM_PASSTHROUGH;
           else
             return CM_DECODE;
@@ -252,20 +270,20 @@ decide_copy_method (const char *p)
         /* Garbled %.. sequence: encode `%'. */
         return CM_ENCODE;
      }
-  else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
+  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
      return CM_ENCODE;
    else
      return CM_PASSTHROUGH;
  }
  
-/* Translate a %-quoting (but possibly non-conformant) input string S
-   into a %-quoting (and conformant) output string.  If no characters
+/* Translate a %-escaped (but possibly non-conformant) input string S
+   into a %-escaped (and conformant) output string.  If no characters
     are encoded or decoded, return the same string S; otherwise, return
     a freshly allocated string with the new contents.
  
     After a URL has been run through this function, the protocols that
     use `%' as the quote character can use the resulting string as-is,
-   while those that don't call decode_string() to get to the intended
+   while those that don't call url_unescape() to get to the intended
     data.  This function is also stable: after an input string is
     transformed the first time, all further transformations of the
     result yield the same result string.
@@ -278,20 +296,21 @@ decide_copy_method (const char *p)
  
         GET /abc%20def HTTP/1.0
  
-   So it appears that the unsafe chars need to be quoted, as with
-   encode_string.  But what if we're requested to download
-   `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
-   the user meant was a literal space, and he was kind enough to quote
-   it.  In that case, Wget should obviously leave the `%20' as is, and
-   send the same request as above.  So in this case we may not call
-   encode_string.
-
-   But what if the requested URI is `abc%20 def'?  If we call
-   encode_string, we end up with `/abc%2520%20def', which is almost
-   certainly not intended.  If we don't call encode_string, we are
-   left with the embedded space and cannot send the request.  What the
+   It appears that the unsafe chars need to be quoted, for example
+   with url_escape.  But what if we're requested to download
+   `abc%20def'?  url_escape transforms "%" to "%25", which would leave
+   us with `abc%2520def'.  This is incorrect -- since %-escapes are
+   part of URL syntax, "%20" is the correct way to denote a literal
+   space on the Wget command line.  This leaves us in the conclusion
+   that in that case Wget should not call url_escape, but leave the
+   `%20' as is.
+
+   And what if the requested URI is `abc%20 def'?  If we call
+   url_escape, we end up with `/abc%2520%20def', which is almost
+   certainly not intended.  If we don't call url_escape, we are left
+   with the embedded space and cannot complete the request.  What the
     user meant was for Wget to request `/abc%20%20def', and this is
-   where reencode_string kicks in.
+   where reencode_escapes kicks in.
  
     Wget used to solve this by first decoding %-quotes, and then
     encoding all the "unsafe" characters found in the resulting string.
@@ -302,7 +321,7 @@ decide_copy_method (const char *p)
     is inevitable because by the second step we would lose information
     on whether the `+' was originally encoded or not.  Both results
     were wrong because in CGI parameters + means space, while %2B means
-   literal plus.  reencode_string correctly translates the above to
+   literal plus.  reencode_escapes correctly translates the above to
     "a%2B+b", i.e. returns the original string.
  
     This function uses an algorithm proposed by Anon Sricharoenchai:
@@ -337,7 +356,7 @@ decide_copy_method (const char *p)
     "foo%2b+bar"      -> "foo%2b+bar"  */
  
  static char *
-reencode_string (const char *s)
+reencode_escapes (const char *s)
  {
    const char *p1;
    char *newstr, *p2;
@@ -401,19 +420,6 @@ reencode_string (const char *s)
    assert (p2 - newstr == newlen);
    return newstr;
  }
-
-/* Run PTR_VAR through reencode_string.  If a new string is consed,
-   free PTR_VAR and make it point to the new storage.  Obviously,
-   PTR_VAR needs to be an lvalue.  */
-
-#define REENCODE(ptr_var) do {                 \
-  char *rf_new = reencode_string (ptr_var);    \
-  if (rf_new != ptr_var)                       \
-    {                                          \
-      xfree (ptr_var);                         \
-      ptr_var = rf_new;                                \
-    }                                          \
-} while (0)
  \f
  /* Returns the scheme type if the scheme is supported, or
     SCHEME_INVALID if not.  */
@@ -528,6 +534,11 @@ parse_uname (const char *str, int len, char **user, char **passwd)
    memcpy (*user, str, len);
    (*user)[len] = '\0';
  
+  if (*user)
+    url_unescape (*user);
+  if (*passwd)
+    url_unescape (*passwd);
+
    return 1;
  }
  
@@ -591,6 +602,29 @@ rewrite_shorthand_url (const char *url)
  \f
  static void parse_path PARAMS ((const char *, char **, char **));
  
+/* Like strpbrk, with the exception that it returns the pointer to the
+   terminating zero (end-of-string aka "eos") if no matching character
+   is found.
+
+   Although I normally balk at Gcc-specific optimizations, it probably
+   makes sense here: glibc has optimizations that detect strpbrk being
+   called with literal string as ACCEPT and inline the search.  That
+   optimization is defeated if strpbrk is hidden within the call to
+   another function.  (And no, making strpbrk_or_eos inline doesn't
+   help because the check for literal accept is in the
+   preprocessor.)  */
+
+#ifdef __GNUC__
+
+#define strpbrk_or_eos(s, accept) ({           \
+  char *SOE_p = strpbrk (s, accept);           \
+  if (!SOE_p)                                  \
+    SOE_p = (char *)s + strlen (s);            \
+  SOE_p;                                       \
+})
+
+#else  /* not __GNUC__ */
+
  static char *
  strpbrk_or_eos (const char *s, const char *accept)
  {
@@ -599,6 +633,7 @@ strpbrk_or_eos (const char *s, const char *accept)
      p = (char *)s + strlen (s);
    return p;
  }
+#endif
  
  /* Turn STR into lowercase; return non-zero if a character was
     actually changed. */
@@ -617,16 +652,22 @@ lowercase_str (char *str)
  }
  
  static char *parse_errors[] = {
-#define PE_NO_ERROR            0
+#define PE_NO_ERROR                    0
    "No error",
-#define PE_UNSUPPORTED_SCHEME 1
+#define PE_UNSUPPORTED_SCHEME          1
    "Unsupported scheme",
-#define PE_EMPTY_HOST          2
+#define PE_EMPTY_HOST                  2
    "Empty host",
-#define PE_BAD_PORT_NUMBER     3
+#define PE_BAD_PORT_NUMBER             3
    "Bad port number",
-#define PE_INVALID_USER_NAME   4
-  "Invalid user name"
+#define PE_INVALID_USER_NAME           4
+  "Invalid user name",
+#define PE_UNTERMINATED_IPV6_ADDRESS   5
+  "Unterminated IPv6 numeric address",
+#define PE_IPV6_NOT_SUPPORTED          6
+  "IPv6 addresses not supported",
+#define PE_INVALID_IPV6_ADDRESS                7
+  "Invalid IPv6 numeric address"
  };
  
  #define SETERR(p, v) do {                      \
@@ -634,6 +675,138 @@ static char *parse_errors[] = {
      *(p) = (v);                                        \
  } while (0)
  
+#ifdef ENABLE_IPV6
+/* The following two functions were adapted from glibc. */
+
+static int
+is_valid_ipv4_address (const char *str, const char *end)
+{
+  int saw_digit, octets;
+  int val;
+
+  saw_digit = 0;
+  octets = 0;
+  val = 0;
+
+  while (str < end) {
+    int ch = *str++;
+
+    if (ch >= '0' && ch <= '9') {
+      val = val * 10 + (ch - '0');
+
+      if (val > 255)
+        return 0;
+      if (saw_digit == 0) {
+        if (++octets > 4)
+          return 0;
+        saw_digit = 1;
+      }
+    } else if (ch == '.' && saw_digit == 1) {
+      if (octets == 4)
+        return 0;
+      val = 0;
+      saw_digit = 0;
+    } else
+      return 0;
+  }
+  if (octets < 4)
+    return 0;
+  
+  return 1;
+}
+
+static int
+is_valid_ipv6_address (const char *str, const char *end)
+{
+  static const char xdigits[] = "0123456789abcdef";
+  const char *curtok;
+  int tp;
+  const char *colonp;
+  int saw_xdigit;
+  unsigned int val;
+
+  tp = 0;
+  colonp = NULL;
+
+  if (str == end)
+    return 0;
+  
+  /* Leading :: requires some special handling. */
+  if (*str == ':')
+    {
+      ++str;
+      if (str == end || *str != ':')
+       return 0;
+    }
+
+  curtok = str;
+  saw_xdigit = 0;
+  val = 0;
+
+  while (str < end) {
+    int ch = *str++;
+    const char *pch;
+
+    /* if ch is a number, add it to val. */
+    pch = strchr(xdigits, ch);
+    if (pch != NULL) {
+      val <<= 4;
+      val |= (pch - xdigits);
+      if (val > 0xffff)
+       return 0;
+      saw_xdigit = 1;
+      continue;
+    }
+
+    /* if ch is a colon ... */
+    if (ch == ':') {
+      curtok = str;
+      if (saw_xdigit == 0) {
+       if (colonp != NULL)
+         return 0;
+       colonp = str + tp;
+       continue;
+      } else if (str == end) {
+       return 0;
+      }
+      if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
+       return 0;
+      tp += NS_INT16SZ;
+      saw_xdigit = 0;
+      val = 0;
+      continue;
+    }
+
+    /* if ch is a dot ... */
+    if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
+       is_valid_ipv4_address(curtok, end) == 1) {
+      tp += NS_INADDRSZ;
+      saw_xdigit = 0;
+      break;
+    }
+    
+    return 0;
+  }
+
+  if (saw_xdigit == 1) {
+    if (tp > NS_IN6ADDRSZ - NS_INT16SZ) 
+      return 0;
+    tp += NS_INT16SZ;
+  }
+
+  if (colonp != NULL) {
+    if (tp == NS_IN6ADDRSZ) 
+      return 0;
+    tp = NS_IN6ADDRSZ;
+  }
+
+  if (tp != NS_IN6ADDRSZ)
+    return 0;
+
+  return 1;
+}
+#endif
+
  /* Parse a URL.
  
     Return a new struct url if successful, NULL on error.  In case of
@@ -667,7 +840,7 @@ url_parse (const char *url, int *error)
        return NULL;
      }
  
-  url_encoded = reencode_string (url);
+  url_encoded = reencode_escapes (url);
    p = url_encoded;
  
    p += strlen (supported_schemes[scheme].leading_string);
@@ -688,8 +861,43 @@ url_parse (const char *url, int *error)
    fragment_b = fragment_e = NULL;
  
    host_b = p;
-  p = strpbrk_or_eos (p, ":/;?#");
-  host_e = p;
+
+  if (*p == '[')
+    {
+      /* Handle IPv6 address inside square brackets.  Ideally we'd
+        just look for the terminating ']', but rfc2732 mandates
+        rejecting invalid IPv6 addresses.  */
+
+      /* The address begins after '['. */
+      host_b = p + 1;
+      host_e = strchr (host_b, ']');
+
+      if (!host_e)
+       {
+         SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
+         return NULL;
+       }
+
+#ifdef ENABLE_IPV6
+      /* Check if the IPv6 address is valid. */
+      if (!is_valid_ipv6_address(host_b, host_e))
+       {
+         SETERR (error, PE_INVALID_IPV6_ADDRESS);
+         return NULL;
+       }
+
+      /* Continue parsing after the closing ']'. */
+      p = host_e + 1;
+#else
+      SETERR (error, PE_IPV6_NOT_SUPPORTED);
+      return NULL;
+#endif
+    }
+  else
+    {
+      p = strpbrk_or_eos (p, ":/;?#");
+      host_e = p;
+    }
  
    if (host_b == host_e)
      {
@@ -726,6 +934,7 @@ url_parse (const char *url, int *error)
               SETERR (error, PE_BAD_PORT_NUMBER);
               return NULL;
             }
+         
           port = 10 * port + (*pp - '0');
         }
      }
@@ -756,6 +965,15 @@ url_parse (const char *url, int *error)
        query_b = p;
        p = strpbrk_or_eos (p, "#");
        query_e = p;
+
+      /* Hack that allows users to use '?' (a wildcard character) in
+        FTP URLs without it being interpreted as a query string
+        delimiter.  */
+      if (scheme == SCHEME_FTP)
+       {
+         query_b = query_e = NULL;
+         path_e = p;
+       }
      }
    if (*p == '#')
      {
@@ -813,9 +1031,9 @@ url_parse (const char *url, int *error)
    else
      {
        if (url_encoded == url)
-       u->url    = xstrdup (url);
+       u->url = xstrdup (url);
        else
-       u->url    = url_encoded;
+       u->url = url_encoded;
      }
    url_encoded = NULL;
  
@@ -829,13 +1047,13 @@ url_error (int error_code)
    return parse_errors[error_code];
  }
  
+/* Parse PATH into dir and file.  PATH is extracted from the URL and
+   is URL-escaped.  The function returns unescaped DIR and FILE.  */
+
  static void
-parse_path (const char *quoted_path, char **dir, char **file)
+parse_path (const char *path, char **dir, char **file)
  {
-  char *path, *last_slash;
-
-  STRDUP_ALLOCA (path, quoted_path);
-  decode_string (path);
+  char *last_slash;
  
    last_slash = strrchr (path, '/');
    if (!last_slash)
@@ -848,6 +1066,8 @@ parse_path (const char *quoted_path, char **dir, char **file)
        *dir = strdupdelim (path, last_slash);
        *file = xstrdup (last_slash + 1);
      }
+  url_unescape (*dir);
+  url_unescape (*file);
  }
  
  /* Note: URL's "full path" is the path with the query string and
@@ -913,38 +1133,83 @@ url_full_path (const struct url *url)
    return full_path;
  }
  
-/* Sync u->path and u->url with u->dir and u->file. */
+/* Escape unsafe and reserved characters, except for the slash
+   characters.  */
  
-static void
-sync_path (struct url *url)
+static char *
+url_escape_dir (const char *dir)
  {
-  char *newpath;
+  char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
+  char *h, *t;
+  if (newdir == dir)
+    return (char *)dir;
  
-  xfree (url->path);
+  /* Unescape slashes in NEWDIR. */
  
-  if (!*url->dir)
+  h = newdir;                  /* hare */
+  t = newdir;                  /* tortoise */
+
+  for (; *h; h++, t++)
      {
-      newpath = xstrdup (url->file);
-      REENCODE (newpath);
+      if (*h == '%' && h[1] == '2' && h[2] == 'F')
+       {
+         *t = '/';
+         h += 2;
+       }
+      else
+       *t = *h;
      }
+  *t = '\0';
+
+  return newdir;
+}
+
+/* Sync u->path and u->url with u->dir and u->file.  Called after
+   u->file or u->dir have been changed, typically by the FTP code.  */
+
+static void
+sync_path (struct url *u)
+{
+  char *newpath, *efile, *edir;
+
+  xfree (u->path);
+
+  /* u->dir and u->file are not escaped.  URL-escape them before
+     reassembling them into u->path.  That way, if they contain
+     separators like '?' or even if u->file contains slashes, the
+     path will be correctly assembled.  (u->file can contain slashes
+     if the URL specifies it with %2f, or if an FTP server returns
+     it.)  */
+  edir = url_escape_dir (u->dir);
+  efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
+
+  if (!*edir)
+    newpath = xstrdup (efile);
    else
      {
-      int dirlen = strlen (url->dir);
-      int filelen = strlen (url->file);
+      int dirlen = strlen (edir);
+      int filelen = strlen (efile);
  
-      newpath = xmalloc (dirlen + 1 + filelen + 1);
-      memcpy (newpath, url->dir, dirlen);
-      newpath[dirlen] = '/';
-      memcpy (newpath + dirlen + 1, url->file, filelen);
-      newpath[dirlen + 1 + filelen] = '\0';
-      REENCODE (newpath);
+      /* Copy "DIR/FILE" to newpath. */
+      char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
+      memcpy (p, edir, dirlen);
+      p += dirlen;
+      *p++ = '/';
+      memcpy (p, efile, filelen);
+      p += filelen;
+      *p++ = '\0';
      }
  
-  url->path = newpath;
+  u->path = newpath;
  
-  /* Synchronize u->url. */
-  xfree (url->url);
-  url->url = url_string (url, 0);
+  if (edir != u->dir)
+    xfree (edir);
+  if (efile != u->file)
+    xfree (efile);
+
+  /* Regenerate u->url as well.  */
+  xfree (u->url);
+  u->url = url_string (u, 0);
  }
  
  /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
@@ -1100,8 +1365,6 @@ rotate_backups(const char *fname)
      {
        sprintf (from, "%s.%d", fname, i - 1);
        sprintf (to, "%s.%d", fname, i);
-      /* #### This will fail on machines without the rename() system
-         call.  */
        rename (from, to);
      }
  
@@ -1120,11 +1383,14 @@ mkalldirs (const char *path)
    int res;
  
    p = path + strlen (path);
-  for (; *p != '/' && p != path; p--);
+  for (; *p != '/' && p != path; p--)
+    ;
+
    /* Don't create if it's just a file.  */
    if ((p == path) && (*p != '/'))
      return 0;
    t = strdupdelim (path, p);
+
    /* Check whether the directory exists.  */
    if ((stat (t, &st) == 0))
      {
@@ -1157,205 +1423,288 @@ mkalldirs (const char *path)
    xfree (t);
    return res;
  }
+\f
+/* Functions for constructing the file name out of URL components.  */
  
-static int
-count_slashes (const char *s)
+/* A growable string structure, used by url_file_name and friends.
+   This should perhaps be moved to utils.c.
+
+   The idea is to have a convenient and efficient way to construct a
+   string by having various functions append data to it.  Instead of
+   passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
+   functions in questions, we pass the pointer to this struct.  */
+
+struct growable {
+  char *base;
+  int size;
+  int tail;
+};
+
+/* Ensure that the string can accept APPEND_COUNT more characters past
+   the current TAIL position.  If necessary, this will grow the string
+   and update its allocated size.  If the string is already large
+   enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
+#define GROW(g, append_size) do {                                      \
+  struct growable *G_ = g;                                             \
+  DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);       \
+} while (0)
+
+/* Return the tail position of the string. */
+#define TAIL(r) ((r)->base + (r)->tail)
+
+/* Move the tail position by APPEND_COUNT characters. */
+#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
+
+/* Append the string STR to DEST.  NOTICE: the string in DEST is not
+   terminated.  */
+
+static void
+append_string (const char *str, struct growable *dest)
  {
-  int i = 0;
-  while (*s)
-    if (*s++ == '/')
-      ++i;
-  return i;
+  int l = strlen (str);
+  GROW (dest, l);
+  memcpy (TAIL (dest), str, l);
+  TAIL_INCR (dest, l);
  }
  
-/* Return the path name of the URL-equivalent file name, with a
-   remote-like structure of directories.  */
-static char *
-mkstruct (const struct url *u)
+/* Append CH to DEST.  For example, append_char (0, DEST)
+   zero-terminates DEST.  */
+
+static void
+append_char (char ch, struct growable *dest)
  {
-  char *dir, *dir_preencoding;
-  char *file, *res, *dirpref;
-  char *query = u->query && *u->query ? u->query : NULL;
-  int l;
+  GROW (dest, 1);
+  *TAIL (dest) = ch;
+  TAIL_INCR (dest, 1);
+}
  
-  if (opt.cut_dirs)
-    {
-      char *ptr = u->dir + (*u->dir == '/');
-      int slash_count = 1 + count_slashes (ptr);
-      int cut = MINVAL (opt.cut_dirs, slash_count);
-      for (; cut && *ptr; ptr++)
-       if (*ptr == '/')
-         --cut;
-      STRDUP_ALLOCA (dir, ptr);
-    }
-  else
-    dir = u->dir + (*u->dir == '/');
+enum {
+  filechr_not_unix    = 1,     /* unusable on Unix, / and \0 */
+  filechr_not_windows = 2,     /* unusable on Windows, one of \|/<>?:*" */
+  filechr_control     = 4,     /* a control character, e.g. 0-31 */
+};
  
-  /* Check for the true name (or at least a consistent name for saving
-     to directory) of HOST, reusing the hlist if possible.  */
-  if (opt.add_hostdir)
-    {
-      /* Add dir_prefix and hostname (if required) to the beginning of
-        dir.  */
-      dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
-                               + strlen (u->host)
-                               + 1 + numdigit (u->port)
-                               + 1);
-      if (!DOTP (opt.dir_prefix))
-       sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
-      else
-       strcpy (dirpref, u->host);
+#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
  
-      if (u->port != scheme_default_port (u->scheme))
-       {
-         int len = strlen (dirpref);
-         dirpref[len] = ':';
-         number_to_string (dirpref + len + 1, u->port);
-       }
-    }
-  else                         /* not add_hostdir */
-    {
-      if (!DOTP (opt.dir_prefix))
-       dirpref = opt.dir_prefix;
-      else
-       dirpref = "";
-    }
+/* Shorthands for the table: */
+#define U filechr_not_unix
+#define W filechr_not_windows
+#define C filechr_control
  
-  /* If there is a prefix, prepend it.  */
-  if (*dirpref)
-    {
-      char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
-      sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
-      dir = newdir;
-    }
+#define UW U|W
+#define UWC U|W|C
  
-  dir_preencoding = dir;
-  dir = reencode_string (dir_preencoding);
+/* Table of characters unsafe under various conditions (see above).
  
-  l = strlen (dir);
-  if (l && dir[l - 1] == '/')
-    dir[l - 1] = '\0';
+   Arguably we could also claim `%' to be unsafe, since we use it as
+   the escape character.  If we ever want to be able to reliably
+   translate file name back to URL, this would become important
+   crucial.  Right now, it's better to be minimal in escaping.  */
  
-  if (!*u->file)
-    file = "index.html";
-  else
-    file = u->file;
-
-  /* Finally, construct the full name.  */
-  res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
-                        + (query ? (1 + strlen (query)) : 0)
-                        + 1);
-  sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
-  if (query)
-    {
-      strcat (res, "?");
-      strcat (res, query);
-    }
-  if (dir != dir_preencoding)
-    xfree (dir);
-  return res;
-}
+const static unsigned char filechr_table[256] =
+{
+UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
+  C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
+  C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
+  C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
+  0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
+  0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
+  0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
+  0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
  
-/* Compose a file name out of BASE, an unescaped file name, and QUERY,
-   an escaped query string.  The trick is to make sure that unsafe
-   characters in BASE are escaped, and that slashes in QUERY are also
-   escaped.  */
+  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
+  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
  
-static char *
-compose_file_name (char *base, char *query)
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+};
+
+/* FN_PORT_SEP is the separator between host and port in file names
+   for non-standard port numbers.  On Unix this is normally ':', as in
+   "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
+   because Windows can't handle ':' in file names.  */
+#define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
+
+/* FN_QUERY_SEP is the separator between the file name and the URL
+   query, normally '?'.  Since Windows cannot handle '?' as part of
+   file name, we use '@' instead there.  */
+#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
+
+/* Quote path element, characters in [b, e), as file name, and append
+   the quoted string to DEST.  Each character is quoted as per
+   file_unsafe_char and the corresponding table.  */
+
+static void
+append_uri_pathel (const char *b, const char *e, struct growable *dest)
  {
-  char result[256];
-  char *from;
-  char *to = result;
+  char *pathel;
+  int pathlen;
+
+  const char *p;
+  int quoted, outlen;
+
+  int mask;
+  if (opt.restrict_files_os == restrict_unix)
+    mask = filechr_not_unix;
+  else
+    mask = filechr_not_windows;
+  if (opt.restrict_files_ctrl)
+    mask |= filechr_control;
+
+  /* Copy [b, e) to PATHEL and URL-unescape it. */
+  BOUNDED_TO_ALLOCA (b, e, pathel);
+  url_unescape (pathel);
+  pathlen = strlen (pathel);
+
+  /* Go through PATHEL and check how many characters we'll need to
+     add for file quoting. */
+  quoted = 0;
+  for (p = pathel; *p; p++)
+    if (FILE_CHAR_TEST (*p, mask))
+      ++quoted;
  
-  /* Copy BASE to RESULT and encode all unsafe characters.  */
-  from = base;
-  while (*from && to - result < sizeof (result))
+  /* p - pathel is the string length.  Each quoted char means two
+     additional characters in the string, hence 2*quoted.  */
+  outlen = (p - pathel) + (2 * quoted);
+  GROW (dest, outlen);
+
+  if (!quoted)
      {
-      if (UNSAFE_CHAR (*from))
-       {
-         unsigned char c = *from++;
-         *to++ = '%';
-         *to++ = XDIGIT_TO_XCHAR (c >> 4);
-         *to++ = XDIGIT_TO_XCHAR (c & 0xf);
-       }
-      else
-       *to++ = *from++;
+      /* If there's nothing to quote, we don't need to go through the
+        string the second time.  */
+      memcpy (TAIL (dest), pathel, outlen);
      }
-
-  if (query && to - result < sizeof (result))
+  else
      {
-      *to++ = '?';
-
-      /* Copy QUERY to RESULT and encode all '/' characters. */
-      from = query;
-      while (*from && to - result < sizeof (result))
+      char *q = TAIL (dest);
+      for (p = pathel; *p; p++)
         {
-         if (*from == '/')
+         if (!FILE_CHAR_TEST (*p, mask))
+           *q++ = *p;
+         else
             {
-             *to++ = '%';
-             *to++ = '2';
-             *to++ = 'F';
-             ++from;
+             unsigned char ch = *p;
+             *q++ = '%';
+             *q++ = XDIGIT_TO_XCHAR (ch >> 4);
+             *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
             }
-         else
-           *to++ = *from++;
         }
+      assert (q - TAIL (dest) == outlen);
      }
+  TAIL_INCR (dest, outlen);
+}
  
-  if (to - result < sizeof (result))
-    *to = '\0';
-  else
-    /* Truncate input which is too long, presumably due to a huge
-       query string.  */
-    result[sizeof (result) - 1] = '\0';
+/* Append to DEST the directory structure that corresponds the
+   directory part of URL's path.  For example, if the URL is
+   http://server/dir1/dir2/file, this appends "/dir1/dir2".
+
+   Each path element ("dir1" and "dir2" in the above example) is
+   examined, url-unescaped, and re-escaped as file name element.
+
+   Additionally, it cuts as many directories from the path as
+   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
+   will produce "bar" for the above example.  For 2 or more, it will
+   produce "".
  
-  return xstrdup (result);
+   Each component of the path is quoted for use as file name.  */
+
+static void
+append_dir_structure (const struct url *u, struct growable *dest)
+{
+  char *pathel, *next;
+  int cut = opt.cut_dirs;
+
+  /* Go through the path components, de-URL-quote them, and quote them
+     (if necessary) as file names.  */
+
+  pathel = u->path;
+  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
+    {
+      if (cut-- > 0)
+       continue;
+      if (pathel == next)
+       /* Ignore empty pathels.  path_simplify should remove
+          occurrences of "//" from the path, but it has special cases
+          for starting / which generates an empty pathel here.  */
+       continue;
+
+      if (dest->tail)
+       append_char ('/', dest);
+      append_uri_pathel (pathel, next, dest);
+    }
  }
  
-/* Create a unique filename, corresponding to a given URL.  Calls
-   mkstruct if necessary.  Does *not* actually create any directories.  */
+/* Return a unique file name that matches the given URL as good as
+   possible.  Does not create directories on the file system.  */
+
  char *
-url_filename (const struct url *u)
+url_file_name (const struct url *u)
  {
-  char *file, *name;
-  int have_prefix = 0;         /* whether we must prepend opt.dir_prefix */
+  struct growable fnres;
+
+  char *u_file, *u_query;
+  char *fname, *unique;
+
+  fnres.base = NULL;
+  fnres.size = 0;
+  fnres.tail = 0;
  
+  /* Start with the directory prefix, if specified. */
+  if (!DOTP (opt.dir_prefix))
+    append_string (opt.dir_prefix, &fnres);
+
+  /* If "dirstruct" is turned on (typically the case with -r), add
+     the host and port (unless those have been turned off) and
+     directory structure.  */
    if (opt.dirstruct)
      {
-      file = mkstruct (u);
-      have_prefix = 1;
-    }
-  else
-    {
-      char *base = *u->file ? u->file : "index.html";
-      char *query = u->query && *u->query ? u->query : NULL;
-      file = compose_file_name (base, query);
+      if (opt.add_hostdir)
+       {
+         if (fnres.tail)
+           append_char ('/', &fnres);
+         append_string (u->host, &fnres);
+         if (u->port != scheme_default_port (u->scheme))
+           {
+             char portstr[24];
+             number_to_string (portstr, u->port);
+             append_char (FN_PORT_SEP, &fnres);
+             append_string (portstr, &fnres);
+           }
+       }
+
+      append_dir_structure (u, &fnres);
      }
  
-  if (!have_prefix)
+  /* Add the file name. */
+  if (fnres.tail)
+    append_char ('/', &fnres);
+  u_file = *u->file ? u->file : "index.html";
+  append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
+
+  /* Append "?query" to the file name. */
+  u_query = u->query && *u->query ? u->query : NULL;
+  if (u_query)
      {
-      /* Check whether the prefix directory is something other than "."
-        before prepending it.  */
-      if (!DOTP (opt.dir_prefix))
-       {
-         char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
-                                        + 1 + strlen (file) + 1);
-         sprintf (nfile, "%s/%s", opt.dir_prefix, file);
-         xfree (file);
-         file = nfile;
-       }
+      append_char (FN_QUERY_SEP, &fnres);
+      append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
      }
-  /* DOS-ish file systems don't like `%' signs in them; we change it
-     to `@'.  */
-#ifdef WINDOWS
-  {
-    char *p = file;
-    for (p = file; *p; p++)
-      if (*p == '%')
-       *p = '@';
-  }
-#endif /* WINDOWS */
+
+  /* Zero-terminate the file name. */
+  append_char ('\0', &fnres);
+
+  fname = fnres.base;
  
    /* Check the cases in which the unique extensions are not used:
       1) Clobbering is turned off (-nc).
@@ -1364,18 +1713,19 @@ url_filename (const struct url *u)
       4) Hierarchy is built.
  
       The exception is the case when file does exist and is a
-     directory (actually support for bad httpd-s).  */
+     directory (see `mkalldirs' for explanation).  */
+
    if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
-      && !(file_exists_p (file) && !file_non_directory_p (file)))
-    return file;
+      && !(file_exists_p (fname) && !file_non_directory_p (fname)))
+    return fname;
  
-  /* Find a unique name.  */
-  name = unique_name (file);
-  xfree (file);
-  return name;
+  unique = unique_name (fname, 1);
+  if (unique != fname)
+    xfree (fname);
+  return unique;
  }
  
-/* Return the langth of URL's path.  Path is considered to be
+/* Return the length of URL's path.  Path is considered to be
     terminated by one of '?', ';', '#', or by the end of the
     string.  */
  static int
@@ -1488,8 +1838,10 @@ path_simplify (char *path)
        else if (*p == '/')
         {
           /* Remove empty path elements.  Not mandated by rfc1808 et
-            al, but empty path elements are not all that useful, and
-            the rest of Wget might not deal with them well. */
+            al, but it seems like a good idea to get rid of them.
+            Supporting them properly is hard (in which directory do
+            you save http://x.com///y.html?) and they don't seem to
+            bring much gain.  */
           char *q = p;
           while (*q == '/')
             ++q;
@@ -1765,23 +2117,29 @@ url_string (const struct url *url, int hide_password)
    char *scheme_str = supported_schemes[url->scheme].leading_string;
    int fplen = full_path_length (url);
  
+  int brackets_around_host = 0;
+
    assert (scheme_str != NULL);
  
    /* Make sure the user name and password are quoted. */
    if (url->user)
      {
-      quoted_user = encode_string_maybe (url->user);
+      quoted_user = url_escape_allow_passthrough (url->user);
        if (url->passwd)
         {
           if (hide_password)
             quoted_passwd = HIDDEN_PASSWORD;
           else
-           quoted_passwd = encode_string_maybe (url->passwd);
+           quoted_passwd = url_escape_allow_passthrough (url->passwd);
         }
      }
  
+  if (strchr (url->host, ':'))
+    brackets_around_host = 1;
+
    size = (strlen (scheme_str)
           + strlen (url->host)
+         + (brackets_around_host ? 2 : 0)
           + fplen
           + 1);
    if (url->port != scheme_port)
@@ -1807,7 +2165,11 @@ url_string (const struct url *url, int hide_password)
        *p++ = '@';
      }
  
+  if (brackets_around_host)
+    *p++ = '[';
    APPEND (p, url->host);
+  if (brackets_around_host)
+    *p++ = ']';
    if (url->port != scheme_port)
      {
        *p++ = ':';
@@ -1829,15 +2191,20 @@ url_string (const struct url *url, int hide_password)
    return result;
  }
  \f
-/* Returns proxy host address, in accordance with SCHEME.  */
+/* Return the URL of the proxy appropriate for url U.  */
  char *
-getproxy (enum url_scheme scheme)
+getproxy (struct url *u)
  {
    char *proxy = NULL;
    char *rewritten_url;
    static char rewritten_storage[1024];
  
-  switch (scheme)
+  if (!opt.use_proxy)
+    return NULL;
+  if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
+    return NULL;
+
+  switch (u->scheme)
      {
      case SCHEME_HTTP:
        proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
@@ -1856,7 +2223,8 @@ getproxy (enum url_scheme scheme)
    if (!proxy || !*proxy)
      return NULL;
  
-  /* Handle shorthands. */
+  /* Handle shorthands.  `rewritten_storage' is a kludge to allow
+     getproxy() to return static storage. */
    rewritten_url = rewrite_shorthand_url (proxy);
    if (rewritten_url)
      {
@@ -2412,6 +2780,24 @@ downloaded_files_free (void)
        downloaded_files_hash = NULL;
      }
  }
+
+/* Return non-zero if scheme a is similar to scheme b.
+ 
+   Schemes are similar if they are equal.  If SSL is supported, schemes
+   are also similar if one is http (SCHEME_HTTP) and the other is https
+   (SCHEME_HTTPS).  */
+int
+schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
+{
+  if (a == b)
+    return 1;
+#ifdef HAVE_SSL
+  if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
+      || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
+    return 1;
+#endif
+  return 0;
+}
  \f
  #if 0
  /* Debugging and testing support for path_simplify. */