From 0a3697ad652df74ffeec8a97e1d23c343d8ef391 Mon Sep 17 00:00:00 2001
From: hniksic <devnull@localhost>
Date: Sun, 14 Sep 2003 15:04:13 -0700
Subject: [PATCH] [svn] New mechanism for quoting file names. Published in
 <m3smmzt4px.fsf@hniksic.iskon.hr>.

---
 NEWS          |  11 +-
 doc/ChangeLog |   5 +
 doc/wget.texi |  37 ++++
 src/ChangeLog |  28 +++
 src/connect.c |   1 +
 src/ftp-ls.c  |   7 +-
 src/ftp.c     |   8 +-
 src/http.c    |   4 +-
 src/init.c    |  29 +++
 src/main.c    |   7 +-
 src/options.h |   6 +
 src/url.c     | 561 ++++++++++++++++++++++++++++++--------------------
 src/url.h     |   4 +-
 13 files changed, 474 insertions(+), 234 deletions(-)

diff --git a/NEWS b/NEWS
index 073a95bd..3cf33275 100644
--- a/NEWS
+++ b/NEWS
@@ -7,8 +7,6 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
 
 * Changes in Wget 1.9.
 
-** The build process now requires Autoconf 2.5x.
-
 ** It is now possible to specify that POST method be used for HTTP
 requests.  For example, `wget --post-data="id=foo&data=bar" URL' will
 send a POST request with the specified contents.
@@ -32,6 +30,15 @@ considered a fatal error.
 
 ** The new option `--dns-cache=off' may be used to prevent Wget from
 caching DNS lookups.
+
+** The build process now requires Autoconf 2.5x.
+
+** Wget no longer quotes characters in local file names that would be
+considered "unsafe" as part of URL.  Quoting can still occur for
+control characters or for '/', but no longer for frequent characters
+such as space.  You can use the new option --restrict-file-names to
+enforce even stricter rules, which is useful when downloading to
+Windows partitions.
 
 * Wget 1.8.1 is a bugfix release with no user-visible changes.
 
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 1f0f1c09..e2570f07 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,8 @@
+2003-09-14  Hrvoje Niksic  <hniksic@xemacs.org>
+
+	* wget.texi (Download Options): Document the new option
+	--restrict-file-names and the corresponding wgetrc command.
+
 2003-09-10  Hrvoje Niksic  <hniksic@xemacs.org>
 
 	* wget.texi (Download Options): Documented new option --dns-cache.
diff --git a/doc/wget.texi b/doc/wget.texi
index 19eb439a..4b0bb3c0 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -800,6 +800,39 @@ lookups where they're probably not needed.
 
 If you don't understand the above description, you probably won't need
 this option.
+
+@cindex file names, restrict
+@cindex Windows file names
+@itemx --restrict-file-names=none|unix|windows
+Restrict characters that may occur in local file names created by Wget
+from remote URLs.  Characters that are considered @dfn{unsafe} under a
+set of restrictions are escaped, i.e. replaced with @samp{%XX}, where
+@samp{XX} is the hexadecimal code of the character.
+
+The default for this option depends on the operating system: on Unix and
+Unix-like OS'es, it defaults to ``unix''.  Under Windows and Cygwin, it
+defaults to ``windows''.  Changing the default is useful when you are
+using a non-native partition, e.g. when downloading files to a Windows
+partition mounted from Linux, or when using NFS-mounted or SMB-mounted
+Windows drives.
+
+When set to ``none'', the only characters that are quoted are those that
+are impossible to get into a file name---the NUL character and @samp{/}.
+The control characters, newline, etc. are all placed into file names.
+
+When set to ``unix'', additional unsafe characters are those in the
+0--31 range and in the 128--159 range.  This is because those characters
+are typically not printable.
+
+When set to ``windows'', all of the above are quoted, along with
+@samp{\}, @samp{|}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
+and @samp{>}.  Additionally, Wget in Windows mode uses @samp{+} instead
+of @samp{:} to separate host and port in local file names, and uses
+@samp{@@} instead of @samp{?} to separate the query portion of the file
+name from the rest.  Therefore, a URL that would be saved as
+@samp{www.xemacs.org:4300/search.pl?input=blah} in Unix mode would be
+saved as @samp{www.xemacs.org+4300/search.pl@@input=blah} in Windows
+mode.
 @end table
 
 @node Directory Options, HTTP Options, Download Options, Invoking
@@ -2241,6 +2274,10 @@ Links}).
 If set to on, remove @sc{ftp} listings downloaded by Wget.  Setting it
 to off is the same as @samp{-nr}.
 
+@item restrict_file_names = off/unix/windows
+Restrict the file names generated by Wget from URLs.  See
+@samp{--restrict-file-names} for a more detailed description.
+
 @item retr_symlinks = on/off
 When set to on, retrieve symbolic links as if they were plain files; the
 same as @samp{--retr-symlinks}.
diff --git a/src/ChangeLog b/src/ChangeLog
index d094f5c0..356082d3 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,31 @@
+2003-09-14  Hrvoje Niksic  <hniksic@xemacs.org>
+
+	* url.c (append_uri_pathel): Use opt.restrict_file_names when
+	calling file_unsafe_char.
+
+	* init.c: New command restrict_file_names.
+
+	* main.c (main): New option --restrict-file-names[=windows,unix].
+
+	* url.c (url_file_name): Renamed from url_filename.
+	(url_file_name): Add directory and hostdir prefix here, not in
+	mkstruct.
+	(append_dir_structure): New function, does part of the work that
+	used to be in mkstruct.  Iterates over path elements in u->path,
+	calling append_uri_pathel on each one to append it to the file
+	name.
+	(append_uri_pathel): URL-unescape a path element and reencode it
+	with a different set of rules, more appropriate for handling of
+	files.
+	(file_unsafe_char): New function, uses a lookup table to decide
+	whether a character should be escaped for use in file name.
+	(append_string): New utility function.
+	(append_char): Ditto.
+	(file_unsafe_char): New argument restrict_for_windows, decide
+	whether Windows file names should be escaped in run-time.
+
+	* connect.c: Include <stdlib.h> to get prototype for abort().
+
 2003-09-14  Hrvoje Niksic  <hniksic@xemacs.org>
 
 	* utils.c (wtimer_sys_set): Extracted the code that sets the
diff --git a/src/connect.c b/src/connect.c
index 99f0909d..26dc404d 100644
--- a/src/connect.c
+++ b/src/connect.c
@@ -30,6 +30,7 @@ so, delete this exception statement from your version.  */
 #include <config.h>
 
 #include <stdio.h>
+#include <stdlib.h>
 #include <sys/types.h>
 #ifdef HAVE_UNISTD_H
 # include <unistd.h>
diff --git a/src/ftp-ls.c b/src/ftp-ls.c
index 47982777..919b4a60 100644
--- a/src/ftp-ls.c
+++ b/src/ftp-ls.c
@@ -842,8 +842,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f)
     {
       char *tmpu, *tmpp;        /* temporary, clean user and passwd */
 
-      tmpu = encode_string (u->user);
-      tmpp = u->passwd ? encode_string (u->passwd) : NULL;
+      tmpu = url_escape (u->user);
+      tmpp = u->passwd ? url_escape (u->passwd) : NULL;
       upwd = (char *)xmalloc (strlen (tmpu)
 			     + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
       sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
@@ -863,7 +863,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f)
       fprintf (fp, "  ");
       if (f->tstamp != -1)
 	{
-	  /* #### Should we translate the months? */
+	  /* #### Should we translate the months?  Or, even better, use
+	     ISO 8601 dates?  */
 	  static char *months[] = {
 	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
diff --git a/src/ftp.c b/src/ftp.c
index 3159171f..d70969ad 100644
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -1025,7 +1025,7 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con)
   struct stat st;
 
   if (!con->target)
-    con->target = url_filename (u);
+    con->target = url_file_name (u);
 
   if (opt.noclobber && file_exists_p (con->target))
     {
@@ -1245,7 +1245,7 @@ ftp_get_listing (struct url *u, ccon *con, struct fileinfo **f)
   /* Find the listing file name.  We do it by taking the file name of
      the URL and replacing the last component with the listing file
      name.  */
-  uf = url_filename (u);
+  uf = url_file_name (u);
   lf = file_merge (uf, LIST_FILENAME);
   xfree (uf);
   DEBUGP ((_("Using `%s' as listing tmp file.\n"), lf));
@@ -1335,7 +1335,7 @@ ftp_retrieve_list (struct url *u, struct fileinfo *f, ccon *con)
       ofile = xstrdup (u->file);
       url_set_file (u, f->name);
 
-      con->target = url_filename (u);
+      con->target = url_file_name (u);
       err = RETROK;
 
       dlthis = 1;
@@ -1723,7 +1723,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
 	      char *filename = (opt.output_document
 				? xstrdup (opt.output_document)
 				: (con.target ? xstrdup (con.target)
-				   : url_filename (u)));
+				   : url_file_name (u)));
 	      res = ftp_index (filename, u, f);
 	      if (res == FTPOK && opt.verbose)
 		{
diff --git a/src/http.c b/src/http.c
index 14176aac..82c6d8de 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1614,12 +1614,12 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
     hstat.local_file = local_file;
   else if (local_file)
     {
-      *local_file = url_filename (u);
+      *local_file = url_file_name (u);
       hstat.local_file = local_file;
     }
   else
     {
-      dummy = url_filename (u);
+      dummy = url_file_name (u);
       hstat.local_file = &dummy;
     }
 
diff --git a/src/init.c b/src/init.c
index 124bfb10..bce2427a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -100,6 +100,7 @@ CMD_DECLARE (cmd_spec_htmlify);
 CMD_DECLARE (cmd_spec_mirror);
 CMD_DECLARE (cmd_spec_progress);
 CMD_DECLARE (cmd_spec_recursive);
+CMD_DECLARE (cmd_spec_restrict_file_names);
 CMD_DECLARE (cmd_spec_useragent);
 
 /* List of recognized commands, each consisting of name, closure and function.
@@ -188,6 +189,7 @@ static struct {
   { "reject",		&opt.rejects,		cmd_vector },
   { "relativeonly",	&opt.relative_only,	cmd_boolean },
   { "removelisting",	&opt.remove_listing,	cmd_boolean },
+  { "restrictfilenames", &opt.restrict_file_names, cmd_spec_restrict_file_names },
   { "retrsymlinks",	&opt.retr_symlinks,	cmd_boolean },
   { "retryconnrefused",	&opt.retry_connrefused,	cmd_boolean },
   { "robots",		&opt.use_robots,	cmd_boolean },
@@ -281,6 +283,13 @@ defaults (void)
   opt.dots_in_line = 50;
 
   opt.dns_cache = 1;
+
+  /* The default for file name restriction defaults to the OS type. */
+#if !defined(WINDOWS) && !defined(__CYGWIN__)
+  opt.restrict_file_names = restrict_shell;
+#else
+  opt.restrict_file_names = restrict_windows;
+#endif
 }
 
 /* Return the user's home directory (strdup-ed), or NULL if none is
@@ -1008,6 +1017,26 @@ cmd_spec_recursive (const char *com, const char *val, void *closure)
   return 1;
 }
 
+static int
+cmd_spec_restrict_file_names (const char *com, const char *val, void *closure)
+{
+  /* The currently accepted values are `none', `unix', and
+     `windows'.  */
+  if (0 == strcasecmp (val, "none"))
+    opt.restrict_file_names = restrict_none;
+  else if (0 == strcasecmp (val, "unix"))
+    opt.restrict_file_names = restrict_shell;
+  else if (0 == strcasecmp (val, "windows"))
+    opt.restrict_file_names = restrict_windows;
+  else
+    {
+      fprintf (stderr, _("%s: %s: Invalid specification `%s'.\n"),
+	       exec_name, com, val);
+      return 0;
+    }
+  return 1;
+}
+
 static int
 cmd_spec_useragent (const char *com, const char *val, void *closure)
 {
diff --git a/src/main.c b/src/main.c
index 67bf55cd..77e1bf30 100644
--- a/src/main.c
+++ b/src/main.c
@@ -179,10 +179,11 @@ Download:\n\
        --bind-address=ADDRESS   bind to ADDRESS (hostname or IP) on local host.\n\
        --limit-rate=RATE        limit download rate to RATE.\n\
        --dns-cache=off          disable caching DNS lookups.\n\
+       --restrict-file-names=MODE restrict chars in file names to MODE.\n\
 \n"), stdout);
   fputs (_("\
 Directories:\n\
-  -nd  --no-directories            don\'t create directories.\n\
+  -nd, --no-directories            don\'t create directories.\n\
   -x,  --force-directories         force creation of directories.\n\
   -nH, --no-host-directories       don\'t create host directories.\n\
   -P,  --directory-prefix=PREFIX   save files to PREFIX/...\n\
@@ -344,6 +345,7 @@ main (int argc, char *const *argv)
     { "proxy-user", required_argument, NULL, 143 },
     { "quota", required_argument, NULL, 'Q' },
     { "reject", required_argument, NULL, 'R' },
+    { "restrict-file-names", required_argument, NULL, 176 },
     { "save-cookies", required_argument, NULL, 162 },
     { "timeout", required_argument, NULL, 'T' },
     { "tries", required_argument, NULL, 't' },
@@ -610,6 +612,9 @@ GNU General Public License for more details.\n"));
 	case 175:
 	  setval ("dnscache", optarg);
 	  break;
+	case 176:
+	  setval ("restrictfilenames", optarg);
+	  break;
 	case 'A':
 	  setval ("accept", optarg);
 	  break;
diff --git a/src/options.h b/src/options.h
index e7eff5e2..7010cd41 100644
--- a/src/options.h
+++ b/src/options.h
@@ -184,6 +184,12 @@ struct options
 
   char *post_data;		/* POST query string */
   char *post_file_name;		/* File to post */
+
+  enum {
+    restrict_none,
+    restrict_shell,
+    restrict_windows
+  } restrict_file_names;	/* whether we restrict file name chars. */
 };
 
 extern struct options opt;
diff --git a/src/url.c b/src/url.c
index eac1cfdd..3a8feb70 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1,5 +1,6 @@
 /* URL handling.
-   Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
+   Free Software Foundation, Inc.
 
 This file is part of GNU Wget.
 
@@ -95,24 +96,22 @@ static int path_simplify PARAMS ((char *));
    code assumes ASCII character set and 8-bit chars.  */
 
 enum {
+  /* rfc1738 reserved chars, preserved from encoding.  */
   urlchr_reserved = 1,
+
+  /* rfc1738 unsafe chars, plus some more.  */
   urlchr_unsafe   = 2
 };
 
+#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
+#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
+#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
+
+/* Shorthands for the table: */
 #define R  urlchr_reserved
 #define U  urlchr_unsafe
 #define RU R|U
 
-#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
-
-/* rfc1738 reserved chars, preserved from encoding.  */
-
-#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
-
-/* rfc1738 unsafe chars, plus some more.  */
-
-#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
-
 const static unsigned char urlchr_table[256] =
 {
   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
@@ -142,6 +141,9 @@ const static unsigned char urlchr_table[256] =
   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 };
+#undef R
+#undef U
+#undef RU
 
 /* Decodes the forms %xy in a URL to the character the hexadecimal
    code of which is xy.  xy are hexadecimal digits from
@@ -150,7 +152,7 @@ const static unsigned char urlchr_table[256] =
    literally.  */
 
 static void
-decode_string (char *s)
+url_unescape (char *s)
 {
   char *t = s;			/* t - tortoise */
   char *h = s;			/* h - hare     */
@@ -175,10 +177,10 @@ decode_string (char *s)
   *t = '\0';
 }
 
-/* Like encode_string, but return S if there are no unsafe chars.  */
+/* Like url_escape, but return S if there are no unsafe chars.  */
 
 static char *
-encode_string_maybe (const char *s)
+url_escape_allow_passthrough (const char *s)
 {
   const char *p1;
   char *p2, *newstr;
@@ -186,7 +188,7 @@ encode_string_maybe (const char *s)
   int addition = 0;
 
   for (p1 = s; *p1; p1++)
-    if (UNSAFE_CHAR (*p1))
+    if (URL_UNSAFE_CHAR (*p1))
       addition += 2;		/* Two more characters (hex digits) */
 
   if (!addition)
@@ -199,7 +201,7 @@ encode_string_maybe (const char *s)
   p2 = newstr;
   while (*p1)
     {
-      if (UNSAFE_CHAR (*p1))
+      if (URL_UNSAFE_CHAR (*p1))
 	{
 	  unsigned char c = *p1++;
 	  *p2++ = '%';
@@ -215,13 +217,13 @@ encode_string_maybe (const char *s)
   return newstr;
 }
 
-/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
+/* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
    given string, returning a malloc-ed %XX encoded string.  */
   
 char *
-encode_string (const char *s)
+url_escape (const char *s)
 {
-  char *encoded = encode_string_maybe (s);
+  char *encoded = url_escape_allow_passthrough (s);
   if (encoded != s)
     return encoded;
   else
@@ -232,13 +234,13 @@ encode_string (const char *s)
    the old value of PTR is freed and PTR is made to point to the newly
    allocated storage.  */
 
-#define ENCODE(ptr) do {			\
-  char *e_new = encode_string_maybe (ptr);	\
-  if (e_new != ptr)				\
-    {						\
-      xfree (ptr);				\
-      ptr = e_new;				\
-    }						\
+#define ENCODE(ptr) do {				\
+  char *e_new = url_escape_allow_passthrough (ptr);	\
+  if (e_new != ptr)					\
+    {							\
+      xfree (ptr);					\
+      ptr = e_new;					\
+    }							\
 } while (0)
 
 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
@@ -258,7 +260,7 @@ decide_copy_method (const char *p)
 	  char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 	    XCHAR_TO_XDIGIT (*(p + 2));
 
-	  if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
+	  if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 	    return CM_PASSTHROUGH;
 	  else
 	    return CM_DECODE;
@@ -267,20 +269,20 @@ decide_copy_method (const char *p)
 	/* Garbled %.. sequence: encode `%'. */
 	return CM_ENCODE;
     }
-  else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
+  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
     return CM_ENCODE;
   else
     return CM_PASSTHROUGH;
 }
 
-/* Translate a %-quoting (but possibly non-conformant) input string S
-   into a %-quoting (and conformant) output string.  If no characters
+/* Translate a %-escaped (but possibly non-conformant) input string S
+   into a %-escaped (and conformant) output string.  If no characters
    are encoded or decoded, return the same string S; otherwise, return
    a freshly allocated string with the new contents.
 
    After a URL has been run through this function, the protocols that
    use `%' as the quote character can use the resulting string as-is,
-   while those that don't call decode_string() to get to the intended
+   while those that don't call url_unescape() to get to the intended
    data.  This function is also stable: after an input string is
    transformed the first time, all further transformations of the
    result yield the same result string.
@@ -293,20 +295,21 @@ decide_copy_method (const char *p)
 
        GET /abc%20def HTTP/1.0
 
-   So it appears that the unsafe chars need to be quoted, as with
-   encode_string.  But what if we're requested to download
-   `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
-   the user meant was a literal space, and he was kind enough to quote
-   it.  In that case, Wget should obviously leave the `%20' as is, and
-   send the same request as above.  So in this case we may not call
-   encode_string.
-
-   But what if the requested URI is `abc%20 def'?  If we call
-   encode_string, we end up with `/abc%2520%20def', which is almost
-   certainly not intended.  If we don't call encode_string, we are
-   left with the embedded space and cannot send the request.  What the
+   It appears that the unsafe chars need to be quoted, for example
+   with url_escape.  But what if we're requested to download
+   `abc%20def'?  url_escape transforms "%" to "%25", which would leave
+   us with `abc%2520def'.  This is incorrect -- since %-escapes are
+   part of URL syntax, "%20" is the correct way to denote a literal
+   space on the Wget command line.  This leaves us in the conclusion
+   that in that case Wget should not call url_escape, but leave the
+   `%20' as is.
+
+   And what if the requested URI is `abc%20 def'?  If we call
+   url_escape, we end up with `/abc%2520%20def', which is almost
+   certainly not intended.  If we don't call url_escape, we are left
+   with the embedded space and cannot complete the request.  What the
    user meant was for Wget to request `/abc%20%20def', and this is
-   where reencode_string kicks in.
+   where reencode_escapes kicks in.
 
    Wget used to solve this by first decoding %-quotes, and then
    encoding all the "unsafe" characters found in the resulting string.
@@ -317,7 +320,7 @@ decide_copy_method (const char *p)
    is inevitable because by the second step we would lose information
    on whether the `+' was originally encoded or not.  Both results
    were wrong because in CGI parameters + means space, while %2B means
-   literal plus.  reencode_string correctly translates the above to
+   literal plus.  reencode_escapes correctly translates the above to
    "a%2B+b", i.e. returns the original string.
 
    This function uses an algorithm proposed by Anon Sricharoenchai:
@@ -352,7 +355,7 @@ decide_copy_method (const char *p)
    "foo%2b+bar"      -> "foo%2b+bar"  */
 
 static char *
-reencode_string (const char *s)
+reencode_escapes (const char *s)
 {
   const char *p1;
   char *newstr, *p2;
@@ -417,12 +420,12 @@ reencode_string (const char *s)
   return newstr;
 }
 
-/* Run PTR_VAR through reencode_string.  If a new string is consed,
+/* Run PTR_VAR through reencode_escapes.  If a new string is consed,
    free PTR_VAR and make it point to the new storage.  Obviously,
    PTR_VAR needs to be an lvalue.  */
 
 #define REENCODE(ptr_var) do {			\
-  char *rf_new = reencode_string (ptr_var);	\
+  char *rf_new = reencode_escapes (ptr_var);	\
   if (rf_new != ptr_var)			\
     {						\
       xfree (ptr_var);				\
@@ -544,9 +547,9 @@ parse_uname (const char *str, int len, char **user, char **passwd)
   (*user)[len] = '\0';
 
   if (*user)
-    decode_string (*user);
+    url_unescape (*user);
   if (*passwd)
-    decode_string (*passwd);
+    url_unescape (*passwd);
 
   return 1;
 }
@@ -611,6 +614,10 @@ rewrite_shorthand_url (const char *url)
 
 static void parse_path PARAMS ((const char *, char **, char **));
 
+/* Like strpbrk, with the exception that it returns the pointer to the
+   terminating zero (end-of-string aka "eos") if no matching character
+   is found.  */
+
 static char *
 strpbrk_or_eos (const char *s, const char *accept)
 {
@@ -825,7 +832,7 @@ url_parse (const char *url, int *error)
       return NULL;
     }
 
-  url_encoded = reencode_string (url);
+  url_encoded = reencode_escapes (url);
   p = url_encoded;
 
   p += strlen (supported_schemes[scheme].leading_string);
@@ -1016,9 +1023,9 @@ url_parse (const char *url, int *error)
   else
     {
       if (url_encoded == url)
-	u->url    = xstrdup (url);
+	u->url = xstrdup (url);
       else
-	u->url    = url_encoded;
+	u->url = url_encoded;
     }
   url_encoded = NULL;
 
@@ -1032,13 +1039,13 @@ url_error (int error_code)
   return parse_errors[error_code];
 }
 
+/* Parse PATH into dir and file.  PATH is extracted from the URL and
+   is URL-escaped.  The function returns unescaped DIR and FILE.  */
+
 static void
-parse_path (const char *quoted_path, char **dir, char **file)
+parse_path (const char *path, char **dir, char **file)
 {
-  char *path, *last_slash;
-
-  STRDUP_ALLOCA (path, quoted_path);
-  decode_string (path);
+  char *last_slash;
 
   last_slash = strrchr (path, '/');
   if (!last_slash)
@@ -1051,6 +1058,8 @@ parse_path (const char *quoted_path, char **dir, char **file)
       *dir = strdupdelim (path, last_slash);
       *file = xstrdup (last_slash + 1);
     }
+  url_unescape (*dir);
+  url_unescape (*file);
 }
 
 /* Note: URL's "full path" is the path with the query string and
@@ -1303,8 +1312,6 @@ rotate_backups(const char *fname)
     {
       sprintf (from, "%s.%d", fname, i - 1);
       sprintf (to, "%s.%d", fname, i);
-      /* #### This will fail on machines without the rename() system
-         call.  */
       rename (from, to);
     }
 
@@ -1323,11 +1330,14 @@ mkalldirs (const char *path)
   int res;
 
   p = path + strlen (path);
-  for (; *p != '/' && p != path; p--);
+  for (; *p != '/' && p != path; p--)
+    ;
+
   /* Don't create if it's just a file.  */
   if ((p == path) && (*p != '/'))
     return 0;
   t = strdupdelim (path, p);
+
   /* Check whether the directory exists.  */
   if ((stat (t, &st) == 0))
     {
@@ -1360,194 +1370,302 @@ mkalldirs (const char *path)
   xfree (t);
   return res;
 }
+
+/* Functions for constructing the file name out of URL components.  */
 
-static int
-count_slashes (const char *s)
+/* A growable string structure, used by url_file_name and friends.
+   This should perhaps be moved to utils.c.
+
+   The idea is to have an easy way to construct a string by having
+   various functions append data to it.  Instead of passing the
+   obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
+   questions, we pass the pointer to this struct.  */
+
+struct growable {
+  char *base;
+  int size;
+  int tail;
+};
+
+/* Ensure that the string can accept APPEND_COUNT more characters past
+   the current TAIL position.  If necessary, this will grow the string
+   and update its allocated size.  If the string is already large
+   enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
+#define GROW(g, append_size) do {					\
+  struct growable *G_ = g;						\
+  DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);	\
+} while (0)
+
+/* Return the tail position of the string. */
+#define TAIL(r) ((r)->base + (r)->tail)
+
+/* Move the tail position by APPEND_COUNT characters. */
+#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
+
+/* Append the string STR to DEST.  NOTICE: the string in DEST is not
+   terminated.  */
+
+static void
+append_string (const char *str, struct growable *dest)
 {
-  int i = 0;
-  while (*s)
-    if (*s++ == '/')
-      ++i;
-  return i;
+  int l = strlen (str);
+  GROW (dest, l);
+  memcpy (TAIL (dest), str, l);
+  TAIL_INCR (dest, l);
 }
 
-/* Return the path name of the URL-equivalent file name, with a
-   remote-like structure of directories.  */
-static char *
-mkstruct (const struct url *u)
+/* Append CH to DEST.  For example, append_char (0, DEST)
+   zero-terminates DEST.  */
+
+static void
+append_char (char ch, struct growable *dest)
 {
-  char *dir, *file;
-  char *res, *dirpref;
-  int l;
+  GROW (dest, 1);
+  *TAIL (dest) = ch;
+  TAIL_INCR (dest, 1);
+}
 
-  if (opt.cut_dirs)
-    {
-      char *ptr = u->dir + (*u->dir == '/');
-      int slash_count = 1 + count_slashes (ptr);
-      int cut = MINVAL (opt.cut_dirs, slash_count);
-      for (; cut && *ptr; ptr++)
-	if (*ptr == '/')
-	  --cut;
-      STRDUP_ALLOCA (dir, ptr);
-    }
-  else
-    dir = u->dir + (*u->dir == '/');
+enum {
+  filechr_unsafe_always  = 1,	/* always unsafe, e.g. / or \0 */
+  filechr_unsafe_shell   = 2,	/* unsafe for shell use, e.g. control chars */
+  filechr_unsafe_windows = 2,	/* disallowed on Windows file system */
+};
 
-  /* Check for the true name (or at least a consistent name for saving
-     to directory) of HOST, reusing the hlist if possible.  */
-  if (opt.add_hostdir)
-    {
-      /* Add dir_prefix and hostname (if required) to the beginning of
-	 dir.  */
-      dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
-				+ strlen (u->host)
-				+ 1 + numdigit (u->port)
-				+ 1);
-      if (!DOTP (opt.dir_prefix))
-	sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
-      else
-	strcpy (dirpref, u->host);
+#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
 
-      if (u->port != scheme_default_port (u->scheme))
-	{
-	  int len = strlen (dirpref);
-	  dirpref[len] = ':';
-	  number_to_string (dirpref + len + 1, u->port);
-	}
-    }
-  else				/* not add_hostdir */
-    {
-      if (!DOTP (opt.dir_prefix))
-	dirpref = opt.dir_prefix;
-      else
-	dirpref = "";
-    }
+/* Shorthands for the table: */
+#define A filechr_unsafe_always
+#define S filechr_unsafe_shell
+#define W filechr_unsafe_windows
 
-  /* If there is a prefix, prepend it.  */
-  if (*dirpref)
-    {
-      char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
-      sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
-      dir = newdir;
-    }
+/* Forbidden chars:
 
-  l = strlen (dir);
-  if (l && dir[l - 1] == '/')
-    dir[l - 1] = '\0';
+   always: \0, /
+   Unix shell: 0-31, 128-159
+   Windows:    \, |, /, <, >, ?, :
 
-  if (!*u->file)
-    file = "index.html";
-  else
-    file = u->file;
+   Arguably we could also claim `%' to be unsafe, since we use it as
+   the escape character.  If we ever want to be able to reliably
+   translate file name back to URL, this would become important
+   crucial.  Right now, it's better to be minimal in escaping.  */
+
+const static unsigned char filechr_table[256] =
+{
+  A,  S,  S,  S,   S,  S,  S,  S,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
+  S,  S,  S,  S,   S,  S,  S,  S,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
+  S,  S,  S,  S,   S,  S,  S,  S,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
+  S,  S,  S,  S,   S,  S,  S,  S,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
+  0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
+  0,  0,  W,  0,   0,  0,  0,  A,   /* (   )   *   +    ,   -   .   /   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
+  0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
+  0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
 
-  /* Finally, construct the full name.  */
-  res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
-			 + 1);
-  sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
+  S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 128-143 */
+  S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 144-159 */
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
 
-  return res;
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+};
+
+/* Return non-zero if character CH is unsafe for use in file or
+   directory name.  Called by append_uri_pathel. */
+
+static inline int
+file_unsafe_char (char ch, int restrict)
+{
+  int mask = filechr_unsafe_always;
+  if (restrict == restrict_shell)
+    mask |= filechr_unsafe_shell;
+  else if (restrict == restrict_windows)
+    mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
+  return FILE_CHAR_TEST (ch, mask);
 }
 
-/* Compose a file name out of BASE, an unescaped file name, and QUERY,
-   an escaped query string.  The trick is to make sure that unsafe
-   characters in BASE are escaped, and that slashes in QUERY are also
-   escaped.  */
+/* FN_PORT_SEP is the separator between host and port in file names
+   for non-standard port numbers.  On Unix this is normally ':', as in
+   "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
+   because Windows can't handle ':' in file names.  */
+#define FN_PORT_SEP  (opt.restrict_file_names != restrict_windows ? ':' : '+')
 
-static char *
-compose_file_name (char *base, char *query)
+/* FN_QUERY_SEP is the separator between the file name and the URL
+   query, normally '?'.  Since Windows cannot handle '?' as part of
+   file name, we use '@' instead there.  */
+#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
+
+/* Quote path element, characters in [b, e), as file name, and append
+   the quoted string to DEST.  Each character is quoted as per
+   file_unsafe_char and the corresponding table.  */
+
+static void
+append_uri_pathel (const char *b, const char *e, struct growable *dest)
 {
-  char result[256];
-  char *from;
-  char *to = result;
+  char *pathel;
+  int pathlen;
 
-  /* Copy BASE to RESULT and encode all unsafe characters.  */
-  from = base;
-  while (*from && to - result < sizeof (result))
-    {
-      if (UNSAFE_CHAR (*from))
-	{
-	  unsigned char c = *from++;
-	  *to++ = '%';
-	  *to++ = XDIGIT_TO_XCHAR (c >> 4);
-	  *to++ = XDIGIT_TO_XCHAR (c & 0xf);
-	}
-      else
-	*to++ = *from++;
+  const char *p;
+  int quoted, outlen;
+
+  /* Currently restrict_for_windows is determined at compile time
+     only.  But some users download files to Windows partitions; they
+     should be able to say --windows-file-names so Wget escapes
+     characters invalid on Windows.  Similar run-time restrictions for
+     other file systems can be implemented.  */
+  const int restrict = opt.restrict_file_names;
+
+  /* Copy [b, e) to PATHEL and URL-unescape it. */
+  BOUNDED_TO_ALLOCA (b, e, pathel);
+  url_unescape (pathel);
+  pathlen = strlen (pathel);
+
+  /* Go through PATHEL and check how many characters we'll need to
+     add for file quoting. */
+  quoted = 0;
+  for (p = pathel; *p; p++)
+    if (file_unsafe_char (*p, restrict))
+      ++quoted;
+
+  /* p - pathel is the string length.  Each quoted char means two
+     additional characters in the string, hence 2*quoted.  */
+  outlen = (p - pathel) + (2 * quoted);
+  GROW (dest, outlen);
+
+  if (!quoted)
+    {
+      /* If there's nothing to quote, we don't need to go through the
+	 string the second time.  */
+      memcpy (TAIL (dest), pathel, outlen);
     }
-
-  if (query && to - result < sizeof (result))
+  else
     {
-      *to++ = '?';
-
-      /* Copy QUERY to RESULT and encode all '/' characters. */
-      from = query;
-      while (*from && to - result < sizeof (result))
+      char *q = TAIL (dest);
+      for (p = pathel; *p; p++)
 	{
-	  if (*from == '/')
+	  if (!file_unsafe_char (*p, restrict))
+	    *q++ = *p;
+	  else
 	    {
-	      *to++ = '%';
-	      *to++ = '2';
-	      *to++ = 'F';
-	      ++from;
+	      unsigned char ch = *p;
+	      *q++ = '%';
+	      *q++ = XDIGIT_TO_XCHAR (ch >> 4);
+	      *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
 	    }
-	  else
-	    *to++ = *from++;
 	}
+      assert (q - TAIL (dest) == outlen);
     }
+  TAIL_INCR (dest, outlen);
+}
 
-  if (to - result < sizeof (result))
-    *to = '\0';
-  else
-    /* Truncate input which is too long, presumably due to a huge
-       query string.  */
-    result[sizeof (result) - 1] = '\0';
+/* Append to DEST the directory structure that corresponds the
+   directory part of URL's path.  For example, if the URL is
+   http://server/dir1/dir2/file, this appends "/dir1/dir2".
+
+   Each path element ("dir1" and "dir2" in the above example) is
+   examined, url-unescaped, and re-escaped as file name element.
+
+   Additionally, it cuts as many directories from the path as
+   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
+   will produce "bar" for the above example.  For 2 or more, it will
+   produce "".
+
+   Each component of the path is quoted for use as file name.  */
 
-  return xstrdup (result);
+static void
+append_dir_structure (const struct url *u, struct growable *dest)
+{
+  char *pathel, *next;
+  int cut = opt.cut_dirs;
+
+  /* Go through the path components, de-URL-quote them, and quote them
+     (if necessary) as file names.  */
+
+  pathel = u->path;
+  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
+    {
+      if (cut-- > 0)
+	continue;
+      if (pathel == next)
+	/* Ignore empty pathels.  path_simplify should remove
+	   occurrences of "//" from the path, but it has special cases
+	   for starting / which generates an empty pathel here.  */
+	continue;
+
+      if (dest->tail)
+	append_char ('/', dest);
+      append_uri_pathel (pathel, next, dest);
+    }
 }
 
-/* Create a unique filename, corresponding to a given URL.  Calls
-   mkstruct if necessary.  Does *not* actually create any directories.  */
+/* Return a unique file name that matches the given URL as good as
+   possible.  Does not create directories on the file system.  */
+
 char *
-url_filename (const struct url *u)
+url_file_name (const struct url *u)
 {
-  char *file, *name;
+  struct growable fnres;
+
+  char *u_file, *u_query;
+  char *fname, *unique;
 
-  char *query = u->query && *u->query ? u->query : NULL;
+  fnres.base = NULL;
+  fnres.size = 0;
+  fnres.tail = 0;
 
+  /* Start with the directory prefix, if specified. */
+  if (!DOTP (opt.dir_prefix))
+    append_string (opt.dir_prefix, &fnres);
+
+  /* If "dirstruct" is turned on (typically the case with -r), add
+     the host and port (unless those have been turned off) and
+     directory structure.  */
   if (opt.dirstruct)
     {
-      char *base = mkstruct (u);
-      file = compose_file_name (base, query);
-      xfree (base);
-    }
-  else
-    {
-      char *base = *u->file ? u->file : "index.html";
-      file = compose_file_name (base, query);
-
-      /* Check whether the prefix directory is something other than "."
-	 before prepending it.  */
-      if (!DOTP (opt.dir_prefix))
+      if (opt.add_hostdir)
 	{
-	  /* #### should just realloc FILE and prepend dir_prefix. */
-	  char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
-					 + 1 + strlen (file) + 1);
-	  sprintf (nfile, "%s/%s", opt.dir_prefix, file);
-	  xfree (file);
-	  file = nfile;
+	  if (fnres.tail)
+	    append_char ('/', &fnres);
+	  append_string (u->host, &fnres);
+	  if (u->port != scheme_default_port (u->scheme))
+	    {
+	      char portstr[24];
+	      number_to_string (portstr, u->port);
+	      append_char (FN_PORT_SEP, &fnres);
+	      append_string (portstr, &fnres);
+	    }
 	}
+
+      append_dir_structure (u, &fnres);
     }
 
-  /* DOS-ish file systems don't like `%' signs in them; we change it
-     to `@'.  */
-#ifdef WINDOWS
-  {
-    char *p = file;
-    for (p = file; *p; p++)
-      if (*p == '%')
-	*p = '@';
-  }
-#endif /* WINDOWS */
+  /* Add the file name. */
+  if (fnres.tail)
+    append_char ('/', &fnres);
+  u_file = *u->file ? u->file : "index.html";
+  append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
+
+  /* Append "?query" to the file name. */
+  u_query = u->query && *u->query ? u->query : NULL;
+  if (u_query)
+    {
+      append_char (FN_QUERY_SEP, &fnres);
+      append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
+    }
+
+  /* Zero-terminate the file name. */
+  append_char ('\0', &fnres);
+
+  fname = fnres.base;
 
   /* Check the cases in which the unique extensions are not used:
      1) Clobbering is turned off (-nc).
@@ -1557,17 +1675,18 @@ url_filename (const struct url *u)
 
      The exception is the case when file does exist and is a
      directory (actually support for bad httpd-s).  */
+
   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
-      && !(file_exists_p (file) && !file_non_directory_p (file)))
-    return file;
+      && !(file_exists_p (fname) && !file_non_directory_p (fname)))
+    return fnres.base;
 
   /* Find a unique name.  */
-  name = unique_name (file);
-  xfree (file);
-  return name;
+  unique = unique_name (fname);
+  xfree (fname);
+  return unique;
 }
 
-/* Return the langth of URL's path.  Path is considered to be
+/* Return the length of URL's path.  Path is considered to be
    terminated by one of '?', ';', '#', or by the end of the
    string.  */
 static int
@@ -1680,8 +1799,10 @@ path_simplify (char *path)
       else if (*p == '/')
 	{
 	  /* Remove empty path elements.  Not mandated by rfc1808 et
-	     al, but empty path elements are not all that useful, and
-	     the rest of Wget might not deal with them well. */
+	     al, but it seems like a good idea to get rid of them.
+	     Supporting them properly is hard (in which directory do
+	     you save http://x.com///y.html?) and they don't seem to
+	     bring much gain.  */
 	  char *q = p;
 	  while (*q == '/')
 	    ++q;
@@ -1964,13 +2085,13 @@ url_string (const struct url *url, int hide_password)
   /* Make sure the user name and password are quoted. */
   if (url->user)
     {
-      quoted_user = encode_string_maybe (url->user);
+      quoted_user = url_escape_allow_passthrough (url->user);
       if (url->passwd)
 	{
 	  if (hide_password)
 	    quoted_passwd = HIDDEN_PASSWORD;
 	  else
-	    quoted_passwd = encode_string_maybe (url->passwd);
+	    quoted_passwd = url_escape_allow_passthrough (url->passwd);
 	}
     }
 
diff --git a/src/url.h b/src/url.h
index bd88d950..d80fe54d 100644
--- a/src/url.h
+++ b/src/url.h
@@ -130,7 +130,7 @@ typedef enum
 
 /* Function declarations */
 
-char *encode_string PARAMS ((const char *));
+char *url_escape PARAMS ((const char *));
 
 struct url *url_parse PARAMS ((const char *, int *));
 const char *url_error PARAMS ((int));
@@ -157,7 +157,7 @@ char *uri_merge PARAMS ((const char *, const char *));
 
 void rotate_backups PARAMS ((const char *));
 int mkalldirs PARAMS ((const char *));
-char *url_filename PARAMS ((const struct url *));
+char *url_file_name PARAMS ((const struct url *));
 
 char *getproxy PARAMS ((struct url *));
 int no_proxy_match PARAMS ((const char *, const char **));
-- 
2.39.2