[svn] Improved --restrict-file-names to accept ",nocontrol".

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index 3a8feb70e28f646d3cd56f6d3fa95bf0a60cfe4d..307da8d572c42a0e053d2a87c285a6406afd1a3a 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -145,11 +145,14 @@ const static unsigned char urlchr_table[256] =
  #undef U
  #undef RU
  
-/* Decodes the forms %xy in a URL to the character the hexadecimal
-   code of which is xy.  xy are hexadecimal digits from
-   [0123456789ABCDEF] (case-insensitive).  If x or y are not
-   hex-digits or `%' precedes `\0', the sequence is inserted
-   literally.  */
+/* URL-unescape the string S.
+
+   This is done by transforming the sequences "%HH" to the character
+   represented by the hexadecimal digits HH.  If % is not followed by
+   two hexadecimal digits, it is inserted literally.
+
+   The transformation is done in place.  If you need the original
+   string intact, make a copy before calling this function.  */
  
  static void
  url_unescape (char *s)
@@ -177,10 +180,15 @@ url_unescape (char *s)
    *t = '\0';
  }
  
-/* Like url_escape, but return S if there are no unsafe chars.  */
+/* The core of url_escape_* functions.  Escapes the characters that
+   match the provided mask in urlchr_table.
+
+   If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
+   will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
+   freshly allocated string will be returned in all cases.  */
  
  static char *
-url_escape_allow_passthrough (const char *s)
+url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
  {
    const char *p1;
    char *p2, *newstr;
@@ -188,11 +196,11 @@ url_escape_allow_passthrough (const char *s)
    int addition = 0;
  
    for (p1 = s; *p1; p1++)
-    if (URL_UNSAFE_CHAR (*p1))
+    if (urlchr_test (*p1, mask))
        addition += 2;           /* Two more characters (hex digits) */
  
    if (!addition)
-    return (char *)s;
+    return allow_passthrough ? (char *)s : xstrdup (s);
  
    newlen = (p1 - s) + addition;
    newstr = (char *)xmalloc (newlen + 1);
@@ -201,7 +209,8 @@ url_escape_allow_passthrough (const char *s)
    p2 = newstr;
    while (*p1)
      {
-      if (URL_UNSAFE_CHAR (*p1))
+      /* Quote the characters that match the test mask. */
+      if (urlchr_test (*p1, mask))
         {
           unsigned char c = *p1++;
           *p2++ = '%';
@@ -211,37 +220,29 @@ url_escape_allow_passthrough (const char *s)
        else
         *p2++ = *p1++;
      }
-  *p2 = '\0';
    assert (p2 - newstr == newlen);
+  *p2 = '\0';
  
    return newstr;
  }
  
-/* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
-   given string, returning a malloc-ed %XX encoded string.  */
-  
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+   string, returning a freshly allocated string.  */
+
  char *
  url_escape (const char *s)
  {
-  char *encoded = url_escape_allow_passthrough (s);
-  if (encoded != s)
-    return encoded;
-  else
-    return xstrdup (s);
+  return url_escape_1 (s, urlchr_unsafe, 0);
  }
  
-/* Encode unsafe characters in PTR to %xx.  If such encoding is done,
-   the old value of PTR is freed and PTR is made to point to the newly
-   allocated storage.  */
-
-#define ENCODE(ptr) do {                               \
-  char *e_new = url_escape_allow_passthrough (ptr);    \
-  if (e_new != ptr)                                    \
-    {                                                  \
-      xfree (ptr);                                     \
-      ptr = e_new;                                     \
-    }                                                  \
-} while (0)
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+   string.  If no characters are unsafe, S is returned.  */
+
+static char *
+url_escape_allow_passthrough (const char *s)
+{
+  return url_escape_1 (s, urlchr_unsafe, 1);
+}
  \f
  enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
  
@@ -419,19 +420,6 @@ reencode_escapes (const char *s)
    assert (p2 - newstr == newlen);
    return newstr;
  }
-
-/* Run PTR_VAR through reencode_escapes.  If a new string is consed,
-   free PTR_VAR and make it point to the new storage.  Obviously,
-   PTR_VAR needs to be an lvalue.  */
-
-#define REENCODE(ptr_var) do {                 \
-  char *rf_new = reencode_escapes (ptr_var);   \
-  if (rf_new != ptr_var)                       \
-    {                                          \
-      xfree (ptr_var);                         \
-      ptr_var = rf_new;                                \
-    }                                          \
-} while (0)
  \f
  /* Returns the scheme type if the scheme is supported, or
     SCHEME_INVALID if not.  */
@@ -616,7 +604,26 @@ static void parse_path PARAMS ((const char *, char **, char **));
  
  /* Like strpbrk, with the exception that it returns the pointer to the
     terminating zero (end-of-string aka "eos") if no matching character
-   is found.  */
+   is found.
+
+   Although I normally balk at Gcc-specific optimizations, it probably
+   makes sense here: glibc has optimizations that detect strpbrk being
+   called with literal string as ACCEPT and inline the search.  That
+   optimization is defeated if strpbrk is hidden within the call to
+   another function.  (And no, making strpbrk_or_eos inline doesn't
+   help because the check for literal accept is in the
+   preprocessor.)  */
+
+#ifdef __GNUC__
+
+#define strpbrk_or_eos(s, accept) ({           \
+  char *SOE_p = strpbrk (s, accept);           \
+  if (!SOE_p)                                  \
+    SOE_p = (char *)s + strlen (s);            \
+  SOE_p;                                       \
+})
+
+#else  /* not __GNUC__ */
  
  static char *
  strpbrk_or_eos (const char *s, const char *accept)
@@ -626,6 +633,7 @@ strpbrk_or_eos (const char *s, const char *accept)
      p = (char *)s + strlen (s);
    return p;
  }
+#endif
  
  /* Turn STR into lowercase; return non-zero if a character was
     actually changed. */
@@ -1125,38 +1133,83 @@ url_full_path (const struct url *url)
    return full_path;
  }
  
-/* Sync u->path and u->url with u->dir and u->file. */
+/* Escape unsafe and reserved characters, except for the slash
+   characters.  */
  
-static void
-sync_path (struct url *url)
+static char *
+url_escape_dir (const char *dir)
  {
-  char *newpath;
+  char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
+  char *h, *t;
+  if (newdir == dir)
+    return (char *)dir;
  
-  xfree (url->path);
+  /* Unescape slashes in NEWDIR. */
+
+  h = newdir;                  /* hare */
+  t = newdir;                  /* tortoise */
  
-  if (!*url->dir)
+  for (; *h; h++, t++)
      {
-      newpath = xstrdup (url->file);
-      REENCODE (newpath);
+      if (*h == '%' && h[1] == '2' && h[2] == 'F')
+       {
+         *t = '/';
+         h += 2;
+       }
+      else
+       *t = *h;
      }
+  *t = '\0';
+
+  return newdir;
+}
+
+/* Sync u->path and u->url with u->dir and u->file.  Called after
+   u->file or u->dir have been changed, typically by the FTP code.  */
+
+static void
+sync_path (struct url *u)
+{
+  char *newpath, *efile, *edir;
+
+  xfree (u->path);
+
+  /* u->dir and u->file are not escaped.  URL-escape them before
+     reassembling them into u->path.  That way, if they contain
+     separators like '?' or even if u->file contains slashes, the
+     path will be correctly assembled.  (u->file can contain slashes
+     if the URL specifies it with %2f, or if an FTP server returns
+     it.)  */
+  edir = url_escape_dir (u->dir);
+  efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
+
+  if (!*edir)
+    newpath = xstrdup (efile);
    else
      {
-      int dirlen = strlen (url->dir);
-      int filelen = strlen (url->file);
+      int dirlen = strlen (edir);
+      int filelen = strlen (efile);
  
-      newpath = xmalloc (dirlen + 1 + filelen + 1);
-      memcpy (newpath, url->dir, dirlen);
-      newpath[dirlen] = '/';
-      memcpy (newpath + dirlen + 1, url->file, filelen);
-      newpath[dirlen + 1 + filelen] = '\0';
-      REENCODE (newpath);
+      /* Copy "DIR/FILE" to newpath. */
+      char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
+      memcpy (p, edir, dirlen);
+      p += dirlen;
+      *p++ = '/';
+      memcpy (p, efile, filelen);
+      p += filelen;
+      *p++ = '\0';
      }
  
-  url->path = newpath;
+  u->path = newpath;
  
-  /* Synchronize u->url. */
-  xfree (url->url);
-  url->url = url_string (url, 0);
+  if (edir != u->dir)
+    xfree (edir);
+  if (efile != u->file)
+    xfree (efile);
+
+  /* Regenerate u->url as well.  */
+  xfree (u->url);
+  u->url = url_string (u, 0);
  }
  
  /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
@@ -1376,10 +1429,10 @@ mkalldirs (const char *path)
  /* A growable string structure, used by url_file_name and friends.
     This should perhaps be moved to utils.c.
  
-   The idea is to have an easy way to construct a string by having
-   various functions append data to it.  Instead of passing the
-   obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
-   questions, we pass the pointer to this struct.  */
+   The idea is to have a convenient and efficient way to construct a
+   string by having various functions append data to it.  Instead of
+   passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
+   functions in questions, we pass the pointer to this struct.  */
  
  struct growable {
    char *base;
@@ -1426,23 +1479,22 @@ append_char (char ch, struct growable *dest)
  }
  
  enum {
-  filechr_unsafe_always  = 1,  /* always unsafe, e.g. / or \0 */
-  filechr_unsafe_shell   = 2,  /* unsafe for shell use, e.g. control chars */
-  filechr_unsafe_windows = 2,  /* disallowed on Windows file system */
+  filechr_not_unix    = 1,     /* unusable on Unix, / and \0 */
+  filechr_not_windows = 2,     /* unusable on Windows, one of \|/<>?:*" */
+  filechr_control     = 4,     /* a control character, e.g. 0-31 */
  };
  
  #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
  
  /* Shorthands for the table: */
-#define A filechr_unsafe_always
-#define S filechr_unsafe_shell
-#define W filechr_unsafe_windows
+#define U filechr_not_unix
+#define W filechr_not_windows
+#define C filechr_control
  
-/* Forbidden chars:
+#define UW U|W
+#define UWC U|W|C
  
-   always: \0, /
-   Unix shell: 0-31, 128-159
-   Windows:    \, |, /, <, >, ?, :
+/* Table of characters unsafe under various conditions (see above).
  
     Arguably we could also claim `%' to be unsafe, since we use it as
     the escape character.  If we ever want to be able to reliably
@@ -1451,12 +1503,12 @@ enum {
  
  const static unsigned char filechr_table[256] =
  {
-  A,  S,  S,  S,   S,  S,  S,  S,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
-  S,  S,  S,  S,   S,  S,  S,  S,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
-  S,  S,  S,  S,   S,  S,  S,  S,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
-  S,  S,  S,  S,   S,  S,  S,  S,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
+UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
+  C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
+  C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
+  C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
    0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
-  0,  0,  W,  0,   0,  0,  0,  A,   /* (   )   *   +    ,   -   .   /   */
+  0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
    0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
@@ -1468,8 +1520,8 @@ const static unsigned char filechr_table[256] =
    0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
    0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
  
-  S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 128-143 */
-  S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 144-159 */
+  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
+  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
  
@@ -1479,30 +1531,16 @@ const static unsigned char filechr_table[256] =
    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
  };
  
-/* Return non-zero if character CH is unsafe for use in file or
-   directory name.  Called by append_uri_pathel. */
-
-static inline int
-file_unsafe_char (char ch, int restrict)
-{
-  int mask = filechr_unsafe_always;
-  if (restrict == restrict_shell)
-    mask |= filechr_unsafe_shell;
-  else if (restrict == restrict_windows)
-    mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
-  return FILE_CHAR_TEST (ch, mask);
-}
-
  /* FN_PORT_SEP is the separator between host and port in file names
     for non-standard port numbers.  On Unix this is normally ':', as in
     "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
     because Windows can't handle ':' in file names.  */
-#define FN_PORT_SEP  (opt.restrict_file_names != restrict_windows ? ':' : '+')
+#define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
  
  /* FN_QUERY_SEP is the separator between the file name and the URL
     query, normally '?'.  Since Windows cannot handle '?' as part of
     file name, we use '@' instead there.  */
-#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
+#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
  
  /* Quote path element, characters in [b, e), as file name, and append
     the quoted string to DEST.  Each character is quoted as per
@@ -1517,12 +1555,13 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
    const char *p;
    int quoted, outlen;
  
-  /* Currently restrict_for_windows is determined at compile time
-     only.  But some users download files to Windows partitions; they
-     should be able to say --windows-file-names so Wget escapes
-     characters invalid on Windows.  Similar run-time restrictions for
-     other file systems can be implemented.  */
-  const int restrict = opt.restrict_file_names;
+  int mask;
+  if (opt.restrict_files_os == restrict_unix)
+    mask = filechr_not_unix;
+  else
+    mask = filechr_not_windows;
+  if (opt.restrict_files_ctrl)
+    mask |= filechr_control;
  
    /* Copy [b, e) to PATHEL and URL-unescape it. */
    BOUNDED_TO_ALLOCA (b, e, pathel);
@@ -1533,7 +1572,7 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
       add for file quoting. */
    quoted = 0;
    for (p = pathel; *p; p++)
-    if (file_unsafe_char (*p, restrict))
+    if (FILE_CHAR_TEST (*p, mask))
        ++quoted;
  
    /* p - pathel is the string length.  Each quoted char means two
@@ -1552,7 +1591,7 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
        char *q = TAIL (dest);
        for (p = pathel; *p; p++)
         {
-         if (!file_unsafe_char (*p, restrict))
+         if (!FILE_CHAR_TEST (*p, mask))
             *q++ = *p;
           else
             {
@@ -1674,15 +1713,15 @@ url_file_name (const struct url *u)
       4) Hierarchy is built.
  
       The exception is the case when file does exist and is a
-     directory (actually support for bad httpd-s).  */
+     directory (see `mkalldirs' for explanation).  */
  
    if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
        && !(file_exists_p (fname) && !file_non_directory_p (fname)))
-    return fnres.base;
+    return fname;
  
-  /* Find a unique name.  */
-  unique = unique_name (fname);
-  xfree (fname);
+  unique = unique_name (fname, 1);
+  if (unique != fname)
+    xfree (fname);
    return unique;
  }