[svn] Merge of fix for bugs 20341 and 20410.

[wget] / src / http.c
diff --git a/src/http.c b/src/http.c

index 4752ce3d7eec4ef7910681f63929aa513a22a27b..faeb0e7fd345cbe94826445e27f9fe18bff819f4 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -1,11 +1,11 @@
  /* HTTP support.
-   Copyright (C) 1996-2005 Free Software Foundation, Inc.
+   Copyright (C) 1996-2006 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
  GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
  
  GNU Wget is distributed in the hope that it will be useful,
@@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  
  In addition, as a special exception, the Free Software Foundation
  gives permission to link the code of its release of Wget with the
@@ -59,6 +58,7 @@ so, delete this exception statement from your version.  */
  # include "gen-md5.h"
  #endif
  #include "convert.h"
+#include "spider.h"
  
  #ifdef TESTING
  #include "test.h"
@@ -737,6 +737,20 @@ resp_free (struct response *resp)
    xfree (resp);
  }
  
+/* Print a single line of response, the characters [b, e).  We tried
+   getting away with
+      logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b);
+   but that failed to escape the non-printable characters and, in fact,
+   caused crashes in UTF-8 locales.  */
+
+static void
+print_response_line(const char *prefix, const char *b, const char *e)
+{
+  char *copy;
+  BOUNDED_TO_ALLOCA(b, e, copy);
+  logprintf (LOG_VERBOSE, "%s%s\n", prefix, escnonprint(copy));
+}
+
  /* Print the server response, line by line, omitting the trailing CRLF
     from individual header lines, and prefixed with PREFIX.  */
  
@@ -755,9 +769,7 @@ print_server_response (const struct response *resp, const char *prefix)
          --e;
        if (b < e && e[-1] == '\r')
          --e;
-      /* This is safe even on printfs with broken handling of "%.<n>s"
-         because resp->headers ends with \0.  */
-      logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, e - b, b);
+      print_response_line(prefix, b, e);
      }
  }
  
@@ -869,7 +881,7 @@ skip_short_body (int fd, wgint contlen)
  
  bool
  extract_param (const char **source, param_token *name, param_token *value,
-              char separator)
+               char separator)
  {
    const char *p = *source;
  
@@ -877,7 +889,7 @@ extract_param (const char **source, param_token *name, param_token *value,
    if (!*p)
      {
        *source = p;
-      return false;            /* no error; nothing more to extract */
+      return false;             /* no error; nothing more to extract */
      }
  
    /* Extract name. */
@@ -885,9 +897,9 @@ extract_param (const char **source, param_token *name, param_token *value,
    while (*p && !ISSPACE (*p) && *p != '=' && *p != separator) ++p;
    name->e = p;
    if (name->b == name->e)
-    return false;              /* empty name: error */
+    return false;               /* empty name: error */
    while (ISSPACE (*p)) ++p;
-  if (*p == separator || !*p)          /* no value */
+  if (*p == separator || !*p)           /* no value */
      {
        xzero (*value);
        if (*p == separator) ++p;
@@ -895,12 +907,12 @@ extract_param (const char **source, param_token *name, param_token *value,
        return true;
      }
    if (*p != '=')
-    return false;              /* error */
+    return false;               /* error */
  
    /* *p is '=', extract value */
    ++p;
    while (ISSPACE (*p)) ++p;
-  if (*p == '"')               /* quoted */
+  if (*p == '"')                /* quoted */
      {
        value->b = ++p;
        while (*p && *p != '"') ++p;
@@ -911,12 +923,12 @@ extract_param (const char **source, param_token *name, param_token *value,
        while (ISSPACE (*p)) ++p;
        while (*p && *p != separator) ++p;
        if (*p == separator)
-       ++p;
+        ++p;
        else if (*p)
-       /* garbage after closed quote, e.g. foo="bar"baz */
-       return false;
+        /* garbage after closed quote, e.g. foo="bar"baz */
+        return false;
      }
-  else                         /* unquoted */
+  else                          /* unquoted */
      {
        value->b = p;
        while (*p && *p != separator) ++p;
@@ -932,6 +944,23 @@ extract_param (const char **source, param_token *name, param_token *value,
  #undef MAX
  #define MAX(p, q) ((p) > (q) ? (p) : (q))
  
+/* Parse the contents of the `Content-Disposition' header, extracting
+   the information useful to Wget.  Content-Disposition is a header
+   borrowed from MIME; when used in HTTP, it typically serves for
+   specifying the desired file name of the resource.  For example:
+
+       Content-Disposition: attachment; filename="flora.jpg"
+
+   Wget will skip the tokens it doesn't care about, such as
+   "attachment" in the previous example; it will also skip other
+   unrecognized params.  If the header is syntactically correct and
+   contains a file name, a copy of the file name is stored in
+   *filename and true is returned.  Otherwise, the function returns
+   false.
+
+   The file name is stripped of directory components and must not be
+   empty.  */
+
  static bool
  parse_content_disposition (const char *hdr, char **filename)
  {
@@ -939,16 +968,34 @@ parse_content_disposition (const char *hdr, char **filename)
    while (extract_param (&hdr, &name, &value, ';'))
      if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "filename") && value.b != NULL)
        {
-       /* Make the file name begin at the last slash or backslash. */
+        /* Make the file name begin at the last slash or backslash. */
          const char *last_slash = memrchr (value.b, '/', value.e - value.b);
          const char *last_bs = memrchr (value.b, '\\', value.e - value.b);
          if (last_slash && last_bs)
            value.b = 1 + MAX (last_slash, last_bs);
          else if (last_slash || last_bs)
            value.b = 1 + (last_slash ? last_slash : last_bs);
-       if (value.b == value.e)
-         continue;
-        *filename = strdupdelim (value.b, value.e);
+        if (value.b == value.e)
+          continue;
+        /* Start with the directory prefix, if specified. */
+        if (opt.dir_prefix)
+          {
+            int prefix_length = strlen (opt.dir_prefix);
+            bool add_slash = (opt.dir_prefix[prefix_length - 1] != '/');
+            int total_length;
+
+            if (add_slash) 
+              ++prefix_length;
+            total_length = prefix_length + (value.e - value.b);            
+            *filename = xmalloc (total_length + 1);
+            strcpy (*filename, opt.dir_prefix);
+            if (add_slash) 
+              (*filename)[prefix_length - 1] = '/';
+            memcpy (*filename + prefix_length, value.b, (value.e - value.b));
+            (*filename)[total_length] = '\0';
+          }
+        else
+          *filename = strdupdelim (value.b, value.e);
          return true;
        }
    return false;
@@ -1709,33 +1756,46 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
  
    /* Determine the local filename if needed. Notice that if -O is used 
     * hstat.local_file is set by http_loop to the argument of -O. */
-  if (!hs->local_file)     
+  if (!hs->local_file)
      {
        /* Honor Content-Disposition whether possible. */
-      if (!resp_header_copy (resp, "Content-Disposition", hdrval, sizeof (hdrval))
+      if (!opt.content_disposition
+          || !resp_header_copy (resp, "Content-Disposition", 
+                                hdrval, sizeof (hdrval))
            || !parse_content_disposition (hdrval, &hs->local_file))
          {
-          /* Choose filename according to URL name. */
+          /* The Content-Disposition header is missing or broken. 
+           * Choose unique file name according to given URL. */
            hs->local_file = url_file_name (u);
          }
      }
    
    /* TODO: perform this check only once. */
-  if (opt.noclobber && file_exists_p (hs->local_file))
+  if (file_exists_p (hs->local_file))
      {
-      /* If opt.noclobber is turned on and file already exists, do not
-         retrieve the file */
-      logprintf (LOG_VERBOSE, _("\
+      if (opt.noclobber)
+        {
+          /* If opt.noclobber is turned on and file already exists, do not
+             retrieve the file */
+          logprintf (LOG_VERBOSE, _("\
  File `%s' already there; not retrieving.\n\n"), hs->local_file);
-      /* If the file is there, we suppose it's retrieved OK.  */
-      *dt |= RETROKF;
+          /* If the file is there, we suppose it's retrieved OK.  */
+          *dt |= RETROKF;
  
-      /* #### Bogusness alert.  */
-      /* If its suffix is "html" or "htm" or similar, assume text/html.  */
-      if (has_html_suffix_p (hs->local_file))
-        *dt |= TEXTHTML;
+          /* #### Bogusness alert.  */
+          /* If its suffix is "html" or "htm" or similar, assume text/html.  */
+          if (has_html_suffix_p (hs->local_file))
+            *dt |= TEXTHTML;
  
-      return RETROK;
+          return RETROK;
+        }
+      else if (!ALLOW_CLOBBER)
+        {
+          char *unique = unique_name (hs->local_file, true);
+          if (unique != hs->local_file)
+            xfree (hs->local_file);
+          hs->local_file = unique;
+        }
      }
  
    /* Support timestamping */
@@ -1773,7 +1833,7 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
            /* Try to stat() the .orig file. */
            if (stat (filename_plus_orig_suffix, &st) == 0)
              {
-              local_dot_orig_file_exists = 1;
+              local_dot_orig_file_exists = true;
                local_filename = filename_plus_orig_suffix;
              }
          }      
@@ -1981,7 +2041,7 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
       content-type.  */
    if (!type ||
          0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) ||
-        0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
+        0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))    
      *dt |= TEXTHTML;
    else
      *dt &= ~TEXTHTML;
@@ -2104,13 +2164,6 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
        return RETRFINISHED;
      }
  
-  /* Print fetch message, if opt.verbose.  */
-  if (opt.verbose)
-    {
-      logprintf (LOG_NOTQUIET, _("Saving to: `%s'\n"), 
-                 HYPHENP (hs->local_file) ? "STDOUT" : hs->local_file);
-    }
-    
    /* Open the local file.  */
    if (!output_stream)
      {
@@ -2147,6 +2200,13 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
    else
      fp = output_stream;
  
+  /* Print fetch message, if opt.verbose.  */
+  if (opt.verbose)
+    {
+      logprintf (LOG_NOTQUIET, _("Saving to: `%s'\n"), 
+                 HYPHENP (hs->local_file) ? "STDOUT" : hs->local_file);
+    }
+    
    /* This confuses the timestamping code that checks for file size.
       #### The timestamping code should be smarter about file size.  */
    if (opt.save_headers && hs->restval == 0)
@@ -2195,7 +2255,8 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
             int *dt, struct url *proxy)
  {
    int count;
-  bool got_head = false;         /* used for time-stamping */
+  bool got_head = false;         /* used for time-stamping and filename detection */
+  bool got_name = false;
    char *tms;
    const char *tmrate;
    uerr_t err, ret = TRYLIMEXC;
@@ -2229,7 +2290,10 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
    hstat.referer = referer;
  
    if (opt.output_document)
-    hstat.local_file = xstrdup (opt.output_document);
+    {
+      hstat.local_file = xstrdup (opt.output_document);
+      got_name = true;
+    }
  
    /* Reset the counter. */
    count = 0;
@@ -2245,8 +2309,12 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
        sleep_between_retrievals (count);
        
        /* Get the current time string.  */
-      tms = time_str (NULL);
+      tms = time_str (time (NULL));
        
+      if (opt.spider && !got_head)
+        logprintf (LOG_VERBOSE, _("\
+Spider mode enabled. Check if remote file exists.\n"));
+
        /* Print fetch message, if opt.verbose.  */
        if (opt.verbose)
          {
@@ -2274,13 +2342,14 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
        /* Default document type is empty.  However, if spider mode is
           on or time-stamping is employed, HEAD_ONLY commands is
           encoded within *dt.  */
-      if (opt.spider || (opt.timestamping && !got_head))
+      if (((opt.spider || opt.timestamping) && !got_head) || !got_name)
          *dt |= HEAD_ONLY;
        else
          *dt &= ~HEAD_ONLY;
  
        /* Decide whether or not to restart.  */
        if (opt.always_rest
+          && got_name
            && stat (hstat.local_file, &st) == 0
            && S_ISREG (st.st_mode))
          /* When -c is used, continue from on-disk size.  (Can't use
@@ -2309,7 +2378,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
        err = gethttp (u, &hstat, dt, proxy);
  
        /* Time?  */
-      tms = time_str (NULL);
+      tms = time_str (time (NULL));
        
        /* Get the new location (with or without the redirection).  */
        if (hstat.newloc)
@@ -2365,26 +2434,43 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
            /* All possibilities should have been exhausted.  */
            abort ();
          }
-      
+     
        if (!(*dt & RETROKF))
          {
+          char *hurl = NULL;
            if (!opt.verbose)
              {
                /* #### Ugly ugly ugly! */
-              char *hurl = url_string (u, true);
+              hurl = url_string (u, true);
                logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
-              xfree (hurl);
              }
-          logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
-                     tms, hstat.statcode, escnonprint (hstat.error));
+          /* Maybe we should always keep track of broken links, not just in
+           * spider mode.  */
+          if (opt.spider)
+            {
+              /* #### Again: ugly ugly ugly! */
+              if (!hurl) 
+                hurl = url_string (u, true);
+              nonexisting_url (hurl);
+              logprintf (LOG_NOTQUIET, _("\
+Remote file does not exist -- broken link!!!\n"));
+            }
+          else
+            {
+              logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
+                         tms, hstat.statcode, escnonprint (hstat.error));
+            }
            logputs (LOG_VERBOSE, "\n");
            ret = WRONGCODE;
+          xfree_null (hurl);
            goto exit;
          }
  
        /* Did we get the time-stamp? */
        if (!got_head)
          {
+          bool restart_loop = false;
+
            if (opt.timestamping && !hstat.remote_time)
              {
                logputs (LOG_NOTQUIET, _("\
@@ -2398,53 +2484,95 @@ Last-modified header missing -- time-stamps turned off.\n"));
                  logputs (LOG_VERBOSE, _("\
  Last-modified header invalid -- time-stamp ignored.\n"));
              }
-        }
-
-      /* The time-stamping section.  */
-      if (opt.timestamping && !got_head)
-        {
-          got_head = true;    /* no more time-stamping */
-          *dt &= ~HEAD_ONLY;
-          count = 0;          /* the retrieve count for HEAD is reset */
-          
-          if (hstat.remote_time && tmr != (time_t) (-1))
+      
+          /* The time-stamping section.  */
+          if (opt.timestamping)
              {
-              /* Now time-stamping can be used validly.  Time-stamping
-                 means that if the sizes of the local and remote file
-                 match, and local file is newer than the remote file,
-                 it will not be retrieved.  Otherwise, the normal
-                 download procedure is resumed.  */
-              if (hstat.orig_file_tstamp >= tmr)
+              if (hstat.orig_file_name) /* Perform the following checks only 
+                                           if the file we're supposed to 
+                                           download already exists. */
                  {
-                  if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen)
+                  if (hstat.remote_time && 
+                      tmr != (time_t) (-1))
                      {
-                      logprintf (LOG_VERBOSE, _("\
+                      /* Now time-stamping can be used validly.  Time-stamping
+                         means that if the sizes of the local and remote file
+                         match, and local file is newer than the remote file,
+                         it will not be retrieved.  Otherwise, the normal
+                         download procedure is resumed.  */
+                      if (hstat.orig_file_tstamp >= tmr)
+                        {
+                          if (hstat.contlen == -1 
+                              || hstat.orig_file_size == hstat.contlen)
+                            {
+                              logprintf (LOG_VERBOSE, _("\
  Server file no newer than local file `%s' -- not retrieving.\n\n"),
-                                 hstat.orig_file_name);
-                      ret = RETROK;
-                      goto exit;
+                                         hstat.orig_file_name);
+                              ret = RETROK;
+                              goto exit;
+                            }
+                          else
+                            {
+                              logprintf (LOG_VERBOSE, _("\
+The sizes do not match (local %s) -- retrieving.\n"),
+                                         number_to_static_string (local_size));
+                            }
+                        }
+                      else
+                        logputs (LOG_VERBOSE,
+                                 _("Remote file is newer, retrieving.\n"));
+
+                      logputs (LOG_VERBOSE, "\n");
                      }
-                  else
+                }
+              
+              /* free_hstat (&hstat); */
+              hstat.timestamp_checked = true;
+              restart_loop = true;
+            }
+          
+          if (opt.always_rest)
+            {
+              got_name = true;
+              restart_loop = true;
+            }
+          
+          if (opt.spider)
+            {
+              if (opt.recursive)
+                {
+                  if (*dt & TEXTHTML)
+                    {
+                      logputs (LOG_VERBOSE, _("\
+Remote file exists and could contain links to other resources -- retrieving.\n\n"));
+                      restart_loop = true;
+                    }
+                  else 
                      {
                        logprintf (LOG_VERBOSE, _("\
-The sizes do not match (local %s) -- retrieving.\n"),
-                                 number_to_static_string (local_size));
+Remote file exists but does not contain any link -- not retrieving.\n\n"));
+                      ret = RETRUNNEEDED;
+                      goto exit;
                      }
                  }
                else
-                logputs (LOG_VERBOSE,
-                         _("Remote file is newer, retrieving.\n"));
-
-              logputs (LOG_VERBOSE, "\n");
+                {
+                  logprintf (LOG_VERBOSE, _("\
+Remote file exists but recursion is disabled -- not retrieving.\n\n"));
+                  ret = RETRUNNEEDED;
+                  goto exit;
+                }
              }
-          
-          /* free_hstat (&hstat); */
-          hstat.timestamp_checked = true;
-          continue;
+
+          got_head = true;    /* no more time-stamping */
+          *dt &= ~HEAD_ONLY;
+          count = 0;          /* the retrieve count for HEAD is reset */
+
+          if (restart_loop) 
+            continue;
          }
-      
+          
        if ((tmr != (time_t) (-1))
-          && !opt.spider
            && ((hstat.len == hstat.contlen) ||
                ((hstat.res == 0) && (hstat.contlen == -1))))
          {
@@ -2463,14 +2591,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
          }
        /* End of time-stamping section. */
  
-      if (opt.spider)
-        {
-          logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
-                     escnonprint (hstat.error));
-          ret = RETROK;
-          goto exit;
-        }
-
        tmrate = retr_rate (hstat.rd_size, hstat.dltime);
        total_download_time += hstat.dltime;
  
@@ -2760,12 +2880,12 @@ digest_authentication_encode (const char *au, const char *user,
      {
        int i;
        for (i = 0; i < countof (options); i++)
-       if (name.e - name.b == strlen (options[i].name)
-           && 0 == strncmp (name.b, options[i].name, name.e - name.b))
-         {
-           *options[i].variable = strdupdelim (value.b, value.e);
-           break;
-         }
+        if (name.e - name.b == strlen (options[i].name)
+            && 0 == strncmp (name.b, options[i].name, name.e - name.b))
+          {
+            *options[i].variable = strdupdelim (value.b, value.e);
+            break;
+          }
      }
    if (!realm || !nonce || !user || !passwd || !path || !method)
      {
@@ -2929,32 +3049,38 @@ http_cleanup (void)
  
  #ifdef TESTING
  
-char *
+const char *
  test_parse_content_disposition()
  {
    int i;
    struct {
      char *hdrval;    
+    char *opt_dir_prefix;
      char *filename;
      bool result;
    } test_array[] = {
-    { "filename=\"file.ext\"", "file.ext", true },
-    { "attachment; filename=\"file.ext\"", "file.ext", true },
-    { "attachment; filename=\"file.ext\"; dummy", "file.ext", true },
-    { "attachment", NULL, false },    
+    { "filename=\"file.ext\"", NULL, "file.ext", true },
+    { "filename=\"file.ext\"", "somedir", "somedir/file.ext", true },
+    { "attachment; filename=\"file.ext\"", NULL, "file.ext", true },
+    { "attachment; filename=\"file.ext\"", "somedir", "somedir/file.ext", true },
+    { "attachment; filename=\"file.ext\"; dummy", NULL, "file.ext", true },
+    { "attachment; filename=\"file.ext\"; dummy", "somedir", "somedir/file.ext", true },
+    { "attachment", NULL, NULL, false },
+    { "attachment", "somedir", NULL, false },
    };
    
    for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) 
      {
        char *filename;
-      bool res = parse_content_disposition (test_array[i].hdrval, &filename);
+      bool res;
+
+      opt.dir_prefix = test_array[i].opt_dir_prefix;
+      res = parse_content_disposition (test_array[i].hdrval, &filename);
  
        mu_assert ("test_parse_content_disposition: wrong result", 
                   res == test_array[i].result
                   && (res == false 
                       || 0 == strcmp (test_array[i].filename, filename)));
-
-      /* printf ("test %d: %s\n", i, res == false ? "false" : filename); */
      }
  
    return NULL;