mass change: update copyright years.

[wget] / src / retr.c
diff --git a/src/retr.c b/src/retr.c

index e3d62978f0b1ee4ea38adb1e36358794f4f2f82b..1c587a2cb4bc0dd372915df3ad6a13ac696b03de 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -1,6 +1,7 @@
  /* File retrieval.
-   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
-   2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+   2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
+   Inc.
  
  This file is part of GNU Wget.
  
@@ -32,13 +33,12 @@ as that of the covered work.  */
  
  #include <stdio.h>
  #include <stdlib.h>
-#ifdef HAVE_UNISTD_H
-# include <unistd.h>
-#endif /* HAVE_UNISTD_H */
+#include <unistd.h>
  #include <errno.h>
  #include <string.h>
  #include <assert.h>
  
+#include "exits.h"
  #include "utils.h"
  #include "retr.h"
  #include "progress.h"
@@ -52,6 +52,7 @@ as that of the covered work.  */
  #include "convert.h"
  #include "ptimer.h"
  #include "html-url.h"
+#include "iri.h"
  
  /* Total size of downloaded files.  Used to enforce quota.  */
  SUM_SIZE_INT total_downloaded_bytes;
@@ -167,7 +168,18 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
       performance: fast downloads will arrive in large 16K chunks
       (which stdio would write out immediately anyway), and slow
       downloads wouldn't be limited by disk speed.  */
+
+  /* 2005-04-20 SMS.
+     Perhaps it shouldn't hinder performance, but it sure does, at least
+     on VMS (more than 2X).  Rather than speculate on what it should or
+     shouldn't do, it might make more sense to test it.  Even better, it
+     might be nice to explain what possible benefit it could offer, as
+     it appears to be a clear invitation to poor performance with no
+     actual justification.  (Also, why 16K?  Anyone test other values?)
+  */
+#ifndef __VMS
    fflush (out);
+#endif /* ndef __VMS */
    return !ferror (out);
  }
  
@@ -196,8 +208,8 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
  {
    int ret = 0;
  
-  static char dlbuf[16384];
-  int dlbufsize = sizeof (dlbuf);
+  int dlbufsize = BUFSIZ;
+  char *dlbuf = xmalloc (BUFSIZ);
  
    struct ptimer *timer = NULL;
    double last_successful_read_tm = 0;
@@ -212,11 +224,15 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
    bool progress_interactive = false;
  
    bool exact = !!(flags & rb_read_exactly);
+
+  /* Used only by HTTP/HTTPS chunked transfer encoding.  */
+  bool chunked = flags & rb_chunked_transfer_encoding;
    wgint skip = 0;
  
    /* How much data we've read/written.  */
    wgint sum_read = 0;
    wgint sum_written = 0;
+  wgint remaining_chunk_size = 0;
  
    if (flags & rb_skip_startpos)
      skip = startpos;
@@ -226,7 +242,8 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
        /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
           argument to progress_create because the indicator doesn't
           (yet) know about "skipping" data.  */
-      progress = progress_create (skip ? 0 : startpos, startpos + toread);
+      wgint start = skip ? 0 : startpos;
+      progress = progress_create (start, start + toread);
        progress_interactive = progress_interactive_p (progress);
      }
  
@@ -255,8 +272,36 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
       should be read.  */
    while (!exact || (sum_read < toread))
      {
-      int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
+      int rdsize;
        double tmout = opt.read_timeout;
+
+      if (chunked)
+        {
+          if (remaining_chunk_size == 0)
+            {
+              char *line = fd_read_line (fd);
+              char *endl;
+              if (line == NULL)
+                {
+                  ret = -1;
+                  break;
+                }
+
+              remaining_chunk_size = strtol (line, &endl, 16);
+              if (remaining_chunk_size == 0)
+                {
+                  ret = 0;
+                  if (fd_read_line (fd) == NULL)
+                    ret = -1;
+                  break;
+                }
+            }
+
+          rdsize = MIN (remaining_chunk_size, dlbufsize);
+        }
+      else
+        rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
+
        if (progress_interactive)
          {
            /* For interactive progress gauges, always specify a ~1s
@@ -287,7 +332,7 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
        else if (ret <= 0)
          break;                  /* EOF or read error */
  
-      if (progress || opt.limit_rate)
+      if (progress || opt.limit_rate || elapsed)
          {
            ptimer_measure (timer);
            if (ret > 0)
@@ -302,6 +347,16 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
                ret = -2;
                goto out;
              }
+          if (chunked)
+            {
+              remaining_chunk_size -= ret;
+              if (remaining_chunk_size == 0)
+                if (fd_read_line (fd) == NULL)
+                  {
+                    ret = -1;
+                    break;
+                  }
+            }
          }
  
        if (opt.limit_rate)
@@ -332,6 +387,8 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
    if (qtywritten)
      *qtywritten += sum_written;
  
+  free (dlbuf);
+
    return ret;
  }
  \f
@@ -596,15 +653,17 @@ static char *getproxy (struct url *);
     multiple points. */
  
  uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
-              const char *refurl, int *dt, bool recursive, struct iri *iri)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+              char **newloc, const char *refurl, int *dt, bool recursive,
+              struct iri *iri, bool register_status)
  {
    uerr_t result;
    char *url;
    bool location_changed;
+  bool iri_fallbacked = 0;
    int dummy;
    char *mynewloc, *proxy;
-  struct url *u, *proxy_url;
+  struct url *u = orig_parsed, *proxy_url;
    int up_error_code;            /* url parse error code */
    char *local_file;
    int redirection_count = 0;
@@ -625,25 +684,11 @@ retrieve_url (const char *origurl, char **file, char **newloc,
    if (file)
      *file = NULL;
  
- second_try:
-  u = url_parse (url, &up_error_code, iri);
-  if (!u)
-    {
-      char *error = url_error (url, up_error_code);
-      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
-      xfree (url);
-      xfree (error);
-      return URLERROR;
-    }
-
-  DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
-           iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
-           iri->utf8_encode));
-
    if (!refurl)
      refurl = opt.referer;
  
   redirected:
+  /* (also for IRI fallbacking) */
  
    result = NOCONERROR;
    mynewloc = NULL;
@@ -658,7 +703,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
        pi->utf8_encode = false;
  
        /* Parse the proxy URL.  */
-      proxy_url = url_parse (proxy, &up_error_code, NULL);
+      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
        if (!proxy_url)
          {
            char *error = url_error (proxy, up_error_code);
@@ -667,7 +712,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
            xfree (url);
            xfree (error);
            RESTORE_POST_DATA;
-          return PROXERR;
+          result = PROXERR;
+          goto bail;
          }
        if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
          {
@@ -675,7 +721,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
            url_free (proxy_url);
            xfree (url);
            RESTORE_POST_DATA;
-          return PROXERR;
+          result = PROXERR;
+          goto bail;
          }
      }
  
@@ -685,7 +732,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
  #endif
        || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
      {
-      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
+      result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
+                          proxy_url, iri);
      }
    else if (u->scheme == SCHEME_FTP)
      {
@@ -696,7 +744,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
        if (redirection_count)
          oldrec = glob = false;
  
-      result = ftp_loop (u, dt, proxy_url, recursive, glob);
+      result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
        recursive = oldrec;
  
        /* There is a possibility of having HTTP being redirected to
@@ -739,20 +787,24 @@ retrieve_url (const char *origurl, char **file, char **newloc,
           the content encoding. */
        iri->utf8_encode = opt.enable_iri;
        set_content_encoding (iri, NULL);
+      xfree_null (iri->orig_url);
  
        /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
        if (!newloc_parsed)
          {
            char *error = url_error (mynewloc, up_error_code);
            logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
                       error);
-          url_free (u);
+          if (orig_parsed != u)
+            {
+              url_free (u);
+            }
            xfree (url);
            xfree (mynewloc);
            xfree (error);
            RESTORE_POST_DATA;
-          return result;
+          goto bail;
          }
  
        /* Now mynewloc will become newloc_parsed->url, because if the
@@ -767,16 +819,23 @@ retrieve_url (const char *origurl, char **file, char **newloc,
            logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
                       opt.max_redirect);
            url_free (newloc_parsed);
-          url_free (u);
+          if (orig_parsed != u)
+            {
+              url_free (u);
+            }
            xfree (url);
            xfree (mynewloc);
            RESTORE_POST_DATA;
-          return WRONGCODE;
+          result = WRONGCODE;
+          goto bail;
          }
  
        xfree (url);
        url = mynewloc;
-      url_free (u);
+      if (orig_parsed != u)
+        {
+          url_free (u);
+        }
        u = newloc_parsed;
  
        /* If we're being redirected from POST, we don't want to POST
@@ -794,27 +853,34 @@ retrieve_url (const char *origurl, char **file, char **newloc,
    if (!(*dt & RETROKF) && iri->utf8_encode)
      {
        iri->utf8_encode = false;
-      DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
-      goto second_try;
+      if (orig_parsed != u)
+        {
+          url_free (u);
+        }
+      u = url_parse (origurl, NULL, iri, true);
+      if (u)
+        {
+          DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
+          url = xstrdup (u->url);
+          iri_fallbacked = 1;
+          goto redirected;
+        }
+      else
+          DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
      }
  
-  if (local_file && *dt & RETROKF)
+  if (local_file && u && *dt & RETROKF)
      {
        register_download (u->url, local_file);
+
        if (redirection_count && 0 != strcmp (origurl, u->url))
          register_redirection (origurl, u->url);
+
        if (*dt & TEXTHTML)
          register_html (u->url, local_file);
-      if (*dt & RETROKF)
-        {
-          register_download (u->url, local_file);
-          if (redirection_count && 0 != strcmp (origurl, u->url))
-            register_redirection (origurl, u->url);
-          if (*dt & TEXTHTML)
-            register_html (u->url, local_file);
-          if (*dt & TEXTCSS)
-            register_css (u->url, local_file);
-        }
+
+      if (*dt & TEXTCSS)
+        register_css (u->url, local_file);
      }
  
    if (file)
@@ -822,9 +888,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
    else
      xfree_null (local_file);
  
-  url_free (u);
+  if (orig_parsed != u)
+    {
+      url_free (u);
+    }
  
-  if (redirection_count)
+  if (redirection_count || iri_fallbacked)
      {
        if (newloc)
          *newloc = url;
@@ -840,6 +909,9 @@ retrieve_url (const char *origurl, char **file, char **newloc,
  
    RESTORE_POST_DATA;
  
+bail:
+  if (register_status)
+    inform_exit_status (result);
    return result;
  }
  
@@ -856,7 +928,7 @@ retrieve_from_file (const char *file, bool html, int *count)
    struct urlpos *url_list, *cur_url;
    struct iri *iri = iri_new();
  
-  char *input_file = NULL;
+  char *input_file, *url_file = NULL;
    const char *url = file;
  
    status = RETROK;             /* Suppose everything is OK.  */
@@ -866,24 +938,44 @@ retrieve_from_file (const char *file, bool html, int *count)
    set_uri_encoding (iri, opt.locale, true);
    set_content_encoding (iri, opt.locale);
  
-  if (url_has_scheme (url))
+  if (url_valid_scheme (url))
      {
-      int dt;
+      int dt,url_err;
        uerr_t status;
+      struct url * url_parsed = url_parse(url, &url_err, iri, true);
+
+      if (!url_parsed)
+        {
+          char *error = url_error (url, url_err);
+          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+          xfree (error);
+          return URLERROR;
+        }
  
        if (!opt.base_href)
          opt.base_href = xstrdup (url);
  
-      status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
-      if (status != RETROK)
+      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
+                             false, iri, true);
+      url_free (url_parsed);
+
+      if (!url_file || (status != RETROK))
          return status;
  
        if (dt & TEXTHTML)
          html = true;
  
-      /* If we have a found a content encoding, use it */
-      if (iri->content_encoding)
+      /* If we have a found a content encoding, use it.
+       * ( == is okay, because we're checking for identical object) */
+      if (iri->content_encoding != opt.locale)
           set_uri_encoding (iri, iri->content_encoding, false);
+
+      /* Reset UTF-8 encode status */
+      iri->utf8_encode = opt.enable_iri;
+      xfree_null (iri->orig_url);
+      iri->orig_url = NULL;
+
+      input_file = url_file;
      }
    else
      input_file = (char *) file;
@@ -891,10 +983,14 @@ retrieve_from_file (const char *file, bool html, int *count)
    url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
                : get_urls_file (input_file));
  
+  xfree_null (url_file);
+
    for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
      {
        char *filename = NULL, *new_file = NULL;
        int dt;
+      struct iri *tmpiri = iri_dup (iri);
+      struct url *parsed_url = NULL;
  
        if (cur_url->ignore_when_downloading)
          continue;
@@ -905,8 +1001,9 @@ retrieve_from_file (const char *file, bool html, int *count)
            break;
          }
  
-      /* Reset UTF-8 encode status */
-      iri->utf8_encode = opt.enable_iri;
+      /* Need to reparse the url, since it didn't have iri information. */
+      if (opt.enable_iri)
+          parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
  
        if ((opt.recursive || opt.page_requisites)
            && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
@@ -917,13 +1014,19 @@ retrieve_from_file (const char *file, bool html, int *count)
            if (cur_url->url->scheme == SCHEME_FTP)
              opt.follow_ftp = 1;
  
-          status = retrieve_tree (cur_url->url->url, iri);
+          status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
+                                  tmpiri);
  
            opt.follow_ftp = old_follow_ftp;
          }
        else
-        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
-                              &dt, opt.recursive, iri);
+        status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
+                               cur_url->url->url, &filename,
+                               &new_file, NULL, &dt, opt.recursive, tmpiri,
+                               true);
+
+      if (parsed_url)
+          url_free (parsed_url);
  
        if (filename && opt.delete_after && file_exists_p (filename))
          {
@@ -937,6 +1040,7 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
  
        xfree_null (new_file);
        xfree_null (filename);
+      iri_free (tmpiri);
      }
  
    /* Free the linked list of URL-s.  */
@@ -1093,18 +1197,12 @@ getproxy (struct url *u)
  /* Returns true if URL would be downloaded through a proxy. */
  
  bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
  {
    bool ret;
-  struct url *u;
-  struct iri *i = iri_new();
-  /* url was given in the command line, so use locale as encoding */
-  set_uri_encoding (i, opt.locale, true);
-  u= url_parse (url, NULL, i);
    if (!u)
      return false;
    ret = getproxy (u) != NULL;
-  url_free (u);
    return ret;
  }
  
@@ -1117,3 +1215,33 @@ no_proxy_match (const char *host, const char **no_proxy)
    else
      return sufmatch (no_proxy, host);
  }
+
+/* Set the file parameter to point to the local file string.  */
+void
+set_local_file (const char **file, const char *default_file)
+{
+  if (opt.output_document)
+    {
+      if (output_stream_regular)
+        *file = opt.output_document;
+    }
+  else
+    *file = default_file;
+}
+
+/* Return true for an input file's own URL, false otherwise.  */
+bool
+input_file_url (const char *input_file)
+{
+  static bool first = true;
+
+  if (input_file
+      && url_has_scheme (input_file)
+      && first)
+    {
+      first = false;
+      return true;
+    }
+  else
+    return false;
+}