X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fretr.c;h=b667ca2ff3cf6ecb4edca9c1148e856f68b0ecbe;hp=05ffe1d03b460c4cc761177d2f56b801ea7e0277;hb=d763f8bf6d6e13ce006ffab616cc8a77e747a633;hpb=9a2ea3938d09643c6528c3b83b1db4c30f47d981

diff --git a/src/retr.c b/src/retr.c
index 05ffe1d0..b667ca2f 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -51,6 +51,7 @@ as that of the covered work.  */
 #include "hash.h"
 #include "convert.h"
 #include "ptimer.h"
+#include "html-url.h"
 #include "iri.h"
 
 /* Total size of downloaded files.  Used to enforce quota.  */
@@ -167,7 +168,18 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
      performance: fast downloads will arrive in large 16K chunks
      (which stdio would write out immediately anyway), and slow
      downloads wouldn't be limited by disk speed.  */
+
+  /* 2005-04-20 SMS.
+     Perhaps it shouldn't hinder performance, but it sure does, at least
+     on VMS (more than 2X).  Rather than speculate on what it should or
+     shouldn't do, it might make more sense to test it.  Even better, it
+     might be nice to explain what possible benefit it could offer, as
+     it appears to be a clear invitation to poor performance with no
+     actual justification.  (Also, why 16K?  Anyone test other values?)
+  */
+#ifndef __VMS
   fflush (out);
+#endif /* ndef __VMS */
   return !ferror (out);
 }
 
@@ -226,7 +238,8 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
          argument to progress_create because the indicator doesn't
          (yet) know about "skipping" data.  */
-      progress = progress_create (skip ? 0 : startpos, startpos + toread);
+      wgint start = skip ? 0 : startpos;
+      progress = progress_create (start, start + toread);
       progress_interactive = progress_interactive_p (progress);
     }
 
@@ -393,7 +406,7 @@ fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
   char *hunk = xmalloc (bufsize);
   int tail = 0;                 /* tail position in HUNK */
 
-  assert (maxsize >= bufsize);
+  assert (!maxsize || maxsize >= bufsize);
 
   while (1)
     {
@@ -596,15 +609,17 @@ static char *getproxy (struct url *);
    multiple points. */
 
 uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
-              const char *refurl, int *dt, bool recursive)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+              char **newloc, const char *refurl, int *dt, bool recursive,
+              struct iri *iri)
 {
   uerr_t result;
   char *url;
   bool location_changed;
+  bool iri_fallbacked = 0;
   int dummy;
   char *mynewloc, *proxy;
-  struct url *u, *proxy_url;
+  struct url *u = orig_parsed, *proxy_url;
   int up_error_code;            /* url parse error code */
   char *local_file;
   int redirection_count = 0;
@@ -613,8 +628,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   char *saved_post_data = NULL;
   char *saved_post_file_name = NULL;
 
-  bool utf8_encoded = opt.enable_iri;
-
   /* If dt is NULL, use local storage.  */
   if (!dt)
     {
@@ -627,21 +640,11 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (file)
     *file = NULL;
 
- second_try:
-  u = url_parse (url, &up_error_code, &utf8_encoded);
-  if (!u)
-    {
-      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
-      xfree (url);
-      return URLERROR;
-    }
-
-  /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
-
   if (!refurl)
     refurl = opt.referer;
 
  redirected:
+  /* (also for IRI fallbacking) */
 
   result = NOCONERROR;
   mynewloc = NULL;
@@ -651,15 +654,19 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   proxy = getproxy (u);
   if (proxy)
     {
-      /* sXXXav : support IRI for proxy */
-      bool proxy_utf8_encode = false;
+      struct iri *pi = iri_new ();
+      set_uri_encoding (pi, opt.locale, true);
+      pi->utf8_encode = false;
+
       /* Parse the proxy URL.  */
-      proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode);
+      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
       if (!proxy_url)
         {
+          char *error = url_error (proxy, up_error_code);
           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
-                     proxy, url_error (up_error_code));
+                     proxy, error);
           xfree (url);
+          xfree (error);
           RESTORE_POST_DATA;
           return PROXERR;
         }
@@ -679,7 +686,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 #endif
       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
     {
-      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
     }
   else if (u->scheme == SCHEME_FTP)
     {
@@ -729,17 +736,26 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       xfree (mynewloc);
       mynewloc = construced_newloc;
 
-      utf8_encoded = opt.enable_iri;
+      /* Reset UTF-8 encoding state, keep the URI encoding and reset
+         the content encoding. */
+      iri->utf8_encode = opt.enable_iri;
+      set_content_encoding (iri, NULL);
+      xfree_null (iri->orig_url);
 
       /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
       if (!newloc_parsed)
         {
+          char *error = url_error (mynewloc, up_error_code);
           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
-                     url_error (up_error_code));
-          url_free (u);
+                     error);
+          if (orig_parsed != u)
+            {
+              url_free (u);
+            }
           xfree (url);
           xfree (mynewloc);
+          xfree (error);
           RESTORE_POST_DATA;
           return result;
         }
@@ -756,7 +772,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
                      opt.max_redirect);
           url_free (newloc_parsed);
-          url_free (u);
+          if (orig_parsed != u)
+            {
+              url_free (u);
+            }
           xfree (url);
           xfree (mynewloc);
           RESTORE_POST_DATA;
@@ -765,7 +784,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 
       xfree (url);
       url = mynewloc;
-      url_free (u);
+      if (orig_parsed != u)
+        {
+          url_free (u);
+        }
       u = newloc_parsed;
 
       /* If we're being redirected from POST, we don't want to POST
@@ -780,11 +802,23 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     }
 
   /* Try to not encode in UTF-8 if fetching failed */
-  if (result != RETROK && utf8_encoded)
+  if (!(*dt & RETROKF) && iri->utf8_encode)
     {
-      utf8_encoded = false;
-      /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
-      goto second_try;
+      iri->utf8_encode = false;
+      if (orig_parsed != u)
+        {
+          url_free (u);
+        }
+      u = url_parse (origurl, NULL, iri, true);
+      if (u)
+        {
+          DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
+          url = xstrdup (u->url);
+          iri_fallbacked = 1;
+          goto redirected;
+        }
+      else
+          DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
     }
 
   if (local_file && *dt & RETROKF)
@@ -794,6 +828,16 @@ retrieve_url (const char *origurl, char **file, char **newloc,
         register_redirection (origurl, u->url);
       if (*dt & TEXTHTML)
         register_html (u->url, local_file);
+      if (*dt & RETROKF)
+        {
+          register_download (u->url, local_file);
+          if (redirection_count && 0 != strcmp (origurl, u->url))
+            register_redirection (origurl, u->url);
+          if (*dt & TEXTHTML)
+            register_html (u->url, local_file);
+          if (*dt & TEXTCSS)
+            register_css (u->url, local_file);
+        }
     }
 
   if (file)
@@ -801,9 +845,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   else
     xfree_null (local_file);
 
-  url_free (u);
+  if (orig_parsed != u)
+    {
+      url_free (u);
+    }
 
-  if (redirection_count)
+  if (redirection_count || iri_fallbacked)
     {
       if (newloc)
         *newloc = url;
@@ -833,16 +880,65 @@ retrieve_from_file (const char *file, bool html, int *count)
 {
   uerr_t status;
   struct urlpos *url_list, *cur_url;
+  struct iri *iri = iri_new();
+
+  char *input_file = NULL;
+  const char *url = file;
 
-  url_list = (html ? get_urls_html (file, NULL, NULL)
-              : get_urls_file (file));
   status = RETROK;             /* Suppose everything is OK.  */
   *count = 0;                  /* Reset the URL count.  */
 
+  /* sXXXav : Assume filename and links in the file are in the locale */
+  set_uri_encoding (iri, opt.locale, true);
+  set_content_encoding (iri, opt.locale);
+
+  if (url_has_scheme (url))
+    {
+      int dt,url_err;
+      uerr_t status;
+      struct url * url_parsed = url_parse(url, &url_err, iri, true);
+
+      if (!url_parsed)
+        {
+          char *error = url_error (url, url_err);
+          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+          xfree (error);
+          return URLERROR;
+        }
+
+      if (!opt.base_href)
+        opt.base_href = xstrdup (url);
+
+      status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
+                             false, iri);
+      if (status != RETROK)
+        return status;
+
+      if (dt & TEXTHTML)
+        html = true;
+
+      /* If we have a found a content encoding, use it.
+       * ( == is okay, because we're checking for identical object) */
+      if (iri->content_encoding != opt.locale)
+	  set_uri_encoding (iri, iri->content_encoding, false);
+
+      /* Reset UTF-8 encode status */
+      iri->utf8_encode = opt.enable_iri;
+      xfree_null (iri->orig_url);
+      iri->orig_url = NULL;
+    }
+  else
+    input_file = (char *) file;
+
+  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
+              : get_urls_file (input_file));
+
   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
     {
       char *filename = NULL, *new_file = NULL;
       int dt;
+      struct iri *tmpiri = iri_dup (iri);
+      struct url *parsed_url = NULL;
 
       if (cur_url->ignore_when_downloading)
         continue;
@@ -852,6 +948,11 @@ retrieve_from_file (const char *file, bool html, int *count)
           status = QUOTEXC;
           break;
         }
+
+      /* Need to reparse the url, since it didn't have iri information. */
+      if (opt.enable_iri)
+          parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
+
       if ((opt.recursive || opt.page_requisites)
           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
         {
@@ -861,12 +962,18 @@ retrieve_from_file (const char *file, bool html, int *count)
           if (cur_url->url->scheme == SCHEME_FTP)
             opt.follow_ftp = 1;
 
-          status = retrieve_tree (cur_url->url->url);
+          status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
+                                  tmpiri);
 
           opt.follow_ftp = old_follow_ftp;
         }
       else
-        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+        status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
+                               cur_url->url->url, &filename,
+                               &new_file, NULL, &dt, opt.recursive, tmpiri);
+
+      if (parsed_url)
+          url_free (parsed_url);
 
       if (filename && opt.delete_after && file_exists_p (filename))
         {
@@ -880,11 +987,14 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
 
       xfree_null (new_file);
       xfree_null (filename);
+      iri_free (tmpiri);
     }
 
   /* Free the linked list of URL-s.  */
   free_urlpos (url_list);
 
+  iri_free (iri);
+
   return status;
 }
 
@@ -1034,14 +1144,12 @@ getproxy (struct url *u)
 /* Returns true if URL would be downloaded through a proxy. */
 
 bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
 {
-  bool ret, utf8_encode = false;
-  struct url *u = url_parse (url, NULL, &utf8_encode);
+  bool ret;
   if (!u)
     return false;
   ret = getproxy (u) != NULL;
-  url_free (u);
   return ret;
 }
 
@@ -1054,3 +1162,16 @@ no_proxy_match (const char *host, const char **no_proxy)
   else
     return sufmatch (no_proxy, host);
 }
+
+/* Set the file parameter to point to the local file string.  */
+void
+set_local_file (const char **file, const char *default_file)
+{
+  if (opt.output_document)
+    {
+      if (output_stream_regular)
+        *file = opt.output_document;
+    }
+  else
+    *file = default_file;
+}