From: Micah Cowan <micah@cowan.name>
Date: Thu, 25 Jun 2009 08:14:11 +0000 (-0700)
Subject: Merge with mainline.
X-Git-Tag: v1.13~338
X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=4f3dd6817348433eafde04a3c2946f43364de7ef

Merge with mainline.
---

4f3dd6817348433eafde04a3c2946f43364de7ef
diff --cc src/http.c
index 50f0c643,9ed226cb..ae89c46d
--- a/src/http.c
+++ b/src/http.c
@@@ -2359,9 -2355,8 +2371,9 @@@ http_loop (struct url *u, char **newloc
    uerr_t err, ret = TRYLIMEXC;
    time_t tmr = -1;               /* remote time-stamp */
    struct http_stat hstat;        /* HTTP status */
-   struct_stat st;  
+   struct_stat st;
    bool send_head_first = true;
 +  char *file_name;
  
    /* Assert that no value for *LOCAL_FILE was passed. */
    assert (local_file == NULL || *local_file == NULL);
@@@ -2434,13 -2429,11 +2446,13 @@@ File %s already there; not retrieving.\
  
    /* Send preliminary HEAD request if -N is given and we have an existing 
     * destination file. */
 +  file_name = url_file_name (u);
-   if (opt.timestamping 
+   if (opt.timestamping
        && !opt.content_disposition
 -      && file_exists_p (url_file_name (u)))
 +      && file_exists_p (file_name))
      send_head_first = true;
 -
 +  xfree (file_name);
 +  
    /* THE loop */
    do
      {
diff --cc src/main.c
index b8039d6b,a2d40888..69df08a7
--- a/src/main.c
+++ b/src/main.c
@@@ -1178,45 -1202,40 +1202,51 @@@ WARNING: Can't reopen standard output i
    for (t = url; *t; t++)
      {
        char *filename = NULL, *redirected_URL = NULL;
 -      int dt;
 +      int dt, url_err;
-       struct url *url_parsed = url_parse (*t, &url_err);
++      struct url *url_parsed = url_parse (*t, &url_err, NULL, false);
  
 -      if ((opt.recursive || opt.page_requisites)
 -          && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
 +      if (!url_parsed)
          {
 -          int old_follow_ftp = opt.follow_ftp;
 -
 -          /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
 -          if (url_scheme (*t) == SCHEME_FTP)
 -            opt.follow_ftp = 1;
 -
 -          status = retrieve_tree (*t, NULL);
 -
 -          opt.follow_ftp = old_follow_ftp;
 +          char *error = url_error (*t, url_err);
 +          logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error);
 +          xfree (error);
 +          status = URLERROR;
          }
        else
          {
 -          struct iri *i = iri_new ();
 -          set_uri_encoding (i, opt.locale, true);
 -          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
 -                                 opt.recursive, i);
 -          iri_free (i);
 -        }
 +          if ((opt.recursive || opt.page_requisites)
 +              && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed)))
 +            {
 +              int old_follow_ftp = opt.follow_ftp;
  
 -      if (opt.delete_after && file_exists_p(filename))
 -        {
 -          DEBUGP (("Removing file due to --delete-after in main():\n"));
 -          logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 -          if (unlink (filename))
 -            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 -        }
 +              /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
 +              if (url_scheme (*t) == SCHEME_FTP) 
 +                opt.follow_ftp = 1;
 +          
-               status = retrieve_tree (url_parsed);
++              status = retrieve_tree (url_parsed, NULL);
  
 -      xfree_null (redirected_URL);
 -      xfree_null (filename);
 +              opt.follow_ftp = old_follow_ftp;
 +            }
 +          else
-             status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
++          {
++            struct iri *i = iri_new ();
++            set_uri_encoding (i, opt.locale, true);
++            status = retrieve_url (url_parsed, *t, &filename, &redirected_URL,
++                                   NULL, &dt, opt.recursive, i);
++            iri_free (i);
++          }
 +
 +          if (opt.delete_after && file_exists_p(filename))
 +            {
 +              DEBUGP (("Removing file due to --delete-after in main():\n"));
 +              logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 +              if (unlink (filename))
 +                logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 +            }
 +          xfree_null (redirected_URL);
 +          xfree_null (filename);
 +          url_free (url_parsed);
 +        }
      }
  
    /* And then from the input file, if any.  */
diff --cc src/recur.c
index 2e067505,95581486..83a9b4ee
--- a/src/recur.c
+++ b/src/recur.c
@@@ -153,9 -160,9 +160,9 @@@ url_dequeue (struct url_queue *queue, s
  }
  
  static bool download_child_p (const struct urlpos *, struct url *, int,
-                               struct url *, struct hash_table *);
+                               struct url *, struct hash_table *, struct iri *);
 -static bool descend_redirect_p (const char *, const char *, int,
 +static bool descend_redirect_p (const char *, struct url *, int,
-                                 struct url *, struct hash_table *);
+                                 struct url *, struct hash_table *, struct iri *);
  
  
  /* Retrieve a part of the web beginning with START_URL.  This used to
@@@ -180,7 -187,7 +187,7 @@@
            options, add it to the queue. */
  
  uerr_t
- retrieve_tree (struct url *start_url_parsed)
 -retrieve_tree (const char *start_url, struct iri *pi)
++retrieve_tree (struct url *start_url_parsed, struct iri *pi)
  {
    uerr_t status = RETROK;
  
@@@ -191,6 -198,31 +198,21 @@@
       the queue, but haven't been downloaded yet.  */
    struct hash_table *blacklist;
  
+   int up_error_code;
 -  struct url *start_url_parsed;
+   struct iri *i = iri_new ();
+ 
+ #define COPYSTR(x)  (x) ? xstrdup(x) : NULL;
+   /* Duplicate pi struct if not NULL */
+   if (pi)
+     {
+       i->uri_encoding = COPYSTR (pi->uri_encoding);
+       i->content_encoding = COPYSTR (pi->content_encoding);
+       i->utf8_encode = pi->utf8_encode;
+     }
+   else
+     set_uri_encoding (i, opt.locale, true);
+ #undef COPYSTR
+ 
 -  start_url_parsed = url_parse (start_url, &up_error_code, i, true);
 -  if (!start_url_parsed)
 -    {
 -      char *error = url_error (start_url, up_error_code);
 -      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error);
 -      xfree (error);
 -      return URLERROR;
 -    }
 -
    queue = url_queue_new ();
    blacklist = make_string_hash_table (0);
  
@@@ -253,22 -286,11 +276,12 @@@
          }
        else
          {
 -          int dt = 0;
 +          int dt = 0, url_err;
            char *redirected = NULL;
-           struct url *url_parsed = url_parse (url, &url_err);
++          struct url *url_parsed = url_parse (url, &url_err, i, false);
  
-           if (!url_parsed)
-             {
-               char *error = url_error (url, url_err);
-               logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
-               xfree (error);
-               status = URLERROR;
-             }
-           else
-             {
-               status = retrieve_url (url_parsed, url, &file, &redirected,
-                                      referer, &dt, false);
-             }
 -          status = retrieve_url (url, &file, &redirected, referer, &dt,
 -                                 false, i);
++          status = retrieve_url (url_parsed, url, &file, &redirected, referer,
++                                 &dt, false, i);
  
            if (html_allowed && file && status == RETROK
                && (dt & RETROKF) && (dt & TEXTHTML))
@@@ -295,8 -317,8 +308,8 @@@
                   want to follow it.  */
                if (descend)
                  {
 -                  if (!descend_redirect_p (redirected, url, depth,
 +                  if (!descend_redirect_p (redirected, url_parsed, depth,
-                                            start_url_parsed, blacklist))
+                                            start_url_parsed, blacklist, i))
                      descend = false;
                    else
                      /* Make sure that the old pre-redirect form gets
@@@ -656,24 -686,27 +676,25 @@@ download_child_p (const struct urlpos *
     it is merely a simple-minded wrapper around download_child_p.  */
  
  static bool
 -descend_redirect_p (const char *redirected, const char *original, int depth,
 +descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
-                     struct url *start_url_parsed, struct hash_table *blacklist)
+                     struct url *start_url_parsed, struct hash_table *blacklist,
+                     struct iri *iri)
  {
 -  struct url *orig_parsed, *new_parsed;
 +  struct url *new_parsed;
    struct urlpos *upos;
    bool success;
  
 -  orig_parsed = url_parse (original, NULL, NULL, false);
    assert (orig_parsed != NULL);
  
-   new_parsed = url_parse (redirected, NULL);
+   new_parsed = url_parse (redirected, NULL, NULL, false);
    assert (new_parsed != NULL);
  
    upos = xnew0 (struct urlpos);
    upos->url = new_parsed;
  
    success = download_child_p (upos, orig_parsed, depth,
-                               start_url_parsed, blacklist);
+                               start_url_parsed, blacklist, iri);
  
 -  url_free (orig_parsed);
    url_free (new_parsed);
    xfree (upos);
  
diff --cc src/recur.h
index 7eeb5642,515a382b..76c0ef5f
--- a/src/recur.h
+++ b/src/recur.h
@@@ -44,6 -42,6 +44,6 @@@ as that of the covered work.  *
  struct urlpos;
  
  void recursive_cleanup (void);
- uerr_t retrieve_tree (struct url *);
 -uerr_t retrieve_tree (const char *, struct iri *);
++uerr_t retrieve_tree (struct url *, struct iri *);
  
  #endif /* RECUR_H */
diff --cc src/res.c
index 20ffe1c8,0320d034..4b0ff82b
--- a/src/res.c
+++ b/src/res.c
@@@ -537,32 -538,22 +538,38 @@@ res_retrieve_file (const char *url, cha
    uerr_t err;
    char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
    int saved_ts_val = opt.timestamping;
 -  int saved_sp_val = opt.spider;
 +  int saved_sp_val = opt.spider, url_err;
 +  struct url * url_parsed;
  
+   /* Copy server URI encoding for a possible IDNA transformation, no need to
+      encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+   set_uri_encoding (i, iri->uri_encoding, false);
+   i->utf8_encode = false;
+ 
    logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
    *file = NULL;
    opt.timestamping = false;
    opt.spider       = false;
 -  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
 +
-   url_parsed = url_parse (robots_url, &url_err);
++  url_parsed = url_parse (robots_url, &url_err, iri, true);
 +  if (!url_parsed)
 +    {
 +      char *error = url_error (robots_url, url_err);
 +      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
 +      xfree (error);
 +      err = URLERROR;
 +    }
 +  else
 +    {
 +      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
-                           false);
++                          false, i);
 +      url_free(url_parsed);
 +    }
 +
    opt.timestamping = saved_ts_val;
-   opt.spider       = saved_sp_val;  
+   opt.spider       = saved_sp_val;
    xfree (robots_url);
+   iri_free (i);
  
    if (err != RETROK && *file != NULL)
      {
diff --cc src/retr.c
index ffa84c38,1d9d7478..0fd936d0
--- a/src/retr.c
+++ b/src/retr.c
@@@ -597,8 -596,8 +597,9 @@@ static char *getproxy (struct url *)
     multiple points. */
  
  uerr_t
 -retrieve_url (const char *origurl, char **file, char **newloc,
 -              const char *refurl, int *dt, bool recursive, struct iri *iri)
 +retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
-               char **newloc, const char *refurl, int *dt, bool recursive)
++              char **newloc, const char *refurl, int *dt, bool recursive,
++              struct iri *iri)
  {
    uerr_t result;
    char *url;
@@@ -626,6 -625,21 +627,11 @@@
    if (file)
      *file = NULL;
  
+  second_try:
 -  u = url_parse (url, &up_error_code, iri, true);
 -  if (!u)
 -    {
 -      char *error = url_error (url, up_error_code);
 -      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
 -      xfree (url);
 -      xfree (error);
 -      return URLERROR;
 -    }
 -
+   DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
+            iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
+            iri->utf8_encode));
+ 
    if (!refurl)
      refurl = opt.referer;
  
@@@ -836,25 -862,20 +866,30 @@@ retrieve_from_file (const char *file, b
  
    status = RETROK;             /* Suppose everything is OK.  */
    *count = 0;                  /* Reset the URL count.  */
-   
+ 
+   /* sXXXav : Assume filename and links in the file are in the locale */
+   set_uri_encoding (iri, opt.locale, true);
+   set_content_encoding (iri, opt.locale);
+ 
    if (url_has_scheme (url))
      {
 -      int dt;
 +      int dt,url_err;
        uerr_t status;
-       struct url * url_parsed = url_parse(url, &url_err);
++      struct url * url_parsed = url_parse(url, &url_err, NULL, true);
 +
 +      if (!url_parsed)
 +        {
 +          char *error = url_error (url, url_err);
 +          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
 +          xfree (error);
 +          return URLERROR;
 +        }
  
        if (!opt.base_href)
          opt.base_href = xstrdup (url);
  
-       status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, false);
 -      status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
++      status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
++                             false, iri);
        if (status != RETROK)
          return status;
  
@@@ -886,18 -917,16 +931,16 @@@
            int old_follow_ftp = opt.follow_ftp;
  
            /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-           if (cur_url->url->scheme == SCHEME_FTP) 
+           if (cur_url->url->scheme == SCHEME_FTP)
              opt.follow_ftp = 1;
-           
-           status = retrieve_tree (cur_url->url);
+ 
 -          status = retrieve_tree (cur_url->url->url, iri);
++          status = retrieve_tree (cur_url->url, iri);
  
            opt.follow_ftp = old_follow_ftp;
          }
        else
-         {
-           status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
-                                  &new_file, NULL, &dt, opt.recursive);
-         }
 -        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
 -	                       &dt, opt.recursive, iri);
++        status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
++                               &new_file, NULL, &dt, opt.recursive, iri);
  
        if (filename && opt.delete_after && file_exists_p (filename))
          {
diff --cc src/retr.h
index 72be93b7,bb2e66d3..8854b684
--- a/src/retr.h
+++ b/src/retr.h
@@@ -53,7 -51,8 +53,8 @@@ typedef const char *(*hunk_terminator_t
  char *fd_read_hunk (int, hunk_terminator_t, long, long);
  char *fd_read_line (int);
  
- uerr_t retrieve_url (struct url *, const char *, char **, char **, const char *, int *, bool);
 -uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
 -                     bool, struct iri *);
++uerr_t retrieve_url (struct url *, const char *, char **, char **,
++                     const char *, int *, bool, struct iri *);
  uerr_t retrieve_from_file (const char *, bool, int *);
  
  const char *retr_rate (wgint, double);
diff --cc src/url.c
index d416fcf7,86d099a7..4c22a9fc
--- a/src/url.c
+++ b/src/url.c
@@@ -668,7 -668,7 +668,8 @@@ url_parse (const char *url, int *error
    int port;
    char *user = NULL, *passwd = NULL;
  
-   char *url_encoded = NULL;
 -  char *url_encoded = NULL, *new_url = NULL;
++  const char *url_encoded = NULL;
++  char *new_url = NULL;
  
    int error_code;
  
@@@ -875,7 -904,7 +905,7 @@@
        if (url_encoded == url)
          u->url = xstrdup (url);
        else
--        u->url = url_encoded;
++        u->url = (char *) url_encoded;
      }
  
    return u;
@@@ -883,7 -912,7 +913,7 @@@
   error:
    /* Cleanup in case of error: */
    if (url_encoded && url_encoded != url)
--    xfree (url_encoded);
++    xfree ((char *) url_encoded);
  
    /* Transmit the error code to the caller, if the caller wants to
       know.  */
diff --cc tests/ChangeLog
index 522bd202,d9ba6531..3dfc60a3
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@@ -1,27 -1,19 +1,43 @@@
+ 2008-12-04  Micah Cowan  <micah@cowan.name> (not copyrightable)
+ 
+ 	* run-px, Test-idn-robots.px: Added test for robots-file
+ 	downloads.
+ 
+ 	* Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px:
+ 	Fix test names.
+ 
+ 2008-11-26  Micah Cowan  <micah@cowan.name>  (not copyrightable)
+ 
+ 	* Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px,
+ 	Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px,
+ 	Test-idn-meta.px, Test-iri-disabled.px,
+ 	Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More
+ 	module-scope warnings.
+ 
 +2009-06-14  Micah Cowan  <micah@cowan.name>
 +
 +	* Makefile.am (EXTRA_DIST): Include all the tests, run-px, and
 +	certs/, to make distcheck happy.
 +
 +2009-06-11  Benjamin Wolsey <bwy@benjaminwolsey.de>
 +
 +	* Test-proxied-https-auth.px: Take an optional argument for the
 +	top source directory, so we can find the cert and key.
 +
 +	* run-px: Provide the top source directory as an argument, so
 +	scripts can find their way around.
 +
 +2009-04-11  Steven Schubiger  <stsc@member.fsf.org>
 +
 +	* run-px: Skip testing with real rc files by setting 
 +	SYSTEM_WGETRC and WGETRC to /dev/null.
 +
 +2009-02-25  Benjamin Wolsey  <bwy@benjaminwolsey.de>
 +
 +	* Makefile.am (run-px-tests): Ensure run-px is run from srcdir.
 +
 +	* run-px: Include modules from srcdir.
 +
  2008-11-25  Steven Schubiger  <stsc@members.fsf.org>
  
  	* WgetTest.pm.in: Remove the magic interpreter line;