Automated merge.

author Xavier Saint <wget@sxav.eu>

Thu, 7 Aug 2008 08:27:19 +0000 (10:27 +0200)

committer Xavier Saint <wget@sxav.eu>

Thu, 7 Aug 2008 08:27:19 +0000 (10:27 +0200)
author Xavier Saint <wget@sxav.eu>
Thu, 7 Aug 2008 08:27:19 +0000 (10:27 +0200)
committer Xavier Saint <wget@sxav.eu>
Thu, 7 Aug 2008 08:27:19 +0000 (10:27 +0200)
diff --combined ChangeLog

index 4bd9e3b3683aad2fe701e2c5ed3053fe5f0eaeb3,21d380b261252d9c27b43fe1f9efe87ad89ab99f..c19c374fa57698e45a9bf65f51fde9349a2ece72
--- 1/ChangeLog
--- 2/ChangeLog
+++ b/ChangeLog
@@@ -1,14 -1,17 +1,22 @@@
+ +2008-08-01  Joao Ferreira  <joao@joaoff.com>
+ +
+ +      * NEWS: Added option --default-page to support alternative
+ +      default names for index.html
+ +
   2008-06-30  Micah Cowan  <micah@cowan.name>
   
         * NEWS: Entries for 1.11.4.
   
         * AUTHORS: Added Steven Schubiger.
   
+ 2008-06-26  Xavier Saint  <wget@sxav.eu>
+ 
+       * configure.ac : IRIs support required libiconv, check it.
+ 
+ 2008-06-14  Xavier Saint  <wget@sxav.eu>
+ 
+       * configure.ac: Add support for IRIs
+ 
   2008-05-29  Micah Cowan  <micah@cowan.name>
   
         * po/*.po: Updated from TP (the 1.11.3 set).
diff --combined doc/ChangeLog

index cea5f7b51864a1c54512df6e476769bc0b8690ee,08d2f05e960df9bd9a83268b73154875039f8f1e..94a06283119c70de05857d0b2064a43ad83c6e01
--- 1/doc/ChangeLog
--- 2/doc/ChangeLog
+++ b/doc/ChangeLog
@@@ -1,8 -1,12 +1,17 @@@
+ 2008-08-03  Xavier Saint  <wget@sxav.eu>
+ 
+       * wget.texi : Add option descriptions for the three new
+       options --iri, --locale and --remote-encoding related to
+       IRI support.
+ 
+       * sample.wgetrc : Add commented lines for the three new
+       command iri, locale and encoding related to IRI support.
+ 
+ +2008-08-03  Micah Cowan  <micah@cowan.name>
+ +
+ +      * wget.texi: Don't set UPDATED; already set by version.texi.
+ +      (HTTP Options): Add --default-page option.
+ +
   2008-07-17  Steven Schubiger  <stsc@members.fsf.org>
   
         * wget.texi (Logging and Input File Options): Document
diff --combined doc/wget.texi

index 9cb5db9982cf66152352da79c7fd6ec7c8591751,9219f49c7cdf74622c9e5997a61983acefec466a..54e2eb9d192eb24ded375098b657509f4aae7726
--- 1/doc/wget.texi
--- 2/doc/wget.texi
+++ b/doc/wget.texi
@@@ -3,6 -3,7 +3,6 @@@
   @c %**start of header
   @setfilename wget.info
   @include version.texi
- -@set UPDATED Jun 2008
   @settitle GNU Wget @value{VERSION} Manual
   @c Disable the monstrous rectangles beside overfull hbox-es.
   @finalout
@@@ -674,6 -675,30 +674,30 @@@ Another instance where you'll get a gar
   Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
   servers that support the @code{Range} header.
   
+ @cindex iri support
+ @cindex idn support
+ @item --iri
+ 
+ Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
+ turn it off. IRI support is activated by default.
+ 
+ You can set the default state of IRI support using @code{iri} command in
+ @file{.wgetrc}. That setting may be overridden from the command line.
+ 
+ @cindex local encoding
+ @cindex locale
+ @item --locale=@var{encoding}
+ 
+ Force Wget to use @var{encoding} as the default system encoding. That affects
+ how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
+ IRI support.
+ 
+ Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
+ environment variable to get the locale. If it fails, @sc{ascii} is used.
+ 
+ You can set the default locale using the @code{locale} command in
+ @file{.wgetrc}. That setting may be overridden from the command line.
+ 
   @cindex progress indicator
   @cindex dot style
   @item --progress=@var{type}
@@@ -705,6 -730,21 +729,21 @@@ command line.  The exception is that, w
   ``dot'' progress will be favored over ``bar''.  To force the bar output,
   use @samp{--progress=bar:force}.
   
+ @cindex remote encoding
+ @item --remote-encoding=@var{encoding}
+ 
+ Force Wget to use encoding as the default remote server encoding. That
+ affects how Wget converts URIs found in files from remote encoding to
+ @sc{utf-8} during a recursive fetch. This options is only useful for
+ IRI support, for the interpretation of non-@sc{ascii} characters.
+ 
+ For HTTP, remote encoding can be found in HTTP @code{Content-Type}
+ header and in HTML @code{Content-Type http-equiv} meta tag.
+ 
+ You can set the default encoding using the @code{remoteencoding}
+ command in @file{.wgetrc}. That setting may be overridden from the
+ command line.
+ 
   @item -N
   @itemx --timestamping
   Turn on time-stamping.  @xref{Time-Stamping}, for details.
@@@ -1075,12 -1115,6 +1114,12 @@@ current directory)
   @section HTTP Options
   
   @table @samp
+ +@cindex default page name
+ +@cindex index.html
+ +@item --default-page=@var{name}
+ +Use @var{name} as the default file name when it isn't known (i.e., for
+ +URLs that end in a slash), instead of @file{index.html}.
+ +
   @cindex .html extension
   @item -E
   @itemx --html-extension
diff --combined src/ChangeLog

index 3f587b2087cc9858c38344097b429174a5233a49,fd86c51caf62ce6a9dfd86e9ef3cb2fe27a860c6..28ce4499e5d457896531edc91358baca8dc29a48
--- 1/src/ChangeLog
--- 2/src/ChangeLog
+++ b/src/ChangeLog
@@@ -1,24 -1,3 +1,24 @@@
+ +2008-08-03  Micah Cowan  <micah@cowan.name>
+ +
+ +      * main.c (print_help): Added --default-page.
+ +
+ +2008-08-01  Joao Ferreira  <joao@joaoff.com>
+ +
+ +      * init.c, main.c, options.h, url.c: Added option --default-page
+ +      to support alternative default names for index.html
+ +
+ +2008-08-03  Micah Cowan  <micah@cowan.name>
+ +
+ +      * build_info.c, css-url.c: #include wget.h, not config.h.
+ +
+ +2008-08-03  Steven Schubiger  <stsc@members.fsf.org>
+ +
+ +      * url.c, url.h (url_error): Better messages for unsupported
+ +      schemes, especially https.
+ +
+ +      * html-url.c, recur.c, retr.c: Adjust to new url_error
+ +      invocation, and free result.
+ +
   2008-07-17  Steven Schubiger  <stsc@members.fsf.org>
   
         * retr.c (retrieve_from_file): When given an URL as input file,
@@@ -28,11 -7,27 +28,27 @@@
         * init.c (cleanup): Free the memory associated with the base
         option (when DEBUG_MALLOC is defined).
   
+ 2008-07-02  Xavier Saint  <wget@sxav.eu>
+ 
+       * iri.c, iri.h  : New function idn_decode() to decode ASCII
+       encoded hostname to the locale.
+ 
+       * host.c : Show hostname to be resolved both in locale and
+       ASCII encoded.
+ 
   2008-06-28  Steven Schubiger  <stsc@members.fsf.org>
   
         * retr.c (retrieve_from_file): Allow for reading the links from
         an external file (HTTP/FTP).
   
+ 2008-06-26  Xavier Saint  <wget@sxav.eu>
+ 
+       * iri.c, iri.h : New functions locale_to_utf8() and
+       idn_encode() adding basic capabilities of IRI/IDN.
+ 
+       * url.c : Convert URLs from locale to UTF-8 allowing a basic
+       support of IRI/IDN
+ 
   2008-06-25  Steven Schubiger  <stsc@members.fsf.org>
   
         * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
@@@ -57,7 -52,7 +73,7 @@@
   
         * http.c: Make -nv --spider include the file's name when it
         exists.
-       
+ 
   2008-06-22  Micah Cowan  <micah@cowan.name>
   
         * Makefile.am (version.c): Fixed version string invocation so it
@@@ -65,12 -60,57 +81,57 @@@
         string vars pointers-to-const, and moved line lengths
         below 80 (in Makefile.am, not in version.c).
   
+ 2008-06-19  Xavier Saint  <wget@sxav.eu>
+ 
+       * iri.c, iri.h : New function check_encoding_name() as
+       a preliminary encoding name check.
+ 
+       * main.c, iri.c : Make use of check_encoding_name().
+ 
+ 2008-06-19  Xavier Saint  <wget@sxav.eu>
+ 
+       * iri.c : Include missing stringprep.h file and add a
+       cast.
+ 
+       * init.c : set a default initial value for opt.enable_iri,
+       opt.locale and opt.encoding_remote.
+ 
+ 2008-06-19  Xavier Saint  <wget@sxav.eu>
+ 
+       * iri.c, iri.h : Add a new function find_locale() to find
+       out the local system encoding.
+ 
+       * main.c : Make use of find_locale().
+ 
+ 2008-06-19  Xavier Saint  <wget@sxav.eu>
+ 
+       * html-url.c : Add "content-type" meta tag parsing for
+       retrieving page encoding.
+ 
+       * iri.h : Make no-op version of parse_charset() return
+       NULL.
+ 
   2008-06-16  Micah Cowan  <micah@cowan.name>
   
         * http.c (http_loop): When hstat.len is higher than the
         successfully completed content's length, but it's because we
         _set_ it that way, don't abort.
   
+ 2008-06-14  Xavier Saint  <wget@sxav.eu>
+ 
+       * iri.c, iri.h : New files.
+ 
+       * Makefile.am : Add files iri.h and conditional iri.c.
+ 
+       * build_info.c : Add compiled feature "iri".
+ 
+       * http.c : include iri.h and parse charset from Content-Type
+       header.
+ 
+       * init.c, main.c, options.h : if an options isn't supported
+       at compiled time, don't get rid off it and show a dummy
+       message instead if they are used.
+ 
   2008-06-13  Micah Cowan  <micah@cowan.name>
   
         * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
@@@ -114,11 -154,11 +175,11 @@@
         default.
   
   2008-05-17  Kenny Parnell  <k.parnell@gmail.com>
-       
+ 
         (cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
   
   2008-05-17  Micah Cowan  <micah@cowan.name>
-       
+ 
         * main.c (main): Handle Ctrl-D on command-line.
   
   2008-05-15  Steven Schubiger  <schubiger@gmail.com>
@@@ -157,7 -197,7 +218,7 @@@
   
         * options.h: Add an according boolean member to the options
         struct.
-       
+ 
         * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
         out, because they're now defined independently by config.h.
   
diff --combined src/build_info.c

index ee843ce9ebb5426874253839316305835d89653b,00d5122d89fcae0c9d67eac0e3c589dc07e3bec6..542fed8a3c529400007345c2f984ba59cc3dd42e
--- 1/src/build_info.c
--- 2/src/build_info.c
+++ b/src/build_info.c
@@@ -30,7 -30,7 +30,7 @@@ Corresponding Source for a non-source f
   shall include the source code for the parts of OpenSSL used as well
   as that of the covered work.  */
   
- -#include "config.h"
+ +#include "wget.h"
   #include <stdio.h>
   
   char *system_wgetrc = SYSTEM_WGETRC;
@@@ -100,6 -100,13 +100,13 @@@ const char* (compiled_features[]) 
   #else
     "-gettext",
   #endif
+ 
+ #ifdef ENABLE_IRI
+   "+iri",
+ #else
+   "-iri",
+ #endif
+ 
     /* sentinel value */
     NULL
   };
diff --combined src/html-url.c

index 95df8bf98e35b5d93b21da4003326fa5539b45f7,6e8860834338425ac3e9baa878b3d12300623a70..cbaffb25cb4550e3040fb2644663f566e40f4211
--- 1/src/html-url.c
--- 2/src/html-url.c
+++ b/src/html-url.c
@@@ -174,6 -174,10 +174,10 @@@ static const char *additional_attribute
   static struct hash_table *interesting_tags;
   static struct hash_table *interesting_attributes;
   
+ /* Will contains the (last) charset found in 'http-equiv=content-type'
+    meta tags  */
+ static char *meta_charset;
+ 
   static void
   init_interesting (void)
   {
@@@ -284,7 -288,7 +288,7 @@@ append_url (const char *link_uri, int p
             return NULL;
           }
   
-       url = url_parse (link_uri, NULL);
+       url = url_parse (link_uri, NULL, NULL);
         if (!url)
           {
             DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@@ -303,7 -307,7 +307,7 @@@
         DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                  ctx->document_file, base, link_uri, complete_uri));
   
-       url = url_parse (complete_uri, NULL);
+       url = url_parse (complete_uri, NULL, NULL);
         if (!url)
           {
             DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@@ -553,6 -557,24 +557,24 @@@ tag_handle_meta (int tagid, struct tagi
             entry->link_expect_html = 1;
           }
       }
+   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+     {
+       /* Handle stuff like:
+          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+ 
+       char *mcharset;
+       char *content = find_attr (tag, "content", NULL);
+       if (!content)
+         return;
+ 
+       mcharset = parse_charset (content);
+       if (!mcharset)
+         return;
+ 
+       /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
+       xfree_null (meta_charset);
+       meta_charset = mcharset;
+     }
     else if (name && 0 == strcasecmp (name, "robots"))
       {
         /* Handle stuff like:
@@@ -617,7 -639,8 +639,8 @@@ collect_tags_mapper (struct taginfo *ta
      <base href=...> and does the right thing.  */
   
   struct urlpos *
- get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+                struct iri *iri)
   {
     struct file_memory *fm;
     struct map_context ctx;
@@@ -657,6 -680,10 +680,10 @@@
     map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                    NULL, interesting_attributes);
   
+   /* If meta charset isn't null, override content encoding */
+   if (iri && meta_charset)
+     set_content_encoding (iri, meta_charset);
+ 
     DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
     if (meta_disallow_follow)
       *meta_disallow_follow = ctx.nofollow;
@@@ -726,14 -753,12 +753,14 @@@ get_urls_file (const char *file
             url_text = merged;
           }
   
-       url = url_parse (url_text, &up_error_code);
+       url = url_parse (url_text, &up_error_code, NULL);
         if (!url)
           {
+ +          char *error = url_error (url_text, up_error_code);
             logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
- -                     file, url_text, url_error (up_error_code));
+ +                     file, url_text, error);
             xfree (url_text);
+ +          xfree (error);
             continue;
           }
         xfree (url_text);
diff --combined src/init.c

index a774061b413a4f1ecd1f98086b2ac4e14ae36238,d01a1c80f9ccf35985424cc8cd16e19865a3ef2f..fd71a3628edd8169f95d352d9bbd1b0d3447df77
--- 1/src/init.c
--- 2/src/init.c
+++ b/src/init.c
@@@ -140,7 -140,6 +140,7 @@@ static const struct 
   #ifdef ENABLE_DEBUG
     { "debug",            &opt.debug,             cmd_boolean },
   #endif
+ +  { "defaultpage",    &opt.default_page,      cmd_string},
     { "deleteafter",      &opt.delete_after,      cmd_boolean },
     { "dirprefix",        &opt.dir_prefix,        cmd_directory },
     { "dirstruct",        NULL,                   cmd_spec_dirstruct },
@@@ -182,9 -181,11 +182,11 @@@
     { "inet6only",        &opt.ipv6_only,         cmd_boolean },
   #endif
     { "input",            &opt.input_filename,    cmd_file },
+   { "iri",              &opt.enable_iri,        cmd_boolean },
     { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
     { "limitrate",        &opt.limit_rate,        cmd_bytes },
     { "loadcookies",      &opt.cookies_input,     cmd_file },
+   { "locale",           &opt.locale,            cmd_string },
     { "logfile",          &opt.lfilename,         cmd_file },
     { "login",            &opt.ftp_user,          cmd_string },/* deprecated*/
     { "maxredirect",      &opt.max_redirect,      cmd_number },
@@@ -224,6 -225,7 +226,7 @@@
     { "referer",          &opt.referer,           cmd_string },
     { "reject",           &opt.rejects,           cmd_vector },
     { "relativeonly",     &opt.relative_only,     cmd_boolean },
+   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
     { "removelisting",    &opt.remove_listing,    cmd_boolean },
     { "restrictfilenames", NULL,                  cmd_spec_restrict_file_names },
     { "retrsymlinks",     &opt.retr_symlinks,     cmd_boolean },
@@@ -331,6 -333,14 +334,14 @@@ defaults (void
     opt.restrict_files_case = restrict_no_case_restriction;
   
     opt.max_redirect = 20;
+ 
+ #ifdef ENABLE_IRI
+   opt.enable_iri = true;
+ #else
+   opt.enable_iri = false;
+ #endif
+   opt.locale = NULL;
+   opt.encoding_remote = NULL;
   }
   \f
   /* Return the user's home directory (strdup-ed), or NULL if none is
diff --combined src/main.c

index a7c00b1cbc5372c32589ae8f7710b9a888d16452,79c35220e28420188cd43fd00a26014fd5c43acf..ef01b1bdc1a954da4f1a5c60ff31065d98aa2614
--- 1/src/main.c
--- 2/src/main.c
+++ b/src/main.c
@@@ -163,7 -163,6 +163,7 @@@ static struct cmdline_option option_dat
       { "cookies", 0, OPT_BOOLEAN, "cookies", -1 },
       { "cut-dirs", 0, OPT_VALUE, "cutdirs", -1 },
       { WHEN_DEBUG ("debug"), 'd', OPT_BOOLEAN, "debug", -1 },
+ +    { "default-page", 0, OPT_VALUE, "defaultpage", -1 },
       { "delete-after", 0, OPT_BOOLEAN, "deleteafter", -1 },
       { "directories", 0, OPT_BOOLEAN, "dirstruct", -1 },
       { "directory-prefix", 'P', OPT_VALUE, "dirprefix", -1 },
@@@ -201,10 -200,12 +201,12 @@@
       { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
   #endif
       { "input-file", 'i', OPT_VALUE, "input", -1 },
+     { "iri", 0, OPT_BOOLEAN, "iri", -1 },
       { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
       { "level", 'l', OPT_VALUE, "reclevel", -1 },
       { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
       { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+     { "locale", 0, OPT_VALUE, "locale", -1 },
       { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
       { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
       { "no", 'n', OPT__NO, NULL, required_argument },
@@@ -238,6 -239,7 +240,7 @@@
       { "referer", 0, OPT_VALUE, "referer", -1 },
       { "reject", 'R', OPT_VALUE, "reject", -1 },
       { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
       { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
       { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
       { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@@@ -514,9 -516,6 +517,9 @@@ HTTP options:\n")
          --http-password=PASS    set http password to PASS.\n"),
       N_("\
          --no-cache              disallow server-cached data.\n"),
+ +    N_ ("\
+ +       --default-page=NAME     Change the default page name (normally\n\
+ +                               this is `index.html'.).\n"),
       N_("\
     -E,  --html-extension        save HTML documents with `.html' extension.\n"),
       N_("\
@@@ -1062,6 -1061,29 +1065,29 @@@ for details.\n\n"))
         exit (1);
       }
   
+ #ifdef ENABLE_IRI
+   if (opt.enable_iri)
+     {
+       if (opt.locale && !check_encoding_name (opt.locale))
+         opt.locale = NULL;
+ 
+       if (!opt.locale)
+         opt.locale = find_locale ();
+ 
+       if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
+         opt.encoding_remote = NULL;
+ 
+       /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
+     }
+ #else
+   if (opt.enable_iri || opt.locale || opt.encoding_remote)
+     {
+       /* sXXXav : be more specific... */
+       printf(_("This version does not have support for IRIs\n"));
+       exit(1);
+     }
+ #endif
+ 
     if (opt.ask_passwd)
       {
         opt.passwd = prompt_for_password ();
@@@ -1171,15 -1193,21 +1197,21 @@@ WARNING: Can't reopen standard output i
             int old_follow_ftp = opt.follow_ftp;
   
             /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-           if (url_scheme (*t) == SCHEME_FTP) 
+           if (url_scheme (*t) == SCHEME_FTP)
               opt.follow_ftp = 1;
-           
+ 
             status = retrieve_tree (*t);
   
             opt.follow_ftp = old_follow_ftp;
           }
         else
-         status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+         {
+           struct iri *i = iri_new ();
+           set_uri_encoding (i, opt.locale, true);
+           status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
+                                  opt.recursive, i);
+           iri_free (i);
+         }
   
         if (opt.delete_after && file_exists_p(filename))
           {
diff --combined src/options.h

index ba39ec4e967d43c641caf2dfdeac8c112ddedc41,723f80a170951e8b1c9e78e2990b5b02b7c22e80..4574ab85de7f1ef6c1a2583d8ffe6161d2d98b18
--- 1/src/options.h
--- 2/src/options.h
+++ b/src/options.h
@@@ -59,8 -59,6 +59,8 @@@ struct option
     char *input_filename;               /* Input filename */
     bool force_html;            /* Is the input file an HTML file? */
   
+ +  char *default_page;           /* Alternative default page (index file) */
+ +
     bool spider;                        /* Is Wget in spider mode? */
   
     char **accepts;             /* List of patterns to accept. */
@@@ -239,6 -237,10 +239,10 @@@
     bool content_disposition;   /* Honor HTTP Content-Disposition header. */
     bool auth_without_challenge;  /* Issue Basic authentication creds without
                                      waiting for a challenge. */
+ 
+   bool enable_iri;
+   char *encoding_remote;
+   char *locale;
   };
   
   extern struct options opt;
diff --combined src/recur.c

index 741ca823094a75c0fd760f44cd126dba88e81216,71fbe7bf1f73767b47b105e44042cb6e81cb8c88..119896ce4431bbf1088951a57b8731e123b46f6c
--- 1/src/recur.c
--- 2/src/recur.c
+++ b/src/recur.c
@@@ -51,7 -51,7 +51,7 @@@ as that of the covered work.  *
   #include "html-url.h"
   #include "css-url.h"
   #include "spider.h"
- 
+ \f
   /* Functions for maintaining the URL queue.  */
   
   struct queue_element {
@@@ -60,6 -60,7 +60,7 @@@
     int depth;                    /* the depth */
     bool html_allowed;            /* whether the document is allowed to
                                      be treated as HTML. */
+   struct iri *iri;                /* sXXXav */
     bool css_allowed;             /* whether the document is allowed to
                                      be treated as CSS. */
     struct queue_element *next;   /* next element in queue */
@@@ -93,11 -94,12 +94,12 @@@ url_queue_delete (struct url_queue *que
      into it.  */
   
   static void
- url_enqueue (struct url_queue *queue,
+ url_enqueue (struct url_queue *queue, struct iri *i,
                const char *url, const char *referer, int depth,
                bool html_allowed, bool css_allowed)
   {
     struct queue_element *qel = xnew (struct queue_element);
+   qel->iri = i;
     qel->url = url;
     qel->referer = referer;
     qel->depth = depth;
@@@ -112,6 -114,10 +114,10 @@@
     DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
     DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
   
+   if (i)
+     DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url),
+              i->uri_encoding ? quote (i->uri_encoding) : "None"));
+ 
     if (queue->tail)
       queue->tail->next = qel;
     queue->tail = qel;
@@@ -124,7 -130,7 +130,7 @@@
      succeeded, or false if the queue is empty.  */
   
   static bool
- url_dequeue (struct url_queue *queue,
+ url_dequeue (struct url_queue *queue, struct iri **i,
                const char **url, const char **referer, int *depth,
                bool *html_allowed, bool *css_allowed)
   {
@@@ -137,6 -143,7 +143,7 @@@
     if (!queue->head)
       queue->tail = NULL;
   
+   *i = qel->iri;
     *url = qel->url;
     *referer = qel->referer;
     *depth = qel->depth;
@@@ -153,9 -160,9 +160,9 @@@
   }
   \f
   static bool download_child_p (const struct urlpos *, struct url *, int,
-                               struct url *, struct hash_table *);
+                               struct url *, struct hash_table *, struct iri *);
   static bool descend_redirect_p (const char *, const char *, int,
-                                 struct url *, struct hash_table *);
+                                 struct url *, struct hash_table *, struct iri *);
   
   
   /* Retrieve a part of the web beginning with START_URL.  This used to
@@@ -192,13 -199,15 +199,16 @@@ retrieve_tree (const char *start_url
     struct hash_table *blacklist;
   
     int up_error_code;
-   struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+   struct url *start_url_parsed;
+   struct iri *i = iri_new ();
+   set_uri_encoding (i, opt.locale, true);
   
+   start_url_parsed = url_parse (start_url, &up_error_code, i);
     if (!start_url_parsed)
       {
- -      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
- -                 url_error (up_error_code));
+ +      char *error = url_error (start_url, up_error_code);
+ +      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error);
+ +      xfree (error);
         return URLERROR;
       }
   
@@@ -207,7 -216,8 +217,8 @@@
   
     /* Enqueue the starting URL.  Use start_url_parsed->url rather than
        just URL so we enqueue the canonical form of the URL.  */
-   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
+   url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
+                false);
     string_set_add (blacklist, start_url_parsed->url);
   
     while (1)
@@@ -226,7 -236,7 +237,7 @@@
   
         /* Get the next URL from the queue... */
   
-       if (!url_dequeue (queue,
+       if (!url_dequeue (queue, (struct iri **) &i,
                           (const char **)&url, (const char **)&referer,
                           &depth, &html_allowed, &css_allowed))
           break;
@@@ -267,7 -277,8 +278,8 @@@
             int dt = 0;
             char *redirected = NULL;
   
-           status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+           status = retrieve_url (url, &file, &redirected, referer, &dt,
+                                  false, i);
   
             if (html_allowed && file && status == RETROK
                 && (dt & RETROKF) && (dt & TEXTHTML))
@@@ -295,7 -306,7 +307,7 @@@
                 if (descend)
                   {
                     if (!descend_redirect_p (redirected, url, depth,
-                                            start_url_parsed, blacklist))
+                                            start_url_parsed, blacklist, i))
                       descend = false;
                     else
                       /* Make sure that the old pre-redirect form gets
@@@ -347,7 -358,7 +359,7 @@@
             bool meta_disallow_follow = false;
             struct urlpos *children
               = is_css ? get_urls_css_file (file, url) :
-                        get_urls_html (file, url, &meta_disallow_follow);
+                        get_urls_html (file, url, &meta_disallow_follow, i);
   
             if (opt.use_robots && meta_disallow_follow)
               {
@@@ -358,7 -369,8 +370,8 @@@
             if (children)
               {
                 struct urlpos *child = children;
-               struct url *url_parsed = url_parsed = url_parse (url, NULL);
+               struct url *url_parsed = url_parse (url, NULL, i);
+               struct iri *ci;
                 char *referer_url = url;
                 bool strip_auth = (url_parsed != NULL
                                    && url_parsed->user != NULL);
@@@ -375,9 -387,11 +388,11 @@@
                     if (dash_p_leaf_HTML && !child->link_inline_p)
                       continue;
                     if (download_child_p (child, url_parsed, depth, start_url_parsed,
-                                         blacklist))
+                                         blacklist, i))
                       {
-                       url_enqueue (queue, xstrdup (child->url->url),
+                       ci = iri_new ();
+                       set_uri_encoding (ci, i->content_encoding, false);
+                       url_enqueue (queue, ci, xstrdup (child->url->url),
                                      xstrdup (referer_url), depth + 1,
                                      child->link_expect_html,
                                      child->link_expect_css);
@@@ -395,18 -409,18 +410,18 @@@
               }
           }
   
-       if (file 
-           && (opt.delete_after 
+       if (file
+           && (opt.delete_after
                 || opt.spider /* opt.recursive is implicitely true */
                 || !acceptable (file)))
           {
             /* Either --delete-after was specified, or we loaded this
-              (otherwise unneeded because of --spider or rejected by -R) 
-              HTML file just to harvest its hyperlinks -- in either case, 
+              (otherwise unneeded because of --spider or rejected by -R)
+              HTML file just to harvest its hyperlinks -- in either case,
                delete the local file. */
             DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
                      opt.delete_after ? "--delete-after" :
-                    (opt.spider ? "--spider" : 
+                    (opt.spider ? "--spider" :
                       "recursive rejection criteria")));
             logprintf (LOG_VERBOSE,
                        (opt.delete_after || opt.spider
@@@ -422,6 -436,7 +437,7 @@@
         xfree (url);
         xfree_null (referer);
         xfree_null (file);
+       iri_free (i);
       }
   
     /* If anything is left of the queue due to a premature exit, free it
@@@ -430,9 -445,11 +446,11 @@@
       char *d1, *d2;
       int d3;
       bool d4, d5;
-     while (url_dequeue (queue,
+     struct iri *d6;
+     while (url_dequeue (queue, (struct iri **)&d6,
                           (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
         {
+         iri_free (d6);
           xfree (d1);
           xfree_null (d2);
         }
@@@ -461,7 -478,8 +479,8 @@@
   
   static bool
   download_child_p (const struct urlpos *upos, struct url *parent, int depth,
-                   struct url *start_url_parsed, struct hash_table *blacklist)
+                   struct url *start_url_parsed, struct hash_table *blacklist,
+                   struct iri *iri)
   {
     struct url *u = upos->url;
     const char *url = u->url;
@@@ -471,7 -489,7 +490,7 @@@
   
     if (string_set_contains (blacklist, url))
       {
-       if (opt.spider) 
+       if (opt.spider)
           {
             char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
             DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
@@@ -602,7 -620,7 +621,7 @@@
         if (!specs)
           {
             char *rfile;
-           if (res_retrieve_file (url, &rfile))
+           if (res_retrieve_file (url, &rfile, iri))
               {
                 specs = res_parse_from_file (rfile);
   
@@@ -657,23 -675,24 +676,24 @@@
   
   static bool
   descend_redirect_p (const char *redirected, const char *original, int depth,
-                     struct url *start_url_parsed, struct hash_table *blacklist)
+                     struct url *start_url_parsed, struct hash_table *blacklist,
+                     struct iri *iri)
   {
     struct url *orig_parsed, *new_parsed;
     struct urlpos *upos;
     bool success;
   
-   orig_parsed = url_parse (original, NULL);
+   orig_parsed = url_parse (original, NULL, NULL);
     assert (orig_parsed != NULL);
   
-   new_parsed = url_parse (redirected, NULL);
+   new_parsed = url_parse (redirected, NULL, NULL);
     assert (new_parsed != NULL);
   
     upos = xnew0 (struct urlpos);
     upos->url = new_parsed;
   
     success = download_child_p (upos, orig_parsed, depth,
-                               start_url_parsed, blacklist);
+                               start_url_parsed, blacklist, iri);
   
     url_free (orig_parsed);
     url_free (new_parsed);
diff --combined src/retr.c

index 857742979e397748f3932d566d77ecf0d89dcd7b,fe176eafcfad22b5d539d2d82516c31d62d2b499..c79410ff893aa406a77cc3a10bb750464c8c0d3b
--- 1/src/retr.c
--- 2/src/retr.c
+++ b/src/retr.c
@@@ -597,7 -597,7 +597,7 @@@ static char *getproxy (struct url *)
   
   uerr_t
   retrieve_url (const char *origurl, char **file, char **newloc,
-               const char *refurl, int *dt, bool recursive)
+               const char *refurl, int *dt, bool recursive, struct iri *iri)
   {
     uerr_t result;
     char *url;
@@@ -625,16 -625,19 +625,21 @@@
     if (file)
       *file = NULL;
   
-   u = url_parse (url, &up_error_code);
+  second_try:
+   u = url_parse (url, &up_error_code, iri);
     if (!u)
       {
- -      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
+ +      char *error = url_error (url, up_error_code);
+ +      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
         xfree (url);
+ +      xfree (error);
         return URLERROR;
       }
   
+   DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url),
+            iri->uri_encoding ? quote (iri->uri_encoding) : "None",
+            iri->utf8_encode));
+ 
     if (!refurl)
       refurl = opt.referer;
   
@@@ -648,15 -651,18 +653,20 @@@
     proxy = getproxy (u);
     if (proxy)
       {
+       /* sXXXav : could a proxy include a path ??? */
+       struct iri *pi = iri_new ();
+       set_uri_encoding (pi, opt.locale, true);
+       pi->utf8_encode = false;
+ 
         /* Parse the proxy URL.  */
-       proxy_url = url_parse (proxy, &up_error_code);
+       proxy_url = url_parse (proxy, &up_error_code, NULL);
         if (!proxy_url)
           {
+ +          char *error = url_error (proxy, up_error_code);
             logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
- -                     proxy, url_error (up_error_code));
+ +                     proxy, error);
             xfree (url);
+ +          xfree (error);
             RESTORE_POST_DATA;
             return PROXERR;
           }
@@@ -676,7 -682,7 +686,7 @@@
   #endif
         || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
       {
-       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
       }
     else if (u->scheme == SCHEME_FTP)
       {
@@@ -726,17 -732,20 +736,22 @@@
         xfree (mynewloc);
         mynewloc = construced_newloc;
   
+       /* Reset UTF-8 encoding state, keep the URI encoding and reset
+          the content encoding. */
+       iri->utf8_encode = opt.enable_iri;
+       set_content_encoding (iri, NULL);
+ 
         /* Now, see if this new location makes sense. */
-       newloc_parsed = url_parse (mynewloc, &up_error_code);
+       newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
         if (!newloc_parsed)
           {
+ +          char *error = url_error (mynewloc, up_error_code);
             logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
- -                     url_error (up_error_code));
+ +                     error);
             url_free (u);
             xfree (url);
             xfree (mynewloc);
+ +          xfree (error);
             RESTORE_POST_DATA;
             return result;
           }
@@@ -776,8 -785,21 +791,21 @@@
         goto redirected;
       }
   
-   if (local_file)
+   /* Try to not encode in UTF-8 if fetching failed */
+   if (!(*dt & RETROKF) && iri->utf8_encode)
       {
+       iri->utf8_encode = false;
+       DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
+       goto second_try;
+     }
+ 
+   if (local_file && *dt & RETROKF)
+     {
+       register_download (u->url, local_file);
+       if (redirection_count && 0 != strcmp (origurl, u->url))
+         register_redirection (origurl, u->url);
+       if (*dt & TEXTHTML)
+         register_html (u->url, local_file);
         if (*dt & RETROKF)
           {
             register_download (u->url, local_file);
@@@ -827,13 -849,17 +855,17 @@@ retrieve_from_file (const char *file, b
   {
     uerr_t status;
     struct urlpos *url_list, *cur_url;
+   struct iri *iri = iri_new();
   
     char *input_file = NULL;
     const char *url = file;
   
     status = RETROK;             /* Suppose everything is OK.  */
     *count = 0;                  /* Reset the URL count.  */
-   
+ 
+   /* sXXXav : Assume filename and links in the file are in the locale */
+   set_content_encoding (iri, opt.locale);
+ 
     if (url_has_scheme (url))
       {
         int dt;
@@@ -842,7 -868,7 +874,7 @@@
         if (!opt.base_href)
           opt.base_href = xstrdup (url);
   
-       status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
+       status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
         if (status != RETROK)
           return status;
   
@@@ -852,7 -878,7 +884,7 @@@
     else
       input_file = (char *) file;
   
-   url_list = (html ? get_urls_html (input_file, NULL, NULL)
+   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
                 : get_urls_file (input_file));
   
     for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@@@ -874,15 -900,16 +906,16 @@@
             int old_follow_ftp = opt.follow_ftp;
   
             /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-           if (cur_url->url->scheme == SCHEME_FTP) 
+           if (cur_url->url->scheme == SCHEME_FTP)
               opt.follow_ftp = 1;
-           
+ 
             status = retrieve_tree (cur_url->url->url);
   
             opt.follow_ftp = old_follow_ftp;
           }
         else
-         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
+                              &dt, opt.recursive, iri);
   
         if (filename && opt.delete_after && file_exists_p (filename))
           {
@@@ -1053,7 -1080,11 +1086,11 @@@ boo
   url_uses_proxy (const char *url)
   {
     bool ret;
-   struct url *u = url_parse (url, NULL);
+   struct url *u;
+   struct iri *i = iri_new();
+   /* url was given in the command line, so use locale as encoding */
+   set_uri_encoding (i, opt.locale, true);
+   u= url_parse (url, NULL, i);
     if (!u)
       return false;
     ret = getproxy (u) != NULL;
diff --combined src/url.c

index 3f4b89920fd2f5d2b37956f3c6f05ba0086051af,e79cf8a2e0736688c0ce1495adc7d7b48286946c..c937d05695e03ede8d38dc79dd2bcb01632427d9
--- 1/src/url.c
--- 2/src/url.c
+++ b/src/url.c
@@@ -619,7 -619,7 +619,7 @@@ static const char *parse_errors[] = 
   #define PE_NO_ERROR                     0
     N_("No error"),
   #define PE_UNSUPPORTED_SCHEME           1
- -  N_("Unsupported scheme"),
+ +  N_("Unsupported scheme %s"),
   #define PE_INVALID_HOST_NAME            2
     N_("Invalid host name"),
   #define PE_BAD_PORT_NUMBER              3
@@@ -640,7 -640,7 +640,7 @@@
      error, and if ERROR is not NULL, also set *ERROR to the appropriate
      error code. */
   struct url *
- url_parse (const char *url, int *error)
+ url_parse (const char *url, int *error, struct iri *iri)
   {
     struct url *u;
     const char *p;
@@@ -659,7 -659,7 +659,7 @@@
     int port;
     char *user = NULL, *passwd = NULL;
   
-   char *url_encoded = NULL;
+   char *url_encoded = NULL, *new_url = NULL;
   
     int error_code;
   
@@@ -670,9 -670,20 +670,20 @@@
         goto error;
       }
   
-   url_encoded = reencode_escapes (url);
+   if (iri && iri->utf8_encode)
+     {
+       url_unescape ((char *) url);
+       iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
+       if (!iri->utf8_encode)
+         new_url = NULL;
+     }
+ 
+   url_encoded = reencode_escapes (new_url ? new_url : url);
     p = url_encoded;
   
+   if (new_url && url_encoded != new_url)
+     xfree (new_url);
+ 
     p += strlen (supported_schemes[scheme].leading_string);
     uname_b = p;
     p = url_skip_credentials (p);
@@@ -842,6 -853,18 +853,18 @@@
       {
         url_unescape (u->host);
         host_modified = true;
+ 
+       /* Apply IDNA regardless of iri->utf8_encode status */
+       if (opt.enable_iri && iri)
+         {
+           char *new = idn_encode (iri, u->host);
+           if (new)
+             {
+               xfree (u->host);
+               u->host = new;
+               host_modified = true;
+             }
+         }
       }
   
     if (params_b)
@@@ -851,7 -874,7 +874,7 @@@
     if (fragment_b)
       u->fragment = strdupdelim (fragment_b, fragment_e);
   
-   if (path_modified || u->fragment || host_modified || path_b == path_e)
+   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
       {
         /* If we suspect that a transformation has rendered what
            url_string might return different from URL_ENCODED, rebuild
@@@ -886,29 -909,11 +909,29 @@@
   /* Return the error message string from ERROR_CODE, which should have
      been retrieved from url_parse.  The error message is translated.  */
   
- -const char *
- -url_error (int error_code)
+ +char *
+ +url_error (const char *url, int error_code)
   {
     assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
- -  return _(parse_errors[error_code]);
+ +
+ +  if (error_code == PE_UNSUPPORTED_SCHEME)
+ +    {
+ +      char *error, *p;
+ +      char *scheme = xstrdup (url);
+ +      assert (url_has_scheme (url));
+ +
+ +      if ((p = strchr (scheme, ':')))
+ +        *p = '\0';
+ +      if (!strcasecmp (scheme, "https"))
+ +        asprintf (&error, _("HTTPS support not compiled in"));
+ +      else
+ +        asprintf (&error, _(parse_errors[error_code]), quote (scheme));
+ +      xfree (scheme);
+ +
+ +      return error;
+ +    }
+ +  else
+ +    return xstrdup (_(parse_errors[error_code]));
   }
   
   /* Split PATH into DIR and FILE.  PATH comes from the URL and is
@@@ -1448,17 -1453,11 +1471,17 @@@ url_file_name (const struct url *u
   
     const char *u_file, *u_query;
     char *fname, *unique;
+ +  char *index_filename = "index.html"; /* The default index file is index.html */
   
     fnres.base = NULL;
     fnres.size = 0;
     fnres.tail = 0;
   
+ +  /* If an alternative index file was defined, change index_filename */
+ +  if (opt.default_page)
+ +    index_filename = opt.default_page;
+ +     
+ +
     /* Start with the directory prefix, if specified. */
     if (opt.dir_prefix)
       append_string (opt.dir_prefix, &fnres);
@@@ -1500,7 -1499,7 +1523,7 @@@
     /* Add the file name. */
     if (fnres.tail)
       append_char ('/', &fnres);
- -  u_file = *u->file ? u->file : "index.html";
+ +  u_file = *u->file ? u->file : index_filename;
     append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
   
     /* Append "?query" to the file name. */
diff --combined src/url.h

index ce308f6f97d815c624e85dd5ecc78a9300af83d6,9c49c0b5920ad91e3712bd5e88f9dd33f5da3cde..69db15510207488d7501fa8871ddb0d8389ebf63
--- 1/src/url.h
--- 2/src/url.h
+++ b/src/url.h
@@@ -84,8 -84,8 +84,8 @@@ struct ur
   
   char *url_escape (const char *);
   
- struct url *url_parse (const char *, int *);
- char *url_error (const char *, int);
+ struct url *url_parse (const char *, int *, struct iri *iri);
- -const char *url_error (int);
++const char *url_error (const char *, int);
   char *url_full_path (const struct url *);
   void url_set_dir (struct url *, const char *);
   void url_set_file (struct url *, const char *);
author	Xavier Saint <wget@sxav.eu>
	Thu, 7 Aug 2008 08:27:19 +0000 (10:27 +0200)
committer	Xavier Saint <wget@sxav.eu>
	Thu, 7 Aug 2008 08:27:19 +0000 (10:27 +0200)
		1	2
ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
doc/ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
doc/wget.texi	patch \|	diff1 \|	diff2 \|	blob \| history
src/ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
src/build_info.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/html-url.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/options.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/recur.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/retr.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/url.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/url.h	patch \|	diff1 \|	diff2 \|	blob \| history