Merge with mainline.

author Micah Cowan <micah@cowan.name>

Thu, 25 Jun 2009 08:14:11 +0000 (01:14 -0700)

committer Micah Cowan <micah@cowan.name>

Thu, 25 Jun 2009 08:14:11 +0000 (01:14 -0700)
author Micah Cowan <micah@cowan.name>
Thu, 25 Jun 2009 08:14:11 +0000 (01:14 -0700)
committer Micah Cowan <micah@cowan.name>
Thu, 25 Jun 2009 08:14:11 +0000 (01:14 -0700)
diff --git a/ChangeLog b/ChangeLog

index 8358b3bda9f97163aa77845768ddbec09fff4ac3..659415aa5c1ea3237eb2ff7bfe721639af1c14fe 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -49,6 +49,14 @@
  
         * AUTHORS: Added Steven Schubiger.
  
+2008-06-26  Xavier Saint  <wget@sxav.eu>
+
+       * configure.ac : IRIs support required libiconv, check it.
+
+2008-06-14  Xavier Saint  <wget@sxav.eu>
+
+       * configure.ac: Add support for IRIs
+
  2008-05-29  Micah Cowan  <micah@cowan.name>
  
         * po/*.po: Updated from TP (the 1.11.3 set).
diff --git a/configure.ac b/configure.ac

index 78fd5e143f39f7d29e912a13b96f72c639fb7955..dcb302fa187cd2d5e3cf69397825098f08e5f8e4 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -462,6 +462,77 @@ else
  fi
  AC_SUBST(COMMENT_IF_NO_POD2MAN)
  
+
+dnl
+dnl Check for IDN/IRIs
+dnl
+
+AC_ARG_ENABLE(iri,
+  AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
+  [case "${enable_iri}" in
+    no)
+      dnl Disable IRIs checking
+      AC_MSG_NOTICE([disabling IRIs at user request])
+      iri=no
+      ;;
+    yes)
+      dnl IRIs explicitly enabled
+      iri=yes
+      force_iri=yes
+      ;;
+    auto)
+      dnl Auto-detect IRI
+      iri=yes
+      ;;
+    *)
+      AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
+      ;;
+    esac
+  ], [
+    dnl If nothing is specified, assume auto-detection
+    iri=yes
+  ]
+)
+
+AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
+                                   [Support IDN/IRIs (needs GNU Libidn)]),
+                                   libidn=$withval, libidn="")
+if test "X$iri" != "Xno"; then
+  AM_ICONV
+
+  if test "X$am_cv_func_iconv" != "Xyes"; then
+    iri=no
+    if test "X$force_iri" = "Xyes"; then
+      AC_MSG_ERROR([Libiconv is required for IRIs support])
+    else
+      AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
+    fi
+  fi
+fi
+
+if test "X$iri" != "Xno"; then
+  if test "$libidn" != ""; then
+    LDFLAGS="${LDFLAGS} -L$libidn/lib"
+    CPPFLAGS="${CPPFLAGS} -I$libidn/include"
+  fi
+  AC_CHECK_HEADER(idna.h,
+    AC_CHECK_LIB(idn, stringprep_check_version,
+      [iri=yes LIBS="${LIBS} -lidn"], iri=no),
+    iri=no)
+
+  if test "X$iri" != "Xno" ; then
+    AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
+    AC_MSG_NOTICE([Enabling support for IRI.])
+  else
+    AC_MSG_WARN([Libidn not found])
+  fi
+fi
+
+
+dnl Needed by src/Makefile.am
+AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
+
+
  dnl
  dnl Create output
  dnl
diff --git a/doc/ChangeLog b/doc/ChangeLog

index 39f390c4b2608817140b3cea88f3b1e2e8ffe301..898e3c6e5822d689c4067c824428e8cb968774a0 100644 (file)
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -70,6 +70,15 @@
         * wget.texi (Robot Exclusion): Fixed typo "downloads" ->
         "download"
  
+2008-08-03  Xavier Saint  <wget@sxav.eu>
+
+       * wget.texi : Add option descriptions for the three new
+       options --iri, --locale and --remote-encoding related to
+       IRI support.
+
+       * sample.wgetrc : Add commented lines for the three new
+       command iri, locale and encoding related to IRI support.
+
  2008-08-03  Micah Cowan  <micah@cowan.name>
  
         * wget.texi: Don't set UPDATED; already set by version.texi.
diff --git a/doc/sample.wgetrc b/doc/sample.wgetrc

index 62981c8f2bf31c892a3481790612d2695a9c27c9..1ce90dea24ca576f507ead9834ab6027b39a69fa 100644 (file)
--- a/doc/sample.wgetrc
+++ b/doc/sample.wgetrc
@@ -114,3 +114,12 @@
  
  # To try ipv6 addresses first:
  #prefer-family = IPv6
+
+# Set default IRI support state
+#iri = off
+
+# Force the default system encoding
+#locale = UTF-8
+
+# Force the default remote server encoding
+#remoteencoding = UTF-8
diff --git a/doc/wget.texi b/doc/wget.texi

index 92ed7905ba538090266328743d308cd1af5b6da1..252548f8094d54276bf62c6e407165a6ac0e0760 100644 (file)
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -675,6 +675,30 @@ Another instance where you'll get a garbled file if you try to use
  Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
  servers that support the @code{Range} header.
  
+@cindex iri support
+@cindex idn support
+@item --iri
+
+Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
+turn it off. IRI support is activated by default.
+
+You can set the default state of IRI support using @code{iri} command in
+@file{.wgetrc}. That setting may be overridden from the command line.
+
+@cindex local encoding
+@cindex locale
+@item --locale=@var{encoding}
+
+Force Wget to use @var{encoding} as the default system encoding. That affects
+how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
+IRI support.
+
+Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
+environment variable to get the locale. If it fails, @sc{ascii} is used.
+
+You can set the default locale using the @code{locale} command in
+@file{.wgetrc}. That setting may be overridden from the command line.
+
  @cindex progress indicator
  @cindex dot style
  @item --progress=@var{type}
@@ -706,6 +730,21 @@ command line.  The exception is that, when the output is not a TTY, the
  ``dot'' progress will be favored over ``bar''.  To force the bar output,
  use @samp{--progress=bar:force}.
  
+@cindex remote encoding
+@item --remote-encoding=@var{encoding}
+
+Force Wget to use encoding as the default remote server encoding. That
+affects how Wget converts URIs found in files from remote encoding to
+@sc{utf-8} during a recursive fetch. This options is only useful for
+IRI support, for the interpretation of non-@sc{ascii} characters.
+
+For HTTP, remote encoding can be found in HTTP @code{Content-Type}
+header and in HTML @code{Content-Type http-equiv} meta tag.
+
+You can set the default encoding using the @code{remoteencoding}
+command in @file{.wgetrc}. That setting may be overridden from the
+command line.
+
  @item -N
  @itemx --timestamping
  Turn on time-stamping.  @xref{Time-Stamping}, for details.
diff --git a/src/ChangeLog b/src/ChangeLog

index a6dd402cc4836506ad9f86f5c7aa9f5874a42c75..bd833ea008529ef05c031443f608e242919ea480 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -256,11 +256,27 @@
         * init.c (cleanup): Free the memory associated with the base
         option (when DEBUG_MALLOC is defined).
  
+2008-07-02  Xavier Saint  <wget@sxav.eu>
+
+       * iri.c, iri.h  : New function idn_decode() to decode ASCII
+       encoded hostname to the locale.
+
+       * host.c : Show hostname to be resolved both in locale and
+       ASCII encoded.
+
  2008-06-28  Steven Schubiger  <stsc@members.fsf.org>
  
         * retr.c (retrieve_from_file): Allow for reading the links from
         an external file (HTTP/FTP).
  
+2008-06-26  Xavier Saint  <wget@sxav.eu>
+
+       * iri.c, iri.h : New functions locale_to_utf8() and
+       idn_encode() adding basic capabilities of IRI/IDN.
+
+       * url.c : Convert URLs from locale to UTF-8 allowing a basic
+       support of IRI/IDN
+
  2008-06-25  Steven Schubiger  <stsc@members.fsf.org>
  
         * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
@@ -285,7 +301,7 @@
  
         * http.c: Make -nv --spider include the file's name when it
         exists.
-       
+
  2008-06-22  Micah Cowan  <micah@cowan.name>
  
         * Makefile.am (version.c): Fixed version string invocation so it
@@ -293,12 +309,57 @@
         string vars pointers-to-const, and moved line lengths
         below 80 (in Makefile.am, not in version.c).
  
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+       * iri.c, iri.h : New function check_encoding_name() as
+       a preliminary encoding name check.
+
+       * main.c, iri.c : Make use of check_encoding_name().
+
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+       * iri.c : Include missing stringprep.h file and add a
+       cast.
+
+       * init.c : set a default initial value for opt.enable_iri,
+       opt.locale and opt.encoding_remote.
+
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+       * iri.c, iri.h : Add a new function find_locale() to find
+       out the local system encoding.
+
+       * main.c : Make use of find_locale().
+
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+       * html-url.c : Add "content-type" meta tag parsing for
+       retrieving page encoding.
+
+       * iri.h : Make no-op version of parse_charset() return
+       NULL.
+
  2008-06-16  Micah Cowan  <micah@cowan.name>
  
         * http.c (http_loop): When hstat.len is higher than the
         successfully completed content's length, but it's because we
         _set_ it that way, don't abort.
  
+2008-06-14  Xavier Saint  <wget@sxav.eu>
+
+       * iri.c, iri.h : New files.
+
+       * Makefile.am : Add files iri.h and conditional iri.c.
+
+       * build_info.c : Add compiled feature "iri".
+
+       * http.c : include iri.h and parse charset from Content-Type
+       header.
+
+       * init.c, main.c, options.h : if an options isn't supported
+       at compiled time, don't get rid off it and show a dummy
+       message instead if they are used.
+
  2008-06-13  Micah Cowan  <micah@cowan.name>
  
         * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
@@ -342,11 +403,11 @@
         default.
  
  2008-05-17  Kenny Parnell  <k.parnell@gmail.com>
-       
+
         (cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
  
  2008-05-17  Micah Cowan  <micah@cowan.name>
-       
+
         * main.c (main): Handle Ctrl-D on command-line.
  
  2008-05-15  Steven Schubiger  <schubiger@gmail.com>
@@ -385,7 +446,7 @@
  
         * options.h: Add an according boolean member to the options
         struct.
-       
+
         * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
         out, because they're now defined independently by config.h.
  
diff --git a/src/Makefile.am b/src/Makefile.am

index 1ced6a90a491fe00b9928c0945e2abc072c12507..58e9b545bd1ef218a1895db9e2a430769f2faab9 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,6 +30,10 @@
  # Version: @VERSION@
  #
  
+if IRI_IS_ENABLED
+IRI_OBJ = iri.c
+endif
+
  # The following line is losing on some versions of make!
  DEFS     = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
  LIBS     = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
@@ -40,8 +44,8 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c    \
                ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
                http.c init.c log.c main.c netrc.c progress.c ptimer.c     \
                recur.c res.c retr.c snprintf.c spider.c url.c             \
-              utils.c                                    \
-              css-url.h connect.h convert.h cookies.h \
+              utils.c $(IRI_OBJ)                                         \
+              css-url.h connect.h convert.h cookies.h                    \
                ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h      \
                http.h http-ntlm.h init.h log.h mswindows.h netrc.h        \
                options.h progress.h ptimer.h recur.h res.h retr.h         \
diff --git a/src/build_info.c b/src/build_info.c

index f60c76ee7f8a76d4561e5340ede902a6c3c81ca8..89ae74f8dcab05365fbdd5c336f5be870d5dc616 100644 (file)
--- a/src/build_info.c
+++ b/src/build_info.c
@@ -103,6 +103,13 @@ const char* (compiled_features[]) =
  #else
    "-gettext",
  #endif
+
+#ifdef ENABLE_IRI
+  "+iri",
+#else
+  "-iri",
+#endif
+
    /* sentinel value */
    NULL
  };
diff --git a/src/connect.c b/src/connect.c

index f46f11c44461017a2dce3549622268f7fc0fea70..0a54c852a89fa804fc212d2c4f1b843800cb704f 100644 (file)
--- a/src/connect.c
+++ b/src/connect.c
@@ -271,9 +271,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
    if (print)
      {
        const char *txt_addr = print_address (ip);
-      if (print && 0 != strcmp (print, txt_addr))
-        logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
-                   escnonprint_uri (print), txt_addr, port);
+      if (0 != strcmp (print, txt_addr))
+        {
+                                 char *str = NULL, *name;
+
+          if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
+            {
+              int len = strlen (print) + strlen (name) + 4;
+              str = xmalloc (len);
+              snprintf (str, len, "%s (%s)", name, print);
+              str[len-1] = '\0';
+              xfree (name);
+            }
+
+          logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
+                     str ? str : escnonprint_uri (print), txt_addr, port);
+
+                                       if (str)
+                                         xfree (str);
+        }
        else
          logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
      }
diff --git a/src/convert.c b/src/convert.c

index e72a4b0f50f4d8784ac63114c1da2b526ba96450..54004ad08db3eb819d9a4ec427d2d82abb41d276 100644 (file)
--- a/src/convert.c
+++ b/src/convert.c
@@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
  
        /* Parse the file...  */
        urls = is_css ? get_urls_css_file (file, url) :
-                      get_urls_html (file, url, NULL);
+                      get_urls_html (file, url, NULL, NULL);
  
        /* We don't respect meta_disallow_follow here because, even if
           the file is not followed, we might still want to convert the
diff --git a/src/ftp-basic.c b/src/ftp-basic.c

index 265a1e25b3ca178f0272edcf3cd806334e758bd2..5f250959fcb62b521c45494e0a165f6632490d88 100644 (file)
--- a/src/ftp-basic.c
+++ b/src/ftp-basic.c
@@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
          return FTPRERR;
  
        /* Strip trailing CRLF before printing the line, so that
-         escnonprint doesn't include bogus \012 and \015. */
+         quotting doesn't include bogus \012 and \015. */
        p = strchr (line, '\0');
        if (p > line && p[-1] == '\n')
          *--p = '\0';
diff --git a/src/host.c b/src/host.c

index 7b8c418963bd6f329ea1409698b73efd8dd51f99..b9aaebb45c713e43e31f97805421bc65218143ae 100644 (file)
--- a/src/host.c
+++ b/src/host.c
@@ -718,8 +718,24 @@ lookup_host (const char *host, int flags)
    /* No luck with the cache; resolve HOST. */
  
    if (!silent && !numeric_address)
-    logprintf (LOG_VERBOSE, _("Resolving %s... "), 
-               quotearg_style (escape_quoting_style, host));
+    {
+      char *str = NULL, *name;
+
+      if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
+        {
+          int len = strlen (host) + strlen (name) + 4;
+          str = xmalloc (len);
+          snprintf (str, len, "%s (%s)", name, host);
+          str[len-1] = '\0';
+          xfree (name);
+        }
+
+      logprintf (LOG_VERBOSE, _("Resolving %s... "),
+                 quotearg_style (escape_quoting_style, str ? str : host));
+
+      if (str)
+        xfree (str);
+    }
  
  #ifdef ENABLE_IPV6
    {
diff --git a/src/html-url.c b/src/html-url.c

index 95df8bf98e35b5d93b21da4003326fa5539b45f7..e6ab232461d8e01170920e4db7f66a4676daeb90 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
  static struct hash_table *interesting_tags;
  static struct hash_table *interesting_attributes;
  
+/* Will contains the (last) charset found in 'http-equiv=content-type'
+   meta tags  */
+static char *meta_charset;
+
  static void
  init_interesting (void)
  {
@@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
            return NULL;
          }
  
-      url = url_parse (link_uri, NULL);
+      url = url_parse (link_uri, NULL, NULL, false);
        if (!url)
          {
            DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
        DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                 ctx->document_file, base, link_uri, complete_uri));
  
-      url = url_parse (complete_uri, NULL);
+      url = url_parse (complete_uri, NULL, NULL, false);
        if (!url)
          {
            DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
            entry->link_expect_html = 1;
          }
      }
+  else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+    {
+      /* Handle stuff like:
+         <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+      char *mcharset;
+      char *content = find_attr (tag, "content", NULL);
+      if (!content)
+        return;
+
+      mcharset = parse_charset (content);
+      if (!mcharset)
+        return;
+
+      xfree_null (meta_charset);
+      meta_charset = mcharset;
+    }
    else if (name && 0 == strcasecmp (name, "robots"))
      {
        /* Handle stuff like:
@@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
     <base href=...> and does the right thing.  */
  
  struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+               struct iri *iri)
  {
    struct file_memory *fm;
    struct map_context ctx;
@@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
    map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                   NULL, interesting_attributes);
  
+  /* If meta charset isn't null, override content encoding */
+  if (iri && meta_charset)
+    set_content_encoding (iri, meta_charset);
+
    DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
    if (meta_disallow_follow)
      *meta_disallow_follow = ctx.nofollow;
@@ -726,7 +752,7 @@ get_urls_file (const char *file)
            url_text = merged;
          }
  
-      url = url_parse (url_text, &up_error_code);
+      url = url_parse (url_text, &up_error_code, NULL, false);
        if (!url)
          {
            char *error = url_error (url_text, up_error_code);
diff --git a/src/html-url.h b/src/html-url.h

index a94f0db6c62abd01f929febd4ca669fda9c365ba..2e9ec820e376418f8e00353241f4e7f5043ea88b 100644 (file)
--- a/src/html-url.h
+++ b/src/html-url.h
@@ -44,7 +44,7 @@ struct map_context {
  };
  
  struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
  struct urlpos *append_url (const char *, int, int, struct map_context *);
  void free_urlpos (struct urlpos *);
  
diff --git a/src/http.c b/src/http.c

index 50f0c6439c776e50a4a75c25f8beb47de4cfcfc5..ae89c46d642fb5e5c6807e88576b5f94422730e4 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -1366,7 +1366,8 @@ free_hstat (struct http_stat *hs)
     If PROXY is non-NULL, the connection will be made to the proxy
     server, and u->url will be requested.  */
  static uerr_t
-gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
+gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
+         struct iri *iri)
  {
    struct request *req;
  
@@ -1925,7 +1926,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
            hs->local_file = url_file_name (u);
          }
      }
-  
+
    /* TODO: perform this check only once. */
    if (!hs->existence_checked && file_exists_p (hs->local_file))
      {
@@ -1996,7 +1997,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
                local_dot_orig_file_exists = true;
                local_filename = filename_plus_orig_suffix;
              }
-        }      
+        }
  
        if (!local_dot_orig_file_exists)
          /* Couldn't stat() <file>.orig, so try to stat() <file>. */
@@ -2061,9 +2062,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
        char *tmp = strchr (type, ';');
        if (tmp)
          {
+          /* sXXXav: only needed if IRI support is enabled */
+          char *tmp2 = tmp + 1;
+
            while (tmp > type && c_isspace (tmp[-1]))
              --tmp;
            *tmp = '\0';
+
+          /* Try to get remote encoding if needed */
+          if (opt.enable_iri && !opt.encoding_remote)
+            {
+              tmp = parse_charset (tmp2);
+              if (tmp)
+                set_content_encoding (iri, tmp);
+            }
          }
      }
    hs->newloc = resp_header_strdup (resp, "Location");
@@ -2348,7 +2360,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
     retried, and retried, and retried, and...  */
  uerr_t
  http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
-           int *dt, struct url *proxy)
+           int *dt, struct url *proxy, struct iri *iri)
  {
    int count;
    bool got_head = false;         /* used for time-stamping and filename detection */
@@ -2359,17 +2371,17 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
    uerr_t err, ret = TRYLIMEXC;
    time_t tmr = -1;               /* remote time-stamp */
    struct http_stat hstat;        /* HTTP status */
-  struct_stat st;  
+  struct_stat st;
    bool send_head_first = true;
    char *file_name;
  
    /* Assert that no value for *LOCAL_FILE was passed. */
    assert (local_file == NULL || *local_file == NULL);
-  
+
    /* Set LOCAL_FILE parameter. */
    if (local_file && opt.output_document)
      *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
-  
+
    /* Reset NEWLOC parameter. */
    *newloc = NULL;
  
@@ -2406,7 +2418,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
           retrieve the file. But if the output_document was given, then this
           test was already done and the file didn't exist. Hence the !opt.output_document */
        logprintf (LOG_VERBOSE, _("\
-File %s already there; not retrieving.\n\n"), 
+File %s already there; not retrieving.\n\n"),
                   quote (hstat.local_file));
        /* If the file is there, we suppose it's retrieved OK.  */
        *dt |= RETROKF;
@@ -2422,10 +2434,10 @@ File %s already there; not retrieving.\n\n"),
  
    /* Reset the counter. */
    count = 0;
-  
+
    /* Reset the document type. */
    *dt = 0;
-  
+
    /* Skip preliminary HEAD request if we're not in spider mode AND
     * if -O was given or HTTP Content-Disposition support is disabled. */
    if (!opt.spider
@@ -2435,7 +2447,7 @@ File %s already there; not retrieving.\n\n"),
    /* Send preliminary HEAD request if -N is given and we have an existing 
     * destination file. */
    file_name = url_file_name (u);
-  if (opt.timestamping 
+  if (opt.timestamping
        && !opt.content_disposition
        && file_exists_p (file_name))
      send_head_first = true;
@@ -2447,10 +2459,10 @@ File %s already there; not retrieving.\n\n"),
        /* Increment the pass counter.  */
        ++count;
        sleep_between_retrievals (count);
-      
+
        /* Get the current time string.  */
        tms = datetime_str (time (NULL));
-      
+
        if (opt.spider && !got_head)
          logprintf (LOG_VERBOSE, _("\
  Spider mode enabled. Check if remote file exists.\n"));
@@ -2459,20 +2471,20 @@ Spider mode enabled. Check if remote file exists.\n"));
        if (opt.verbose)
          {
            char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
-          
-          if (count > 1) 
+
+          if (count > 1)
              {
                char tmp[256];
                sprintf (tmp, _("(try:%2d)"), count);
                logprintf (LOG_NOTQUIET, "--%s--  %s  %s\n",
                           tms, tmp, hurl);
              }
-          else 
+          else
              {
                logprintf (LOG_NOTQUIET, "--%s--  %s\n",
                           tms, hurl);
              }
-          
+
  #ifdef WINDOWS
            ws_changetitle (hurl);
  #endif
@@ -2482,7 +2494,7 @@ Spider mode enabled. Check if remote file exists.\n"));
        /* Default document type is empty.  However, if spider mode is
           on or time-stamping is employed, HEAD_ONLY commands is
           encoded within *dt.  */
-      if (send_head_first && !got_head) 
+      if (send_head_first && !got_head)
          *dt |= HEAD_ONLY;
        else
          *dt &= ~HEAD_ONLY;
@@ -2515,11 +2527,11 @@ Spider mode enabled. Check if remote file exists.\n"));
          *dt &= ~SEND_NOCACHE;
  
        /* Try fetching the document, or at least its head.  */
-      err = gethttp (u, &hstat, dt, proxy);
+      err = gethttp (u, &hstat, dt, proxy, iri);
  
        /* Time?  */
        tms = datetime_str (time (NULL));
-      
+
        /* Get the new location (with or without the redirection).  */
        if (hstat.newloc)
          *newloc = xstrdup (hstat.newloc);
@@ -2558,7 +2570,7 @@ Spider mode enabled. Check if remote file exists.\n"));
                           hstat.statcode);
                ret = WRONGCODE;
              }
-          else 
+          else
              {
                ret = NEWLOCATION;
              }
@@ -2574,7 +2586,7 @@ Spider mode enabled. Check if remote file exists.\n"));
            /* All possibilities should have been exhausted.  */
            abort ();
          }
-      
+
        if (!(*dt & RETROKF))
          {
            char *hurl = NULL;
@@ -2593,11 +2605,13 @@ Spider mode enabled. Check if remote file exists.\n"));
                continue;
              }
            /* Maybe we should always keep track of broken links, not just in
-           * spider mode.  */
-          else if (opt.spider)
+           * spider mode.
+           * Don't log error if it was UTF-8 encoded because we will try
+           * once unencoded. */
+          else if (opt.spider && !iri->utf8_encode)
              {
                /* #### Again: ugly ugly ugly! */
-              if (!hurl) 
+              if (!hurl)
                  hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
                nonexisting_url (hurl);
                logprintf (LOG_NOTQUIET, _("\
@@ -2606,7 +2620,7 @@ Remote file does not exist -- broken link!!!\n"));
            else
              {
                logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
-                         tms, hstat.statcode, 
+                         tms, hstat.statcode,
                           quotearg_style (escape_quoting_style, hstat.error));
              }
            logputs (LOG_VERBOSE, "\n");
diff --git a/src/http.h b/src/http.h

index e0e66cea89b7ca454c46ac18ddcec86b5aa7276c..4769e9d376eb82d0e0f2d8d0fcab92562dea8211 100644 (file)
--- a/src/http.h
+++ b/src/http.h
@@ -33,7 +33,7 @@ as that of the covered work.  */
  struct url;
  
  uerr_t http_loop (struct url *, char **, char **, const char *, int *,
-                 struct url *);
+                 struct url *, struct iri *);
  void save_cookies (void);
  void http_cleanup (void);
  time_t http_atotm (const char *);
diff --git a/src/init.c b/src/init.c

index bbe6b585a74926a2a1d9cb0f59e0bd3529a6016b..23f8cb2cfd954271e8f49107ac761a5cb0a69ba3 100644 (file)
--- a/src/init.c
+++ b/src/init.c
@@ -177,9 +177,11 @@ static const struct {
    { "inet6only",        &opt.ipv6_only,         cmd_boolean },
  #endif
    { "input",            &opt.input_filename,    cmd_file },
+  { "iri",              &opt.enable_iri,        cmd_boolean },
    { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
    { "limitrate",        &opt.limit_rate,        cmd_bytes },
    { "loadcookies",      &opt.cookies_input,     cmd_file },
+  { "locale",           &opt.locale,            cmd_string },
    { "logfile",          &opt.lfilename,         cmd_file },
    { "login",            &opt.ftp_user,          cmd_string },/* deprecated*/
    { "maxredirect",      &opt.max_redirect,      cmd_number },
@@ -219,6 +221,7 @@ static const struct {
    { "referer",          &opt.referer,           cmd_string },
    { "reject",           &opt.rejects,           cmd_vector },
    { "relativeonly",     &opt.relative_only,     cmd_boolean },
+  { "remoteencoding",   &opt.encoding_remote,   cmd_string },
    { "removelisting",    &opt.remove_listing,    cmd_boolean },
    { "restrictfilenames", NULL,                  cmd_spec_restrict_file_names },
    { "retrsymlinks",     &opt.retr_symlinks,     cmd_boolean },
@@ -328,6 +331,14 @@ defaults (void)
    opt.max_redirect = 20;
  
    opt.waitretry = 10;
+
+#ifdef ENABLE_IRI
+  opt.enable_iri = true;
+#else
+  opt.enable_iri = false;
+#endif
+  opt.locale = NULL;
+  opt.encoding_remote = NULL;
  }
  \f
  /* Return the user's home directory (strdup-ed), or NULL if none is
diff --git a/src/iri.c b/src/iri.c

new file mode 100644 (file)

index 0000000..b1e0bf8
--- /dev/null
+++ b/src/iri.c
@@ -0,0 +1,350 @@
+/* IRI related functions.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <iconv.h>
+#include <stringprep.h>
+#include <idna.h>
+#include <errno.h>
+
+#include "utils.h"
+
+/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
+#define IDNA_FLAGS  IDNA_USE_STD3_ASCII_RULES
+
+/* Note: locale encoding is kept in options struct (opt.locale) */
+
+static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
+
+
+/* Given a string containing "charset=XXX", return the encoding if found,
+   or NULL otherwise */
+char *
+parse_charset (char *str)
+{
+  char *charset;
+
+  if (!str || !*str)
+    return NULL;
+
+  str = strcasestr (str, "charset=");
+  if (!str)
+    return NULL;
+
+  str += 8;
+  charset = str;
+
+  /* sXXXav: which chars should be banned ??? */
+  while (*charset && !c_isspace (*charset))
+    charset++;
+
+  /* sXXXav: could strdupdelim return NULL ? */
+  charset = strdupdelim (str, charset);
+
+  /* Do a minimum check on the charset value */
+  if (!check_encoding_name (charset))
+    {
+      xfree (charset);
+      return NULL;
+    }
+
+  /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
+
+  return charset;
+}
+
+/* Find the locale used, or fall back on a default value */
+char *
+find_locale (void)
+{
+  return (char *) stringprep_locale_charset ();
+}
+
+/* Basic check of an encoding name. */
+bool
+check_encoding_name (char *encoding)
+{
+  char *s = encoding;
+
+  while (*s)
+    {
+      if (!c_isascii (*s) || c_isspace (*s))
+        {
+          logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
+          return false;
+        }
+
+      s++;
+    }
+
+  return true;
+}
+
+/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
+static bool
+open_locale_to_utf8 (void)
+{
+
+}
+
+/* Try converting string str from locale to UTF-8. Return a new string
+   on success, or str on error or if conversion isn't needed. */
+const char *
+locale_to_utf8 (const char *str)
+{
+  iconv_t l2u;
+  char *new;
+
+  /* That shouldn't happen, just in case */
+  if (!opt.locale)
+    {
+      logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
+      opt.locale = find_locale ();
+    }
+
+  if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
+    return str;
+
+  l2u = iconv_open ("UTF-8", opt.locale);
+  if (l2u != (iconv_t)(-1))
+    { 
+      logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
+                 quote (opt.locale), quote ("UTF-8"));
+      return str;
+    }
+
+  if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
+    return (const char *) new;
+
+  return str;
+}
+
+/* Do the conversion according to the passed conversion descriptor cd. *out
+   will contain the transcoded string on success. *out content is
+   unspecified otherwise. */
+static bool
+do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+{
+  /* sXXXav : hummm hard to guess... */
+  size_t len, done, outlen = inlen * 2;
+  int invalid = 0, tooshort = 0;
+  char *s;
+
+  s = xmalloc (outlen + 1);
+  *out = s;
+  len = outlen;
+  done = 0;
+
+  for (;;)
+    {
+      if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
+        {
+          *out = s;
+          *(s + len - outlen - done) = '\0';
+          return true;
+        }
+
+      /* Incomplete or invalid multibyte sequence */
+      if (errno == EINVAL || errno == EILSEQ)
+        {
+          if (!invalid)
+            logprintf (LOG_VERBOSE,
+                      "Incomplete or invalide multibyte sequence encountered\n");
+
+          invalid++;
+          **out = *in;
+          in++;
+          inlen--;
+          (*out)++;
+          outlen--;
+        }
+      else if (errno == E2BIG) /* Output buffer full */
+        {
+          char *new;
+
+          tooshort++;
+          done = len;
+          outlen = done + inlen * 2;
+          new = xmalloc (outlen + 1);
+          memcpy (new, s, done);
+          xfree (s);
+          s = new;
+          len = outlen;
+          *out = s + done;
+        }
+      else /* Weird, we got an unspecified error */
+        {
+          logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
+          break;
+        }
+    }
+
+    return false;
+}
+
+/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
+   on error. */
+char *
+idn_encode (struct iri *i, char *host)
+{
+  char *new;
+  int ret;
+
+  /* Encode to UTF-8 if not done */
+  if (!i->utf8_encode)
+    {
+      if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
+          return NULL;  /* Nothing to encode or an error occured */
+      host = new;
+    }
+
+  /* toASCII UTF-8 NULL terminated string */
+  ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
+  if (ret != IDNA_SUCCESS)
+    {
+      /* sXXXav : free new when needed ! */
+      logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
+                 quote (idna_strerror (ret)));
+      return NULL;
+    }
+
+  return new;
+}
+
+/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
+   on success or NULL on error. */
+char *
+idn_decode (char *host)
+{
+  char *new;
+  int ret;
+
+  ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
+  if (ret != IDNA_SUCCESS)
+    {
+      logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
+                 quote (idna_strerror (ret)));
+      return NULL;
+    }
+
+  return new;
+}
+
+/* Try to transcode string str from remote encoding to UTF-8. On success, *new
+   contains the transcoded string. *new content is unspecified otherwise. */
+bool
+remote_to_utf8 (struct iri *i, const char *str, const char **new)
+{
+  iconv_t cd;
+  bool ret = false;
+
+  if (!i->uri_encoding)
+    return false;
+
+  cd = iconv_open ("UTF-8", i->uri_encoding);
+  if (cd == (iconv_t)(-1))
+    return false;
+
+  if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+    ret = true;
+
+  iconv_close (cd);
+
+  /* Test if something was converted */
+  if (!strcmp (str, *new))
+    {
+      xfree ((char *) *new);
+      return false;
+    }
+
+  return ret;
+}
+
+/* Allocate a new iri structure and return a pointer to it. */
+struct iri *
+iri_new (void)
+{
+  struct iri *i = xmalloc (sizeof (struct iri));
+  i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
+  i->content_encoding = NULL;
+  i->orig_url = NULL;
+  i->utf8_encode = opt.enable_iri;
+  return i;
+}
+
+/* Completely free an iri structure. */
+void
+iri_free (struct iri *i)
+{
+  xfree_null (i->uri_encoding);
+  xfree_null (i->content_encoding);
+  xfree_null (i->orig_url);
+  xfree (i);
+}
+
+/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
+   it unless force is true. */
+void
+set_uri_encoding (struct iri *i, char *charset, bool force)
+{
+  DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
+  if (!force && opt.encoding_remote)
+    return;
+  if (i->uri_encoding)
+    {
+      if (charset && !strcasecmp (i->uri_encoding, charset))
+        return;
+      xfree (i->uri_encoding);
+    }
+
+  i->uri_encoding = charset ? xstrdup (charset) : NULL;
+}
+
+/* Set content_encoding of struct iri i. */
+void
+set_content_encoding (struct iri *i, char *charset)
+{
+  DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
+  if (opt.encoding_remote)
+    return;
+  if (i->content_encoding)
+    {
+      if (charset && !strcasecmp (i->content_encoding, charset))
+        return;
+      xfree (i->content_encoding);
+    }
+
+  i->content_encoding = charset ? xstrdup (charset) : NULL;
+}
+
diff --git a/src/iri.h b/src/iri.h

new file mode 100644 (file)

index 0000000..6ad2bec
--- /dev/null
+++ b/src/iri.h
@@ -0,0 +1,71 @@
+/* Internationalization related declarations.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
+#ifndef IRI_H
+#define IRI_H
+
+struct iri {
+  char *uri_encoding;      /* Encoding of the uri to fetch */
+  char *content_encoding;  /* Encoding of links inside the fetched file */
+  char *orig_url;          /* */
+  bool utf8_encode;        /* Will/Is the current url encoded in utf8 */
+};
+
+#ifdef ENABLE_IRI
+
+char *parse_charset (char *str);
+char *find_locale (void);
+bool check_encoding_name (char *encoding);
+const char *locale_to_utf8 (const char *str);
+char *idn_encode (struct iri *i, char *host);
+char *idn_decode (char *host);
+bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
+struct iri *iri_new (void);
+void iri_free (struct iri *i);
+void set_uri_encoding (struct iri *i, char *charset, bool force);
+void set_content_encoding (struct iri *i, char *charset);
+
+#else /* ENABLE_IRI */
+
+struct iri dummy_iri;
+
+#define parse_charset(str)          NULL
+#define find_locale()               NULL
+#define check_encoding_name(str)    false
+#define locale_to_utf8(str)         (str)
+#define idn_encode(a,b)             NULL
+#define idn_decode(str)             NULL
+#define remote_to_utf8(a,b,c)       false
+#define iri_new()                   (&dummy_iri)
+#define iri_free(a)
+#define set_uri_encoding(a,b,c)
+#define set_content_encoding(a,b)
+
+#endif /* ENABLE_IRI */
+#endif /* IRI_H */
diff --git a/src/log.c b/src/log.c

index e84e5c61c22f45ac4a07cc3d77e0a667e90a3c61..b62bf9dd3781cc1e9d2b7a7e59d0471a6ac6b161 100644 (file)
--- a/src/log.c
+++ b/src/log.c
@@ -43,7 +43,7 @@ as that of the covered work.  */
  #include "utils.h"
  #include "log.h"
  
-/* This file impplement support for "logging".  Logging means printing
+/* This file implement support for "logging".  Logging means printing
     output, plus several additional features:
  
     - Cataloguing output by importance.  You can specify that a log
diff --git a/src/main.c b/src/main.c

index b8039d6b16b1526a31423fd0c6e0a72ce089f674..69df08a73d443c1f8192a88988c67422be1fd4a6 100644 (file)
--- a/src/main.c
+++ b/src/main.c
@@ -202,10 +202,12 @@ static struct cmdline_option option_data[] =
      { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
  #endif
      { "input-file", 'i', OPT_VALUE, "input", -1 },
+    { "iri", 0, OPT_BOOLEAN, "iri", -1 },
      { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
      { "level", 'l', OPT_VALUE, "reclevel", -1 },
      { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
      { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+    { "locale", 0, OPT_VALUE, "locale", -1 },
      { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
      { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
      { "no", 'n', OPT__NO, NULL, required_argument },
@@ -239,6 +241,7 @@ static struct cmdline_option option_data[] =
      { "referer", 0, OPT_VALUE, "referer", -1 },
      { "reject", 'R', OPT_VALUE, "reject", -1 },
      { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+    { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
      { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
      { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
      { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@@ -1077,6 +1080,27 @@ for details.\n\n"));
        exit (1);
      }
  
+#ifdef ENABLE_IRI
+  if (opt.enable_iri)
+    {
+      if (opt.locale && !check_encoding_name (opt.locale))
+        opt.locale = NULL;
+
+      if (!opt.locale)
+        opt.locale = find_locale ();
+
+      if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
+        opt.encoding_remote = NULL;
+    }
+#else
+  if (opt.enable_iri || opt.locale || opt.encoding_remote)
+    {
+      /* sXXXav : be more specific... */
+      printf(_("This version does not have support for IRIs\n"));
+      exit(1);
+    }
+#endif
+
    if (opt.ask_passwd)
      {
        opt.passwd = prompt_for_password ();
@@ -1179,7 +1203,7 @@ WARNING: Can't reopen standard output in binary mode;\n\
      {
        char *filename = NULL, *redirected_URL = NULL;
        int dt, url_err;
-      struct url *url_parsed = url_parse (*t, &url_err);
+      struct url *url_parsed = url_parse (*t, &url_err, NULL, false);
  
        if (!url_parsed)
          {
@@ -1199,12 +1223,18 @@ WARNING: Can't reopen standard output in binary mode;\n\
                if (url_scheme (*t) == SCHEME_FTP) 
                  opt.follow_ftp = 1;
            
-              status = retrieve_tree (url_parsed);
+              status = retrieve_tree (url_parsed, NULL);
  
                opt.follow_ftp = old_follow_ftp;
              }
            else
-            status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+          {
+            struct iri *i = iri_new ();
+            set_uri_encoding (i, opt.locale, true);
+            status = retrieve_url (url_parsed, *t, &filename, &redirected_URL,
+                                   NULL, &dt, opt.recursive, i);
+            iri_free (i);
+          }
  
            if (opt.delete_after && file_exists_p(filename))
              {
diff --git a/src/options.h b/src/options.h

index 18f031c4aa0508c7915b41e77eb728154bbfdb96..8dc7fee2ad3d34bd0e79afae75769b6bbb95ad33 100644 (file)
--- a/src/options.h
+++ b/src/options.h
@@ -235,6 +235,10 @@ struct options
    bool content_disposition;    /* Honor HTTP Content-Disposition header. */
    bool auth_without_challenge;  /* Issue Basic authentication creds without
                                     waiting for a challenge. */
+
+  bool enable_iri;
+  char *encoding_remote;
+  char *locale;
  };
  
  extern struct options opt;
diff --git a/src/recur.c b/src/recur.c

index 2e067505c1c6a521c8ebcef66178ad7347fb8184..83a9b4ee84d5b155196263841ea37214fd3d014c 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -51,7 +51,7 @@ as that of the covered work.  */
  #include "html-url.h"
  #include "css-url.h"
  #include "spider.h"
-
+\f
  /* Functions for maintaining the URL queue.  */
  
  struct queue_element {
@@ -60,6 +60,7 @@ struct queue_element {
    int depth;                    /* the depth */
    bool html_allowed;            /* whether the document is allowed to
                                     be treated as HTML. */
+  struct iri *iri;                /* sXXXav */
    bool css_allowed;             /* whether the document is allowed to
                                     be treated as CSS. */
    struct queue_element *next;   /* next element in queue */
@@ -93,11 +94,12 @@ url_queue_delete (struct url_queue *queue)
     into it.  */
  
  static void
-url_enqueue (struct url_queue *queue,
+url_enqueue (struct url_queue *queue, struct iri *i,
               const char *url, const char *referer, int depth,
               bool html_allowed, bool css_allowed)
  {
    struct queue_element *qel = xnew (struct queue_element);
+  qel->iri = i;
    qel->url = url;
    qel->referer = referer;
    qel->depth = depth;
@@ -112,6 +114,10 @@ url_enqueue (struct url_queue *queue,
    DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
    DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
  
+  if (i)
+    DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
+             i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
+
    if (queue->tail)
      queue->tail->next = qel;
    queue->tail = qel;
@@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue,
     succeeded, or false if the queue is empty.  */
  
  static bool
-url_dequeue (struct url_queue *queue,
+url_dequeue (struct url_queue *queue, struct iri **i,
               const char **url, const char **referer, int *depth,
               bool *html_allowed, bool *css_allowed)
  {
@@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue,
    if (!queue->head)
      queue->tail = NULL;
  
+  *i = qel->iri;
    *url = qel->url;
    *referer = qel->referer;
    *depth = qel->depth;
@@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue,
  }
  \f
  static bool download_child_p (const struct urlpos *, struct url *, int,
-                              struct url *, struct hash_table *);
+                              struct url *, struct hash_table *, struct iri *);
  static bool descend_redirect_p (const char *, struct url *, int,
-                                struct url *, struct hash_table *);
+                                struct url *, struct hash_table *, struct iri *);
  
  
  /* Retrieve a part of the web beginning with START_URL.  This used to
@@ -180,7 +187,7 @@ static bool descend_redirect_p (const char *, struct url *, int,
            options, add it to the queue. */
  
  uerr_t
-retrieve_tree (struct url *start_url_parsed)
+retrieve_tree (struct url *start_url_parsed, struct iri *pi)
  {
    uerr_t status = RETROK;
  
@@ -191,12 +198,28 @@ retrieve_tree (struct url *start_url_parsed)
       the queue, but haven't been downloaded yet.  */
    struct hash_table *blacklist;
  
+  int up_error_code;
+  struct iri *i = iri_new ();
+
+#define COPYSTR(x)  (x) ? xstrdup(x) : NULL;
+  /* Duplicate pi struct if not NULL */
+  if (pi)
+    {
+      i->uri_encoding = COPYSTR (pi->uri_encoding);
+      i->content_encoding = COPYSTR (pi->content_encoding);
+      i->utf8_encode = pi->utf8_encode;
+    }
+  else
+    set_uri_encoding (i, opt.locale, true);
+#undef COPYSTR
+
    queue = url_queue_new ();
    blacklist = make_string_hash_table (0);
  
    /* Enqueue the starting URL.  Use start_url_parsed->url rather than
       just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
+  url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
+               false);
    string_set_add (blacklist, start_url_parsed->url);
  
    while (1)
@@ -215,7 +238,7 @@ retrieve_tree (struct url *start_url_parsed)
  
        /* Get the next URL from the queue... */
  
-      if (!url_dequeue (queue,
+      if (!url_dequeue (queue, (struct iri **) &i,
                          (const char **)&url, (const char **)&referer,
                          &depth, &html_allowed, &css_allowed))
          break;
@@ -255,20 +278,10 @@ retrieve_tree (struct url *start_url_parsed)
          {
            int dt = 0, url_err;
            char *redirected = NULL;
-          struct url *url_parsed = url_parse (url, &url_err);
+          struct url *url_parsed = url_parse (url, &url_err, i, false);
  
-          if (!url_parsed)
-            {
-              char *error = url_error (url, url_err);
-              logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
-              xfree (error);
-              status = URLERROR;
-            }
-          else
-            {
-              status = retrieve_url (url_parsed, url, &file, &redirected,
-                                     referer, &dt, false);
-            }
+          status = retrieve_url (url_parsed, url, &file, &redirected, referer,
+                                 &dt, false, i);
  
            if (html_allowed && file && status == RETROK
                && (dt & RETROKF) && (dt & TEXTHTML))
@@ -296,7 +309,7 @@ retrieve_tree (struct url *start_url_parsed)
                if (descend)
                  {
                    if (!descend_redirect_p (redirected, url_parsed, depth,
-                                           start_url_parsed, blacklist))
+                                           start_url_parsed, blacklist, i))
                      descend = false;
                    else
                      /* Make sure that the old pre-redirect form gets
@@ -349,7 +362,7 @@ retrieve_tree (struct url *start_url_parsed)
            bool meta_disallow_follow = false;
            struct urlpos *children
              = is_css ? get_urls_css_file (file, url) :
-                       get_urls_html (file, url, &meta_disallow_follow);
+                       get_urls_html (file, url, &meta_disallow_follow, i);
  
            if (opt.use_robots && meta_disallow_follow)
              {
@@ -360,7 +373,8 @@ retrieve_tree (struct url *start_url_parsed)
            if (children)
              {
                struct urlpos *child = children;
-              struct url *url_parsed = url_parsed = url_parse (url, NULL);
+              struct url *url_parsed = url_parse (url, NULL, i, false);
+              struct iri *ci;
                char *referer_url = url;
                bool strip_auth = (url_parsed != NULL
                                   && url_parsed->user != NULL);
@@ -377,9 +391,11 @@ retrieve_tree (struct url *start_url_parsed)
                    if (dash_p_leaf_HTML && !child->link_inline_p)
                      continue;
                    if (download_child_p (child, url_parsed, depth, start_url_parsed,
-                                        blacklist))
+                                        blacklist, i))
                      {
-                      url_enqueue (queue, xstrdup (child->url->url),
+                      ci = iri_new ();
+                      set_uri_encoding (ci, i->content_encoding, false);
+                      url_enqueue (queue, ci, xstrdup (child->url->url),
                                     xstrdup (referer_url), depth + 1,
                                     child->link_expect_html,
                                     child->link_expect_css);
@@ -397,18 +413,18 @@ retrieve_tree (struct url *start_url_parsed)
              }
          }
  
-      if (file 
-          && (opt.delete_after 
+      if (file
+          && (opt.delete_after
                || opt.spider /* opt.recursive is implicitely true */
                || !acceptable (file)))
          {
            /* Either --delete-after was specified, or we loaded this
-             (otherwise unneeded because of --spider or rejected by -R) 
-             HTML file just to harvest its hyperlinks -- in either case, 
+             (otherwise unneeded because of --spider or rejected by -R)
+             HTML file just to harvest its hyperlinks -- in either case,
               delete the local file. */
            DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
                     opt.delete_after ? "--delete-after" :
-                   (opt.spider ? "--spider" : 
+                   (opt.spider ? "--spider" :
                      "recursive rejection criteria")));
            logprintf (LOG_VERBOSE,
                       (opt.delete_after || opt.spider
@@ -424,6 +440,7 @@ retrieve_tree (struct url *start_url_parsed)
        xfree (url);
        xfree_null (referer);
        xfree_null (file);
+      iri_free (i);
      }
  
    /* If anything is left of the queue due to a premature exit, free it
@@ -432,9 +449,11 @@ retrieve_tree (struct url *start_url_parsed)
      char *d1, *d2;
      int d3;
      bool d4, d5;
-    while (url_dequeue (queue,
+    struct iri *d6;
+    while (url_dequeue (queue, (struct iri **)&d6,
                          (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
        {
+        iri_free (d6);
          xfree (d1);
          xfree_null (d2);
        }
@@ -461,7 +480,8 @@ retrieve_tree (struct url *start_url_parsed)
  
  static bool
  download_child_p (const struct urlpos *upos, struct url *parent, int depth,
-                  struct url *start_url_parsed, struct hash_table *blacklist)
+                  struct url *start_url_parsed, struct hash_table *blacklist,
+                  struct iri *iri)
  {
    struct url *u = upos->url;
    const char *url = u->url;
@@ -471,7 +491,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
  
    if (string_set_contains (blacklist, url))
      {
-      if (opt.spider) 
+      if (opt.spider)
          {
            char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
            DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
@@ -602,7 +622,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
        if (!specs)
          {
            char *rfile;
-          if (res_retrieve_file (url, &rfile))
+          if (res_retrieve_file (url, &rfile, iri))
              {
                specs = res_parse_from_file (rfile);
  
@@ -657,7 +677,8 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
  
  static bool
  descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
-                    struct url *start_url_parsed, struct hash_table *blacklist)
+                    struct url *start_url_parsed, struct hash_table *blacklist,
+                    struct iri *iri)
  {
    struct url *new_parsed;
    struct urlpos *upos;
@@ -665,14 +686,14 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
  
    assert (orig_parsed != NULL);
  
-  new_parsed = url_parse (redirected, NULL);
+  new_parsed = url_parse (redirected, NULL, NULL, false);
    assert (new_parsed != NULL);
  
    upos = xnew0 (struct urlpos);
    upos->url = new_parsed;
  
    success = download_child_p (upos, orig_parsed, depth,
-                              start_url_parsed, blacklist);
+                              start_url_parsed, blacklist, iri);
  
    url_free (new_parsed);
    xfree (upos);
diff --git a/src/recur.h b/src/recur.h

index 7eeb5642cb3bdd58148db206dff4c923f9da0314..76c0ef5f51f511ccec3ebd32d705d2efe7c0004f 100644 (file)
--- a/src/recur.h
+++ b/src/recur.h
@@ -44,6 +44,6 @@ as that of the covered work.  */
  struct urlpos;
  
  void recursive_cleanup (void);
-uerr_t retrieve_tree (struct url *);
+uerr_t retrieve_tree (struct url *, struct iri *);
  
  #endif /* RECUR_H */
diff --git a/src/res.c b/src/res.c

index 20ffe1c8de45947b1d8cd9c262823151703b6bcd..4b0ff82ba5b5a15ca4cae87e607ea2ac37f016e6 100644 (file)
--- a/src/res.c
+++ b/src/res.c
@@ -532,20 +532,26 @@ res_get_specs (const char *host, int port)
     Return true if robots were retrieved OK, false otherwise.  */
  
  bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
  {
+  struct iri *i = iri_new ();
    uerr_t err;
    char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
    int saved_ts_val = opt.timestamping;
    int saved_sp_val = opt.spider, url_err;
    struct url * url_parsed;
  
+  /* Copy server URI encoding for a possible IDNA transformation, no need to
+     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+  set_uri_encoding (i, iri->uri_encoding, false);
+  i->utf8_encode = false;
+
    logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
    *file = NULL;
    opt.timestamping = false;
    opt.spider       = false;
  
-  url_parsed = url_parse (robots_url, &url_err);
+  url_parsed = url_parse (robots_url, &url_err, iri, true);
    if (!url_parsed)
      {
        char *error = url_error (robots_url, url_err);
@@ -556,13 +562,14 @@ res_retrieve_file (const char *url, char **file)
    else
      {
        err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
-                          false);
+                          false, i);
        url_free(url_parsed);
      }
  
    opt.timestamping = saved_ts_val;
-  opt.spider       = saved_sp_val;  
+  opt.spider       = saved_sp_val;
    xfree (robots_url);
+  iri_free (i);
  
    if (err != RETROK && *file != NULL)
      {
diff --git a/src/res.h b/src/res.h

index 94a57750ef8316c3e948ec7ed96d41dd00348ddc..5439eaf912bf3e1a48878cb8534c5a523bbd0782 100644 (file)
--- a/src/res.h
+++ b/src/res.h
@@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
  void res_register_specs (const char *, int, struct robot_specs *);
  struct robot_specs *res_get_specs (const char *, int);
  
-bool res_retrieve_file (const char *, char **);
+bool res_retrieve_file (const char *, char **, struct iri *);
  
  bool is_robots_txt_url (const char *);
  
diff --git a/src/retr.c b/src/retr.c

index ffa84c38410ef8238ce752714b2c2fb683f23539..0fd936d0d9f540061cf6274c8d869d19259ad823 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -598,7 +598,8 @@ static char *getproxy (struct url *);
  
  uerr_t
  retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
-              char **newloc, const char *refurl, int *dt, bool recursive)
+              char **newloc, const char *refurl, int *dt, bool recursive,
+              struct iri *iri)
  {
    uerr_t result;
    char *url;
@@ -626,6 +627,11 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
    if (file)
      *file = NULL;
  
+ second_try:
+  DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
+           iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
+           iri->utf8_encode));
+
    if (!refurl)
      refurl = opt.referer;
  
@@ -639,8 +645,12 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
    proxy = getproxy (u);
    if (proxy)
      {
+      struct iri *pi = iri_new ();
+      set_uri_encoding (pi, opt.locale, true);
+      pi->utf8_encode = false;
+
        /* Parse the proxy URL.  */
-      proxy_url = url_parse (proxy, &up_error_code);
+      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
        if (!proxy_url)
          {
            char *error = url_error (proxy, up_error_code);
@@ -667,7 +677,7 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
  #endif
        || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
      {
-      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
      }
    else if (u->scheme == SCHEME_FTP)
      {
@@ -717,8 +727,14 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
        xfree (mynewloc);
        mynewloc = construced_newloc;
  
+      /* Reset UTF-8 encoding state, keep the URI encoding and reset
+         the content encoding. */
+      iri->utf8_encode = opt.enable_iri;
+      set_content_encoding (iri, NULL);
+      xfree_null (iri->orig_url);
+
        /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
        if (!newloc_parsed)
          {
            char *error = url_error (mynewloc, up_error_code);
@@ -776,8 +792,21 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
        goto redirected;
      }
  
-  if (local_file)
+  /* Try to not encode in UTF-8 if fetching failed */
+  if (!(*dt & RETROKF) && iri->utf8_encode)
+    {
+      iri->utf8_encode = false;
+      DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
+      goto second_try;
+    }
+
+  if (local_file && *dt & RETROKF)
      {
+      register_download (u->url, local_file);
+      if (redirection_count && 0 != strcmp (origurl, u->url))
+        register_redirection (origurl, u->url);
+      if (*dt & TEXTHTML)
+        register_html (u->url, local_file);
        if (*dt & RETROKF)
          {
            register_download (u->url, local_file);
@@ -830,18 +859,23 @@ retrieve_from_file (const char *file, bool html, int *count)
  {
    uerr_t status;
    struct urlpos *url_list, *cur_url;
+  struct iri *iri = iri_new();
  
    char *input_file = NULL;
    const char *url = file;
  
    status = RETROK;             /* Suppose everything is OK.  */
    *count = 0;                  /* Reset the URL count.  */
-  
+
+  /* sXXXav : Assume filename and links in the file are in the locale */
+  set_uri_encoding (iri, opt.locale, true);
+  set_content_encoding (iri, opt.locale);
+
    if (url_has_scheme (url))
      {
        int dt,url_err;
        uerr_t status;
-      struct url * url_parsed = url_parse(url, &url_err);
+      struct url * url_parsed = url_parse(url, &url_err, NULL, true);
  
        if (!url_parsed)
          {
@@ -854,17 +888,22 @@ retrieve_from_file (const char *file, bool html, int *count)
        if (!opt.base_href)
          opt.base_href = xstrdup (url);
  
-      status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, false);
+      status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
+                             false, iri);
        if (status != RETROK)
          return status;
  
        if (dt & TEXTHTML)
          html = true;
+
+      /* If we have a found a content encoding, use it */
+      if (iri->content_encoding)
+         set_uri_encoding (iri, iri->content_encoding, false);
      }
    else
      input_file = (char *) file;
  
-  url_list = (html ? get_urls_html (input_file, NULL, NULL)
+  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
                : get_urls_file (input_file));
  
    for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@@ -880,24 +919,28 @@ retrieve_from_file (const char *file, bool html, int *count)
            status = QUOTEXC;
            break;
          }
+
+      /* Reset UTF-8 encode status */
+      iri->utf8_encode = opt.enable_iri;
+      xfree_null (iri->orig_url);
+      iri->orig_url = NULL;
+
        if ((opt.recursive || opt.page_requisites)
            && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
          {
            int old_follow_ftp = opt.follow_ftp;
  
            /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-          if (cur_url->url->scheme == SCHEME_FTP) 
+          if (cur_url->url->scheme == SCHEME_FTP)
              opt.follow_ftp = 1;
-          
-          status = retrieve_tree (cur_url->url);
+
+          status = retrieve_tree (cur_url->url, iri);
  
            opt.follow_ftp = old_follow_ftp;
          }
        else
-        {
-          status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
-                                 &new_file, NULL, &dt, opt.recursive);
-        }
+        status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
+                               &new_file, NULL, &dt, opt.recursive, iri);
  
        if (filename && opt.delete_after && file_exists_p (filename))
          {
@@ -916,6 +959,8 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
    /* Free the linked list of URL-s.  */
    free_urlpos (url_list);
  
+  iri_free (iri);
+
    return status;
  }
  
diff --git a/src/retr.h b/src/retr.h

index 72be93b718d067ab11af0394aca81537ba5047eb..8854b68404179a252f4dca1ce165dec1fec26104 100644 (file)
--- a/src/retr.h
+++ b/src/retr.h
@@ -53,7 +53,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
  char *fd_read_hunk (int, hunk_terminator_t, long, long);
  char *fd_read_line (int);
  
-uerr_t retrieve_url (struct url *, const char *, char **, char **, const char *, int *, bool);
+uerr_t retrieve_url (struct url *, const char *, char **, char **,
+                     const char *, int *, bool, struct iri *);
  uerr_t retrieve_from_file (const char *, bool, int *);
  
  const char *retr_rate (wgint, double);
diff --git a/src/url.c b/src/url.c

index d416fcf7fae38e6ee5b19276773ace45425d41a5..4c22a9fc6e460c5d34cc63ac81f0d6d3ca69c453 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -649,7 +649,7 @@ static const char *parse_errors[] = {
     error, and if ERROR is not NULL, also set *ERROR to the appropriate
     error code. */
  struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
  {
    struct url *u;
    const char *p;
@@ -668,7 +668,8 @@ url_parse (const char *url, int *error)
    int port;
    char *user = NULL, *passwd = NULL;
  
-  char *url_encoded = NULL;
+  const char *url_encoded = NULL;
+  char *new_url = NULL;
  
    int error_code;
  
@@ -679,9 +680,26 @@ url_parse (const char *url, int *error)
        goto error;
      }
  
-  url_encoded = reencode_escapes (url);
+  if (iri && iri->utf8_encode)
+    {
+      iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
+      if (!iri->utf8_encode)
+        new_url = NULL;
+      else
+        iri->orig_url = xstrdup (url);
+    }
+
+  /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
+  if (percent_encode)
+    url_encoded = reencode_escapes (new_url ? new_url : url);
+  else
+     url_encoded = new_url ? new_url : url;
+
    p = url_encoded;
  
+  if (new_url && url_encoded != new_url)
+    xfree (new_url);
+
    p += strlen (supported_schemes[scheme].leading_string);
    uname_b = p;
    p = url_skip_credentials (p);
@@ -851,6 +869,18 @@ url_parse (const char *url, int *error)
      {
        url_unescape (u->host);
        host_modified = true;
+
+      /* Apply IDNA regardless of iri->utf8_encode status */
+      if (opt.enable_iri && iri)
+        {
+          char *new = idn_encode (iri, u->host);
+          if (new)
+            {
+              xfree (u->host);
+              u->host = new;
+              host_modified = true;
+            }
+        }
      }
  
    if (params_b)
@@ -860,7 +890,7 @@ url_parse (const char *url, int *error)
    if (fragment_b)
      u->fragment = strdupdelim (fragment_b, fragment_e);
  
-  if (path_modified || u->fragment || host_modified || path_b == path_e)
+  if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
      {
        /* If we suspect that a transformation has rendered what
           url_string might return different from URL_ENCODED, rebuild
@@ -875,7 +905,7 @@ url_parse (const char *url, int *error)
        if (url_encoded == url)
          u->url = xstrdup (url);
        else
-        u->url = url_encoded;
+        u->url = (char *) url_encoded;
      }
  
    return u;
@@ -883,7 +913,7 @@ url_parse (const char *url, int *error)
   error:
    /* Cleanup in case of error: */
    if (url_encoded && url_encoded != url)
-    xfree (url_encoded);
+    xfree ((char *) url_encoded);
  
    /* Transmit the error code to the caller, if the caller wants to
       know.  */
@@ -1978,12 +2008,12 @@ schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
  \f
  static int
  getchar_from_escaped_string (const char *str, char *c)
-{  
+{
    const char *p = str;
  
    assert (str && *str);
    assert (c);
-  
+
    if (p[0] == '%')
      {
        if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
@@ -2033,7 +2063,7 @@ are_urls_equal (const char *u1, const char *u2)
        p += pp;
        q += qq;
      }
-  
+
    return (*p == 0 && *q == 0 ? true : false);
  }
  \f
@@ -2142,7 +2172,7 @@ test_append_uri_pathel()
    } test_array[] = {
      { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
    };
-  
+
    for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) 
      {
        struct growable dest;
diff --git a/src/url.h b/src/url.h

index f523e2efd3229104515e4825b29e0559c83503fc..38eafca4b50997aebba6a5f92574c34a93ad0248 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -85,7 +85,7 @@ struct url
  char *url_escape (const char *);
  char *url_escape_unsafe_and_reserved (const char *);
  
-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);
  char *url_error (const char *, int);
  char *url_full_path (const struct url *);
  void url_set_dir (struct url *, const char *);
diff --git a/src/wget.h b/src/wget.h

index d87dfcac85198b4b5f0f305c703029fba68d080d..b17b6709afaac1fcfb65f09b327d8a6365178ffb 100644 (file)
--- a/src/wget.h
+++ b/src/wget.h
@@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
  #include "quote.h"
  #include "quotearg.h"
  
+/* Likewise for struct iri definition */
+#include "iri.h"
+
  /* Useful macros used across the code: */
  
  /* The number of elements in an array.  For example:
diff --git a/tests/ChangeLog b/tests/ChangeLog

index 522bd2020c1b57fbaef690882cbe3e1105562827..3dfc60a312935733fd66edf479849ba73a183012 100644 (file)
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,19 @@
+2008-12-04  Micah Cowan  <micah@cowan.name> (not copyrightable)
+
+       * run-px, Test-idn-robots.px: Added test for robots-file
+       downloads.
+
+       * Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px:
+       Fix test names.
+
+2008-11-26  Micah Cowan  <micah@cowan.name>  (not copyrightable)
+
+       * Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px,
+       Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px,
+       Test-idn-meta.px, Test-iri-disabled.px,
+       Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More
+       module-scope warnings.
+
  2009-06-14  Micah Cowan  <micah@cowan.name>
  
         * Makefile.am (EXTRA_DIST): Include all the tests, run-px, and
@@ -95,6 +111,51 @@
  
         * run-px: Use strict (thanks Steven Schubiger!).
  
+2008-09-09  Micah Cowan  <micah@cowan.name>
+
+       * Test-idn-cmd.px: Added.
+
+       * run-px: Added Test-idn-cmd.px.
+
+2008-08-28  Micah Cowan  <micah@cowan.name>
+
+       * HTTPServer.pm (run): Allow distinguishing between hostnames,
+       when used as a proxy.
+
+       * Test-idn-headers.px, Test-idn-meta.px: Added.
+
+       * run-px: Added Test-idn-headers.px, Test-idn-meta.px.
+
+       * Test-proxy-auth-basic.px: Use the full URL, rather than just the
+       path (made necessary by the accompanying change to HTTPServer.pm).
+
+2008-08-14  Xavier Saint <wget@sxav.eu>
+       
+       * Test-iri-list.px : Fetch files from a remote list.
+
+2008-08-03  Xavier Saint <wget@sxav.eu>
+
+       * Test-iri.px : HTTP recursive fetch for testing IRI support and
+       fallback.
+
+       * Test-iri-disabled.px : Same file structure as Test-iri.px but with
+       IRI support disabled
+
+       * Test-iri-forced-remote.px : There's a difference between ISO-8859-1
+       and ISO-8859-15 for character 0xA4 (respectively currency sign and
+       euro sign). So with a forced ISO-8859-1 remote encoding, wget should
+       see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead
+       of using the ISO-8859-15 given by the server.
+
+       * Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale
+       and expect wget to fetch the file UTF-8 encoded.
+
+       * Test-ftp-iri-fallback.px : Same as above but wget should fallback on
+       locale encoding to fetch the file.
+
+       * Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support
+       disabled. The UTF-8 encoded file should not be retrieved.
+
  2008-06-22  Micah Cowan  <micah@cowan.name>
  
         * Test-proxied-https-auth.px: Shift exit code so it falls in the
diff --git a/tests/HTTPServer.pm b/tests/HTTPServer.pm

index e3c38e6f5f02c31a580813591b6e21a2726a6dc2..5252b5b8c7c89193857c23cde3eefe8934675cdc 100644 (file)
--- a/tests/HTTPServer.pm
+++ b/tests/HTTPServer.pm
@@ -26,7 +26,8 @@ sub run {
          my $con = $self->accept();
          print STDERR "Accepted a new connection\n" if $log;
          while (my $req = $con->get_request) {
-            my $url_path = $req->url->path;
+            #my $url_path = $req->url->path;
+            my $url_path = $req->url->as_string;
              if ($url_path =~ m{/$}) { # append 'index.html'
                  $url_path .= 'index.html';
              }
diff --git a/tests/Test-ftp-iri-disabled.px b/tests/Test-ftp-iri-disabled.px

new file mode 100755 (executable)

index 0000000..9612286
--- /dev/null
+++ b/tests/Test-ftp-iri-disabled.px
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use FTPTest;
+
+
+###############################################################################
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+
+my $francais = <<EOF;
+Some text.
+EOF
+
+$francais =~ s/\n/\r\n/;
+
+
+# code, msg, headers, content
+my %urls = (
+    "/fran${ccedilla_u8}ais.txt" => {
+        content => $francais,
+    },
+    "/fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+###############################################################################
+
+my $the_test = FTPTest->new (name => "Test-ftp-iri",
+                             input => \%urls, 
+                             cmdline => $cmdline, 
+                             errcode => $expected_error_code, 
+                             output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-ftp-iri-fallback.px b/tests/Test-ftp-iri-fallback.px

new file mode 100755 (executable)

index 0000000..091fd00
--- /dev/null
+++ b/tests/Test-ftp-iri-fallback.px
@@ -0,0 +1,47 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use FTPTest;
+
+
+###############################################################################
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+
+my $francais = <<EOF;
+Some text.
+EOF
+
+$francais =~ s/\n/\r\n/;
+
+# code, msg, headers, content
+my %urls = (
+    "/fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+###############################################################################
+
+my $the_test = FTPTest->new (name => "Test-ftp-iri",
+                             input => \%urls, 
+                             cmdline => $cmdline, 
+                             errcode => $expected_error_code, 
+                             output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px

new file mode 100755 (executable)

index 0000000..78e2622
--- /dev/null
+++ b/tests/Test-ftp-iri.px
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use FTPTest;
+
+
+###############################################################################
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+
+my $francais = <<EOF;
+Some text.
+EOF
+
+$francais =~ s/\n/\r\n/;
+
+
+# code, msg, headers, content
+my %urls = (
+    "/fran${ccedilla_u8}ais.txt" => {
+        content => $francais,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "fran${ccedilla_u8}ais.txt" => {
+        content => $francais,
+    },
+);
+
+###############################################################################
+
+my $the_test = FTPTest->new (name => "Test-ftp-iri",
+                             input => \%urls, 
+                             cmdline => $cmdline, 
+                             errcode => $expected_error_code, 
+                             output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-idn-cmd.px b/tests/Test-idn-cmd.px

new file mode 100755 (executable)

index 0000000..2f97962
--- /dev/null
+++ b/tests/Test-idn-cmd.px
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} --locale=EUC-JP $euc_jp_hostname";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "$punycoded_hostname/index.html" => {
+        content => $result_file,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-idn-cmd",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-idn-headers.px b/tests/Test-idn-headers.px

new file mode 100755 (executable)

index 0000000..b94c1cd
--- /dev/null
+++ b/tests/Test-idn-headers.px
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $starter_file = <<EOF;
+<a href="http://$euc_jp_hostname/">The link</a>
+EOF
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    'http://start-here.com/start.html' => {
+        code => "200",
+        msg => "You want fries with that?",
+        headers => {
+            'Content-Type' => 'text/html; charset=EUC-JP',
+        },
+        content => $starter_file,
+    },
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} http://start-here.com/start.html";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'start-here.com/start.html' => {
+        content => $starter_file,
+    },
+    "$punycoded_hostname/index.html" => {
+        content => $result_file,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-idn-headers",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px

new file mode 100755 (executable)

index 0000000..2734e1e
--- /dev/null
+++ b/tests/Test-idn-meta.px
@@ -0,0 +1,67 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $starter_file = <<EOF;
+<meta http-equiv="Content-Type" content="text/html; charset=EUC-JP" />
+<a href="http://$euc_jp_hostname/">The link</a>
+EOF
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    'http://start-here.com/start.html' => {
+        code => "200",
+        msg => "You want fries with that?",
+        headers => {
+            'Content-Type' => 'text/html; charset=UTF-8',
+        },
+        content => $starter_file,
+    },
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} http://start-here.com/start.html";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'start-here.com/start.html' => {
+        content => $starter_file,
+    },
+    "$punycoded_hostname/index.html" => {
+        content => $result_file,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-idn-meta",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-idn-robots.px b/tests/Test-idn-robots.px

new file mode 100755 (executable)

index 0000000..bc9084e
--- /dev/null
+++ b/tests/Test-idn-robots.px
@@ -0,0 +1,78 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $starter_file = <<EOF;
+<a href="http://$euc_jp_hostname/foo.txt">The link</a>
+EOF
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/html; charset=EUC-JP',
+        },
+        content => $starter_file,
+    },
+    "http://$punycoded_hostname/foo.txt" => {
+        code => "200",
+        msg => "Uh-huh",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+    "http://$punycoded_hostname/robots.txt" => {
+        code => "200",
+        msg => "Uh-huh",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => '',
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} --locale=EUC-JP"
+    . " http://$euc_jp_hostname/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "$punycoded_hostname/index.html" => {
+        content => $starter_file,
+    },
+    "$punycoded_hostname/foo.txt" => {
+        content => $result_file,
+    },
+    "$punycoded_hostname/robots.txt" => {
+        content => '',
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-idn-robots",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px

new file mode 100755 (executable)

index 0000000..02fc4d3
--- /dev/null
+++ b/tests/Test-iri-disabled.px
@@ -0,0 +1,197 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+
+###############################################################################
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 mime + file = iso-8859-1
+# p3_€€€.html                 meta + file = utf-8, mime = iso-8859-1
+#
+
+my $ccedilla_l15 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+my $eurosign_l15 = "\xA4";
+my $eurosign_u8 = "\xE2\x82\xAC";
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
+    Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+</head>
+<body>
+  <p>
+    &Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
+    Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeuro = <<EOF;
+<html>
+<head>
+  <title>Euro page</title>
+</head>
+<body>
+  <p>
+    My tailor isn't rich anymore.
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {      # UTF-8 encoded
+        code => "200",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {      # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {        # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%A4%A4%A4.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $pageindex,
+    },
+    'robots.txt' => {
+        content => "",
+    },
+    "p1_fran${ccedilla_l15}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_l1}${eacute_l1}n.html" => {
+        content => $pageeen,
+    },
+    "p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => {
+        content => $pageeuro,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-disabled",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px

new file mode 100755 (executable)

index 0000000..8341d51
--- /dev/null
+++ b/tests/Test-iri-forced-remote.px
@@ -0,0 +1,208 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+
+###############################################################################
+# Force remote encoding to ISO-8859-1
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 mime + file = iso-8859-1
+# p3_€€€.html                 meta + file = utf-8, mime = iso-8859-1
+#
+
+my $ccedilla_l15 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+my $eurosign_l15 = "\xA4";
+my $eurosign_u8 = "\xE2\x82\xAC";
+my $currency_l1 = "\xA4";
+my $currency_u8 = "\xC2\xA4";
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
+    Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+</head>
+<body>
+  <p>
+    &Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
+    Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeuro = <<EOF;
+<html>
+<head>
+  <title>Euro page</title>
+</head>
+<body>
+  <p>
+    My tailor isn't rich anymore.
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {      # UTF-8 encoded
+        code => "404",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $page404,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {      # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {        # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%A4%A4%A4.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $pageindex,
+    },
+    'robots.txt' => {
+        content => "",
+    },
+    "p1_fran${ccedilla_l15}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_u8}${eacute_u8}n.html" => {
+        content => $pageeen,
+    },
+    "p3_${currency_u8}${currency_u8}${currency_u8}.html" => {
+        content => $pageeuro,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-forced-remote",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px

new file mode 100755 (executable)

index 0000000..87cc33c
--- /dev/null
+++ b/tests/Test-iri-list.px
@@ -0,0 +1,174 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+###############################################################################
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 meta + file = utf-8, mime =iso-8859-1
+#
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+
+my $urllist = <<EOF;
+http://localhost:{{port}}/
+http://localhost:{{port}}/p1_fran${ccedilla_l1}ais.html
+http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html
+EOF
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+       Main page.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    French page.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+</head>
+<body>
+  <p>
+    Dutch page.
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {      # UTF-8 encoded
+        code => "404",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $page404,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {      # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/url_list.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=ISO-8859-1",
+        },
+        content => $urllist,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'url_list.txt' => {
+        content => $urllist,
+    },
+    'index.html' => {
+        content => $pageindex,
+    },
+    "p1_fran${ccedilla_l1}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_u8}${eacute_u8}n.html" => {
+        content => $pageeen,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-list",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-iri.px b/tests/Test-iri.px

new file mode 100755 (executable)

index 0000000..738c304
--- /dev/null
+++ b/tests/Test-iri.px
@@ -0,0 +1,225 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+
+###############################################################################
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 meta + file = utf-8, mime =iso-8859-1
+# p3_€€€.html                 meta + file = utf-8, mime = iso-8859-1
+# p4_méér.html                mime + file = utf-8
+#
+
+my $ccedilla_l15 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+my $eurosign_l15 = "\xA4";
+my $eurosign_u8 = "\xE2\x82\xAC";
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
+    Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+</head>
+<body>
+  <p>
+    &Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
+    Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)<br/>
+    <a href="http://localhost:{{port}}/p4_m${eacute_u8}${eacute_u8}r.html">M&eacute&eacute;r</a>
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeuro = <<EOF;
+<html>
+<head>
+  <title>Euro page</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+</head>
+<body>
+  <p>
+    My tailor isn't rich anymore.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagemeer = <<EOF;
+<html>
+<head>
+  <title>Bekende supermarkt</title>
+</head>
+<body>
+  <p>
+    Ik ben toch niet gek !
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {      # UTF-8 encoded
+        code => "404",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $page404,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {      # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {        # UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=ISO-8859-1",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%A4%A4%A4.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=ISO-8859-1",
+        },
+        content => $pageeuro,
+    },
+    '/p4_m%C3%A9%C3%A9r.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=UTF-8",
+        },
+        content => $pagemeer,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $pageindex,
+    },
+    'robots.txt' => {
+        content => "",
+    },
+    "p1_fran${ccedilla_l15}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_u8}${eacute_u8}n.html" => {
+        content => $pageeen,
+    },
+    "p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => {
+        content => $pageeuro,
+    },
+    "p4_m${eacute_u8}${eacute_u8}r.html" => {
+        content => $pagemeer,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri",
+                              input => \%urls,
+                              cmdline => $cmdline,
+                              errcode => $expected_error_code,
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-proxy-auth-basic.px b/tests/Test-proxy-auth-basic.px

index 7b3a638f4c885e9d025f6434ca92ba6c335a607e..033ce0396ae069cddb855bf6f2a7479f103018d4 100755 (executable)
--- a/tests/Test-proxy-auth-basic.px
+++ b/tests/Test-proxy-auth-basic.px
@@ -12,7 +12,7 @@ my $wholefile = "You're all authenticated.\n";
  
  # code, msg, headers, content
  my %urls = (
-    '/needs-auth.txt' => {
+    'http://no.such.domain/needs-auth.txt' => {
          auth_method => 'Basic',
          user => 'fiddle-dee-dee',
          passwd => 'Dodgson',
diff --git a/tests/run-px b/tests/run-px

index 33e4c60075f7b1c3a4a3a32e8ee88beedf6822a1..3b5449bd9059cf57bcb88f431e6b8de60f1e2ee2 100755 (executable)
--- a/tests/run-px
+++ b/tests/run-px
@@ -25,9 +25,20 @@ my @tests = (
      'Test-E-k-K.px',
      'Test-E-k.px',
      'Test-ftp.px',
+    'Test-ftp-iri.px',
+    'Test-ftp-iri-fallback.px',
+    'Test-ftp-iri-disabled.px',
      'Test-HTTP-Content-Disposition-1.px',
      'Test-HTTP-Content-Disposition-2.px',
      'Test-HTTP-Content-Disposition.px',
+    'Test-idn-headers.px',
+    'Test-idn-meta.px',
+    'Test-idn-cmd.px',
+    'Test-idn-robots.px',
+    'Test-iri.px',
+    'Test-iri-disabled.px',
+    'Test-iri-forced-remote.px',
+    'Test-iri-list.px',
      'Test-N-current.px',
      'Test-N-smaller.px',
      'Test-N-no-info.px',
author	Micah Cowan <micah@cowan.name>
	Thu, 25 Jun 2009 08:14:11 +0000 (01:14 -0700)
committer	Micah Cowan <micah@cowan.name>
	Thu, 25 Jun 2009 08:14:11 +0000 (01:14 -0700)
ChangeLog		patch \| blob \| history
configure.ac		patch \| blob \| history
doc/ChangeLog		patch \| blob \| history
doc/sample.wgetrc		patch \| blob \| history
doc/wget.texi		patch \| blob \| history
src/ChangeLog		patch \| blob \| history
src/Makefile.am		patch \| blob \| history
src/build_info.c		patch \| blob \| history
src/connect.c		patch \| blob \| history
src/convert.c		patch \| blob \| history
src/ftp-basic.c		patch \| blob \| history
src/host.c		patch \| blob \| history
src/html-url.c		patch \| blob \| history
src/html-url.h		patch \| blob \| history
src/http.c		patch \| blob \| history
src/http.h		patch \| blob \| history
src/init.c		patch \| blob \| history
src/iri.c	[new file with mode: 0644]	patch \| blob
src/iri.h	[new file with mode: 0644]	patch \| blob
src/log.c		patch \| blob \| history
src/main.c		patch \| blob \| history
src/options.h		patch \| blob \| history
src/recur.c		patch \| blob \| history
src/recur.h		patch \| blob \| history
src/res.c		patch \| blob \| history
src/res.h		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/retr.h		patch \| blob \| history
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history
src/wget.h		patch \| blob \| history
tests/ChangeLog		patch \| blob \| history
tests/HTTPServer.pm		patch \| blob \| history
tests/Test-ftp-iri-disabled.px	[new file with mode: 0755]	patch \| blob
tests/Test-ftp-iri-fallback.px	[new file with mode: 0755]	patch \| blob
tests/Test-ftp-iri.px	[new file with mode: 0755]	patch \| blob
tests/Test-idn-cmd.px	[new file with mode: 0755]	patch \| blob
tests/Test-idn-headers.px	[new file with mode: 0755]	patch \| blob
tests/Test-idn-meta.px	[new file with mode: 0755]	patch \| blob
tests/Test-idn-robots.px	[new file with mode: 0755]	patch \| blob
tests/Test-iri-disabled.px	[new file with mode: 0755]	patch \| blob
tests/Test-iri-forced-remote.px	[new file with mode: 0755]	patch \| blob
tests/Test-iri-list.px	[new file with mode: 0755]	patch \| blob
tests/Test-iri.px	[new file with mode: 0755]	patch \| blob
tests/Test-proxy-auth-basic.px		patch \| blob \| history
tests/run-px		patch \| blob \| history