* AUTHORS: Added Steven Schubiger.
+2008-06-26 Xavier Saint <wget@sxav.eu>
+
+ * configure.ac : IRIs support required libiconv, check it.
+
+2008-06-14 Xavier Saint <wget@sxav.eu>
+
+ * configure.ac: Add support for IRIs
+
2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set).
fi
AC_SUBST(COMMENT_IF_NO_POD2MAN)
+
+dnl
+dnl Check for IDN/IRIs
+dnl
+
+AC_ARG_ENABLE(iri,
+ AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
+ [case "${enable_iri}" in
+ no)
+ dnl Disable IRIs checking
+ AC_MSG_NOTICE([disabling IRIs at user request])
+ iri=no
+ ;;
+ yes)
+ dnl IRIs explicitly enabled
+ iri=yes
+ force_iri=yes
+ ;;
+ auto)
+ dnl Auto-detect IRI
+ iri=yes
+ ;;
+ *)
+ AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
+ ;;
+ esac
+ ], [
+ dnl If nothing is specified, assume auto-detection
+ iri=yes
+ ]
+)
+
+AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
+ [Support IDN/IRIs (needs GNU Libidn)]),
+ libidn=$withval, libidn="")
+if test "X$iri" != "Xno"; then
+ AM_ICONV
+
+ if test "X$am_cv_func_iconv" != "Xyes"; then
+ iri=no
+ if test "X$force_iri" = "Xyes"; then
+ AC_MSG_ERROR([Libiconv is required for IRIs support])
+ else
+ AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
+ fi
+ fi
+fi
+
+if test "X$iri" != "Xno"; then
+ if test "$libidn" != ""; then
+ LDFLAGS="${LDFLAGS} -L$libidn/lib"
+ CPPFLAGS="${CPPFLAGS} -I$libidn/include"
+ fi
+ AC_CHECK_HEADER(idna.h,
+ AC_CHECK_LIB(idn, stringprep_check_version,
+ [iri=yes LIBS="${LIBS} -lidn"], iri=no),
+ iri=no)
+
+ if test "X$iri" != "Xno" ; then
+ AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
+ AC_MSG_NOTICE([Enabling support for IRI.])
+ else
+ AC_MSG_WARN([Libidn not found])
+ fi
+fi
+
+
+dnl Needed by src/Makefile.am
+AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
+
+
dnl
dnl Create output
dnl
* init.c (cleanup): Free the memory associated with the base
option (when DEBUG_MALLOC is defined).
+2008-07-02 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New function idn_decode() to decode ASCII
+ encoded hostname to the locale.
+
+ * host.c : Show hostname to be resolved both in locale and
+ ASCII encoded.
+
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
* retr.c (retrieve_from_file): Allow for reading the links from
an external file (HTTP/FTP).
+2008-06-26 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New functions locale_to_utf8() and
+ idn_encode() adding basic capabilities of IRI/IDN.
+
+ * url.c : Convert URLs from locale to UTF-8 allowing a basic
+ support of IRI/IDN
+
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
* http.c: Make -nv --spider include the file's name when it
exists.
-
+
2008-06-22 Micah Cowan <micah@cowan.name>
* Makefile.am (version.c): Fixed version string invocation so it
string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c).
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New function check_encoding_name() as
+ a preliminary encoding name check.
+
+ * main.c, iri.c : Make use of check_encoding_name().
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c : Include missing stringprep.h file and add a
+ cast.
+
+ * init.c : set a default initial value for opt.enable_iri,
+ opt.locale and opt.encoding_remote.
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : Add a new function find_locale() to find
+ out the local system encoding.
+
+ * main.c : Make use of find_locale().
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * html-url.c : Add "content-type" meta tag parsing for
+ retrieving page encoding.
+
+ * iri.h : Make no-op version of parse_charset() return
+ NULL.
+
2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we
_set_ it that way, don't abort.
+2008-06-14 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New files.
+
+ * Makefile.am : Add files iri.h and conditional iri.c.
+
+ * build_info.c : Add compiled feature "iri".
+
+ * http.c : include iri.h and parse charset from Content-Type
+ header.
+
+ * init.c, main.c, options.h : if an options isn't supported
+ at compiled time, don't get rid off it and show a dummy
+ message instead if they are used.
+
2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
default.
2008-05-17 Kenny Parnell <k.parnell@gmail.com>
-
+
(cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
2008-05-17 Micah Cowan <micah@cowan.name>
-
+
* main.c (main): Handle Ctrl-D on command-line.
2008-05-15 Steven Schubiger <schubiger@gmail.com>
* options.h: Add an according boolean member to the options
struct.
-
+
* sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
out, because they're now defined independently by config.h.
# Version: @VERSION@
#
+if IRI_IS_ENABLED
+IRI_OBJ = iri.c
+endif
+
# The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
- utils.c \
- css-url.h connect.h convert.h cookies.h \
+ utils.c $(IRI_OBJ) \
+ css-url.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \
#else
"-gettext",
#endif
+
+#ifdef ENABLE_IRI
+ "+iri",
+#else
+ "-iri",
+#endif
+
/* sentinel value */
NULL
};
#include "host.h"
#include "connect.h"
#include "hash.h"
+#include "iri.h"
/* Define sockaddr_storage where unavailable (presumably on IPv4-only
hosts). */
if (print)
{
const char *txt_addr = print_address (ip);
- if (print && 0 != strcmp (print, txt_addr))
- logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
- escnonprint_uri (print), txt_addr, port);
+ if (0 != strcmp (print, txt_addr))
+ {
+ char *str = NULL, *name;
+
+ if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
+ {
+ int len = strlen (print) + strlen (name) + 4;
+ str = xmalloc (len);
+ snprintf (str, len, "%s (%s)", name, print);
+ str[len-1] = '\0';
+ xfree (name);
+ }
+
+ logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
+ str ? str : escnonprint_uri (print), txt_addr, port);
+
+ if (str)
+ xfree (str);
+ }
else
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
}
/* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) :
- get_urls_html (file, url, NULL);
+ get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the
return FTPRERR;
/* Strip trailing CRLF before printing the line, so that
- escnonprint doesn't include bogus \012 and \015. */
+ quotting doesn't include bogus \012 and \015. */
p = strchr (line, '\0');
if (p > line && p[-1] == '\n')
*--p = '\0';
#include "host.h"
#include "url.h"
#include "hash.h"
+#include "iri.h"
#ifndef NO_ADDRESS
# define NO_ADDRESS NO_DATA
/* No luck with the cache; resolve HOST. */
if (!silent && !numeric_address)
- logprintf (LOG_VERBOSE, _("Resolving %s... "),
- quotearg_style (escape_quoting_style, host));
+ {
+ char *str = NULL, *name;
+
+ if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
+ {
+ int len = strlen (host) + strlen (name) + 4;
+ str = xmalloc (len);
+ snprintf (str, len, "%s (%s)", name, host);
+ str[len-1] = '\0';
+ xfree (name);
+ }
+
+ logprintf (LOG_VERBOSE, _("Resolving %s... "),
+ quotearg_style (escape_quoting_style, str ? str : host));
+
+ if (str)
+ xfree (str);
+ }
#ifdef ENABLE_IPV6
{
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
+/* Will contains the (last) charset found in 'http-equiv=content-type'
+ meta tags */
+static char *meta_charset;
+
static void
init_interesting (void)
{
return NULL;
}
- url = url_parse (link_uri, NULL);
+ url = url_parse (link_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
- url = url_parse (complete_uri, NULL);
+ url = url_parse (complete_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
+ xfree_null (meta_charset);
+ meta_charset = mcharset;
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
<base href=...> and does the right thing. */
struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+ struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
+ /* If meta charset isn't null, override content encoding */
+ if (iri && meta_charset)
+ set_content_encoding (iri, meta_charset);
+
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
url_text = merged;
}
- url = url_parse (url_text, &up_error_code);
+ url = url_parse (url_text, &up_error_code, NULL);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
};
struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *);
If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */
static uerr_t
-gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
+gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
+ struct iri *iri)
{
struct request *req;
hs->local_file = url_file_name (u);
}
}
-
+
/* TODO: perform this check only once. */
if (!hs->existence_checked && file_exists_p (hs->local_file))
{
local_dot_orig_file_exists = true;
local_filename = filename_plus_orig_suffix;
}
- }
+ }
if (!local_dot_orig_file_exists)
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
char *tmp = strchr (type, ';');
if (tmp)
{
+ /* sXXXav: only needed if IRI support is enabled */
+ char *tmp2 = tmp + 1;
+
while (tmp > type && c_isspace (tmp[-1]))
--tmp;
*tmp = '\0';
+
+ /* Try to get remote encoding if needed */
+ if (opt.enable_iri && !opt.encoding_remote)
+ {
+ tmp = parse_charset (tmp2);
+ if (tmp)
+ set_content_encoding (iri, tmp);
+ }
}
}
hs->newloc = resp_header_strdup (resp, "Location");
retried, and retried, and retried, and... */
uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
- int *dt, struct url *proxy)
+ int *dt, struct url *proxy, struct iri *iri)
{
int count;
bool got_head = false; /* used for time-stamping and filename detection */
uerr_t err, ret = TRYLIMEXC;
time_t tmr = -1; /* remote time-stamp */
struct http_stat hstat; /* HTTP status */
- struct_stat st;
+ struct_stat st;
bool send_head_first = true;
/* Assert that no value for *LOCAL_FILE was passed. */
assert (local_file == NULL || *local_file == NULL);
-
+
/* Set LOCAL_FILE parameter. */
if (local_file && opt.output_document)
*local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
-
+
/* Reset NEWLOC parameter. */
*newloc = NULL;
retrieve the file. But if the output_document was given, then this
test was already done and the file didn't exist. Hence the !opt.output_document */
logprintf (LOG_VERBOSE, _("\
-File %s already there; not retrieving.\n\n"),
+File %s already there; not retrieving.\n\n"),
quote (hstat.local_file));
/* If the file is there, we suppose it's retrieved OK. */
*dt |= RETROKF;
/* Reset the counter. */
count = 0;
-
+
/* Reset the document type. */
*dt = 0;
-
+
/* Skip preliminary HEAD request if we're not in spider mode AND
* if -O was given or HTTP Content-Disposition support is disabled. */
if (!opt.spider
/* Send preliminary HEAD request if -N is given and we have an existing
* destination file. */
- if (opt.timestamping
+ if (opt.timestamping
&& !opt.content_disposition
&& file_exists_p (url_file_name (u)))
send_head_first = true;
-
+
/* THE loop */
do
{
/* Increment the pass counter. */
++count;
sleep_between_retrievals (count);
-
+
/* Get the current time string. */
tms = datetime_str (time (NULL));
-
+
if (opt.spider && !got_head)
logprintf (LOG_VERBOSE, _("\
Spider mode enabled. Check if remote file exists.\n"));
if (opt.verbose)
{
char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
-
- if (count > 1)
+
+ if (count > 1)
{
char tmp[256];
sprintf (tmp, _("(try:%2d)"), count);
logprintf (LOG_NOTQUIET, "--%s-- %s %s\n",
tms, tmp, hurl);
}
- else
+ else
{
logprintf (LOG_NOTQUIET, "--%s-- %s\n",
tms, hurl);
}
-
+
#ifdef WINDOWS
ws_changetitle (hurl);
#endif
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
- if (send_head_first && !got_head)
+ if (send_head_first && !got_head)
*dt |= HEAD_ONLY;
else
*dt &= ~HEAD_ONLY;
*dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */
- err = gethttp (u, &hstat, dt, proxy);
+ err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */
tms = datetime_str (time (NULL));
-
+
/* Get the new location (with or without the redirection). */
if (hstat.newloc)
*newloc = xstrdup (hstat.newloc);
hstat.statcode);
ret = WRONGCODE;
}
- else
+ else
{
ret = NEWLOCATION;
}
/* All possibilities should have been exhausted. */
abort ();
}
-
+
if (!(*dt & RETROKF))
{
char *hurl = NULL;
continue;
}
/* Maybe we should always keep track of broken links, not just in
- * spider mode. */
- else if (opt.spider)
+ * spider mode.
+ * Don't log error if it was UTF-8 encoded because we will try
+ * once unencoded. */
+ else if (opt.spider && !iri->utf8_encode)
{
/* #### Again: ugly ugly ugly! */
- if (!hurl)
+ if (!hurl)
hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
nonexisting_url (hurl);
logprintf (LOG_NOTQUIET, _("\
else
{
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
- tms, hstat.statcode,
+ tms, hstat.statcode,
quotearg_style (escape_quoting_style, hstat.error));
}
logputs (LOG_VERBOSE, "\n");
struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
- struct url *);
+ struct url *, struct iri *);
void save_cookies (void);
void http_cleanup (void);
time_t http_atotm (const char *);
{ "inet6only", &opt.ipv6_only, cmd_boolean },
#endif
{ "input", &opt.input_filename, cmd_file },
+ { "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file },
+ { "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number },
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
+ { "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
opt.restrict_files_case = restrict_no_case_restriction;
opt.max_redirect = 20;
+
+#ifdef ENABLE_IRI
+ opt.enable_iri = true;
+#else
+ opt.enable_iri = false;
+#endif
+ opt.locale = NULL;
+ opt.encoding_remote = NULL;
}
\f
/* Return the user's home directory (strdup-ed), or NULL if none is
--- /dev/null
+/* IRI related functions.
+ Copyright (C) 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <iconv.h>
+#include <stringprep.h>
+#include <idna.h>
+#include <errno.h>
+
+#include "utils.h"
+#include "iri.h"
+
+/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
+#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
+
+/* Note: locale encoding is kept in options struct (opt.locale) */
+
+static iconv_t locale2utf8;
+
+static bool open_locale_to_utf8 (void);
+static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
+
+
+/* Given a string containing "charset=XXX", return the encoding if found,
+ or NULL otherwise */
+char *
+parse_charset (char *str)
+{
+ char *charset;
+
+ if (!str || !*str)
+ return NULL;
+
+ str = strcasestr (str, "charset=");
+ if (!str)
+ return NULL;
+
+ str += 8;
+ charset = str;
+
+ /* sXXXav: which chars should be banned ??? */
+ while (*charset && !c_isspace (*charset))
+ charset++;
+
+ /* sXXXav: could strdupdelim return NULL ? */
+ charset = strdupdelim (str, charset);
+
+ /* Do a minimum check on the charset value */
+ if (!check_encoding_name (charset))
+ {
+ xfree (charset);
+ return NULL;
+ }
+
+ /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
+
+ return charset;
+}
+
+/* Find the locale used, or fall back on a default value */
+char *
+find_locale (void)
+{
+ return (char *) stringprep_locale_charset ();
+}
+
+/* Basic check of an encoding name. */
+bool
+check_encoding_name (char *encoding)
+{
+ char *s = encoding;
+
+ while (*s)
+ {
+ if (!c_isascii (*s) || c_isspace (*s))
+ {
+ logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
+ return false;
+ }
+
+ s++;
+ }
+
+ return true;
+}
+
+/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
+static bool
+open_locale_to_utf8 (void)
+{
+ if (locale2utf8)
+ return true;
+
+ /* sXXXav : That shouldn't happen, just in case */
+ if (!opt.locale)
+ {
+ logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
+ opt.locale = find_locale ();
+ }
+
+ if (!opt.locale)
+ return false;
+
+ locale2utf8 = iconv_open ("UTF-8", opt.locale);
+ if (locale2utf8 != (iconv_t)(-1))
+ return true;
+
+ logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
+ quote (opt.locale), quote ("UTF-8"));
+ locale2utf8 = NULL;
+ return false;
+}
+
+/* Try converting string str from locale to UTF-8. Return a new string
+ on success, or str on error or if conversion isn't needed. */
+const char *
+locale_to_utf8 (const char *str)
+{
+ char *new;
+
+ if (!strcasecmp (opt.locale, "utf-8"))
+ return str;
+
+ if (!open_locale_to_utf8 ())
+ return str;
+
+ if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
+ return (const char *) new;
+
+ return str;
+}
+
+/* Do the conversion according to the passed conversion descriptor cd. *out
+ will containes the transcoded string on success. *out content is
+ unspecified otherwise. */
+static bool
+do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+{
+ /* sXXXav : hummm hard to guess... */
+ size_t len, done, outlen = inlen * 2;
+ int invalid = 0, tooshort = 0;
+ char *s;
+
+ s = xmalloc (outlen + 1);
+ *out = s;
+ len = outlen;
+ done = 0;
+
+ for (;;)
+ {
+ if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
+ {
+ *out = s;
+ *(s + len - outlen - done) = '\0';
+ return true;
+ }
+
+ /* Incomplete or invalid multibyte sequence */
+ if (errno == EINVAL || errno == EILSEQ)
+ {
+ if (!invalid)
+ logprintf (LOG_VERBOSE,
+ "Incomplete or invalide multibyte sequence encountered\n");
+
+ invalid++;
+ **out = *in;
+ in++;
+ inlen--;
+ (*out)++;
+ outlen--;
+ }
+ else if (errno == E2BIG) /* Output buffer full */
+ {
+ char *new;
+
+ tooshort++;
+ done = len;
+ outlen = done + inlen * 2;
+ new = xmalloc (outlen + 1);
+ memcpy (new, s, done);
+ xfree (s);
+ s = new;
+ len = outlen;
+ *out = s + done;
+ }
+ else /* Weird, we got an unspecified error */
+ {
+ logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
+ break;
+ }
+ }
+
+ return false;
+}
+
+/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
+ on error. */
+char *
+idn_encode (struct iri *i, char *host)
+{
+ char *new;
+ int ret;
+
+ /* Encode to UTF-8 if not done */
+ if (!i->utf8_encode)
+ {
+ if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
+ {
+ /* Nothing to encode or an error occured */
+ return NULL;
+ }
+
+ host = new;
+ }
+
+ /* toASCII UTF-8 NULL terminated string */
+ ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
+ if (ret != IDNA_SUCCESS)
+ {
+ /* sXXXav : free new when needed ! */
+ logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
+ on success or NULL on error. */
+char *
+idn_decode (char *host)
+{
+ char *new;
+ int ret;
+
+ ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
+ if (ret != IDNA_SUCCESS)
+ {
+ logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to transcode string str from remote encoding to UTF-8. On success, *new
+ contains the transcoded string. *new content is unspecified otherwise. */
+bool
+remote_to_utf8 (struct iri *i, const char *str, const char **new)
+{
+ char *r;
+ iconv_t cd;
+ bool ret = false;
+
+ if (opt.encoding_remote)
+ r = opt.encoding_remote;
+ else if (i->uri_encoding)
+ r = i->uri_encoding;
+ else
+ return false;
+
+ cd = iconv_open ("UTF-8", r);
+ if (cd == (iconv_t)(-1))
+ return false;
+
+ if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+ ret = true;
+
+ iconv_close (cd);
+
+ /* Test if something was converted */
+ if (!strcmp (str, *new))
+ {
+ xfree ((char *) *new);
+ return false;
+ }
+
+ return ret;
+}
+
+struct iri *
+iri_new (void)
+{
+ struct iri *i = xmalloc (sizeof (struct iri));
+ i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
+ i->content_encoding = NULL;
+ i->utf8_encode = opt.enable_iri;
+}
+
+void
+iri_free (struct iri *i)
+{
+ xfree_null (i->uri_encoding);
+ xfree_null (i->content_encoding);
+ xfree (i);
+}
+
+void
+set_uri_encoding (struct iri *i, char *charset)
+{
+ logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
+ if (opt.encoding_remote)
+ return;
+ if (i->uri_encoding)
+ {
+ if (!strcasecmp (i->uri_encoding, charset))
+ return;
+ xfree (i->uri_encoding);
+ }
+
+ i->uri_encoding = charset ? xstrdup (charset) : NULL;
+}
+
+void
+set_content_encoding (struct iri *i, char *charset)
+{
+ logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
+ if (opt.encoding_remote)
+ return;
+ if (i->content_encoding)
+ {
+ if (!strcasecmp (i->content_encoding, charset))
+ return;
+ xfree (i->content_encoding);
+ }
+
+ i->content_encoding = charset ? xstrdup (charset) : NULL;
+}
+
--- /dev/null
+/* Internationalization related declarations.
+ Copyright (C) 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#ifndef IRI_H
+#define IRI_H
+
+struct iri {
+ char *uri_encoding; /* Encoding of the uri to fetch */
+ char *content_encoding; /* Encoding of links inside the fetched file */
+ bool utf8_encode; /* Will/Is the current url encoded in utf8 */
+};
+
+#ifdef ENABLE_IRI
+
+char *parse_charset (char *str);
+char *find_locale (void);
+bool check_encoding_name (char *encoding);
+const char *locale_to_utf8 (const char *str);
+char *idn_encode (struct iri *i, char *host);
+char *idn_decode (char *host);
+bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
+struct iri *iri_new (void);
+void iri_free (struct iri *i);
+void set_uri_encoding (struct iri *i, char *charset);
+void set_content_encoding (struct iri *i, char *charset);
+
+#else /* ENABLE_IRI */
+
+struct iri dummy_iri;
+
+#define parse_charset(str) NULL
+#define find_locale() NULL
+#define check_encoding_name(str) false
+#define locale_to_utf8(str) (str)
+#define idn_encode(a,b,c) NULL
+#define idn_decode(str) NULL
+#define remote_to_utf8(a,b,c) false
+#define iri_new() (&dummy_iri)
+#define iri_free(a)
+#define set_uri_encoding(a,b)
+#define set_content_encoding(a,b)
+
+#endif /* ENABLE_IRI */
+#endif /* IRI_H */
#include "utils.h"
#include "log.h"
-/* This file impplement support for "logging". Logging means printing
+/* This file implement support for "logging". Logging means printing
output, plus several additional features:
- Cataloguing output by importance. You can specify that a log
#include <assert.h>
#include <errno.h>
#include <time.h>
+#ifdef ENABLE_IRI
+#include <langinfo.h>
+#endif
#include "utils.h"
#include "init.h"
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif
{ "input-file", 'i', OPT_VALUE, "input", -1 },
+ { "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+ { "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument },
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+ { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
exit (1);
}
+#ifdef ENABLE_IRI
+ if (opt.enable_iri)
+ {
+ if (opt.locale && !check_encoding_name (opt.locale))
+ opt.locale = NULL;
+
+ if (!opt.locale)
+ opt.locale = find_locale ();
+
+ if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
+ opt.encoding_remote = NULL;
+
+ /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
+ }
+#else
+ if (opt.enable_iri || opt.locale || opt.encoding_remote)
+ {
+ /* sXXXav : be more specific... */
+ printf(_("This version does not have support for IRIs\n"));
+ exit(1);
+ }
+#endif
+
if (opt.ask_passwd)
{
opt.passwd = prompt_for_password ();
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (url_scheme (*t) == SCHEME_FTP)
+ if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
-
+
status = retrieve_tree (*t);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ {
+ struct iri *i = iri_new ();
+ set_uri_encoding (i, opt.locale);
+ status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
+ opt.recursive, i);
+ iri_free (i);
+ }
if (opt.delete_after && file_exists_p(filename))
{
bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */
+
+ bool enable_iri;
+ char *encoding_remote;
+ char *locale;
};
extern struct options opt;
#include "html-url.h"
#include "css-url.h"
#include "spider.h"
-
+#include "iri.h"
+\f
/* Functions for maintaining the URL queue. */
struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
+ struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
into it. */
static void
-url_enqueue (struct url_queue *queue,
+url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
+ qel->iri = i;
qel->url = url;
qel->referer = referer;
qel->depth = depth;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
+ if (i)
+ printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
+
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
succeeded, or false if the queue is empty. */
static bool
-url_dequeue (struct url_queue *queue,
+url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
if (!queue->head)
queue->tail = NULL;
+ *i = qel->iri;
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
}
\f
static bool download_child_p (const struct urlpos *, struct url *, int,
- struct url *, struct hash_table *);
+ struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, const char *, int,
- struct url *, struct hash_table *);
+ struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
struct hash_table *blacklist;
int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ struct url *start_url_parsed;
+ struct iri *i = iri_new ();
+ set_uri_encoding (i, opt.locale);
+ start_url_parsed = url_parse (start_url, &up_error_code, i);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
- url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
+ url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
+ false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
/* Get the next URL from the queue... */
- if (!url_dequeue (queue,
+ if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed))
break;
int dt = 0;
char *redirected = NULL;
- status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+ status = retrieve_url (url, &file, &redirected, referer, &dt,
+ false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
if (descend)
{
if (!descend_redirect_p (redirected, url, depth,
- start_url_parsed, blacklist))
+ start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
bool meta_disallow_follow = false;
struct urlpos *children
= is_css ? get_urls_css_file (file, url) :
- get_urls_html (file, url, &meta_disallow_follow);
+ get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow)
{
if (children)
{
struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ struct url *url_parsed = url_parse (url, NULL, i);
+ struct iri *ci;
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
- blacklist))
+ blacklist, i))
{
- url_enqueue (queue, xstrdup (child->url->url),
+ ci = iri_new ();
+ set_uri_encoding (ci, i->content_encoding);
+ url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
child->link_expect_html,
child->link_expect_css);
}
}
- if (file
- && (opt.delete_after
+ if (file
+ && (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- (otherwise unneeded because of --spider or rejected by -R)
- HTML file just to harvest its hyperlinks -- in either case,
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- (opt.spider ? "--spider" :
+ (opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
xfree (url);
xfree_null (referer);
xfree_null (file);
+ iri_free (i);
}
/* If anything is left of the queue due to a premature exit, free it
char *d1, *d2;
int d3;
bool d4, d5;
- while (url_dequeue (queue,
+ struct iri *d6;
+ while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
+ iri_free (d6);
xfree (d1);
xfree_null (d2);
}
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
- struct url *start_url_parsed, struct hash_table *blacklist)
+ struct url *start_url_parsed, struct hash_table *blacklist,
+ struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
if (string_set_contains (blacklist, url))
{
- if (opt.spider)
+ if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
if (!specs)
{
char *rfile;
- if (res_retrieve_file (url, &rfile))
+ if (res_retrieve_file (url, &rfile, iri))
{
specs = res_parse_from_file (rfile);
static bool
descend_redirect_p (const char *redirected, const char *original, int depth,
- struct url *start_url_parsed, struct hash_table *blacklist)
+ struct url *start_url_parsed, struct hash_table *blacklist,
+ struct iri *iri)
{
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
- orig_parsed = url_parse (original, NULL);
+ orig_parsed = url_parse (original, NULL, NULL);
assert (orig_parsed != NULL);
- new_parsed = url_parse (redirected, NULL);
+ new_parsed = url_parse (redirected, NULL, NULL);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
- start_url_parsed, blacklist);
+ start_url_parsed, blacklist, iri);
url_free (orig_parsed);
url_free (new_parsed);
Return true if robots were retrieved OK, false otherwise. */
bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
{
+ struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider;
+ /* Copy server URI encoding for a possible IDNA transformation, no need to
+ encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+ set_uri_encoding (i, iri->uri_encoding);
+ i->utf8_encode = false;
+
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+ err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
opt.timestamping = saved_ts_val;
- opt.spider = saved_sp_val;
+ opt.spider = saved_sp_val;
xfree (robots_url);
+ iri_free (i);
if (err != RETROK && *file != NULL)
{
void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int);
-bool res_retrieve_file (const char *, char **);
+bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *);
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
+#include "iri.h"
#include "html-url.h"
/* Total size of downloaded files. Used to enforce quota. */
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive)
+ const char *refurl, int *dt, bool recursive, struct iri *iri)
{
uerr_t result;
char *url;
if (file)
*file = NULL;
- u = url_parse (url, &up_error_code);
+ second_try:
+ u = url_parse (url, &up_error_code, iri);
if (!u)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
return URLERROR;
}
+ printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
+
if (!refurl)
refurl = opt.referer;
proxy = getproxy (u);
if (proxy)
{
+ /* sXXXav : could a proxy include a path ??? */
+ struct iri *pi = iri_new ();
+ set_uri_encoding (pi, opt.locale);
+ pi->utf8_encode = false;
+
/* Parse the proxy URL. */
- proxy_url = url_parse (proxy, &up_error_code);
+ proxy_url = url_parse (proxy, &up_error_code, NULL);
if (!proxy_url)
{
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
- result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+ result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
xfree (mynewloc);
mynewloc = construced_newloc;
+ /* Reset UTF-8 encoding state, keep the URI encoding and reset
+ the content encoding. */
+ iri->utf8_encode = opt.enable_iri;
+ set_content_encoding (iri, NULL);
+
/* Now, see if this new location makes sense. */
- newloc_parsed = url_parse (mynewloc, &up_error_code);
+ newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
if (!newloc_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
goto redirected;
}
- if (local_file)
+ /* Try to not encode in UTF-8 if fetching failed */
+ if (!(*dt & RETROKF) && iri->utf8_encode)
{
+ iri->utf8_encode = false;
+ printf ("[Fallbacking to non-utf8 for `%s'\n", url);
+ goto second_try;
+ }
+
+ if (local_file && *dt & RETROKF)
+ {
+ register_download (u->url, local_file);
+ if (redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+ if (*dt & TEXTHTML)
+ register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
{
uerr_t status;
struct urlpos *url_list, *cur_url;
+ struct iri *iri = iri_new();
char *input_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
-
+
+ /* sXXXav : Assume filename and links in the file are in the locale */
+ set_content_encoding (iri, opt.locale);
+
if (url_has_scheme (url))
{
int dt;
if (!opt.base_href)
opt.base_href = xstrdup (url);
- status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
+ status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
if (status != RETROK)
return status;
else
input_file = (char *) file;
- url_list = (html ? get_urls_html (input_file, NULL, NULL)
+ url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
+
status = retrieve_tree (cur_url->url->url);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+ status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
+ &dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
url_uses_proxy (const char *url)
{
bool ret;
- struct url *u = url_parse (url, NULL);
+ struct url *u;
+ struct iri *i = iri_new();
+ /* url was given in the command line, so use locale as encoding */
+ set_uri_encoding (i, opt.locale);
+ u= url_parse (url, NULL, i);
if (!u)
return false;
ret = getproxy (u) != NULL;
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
+uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
+ bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);
#include "utils.h"
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+#include "iri.h"
#ifdef TESTING
#include "test.h"
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri)
{
struct url *u;
const char *p;
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded = NULL;
+ char *url_encoded = NULL, *new_url = NULL;
int error_code;
goto error;
}
- url_encoded = reencode_escapes (url);
+ if (iri && iri->utf8_encode)
+ {
+ url_unescape ((char *) url);
+ iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
+ if (!iri->utf8_encode)
+ new_url = NULL;
+ }
+
+ url_encoded = reencode_escapes (new_url ? new_url : url);
p = url_encoded;
+ if (new_url && url_encoded != new_url)
+ xfree (new_url);
+
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
{
url_unescape (u->host);
host_modified = true;
+
+ /* Apply IDNA regardless of iri->utf8_encode status */
+ if (opt.enable_iri && iri)
+ {
+ char *new = idn_encode (iri, u->host);
+ if (new)
+ {
+ xfree (u->host);
+ u->host = new;
+ host_modified = true;
+ }
+ }
}
if (params_b)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
- if (path_modified || u->fragment || host_modified || path_b == path_e)
+ if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild
char *url_escape (const char *);
-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, struct iri *iri);
const char *url_error (int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);
#include "quote.h"
#include "quotearg.h"
+/* Likewise for struct iri definition */
+#include "iri.h"
+
/* Useful macros used across the code: */
/* The number of elements in an array. For example: