* AUTHORS: Added Steven Schubiger.
+2008-06-26 Xavier Saint <wget@sxav.eu>
+
+ * configure.ac : IRIs support required libiconv, check it.
+
+2008-06-14 Xavier Saint <wget@sxav.eu>
+
+ * configure.ac: Add support for IRIs
+
2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set).
fi
AC_SUBST(COMMENT_IF_NO_POD2MAN)
+
+dnl
+dnl Check for IDN/IRIs
+dnl
+
+AC_ARG_ENABLE(iri,
+ AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
+ [case "${enable_iri}" in
+ no)
+ dnl Disable IRIs checking
+ AC_MSG_NOTICE([disabling IRIs at user request])
+ iri=no
+ ;;
+ yes)
+ dnl IRIs explicitly enabled
+ iri=yes
+ force_iri=yes
+ ;;
+ auto)
+ dnl Auto-detect IRI
+ iri=yes
+ ;;
+ *)
+ AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
+ ;;
+ esac
+ ], [
+ dnl If nothing is specified, assume auto-detection
+ iri=yes
+ ]
+)
+
+AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
+ [Support IDN/IRIs (needs GNU Libidn)]),
+ libidn=$withval, libidn="")
+if test "X$iri" != "Xno"; then
+ AM_ICONV
+
+ if test "X$am_cv_func_iconv" != "Xyes"; then
+ iri=no
+ if test "X$force_iri" = "Xyes"; then
+ AC_MSG_ERROR([Libiconv is required for IRIs support])
+ else
+ AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
+ fi
+ fi
+fi
+
+if test "X$iri" != "Xno"; then
+ if test "$libidn" != ""; then
+ LDFLAGS="${LDFLAGS} -L$libidn/lib"
+ CPPFLAGS="${CPPFLAGS} -I$libidn/include"
+ fi
+ AC_CHECK_HEADER(idna.h,
+ AC_CHECK_LIB(idn, stringprep_check_version,
+ [iri=yes LIBS="${LIBS} -lidn"], iri=no),
+ iri=no)
+
+ if test "X$iri" != "Xno" ; then
+ AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
+ AC_MSG_NOTICE([Enabling support for IRI.])
+ else
+ AC_MSG_WARN([Libidn not found])
+ fi
+fi
+
+
+dnl Needed by src/Makefile.am
+AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
+
+
dnl
dnl Create output
dnl
+2008-07-02 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New function idn_decode() to decode ASCII
+ encoded hostname to the locale.
+
+ * host.c : Show hostname to be resolved both in locale and
+ ASCII encoded.
+
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
* retr.c (retrieve_from_file): Allow for reading the links from
an external file (HTTP/FTP).
+2008-06-26 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New functions locale_to_utf8() and
+ idn_encode() adding basic capabilities of IRI/IDN.
+
+ * url.c : Convert URLs from locale to UTF-8 allowing a basic
+ support of IRI/IDN
+
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
* http.c: Make -nv --spider include the file's name when it
exists.
-
+
2008-06-22 Micah Cowan <micah@cowan.name>
* Makefile.am (version.c): Fixed version string invocation so it
string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c).
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New function check_encoding_name() as
+ a preliminary encoding name check.
+
+ * main.c, iri.c : Make use of check_encoding_name().
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c : Include missing stringprep.h file and add a
+ cast.
+
+ * init.c : set a default initial value for opt.enable_iri,
+ opt.locale and opt.encoding_remote.
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : Add a new function find_locale() to find
+ out the local system encoding.
+
+ * main.c : Make use of find_locale().
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * html-url.c : Add "content-type" meta tag parsing for
+ retrieving page encoding.
+
+ * iri.h : Make no-op version of parse_charset() return
+ NULL.
+
2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we
_set_ it that way, don't abort.
+2008-06-14 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New files.
+
+ * Makefile.am : Add files iri.h and conditional iri.c.
+
+ * build_info.c : Add compiled feature "iri".
+
+ * http.c : include iri.h and parse charset from Content-Type
+ header.
+
+ * init.c, main.c, options.h : if an options isn't supported
+ at compiled time, don't get rid off it and show a dummy
+ message instead if they are used.
+
2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
default.
2008-05-17 Kenny Parnell <k.parnell@gmail.com>
-
+
(cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
2008-05-17 Micah Cowan <micah@cowan.name>
-
+
* main.c (main): Handle Ctrl-D on command-line.
2008-05-15 Steven Schubiger <schubiger@gmail.com>
* options.h: Add an according boolean member to the options
struct.
-
+
* sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
out, because they're now defined independently by config.h.
# Version: @VERSION@
#
+if IRI_IS_ENABLED
+IRI_OBJ = iri.c
+endif
+
# The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
- utils.c \
- css-url.h connect.h convert.h cookies.h \
+ utils.c $(IRI_OBJ) \
+ css-url.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \
#else
"-gettext",
#endif
+
+#ifdef ENABLE_IRI
+ "+iri",
+#else
+ "-iri",
+#endif
+
/* sentinel value */
NULL
};
#include "host.h"
#include "connect.h"
#include "hash.h"
+#include "iri.h"
/* Define sockaddr_storage where unavailable (presumably on IPv4-only
hosts). */
if (print)
{
const char *txt_addr = print_address (ip);
- if (print && 0 != strcmp (print, txt_addr))
- logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
- escnonprint_uri (print), txt_addr, port);
+ if (0 != strcmp (print, txt_addr))
+ {
+ char *str = NULL, *name;
+
+ if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
+ {
+ int len = strlen (print) + strlen (name) + 4;
+ str = xmalloc (len);
+ snprintf (str, len, "%s (%s)", name, print);
+ str[len-1] = '\0';
+ xfree (name);
+ }
+
+ logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
+ str ? str : escnonprint_uri (print), txt_addr, port);
+
+ if (str)
+ xfree (str);
+ }
else
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
}
return FTPRERR;
/* Strip trailing CRLF before printing the line, so that
- escnonprint doesn't include bogus \012 and \015. */
+ quotting doesn't include bogus \012 and \015. */
p = strchr (line, '\0');
if (p > line && p[-1] == '\n')
*--p = '\0';
#include "host.h"
#include "url.h"
#include "hash.h"
+#include "iri.h"
#ifndef NO_ADDRESS
# define NO_ADDRESS NO_DATA
/* No luck with the cache; resolve HOST. */
if (!silent && !numeric_address)
- logprintf (LOG_VERBOSE, _("Resolving %s... "),
- quotearg_style (escape_quoting_style, host));
+ {
+ char *str = NULL, *name;
+
+ if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
+ {
+ int len = strlen (host) + strlen (name) + 4;
+ str = xmalloc (len);
+ snprintf (str, len, "%s (%s)", name, host);
+ str[len-1] = '\0';
+ xfree (name);
+ }
+
+ logprintf (LOG_VERBOSE, _("Resolving %s... "),
+ quotearg_style (escape_quoting_style, str ? str : host));
+
+ if (str)
+ xfree (str);
+ }
#ifdef ENABLE_IPV6
{
#include "recur.h"
#include "html-url.h"
#include "css-url.h"
+#include "iri.h"
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
return NULL;
}
+ set_ugly_no_encode (true);
url = url_parse (link_uri, NULL);
+ set_ugly_no_encode (false);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
+ set_ugly_no_encode (true);
url = url_parse (complete_uri, NULL);
+ set_ugly_no_encode (false);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
+
+ set_current_charset (mcharset);
+ xfree (mcharset);
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
url_text = merged;
}
+ set_ugly_no_encode (true);
url = url_parse (url_text, &up_error_code);
+ set_ugly_no_encode (false);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
#include "retr.h"
#include "connect.h"
#include "netrc.h"
+#include "iri.h"
#ifdef HAVE_SSL
# include "ssl.h"
#endif
hs->local_file = url_file_name (u);
}
}
-
+
/* TODO: perform this check only once. */
if (!hs->existence_checked && file_exists_p (hs->local_file))
{
local_dot_orig_file_exists = true;
local_filename = filename_plus_orig_suffix;
}
- }
+ }
if (!local_dot_orig_file_exists)
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
char *tmp = strchr (type, ';');
if (tmp)
{
+ /* sXXXav: only needed if IRI support is enabled */
+ char *tmp2 = tmp + 1;
+
while (tmp > type && c_isspace (tmp[-1]))
--tmp;
*tmp = '\0';
+
+ /* Try to get remote encoding if needed */
+ if (opt.enable_iri && !opt.encoding_remote)
+ set_current_charset (parse_charset (tmp2));
}
}
hs->newloc = resp_header_strdup (resp, "Location");
uerr_t err, ret = TRYLIMEXC;
time_t tmr = -1; /* remote time-stamp */
struct http_stat hstat; /* HTTP status */
- struct_stat st;
+ struct_stat st;
bool send_head_first = true;
/* Assert that no value for *LOCAL_FILE was passed. */
assert (local_file == NULL || *local_file == NULL);
-
+
/* Set LOCAL_FILE parameter. */
if (local_file && opt.output_document)
*local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
-
+
/* Reset NEWLOC parameter. */
*newloc = NULL;
retrieve the file. But if the output_document was given, then this
test was already done and the file didn't exist. Hence the !opt.output_document */
logprintf (LOG_VERBOSE, _("\
-File %s already there; not retrieving.\n\n"),
+File %s already there; not retrieving.\n\n"),
quote (hstat.local_file));
/* If the file is there, we suppose it's retrieved OK. */
*dt |= RETROKF;
/* Reset the counter. */
count = 0;
-
+
/* Reset the document type. */
*dt = 0;
-
+
/* Skip preliminary HEAD request if we're not in spider mode AND
* if -O was given or HTTP Content-Disposition support is disabled. */
if (!opt.spider
/* Send preliminary HEAD request if -N is given and we have an existing
* destination file. */
- if (opt.timestamping
+ if (opt.timestamping
&& !opt.content_disposition
&& file_exists_p (url_file_name (u)))
send_head_first = true;
-
+
/* THE loop */
do
{
/* Increment the pass counter. */
++count;
sleep_between_retrievals (count);
-
+
/* Get the current time string. */
tms = datetime_str (time (NULL));
-
+
if (opt.spider && !got_head)
logprintf (LOG_VERBOSE, _("\
Spider mode enabled. Check if remote file exists.\n"));
if (opt.verbose)
{
char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
-
- if (count > 1)
+
+ if (count > 1)
{
char tmp[256];
sprintf (tmp, _("(try:%2d)"), count);
logprintf (LOG_NOTQUIET, "--%s-- %s %s\n",
tms, tmp, hurl);
}
- else
+ else
{
logprintf (LOG_NOTQUIET, "--%s-- %s\n",
tms, hurl);
}
-
+
#ifdef WINDOWS
ws_changetitle (hurl);
#endif
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
- if (send_head_first && !got_head)
+ if (send_head_first && !got_head)
*dt |= HEAD_ONLY;
else
*dt &= ~HEAD_ONLY;
/* Time? */
tms = datetime_str (time (NULL));
-
+
/* Get the new location (with or without the redirection). */
if (hstat.newloc)
*newloc = xstrdup (hstat.newloc);
hstat.statcode);
ret = WRONGCODE;
}
- else
+ else
{
ret = NEWLOCATION;
}
/* All possibilities should have been exhausted. */
abort ();
}
-
+
if (!(*dt & RETROKF))
{
char *hurl = NULL;
continue;
}
/* Maybe we should always keep track of broken links, not just in
- * spider mode. */
- else if (opt.spider)
+ * spider mode.
+ * Don't log error if it was utf8 encoded because we will try
+ * one unencoded. */
+ else if (opt.spider && !get_utf8_encode ())
{
/* #### Again: ugly ugly ugly! */
- if (!hurl)
+ if (!hurl)
hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
nonexisting_url (hurl);
logprintf (LOG_NOTQUIET, _("\
else
{
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
- tms, hstat.statcode,
+ tms, hstat.statcode,
quotearg_style (escape_quoting_style, hstat.error));
}
logputs (LOG_VERBOSE, "\n");
{ "inet6only", &opt.ipv6_only, cmd_boolean },
#endif
{ "input", &opt.input_filename, cmd_file },
+ { "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file },
+ { "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number },
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
+ { "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
opt.restrict_files_case = restrict_no_case_restriction;
opt.max_redirect = 20;
+
+#ifdef ENABLE_IRI
+ opt.enable_iri = true;
+#else
+ opt.enable_iri = false;
+#endif
+ opt.locale = NULL;
+ opt.encoding_remote = NULL;
}
\f
/* Return the user's home directory (strdup-ed), or NULL if none is
--- /dev/null
+/* IRI related functions.
+ Copyright (C) 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <iconv.h>
+#include <stringprep.h>
+#include <idna.h>
+#include <errno.h>
+
+#include "utils.h"
+#include "iri.h"
+
+/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
+#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
+
+/* Note: locale encoding is kept in options struct (opt.locale) */
+
+/* Hold the encoding used for the current fetch */
+char *remote;
+
+/* Hold the encoding for the future found links */
+char *current;
+
+/* Will/Is the current URL encoded in utf8 ? */
+bool utf8_encode;
+
+/* Force no utf8 encoding for url_parse () */
+bool ugly_no_encode;
+
+static iconv_t locale2utf8;
+
+static bool open_locale_to_utf8 (void);
+static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
+
+
+/* Given a string containing "charset=XXX", return the encoding if found,
+ or NULL otherwise */
+char *
+parse_charset (char *str)
+{
+ char *charset;
+
+ if (!str || !*str)
+ return NULL;
+
+ str = strcasestr (str, "charset=");
+ if (!str)
+ return NULL;
+
+ str += 8;
+ charset = str;
+
+ /* sXXXav: which chars should be banned ??? */
+ while (*charset && !c_isspace (*charset))
+ charset++;
+
+ /* sXXXav: could strdupdelim return NULL ? */
+ charset = strdupdelim (str, charset);
+
+ /* Do a minimum check on the charset value */
+ if (!check_encoding_name (charset))
+ {
+ xfree (charset);
+ return NULL;
+ }
+
+ /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
+
+ return charset;
+}
+
+/* Find the locale used, or fall back on a default value */
+char *
+find_locale (void)
+{
+ return (char *) stringprep_locale_charset ();
+}
+
+/* Basic check of an encoding name. */
+bool
+check_encoding_name (char *encoding)
+{
+ char *s = encoding;
+
+ while (*s)
+ {
+ if (!c_isascii (*s) || c_isspace (*s))
+ {
+ logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
+ return false;
+ }
+
+ s++;
+ }
+
+ return true;
+}
+
+/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
+static bool
+open_locale_to_utf8 (void)
+{
+ if (locale2utf8)
+ return true;
+
+ /* sXXXav : That shouldn't happen, just in case */
+ if (!opt.locale)
+ {
+ logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
+ opt.locale = find_locale ();
+ }
+
+ if (!opt.locale)
+ return false;
+
+ locale2utf8 = iconv_open ("UTF-8", opt.locale);
+ if (locale2utf8 != (iconv_t)(-1))
+ return true;
+
+ logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
+ quote (opt.locale), quote ("UTF-8"));
+ locale2utf8 = NULL;
+ return false;
+}
+
+/* Try converting string str from locale to UTF-8. Return a new string
+ on success, or str on error or if conversion isn't needed. */
+const char *
+locale_to_utf8 (const char *str)
+{
+ char *new;
+
+ if (!strcasecmp (opt.locale, "utf-8"))
+ return str;
+
+ if (!open_locale_to_utf8 ())
+ return str;
+
+ if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
+ return (const char *) new;
+
+ return str;
+}
+
+/* Do the conversion according to the passed conversion descriptor cd. *out
+ will containes the transcoded string on success. *out content is
+ unspecified otherwise. */
+static bool
+do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+{
+ /* sXXXav : hummm hard to guess... */
+ size_t len, done, outlen = inlen * 2;
+ int invalid = 0, tooshort = 0;
+ char *s;
+
+ s = xmalloc (outlen + 1);
+ *out = s;
+ len = outlen;
+ done = 0;
+
+ for (;;)
+ {
+ if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
+ {
+ *out = s;
+ *(s + len - outlen - done) = '\0';
+ return true;
+ }
+
+ /* Incomplete or invalid multibyte sequence */
+ if (errno == EINVAL || errno == EILSEQ)
+ {
+ if (!invalid)
+ logprintf (LOG_VERBOSE,
+ "Incomplete or invalide multibyte sequence encountered\n");
+
+ invalid++;
+ **out = *in;
+ in++;
+ inlen--;
+ (*out)++;
+ outlen--;
+ }
+ else if (errno == E2BIG) /* Output buffer full */
+ {
+ char *new;
+
+ tooshort++;
+ done = len;
+ outlen = done + inlen * 2;
+ new = xmalloc (outlen + 1);
+ memcpy (new, s, done);
+ xfree (s);
+ s = new;
+ len = outlen;
+ *out = s + done;
+ }
+ else /* Weird, we got an unspecified error */
+ {
+ logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
+ break;
+ }
+ }
+
+ return false;
+}
+
+/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
+ on error. */
+char *
+idn_encode (char *host, bool utf8_encoded)
+{
+ char *new;
+ int ret;
+
+ /* Encode to UTF-8 if not done using current remote */
+ if (!utf8_encoded)
+ {
+ if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+ {
+ /* Nothing to encode or an error occured */
+ return NULL;
+ }
+
+ host = new;
+ }
+
+ /* toASCII UTF-8 NULL terminated string */
+ ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
+ if (ret != IDNA_SUCCESS)
+ {
+ /* sXXXav : free new when needed ! */
+ logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
+ on success or NULL on error. */
+char *
+idn_decode (char *host)
+{
+ char *new;
+ int ret;
+
+ ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
+ if (ret != IDNA_SUCCESS)
+ {
+ logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to transcode string str from remote encoding to UTF-8. On success, *new
+ contains the transcoded string. *new content is unspecified otherwise. */
+bool
+remote_to_utf8 (const char *str, const char **new)
+{
+ char *r;
+ iconv_t cd;
+ bool ret = false;
+
+ if (opt.encoding_remote)
+ r = opt.encoding_remote;
+ else if (current)
+ r = current;
+ else
+ return false;
+
+ cd = iconv_open ("UTF-8", r);
+ if (cd == (iconv_t)(-1))
+ return false;
+
+ if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+ ret = true;
+
+ iconv_close (cd);
+
+ /* Test if something was converted */
+ if (!strcmp (str, *new))
+ {
+ xfree ((char *) *new);
+ return false;
+ }
+
+ return ret;
+}
+
+char *get_remote_charset (void)
+{
+ return remote;
+}
+
+char *get_current_charset (void)
+{
+ return current;
+}
+
+void set_current_charset (char *charset)
+{
+ /*printf("[ current = `%s'\n", charset);*/
+ if (current)
+ xfree (current);
+
+ current = charset ? xstrdup (charset) : NULL;
+}
+
+void set_current_as_locale (void)
+{
+ /*printf("[ current = locale = `%s'\n", opt.locale);*/
+ if (current)
+ xfree (current);
+
+ /* sXXXav : assert opt.locale NULL ? */
+ current = xstrdup (opt.locale);
+}
+
+void
+set_remote_charset (char *charset)
+{
+ /*printf("[ remote = `%s'\n", charset);*/
+ if (remote)
+ xfree (remote);
+
+ remote = charset ? xstrdup (charset) : NULL;
+}
+
+void
+set_remote_as_current (void)
+{
+ /*printf("[ remote = current = `%s'\n", current);*/
+ if (remote)
+ xfree (remote);
+
+ remote = current ? xstrdup (current) : NULL;
+}
+
+void reset_utf8_encode (void)
+{
+ set_utf8_encode (opt.enable_iri);
+}
+
+void set_utf8_encode (bool encode)
+{
+ utf8_encode = encode;
+}
+
+bool get_utf8_encode (void)
+{
+ return (!ugly_no_encode && utf8_encode);
+}
+
+void set_ugly_no_encode (bool ugly)
+{
+ ugly_no_encode = ugly;
+}
+
--- /dev/null
+/* Internationalization related declarations.
+ Copyright (C) 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#ifndef IRI_H
+#define IRI_H
+
+#ifdef ENABLE_IRI
+
+char *parse_charset (char *str);
+char *find_locale (void);
+bool check_encoding_name (char *encoding);
+const char *locale_to_utf8 (const char *str);
+char *idn_encode (char *host, bool utf8_encoded);
+char *idn_decode (char *host);
+char *get_remote_charset (void);
+char *get_current_charset (void);
+void set_current_charset (char *charset);
+void set_current_as_locale (void);
+void set_current_charset (char *charset);
+void set_remote_charset (char *charset);
+void set_remote_as_current (void);
+bool remote_to_utf8 (const char *str, const char **new);
+void reset_utf8_encode (void);
+void set_utf8_encode (bool encode);
+bool get_utf8_encode (void);
+
+/* ugly ugly ugly */
+void set_ugly_no_encode (bool ugly);
+
+#else /* ENABLE_IRI */
+
+#define parse_charset(str) NULL
+#define find_locale() NULL
+#define check_encoding_name(str) false
+#define locale_to_utf8(str) (str)
+#define idn_encode(str,encoded) NULL
+#define idn_decode(str) NULL
+#define get_remote_charset() NULL
+#define get_current_charset() NULL
+#define set_current_charset(str)
+#define set_current_as_locale()
+#define set_current_charset(str)
+#define set_remote_charset(str)
+#define set_remote_as_current()
+#define remote_to_utf8(a,b) false
+#define reset_utf8_encode()
+#define set_utf8_encode(a)
+#define get_utf8_encode() false
+#define set_ugly_no_encode(a)
+
+#endif /* ENABLE_IRI */
+#endif /* IRI_H */
#include "utils.h"
#include "log.h"
-/* This file impplement support for "logging". Logging means printing
+/* This file implement support for "logging". Logging means printing
output, plus several additional features:
- Cataloguing output by importance. You can specify that a log
#include <assert.h>
#include <errno.h>
#include <time.h>
+#ifdef ENABLE_IRI
+#include <langinfo.h>
+#endif
#include "utils.h"
#include "init.h"
#include "convert.h"
#include "spider.h"
#include "http.h" /* for save_cookies */
+#include "iri.h"
#include <getopt.h>
#include <getpass.h>
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif
{ "input-file", 'i', OPT_VALUE, "input", -1 },
+ { "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+ { "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument },
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+ { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
exit (1);
}
+#ifdef ENABLE_IRI
+ if (opt.enable_iri)
+ {
+ if (opt.locale && !check_encoding_name (opt.locale))
+ opt.locale = NULL;
+
+ if (!opt.locale)
+ opt.locale = find_locale ();
+
+ if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
+ opt.encoding_remote = NULL;
+
+ /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
+ }
+#else
+ if (opt.enable_iri || opt.locale || opt.encoding_remote)
+ {
+ /* sXXXav : be more specific... */
+ printf(_("This version does not have support for IRIs\n"));
+ exit(1);
+ }
+#endif
+
if (opt.ask_passwd)
{
opt.passwd = prompt_for_password ();
char *filename = NULL, *redirected_URL = NULL;
int dt;
+ set_current_as_locale ();
+ set_ugly_no_encode (false);
+
if ((opt.recursive || opt.page_requisites)
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (url_scheme (*t) == SCHEME_FTP)
+ if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
-
+
status = retrieve_tree (*t);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ {
+ set_remote_as_current ();
+ status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ }
if (opt.delete_after && file_exists_p(filename))
{
bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */
+
+ bool enable_iri;
+ char *encoding_remote;
+ char *locale;
};
extern struct options opt;
#include "html-url.h"
#include "css-url.h"
#include "spider.h"
-
+#include "iri.h"
+\f
/* Functions for maintaining the URL queue. */
struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
+ char *remote_encoding;
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
+ char *charset = get_current_charset ();
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->css_allowed = css_allowed;
qel->next = NULL;
+ if (charset)
+ qel->remote_encoding = xstrdup (charset);
+ else
+ qel->remote_encoding = NULL;
+
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
+ /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
if (!queue->head)
queue->tail = NULL;
+ set_remote_charset (qel->remote_encoding);
+ if (qel->remote_encoding)
+ xfree (qel->remote_encoding);
+
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
struct hash_table *blacklist;
int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ struct url *start_url_parsed;
+ set_ugly_no_encode (true);
+ start_url_parsed= url_parse (start_url, &up_error_code);
+ set_ugly_no_encode (false);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
if (children)
{
struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ set_ugly_no_encode (true);
+ struct url *url_parsed = url_parse (url, NULL);
+ set_ugly_no_encode (false);
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
}
}
- if (file
- && (opt.delete_after
+ if (file
+ && (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- (otherwise unneeded because of --spider or rejected by -R)
- HTML file just to harvest its hyperlinks -- in either case,
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- (opt.spider ? "--spider" :
+ (opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
if (string_set_contains (blacklist, url))
{
- if (opt.spider)
+ if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
struct urlpos *upos;
bool success;
+ set_ugly_no_encode (true);
orig_parsed = url_parse (original, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
assert (new_parsed != NULL);
+ set_ugly_no_encode (false);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
+#include "iri.h"
#include "html-url.h"
/* Total size of downloaded files. Used to enforce quota. */
if (file)
*file = NULL;
+ reset_utf8_encode ();
+
+ second_try:
u = url_parse (url, &up_error_code);
if (!u)
{
return URLERROR;
}
+ /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+
if (!refurl)
refurl = opt.referer;
proxy = getproxy (u);
if (proxy)
{
+ /* sXXXav : support IRI for proxy */
/* Parse the proxy URL. */
+ set_ugly_no_encode (true);
proxy_url = url_parse (proxy, &up_error_code);
+ set_ugly_no_encode (false);
if (!proxy_url)
{
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
xfree (mynewloc);
mynewloc = construced_newloc;
+ reset_utf8_encode ();
+
/* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code);
if (!newloc_parsed)
goto redirected;
}
- if (local_file)
+ /* Try to not encode in UTF-8 if fetching failed */
+ if (!(*dt & RETROKF) && get_utf8_encode ())
{
+ set_utf8_encode (false);
+ /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+ goto second_try;
+ }
+
+ if (local_file && *dt & RETROKF)
+ {
+ register_download (u->url, local_file);
+ if (redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+ if (*dt & TEXTHTML)
+ register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
+
status = retrieve_tree (cur_url->url->url);
opt.follow_ftp = old_follow_ftp;
url_uses_proxy (const char *url)
{
bool ret;
- struct url *u = url_parse (url, NULL);
+ struct url *u;
+ set_ugly_no_encode(true);
+ u= url_parse (url, NULL);
+ set_ugly_no_encode(false);
if (!u)
return false;
ret = getproxy (u) != NULL;
#include "utils.h"
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+#include "iri.h"
#ifdef TESTING
#include "test.h"
goto error;
}
+ if (opt.enable_iri && get_utf8_encode ())
+ {
+ const char *new;
+ bool utf8_encode;
+ url_unescape ((char *) url);
+ utf8_encode = remote_to_utf8 (url, &new);
+ set_utf8_encode (utf8_encode);
+ if (utf8_encode)
+ url = new;
+ }
+
url_encoded = reencode_escapes (url);
p = url_encoded;
host_modified = true;
}
+ if (opt.enable_iri)
+ {
+ char *new = idn_encode (u->host, get_utf8_encode ());
+ if (new)
+ {
+ xfree (u->host);
+ u->host = new;
+ host_modified = true;
+ }
+ }
+
if (params_b)
u->params = strdupdelim (params_b, params_e);
if (query_b)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
- if (path_modified || u->fragment || host_modified || path_b == path_e)
+ if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild