From: Saint Xavier Date: Sun, 20 Jul 2008 19:45:09 +0000 (+0200) Subject: Automated merge. X-Git-Tag: v1.13~338^2~7^2~6^2~13^2~2 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=b30a0dd817886f77a64be9218c5e5399bcbc2e67;hp=b28a6abfe66e03dae1f749d8215f4ba2b7303e5a Automated merge. --- diff --git a/ChangeLog b/ChangeLog index d96ce355..21d380b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,14 @@ * AUTHORS: Added Steven Schubiger. +2008-06-26 Xavier Saint + + * configure.ac : IRIs support required libiconv, check it. + +2008-06-14 Xavier Saint + + * configure.ac: Add support for IRIs + 2008-05-29 Micah Cowan * po/*.po: Updated from TP (the 1.11.3 set). diff --git a/configure.ac b/configure.ac index 2ccc703d..fb0c65d1 100644 --- a/configure.ac +++ b/configure.ac @@ -460,6 +460,77 @@ else fi AC_SUBST(COMMENT_IF_NO_POD2MAN) + +dnl +dnl Check for IDN/IRIs +dnl + +AC_ARG_ENABLE(iri, + AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]), + [case "${enable_iri}" in + no) + dnl Disable IRIs checking + AC_MSG_NOTICE([disabling IRIs at user request]) + iri=no + ;; + yes) + dnl IRIs explicitly enabled + iri=yes + force_iri=yes + ;; + auto) + dnl Auto-detect IRI + iri=yes + ;; + *) + AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri']) + ;; + esac + ], [ + dnl If nothing is specified, assume auto-detection + iri=yes + ] +) + +AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]], + [Support IDN/IRIs (needs GNU Libidn)]), + libidn=$withval, libidn="") +if test "X$iri" != "Xno"; then + AM_ICONV + + if test "X$am_cv_func_iconv" != "Xyes"; then + iri=no + if test "X$force_iri" = "Xyes"; then + AC_MSG_ERROR([Libiconv is required for IRIs support]) + else + AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found]) + fi + fi +fi + +if test "X$iri" != "Xno"; then + if test "$libidn" != ""; then + LDFLAGS="${LDFLAGS} -L$libidn/lib" + CPPFLAGS="${CPPFLAGS} -I$libidn/include" + fi + AC_CHECK_HEADER(idna.h, + AC_CHECK_LIB(idn, stringprep_check_version, + [iri=yes LIBS="${LIBS} -lidn"], iri=no), + iri=no) + + if test "X$iri" != "Xno" ; then + AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.]) + AC_MSG_NOTICE([Enabling support for IRI.]) + else + AC_MSG_WARN([Libidn not found]) + fi +fi + + +dnl Needed by src/Makefile.am +AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) + + dnl dnl Create output dnl diff --git a/src/ChangeLog b/src/ChangeLog index e551f1c9..02bc331b 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,8 +1,24 @@ +2008-07-02 Xavier Saint + + * iri.c, iri.h : New function idn_decode() to decode ASCII + encoded hostname to the locale. + + * host.c : Show hostname to be resolved both in locale and + ASCII encoded. + 2008-06-28 Steven Schubiger * retr.c (retrieve_from_file): Allow for reading the links from an external file (HTTP/FTP). +2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + 2008-06-25 Steven Schubiger * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic @@ -27,7 +43,7 @@ * http.c: Make -nv --spider include the file's name when it exists. - + 2008-06-22 Micah Cowan * Makefile.am (version.c): Fixed version string invocation so it @@ -35,12 +51,57 @@ string vars pointers-to-const, and moved line lengths below 80 (in Makefile.am, not in version.c). +2008-06-19 Xavier Saint + + * iri.c, iri.h : New function check_encoding_name() as + a preliminary encoding name check. + + * main.c, iri.c : Make use of check_encoding_name(). + +2008-06-19 Xavier Saint + + * iri.c : Include missing stringprep.h file and add a + cast. + + * init.c : set a default initial value for opt.enable_iri, + opt.locale and opt.encoding_remote. + +2008-06-19 Xavier Saint + + * iri.c, iri.h : Add a new function find_locale() to find + out the local system encoding. + + * main.c : Make use of find_locale(). + +2008-06-19 Xavier Saint + + * html-url.c : Add "content-type" meta tag parsing for + retrieving page encoding. + + * iri.h : Make no-op version of parse_charset() return + NULL. + 2008-06-16 Micah Cowan * http.c (http_loop): When hstat.len is higher than the successfully completed content's length, but it's because we _set_ it that way, don't abort. +2008-06-14 Xavier Saint + + * iri.c, iri.h : New files. + + * Makefile.am : Add files iri.h and conditional iri.c. + + * build_info.c : Add compiled feature "iri". + + * http.c : include iri.h and parse charset from Content-Type + header. + + * init.c, main.c, options.h : if an options isn't supported + at compiled time, don't get rid off it and show a dummy + message instead if they are used. + 2008-06-13 Micah Cowan * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL @@ -84,11 +145,11 @@ default. 2008-05-17 Kenny Parnell - + (cmd_spec_prefer_family): Initialize prefer_family to prefer_none. 2008-05-17 Micah Cowan - + * main.c (main): Handle Ctrl-D on command-line. 2008-05-15 Steven Schubiger @@ -127,7 +188,7 @@ * options.h: Add an according boolean member to the options struct. - + * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE out, because they're now defined independently by config.h. diff --git a/src/Makefile.am b/src/Makefile.am index 6db4ac17..edbb592e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -30,6 +30,10 @@ # Version: @VERSION@ # +if IRI_IS_ENABLED +IRI_OBJ = iri.c +endif + # The following line is losing on some versions of make! DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@ @@ -40,8 +44,8 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ recur.c res.c retr.c snprintf.c spider.c url.c \ - utils.c \ - css-url.h connect.h convert.h cookies.h \ + utils.c $(IRI_OBJ) \ + css-url.h connect.h convert.h cookies.h \ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ diff --git a/src/build_info.c b/src/build_info.c index e330b653..00d5122d 100644 --- a/src/build_info.c +++ b/src/build_info.c @@ -100,6 +100,13 @@ const char* (compiled_features[]) = #else "-gettext", #endif + +#ifdef ENABLE_IRI + "+iri", +#else + "-iri", +#endif + /* sentinel value */ NULL }; diff --git a/src/connect.c b/src/connect.c index 1e8f07e5..6cfdb4b7 100644 --- a/src/connect.c +++ b/src/connect.c @@ -58,6 +58,7 @@ as that of the covered work. */ #include "host.h" #include "connect.h" #include "hash.h" +#include "iri.h" /* Define sockaddr_storage where unavailable (presumably on IPv4-only hosts). */ @@ -266,9 +267,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print) if (print) { const char *txt_addr = print_address (ip); - if (print && 0 != strcmp (print, txt_addr)) - logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), - escnonprint_uri (print), txt_addr, port); + if (0 != strcmp (print, txt_addr)) + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL) + { + int len = strlen (print) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, print); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), + str ? str : escnonprint_uri (print), txt_addr, port); + + if (str) + xfree (str); + } else logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); } diff --git a/src/ftp-basic.c b/src/ftp-basic.c index 265a1e25..5f250959 100644 --- a/src/ftp-basic.c +++ b/src/ftp-basic.c @@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line) return FTPRERR; /* Strip trailing CRLF before printing the line, so that - escnonprint doesn't include bogus \012 and \015. */ + quotting doesn't include bogus \012 and \015. */ p = strchr (line, '\0'); if (p > line && p[-1] == '\n') *--p = '\0'; diff --git a/src/host.c b/src/host.c index fdb35b1c..1226a274 100644 --- a/src/host.c +++ b/src/host.c @@ -53,6 +53,7 @@ as that of the covered work. */ #include "host.h" #include "url.h" #include "hash.h" +#include "iri.h" #ifndef NO_ADDRESS # define NO_ADDRESS NO_DATA @@ -712,8 +713,24 @@ lookup_host (const char *host, int flags) /* No luck with the cache; resolve HOST. */ if (!silent && !numeric_address) - logprintf (LOG_VERBOSE, _("Resolving %s... "), - quotearg_style (escape_quoting_style, host)); + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL) + { + int len = strlen (host) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, host); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Resolving %s... "), + quotearg_style (escape_quoting_style, str ? str : host)); + + if (str) + xfree (str); + } #ifdef ENABLE_IPV6 { diff --git a/src/html-url.c b/src/html-url.c index 75bec7d9..ef93a7e4 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -44,6 +44,7 @@ as that of the covered work. */ #include "recur.h" #include "html-url.h" #include "css-url.h" +#include "iri.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -284,7 +285,9 @@ append_url (const char *link_uri, int position, int size, return NULL; } + set_ugly_no_encode (true); url = url_parse (link_uri, NULL); + set_ugly_no_encode (false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -303,7 +306,9 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); + set_ugly_no_encode (true); url = url_parse (complete_uri, NULL); + set_ugly_no_encode (false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -553,6 +558,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) entry->link_expect_html = 1; } } + else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) + { + /* Handle stuff like: + */ + + char *mcharset; + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + + mcharset = parse_charset (content); + if (!mcharset) + return; + + /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ + + set_current_charset (mcharset); + xfree (mcharset); + } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: @@ -726,7 +750,9 @@ get_urls_file (const char *file) url_text = merged; } + set_ugly_no_encode (true); url = url_parse (url_text, &up_error_code); + set_ugly_no_encode (false); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/http.c b/src/http.c index 52f65fed..5ec70d27 100644 --- a/src/http.c +++ b/src/http.c @@ -49,6 +49,7 @@ as that of the covered work. */ #include "retr.h" #include "connect.h" #include "netrc.h" +#include "iri.h" #ifdef HAVE_SSL # include "ssl.h" #endif @@ -1827,7 +1828,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@ -1896,7 +1897,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@ -2048,9 +2049,16 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); char *tmp = strchr (type, ';'); if (tmp) { + /* sXXXav: only needed if IRI support is enabled */ + char *tmp2 = tmp + 1; + while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; + + /* Try to get remote encoding if needed */ + if (opt.enable_iri && !opt.encoding_remote) + set_current_charset (parse_charset (tmp2)); } } hs->newloc = resp_header_strdup (resp, "Location"); @@ -2336,16 +2344,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); - + /* Set LOCAL_FILE parameter. */ if (local_file && opt.output_document) *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); - + /* Reset NEWLOC parameter. */ *newloc = NULL; @@ -2382,7 +2390,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), +File %s already there; not retrieving.\n\n"), quote (hstat.local_file)); /* If the file is there, we suppose it's retrieved OK. */ *dt |= RETROKF; @@ -2398,10 +2406,10 @@ File %s already there; not retrieving.\n\n"), /* Reset the counter. */ count = 0; - + /* Reset the document type. */ *dt = 0; - + /* Skip preliminary HEAD request if we're not in spider mode AND * if -O was given or HTTP Content-Disposition support is disabled. */ if (!opt.spider @@ -2410,21 +2418,21 @@ File %s already there; not retrieving.\n\n"), /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition && file_exists_p (url_file_name (u))) send_head_first = true; - + /* THE loop */ do { /* Increment the pass counter. */ ++count; sleep_between_retrievals (count); - + /* Get the current time string. */ tms = datetime_str (time (NULL)); - + if (opt.spider && !got_head) logprintf (LOG_VERBOSE, _("\ Spider mode enabled. Check if remote file exists.\n")); @@ -2433,20 +2441,20 @@ Spider mode enabled. Check if remote file exists.\n")); if (opt.verbose) { char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); - - if (count > 1) + + if (count > 1) { char tmp[256]; sprintf (tmp, _("(try:%2d)"), count); logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", tms, tmp, hurl); } - else + else { logprintf (LOG_NOTQUIET, "--%s-- %s\n", tms, hurl); } - + #ifdef WINDOWS ws_changetitle (hurl); #endif @@ -2456,7 +2464,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (send_head_first && !got_head) + if (send_head_first && !got_head) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@ -2493,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* Time? */ tms = datetime_str (time (NULL)); - + /* Get the new location (with or without the redirection). */ if (hstat.newloc) *newloc = xstrdup (hstat.newloc); @@ -2532,7 +2540,7 @@ Spider mode enabled. Check if remote file exists.\n")); hstat.statcode); ret = WRONGCODE; } - else + else { ret = NEWLOCATION; } @@ -2548,7 +2556,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { char *hurl = NULL; @@ -2567,11 +2575,13 @@ Spider mode enabled. Check if remote file exists.\n")); continue; } /* Maybe we should always keep track of broken links, not just in - * spider mode. */ - else if (opt.spider) + * spider mode. + * Don't log error if it was utf8 encoded because we will try + * one unencoded. */ + else if (opt.spider && !get_utf8_encode ()) { /* #### Again: ugly ugly ugly! */ - if (!hurl) + if (!hurl) hurl = url_string (u, URL_AUTH_HIDE_PASSWD); nonexisting_url (hurl); logprintf (LOG_NOTQUIET, _("\ @@ -2580,7 +2590,7 @@ Remote file does not exist -- broken link!!!\n")); else { logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, + tms, hstat.statcode, quotearg_style (escape_quoting_style, hstat.error)); } logputs (LOG_VERBOSE, "\n"); diff --git a/src/init.c b/src/init.c index a634fa79..f56aa652 100644 --- a/src/init.c +++ b/src/init.c @@ -181,9 +181,11 @@ static const struct { { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, + { "iri", &opt.enable_iri, cmd_boolean }, { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, + { "locale", &opt.locale, cmd_string }, { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@ -223,6 +225,7 @@ static const struct { { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, + { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, @@ -330,6 +333,14 @@ defaults (void) opt.restrict_files_case = restrict_no_case_restriction; opt.max_redirect = 20; + +#ifdef ENABLE_IRI + opt.enable_iri = true; +#else + opt.enable_iri = false; +#endif + opt.locale = NULL; + opt.encoding_remote = NULL; } /* Return the user's home directory (strdup-ed), or NULL if none is diff --git a/src/iri.c b/src/iri.c new file mode 100644 index 00000000..c28d4f51 --- /dev/null +++ b/src/iri.c @@ -0,0 +1,394 @@ +/* IRI related functions. + Copyright (C) 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at +your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "iri.h" + +/* RFC3987 section 3.1 mandates STD3 ASCII RULES */ +#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES + +/* Note: locale encoding is kept in options struct (opt.locale) */ + +/* Hold the encoding used for the current fetch */ +char *remote; + +/* Hold the encoding for the future found links */ +char *current; + +/* Will/Is the current URL encoded in utf8 ? */ +bool utf8_encode; + +/* Force no utf8 encoding for url_parse () */ +bool ugly_no_encode; + +static iconv_t locale2utf8; + +static bool open_locale_to_utf8 (void); +static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); + + +/* Given a string containing "charset=XXX", return the encoding if found, + or NULL otherwise */ +char * +parse_charset (char *str) +{ + char *charset; + + if (!str || !*str) + return NULL; + + str = strcasestr (str, "charset="); + if (!str) + return NULL; + + str += 8; + charset = str; + + /* sXXXav: which chars should be banned ??? */ + while (*charset && !c_isspace (*charset)) + charset++; + + /* sXXXav: could strdupdelim return NULL ? */ + charset = strdupdelim (str, charset); + + /* Do a minimum check on the charset value */ + if (!check_encoding_name (charset)) + { + xfree (charset); + return NULL; + } + + /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ + + return charset; +} + +/* Find the locale used, or fall back on a default value */ +char * +find_locale (void) +{ + return (char *) stringprep_locale_charset (); +} + +/* Basic check of an encoding name. */ +bool +check_encoding_name (char *encoding) +{ + char *s = encoding; + + while (*s) + { + if (!c_isascii (*s) || c_isspace (*s)) + { + logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding)); + return false; + } + + s++; + } + + return true; +} + +/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */ +static bool +open_locale_to_utf8 (void) +{ + if (locale2utf8) + return true; + + /* sXXXav : That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); + opt.locale = find_locale (); + } + + if (!opt.locale) + return false; + + locale2utf8 = iconv_open ("UTF-8", opt.locale); + if (locale2utf8 != (iconv_t)(-1)) + return true; + + logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", + quote (opt.locale), quote ("UTF-8")); + locale2utf8 = NULL; + return false; +} + +/* Try converting string str from locale to UTF-8. Return a new string + on success, or str on error or if conversion isn't needed. */ +const char * +locale_to_utf8 (const char *str) +{ + char *new; + + if (!strcasecmp (opt.locale, "utf-8")) + return str; + + if (!open_locale_to_utf8 ()) + return str; + + if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new)) + return (const char *) new; + + return str; +} + +/* Do the conversion according to the passed conversion descriptor cd. *out + will containes the transcoded string on success. *out content is + unspecified otherwise. */ +static bool +do_conversion (iconv_t cd, char *in, size_t inlen, char **out) +{ + /* sXXXav : hummm hard to guess... */ + size_t len, done, outlen = inlen * 2; + int invalid = 0, tooshort = 0; + char *s; + + s = xmalloc (outlen + 1); + *out = s; + len = outlen; + done = 0; + + for (;;) + { + if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) + { + *out = s; + *(s + len - outlen - done) = '\0'; + return true; + } + + /* Incomplete or invalid multibyte sequence */ + if (errno == EINVAL || errno == EILSEQ) + { + if (!invalid) + logprintf (LOG_VERBOSE, + "Incomplete or invalide multibyte sequence encountered\n"); + + invalid++; + **out = *in; + in++; + inlen--; + (*out)++; + outlen--; + } + else if (errno == E2BIG) /* Output buffer full */ + { + char *new; + + tooshort++; + done = len; + outlen = done + inlen * 2; + new = xmalloc (outlen + 1); + memcpy (new, s, done); + xfree (s); + s = new; + len = outlen; + *out = s + done; + } + else /* Weird, we got an unspecified error */ + { + logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno); + break; + } + } + + return false; +} + +/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL + on error. */ +char * +idn_encode (char *host, bool utf8_encoded) +{ + char *new; + int ret; + + /* Encode to UTF-8 if not done using current remote */ + if (!utf8_encoded) + { + if (!remote_to_utf8 ((const char *) host, (const char **) &new)) + { + /* Nothing to encode or an error occured */ + return NULL; + } + + host = new; + } + + /* toASCII UTF-8 NULL terminated string */ + ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + /* sXXXav : free new when needed ! */ + logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to decode an "ASCII encoded" host. Return the new domain in the locale + on success or NULL on error. */ +char * +idn_decode (char *host) +{ + char *new; + int ret; + + ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to transcode string str from remote encoding to UTF-8. On success, *new + contains the transcoded string. *new content is unspecified otherwise. */ +bool +remote_to_utf8 (const char *str, const char **new) +{ + char *r; + iconv_t cd; + bool ret = false; + + if (opt.encoding_remote) + r = opt.encoding_remote; + else if (current) + r = current; + else + return false; + + cd = iconv_open ("UTF-8", r); + if (cd == (iconv_t)(-1)) + return false; + + if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + ret = true; + + iconv_close (cd); + + /* Test if something was converted */ + if (!strcmp (str, *new)) + { + xfree ((char *) *new); + return false; + } + + return ret; +} + +char *get_remote_charset (void) +{ + return remote; +} + +char *get_current_charset (void) +{ + return current; +} + +void set_current_charset (char *charset) +{ + /*printf("[ current = `%s'\n", charset);*/ + if (current) + xfree (current); + + current = charset ? xstrdup (charset) : NULL; +} + +void set_current_as_locale (void) +{ + /*printf("[ current = locale = `%s'\n", opt.locale);*/ + if (current) + xfree (current); + + /* sXXXav : assert opt.locale NULL ? */ + current = xstrdup (opt.locale); +} + +void +set_remote_charset (char *charset) +{ + /*printf("[ remote = `%s'\n", charset);*/ + if (remote) + xfree (remote); + + remote = charset ? xstrdup (charset) : NULL; +} + +void +set_remote_as_current (void) +{ + /*printf("[ remote = current = `%s'\n", current);*/ + if (remote) + xfree (remote); + + remote = current ? xstrdup (current) : NULL; +} + +void reset_utf8_encode (void) +{ + set_utf8_encode (opt.enable_iri); +} + +void set_utf8_encode (bool encode) +{ + utf8_encode = encode; +} + +bool get_utf8_encode (void) +{ + return (!ugly_no_encode && utf8_encode); +} + +void set_ugly_no_encode (bool ugly) +{ + ugly_no_encode = ugly; +} + diff --git a/src/iri.h b/src/iri.h new file mode 100644 index 00000000..50102df4 --- /dev/null +++ b/src/iri.h @@ -0,0 +1,78 @@ +/* Internationalization related declarations. + Copyright (C) 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#ifndef IRI_H +#define IRI_H + +#ifdef ENABLE_IRI + +char *parse_charset (char *str); +char *find_locale (void); +bool check_encoding_name (char *encoding); +const char *locale_to_utf8 (const char *str); +char *idn_encode (char *host, bool utf8_encoded); +char *idn_decode (char *host); +char *get_remote_charset (void); +char *get_current_charset (void); +void set_current_charset (char *charset); +void set_current_as_locale (void); +void set_current_charset (char *charset); +void set_remote_charset (char *charset); +void set_remote_as_current (void); +bool remote_to_utf8 (const char *str, const char **new); +void reset_utf8_encode (void); +void set_utf8_encode (bool encode); +bool get_utf8_encode (void); + +/* ugly ugly ugly */ +void set_ugly_no_encode (bool ugly); + +#else /* ENABLE_IRI */ + +#define parse_charset(str) NULL +#define find_locale() NULL +#define check_encoding_name(str) false +#define locale_to_utf8(str) (str) +#define idn_encode(str,encoded) NULL +#define idn_decode(str) NULL +#define get_remote_charset() NULL +#define get_current_charset() NULL +#define set_current_charset(str) +#define set_current_as_locale() +#define set_current_charset(str) +#define set_remote_charset(str) +#define set_remote_as_current() +#define remote_to_utf8(a,b) false +#define reset_utf8_encode() +#define set_utf8_encode(a) +#define get_utf8_encode() false +#define set_ugly_no_encode(a) + +#endif /* ENABLE_IRI */ +#endif /* IRI_H */ diff --git a/src/log.c b/src/log.c index e84e5c61..b62bf9dd 100644 --- a/src/log.c +++ b/src/log.c @@ -43,7 +43,7 @@ as that of the covered work. */ #include "utils.h" #include "log.h" -/* This file impplement support for "logging". Logging means printing +/* This file implement support for "logging". Logging means printing output, plus several additional features: - Cataloguing output by importance. You can specify that a log diff --git a/src/main.c b/src/main.c index 70387c9c..6135a67d 100644 --- a/src/main.c +++ b/src/main.c @@ -43,6 +43,9 @@ as that of the covered work. */ #include #include #include +#ifdef ENABLE_IRI +#include +#endif #include "utils.h" #include "init.h" @@ -54,6 +57,7 @@ as that of the covered work. */ #include "convert.h" #include "spider.h" #include "http.h" /* for save_cookies */ +#include "iri.h" #include #include @@ -200,10 +204,12 @@ static struct cmdline_option option_data[] = { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, + { "iri", 0, OPT_BOOLEAN, "iri", -1 }, { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, + { "locale", 0, OPT_VALUE, "locale", -1 }, { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@ -237,6 +243,7 @@ static struct cmdline_option option_data[] = { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, + { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@ -1058,6 +1065,29 @@ for details.\n\n")); exit (1); } +#ifdef ENABLE_IRI + if (opt.enable_iri) + { + if (opt.locale && !check_encoding_name (opt.locale)) + opt.locale = NULL; + + if (!opt.locale) + opt.locale = find_locale (); + + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) + opt.encoding_remote = NULL; + + /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/ + } +#else + if (opt.enable_iri || opt.locale || opt.encoding_remote) + { + /* sXXXav : be more specific... */ + printf(_("This version does not have support for IRIs\n")); + exit(1); + } +#endif + if (opt.ask_passwd) { opt.passwd = prompt_for_password (); @@ -1161,21 +1191,27 @@ WARNING: Can't reopen standard output in binary mode;\n\ char *filename = NULL, *redirected_URL = NULL; int dt; + set_current_as_locale (); + set_ugly_no_encode (false); + if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) + if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (*t); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + { + set_remote_as_current (); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + } if (opt.delete_after && file_exists_p(filename)) { diff --git a/src/options.h b/src/options.h index 6a6badb0..723f80a1 100644 --- a/src/options.h +++ b/src/options.h @@ -237,6 +237,10 @@ struct options bool content_disposition; /* Honor HTTP Content-Disposition header. */ bool auth_without_challenge; /* Issue Basic authentication creds without waiting for a challenge. */ + + bool enable_iri; + char *encoding_remote; + char *locale; }; extern struct options opt; diff --git a/src/recur.c b/src/recur.c index 729a14e9..24b80ad4 100644 --- a/src/recur.c +++ b/src/recur.c @@ -51,7 +51,8 @@ as that of the covered work. */ #include "html-url.h" #include "css-url.h" #include "spider.h" - +#include "iri.h" + /* Functions for maintaining the URL queue. */ struct queue_element { @@ -60,6 +61,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ + char *remote_encoding; bool css_allowed; /* whether the document is allowed to be treated as CSS. */ struct queue_element *next; /* next element in queue */ @@ -98,6 +100,7 @@ url_enqueue (struct url_queue *queue, bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); + char *charset = get_current_charset (); qel->url = url; qel->referer = referer; qel->depth = depth; @@ -105,6 +108,11 @@ url_enqueue (struct url_queue *queue, qel->css_allowed = css_allowed; qel->next = NULL; + if (charset) + qel->remote_encoding = xstrdup (charset); + else + qel->remote_encoding = NULL; + ++queue->count; if (queue->count > queue->maxcount) queue->maxcount = queue->count; @@ -112,6 +120,8 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/ + if (queue->tail) queue->tail->next = qel; queue->tail = qel; @@ -137,6 +147,10 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; + set_remote_charset (qel->remote_encoding); + if (qel->remote_encoding) + xfree (qel->remote_encoding); + *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -192,8 +206,11 @@ retrieve_tree (const char *start_url) struct hash_table *blacklist; int up_error_code; - struct url *start_url_parsed = url_parse (start_url, &up_error_code); + struct url *start_url_parsed; + set_ugly_no_encode (true); + start_url_parsed= url_parse (start_url, &up_error_code); + set_ugly_no_encode (false); if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, @@ -357,7 +374,9 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + set_ugly_no_encode (true); + struct url *url_parsed = url_parse (url, NULL); + set_ugly_no_encode (false); char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -394,18 +413,18 @@ retrieve_tree (const char *start_url) } } - if (file - && (opt.delete_after + if (file + && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : + (opt.spider ? "--spider" : "recursive rejection criteria"))); logprintf (LOG_VERBOSE, (opt.delete_after || opt.spider @@ -470,7 +489,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { - if (opt.spider) + if (opt.spider) { char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); @@ -662,11 +681,13 @@ descend_redirect_p (const char *redirected, const char *original, int depth, struct urlpos *upos; bool success; + set_ugly_no_encode (true); orig_parsed = url_parse (original, NULL); assert (orig_parsed != NULL); new_parsed = url_parse (redirected, NULL); assert (new_parsed != NULL); + set_ugly_no_encode (false); upos = xnew0 (struct urlpos); upos->url = new_parsed; diff --git a/src/retr.c b/src/retr.c index 58e00d2f..7a28ea32 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,6 +51,7 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "ptimer.h" +#include "iri.h" #include "html-url.h" /* Total size of downloaded files. Used to enforce quota. */ @@ -625,6 +626,9 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; + reset_utf8_encode (); + + second_try: u = url_parse (url, &up_error_code); if (!u) { @@ -633,6 +637,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } + /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/ + if (!refurl) refurl = opt.referer; @@ -646,8 +652,11 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { + /* sXXXav : support IRI for proxy */ /* Parse the proxy URL. */ + set_ugly_no_encode (true); proxy_url = url_parse (proxy, &up_error_code); + set_ugly_no_encode (false); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -722,6 +731,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; + reset_utf8_encode (); + /* Now, see if this new location makes sense. */ newloc_parsed = url_parse (mynewloc, &up_error_code); if (!newloc_parsed) @@ -770,8 +781,21 @@ retrieve_url (const char *origurl, char **file, char **newloc, goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (!(*dt & RETROKF) && get_utf8_encode ()) { + set_utf8_encode (false); + /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ + goto second_try; + } + + if (local_file && *dt & RETROKF) + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); if (*dt & RETROKF) { register_download (u->url, local_file); @@ -860,9 +884,9 @@ retrieve_from_file (const char *file, bool html, int *count) int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (cur_url->url->url); opt.follow_ftp = old_follow_ftp; @@ -1039,7 +1063,10 @@ bool url_uses_proxy (const char *url) { bool ret; - struct url *u = url_parse (url, NULL); + struct url *u; + set_ugly_no_encode(true); + u= url_parse (url, NULL); + set_ugly_no_encode(false); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/url.c b/src/url.c index f5d621f9..beaf0fb2 100644 --- a/src/url.c +++ b/src/url.c @@ -42,6 +42,7 @@ as that of the covered work. */ #include "utils.h" #include "url.h" #include "host.h" /* for is_valid_ipv6_address */ +#include "iri.h" #ifdef TESTING #include "test.h" @@ -670,6 +671,17 @@ url_parse (const char *url, int *error) goto error; } + if (opt.enable_iri && get_utf8_encode ()) + { + const char *new; + bool utf8_encode; + url_unescape ((char *) url); + utf8_encode = remote_to_utf8 (url, &new); + set_utf8_encode (utf8_encode); + if (utf8_encode) + url = new; + } + url_encoded = reencode_escapes (url); p = url_encoded; @@ -844,6 +856,17 @@ url_parse (const char *url, int *error) host_modified = true; } + if (opt.enable_iri) + { + char *new = idn_encode (u->host, get_utf8_encode ()); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } + if (params_b) u->params = strdupdelim (params_b, params_e); if (query_b) @@ -851,7 +874,7 @@ url_parse (const char *url, int *error) if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild