From: Saint Xavier Date: Wed, 23 Jul 2008 22:58:10 +0000 (+0200) Subject: Automated merge. X-Git-Tag: v1.13~338^2~7^2~6^2~13 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=ccd62071dcbdfc0269813746b9f51ff9c23261db;hp=e4371807f6eb9ee6d0cc7828773e792fbecb5ce5 Automated merge. --- diff --git a/ChangeLog b/ChangeLog index d96ce355..21d380b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,14 @@ * AUTHORS: Added Steven Schubiger. +2008-06-26 Xavier Saint + + * configure.ac : IRIs support required libiconv, check it. + +2008-06-14 Xavier Saint + + * configure.ac: Add support for IRIs + 2008-05-29 Micah Cowan * po/*.po: Updated from TP (the 1.11.3 set). diff --git a/configure.ac b/configure.ac index 2ccc703d..fb0c65d1 100644 --- a/configure.ac +++ b/configure.ac @@ -460,6 +460,77 @@ else fi AC_SUBST(COMMENT_IF_NO_POD2MAN) + +dnl +dnl Check for IDN/IRIs +dnl + +AC_ARG_ENABLE(iri, + AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]), + [case "${enable_iri}" in + no) + dnl Disable IRIs checking + AC_MSG_NOTICE([disabling IRIs at user request]) + iri=no + ;; + yes) + dnl IRIs explicitly enabled + iri=yes + force_iri=yes + ;; + auto) + dnl Auto-detect IRI + iri=yes + ;; + *) + AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri']) + ;; + esac + ], [ + dnl If nothing is specified, assume auto-detection + iri=yes + ] +) + +AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]], + [Support IDN/IRIs (needs GNU Libidn)]), + libidn=$withval, libidn="") +if test "X$iri" != "Xno"; then + AM_ICONV + + if test "X$am_cv_func_iconv" != "Xyes"; then + iri=no + if test "X$force_iri" = "Xyes"; then + AC_MSG_ERROR([Libiconv is required for IRIs support]) + else + AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found]) + fi + fi +fi + +if test "X$iri" != "Xno"; then + if test "$libidn" != ""; then + LDFLAGS="${LDFLAGS} -L$libidn/lib" + CPPFLAGS="${CPPFLAGS} -I$libidn/include" + fi + AC_CHECK_HEADER(idna.h, + AC_CHECK_LIB(idn, stringprep_check_version, + [iri=yes LIBS="${LIBS} -lidn"], iri=no), + iri=no) + + if test "X$iri" != "Xno" ; then + AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.]) + AC_MSG_NOTICE([Enabling support for IRI.]) + else + AC_MSG_WARN([Libidn not found]) + fi +fi + + +dnl Needed by src/Makefile.am +AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) + + dnl dnl Create output dnl diff --git a/src/ChangeLog b/src/ChangeLog index 3e08d0d8..fd86c51c 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -7,11 +7,27 @@ * init.c (cleanup): Free the memory associated with the base option (when DEBUG_MALLOC is defined). +2008-07-02 Xavier Saint + + * iri.c, iri.h : New function idn_decode() to decode ASCII + encoded hostname to the locale. + + * host.c : Show hostname to be resolved both in locale and + ASCII encoded. + 2008-06-28 Steven Schubiger * retr.c (retrieve_from_file): Allow for reading the links from an external file (HTTP/FTP). +2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + 2008-06-25 Steven Schubiger * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic @@ -36,7 +52,7 @@ * http.c: Make -nv --spider include the file's name when it exists. - + 2008-06-22 Micah Cowan * Makefile.am (version.c): Fixed version string invocation so it @@ -44,12 +60,57 @@ string vars pointers-to-const, and moved line lengths below 80 (in Makefile.am, not in version.c). +2008-06-19 Xavier Saint + + * iri.c, iri.h : New function check_encoding_name() as + a preliminary encoding name check. + + * main.c, iri.c : Make use of check_encoding_name(). + +2008-06-19 Xavier Saint + + * iri.c : Include missing stringprep.h file and add a + cast. + + * init.c : set a default initial value for opt.enable_iri, + opt.locale and opt.encoding_remote. + +2008-06-19 Xavier Saint + + * iri.c, iri.h : Add a new function find_locale() to find + out the local system encoding. + + * main.c : Make use of find_locale(). + +2008-06-19 Xavier Saint + + * html-url.c : Add "content-type" meta tag parsing for + retrieving page encoding. + + * iri.h : Make no-op version of parse_charset() return + NULL. + 2008-06-16 Micah Cowan * http.c (http_loop): When hstat.len is higher than the successfully completed content's length, but it's because we _set_ it that way, don't abort. +2008-06-14 Xavier Saint + + * iri.c, iri.h : New files. + + * Makefile.am : Add files iri.h and conditional iri.c. + + * build_info.c : Add compiled feature "iri". + + * http.c : include iri.h and parse charset from Content-Type + header. + + * init.c, main.c, options.h : if an options isn't supported + at compiled time, don't get rid off it and show a dummy + message instead if they are used. + 2008-06-13 Micah Cowan * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL @@ -93,11 +154,11 @@ default. 2008-05-17 Kenny Parnell - + (cmd_spec_prefer_family): Initialize prefer_family to prefer_none. 2008-05-17 Micah Cowan - + * main.c (main): Handle Ctrl-D on command-line. 2008-05-15 Steven Schubiger @@ -136,7 +197,7 @@ * options.h: Add an according boolean member to the options struct. - + * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE out, because they're now defined independently by config.h. diff --git a/src/Makefile.am b/src/Makefile.am index 6db4ac17..edbb592e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -30,6 +30,10 @@ # Version: @VERSION@ # +if IRI_IS_ENABLED +IRI_OBJ = iri.c +endif + # The following line is losing on some versions of make! DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@ @@ -40,8 +44,8 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ recur.c res.c retr.c snprintf.c spider.c url.c \ - utils.c \ - css-url.h connect.h convert.h cookies.h \ + utils.c $(IRI_OBJ) \ + css-url.h connect.h convert.h cookies.h \ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ diff --git a/src/build_info.c b/src/build_info.c index e330b653..00d5122d 100644 --- a/src/build_info.c +++ b/src/build_info.c @@ -100,6 +100,13 @@ const char* (compiled_features[]) = #else "-gettext", #endif + +#ifdef ENABLE_IRI + "+iri", +#else + "-iri", +#endif + /* sentinel value */ NULL }; diff --git a/src/connect.c b/src/connect.c index 1e8f07e5..6cfdb4b7 100644 --- a/src/connect.c +++ b/src/connect.c @@ -58,6 +58,7 @@ as that of the covered work. */ #include "host.h" #include "connect.h" #include "hash.h" +#include "iri.h" /* Define sockaddr_storage where unavailable (presumably on IPv4-only hosts). */ @@ -266,9 +267,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print) if (print) { const char *txt_addr = print_address (ip); - if (print && 0 != strcmp (print, txt_addr)) - logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), - escnonprint_uri (print), txt_addr, port); + if (0 != strcmp (print, txt_addr)) + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL) + { + int len = strlen (print) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, print); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), + str ? str : escnonprint_uri (print), txt_addr, port); + + if (str) + xfree (str); + } else logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); } diff --git a/src/convert.c b/src/convert.c index e72a4b0f..54004ad0 100644 --- a/src/convert.c +++ b/src/convert.c @@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set, /* Parse the file... */ urls = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, NULL); + get_urls_html (file, url, NULL, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the diff --git a/src/ftp-basic.c b/src/ftp-basic.c index 265a1e25..5f250959 100644 --- a/src/ftp-basic.c +++ b/src/ftp-basic.c @@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line) return FTPRERR; /* Strip trailing CRLF before printing the line, so that - escnonprint doesn't include bogus \012 and \015. */ + quotting doesn't include bogus \012 and \015. */ p = strchr (line, '\0'); if (p > line && p[-1] == '\n') *--p = '\0'; diff --git a/src/host.c b/src/host.c index fdb35b1c..1226a274 100644 --- a/src/host.c +++ b/src/host.c @@ -53,6 +53,7 @@ as that of the covered work. */ #include "host.h" #include "url.h" #include "hash.h" +#include "iri.h" #ifndef NO_ADDRESS # define NO_ADDRESS NO_DATA @@ -712,8 +713,24 @@ lookup_host (const char *host, int flags) /* No luck with the cache; resolve HOST. */ if (!silent && !numeric_address) - logprintf (LOG_VERBOSE, _("Resolving %s... "), - quotearg_style (escape_quoting_style, host)); + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL) + { + int len = strlen (host) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, host); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Resolving %s... "), + quotearg_style (escape_quoting_style, str ? str : host)); + + if (str) + xfree (str); + } #ifdef ENABLE_IPV6 { diff --git a/src/html-url.c b/src/html-url.c index 75bec7d9..6e886083 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -174,6 +174,10 @@ static const char *additional_attributes[] = { static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, NULL); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, NULL); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -553,6 +557,24 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) entry->link_expect_html = 1; } } + else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) + { + /* Handle stuff like: + */ + + char *mcharset; + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + + mcharset = parse_charset (content); + if (!mcharset) + return; + + /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ + xfree_null (meta_charset); + meta_charset = mcharset; + } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: @@ -617,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -657,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; @@ -726,7 +753,7 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, NULL); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/html-url.h b/src/html-url.h index a94f0db6..2e9ec820 100644 --- a/src/html-url.h +++ b/src/html-url.h @@ -44,7 +44,7 @@ struct map_context { }; struct urlpos *get_urls_file (const char *); -struct urlpos *get_urls_html (const char *, const char *, bool *); +struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *); struct urlpos *append_url (const char *, int, int, struct map_context *); void free_urlpos (struct urlpos *); diff --git a/src/http.c b/src/http.c index 52f65fed..589e18ee 100644 --- a/src/http.c +++ b/src/http.c @@ -1364,7 +1364,8 @@ free_hstat (struct http_stat *hs) If PROXY is non-NULL, the connection will be made to the proxy server, and u->url will be requested. */ static uerr_t -gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) +gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, + struct iri *iri) { struct request *req; @@ -1827,7 +1828,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@ -1896,7 +1897,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@ -2048,9 +2049,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); char *tmp = strchr (type, ';'); if (tmp) { + /* sXXXav: only needed if IRI support is enabled */ + char *tmp2 = tmp + 1; + while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; + + /* Try to get remote encoding if needed */ + if (opt.enable_iri && !opt.encoding_remote) + { + tmp = parse_charset (tmp2); + if (tmp) + set_content_encoding (iri, tmp); + } } } hs->newloc = resp_header_strdup (resp, "Location"); @@ -2325,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); retried, and retried, and retried, and... */ uerr_t http_loop (struct url *u, char **newloc, char **local_file, const char *referer, - int *dt, struct url *proxy) + int *dt, struct url *proxy, struct iri *iri) { int count; bool got_head = false; /* used for time-stamping and filename detection */ @@ -2336,16 +2348,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); - + /* Set LOCAL_FILE parameter. */ if (local_file && opt.output_document) *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); - + /* Reset NEWLOC parameter. */ *newloc = NULL; @@ -2382,7 +2394,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), +File %s already there; not retrieving.\n\n"), quote (hstat.local_file)); /* If the file is there, we suppose it's retrieved OK. */ *dt |= RETROKF; @@ -2398,10 +2410,10 @@ File %s already there; not retrieving.\n\n"), /* Reset the counter. */ count = 0; - + /* Reset the document type. */ *dt = 0; - + /* Skip preliminary HEAD request if we're not in spider mode AND * if -O was given or HTTP Content-Disposition support is disabled. */ if (!opt.spider @@ -2410,21 +2422,21 @@ File %s already there; not retrieving.\n\n"), /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition && file_exists_p (url_file_name (u))) send_head_first = true; - + /* THE loop */ do { /* Increment the pass counter. */ ++count; sleep_between_retrievals (count); - + /* Get the current time string. */ tms = datetime_str (time (NULL)); - + if (opt.spider && !got_head) logprintf (LOG_VERBOSE, _("\ Spider mode enabled. Check if remote file exists.\n")); @@ -2433,20 +2445,20 @@ Spider mode enabled. Check if remote file exists.\n")); if (opt.verbose) { char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); - - if (count > 1) + + if (count > 1) { char tmp[256]; sprintf (tmp, _("(try:%2d)"), count); logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", tms, tmp, hurl); } - else + else { logprintf (LOG_NOTQUIET, "--%s-- %s\n", tms, hurl); } - + #ifdef WINDOWS ws_changetitle (hurl); #endif @@ -2456,7 +2468,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (send_head_first && !got_head) + if (send_head_first && !got_head) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@ -2489,11 +2501,11 @@ Spider mode enabled. Check if remote file exists.\n")); *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy); + err = gethttp (u, &hstat, dt, proxy, iri); /* Time? */ tms = datetime_str (time (NULL)); - + /* Get the new location (with or without the redirection). */ if (hstat.newloc) *newloc = xstrdup (hstat.newloc); @@ -2532,7 +2544,7 @@ Spider mode enabled. Check if remote file exists.\n")); hstat.statcode); ret = WRONGCODE; } - else + else { ret = NEWLOCATION; } @@ -2548,7 +2560,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { char *hurl = NULL; @@ -2567,11 +2579,13 @@ Spider mode enabled. Check if remote file exists.\n")); continue; } /* Maybe we should always keep track of broken links, not just in - * spider mode. */ - else if (opt.spider) + * spider mode. + * Don't log error if it was UTF-8 encoded because we will try + * once unencoded. */ + else if (opt.spider && !iri->utf8_encode) { /* #### Again: ugly ugly ugly! */ - if (!hurl) + if (!hurl) hurl = url_string (u, URL_AUTH_HIDE_PASSWD); nonexisting_url (hurl); logprintf (LOG_NOTQUIET, _("\ @@ -2580,7 +2594,7 @@ Remote file does not exist -- broken link!!!\n")); else { logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, + tms, hstat.statcode, quotearg_style (escape_quoting_style, hstat.error)); } logputs (LOG_VERBOSE, "\n"); diff --git a/src/http.h b/src/http.h index e0e66cea..4769e9d3 100644 --- a/src/http.h +++ b/src/http.h @@ -33,7 +33,7 @@ as that of the covered work. */ struct url; uerr_t http_loop (struct url *, char **, char **, const char *, int *, - struct url *); + struct url *, struct iri *); void save_cookies (void); void http_cleanup (void); time_t http_atotm (const char *); diff --git a/src/init.c b/src/init.c index d4fc10e3..d01a1c80 100644 --- a/src/init.c +++ b/src/init.c @@ -181,9 +181,11 @@ static const struct { { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, + { "iri", &opt.enable_iri, cmd_boolean }, { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, + { "locale", &opt.locale, cmd_string }, { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@ -223,6 +225,7 @@ static const struct { { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, + { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, @@ -330,6 +333,14 @@ defaults (void) opt.restrict_files_case = restrict_no_case_restriction; opt.max_redirect = 20; + +#ifdef ENABLE_IRI + opt.enable_iri = true; +#else + opt.enable_iri = false; +#endif + opt.locale = NULL; + opt.encoding_remote = NULL; } /* Return the user's home directory (strdup-ed), or NULL if none is diff --git a/src/iri.c b/src/iri.c new file mode 100644 index 00000000..783aa331 --- /dev/null +++ b/src/iri.c @@ -0,0 +1,362 @@ +/* IRI related functions. + Copyright (C) 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at +your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "iri.h" + +/* RFC3987 section 3.1 mandates STD3 ASCII RULES */ +#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES + +/* Note: locale encoding is kept in options struct (opt.locale) */ + +static iconv_t locale2utf8; + +static bool open_locale_to_utf8 (void); +static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); + + +/* Given a string containing "charset=XXX", return the encoding if found, + or NULL otherwise */ +char * +parse_charset (char *str) +{ + char *charset; + + if (!str || !*str) + return NULL; + + str = strcasestr (str, "charset="); + if (!str) + return NULL; + + str += 8; + charset = str; + + /* sXXXav: which chars should be banned ??? */ + while (*charset && !c_isspace (*charset)) + charset++; + + /* sXXXav: could strdupdelim return NULL ? */ + charset = strdupdelim (str, charset); + + /* Do a minimum check on the charset value */ + if (!check_encoding_name (charset)) + { + xfree (charset); + return NULL; + } + + /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ + + return charset; +} + +/* Find the locale used, or fall back on a default value */ +char * +find_locale (void) +{ + return (char *) stringprep_locale_charset (); +} + +/* Basic check of an encoding name. */ +bool +check_encoding_name (char *encoding) +{ + char *s = encoding; + + while (*s) + { + if (!c_isascii (*s) || c_isspace (*s)) + { + logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding)); + return false; + } + + s++; + } + + return true; +} + +/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */ +static bool +open_locale_to_utf8 (void) +{ + if (locale2utf8) + return true; + + /* sXXXav : That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); + opt.locale = find_locale (); + } + + if (!opt.locale) + return false; + + locale2utf8 = iconv_open ("UTF-8", opt.locale); + if (locale2utf8 != (iconv_t)(-1)) + return true; + + logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", + quote (opt.locale), quote ("UTF-8")); + locale2utf8 = NULL; + return false; +} + +/* Try converting string str from locale to UTF-8. Return a new string + on success, or str on error or if conversion isn't needed. */ +const char * +locale_to_utf8 (const char *str) +{ + char *new; + + if (!strcasecmp (opt.locale, "utf-8")) + return str; + + if (!open_locale_to_utf8 ()) + return str; + + if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new)) + return (const char *) new; + + return str; +} + +/* Do the conversion according to the passed conversion descriptor cd. *out + will containes the transcoded string on success. *out content is + unspecified otherwise. */ +static bool +do_conversion (iconv_t cd, char *in, size_t inlen, char **out) +{ + /* sXXXav : hummm hard to guess... */ + size_t len, done, outlen = inlen * 2; + int invalid = 0, tooshort = 0; + char *s; + + s = xmalloc (outlen + 1); + *out = s; + len = outlen; + done = 0; + + for (;;) + { + if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) + { + *out = s; + *(s + len - outlen - done) = '\0'; + return true; + } + + /* Incomplete or invalid multibyte sequence */ + if (errno == EINVAL || errno == EILSEQ) + { + if (!invalid) + logprintf (LOG_VERBOSE, + "Incomplete or invalide multibyte sequence encountered\n"); + + invalid++; + **out = *in; + in++; + inlen--; + (*out)++; + outlen--; + } + else if (errno == E2BIG) /* Output buffer full */ + { + char *new; + + tooshort++; + done = len; + outlen = done + inlen * 2; + new = xmalloc (outlen + 1); + memcpy (new, s, done); + xfree (s); + s = new; + len = outlen; + *out = s + done; + } + else /* Weird, we got an unspecified error */ + { + logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno); + break; + } + } + + return false; +} + +/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL + on error. */ +char * +idn_encode (struct iri *i, char *host) +{ + char *new; + int ret; + + /* Encode to UTF-8 if not done */ + if (!i->utf8_encode) + { + if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) + { + /* Nothing to encode or an error occured */ + return NULL; + } + + host = new; + } + + /* toASCII UTF-8 NULL terminated string */ + ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + /* sXXXav : free new when needed ! */ + logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to decode an "ASCII encoded" host. Return the new domain in the locale + on success or NULL on error. */ +char * +idn_decode (char *host) +{ + char *new; + int ret; + + ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to transcode string str from remote encoding to UTF-8. On success, *new + contains the transcoded string. *new content is unspecified otherwise. */ +bool +remote_to_utf8 (struct iri *i, const char *str, const char **new) +{ + char *r; + iconv_t cd; + bool ret = false; + + if (opt.encoding_remote) + r = opt.encoding_remote; + else if (i->uri_encoding) + r = i->uri_encoding; + else + return false; + + cd = iconv_open ("UTF-8", r); + if (cd == (iconv_t)(-1)) + return false; + + if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + ret = true; + + iconv_close (cd); + + /* Test if something was converted */ + if (!strcmp (str, *new)) + { + xfree ((char *) *new); + return false; + } + + return ret; +} + +struct iri * +iri_new (void) +{ + struct iri *i = xmalloc (sizeof (struct iri)); + i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; + i->content_encoding = NULL; + i->utf8_encode = opt.enable_iri; +} + +void +iri_free (struct iri *i) +{ + xfree_null (i->uri_encoding); + xfree_null (i->content_encoding); + xfree (i); +} + +void +set_uri_encoding (struct iri *i, char *charset) +{ + logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset); + if (opt.encoding_remote) + return; + if (i->uri_encoding) + { + if (!strcasecmp (i->uri_encoding, charset)) + return; + xfree (i->uri_encoding); + } + + i->uri_encoding = charset ? xstrdup (charset) : NULL; +} + +void +set_content_encoding (struct iri *i, char *charset) +{ + logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset); + if (opt.encoding_remote) + return; + if (i->content_encoding) + { + if (!strcasecmp (i->content_encoding, charset)) + return; + xfree (i->content_encoding); + } + + i->content_encoding = charset ? xstrdup (charset) : NULL; +} + diff --git a/src/iri.h b/src/iri.h new file mode 100644 index 00000000..173d0656 --- /dev/null +++ b/src/iri.h @@ -0,0 +1,70 @@ +/* Internationalization related declarations. + Copyright (C) 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#ifndef IRI_H +#define IRI_H + +struct iri { + char *uri_encoding; /* Encoding of the uri to fetch */ + char *content_encoding; /* Encoding of links inside the fetched file */ + bool utf8_encode; /* Will/Is the current url encoded in utf8 */ +}; + +#ifdef ENABLE_IRI + +char *parse_charset (char *str); +char *find_locale (void); +bool check_encoding_name (char *encoding); +const char *locale_to_utf8 (const char *str); +char *idn_encode (struct iri *i, char *host); +char *idn_decode (char *host); +bool remote_to_utf8 (struct iri *i, const char *str, const char **new); +struct iri *iri_new (void); +void iri_free (struct iri *i); +void set_uri_encoding (struct iri *i, char *charset); +void set_content_encoding (struct iri *i, char *charset); + +#else /* ENABLE_IRI */ + +struct iri dummy_iri; + +#define parse_charset(str) NULL +#define find_locale() NULL +#define check_encoding_name(str) false +#define locale_to_utf8(str) (str) +#define idn_encode(a,b,c) NULL +#define idn_decode(str) NULL +#define remote_to_utf8(a,b,c) false +#define iri_new() (&dummy_iri) +#define iri_free(a) +#define set_uri_encoding(a,b) +#define set_content_encoding(a,b) + +#endif /* ENABLE_IRI */ +#endif /* IRI_H */ diff --git a/src/log.c b/src/log.c index e84e5c61..b62bf9dd 100644 --- a/src/log.c +++ b/src/log.c @@ -43,7 +43,7 @@ as that of the covered work. */ #include "utils.h" #include "log.h" -/* This file impplement support for "logging". Logging means printing +/* This file implement support for "logging". Logging means printing output, plus several additional features: - Cataloguing output by importance. You can specify that a log diff --git a/src/main.c b/src/main.c index 70387c9c..8cee194c 100644 --- a/src/main.c +++ b/src/main.c @@ -43,6 +43,9 @@ as that of the covered work. */ #include #include #include +#ifdef ENABLE_IRI +#include +#endif #include "utils.h" #include "init.h" @@ -200,10 +203,12 @@ static struct cmdline_option option_data[] = { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, + { "iri", 0, OPT_BOOLEAN, "iri", -1 }, { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, + { "locale", 0, OPT_VALUE, "locale", -1 }, { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@ -237,6 +242,7 @@ static struct cmdline_option option_data[] = { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, + { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@ -1058,6 +1064,29 @@ for details.\n\n")); exit (1); } +#ifdef ENABLE_IRI + if (opt.enable_iri) + { + if (opt.locale && !check_encoding_name (opt.locale)) + opt.locale = NULL; + + if (!opt.locale) + opt.locale = find_locale (); + + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) + opt.encoding_remote = NULL; + + /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/ + } +#else + if (opt.enable_iri || opt.locale || opt.encoding_remote) + { + /* sXXXav : be more specific... */ + printf(_("This version does not have support for IRIs\n")); + exit(1); + } +#endif + if (opt.ask_passwd) { opt.passwd = prompt_for_password (); @@ -1167,15 +1196,21 @@ WARNING: Can't reopen standard output in binary mode;\n\ int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) + if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (*t); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + { + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, + opt.recursive, i); + iri_free (i); + } if (opt.delete_after && file_exists_p(filename)) { diff --git a/src/options.h b/src/options.h index 6a6badb0..723f80a1 100644 --- a/src/options.h +++ b/src/options.h @@ -237,6 +237,10 @@ struct options bool content_disposition; /* Honor HTTP Content-Disposition header. */ bool auth_without_challenge; /* Issue Basic authentication creds without waiting for a challenge. */ + + bool enable_iri; + char *encoding_remote; + char *locale; }; extern struct options opt; diff --git a/src/recur.c b/src/recur.c index 729a14e9..e2f58d1c 100644 --- a/src/recur.c +++ b/src/recur.c @@ -51,7 +51,8 @@ as that of the covered work. */ #include "html-url.h" #include "css-url.h" #include "spider.h" - +#include "iri.h" + /* Functions for maintaining the URL queue. */ struct queue_element { @@ -60,6 +61,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ + struct iri *iri; /* sXXXav */ bool css_allowed; /* whether the document is allowed to be treated as CSS. */ struct queue_element *next; /* next element in queue */ @@ -93,11 +95,12 @@ url_queue_delete (struct url_queue *queue) into it. */ static void -url_enqueue (struct url_queue *queue, +url_enqueue (struct url_queue *queue, struct iri *i, const char *url, const char *referer, int depth, bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); + qel->iri = i; qel->url = url; qel->referer = referer; qel->depth = depth; @@ -112,6 +115,9 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + if (i) + printf ("[Enqueuing %s with %s\n", url, i->uri_encoding); + if (queue->tail) queue->tail->next = qel; queue->tail = qel; @@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue, succeeded, or false if the queue is empty. */ static bool -url_dequeue (struct url_queue *queue, +url_dequeue (struct url_queue *queue, struct iri **i, const char **url, const char **referer, int *depth, bool *html_allowed, bool *css_allowed) { @@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; + *i = qel->iri; *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue, } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); static bool descend_redirect_p (const char *, const char *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); /* Retrieve a part of the web beginning with START_URL. This used to @@ -192,8 +199,11 @@ retrieve_tree (const char *start_url) struct hash_table *blacklist; int up_error_code; - struct url *start_url_parsed = url_parse (start_url, &up_error_code); + struct url *start_url_parsed; + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale); + start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, @@ -206,7 +216,8 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); + url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, + false); string_set_add (blacklist, start_url_parsed->url); while (1) @@ -225,7 +236,7 @@ retrieve_tree (const char *start_url) /* Get the next URL from the queue... */ - if (!url_dequeue (queue, + if (!url_dequeue (queue, (struct iri **) &i, (const char **)&url, (const char **)&referer, &depth, &html_allowed, &css_allowed)) break; @@ -266,7 +277,8 @@ retrieve_tree (const char *start_url) int dt = 0; char *redirected = NULL; - status = retrieve_url (url, &file, &redirected, referer, &dt, false); + status = retrieve_url (url, &file, &redirected, referer, &dt, + false, i); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@ -294,7 +306,7 @@ retrieve_tree (const char *start_url) if (descend) { if (!descend_redirect_p (redirected, url, depth, - start_url_parsed, blacklist)) + start_url_parsed, blacklist, i)) descend = false; else /* Make sure that the old pre-redirect form gets @@ -346,7 +358,7 @@ retrieve_tree (const char *start_url) bool meta_disallow_follow = false; struct urlpos *children = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, &meta_disallow_follow); + get_urls_html (file, url, &meta_disallow_follow, i); if (opt.use_robots && meta_disallow_follow) { @@ -357,7 +369,8 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + struct url *url_parsed = url_parse (url, NULL, i); + struct iri *ci; char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -374,9 +387,11 @@ retrieve_tree (const char *start_url) if (dash_p_leaf_HTML && !child->link_inline_p) continue; if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) + blacklist, i)) { - url_enqueue (queue, xstrdup (child->url->url), + ci = iri_new (); + set_uri_encoding (ci, i->content_encoding); + url_enqueue (queue, ci, xstrdup (child->url->url), xstrdup (referer_url), depth + 1, child->link_expect_html, child->link_expect_css); @@ -394,18 +409,18 @@ retrieve_tree (const char *start_url) } } - if (file - && (opt.delete_after + if (file + && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : + (opt.spider ? "--spider" : "recursive rejection criteria"))); logprintf (LOG_VERBOSE, (opt.delete_after || opt.spider @@ -421,6 +436,7 @@ retrieve_tree (const char *start_url) xfree (url); xfree_null (referer); xfree_null (file); + iri_free (i); } /* If anything is left of the queue due to a premature exit, free it @@ -429,9 +445,11 @@ retrieve_tree (const char *start_url) char *d1, *d2; int d3; bool d4, d5; - while (url_dequeue (queue, + struct iri *d6; + while (url_dequeue (queue, (struct iri **)&d6, (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { + iri_free (d6); xfree (d1); xfree_null (d2); } @@ -460,7 +478,8 @@ retrieve_tree (const char *start_url) static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *u = upos->url; const char *url = u->url; @@ -470,7 +489,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { - if (opt.spider) + if (opt.spider) { char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); @@ -601,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (!specs) { char *rfile; - if (res_retrieve_file (url, &rfile)) + if (res_retrieve_file (url, &rfile, iri)) { specs = res_parse_from_file (rfile); @@ -656,23 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, static bool descend_redirect_p (const char *redirected, const char *original, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; - orig_parsed = url_parse (original, NULL); + orig_parsed = url_parse (original, NULL, NULL); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, NULL); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist, iri); url_free (orig_parsed); url_free (new_parsed); diff --git a/src/res.c b/src/res.c index 8c35f0e1..69abd12d 100644 --- a/src/res.c +++ b/src/res.c @@ -532,21 +532,28 @@ res_get_specs (const char *host, int port) Return true if robots were retrieved OK, false otherwise. */ bool -res_retrieve_file (const char *url, char **file) +res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; int saved_sp_val = opt.spider; + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding); + i->utf8_encode = false; + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); + err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i); opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { diff --git a/src/res.h b/src/res.h index 94a57750..5439eaf9 100644 --- a/src/res.h +++ b/src/res.h @@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *); void res_register_specs (const char *, int, struct robot_specs *); struct robot_specs *res_get_specs (const char *, int); -bool res_retrieve_file (const char *, char **); +bool res_retrieve_file (const char *, char **, struct iri *); bool is_robots_txt_url (const char *); diff --git a/src/retr.c b/src/retr.c index 0fc46837..ae8ef3ef 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,6 +51,7 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "ptimer.h" +#include "iri.h" #include "html-url.h" /* Total size of downloaded files. Used to enforce quota. */ @@ -597,7 +598,7 @@ static char *getproxy (struct url *); uerr_t retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive) + const char *refurl, int *dt, bool recursive, struct iri *iri) { uerr_t result; char *url; @@ -625,7 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - u = url_parse (url, &up_error_code); + second_try: + u = url_parse (url, &up_error_code, iri); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); @@ -633,6 +635,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } + printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode); + if (!refurl) refurl = opt.referer; @@ -646,8 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { + /* sXXXav : could a proxy include a path ??? */ + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, NULL); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -672,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@ -722,8 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), @@ -770,8 +784,21 @@ retrieve_url (const char *origurl, char **file, char **newloc, goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (!(*dt & RETROKF) && iri->utf8_encode) { + iri->utf8_encode = false; + printf ("[Fallbacking to non-utf8 for `%s'\n", url); + goto second_try; + } + + if (local_file && *dt & RETROKF) + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); if (*dt & RETROKF) { register_download (u->url, local_file); @@ -821,13 +848,17 @@ retrieve_from_file (const char *file, bool html, int *count) { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); char *input_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { int dt; @@ -836,7 +867,7 @@ retrieve_from_file (const char *file, bool html, int *count) if (!opt.base_href) opt.base_href = xstrdup (url); - status = retrieve_url (url, &input_file, NULL, NULL, &dt, false); + status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri); if (status != RETROK) return status; @@ -846,7 +877,7 @@ retrieve_from_file (const char *file, bool html, int *count) else input_file = (char *) file; - url_list = (html ? get_urls_html (input_file, NULL, NULL) + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) @@ -868,15 +899,16 @@ retrieve_from_file (const char *file, bool html, int *count) int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (cur_url->url->url); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); + status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, + &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { @@ -1047,7 +1079,11 @@ bool url_uses_proxy (const char *url) { bool ret; - struct url *u = url_parse (url, NULL); + struct url *u; + struct iri *i = iri_new(); + /* url was given in the command line, so use locale as encoding */ + set_uri_encoding (i, opt.locale); + u= url_parse (url, NULL, i); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/retr.h b/src/retr.h index ec55cfda..bb2e66d3 100644 --- a/src/retr.h +++ b/src/retr.h @@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int); char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool); +uerr_t retrieve_url (const char *, char **, char **, const char *, int *, + bool, struct iri *); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); diff --git a/src/url.c b/src/url.c index f5d621f9..c7a3a721 100644 --- a/src/url.c +++ b/src/url.c @@ -42,6 +42,7 @@ as that of the covered work. */ #include "utils.h" #include "url.h" #include "host.h" /* for is_valid_ipv6_address */ +#include "iri.h" #ifdef TESTING #include "test.h" @@ -640,7 +641,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, struct iri *iri) { struct url *u; const char *p; @@ -659,7 +660,7 @@ url_parse (const char *url, int *error) int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; + char *url_encoded = NULL, *new_url = NULL; int error_code; @@ -670,9 +671,20 @@ url_parse (const char *url, int *error) goto error; } - url_encoded = reencode_escapes (url); + if (iri && iri->utf8_encode) + { + url_unescape ((char *) url); + iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url); + if (!iri->utf8_encode) + new_url = NULL; + } + + url_encoded = reencode_escapes (new_url ? new_url : url); p = url_encoded; + if (new_url && url_encoded != new_url) + xfree (new_url); + p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); @@ -842,6 +854,18 @@ url_parse (const char *url, int *error) { url_unescape (u->host); host_modified = true; + + /* Apply IDNA regardless of iri->utf8_encode status */ + if (opt.enable_iri && iri) + { + char *new = idn_encode (iri, u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } } if (params_b) @@ -851,7 +875,7 @@ url_parse (const char *url, int *error) if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild diff --git a/src/url.h b/src/url.h index 7c8bcfed..9c49c0b5 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, struct iri *iri); const char *url_error (int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); diff --git a/src/wget.h b/src/wget.h index d87dfcac..b17b6709 100644 --- a/src/wget.h +++ b/src/wget.h @@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT; #include "quote.h" #include "quotearg.h" +/* Likewise for struct iri definition */ +#include "iri.h" + /* Useful macros used across the code: */ /* The number of elements in an array. For example: