From: Micah Cowan Date: Thu, 25 Jun 2009 08:14:11 +0000 (-0700) Subject: Merge with mainline. X-Git-Tag: v1.13~338 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=4f3dd6817348433eafde04a3c2946f43364de7ef;hp=-c Merge with mainline. --- 4f3dd6817348433eafde04a3c2946f43364de7ef diff --combined ChangeLog index 8358b3bd,a891c52e..659415aa --- a/ChangeLog +++ b/ChangeLog @@@ -1,28 -1,3 +1,28 @@@ +2009-06-14 Micah Cowan + + * po/Makefile.in.in (distclean): remove en_US.po, too. + + * Makefile.am: Include md5 as a subdir unconditionally. + It may result in useless compilation, and additional risk of + breaking a build of something that isn't actually needed, but + otherwise it's too much of a hassle to manage a failure-free + distcheck. + +2009-06-12 Micah Cowan + + * configure.ac: Check for h_errno declaration. Idea thanks to + Maciej W. Rozycki. + +2009-03-03 Steven Schubiger + + * src/ftp.c, src/http.c, src/main.c, src/recur.h, + tests/Makefile.am: Update the copyright years. + +2009-01-23 Steven Schubiger + + * util/freeopts, util/rmold.pl, util/trunc.c: Remove + unnecessary whitespace. + 2008-11-10 Micah Cowan * MAILING-LIST: Mention Gmane, introduce subsections. @@@ -49,6 -24,14 +49,14 @@@ * AUTHORS: Added Steven Schubiger. + 2008-06-26 Xavier Saint + + * configure.ac : IRIs support required libiconv, check it. + + 2008-06-14 Xavier Saint + + * configure.ac: Add support for IRIs + 2008-05-29 Micah Cowan * po/*.po: Updated from TP (the 1.11.3 set). diff --combined configure.ac index 78fd5e14,fb0c65d1..dcb302fa --- a/configure.ac +++ b/configure.ac @@@ -163,8 -163,6 +163,8 @@@ AC_CHECK_HEADERS(unistd.h sys/time.h AC_CHECK_HEADERS(termios.h sys/ioctl.h sys/select.h utime.h sys/utime.h) AC_CHECK_HEADERS(stdint.h inttypes.h pwd.h wchar.h) +AC_CHECK_DECLS(h_errno,,,[#include ]) + dnl dnl Check sizes of integer types. These are used to find n-bit dnl integral types on older systems that fail to provide intN_t and @@@ -462,6 -460,77 +462,77 @@@ els fi AC_SUBST(COMMENT_IF_NO_POD2MAN) + + dnl + dnl Check for IDN/IRIs + dnl + + AC_ARG_ENABLE(iri, + AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]), + [case "${enable_iri}" in + no) + dnl Disable IRIs checking + AC_MSG_NOTICE([disabling IRIs at user request]) + iri=no + ;; + yes) + dnl IRIs explicitly enabled + iri=yes + force_iri=yes + ;; + auto) + dnl Auto-detect IRI + iri=yes + ;; + *) + AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri']) + ;; + esac + ], [ + dnl If nothing is specified, assume auto-detection + iri=yes + ] + ) + + AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]], + [Support IDN/IRIs (needs GNU Libidn)]), + libidn=$withval, libidn="") + if test "X$iri" != "Xno"; then + AM_ICONV + + if test "X$am_cv_func_iconv" != "Xyes"; then + iri=no + if test "X$force_iri" = "Xyes"; then + AC_MSG_ERROR([Libiconv is required for IRIs support]) + else + AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found]) + fi + fi + fi + + if test "X$iri" != "Xno"; then + if test "$libidn" != ""; then + LDFLAGS="${LDFLAGS} -L$libidn/lib" + CPPFLAGS="${CPPFLAGS} -I$libidn/include" + fi + AC_CHECK_HEADER(idna.h, + AC_CHECK_LIB(idn, stringprep_check_version, + [iri=yes LIBS="${LIBS} -lidn"], iri=no), + iri=no) + + if test "X$iri" != "Xno" ; then + AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.]) + AC_MSG_NOTICE([Enabling support for IRI.]) + else + AC_MSG_WARN([Libidn not found]) + fi + fi + + + dnl Needed by src/Makefile.am + AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) + + dnl dnl Create output dnl diff --combined doc/ChangeLog index 39f390c4,dc1d4084..898e3c6e --- a/doc/ChangeLog +++ b/doc/ChangeLog @@@ -1,31 -1,3 +1,31 @@@ +2009-06-20 Micah Cowan + + * wget.texi (Contributors): Added Jay Krell. + +2009-06-14 Micah Cowan + + * Makefile.am (wget.pod): $(srcdir)/version.texi -> version.texi + +2009-06-12 Micah Cowan + + * wget.texi (Download Options): More accuracy on what happens when + -nd is used with -r or -p. + +2009-06-11 Micah Cowan + + * wget.texi (Contributors): Added Xin Zou, Benjamin Wolsley, and + Robert Millan. + +2009-06-11 Joao Ferreira + + * wget.texi (Option Syntax): Fixed contradictory and confusing + explanation of --folow-ftp and negation. + +2009-06-10 Micah Cowan + + * sample.wgetrc: Add "https_proxy" to the proxy examples. Thanks + to Martin Paul for the suggestion. + 2008-11-15 Steven Schubiger * sample.wgetrc: Comment the waitretry "default" value, @@@ -70,6 -42,15 +70,15 @@@ * wget.texi (Robot Exclusion): Fixed typo "downloads" -> "download" + 2008-08-03 Xavier Saint + + * wget.texi : Add option descriptions for the three new + options --iri, --locale and --remote-encoding related to + IRI support. + + * sample.wgetrc : Add commented lines for the three new + command iri, locale and encoding related to IRI support. + 2008-08-03 Micah Cowan * wget.texi: Don't set UPDATED; already set by version.texi. diff --combined doc/sample.wgetrc index 62981c8f,12914aea..1ce90dea --- a/doc/sample.wgetrc +++ b/doc/sample.wgetrc @@@ -73,9 -73,8 +73,9 @@@ # is *not* sent by default. #header = Accept-Language: en -# You can set the default proxies for Wget to use for http and ftp. +# You can set the default proxies for Wget to use for http, https, and ftp. # They will override the value in the environment. +#https_proxy = http://proxy.yoyodyne.com:18023/ #http_proxy = http://proxy.yoyodyne.com:18023/ #ftp_proxy = http://proxy.yoyodyne.com:18023/ @@@ -114,3 -113,12 +114,12 @@@ # To try ipv6 addresses first: #prefer-family = IPv6 + + # Set default IRI support state + #iri = off + + # Force the default system encoding + #locale = UTF-8 + + # Force the default remote server encoding + #remoteencoding = UTF-8 diff --combined doc/wget.texi index 92ed7905,a2804fb4..252548f8 --- a/doc/wget.texi +++ b/doc/wget.texi @@@ -396,8 -396,8 +396,8 @@@ the option name; negative options can b @samp{--no-} prefix. This might seem superfluous---if the default for an affirmative option is to not do something, then why provide a way to explicitly turn it off? But the startup file may in fact change -the default. For instance, using @code{follow_ftp = off} in -@file{.wgetrc} makes Wget @emph{not} follow FTP links by default, and +the default. For instance, using @code{follow_ftp = on} in +@file{.wgetrc} makes Wget @emph{follow} FTP links by default, and using @samp{--no-follow-ftp} is the only way to restore the factory default from the command line. @@@ -582,24 -582,23 +582,24 @@@ behavior depends on a few options, incl cases, the local file will be @dfn{clobbered}, or overwritten, upon repeated download. In other cases it will be preserved. -When running Wget without @samp{-N}, @samp{-nc}, @samp{-r}, or @samp{p}, -downloading the same file in the same directory will result in the -original copy of @var{file} being preserved and the second copy being -named @samp{@var{file}.1}. If that file is downloaded yet again, the -third copy will be named @samp{@var{file}.2}, and so on. When -@samp{-nc} is specified, this behavior is suppressed, and Wget will -refuse to download newer copies of @samp{@var{file}}. Therefore, -``@code{no-clobber}'' is actually a misnomer in this mode---it's not -clobbering that's prevented (as the numeric suffixes were already -preventing clobbering), but rather the multiple version saving that's -prevented. - -When running Wget with @samp{-r} or @samp{-p}, but without @samp{-N} -or @samp{-nc}, re-downloading a file will result in the new copy -simply overwriting the old. Adding @samp{-nc} will prevent this -behavior, instead causing the original version to be preserved and any -newer copies on the server to be ignored. +When running Wget without @samp{-N}, @samp{-nc}, @samp{-r}, or +@samp{-p}, downloading the same file in the same directory will result +in the original copy of @var{file} being preserved and the second copy +being named @samp{@var{file}.1}. If that file is downloaded yet +again, the third copy will be named @samp{@var{file}.2}, and so on. +(This is also the behavior with @samp{-nd}, even if @samp{-r} or +@samp{-p} are in effect.) When @samp{-nc} is specified, this behavior +is suppressed, and Wget will refuse to download newer copies of +@samp{@var{file}}. Therefore, ``@code{no-clobber}'' is actually a +misnomer in this mode---it's not clobbering that's prevented (as the +numeric suffixes were already preventing clobbering), but rather the +multiple version saving that's prevented. + +When running Wget with @samp{-r} or @samp{-p}, but without @samp{-N}, +@samp{-nd}, or @samp{-nc}, re-downloading a file will result in the +new copy simply overwriting the old. Adding @samp{-nc} will prevent +this behavior, instead causing the original version to be preserved +and any newer copies on the server to be ignored. When running Wget with @samp{-N}, with or without @samp{-r} or @samp{-p}, the decision as to whether or not to download a newer copy @@@ -675,6 -674,30 +675,30 @@@ Another instance where you'll get a gar Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http} servers that support the @code{Range} header. + @cindex iri support + @cindex idn support + @item --iri + + Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to + turn it off. IRI support is activated by default. + + You can set the default state of IRI support using @code{iri} command in + @file{.wgetrc}. That setting may be overridden from the command line. + + @cindex local encoding + @cindex locale + @item --locale=@var{encoding} + + Force Wget to use @var{encoding} as the default system encoding. That affects + how Wget converts URLs specified as arguments from locale to @sc{utf-8} for + IRI support. + + Wget use the function @code{nl_langinfo()} and then the @code{CHARSET} + environment variable to get the locale. If it fails, @sc{ascii} is used. + + You can set the default locale using the @code{locale} command in + @file{.wgetrc}. That setting may be overridden from the command line. + @cindex progress indicator @cindex dot style @item --progress=@var{type} @@@ -706,6 -729,21 +730,21 @@@ command line. The exception is that, w ``dot'' progress will be favored over ``bar''. To force the bar output, use @samp{--progress=bar:force}. + @cindex remote encoding + @item --remote-encoding=@var{encoding} + + Force Wget to use encoding as the default remote server encoding. That + affects how Wget converts URIs found in files from remote encoding to + @sc{utf-8} during a recursive fetch. This options is only useful for + IRI support, for the interpretation of non-@sc{ascii} characters. + + For HTTP, remote encoding can be found in HTTP @code{Content-Type} + header and in HTML @code{Content-Type http-equiv} meta tag. + + You can set the default encoding using the @code{remoteencoding} + command in @file{.wgetrc}. That setting may be overridden from the + command line. + @item -N @itemx --timestamping Turn on time-stamping. @xref{Time-Stamping}, for details. @@@ -3935,7 -3973,6 +3974,7 @@@ Fila Kolodny Alexander Kourakos, Martin Kraemer, Sami Krank, +Jay Krell, @tex $\Sigma\acute{\iota}\mu o\varsigma\; \Xi\varepsilon\nu\iota\tau\acute{\epsilon}\lambda\lambda\eta\varsigma$ @@@ -3966,7 -4003,6 +4005,7 @@@ Aurelien Marchand Matthew J.@: Mellon, Jordan Mendelson, Ted Mielczarek, +Robert Millan, Lin Zhe Min, Jan Minar, Tim Mooney, @@@ -4042,7 -4078,6 +4081,7 @@@ Charles G Waldman Douglas E.@: Wegscheid, Ralf Wildenhues, Joshua David Williams, +Benjamin Wolsey, YAMAZAKI Makoto, Jasmin Zainul, @iftex @@@ -4051,8 -4086,7 +4090,8 @@@ Bojan @v{Z}drnja @ifnottex Bojan Zdrnja, @end ifnottex -Kristijan Zimmer. +Kristijan Zimmer, +Xin Zou. Apologies to all who I accidentally left out, and many thanks to all the subscribers of the Wget mailing list. diff --combined src/ChangeLog index a6dd402c,2d3331f1..bd833ea0 --- a/src/ChangeLog +++ b/src/ChangeLog @@@ -1,144 -1,8 +1,144 @@@ +2009-06-20 Jay Krell + + * sysdep.h (_ALL_SOURCE): (small change) Define the _ALL_SOURCE + macro on INTERIX systems. (I switched the location from ftp.c to + sysdep.h --mjc) + +2009-06-15 Micah Cowan + + * ftp.c (getftp): If we can't accept the connection, return + CONERROR, not whatever the contents of err happens to be. Fixes + bug #25015. + + * retr.c (fd_read_body): Make both args to progress_create + consistent, resulting in an accurate progress display. Fixes bug + #24948. + +2009-06-14 Micah Cowan + + * Makefile.am (wget_SOURCES): css-tokens.h needs to ship with + dist, too. + +2009-06-13 Micah Cowan + + * init.c: Rename setval_internal_wrapper to setval_internal_tilde, + ensure we don't "replace" the tilde unless it's actually + present. Clean up some minor GNU style issues. + +2009-06-13 Julien Pichon + + * init.c: Handle tilde-expansion in wgetrc commands, without + resorting to setting/unsetting globals to change behavior in one + call location. + +2009-06-12 Micah Cowan + + * host.c: Include before . Not + required by POSIX any more, but some older systems (such as + FreeBSD 4.1) still need it, and it doesn't seem like it could + hurt... + + * build_info.c (library): Handle "https" as a feature in its own + right, apart from "gnutls" and "openssl". + + * host.c: Declare h_errno if no declaration is provided. Idea + thanks to Maciej W. Rozycki. + +2009-06-11 Xin Zou + + * http.c (gethttp): Fix some memory leaks. + +2009-06-11 Micah Cowan + + * http.c (http_atotm): Handle potential for setlocale's return + value to be static storage. Thanks to Benjamin Wolsey + . + + * sysdep.h: Need NAMESPACE_TWEAKS on non-Linux glibc-based + systems, too. Thanks to Robert Millan. + +2009-05-28 Steven Schubiger + + * ftp.c (ftp_get_listing): Update the "listing file" + string after calling ftp_loop_internal(). + +2009-05-27 Steven Schubiger + + * ftp.c (ftp_get_listing): Duplicate the "listing file" + string to avoid memory corruption when FOPEN_EXCL_ERR is + encountered. + +2009-05-17 Steven Schubiger + + * progress.c (eta_to_human_short): Fix the remaining hours + to be displayed. Spotted by Tadeu Martins (#26411). + +2009-04-24 Micah Cowan + + * hash.c: Change stdint.h inclusion to use HAVE_STDINT_H, not C99 + check. + + * connect.c: stdint.h inclusion added. + + Thanks to Markus Duft for a similar patch. + +2009-04-20 Micah Cowan + + * Makefile.am (version.c): Fix unportable use of "echo -n". + +2009-04-13 Steven Schubiger + + * ftp.c (ftp_retrieve_list): Move the duplicated code that + determines the local file to a function. + + * http.c (http_loop): Likewise. + + * retr.c (set_local_file): New function. + +2009-04-11 Steven Schubiger + + * init.c (initialize): Run a custom SYSTEM_WGETRC when + provided as an environment variable. + +2009-02-27 Gisle Vanem + + * main.c (main): "freopen (NULL,.." causes an assertion in MSVC + debug-mode. I.e. NULL isn't legal. But the "CONOUT$" device works + fine. + +2009-02-27 Steven Schubiger + + * ftp.c (ftp_loop_internal): Don't claim for FTP retrievals + when writing to standard output either that the document + has been saved. Addresses bug #20520 again. + +2009-02-21 Steven Schubiger + + * http.c (http_loop): When a document is written to + standard output, don't claim it has been saved to a file. + Addresses bug #20520. + +2009-02-18 Steven Schubiger + + * recur.h: Remove the dangling declaration for recursive_cleanup(). + +2009-02-01 Gerardo E. Gidoni + + * main.c, recur.c, recur.h, res.c, retr.c, retr.h: restructured code to + avoid multiple 'url_parse' calls. + 2008-11-13 Micah Cowan * http.c (gethttp): Don't do anything when content-length >= our requested range. +2008-11-27 Saint Xavier + + * http.c (gethttp): Move authentication code before filename + allocation avoiding fallbacking on default filename because + "Content-Disposition" header wasn't present before authentcation + has been completed. Fixes bug #24862. + 2008-11-16 Steven Schubiger * main.c: Declare and initialize the numurls counter. @@@ -256,11 -120,27 +256,27 @@@ * init.c (cleanup): Free the memory associated with the base option (when DEBUG_MALLOC is defined). + 2008-07-02 Xavier Saint + + * iri.c, iri.h : New function idn_decode() to decode ASCII + encoded hostname to the locale. + + * host.c : Show hostname to be resolved both in locale and + ASCII encoded. + 2008-06-28 Steven Schubiger * retr.c (retrieve_from_file): Allow for reading the links from an external file (HTTP/FTP). + 2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + 2008-06-25 Steven Schubiger * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic @@@ -285,7 -165,7 +301,7 @@@ * http.c: Make -nv --spider include the file's name when it exists. - + 2008-06-22 Micah Cowan * Makefile.am (version.c): Fixed version string invocation so it @@@ -293,12 -173,57 +309,57 @@@ string vars pointers-to-const, and moved line lengths below 80 (in Makefile.am, not in version.c). + 2008-06-19 Xavier Saint + + * iri.c, iri.h : New function check_encoding_name() as + a preliminary encoding name check. + + * main.c, iri.c : Make use of check_encoding_name(). + + 2008-06-19 Xavier Saint + + * iri.c : Include missing stringprep.h file and add a + cast. + + * init.c : set a default initial value for opt.enable_iri, + opt.locale and opt.encoding_remote. + + 2008-06-19 Xavier Saint + + * iri.c, iri.h : Add a new function find_locale() to find + out the local system encoding. + + * main.c : Make use of find_locale(). + + 2008-06-19 Xavier Saint + + * html-url.c : Add "content-type" meta tag parsing for + retrieving page encoding. + + * iri.h : Make no-op version of parse_charset() return + NULL. + 2008-06-16 Micah Cowan * http.c (http_loop): When hstat.len is higher than the successfully completed content's length, but it's because we _set_ it that way, don't abort. + 2008-06-14 Xavier Saint + + * iri.c, iri.h : New files. + + * Makefile.am : Add files iri.h and conditional iri.c. + + * build_info.c : Add compiled feature "iri". + + * http.c : include iri.h and parse charset from Content-Type + header. + + * init.c, main.c, options.h : if an options isn't supported + at compiled time, don't get rid off it and show a dummy + message instead if they are used. + 2008-06-13 Micah Cowan * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL @@@ -342,11 -267,11 +403,11 @@@ default. 2008-05-17 Kenny Parnell - + (cmd_spec_prefer_family): Initialize prefer_family to prefer_none. 2008-05-17 Micah Cowan - + * main.c (main): Handle Ctrl-D on command-line. 2008-05-15 Steven Schubiger @@@ -385,7 -310,7 +446,7 @@@ * options.h: Add an according boolean member to the options struct. - + * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE out, because they're now defined independently by config.h. diff --combined src/Makefile.am index 1ced6a90,ab830ba0..58e9b545 --- a/src/Makefile.am +++ b/src/Makefile.am @@@ -30,18 -30,22 +30,22 @@@ # Version: @VERSION@ # + if IRI_IS_ENABLED + IRI_OBJ = iri.c + endif + # The following line is losing on some versions of make! DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@ bin_PROGRAMS = wget wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \ - css.l css-url.c \ + css.l css-url.c css-tokens.h \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ recur.c res.c retr.c snprintf.c spider.c url.c \ - utils.c \ - css-url.h connect.h convert.h cookies.h \ + utils.c $(IRI_OBJ) \ + css-url.h connect.h convert.h cookies.h \ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ @@@ -62,7 -66,7 +66,7 @@@ version.c: $(wget_SOURCES) $(LDADD) $( echo '/* version.c */' > $@ echo '/* Autogenerated by Makefile - DO NOT EDIT */' >> $@ echo '' >> $@ - echo -n 'const char *version_string = "@VERSION@"' >> $@ + echo 'const char *version_string = "@VERSION@"' >> $@ -hg log -r . --template='" ({node|short})"\n' 2>/dev/null >> $@ echo ';' >> $@ echo 'const char *compilation_string = "'$(COMPILE)'";' \ diff --combined src/build_info.c index f60c76ee,532dccaf..89ae74f8 --- a/src/build_info.c +++ b/src/build_info.c @@@ -80,12 -80,6 +80,12 @@@ const char* (compiled_features[]) "-md5", #endif +#ifdef HAVE_SSL + "+https", +#else + "-https", +#endif + #ifdef HAVE_LIBGNUTLS "+gnutls", #else @@@ -103,6 -97,13 +103,13 @@@ #else "-gettext", #endif + + #ifdef ENABLE_IRI + "+iri", + #else + "-iri", + #endif + /* sentinel value */ NULL }; diff --combined src/connect.c index f46f11c4,41258d26..0a54c852 --- a/src/connect.c +++ b/src/connect.c @@@ -59,11 -59,6 +59,11 @@@ as that of the covered work. * #include "connect.h" #include "hash.h" +/* Apparently needed for Interix: */ +#ifdef HAVE_STDINT_H +# include +#endif + /* Define sockaddr_storage where unavailable (presumably on IPv4-only hosts). */ @@@ -271,9 -266,25 +271,25 @@@ connect_to_ip (const ip_address *ip, in if (print) { const char *txt_addr = print_address (ip); - if (print && 0 != strcmp (print, txt_addr)) - logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), - escnonprint_uri (print), txt_addr, port); + if (0 != strcmp (print, txt_addr)) + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL) + { + int len = strlen (print) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, print); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), + str ? str : escnonprint_uri (print), txt_addr, port); + + if (str) + xfree (str); + } else logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); } diff --combined src/host.c index 7b8c4189,bbf40222..b9aaebb4 --- a/src/host.c +++ b/src/host.c @@@ -36,7 -36,6 +36,7 @@@ as that of the covered work. * #include #ifndef WINDOWS +# include # include # include # ifndef __BEOS__ @@@ -59,11 -58,6 +59,11 @@@ # define NO_ADDRESS NO_DATA #endif +#if !HAVE_DECL_H_ERRNO +extern int h_errno; +#endif + + /* Lists of IP addresses that result from running DNS queries. See lookup_host for details. */ @@@ -718,8 -712,24 +718,24 @@@ lookup_host (const char *host, int flag /* No luck with the cache; resolve HOST. */ if (!silent && !numeric_address) - logprintf (LOG_VERBOSE, _("Resolving %s... "), - quotearg_style (escape_quoting_style, host)); + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL) + { + int len = strlen (host) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, host); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Resolving %s... "), + quotearg_style (escape_quoting_style, str ? str : host)); + + if (str) + xfree (str); + } #ifdef ENABLE_IPV6 { diff --combined src/http.c index 50f0c643,9ed226cb..ae89c46d --- a/src/http.c +++ b/src/http.c @@@ -1,6 -1,6 +1,6 @@@ /* HTTP support. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of GNU Wget. @@@ -1366,7 -1366,8 +1366,8 @@@ free_hstat (struct http_stat *hs If PROXY is non-NULL, the connection will be made to the proxy server, and u->url will be requested. */ static uerr_t - gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) + gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, + struct iri *iri) { struct request *req; @@@ -1815,101 -1816,6 +1816,101 @@@ print_server_response (resp, " "); } + /* Check for keep-alive related responses. */ + if (!inhibit_keep_alive && contlen != -1) + { + if (resp_header_copy (resp, "Keep-Alive", NULL, 0)) + keep_alive = true; + else if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval))) + { + if (0 == strcasecmp (hdrval, "Keep-Alive")) + keep_alive = true; + } + } + + if (keep_alive) + /* The server has promised that it will not close the connection + when we're done. This means that we can register it. */ + register_persistent (conn->host, conn->port, sock, using_ssl); + + if (statcode == HTTP_STATUS_UNAUTHORIZED) + { + /* Authorization is required. */ + if (keep_alive && !head_only && skip_short_body (sock, contlen)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + pconn.authorized = false; + if (!auth_finished && (user && passwd)) + { + /* IIS sends multiple copies of WWW-Authenticate, one with + the value "negotiate", and other(s) with data. Loop over + all the occurrences and pick the one we recognize. */ + int wapos; + const char *wabeg, *waend; + char *www_authenticate = NULL; + for (wapos = 0; + (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos, + &wabeg, &waend)) != -1; + ++wapos) + if (known_authentication_scheme_p (wabeg, waend)) + { + BOUNDED_TO_ALLOCA (wabeg, waend, www_authenticate); + break; + } + + if (!www_authenticate) + { + /* If the authentication header is missing or + unrecognized, there's no sense in retrying. */ + logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n")); + } + else if (!basic_auth_finished + || !BEGINS_WITH (www_authenticate, "Basic")) + { + char *pth; + pth = url_full_path (u); + request_set_header (req, "Authorization", + create_authorization_line (www_authenticate, + user, passwd, + request_method (req), + pth, + &auth_finished), + rel_value); + if (BEGINS_WITH (www_authenticate, "NTLM")) + ntlm_seen = true; + else if (!u->user && BEGINS_WITH (www_authenticate, "Basic")) + { + /* Need to register this host as using basic auth, + * so we automatically send creds next time. */ + register_basic_auth_host (u->host); + } + xfree (pth); + xfree_null (message); + resp_free (resp); + xfree (head); + goto retry_with_auth; + } + else + { + /* We already did Basic auth, and it failed. Gotta + * give up. */ + } + } + logputs (LOG_NOTQUIET, _("Authorization failed.\n")); + request_free (req); + xfree_null (message); + resp_free (resp); + xfree (head); + return AUTHFAILED; + } + else /* statcode != HTTP_STATUS_UNAUTHORIZED */ + { + /* Kludge: if NTLM is used, mark the TCP connection as authorized. */ + if (ntlm_seen) + pconn.authorized = true; + } + /* Determine the local filename if needed. Notice that if -O is used * hstat.local_file is set by http_loop to the argument of -O. */ if (!hs->local_file) @@@ -1925,7 -1831,7 +1926,7 @@@ hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@@ -1944,8 -1850,6 +1945,8 @@@ File %s already there; not retrieving.\ if (has_html_suffix_p (hs->local_file)) *dt |= TEXTHTML; + xfree (head); + xfree_null (message); return RETRUNNEEDED; } else if (!ALLOW_CLOBBER) @@@ -1996,7 -1900,7 +1997,7 @@@ local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@@ -2044,6 -1948,93 +2045,6 @@@ contlen = parsed; } - /* Check for keep-alive related responses. */ - if (!inhibit_keep_alive && contlen != -1) - { - if (resp_header_copy (resp, "Keep-Alive", NULL, 0)) - keep_alive = true; - else if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval))) - { - if (0 == strcasecmp (hdrval, "Keep-Alive")) - keep_alive = true; - } - } - if (keep_alive) - /* The server has promised that it will not close the connection - when we're done. This means that we can register it. */ - register_persistent (conn->host, conn->port, sock, using_ssl); - - if (statcode == HTTP_STATUS_UNAUTHORIZED) - { - /* Authorization is required. */ - if (keep_alive && !head_only && skip_short_body (sock, contlen)) - CLOSE_FINISH (sock); - else - CLOSE_INVALIDATE (sock); - pconn.authorized = false; - if (!auth_finished && (user && passwd)) - { - /* IIS sends multiple copies of WWW-Authenticate, one with - the value "negotiate", and other(s) with data. Loop over - all the occurrences and pick the one we recognize. */ - int wapos; - const char *wabeg, *waend; - char *www_authenticate = NULL; - for (wapos = 0; - (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos, - &wabeg, &waend)) != -1; - ++wapos) - if (known_authentication_scheme_p (wabeg, waend)) - { - BOUNDED_TO_ALLOCA (wabeg, waend, www_authenticate); - break; - } - - if (!www_authenticate) - { - /* If the authentication header is missing or - unrecognized, there's no sense in retrying. */ - logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n")); - } - else if (!basic_auth_finished - || !BEGINS_WITH (www_authenticate, "Basic")) - { - char *pth; - pth = url_full_path (u); - request_set_header (req, "Authorization", - create_authorization_line (www_authenticate, - user, passwd, - request_method (req), - pth, - &auth_finished), - rel_value); - if (BEGINS_WITH (www_authenticate, "NTLM")) - ntlm_seen = true; - else if (!u->user && BEGINS_WITH (www_authenticate, "Basic")) - { - /* Need to register this host as using basic auth, - * so we automatically send creds next time. */ - register_basic_auth_host (u->host); - } - xfree (pth); - goto retry_with_auth; - } - else - { - /* We already did Basic auth, and it failed. Gotta - * give up. */ - } - } - logputs (LOG_NOTQUIET, _("Authorization failed.\n")); - request_free (req); - return AUTHFAILED; - } - else /* statcode != HTTP_STATUS_UNAUTHORIZED */ - { - /* Kludge: if NTLM is used, mark the TCP connection as authorized. */ - if (ntlm_seen) - pconn.authorized = true; - } request_free (req); hs->statcode = statcode; @@@ -2061,9 -2052,20 +2062,20 @@@ char *tmp = strchr (type, ';'); if (tmp) { + /* sXXXav: only needed if IRI support is enabled */ + char *tmp2 = tmp + 1; + while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; + + /* Try to get remote encoding if needed */ + if (opt.enable_iri && !opt.encoding_remote) + { + tmp = parse_charset (tmp2); + if (tmp) + set_content_encoding (iri, tmp); + } } } hs->newloc = resp_header_strdup (resp, "Location"); @@@ -2124,7 -2126,6 +2136,7 @@@ else CLOSE_INVALIDATE (sock); xfree_null (type); + xfree (head); return NEWLOCATION; } } @@@ -2180,7 -2181,6 +2192,7 @@@ xfree_null (type); CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there might be more bytes in the body. */ + xfree (head); return RETRUNNEEDED; } if ((contrange != 0 && contrange != hs->restval) @@@ -2190,7 -2190,6 +2202,7 @@@ Bail out. */ xfree_null (type); CLOSE_INVALIDATE (sock); + xfree (head); return RANGEERR; } if (contlen == -1) @@@ -2254,7 -2253,6 +2266,7 @@@ CLOSE_FINISH (sock); else CLOSE_INVALIDATE (sock); + xfree (head); return RETRFINISHED; } @@@ -2281,7 -2279,6 +2293,7 @@@ _("%s has sprung into existence.\n"), hs->local_file); CLOSE_INVALIDATE (sock); + xfree (head); return FOPEN_EXCL_ERR; } } @@@ -2289,7 -2286,6 +2301,7 @@@ { logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); CLOSE_INVALIDATE (sock); + xfree (head); return FOPENERR; } } @@@ -2348,7 -2344,7 +2360,7 @@@ retried, and retried, and retried, and... */ uerr_t http_loop (struct url *u, char **newloc, char **local_file, const char *referer, - int *dt, struct url *proxy) + int *dt, struct url *proxy, struct iri *iri) { int count; bool got_head = false; /* used for time-stamping and filename detection */ @@@ -2359,17 -2355,16 +2371,17 @@@ uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; + char *file_name; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); - + /* Set LOCAL_FILE parameter. */ if (local_file && opt.output_document) *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); - + /* Reset NEWLOC parameter. */ *newloc = NULL; @@@ -2406,7 -2401,7 +2418,7 @@@ retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ logprintf (LOG_VERBOSE, _("\ - File %s already there; not retrieving.\n\n"), + File %s already there; not retrieving.\n\n"), quote (hstat.local_file)); /* If the file is there, we suppose it's retrieved OK. */ *dt |= RETROKF; @@@ -2422,10 -2417,10 +2434,10 @@@ /* Reset the counter. */ count = 0; - + /* Reset the document type. */ *dt = 0; - + /* Skip preliminary HEAD request if we're not in spider mode AND * if -O was given or HTTP Content-Disposition support is disabled. */ if (!opt.spider @@@ -2434,23 -2429,21 +2446,23 @@@ /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ + file_name = url_file_name (u); - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition - && file_exists_p (url_file_name (u))) + && file_exists_p (file_name)) send_head_first = true; - + xfree (file_name); + /* THE loop */ do { /* Increment the pass counter. */ ++count; sleep_between_retrievals (count); - + /* Get the current time string. */ tms = datetime_str (time (NULL)); - + if (opt.spider && !got_head) logprintf (LOG_VERBOSE, _("\ Spider mode enabled. Check if remote file exists.\n")); @@@ -2459,20 -2452,20 +2471,20 @@@ if (opt.verbose) { char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); - - if (count > 1) + + if (count > 1) { char tmp[256]; sprintf (tmp, _("(try:%2d)"), count); logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", tms, tmp, hurl); } - else + else { logprintf (LOG_NOTQUIET, "--%s-- %s\n", tms, hurl); } - + #ifdef WINDOWS ws_changetitle (hurl); #endif @@@ -2482,7 -2475,7 +2494,7 @@@ /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (send_head_first && !got_head) + if (send_head_first && !got_head) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@@ -2515,11 -2508,11 +2527,11 @@@ *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy); + err = gethttp (u, &hstat, dt, proxy, iri); /* Time? */ tms = datetime_str (time (NULL)); - + /* Get the new location (with or without the redirection). */ if (hstat.newloc) *newloc = xstrdup (hstat.newloc); @@@ -2558,7 -2551,7 +2570,7 @@@ hstat.statcode); ret = WRONGCODE; } - else + else { ret = NEWLOCATION; } @@@ -2574,7 -2567,7 +2586,7 @@@ /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { char *hurl = NULL; @@@ -2593,11 -2586,13 +2605,13 @@@ continue; } /* Maybe we should always keep track of broken links, not just in - * spider mode. */ - else if (opt.spider) + * spider mode. + * Don't log error if it was UTF-8 encoded because we will try + * once unencoded. */ + else if (opt.spider && !iri->utf8_encode) { /* #### Again: ugly ugly ugly! */ - if (!hurl) + if (!hurl) hurl = url_string (u, URL_AUTH_HIDE_PASSWD); nonexisting_url (hurl); logprintf (LOG_NOTQUIET, _("\ @@@ -2606,7 -2601,7 +2620,7 @@@ Remote file does not exist -- broken li else { logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, + tms, hstat.statcode, quotearg_style (escape_quoting_style, hstat.error)); } logputs (LOG_VERBOSE, "\n"); @@@ -2740,8 -2735,16 +2754,8 @@@ Remote file exists.\n\n")) && ((hstat.len == hstat.contlen) || ((hstat.res == 0) && (hstat.contlen == -1)))) { - /* #### This code repeats in http.c and ftp.c. Move it to a - function! */ const char *fl = NULL; - if (opt.output_document) - { - if (output_stream_regular) - fl = opt.output_document; - } - else - fl = hstat.local_file; + set_local_file (&fl, hstat.local_file); if (fl) { time_t newtmr = -1; @@@ -2765,14 -2768,9 +2779,14 @@@ { if (*dt & RETROKF) { + bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document)); + logprintf (LOG_VERBOSE, - _("%s (%s) - %s saved [%s/%s]\n\n"), - tms, tmrate, quote (hstat.local_file), + write_to_stdout + ? _("%s (%s) - written to stdout %s[%s/%s]\n\n") + : _("%s (%s) - %s saved [%s/%s]\n\n"), + tms, tmrate, + write_to_stdout ? "" : quote (hstat.local_file), number_to_static_string (hstat.len), number_to_static_string (hstat.contlen)); logprintf (LOG_NONVERBOSE, @@@ -2801,14 -2799,9 +2815,14 @@@ { if (*dt & RETROKF) { + bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document)); + logprintf (LOG_VERBOSE, - _("%s (%s) - %s saved [%s]\n\n"), - tms, tmrate, quote (hstat.local_file), + write_to_stdout + ? _("%s (%s) - written to stdout %s[%s]\n\n") + : _("%s (%s) - %s saved [%s]\n\n"), + tms, tmrate, + write_to_stdout ? "" : quote (hstat.local_file), number_to_static_string (hstat.len)); logprintf (LOG_NONVERBOSE, "%s URL:%s [%s] -> \"%s\" [%d]\n", @@@ -2952,7 -2945,6 +2966,7 @@@ http_atotm (const char *time_string Netscape cookie specification.) */ }; const char *oldlocale; + char savedlocale[256]; size_t i; time_t ret = (time_t) -1; @@@ -2960,16 -2952,6 +2974,16 @@@ non-English locales, which we work around by temporarily setting locale to C before invoking strptime. */ oldlocale = setlocale (LC_TIME, NULL); + if (oldlocale) + { + size_t l = strlen (oldlocale); + if (l >= sizeof savedlocale) + savedlocale[0] = '\0'; + else + memcpy (savedlocale, oldlocale, l); + } + else savedlocale[0] = '\0'; + setlocale (LC_TIME, "C"); for (i = 0; i < countof (time_formats); i++) @@@ -2989,8 -2971,7 +3003,8 @@@ } /* Restore the previous locale. */ - setlocale (LC_TIME, oldlocale); + if (savedlocale[0]) + setlocale (LC_TIME, savedlocale); return ret; } diff --combined src/init.c index bbe6b585,5ab0862c..23f8cb2c --- a/src/init.c +++ b/src/init.c @@@ -1,6 -1,6 +1,6 @@@ /* Reading/parsing the initialization file. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of GNU Wget. @@@ -58,6 -58,11 +58,6 @@@ as that of the covered work. * #include "test.h" #endif -/* We want tilde expansion enabled only when reading `.wgetrc' lines; - otherwise, it will be performed by the shell. This variable will - be set by the wgetrc-reading function. */ - -static bool enable_tilde_expansion; #define CMD_DECLARE(func) static bool func (const char *, const char *, void *) @@@ -177,9 -182,11 +177,11 @@@ static const struct { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, + { "iri", &opt.enable_iri, cmd_boolean }, { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, + { "locale", &opt.locale, cmd_string }, { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@@ -219,6 -226,7 +221,7 @@@ { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, + { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, @@@ -328,6 -336,14 +331,14 @@@ defaults (void opt.max_redirect = 20; opt.waitretry = 10; + + #ifdef ENABLE_IRI + opt.enable_iri = true; + #else + opt.enable_iri = false; + #endif + opt.locale = NULL; + opt.encoding_remote = NULL; } /* Return the user's home directory (strdup-ed), or NULL if none is @@@ -468,7 -484,6 +479,7 @@@ enum parse_line static enum parse_line parse_line (const char *, char **, char **, int *); static bool setval_internal (int, const char *, const char *); +static bool setval_internal_tilde (int, const char *, const char *); /* Initialize variables from a wgetrc file. Returns zero (failure) if there were errors in the file. */ @@@ -488,6 -503,7 +499,6 @@@ run_wgetrc (const char *file file, strerror (errno)); return true; /* not a fatal error */ } - enable_tilde_expansion = true; ln = 1; while ((line = read_whole_line (fp)) != NULL) { @@@ -499,7 -515,7 +510,7 @@@ { case line_ok: /* If everything is OK, set the value. */ - if (!setval_internal (comind, com, val)) + if (!setval_internal_tilde (comind, com, val)) { fprintf (stderr, _("%s: Error in %s at line %d.\n"), exec_name, file, ln); @@@ -526,6 -542,7 +537,6 @@@ xfree (line); ++ln; } - enable_tilde_expansion = false; fclose (fp); return errcnt == 0; @@@ -536,20 -553,15 +547,20 @@@ void initialize (void) { - char *file; + char *file, *env_sysrc; int ok = true; /* Load the hard-coded defaults. */ defaults (); - - /* If SYSTEM_WGETRC is defined, use it. */ + + /* Run a non-standard system rc file when the according environment + variable has been set. For internal testing purposes only! */ + env_sysrc = getenv ("SYSTEM_WGETRC"); + if (env_sysrc && file_exists_p (env_sysrc)) + ok &= run_wgetrc (env_sysrc); + /* Otherwise, if SYSTEM_WGETRC is defined, use it. */ #ifdef SYSTEM_WGETRC - if (file_exists_p (SYSTEM_WGETRC)) + else if (file_exists_p (SYSTEM_WGETRC)) ok &= run_wgetrc (SYSTEM_WGETRC); #endif /* Override it with your own, if one exists. */ @@@ -662,12 -674,6 +673,12 @@@ parse_line (const char *line, char **co return line_ok; } +#if defined(WINDOWS) || defined(MSDOS) +# define ISSEP(c) ((c) == '/' || (c) == '\\') +#else +# define ISSEP(c) ((c) == '/') +#endif + /* Run commands[comind].action. */ static bool @@@ -678,37 -684,6 +689,37 @@@ setval_internal (int comind, const cha return commands[comind].action (com, val, commands[comind].place); } +static bool +setval_internal_tilde (int comind, const char *com, const char *val) +{ + bool ret; + int homelen; + char *home; + char **pstring; + ret = setval_internal (comind, com, val); + + /* We make tilde expansion for cmd_file and cmd_directory */ + if (((commands[comind].action == cmd_file) || + (commands[comind].action == cmd_directory)) + && ret && (*val == '~' && ISSEP (val[1]))) + { + pstring = commands[comind].place; + home = home_dir (); + if (home) + { + homelen = strlen (home); + while (homelen && ISSEP (home[homelen - 1])) + home[--homelen] = '\0'; + + /* Skip the leading "~/". */ + for (++val; ISSEP (*val); val++) + ; + *pstring = concat_strings (home, "/", val, (char *)0); + } + } + return ret; +} + /* Run command COM with value VAL. If running the command produces an error, report the error and exit. @@@ -844,6 -819,11 +855,6 @@@ cmd_string (const char *com, const cha return true; } -#if defined(WINDOWS) || defined(MSDOS) -# define ISSEP(c) ((c) == '/' || (c) == '\\') -#else -# define ISSEP(c) ((c) == '/') -#endif /* Like the above, but handles tilde-expansion when reading a user's `.wgetrc'. In that case, and if VAL begins with `~', the tilde @@@ -857,7 -837,28 +868,7 @@@ cmd_file (const char *com, const char * /* #### If VAL is empty, perhaps should set *PLACE to NULL. */ - if (!enable_tilde_expansion || !(*val == '~' && ISSEP (val[1]))) - { - noexpand: - *pstring = xstrdup (val); - } - else - { - int homelen; - char *home = home_dir (); - if (!home) - goto noexpand; - - homelen = strlen (home); - while (homelen && ISSEP (home[homelen - 1])) - home[--homelen] = '\0'; - - /* Skip the leading "~/". */ - for (++val; ISSEP (*val); val++) - ; - - *pstring = concat_strings (home, "/", val, (char *) 0); - } + *pstring = xstrdup (val); #if defined(WINDOWS) || defined(MSDOS) /* Convert "\" to "/". */ diff --combined src/main.c index b8039d6b,a2d40888..69df08a7 --- a/src/main.c +++ b/src/main.c @@@ -1,6 -1,6 +1,6 @@@ /* Command line parsing. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of GNU Wget. @@@ -202,10 -202,12 +202,12 @@@ static struct cmdline_option option_dat { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, + { "iri", 0, OPT_BOOLEAN, "iri", -1 }, { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, + { "locale", 0, OPT_VALUE, "locale", -1 }, { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@@ -239,6 -241,7 +241,7 @@@ { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, + { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@@ -1077,6 -1080,27 +1080,27 @@@ for details.\n\n")) exit (1); } + #ifdef ENABLE_IRI + if (opt.enable_iri) + { + if (opt.locale && !check_encoding_name (opt.locale)) + opt.locale = NULL; + + if (!opt.locale) + opt.locale = find_locale (); + + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) + opt.encoding_remote = NULL; + } + #else + if (opt.enable_iri || opt.locale || opt.encoding_remote) + { + /* sXXXav : be more specific... */ + printf(_("This version does not have support for IRIs\n")); + exit(1); + } + #endif + if (opt.ask_passwd) { opt.passwd = prompt_for_password (); @@@ -1124,7 -1148,7 +1148,7 @@@ { #ifdef WINDOWS FILE *result; - result = freopen (NULL, "wb", stdout); + result = freopen ("CONOUT$", "wb", stdout); if (result == NULL) { logputs (LOG_NOTQUIET, _("\ @@@ -1178,45 -1202,40 +1202,51 @@@ WARNING: Can't reopen standard output i for (t = url; *t; t++) { char *filename = NULL, *redirected_URL = NULL; - int dt; + int dt, url_err; - struct url *url_parsed = url_parse (*t, &url_err); ++ struct url *url_parsed = url_parse (*t, &url_err, NULL, false); - if ((opt.recursive || opt.page_requisites) - && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) + if (!url_parsed) { - int old_follow_ftp = opt.follow_ftp; - - /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) - opt.follow_ftp = 1; - - status = retrieve_tree (*t, NULL); - - opt.follow_ftp = old_follow_ftp; + char *error = url_error (*t, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error); + xfree (error); + status = URLERROR; } else { - struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale, true); - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, - opt.recursive, i); - iri_free (i); - } + if ((opt.recursive || opt.page_requisites) + && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed))) + { + int old_follow_ftp = opt.follow_ftp; - if (opt.delete_after && file_exists_p(filename)) - { - DEBUGP (("Removing file due to --delete-after in main():\n")); - logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); - if (unlink (filename)) - logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); - } + /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ + if (url_scheme (*t) == SCHEME_FTP) + opt.follow_ftp = 1; + - status = retrieve_tree (url_parsed); ++ status = retrieve_tree (url_parsed, NULL); - xfree_null (redirected_URL); - xfree_null (filename); + opt.follow_ftp = old_follow_ftp; + } + else - status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive); ++ { ++ struct iri *i = iri_new (); ++ set_uri_encoding (i, opt.locale, true); ++ status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, ++ NULL, &dt, opt.recursive, i); ++ iri_free (i); ++ } + + if (opt.delete_after && file_exists_p(filename)) + { + DEBUGP (("Removing file due to --delete-after in main():\n")); + logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); + if (unlink (filename)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + } + xfree_null (redirected_URL); + xfree_null (filename); + url_free (url_parsed); + } } /* And then from the input file, if any. */ diff --combined src/recur.c index 2e067505,95581486..83a9b4ee --- a/src/recur.c +++ b/src/recur.c @@@ -51,7 -51,7 +51,7 @@@ as that of the covered work. * #include "html-url.h" #include "css-url.h" #include "spider.h" - + /* Functions for maintaining the URL queue. */ struct queue_element { @@@ -60,6 -60,7 +60,7 @@@ int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ + struct iri *iri; /* sXXXav */ bool css_allowed; /* whether the document is allowed to be treated as CSS. */ struct queue_element *next; /* next element in queue */ @@@ -93,11 -94,12 +94,12 @@@ url_queue_delete (struct url_queue *que into it. */ static void - url_enqueue (struct url_queue *queue, + url_enqueue (struct url_queue *queue, struct iri *i, const char *url, const char *referer, int depth, bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); + qel->iri = i; qel->url = url; qel->referer = referer; qel->depth = depth; @@@ -112,6 -114,10 +114,10 @@@ DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + if (i) + DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url), + i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); + if (queue->tail) queue->tail->next = qel; queue->tail = qel; @@@ -124,7 -130,7 +130,7 @@@ succeeded, or false if the queue is empty. */ static bool - url_dequeue (struct url_queue *queue, + url_dequeue (struct url_queue *queue, struct iri **i, const char **url, const char **referer, int *depth, bool *html_allowed, bool *css_allowed) { @@@ -137,6 -143,7 +143,7 @@@ if (!queue->head) queue->tail = NULL; + *i = qel->iri; *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@@ -153,9 -160,9 +160,9 @@@ } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); -static bool descend_redirect_p (const char *, const char *, int, +static bool descend_redirect_p (const char *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); /* Retrieve a part of the web beginning with START_URL. This used to @@@ -180,7 -187,7 +187,7 @@@ options, add it to the queue. */ uerr_t - retrieve_tree (struct url *start_url_parsed) -retrieve_tree (const char *start_url, struct iri *pi) ++retrieve_tree (struct url *start_url_parsed, struct iri *pi) { uerr_t status = RETROK; @@@ -191,12 -198,38 +198,28 @@@ the queue, but haven't been downloaded yet. */ struct hash_table *blacklist; + int up_error_code; - struct url *start_url_parsed; + struct iri *i = iri_new (); + + #define COPYSTR(x) (x) ? xstrdup(x) : NULL; + /* Duplicate pi struct if not NULL */ + if (pi) + { + i->uri_encoding = COPYSTR (pi->uri_encoding); + i->content_encoding = COPYSTR (pi->content_encoding); + i->utf8_encode = pi->utf8_encode; + } + else + set_uri_encoding (i, opt.locale, true); + #undef COPYSTR + - start_url_parsed = url_parse (start_url, &up_error_code, i, true); - if (!start_url_parsed) - { - char *error = url_error (start_url, up_error_code); - logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error); - xfree (error); - return URLERROR; - } - queue = url_queue_new (); blacklist = make_string_hash_table (0); /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); + url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, + false); string_set_add (blacklist, start_url_parsed->url); while (1) @@@ -215,7 -248,7 +238,7 @@@ /* Get the next URL from the queue... */ - if (!url_dequeue (queue, + if (!url_dequeue (queue, (struct iri **) &i, (const char **)&url, (const char **)&referer, &depth, &html_allowed, &css_allowed)) break; @@@ -253,22 -286,11 +276,12 @@@ } else { - int dt = 0; + int dt = 0, url_err; char *redirected = NULL; - struct url *url_parsed = url_parse (url, &url_err); ++ struct url *url_parsed = url_parse (url, &url_err, i, false); - if (!url_parsed) - { - char *error = url_error (url, url_err); - logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); - xfree (error); - status = URLERROR; - } - else - { - status = retrieve_url (url_parsed, url, &file, &redirected, - referer, &dt, false); - } - status = retrieve_url (url, &file, &redirected, referer, &dt, - false, i); ++ status = retrieve_url (url_parsed, url, &file, &redirected, referer, ++ &dt, false, i); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@@ -295,8 -317,8 +308,8 @@@ want to follow it. */ if (descend) { - if (!descend_redirect_p (redirected, url, depth, + if (!descend_redirect_p (redirected, url_parsed, depth, - start_url_parsed, blacklist)) + start_url_parsed, blacklist, i)) descend = false; else /* Make sure that the old pre-redirect form gets @@@ -307,7 -329,6 +320,7 @@@ xfree (url); url = redirected; } + url_free(url_parsed); } if (opt.spider) @@@ -349,7 -370,7 +362,7 @@@ bool meta_disallow_follow = false; struct urlpos *children = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, &meta_disallow_follow); + get_urls_html (file, url, &meta_disallow_follow, i); if (opt.use_robots && meta_disallow_follow) { @@@ -360,7 -381,8 +373,8 @@@ if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + struct url *url_parsed = url_parse (url, NULL, i, false); + struct iri *ci; char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@@ -377,9 -399,11 +391,11 @@@ if (dash_p_leaf_HTML && !child->link_inline_p) continue; if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) + blacklist, i)) { - url_enqueue (queue, xstrdup (child->url->url), + ci = iri_new (); + set_uri_encoding (ci, i->content_encoding, false); + url_enqueue (queue, ci, xstrdup (child->url->url), xstrdup (referer_url), depth + 1, child->link_expect_html, child->link_expect_css); @@@ -397,18 -421,18 +413,18 @@@ } } - if (file - && (opt.delete_after + if (file + && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : + (opt.spider ? "--spider" : "recursive rejection criteria"))); logprintf (LOG_VERBOSE, (opt.delete_after || opt.spider @@@ -424,6 -448,7 +440,7 @@@ xfree (url); xfree_null (referer); xfree_null (file); + iri_free (i); } /* If anything is left of the queue due to a premature exit, free it @@@ -432,15 -457,19 +449,17 @@@ char *d1, *d2; int d3; bool d4, d5; - while (url_dequeue (queue, + struct iri *d6; + while (url_dequeue (queue, (struct iri **)&d6, (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { + iri_free (d6); xfree (d1); xfree_null (d2); } } url_queue_delete (queue); - if (start_url_parsed) - url_free (start_url_parsed); string_set_free (blacklist); if (opt.quota && total_downloaded_bytes > opt.quota) @@@ -461,7 -490,8 +480,8 @@@ static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *u = upos->url; const char *url = u->url; @@@ -471,7 -501,7 +491,7 @@@ if (string_set_contains (blacklist, url)) { - if (opt.spider) + if (opt.spider) { char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); @@@ -602,7 -632,7 +622,7 @@@ if (!specs) { char *rfile; - if (res_retrieve_file (url, &rfile)) + if (res_retrieve_file (url, &rfile, iri)) { specs = res_parse_from_file (rfile); @@@ -656,24 -686,27 +676,25 @@@ it is merely a simple-minded wrapper around download_child_p. */ static bool -descend_redirect_p (const char *redirected, const char *original, int depth, +descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { - struct url *orig_parsed, *new_parsed; + struct url *new_parsed; struct urlpos *upos; bool success; - orig_parsed = url_parse (original, NULL, NULL, false); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, NULL, false); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist, iri); - url_free (orig_parsed); url_free (new_parsed); xfree (upos); diff --combined src/recur.h index 7eeb5642,515a382b..76c0ef5f --- a/src/recur.h +++ b/src/recur.h @@@ -1,6 -1,6 +1,6 @@@ /* Declarations for recur.c. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of GNU Wget. @@@ -31,8 -31,6 +31,8 @@@ as that of the covered work. * #ifndef RECUR_H #define RECUR_H +#include "url.h" + /* For most options, 0 means no limits, but with -p in the picture, that causes a problem on the maximum recursion depth variable. To retain backwards compatibility we allow users to consider "0" to be @@@ -44,6 -42,6 +44,6 @@@ struct urlpos; void recursive_cleanup (void); - uerr_t retrieve_tree (struct url *); -uerr_t retrieve_tree (const char *, struct iri *); ++uerr_t retrieve_tree (struct url *, struct iri *); #endif /* RECUR_H */ diff --combined src/res.c index 20ffe1c8,0320d034..4b0ff82b --- a/src/res.c +++ b/src/res.c @@@ -532,37 -532,28 +532,44 @@@ res_get_specs (const char *host, int po Return true if robots were retrieved OK, false otherwise. */ bool - res_retrieve_file (const char *url, char **file) + res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; - int saved_sp_val = opt.spider; + int saved_sp_val = opt.spider, url_err; + struct url * url_parsed; + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding, false); + i->utf8_encode = false; + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i); + - url_parsed = url_parse (robots_url, &url_err); ++ url_parsed = url_parse (robots_url, &url_err, iri, true); + if (!url_parsed) + { + char *error = url_error (robots_url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); + xfree (error); + err = URLERROR; + } + else + { + err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, - false); ++ false, i); + url_free(url_parsed); + } + opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { diff --combined src/retr.c index ffa84c38,1d9d7478..0fd936d0 --- a/src/retr.c +++ b/src/retr.c @@@ -226,8 -226,7 +226,8 @@@ fd_read_body (int fd, FILE *out, wgint /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL argument to progress_create because the indicator doesn't (yet) know about "skipping" data. */ - progress = progress_create (skip ? 0 : startpos, startpos + toread); + wgint start = skip ? 0 : startpos; + progress = progress_create (start, start + toread); progress_interactive = progress_interactive_p (progress); } @@@ -597,15 -596,15 +597,16 @@@ static char *getproxy (struct url *) multiple points. */ uerr_t -retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive, struct iri *iri) +retrieve_url (struct url * orig_parsed, const char *origurl, char **file, - char **newloc, const char *refurl, int *dt, bool recursive) ++ char **newloc, const char *refurl, int *dt, bool recursive, ++ struct iri *iri) { uerr_t result; char *url; bool location_changed; int dummy; char *mynewloc, *proxy; - struct url *u, *proxy_url; + struct url *u = orig_parsed, *proxy_url; int up_error_code; /* url parse error code */ char *local_file; int redirection_count = 0; @@@ -626,6 -625,21 +627,11 @@@ if (file) *file = NULL; + second_try: - u = url_parse (url, &up_error_code, iri, true); - if (!u) - { - char *error = url_error (url, up_error_code); - logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); - xfree (url); - xfree (error); - return URLERROR; - } - + DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url), + iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None", + iri->utf8_encode)); + if (!refurl) refurl = opt.referer; @@@ -639,8 -653,12 +645,12 @@@ proxy = getproxy (u); if (proxy) { + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale, true); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, NULL, true); if (!proxy_url) { char *error = url_error (proxy, up_error_code); @@@ -667,7 -685,7 +677,7 @@@ #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@@ -717,17 -735,20 +727,23 @@@ xfree (mynewloc); mynewloc = construced_newloc; + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); + xfree_null (iri->orig_url); + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true); if (!newloc_parsed) { char *error = url_error (mynewloc, up_error_code); logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), error); - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } xfree (url); xfree (mynewloc); xfree (error); @@@ -747,10 -768,7 +763,10 @@@ logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"), opt.max_redirect); url_free (newloc_parsed); - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } xfree (url); xfree (mynewloc); RESTORE_POST_DATA; @@@ -759,10 -777,7 +775,10 @@@ xfree (url); url = mynewloc; - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } u = newloc_parsed; /* If we're being redirected from POST, we don't want to POST @@@ -776,8 -791,21 +792,21 @@@ goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (!(*dt & RETROKF) && iri->utf8_encode) + { + iri->utf8_encode = false; + DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url))); + goto second_try; + } + + if (local_file && *dt & RETROKF) { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); if (*dt & RETROKF) { register_download (u->url, local_file); @@@ -795,10 -823,7 +824,10 @@@ else xfree_null (local_file); - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } if (redirection_count) { @@@ -830,41 -855,41 +859,51 @@@ retrieve_from_file (const char *file, b { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); char *input_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { - int dt; + int dt,url_err; uerr_t status; - struct url * url_parsed = url_parse(url, &url_err); ++ struct url * url_parsed = url_parse(url, &url_err, NULL, true); + + if (!url_parsed) + { + char *error = url_error (url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); + xfree (error); + return URLERROR; + } if (!opt.base_href) opt.base_href = xstrdup (url); - status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, false); - status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri); ++ status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, ++ false, iri); if (status != RETROK) return status; if (dt & TEXTHTML) html = true; + + /* If we have a found a content encoding, use it */ + if (iri->content_encoding) + set_uri_encoding (iri, iri->content_encoding, false); } else input_file = (char *) file; - url_list = (html ? get_urls_html (input_file, NULL, NULL) + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) @@@ -880,24 -905,28 +919,28 @@@ status = QUOTEXC; break; } + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + xfree_null (iri->orig_url); + iri->orig_url = NULL; + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (cur_url->url); + - status = retrieve_tree (cur_url->url->url, iri); ++ status = retrieve_tree (cur_url->url, iri); opt.follow_ftp = old_follow_ftp; } else - { - status = retrieve_url (cur_url->url, cur_url->url->url, &filename, - &new_file, NULL, &dt, opt.recursive); - } - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, - &dt, opt.recursive, iri); ++ status = retrieve_url (cur_url->url, cur_url->url->url, &filename, ++ &new_file, NULL, &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { @@@ -916,6 -945,8 +959,8 @@@ Removing file due to --delete-after in /* Free the linked list of URL-s. */ free_urlpos (url_list); + iri_free (iri); + return status; } @@@ -1065,12 -1096,18 +1110,12 @@@ getproxy (struct url *u /* Returns true if URL would be downloaded through a proxy. */ bool -url_uses_proxy (const char *url) +url_uses_proxy (struct url * u) { bool ret; - struct url *u; - struct iri *i = iri_new(); - /* url was given in the command line, so use locale as encoding */ - set_uri_encoding (i, opt.locale, true); - u= url_parse (url, NULL, i, false); if (!u) return false; ret = getproxy (u) != NULL; - url_free (u); return ret; } @@@ -1083,16 -1120,3 +1128,16 @@@ no_proxy_match (const char *host, cons else return sufmatch (no_proxy, host); } + +/* Set the file parameter to point to the local file string. */ +void +set_local_file (const char **file, const char *default_file) +{ + if (opt.output_document) + { + if (output_stream_regular) + *file = opt.output_document; + } + else + *file = default_file; +} diff --combined src/retr.h index 72be93b7,bb2e66d3..8854b684 --- a/src/retr.h +++ b/src/retr.h @@@ -31,8 -31,6 +31,8 @@@ as that of the covered work. * #ifndef RETR_H #define RETR_H +#include "url.h" + /* These global vars should be made static to retr.c and exported via functions! */ extern SUM_SIZE_INT total_downloaded_bytes; @@@ -53,7 -51,8 +53,8 @@@ typedef const char *(*hunk_terminator_t char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); - uerr_t retrieve_url (struct url *, const char *, char **, char **, const char *, int *, bool); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *, - bool, struct iri *); ++uerr_t retrieve_url (struct url *, const char *, char **, char **, ++ const char *, int *, bool, struct iri *); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); @@@ -64,6 -63,6 +65,6 @@@ void sleep_between_retrievals (int) void rotate_backups (const char *); -bool url_uses_proxy (const char *); +bool url_uses_proxy (struct url *); #endif /* RETR_H */ diff --combined src/url.c index d416fcf7,86d099a7..4c22a9fc --- a/src/url.c +++ b/src/url.c @@@ -649,7 -649,7 +649,7 @@@ static const char *parse_errors[] = error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * - url_parse (const char *url, int *error) + url_parse (const char *url, int *error, struct iri *iri, bool percent_encode) { struct url *u; const char *p; @@@ -668,7 -668,7 +668,8 @@@ int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; - char *url_encoded = NULL, *new_url = NULL; ++ const char *url_encoded = NULL; ++ char *new_url = NULL; int error_code; @@@ -679,9 -679,26 +680,26 @@@ goto error; } - url_encoded = reencode_escapes (url); + if (iri && iri->utf8_encode) + { + iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url); + if (!iri->utf8_encode) + new_url = NULL; + else + iri->orig_url = xstrdup (url); + } + + /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/ + if (percent_encode) + url_encoded = reencode_escapes (new_url ? new_url : url); + else + url_encoded = new_url ? new_url : url; + p = url_encoded; + if (new_url && url_encoded != new_url) + xfree (new_url); + p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); @@@ -851,6 -868,18 +869,18 @@@ { url_unescape (u->host); host_modified = true; + + /* Apply IDNA regardless of iri->utf8_encode status */ + if (opt.enable_iri && iri) + { + char *new = idn_encode (iri, u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } } if (params_b) @@@ -860,7 -889,7 +890,7 @@@ if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild @@@ -875,7 -904,7 +905,7 @@@ if (url_encoded == url) u->url = xstrdup (url); else -- u->url = url_encoded; ++ u->url = (char *) url_encoded; } return u; @@@ -883,7 -912,7 +913,7 @@@ error: /* Cleanup in case of error: */ if (url_encoded && url_encoded != url) -- xfree (url_encoded); ++ xfree ((char *) url_encoded); /* Transmit the error code to the caller, if the caller wants to know. */ @@@ -1978,12 -2007,12 +2008,12 @@@ schemes_are_similar_p (enum url_scheme static int getchar_from_escaped_string (const char *str, char *c) - { + { const char *p = str; assert (str && *str); assert (c); - + if (p[0] == '%') { if (!c_isxdigit(p[1]) || !c_isxdigit(p[2])) @@@ -2033,7 -2062,7 +2063,7 @@@ are_urls_equal (const char *u1, const c p += pp; q += qq; } - + return (*p == 0 && *q == 0 ? true : false); } @@@ -2142,7 -2171,7 +2172,7 @@@ test_append_uri_pathel( } test_array[] = { { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" }, }; - + for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) { struct growable dest; diff --combined tests/ChangeLog index 522bd202,d9ba6531..3dfc60a3 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@@ -1,27 -1,19 +1,43 @@@ + 2008-12-04 Micah Cowan (not copyrightable) + + * run-px, Test-idn-robots.px: Added test for robots-file + downloads. + + * Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px: + Fix test names. + + 2008-11-26 Micah Cowan (not copyrightable) + + * Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px, + Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px, + Test-idn-meta.px, Test-iri-disabled.px, + Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More + module-scope warnings. + +2009-06-14 Micah Cowan + + * Makefile.am (EXTRA_DIST): Include all the tests, run-px, and + certs/, to make distcheck happy. + +2009-06-11 Benjamin Wolsey + + * Test-proxied-https-auth.px: Take an optional argument for the + top source directory, so we can find the cert and key. + + * run-px: Provide the top source directory as an argument, so + scripts can find their way around. + +2009-04-11 Steven Schubiger + + * run-px: Skip testing with real rc files by setting + SYSTEM_WGETRC and WGETRC to /dev/null. + +2009-02-25 Benjamin Wolsey + + * Makefile.am (run-px-tests): Ensure run-px is run from srcdir. + + * run-px: Include modules from srcdir. + 2008-11-25 Steven Schubiger * WgetTest.pm.in: Remove the magic interpreter line; @@@ -95,6 -87,51 +111,51 @@@ * run-px: Use strict (thanks Steven Schubiger!). + 2008-09-09 Micah Cowan + + * Test-idn-cmd.px: Added. + + * run-px: Added Test-idn-cmd.px. + + 2008-08-28 Micah Cowan + + * HTTPServer.pm (run): Allow distinguishing between hostnames, + when used as a proxy. + + * Test-idn-headers.px, Test-idn-meta.px: Added. + + * run-px: Added Test-idn-headers.px, Test-idn-meta.px. + + * Test-proxy-auth-basic.px: Use the full URL, rather than just the + path (made necessary by the accompanying change to HTTPServer.pm). + + 2008-08-14 Xavier Saint + + * Test-iri-list.px : Fetch files from a remote list. + + 2008-08-03 Xavier Saint + + * Test-iri.px : HTTP recursive fetch for testing IRI support and + fallback. + + * Test-iri-disabled.px : Same file structure as Test-iri.px but with + IRI support disabled + + * Test-iri-forced-remote.px : There's a difference between ISO-8859-1 + and ISO-8859-15 for character 0xA4 (respectively currency sign and + euro sign). So with a forced ISO-8859-1 remote encoding, wget should + see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead + of using the ISO-8859-15 given by the server. + + * Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale + and expect wget to fetch the file UTF-8 encoded. + + * Test-ftp-iri-fallback.px : Same as above but wget should fallback on + locale encoding to fetch the file. + + * Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support + disabled. The UTF-8 encoded file should not be retrieved. + 2008-06-22 Micah Cowan * Test-proxied-https-auth.px: Shift exit code so it falls in the diff --combined tests/run-px index 33e4c600,01d84995..3b5449bd --- a/tests/run-px +++ b/tests/run-px @@@ -25,9 -25,20 +25,20 @@@ my @tests = 'Test-E-k-K.px', 'Test-E-k.px', 'Test-ftp.px', + 'Test-ftp-iri.px', + 'Test-ftp-iri-fallback.px', + 'Test-ftp-iri-disabled.px', 'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', + 'Test-idn-headers.px', + 'Test-idn-meta.px', + 'Test-idn-cmd.px', + 'Test-idn-robots.px', + 'Test-iri.px', + 'Test-iri-disabled.px', + 'Test-iri-forced-remote.px', + 'Test-iri-list.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px', @@@ -55,22 -66,14 +66,22 @@@ 'Test--spider-r.px', ); +foreach my $var (qw(SYSTEM_WGETRC WGETRC)) { + $ENV{$var} = '/dev/null'; +} + my @tested; foreach my $test (@tests) { print "Running $test\n\n"; - system("$^X $top_srcdir/tests/$test"); + system("$^X -I$top_srcdir/tests $top_srcdir/tests/$test $top_srcdir"); push @tested, { name => $test, result => $? }; } +foreach my $var (qw(SYSTEM_WGETRC WGETRC)) { + delete $ENV{$var}; +} + print "\n"; foreach my $test (@tested) { ($test->{result} == 0)