From: Micah Cowan Date: Wed, 26 Nov 2008 15:08:38 +0000 (-0800) Subject: Merge with mainline. X-Git-Tag: v1.13~338^2~2^2^2~1 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=289ff1c86acbd60e09cb15d22df62b8e19942c3e;hp=-c Merge with mainline. --- 289ff1c86acbd60e09cb15d22df62b8e19942c3e diff --combined ChangeLog index c19c374f,ac384a49..a891c52e --- a/ChangeLog +++ b/ChangeLog @@@ -1,3 -1,18 +1,18 @@@ + 2008-11-10 Micah Cowan + + * MAILING-LIST: Mention Gmane, introduce subsections. + + 2008-11-05 Micah Cowan + + * MAILING-LIST: Mention moderation for unsubscribed posts, and + archive location. + + 2008-10-31 Micah Cowan + + * MAILING-LIST: Update information. + + * NEWS: Add mention of mailing list move. + 2008-08-01 Joao Ferreira * NEWS: Added option --default-page to support alternative @@@ -9,14 -24,6 +24,14 @@@ * AUTHORS: Added Steven Schubiger. +2008-06-26 Xavier Saint + + * configure.ac : IRIs support required libiconv, check it. + +2008-06-14 Xavier Saint + + * configure.ac: Add support for IRIs + 2008-05-29 Micah Cowan * po/*.po: Updated from TP (the 1.11.3 set). diff --combined doc/ChangeLog index 94a06283,a879fcb8..02eb44e5 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@@ -1,12 -1,39 +1,48 @@@ + 2008-11-10 Micah Cowan + + * Makefile.am (EXTRA_DIST): Removed no-longer-present + README.maint (shouldn't have been there in the first place). + + * wget.texi (Mailing Lists): Added information aboug Gmane portal, + added subsection headings. + + Update node pointers. + + 2008-11-05 Micah Cowan + + * wget.texi: Move --no-http-keep-alive from FTP Options to HTTP + Options. + (Mailing List): Mention moderation for unsubscribed posts, and + archive location. + + 2008-11-04 Micah Cowan + + * wget.texi, fdl.texi: Updated to FDL version 1.3. + + 2008-10-31 Micah Cowan + + * wget.texi (Mailing List): Update info to reflect change to + bug-wget@gnu.org. + + 2008-09-30 Steven Schubiger + + * wget.texi (Wgetrc Commands): Add default_page, save_headers, + spider and user_agent to the list of recognized commands. + + 2008-09-10 Michael Kessler + + * wget.texi (Robot Exclusion): Fixed typo "downloads" -> + "download" + +2008-08-03 Xavier Saint + + * wget.texi : Add option descriptions for the three new + options --iri, --locale and --remote-encoding related to + IRI support. + + * sample.wgetrc : Add commented lines for the three new + command iri, locale and encoding related to IRI support. + 2008-08-03 Micah Cowan * wget.texi: Don't set UPDATED; already set by version.texi. diff --combined doc/wget.texi index 54e2eb9d,8af74d94..657ec3cf --- a/doc/wget.texi +++ b/doc/wget.texi @@@ -82,7 -82,7 +82,7 @@@ Info entry for @file{wget} @contents @ifnottex - @node Top + @node Top, Overview, (dir), (dir) @top Wget @value{VERSION} @insertcopying @@@ -102,7 -102,7 +102,7 @@@ * Concept Index:: Topics covered by this manual. @end menu - @node Overview + @node Overview, Invoking, Top, Top @chapter Overview @cindex overview @cindex features @@@ -211,7 -211,7 +211,7 @@@ Public License, as published by the Fre file @file{COPYING} that came with GNU Wget, for details). @end itemize - @node Invoking + @node Invoking, Recursive Download, Overview, Top @chapter Invoking @cindex invoking @cindex command line @@@ -248,7 -248,7 +248,7 @@@ the command line * Recursive Accept/Reject Options:: @end menu - @node URL Format + @node URL Format, Option Syntax, Invoking, Invoking @section URL Format @cindex URL @cindex URL syntax @@@ -326,7 -326,7 +326,7 @@@ with your favorite browser, like @code{ @c man begin OPTIONS - @node Option Syntax + @node Option Syntax, Basic Startup Options, URL Format, Invoking @section Option Syntax @cindex option syntax @cindex syntax of options @@@ -401,7 -401,7 +401,7 @@@ the default. For instance, using @code using @samp{--no-follow-ftp} is the only way to restore the factory default from the command line. - @node Basic Startup Options + @node Basic Startup Options, Logging and Input File Options, Option Syntax, Invoking @section Basic Startup Options @table @samp @@@ -429,7 -429,7 +429,7 @@@ instances of @samp{-e} @end table - @node Logging and Input File Options + @node Logging and Input File Options, Download Options, Basic Startup Options, Invoking @section Logging and Input File Options @table @samp @@@ -517,7 -517,7 +517,7 @@@ Prepends @var{URL} to relative links re the @samp{-i} option. @end table - @node Download Options + @node Download Options, Directory Options, Logging and Input File Options, Invoking @section Download Options @table @samp @@@ -674,30 -674,6 +674,30 @@@ Another instance where you'll get a gar Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http} servers that support the @code{Range} header. +@cindex iri support +@cindex idn support +@item --iri + +Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to +turn it off. IRI support is activated by default. + +You can set the default state of IRI support using @code{iri} command in +@file{.wgetrc}. That setting may be overridden from the command line. + +@cindex local encoding +@cindex locale +@item --locale=@var{encoding} + +Force Wget to use @var{encoding} as the default system encoding. That affects +how Wget converts URLs specified as arguments from locale to @sc{utf-8} for +IRI support. + +Wget use the function @code{nl_langinfo()} and then the @code{CHARSET} +environment variable to get the locale. If it fails, @sc{ascii} is used. + +You can set the default locale using the @code{locale} command in +@file{.wgetrc}. That setting may be overridden from the command line. + @cindex progress indicator @cindex dot style @item --progress=@var{type} @@@ -729,21 -705,6 +729,21 @@@ command line. The exception is that, w ``dot'' progress will be favored over ``bar''. To force the bar output, use @samp{--progress=bar:force}. +@cindex remote encoding +@item --remote-encoding=@var{encoding} + +Force Wget to use encoding as the default remote server encoding. That +affects how Wget converts URIs found in files from remote encoding to +@sc{utf-8} during a recursive fetch. This options is only useful for +IRI support, for the interpretation of non-@sc{ascii} characters. + +For HTTP, remote encoding can be found in HTTP @code{Content-Type} +header and in HTML @code{Content-Type http-equiv} meta tag. + +You can set the default encoding using the @code{remoteencoding} +command in @file{.wgetrc}. That setting may be overridden from the +command line. + @item -N @itemx --timestamping Turn on time-stamping. @xref{Time-Stamping}, for details. @@@ -1038,7 -999,7 +1038,7 @@@ Prompt for a password for each connecti when @samp{--password} is being used, because they are mutually exclusive. @end table - @node Directory Options + @node Directory Options, HTTP Options, Download Options, Invoking @section Directory Options @table @samp @@@ -1110,7 -1071,7 +1110,7 @@@ i.e. the top of the retrieval tree. Th current directory). @end table - @node HTTP Options + @node HTTP Options, HTTPS (SSL/TLS) Options, Directory Options, Invoking @section HTTP Options @table @samp @@@ -1170,6 -1131,19 +1170,19 @@@ For more information about security iss Considerations}. @end iftex + @cindex Keep-Alive, turning off + @cindex Persistent Connections, disabling + @item --no-http-keep-alive + Turn off the ``keep-alive'' feature for HTTP downloads. Normally, Wget + asks the server to keep the connection open so that, when you download + more than one document from the same server, they get transferred over + the same TCP connection. This saves time and at the same time reduces + the load on the server. + + This option is useful when, for some reason, persistent (keep-alive) + connections don't work for you, for example due to a server bug or due + to the inability of server-side scripts to cope with the connections. + @cindex proxy @cindex cache @item --no-cache @@@ -1444,7 -1418,7 +1457,7 @@@ form-based authentication @end table - @node HTTPS (SSL/TLS) Options + @node HTTPS (SSL/TLS) Options, FTP Options, HTTP Options, Invoking @section HTTPS (SSL/TLS) Options @cindex SSL @@@ -1569,7 -1543,7 +1582,7 @@@ not used), EGD is never contacted. EG systems that support @file{/dev/random}. @end table - @node FTP Options + @node FTP Options, Recursive Retrieval Options, HTTPS (SSL/TLS) Options, Invoking @section FTP Options @table @samp @@@ -1672,22 -1646,9 +1685,9 @@@ Note that when retrieving a file (not specified on the command-line, rather than because it was recursed to, this option has no effect. Symbolic links are always traversed in this case. - - @cindex Keep-Alive, turning off - @cindex Persistent Connections, disabling - @item --no-http-keep-alive - Turn off the ``keep-alive'' feature for HTTP downloads. Normally, Wget - asks the server to keep the connection open so that, when you download - more than one document from the same server, they get transferred over - the same TCP connection. This saves time and at the same time reduces - the load on the server. - - This option is useful when, for some reason, persistent (keep-alive) - connections don't work for you, for example due to a server bug or due - to the inability of server-side scripts to cope with the connections. @end table - @node Recursive Retrieval Options + @node Recursive Retrieval Options, Recursive Accept/Reject Options, FTP Options, Invoking @section Recursive Retrieval Options @table @samp @@@ -1892,7 -1853,7 +1892,7 @@@ If, for whatever reason, you want stric option to turn it on. @end table - @node Recursive Accept/Reject Options + @node Recursive Accept/Reject Options, , Recursive Retrieval Options, Invoking @section Recursive Accept/Reject Options @table @samp @@@ -1987,7 -1948,7 +1987,7 @@@ This is a useful option, since it guara @c man end - @node Recursive Download + @node Recursive Download, Following Links, Invoking, Top @chapter Recursive Download @cindex recursion @cindex retrieving @@@ -2055,7 -2016,7 +2055,7 @@@ about this Recursive retrieval should be used with care. Don't say you were not warned. - @node Following Links + @node Following Links, Time-Stamping, Recursive Download, Top @chapter Following Links @cindex links @cindex following links @@@ -2079,7 -2040,7 +2079,7 @@@ links it will follow * FTP Links:: Following FTP links. @end menu - @node Spanning Hosts + @node Spanning Hosts, Types of Files, Following Links, Following Links @section Spanning Hosts @cindex spanning hosts @cindex hosts, spanning @@@ -2136,7 -2097,7 +2136,7 @@@ wget -rH -Dfoo.edu --exclude-domains su @end table - @node Types of Files + @node Types of Files, Directory-Based Limits, Spanning Hosts, Following Links @section Types of Files @cindex types of files @@@ -2241,7 -2202,7 +2241,7 @@@ local filenames, and so @emph{do} contr This behavior, too, is considered less-than-desirable, and may change in a future version of Wget. - @node Directory-Based Limits + @node Directory-Based Limits, Relative Links, Types of Files, Following Links @section Directory-Based Limits @cindex directories @cindex directory limits @@@ -2325,7 -2286,7 +2325,7 @@@ directory, while in @samp{http://foo/ba meaningless, as its parent is @samp{/}). @end table - @node Relative Links + @node Relative Links, FTP Links, Directory-Based Limits, Following Links @section Relative Links @cindex relative links @@@ -2354,7 -2315,7 +2354,7 @@@ to ``just work'' without having to conv This option is probably not very useful and might be removed in a future release. - @node FTP Links + @node FTP Links, , Relative Links, Following Links @section Following FTP Links @cindex following ftp links @@@ -2374,7 -2335,7 +2374,7 @@@ effect on such downloads. On the othe Also note that followed links to @sc{ftp} directories will not be retrieved recursively further. - @node Time-Stamping + @node Time-Stamping, Startup File, Following Links, Top @chapter Time-Stamping @cindex time-stamping @cindex timestamping @@@ -2424,7 -2385,7 +2424,7 @@@ say * FTP Time-Stamping Internals:: @end menu - @node Time-Stamping Usage + @node Time-Stamping Usage, HTTP Time-Stamping Internals, Time-Stamping, Time-Stamping @section Time-Stamping Usage @cindex time-stamping usage @cindex usage, time-stamping @@@ -2480,7 -2441,7 +2480,7 @@@ gives a timestamp. For @sc{http}, thi directory listing with dates in a format that Wget can parse (@pxref{FTP Time-Stamping Internals}). - @node HTTP Time-Stamping Internals + @node HTTP Time-Stamping Internals, FTP Time-Stamping Internals, Time-Stamping Usage, Time-Stamping @section HTTP Time-Stamping Internals @cindex http time-stamping @@@ -2512,7 -2473,7 +2512,7 @@@ with @samp{-N}, server file @samp{@var{ Arguably, @sc{http} time-stamping should be implemented using the @code{If-Modified-Since} request. - @node FTP Time-Stamping Internals + @node FTP Time-Stamping Internals, , HTTP Time-Stamping Internals, Time-Stamping @section FTP Time-Stamping Internals @cindex ftp time-stamping @@@ -2541,7 -2502,7 +2541,7 @@@ that is supported by some @sc{ftp} serv @code{wu-ftpd}), which returns the exact time of the specified file. Wget may support this command in the future. - @node Startup File + @node Startup File, Examples, Time-Stamping, Top @chapter Startup File @cindex startup file @cindex wgetrc @@@ -2569,7 -2530,7 +2569,7 @@@ commands * Sample Wgetrc:: A wgetrc example. @end menu - @node Wgetrc Location + @node Wgetrc Location, Wgetrc Syntax, Startup File, Startup File @section Wgetrc Location @cindex wgetrc location @cindex location of wgetrc @@@ -2590,7 -2551,7 +2590,7 @@@ means that in case of collision user's system-wide wgetrc (in @file{/usr/local/etc/wgetrc} by default). Fascist admins, away! - @node Wgetrc Syntax + @node Wgetrc Syntax, Wgetrc Commands, Wgetrc Location, Startup File @section Wgetrc Syntax @cindex wgetrc syntax @cindex syntax of wgetrc @@@ -2617,7 -2578,7 +2617,7 @@@ global @file{wgetrc}, you can do it wit reject = @end example - @node Wgetrc Commands + @node Wgetrc Commands, Sample Wgetrc, Wgetrc Syntax, Startup File @section Wgetrc Commands @cindex wgetrc commands @@@ -2710,6 -2671,9 +2710,9 @@@ Ignore @var{n} remote directory compone @item debug = on/off Debug mode, same as @samp{-d}. + @item default_page = @var{string} + Default page name---the same as @samp{--default-page=@var{string}}. + @item delete_after = on/off Delete after download---the same as @samp{--delete-after}. @@@ -3002,6 -2966,9 +3005,9 @@@ this off Save cookies to @var{file}. The same as @samp{--save-cookies @var{file}}. + @item save_headers = on/off + Same as @samp{--save-headers}. + @item secure_protocol = @var{string} Choose the secure protocol to be used. Legal values are @samp{auto} (the default), @samp{SSLv2}, @samp{SSLv3}, and @samp{TLSv1}. The same @@@ -3014,6 -2981,9 +3020,9 @@@ responses---the same as @samp{-S} @item span_hosts = on/off Same as @samp{-H}. + @item spider = on/off + Same as @samp{--spider}. + @item strict_comments = on/off Same as @samp{--strict-comments}. @@@ -3037,6 -3007,10 +3046,10 @@@ Specify username @var{string} for both This command can be overridden using the @samp{ftp_user} and @samp{http_user} command for @sc{ftp} and @sc{http} respectively. + @item user_agent = @var{string} + User agent identification sent to the HTTP Server---the same as + @samp{--user-agent=@var{string}}. + @item verbose = on/off Turn verbose on/off---the same as @samp{-v}/@samp{-nv}. @@@ -3050,7 -3024,7 +3063,7 @@@ only---the same as @samp{--waitretry=@v turned on by default in the global @file{wgetrc}. @end table - @node Sample Wgetrc + @node Sample Wgetrc, , Wgetrc Commands, Startup File @section Sample Wgetrc @cindex sample wgetrc @@@ -3067,7 -3041,7 +3080,7 @@@ its line @include sample.wgetrc.munged_for_texi_inclusion @end example - @node Examples + @node Examples, Various, Startup File, Top @chapter Examples @cindex examples @@@ -3081,7 -3055,7 +3094,7 @@@ complexity * Very Advanced Usage:: The hairy stuff. @end menu - @node Simple Usage + @node Simple Usage, Advanced Usage, Examples, Examples @section Simple Usage @itemize @bullet @@@ -3134,7 -3108,7 +3147,7 @@@ links index.htm @end example @end itemize - @node Advanced Usage + @node Advanced Usage, Very Advanced Usage, Simple Usage, Examples @section Advanced Usage @itemize @bullet @@@ -3270,7 -3244,7 +3283,7 @@@ wget -O - http://cool.list.com/ | wget @end example @end itemize - @node Very Advanced Usage + @node Very Advanced Usage, , Advanced Usage, Examples @section Very Advanced Usage @cindex mirroring @@@ -3319,7 -3293,7 +3332,7 @@@ wget -m -k -K -E http://www.gnu.org/ - @end itemize @c man end - @node Various + @node Various, Appendices, Examples, Top @chapter Various @cindex various @@@ -3329,14 -3303,14 +3342,14 @@@ This chapter contains all the stuff tha * Proxies:: Support for proxy servers. * Distribution:: Getting the latest version. * Web Site:: GNU Wget's presence on the World Wide Web. - * Mailing List:: Wget mailing list for announcements and discussion. + * Mailing Lists:: Wget mailing list for announcements and discussion. * Internet Relay Chat:: Wget's presence on IRC. * Reporting Bugs:: How and where to report bugs. * Portability:: The systems Wget works on. * Signals:: Signal-handling performed by Wget. @end menu - @node Proxies + @node Proxies, Distribution, Various, Various @section Proxies @cindex proxies @@@ -3412,7 -3386,7 +3425,7 @@@ Alternatively, you may use the @samp{pr settings @code{proxy_user} and @code{proxy_password} to set the proxy username and password. - @node Distribution + @node Distribution, Web Site, Proxies, Various @section Distribution @cindex latest version @@@ -3421,7 -3395,7 +3434,7 @@@ master GNU archive site ftp.gnu.org, an Wget @value{VERSION} can be found at @url{ftp://ftp.gnu.org/pub/gnu/wget/wget-@value{VERSION}.tar.gz} - @node Web Site + @node Web Site, Mailing Lists, Distribution, Various @section Web Site @cindex web site @@@ -3430,43 -3404,64 +3443,64 @@@ The official web site for GNU Wget is a information resides at ``The Wget Wgiki'', @url{http://wget.addictivecode.org/}. - @node Mailing List - @section Mailing List + @node Mailing Lists, Internet Relay Chat, Web Site, Various + @section Mailing Lists @cindex mailing list @cindex list - There are several Wget-related mailing lists. The general discussion - list is at @email{wget@@sunsite.dk}. It is the preferred place for - support requests and suggestions, as well as for discussion of - development. You are invited to subscribe. - - To subscribe, simply send mail to @email{wget-subscribe@@sunsite.dk} - and follow the instructions. Unsubscribe by mailing to - @email{wget-unsubscribe@@sunsite.dk}. The mailing list is archived at + @unnumberedsubsec Primary List + + The primary mailinglist for discussion, bug-reports, or questions + about GNU Wget is at @email{bug-wget@@gnu.org}. To subscribe, send an + email to @email{bug-wget-join@@gnu.org}, or visit + @url{http://lists.gnu.org/mailman/listinfo/bug-wget}. + + You do not need to subscribe to send a message to the list; however, + please note that unsubscribed messages are moderated, and may take a + while before they hit the list---@strong{usually around a day}. If + you want your message to show up immediately, please subscribe to the + list before posting. Archives for the list may be found at + @url{http://lists.gnu.org/pipermail/bug-wget/}. + + An NNTP/Usenettish gateway is also available via + @uref{http://gmane.org/about.php,Gmane}. You can see the Gmane + archives at + @url{http://news.gmane.org/gmane.comp.web.wget.general}. Note that the + Gmane archives conveniently include messages from both the current + list, and the previous one. Messages also show up in the Gmane + archives sooner than they do at @url{lists.gnu.org}. + + @unnumberedsubsec Bug Notices List + + Additionally, there is the @email{wget-notify@@addictivecode.org} mailing + list. This is a non-discussion list that receives bug report + notifications from the bug-tracker. To subscribe to this list, + send an email to @email{wget-notify-join@@addictivecode.org}, + or visit @url{http://addictivecode.org/mailman/listinfo/wget-notify}. + + @unnumberedsubsec Obsolete Lists + + Previously, the mailing list @email{wget@@sunsite.dk} was used as the + main discussion list, and another list, + @email{wget-patches@@sunsite.dk} was used for submitting and + discussing patches to GNU Wget. + + Messages from @email{wget@@sunsite.dk} are archived at + @itemize @tie{} + @item @url{http://www.mail-archive.com/wget%40sunsite.dk/} and at - @url{http://news.gmane.org/gmane.comp.web.wget.general}. - - Another mailing list is at @email{wget-patches@@sunsite.dk}, and is - used to submit patches for review by Wget developers. A ``patch'' is - a textual representation of change to source code, readable by both - humans and programs. The - @url{http://wget.addictivecode.org/PatchGuidelines} page - covers the creation and submitting of patches in detail. Please don't - send general suggestions or bug reports to @samp{wget-patches}; use it - only for patch submissions. - - Subscription is the same as above for @email{wget@@sunsite.dk}, except - that you send to @email{wget-patches-subscribe@@sunsite.dk}, instead. - The mailing list is archived at - @url{http://news.gmane.org/gmane.comp.web.wget.patches}. + @item + @url{http://news.gmane.org/gmane.comp.web.wget.general} (which also + continues to archive the current list, @email{bug-wget@@gnu.org}). + @end itemize - Finally, there is the @email{wget-notify@@addictivecode.org} mailing - list. This is a non-discussion list that receives bug report-change - notifications from the bug-tracker. Unlike for the other mailing lists, - subscription is through the @code{mailman} interface at - @url{http://addictivecode.org/mailman/listinfo/wget-notify}. + Messages from @email{wget-patches@@sunsite.dk} are archived at + @itemize @tie{} + @item + @url{http://news.gmane.org/gmane.comp.web.wget.patches}. + @end itemize - @node Internet Relay Chat + @node Internet Relay Chat, Reporting Bugs, Mailing Lists, Various @section Internet Relay Chat @cindex Internet Relay Chat @cindex IRC @@@ -3475,7 -3470,7 +3509,7 @@@ In addition to the mailinglists, we also have a support channel set up via IRC at @code{irc.freenode.org}, @code{#wget}. Come check it out! - @node Reporting Bugs + @node Reporting Bugs, Portability, Internet Relay Chat, Various @section Reporting Bugs @cindex bugs @cindex reporting bugs @@@ -3495,7 -3490,7 +3529,7 @@@ Wget crashes, it's a bug. If Wget doe it's a bug. If things work strange, but you are not sure about the way they are supposed to work, it might well be a bug, but you might want to double-check the documentation and the mailing lists (@pxref{Mailing - List}). + Lists}). @item Try to repeat the bug in as simple circumstances as possible. E.g. if @@@ -3534,7 -3529,7 +3568,7 @@@ safe to try @end enumerate @c man end - @node Portability + @node Portability, Signals, Reporting Bugs, Various @section Portability @cindex portability @cindex operating systems @@@ -3567,7 -3562,7 +3601,7 @@@ Support for building on MS-DOS via DJGP Vanem; a port to VMS is maintained by Steven Schweda, and is available at @url{http://antinode.org/}. - @node Signals + @node Signals, , Portability, Various @section Signals @cindex signal handling @cindex hangup @@@ -3588,7 -3583,7 +3622,7 @@@ SIGHUP received, redirecting output to Other than that, Wget will not try to interfere with signals in any way. @kbd{C-c}, @code{kill -TERM} and @code{kill -KILL} should kill it alike. - @node Appendices + @node Appendices, Copying this manual, Various, Top @chapter Appendices This chapter contains some references I consider useful. @@@ -3599,7 -3594,7 +3633,7 @@@ * Contributors:: People who helped. @end menu - @node Robot Exclusion + @node Robot Exclusion, Security Considerations, Appendices, Appendices @section Robot Exclusion @cindex robot exclusion @cindex robots.txt @@@ -3638,7 -3633,7 +3672,7 @@@ avoid. To be found by the robots, the download and parse. Although Wget is not a web robot in the strictest sense of the word, it - can downloads large parts of the site without the user's intervention to + can download large parts of the site without the user's intervention to download an individual page. Because of that, Wget honors RES when downloading recursively. For instance, when you issue: @@@ -3682,7 -3677,7 +3716,7 @@@ robot exclusion, set the @code{robots} @file{.wgetrc}. You can achieve the same effect from the command line using the @code{-e} switch, e.g. @samp{wget -e robots=off @var{url}...}. - @node Security Considerations + @node Security Considerations, Contributors, Robot Exclusion, Appendices @section Security Considerations @cindex security @@@ -3713,7 -3708,7 +3747,7 @@@ being careful when you send debug logs me). @end enumerate - @node Contributors + @node Contributors, , Security Considerations, Appendices @section Contributors @cindex contributors @@@ -4058,17 -4053,21 +4092,21 @@@ Kristijan Zimmer Apologies to all who I accidentally left out, and many thanks to all the subscribers of the Wget mailing list. - @node Copying this manual + @node Copying this manual, Concept Index, Appendices, Top @appendix Copying this manual @menu * GNU Free Documentation License:: Licnse for copying this manual. @end menu + @node GNU Free Documentation License, , Copying this manual, Copying this manual + @appendixsec GNU Free Documentation License + @cindex FDL, GNU Free Documentation License + @include fdl.texi - @node Concept Index + @node Concept Index, , Copying this manual, Top @unnumbered Concept Index @printindex cp diff --combined src/ChangeLog index 5e3a8893,99cd940c..09b6527e --- a/src/ChangeLog +++ b/src/ChangeLog @@@ -1,3 -1,74 +1,74 @@@ + 2008-11-13 Micah Cowan + + * http.c (gethttp): Don't do anything when content-length >= our + requested range. + + 2008-11-12 Micah Cowan + + * ftp-ls.c (ftp_index): HTML-escape dir name in title, h1, a:href. + + 2008-11-12 Alexander Belopolsky + + * url.c, url.h (url_escape_unsafe_and_reserved): Added. + + * ftp-ls.c (ftp_index): URL-escape, rather than HTML-escape, the + filename appearing in the link. + + 2008-11-12 Steven Schubiger + + * main.c (print_version): Hand the relevant + xstrdup/xfree calls back to format_and_print_line(). + + 2008-11-11 Steven Schubiger + + * main.c (format_and_print_line): Move both the memory + allocating and freeing bits upwards to print_version(). + + 2008-11-10 Saint Xavier + + * http.c: Make --auth-no-challenge works with user:pass@ in URLs. + + 2008-11-05 Micah Cowan + + * ftp.c (print_length): Should print humanized "size remaining" + only when it's at least 1k. + + 2008-10-31 Micah Cowan + + * main.c (print_version): Add information about the mailing list. + + 2008-10-31 Alexander Drozdov + + * retr.c (fd_read_hunk): Make assert deal with maxsize == 0. + + * ftp-ls.c (clean_line): Prevent underflow on empty lines. + + 2008-10-26 Gisle Vanem + + * main.c (format_and_print_line): Put variables on top of + blocks (not all compilers are C99). Add an extra '\n' if + SYSTEM_WGETRC isn't defined and printed. + + 2008-09-09 Gisle Vanem + + * url.c (url_error): Use aprintf, not asprintf. + + 2008-09-09 Micah Cowan + + * init.c (home_dir): Save the calculated value for home, + to avoid duplicated work on repeated calls. + (wgetrc_file_name) [WINDOWS]: Define and initialize home var. + + * build_info.c, main.c: Remove unnecessary extern vars + system_wgetrc and locale_dir. + + * main.c: Define program_name for lib/error.c. + + 2008-09-02 Gisle Vanem + + * mswindows.h: Must ensure is included before + we redefine ?vsnprintf(). + 2008-08-08 Steven Schubiger * main.c, utils.h: Removed some dead conditional DEBUG_MALLOC code. @@@ -32,27 -103,11 +103,27 @@@ * init.c (cleanup): Free the memory associated with the base option (when DEBUG_MALLOC is defined). +2008-07-02 Xavier Saint + + * iri.c, iri.h : New function idn_decode() to decode ASCII + encoded hostname to the locale. + + * host.c : Show hostname to be resolved both in locale and + ASCII encoded. + 2008-06-28 Steven Schubiger * retr.c (retrieve_from_file): Allow for reading the links from an external file (HTTP/FTP). +2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + 2008-06-25 Steven Schubiger * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic @@@ -77,7 -132,7 +148,7 @@@ * http.c: Make -nv --spider include the file's name when it exists. - + 2008-06-22 Micah Cowan * Makefile.am (version.c): Fixed version string invocation so it @@@ -85,57 -140,12 +156,57 @@@ string vars pointers-to-const, and moved line lengths below 80 (in Makefile.am, not in version.c). +2008-06-19 Xavier Saint + + * iri.c, iri.h : New function check_encoding_name() as + a preliminary encoding name check. + + * main.c, iri.c : Make use of check_encoding_name(). + +2008-06-19 Xavier Saint + + * iri.c : Include missing stringprep.h file and add a + cast. + + * init.c : set a default initial value for opt.enable_iri, + opt.locale and opt.encoding_remote. + +2008-06-19 Xavier Saint + + * iri.c, iri.h : Add a new function find_locale() to find + out the local system encoding. + + * main.c : Make use of find_locale(). + +2008-06-19 Xavier Saint + + * html-url.c : Add "content-type" meta tag parsing for + retrieving page encoding. + + * iri.h : Make no-op version of parse_charset() return + NULL. + 2008-06-16 Micah Cowan * http.c (http_loop): When hstat.len is higher than the successfully completed content's length, but it's because we _set_ it that way, don't abort. +2008-06-14 Xavier Saint + + * iri.c, iri.h : New files. + + * Makefile.am : Add files iri.h and conditional iri.c. + + * build_info.c : Add compiled feature "iri". + + * http.c : include iri.h and parse charset from Content-Type + header. + + * init.c, main.c, options.h : if an options isn't supported + at compiled time, don't get rid off it and show a dummy + message instead if they are used. + 2008-06-13 Micah Cowan * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL @@@ -179,11 -189,11 +250,11 @@@ default. 2008-05-17 Kenny Parnell - + (cmd_spec_prefer_family): Initialize prefer_family to prefer_none. 2008-05-17 Micah Cowan - + * main.c (main): Handle Ctrl-D on command-line. 2008-05-15 Steven Schubiger @@@ -222,7 -232,7 +293,7 @@@ * options.h: Add an according boolean member to the options struct. - + * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE out, because they're now defined independently by config.h. diff --combined src/build_info.c index 542fed8a,551b7d94..532dccaf --- a/src/build_info.c +++ b/src/build_info.c @@@ -33,9 -33,6 +33,6 @@@ as that of the covered work. * #include "wget.h" #include - char *system_wgetrc = SYSTEM_WGETRC; - char *locale_dir = LOCALEDIR; - const char* (compiled_features[]) = { @@@ -100,13 -97,6 +97,13 @@@ #else "-gettext", #endif + +#ifdef ENABLE_IRI + "+iri", +#else + "-iri", +#endif + /* sentinel value */ NULL }; diff --combined src/http.c index 589e18ee,cd0dba85..99c9be6f --- a/src/http.c +++ b/src/http.c @@@ -1364,8 -1364,7 +1364,8 @@@ free_hstat (struct http_stat *hs If PROXY is non-NULL, the connection will be made to the proxy server, and u->url will be requested. */ static uerr_t -gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) +gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, + struct iri *iri) { struct request *req; @@@ -1496,9 -1495,10 +1496,10 @@@ user = user ? user : (opt.http_user ? opt.http_user : opt.user); passwd = passwd ? passwd : (opt.http_passwd ? opt.http_passwd : opt.passwd); - if (user && passwd - && !u->user) /* We only do "site-wide" authentication with "global" - user/password values; URL user/password info overrides. */ + /* We only do "site-wide" authentication with "global" user/password + * values unless --auth-no-challange has been requested; URL user/password + * info overrides. */ + if (user && passwd && (!u->user || opt.auth_without_challenge)) { /* If this is a host for which we've already received a Basic * challenge, we'll go ahead and send Basic authentication creds. */ @@@ -1828,7 -1828,7 +1829,7 @@@ hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@@ -1897,7 -1897,7 +1898,7 @@@ File %s already there; not retrieving.\ local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@@ -2049,20 -2049,9 +2050,20 @@@ char *tmp = strchr (type, ';'); if (tmp) { + /* sXXXav: only needed if IRI support is enabled */ + char *tmp2 = tmp + 1; + while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; + + /* Try to get remote encoding if needed */ + if (opt.enable_iri && !opt.encoding_remote) + { + tmp = parse_charset (tmp2); + if (tmp) + set_content_encoding (iri, tmp); + } } } hs->newloc = resp_header_strdup (resp, "Location"); @@@ -2159,11 -2148,15 +2160,15 @@@ } } - if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE) + if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE + || (hs->restval > 0 && statcode == HTTP_STATUS_OK + && contrange == 0 && hs->restval >= contlen) + ) { /* If `-c' is in use and the file has been fully downloaded (or the remote file has shrunk), Wget effectively requests bytes - after the end of file and the server response with 416. */ + after the end of file and the server response with 416 + (or 200 with a <= Content-Length. */ logputs (LOG_VERBOSE, _("\ \n The file is already fully retrieved; nothing to do.\n\n")); /* In case the caller inspects. */ @@@ -2337,7 -2330,7 +2342,7 @@@ retried, and retried, and retried, and... */ uerr_t http_loop (struct url *u, char **newloc, char **local_file, const char *referer, - int *dt, struct url *proxy) + int *dt, struct url *proxy, struct iri *iri) { int count; bool got_head = false; /* used for time-stamping and filename detection */ @@@ -2348,16 -2341,16 +2353,16 @@@ uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); - + /* Set LOCAL_FILE parameter. */ if (local_file && opt.output_document) *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); - + /* Reset NEWLOC parameter. */ *newloc = NULL; @@@ -2394,7 -2387,7 +2399,7 @@@ retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), +File %s already there; not retrieving.\n\n"), quote (hstat.local_file)); /* If the file is there, we suppose it's retrieved OK. */ *dt |= RETROKF; @@@ -2410,10 -2403,10 +2415,10 @@@ /* Reset the counter. */ count = 0; - + /* Reset the document type. */ *dt = 0; - + /* Skip preliminary HEAD request if we're not in spider mode AND * if -O was given or HTTP Content-Disposition support is disabled. */ if (!opt.spider @@@ -2422,21 -2415,21 +2427,21 @@@ /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition && file_exists_p (url_file_name (u))) send_head_first = true; - + /* THE loop */ do { /* Increment the pass counter. */ ++count; sleep_between_retrievals (count); - + /* Get the current time string. */ tms = datetime_str (time (NULL)); - + if (opt.spider && !got_head) logprintf (LOG_VERBOSE, _("\ Spider mode enabled. Check if remote file exists.\n")); @@@ -2445,20 -2438,20 +2450,20 @@@ if (opt.verbose) { char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); - - if (count > 1) + + if (count > 1) { char tmp[256]; sprintf (tmp, _("(try:%2d)"), count); logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", tms, tmp, hurl); } - else + else { logprintf (LOG_NOTQUIET, "--%s-- %s\n", tms, hurl); } - + #ifdef WINDOWS ws_changetitle (hurl); #endif @@@ -2468,7 -2461,7 +2473,7 @@@ /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (send_head_first && !got_head) + if (send_head_first && !got_head) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@@ -2501,11 -2494,11 +2506,11 @@@ *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy); + err = gethttp (u, &hstat, dt, proxy, iri); /* Time? */ tms = datetime_str (time (NULL)); - + /* Get the new location (with or without the redirection). */ if (hstat.newloc) *newloc = xstrdup (hstat.newloc); @@@ -2544,7 -2537,7 +2549,7 @@@ hstat.statcode); ret = WRONGCODE; } - else + else { ret = NEWLOCATION; } @@@ -2560,7 -2553,7 +2565,7 @@@ /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { char *hurl = NULL; @@@ -2579,13 -2572,11 +2584,13 @@@ continue; } /* Maybe we should always keep track of broken links, not just in - * spider mode. */ - else if (opt.spider) + * spider mode. + * Don't log error if it was UTF-8 encoded because we will try + * once unencoded. */ + else if (opt.spider && !iri->utf8_encode) { /* #### Again: ugly ugly ugly! */ - if (!hurl) + if (!hurl) hurl = url_string (u, URL_AUTH_HIDE_PASSWD); nonexisting_url (hurl); logprintf (LOG_NOTQUIET, _("\ @@@ -2594,7 -2585,7 +2599,7 @@@ Remote file does not exist -- broken li else { logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, + tms, hstat.statcode, quotearg_style (escape_quoting_style, hstat.error)); } logputs (LOG_VERBOSE, "\n"); diff --combined src/init.c index fd71a362,768bebd1..991f570f --- a/src/init.c +++ b/src/init.c @@@ -182,11 -182,9 +182,11 @@@ static const struct { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, + { "iri", &opt.enable_iri, cmd_boolean }, { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, + { "locale", &opt.locale, cmd_string }, { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@@ -226,7 -224,6 +226,7 @@@ { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, + { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, @@@ -334,14 -331,6 +334,14 @@@ defaults (void opt.restrict_files_case = restrict_no_case_restriction; opt.max_redirect = 20; + +#ifdef ENABLE_IRI + opt.enable_iri = true; +#else + opt.enable_iri = false; +#endif + opt.locale = NULL; + opt.encoding_remote = NULL; } /* Return the user's home directory (strdup-ed), or NULL if none is @@@ -349,35 -338,41 +349,41 @@@ char * home_dir (void) { - char *home = getenv ("HOME"); + static char buf[PATH_MAX]; + static char *home; if (!home) { + home = getenv ("HOME"); + if (!home) + { #if defined(MSDOS) - /* Under MSDOS, if $HOME isn't defined, use the directory where - `wget.exe' resides. */ - const char *_w32_get_argv0 (void); /* in libwatt.a/pcconfig.c */ - char *p, buf[PATH_MAX]; - - strcpy (buf, _w32_get_argv0 ()); - p = strrchr (buf, '/'); /* djgpp */ - if (!p) - p = strrchr (buf, '\\'); /* others */ - assert (p); - *p = '\0'; - home = buf; + /* Under MSDOS, if $HOME isn't defined, use the directory where + `wget.exe' resides. */ + const char *_w32_get_argv0 (void); /* in libwatt.a/pcconfig.c */ + char *p; + + strcpy (buf, _w32_get_argv0 ()); + p = strrchr (buf, '/'); /* djgpp */ + if (!p) + p = strrchr (buf, '\\'); /* others */ + assert (p); + *p = '\0'; + home = buf; #elif !defined(WINDOWS) - /* If HOME is not defined, try getting it from the password - file. */ - struct passwd *pwd = getpwuid (getuid ()); - if (!pwd || !pwd->pw_dir) - return NULL; - home = pwd->pw_dir; + /* If HOME is not defined, try getting it from the password + file. */ + struct passwd *pwd = getpwuid (getuid ()); + if (!pwd || !pwd->pw_dir) + return NULL; + strcpy (buf, pwd->pw_dir); + home = buf; #else /* !WINDOWS */ - /* Under Windows, if $HOME isn't defined, use the directory where - `wget.exe' resides. */ - home = ws_mypath (); + /* Under Windows, if $HOME isn't defined, use the directory where + `wget.exe' resides. */ + home = ws_mypath (); #endif /* WINDOWS */ + } } return home ? xstrdup (home) : NULL; @@@ -403,12 -398,13 +409,13 @@@ wgetrc_env_file_name (void } return NULL; } + /* Check for the existance of '$HOME/.wgetrc' and return it's path if it exists and is set. */ char * wgetrc_user_file_name (void) { - char *home = home_dir(); + char *home = home_dir (); char *file = NULL; if (home) file = aprintf ("%s/.wgetrc", home); @@@ -422,6 -418,7 +429,7 @@@ } return file; } + /* Return the path to the user's .wgetrc. This is either the value of `WGETRC' environment variable, or `$HOME/.wgetrc'. @@@ -430,10 -427,11 +438,11 @@@ char * wgetrc_file_name (void) { + char *home = NULL; char *file = wgetrc_env_file_name (); if (file && *file) return file; - + file = wgetrc_user_file_name (); #ifdef WINDOWS @@@ -441,6 -439,7 +450,7 @@@ `wget.ini' in the directory where `wget.exe' resides; we do this for backward compatibility with previous versions of Wget. SYSTEM_WGETRC should not be defined under WINDOWS. */ + home = home_dir (); if (!file || !file_exists_p (file)) { xfree_null (file); @@@ -449,6 -448,7 +459,7 @@@ if (home) file = aprintf ("%s/wget.ini", home); } + xfree_null (home); #endif /* WINDOWS */ if (!file) diff --combined src/main.c index 414b62bc,b0fceae0..fcf58861 --- a/src/main.c +++ b/src/main.c @@@ -72,8 -72,6 +72,6 @@@ extern char *system_getrc extern char *link_string; /* defined in build_info.c */ extern char *compiled_features[]; - extern char *system_wgetrc; - extern char *locale_dir; /* Used for --version output in print_version */ static const int max_chars_per_line = 72; @@@ -201,12 -199,10 +199,12 @@@ static struct cmdline_option option_dat { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, + { "iri", 0, OPT_BOOLEAN, "iri", -1 }, { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, + { "locale", 0, OPT_VALUE, "locale", -1 }, { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@@ -240,7 -236,6 +238,7 @@@ { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, + { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@@ -711,20 -706,26 +709,26 @@@ prompt_for_password (void and an appropriate number of spaces are added on subsequent lines.*/ static void - format_and_print_line (char* prefix, char* line, - int line_length) + format_and_print_line (const char *prefix, const char *line, + int line_length) { + int leading_spaces; + int remaining_chars; + char *line_dup, *token; + assert (prefix != NULL); assert (line != NULL); + line_dup = xstrdup (line); + if (line_length <= 0) line_length = max_chars_per_line; - const int leading_spaces = strlen (prefix); + leading_spaces = strlen (prefix); printf ("%s", prefix); - int remaining_chars = line_length - leading_spaces; + remaining_chars = line_length - leading_spaces; /* We break on spaces. */ - char* token = strtok (line, " "); + token = strtok (line_dup, " "); while (token != NULL) { /* If however a token is much larger than the maximum @@@ -732,8 -733,9 +736,9 @@@ token on the next line. */ if (remaining_chars <= strlen (token)) { + int j; printf ("\n"); - int j = 0; + j = 0; for (j = 0; j < leading_spaces; j++) { printf (" "); @@@ -746,8 -748,8 +751,8 @@@ } printf ("\n"); - xfree (prefix); - xfree (line); + + xfree (line_dup); } static void @@@ -760,13 -762,15 +765,15 @@@ print_version (void const char *link_title = "Link : "; const char *prefix_spaces = " "; const int prefix_space_length = strlen (prefix_spaces); + char *line; + char *env_wgetrc, *user_wgetrc; + int i; printf ("GNU Wget %s\n", version_string); printf (options_title); /* compiled_features is a char*[]. We limit the characters per line to max_chars_per_line and prefix each line with a constant number of spaces for proper alignment. */ - int i =0; for (i = 0; compiled_features[i] != NULL; ) { int line_length = max_chars_per_line - prefix_space_length; @@@ -785,31 -789,36 +792,36 @@@ /* Handle the case when $WGETRC is unset and $HOME/.wgetrc is absent. */ printf (wgetrc_title); - char *env_wgetrc = wgetrc_env_file_name (); + env_wgetrc = wgetrc_env_file_name (); if (env_wgetrc && *env_wgetrc) { printf ("%s (env)\n%s", env_wgetrc, prefix_spaces); xfree (env_wgetrc); } - char *user_wgetrc = wgetrc_user_file_name (); + user_wgetrc = wgetrc_user_file_name (); if (user_wgetrc) { printf ("%s (user)\n%s", user_wgetrc, prefix_spaces); xfree (user_wgetrc); } - printf ("%s (system)\n", system_wgetrc); + #ifdef SYSTEM_WGETRC + printf ("%s (system)\n", SYSTEM_WGETRC); + #else + putchar ('\n'); + #endif - format_and_print_line (strdup (locale_title), - strdup (locale_dir), + format_and_print_line (locale_title, + LOCALEDIR, max_chars_per_line); - format_and_print_line (strdup (compile_title), - strdup (compilation_string), + format_and_print_line (compile_title, + compilation_string, max_chars_per_line); - format_and_print_line (strdup (link_title), - strdup (link_string), + format_and_print_line (link_title, + link_string, max_chars_per_line); + printf ("\n"); /* TRANSLATORS: When available, an actual copyright character (cirle-c) should be used in preference to "(C)". */ @@@ -826,9 -835,13 +838,13 @@@ There is NO WARRANTY, to the extent per stdout); fputs (_("Currently maintained by Micah Cowan .\n"), stdout); + fputs (_("Please send bug reports and questions to .\n"), + stdout); exit (0); } + char *program_name; /* Needed by lib/error.c. */ + int main (int argc, char **argv) { @@@ -837,6 -850,8 +853,8 @@@ int nurl, status; bool append_to_log = false; + program_name = argv[0]; + i18n_initialize (); /* Construct the name of the executable, without the directory part. */ @@@ -1065,27 -1080,6 +1083,27 @@@ for details.\n\n")) exit (1); } +#ifdef ENABLE_IRI + if (opt.enable_iri) + { + if (opt.locale && !check_encoding_name (opt.locale)) + opt.locale = NULL; + + if (!opt.locale) + opt.locale = find_locale (); + + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) + opt.encoding_remote = NULL; + } +#else + if (opt.enable_iri || opt.locale || opt.encoding_remote) + { + /* sXXXav : be more specific... */ + printf(_("This version does not have support for IRIs\n")); + exit(1); + } +#endif + if (opt.ask_passwd) { opt.passwd = prompt_for_password (); @@@ -1195,21 -1189,15 +1213,21 @@@ WARNING: Can't reopen standard output i int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) + if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (*t); + + status = retrieve_tree (*t, NULL); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + { + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale, true); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, + opt.recursive, i); + iri_free (i); + } if (opt.delete_after && file_exists_p(filename)) { diff --combined src/retr.c index 28a6d874,21c9002e..e3d62978 --- a/src/retr.c +++ b/src/retr.c @@@ -393,7 -393,7 +393,7 @@@ fd_read_hunk (int fd, hunk_terminator_ char *hunk = xmalloc (bufsize); int tail = 0; /* tail position in HUNK */ - assert (maxsize >= bufsize); + assert (!maxsize || maxsize >= bufsize); while (1) { @@@ -597,7 -597,7 +597,7 @@@ static char *getproxy (struct url *) uerr_t retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive) + const char *refurl, int *dt, bool recursive, struct iri *iri) { uerr_t result; char *url; @@@ -625,8 -625,7 +625,8 @@@ if (file) *file = NULL; - u = url_parse (url, &up_error_code); + second_try: + u = url_parse (url, &up_error_code, iri); if (!u) { char *error = url_error (url, up_error_code); @@@ -636,10 -635,6 +636,10 @@@ return URLERROR; } + DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url), + iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None", + iri->utf8_encode)); + if (!refurl) refurl = opt.referer; @@@ -653,12 -648,8 +653,12 @@@ proxy = getproxy (u); if (proxy) { + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale, true); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, NULL); if (!proxy_url) { char *error = url_error (proxy, up_error_code); @@@ -685,7 -676,7 +685,7 @@@ #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@@ -735,13 -726,8 +735,13 @@@ xfree (mynewloc); mynewloc = construced_newloc; + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri); if (!newloc_parsed) { char *error = url_error (mynewloc, up_error_code); @@@ -790,21 -776,8 +790,21 @@@ goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (!(*dt & RETROKF) && iri->utf8_encode) { + iri->utf8_encode = false; + DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url))); + goto second_try; + } + + if (local_file && *dt & RETROKF) + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); if (*dt & RETROKF) { register_download (u->url, local_file); @@@ -854,18 -827,13 +854,18 @@@ retrieve_from_file (const char *file, b { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); char *input_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { int dt; @@@ -874,21 -842,17 +874,21 @@@ if (!opt.base_href) opt.base_href = xstrdup (url); - status = retrieve_url (url, &input_file, NULL, NULL, &dt, false); + status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri); if (status != RETROK) return status; if (dt & TEXTHTML) html = true; + + /* If we have a found a content encoding, use it */ + if (iri->content_encoding) + set_uri_encoding (iri, iri->content_encoding, false); } else input_file = (char *) file; - url_list = (html ? get_urls_html (input_file, NULL, NULL) + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) @@@ -904,26 -868,21 +904,26 @@@ status = QUOTEXC; break; } + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (cur_url->url->url); + + status = retrieve_tree (cur_url->url->url, iri); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); + status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, + &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { @@@ -942,8 -901,6 +942,8 @@@ Removing file due to --delete-after in /* Free the linked list of URL-s. */ free_urlpos (url_list); + iri_free (iri); + return status; } @@@ -1096,11 -1053,7 +1096,11 @@@ boo url_uses_proxy (const char *url) { bool ret; - struct url *u = url_parse (url, NULL); + struct url *u; + struct iri *i = iri_new(); + /* url was given in the command line, so use locale as encoding */ + set_uri_encoding (i, opt.locale, true); + u= url_parse (url, NULL, i); if (!u) return false; ret = getproxy (u) != NULL; diff --combined src/url.c index c937d056,d416fcf7..31614794 --- a/src/url.c +++ b/src/url.c @@@ -252,6 -252,15 +252,15 @@@ url_escape (const char *s return url_escape_1 (s, urlchr_unsafe, false); } + /* URL-escape the unsafe and reserved characters (see urlchr_table) in + a given string, returning a freshly allocated string. */ + + char * + url_escape_unsafe_and_reserved (const char *s) + { + return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false); + } + /* URL-escape the unsafe characters (see urlchr_table) in a given string. If no characters are unsafe, S is returned. */ @@@ -640,7 -649,7 +649,7 @@@ static const char *parse_errors[] = error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, struct iri *iri) { struct url *u; const char *p; @@@ -659,7 -668,7 +668,7 @@@ int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; + char *url_encoded = NULL, *new_url = NULL; int error_code; @@@ -670,20 -679,9 +679,20 @@@ goto error; } - url_encoded = reencode_escapes (url); + if (iri && iri->utf8_encode) + { + url_unescape ((char *) url); + iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url); + if (!iri->utf8_encode) + new_url = NULL; + } + + url_encoded = reencode_escapes (new_url ? new_url : url); p = url_encoded; + if (new_url && url_encoded != new_url) + xfree (new_url); + p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); @@@ -853,18 -851,6 +862,18 @@@ { url_unescape (u->host); host_modified = true; + + /* Apply IDNA regardless of iri->utf8_encode status */ + if (opt.enable_iri && iri) + { + char *new = idn_encode (iri, u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } } if (params_b) @@@ -874,7 -860,7 +883,7 @@@ if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild @@@ -923,9 -909,9 +932,9 @@@ url_error (const char *url, int error_c if ((p = strchr (scheme, ':'))) *p = '\0'; if (!strcasecmp (scheme, "https")) - asprintf (&error, _("HTTPS support not compiled in")); + error = aprintf (_("HTTPS support not compiled in")); else - asprintf (&error, _(parse_errors[error_code]), quote (scheme)); + error = aprintf (_(parse_errors[error_code]), quote (scheme)); xfree (scheme); return error; diff --combined src/url.h index 0748e214,f523e2ef..badd9252 --- a/src/url.h +++ b/src/url.h @@@ -83,8 -83,9 +83,9 @@@ struct ur /* Function declarations */ char *url_escape (const char *); + char *url_escape_unsafe_and_reserved (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, struct iri *iri); char *url_error (const char *, int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); diff --combined tests/ChangeLog index 7751be64,4c1b7de7..25f2ab40 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@@ -1,48 -1,66 +1,111 @@@ + 2008-11-12 Steven Schubiger + + * Test-auth-basic.px, Test-auth-no-challenge.px, + Test-auth-no-challenge-url.px, Test-c-full.px, + Test-c-partial.px, Test-c.px, Test-c-shorter.px, + Test-E-k-K.px, Test-E-k.px, Test-ftp.px, + Test-HTTP-Content-Disposition-1.px, + Test-HTTP-Content-Disposition-2.px, + Test-HTTP-Content-Disposition.px, Test-N-current.px, + Test-N-HTTP-Content-Disposition.px, + Test-N--no-content-disposition.px, + Test-N--no-content-disposition-trivial.px, + Test-N-no-info.px, Test--no-content-disposition.px, + Test--no-content-disposition-trivial.px, Test-N-old.px, + Test-nonexisting-quiet.px, Test-noop.px, Test-np.px, + Test-N.px, Test-N-smaller.px, + Test-O-HTTP-Content-Disposition.px, Test-O-nc.px, + Test-O--no-content-disposition.px, + Test-O--no-content-disposition-trivial.px, + Test-O-nonexisting.px, Test-O.px, + Test-proxy-auth-basic.px, Test-Restrict-Lowercase.px, + Test-Restrict-Uppercase.px, + Test--spider-fail.pxm, Test--spider.px, + Test--spider-r-HTTP-Content-Disposition.px, + Test--spider-r--no-content-disposition.px, + Test--spider-r--no-content-disposition-trivial.px, + Test--spider-r.px: Enforce lexically scoped warnings. + + * Test-proxied-https-auth.px, run-px: Place use strict + before use warnings. + + 2008-11-12 Steven Schubiger + + * FTPServer.pm, FTPTest.pm, HTTPServer.pm, HTTPTest.pm: + Remove the magic interpreter line, because it cannot be + used fully. Substitute -w with use warnings. + + 2008-11-11 Micah Cowan + + * HTTPServer.pm (handle_auth): Allow testing of + --auth-no-challenge. + + * Test-auth-no-challenge.px, Test-auth-no-challenge-url.px: + Added. + + * run-px: Add Test-auth-no-challenge.px, + Test-auth-no-challenge-url.px. + + 2008-11-07 Steven Schubiger + + * run-px: Use some colors for the summary part of the test + output to strengthen the distinction between a successful + or failing run. + + 2008-11-06 Steven Schubiger + + * run-px: When executing test scripts, invoke them with the + current perl executable name as determined by env. + + 2008-11-06 Micah Cowan + + * run-px: Use strict (thanks Steven Schubiger!). + +2008-09-09 Micah Cowan + + * Test-idn-cmd.px: Added. + + * run-px: Added Test-idn-cmd.px. + +2008-08-28 Micah Cowan + + * HTTPServer.pm (run): Allow distinguishing between hostnames, + when used as a proxy. + + * Test-idn-headers.px, Test-idn-meta.px: Added. + + * run-px: Added Test-idn-headers.px, Test-idn-meta.px. + + * Test-proxy-auth-basic.px: Use the full URL, rather than just the + path (made necessary by the accompanying change to HTTPServer.pm). + +2008-08-14 Xavier Saint + + * Test-iri-list.px : Fetch files from a remote list. + +2008-08-03 Xavier Saint + + * Test-iri.px : HTTP recursive fetch for testing IRI support and + fallback. + + * Test-iri-disabled.px : Same file structure as Test-iri.px but with + IRI support disabled + + * Test-iri-forced-remote.px : There's a difference between ISO-8859-1 + and ISO-8859-15 for character 0xA4 (respectively currency sign and + euro sign). So with a forced ISO-8859-1 remote encoding, wget should + see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead + of using the ISO-8859-15 given by the server. + + * Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale + and expect wget to fetch the file UTF-8 encoded. + + * Test-ftp-iri-fallback.px : Same as above but wget should fallback on + locale encoding to fetch the file. + + * Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support + disabled. The UTF-8 encoded file should not be retrieved. + 2008-06-22 Micah Cowan * Test-proxied-https-auth.px: Shift exit code so it falls in the diff --combined tests/HTTPServer.pm index 01c36957,ac055c32..7f535358 --- a/tests/HTTPServer.pm +++ b/tests/HTTPServer.pm @@@ -1,8 -1,7 +1,7 @@@ - #!/usr/bin/perl -w - package HTTPServer; use strict; + use warnings; use HTTP::Daemon; use HTTP::Status; @@@ -27,8 -26,7 +26,8 @@@ sub run my $con = $self->accept(); print STDERR "Accepted a new connection\n" if $log; while (my $req = $con->get_request) { - my $url_path = $req->url->path; + #my $url_path = $req->url->path; + my $url_path = $req->url->as_string; if ($url_path =~ m{/$}) { # append 'index.html' $url_path .= 'index.html'; } @@@ -145,8 -143,7 +144,7 @@@ sub handle_auth my $authhdr = $req->header('Authorization'); # Have we sent the challenge yet? - unless (defined $url_rec->{auth_challenged} - && $url_rec->{auth_challenged}) { + unless ($url_rec->{auth_challenged} || $url_rec->{auth_no_challenge}) { # Since we haven't challenged yet, we'd better not # have received authentication (for our testing purposes). if ($authhdr) { @@@ -167,6 -164,9 +165,9 @@@ # failed it. $code = 400; $msg = "You didn't send auth after I sent challenge"; + if ($url_rec->{auth_no_challenge}) { + $msg = "--auth-no-challenge but no auth sent." + } } else { my ($sent_method) = ($authhdr =~ /^(\S+)/g); unless ($sent_method eq $url_rec->{'auth_method'}) { diff --combined tests/Test-proxy-auth-basic.px index e3934d7d,7b3a638f..033ce039 --- a/tests/Test-proxy-auth-basic.px +++ b/tests/Test-proxy-auth-basic.px @@@ -1,6 -1,7 +1,7 @@@ - #!/usr/bin/perl -w + #!/usr/bin/perl use strict; + use warnings; use HTTPTest; @@@ -11,7 -12,7 +12,7 @@@ my $wholefile = "You're all authenticat # code, msg, headers, content my %urls = ( - '/needs-auth.txt' => { + 'http://no.such.domain/needs-auth.txt' => { auth_method => 'Basic', user => 'fiddle-dee-dee', passwd => 'Dodgson', diff --combined tests/run-px index 38520714,0f8f2964..3ab1c444 --- a/tests/run-px +++ b/tests/run-px @@@ -1,11 -1,19 +1,19 @@@ #!/usr/bin/env perl + + use 5.006; + use strict; use warnings; + use Term::ANSIColor ':constants'; + $Term::ANSIColor::AUTORESET = 1; + die "Please specify the top source directory.\n" if (!@ARGV); my $top_srcdir = shift @ARGV; my @tests = ( 'Test-auth-basic.px', + 'Test-auth-no-challenge.px', + 'Test-auth-no-challenge-url.px', 'Test-proxy-auth-basic.px', 'Test-proxied-https-auth.px', 'Test-N-HTTP-Content-Disposition.px', @@@ -17,19 -25,9 +25,19 @@@ 'Test-E-k-K.px', 'Test-E-k.px', 'Test-ftp.px', + 'Test-ftp-iri.px', + 'Test-ftp-iri-fallback.px', + 'Test-ftp-iri-disabled.px', 'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', + 'Test-idn-headers.px', + 'Test-idn-meta.px', + 'Test-idn-cmd.px', + 'Test-iri.px', + 'Test-iri-disabled.px', + 'Test-iri-forced-remote.px', + 'Test-iri-list.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px', @@@ -57,26 -55,56 +65,56 @@@ 'Test--spider-r.px', ); - my @results; + my @tested; - for my $test (@tests) { + foreach my $test (@tests) { print "Running $test\n\n"; - system("$top_srcdir/tests/$test"); - push @results, $?; + system("$^X $top_srcdir/tests/$test"); + push @tested, { name => $test, result => $? }; } - for (my $i=0; $i != @tests; ++$i) { - if ($results[$i] == 0) { - print "pass: "; - } else { - print "FAIL: "; - } - print "$tests[$i]\n"; + print "\n"; + foreach my $test (@tested) { + ($test->{result} == 0) + ? print GREEN 'pass: ' + : print RED 'FAIL: '; + print $test->{name}, "\n"; } + my $count = sub + { + return { + pass => sub { scalar grep $_->{result} == 0, @tested }, + fail => sub { scalar grep $_->{result} != 0, @tested }, + }->{$_[0]}->(); + }; + + my $summary = sub + { + my @lines = ( + "${\scalar @tested} tests were run", + "${\$count->('pass')} PASS, ${\$count->('fail')} FAIL", + ); + my $len_longest = sub + { + local $_ = 0; + foreach my $line (@lines) { + if (length $line > $_) { + $_ = length $line; + } + } + return $_; + }->(); + return join "\n", + '=' x $len_longest, + @lines, + '=' x $len_longest; + }->(); + + print "\n"; + print $count->('fail') + ? RED $summary + : GREEN $summary; print "\n"; - print scalar(@results) . " tests were run\n"; - print scalar(grep $_ == 0, @results) . " PASS\n"; - print scalar(grep $_ != 0, @results) . " FAIL\n"; - exit scalar (grep $_ != 0, @results); + exit $count->('fail');