From 42c78fdd71c311cf96210b709ec0a18ef45ef87f Mon Sep 17 00:00:00 2001 From: Tim Ruehsen Date: Thu, 22 Aug 2013 12:28:11 +0200 Subject: [PATCH] added option --https-only --- doc/ChangeLog | 4 ++ doc/wget.texi | 3 ++ src/ChangeLog | 6 +++ src/init.c | 3 ++ src/main.c | 3 ++ src/options.h | 2 +- src/recur.c | 23 +++++++---- tests/ChangeLog | 6 +++ tests/Makefile.am | 1 + tests/Test--httpsonly-r.px | 79 ++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 11 files changed, 122 insertions(+), 9 deletions(-) create mode 100755 tests/Test--httpsonly-r.px diff --git a/doc/ChangeLog b/doc/ChangeLog index bc0fb79f..d283055a 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +2013-08-22 Tim Ruehsen + + * wget.texi: added description for --https-only + 2013-08-13 Hrvoje Niksic * wget.texi (Download Options): Fix misspelling. diff --git a/doc/wget.texi b/doc/wget.texi index ba4612de..cced7edd 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -1606,6 +1606,9 @@ buggy SSL server implementations that make it hard for OpenSSL to choose the correct protocol version. Fortunately, such servers are quite rare. +@item --https-only +When in recursive mode, only HTTPS links are followed. + @cindex SSL certificate, check @item --no-check-certificate Don't check the server certificate against the available certificate diff --git a/src/ChangeLog b/src/ChangeLog index edfb80f6..03a1f6ad 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,9 @@ +2013-08-22 Tim Ruehsen + + * main.c: Add new option --https-only. + * options.h: Likewise. + * recur.c (download_child_p): add check for HTTPS. + 2013-08-09 Tim Ruehsen * gnutls.c (ssl_init): Prevent CA files from being loaded twice diff --git a/src/init.c b/src/init.c index 1c4432b5..033da4f7 100644 --- a/src/init.c +++ b/src/init.c @@ -194,6 +194,9 @@ static const struct { { "httppasswd", &opt.http_passwd, cmd_string }, /* deprecated */ { "httppassword", &opt.http_passwd, cmd_string }, { "httpproxy", &opt.http_proxy, cmd_string }, +#ifdef HAVE_SSL + { "httpsonly", &opt.https_only, cmd_boolean }, +#endif { "httpsproxy", &opt.https_proxy, cmd_string }, { "httpuser", &opt.http_user, cmd_string }, { "ignorecase", &opt.ignore_case, cmd_boolean }, diff --git a/src/main.c b/src/main.c index 6b71a20d..8414f5e5 100644 --- a/src/main.c +++ b/src/main.c @@ -217,6 +217,7 @@ static struct cmdline_option option_data[] = { "http-passwd", 0, OPT_VALUE, "httppassword", -1 }, /* deprecated */ { "http-password", 0, OPT_VALUE, "httppassword", -1 }, { "http-user", 0, OPT_VALUE, "httpuser", -1 }, + { IF_SSL ("https-only"), 0, OPT_BOOLEAN, "httpsonly", -1 }, { "ignore-case", 0, OPT_BOOLEAN, "ignorecase", -1 }, { "ignore-length", 0, OPT_BOOLEAN, "ignorelength", -1 }, { "ignore-tags", 0, OPT_VALUE, "ignoretags", -1 }, @@ -635,6 +636,8 @@ HTTPS (SSL/TLS) options:\n"), N_("\ --secure-protocol=PR choose secure protocol, one of auto, SSLv2,\n\ SSLv3, and TLSv1.\n"), + N_("\ + --https-only only follow secure HTTPS links\n"), N_("\ --no-check-certificate don't validate the server's certificate.\n"), N_("\ diff --git a/src/options.h b/src/options.h index 0a10c9b3..4460c6c6 100644 --- a/src/options.h +++ b/src/options.h @@ -215,9 +215,9 @@ struct options char *ca_directory; /* CA directory (hash files) */ char *ca_cert; /* CA certificate file to use */ - char *random_file; /* file with random data to seed the PRNG */ char *egd_file; /* file name of the egd daemon socket */ + bool https_only; /* whether to follow HTTPS only */ #endif /* HAVE_SSL */ bool cookies; /* whether cookies are used. */ diff --git a/src/recur.c b/src/recur.c index b6ba1d95..edf34d42 100644 --- a/src/recur.c +++ b/src/recur.c @@ -505,15 +505,16 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, } /* Several things to check for: - 1. if scheme is not http, and we don't load it - 2. check for relative links (if relative_only is set) - 3. check for domain - 4. check for no-parent - 5. check for excludes && includes - 6. check for suffix - 7. check for same host (if spanhost is unset), with possible + 1. if scheme is not https and https_only requested + 2. if scheme is not http, and we don't load it + 3. check for relative links (if relative_only is set) + 4. check for domain + 5. check for no-parent + 6. check for excludes && includes + 7. check for suffix + 8. check for same host (if spanhost is unset), with possible gethostbyname baggage - 8. check for robots.txt + 9. check for robots.txt Addendum: If the URL is FTP, and it is to be loaded, only the domain and suffix settings are "stronger". @@ -525,6 +526,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, More time- and memory- consuming tests should be put later on the list. */ + if (opt.https_only && u->scheme != SCHEME_HTTPS) + { + DEBUGP (("Not following non-HTTPS links.\n")); + goto out; + } + /* Determine whether URL under consideration has a HTTP-like scheme. */ u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); diff --git a/tests/ChangeLog b/tests/ChangeLog index 8cd48648..9a58797a 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2013-08-22 Tim Ruehsen + + * Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px. + * run-px (tests): Likewise. + * Test--httpsonly-r.px: New file. + 2013-03-12 Darshit Shah * Makefile.am (EXTRA_DIST): Add Test--post-file.px. diff --git a/tests/Makefile.am b/tests/Makefile.am index ac6a663f..a4947870 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -132,6 +132,7 @@ EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \ Test--spider-r--no-content-disposition.px \ Test--spider-r--no-content-disposition-trivial.px \ Test--spider-r.px \ + Test--httpsonly-r.px \ run-px certs check_PROGRAMS = unit-tests diff --git a/tests/Test--httpsonly-r.px b/tests/Test--httpsonly-r.px new file mode 100755 index 00000000..019df1aa --- /dev/null +++ b/tests/Test--httpsonly-r.px @@ -0,0 +1,79 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use HTTPTest; + + +############################################################################### + +my $mainpage = < + + Main Page + + +

+ Some text and a link to a second page. +

+ + +EOF + +my $secondpage = < + + Second Page + + +

+ Anything. +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Dontcare", + headers => { + "Content-type" => "text/html", + }, + content => $mainpage, + }, + '/secondpage.html' => { + code => "200", + msg => "Dontcare", + headers => { + "Content-type" => "text/html", + }, + content => $secondpage, + } +); + +my $cmdline = $WgetTest::WGETPATH . " --https-only -r -nH http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $mainpage, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test--httpsonly-r", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +print $expected_error_code."\n"; + +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 3c35d6f0..14f5e7c4 100755 --- a/tests/run-px +++ b/tests/run-px @@ -81,6 +81,7 @@ my @tests = ( 'Test--spider-r--no-content-disposition.px', 'Test--spider-r--no-content-disposition-trivial.px', 'Test--spider-r.px', + 'Test--httpsonly-r.px', ); foreach my $var (qw(SYSTEM_WGETRC WGETRC)) { -- 2.39.2