X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fres.c;h=eb4caf11c0b26d4a7de70ed1db55f9f334fb1f56;hp=0aea66f35f177734ed043f94df58869ddcba4135;hb=b014f8fae9291e7504c0cca2dd8b9a0035466c03;hpb=b7c6c35be5930695cc64ef2d54d6f421f8511553 diff --git a/src/res.c b/src/res.c index 0aea66f3..eb4caf11 100644 --- a/src/res.c +++ b/src/res.c @@ -1,5 +1,5 @@ /* Support for Robot Exclusion Standard (RES). - Copyright (C) 2001, 2006, 2007 Free Software Foundation, Inc. + Copyright (C) 2001, 2006, 2007, 2008 Free Software Foundation, Inc. This file is part of Wget. @@ -16,15 +16,16 @@ General Public License for more details. You should have received a copy of the GNU General Public License along with Wget. If not, see . -In addition, as a special exception, the Free Software Foundation -gives permission to link the code of its release of Wget with the -OpenSSL project's "OpenSSL" library (or with modified versions of it -that use the same license as the "OpenSSL" library), and distribute -the linked executables. You must obey the GNU General Public License -in all respects for all of the code used other than "OpenSSL". If you -modify this file, you may extend this exception to your version of the -file, but you are not obligated to do so. If you do not wish to do -so, delete this exception statement from your version. */ +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ /* This file implements the Robot Exclusion Standard (RES). @@ -66,9 +67,7 @@ so, delete this exception statement from your version. */ res_match_path, res_register_specs, res_get_specs, and res_retrieve_file. */ -#ifdef HAVE_CONFIG_H -# include -#endif +#include "wget.h" #include #include @@ -76,7 +75,6 @@ so, delete this exception statement from your version. */ #include #include -#include "wget.h" #include "utils.h" #include "hash.h" #include "url.h" @@ -465,9 +463,9 @@ res_match_path (const struct robot_specs *specs, const char *path) if (matches (specs->paths[i].path, path)) { bool allowedp = specs->paths[i].allowedp; - DEBUGP (("%s path %s because of rule `%s'.\n", + DEBUGP (("%s path %s because of rule %s.\n", allowedp ? "Allowing" : "Rejecting", - path, specs->paths[i].path)); + path, quote (specs->paths[i].path))); return allowedp; } return true; @@ -534,21 +532,44 @@ res_get_specs (const char *host, int port) Return true if robots were retrieved OK, false otherwise. */ bool -res_retrieve_file (const char *url, char **file) +res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; - int saved_sp_val = opt.spider; + int saved_sp_val = opt.spider, url_err; + struct url * url_parsed; + + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding, false); + i->utf8_encode = false; logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); + + url_parsed = url_parse (robots_url, &url_err, iri, true); + if (!url_parsed) + { + char *error = url_error (robots_url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); + xfree (error); + err = URLERROR; + } + else + { + err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, + false, i, false); + url_free(url_parsed); + } + opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) {