/* Support for Robot Exclusion Standard (RES).
- Copyright (C) 2001 Free Software Foundation, Inc.
+ Copyright (C) 2001, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+ Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or (at
+the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
This program is distributed in the hope that it will be useful, but
General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
/* This file implements the Robot Exclusion Standard (RES).
res_match_path, res_register_specs, res_get_specs, and
res_retrieve_file. */
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "utils.h"
#include "hash.h"
#include "url.h"
#define EOL(p) ((p) >= lineend)
#define SKIP_SPACE(p) do { \
- while (!EOL (p) && ISSPACE (*p)) \
+ while (!EOL (p) && c_isspace (*p)) \
++p; \
} while (0)
lineend to a location preceding the first comment. Real line
ending remains in lineend_real. */
for (lineend = p; lineend < lineend_real; lineend++)
- if ((lineend == p || ISSPACE (*(lineend - 1)))
+ if ((lineend == p || c_isspace (*(lineend - 1)))
&& *lineend == '#')
break;
/* Ignore trailing whitespace in the same way. */
- while (lineend > p && ISSPACE (*(lineend - 1)))
+ while (lineend > p && c_isspace (*(lineend - 1)))
--lineend;
assert (!EOL (p));
field_b = p;
- while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
+ while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
++p;
field_e = p;
SKIP_SPACE (p);
if (field_b == field_e || EOL (p) || *p != ':')
{
- DEBUGP (("Ignoring malformed line %d", line_count));
+ DEBUGP (("Ignoring malformed line %d\n", line_count));
goto next;
}
++p; /* skip ':' */
}
else
{
- DEBUGP (("Ignoring unknown field at line %d", line_count));
+ DEBUGP (("Ignoring unknown field at line %d\n", line_count));
goto next;
}
res_parse_from_file (const char *filename)
{
struct robot_specs *specs;
- struct file_memory *fm = read_file (filename);
+ struct file_memory *fm = wget_read_file (filename);
if (!fm)
{
logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
return NULL;
}
specs = res_parse (fm->content, fm->length);
- read_file_free (fm);
+ wget_read_file_free (fm);
return specs;
}
advance the pointer. */
#define DECODE_MAYBE(c, ptr) do { \
- if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
+ if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
{ \
char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
if (decoded != '/') \
if (matches (specs->paths[i].path, path))
{
bool allowedp = specs->paths[i].allowedp;
- DEBUGP (("%s path %s because of rule `%s'.\n",
+ DEBUGP (("%s path %s because of rule %s.\n",
allowedp ? "Allowing" : "Rejecting",
- path, specs->paths[i].path));
+ path, quote (specs->paths[i].path)));
return allowedp;
}
return true;
Return true if robots were retrieved OK, false otherwise. */
bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
{
+ struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
+ int saved_ts_val = opt.timestamping;
+ int saved_sp_val = opt.spider, url_err;
+ struct url * url_parsed;
+
+ /* Copy server URI encoding for a possible IDNA transformation, no need to
+ encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+ set_uri_encoding (i, iri->uri_encoding, false);
+ i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+ opt.timestamping = false;
+ opt.spider = false;
+
+ url_parsed = url_parse (robots_url, &url_err, i, true);
+ if (!url_parsed)
+ {
+ char *error = url_error (robots_url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
+ xfree (error);
+ err = URLERROR;
+ }
+ else
+ {
+ err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
+ false, i, false);
+ url_free(url_parsed);
+ }
+
+ opt.timestamping = saved_ts_val;
+ opt.spider = saved_sp_val;
xfree (robots_url);
+ iri_free (i);
if (err != RETROK && *file != NULL)
{
bool ret = are_urls_equal (url, robots_url);
xfree (robots_url);
-
+
return ret;
}
\f
{ "http://www.yoyodyne.com/somepath/", false },
{ "http://www.yoyodyne.com/somepath/robots.txt", false },
};
-
- for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
+
+ for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
- mu_assert ("test_is_robots_txt_url: wrong result",
+ mu_assert ("test_is_robots_txt_url: wrong result",
is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
}