/* Support for Robot Exclusion Standard (RES).
- Copyright (C) 2001, 2006, 2007 Free Software Foundation, Inc.
+ Copyright (C) 2001, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+ Foundation, Inc.
This file is part of Wget.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
/* This file implements the Robot Exclusion Standard (RES).
res_match_path, res_register_specs, res_get_specs, and
res_retrieve_file. */
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "utils.h"
#include "hash.h"
#include "url.h"
#define EOL(p) ((p) >= lineend)
#define SKIP_SPACE(p) do { \
- while (!EOL (p) && ISSPACE (*p)) \
+ while (!EOL (p) && c_isspace (*p)) \
++p; \
} while (0)
lineend to a location preceding the first comment. Real line
ending remains in lineend_real. */
for (lineend = p; lineend < lineend_real; lineend++)
- if ((lineend == p || ISSPACE (*(lineend - 1)))
+ if ((lineend == p || c_isspace (*(lineend - 1)))
&& *lineend == '#')
break;
/* Ignore trailing whitespace in the same way. */
- while (lineend > p && ISSPACE (*(lineend - 1)))
+ while (lineend > p && c_isspace (*(lineend - 1)))
--lineend;
assert (!EOL (p));
field_b = p;
- while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
+ while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
++p;
field_e = p;
SKIP_SPACE (p);
if (field_b == field_e || EOL (p) || *p != ':')
{
- DEBUGP (("Ignoring malformed line %d", line_count));
+ DEBUGP (("Ignoring malformed line %d\n", line_count));
goto next;
}
++p; /* skip ':' */
}
else
{
- DEBUGP (("Ignoring unknown field at line %d", line_count));
+ DEBUGP (("Ignoring unknown field at line %d\n", line_count));
goto next;
}
res_parse_from_file (const char *filename)
{
struct robot_specs *specs;
- struct file_memory *fm = read_file (filename);
+ struct file_memory *fm = wget_read_file (filename);
if (!fm)
{
logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
return NULL;
}
specs = res_parse (fm->content, fm->length);
- read_file_free (fm);
+ wget_read_file_free (fm);
return specs;
}
advance the pointer. */
#define DECODE_MAYBE(c, ptr) do { \
- if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
+ if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
{ \
char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
if (decoded != '/') \
if (matches (specs->paths[i].path, path))
{
bool allowedp = specs->paths[i].allowedp;
- DEBUGP (("%s path %s because of rule `%s'.\n",
+ DEBUGP (("%s path %s because of rule %s.\n",
allowedp ? "Allowing" : "Rejecting",
- path, specs->paths[i].path));
+ path, quote (specs->paths[i].path)));
return allowedp;
}
return true;
Return true if robots were retrieved OK, false otherwise. */
bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
{
+ struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
- int saved_sp_val = opt.spider;
+ int saved_sp_val = opt.spider, url_err;
+ struct url * url_parsed;
+
+ /* Copy server URI encoding for a possible IDNA transformation, no need to
+ encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+ set_uri_encoding (i, iri->uri_encoding, false);
+ i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+
+ url_parsed = url_parse (robots_url, &url_err, i, true);
+ if (!url_parsed)
+ {
+ char *error = url_error (robots_url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
+ xfree (error);
+ err = URLERROR;
+ }
+ else
+ {
+ err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
+ false, i, false);
+ url_free(url_parsed);
+ }
+
opt.timestamping = saved_ts_val;
- opt.spider = saved_sp_val;
+ opt.spider = saved_sp_val;
xfree (robots_url);
+ iri_free (i);
if (err != RETROK && *file != NULL)
{
bool ret = are_urls_equal (url, robots_url);
xfree (robots_url);
-
+
return ret;
}
\f
#ifdef TESTING
const char *
-test_is_robots_txt_url()
+test_is_robots_txt_url(void)
{
- int i;
- struct {
- char *url;
+ unsigned i;
+ static const struct {
+ const char *url;
bool expected_result;
} test_array[] = {
{ "http://www.yoyodyne.com/robots.txt", true },
{ "http://www.yoyodyne.com/somepath/", false },
{ "http://www.yoyodyne.com/somepath/robots.txt", false },
};
-
- for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
+
+ for (i = 0; i < countof(test_array); ++i)
{
- mu_assert ("test_is_robots_txt_url: wrong result",
+ mu_assert ("test_is_robots_txt_url: wrong result",
is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
}