X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fres.c;h=0996c90370104013c0fb1b93b61466bc45988c79;hp=32b53b1beadaeebb33d4cb10ac8ed9a730e49743;hb=2f6aa1d7417df1dfc58597777686fbd77179b9fd;hpb=222e9465b7cef12a75e5ce0d6cc7df60c934566f diff --git a/src/res.c b/src/res.c index 32b53b1b..0996c903 100644 --- a/src/res.c +++ b/src/res.c @@ -1,11 +1,12 @@ /* Support for Robot Exclusion Standard (RES). - Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001, 2006, 2007, 2008, 2009, 2010, 2011 Free Software + Foundation, Inc. This file is part of Wget. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at +the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but @@ -14,8 +15,18 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ /* This file implements the Robot Exclusion Standard (RES). @@ -57,31 +68,28 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ res_match_path, res_register_specs, res_get_specs, and res_retrieve_file. */ -#ifdef HAVE_CONFIG_H -# include -#endif +#include "wget.h" #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif /* HAVE_STRING_H */ +#include #include #include -#include "wget.h" #include "utils.h" #include "hash.h" #include "url.h" #include "retr.h" #include "res.h" +#ifdef TESTING +#include "test.h" +#endif + struct path_info { char *path; - int allowedp; - int user_agent_exact_p; + bool allowedp; + bool user_agent_exact_p; }; struct robot_specs { @@ -98,22 +106,22 @@ struct robot_specs { static void match_user_agent (const char *agent, int length, - int *matches, int *exact_match) + bool *matches, bool *exact_match) { if (length == 1 && *agent == '*') { - *matches = 1; - *exact_match = 0; + *matches = true; + *exact_match = false; } else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget")) { - *matches = 1; - *exact_match = 1; + *matches = true; + *exact_match = true; } else { - *matches = 0; - *exact_match = 0; + *matches = false; + *exact_match = false; } } @@ -122,7 +130,7 @@ match_user_agent (const char *agent, int length, static void add_path (struct robot_specs *specs, const char *path_b, const char *path_e, - int allowedp, int exactp) + bool allowedp, bool exactp) { struct path_info pp; if (path_b < path_e && *path_b == '/') @@ -136,17 +144,17 @@ add_path (struct robot_specs *specs, const char *path_b, const char *path_e, if (specs->count > specs->size) { if (specs->size == 0) - specs->size = 1; + specs->size = 1; else - specs->size <<= 1; + specs->size <<= 1; specs->paths = xrealloc (specs->paths, - specs->size * sizeof (struct path_info)); + specs->size * sizeof (struct path_info)); } specs->paths[specs->count - 1] = pp; } -/* Recreate SPECS->paths with only those paths that have non-zero - user_agent_exact_p. */ +/* Recreate SPECS->paths with only those paths that have + user_agent_exact_p set to true. */ static void prune_non_exact (struct robot_specs *specs) @@ -157,7 +165,7 @@ prune_non_exact (struct robot_specs *specs) for (i = 0; i < specs->count; i++) if (specs->paths[i].user_agent_exact_p) ++cnt; - newpaths = xmalloc (cnt * sizeof (struct path_info)); + newpaths = xnew_array (struct path_info, cnt); for (i = 0, j = 0; i < specs->count; i++) if (specs->paths[i].user_agent_exact_p) newpaths[j++] = specs->paths[i]; @@ -170,12 +178,12 @@ prune_non_exact (struct robot_specs *specs) #define EOL(p) ((p) >= lineend) -#define SKIP_SPACE(p) do { \ - while (!EOL (p) && ISSPACE (*p)) \ - ++p; \ +#define SKIP_SPACE(p) do { \ + while (!EOL (p) && c_isspace (*p)) \ + ++p; \ } while (0) -#define FIELD_IS(string_literal) \ +#define FIELD_IS(string_literal) \ BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal) /* Parse textual RES specs beginning with SOURCE of length LENGTH. @@ -216,22 +224,21 @@ res_parse (const char *source, int length) const char *p = source; const char *end = source + length; - /* non-zero if last applicable user-agent field matches Wget. */ - int user_agent_applies = 0; + /* true if last applicable user-agent field matches Wget. */ + bool user_agent_applies = false; - /* non-zero if last applicable user-agent field *exactly* matches + /* true if last applicable user-agent field *exactly* matches Wget. */ - int user_agent_exact = 0; + bool user_agent_exact = false; /* whether we ever encountered exact user agent. */ - int found_exact = 0; + bool found_exact = false; /* count of allow/disallow lines in the current "record", i.e. after the last `user-agent' instructions. */ int record_count = 0; - struct robot_specs *specs = xmalloc (sizeof (struct robot_specs)); - memset (specs, '\0', sizeof (struct robot_specs)); + struct robot_specs *specs = xnew0 (struct robot_specs); while (1) { @@ -240,114 +247,113 @@ res_parse (const char *source, int length) const char *value_b, *value_e; if (p == end) - break; + break; lineend_real = memchr (p, '\n', end - p); if (lineend_real) - ++lineend_real; + ++lineend_real; else - lineend_real = end; + lineend_real = end; lineend = lineend_real; /* Before doing anything else, check whether the line is empty - or comment-only. */ + or comment-only. */ SKIP_SPACE (p); if (EOL (p) || *p == '#') - goto next; + goto next; /* Make sure the end-of-line comments are respected by setting - lineend to a location preceding the first comment. Real line - ending remains in lineend_real. */ + lineend to a location preceding the first comment. Real line + ending remains in lineend_real. */ for (lineend = p; lineend < lineend_real; lineend++) - if ((lineend == p || ISSPACE (*(lineend - 1))) - && *lineend == '#') - break; + if ((lineend == p || c_isspace (*(lineend - 1))) + && *lineend == '#') + break; /* Ignore trailing whitespace in the same way. */ - while (lineend > p && ISSPACE (*(lineend - 1))) - --lineend; + while (lineend > p && c_isspace (*(lineend - 1))) + --lineend; assert (!EOL (p)); field_b = p; - while (!EOL (p) && (ISALNUM (*p) || *p == '-')) - ++p; + while (!EOL (p) && (c_isalnum (*p) || *p == '-')) + ++p; field_e = p; SKIP_SPACE (p); if (field_b == field_e || EOL (p) || *p != ':') - { - DEBUGP (("Ignoring malformed line %d", line_count)); - goto next; - } - ++p; /* skip ':' */ + { + DEBUGP (("Ignoring malformed line %d", line_count)); + goto next; + } + ++p; /* skip ':' */ SKIP_SPACE (p); value_b = p; while (!EOL (p)) - ++p; + ++p; value_e = p; /* Finally, we have a syntactically valid line. */ if (FIELD_IS ("user-agent")) - { - /* We have to support several cases: - - --previous records-- - - User-Agent: foo - User-Agent: Wget - User-Agent: bar - ... matching record ... - - User-Agent: baz - User-Agent: qux - ... non-matching record ... - - User-Agent: * - ... matching record, but will be pruned later ... - - We have to respect `User-Agent' at the beginning of each - new record simply because we don't know if we're going to - encounter "Wget" among the agents or not. Hence, - match_user_agent is called when record_count != 0. - - But if record_count is 0, we have to keep calling it - until it matches, and if that happens, we must not call - it any more, until the next record. Hence the other part - of the condition. */ - if (record_count != 0 || user_agent_applies == 0) - match_user_agent (value_b, value_e - value_b, - &user_agent_applies, &user_agent_exact); - if (user_agent_exact) - found_exact = 1; - record_count = 0; - } + { + /* We have to support several cases: + + --previous records-- + + User-Agent: foo + User-Agent: Wget + User-Agent: bar + ... matching record ... + + User-Agent: baz + User-Agent: qux + ... non-matching record ... + + User-Agent: * + ... matching record, but will be pruned later ... + + We have to respect `User-Agent' at the beginning of each + new record simply because we don't know if we're going to + encounter "Wget" among the agents or not. Hence, + match_user_agent is called when record_count != 0. + + But if record_count is 0, we have to keep calling it + until it matches, and if that happens, we must not call + it any more, until the next record. Hence the other part + of the condition. */ + if (record_count != 0 || user_agent_applies == false) + match_user_agent (value_b, value_e - value_b, + &user_agent_applies, &user_agent_exact); + if (user_agent_exact) + found_exact = true; + record_count = 0; + } else if (FIELD_IS ("allow")) - { - if (user_agent_applies) - { - add_path (specs, value_b, value_e, 1, user_agent_exact); - } - ++record_count; - } + { + if (user_agent_applies) + { + add_path (specs, value_b, value_e, true, user_agent_exact); + } + ++record_count; + } else if (FIELD_IS ("disallow")) - { - if (user_agent_applies) - { - int allowed = 0; - if (value_b == value_e) - /* Empty "disallow" line means everything is - *allowed*! */ - allowed = 1; - add_path (specs, value_b, value_e, allowed, user_agent_exact); - } - ++record_count; - } + { + if (user_agent_applies) + { + bool allowed = false; + if (value_b == value_e) + /* Empty "disallow" line means everything is *allowed*! */ + allowed = true; + add_path (specs, value_b, value_e, allowed, user_agent_exact); + } + ++record_count; + } else - { - DEBUGP (("Ignoring unknown field at line %d", line_count)); - goto next; - } + { + DEBUGP (("Ignoring unknown field at line %d", line_count)); + goto next; + } next: p = lineend_real; @@ -357,15 +363,15 @@ res_parse (const char *source, int length) if (found_exact) { /* We've encountered an exactly matching user-agent. Throw out - all the stuff with user-agent: *. */ + all the stuff with user-agent: *. */ prune_non_exact (specs); } else if (specs->size > specs->count) { /* add_path normally over-allocates specs->paths. Reallocate it - to the correct size in order to conserve some memory. */ + to the correct size in order to conserve some memory. */ specs->paths = xrealloc (specs->paths, - specs->count * sizeof (struct path_info)); + specs->count * sizeof (struct path_info)); specs->size = specs->count; } @@ -379,15 +385,15 @@ struct robot_specs * res_parse_from_file (const char *filename) { struct robot_specs *specs; - struct file_memory *fm = read_file (filename); + struct file_memory *fm = wget_read_file (filename); if (!fm) { - logprintf (LOG_NOTQUIET, "Cannot open %s: %s", - filename, strerror (errno)); + logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"), + filename, strerror (errno)); return NULL; } specs = res_parse (fm->content, fm->length); - read_file_free (fm); + wget_read_file_free (fm); return specs; } @@ -397,7 +403,7 @@ free_specs (struct robot_specs *specs) int i; for (i = 0; i < specs->count; i++) xfree (specs->paths[i].path); - FREE_MAYBE (specs->paths); + xfree_null (specs->paths); xfree (specs); } @@ -407,25 +413,23 @@ free_specs (struct robot_specs *specs) that number is not a numerical representation of '/', decode C and advance the pointer. */ -#define DECODE_MAYBE(c, ptr) do { \ - if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \ - { \ - char decoded \ - = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \ - if (decoded != '/') \ - { \ - c = decoded; \ - ptr += 2; \ - } \ - } \ +#define DECODE_MAYBE(c, ptr) do { \ + if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \ + { \ + char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ + if (decoded != '/') \ + { \ + c = decoded; \ + ptr += 2; \ + } \ + } \ } while (0) -/* The inner matching engine: return non-zero if RECORD_PATH matches +/* The inner matching engine: return true if RECORD_PATH matches URL_PATH. The rules for matching are described at - , - section 3.2.2. */ + , section 3.2.2. */ -static int +static bool matches (const char *record_path, const char *url_path) { const char *rp = record_path; @@ -436,13 +440,13 @@ matches (const char *record_path, const char *url_path) char rc = *rp; char uc = *up; if (!rc) - return 1; + return true; if (!uc) - return 0; + return false; DECODE_MAYBE(rc, rp); DECODE_MAYBE(uc, up); if (rc != uc) - return 0; + return false; } } @@ -450,35 +454,35 @@ matches (const char *record_path, const char *url_path) matches, return its allow/reject status. If none matches, retrieval is by default allowed. */ -int +bool res_match_path (const struct robot_specs *specs, const char *path) { int i; if (!specs) - return 1; + return true; for (i = 0; i < specs->count; i++) if (matches (specs->paths[i].path, path)) { - int allowedp = specs->paths[i].allowedp; - DEBUGP (("%s path %s because of rule `%s'.\n", - allowedp ? "Allowing" : "Rejecting", - path, specs->paths[i].path)); - return allowedp; + bool allowedp = specs->paths[i].allowedp; + DEBUGP (("%s path %s because of rule %s.\n", + allowedp ? "Allowing" : "Rejecting", + path, quote (specs->paths[i].path))); + return allowedp; } - return 1; + return true; } /* Registering the specs. */ -struct hash_table *registered_specs; +static struct hash_table *registered_specs; /* Stolen from cookies.c. */ -#define SET_HOSTPORT(host, port, result) do { \ - int HP_len = strlen (host); \ - result = alloca (HP_len + 1 + numdigit (port) + 1); \ - memcpy (result, host, HP_len); \ - result[HP_len] = ':'; \ - long_to_string (result + HP_len + 1, port); \ +#define SET_HOSTPORT(host, port, result) do { \ + int HP_len = strlen (host); \ + result = alloca (HP_len + 1 + numdigit (port) + 1); \ + memcpy (result, host, HP_len); \ + result[HP_len] = ':'; \ + number_to_string (result + HP_len + 1, port); \ } while (0) /* Register RES specs that below to server on HOST:PORT. They will @@ -494,14 +498,10 @@ res_register_specs (const char *host, int port, struct robot_specs *specs) if (!registered_specs) registered_specs = make_nocase_string_hash_table (0); - /* Required to shut up the compiler. */ - old = NULL; - hp_old = NULL; - - if (hash_table_get_pair (registered_specs, hp, hp_old, old)) + if (hash_table_get_pair (registered_specs, hp, &hp_old, &old)) { if (old) - free_specs (old); + free_specs (old); hash_table_put (registered_specs, hp_old, specs); } else @@ -530,45 +530,115 @@ res_get_specs (const char *host, int port) serves URL. The file will be named according to the currently active rules, and the file name will be returned in *file. - Return non-zero if robots were retrieved OK, zero otherwise. */ + Return true if robots were retrieved OK, false otherwise. */ -int -res_retrieve_file (const char *url, char **file) +bool +res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); + int saved_ts_val = opt.timestamping; + int saved_sp_val = opt.spider, url_err; + struct url * url_parsed; + + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding, false); + i->utf8_encode = false; logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; - err = retrieve_url (robots_url, file, NULL, NULL, NULL); + opt.timestamping = false; + opt.spider = false; + + url_parsed = url_parse (robots_url, &url_err, iri, true); + if (!url_parsed) + { + char *error = url_error (robots_url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); + xfree (error); + err = URLERROR; + } + else + { + err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, + false, i, false); + url_free(url_parsed); + } + + opt.timestamping = saved_ts_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { /* If the file is not retrieved correctly, but retrieve_url - allocated the file name, deallocate is here so that the - caller doesn't have to worry about it. */ + allocated the file name, deallocate is here so that the + caller doesn't have to worry about it. */ xfree (*file); *file = NULL; } return err == RETROK; } -static int -cleanup_hash_table_mapper (void *key, void *value, void *arg_ignored) +bool +is_robots_txt_url (const char *url) { - xfree (key); - free_specs (value); - return 0; -} + char *robots_url = uri_merge (url, RES_SPECS_LOCATION); + bool ret = are_urls_equal (url, robots_url); + + xfree (robots_url); + return ret; +} + void res_cleanup (void) { if (registered_specs) { - hash_table_map (registered_specs, cleanup_hash_table_mapper, NULL); + hash_table_iterator iter; + for (hash_table_iterate (registered_specs, &iter); + hash_table_iter_next (&iter); + ) + { + xfree (iter.key); + free_specs (iter.value); + } hash_table_destroy (registered_specs); registered_specs = NULL; } } + +#ifdef TESTING + +const char * +test_is_robots_txt_url() +{ + int i; + struct { + char *url; + bool expected_result; + } test_array[] = { + { "http://www.yoyodyne.com/robots.txt", true }, + { "http://www.yoyodyne.com/somepath/", false }, + { "http://www.yoyodyne.com/somepath/robots.txt", false }, + }; + + for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) + { + mu_assert ("test_is_robots_txt_url: wrong result", + is_robots_txt_url (test_array[i].url) == test_array[i].expected_result); + } + + return NULL; +} + +#endif /* TESTING */ + +/* + * vim: et ts=2 sw=2 + */ +