X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fres.c;h=103bc4e7c8aedcbca3037d7cbce95ded305f2f84;hp=656f2895ec52e971d6f37cbd62138bc802fef049;hb=60c88ee992b501590aeed111a669e99fbff7ef82;hpb=79f66dfd1537edc79cead657d171018c85c05425 diff --git a/src/res.c b/src/res.c index 656f2895..103bc4e7 100644 --- a/src/res.c +++ b/src/res.c @@ -84,6 +84,10 @@ so, delete this exception statement from your version. */ #include "retr.h" #include "res.h" +#ifdef TESTING +#include "test.h" +#endif + struct path_info { char *path; bool allowedp; @@ -104,7 +108,7 @@ struct robot_specs { static void match_user_agent (const char *agent, int length, - bool *matches, bool *exact_match) + bool *matches, bool *exact_match) { if (length == 1 && *agent == '*') { @@ -128,7 +132,7 @@ match_user_agent (const char *agent, int length, static void add_path (struct robot_specs *specs, const char *path_b, const char *path_e, - bool allowedp, bool exactp) + bool allowedp, bool exactp) { struct path_info pp; if (path_b < path_e && *path_b == '/') @@ -142,11 +146,11 @@ add_path (struct robot_specs *specs, const char *path_b, const char *path_e, if (specs->count > specs->size) { if (specs->size == 0) - specs->size = 1; + specs->size = 1; else - specs->size <<= 1; + specs->size <<= 1; specs->paths = xrealloc (specs->paths, - specs->size * sizeof (struct path_info)); + specs->size * sizeof (struct path_info)); } specs->paths[specs->count - 1] = pp; } @@ -176,12 +180,12 @@ prune_non_exact (struct robot_specs *specs) #define EOL(p) ((p) >= lineend) -#define SKIP_SPACE(p) do { \ - while (!EOL (p) && ISSPACE (*p)) \ - ++p; \ +#define SKIP_SPACE(p) do { \ + while (!EOL (p) && ISSPACE (*p)) \ + ++p; \ } while (0) -#define FIELD_IS(string_literal) \ +#define FIELD_IS(string_literal) \ BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal) /* Parse textual RES specs beginning with SOURCE of length LENGTH. @@ -245,113 +249,113 @@ res_parse (const char *source, int length) const char *value_b, *value_e; if (p == end) - break; + break; lineend_real = memchr (p, '\n', end - p); if (lineend_real) - ++lineend_real; + ++lineend_real; else - lineend_real = end; + lineend_real = end; lineend = lineend_real; /* Before doing anything else, check whether the line is empty - or comment-only. */ + or comment-only. */ SKIP_SPACE (p); if (EOL (p) || *p == '#') - goto next; + goto next; /* Make sure the end-of-line comments are respected by setting - lineend to a location preceding the first comment. Real line - ending remains in lineend_real. */ + lineend to a location preceding the first comment. Real line + ending remains in lineend_real. */ for (lineend = p; lineend < lineend_real; lineend++) - if ((lineend == p || ISSPACE (*(lineend - 1))) - && *lineend == '#') - break; + if ((lineend == p || ISSPACE (*(lineend - 1))) + && *lineend == '#') + break; /* Ignore trailing whitespace in the same way. */ while (lineend > p && ISSPACE (*(lineend - 1))) - --lineend; + --lineend; assert (!EOL (p)); field_b = p; while (!EOL (p) && (ISALNUM (*p) || *p == '-')) - ++p; + ++p; field_e = p; SKIP_SPACE (p); if (field_b == field_e || EOL (p) || *p != ':') - { - DEBUGP (("Ignoring malformed line %d", line_count)); - goto next; - } - ++p; /* skip ':' */ + { + DEBUGP (("Ignoring malformed line %d", line_count)); + goto next; + } + ++p; /* skip ':' */ SKIP_SPACE (p); value_b = p; while (!EOL (p)) - ++p; + ++p; value_e = p; /* Finally, we have a syntactically valid line. */ if (FIELD_IS ("user-agent")) - { - /* We have to support several cases: - - --previous records-- - - User-Agent: foo - User-Agent: Wget - User-Agent: bar - ... matching record ... - - User-Agent: baz - User-Agent: qux - ... non-matching record ... - - User-Agent: * - ... matching record, but will be pruned later ... - - We have to respect `User-Agent' at the beginning of each - new record simply because we don't know if we're going to - encounter "Wget" among the agents or not. Hence, - match_user_agent is called when record_count != 0. - - But if record_count is 0, we have to keep calling it - until it matches, and if that happens, we must not call - it any more, until the next record. Hence the other part - of the condition. */ - if (record_count != 0 || user_agent_applies == false) - match_user_agent (value_b, value_e - value_b, - &user_agent_applies, &user_agent_exact); - if (user_agent_exact) - found_exact = true; - record_count = 0; - } + { + /* We have to support several cases: + + --previous records-- + + User-Agent: foo + User-Agent: Wget + User-Agent: bar + ... matching record ... + + User-Agent: baz + User-Agent: qux + ... non-matching record ... + + User-Agent: * + ... matching record, but will be pruned later ... + + We have to respect `User-Agent' at the beginning of each + new record simply because we don't know if we're going to + encounter "Wget" among the agents or not. Hence, + match_user_agent is called when record_count != 0. + + But if record_count is 0, we have to keep calling it + until it matches, and if that happens, we must not call + it any more, until the next record. Hence the other part + of the condition. */ + if (record_count != 0 || user_agent_applies == false) + match_user_agent (value_b, value_e - value_b, + &user_agent_applies, &user_agent_exact); + if (user_agent_exact) + found_exact = true; + record_count = 0; + } else if (FIELD_IS ("allow")) - { - if (user_agent_applies) - { - add_path (specs, value_b, value_e, true, user_agent_exact); - } - ++record_count; - } + { + if (user_agent_applies) + { + add_path (specs, value_b, value_e, true, user_agent_exact); + } + ++record_count; + } else if (FIELD_IS ("disallow")) - { - if (user_agent_applies) - { - bool allowed = false; - if (value_b == value_e) - /* Empty "disallow" line means everything is *allowed*! */ - allowed = true; - add_path (specs, value_b, value_e, allowed, user_agent_exact); - } - ++record_count; - } + { + if (user_agent_applies) + { + bool allowed = false; + if (value_b == value_e) + /* Empty "disallow" line means everything is *allowed*! */ + allowed = true; + add_path (specs, value_b, value_e, allowed, user_agent_exact); + } + ++record_count; + } else - { - DEBUGP (("Ignoring unknown field at line %d", line_count)); - goto next; - } + { + DEBUGP (("Ignoring unknown field at line %d", line_count)); + goto next; + } next: p = lineend_real; @@ -361,15 +365,15 @@ res_parse (const char *source, int length) if (found_exact) { /* We've encountered an exactly matching user-agent. Throw out - all the stuff with user-agent: *. */ + all the stuff with user-agent: *. */ prune_non_exact (specs); } else if (specs->size > specs->count) { /* add_path normally over-allocates specs->paths. Reallocate it - to the correct size in order to conserve some memory. */ + to the correct size in order to conserve some memory. */ specs->paths = xrealloc (specs->paths, - specs->count * sizeof (struct path_info)); + specs->count * sizeof (struct path_info)); specs->size = specs->count; } @@ -387,7 +391,7 @@ res_parse_from_file (const char *filename) if (!fm) { logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"), - filename, strerror (errno)); + filename, strerror (errno)); return NULL; } specs = res_parse (fm->content, fm->length); @@ -411,16 +415,16 @@ free_specs (struct robot_specs *specs) that number is not a numerical representation of '/', decode C and advance the pointer. */ -#define DECODE_MAYBE(c, ptr) do { \ - if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \ - { \ - char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ - if (decoded != '/') \ - { \ - c = decoded; \ - ptr += 2; \ - } \ - } \ +#define DECODE_MAYBE(c, ptr) do { \ + if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \ + { \ + char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ + if (decoded != '/') \ + { \ + c = decoded; \ + ptr += 2; \ + } \ + } \ } while (0) /* The inner matching engine: return true if RECORD_PATH matches @@ -438,13 +442,13 @@ matches (const char *record_path, const char *url_path) char rc = *rp; char uc = *up; if (!rc) - return true; + return true; if (!uc) - return false; + return false; DECODE_MAYBE(rc, rp); DECODE_MAYBE(uc, up); if (rc != uc) - return false; + return false; } } @@ -461,11 +465,11 @@ res_match_path (const struct robot_specs *specs, const char *path) for (i = 0; i < specs->count; i++) if (matches (specs->paths[i].path, path)) { - bool allowedp = specs->paths[i].allowedp; - DEBUGP (("%s path %s because of rule `%s'.\n", - allowedp ? "Allowing" : "Rejecting", - path, specs->paths[i].path)); - return allowedp; + bool allowedp = specs->paths[i].allowedp; + DEBUGP (("%s path %s because of rule `%s'.\n", + allowedp ? "Allowing" : "Rejecting", + path, specs->paths[i].path)); + return allowedp; } return true; } @@ -475,12 +479,12 @@ res_match_path (const struct robot_specs *specs, const char *path) static struct hash_table *registered_specs; /* Stolen from cookies.c. */ -#define SET_HOSTPORT(host, port, result) do { \ - int HP_len = strlen (host); \ - result = alloca (HP_len + 1 + numdigit (port) + 1); \ - memcpy (result, host, HP_len); \ - result[HP_len] = ':'; \ - number_to_string (result + HP_len + 1, port); \ +#define SET_HOSTPORT(host, port, result) do { \ + int HP_len = strlen (host); \ + result = alloca (HP_len + 1 + numdigit (port) + 1); \ + memcpy (result, host, HP_len); \ + result[HP_len] = ':'; \ + number_to_string (result + HP_len + 1, port); \ } while (0) /* Register RES specs that below to server on HOST:PORT. They will @@ -499,7 +503,7 @@ res_register_specs (const char *host, int port, struct robot_specs *specs) if (hash_table_get_pair (registered_specs, hp, &hp_old, &old)) { if (old) - free_specs (old); + free_specs (old); hash_table_put (registered_specs, hp_old, specs); } else @@ -544,14 +548,25 @@ res_retrieve_file (const char *url, char **file) if (err != RETROK && *file != NULL) { /* If the file is not retrieved correctly, but retrieve_url - allocated the file name, deallocate is here so that the - caller doesn't have to worry about it. */ + allocated the file name, deallocate is here so that the + caller doesn't have to worry about it. */ xfree (*file); *file = NULL; } return err == RETROK; } +bool +is_robots_txt_url (const char *url) +{ + char *robots_url = uri_merge (url, RES_SPECS_LOCATION); + bool ret = are_urls_equal (url, robots_url); + + xfree (robots_url); + + return ret; +} + void res_cleanup (void) { @@ -559,13 +574,44 @@ res_cleanup (void) { hash_table_iterator iter; for (hash_table_iterate (registered_specs, &iter); - hash_table_iter_next (&iter); - ) - { - xfree (iter.key); - free_specs (iter.value); - } + hash_table_iter_next (&iter); + ) + { + xfree (iter.key); + free_specs (iter.value); + } hash_table_destroy (registered_specs); registered_specs = NULL; } } + +#ifdef TESTING + +const char * +test_is_robots_txt_url() +{ + int i; + struct { + char *url; + bool expected_result; + } test_array[] = { + { "http://www.yoyodyne.com/robots.txt", true }, + { "http://www.yoyodyne.com/somepath/", false }, + { "http://www.yoyodyne.com/somepath/robots.txt", false }, + }; + + for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) + { + mu_assert ("test_is_robots_txt_url: wrong result", + is_robots_txt_url (test_array[i].url) == test_array[i].expected_result); + } + + return NULL; +} + +#endif /* TESTING */ + +/* + * vim: et ts=2 sw=2 + */ +