From: hniksic Date: Sun, 18 Nov 2001 02:17:30 +0000 (-0800) Subject: [svn] Plug in new implementation of RES. X-Git-Tag: v1.13~2072 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=05f90bb302e8d97723d47967e0e88d43897852c0 [svn] Plug in new implementation of RES. Published in . --- diff --git a/src/ChangeLog b/src/ChangeLog index 79255c9c..f1cee06f 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,7 @@ +2001-05-12 Hrvoje Niksic + + * res.c: New file. Implement all RES-related code here. + 2001-11-18 Hrvoje Niksic * version.c: Wget 1.7.1 is released. diff --git a/src/Makefile.in b/src/Makefile.in index 4a440aa9..a38fea4a 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -64,8 +64,9 @@ SSL_OBJ = @SSL_OBJ@ OBJ = $(ALLOCA) cmpt$o connect$o cookies$o fnmatch$o ftp$o \ ftp-basic$o ftp-ls$o $(OPIE_OBJ) getopt$o hash$o \ headers$o host$o html-parse$o html-url$o http$o init$o \ - log$o main$o $(MD5_OBJ) netrc$o rbuf$o recur$o retr$o \ - snprintf$o $(SSL_OBJ) url$o utils$o version$o safe-ctype$o + log$o main$o $(MD5_OBJ) netrc$o rbuf$o recur$o res$o \ + retr$o safe-ctype$o snprintf$o $(SSL_OBJ) url$o \ + utils$o version$o .SUFFIXES: .SUFFIXES: .c .o ._c ._o diff --git a/src/recur.c b/src/recur.c index faa01e41..497c455f 100644 --- a/src/recur.c +++ b/src/recur.c @@ -42,6 +42,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "fnmatch.h" #include "host.h" #include "hash.h" +#include "res.h" #ifndef errno extern int errno; @@ -49,8 +50,6 @@ extern int errno; extern char *version_string; -#define ROBOTS_FILENAME "robots.txt" - static struct hash_table *dl_file_url_map; static struct hash_table *dl_url_file_map; @@ -61,26 +60,14 @@ static slist *downloaded_html_files; /* List of undesirable-to-load URLs. */ static struct hash_table *undesirable_urls; -/* List of forbidden locations. */ -static char **forbidden = NULL; - /* Current recursion depth. */ static int depth; /* Base directory we're recursing from (used by no_parent). */ static char *base_dir; -/* The host name for which we last checked robots. */ -static char *robots_host; - static int first_time = 1; -/* Construct the robots URL. */ -static struct urlinfo *robots_url PARAMS ((const char *, const char *)); -static uerr_t retrieve_robots PARAMS ((const char *, const char *)); -static char **parse_robots PARAMS ((const char *)); -static int robots_match PARAMS ((struct urlinfo *, char **)); - /* Cleanup the data structures associated with recursive retrieving (the variables above). */ @@ -105,12 +92,9 @@ recursive_cleanup (void) dl_url_file_map = NULL; } undesirable_urls = NULL; - free_vec (forbidden); - forbidden = NULL; slist_free (downloaded_html_files); downloaded_html_files = NULL; FREE_MAYBE (base_dir); - FREE_MAYBE (robots_host); first_time = 1; } @@ -137,9 +121,7 @@ recursive_retrieve (const char *file, const char *this_url) int meta_disallow_follow; int this_url_ftp; /* See below the explanation */ uerr_t err; - struct urlinfo *rurl; urlpos *url_list, *cur_url; - char *rfile; /* For robots */ struct urlinfo *u; assert (this_url != NULL); @@ -176,8 +158,6 @@ recursive_retrieve (const char *file, const char *this_url) } freeurl (u, 1); depth = 1; - robots_host = NULL; - forbidden = NULL; first_time = 0; } else @@ -399,39 +379,31 @@ recursive_retrieve (const char *file, const char *this_url) /* What about robots.txt? */ if (!inl && opt.use_robots && u->proto == URLHTTP) { - /* Since Wget knows about only one set of robot rules at a - time, /robots.txt must be reloaded whenever a new host is - accessed. - - robots_host holds the host the current `forbid' variable - is assigned to. */ - if (!robots_host || !same_host (robots_host, u->host)) + struct robot_specs *specs = res_get_specs (u->host, u->port); + if (!specs) { - FREE_MAYBE (robots_host); - /* Now make robots_host the new host, no matter what the - result will be. So if there is no /robots.txt on the - site, Wget will not retry getting robots all the - time. */ - robots_host = xstrdup (u->host); - free_vec (forbidden); - forbidden = NULL; - err = retrieve_robots (constr, ROBOTS_FILENAME); - if (err == ROBOTSOK) + char *rfile; + if (res_retrieve_file (constr, &rfile)) { - rurl = robots_url (constr, ROBOTS_FILENAME); - rfile = url_filename (rurl); - forbidden = parse_robots (rfile); - freeurl (rurl, 1); + specs = res_parse_from_file (rfile); xfree (rfile); } + else + { + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); + } + res_register_specs (u->host, u->port, specs); } - /* Now that we have (or don't have) robots, we can check for - them. */ - if (!robots_match (u, forbidden)) + /* Now that we have (or don't have) robots.txt specs, we can + check what they say. */ + if (!res_match_path (specs, u->path)) { - DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, - ROBOTS_FILENAME)); + DEBUGP (("Not following %s because robots.txt forbids it.\n", + constr)); string_set_add (undesirable_urls, constr); inl = 1; } @@ -650,249 +622,3 @@ convert_all_links (void) free_urlpos (urls); } } - -/* Robots support. */ - -/* Construct the robots URL. */ -static struct urlinfo * -robots_url (const char *url, const char *robots_filename) -{ - struct urlinfo *u = newurl (); - uerr_t err; - - err = parseurl (url, u, 0); - assert (err == URLOK && u->proto == URLHTTP); - xfree (u->file); - xfree (u->dir); - xfree (u->url); - u->dir = xstrdup (""); - u->file = xstrdup (robots_filename); - u->url = str_url (u, 0); - return u; -} - -/* Retrieves the robots_filename from the root server directory, if - possible. Returns ROBOTSOK if robots were retrieved OK, and - NOROBOTS if robots could not be retrieved for any reason. */ -static uerr_t -retrieve_robots (const char *url, const char *robots_filename) -{ - int dt; - uerr_t err; - struct urlinfo *u; - - u = robots_url (url, robots_filename); - logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); - err = retrieve_url (u->url, NULL, NULL, NULL, &dt); - freeurl (u, 1); - if (err == RETROK) - return ROBOTSOK; - else - return NOROBOTS; -} - -/* Parse the robots_filename and return the disallowed path components - in a malloc-ed vector of character pointers. - - It should be fully compliant with the syntax as described in the - file norobots.txt, adopted by the robots mailing list - (robots@webcrawler.com). */ -static char ** -parse_robots (const char *robots_filename) -{ - FILE *fp; - char **entries; - char *line, *cmd, *str, *p; - char *base_version, *version; - int num, i; - int wget_matched; /* is the part meant for Wget? */ - - entries = NULL; - - num = 0; - fp = fopen (robots_filename, "rb"); - if (!fp) - return NULL; - - /* Kill version number. */ - if (opt.useragent) - { - STRDUP_ALLOCA (base_version, opt.useragent); - STRDUP_ALLOCA (version, opt.useragent); - } - else - { - int len = 10 + strlen (version_string); - base_version = (char *)alloca (len); - sprintf (base_version, "Wget/%s", version_string); - version = (char *)alloca (len); - sprintf (version, "Wget/%s", version_string); - } - for (p = version; *p; p++) - *p = TOLOWER (*p); - for (p = base_version; *p && *p != '/'; p++) - *p = TOLOWER (*p); - *p = '\0'; - - /* Setting this to 1 means that Wget considers itself under - restrictions by default, even if the User-Agent field is not - present. However, if it finds the user-agent set to anything - other than Wget, the rest will be ignored (up to the following - User-Agent field). Thus you may have something like: - - Disallow: 1 - Disallow: 2 - User-Agent: stupid-robot - Disallow: 3 - Disallow: 4 - User-Agent: Wget* - Disallow: 5 - Disallow: 6 - User-Agent: * - Disallow: 7 - - In this case the 1, 2, 5, 6 and 7 disallow lines will be - stored. */ - wget_matched = 1; - while ((line = read_whole_line (fp))) - { - int len = strlen (line); - /* Destroy if present. */ - if (len && line[len - 1] == '\n') - line[--len] = '\0'; - if (len && line[len - 1] == '\r') - line[--len] = '\0'; - /* According to specifications, optional space may be at the - end... */ - DEBUGP (("Line: %s\n", line)); - /* Skip spaces. */ - for (cmd = line; *cmd && ISSPACE (*cmd); cmd++); - if (!*cmd) - { - xfree (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Look for ':'. */ - for (str = cmd; *str && *str != ':'; str++); - if (!*str) - { - xfree (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Zero-terminate the command. */ - *str++ = '\0'; - /* Look for the string beginning... */ - for (; *str && ISSPACE (*str); str++); - /* Look for comments or trailing spaces and kill them off. */ - for (p = str; *p; p++) - if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0'))) - { - /* We have found either a shell-style comment `+#' or some - trailing spaces. Now rewind to the beginning of the spaces - and place '\0' there. */ - while (p > str && ISSPACE (*p)) - --p; - if (p == str) - *p = '\0'; - else - *(p + 1) = '\0'; - break; - } - if (!strcasecmp (cmd, "User-agent")) - { - int match = 0; - /* Lowercase the agent string. */ - for (p = str; *p; p++) - *p = TOLOWER (*p); - /* If the string is `*', it matches. */ - if (*str == '*' && !*(str + 1)) - match = 1; - else - { - /* If the string contains wildcards, we'll run it through - fnmatch(). */ - if (has_wildcards_p (str)) - { - /* If the string contains '/', compare with the full - version. Else, compare it to base_version. */ - if (strchr (str, '/')) - match = !fnmatch (str, version, 0); - else - match = !fnmatch (str, base_version, 0); - } - else /* Substring search */ - { - if (strstr (version, str)) - match = 1; - else - match = 0; - } - } - /* If Wget is not matched, skip all the entries up to the - next User-agent field. */ - wget_matched = match; - } - else if (!wget_matched) - { - xfree (line); - DEBUGP (("(chucking out since it is not applicable for Wget)\n")); - continue; - } - else if (!strcasecmp (cmd, "Disallow")) - { - /* If "Disallow" is empty, the robot is welcome. */ - if (!*str) - { - free_vec (entries); - entries = (char **)xmalloc (sizeof (char *)); - *entries = NULL; - num = 0; - } - else - { - entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *)); - entries[num] = xstrdup (str); - entries[++num] = NULL; - /* Strip trailing spaces, according to specifications. */ - for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--) - if (ISSPACE (str[i])) - str[i] = '\0'; - } - } - else - { - /* unknown command */ - DEBUGP (("(chucked out)\n")); - } - xfree (line); - } - fclose (fp); - return entries; -} - -/* May the URL url be loaded according to disallowing rules stored in - forbidden? */ -static int -robots_match (struct urlinfo *u, char **fb) -{ - int l; - - if (!fb) - return 1; - DEBUGP (("Matching %s against: ", u->path)); - for (; *fb; fb++) - { - DEBUGP (("%s ", *fb)); - l = strlen (*fb); - /* If dir is fb, we may not load the file. */ - if (strncmp (u->path, *fb, l) == 0) - { - DEBUGP (("matched.\n")); - return 0; /* Matches, i.e. does not load... */ - } - } - DEBUGP (("not matched.\n")); - return 1; -} diff --git a/src/res.c b/src/res.c new file mode 100644 index 00000000..7b7f5538 --- /dev/null +++ b/src/res.c @@ -0,0 +1,544 @@ +/* Support for Robot Exclusion Standard (RES). + Copyright (C) 2001 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* This file implements the Robot Exclusion Standard (RES). + + RES is a simple protocol that enables site admins to signalize to + the web crawlers that certain parts of the site should not be + accessed. All the admin needs to do is create a "robots.txt" file + in the web server root, and use simple commands to allow or + disallow access to certain parts of the site. + + The first specification was written by Martijn Koster in 1994, and + is still available at . + In 1996, Martijn wrote an Internet Draft specifying an improved RES + specification; however, that work was apparently abandoned since + the draft has expired in 1997 and hasn't been replaced since. The + draft is available at + . + + This file implements RES as specified by the draft. Note that this + only handles the "robots.txt" support. The META tag that controls + whether the links should be followed is handled in `html-url.c'. + + Known deviations: + + * The end-of-line comment recognition is more in the spirit of the + Bourne Shell (as specified by RES-1994). That means that + "foo#bar" is taken literally, whereas "foo #bar" is interpreted + as "foo". The Draft apparently specifies that both should be + interpreted as "foo". + + * We don't recognize sole CR as the line ending. + + * We don't implement expiry mechanism for /robots.txt specs. I + consider it non-necessary for a relatively short-lived + application such as Wget. Besides, it is highly questionable + whether anyone deploys the recommended expiry scheme for + robots.txt. + + Entry points are functions res_parse, res_parse_from_file, + res_match_path, res_register_specs, res_get_specs, and + res_retrieve_file. */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#ifdef HAVE_STRING_H +# include +#else +# include +#endif /* HAVE_STRING_H */ +#include +#include + +#include "wget.h" +#include "utils.h" +#include "hash.h" +#include "url.h" +#include "retr.h" +#include "res.h" + +struct path_info { + char *path; + int allowedp; + int user_agent_exact_p; +}; + +struct robot_specs { + int count; + int size; + struct path_info *paths; +}; + +/* Parsing the robot spec. */ + +/* Check whether AGENT (a string of length LENGTH) equals "wget" or + "*". If it is either of them, *matches is set to one. If it is + "wget", *exact_match is set to one. */ + +static void +match_user_agent (const char *agent, int length, + int *matches, int *exact_match) +{ + if (length == 1 && *agent == '*') + { + *matches = 1; + *exact_match = 0; + } + else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget")) + { + *matches = 1; + *exact_match = 1; + } + else + { + *matches = 0; + *exact_match = 0; + } +} + +/* Add a path specification between PATH_B and PATH_E as one of the + paths in SPECS. */ + +static void +add_path (struct robot_specs *specs, const char *path_b, const char *path_e, + int allowedp, int exactp) +{ + struct path_info pp; + pp.path = strdupdelim (path_b, path_e); + pp.allowedp = allowedp; + pp.user_agent_exact_p = exactp; + ++specs->count; + if (specs->count > specs->size) + { + if (specs->size == 0) + specs->size = 1; + else + specs->size <<= 1; + specs->paths = xrealloc (specs->paths, + specs->size * sizeof (struct path_info)); + } + specs->paths[specs->count - 1] = pp; +} + +/* Recreate SPECS->paths with only those paths that have non-zero + user_agent_exact_p. */ + +static void +prune_non_exact (struct robot_specs *specs) +{ + struct path_info *newpaths; + int i, j, cnt; + cnt = 0; + for (i = 0; i < specs->count; i++) + if (specs->paths[i].user_agent_exact_p) + ++cnt; + newpaths = xmalloc (cnt * sizeof (struct path_info)); + for (i = 0, j = 0; i < specs->count; i++) + if (specs->paths[i].user_agent_exact_p) + newpaths[j++] = specs->paths[i]; + assert (j == cnt); + xfree (specs->paths); + specs->paths = newpaths; + specs->count = cnt; + specs->size = cnt; +} + +#define EOL(p) ((p) >= lineend) + +#define SKIP_SPACE(p) do { \ + while (!EOL (p) && ISSPACE (*p)) \ + ++p; \ +} while (0) + +#define FIELD_IS(string_literal) \ + BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal) + +/* Parse textual RES specs beginning with SOURCE of length LENGTH. + Return a specs objects ready to be fed to res_match_path. + + The parsing itself is trivial, but creating a correct SPECS object + is trickier than it seems, because RES is surprisingly byzantine if + you attempt to implement it correctly. + + A "record" is a block of one or more `User-Agent' lines followed by + one or more `Allow' or `Disallow' lines. Record is accepted by + Wget if one of the `User-Agent' lines was "wget", or if the user + agent line was "*". + + After all the lines have been read, we examine whether an exact + ("wget") user-agent field was specified. If so, we delete all the + lines read under "User-Agent: *" blocks because we have our own + Wget-specific blocks. This enables the admin to say: + + User-Agent: * + Disallow: / + + User-Agent: google + User-Agent: wget + Disallow: /cgi-bin + + This means that to Wget and to Google, /cgi-bin is disallowed, + whereas for all other crawlers, everything is disallowed. + res_parse is implemented so that the order of records doesn't + matter. In the case above, the "User-Agent: *" could have come + after the other one. */ + +struct robot_specs * +res_parse (const char *source, int length) +{ + int line_count = 1; + + const char *p = source; + const char *end = source + length; + + /* non-zero if last applicable user-agent field matches Wget. */ + int user_agent_applies = 0; + + /* non-zero if last applicable user-agent field *exactly* matches + Wget. */ + int user_agent_exact = 0; + + /* whether we ever encountered exact user agent. */ + int found_exact = 0; + + /* count of allow/disallow lines in the current "record", i.e. after + the last `user-agent' instructions. */ + int record_count = 0; + + struct robot_specs *specs = xmalloc (sizeof (struct robot_specs)); + memset (specs, '\0', sizeof (struct robot_specs)); + + while (1) + { + const char *lineend, *lineend_real; + const char *field_b, *field_e; + const char *value_b, *value_e; + + if (p == end) + break; + lineend_real = memchr (p, '\n', end - p); + if (lineend_real) + ++lineend_real; + else + lineend_real = end; + lineend = lineend_real; + + /* Before doing anything else, check whether the line is empty + or comment-only. */ + SKIP_SPACE (p); + if (EOL (p) || *p == '#') + goto next; + + /* Make sure the end-of-line comments are respected by setting + lineend to a location preceding the first comment. Real line + ending remains in lineend_real. */ + for (lineend = p; lineend < lineend_real; lineend++) + if ((lineend == p || ISSPACE (*(lineend - 1))) + && *lineend == '#') + break; + + /* Ignore trailing whitespace in the same way. */ + while (lineend > p && ISSPACE (*(lineend - 1))) + --lineend; + + assert (!EOL (p)); + + field_b = p; + while (!EOL (p) && (ISALNUM (*p) || *p == '-')) + ++p; + field_e = p; + + SKIP_SPACE (p); + if (field_b == field_e || EOL (p) || *p != ':') + { + DEBUGP (("Ignoring malformed line %d", line_count)); + goto next; + } + ++p; /* skip ':' */ + SKIP_SPACE (p); + + value_b = p; + while (!EOL (p)) + ++p; + value_e = p; + + /* Finally, we have a syntactically valid line. */ + if (FIELD_IS ("user-agent")) + { + /* We have to support several cases: + + --previous records-- + + User-Agent: foo + User-Agent: Wget + User-Agent: bar + ... matching record ... + + User-Agent: baz + User-Agent: qux + ... non-matching record ... + + User-Agent: * + ... matching record, but will be pruned later ... + + We have to respect `User-Agent' at the beginning of each + new record simply because we don't know if we're going to + encounter "Wget" among the agents or not. Hence, + match_user_agent is called when record_count != 0. + + But if record_count is 0, we have to keep calling it + until it matches, and if that happens, we must not call + it any more, until the next record. Hence the other part + of the condition. */ + if (record_count != 0 || user_agent_applies == 0) + match_user_agent (value_b, value_e - value_b, + &user_agent_applies, &user_agent_exact); + if (user_agent_exact) + found_exact = 1; + record_count = 0; + } + else if (FIELD_IS ("allow")) + { + if (user_agent_applies) + { + add_path (specs, value_b, value_e, 1, user_agent_exact); + } + ++record_count; + } + else if (FIELD_IS ("disallow")) + { + if (user_agent_applies) + { + int allowed = 0; + if (value_b == value_e) + /* Empty "disallow" line means everything is + *allowed*! */ + allowed = 1; + add_path (specs, value_b, value_e, allowed, user_agent_exact); + } + ++record_count; + } + else + { + DEBUGP (("Ignoring unknown field at line %d", line_count)); + goto next; + } + + next: + p = lineend_real; + ++line_count; + } + + if (found_exact) + { + /* We've encountered an exactly matching user-agent. Throw out + all the stuff with user-agent: *. */ + prune_non_exact (specs); + } + else if (specs->size > specs->count) + { + /* add_path normally over-allocates specs->paths. Reallocate it + to the correct size in order to conserve some memory. */ + specs->paths = xrealloc (specs->paths, + specs->count * sizeof (struct path_info)); + specs->size = specs->count; + } + + return specs; +} + +/* The same like res_parse, but first map the FILENAME into memory, + and then parse it. */ + +struct robot_specs * +res_parse_from_file (const char *filename) +{ + struct robot_specs *specs; + struct file_memory *fm = read_file (filename); + if (!fm) + { + logprintf (LOG_NOTQUIET, "Cannot open %s: %s", + filename, strerror (errno)); + return NULL; + } + specs = res_parse (fm->content, fm->length); + read_file_free (fm); + return specs; +} + +static void +free_specs (struct robot_specs *specs) +{ + FREE_MAYBE (specs->paths); + xfree (specs); +} + +/* Matching of a path according to the specs. */ + +/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if + that number is not a numerical representation of '/', decode C and + advance the pointer. */ + +#define DECODE_MAYBE(c, ptr) do { \ + if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \ + { \ + char decoded \ + = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \ + if (decoded != '/') \ + { \ + c = decoded; \ + ptr += 2; \ + } \ + } \ +} while (0) + +/* The inner matching engine: return non-zero if RECORD_PATH matches + URL_PATH. The rules for matching are described at + , + section 3.2.2. */ + +static int +matches (const char *record_path, const char *url_path) +{ + const char *rp = record_path; + const char *up = url_path; + + for (; ; ++rp, ++up) + { + char rc = *rp; + char uc = *up; + if (!rc) + return 1; + if (!uc) + return 0; + DECODE_MAYBE(rc, rp); + DECODE_MAYBE(uc, up); + if (rc != uc) + return 0; + } +} + +/* Iterate through all paths in SPECS. For the first one that + matches, return its allow/reject status. If none matches, + retrieval is by default allowed. */ + +int +res_match_path (const struct robot_specs *specs, const char *path) +{ + int i; + if (!specs) + return 1; + for (i = 0; i < specs->count; i++) + if (matches (specs->paths[i].path, path)) + { + int allowedp = specs->paths[i].allowedp; + DEBUGP (("%s path %s because of rule `%s'.\n", + allowedp ? "Allowing" : "Rejecting", + path, specs->paths[i].path)); + return allowedp; + } + return 1; +} + +/* Registering the specs. */ + +struct hash_table *registered_specs; + +/* Stolen from cookies.c. */ +#define SET_HOSTPORT(host, port, result) do { \ + int HP_len = strlen (host); \ + result = alloca (HP_len + 1 + numdigit (port) + 1); \ + memcpy (result, host, HP_len); \ + result[HP_len] = ':'; \ + long_to_string (result + HP_len + 1, port); \ +} while (0) + +/* Register RES specs that below to server on HOST:PORT. They will + later be retrievable using res_get_specs. */ + +void +res_register_specs (const char *host, int port, struct robot_specs *specs) +{ + struct robot_specs *old; + char *hp, *hp_old; + SET_HOSTPORT (host, port, hp); + + if (!registered_specs) + registered_specs = make_nocase_string_hash_table (0); + + if (hash_table_get_pair (registered_specs, hp, hp_old, old)) + { + if (old) + free_specs (old); + hash_table_put (registered_specs, hp_old, specs); + } + else + { + hash_table_put (registered_specs, xstrdup (hp), specs); + } +} + +/* Get the specs that belong to HOST:PORT. */ + +struct robot_specs * +res_get_specs (const char *host, int port) +{ + char *hp; + SET_HOSTPORT (host, port, hp); + if (!registered_specs) + return NULL; + return hash_table_get (registered_specs, hp); +} + +/* Loading the robots file. */ + +#define RES_SPECS_LOCATION "/robots.txt" + +/* Retrieve the robots.txt from the server root of the server that + serves URL. The file will be named according to the currently + active rules, and the file name will be returned in *file. + + Return non-zero if robots were retrieved OK, zero otherwise. */ + +int +res_retrieve_file (const char *url, char **file) +{ + uerr_t err; + char *robots_url = uri_merge (url, RES_SPECS_LOCATION); + + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); + *file = NULL; + err = retrieve_url (robots_url, file, NULL, NULL, NULL); + xfree (robots_url); + + if (err != RETROK && *file != NULL) + { + /* If the file is not retrieved correctly, but retrieve_url + allocated the file name, deallocate is here so that the + caller doesn't have to worry about it. */ + xfree (*file); + *file = NULL; + } + return err == RETROK; +} diff --git a/src/res.h b/src/res.h new file mode 100644 index 00000000..97b89778 --- /dev/null +++ b/src/res.h @@ -0,0 +1,31 @@ +/* Declarations for res.c. + Copyright (C) 2001 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +struct robot_specs; + +struct robot_specs *res_parse PARAMS ((const char *, int)); +struct robot_specs *res_parse_from_file PARAMS ((const char *)); + +int res_match_path PARAMS ((const struct robot_specs *, const char *)); + +void res_register_specs PARAMS ((const char *, int, struct robot_specs *)); +struct robot_specs *res_get_specs PARAMS ((const char *, int)); + +int res_retrieve_file PARAMS ((const char *, char **)); +