#include "fnmatch.h"
#include "host.h"
#include "hash.h"
+#include "res.h"
#ifndef errno
extern int errno;
extern char *version_string;
-#define ROBOTS_FILENAME "robots.txt"
-
static struct hash_table *dl_file_url_map;
static struct hash_table *dl_url_file_map;
/* List of undesirable-to-load URLs. */
static struct hash_table *undesirable_urls;
-/* List of forbidden locations. */
-static char **forbidden = NULL;
-
/* Current recursion depth. */
static int depth;
/* Base directory we're recursing from (used by no_parent). */
static char *base_dir;
-/* The host name for which we last checked robots. */
-static char *robots_host;
-
static int first_time = 1;
-/* Construct the robots URL. */
-static struct urlinfo *robots_url PARAMS ((const char *, const char *));
-static uerr_t retrieve_robots PARAMS ((const char *, const char *));
-static char **parse_robots PARAMS ((const char *));
-static int robots_match PARAMS ((struct urlinfo *, char **));
-
/* Cleanup the data structures associated with recursive retrieving
(the variables above). */
dl_url_file_map = NULL;
}
undesirable_urls = NULL;
- free_vec (forbidden);
- forbidden = NULL;
slist_free (downloaded_html_files);
downloaded_html_files = NULL;
FREE_MAYBE (base_dir);
- FREE_MAYBE (robots_host);
first_time = 1;
}
int meta_disallow_follow;
int this_url_ftp; /* See below the explanation */
uerr_t err;
- struct urlinfo *rurl;
urlpos *url_list, *cur_url;
- char *rfile; /* For robots */
struct urlinfo *u;
assert (this_url != NULL);
}
freeurl (u, 1);
depth = 1;
- robots_host = NULL;
- forbidden = NULL;
first_time = 0;
}
else
/* What about robots.txt? */
if (!inl && opt.use_robots && u->proto == URLHTTP)
{
- /* Since Wget knows about only one set of robot rules at a
- time, /robots.txt must be reloaded whenever a new host is
- accessed.
-
- robots_host holds the host the current `forbid' variable
- is assigned to. */
- if (!robots_host || !same_host (robots_host, u->host))
+ struct robot_specs *specs = res_get_specs (u->host, u->port);
+ if (!specs)
{
- FREE_MAYBE (robots_host);
- /* Now make robots_host the new host, no matter what the
- result will be. So if there is no /robots.txt on the
- site, Wget will not retry getting robots all the
- time. */
- robots_host = xstrdup (u->host);
- free_vec (forbidden);
- forbidden = NULL;
- err = retrieve_robots (constr, ROBOTS_FILENAME);
- if (err == ROBOTSOK)
+ char *rfile;
+ if (res_retrieve_file (constr, &rfile))
{
- rurl = robots_url (constr, ROBOTS_FILENAME);
- rfile = url_filename (rurl);
- forbidden = parse_robots (rfile);
- freeurl (rurl, 1);
+ specs = res_parse_from_file (rfile);
xfree (rfile);
}
+ else
+ {
+ /* If we cannot get real specs, at least produce
+ dummy ones so that we can register them and stop
+ trying to retrieve them. */
+ specs = res_parse ("", 0);
+ }
+ res_register_specs (u->host, u->port, specs);
}
- /* Now that we have (or don't have) robots, we can check for
- them. */
- if (!robots_match (u, forbidden))
+ /* Now that we have (or don't have) robots.txt specs, we can
+ check what they say. */
+ if (!res_match_path (specs, u->path))
{
- DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
- ROBOTS_FILENAME));
+ DEBUGP (("Not following %s because robots.txt forbids it.\n",
+ constr));
string_set_add (undesirable_urls, constr);
inl = 1;
}
free_urlpos (urls);
}
}
-\f
-/* Robots support. */
-
-/* Construct the robots URL. */
-static struct urlinfo *
-robots_url (const char *url, const char *robots_filename)
-{
- struct urlinfo *u = newurl ();
- uerr_t err;
-
- err = parseurl (url, u, 0);
- assert (err == URLOK && u->proto == URLHTTP);
- xfree (u->file);
- xfree (u->dir);
- xfree (u->url);
- u->dir = xstrdup ("");
- u->file = xstrdup (robots_filename);
- u->url = str_url (u, 0);
- return u;
-}
-
-/* Retrieves the robots_filename from the root server directory, if
- possible. Returns ROBOTSOK if robots were retrieved OK, and
- NOROBOTS if robots could not be retrieved for any reason. */
-static uerr_t
-retrieve_robots (const char *url, const char *robots_filename)
-{
- int dt;
- uerr_t err;
- struct urlinfo *u;
-
- u = robots_url (url, robots_filename);
- logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
- err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
- freeurl (u, 1);
- if (err == RETROK)
- return ROBOTSOK;
- else
- return NOROBOTS;
-}
-
-/* Parse the robots_filename and return the disallowed path components
- in a malloc-ed vector of character pointers.
-
- It should be fully compliant with the syntax as described in the
- file norobots.txt, adopted by the robots mailing list
- (robots@webcrawler.com). */
-static char **
-parse_robots (const char *robots_filename)
-{
- FILE *fp;
- char **entries;
- char *line, *cmd, *str, *p;
- char *base_version, *version;
- int num, i;
- int wget_matched; /* is the part meant for Wget? */
-
- entries = NULL;
-
- num = 0;
- fp = fopen (robots_filename, "rb");
- if (!fp)
- return NULL;
-
- /* Kill version number. */
- if (opt.useragent)
- {
- STRDUP_ALLOCA (base_version, opt.useragent);
- STRDUP_ALLOCA (version, opt.useragent);
- }
- else
- {
- int len = 10 + strlen (version_string);
- base_version = (char *)alloca (len);
- sprintf (base_version, "Wget/%s", version_string);
- version = (char *)alloca (len);
- sprintf (version, "Wget/%s", version_string);
- }
- for (p = version; *p; p++)
- *p = TOLOWER (*p);
- for (p = base_version; *p && *p != '/'; p++)
- *p = TOLOWER (*p);
- *p = '\0';
-
- /* Setting this to 1 means that Wget considers itself under
- restrictions by default, even if the User-Agent field is not
- present. However, if it finds the user-agent set to anything
- other than Wget, the rest will be ignored (up to the following
- User-Agent field). Thus you may have something like:
-
- Disallow: 1
- Disallow: 2
- User-Agent: stupid-robot
- Disallow: 3
- Disallow: 4
- User-Agent: Wget*
- Disallow: 5
- Disallow: 6
- User-Agent: *
- Disallow: 7
-
- In this case the 1, 2, 5, 6 and 7 disallow lines will be
- stored. */
- wget_matched = 1;
- while ((line = read_whole_line (fp)))
- {
- int len = strlen (line);
- /* Destroy <CR><LF> if present. */
- if (len && line[len - 1] == '\n')
- line[--len] = '\0';
- if (len && line[len - 1] == '\r')
- line[--len] = '\0';
- /* According to specifications, optional space may be at the
- end... */
- DEBUGP (("Line: %s\n", line));
- /* Skip spaces. */
- for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
- if (!*cmd)
- {
- xfree (line);
- DEBUGP (("(chucked out)\n"));
- continue;
- }
- /* Look for ':'. */
- for (str = cmd; *str && *str != ':'; str++);
- if (!*str)
- {
- xfree (line);
- DEBUGP (("(chucked out)\n"));
- continue;
- }
- /* Zero-terminate the command. */
- *str++ = '\0';
- /* Look for the string beginning... */
- for (; *str && ISSPACE (*str); str++);
- /* Look for comments or trailing spaces and kill them off. */
- for (p = str; *p; p++)
- if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
- {
- /* We have found either a shell-style comment `<sp>+#' or some
- trailing spaces. Now rewind to the beginning of the spaces
- and place '\0' there. */
- while (p > str && ISSPACE (*p))
- --p;
- if (p == str)
- *p = '\0';
- else
- *(p + 1) = '\0';
- break;
- }
- if (!strcasecmp (cmd, "User-agent"))
- {
- int match = 0;
- /* Lowercase the agent string. */
- for (p = str; *p; p++)
- *p = TOLOWER (*p);
- /* If the string is `*', it matches. */
- if (*str == '*' && !*(str + 1))
- match = 1;
- else
- {
- /* If the string contains wildcards, we'll run it through
- fnmatch(). */
- if (has_wildcards_p (str))
- {
- /* If the string contains '/', compare with the full
- version. Else, compare it to base_version. */
- if (strchr (str, '/'))
- match = !fnmatch (str, version, 0);
- else
- match = !fnmatch (str, base_version, 0);
- }
- else /* Substring search */
- {
- if (strstr (version, str))
- match = 1;
- else
- match = 0;
- }
- }
- /* If Wget is not matched, skip all the entries up to the
- next User-agent field. */
- wget_matched = match;
- }
- else if (!wget_matched)
- {
- xfree (line);
- DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
- continue;
- }
- else if (!strcasecmp (cmd, "Disallow"))
- {
- /* If "Disallow" is empty, the robot is welcome. */
- if (!*str)
- {
- free_vec (entries);
- entries = (char **)xmalloc (sizeof (char *));
- *entries = NULL;
- num = 0;
- }
- else
- {
- entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
- entries[num] = xstrdup (str);
- entries[++num] = NULL;
- /* Strip trailing spaces, according to specifications. */
- for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
- if (ISSPACE (str[i]))
- str[i] = '\0';
- }
- }
- else
- {
- /* unknown command */
- DEBUGP (("(chucked out)\n"));
- }
- xfree (line);
- }
- fclose (fp);
- return entries;
-}
-
-/* May the URL url be loaded according to disallowing rules stored in
- forbidden? */
-static int
-robots_match (struct urlinfo *u, char **fb)
-{
- int l;
-
- if (!fb)
- return 1;
- DEBUGP (("Matching %s against: ", u->path));
- for (; *fb; fb++)
- {
- DEBUGP (("%s ", *fb));
- l = strlen (*fb);
- /* If dir is fb, we may not load the file. */
- if (strncmp (u->path, *fb, l) == 0)
- {
- DEBUGP (("matched.\n"));
- return 0; /* Matches, i.e. does not load... */
- }
- }
- DEBUGP (("not matched.\n"));
- return 1;
-}
--- /dev/null
+/* Support for Robot Exclusion Standard (RES).
+ Copyright (C) 2001 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* This file implements the Robot Exclusion Standard (RES).
+
+ RES is a simple protocol that enables site admins to signalize to
+ the web crawlers that certain parts of the site should not be
+ accessed. All the admin needs to do is create a "robots.txt" file
+ in the web server root, and use simple commands to allow or
+ disallow access to certain parts of the site.
+
+ The first specification was written by Martijn Koster in 1994, and
+ is still available at <http://www.robotstxt.org/wc/norobots.html>.
+ In 1996, Martijn wrote an Internet Draft specifying an improved RES
+ specification; however, that work was apparently abandoned since
+ the draft has expired in 1997 and hasn't been replaced since. The
+ draft is available at
+ <http://www.robotstxt.org/wc/norobots-rfc.html>.
+
+ This file implements RES as specified by the draft. Note that this
+ only handles the "robots.txt" support. The META tag that controls
+ whether the links should be followed is handled in `html-url.c'.
+
+ Known deviations:
+
+ * The end-of-line comment recognition is more in the spirit of the
+ Bourne Shell (as specified by RES-1994). That means that
+ "foo#bar" is taken literally, whereas "foo #bar" is interpreted
+ as "foo". The Draft apparently specifies that both should be
+ interpreted as "foo".
+
+ * We don't recognize sole CR as the line ending.
+
+ * We don't implement expiry mechanism for /robots.txt specs. I
+ consider it non-necessary for a relatively short-lived
+ application such as Wget. Besides, it is highly questionable
+ whether anyone deploys the recommended expiry scheme for
+ robots.txt.
+
+ Entry points are functions res_parse, res_parse_from_file,
+ res_match_path, res_register_specs, res_get_specs, and
+ res_retrieve_file. */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif /* HAVE_STRING_H */
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "utils.h"
+#include "hash.h"
+#include "url.h"
+#include "retr.h"
+#include "res.h"
+
+struct path_info {
+ char *path;
+ int allowedp;
+ int user_agent_exact_p;
+};
+
+struct robot_specs {
+ int count;
+ int size;
+ struct path_info *paths;
+};
+\f
+/* Parsing the robot spec. */
+
+/* Check whether AGENT (a string of length LENGTH) equals "wget" or
+ "*". If it is either of them, *matches is set to one. If it is
+ "wget", *exact_match is set to one. */
+
+static void
+match_user_agent (const char *agent, int length,
+ int *matches, int *exact_match)
+{
+ if (length == 1 && *agent == '*')
+ {
+ *matches = 1;
+ *exact_match = 0;
+ }
+ else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
+ {
+ *matches = 1;
+ *exact_match = 1;
+ }
+ else
+ {
+ *matches = 0;
+ *exact_match = 0;
+ }
+}
+
+/* Add a path specification between PATH_B and PATH_E as one of the
+ paths in SPECS. */
+
+static void
+add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
+ int allowedp, int exactp)
+{
+ struct path_info pp;
+ pp.path = strdupdelim (path_b, path_e);
+ pp.allowedp = allowedp;
+ pp.user_agent_exact_p = exactp;
+ ++specs->count;
+ if (specs->count > specs->size)
+ {
+ if (specs->size == 0)
+ specs->size = 1;
+ else
+ specs->size <<= 1;
+ specs->paths = xrealloc (specs->paths,
+ specs->size * sizeof (struct path_info));
+ }
+ specs->paths[specs->count - 1] = pp;
+}
+
+/* Recreate SPECS->paths with only those paths that have non-zero
+ user_agent_exact_p. */
+
+static void
+prune_non_exact (struct robot_specs *specs)
+{
+ struct path_info *newpaths;
+ int i, j, cnt;
+ cnt = 0;
+ for (i = 0; i < specs->count; i++)
+ if (specs->paths[i].user_agent_exact_p)
+ ++cnt;
+ newpaths = xmalloc (cnt * sizeof (struct path_info));
+ for (i = 0, j = 0; i < specs->count; i++)
+ if (specs->paths[i].user_agent_exact_p)
+ newpaths[j++] = specs->paths[i];
+ assert (j == cnt);
+ xfree (specs->paths);
+ specs->paths = newpaths;
+ specs->count = cnt;
+ specs->size = cnt;
+}
+
+#define EOL(p) ((p) >= lineend)
+
+#define SKIP_SPACE(p) do { \
+ while (!EOL (p) && ISSPACE (*p)) \
+ ++p; \
+} while (0)
+
+#define FIELD_IS(string_literal) \
+ BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
+
+/* Parse textual RES specs beginning with SOURCE of length LENGTH.
+ Return a specs objects ready to be fed to res_match_path.
+
+ The parsing itself is trivial, but creating a correct SPECS object
+ is trickier than it seems, because RES is surprisingly byzantine if
+ you attempt to implement it correctly.
+
+ A "record" is a block of one or more `User-Agent' lines followed by
+ one or more `Allow' or `Disallow' lines. Record is accepted by
+ Wget if one of the `User-Agent' lines was "wget", or if the user
+ agent line was "*".
+
+ After all the lines have been read, we examine whether an exact
+ ("wget") user-agent field was specified. If so, we delete all the
+ lines read under "User-Agent: *" blocks because we have our own
+ Wget-specific blocks. This enables the admin to say:
+
+ User-Agent: *
+ Disallow: /
+
+ User-Agent: google
+ User-Agent: wget
+ Disallow: /cgi-bin
+
+ This means that to Wget and to Google, /cgi-bin is disallowed,
+ whereas for all other crawlers, everything is disallowed.
+ res_parse is implemented so that the order of records doesn't
+ matter. In the case above, the "User-Agent: *" could have come
+ after the other one. */
+
+struct robot_specs *
+res_parse (const char *source, int length)
+{
+ int line_count = 1;
+
+ const char *p = source;
+ const char *end = source + length;
+
+ /* non-zero if last applicable user-agent field matches Wget. */
+ int user_agent_applies = 0;
+
+ /* non-zero if last applicable user-agent field *exactly* matches
+ Wget. */
+ int user_agent_exact = 0;
+
+ /* whether we ever encountered exact user agent. */
+ int found_exact = 0;
+
+ /* count of allow/disallow lines in the current "record", i.e. after
+ the last `user-agent' instructions. */
+ int record_count = 0;
+
+ struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
+ memset (specs, '\0', sizeof (struct robot_specs));
+
+ while (1)
+ {
+ const char *lineend, *lineend_real;
+ const char *field_b, *field_e;
+ const char *value_b, *value_e;
+
+ if (p == end)
+ break;
+ lineend_real = memchr (p, '\n', end - p);
+ if (lineend_real)
+ ++lineend_real;
+ else
+ lineend_real = end;
+ lineend = lineend_real;
+
+ /* Before doing anything else, check whether the line is empty
+ or comment-only. */
+ SKIP_SPACE (p);
+ if (EOL (p) || *p == '#')
+ goto next;
+
+ /* Make sure the end-of-line comments are respected by setting
+ lineend to a location preceding the first comment. Real line
+ ending remains in lineend_real. */
+ for (lineend = p; lineend < lineend_real; lineend++)
+ if ((lineend == p || ISSPACE (*(lineend - 1)))
+ && *lineend == '#')
+ break;
+
+ /* Ignore trailing whitespace in the same way. */
+ while (lineend > p && ISSPACE (*(lineend - 1)))
+ --lineend;
+
+ assert (!EOL (p));
+
+ field_b = p;
+ while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
+ ++p;
+ field_e = p;
+
+ SKIP_SPACE (p);
+ if (field_b == field_e || EOL (p) || *p != ':')
+ {
+ DEBUGP (("Ignoring malformed line %d", line_count));
+ goto next;
+ }
+ ++p; /* skip ':' */
+ SKIP_SPACE (p);
+
+ value_b = p;
+ while (!EOL (p))
+ ++p;
+ value_e = p;
+
+ /* Finally, we have a syntactically valid line. */
+ if (FIELD_IS ("user-agent"))
+ {
+ /* We have to support several cases:
+
+ --previous records--
+
+ User-Agent: foo
+ User-Agent: Wget
+ User-Agent: bar
+ ... matching record ...
+
+ User-Agent: baz
+ User-Agent: qux
+ ... non-matching record ...
+
+ User-Agent: *
+ ... matching record, but will be pruned later ...
+
+ We have to respect `User-Agent' at the beginning of each
+ new record simply because we don't know if we're going to
+ encounter "Wget" among the agents or not. Hence,
+ match_user_agent is called when record_count != 0.
+
+ But if record_count is 0, we have to keep calling it
+ until it matches, and if that happens, we must not call
+ it any more, until the next record. Hence the other part
+ of the condition. */
+ if (record_count != 0 || user_agent_applies == 0)
+ match_user_agent (value_b, value_e - value_b,
+ &user_agent_applies, &user_agent_exact);
+ if (user_agent_exact)
+ found_exact = 1;
+ record_count = 0;
+ }
+ else if (FIELD_IS ("allow"))
+ {
+ if (user_agent_applies)
+ {
+ add_path (specs, value_b, value_e, 1, user_agent_exact);
+ }
+ ++record_count;
+ }
+ else if (FIELD_IS ("disallow"))
+ {
+ if (user_agent_applies)
+ {
+ int allowed = 0;
+ if (value_b == value_e)
+ /* Empty "disallow" line means everything is
+ *allowed*! */
+ allowed = 1;
+ add_path (specs, value_b, value_e, allowed, user_agent_exact);
+ }
+ ++record_count;
+ }
+ else
+ {
+ DEBUGP (("Ignoring unknown field at line %d", line_count));
+ goto next;
+ }
+
+ next:
+ p = lineend_real;
+ ++line_count;
+ }
+
+ if (found_exact)
+ {
+ /* We've encountered an exactly matching user-agent. Throw out
+ all the stuff with user-agent: *. */
+ prune_non_exact (specs);
+ }
+ else if (specs->size > specs->count)
+ {
+ /* add_path normally over-allocates specs->paths. Reallocate it
+ to the correct size in order to conserve some memory. */
+ specs->paths = xrealloc (specs->paths,
+ specs->count * sizeof (struct path_info));
+ specs->size = specs->count;
+ }
+
+ return specs;
+}
+
+/* The same like res_parse, but first map the FILENAME into memory,
+ and then parse it. */
+
+struct robot_specs *
+res_parse_from_file (const char *filename)
+{
+ struct robot_specs *specs;
+ struct file_memory *fm = read_file (filename);
+ if (!fm)
+ {
+ logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
+ filename, strerror (errno));
+ return NULL;
+ }
+ specs = res_parse (fm->content, fm->length);
+ read_file_free (fm);
+ return specs;
+}
+
+static void
+free_specs (struct robot_specs *specs)
+{
+ FREE_MAYBE (specs->paths);
+ xfree (specs);
+}
+\f
+/* Matching of a path according to the specs. */
+
+/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
+ that number is not a numerical representation of '/', decode C and
+ advance the pointer. */
+
+#define DECODE_MAYBE(c, ptr) do { \
+ if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
+ { \
+ char decoded \
+ = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \
+ if (decoded != '/') \
+ { \
+ c = decoded; \
+ ptr += 2; \
+ } \
+ } \
+} while (0)
+
+/* The inner matching engine: return non-zero if RECORD_PATH matches
+ URL_PATH. The rules for matching are described at
+ <http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html>,
+ section 3.2.2. */
+
+static int
+matches (const char *record_path, const char *url_path)
+{
+ const char *rp = record_path;
+ const char *up = url_path;
+
+ for (; ; ++rp, ++up)
+ {
+ char rc = *rp;
+ char uc = *up;
+ if (!rc)
+ return 1;
+ if (!uc)
+ return 0;
+ DECODE_MAYBE(rc, rp);
+ DECODE_MAYBE(uc, up);
+ if (rc != uc)
+ return 0;
+ }
+}
+
+/* Iterate through all paths in SPECS. For the first one that
+ matches, return its allow/reject status. If none matches,
+ retrieval is by default allowed. */
+
+int
+res_match_path (const struct robot_specs *specs, const char *path)
+{
+ int i;
+ if (!specs)
+ return 1;
+ for (i = 0; i < specs->count; i++)
+ if (matches (specs->paths[i].path, path))
+ {
+ int allowedp = specs->paths[i].allowedp;
+ DEBUGP (("%s path %s because of rule `%s'.\n",
+ allowedp ? "Allowing" : "Rejecting",
+ path, specs->paths[i].path));
+ return allowedp;
+ }
+ return 1;
+}
+\f
+/* Registering the specs. */
+
+struct hash_table *registered_specs;
+
+/* Stolen from cookies.c. */
+#define SET_HOSTPORT(host, port, result) do { \
+ int HP_len = strlen (host); \
+ result = alloca (HP_len + 1 + numdigit (port) + 1); \
+ memcpy (result, host, HP_len); \
+ result[HP_len] = ':'; \
+ long_to_string (result + HP_len + 1, port); \
+} while (0)
+
+/* Register RES specs that below to server on HOST:PORT. They will
+ later be retrievable using res_get_specs. */
+
+void
+res_register_specs (const char *host, int port, struct robot_specs *specs)
+{
+ struct robot_specs *old;
+ char *hp, *hp_old;
+ SET_HOSTPORT (host, port, hp);
+
+ if (!registered_specs)
+ registered_specs = make_nocase_string_hash_table (0);
+
+ if (hash_table_get_pair (registered_specs, hp, hp_old, old))
+ {
+ if (old)
+ free_specs (old);
+ hash_table_put (registered_specs, hp_old, specs);
+ }
+ else
+ {
+ hash_table_put (registered_specs, xstrdup (hp), specs);
+ }
+}
+
+/* Get the specs that belong to HOST:PORT. */
+
+struct robot_specs *
+res_get_specs (const char *host, int port)
+{
+ char *hp;
+ SET_HOSTPORT (host, port, hp);
+ if (!registered_specs)
+ return NULL;
+ return hash_table_get (registered_specs, hp);
+}
+\f
+/* Loading the robots file. */
+
+#define RES_SPECS_LOCATION "/robots.txt"
+
+/* Retrieve the robots.txt from the server root of the server that
+ serves URL. The file will be named according to the currently
+ active rules, and the file name will be returned in *file.
+
+ Return non-zero if robots were retrieved OK, zero otherwise. */
+
+int
+res_retrieve_file (const char *url, char **file)
+{
+ uerr_t err;
+ char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
+
+ logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
+ *file = NULL;
+ err = retrieve_url (robots_url, file, NULL, NULL, NULL);
+ xfree (robots_url);
+
+ if (err != RETROK && *file != NULL)
+ {
+ /* If the file is not retrieved correctly, but retrieve_url
+ allocated the file name, deallocate is here so that the
+ caller doesn't have to worry about it. */
+ xfree (*file);
+ *file = NULL;
+ }
+ return err == RETROK;
+}