1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20 /* This file implements the Robot Exclusion Standard (RES).
22 RES is a simple protocol that enables site admins to signalize to
23 the web crawlers that certain parts of the site should not be
24 accessed. All the admin needs to do is create a "robots.txt" file
25 in the web server root, and use simple commands to allow or
26 disallow access to certain parts of the site.
28 The first specification was written by Martijn Koster in 1994, and
29 is still available at <http://www.robotstxt.org/wc/norobots.html>.
30 In 1996, Martijn wrote an Internet Draft specifying an improved RES
31 specification; however, that work was apparently abandoned since
32 the draft has expired in 1997 and hasn't been replaced since. The
34 <http://www.robotstxt.org/wc/norobots-rfc.html>.
36 This file implements RES as specified by the draft. Note that this
37 only handles the "robots.txt" support. The META tag that controls
38 whether the links should be followed is handled in `html-url.c'.
42 * The end-of-line comment recognition is more in the spirit of the
43 Bourne Shell (as specified by RES-1994). That means that
44 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
45 as "foo". The Draft apparently specifies that both should be
48 * We don't recognize sole CR as the line ending.
50 * We don't implement expiry mechanism for /robots.txt specs. I
51 consider it non-necessary for a relatively short-lived
52 application such as Wget. Besides, it is highly questionable
53 whether anyone deploys the recommended expiry scheme for
56 Entry points are functions res_parse, res_parse_from_file,
57 res_match_path, res_register_specs, res_get_specs, and
70 #endif /* HAVE_STRING_H */
84 int user_agent_exact_p;
90 struct path_info *paths;
93 /* Parsing the robot spec. */
95 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
96 "*". If it is either of them, *matches is set to one. If it is
97 "wget", *exact_match is set to one. */
100 match_user_agent (const char *agent, int length,
101 int *matches, int *exact_match)
103 if (length == 1 && *agent == '*')
108 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
120 /* Add a path specification between PATH_B and PATH_E as one of the
124 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
125 int allowedp, int exactp)
128 if (path_b < path_e && *path_b == '/')
129 /* Our path representation doesn't use a leading slash, so remove
132 pp.path = strdupdelim (path_b, path_e);
133 pp.allowedp = allowedp;
134 pp.user_agent_exact_p = exactp;
136 if (specs->count > specs->size)
138 if (specs->size == 0)
142 specs->paths = xrealloc (specs->paths,
143 specs->size * sizeof (struct path_info));
145 specs->paths[specs->count - 1] = pp;
148 /* Recreate SPECS->paths with only those paths that have non-zero
149 user_agent_exact_p. */
152 prune_non_exact (struct robot_specs *specs)
154 struct path_info *newpaths;
157 for (i = 0; i < specs->count; i++)
158 if (specs->paths[i].user_agent_exact_p)
160 newpaths = xmalloc (cnt * sizeof (struct path_info));
161 for (i = 0, j = 0; i < specs->count; i++)
162 if (specs->paths[i].user_agent_exact_p)
163 newpaths[j++] = specs->paths[i];
165 xfree (specs->paths);
166 specs->paths = newpaths;
171 #define EOL(p) ((p) >= lineend)
173 #define SKIP_SPACE(p) do { \
174 while (!EOL (p) && ISSPACE (*p)) \
178 #define FIELD_IS(string_literal) \
179 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
181 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
182 Return a specs objects ready to be fed to res_match_path.
184 The parsing itself is trivial, but creating a correct SPECS object
185 is trickier than it seems, because RES is surprisingly byzantine if
186 you attempt to implement it correctly.
188 A "record" is a block of one or more `User-Agent' lines followed by
189 one or more `Allow' or `Disallow' lines. Record is accepted by
190 Wget if one of the `User-Agent' lines was "wget", or if the user
193 After all the lines have been read, we examine whether an exact
194 ("wget") user-agent field was specified. If so, we delete all the
195 lines read under "User-Agent: *" blocks because we have our own
196 Wget-specific blocks. This enables the admin to say:
205 This means that to Wget and to Google, /cgi-bin is disallowed,
206 whereas for all other crawlers, everything is disallowed.
207 res_parse is implemented so that the order of records doesn't
208 matter. In the case above, the "User-Agent: *" could have come
209 after the other one. */
212 res_parse (const char *source, int length)
216 const char *p = source;
217 const char *end = source + length;
219 /* non-zero if last applicable user-agent field matches Wget. */
220 int user_agent_applies = 0;
222 /* non-zero if last applicable user-agent field *exactly* matches
224 int user_agent_exact = 0;
226 /* whether we ever encountered exact user agent. */
229 /* count of allow/disallow lines in the current "record", i.e. after
230 the last `user-agent' instructions. */
231 int record_count = 0;
233 struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
234 memset (specs, '\0', sizeof (struct robot_specs));
238 const char *lineend, *lineend_real;
239 const char *field_b, *field_e;
240 const char *value_b, *value_e;
244 lineend_real = memchr (p, '\n', end - p);
249 lineend = lineend_real;
251 /* Before doing anything else, check whether the line is empty
254 if (EOL (p) || *p == '#')
257 /* Make sure the end-of-line comments are respected by setting
258 lineend to a location preceding the first comment. Real line
259 ending remains in lineend_real. */
260 for (lineend = p; lineend < lineend_real; lineend++)
261 if ((lineend == p || ISSPACE (*(lineend - 1)))
265 /* Ignore trailing whitespace in the same way. */
266 while (lineend > p && ISSPACE (*(lineend - 1)))
272 while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
277 if (field_b == field_e || EOL (p) || *p != ':')
279 DEBUGP (("Ignoring malformed line %d", line_count));
290 /* Finally, we have a syntactically valid line. */
291 if (FIELD_IS ("user-agent"))
293 /* We have to support several cases:
300 ... matching record ...
304 ... non-matching record ...
307 ... matching record, but will be pruned later ...
309 We have to respect `User-Agent' at the beginning of each
310 new record simply because we don't know if we're going to
311 encounter "Wget" among the agents or not. Hence,
312 match_user_agent is called when record_count != 0.
314 But if record_count is 0, we have to keep calling it
315 until it matches, and if that happens, we must not call
316 it any more, until the next record. Hence the other part
318 if (record_count != 0 || user_agent_applies == 0)
319 match_user_agent (value_b, value_e - value_b,
320 &user_agent_applies, &user_agent_exact);
321 if (user_agent_exact)
325 else if (FIELD_IS ("allow"))
327 if (user_agent_applies)
329 add_path (specs, value_b, value_e, 1, user_agent_exact);
333 else if (FIELD_IS ("disallow"))
335 if (user_agent_applies)
338 if (value_b == value_e)
339 /* Empty "disallow" line means everything is
342 add_path (specs, value_b, value_e, allowed, user_agent_exact);
348 DEBUGP (("Ignoring unknown field at line %d", line_count));
359 /* We've encountered an exactly matching user-agent. Throw out
360 all the stuff with user-agent: *. */
361 prune_non_exact (specs);
363 else if (specs->size > specs->count)
365 /* add_path normally over-allocates specs->paths. Reallocate it
366 to the correct size in order to conserve some memory. */
367 specs->paths = xrealloc (specs->paths,
368 specs->count * sizeof (struct path_info));
369 specs->size = specs->count;
375 /* The same like res_parse, but first map the FILENAME into memory,
376 and then parse it. */
379 res_parse_from_file (const char *filename)
381 struct robot_specs *specs;
382 struct file_memory *fm = read_file (filename);
385 logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
386 filename, strerror (errno));
389 specs = res_parse (fm->content, fm->length);
395 free_specs (struct robot_specs *specs)
398 for (i = 0; i < specs->count; i++)
399 xfree (specs->paths[i].path);
400 FREE_MAYBE (specs->paths);
404 /* Matching of a path according to the specs. */
406 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
407 that number is not a numerical representation of '/', decode C and
408 advance the pointer. */
410 #define DECODE_MAYBE(c, ptr) do { \
411 if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
414 = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \
415 if (decoded != '/') \
423 /* The inner matching engine: return non-zero if RECORD_PATH matches
424 URL_PATH. The rules for matching are described at
425 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
428 matches (const char *record_path, const char *url_path)
430 const char *rp = record_path;
431 const char *up = url_path;
441 DECODE_MAYBE(rc, rp);
442 DECODE_MAYBE(uc, up);
448 /* Iterate through all paths in SPECS. For the first one that
449 matches, return its allow/reject status. If none matches,
450 retrieval is by default allowed. */
453 res_match_path (const struct robot_specs *specs, const char *path)
458 for (i = 0; i < specs->count; i++)
459 if (matches (specs->paths[i].path, path))
461 int allowedp = specs->paths[i].allowedp;
462 DEBUGP (("%s path %s because of rule `%s'.\n",
463 allowedp ? "Allowing" : "Rejecting",
464 path, specs->paths[i].path));
470 /* Registering the specs. */
472 static struct hash_table *registered_specs;
474 /* Stolen from cookies.c. */
475 #define SET_HOSTPORT(host, port, result) do { \
476 int HP_len = strlen (host); \
477 result = alloca (HP_len + 1 + numdigit (port) + 1); \
478 memcpy (result, host, HP_len); \
479 result[HP_len] = ':'; \
480 number_to_string (result + HP_len + 1, port); \
483 /* Register RES specs that below to server on HOST:PORT. They will
484 later be retrievable using res_get_specs. */
487 res_register_specs (const char *host, int port, struct robot_specs *specs)
489 struct robot_specs *old;
491 SET_HOSTPORT (host, port, hp);
493 if (!registered_specs)
494 registered_specs = make_nocase_string_hash_table (0);
496 /* Required to shut up the compiler. */
500 if (hash_table_get_pair (registered_specs, hp, hp_old, old))
504 hash_table_put (registered_specs, hp_old, specs);
508 hash_table_put (registered_specs, xstrdup (hp), specs);
512 /* Get the specs that belong to HOST:PORT. */
515 res_get_specs (const char *host, int port)
518 SET_HOSTPORT (host, port, hp);
519 if (!registered_specs)
521 return hash_table_get (registered_specs, hp);
524 /* Loading the robots file. */
526 #define RES_SPECS_LOCATION "/robots.txt"
528 /* Retrieve the robots.txt from the server root of the server that
529 serves URL. The file will be named according to the currently
530 active rules, and the file name will be returned in *file.
532 Return non-zero if robots were retrieved OK, zero otherwise. */
535 res_retrieve_file (const char *url, char **file)
538 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
540 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
542 err = retrieve_url (robots_url, file, NULL, NULL, NULL);
545 if (err != RETROK && *file != NULL)
547 /* If the file is not retrieved correctly, but retrieve_url
548 allocated the file name, deallocate is here so that the
549 caller doesn't have to worry about it. */
553 return err == RETROK;
557 cleanup_hash_table_mapper (void *key, void *value, void *arg_ignored)
567 if (registered_specs)
569 hash_table_map (registered_specs, cleanup_hash_table_mapper, NULL);
570 hash_table_destroy (registered_specs);
571 registered_specs = NULL;