1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001,2006 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
30 /* This file implements the Robot Exclusion Standard (RES).
32 RES is a simple protocol that enables site admins to signalize to
33 the web crawlers that certain parts of the site should not be
34 accessed. All the admin needs to do is create a "robots.txt" file
35 in the web server root, and use simple commands to allow or
36 disallow access to certain parts of the site.
38 The first specification was written by Martijn Koster in 1994, and
39 is still available at <http://www.robotstxt.org/wc/norobots.html>.
40 In 1996, Martijn wrote an Internet Draft specifying an improved RES
41 specification; however, that work was apparently abandoned since
42 the draft has expired in 1997 and hasn't been replaced since. The
44 <http://www.robotstxt.org/wc/norobots-rfc.html>.
46 This file implements RES as specified by the draft. Note that this
47 only handles the "robots.txt" support. The META tag that controls
48 whether the links should be followed is handled in `html-url.c'.
52 * The end-of-line comment recognition is more in the spirit of the
53 Bourne Shell (as specified by RES-1994). That means that
54 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
55 as "foo". The Draft apparently specifies that both should be
58 * We don't recognize sole CR as the line ending.
60 * We don't implement expiry mechanism for /robots.txt specs. I
61 consider it non-necessary for a relatively short-lived
62 application such as Wget. Besides, it is highly questionable
63 whether anyone deploys the recommended expiry scheme for
66 Entry points are functions res_parse, res_parse_from_file,
67 res_match_path, res_register_specs, res_get_specs, and
94 bool user_agent_exact_p;
100 struct path_info *paths;
103 /* Parsing the robot spec. */
105 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
106 "*". If it is either of them, *matches is set to one. If it is
107 "wget", *exact_match is set to one. */
110 match_user_agent (const char *agent, int length,
111 bool *matches, bool *exact_match)
113 if (length == 1 && *agent == '*')
116 *exact_match = false;
118 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
126 *exact_match = false;
130 /* Add a path specification between PATH_B and PATH_E as one of the
134 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
135 bool allowedp, bool exactp)
138 if (path_b < path_e && *path_b == '/')
139 /* Our path representation doesn't use a leading slash, so remove
142 pp.path = strdupdelim (path_b, path_e);
143 pp.allowedp = allowedp;
144 pp.user_agent_exact_p = exactp;
146 if (specs->count > specs->size)
148 if (specs->size == 0)
152 specs->paths = xrealloc (specs->paths,
153 specs->size * sizeof (struct path_info));
155 specs->paths[specs->count - 1] = pp;
158 /* Recreate SPECS->paths with only those paths that have
159 user_agent_exact_p set to true. */
162 prune_non_exact (struct robot_specs *specs)
164 struct path_info *newpaths;
167 for (i = 0; i < specs->count; i++)
168 if (specs->paths[i].user_agent_exact_p)
170 newpaths = xnew_array (struct path_info, cnt);
171 for (i = 0, j = 0; i < specs->count; i++)
172 if (specs->paths[i].user_agent_exact_p)
173 newpaths[j++] = specs->paths[i];
175 xfree (specs->paths);
176 specs->paths = newpaths;
181 #define EOL(p) ((p) >= lineend)
183 #define SKIP_SPACE(p) do { \
184 while (!EOL (p) && ISSPACE (*p)) \
188 #define FIELD_IS(string_literal) \
189 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
191 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
192 Return a specs objects ready to be fed to res_match_path.
194 The parsing itself is trivial, but creating a correct SPECS object
195 is trickier than it seems, because RES is surprisingly byzantine if
196 you attempt to implement it correctly.
198 A "record" is a block of one or more `User-Agent' lines followed by
199 one or more `Allow' or `Disallow' lines. Record is accepted by
200 Wget if one of the `User-Agent' lines was "wget", or if the user
203 After all the lines have been read, we examine whether an exact
204 ("wget") user-agent field was specified. If so, we delete all the
205 lines read under "User-Agent: *" blocks because we have our own
206 Wget-specific blocks. This enables the admin to say:
215 This means that to Wget and to Google, /cgi-bin is disallowed,
216 whereas for all other crawlers, everything is disallowed.
217 res_parse is implemented so that the order of records doesn't
218 matter. In the case above, the "User-Agent: *" could have come
219 after the other one. */
222 res_parse (const char *source, int length)
226 const char *p = source;
227 const char *end = source + length;
229 /* true if last applicable user-agent field matches Wget. */
230 bool user_agent_applies = false;
232 /* true if last applicable user-agent field *exactly* matches
234 bool user_agent_exact = false;
236 /* whether we ever encountered exact user agent. */
237 bool found_exact = false;
239 /* count of allow/disallow lines in the current "record", i.e. after
240 the last `user-agent' instructions. */
241 int record_count = 0;
243 struct robot_specs *specs = xnew0 (struct robot_specs);
247 const char *lineend, *lineend_real;
248 const char *field_b, *field_e;
249 const char *value_b, *value_e;
253 lineend_real = memchr (p, '\n', end - p);
258 lineend = lineend_real;
260 /* Before doing anything else, check whether the line is empty
263 if (EOL (p) || *p == '#')
266 /* Make sure the end-of-line comments are respected by setting
267 lineend to a location preceding the first comment. Real line
268 ending remains in lineend_real. */
269 for (lineend = p; lineend < lineend_real; lineend++)
270 if ((lineend == p || ISSPACE (*(lineend - 1)))
274 /* Ignore trailing whitespace in the same way. */
275 while (lineend > p && ISSPACE (*(lineend - 1)))
281 while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
286 if (field_b == field_e || EOL (p) || *p != ':')
288 DEBUGP (("Ignoring malformed line %d", line_count));
299 /* Finally, we have a syntactically valid line. */
300 if (FIELD_IS ("user-agent"))
302 /* We have to support several cases:
309 ... matching record ...
313 ... non-matching record ...
316 ... matching record, but will be pruned later ...
318 We have to respect `User-Agent' at the beginning of each
319 new record simply because we don't know if we're going to
320 encounter "Wget" among the agents or not. Hence,
321 match_user_agent is called when record_count != 0.
323 But if record_count is 0, we have to keep calling it
324 until it matches, and if that happens, we must not call
325 it any more, until the next record. Hence the other part
327 if (record_count != 0 || user_agent_applies == false)
328 match_user_agent (value_b, value_e - value_b,
329 &user_agent_applies, &user_agent_exact);
330 if (user_agent_exact)
334 else if (FIELD_IS ("allow"))
336 if (user_agent_applies)
338 add_path (specs, value_b, value_e, true, user_agent_exact);
342 else if (FIELD_IS ("disallow"))
344 if (user_agent_applies)
346 bool allowed = false;
347 if (value_b == value_e)
348 /* Empty "disallow" line means everything is *allowed*! */
350 add_path (specs, value_b, value_e, allowed, user_agent_exact);
356 DEBUGP (("Ignoring unknown field at line %d", line_count));
367 /* We've encountered an exactly matching user-agent. Throw out
368 all the stuff with user-agent: *. */
369 prune_non_exact (specs);
371 else if (specs->size > specs->count)
373 /* add_path normally over-allocates specs->paths. Reallocate it
374 to the correct size in order to conserve some memory. */
375 specs->paths = xrealloc (specs->paths,
376 specs->count * sizeof (struct path_info));
377 specs->size = specs->count;
383 /* The same like res_parse, but first map the FILENAME into memory,
384 and then parse it. */
387 res_parse_from_file (const char *filename)
389 struct robot_specs *specs;
390 struct file_memory *fm = read_file (filename);
393 logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
394 filename, strerror (errno));
397 specs = res_parse (fm->content, fm->length);
403 free_specs (struct robot_specs *specs)
406 for (i = 0; i < specs->count; i++)
407 xfree (specs->paths[i].path);
408 xfree_null (specs->paths);
412 /* Matching of a path according to the specs. */
414 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
415 that number is not a numerical representation of '/', decode C and
416 advance the pointer. */
418 #define DECODE_MAYBE(c, ptr) do { \
419 if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
421 char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
422 if (decoded != '/') \
430 /* The inner matching engine: return true if RECORD_PATH matches
431 URL_PATH. The rules for matching are described at
432 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
435 matches (const char *record_path, const char *url_path)
437 const char *rp = record_path;
438 const char *up = url_path;
448 DECODE_MAYBE(rc, rp);
449 DECODE_MAYBE(uc, up);
455 /* Iterate through all paths in SPECS. For the first one that
456 matches, return its allow/reject status. If none matches,
457 retrieval is by default allowed. */
460 res_match_path (const struct robot_specs *specs, const char *path)
465 for (i = 0; i < specs->count; i++)
466 if (matches (specs->paths[i].path, path))
468 bool allowedp = specs->paths[i].allowedp;
469 DEBUGP (("%s path %s because of rule `%s'.\n",
470 allowedp ? "Allowing" : "Rejecting",
471 path, specs->paths[i].path));
477 /* Registering the specs. */
479 static struct hash_table *registered_specs;
481 /* Stolen from cookies.c. */
482 #define SET_HOSTPORT(host, port, result) do { \
483 int HP_len = strlen (host); \
484 result = alloca (HP_len + 1 + numdigit (port) + 1); \
485 memcpy (result, host, HP_len); \
486 result[HP_len] = ':'; \
487 number_to_string (result + HP_len + 1, port); \
490 /* Register RES specs that below to server on HOST:PORT. They will
491 later be retrievable using res_get_specs. */
494 res_register_specs (const char *host, int port, struct robot_specs *specs)
496 struct robot_specs *old;
498 SET_HOSTPORT (host, port, hp);
500 if (!registered_specs)
501 registered_specs = make_nocase_string_hash_table (0);
503 if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
507 hash_table_put (registered_specs, hp_old, specs);
511 hash_table_put (registered_specs, xstrdup (hp), specs);
515 /* Get the specs that belong to HOST:PORT. */
518 res_get_specs (const char *host, int port)
521 SET_HOSTPORT (host, port, hp);
522 if (!registered_specs)
524 return hash_table_get (registered_specs, hp);
527 /* Loading the robots file. */
529 #define RES_SPECS_LOCATION "/robots.txt"
531 /* Retrieve the robots.txt from the server root of the server that
532 serves URL. The file will be named according to the currently
533 active rules, and the file name will be returned in *file.
535 Return true if robots were retrieved OK, false otherwise. */
538 res_retrieve_file (const char *url, char **file)
541 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
542 int saved_ts_val = opt.timestamping;
543 int saved_sp_val = opt.spider;
545 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
547 opt.timestamping = false;
549 err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
550 opt.timestamping = saved_ts_val;
551 opt.spider = saved_sp_val;
554 if (err != RETROK && *file != NULL)
556 /* If the file is not retrieved correctly, but retrieve_url
557 allocated the file name, deallocate is here so that the
558 caller doesn't have to worry about it. */
562 return err == RETROK;
566 is_robots_txt_url (const char *url)
568 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
569 bool ret = are_urls_equal (url, robots_url);
579 if (registered_specs)
581 hash_table_iterator iter;
582 for (hash_table_iterate (registered_specs, &iter);
583 hash_table_iter_next (&iter);
587 free_specs (iter.value);
589 hash_table_destroy (registered_specs);
590 registered_specs = NULL;
597 test_is_robots_txt_url()
602 bool expected_result;
604 { "http://www.yoyodyne.com/robots.txt", true },
605 { "http://www.yoyodyne.com/somepath/", false },
606 { "http://www.yoyodyne.com/somepath/robots.txt", false },
609 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
611 mu_assert ("test_is_robots_txt_url: wrong result",
612 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);