1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001,2006 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 In addition, as a special exception, the Free Software Foundation
20 gives permission to link the code of its release of Wget with the
21 OpenSSL project's "OpenSSL" library (or with modified versions of it
22 that use the same license as the "OpenSSL" library), and distribute
23 the linked executables. You must obey the GNU General Public License
24 in all respects for all of the code used other than "OpenSSL". If you
25 modify this file, you may extend this exception to your version of the
26 file, but you are not obligated to do so. If you do not wish to do
27 so, delete this exception statement from your version. */
29 /* This file implements the Robot Exclusion Standard (RES).
31 RES is a simple protocol that enables site admins to signalize to
32 the web crawlers that certain parts of the site should not be
33 accessed. All the admin needs to do is create a "robots.txt" file
34 in the web server root, and use simple commands to allow or
35 disallow access to certain parts of the site.
37 The first specification was written by Martijn Koster in 1994, and
38 is still available at <http://www.robotstxt.org/wc/norobots.html>.
39 In 1996, Martijn wrote an Internet Draft specifying an improved RES
40 specification; however, that work was apparently abandoned since
41 the draft has expired in 1997 and hasn't been replaced since. The
43 <http://www.robotstxt.org/wc/norobots-rfc.html>.
45 This file implements RES as specified by the draft. Note that this
46 only handles the "robots.txt" support. The META tag that controls
47 whether the links should be followed is handled in `html-url.c'.
51 * The end-of-line comment recognition is more in the spirit of the
52 Bourne Shell (as specified by RES-1994). That means that
53 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
54 as "foo". The Draft apparently specifies that both should be
57 * We don't recognize sole CR as the line ending.
59 * We don't implement expiry mechanism for /robots.txt specs. I
60 consider it non-necessary for a relatively short-lived
61 application such as Wget. Besides, it is highly questionable
62 whether anyone deploys the recommended expiry scheme for
65 Entry points are functions res_parse, res_parse_from_file,
66 res_match_path, res_register_specs, res_get_specs, and
93 bool user_agent_exact_p;
99 struct path_info *paths;
102 /* Parsing the robot spec. */
104 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
105 "*". If it is either of them, *matches is set to one. If it is
106 "wget", *exact_match is set to one. */
109 match_user_agent (const char *agent, int length,
110 bool *matches, bool *exact_match)
112 if (length == 1 && *agent == '*')
115 *exact_match = false;
117 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
125 *exact_match = false;
129 /* Add a path specification between PATH_B and PATH_E as one of the
133 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
134 bool allowedp, bool exactp)
137 if (path_b < path_e && *path_b == '/')
138 /* Our path representation doesn't use a leading slash, so remove
141 pp.path = strdupdelim (path_b, path_e);
142 pp.allowedp = allowedp;
143 pp.user_agent_exact_p = exactp;
145 if (specs->count > specs->size)
147 if (specs->size == 0)
151 specs->paths = xrealloc (specs->paths,
152 specs->size * sizeof (struct path_info));
154 specs->paths[specs->count - 1] = pp;
157 /* Recreate SPECS->paths with only those paths that have
158 user_agent_exact_p set to true. */
161 prune_non_exact (struct robot_specs *specs)
163 struct path_info *newpaths;
166 for (i = 0; i < specs->count; i++)
167 if (specs->paths[i].user_agent_exact_p)
169 newpaths = xnew_array (struct path_info, cnt);
170 for (i = 0, j = 0; i < specs->count; i++)
171 if (specs->paths[i].user_agent_exact_p)
172 newpaths[j++] = specs->paths[i];
174 xfree (specs->paths);
175 specs->paths = newpaths;
180 #define EOL(p) ((p) >= lineend)
182 #define SKIP_SPACE(p) do { \
183 while (!EOL (p) && ISSPACE (*p)) \
187 #define FIELD_IS(string_literal) \
188 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
190 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
191 Return a specs objects ready to be fed to res_match_path.
193 The parsing itself is trivial, but creating a correct SPECS object
194 is trickier than it seems, because RES is surprisingly byzantine if
195 you attempt to implement it correctly.
197 A "record" is a block of one or more `User-Agent' lines followed by
198 one or more `Allow' or `Disallow' lines. Record is accepted by
199 Wget if one of the `User-Agent' lines was "wget", or if the user
202 After all the lines have been read, we examine whether an exact
203 ("wget") user-agent field was specified. If so, we delete all the
204 lines read under "User-Agent: *" blocks because we have our own
205 Wget-specific blocks. This enables the admin to say:
214 This means that to Wget and to Google, /cgi-bin is disallowed,
215 whereas for all other crawlers, everything is disallowed.
216 res_parse is implemented so that the order of records doesn't
217 matter. In the case above, the "User-Agent: *" could have come
218 after the other one. */
221 res_parse (const char *source, int length)
225 const char *p = source;
226 const char *end = source + length;
228 /* true if last applicable user-agent field matches Wget. */
229 bool user_agent_applies = false;
231 /* true if last applicable user-agent field *exactly* matches
233 bool user_agent_exact = false;
235 /* whether we ever encountered exact user agent. */
236 bool found_exact = false;
238 /* count of allow/disallow lines in the current "record", i.e. after
239 the last `user-agent' instructions. */
240 int record_count = 0;
242 struct robot_specs *specs = xnew0 (struct robot_specs);
246 const char *lineend, *lineend_real;
247 const char *field_b, *field_e;
248 const char *value_b, *value_e;
252 lineend_real = memchr (p, '\n', end - p);
257 lineend = lineend_real;
259 /* Before doing anything else, check whether the line is empty
262 if (EOL (p) || *p == '#')
265 /* Make sure the end-of-line comments are respected by setting
266 lineend to a location preceding the first comment. Real line
267 ending remains in lineend_real. */
268 for (lineend = p; lineend < lineend_real; lineend++)
269 if ((lineend == p || ISSPACE (*(lineend - 1)))
273 /* Ignore trailing whitespace in the same way. */
274 while (lineend > p && ISSPACE (*(lineend - 1)))
280 while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
285 if (field_b == field_e || EOL (p) || *p != ':')
287 DEBUGP (("Ignoring malformed line %d", line_count));
298 /* Finally, we have a syntactically valid line. */
299 if (FIELD_IS ("user-agent"))
301 /* We have to support several cases:
308 ... matching record ...
312 ... non-matching record ...
315 ... matching record, but will be pruned later ...
317 We have to respect `User-Agent' at the beginning of each
318 new record simply because we don't know if we're going to
319 encounter "Wget" among the agents or not. Hence,
320 match_user_agent is called when record_count != 0.
322 But if record_count is 0, we have to keep calling it
323 until it matches, and if that happens, we must not call
324 it any more, until the next record. Hence the other part
326 if (record_count != 0 || user_agent_applies == false)
327 match_user_agent (value_b, value_e - value_b,
328 &user_agent_applies, &user_agent_exact);
329 if (user_agent_exact)
333 else if (FIELD_IS ("allow"))
335 if (user_agent_applies)
337 add_path (specs, value_b, value_e, true, user_agent_exact);
341 else if (FIELD_IS ("disallow"))
343 if (user_agent_applies)
345 bool allowed = false;
346 if (value_b == value_e)
347 /* Empty "disallow" line means everything is *allowed*! */
349 add_path (specs, value_b, value_e, allowed, user_agent_exact);
355 DEBUGP (("Ignoring unknown field at line %d", line_count));
366 /* We've encountered an exactly matching user-agent. Throw out
367 all the stuff with user-agent: *. */
368 prune_non_exact (specs);
370 else if (specs->size > specs->count)
372 /* add_path normally over-allocates specs->paths. Reallocate it
373 to the correct size in order to conserve some memory. */
374 specs->paths = xrealloc (specs->paths,
375 specs->count * sizeof (struct path_info));
376 specs->size = specs->count;
382 /* The same like res_parse, but first map the FILENAME into memory,
383 and then parse it. */
386 res_parse_from_file (const char *filename)
388 struct robot_specs *specs;
389 struct file_memory *fm = read_file (filename);
392 logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
393 filename, strerror (errno));
396 specs = res_parse (fm->content, fm->length);
402 free_specs (struct robot_specs *specs)
405 for (i = 0; i < specs->count; i++)
406 xfree (specs->paths[i].path);
407 xfree_null (specs->paths);
411 /* Matching of a path according to the specs. */
413 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
414 that number is not a numerical representation of '/', decode C and
415 advance the pointer. */
417 #define DECODE_MAYBE(c, ptr) do { \
418 if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
420 char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
421 if (decoded != '/') \
429 /* The inner matching engine: return true if RECORD_PATH matches
430 URL_PATH. The rules for matching are described at
431 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
434 matches (const char *record_path, const char *url_path)
436 const char *rp = record_path;
437 const char *up = url_path;
447 DECODE_MAYBE(rc, rp);
448 DECODE_MAYBE(uc, up);
454 /* Iterate through all paths in SPECS. For the first one that
455 matches, return its allow/reject status. If none matches,
456 retrieval is by default allowed. */
459 res_match_path (const struct robot_specs *specs, const char *path)
464 for (i = 0; i < specs->count; i++)
465 if (matches (specs->paths[i].path, path))
467 bool allowedp = specs->paths[i].allowedp;
468 DEBUGP (("%s path %s because of rule `%s'.\n",
469 allowedp ? "Allowing" : "Rejecting",
470 path, specs->paths[i].path));
476 /* Registering the specs. */
478 static struct hash_table *registered_specs;
480 /* Stolen from cookies.c. */
481 #define SET_HOSTPORT(host, port, result) do { \
482 int HP_len = strlen (host); \
483 result = alloca (HP_len + 1 + numdigit (port) + 1); \
484 memcpy (result, host, HP_len); \
485 result[HP_len] = ':'; \
486 number_to_string (result + HP_len + 1, port); \
489 /* Register RES specs that below to server on HOST:PORT. They will
490 later be retrievable using res_get_specs. */
493 res_register_specs (const char *host, int port, struct robot_specs *specs)
495 struct robot_specs *old;
497 SET_HOSTPORT (host, port, hp);
499 if (!registered_specs)
500 registered_specs = make_nocase_string_hash_table (0);
502 if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
506 hash_table_put (registered_specs, hp_old, specs);
510 hash_table_put (registered_specs, xstrdup (hp), specs);
514 /* Get the specs that belong to HOST:PORT. */
517 res_get_specs (const char *host, int port)
520 SET_HOSTPORT (host, port, hp);
521 if (!registered_specs)
523 return hash_table_get (registered_specs, hp);
526 /* Loading the robots file. */
528 #define RES_SPECS_LOCATION "/robots.txt"
530 /* Retrieve the robots.txt from the server root of the server that
531 serves URL. The file will be named according to the currently
532 active rules, and the file name will be returned in *file.
534 Return true if robots were retrieved OK, false otherwise. */
537 res_retrieve_file (const char *url, char **file)
540 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
541 int saved_ts_val = opt.timestamping;
542 int saved_sp_val = opt.spider;
544 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
546 opt.timestamping = false;
548 err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
549 opt.timestamping = saved_ts_val;
550 opt.spider = saved_sp_val;
553 if (err != RETROK && *file != NULL)
555 /* If the file is not retrieved correctly, but retrieve_url
556 allocated the file name, deallocate is here so that the
557 caller doesn't have to worry about it. */
561 return err == RETROK;
565 is_robots_txt_url (const char *url)
567 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
568 bool ret = are_urls_equal (url, robots_url);
578 if (registered_specs)
580 hash_table_iterator iter;
581 for (hash_table_iterate (registered_specs, &iter);
582 hash_table_iter_next (&iter);
586 free_specs (iter.value);
588 hash_table_destroy (registered_specs);
589 registered_specs = NULL;
596 test_is_robots_txt_url()
601 bool expected_result;
603 { "http://www.yoyodyne.com/robots.txt", true },
604 { "http://www.yoyodyne.com/somepath/", false },
605 { "http://www.yoyodyne.com/somepath/robots.txt", false },
608 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
610 mu_assert ("test_is_robots_txt_url: wrong result",
611 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);