1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001, 2006, 2007, 2008 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
30 /* This file implements the Robot Exclusion Standard (RES).
32 RES is a simple protocol that enables site admins to signalize to
33 the web crawlers that certain parts of the site should not be
34 accessed. All the admin needs to do is create a "robots.txt" file
35 in the web server root, and use simple commands to allow or
36 disallow access to certain parts of the site.
38 The first specification was written by Martijn Koster in 1994, and
39 is still available at <http://www.robotstxt.org/wc/norobots.html>.
40 In 1996, Martijn wrote an Internet Draft specifying an improved RES
41 specification; however, that work was apparently abandoned since
42 the draft has expired in 1997 and hasn't been replaced since. The
44 <http://www.robotstxt.org/wc/norobots-rfc.html>.
46 This file implements RES as specified by the draft. Note that this
47 only handles the "robots.txt" support. The META tag that controls
48 whether the links should be followed is handled in `html-url.c'.
52 * The end-of-line comment recognition is more in the spirit of the
53 Bourne Shell (as specified by RES-1994). That means that
54 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
55 as "foo". The Draft apparently specifies that both should be
58 * We don't recognize sole CR as the line ending.
60 * We don't implement expiry mechanism for /robots.txt specs. I
61 consider it non-necessary for a relatively short-lived
62 application such as Wget. Besides, it is highly questionable
63 whether anyone deploys the recommended expiry scheme for
66 Entry points are functions res_parse, res_parse_from_file,
67 res_match_path, res_register_specs, res_get_specs, and
91 bool user_agent_exact_p;
97 struct path_info *paths;
100 /* Parsing the robot spec. */
102 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
103 "*". If it is either of them, *matches is set to one. If it is
104 "wget", *exact_match is set to one. */
107 match_user_agent (const char *agent, int length,
108 bool *matches, bool *exact_match)
110 if (length == 1 && *agent == '*')
113 *exact_match = false;
115 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
123 *exact_match = false;
127 /* Add a path specification between PATH_B and PATH_E as one of the
131 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
132 bool allowedp, bool exactp)
135 if (path_b < path_e && *path_b == '/')
136 /* Our path representation doesn't use a leading slash, so remove
139 pp.path = strdupdelim (path_b, path_e);
140 pp.allowedp = allowedp;
141 pp.user_agent_exact_p = exactp;
143 if (specs->count > specs->size)
145 if (specs->size == 0)
149 specs->paths = xrealloc (specs->paths,
150 specs->size * sizeof (struct path_info));
152 specs->paths[specs->count - 1] = pp;
155 /* Recreate SPECS->paths with only those paths that have
156 user_agent_exact_p set to true. */
159 prune_non_exact (struct robot_specs *specs)
161 struct path_info *newpaths;
164 for (i = 0; i < specs->count; i++)
165 if (specs->paths[i].user_agent_exact_p)
167 newpaths = xnew_array (struct path_info, cnt);
168 for (i = 0, j = 0; i < specs->count; i++)
169 if (specs->paths[i].user_agent_exact_p)
170 newpaths[j++] = specs->paths[i];
172 xfree (specs->paths);
173 specs->paths = newpaths;
178 #define EOL(p) ((p) >= lineend)
180 #define SKIP_SPACE(p) do { \
181 while (!EOL (p) && c_isspace (*p)) \
185 #define FIELD_IS(string_literal) \
186 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
188 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
189 Return a specs objects ready to be fed to res_match_path.
191 The parsing itself is trivial, but creating a correct SPECS object
192 is trickier than it seems, because RES is surprisingly byzantine if
193 you attempt to implement it correctly.
195 A "record" is a block of one or more `User-Agent' lines followed by
196 one or more `Allow' or `Disallow' lines. Record is accepted by
197 Wget if one of the `User-Agent' lines was "wget", or if the user
200 After all the lines have been read, we examine whether an exact
201 ("wget") user-agent field was specified. If so, we delete all the
202 lines read under "User-Agent: *" blocks because we have our own
203 Wget-specific blocks. This enables the admin to say:
212 This means that to Wget and to Google, /cgi-bin is disallowed,
213 whereas for all other crawlers, everything is disallowed.
214 res_parse is implemented so that the order of records doesn't
215 matter. In the case above, the "User-Agent: *" could have come
216 after the other one. */
219 res_parse (const char *source, int length)
223 const char *p = source;
224 const char *end = source + length;
226 /* true if last applicable user-agent field matches Wget. */
227 bool user_agent_applies = false;
229 /* true if last applicable user-agent field *exactly* matches
231 bool user_agent_exact = false;
233 /* whether we ever encountered exact user agent. */
234 bool found_exact = false;
236 /* count of allow/disallow lines in the current "record", i.e. after
237 the last `user-agent' instructions. */
238 int record_count = 0;
240 struct robot_specs *specs = xnew0 (struct robot_specs);
244 const char *lineend, *lineend_real;
245 const char *field_b, *field_e;
246 const char *value_b, *value_e;
250 lineend_real = memchr (p, '\n', end - p);
255 lineend = lineend_real;
257 /* Before doing anything else, check whether the line is empty
260 if (EOL (p) || *p == '#')
263 /* Make sure the end-of-line comments are respected by setting
264 lineend to a location preceding the first comment. Real line
265 ending remains in lineend_real. */
266 for (lineend = p; lineend < lineend_real; lineend++)
267 if ((lineend == p || c_isspace (*(lineend - 1)))
271 /* Ignore trailing whitespace in the same way. */
272 while (lineend > p && c_isspace (*(lineend - 1)))
278 while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
283 if (field_b == field_e || EOL (p) || *p != ':')
285 DEBUGP (("Ignoring malformed line %d", line_count));
296 /* Finally, we have a syntactically valid line. */
297 if (FIELD_IS ("user-agent"))
299 /* We have to support several cases:
306 ... matching record ...
310 ... non-matching record ...
313 ... matching record, but will be pruned later ...
315 We have to respect `User-Agent' at the beginning of each
316 new record simply because we don't know if we're going to
317 encounter "Wget" among the agents or not. Hence,
318 match_user_agent is called when record_count != 0.
320 But if record_count is 0, we have to keep calling it
321 until it matches, and if that happens, we must not call
322 it any more, until the next record. Hence the other part
324 if (record_count != 0 || user_agent_applies == false)
325 match_user_agent (value_b, value_e - value_b,
326 &user_agent_applies, &user_agent_exact);
327 if (user_agent_exact)
331 else if (FIELD_IS ("allow"))
333 if (user_agent_applies)
335 add_path (specs, value_b, value_e, true, user_agent_exact);
339 else if (FIELD_IS ("disallow"))
341 if (user_agent_applies)
343 bool allowed = false;
344 if (value_b == value_e)
345 /* Empty "disallow" line means everything is *allowed*! */
347 add_path (specs, value_b, value_e, allowed, user_agent_exact);
353 DEBUGP (("Ignoring unknown field at line %d", line_count));
364 /* We've encountered an exactly matching user-agent. Throw out
365 all the stuff with user-agent: *. */
366 prune_non_exact (specs);
368 else if (specs->size > specs->count)
370 /* add_path normally over-allocates specs->paths. Reallocate it
371 to the correct size in order to conserve some memory. */
372 specs->paths = xrealloc (specs->paths,
373 specs->count * sizeof (struct path_info));
374 specs->size = specs->count;
380 /* The same like res_parse, but first map the FILENAME into memory,
381 and then parse it. */
384 res_parse_from_file (const char *filename)
386 struct robot_specs *specs;
387 struct file_memory *fm = read_file (filename);
390 logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
391 filename, strerror (errno));
394 specs = res_parse (fm->content, fm->length);
400 free_specs (struct robot_specs *specs)
403 for (i = 0; i < specs->count; i++)
404 xfree (specs->paths[i].path);
405 xfree_null (specs->paths);
409 /* Matching of a path according to the specs. */
411 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
412 that number is not a numerical representation of '/', decode C and
413 advance the pointer. */
415 #define DECODE_MAYBE(c, ptr) do { \
416 if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
418 char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
419 if (decoded != '/') \
427 /* The inner matching engine: return true if RECORD_PATH matches
428 URL_PATH. The rules for matching are described at
429 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
432 matches (const char *record_path, const char *url_path)
434 const char *rp = record_path;
435 const char *up = url_path;
445 DECODE_MAYBE(rc, rp);
446 DECODE_MAYBE(uc, up);
452 /* Iterate through all paths in SPECS. For the first one that
453 matches, return its allow/reject status. If none matches,
454 retrieval is by default allowed. */
457 res_match_path (const struct robot_specs *specs, const char *path)
462 for (i = 0; i < specs->count; i++)
463 if (matches (specs->paths[i].path, path))
465 bool allowedp = specs->paths[i].allowedp;
466 DEBUGP (("%s path %s because of rule %s.\n",
467 allowedp ? "Allowing" : "Rejecting",
468 path, quote (specs->paths[i].path)));
474 /* Registering the specs. */
476 static struct hash_table *registered_specs;
478 /* Stolen from cookies.c. */
479 #define SET_HOSTPORT(host, port, result) do { \
480 int HP_len = strlen (host); \
481 result = alloca (HP_len + 1 + numdigit (port) + 1); \
482 memcpy (result, host, HP_len); \
483 result[HP_len] = ':'; \
484 number_to_string (result + HP_len + 1, port); \
487 /* Register RES specs that below to server on HOST:PORT. They will
488 later be retrievable using res_get_specs. */
491 res_register_specs (const char *host, int port, struct robot_specs *specs)
493 struct robot_specs *old;
495 SET_HOSTPORT (host, port, hp);
497 if (!registered_specs)
498 registered_specs = make_nocase_string_hash_table (0);
500 if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
504 hash_table_put (registered_specs, hp_old, specs);
508 hash_table_put (registered_specs, xstrdup (hp), specs);
512 /* Get the specs that belong to HOST:PORT. */
515 res_get_specs (const char *host, int port)
518 SET_HOSTPORT (host, port, hp);
519 if (!registered_specs)
521 return hash_table_get (registered_specs, hp);
524 /* Loading the robots file. */
526 #define RES_SPECS_LOCATION "/robots.txt"
528 /* Retrieve the robots.txt from the server root of the server that
529 serves URL. The file will be named according to the currently
530 active rules, and the file name will be returned in *file.
532 Return true if robots were retrieved OK, false otherwise. */
535 res_retrieve_file (const char *url, char **file)
538 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
539 int saved_ts_val = opt.timestamping;
540 int saved_sp_val = opt.spider;
542 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
544 opt.timestamping = false;
546 err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
547 opt.timestamping = saved_ts_val;
548 opt.spider = saved_sp_val;
551 if (err != RETROK && *file != NULL)
553 /* If the file is not retrieved correctly, but retrieve_url
554 allocated the file name, deallocate is here so that the
555 caller doesn't have to worry about it. */
559 return err == RETROK;
563 is_robots_txt_url (const char *url)
565 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
566 bool ret = are_urls_equal (url, robots_url);
576 if (registered_specs)
578 hash_table_iterator iter;
579 for (hash_table_iterate (registered_specs, &iter);
580 hash_table_iter_next (&iter);
584 free_specs (iter.value);
586 hash_table_destroy (registered_specs);
587 registered_specs = NULL;
594 test_is_robots_txt_url()
599 bool expected_result;
601 { "http://www.yoyodyne.com/robots.txt", true },
602 { "http://www.yoyodyne.com/somepath/", false },
603 { "http://www.yoyodyne.com/somepath/robots.txt", false },
606 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
608 mu_assert ("test_is_robots_txt_url: wrong result",
609 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);