1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20 /* This file implements the Robot Exclusion Standard (RES).
22 RES is a simple protocol that enables site admins to signalize to
23 the web crawlers that certain parts of the site should not be
24 accessed. All the admin needs to do is create a "robots.txt" file
25 in the web server root, and use simple commands to allow or
26 disallow access to certain parts of the site.
28 The first specification was written by Martijn Koster in 1994, and
29 is still available at <http://www.robotstxt.org/wc/norobots.html>.
30 In 1996, Martijn wrote an Internet Draft specifying an improved RES
31 specification; however, that work was apparently abandoned since
32 the draft has expired in 1997 and hasn't been replaced since. The
34 <http://www.robotstxt.org/wc/norobots-rfc.html>.
36 This file implements RES as specified by the draft. Note that this
37 only handles the "robots.txt" support. The META tag that controls
38 whether the links should be followed is handled in `html-url.c'.
42 * The end-of-line comment recognition is more in the spirit of the
43 Bourne Shell (as specified by RES-1994). That means that
44 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
45 as "foo". The Draft apparently specifies that both should be
48 * We don't recognize sole CR as the line ending.
50 * We don't implement expiry mechanism for /robots.txt specs. I
51 consider it non-necessary for a relatively short-lived
52 application such as Wget. Besides, it is highly questionable
53 whether anyone deploys the recommended expiry scheme for
56 Entry points are functions res_parse, res_parse_from_file,
57 res_match_path, res_register_specs, res_get_specs, and
70 #endif /* HAVE_STRING_H */
84 int user_agent_exact_p;
90 struct path_info *paths;
93 /* Parsing the robot spec. */
95 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
96 "*". If it is either of them, *matches is set to one. If it is
97 "wget", *exact_match is set to one. */
100 match_user_agent (const char *agent, int length,
101 int *matches, int *exact_match)
103 if (length == 1 && *agent == '*')
108 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
120 /* Add a path specification between PATH_B and PATH_E as one of the
124 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
125 int allowedp, int exactp)
128 pp.path = strdupdelim (path_b, path_e);
129 pp.allowedp = allowedp;
130 pp.user_agent_exact_p = exactp;
132 if (specs->count > specs->size)
134 if (specs->size == 0)
138 specs->paths = xrealloc (specs->paths,
139 specs->size * sizeof (struct path_info));
141 specs->paths[specs->count - 1] = pp;
144 /* Recreate SPECS->paths with only those paths that have non-zero
145 user_agent_exact_p. */
148 prune_non_exact (struct robot_specs *specs)
150 struct path_info *newpaths;
153 for (i = 0; i < specs->count; i++)
154 if (specs->paths[i].user_agent_exact_p)
156 newpaths = xmalloc (cnt * sizeof (struct path_info));
157 for (i = 0, j = 0; i < specs->count; i++)
158 if (specs->paths[i].user_agent_exact_p)
159 newpaths[j++] = specs->paths[i];
161 xfree (specs->paths);
162 specs->paths = newpaths;
167 #define EOL(p) ((p) >= lineend)
169 #define SKIP_SPACE(p) do { \
170 while (!EOL (p) && ISSPACE (*p)) \
174 #define FIELD_IS(string_literal) \
175 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
177 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
178 Return a specs objects ready to be fed to res_match_path.
180 The parsing itself is trivial, but creating a correct SPECS object
181 is trickier than it seems, because RES is surprisingly byzantine if
182 you attempt to implement it correctly.
184 A "record" is a block of one or more `User-Agent' lines followed by
185 one or more `Allow' or `Disallow' lines. Record is accepted by
186 Wget if one of the `User-Agent' lines was "wget", or if the user
189 After all the lines have been read, we examine whether an exact
190 ("wget") user-agent field was specified. If so, we delete all the
191 lines read under "User-Agent: *" blocks because we have our own
192 Wget-specific blocks. This enables the admin to say:
201 This means that to Wget and to Google, /cgi-bin is disallowed,
202 whereas for all other crawlers, everything is disallowed.
203 res_parse is implemented so that the order of records doesn't
204 matter. In the case above, the "User-Agent: *" could have come
205 after the other one. */
208 res_parse (const char *source, int length)
212 const char *p = source;
213 const char *end = source + length;
215 /* non-zero if last applicable user-agent field matches Wget. */
216 int user_agent_applies = 0;
218 /* non-zero if last applicable user-agent field *exactly* matches
220 int user_agent_exact = 0;
222 /* whether we ever encountered exact user agent. */
225 /* count of allow/disallow lines in the current "record", i.e. after
226 the last `user-agent' instructions. */
227 int record_count = 0;
229 struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
230 memset (specs, '\0', sizeof (struct robot_specs));
234 const char *lineend, *lineend_real;
235 const char *field_b, *field_e;
236 const char *value_b, *value_e;
240 lineend_real = memchr (p, '\n', end - p);
245 lineend = lineend_real;
247 /* Before doing anything else, check whether the line is empty
250 if (EOL (p) || *p == '#')
253 /* Make sure the end-of-line comments are respected by setting
254 lineend to a location preceding the first comment. Real line
255 ending remains in lineend_real. */
256 for (lineend = p; lineend < lineend_real; lineend++)
257 if ((lineend == p || ISSPACE (*(lineend - 1)))
261 /* Ignore trailing whitespace in the same way. */
262 while (lineend > p && ISSPACE (*(lineend - 1)))
268 while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
273 if (field_b == field_e || EOL (p) || *p != ':')
275 DEBUGP (("Ignoring malformed line %d", line_count));
286 /* Finally, we have a syntactically valid line. */
287 if (FIELD_IS ("user-agent"))
289 /* We have to support several cases:
296 ... matching record ...
300 ... non-matching record ...
303 ... matching record, but will be pruned later ...
305 We have to respect `User-Agent' at the beginning of each
306 new record simply because we don't know if we're going to
307 encounter "Wget" among the agents or not. Hence,
308 match_user_agent is called when record_count != 0.
310 But if record_count is 0, we have to keep calling it
311 until it matches, and if that happens, we must not call
312 it any more, until the next record. Hence the other part
314 if (record_count != 0 || user_agent_applies == 0)
315 match_user_agent (value_b, value_e - value_b,
316 &user_agent_applies, &user_agent_exact);
317 if (user_agent_exact)
321 else if (FIELD_IS ("allow"))
323 if (user_agent_applies)
325 add_path (specs, value_b, value_e, 1, user_agent_exact);
329 else if (FIELD_IS ("disallow"))
331 if (user_agent_applies)
334 if (value_b == value_e)
335 /* Empty "disallow" line means everything is
338 add_path (specs, value_b, value_e, allowed, user_agent_exact);
344 DEBUGP (("Ignoring unknown field at line %d", line_count));
355 /* We've encountered an exactly matching user-agent. Throw out
356 all the stuff with user-agent: *. */
357 prune_non_exact (specs);
359 else if (specs->size > specs->count)
361 /* add_path normally over-allocates specs->paths. Reallocate it
362 to the correct size in order to conserve some memory. */
363 specs->paths = xrealloc (specs->paths,
364 specs->count * sizeof (struct path_info));
365 specs->size = specs->count;
371 /* The same like res_parse, but first map the FILENAME into memory,
372 and then parse it. */
375 res_parse_from_file (const char *filename)
377 struct robot_specs *specs;
378 struct file_memory *fm = read_file (filename);
381 logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
382 filename, strerror (errno));
385 specs = res_parse (fm->content, fm->length);
391 free_specs (struct robot_specs *specs)
393 FREE_MAYBE (specs->paths);
397 /* Matching of a path according to the specs. */
399 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
400 that number is not a numerical representation of '/', decode C and
401 advance the pointer. */
403 #define DECODE_MAYBE(c, ptr) do { \
404 if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
407 = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \
408 if (decoded != '/') \
416 /* The inner matching engine: return non-zero if RECORD_PATH matches
417 URL_PATH. The rules for matching are described at
418 <http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html>,
422 matches (const char *record_path, const char *url_path)
424 const char *rp = record_path;
425 const char *up = url_path;
435 DECODE_MAYBE(rc, rp);
436 DECODE_MAYBE(uc, up);
442 /* Iterate through all paths in SPECS. For the first one that
443 matches, return its allow/reject status. If none matches,
444 retrieval is by default allowed. */
447 res_match_path (const struct robot_specs *specs, const char *path)
452 for (i = 0; i < specs->count; i++)
453 if (matches (specs->paths[i].path, path))
455 int allowedp = specs->paths[i].allowedp;
456 DEBUGP (("%s path %s because of rule `%s'.\n",
457 allowedp ? "Allowing" : "Rejecting",
458 path, specs->paths[i].path));
464 /* Registering the specs. */
466 struct hash_table *registered_specs;
468 /* Stolen from cookies.c. */
469 #define SET_HOSTPORT(host, port, result) do { \
470 int HP_len = strlen (host); \
471 result = alloca (HP_len + 1 + numdigit (port) + 1); \
472 memcpy (result, host, HP_len); \
473 result[HP_len] = ':'; \
474 long_to_string (result + HP_len + 1, port); \
477 /* Register RES specs that below to server on HOST:PORT. They will
478 later be retrievable using res_get_specs. */
481 res_register_specs (const char *host, int port, struct robot_specs *specs)
483 struct robot_specs *old;
485 SET_HOSTPORT (host, port, hp);
487 if (!registered_specs)
488 registered_specs = make_nocase_string_hash_table (0);
490 /* Required to shut up the compiler. */
494 if (hash_table_get_pair (registered_specs, hp, hp_old, old))
498 hash_table_put (registered_specs, hp_old, specs);
502 hash_table_put (registered_specs, xstrdup (hp), specs);
506 /* Get the specs that belong to HOST:PORT. */
509 res_get_specs (const char *host, int port)
512 SET_HOSTPORT (host, port, hp);
513 if (!registered_specs)
515 return hash_table_get (registered_specs, hp);
518 /* Loading the robots file. */
520 #define RES_SPECS_LOCATION "/robots.txt"
522 /* Retrieve the robots.txt from the server root of the server that
523 serves URL. The file will be named according to the currently
524 active rules, and the file name will be returned in *file.
526 Return non-zero if robots were retrieved OK, zero otherwise. */
529 res_retrieve_file (const char *url, char **file)
532 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
534 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
536 err = retrieve_url (robots_url, file, NULL, NULL, NULL);
539 if (err != RETROK && *file != NULL)
541 /* If the file is not retrieved correctly, but retrieve_url
542 allocated the file name, deallocate is here so that the
543 caller doesn't have to worry about it. */
547 return err == RETROK;