1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20 /* This file implements the Robot Exclusion Standard (RES).
22 RES is a simple protocol that enables site admins to signalize to
23 the web crawlers that certain parts of the site should not be
24 accessed. All the admin needs to do is create a "robots.txt" file
25 in the web server root, and use simple commands to allow or
26 disallow access to certain parts of the site.
28 The first specification was written by Martijn Koster in 1994, and
29 is still available at <http://www.robotstxt.org/wc/norobots.html>.
30 In 1996, Martijn wrote an Internet Draft specifying an improved RES
31 specification; however, that work was apparently abandoned since
32 the draft has expired in 1997 and hasn't been replaced since. The
34 <http://www.robotstxt.org/wc/norobots-rfc.html>.
36 This file implements RES as specified by the draft. Note that this
37 only handles the "robots.txt" support. The META tag that controls
38 whether the links should be followed is handled in `html-url.c'.
42 * The end-of-line comment recognition is more in the spirit of the
43 Bourne Shell (as specified by RES-1994). That means that
44 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
45 as "foo". The Draft apparently specifies that both should be
48 * We don't recognize sole CR as the line ending.
50 * We don't implement expiry mechanism for /robots.txt specs. I
51 consider it non-necessary for a relatively short-lived
52 application such as Wget. Besides, it is highly questionable
53 whether anyone deploys the recommended expiry scheme for
56 Entry points are functions res_parse, res_parse_from_file,
57 res_match_path, res_register_specs, res_get_specs, and
70 #endif /* HAVE_STRING_H */
84 int user_agent_exact_p;
90 struct path_info *paths;
93 /* Parsing the robot spec. */
95 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
96 "*". If it is either of them, *matches is set to one. If it is
97 "wget", *exact_match is set to one. */
100 match_user_agent (const char *agent, int length,
101 int *matches, int *exact_match)
103 if (length == 1 && *agent == '*')
108 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
120 /* Add a path specification between PATH_B and PATH_E as one of the
124 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
125 int allowedp, int exactp)
128 if (path_b < path_e && *path_b == '/')
129 /* Our path representation doesn't use a leading slash, so remove
132 pp.path = strdupdelim (path_b, path_e);
133 pp.allowedp = allowedp;
134 pp.user_agent_exact_p = exactp;
136 if (specs->count > specs->size)
138 if (specs->size == 0)
142 specs->paths = xrealloc (specs->paths,
143 specs->size * sizeof (struct path_info));
145 specs->paths[specs->count - 1] = pp;
148 /* Recreate SPECS->paths with only those paths that have non-zero
149 user_agent_exact_p. */
152 prune_non_exact (struct robot_specs *specs)
154 struct path_info *newpaths;
157 for (i = 0; i < specs->count; i++)
158 if (specs->paths[i].user_agent_exact_p)
160 newpaths = xmalloc (cnt * sizeof (struct path_info));
161 for (i = 0, j = 0; i < specs->count; i++)
162 if (specs->paths[i].user_agent_exact_p)
163 newpaths[j++] = specs->paths[i];
165 xfree (specs->paths);
166 specs->paths = newpaths;
171 #define EOL(p) ((p) >= lineend)
173 #define SKIP_SPACE(p) do { \
174 while (!EOL (p) && ISSPACE (*p)) \
178 #define FIELD_IS(string_literal) \
179 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
181 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
182 Return a specs objects ready to be fed to res_match_path.
184 The parsing itself is trivial, but creating a correct SPECS object
185 is trickier than it seems, because RES is surprisingly byzantine if
186 you attempt to implement it correctly.
188 A "record" is a block of one or more `User-Agent' lines followed by
189 one or more `Allow' or `Disallow' lines. Record is accepted by
190 Wget if one of the `User-Agent' lines was "wget", or if the user
193 After all the lines have been read, we examine whether an exact
194 ("wget") user-agent field was specified. If so, we delete all the
195 lines read under "User-Agent: *" blocks because we have our own
196 Wget-specific blocks. This enables the admin to say:
205 This means that to Wget and to Google, /cgi-bin is disallowed,
206 whereas for all other crawlers, everything is disallowed.
207 res_parse is implemented so that the order of records doesn't
208 matter. In the case above, the "User-Agent: *" could have come
209 after the other one. */
212 res_parse (const char *source, int length)
216 const char *p = source;
217 const char *end = source + length;
219 /* non-zero if last applicable user-agent field matches Wget. */
220 int user_agent_applies = 0;
222 /* non-zero if last applicable user-agent field *exactly* matches
224 int user_agent_exact = 0;
226 /* whether we ever encountered exact user agent. */
229 /* count of allow/disallow lines in the current "record", i.e. after
230 the last `user-agent' instructions. */
231 int record_count = 0;
233 struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
234 memset (specs, '\0', sizeof (struct robot_specs));
238 const char *lineend, *lineend_real;
239 const char *field_b, *field_e;
240 const char *value_b, *value_e;
244 lineend_real = memchr (p, '\n', end - p);
249 lineend = lineend_real;
251 /* Before doing anything else, check whether the line is empty
254 if (EOL (p) || *p == '#')
257 /* Make sure the end-of-line comments are respected by setting
258 lineend to a location preceding the first comment. Real line
259 ending remains in lineend_real. */
260 for (lineend = p; lineend < lineend_real; lineend++)
261 if ((lineend == p || ISSPACE (*(lineend - 1)))
265 /* Ignore trailing whitespace in the same way. */
266 while (lineend > p && ISSPACE (*(lineend - 1)))
272 while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
277 if (field_b == field_e || EOL (p) || *p != ':')
279 DEBUGP (("Ignoring malformed line %d", line_count));
290 /* Finally, we have a syntactically valid line. */
291 if (FIELD_IS ("user-agent"))
293 /* We have to support several cases:
300 ... matching record ...
304 ... non-matching record ...
307 ... matching record, but will be pruned later ...
309 We have to respect `User-Agent' at the beginning of each
310 new record simply because we don't know if we're going to
311 encounter "Wget" among the agents or not. Hence,
312 match_user_agent is called when record_count != 0.
314 But if record_count is 0, we have to keep calling it
315 until it matches, and if that happens, we must not call
316 it any more, until the next record. Hence the other part
318 if (record_count != 0 || user_agent_applies == 0)
319 match_user_agent (value_b, value_e - value_b,
320 &user_agent_applies, &user_agent_exact);
321 if (user_agent_exact)
325 else if (FIELD_IS ("allow"))
327 if (user_agent_applies)
329 add_path (specs, value_b, value_e, 1, user_agent_exact);
333 else if (FIELD_IS ("disallow"))
335 if (user_agent_applies)
338 if (value_b == value_e)
339 /* Empty "disallow" line means everything is
342 add_path (specs, value_b, value_e, allowed, user_agent_exact);
348 DEBUGP (("Ignoring unknown field at line %d", line_count));
359 /* We've encountered an exactly matching user-agent. Throw out
360 all the stuff with user-agent: *. */
361 prune_non_exact (specs);
363 else if (specs->size > specs->count)
365 /* add_path normally over-allocates specs->paths. Reallocate it
366 to the correct size in order to conserve some memory. */
367 specs->paths = xrealloc (specs->paths,
368 specs->count * sizeof (struct path_info));
369 specs->size = specs->count;
375 /* The same like res_parse, but first map the FILENAME into memory,
376 and then parse it. */
379 res_parse_from_file (const char *filename)
381 struct robot_specs *specs;
382 struct file_memory *fm = read_file (filename);
385 logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
386 filename, strerror (errno));
389 specs = res_parse (fm->content, fm->length);
395 free_specs (struct robot_specs *specs)
398 for (i = 0; i < specs->count; i++)
399 xfree (specs->paths[i].path);
400 FREE_MAYBE (specs->paths);
404 /* Matching of a path according to the specs. */
406 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
407 that number is not a numerical representation of '/', decode C and
408 advance the pointer. */
410 #define DECODE_MAYBE(c, ptr) do { \
411 if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
414 = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \
415 if (decoded != '/') \
423 /* The inner matching engine: return non-zero if RECORD_PATH matches
424 URL_PATH. The rules for matching are described at
425 <http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html>,
429 matches (const char *record_path, const char *url_path)
431 const char *rp = record_path;
432 const char *up = url_path;
442 DECODE_MAYBE(rc, rp);
443 DECODE_MAYBE(uc, up);
449 /* Iterate through all paths in SPECS. For the first one that
450 matches, return its allow/reject status. If none matches,
451 retrieval is by default allowed. */
454 res_match_path (const struct robot_specs *specs, const char *path)
459 for (i = 0; i < specs->count; i++)
460 if (matches (specs->paths[i].path, path))
462 int allowedp = specs->paths[i].allowedp;
463 DEBUGP (("%s path %s because of rule `%s'.\n",
464 allowedp ? "Allowing" : "Rejecting",
465 path, specs->paths[i].path));
471 /* Registering the specs. */
473 static struct hash_table *registered_specs;
475 /* Stolen from cookies.c. */
476 #define SET_HOSTPORT(host, port, result) do { \
477 int HP_len = strlen (host); \
478 result = alloca (HP_len + 1 + numdigit (port) + 1); \
479 memcpy (result, host, HP_len); \
480 result[HP_len] = ':'; \
481 number_to_string (result + HP_len + 1, port); \
484 /* Register RES specs that below to server on HOST:PORT. They will
485 later be retrievable using res_get_specs. */
488 res_register_specs (const char *host, int port, struct robot_specs *specs)
490 struct robot_specs *old;
492 SET_HOSTPORT (host, port, hp);
494 if (!registered_specs)
495 registered_specs = make_nocase_string_hash_table (0);
497 /* Required to shut up the compiler. */
501 if (hash_table_get_pair (registered_specs, hp, hp_old, old))
505 hash_table_put (registered_specs, hp_old, specs);
509 hash_table_put (registered_specs, xstrdup (hp), specs);
513 /* Get the specs that belong to HOST:PORT. */
516 res_get_specs (const char *host, int port)
519 SET_HOSTPORT (host, port, hp);
520 if (!registered_specs)
522 return hash_table_get (registered_specs, hp);
525 /* Loading the robots file. */
527 #define RES_SPECS_LOCATION "/robots.txt"
529 /* Retrieve the robots.txt from the server root of the server that
530 serves URL. The file will be named according to the currently
531 active rules, and the file name will be returned in *file.
533 Return non-zero if robots were retrieved OK, zero otherwise. */
536 res_retrieve_file (const char *url, char **file)
539 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
541 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
543 err = retrieve_url (robots_url, file, NULL, NULL, NULL);
546 if (err != RETROK && *file != NULL)
548 /* If the file is not retrieved correctly, but retrieve_url
549 allocated the file name, deallocate is here so that the
550 caller doesn't have to worry about it. */
554 return err == RETROK;
558 cleanup_hash_table_mapper (void *key, void *value, void *arg_ignored)
568 if (registered_specs)
570 hash_table_map (registered_specs, cleanup_hash_table_mapper, NULL);
571 hash_table_destroy (registered_specs);
572 registered_specs = NULL;