sjero.net Git - wget/blob - src/res.c

   1 /* Support for Robot Exclusion Standard (RES).
   2    Copyright (C) 2001 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 /* This file implements the Robot Exclusion Standard (RES).
  21
  22    RES is a simple protocol that enables site admins to signalize to
  23    the web crawlers that certain parts of the site should not be
  24    accessed.  All the admin needs to do is create a "robots.txt" file
  25    in the web server root, and use simple commands to allow or
  26    disallow access to certain parts of the site.
  27
  28    The first specification was written by Martijn Koster in 1994, and
  29    is still available at <http://www.robotstxt.org/wc/norobots.html>.
  30    In 1996, Martijn wrote an Internet Draft specifying an improved RES
  31    specification; however, that work was apparently abandoned since
  32    the draft has expired in 1997 and hasn't been replaced since.  The
  33    draft is available at
  34    <http://www.robotstxt.org/wc/norobots-rfc.html>.
  35
  36    This file implements RES as specified by the draft.  Note that this
  37    only handles the "robots.txt" support.  The META tag that controls
  38    whether the links should be followed is handled in `html-url.c'.
  39
  40    Known deviations:
  41
  42    * The end-of-line comment recognition is more in the spirit of the
  43      Bourne Shell (as specified by RES-1994).  That means that
  44      "foo#bar" is taken literally, whereas "foo #bar" is interpreted
  45      as "foo".  The Draft apparently specifies that both should be
  46      interpreted as "foo".
  47
  48    * We don't recognize sole CR as the line ending.
  49
  50    * We don't implement expiry mechanism for /robots.txt specs.  I
  51      consider it non-necessary for a relatively short-lived
  52      application such as Wget.  Besides, it is highly questionable
  53      whether anyone deploys the recommended expiry scheme for
  54      robots.txt.
  55
  56    Entry points are functions res_parse, res_parse_from_file,
  57    res_match_path, res_register_specs, res_get_specs, and
  58    res_retrieve_file.  */
  59
  60 #ifdef HAVE_CONFIG_H
  61 # include <config.h>
  62 #endif
  63
  64 #include <stdio.h>
  65 #include <stdlib.h>
  66 #ifdef HAVE_STRING_H
  67 # include <string.h>
  68 #else
  69 # include <strings.h>
  70 #endif /* HAVE_STRING_H */
  71 #include <errno.h>
  72 #include <assert.h>
  73
  74 #include "wget.h"
  75 #include "utils.h"
  76 #include "hash.h"
  77 #include "url.h"
  78 #include "retr.h"
  79 #include "res.h"
  80
  81 struct path_info {
  82   char *path;
  83   int allowedp;
  84   int user_agent_exact_p;
  85 };
  86
  87 struct robot_specs {
  88   int count;
  89   int size;
  90   struct path_info *paths;
  91 };
  92 \f
  93 /* Parsing the robot spec. */
  94
  95 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
  96    "*".  If it is either of them, *matches is set to one.  If it is
  97    "wget", *exact_match is set to one.  */
  98
  99 static void
 100 match_user_agent (const char *agent, int length,
 101                   int *matches, int *exact_match)
 102 {
 103   if (length == 1 && *agent == '*')
 104     {
 105       *matches = 1;
 106       *exact_match = 0;
 107     }
 108   else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
 109     {
 110       *matches = 1;
 111       *exact_match = 1;
 112     }
 113   else
 114     {
 115       *matches = 0;
 116       *exact_match = 0;
 117     }
 118 }
 119
 120 /* Add a path specification between PATH_B and PATH_E as one of the
 121    paths in SPECS.  */
 122
 123 static void
 124 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
 125           int allowedp, int exactp)
 126 {
 127   struct path_info pp;
 128   pp.path     = strdupdelim (path_b, path_e);
 129   pp.allowedp = allowedp;
 130   pp.user_agent_exact_p = exactp;
 131   ++specs->count;
 132   if (specs->count > specs->size)
 133     {
 134       if (specs->size == 0)
 135         specs->size = 1;
 136       else
 137         specs->size <<= 1;
 138       specs->paths = xrealloc (specs->paths,
 139                                specs->size * sizeof (struct path_info));
 140     }
 141   specs->paths[specs->count - 1] = pp;
 142 }
 143
 144 /* Recreate SPECS->paths with only those paths that have non-zero
 145    user_agent_exact_p.  */
 146
 147 static void
 148 prune_non_exact (struct robot_specs *specs)
 149 {
 150   struct path_info *newpaths;
 151   int i, j, cnt;
 152   cnt = 0;
 153   for (i = 0; i < specs->count; i++)
 154     if (specs->paths[i].user_agent_exact_p)
 155       ++cnt;
 156   newpaths = xmalloc (cnt * sizeof (struct path_info));
 157   for (i = 0, j = 0; i < specs->count; i++)
 158     if (specs->paths[i].user_agent_exact_p)
 159       newpaths[j++] = specs->paths[i];
 160   assert (j == cnt);
 161   xfree (specs->paths);
 162   specs->paths = newpaths;
 163   specs->count = cnt;
 164   specs->size  = cnt;
 165 }
 166
 167 #define EOL(p) ((p) >= lineend)
 168
 169 #define SKIP_SPACE(p) do {              \
 170   while (!EOL (p) && ISSPACE (*p))      \
 171     ++p;                                \
 172 } while (0)
 173
 174 #define FIELD_IS(string_literal)        \
 175   BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
 176
 177 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
 178    Return a specs objects ready to be fed to res_match_path.
 179
 180    The parsing itself is trivial, but creating a correct SPECS object
 181    is trickier than it seems, because RES is surprisingly byzantine if
 182    you attempt to implement it correctly.
 183
 184    A "record" is a block of one or more `User-Agent' lines followed by
 185    one or more `Allow' or `Disallow' lines.  Record is accepted by
 186    Wget if one of the `User-Agent' lines was "wget", or if the user
 187    agent line was "*".
 188
 189    After all the lines have been read, we examine whether an exact
 190    ("wget") user-agent field was specified.  If so, we delete all the
 191    lines read under "User-Agent: *" blocks because we have our own
 192    Wget-specific blocks.  This enables the admin to say:
 193
 194        User-Agent: *
 195        Disallow: /
 196
 197        User-Agent: google
 198        User-Agent: wget
 199        Disallow: /cgi-bin
 200
 201    This means that to Wget and to Google, /cgi-bin is disallowed,
 202    whereas for all other crawlers, everything is disallowed.
 203    res_parse is implemented so that the order of records doesn't
 204    matter.  In the case above, the "User-Agent: *" could have come
 205    after the other one.  */
 206
 207 struct robot_specs *
 208 res_parse (const char *source, int length)
 209 {
 210   int line_count = 1;
 211
 212   const char *p   = source;
 213   const char *end = source + length;
 214
 215   /* non-zero if last applicable user-agent field matches Wget. */
 216   int user_agent_applies = 0;
 217
 218   /* non-zero if last applicable user-agent field *exactly* matches
 219      Wget.  */
 220   int user_agent_exact = 0;
 221
 222   /* whether we ever encountered exact user agent. */
 223   int found_exact = 0;
 224
 225   /* count of allow/disallow lines in the current "record", i.e. after
 226      the last `user-agent' instructions.  */
 227   int record_count = 0;
 228
 229   struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
 230   memset (specs, '\0', sizeof (struct robot_specs));
 231
 232   while (1)
 233     {
 234       const char *lineend, *lineend_real;
 235       const char *field_b, *field_e;
 236       const char *value_b, *value_e;
 237
 238       if (p == end)
 239         break;
 240       lineend_real = memchr (p, '\n', end - p);
 241       if (lineend_real)
 242         ++lineend_real;
 243       else
 244         lineend_real = end;
 245       lineend = lineend_real;
 246
 247       /* Before doing anything else, check whether the line is empty
 248          or comment-only. */
 249       SKIP_SPACE (p);
 250       if (EOL (p) || *p == '#')
 251         goto next;
 252
 253       /* Make sure the end-of-line comments are respected by setting
 254          lineend to a location preceding the first comment.  Real line
 255          ending remains in lineend_real.  */
 256       for (lineend = p; lineend < lineend_real; lineend++)
 257         if ((lineend == p || ISSPACE (*(lineend - 1)))
 258             && *lineend == '#')
 259           break;
 260
 261       /* Ignore trailing whitespace in the same way. */
 262       while (lineend > p && ISSPACE (*(lineend - 1)))
 263         --lineend;
 264
 265       assert (!EOL (p));
 266
 267       field_b = p;
 268       while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
 269         ++p;
 270       field_e = p;
 271
 272       SKIP_SPACE (p);
 273       if (field_b == field_e || EOL (p) || *p != ':')
 274         {
 275           DEBUGP (("Ignoring malformed line %d", line_count));
 276           goto next;
 277         }
 278       ++p;                      /* skip ':' */
 279       SKIP_SPACE (p);
 280
 281       value_b = p;
 282       while (!EOL (p))
 283         ++p;
 284       value_e = p;
 285
 286       /* Finally, we have a syntactically valid line. */
 287       if (FIELD_IS ("user-agent"))
 288         {
 289           /* We have to support several cases:
 290
 291              --previous records--
 292
 293              User-Agent: foo
 294              User-Agent: Wget
 295              User-Agent: bar
 296              ... matching record ...
 297
 298              User-Agent: baz
 299              User-Agent: qux
 300              ... non-matching record ...
 301
 302              User-Agent: *
 303              ... matching record, but will be pruned later ...
 304
 305              We have to respect `User-Agent' at the beginning of each
 306              new record simply because we don't know if we're going to
 307              encounter "Wget" among the agents or not.  Hence,
 308              match_user_agent is called when record_count != 0.
 309
 310              But if record_count is 0, we have to keep calling it
 311              until it matches, and if that happens, we must not call
 312              it any more, until the next record.  Hence the other part
 313              of the condition.  */
 314           if (record_count != 0 || user_agent_applies == 0)
 315             match_user_agent (value_b, value_e - value_b,
 316                               &user_agent_applies, &user_agent_exact);
 317           if (user_agent_exact)
 318             found_exact = 1;
 319           record_count = 0;
 320         }
 321       else if (FIELD_IS ("allow"))
 322         {
 323           if (user_agent_applies)
 324             {
 325               add_path (specs, value_b, value_e, 1, user_agent_exact);
 326             }
 327           ++record_count;
 328         }
 329       else if (FIELD_IS ("disallow"))
 330         {
 331           if (user_agent_applies)
 332             {
 333               int allowed = 0;
 334               if (value_b == value_e)
 335                 /* Empty "disallow" line means everything is
 336                    *allowed*!  */
 337                 allowed = 1;
 338               add_path (specs, value_b, value_e, allowed, user_agent_exact);
 339             }
 340           ++record_count;
 341         }
 342       else
 343         {
 344           DEBUGP (("Ignoring unknown field at line %d", line_count));
 345           goto next;
 346         }
 347
 348     next:
 349       p = lineend_real;
 350       ++line_count;
 351     }
 352
 353   if (found_exact)
 354     {
 355       /* We've encountered an exactly matching user-agent.  Throw out
 356          all the stuff with user-agent: *.  */
 357       prune_non_exact (specs);
 358     }
 359   else if (specs->size > specs->count)
 360     {
 361       /* add_path normally over-allocates specs->paths.  Reallocate it
 362          to the correct size in order to conserve some memory.  */
 363       specs->paths = xrealloc (specs->paths,
 364                                specs->count * sizeof (struct path_info));
 365       specs->size = specs->count;
 366     }
 367
 368   return specs;
 369 }
 370
 371 /* The same like res_parse, but first map the FILENAME into memory,
 372    and then parse it.  */
 373
 374 struct robot_specs *
 375 res_parse_from_file (const char *filename)
 376 {
 377   struct robot_specs *specs;
 378   struct file_memory *fm = read_file (filename);
 379   if (!fm)
 380     {
 381       logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
 382                  filename, strerror (errno));
 383       return NULL;
 384     }
 385   specs = res_parse (fm->content, fm->length);
 386   read_file_free (fm);
 387   return specs;
 388 }
 389
 390 static void
 391 free_specs (struct robot_specs *specs)
 392 {
 393   FREE_MAYBE (specs->paths);
 394   xfree (specs);
 395 }
 396 \f
 397 /* Matching of a path according to the specs. */
 398
 399 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
 400    that number is not a numerical representation of '/', decode C and
 401    advance the pointer.  */
 402
 403 #define DECODE_MAYBE(c, ptr) do {                                       \
 404   if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2]))               \
 405     {                                                                   \
 406       char decoded                                                      \
 407         = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]);   \
 408       if (decoded != '/')                                               \
 409         {                                                               \
 410           c = decoded;                                                  \
 411           ptr += 2;                                                     \
 412         }                                                               \
 413     }                                                                   \
 414 } while (0)
 415
 416 /* The inner matching engine: return non-zero if RECORD_PATH matches
 417    URL_PATH.  The rules for matching are described at
 418    <http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html>,
 419    section 3.2.2.  */
 420
 421 static int
 422 matches (const char *record_path, const char *url_path)
 423 {
 424   const char *rp = record_path;
 425   const char *up = url_path;
 426
 427   for (; ; ++rp, ++up)
 428     {
 429       char rc = *rp;
 430       char uc = *up;
 431       if (!rc)
 432         return 1;
 433       if (!uc)
 434         return 0;
 435       DECODE_MAYBE(rc, rp);
 436       DECODE_MAYBE(uc, up);
 437       if (rc != uc)
 438         return 0;
 439     }
 440 }
 441
 442 /* Iterate through all paths in SPECS.  For the first one that
 443    matches, return its allow/reject status.  If none matches,
 444    retrieval is by default allowed.  */
 445
 446 int
 447 res_match_path (const struct robot_specs *specs, const char *path)
 448 {
 449   int i;
 450   if (!specs)
 451     return 1;
 452   for (i = 0; i < specs->count; i++)
 453     if (matches (specs->paths[i].path, path))
 454       {
 455         int allowedp = specs->paths[i].allowedp;
 456         DEBUGP (("%s path %s because of rule `%s'.\n",
 457                  allowedp ? "Allowing" : "Rejecting",
 458                  path, specs->paths[i].path));
 459         return allowedp;
 460       }
 461   return 1;
 462 }
 463 \f
 464 /* Registering the specs. */
 465
 466 struct hash_table *registered_specs;
 467
 468 /* Stolen from cookies.c. */
 469 #define SET_HOSTPORT(host, port, result) do {           \
 470   int HP_len = strlen (host);                           \
 471   result = alloca (HP_len + 1 + numdigit (port) + 1);   \
 472   memcpy (result, host, HP_len);                        \
 473   result[HP_len] = ':';                                 \
 474   long_to_string (result + HP_len + 1, port);           \
 475 } while (0)
 476
 477 /* Register RES specs that below to server on HOST:PORT.  They will
 478    later be retrievable using res_get_specs.  */
 479
 480 void
 481 res_register_specs (const char *host, int port, struct robot_specs *specs)
 482 {
 483   struct robot_specs *old;
 484   char *hp, *hp_old;
 485   SET_HOSTPORT (host, port, hp);
 486
 487   if (!registered_specs)
 488     registered_specs = make_nocase_string_hash_table (0);
 489
 490   /* Required to shut up the compiler. */
 491   old    = NULL;
 492   hp_old = NULL;
 493
 494   if (hash_table_get_pair (registered_specs, hp, hp_old, old))
 495     {
 496       if (old)
 497         free_specs (old);
 498       hash_table_put (registered_specs, hp_old, specs);
 499     }
 500   else
 501     {
 502       hash_table_put (registered_specs, xstrdup (hp), specs);
 503     }
 504 }
 505
 506 /* Get the specs that belong to HOST:PORT. */
 507
 508 struct robot_specs *
 509 res_get_specs (const char *host, int port)
 510 {
 511   char *hp;
 512   SET_HOSTPORT (host, port, hp);
 513   if (!registered_specs)
 514     return NULL;
 515   return hash_table_get (registered_specs, hp);
 516 }
 517 \f
 518 /* Loading the robots file.  */
 519
 520 #define RES_SPECS_LOCATION "/robots.txt"
 521
 522 /* Retrieve the robots.txt from the server root of the server that
 523    serves URL.  The file will be named according to the currently
 524    active rules, and the file name will be returned in *file.
 525
 526    Return non-zero if robots were retrieved OK, zero otherwise.  */
 527
 528 int
 529 res_retrieve_file (const char *url, char **file)
 530 {
 531   uerr_t err;
 532   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 533
 534   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 535   *file = NULL;
 536   err = retrieve_url (robots_url, file, NULL, NULL, NULL);
 537   xfree (robots_url);
 538
 539   if (err != RETROK && *file != NULL)
 540     {
 541       /* If the file is not retrieved correctly, but retrieve_url
 542          allocated the file name, deallocate is here so that the
 543          caller doesn't have to worry about it.  */
 544       xfree (*file);
 545       *file = NULL;
 546     }
 547   return err == RETROK;
 548 }