sjero.net Git - wget/blob - src/res.c

   1 /* Support for Robot Exclusion Standard (RES).
   2    Copyright (C) 2001,2006 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 In addition, as a special exception, the Free Software Foundation
  20 gives permission to link the code of its release of Wget with the
  21 OpenSSL project's "OpenSSL" library (or with modified versions of it
  22 that use the same license as the "OpenSSL" library), and distribute
  23 the linked executables.  You must obey the GNU General Public License
  24 in all respects for all of the code used other than "OpenSSL".  If you
  25 modify this file, you may extend this exception to your version of the
  26 file, but you are not obligated to do so.  If you do not wish to do
  27 so, delete this exception statement from your version.  */
  28
  29 /* This file implements the Robot Exclusion Standard (RES).
  30
  31    RES is a simple protocol that enables site admins to signalize to
  32    the web crawlers that certain parts of the site should not be
  33    accessed.  All the admin needs to do is create a "robots.txt" file
  34    in the web server root, and use simple commands to allow or
  35    disallow access to certain parts of the site.
  36
  37    The first specification was written by Martijn Koster in 1994, and
  38    is still available at <http://www.robotstxt.org/wc/norobots.html>.
  39    In 1996, Martijn wrote an Internet Draft specifying an improved RES
  40    specification; however, that work was apparently abandoned since
  41    the draft has expired in 1997 and hasn't been replaced since.  The
  42    draft is available at
  43    <http://www.robotstxt.org/wc/norobots-rfc.html>.
  44
  45    This file implements RES as specified by the draft.  Note that this
  46    only handles the "robots.txt" support.  The META tag that controls
  47    whether the links should be followed is handled in `html-url.c'.
  48
  49    Known deviations:
  50
  51    * The end-of-line comment recognition is more in the spirit of the
  52      Bourne Shell (as specified by RES-1994).  That means that
  53      "foo#bar" is taken literally, whereas "foo #bar" is interpreted
  54      as "foo".  The Draft apparently specifies that both should be
  55      interpreted as "foo".
  56
  57    * We don't recognize sole CR as the line ending.
  58
  59    * We don't implement expiry mechanism for /robots.txt specs.  I
  60      consider it non-necessary for a relatively short-lived
  61      application such as Wget.  Besides, it is highly questionable
  62      whether anyone deploys the recommended expiry scheme for
  63      robots.txt.
  64
  65    Entry points are functions res_parse, res_parse_from_file,
  66    res_match_path, res_register_specs, res_get_specs, and
  67    res_retrieve_file.  */
  68
  69 #ifdef HAVE_CONFIG_H
  70 # include <config.h>
  71 #endif
  72
  73 #include <stdio.h>
  74 #include <stdlib.h>
  75 #include <string.h>
  76 #include <errno.h>
  77 #include <assert.h>
  78
  79 #include "wget.h"
  80 #include "utils.h"
  81 #include "hash.h"
  82 #include "url.h"
  83 #include "retr.h"
  84 #include "res.h"
  85
  86 #ifdef TESTING
  87 #include "test.h"
  88 #endif
  89
  90 struct path_info {
  91   char *path;
  92   bool allowedp;
  93   bool user_agent_exact_p;
  94 };
  95
  96 struct robot_specs {
  97   int count;
  98   int size;
  99   struct path_info *paths;
 100 };
 101 \f
 102 /* Parsing the robot spec. */
 103
 104 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
 105    "*".  If it is either of them, *matches is set to one.  If it is
 106    "wget", *exact_match is set to one.  */
 107
 108 static void
 109 match_user_agent (const char *agent, int length,
 110                   bool *matches, bool *exact_match)
 111 {
 112   if (length == 1 && *agent == '*')
 113     {
 114       *matches = true;
 115       *exact_match = false;
 116     }
 117   else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
 118     {
 119       *matches = true;
 120       *exact_match = true;
 121     }
 122   else
 123     {
 124       *matches = false;
 125       *exact_match = false;
 126     }
 127 }
 128
 129 /* Add a path specification between PATH_B and PATH_E as one of the
 130    paths in SPECS.  */
 131
 132 static void
 133 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
 134           bool allowedp, bool exactp)
 135 {
 136   struct path_info pp;
 137   if (path_b < path_e && *path_b == '/')
 138     /* Our path representation doesn't use a leading slash, so remove
 139        one from theirs. */
 140     ++path_b;
 141   pp.path     = strdupdelim (path_b, path_e);
 142   pp.allowedp = allowedp;
 143   pp.user_agent_exact_p = exactp;
 144   ++specs->count;
 145   if (specs->count > specs->size)
 146     {
 147       if (specs->size == 0)
 148         specs->size = 1;
 149       else
 150         specs->size <<= 1;
 151       specs->paths = xrealloc (specs->paths,
 152                                specs->size * sizeof (struct path_info));
 153     }
 154   specs->paths[specs->count - 1] = pp;
 155 }
 156
 157 /* Recreate SPECS->paths with only those paths that have
 158    user_agent_exact_p set to true.  */
 159
 160 static void
 161 prune_non_exact (struct robot_specs *specs)
 162 {
 163   struct path_info *newpaths;
 164   int i, j, cnt;
 165   cnt = 0;
 166   for (i = 0; i < specs->count; i++)
 167     if (specs->paths[i].user_agent_exact_p)
 168       ++cnt;
 169   newpaths = xnew_array (struct path_info, cnt);
 170   for (i = 0, j = 0; i < specs->count; i++)
 171     if (specs->paths[i].user_agent_exact_p)
 172       newpaths[j++] = specs->paths[i];
 173   assert (j == cnt);
 174   xfree (specs->paths);
 175   specs->paths = newpaths;
 176   specs->count = cnt;
 177   specs->size  = cnt;
 178 }
 179
 180 #define EOL(p) ((p) >= lineend)
 181
 182 #define SKIP_SPACE(p) do {              \
 183   while (!EOL (p) && ISSPACE (*p))      \
 184     ++p;                                \
 185 } while (0)
 186
 187 #define FIELD_IS(string_literal)        \
 188   BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
 189
 190 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
 191    Return a specs objects ready to be fed to res_match_path.
 192
 193    The parsing itself is trivial, but creating a correct SPECS object
 194    is trickier than it seems, because RES is surprisingly byzantine if
 195    you attempt to implement it correctly.
 196
 197    A "record" is a block of one or more `User-Agent' lines followed by
 198    one or more `Allow' or `Disallow' lines.  Record is accepted by
 199    Wget if one of the `User-Agent' lines was "wget", or if the user
 200    agent line was "*".
 201
 202    After all the lines have been read, we examine whether an exact
 203    ("wget") user-agent field was specified.  If so, we delete all the
 204    lines read under "User-Agent: *" blocks because we have our own
 205    Wget-specific blocks.  This enables the admin to say:
 206
 207        User-Agent: *
 208        Disallow: /
 209
 210        User-Agent: google
 211        User-Agent: wget
 212        Disallow: /cgi-bin
 213
 214    This means that to Wget and to Google, /cgi-bin is disallowed,
 215    whereas for all other crawlers, everything is disallowed.
 216    res_parse is implemented so that the order of records doesn't
 217    matter.  In the case above, the "User-Agent: *" could have come
 218    after the other one.  */
 219
 220 struct robot_specs *
 221 res_parse (const char *source, int length)
 222 {
 223   int line_count = 1;
 224
 225   const char *p   = source;
 226   const char *end = source + length;
 227
 228   /* true if last applicable user-agent field matches Wget. */
 229   bool user_agent_applies = false;
 230
 231   /* true if last applicable user-agent field *exactly* matches
 232      Wget.  */
 233   bool user_agent_exact = false;
 234
 235   /* whether we ever encountered exact user agent. */
 236   bool found_exact = false;
 237
 238   /* count of allow/disallow lines in the current "record", i.e. after
 239      the last `user-agent' instructions.  */
 240   int record_count = 0;
 241
 242   struct robot_specs *specs = xnew0 (struct robot_specs);
 243
 244   while (1)
 245     {
 246       const char *lineend, *lineend_real;
 247       const char *field_b, *field_e;
 248       const char *value_b, *value_e;
 249
 250       if (p == end)
 251         break;
 252       lineend_real = memchr (p, '\n', end - p);
 253       if (lineend_real)
 254         ++lineend_real;
 255       else
 256         lineend_real = end;
 257       lineend = lineend_real;
 258
 259       /* Before doing anything else, check whether the line is empty
 260          or comment-only. */
 261       SKIP_SPACE (p);
 262       if (EOL (p) || *p == '#')
 263         goto next;
 264
 265       /* Make sure the end-of-line comments are respected by setting
 266          lineend to a location preceding the first comment.  Real line
 267          ending remains in lineend_real.  */
 268       for (lineend = p; lineend < lineend_real; lineend++)
 269         if ((lineend == p || ISSPACE (*(lineend - 1)))
 270             && *lineend == '#')
 271           break;
 272
 273       /* Ignore trailing whitespace in the same way. */
 274       while (lineend > p && ISSPACE (*(lineend - 1)))
 275         --lineend;
 276
 277       assert (!EOL (p));
 278
 279       field_b = p;
 280       while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
 281         ++p;
 282       field_e = p;
 283
 284       SKIP_SPACE (p);
 285       if (field_b == field_e || EOL (p) || *p != ':')
 286         {
 287           DEBUGP (("Ignoring malformed line %d", line_count));
 288           goto next;
 289         }
 290       ++p;                      /* skip ':' */
 291       SKIP_SPACE (p);
 292
 293       value_b = p;
 294       while (!EOL (p))
 295         ++p;
 296       value_e = p;
 297
 298       /* Finally, we have a syntactically valid line. */
 299       if (FIELD_IS ("user-agent"))
 300         {
 301           /* We have to support several cases:
 302
 303              --previous records--
 304
 305              User-Agent: foo
 306              User-Agent: Wget
 307              User-Agent: bar
 308              ... matching record ...
 309
 310              User-Agent: baz
 311              User-Agent: qux
 312              ... non-matching record ...
 313
 314              User-Agent: *
 315              ... matching record, but will be pruned later ...
 316
 317              We have to respect `User-Agent' at the beginning of each
 318              new record simply because we don't know if we're going to
 319              encounter "Wget" among the agents or not.  Hence,
 320              match_user_agent is called when record_count != 0.
 321
 322              But if record_count is 0, we have to keep calling it
 323              until it matches, and if that happens, we must not call
 324              it any more, until the next record.  Hence the other part
 325              of the condition.  */
 326           if (record_count != 0 || user_agent_applies == false)
 327             match_user_agent (value_b, value_e - value_b,
 328                               &user_agent_applies, &user_agent_exact);
 329           if (user_agent_exact)
 330             found_exact = true;
 331           record_count = 0;
 332         }
 333       else if (FIELD_IS ("allow"))
 334         {
 335           if (user_agent_applies)
 336             {
 337               add_path (specs, value_b, value_e, true, user_agent_exact);
 338             }
 339           ++record_count;
 340         }
 341       else if (FIELD_IS ("disallow"))
 342         {
 343           if (user_agent_applies)
 344             {
 345               bool allowed = false;
 346               if (value_b == value_e)
 347                 /* Empty "disallow" line means everything is *allowed*!  */
 348                 allowed = true;
 349               add_path (specs, value_b, value_e, allowed, user_agent_exact);
 350             }
 351           ++record_count;
 352         }
 353       else
 354         {
 355           DEBUGP (("Ignoring unknown field at line %d", line_count));
 356           goto next;
 357         }
 358
 359     next:
 360       p = lineend_real;
 361       ++line_count;
 362     }
 363
 364   if (found_exact)
 365     {
 366       /* We've encountered an exactly matching user-agent.  Throw out
 367          all the stuff with user-agent: *.  */
 368       prune_non_exact (specs);
 369     }
 370   else if (specs->size > specs->count)
 371     {
 372       /* add_path normally over-allocates specs->paths.  Reallocate it
 373          to the correct size in order to conserve some memory.  */
 374       specs->paths = xrealloc (specs->paths,
 375                                specs->count * sizeof (struct path_info));
 376       specs->size = specs->count;
 377     }
 378
 379   return specs;
 380 }
 381
 382 /* The same like res_parse, but first map the FILENAME into memory,
 383    and then parse it.  */
 384
 385 struct robot_specs *
 386 res_parse_from_file (const char *filename)
 387 {
 388   struct robot_specs *specs;
 389   struct file_memory *fm = read_file (filename);
 390   if (!fm)
 391     {
 392       logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
 393                  filename, strerror (errno));
 394       return NULL;
 395     }
 396   specs = res_parse (fm->content, fm->length);
 397   read_file_free (fm);
 398   return specs;
 399 }
 400
 401 static void
 402 free_specs (struct robot_specs *specs)
 403 {
 404   int i;
 405   for (i = 0; i < specs->count; i++)
 406     xfree (specs->paths[i].path);
 407   xfree_null (specs->paths);
 408   xfree (specs);
 409 }
 410 \f
 411 /* Matching of a path according to the specs. */
 412
 413 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
 414    that number is not a numerical representation of '/', decode C and
 415    advance the pointer.  */
 416
 417 #define DECODE_MAYBE(c, ptr) do {                               \
 418   if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2]))       \
 419     {                                                           \
 420       char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
 421       if (decoded != '/')                                       \
 422         {                                                       \
 423           c = decoded;                                          \
 424           ptr += 2;                                             \
 425         }                                                       \
 426     }                                                           \
 427 } while (0)
 428
 429 /* The inner matching engine: return true if RECORD_PATH matches
 430    URL_PATH.  The rules for matching are described at
 431    <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2.  */
 432
 433 static bool
 434 matches (const char *record_path, const char *url_path)
 435 {
 436   const char *rp = record_path;
 437   const char *up = url_path;
 438
 439   for (; ; ++rp, ++up)
 440     {
 441       char rc = *rp;
 442       char uc = *up;
 443       if (!rc)
 444         return true;
 445       if (!uc)
 446         return false;
 447       DECODE_MAYBE(rc, rp);
 448       DECODE_MAYBE(uc, up);
 449       if (rc != uc)
 450         return false;
 451     }
 452 }
 453
 454 /* Iterate through all paths in SPECS.  For the first one that
 455    matches, return its allow/reject status.  If none matches,
 456    retrieval is by default allowed.  */
 457
 458 bool
 459 res_match_path (const struct robot_specs *specs, const char *path)
 460 {
 461   int i;
 462   if (!specs)
 463     return true;
 464   for (i = 0; i < specs->count; i++)
 465     if (matches (specs->paths[i].path, path))
 466       {
 467         bool allowedp = specs->paths[i].allowedp;
 468         DEBUGP (("%s path %s because of rule `%s'.\n",
 469                  allowedp ? "Allowing" : "Rejecting",
 470                  path, specs->paths[i].path));
 471         return allowedp;
 472       }
 473   return true;
 474 }
 475 \f
 476 /* Registering the specs. */
 477
 478 static struct hash_table *registered_specs;
 479
 480 /* Stolen from cookies.c. */
 481 #define SET_HOSTPORT(host, port, result) do {           \
 482   int HP_len = strlen (host);                           \
 483   result = alloca (HP_len + 1 + numdigit (port) + 1);   \
 484   memcpy (result, host, HP_len);                        \
 485   result[HP_len] = ':';                                 \
 486   number_to_string (result + HP_len + 1, port);         \
 487 } while (0)
 488
 489 /* Register RES specs that below to server on HOST:PORT.  They will
 490    later be retrievable using res_get_specs.  */
 491
 492 void
 493 res_register_specs (const char *host, int port, struct robot_specs *specs)
 494 {
 495   struct robot_specs *old;
 496   char *hp, *hp_old;
 497   SET_HOSTPORT (host, port, hp);
 498
 499   if (!registered_specs)
 500     registered_specs = make_nocase_string_hash_table (0);
 501
 502   if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
 503     {
 504       if (old)
 505         free_specs (old);
 506       hash_table_put (registered_specs, hp_old, specs);
 507     }
 508   else
 509     {
 510       hash_table_put (registered_specs, xstrdup (hp), specs);
 511     }
 512 }
 513
 514 /* Get the specs that belong to HOST:PORT. */
 515
 516 struct robot_specs *
 517 res_get_specs (const char *host, int port)
 518 {
 519   char *hp;
 520   SET_HOSTPORT (host, port, hp);
 521   if (!registered_specs)
 522     return NULL;
 523   return hash_table_get (registered_specs, hp);
 524 }
 525 \f
 526 /* Loading the robots file.  */
 527
 528 #define RES_SPECS_LOCATION "/robots.txt"
 529
 530 /* Retrieve the robots.txt from the server root of the server that
 531    serves URL.  The file will be named according to the currently
 532    active rules, and the file name will be returned in *file.
 533
 534    Return true if robots were retrieved OK, false otherwise.  */
 535
 536 bool
 537 res_retrieve_file (const char *url, char **file)
 538 {
 539   uerr_t err;
 540   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 541   int saved_ts_val = opt.timestamping;
 542   int saved_sp_val = opt.spider;
 543
 544   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 545   *file = NULL;
 546   opt.timestamping = false;
 547   opt.spider       = false;
 548   err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
 549   opt.timestamping = saved_ts_val;
 550   opt.spider       = saved_sp_val;
 551   xfree (robots_url);
 552
 553   if (err != RETROK && *file != NULL)
 554     {
 555       /* If the file is not retrieved correctly, but retrieve_url
 556          allocated the file name, deallocate is here so that the
 557          caller doesn't have to worry about it.  */
 558       xfree (*file);
 559       *file = NULL;
 560     }
 561   return err == RETROK;
 562 }
 563 \f
 564 bool
 565 is_robots_txt_url (const char *url)
 566 {
 567   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 568   bool ret = are_urls_equal (url, robots_url);
 569
 570   xfree (robots_url);
 571
 572   return ret;
 573 }
 574 \f
 575 void
 576 res_cleanup (void)
 577 {
 578   if (registered_specs)
 579     {
 580       hash_table_iterator iter;
 581       for (hash_table_iterate (registered_specs, &iter);
 582            hash_table_iter_next (&iter);
 583            )
 584         {
 585           xfree (iter.key);
 586           free_specs (iter.value);
 587         }
 588       hash_table_destroy (registered_specs);
 589       registered_specs = NULL;
 590     }
 591 }
 592 \f
 593 #ifdef TESTING
 594
 595 const char *
 596 test_is_robots_txt_url()
 597 {
 598   int i;
 599   struct {
 600     char *url;
 601     bool expected_result;
 602   } test_array[] = {
 603     { "http://www.yoyodyne.com/robots.txt", true },
 604     { "http://www.yoyodyne.com/somepath/", false },
 605     { "http://www.yoyodyne.com/somepath/robots.txt", false },
 606   };
 607
 608   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
 609     {
 610       mu_assert ("test_is_robots_txt_url: wrong result",
 611                  is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
 612     }
 613
 614   return NULL;
 615 }
 616
 617 #endif /* TESTING */
 618
 619 /*
 620  * vim: et ts=2 sw=2
 621  */
 622