sjero.net Git - wget/blob - src/res.c

   1 /* Support for Robot Exclusion Standard (RES).
   2    Copyright (C) 2001, 2006, 2007, 2008 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 /* This file implements the Robot Exclusion Standard (RES).
  31
  32    RES is a simple protocol that enables site admins to signalize to
  33    the web crawlers that certain parts of the site should not be
  34    accessed.  All the admin needs to do is create a "robots.txt" file
  35    in the web server root, and use simple commands to allow or
  36    disallow access to certain parts of the site.
  37
  38    The first specification was written by Martijn Koster in 1994, and
  39    is still available at <http://www.robotstxt.org/wc/norobots.html>.
  40    In 1996, Martijn wrote an Internet Draft specifying an improved RES
  41    specification; however, that work was apparently abandoned since
  42    the draft has expired in 1997 and hasn't been replaced since.  The
  43    draft is available at
  44    <http://www.robotstxt.org/wc/norobots-rfc.html>.
  45
  46    This file implements RES as specified by the draft.  Note that this
  47    only handles the "robots.txt" support.  The META tag that controls
  48    whether the links should be followed is handled in `html-url.c'.
  49
  50    Known deviations:
  51
  52    * The end-of-line comment recognition is more in the spirit of the
  53      Bourne Shell (as specified by RES-1994).  That means that
  54      "foo#bar" is taken literally, whereas "foo #bar" is interpreted
  55      as "foo".  The Draft apparently specifies that both should be
  56      interpreted as "foo".
  57
  58    * We don't recognize sole CR as the line ending.
  59
  60    * We don't implement expiry mechanism for /robots.txt specs.  I
  61      consider it non-necessary for a relatively short-lived
  62      application such as Wget.  Besides, it is highly questionable
  63      whether anyone deploys the recommended expiry scheme for
  64      robots.txt.
  65
  66    Entry points are functions res_parse, res_parse_from_file,
  67    res_match_path, res_register_specs, res_get_specs, and
  68    res_retrieve_file.  */
  69
  70 #include "wget.h"
  71
  72 #include <stdio.h>
  73 #include <stdlib.h>
  74 #include <string.h>
  75 #include <errno.h>
  76 #include <assert.h>
  77
  78 #include "utils.h"
  79 #include "hash.h"
  80 #include "url.h"
  81 #include "retr.h"
  82 #include "res.h"
  83
  84 #ifdef TESTING
  85 #include "test.h"
  86 #endif
  87
  88 struct path_info {
  89   char *path;
  90   bool allowedp;
  91   bool user_agent_exact_p;
  92 };
  93
  94 struct robot_specs {
  95   int count;
  96   int size;
  97   struct path_info *paths;
  98 };
  99 \f
 100 /* Parsing the robot spec. */
 101
 102 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
 103    "*".  If it is either of them, *matches is set to one.  If it is
 104    "wget", *exact_match is set to one.  */
 105
 106 static void
 107 match_user_agent (const char *agent, int length,
 108                   bool *matches, bool *exact_match)
 109 {
 110   if (length == 1 && *agent == '*')
 111     {
 112       *matches = true;
 113       *exact_match = false;
 114     }
 115   else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
 116     {
 117       *matches = true;
 118       *exact_match = true;
 119     }
 120   else
 121     {
 122       *matches = false;
 123       *exact_match = false;
 124     }
 125 }
 126
 127 /* Add a path specification between PATH_B and PATH_E as one of the
 128    paths in SPECS.  */
 129
 130 static void
 131 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
 132           bool allowedp, bool exactp)
 133 {
 134   struct path_info pp;
 135   if (path_b < path_e && *path_b == '/')
 136     /* Our path representation doesn't use a leading slash, so remove
 137        one from theirs. */
 138     ++path_b;
 139   pp.path     = strdupdelim (path_b, path_e);
 140   pp.allowedp = allowedp;
 141   pp.user_agent_exact_p = exactp;
 142   ++specs->count;
 143   if (specs->count > specs->size)
 144     {
 145       if (specs->size == 0)
 146         specs->size = 1;
 147       else
 148         specs->size <<= 1;
 149       specs->paths = xrealloc (specs->paths,
 150                                specs->size * sizeof (struct path_info));
 151     }
 152   specs->paths[specs->count - 1] = pp;
 153 }
 154
 155 /* Recreate SPECS->paths with only those paths that have
 156    user_agent_exact_p set to true.  */
 157
 158 static void
 159 prune_non_exact (struct robot_specs *specs)
 160 {
 161   struct path_info *newpaths;
 162   int i, j, cnt;
 163   cnt = 0;
 164   for (i = 0; i < specs->count; i++)
 165     if (specs->paths[i].user_agent_exact_p)
 166       ++cnt;
 167   newpaths = xnew_array (struct path_info, cnt);
 168   for (i = 0, j = 0; i < specs->count; i++)
 169     if (specs->paths[i].user_agent_exact_p)
 170       newpaths[j++] = specs->paths[i];
 171   assert (j == cnt);
 172   xfree (specs->paths);
 173   specs->paths = newpaths;
 174   specs->count = cnt;
 175   specs->size  = cnt;
 176 }
 177
 178 #define EOL(p) ((p) >= lineend)
 179
 180 #define SKIP_SPACE(p) do {              \
 181   while (!EOL (p) && c_isspace (*p))      \
 182     ++p;                                \
 183 } while (0)
 184
 185 #define FIELD_IS(string_literal)        \
 186   BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
 187
 188 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
 189    Return a specs objects ready to be fed to res_match_path.
 190
 191    The parsing itself is trivial, but creating a correct SPECS object
 192    is trickier than it seems, because RES is surprisingly byzantine if
 193    you attempt to implement it correctly.
 194
 195    A "record" is a block of one or more `User-Agent' lines followed by
 196    one or more `Allow' or `Disallow' lines.  Record is accepted by
 197    Wget if one of the `User-Agent' lines was "wget", or if the user
 198    agent line was "*".
 199
 200    After all the lines have been read, we examine whether an exact
 201    ("wget") user-agent field was specified.  If so, we delete all the
 202    lines read under "User-Agent: *" blocks because we have our own
 203    Wget-specific blocks.  This enables the admin to say:
 204
 205        User-Agent: *
 206        Disallow: /
 207
 208        User-Agent: google
 209        User-Agent: wget
 210        Disallow: /cgi-bin
 211
 212    This means that to Wget and to Google, /cgi-bin is disallowed,
 213    whereas for all other crawlers, everything is disallowed.
 214    res_parse is implemented so that the order of records doesn't
 215    matter.  In the case above, the "User-Agent: *" could have come
 216    after the other one.  */
 217
 218 struct robot_specs *
 219 res_parse (const char *source, int length)
 220 {
 221   int line_count = 1;
 222
 223   const char *p   = source;
 224   const char *end = source + length;
 225
 226   /* true if last applicable user-agent field matches Wget. */
 227   bool user_agent_applies = false;
 228
 229   /* true if last applicable user-agent field *exactly* matches
 230      Wget.  */
 231   bool user_agent_exact = false;
 232
 233   /* whether we ever encountered exact user agent. */
 234   bool found_exact = false;
 235
 236   /* count of allow/disallow lines in the current "record", i.e. after
 237      the last `user-agent' instructions.  */
 238   int record_count = 0;
 239
 240   struct robot_specs *specs = xnew0 (struct robot_specs);
 241
 242   while (1)
 243     {
 244       const char *lineend, *lineend_real;
 245       const char *field_b, *field_e;
 246       const char *value_b, *value_e;
 247
 248       if (p == end)
 249         break;
 250       lineend_real = memchr (p, '\n', end - p);
 251       if (lineend_real)
 252         ++lineend_real;
 253       else
 254         lineend_real = end;
 255       lineend = lineend_real;
 256
 257       /* Before doing anything else, check whether the line is empty
 258          or comment-only. */
 259       SKIP_SPACE (p);
 260       if (EOL (p) || *p == '#')
 261         goto next;
 262
 263       /* Make sure the end-of-line comments are respected by setting
 264          lineend to a location preceding the first comment.  Real line
 265          ending remains in lineend_real.  */
 266       for (lineend = p; lineend < lineend_real; lineend++)
 267         if ((lineend == p || c_isspace (*(lineend - 1)))
 268             && *lineend == '#')
 269           break;
 270
 271       /* Ignore trailing whitespace in the same way. */
 272       while (lineend > p && c_isspace (*(lineend - 1)))
 273         --lineend;
 274
 275       assert (!EOL (p));
 276
 277       field_b = p;
 278       while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
 279         ++p;
 280       field_e = p;
 281
 282       SKIP_SPACE (p);
 283       if (field_b == field_e || EOL (p) || *p != ':')
 284         {
 285           DEBUGP (("Ignoring malformed line %d", line_count));
 286           goto next;
 287         }
 288       ++p;                      /* skip ':' */
 289       SKIP_SPACE (p);
 290
 291       value_b = p;
 292       while (!EOL (p))
 293         ++p;
 294       value_e = p;
 295
 296       /* Finally, we have a syntactically valid line. */
 297       if (FIELD_IS ("user-agent"))
 298         {
 299           /* We have to support several cases:
 300
 301              --previous records--
 302
 303              User-Agent: foo
 304              User-Agent: Wget
 305              User-Agent: bar
 306              ... matching record ...
 307
 308              User-Agent: baz
 309              User-Agent: qux
 310              ... non-matching record ...
 311
 312              User-Agent: *
 313              ... matching record, but will be pruned later ...
 314
 315              We have to respect `User-Agent' at the beginning of each
 316              new record simply because we don't know if we're going to
 317              encounter "Wget" among the agents or not.  Hence,
 318              match_user_agent is called when record_count != 0.
 319
 320              But if record_count is 0, we have to keep calling it
 321              until it matches, and if that happens, we must not call
 322              it any more, until the next record.  Hence the other part
 323              of the condition.  */
 324           if (record_count != 0 || user_agent_applies == false)
 325             match_user_agent (value_b, value_e - value_b,
 326                               &user_agent_applies, &user_agent_exact);
 327           if (user_agent_exact)
 328             found_exact = true;
 329           record_count = 0;
 330         }
 331       else if (FIELD_IS ("allow"))
 332         {
 333           if (user_agent_applies)
 334             {
 335               add_path (specs, value_b, value_e, true, user_agent_exact);
 336             }
 337           ++record_count;
 338         }
 339       else if (FIELD_IS ("disallow"))
 340         {
 341           if (user_agent_applies)
 342             {
 343               bool allowed = false;
 344               if (value_b == value_e)
 345                 /* Empty "disallow" line means everything is *allowed*!  */
 346                 allowed = true;
 347               add_path (specs, value_b, value_e, allowed, user_agent_exact);
 348             }
 349           ++record_count;
 350         }
 351       else
 352         {
 353           DEBUGP (("Ignoring unknown field at line %d", line_count));
 354           goto next;
 355         }
 356
 357     next:
 358       p = lineend_real;
 359       ++line_count;
 360     }
 361
 362   if (found_exact)
 363     {
 364       /* We've encountered an exactly matching user-agent.  Throw out
 365          all the stuff with user-agent: *.  */
 366       prune_non_exact (specs);
 367     }
 368   else if (specs->size > specs->count)
 369     {
 370       /* add_path normally over-allocates specs->paths.  Reallocate it
 371          to the correct size in order to conserve some memory.  */
 372       specs->paths = xrealloc (specs->paths,
 373                                specs->count * sizeof (struct path_info));
 374       specs->size = specs->count;
 375     }
 376
 377   return specs;
 378 }
 379
 380 /* The same like res_parse, but first map the FILENAME into memory,
 381    and then parse it.  */
 382
 383 struct robot_specs *
 384 res_parse_from_file (const char *filename)
 385 {
 386   struct robot_specs *specs;
 387   struct file_memory *fm = read_file (filename);
 388   if (!fm)
 389     {
 390       logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
 391                  filename, strerror (errno));
 392       return NULL;
 393     }
 394   specs = res_parse (fm->content, fm->length);
 395   read_file_free (fm);
 396   return specs;
 397 }
 398
 399 static void
 400 free_specs (struct robot_specs *specs)
 401 {
 402   int i;
 403   for (i = 0; i < specs->count; i++)
 404     xfree (specs->paths[i].path);
 405   xfree_null (specs->paths);
 406   xfree (specs);
 407 }
 408 \f
 409 /* Matching of a path according to the specs. */
 410
 411 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
 412    that number is not a numerical representation of '/', decode C and
 413    advance the pointer.  */
 414
 415 #define DECODE_MAYBE(c, ptr) do {                               \
 416   if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2]))       \
 417     {                                                           \
 418       char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
 419       if (decoded != '/')                                       \
 420         {                                                       \
 421           c = decoded;                                          \
 422           ptr += 2;                                             \
 423         }                                                       \
 424     }                                                           \
 425 } while (0)
 426
 427 /* The inner matching engine: return true if RECORD_PATH matches
 428    URL_PATH.  The rules for matching are described at
 429    <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2.  */
 430
 431 static bool
 432 matches (const char *record_path, const char *url_path)
 433 {
 434   const char *rp = record_path;
 435   const char *up = url_path;
 436
 437   for (; ; ++rp, ++up)
 438     {
 439       char rc = *rp;
 440       char uc = *up;
 441       if (!rc)
 442         return true;
 443       if (!uc)
 444         return false;
 445       DECODE_MAYBE(rc, rp);
 446       DECODE_MAYBE(uc, up);
 447       if (rc != uc)
 448         return false;
 449     }
 450 }
 451
 452 /* Iterate through all paths in SPECS.  For the first one that
 453    matches, return its allow/reject status.  If none matches,
 454    retrieval is by default allowed.  */
 455
 456 bool
 457 res_match_path (const struct robot_specs *specs, const char *path)
 458 {
 459   int i;
 460   if (!specs)
 461     return true;
 462   for (i = 0; i < specs->count; i++)
 463     if (matches (specs->paths[i].path, path))
 464       {
 465         bool allowedp = specs->paths[i].allowedp;
 466         DEBUGP (("%s path %s because of rule %s.\n",
 467                  allowedp ? "Allowing" : "Rejecting",
 468                  path, quote (specs->paths[i].path)));
 469         return allowedp;
 470       }
 471   return true;
 472 }
 473 \f
 474 /* Registering the specs. */
 475
 476 static struct hash_table *registered_specs;
 477
 478 /* Stolen from cookies.c. */
 479 #define SET_HOSTPORT(host, port, result) do {           \
 480   int HP_len = strlen (host);                           \
 481   result = alloca (HP_len + 1 + numdigit (port) + 1);   \
 482   memcpy (result, host, HP_len);                        \
 483   result[HP_len] = ':';                                 \
 484   number_to_string (result + HP_len + 1, port);         \
 485 } while (0)
 486
 487 /* Register RES specs that below to server on HOST:PORT.  They will
 488    later be retrievable using res_get_specs.  */
 489
 490 void
 491 res_register_specs (const char *host, int port, struct robot_specs *specs)
 492 {
 493   struct robot_specs *old;
 494   char *hp, *hp_old;
 495   SET_HOSTPORT (host, port, hp);
 496
 497   if (!registered_specs)
 498     registered_specs = make_nocase_string_hash_table (0);
 499
 500   if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
 501     {
 502       if (old)
 503         free_specs (old);
 504       hash_table_put (registered_specs, hp_old, specs);
 505     }
 506   else
 507     {
 508       hash_table_put (registered_specs, xstrdup (hp), specs);
 509     }
 510 }
 511
 512 /* Get the specs that belong to HOST:PORT. */
 513
 514 struct robot_specs *
 515 res_get_specs (const char *host, int port)
 516 {
 517   char *hp;
 518   SET_HOSTPORT (host, port, hp);
 519   if (!registered_specs)
 520     return NULL;
 521   return hash_table_get (registered_specs, hp);
 522 }
 523 \f
 524 /* Loading the robots file.  */
 525
 526 #define RES_SPECS_LOCATION "/robots.txt"
 527
 528 /* Retrieve the robots.txt from the server root of the server that
 529    serves URL.  The file will be named according to the currently
 530    active rules, and the file name will be returned in *file.
 531
 532    Return true if robots were retrieved OK, false otherwise.  */
 533
 534 bool
 535 res_retrieve_file (const char *url, char **file, struct iri *iri)
 536 {
 537   struct iri *i = iri_new ();
 538   uerr_t err;
 539   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 540   int saved_ts_val = opt.timestamping;
 541   int saved_sp_val = opt.spider, url_err;
 542   struct url * url_parsed;
 543
 544   /* Copy server URI encoding for a possible IDNA transformation, no need to
 545      encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
 546   set_uri_encoding (i, iri->uri_encoding, false);
 547   i->utf8_encode = false;
 548
 549   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 550   *file = NULL;
 551   opt.timestamping = false;
 552   opt.spider       = false;
 553
 554   url_parsed = url_parse (robots_url, &url_err, iri, true);
 555   if (!url_parsed)
 556     {
 557       char *error = url_error (robots_url, url_err);
 558       logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
 559       xfree (error);
 560       err = URLERROR;
 561     }
 562   else
 563     {
 564       err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
 565                           false, i, false);
 566       url_free(url_parsed);
 567     }
 568
 569   opt.timestamping = saved_ts_val;
 570   opt.spider       = saved_sp_val;
 571   xfree (robots_url);
 572   iri_free (i);
 573
 574   if (err != RETROK && *file != NULL)
 575     {
 576       /* If the file is not retrieved correctly, but retrieve_url
 577          allocated the file name, deallocate is here so that the
 578          caller doesn't have to worry about it.  */
 579       xfree (*file);
 580       *file = NULL;
 581     }
 582   return err == RETROK;
 583 }
 584 \f
 585 bool
 586 is_robots_txt_url (const char *url)
 587 {
 588   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 589   bool ret = are_urls_equal (url, robots_url);
 590
 591   xfree (robots_url);
 592
 593   return ret;
 594 }
 595 \f
 596 void
 597 res_cleanup (void)
 598 {
 599   if (registered_specs)
 600     {
 601       hash_table_iterator iter;
 602       for (hash_table_iterate (registered_specs, &iter);
 603            hash_table_iter_next (&iter);
 604            )
 605         {
 606           xfree (iter.key);
 607           free_specs (iter.value);
 608         }
 609       hash_table_destroy (registered_specs);
 610       registered_specs = NULL;
 611     }
 612 }
 613 \f
 614 #ifdef TESTING
 615
 616 const char *
 617 test_is_robots_txt_url()
 618 {
 619   int i;
 620   struct {
 621     char *url;
 622     bool expected_result;
 623   } test_array[] = {
 624     { "http://www.yoyodyne.com/robots.txt", true },
 625     { "http://www.yoyodyne.com/somepath/", false },
 626     { "http://www.yoyodyne.com/somepath/robots.txt", false },
 627   };
 628
 629   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
 630     {
 631       mu_assert ("test_is_robots_txt_url: wrong result",
 632                  is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
 633     }
 634
 635   return NULL;
 636 }
 637
 638 #endif /* TESTING */
 639
 640 /*
 641  * vim: et ts=2 sw=2
 642  */
 643