sjero.net Git - wget/blob - src/res.c

   1 /* Support for Robot Exclusion Standard (RES).
   2    Copyright (C) 2001, 2006, 2007 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 In addition, as a special exception, the Free Software Foundation
  20 gives permission to link the code of its release of Wget with the
  21 OpenSSL project's "OpenSSL" library (or with modified versions of it
  22 that use the same license as the "OpenSSL" library), and distribute
  23 the linked executables.  You must obey the GNU General Public License
  24 in all respects for all of the code used other than "OpenSSL".  If you
  25 modify this file, you may extend this exception to your version of the
  26 file, but you are not obligated to do so.  If you do not wish to do
  27 so, delete this exception statement from your version.  */
  28
  29 /* This file implements the Robot Exclusion Standard (RES).
  30
  31    RES is a simple protocol that enables site admins to signalize to
  32    the web crawlers that certain parts of the site should not be
  33    accessed.  All the admin needs to do is create a "robots.txt" file
  34    in the web server root, and use simple commands to allow or
  35    disallow access to certain parts of the site.
  36
  37    The first specification was written by Martijn Koster in 1994, and
  38    is still available at <http://www.robotstxt.org/wc/norobots.html>.
  39    In 1996, Martijn wrote an Internet Draft specifying an improved RES
  40    specification; however, that work was apparently abandoned since
  41    the draft has expired in 1997 and hasn't been replaced since.  The
  42    draft is available at
  43    <http://www.robotstxt.org/wc/norobots-rfc.html>.
  44
  45    This file implements RES as specified by the draft.  Note that this
  46    only handles the "robots.txt" support.  The META tag that controls
  47    whether the links should be followed is handled in `html-url.c'.
  48
  49    Known deviations:
  50
  51    * The end-of-line comment recognition is more in the spirit of the
  52      Bourne Shell (as specified by RES-1994).  That means that
  53      "foo#bar" is taken literally, whereas "foo #bar" is interpreted
  54      as "foo".  The Draft apparently specifies that both should be
  55      interpreted as "foo".
  56
  57    * We don't recognize sole CR as the line ending.
  58
  59    * We don't implement expiry mechanism for /robots.txt specs.  I
  60      consider it non-necessary for a relatively short-lived
  61      application such as Wget.  Besides, it is highly questionable
  62      whether anyone deploys the recommended expiry scheme for
  63      robots.txt.
  64
  65    Entry points are functions res_parse, res_parse_from_file,
  66    res_match_path, res_register_specs, res_get_specs, and
  67    res_retrieve_file.  */
  68
  69 #include "wget.h"
  70
  71 #include <stdio.h>
  72 #include <stdlib.h>
  73 #include <string.h>
  74 #include <errno.h>
  75 #include <assert.h>
  76
  77 #include "utils.h"
  78 #include "hash.h"
  79 #include "url.h"
  80 #include "retr.h"
  81 #include "res.h"
  82
  83 #ifdef TESTING
  84 #include "test.h"
  85 #endif
  86
  87 struct path_info {
  88   char *path;
  89   bool allowedp;
  90   bool user_agent_exact_p;
  91 };
  92
  93 struct robot_specs {
  94   int count;
  95   int size;
  96   struct path_info *paths;
  97 };
  98 \f
  99 /* Parsing the robot spec. */
 100
 101 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
 102    "*".  If it is either of them, *matches is set to one.  If it is
 103    "wget", *exact_match is set to one.  */
 104
 105 static void
 106 match_user_agent (const char *agent, int length,
 107                   bool *matches, bool *exact_match)
 108 {
 109   if (length == 1 && *agent == '*')
 110     {
 111       *matches = true;
 112       *exact_match = false;
 113     }
 114   else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
 115     {
 116       *matches = true;
 117       *exact_match = true;
 118     }
 119   else
 120     {
 121       *matches = false;
 122       *exact_match = false;
 123     }
 124 }
 125
 126 /* Add a path specification between PATH_B and PATH_E as one of the
 127    paths in SPECS.  */
 128
 129 static void
 130 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
 131           bool allowedp, bool exactp)
 132 {
 133   struct path_info pp;
 134   if (path_b < path_e && *path_b == '/')
 135     /* Our path representation doesn't use a leading slash, so remove
 136        one from theirs. */
 137     ++path_b;
 138   pp.path     = strdupdelim (path_b, path_e);
 139   pp.allowedp = allowedp;
 140   pp.user_agent_exact_p = exactp;
 141   ++specs->count;
 142   if (specs->count > specs->size)
 143     {
 144       if (specs->size == 0)
 145         specs->size = 1;
 146       else
 147         specs->size <<= 1;
 148       specs->paths = xrealloc (specs->paths,
 149                                specs->size * sizeof (struct path_info));
 150     }
 151   specs->paths[specs->count - 1] = pp;
 152 }
 153
 154 /* Recreate SPECS->paths with only those paths that have
 155    user_agent_exact_p set to true.  */
 156
 157 static void
 158 prune_non_exact (struct robot_specs *specs)
 159 {
 160   struct path_info *newpaths;
 161   int i, j, cnt;
 162   cnt = 0;
 163   for (i = 0; i < specs->count; i++)
 164     if (specs->paths[i].user_agent_exact_p)
 165       ++cnt;
 166   newpaths = xnew_array (struct path_info, cnt);
 167   for (i = 0, j = 0; i < specs->count; i++)
 168     if (specs->paths[i].user_agent_exact_p)
 169       newpaths[j++] = specs->paths[i];
 170   assert (j == cnt);
 171   xfree (specs->paths);
 172   specs->paths = newpaths;
 173   specs->count = cnt;
 174   specs->size  = cnt;
 175 }
 176
 177 #define EOL(p) ((p) >= lineend)
 178
 179 #define SKIP_SPACE(p) do {              \
 180   while (!EOL (p) && c_isspace (*p))      \
 181     ++p;                                \
 182 } while (0)
 183
 184 #define FIELD_IS(string_literal)        \
 185   BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
 186
 187 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
 188    Return a specs objects ready to be fed to res_match_path.
 189
 190    The parsing itself is trivial, but creating a correct SPECS object
 191    is trickier than it seems, because RES is surprisingly byzantine if
 192    you attempt to implement it correctly.
 193
 194    A "record" is a block of one or more `User-Agent' lines followed by
 195    one or more `Allow' or `Disallow' lines.  Record is accepted by
 196    Wget if one of the `User-Agent' lines was "wget", or if the user
 197    agent line was "*".
 198
 199    After all the lines have been read, we examine whether an exact
 200    ("wget") user-agent field was specified.  If so, we delete all the
 201    lines read under "User-Agent: *" blocks because we have our own
 202    Wget-specific blocks.  This enables the admin to say:
 203
 204        User-Agent: *
 205        Disallow: /
 206
 207        User-Agent: google
 208        User-Agent: wget
 209        Disallow: /cgi-bin
 210
 211    This means that to Wget and to Google, /cgi-bin is disallowed,
 212    whereas for all other crawlers, everything is disallowed.
 213    res_parse is implemented so that the order of records doesn't
 214    matter.  In the case above, the "User-Agent: *" could have come
 215    after the other one.  */
 216
 217 struct robot_specs *
 218 res_parse (const char *source, int length)
 219 {
 220   int line_count = 1;
 221
 222   const char *p   = source;
 223   const char *end = source + length;
 224
 225   /* true if last applicable user-agent field matches Wget. */
 226   bool user_agent_applies = false;
 227
 228   /* true if last applicable user-agent field *exactly* matches
 229      Wget.  */
 230   bool user_agent_exact = false;
 231
 232   /* whether we ever encountered exact user agent. */
 233   bool found_exact = false;
 234
 235   /* count of allow/disallow lines in the current "record", i.e. after
 236      the last `user-agent' instructions.  */
 237   int record_count = 0;
 238
 239   struct robot_specs *specs = xnew0 (struct robot_specs);
 240
 241   while (1)
 242     {
 243       const char *lineend, *lineend_real;
 244       const char *field_b, *field_e;
 245       const char *value_b, *value_e;
 246
 247       if (p == end)
 248         break;
 249       lineend_real = memchr (p, '\n', end - p);
 250       if (lineend_real)
 251         ++lineend_real;
 252       else
 253         lineend_real = end;
 254       lineend = lineend_real;
 255
 256       /* Before doing anything else, check whether the line is empty
 257          or comment-only. */
 258       SKIP_SPACE (p);
 259       if (EOL (p) || *p == '#')
 260         goto next;
 261
 262       /* Make sure the end-of-line comments are respected by setting
 263          lineend to a location preceding the first comment.  Real line
 264          ending remains in lineend_real.  */
 265       for (lineend = p; lineend < lineend_real; lineend++)
 266         if ((lineend == p || c_isspace (*(lineend - 1)))
 267             && *lineend == '#')
 268           break;
 269
 270       /* Ignore trailing whitespace in the same way. */
 271       while (lineend > p && c_isspace (*(lineend - 1)))
 272         --lineend;
 273
 274       assert (!EOL (p));
 275
 276       field_b = p;
 277       while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
 278         ++p;
 279       field_e = p;
 280
 281       SKIP_SPACE (p);
 282       if (field_b == field_e || EOL (p) || *p != ':')
 283         {
 284           DEBUGP (("Ignoring malformed line %d", line_count));
 285           goto next;
 286         }
 287       ++p;                      /* skip ':' */
 288       SKIP_SPACE (p);
 289
 290       value_b = p;
 291       while (!EOL (p))
 292         ++p;
 293       value_e = p;
 294
 295       /* Finally, we have a syntactically valid line. */
 296       if (FIELD_IS ("user-agent"))
 297         {
 298           /* We have to support several cases:
 299
 300              --previous records--
 301
 302              User-Agent: foo
 303              User-Agent: Wget
 304              User-Agent: bar
 305              ... matching record ...
 306
 307              User-Agent: baz
 308              User-Agent: qux
 309              ... non-matching record ...
 310
 311              User-Agent: *
 312              ... matching record, but will be pruned later ...
 313
 314              We have to respect `User-Agent' at the beginning of each
 315              new record simply because we don't know if we're going to
 316              encounter "Wget" among the agents or not.  Hence,
 317              match_user_agent is called when record_count != 0.
 318
 319              But if record_count is 0, we have to keep calling it
 320              until it matches, and if that happens, we must not call
 321              it any more, until the next record.  Hence the other part
 322              of the condition.  */
 323           if (record_count != 0 || user_agent_applies == false)
 324             match_user_agent (value_b, value_e - value_b,
 325                               &user_agent_applies, &user_agent_exact);
 326           if (user_agent_exact)
 327             found_exact = true;
 328           record_count = 0;
 329         }
 330       else if (FIELD_IS ("allow"))
 331         {
 332           if (user_agent_applies)
 333             {
 334               add_path (specs, value_b, value_e, true, user_agent_exact);
 335             }
 336           ++record_count;
 337         }
 338       else if (FIELD_IS ("disallow"))
 339         {
 340           if (user_agent_applies)
 341             {
 342               bool allowed = false;
 343               if (value_b == value_e)
 344                 /* Empty "disallow" line means everything is *allowed*!  */
 345                 allowed = true;
 346               add_path (specs, value_b, value_e, allowed, user_agent_exact);
 347             }
 348           ++record_count;
 349         }
 350       else
 351         {
 352           DEBUGP (("Ignoring unknown field at line %d", line_count));
 353           goto next;
 354         }
 355
 356     next:
 357       p = lineend_real;
 358       ++line_count;
 359     }
 360
 361   if (found_exact)
 362     {
 363       /* We've encountered an exactly matching user-agent.  Throw out
 364          all the stuff with user-agent: *.  */
 365       prune_non_exact (specs);
 366     }
 367   else if (specs->size > specs->count)
 368     {
 369       /* add_path normally over-allocates specs->paths.  Reallocate it
 370          to the correct size in order to conserve some memory.  */
 371       specs->paths = xrealloc (specs->paths,
 372                                specs->count * sizeof (struct path_info));
 373       specs->size = specs->count;
 374     }
 375
 376   return specs;
 377 }
 378
 379 /* The same like res_parse, but first map the FILENAME into memory,
 380    and then parse it.  */
 381
 382 struct robot_specs *
 383 res_parse_from_file (const char *filename)
 384 {
 385   struct robot_specs *specs;
 386   struct file_memory *fm = read_file (filename);
 387   if (!fm)
 388     {
 389       logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
 390                  filename, strerror (errno));
 391       return NULL;
 392     }
 393   specs = res_parse (fm->content, fm->length);
 394   read_file_free (fm);
 395   return specs;
 396 }
 397
 398 static void
 399 free_specs (struct robot_specs *specs)
 400 {
 401   int i;
 402   for (i = 0; i < specs->count; i++)
 403     xfree (specs->paths[i].path);
 404   xfree_null (specs->paths);
 405   xfree (specs);
 406 }
 407 \f
 408 /* Matching of a path according to the specs. */
 409
 410 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
 411    that number is not a numerical representation of '/', decode C and
 412    advance the pointer.  */
 413
 414 #define DECODE_MAYBE(c, ptr) do {                               \
 415   if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2]))       \
 416     {                                                           \
 417       char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
 418       if (decoded != '/')                                       \
 419         {                                                       \
 420           c = decoded;                                          \
 421           ptr += 2;                                             \
 422         }                                                       \
 423     }                                                           \
 424 } while (0)
 425
 426 /* The inner matching engine: return true if RECORD_PATH matches
 427    URL_PATH.  The rules for matching are described at
 428    <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2.  */
 429
 430 static bool
 431 matches (const char *record_path, const char *url_path)
 432 {
 433   const char *rp = record_path;
 434   const char *up = url_path;
 435
 436   for (; ; ++rp, ++up)
 437     {
 438       char rc = *rp;
 439       char uc = *up;
 440       if (!rc)
 441         return true;
 442       if (!uc)
 443         return false;
 444       DECODE_MAYBE(rc, rp);
 445       DECODE_MAYBE(uc, up);
 446       if (rc != uc)
 447         return false;
 448     }
 449 }
 450
 451 /* Iterate through all paths in SPECS.  For the first one that
 452    matches, return its allow/reject status.  If none matches,
 453    retrieval is by default allowed.  */
 454
 455 bool
 456 res_match_path (const struct robot_specs *specs, const char *path)
 457 {
 458   int i;
 459   if (!specs)
 460     return true;
 461   for (i = 0; i < specs->count; i++)
 462     if (matches (specs->paths[i].path, path))
 463       {
 464         bool allowedp = specs->paths[i].allowedp;
 465         DEBUGP (("%s path %s because of rule `%s'.\n",
 466                  allowedp ? "Allowing" : "Rejecting",
 467                  path, specs->paths[i].path));
 468         return allowedp;
 469       }
 470   return true;
 471 }
 472 \f
 473 /* Registering the specs. */
 474
 475 static struct hash_table *registered_specs;
 476
 477 /* Stolen from cookies.c. */
 478 #define SET_HOSTPORT(host, port, result) do {           \
 479   int HP_len = strlen (host);                           \
 480   result = alloca (HP_len + 1 + numdigit (port) + 1);   \
 481   memcpy (result, host, HP_len);                        \
 482   result[HP_len] = ':';                                 \
 483   number_to_string (result + HP_len + 1, port);         \
 484 } while (0)
 485
 486 /* Register RES specs that below to server on HOST:PORT.  They will
 487    later be retrievable using res_get_specs.  */
 488
 489 void
 490 res_register_specs (const char *host, int port, struct robot_specs *specs)
 491 {
 492   struct robot_specs *old;
 493   char *hp, *hp_old;
 494   SET_HOSTPORT (host, port, hp);
 495
 496   if (!registered_specs)
 497     registered_specs = make_nocase_string_hash_table (0);
 498
 499   if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
 500     {
 501       if (old)
 502         free_specs (old);
 503       hash_table_put (registered_specs, hp_old, specs);
 504     }
 505   else
 506     {
 507       hash_table_put (registered_specs, xstrdup (hp), specs);
 508     }
 509 }
 510
 511 /* Get the specs that belong to HOST:PORT. */
 512
 513 struct robot_specs *
 514 res_get_specs (const char *host, int port)
 515 {
 516   char *hp;
 517   SET_HOSTPORT (host, port, hp);
 518   if (!registered_specs)
 519     return NULL;
 520   return hash_table_get (registered_specs, hp);
 521 }
 522 \f
 523 /* Loading the robots file.  */
 524
 525 #define RES_SPECS_LOCATION "/robots.txt"
 526
 527 /* Retrieve the robots.txt from the server root of the server that
 528    serves URL.  The file will be named according to the currently
 529    active rules, and the file name will be returned in *file.
 530
 531    Return true if robots were retrieved OK, false otherwise.  */
 532
 533 bool
 534 res_retrieve_file (const char *url, char **file)
 535 {
 536   uerr_t err;
 537   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 538   int saved_ts_val = opt.timestamping;
 539   int saved_sp_val = opt.spider;
 540
 541   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 542   *file = NULL;
 543   opt.timestamping = false;
 544   opt.spider       = false;
 545   err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
 546   opt.timestamping = saved_ts_val;
 547   opt.spider       = saved_sp_val;
 548   xfree (robots_url);
 549
 550   if (err != RETROK && *file != NULL)
 551     {
 552       /* If the file is not retrieved correctly, but retrieve_url
 553          allocated the file name, deallocate is here so that the
 554          caller doesn't have to worry about it.  */
 555       xfree (*file);
 556       *file = NULL;
 557     }
 558   return err == RETROK;
 559 }
 560 \f
 561 bool
 562 is_robots_txt_url (const char *url)
 563 {
 564   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 565   bool ret = are_urls_equal (url, robots_url);
 566
 567   xfree (robots_url);
 568
 569   return ret;
 570 }
 571 \f
 572 void
 573 res_cleanup (void)
 574 {
 575   if (registered_specs)
 576     {
 577       hash_table_iterator iter;
 578       for (hash_table_iterate (registered_specs, &iter);
 579            hash_table_iter_next (&iter);
 580            )
 581         {
 582           xfree (iter.key);
 583           free_specs (iter.value);
 584         }
 585       hash_table_destroy (registered_specs);
 586       registered_specs = NULL;
 587     }
 588 }
 589 \f
 590 #ifdef TESTING
 591
 592 const char *
 593 test_is_robots_txt_url()
 594 {
 595   int i;
 596   struct {
 597     char *url;
 598     bool expected_result;
 599   } test_array[] = {
 600     { "http://www.yoyodyne.com/robots.txt", true },
 601     { "http://www.yoyodyne.com/somepath/", false },
 602     { "http://www.yoyodyne.com/somepath/robots.txt", false },
 603   };
 604
 605   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
 606     {
 607       mu_assert ("test_is_robots_txt_url: wrong result",
 608                  is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
 609     }
 610
 611   return NULL;
 612 }
 613
 614 #endif /* TESTING */
 615
 616 /*
 617  * vim: et ts=2 sw=2
 618  */
 619