sjero.net Git - wget/blob - src/res.c

   1 /* Support for Robot Exclusion Standard (RES).
   2    Copyright (C) 2001,2006 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* This file implements the Robot Exclusion Standard (RES).
  31
  32    RES is a simple protocol that enables site admins to signalize to
  33    the web crawlers that certain parts of the site should not be
  34    accessed.  All the admin needs to do is create a "robots.txt" file
  35    in the web server root, and use simple commands to allow or
  36    disallow access to certain parts of the site.
  37
  38    The first specification was written by Martijn Koster in 1994, and
  39    is still available at <http://www.robotstxt.org/wc/norobots.html>.
  40    In 1996, Martijn wrote an Internet Draft specifying an improved RES
  41    specification; however, that work was apparently abandoned since
  42    the draft has expired in 1997 and hasn't been replaced since.  The
  43    draft is available at
  44    <http://www.robotstxt.org/wc/norobots-rfc.html>.
  45
  46    This file implements RES as specified by the draft.  Note that this
  47    only handles the "robots.txt" support.  The META tag that controls
  48    whether the links should be followed is handled in `html-url.c'.
  49
  50    Known deviations:
  51
  52    * The end-of-line comment recognition is more in the spirit of the
  53      Bourne Shell (as specified by RES-1994).  That means that
  54      "foo#bar" is taken literally, whereas "foo #bar" is interpreted
  55      as "foo".  The Draft apparently specifies that both should be
  56      interpreted as "foo".
  57
  58    * We don't recognize sole CR as the line ending.
  59
  60    * We don't implement expiry mechanism for /robots.txt specs.  I
  61      consider it non-necessary for a relatively short-lived
  62      application such as Wget.  Besides, it is highly questionable
  63      whether anyone deploys the recommended expiry scheme for
  64      robots.txt.
  65
  66    Entry points are functions res_parse, res_parse_from_file,
  67    res_match_path, res_register_specs, res_get_specs, and
  68    res_retrieve_file.  */
  69
  70 #ifdef HAVE_CONFIG_H
  71 # include <config.h>
  72 #endif
  73
  74 #include <stdio.h>
  75 #include <stdlib.h>
  76 #include <string.h>
  77 #include <errno.h>
  78 #include <assert.h>
  79
  80 #include "wget.h"
  81 #include "utils.h"
  82 #include "hash.h"
  83 #include "url.h"
  84 #include "retr.h"
  85 #include "res.h"
  86
  87 #ifdef TESTING
  88 #include "test.h"
  89 #endif
  90
  91 struct path_info {
  92   char *path;
  93   bool allowedp;
  94   bool user_agent_exact_p;
  95 };
  96
  97 struct robot_specs {
  98   int count;
  99   int size;
 100   struct path_info *paths;
 101 };
 102 \f
 103 /* Parsing the robot spec. */
 104
 105 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
 106    "*".  If it is either of them, *matches is set to one.  If it is
 107    "wget", *exact_match is set to one.  */
 108
 109 static void
 110 match_user_agent (const char *agent, int length,
 111                   bool *matches, bool *exact_match)
 112 {
 113   if (length == 1 && *agent == '*')
 114     {
 115       *matches = true;
 116       *exact_match = false;
 117     }
 118   else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
 119     {
 120       *matches = true;
 121       *exact_match = true;
 122     }
 123   else
 124     {
 125       *matches = false;
 126       *exact_match = false;
 127     }
 128 }
 129
 130 /* Add a path specification between PATH_B and PATH_E as one of the
 131    paths in SPECS.  */
 132
 133 static void
 134 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
 135           bool allowedp, bool exactp)
 136 {
 137   struct path_info pp;
 138   if (path_b < path_e && *path_b == '/')
 139     /* Our path representation doesn't use a leading slash, so remove
 140        one from theirs. */
 141     ++path_b;
 142   pp.path     = strdupdelim (path_b, path_e);
 143   pp.allowedp = allowedp;
 144   pp.user_agent_exact_p = exactp;
 145   ++specs->count;
 146   if (specs->count > specs->size)
 147     {
 148       if (specs->size == 0)
 149         specs->size = 1;
 150       else
 151         specs->size <<= 1;
 152       specs->paths = xrealloc (specs->paths,
 153                                specs->size * sizeof (struct path_info));
 154     }
 155   specs->paths[specs->count - 1] = pp;
 156 }
 157
 158 /* Recreate SPECS->paths with only those paths that have
 159    user_agent_exact_p set to true.  */
 160
 161 static void
 162 prune_non_exact (struct robot_specs *specs)
 163 {
 164   struct path_info *newpaths;
 165   int i, j, cnt;
 166   cnt = 0;
 167   for (i = 0; i < specs->count; i++)
 168     if (specs->paths[i].user_agent_exact_p)
 169       ++cnt;
 170   newpaths = xnew_array (struct path_info, cnt);
 171   for (i = 0, j = 0; i < specs->count; i++)
 172     if (specs->paths[i].user_agent_exact_p)
 173       newpaths[j++] = specs->paths[i];
 174   assert (j == cnt);
 175   xfree (specs->paths);
 176   specs->paths = newpaths;
 177   specs->count = cnt;
 178   specs->size  = cnt;
 179 }
 180
 181 #define EOL(p) ((p) >= lineend)
 182
 183 #define SKIP_SPACE(p) do {              \
 184   while (!EOL (p) && ISSPACE (*p))      \
 185     ++p;                                \
 186 } while (0)
 187
 188 #define FIELD_IS(string_literal)        \
 189   BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
 190
 191 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
 192    Return a specs objects ready to be fed to res_match_path.
 193
 194    The parsing itself is trivial, but creating a correct SPECS object
 195    is trickier than it seems, because RES is surprisingly byzantine if
 196    you attempt to implement it correctly.
 197
 198    A "record" is a block of one or more `User-Agent' lines followed by
 199    one or more `Allow' or `Disallow' lines.  Record is accepted by
 200    Wget if one of the `User-Agent' lines was "wget", or if the user
 201    agent line was "*".
 202
 203    After all the lines have been read, we examine whether an exact
 204    ("wget") user-agent field was specified.  If so, we delete all the
 205    lines read under "User-Agent: *" blocks because we have our own
 206    Wget-specific blocks.  This enables the admin to say:
 207
 208        User-Agent: *
 209        Disallow: /
 210
 211        User-Agent: google
 212        User-Agent: wget
 213        Disallow: /cgi-bin
 214
 215    This means that to Wget and to Google, /cgi-bin is disallowed,
 216    whereas for all other crawlers, everything is disallowed.
 217    res_parse is implemented so that the order of records doesn't
 218    matter.  In the case above, the "User-Agent: *" could have come
 219    after the other one.  */
 220
 221 struct robot_specs *
 222 res_parse (const char *source, int length)
 223 {
 224   int line_count = 1;
 225
 226   const char *p   = source;
 227   const char *end = source + length;
 228
 229   /* true if last applicable user-agent field matches Wget. */
 230   bool user_agent_applies = false;
 231
 232   /* true if last applicable user-agent field *exactly* matches
 233      Wget.  */
 234   bool user_agent_exact = false;
 235
 236   /* whether we ever encountered exact user agent. */
 237   bool found_exact = false;
 238
 239   /* count of allow/disallow lines in the current "record", i.e. after
 240      the last `user-agent' instructions.  */
 241   int record_count = 0;
 242
 243   struct robot_specs *specs = xnew0 (struct robot_specs);
 244
 245   while (1)
 246     {
 247       const char *lineend, *lineend_real;
 248       const char *field_b, *field_e;
 249       const char *value_b, *value_e;
 250
 251       if (p == end)
 252         break;
 253       lineend_real = memchr (p, '\n', end - p);
 254       if (lineend_real)
 255         ++lineend_real;
 256       else
 257         lineend_real = end;
 258       lineend = lineend_real;
 259
 260       /* Before doing anything else, check whether the line is empty
 261          or comment-only. */
 262       SKIP_SPACE (p);
 263       if (EOL (p) || *p == '#')
 264         goto next;
 265
 266       /* Make sure the end-of-line comments are respected by setting
 267          lineend to a location preceding the first comment.  Real line
 268          ending remains in lineend_real.  */
 269       for (lineend = p; lineend < lineend_real; lineend++)
 270         if ((lineend == p || ISSPACE (*(lineend - 1)))
 271             && *lineend == '#')
 272           break;
 273
 274       /* Ignore trailing whitespace in the same way. */
 275       while (lineend > p && ISSPACE (*(lineend - 1)))
 276         --lineend;
 277
 278       assert (!EOL (p));
 279
 280       field_b = p;
 281       while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
 282         ++p;
 283       field_e = p;
 284
 285       SKIP_SPACE (p);
 286       if (field_b == field_e || EOL (p) || *p != ':')
 287         {
 288           DEBUGP (("Ignoring malformed line %d", line_count));
 289           goto next;
 290         }
 291       ++p;                      /* skip ':' */
 292       SKIP_SPACE (p);
 293
 294       value_b = p;
 295       while (!EOL (p))
 296         ++p;
 297       value_e = p;
 298
 299       /* Finally, we have a syntactically valid line. */
 300       if (FIELD_IS ("user-agent"))
 301         {
 302           /* We have to support several cases:
 303
 304              --previous records--
 305
 306              User-Agent: foo
 307              User-Agent: Wget
 308              User-Agent: bar
 309              ... matching record ...
 310
 311              User-Agent: baz
 312              User-Agent: qux
 313              ... non-matching record ...
 314
 315              User-Agent: *
 316              ... matching record, but will be pruned later ...
 317
 318              We have to respect `User-Agent' at the beginning of each
 319              new record simply because we don't know if we're going to
 320              encounter "Wget" among the agents or not.  Hence,
 321              match_user_agent is called when record_count != 0.
 322
 323              But if record_count is 0, we have to keep calling it
 324              until it matches, and if that happens, we must not call
 325              it any more, until the next record.  Hence the other part
 326              of the condition.  */
 327           if (record_count != 0 || user_agent_applies == false)
 328             match_user_agent (value_b, value_e - value_b,
 329                               &user_agent_applies, &user_agent_exact);
 330           if (user_agent_exact)
 331             found_exact = true;
 332           record_count = 0;
 333         }
 334       else if (FIELD_IS ("allow"))
 335         {
 336           if (user_agent_applies)
 337             {
 338               add_path (specs, value_b, value_e, true, user_agent_exact);
 339             }
 340           ++record_count;
 341         }
 342       else if (FIELD_IS ("disallow"))
 343         {
 344           if (user_agent_applies)
 345             {
 346               bool allowed = false;
 347               if (value_b == value_e)
 348                 /* Empty "disallow" line means everything is *allowed*!  */
 349                 allowed = true;
 350               add_path (specs, value_b, value_e, allowed, user_agent_exact);
 351             }
 352           ++record_count;
 353         }
 354       else
 355         {
 356           DEBUGP (("Ignoring unknown field at line %d", line_count));
 357           goto next;
 358         }
 359
 360     next:
 361       p = lineend_real;
 362       ++line_count;
 363     }
 364
 365   if (found_exact)
 366     {
 367       /* We've encountered an exactly matching user-agent.  Throw out
 368          all the stuff with user-agent: *.  */
 369       prune_non_exact (specs);
 370     }
 371   else if (specs->size > specs->count)
 372     {
 373       /* add_path normally over-allocates specs->paths.  Reallocate it
 374          to the correct size in order to conserve some memory.  */
 375       specs->paths = xrealloc (specs->paths,
 376                                specs->count * sizeof (struct path_info));
 377       specs->size = specs->count;
 378     }
 379
 380   return specs;
 381 }
 382
 383 /* The same like res_parse, but first map the FILENAME into memory,
 384    and then parse it.  */
 385
 386 struct robot_specs *
 387 res_parse_from_file (const char *filename)
 388 {
 389   struct robot_specs *specs;
 390   struct file_memory *fm = read_file (filename);
 391   if (!fm)
 392     {
 393       logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
 394                  filename, strerror (errno));
 395       return NULL;
 396     }
 397   specs = res_parse (fm->content, fm->length);
 398   read_file_free (fm);
 399   return specs;
 400 }
 401
 402 static void
 403 free_specs (struct robot_specs *specs)
 404 {
 405   int i;
 406   for (i = 0; i < specs->count; i++)
 407     xfree (specs->paths[i].path);
 408   xfree_null (specs->paths);
 409   xfree (specs);
 410 }
 411 \f
 412 /* Matching of a path according to the specs. */
 413
 414 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
 415    that number is not a numerical representation of '/', decode C and
 416    advance the pointer.  */
 417
 418 #define DECODE_MAYBE(c, ptr) do {                               \
 419   if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2]))       \
 420     {                                                           \
 421       char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
 422       if (decoded != '/')                                       \
 423         {                                                       \
 424           c = decoded;                                          \
 425           ptr += 2;                                             \
 426         }                                                       \
 427     }                                                           \
 428 } while (0)
 429
 430 /* The inner matching engine: return true if RECORD_PATH matches
 431    URL_PATH.  The rules for matching are described at
 432    <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2.  */
 433
 434 static bool
 435 matches (const char *record_path, const char *url_path)
 436 {
 437   const char *rp = record_path;
 438   const char *up = url_path;
 439
 440   for (; ; ++rp, ++up)
 441     {
 442       char rc = *rp;
 443       char uc = *up;
 444       if (!rc)
 445         return true;
 446       if (!uc)
 447         return false;
 448       DECODE_MAYBE(rc, rp);
 449       DECODE_MAYBE(uc, up);
 450       if (rc != uc)
 451         return false;
 452     }
 453 }
 454
 455 /* Iterate through all paths in SPECS.  For the first one that
 456    matches, return its allow/reject status.  If none matches,
 457    retrieval is by default allowed.  */
 458
 459 bool
 460 res_match_path (const struct robot_specs *specs, const char *path)
 461 {
 462   int i;
 463   if (!specs)
 464     return true;
 465   for (i = 0; i < specs->count; i++)
 466     if (matches (specs->paths[i].path, path))
 467       {
 468         bool allowedp = specs->paths[i].allowedp;
 469         DEBUGP (("%s path %s because of rule `%s'.\n",
 470                  allowedp ? "Allowing" : "Rejecting",
 471                  path, specs->paths[i].path));
 472         return allowedp;
 473       }
 474   return true;
 475 }
 476 \f
 477 /* Registering the specs. */
 478
 479 static struct hash_table *registered_specs;
 480
 481 /* Stolen from cookies.c. */
 482 #define SET_HOSTPORT(host, port, result) do {           \
 483   int HP_len = strlen (host);                           \
 484   result = alloca (HP_len + 1 + numdigit (port) + 1);   \
 485   memcpy (result, host, HP_len);                        \
 486   result[HP_len] = ':';                                 \
 487   number_to_string (result + HP_len + 1, port);         \
 488 } while (0)
 489
 490 /* Register RES specs that below to server on HOST:PORT.  They will
 491    later be retrievable using res_get_specs.  */
 492
 493 void
 494 res_register_specs (const char *host, int port, struct robot_specs *specs)
 495 {
 496   struct robot_specs *old;
 497   char *hp, *hp_old;
 498   SET_HOSTPORT (host, port, hp);
 499
 500   if (!registered_specs)
 501     registered_specs = make_nocase_string_hash_table (0);
 502
 503   if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
 504     {
 505       if (old)
 506         free_specs (old);
 507       hash_table_put (registered_specs, hp_old, specs);
 508     }
 509   else
 510     {
 511       hash_table_put (registered_specs, xstrdup (hp), specs);
 512     }
 513 }
 514
 515 /* Get the specs that belong to HOST:PORT. */
 516
 517 struct robot_specs *
 518 res_get_specs (const char *host, int port)
 519 {
 520   char *hp;
 521   SET_HOSTPORT (host, port, hp);
 522   if (!registered_specs)
 523     return NULL;
 524   return hash_table_get (registered_specs, hp);
 525 }
 526 \f
 527 /* Loading the robots file.  */
 528
 529 #define RES_SPECS_LOCATION "/robots.txt"
 530
 531 /* Retrieve the robots.txt from the server root of the server that
 532    serves URL.  The file will be named according to the currently
 533    active rules, and the file name will be returned in *file.
 534
 535    Return true if robots were retrieved OK, false otherwise.  */
 536
 537 bool
 538 res_retrieve_file (const char *url, char **file)
 539 {
 540   uerr_t err;
 541   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 542   int saved_ts_val = opt.timestamping;
 543   int saved_sp_val = opt.spider;
 544
 545   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 546   *file = NULL;
 547   opt.timestamping = false;
 548   opt.spider       = false;
 549   err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
 550   opt.timestamping = saved_ts_val;
 551   opt.spider       = saved_sp_val;
 552   xfree (robots_url);
 553
 554   if (err != RETROK && *file != NULL)
 555     {
 556       /* If the file is not retrieved correctly, but retrieve_url
 557          allocated the file name, deallocate is here so that the
 558          caller doesn't have to worry about it.  */
 559       xfree (*file);
 560       *file = NULL;
 561     }
 562   return err == RETROK;
 563 }
 564 \f
 565 bool
 566 is_robots_txt_url (const char *url)
 567 {
 568   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 569   bool ret = are_urls_equal (url, robots_url);
 570
 571   xfree (robots_url);
 572
 573   return ret;
 574 }
 575 \f
 576 void
 577 res_cleanup (void)
 578 {
 579   if (registered_specs)
 580     {
 581       hash_table_iterator iter;
 582       for (hash_table_iterate (registered_specs, &iter);
 583            hash_table_iter_next (&iter);
 584            )
 585         {
 586           xfree (iter.key);
 587           free_specs (iter.value);
 588         }
 589       hash_table_destroy (registered_specs);
 590       registered_specs = NULL;
 591     }
 592 }
 593 \f
 594 #ifdef TESTING
 595
 596 const char *
 597 test_is_robots_txt_url()
 598 {
 599   int i;
 600   struct {
 601     char *url;
 602     bool expected_result;
 603   } test_array[] = {
 604     { "http://www.yoyodyne.com/robots.txt", true },
 605     { "http://www.yoyodyne.com/somepath/", false },
 606     { "http://www.yoyodyne.com/somepath/robots.txt", false },
 607   };
 608
 609   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
 610     {
 611       mu_assert ("test_is_robots_txt_url: wrong result",
 612                  is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
 613     }
 614
 615   return NULL;
 616 }
 617
 618 #endif /* TESTING */
 619
 620 /*
 621  * vim: et ts=2 sw=2
 622  */
 623