sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif /* HAVE_UNISTD_H */
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "retr.h"
  46 #include "ftp.h"
  47 #include "host.h"
  48 #include "hash.h"
  49 #include "res.h"
  50 #include "convert.h"
  51 #include "spider.h"
  52 \f
  53 /* Functions for maintaining the URL queue.  */
  54
  55 struct queue_element {
  56   const char *url;              /* the URL to download */
  57   const char *referer;          /* the referring document */
  58   int depth;                    /* the depth */
  59   bool html_allowed;            /* whether the document is allowed to
  60                                    be treated as HTML. */
  61
  62   struct queue_element *next;   /* next element in queue */
  63 };
  64
  65 struct url_queue {
  66   struct queue_element *head;
  67   struct queue_element *tail;
  68   int count, maxcount;
  69 };
  70
  71 /* Create a URL queue. */
  72
  73 static struct url_queue *
  74 url_queue_new (void)
  75 {
  76   struct url_queue *queue = xnew0 (struct url_queue);
  77   return queue;
  78 }
  79
  80 /* Delete a URL queue. */
  81
  82 static void
  83 url_queue_delete (struct url_queue *queue)
  84 {
  85   xfree (queue);
  86 }
  87
  88 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  89    retrieved ("dequeued") from the queue in the order they were placed
  90    into it.  */
  91
  92 static void
  93 url_enqueue (struct url_queue *queue,
  94              const char *url, const char *referer, int depth, bool html_allowed)
  95 {
  96   struct queue_element *qel = xnew (struct queue_element);
  97   qel->url = url;
  98   qel->referer = referer;
  99   qel->depth = depth;
 100   qel->html_allowed = html_allowed;
 101   qel->next = NULL;
 102
 103   ++queue->count;
 104   if (queue->count > queue->maxcount)
 105     queue->maxcount = queue->count;
 106
 107   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 108   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 109
 110   if (queue->tail)
 111     queue->tail->next = qel;
 112   queue->tail = qel;
 113
 114   if (!queue->head)
 115     queue->head = queue->tail;
 116 }
 117
 118 /* Take a URL out of the queue.  Return true if this operation
 119    succeeded, or false if the queue is empty.  */
 120
 121 static bool
 122 url_dequeue (struct url_queue *queue,
 123              const char **url, const char **referer, int *depth,
 124              bool *html_allowed)
 125 {
 126   struct queue_element *qel = queue->head;
 127
 128   if (!qel)
 129     return false;
 130
 131   queue->head = queue->head->next;
 132   if (!queue->head)
 133     queue->tail = NULL;
 134
 135   *url = qel->url;
 136   *referer = qel->referer;
 137   *depth = qel->depth;
 138   *html_allowed = qel->html_allowed;
 139
 140   --queue->count;
 141
 142   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 143   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 144
 145   xfree (qel);
 146   return true;
 147 }
 148 \f
 149 static bool download_child_p (const struct urlpos *, struct url *, int,
 150                               struct url *, struct hash_table *);
 151 static bool descend_redirect_p (const char *, const char *, int,
 152                                 struct url *, struct hash_table *);
 153
 154
 155 /* Retrieve a part of the web beginning with START_URL.  This used to
 156    be called "recursive retrieval", because the old function was
 157    recursive and implemented depth-first search.  retrieve_tree on the
 158    other hand implements breadth-search traversal of the tree, which
 159    results in much nicer ordering of downloads.
 160
 161    The algorithm this function uses is simple:
 162
 163    1. put START_URL in the queue.
 164    2. while there are URLs in the queue:
 165
 166      3. get next URL from the queue.
 167      4. download it.
 168      5. if the URL is HTML and its depth does not exceed maximum depth,
 169         get the list of URLs embedded therein.
 170      6. for each of those URLs do the following:
 171
 172        7. if the URL is not one of those downloaded before, and if it
 173           satisfies the criteria specified by the various command-line
 174           options, add it to the queue. */
 175
 176 uerr_t
 177 retrieve_tree (const char *start_url)
 178 {
 179   uerr_t status = RETROK;
 180
 181   /* The queue of URLs we need to load. */
 182   struct url_queue *queue;
 183
 184   /* The URLs we do not wish to enqueue, because they are already in
 185      the queue, but haven't been downloaded yet.  */
 186   struct hash_table *blacklist;
 187
 188   int up_error_code;
 189   struct url *start_url_parsed = url_parse (start_url, &up_error_code);
 190
 191   if (!start_url_parsed)
 192     {
 193       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
 194                  url_error (up_error_code));
 195       return URLERROR;
 196     }
 197
 198   queue = url_queue_new ();
 199   blacklist = make_string_hash_table (0);
 200
 201   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
 202      just URL so we enqueue the canonical form of the URL.  */
 203   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
 204   string_set_add (blacklist, start_url_parsed->url);
 205
 206   while (1)
 207     {
 208       bool descend = false;
 209       char *url, *referer, *file = NULL;
 210       int depth;
 211       bool html_allowed;
 212       bool dash_p_leaf_HTML = false;
 213
 214       if (opt.quota && total_downloaded_bytes > opt.quota)
 215         break;
 216       if (status == FWRITEERR)
 217         break;
 218
 219       /* Get the next URL from the queue... */
 220
 221       if (!url_dequeue (queue,
 222                         (const char **)&url, (const char **)&referer,
 223                         &depth, &html_allowed))
 224         break;
 225
 226       /* ...and download it.  Note that this download is in most cases
 227          unconditional, as download_child_p already makes sure a file
 228          doesn't get enqueued twice -- and yet this check is here, and
 229          not in download_child_p.  This is so that if you run `wget -r
 230          URL1 URL2', and a random URL is encountered once under URL1
 231          and again under URL2, but at a different (possibly smaller)
 232          depth, we want the URL's children to be taken into account
 233          the second time.  */
 234       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 235         {
 236           file = xstrdup (hash_table_get (dl_url_file_map, url));
 237
 238           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 239                    url, file));
 240
 241           if (html_allowed
 242               && downloaded_html_set
 243               && string_set_contains (downloaded_html_set, file))
 244             descend = true;
 245         }
 246       else
 247         {
 248           int dt = 0;
 249           char *redirected = NULL;
 250
 251           status = retrieve_url (url, &file, &redirected, referer, &dt, false);
 252
 253           if (html_allowed && file && status == RETROK
 254               && (dt & RETROKF) && (dt & TEXTHTML))
 255             descend = true;
 256
 257           if (redirected)
 258             {
 259               /* We have been redirected, possibly to another host, or
 260                  different path, or wherever.  Check whether we really
 261                  want to follow it.  */
 262               if (descend)
 263                 {
 264                   if (!descend_redirect_p (redirected, url, depth,
 265                                            start_url_parsed, blacklist))
 266                     descend = false;
 267                   else
 268                     /* Make sure that the old pre-redirect form gets
 269                        blacklisted. */
 270                     string_set_add (blacklist, url);
 271                 }
 272
 273               xfree (url);
 274               url = redirected;
 275             }
 276         }
 277
 278       if (opt.spider)
 279         {
 280           visited_url (url, referer);
 281         }
 282
 283       if (descend
 284           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 285         {
 286           if (opt.page_requisites
 287               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 288             {
 289               /* When -p is specified, we are allowed to exceed the
 290                  maximum depth, but only for the "inline" links,
 291                  i.e. those that are needed to display the page.
 292                  Originally this could exceed the depth at most by
 293                  one, but we allow one more level so that the leaf
 294                  pages that contain frames can be loaded
 295                  correctly.  */
 296               dash_p_leaf_HTML = true;
 297             }
 298           else
 299             {
 300               /* Either -p wasn't specified or it was and we've
 301                  already spent the two extra (pseudo-)levels that it
 302                  affords us, so we need to bail out. */
 303               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 304                        depth, opt.reclevel));
 305               descend = false;
 306             }
 307         }
 308
 309       /* If the downloaded document was HTML, parse it and enqueue the
 310          links it contains. */
 311
 312       if (descend)
 313         {
 314           bool meta_disallow_follow = false;
 315           struct urlpos *children
 316             = get_urls_html (file, url, &meta_disallow_follow);
 317
 318           if (opt.use_robots && meta_disallow_follow)
 319             {
 320               free_urlpos (children);
 321               children = NULL;
 322             }
 323
 324           if (children)
 325             {
 326               struct urlpos *child = children;
 327               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 328               char *referer_url = url;
 329               bool strip_auth = (url_parsed != NULL
 330                                  && url_parsed->user != NULL);
 331               assert (url_parsed != NULL);
 332
 333               /* Strip auth info if present */
 334               if (strip_auth)
 335                 referer_url = url_string (url_parsed, URL_AUTH_HIDE);
 336
 337               for (; child; child = child->next)
 338                 {
 339                   if (child->ignore_when_downloading)
 340                     continue;
 341                   if (dash_p_leaf_HTML && !child->link_inline_p)
 342                     continue;
 343                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 344                                         blacklist))
 345                     {
 346                       url_enqueue (queue, xstrdup (child->url->url),
 347                                    xstrdup (referer_url), depth + 1,
 348                                    child->link_expect_html);
 349                       /* We blacklist the URL we have enqueued, because we
 350                          don't want to enqueue (and hence download) the
 351                          same URL twice.  */
 352                       string_set_add (blacklist, child->url->url);
 353                     }
 354                 }
 355
 356               if (strip_auth)
 357                 xfree (referer_url);
 358               url_free (url_parsed);
 359               free_urlpos (children);
 360             }
 361         }
 362
 363       if (file
 364           && (opt.delete_after
 365               || opt.spider /* opt.recursive is implicitely true */
 366               || !acceptable (file)))
 367         {
 368           /* Either --delete-after was specified, or we loaded this
 369              (otherwise unneeded because of --spider or rejected by -R)
 370              HTML file just to harvest its hyperlinks -- in either case,
 371              delete the local file. */
 372           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 373                    opt.delete_after ? "--delete-after" :
 374                    (opt.spider ? "--spider" :
 375                     "recursive rejection criteria")));
 376           logprintf (LOG_VERBOSE,
 377                      (opt.delete_after || opt.spider
 378                       ? _("Removing %s.\n")
 379                       : _("Removing %s since it should be rejected.\n")),
 380                      file);
 381           if (unlink (file))
 382             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 383           logputs (LOG_VERBOSE, "\n");
 384           register_delete_file (file);
 385         }
 386
 387       xfree (url);
 388       xfree_null (referer);
 389       xfree_null (file);
 390     }
 391
 392   /* If anything is left of the queue due to a premature exit, free it
 393      now.  */
 394   {
 395     char *d1, *d2;
 396     int d3;
 397     bool d4;
 398     while (url_dequeue (queue,
 399                         (const char **)&d1, (const char **)&d2, &d3, &d4))
 400       {
 401         xfree (d1);
 402         xfree_null (d2);
 403       }
 404   }
 405   url_queue_delete (queue);
 406
 407   if (start_url_parsed)
 408     url_free (start_url_parsed);
 409   string_set_free (blacklist);
 410
 411   if (opt.quota && total_downloaded_bytes > opt.quota)
 412     return QUOTEXC;
 413   else if (status == FWRITEERR)
 414     return FWRITEERR;
 415   else
 416     return RETROK;
 417 }
 418
 419 /* Based on the context provided by retrieve_tree, decide whether a
 420    URL is to be descended to.  This is only ever called from
 421    retrieve_tree, but is in a separate function for clarity.
 422
 423    The most expensive checks (such as those for robots) are memoized
 424    by storing these URLs to BLACKLIST.  This may or may not help.  It
 425    will help if those URLs are encountered many times.  */
 426
 427 static bool
 428 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 429                   struct url *start_url_parsed, struct hash_table *blacklist)
 430 {
 431   struct url *u = upos->url;
 432   const char *url = u->url;
 433   bool u_scheme_like_http;
 434
 435   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 436
 437   if (string_set_contains (blacklist, url))
 438     {
 439       if (opt.spider)
 440         {
 441           char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
 442           DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
 443           visited_url (url, referrer);
 444           xfree (referrer);
 445         }
 446       DEBUGP (("Already on the black list.\n"));
 447       goto out;
 448     }
 449
 450   /* Several things to check for:
 451      1. if scheme is not http, and we don't load it
 452      2. check for relative links (if relative_only is set)
 453      3. check for domain
 454      4. check for no-parent
 455      5. check for excludes && includes
 456      6. check for suffix
 457      7. check for same host (if spanhost is unset), with possible
 458      gethostbyname baggage
 459      8. check for robots.txt
 460
 461      Addendum: If the URL is FTP, and it is to be loaded, only the
 462      domain and suffix settings are "stronger".
 463
 464      Note that .html files will get loaded regardless of suffix rules
 465      (but that is remedied later with unlink) unless the depth equals
 466      the maximum depth.
 467
 468      More time- and memory- consuming tests should be put later on
 469      the list.  */
 470
 471   /* Determine whether URL under consideration has a HTTP-like scheme. */
 472   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 473
 474   /* 1. Schemes other than HTTP are normally not recursed into. */
 475   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 476     {
 477       DEBUGP (("Not following non-HTTP schemes.\n"));
 478       goto out;
 479     }
 480
 481   /* 2. If it is an absolute link and they are not followed, throw it
 482      out.  */
 483   if (u_scheme_like_http)
 484     if (opt.relative_only && !upos->link_relative_p)
 485       {
 486         DEBUGP (("It doesn't really look like a relative link.\n"));
 487         goto out;
 488       }
 489
 490   /* 3. If its domain is not to be accepted/looked-up, chuck it
 491      out.  */
 492   if (!accept_domain (u))
 493     {
 494       DEBUGP (("The domain was not accepted.\n"));
 495       goto out;
 496     }
 497
 498   /* 4. Check for parent directory.
 499
 500      If we descended to a different host or changed the scheme, ignore
 501      opt.no_parent.  Also ignore it for documents needed to display
 502      the parent page when in -p mode.  */
 503   if (opt.no_parent
 504       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
 505       && 0 == strcasecmp (u->host, start_url_parsed->host)
 506       && u->port == start_url_parsed->port
 507       && !(opt.page_requisites && upos->link_inline_p))
 508     {
 509       if (!subdir_p (start_url_parsed->dir, u->dir))
 510         {
 511           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 512                    u->dir, start_url_parsed->dir));
 513           goto out;
 514         }
 515     }
 516
 517   /* 5. If the file does not match the acceptance list, or is on the
 518      rejection list, chuck it out.  The same goes for the directory
 519      exclusion and inclusion lists.  */
 520   if (opt.includes || opt.excludes)
 521     {
 522       if (!accdir (u->dir))
 523         {
 524           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 525           goto out;
 526         }
 527     }
 528
 529   /* 6. Check for acceptance/rejection rules.  We ignore these rules
 530      for directories (no file name to match) and for non-leaf HTMLs,
 531      which can lead to other files that do need to be downloaded.  (-p
 532      automatically implies non-leaf because with -p we can, if
 533      necesary, overstep the maximum depth to get the page requisites.)  */
 534   if (u->file[0] != '\0'
 535       && !(has_html_suffix_p (u->file)
 536            /* The exception only applies to non-leaf HTMLs (but -p
 537               always implies non-leaf because we can overstep the
 538               maximum depth to get the requisites): */
 539            && (/* non-leaf */
 540                opt.reclevel == INFINITE_RECURSION
 541                /* also non-leaf */
 542                || depth < opt.reclevel - 1
 543                /* -p, which implies non-leaf (see above) */
 544                || opt.page_requisites)))
 545     {
 546       if (!acceptable (u->file))
 547         {
 548           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 549                    url, u->file));
 550           goto out;
 551         }
 552     }
 553
 554   /* 7. */
 555   if (schemes_are_similar_p (u->scheme, parent->scheme))
 556     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 557       {
 558         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 559                  u->host, parent->host));
 560         goto out;
 561       }
 562
 563   /* 8. */
 564   if (opt.use_robots && u_scheme_like_http)
 565     {
 566       struct robot_specs *specs = res_get_specs (u->host, u->port);
 567       if (!specs)
 568         {
 569           char *rfile;
 570           if (res_retrieve_file (url, &rfile))
 571             {
 572               specs = res_parse_from_file (rfile);
 573
 574               /* Delete the robots.txt file if we chose to either delete the
 575                  files after downloading or we're just running a spider. */
 576               if (opt.delete_after || opt.spider)
 577                 {
 578                   logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
 579                   if (unlink (rfile))
 580                       logprintf (LOG_NOTQUIET, "unlink: %s\n",
 581                                  strerror (errno));
 582                 }
 583
 584               xfree (rfile);
 585             }
 586           else
 587             {
 588               /* If we cannot get real specs, at least produce
 589                  dummy ones so that we can register them and stop
 590                  trying to retrieve them.  */
 591               specs = res_parse ("", 0);
 592             }
 593           res_register_specs (u->host, u->port, specs);
 594         }
 595
 596       /* Now that we have (or don't have) robots.txt specs, we can
 597          check what they say.  */
 598       if (!res_match_path (specs, u->path))
 599         {
 600           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 601           string_set_add (blacklist, url);
 602           goto out;
 603         }
 604     }
 605
 606   /* The URL has passed all the tests.  It can be placed in the
 607      download queue. */
 608   DEBUGP (("Decided to load it.\n"));
 609
 610   return true;
 611
 612  out:
 613   DEBUGP (("Decided NOT to load it.\n"));
 614
 615   return false;
 616 }
 617
 618 /* This function determines whether we will consider downloading the
 619    children of a URL whose download resulted in a redirection,
 620    possibly to another host, etc.  It is needed very rarely, and thus
 621    it is merely a simple-minded wrapper around download_child_p.  */
 622
 623 static bool
 624 descend_redirect_p (const char *redirected, const char *original, int depth,
 625                     struct url *start_url_parsed, struct hash_table *blacklist)
 626 {
 627   struct url *orig_parsed, *new_parsed;
 628   struct urlpos *upos;
 629   bool success;
 630
 631   orig_parsed = url_parse (original, NULL);
 632   assert (orig_parsed != NULL);
 633
 634   new_parsed = url_parse (redirected, NULL);
 635   assert (new_parsed != NULL);
 636
 637   upos = xnew0 (struct urlpos);
 638   upos->url = new_parsed;
 639
 640   success = download_child_p (upos, orig_parsed, depth,
 641                               start_url_parsed, blacklist);
 642
 643   url_free (orig_parsed);
 644   url_free (new_parsed);
 645   xfree (upos);
 646
 647   if (!success)
 648     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 649
 650   return success;
 651 }
 652
 653 /* vim:set sts=2 sw=2 cino+={s: */