sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #define USE_GNULIB_ALLOC
  32
  33 #include "wget.h"
  34
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 #ifdef HAVE_UNISTD_H
  39 # include <unistd.h>
  40 #endif /* HAVE_UNISTD_H */
  41 #include <errno.h>
  42 #include <assert.h>
  43
  44 #include "url.h"
  45 #include "recur.h"
  46 #include "utils.h"
  47 #include "retr.h"
  48 #include "ftp.h"
  49 #include "host.h"
  50 #include "hash.h"
  51 #include "res.h"
  52 #include "convert.h"
  53 #include "spider.h"
  54 \f
  55 /* Functions for maintaining the URL queue.  */
  56
  57 struct queue_element {
  58   const char *url;              /* the URL to download */
  59   const char *referer;          /* the referring document */
  60   int depth;                    /* the depth */
  61   bool html_allowed;            /* whether the document is allowed to
  62                                    be treated as HTML. */
  63
  64   struct queue_element *next;   /* next element in queue */
  65 };
  66
  67 struct url_queue {
  68   struct queue_element *head;
  69   struct queue_element *tail;
  70   int count, maxcount;
  71 };
  72
  73 /* Create a URL queue. */
  74
  75 static struct url_queue *
  76 url_queue_new (void)
  77 {
  78   struct url_queue *queue = xnew0 (struct url_queue);
  79   return queue;
  80 }
  81
  82 /* Delete a URL queue. */
  83
  84 static void
  85 url_queue_delete (struct url_queue *queue)
  86 {
  87   xfree (queue);
  88 }
  89
  90 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  91    retrieved ("dequeued") from the queue in the order they were placed
  92    into it.  */
  93
  94 static void
  95 url_enqueue (struct url_queue *queue,
  96              const char *url, const char *referer, int depth, bool html_allowed)
  97 {
  98   struct queue_element *qel = xnew (struct queue_element);
  99   qel->url = url;
 100   qel->referer = referer;
 101   qel->depth = depth;
 102   qel->html_allowed = html_allowed;
 103   qel->next = NULL;
 104
 105   ++queue->count;
 106   if (queue->count > queue->maxcount)
 107     queue->maxcount = queue->count;
 108
 109   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 110   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 111
 112   if (queue->tail)
 113     queue->tail->next = qel;
 114   queue->tail = qel;
 115
 116   if (!queue->head)
 117     queue->head = queue->tail;
 118 }
 119
 120 /* Take a URL out of the queue.  Return true if this operation
 121    succeeded, or false if the queue is empty.  */
 122
 123 static bool
 124 url_dequeue (struct url_queue *queue,
 125              const char **url, const char **referer, int *depth,
 126              bool *html_allowed)
 127 {
 128   struct queue_element *qel = queue->head;
 129
 130   if (!qel)
 131     return false;
 132
 133   queue->head = queue->head->next;
 134   if (!queue->head)
 135     queue->tail = NULL;
 136
 137   *url = qel->url;
 138   *referer = qel->referer;
 139   *depth = qel->depth;
 140   *html_allowed = qel->html_allowed;
 141
 142   --queue->count;
 143
 144   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 145   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 146
 147   xfree (qel);
 148   return true;
 149 }
 150 \f
 151 static bool download_child_p (const struct urlpos *, struct url *, int,
 152                               struct url *, struct hash_table *);
 153 static bool descend_redirect_p (const char *, const char *, int,
 154                                 struct url *, struct hash_table *);
 155
 156
 157 /* Retrieve a part of the web beginning with START_URL.  This used to
 158    be called "recursive retrieval", because the old function was
 159    recursive and implemented depth-first search.  retrieve_tree on the
 160    other hand implements breadth-search traversal of the tree, which
 161    results in much nicer ordering of downloads.
 162
 163    The algorithm this function uses is simple:
 164
 165    1. put START_URL in the queue.
 166    2. while there are URLs in the queue:
 167
 168      3. get next URL from the queue.
 169      4. download it.
 170      5. if the URL is HTML and its depth does not exceed maximum depth,
 171         get the list of URLs embedded therein.
 172      6. for each of those URLs do the following:
 173
 174        7. if the URL is not one of those downloaded before, and if it
 175           satisfies the criteria specified by the various command-line
 176           options, add it to the queue. */
 177
 178 uerr_t
 179 retrieve_tree (const char *start_url)
 180 {
 181   uerr_t status = RETROK;
 182
 183   /* The queue of URLs we need to load. */
 184   struct url_queue *queue;
 185
 186   /* The URLs we do not wish to enqueue, because they are already in
 187      the queue, but haven't been downloaded yet.  */
 188   struct hash_table *blacklist;
 189
 190   int up_error_code;
 191   struct url *start_url_parsed = url_parse (start_url, &up_error_code);
 192
 193   if (!start_url_parsed)
 194     {
 195       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
 196                  url_error (up_error_code));
 197       return URLERROR;
 198     }
 199
 200   queue = url_queue_new ();
 201   blacklist = make_string_hash_table (0);
 202
 203   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
 204      just URL so we enqueue the canonical form of the URL.  */
 205   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
 206   string_set_add (blacklist, start_url_parsed->url);
 207
 208   while (1)
 209     {
 210       bool descend = false;
 211       char *url, *referer, *file = NULL;
 212       int depth;
 213       bool html_allowed;
 214       bool dash_p_leaf_HTML = false;
 215
 216       if (opt.quota && total_downloaded_bytes > opt.quota)
 217         break;
 218       if (status == FWRITEERR)
 219         break;
 220
 221       /* Get the next URL from the queue... */
 222
 223       if (!url_dequeue (queue,
 224                         (const char **)&url, (const char **)&referer,
 225                         &depth, &html_allowed))
 226         break;
 227
 228       /* ...and download it.  Note that this download is in most cases
 229          unconditional, as download_child_p already makes sure a file
 230          doesn't get enqueued twice -- and yet this check is here, and
 231          not in download_child_p.  This is so that if you run `wget -r
 232          URL1 URL2', and a random URL is encountered once under URL1
 233          and again under URL2, but at a different (possibly smaller)
 234          depth, we want the URL's children to be taken into account
 235          the second time.  */
 236       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 237         {
 238           file = xstrdup (hash_table_get (dl_url_file_map, url));
 239
 240           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 241                    url, file));
 242
 243           if (html_allowed
 244               && downloaded_html_set
 245               && string_set_contains (downloaded_html_set, file))
 246             descend = true;
 247         }
 248       else
 249         {
 250           int dt = 0;
 251           char *redirected = NULL;
 252
 253           status = retrieve_url (url, &file, &redirected, referer, &dt, false);
 254
 255           if (html_allowed && file && status == RETROK
 256               && (dt & RETROKF) && (dt & TEXTHTML))
 257             descend = true;
 258
 259           if (redirected)
 260             {
 261               /* We have been redirected, possibly to another host, or
 262                  different path, or wherever.  Check whether we really
 263                  want to follow it.  */
 264               if (descend)
 265                 {
 266                   if (!descend_redirect_p (redirected, url, depth,
 267                                            start_url_parsed, blacklist))
 268                     descend = false;
 269                   else
 270                     /* Make sure that the old pre-redirect form gets
 271                        blacklisted. */
 272                     string_set_add (blacklist, url);
 273                 }
 274
 275               xfree (url);
 276               url = redirected;
 277             }
 278         }
 279
 280       if (opt.spider)
 281         {
 282           visited_url (url, referer);
 283         }
 284
 285       if (descend
 286           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 287         {
 288           if (opt.page_requisites
 289               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 290             {
 291               /* When -p is specified, we are allowed to exceed the
 292                  maximum depth, but only for the "inline" links,
 293                  i.e. those that are needed to display the page.
 294                  Originally this could exceed the depth at most by
 295                  one, but we allow one more level so that the leaf
 296                  pages that contain frames can be loaded
 297                  correctly.  */
 298               dash_p_leaf_HTML = true;
 299             }
 300           else
 301             {
 302               /* Either -p wasn't specified or it was and we've
 303                  already spent the two extra (pseudo-)levels that it
 304                  affords us, so we need to bail out. */
 305               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 306                        depth, opt.reclevel));
 307               descend = false;
 308             }
 309         }
 310
 311       /* If the downloaded document was HTML, parse it and enqueue the
 312          links it contains. */
 313
 314       if (descend)
 315         {
 316           bool meta_disallow_follow = false;
 317           struct urlpos *children
 318             = get_urls_html (file, url, &meta_disallow_follow);
 319
 320           if (opt.use_robots && meta_disallow_follow)
 321             {
 322               free_urlpos (children);
 323               children = NULL;
 324             }
 325
 326           if (children)
 327             {
 328               struct urlpos *child = children;
 329               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 330               char *referer_url = url;
 331               bool strip_auth = (url_parsed != NULL
 332                                  && url_parsed->user != NULL);
 333               assert (url_parsed != NULL);
 334
 335               /* Strip auth info if present */
 336               if (strip_auth)
 337                 referer_url = url_string (url_parsed, URL_AUTH_HIDE);
 338
 339               for (; child; child = child->next)
 340                 {
 341                   if (child->ignore_when_downloading)
 342                     continue;
 343                   if (dash_p_leaf_HTML && !child->link_inline_p)
 344                     continue;
 345                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 346                                         blacklist))
 347                     {
 348                       url_enqueue (queue, xstrdup (child->url->url),
 349                                    xstrdup (referer_url), depth + 1,
 350                                    child->link_expect_html);
 351                       /* We blacklist the URL we have enqueued, because we
 352                          don't want to enqueue (and hence download) the
 353                          same URL twice.  */
 354                       string_set_add (blacklist, child->url->url);
 355                     }
 356                 }
 357
 358               if (strip_auth)
 359                 xfree (referer_url);
 360               url_free (url_parsed);
 361               free_urlpos (children);
 362             }
 363         }
 364
 365       if (file
 366           && (opt.delete_after
 367               || opt.spider /* opt.recursive is implicitely true */
 368               || !acceptable (file)))
 369         {
 370           /* Either --delete-after was specified, or we loaded this
 371              (otherwise unneeded because of --spider or rejected by -R)
 372              HTML file just to harvest its hyperlinks -- in either case,
 373              delete the local file. */
 374           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 375                    opt.delete_after ? "--delete-after" :
 376                    (opt.spider ? "--spider" :
 377                     "recursive rejection criteria")));
 378           logprintf (LOG_VERBOSE,
 379                      (opt.delete_after || opt.spider
 380                       ? _("Removing %s.\n")
 381                       : _("Removing %s since it should be rejected.\n")),
 382                      file);
 383           if (unlink (file))
 384             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 385           logputs (LOG_VERBOSE, "\n");
 386           register_delete_file (file);
 387         }
 388
 389       xfree (url);
 390       xfree_null (referer);
 391       xfree_null (file);
 392     }
 393
 394   /* If anything is left of the queue due to a premature exit, free it
 395      now.  */
 396   {
 397     char *d1, *d2;
 398     int d3;
 399     bool d4;
 400     while (url_dequeue (queue,
 401                         (const char **)&d1, (const char **)&d2, &d3, &d4))
 402       {
 403         xfree (d1);
 404         xfree_null (d2);
 405       }
 406   }
 407   url_queue_delete (queue);
 408
 409   if (start_url_parsed)
 410     url_free (start_url_parsed);
 411   string_set_free (blacklist);
 412
 413   if (opt.quota && total_downloaded_bytes > opt.quota)
 414     return QUOTEXC;
 415   else if (status == FWRITEERR)
 416     return FWRITEERR;
 417   else
 418     return RETROK;
 419 }
 420
 421 /* Based on the context provided by retrieve_tree, decide whether a
 422    URL is to be descended to.  This is only ever called from
 423    retrieve_tree, but is in a separate function for clarity.
 424
 425    The most expensive checks (such as those for robots) are memoized
 426    by storing these URLs to BLACKLIST.  This may or may not help.  It
 427    will help if those URLs are encountered many times.  */
 428
 429 static bool
 430 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 431                   struct url *start_url_parsed, struct hash_table *blacklist)
 432 {
 433   struct url *u = upos->url;
 434   const char *url = u->url;
 435   bool u_scheme_like_http;
 436
 437   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 438
 439   if (string_set_contains (blacklist, url))
 440     {
 441       if (opt.spider)
 442         {
 443           char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
 444           DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
 445           visited_url (url, referrer);
 446           xfree (referrer);
 447         }
 448       DEBUGP (("Already on the black list.\n"));
 449       goto out;
 450     }
 451
 452   /* Several things to check for:
 453      1. if scheme is not http, and we don't load it
 454      2. check for relative links (if relative_only is set)
 455      3. check for domain
 456      4. check for no-parent
 457      5. check for excludes && includes
 458      6. check for suffix
 459      7. check for same host (if spanhost is unset), with possible
 460      gethostbyname baggage
 461      8. check for robots.txt
 462
 463      Addendum: If the URL is FTP, and it is to be loaded, only the
 464      domain and suffix settings are "stronger".
 465
 466      Note that .html files will get loaded regardless of suffix rules
 467      (but that is remedied later with unlink) unless the depth equals
 468      the maximum depth.
 469
 470      More time- and memory- consuming tests should be put later on
 471      the list.  */
 472
 473   /* Determine whether URL under consideration has a HTTP-like scheme. */
 474   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 475
 476   /* 1. Schemes other than HTTP are normally not recursed into. */
 477   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 478     {
 479       DEBUGP (("Not following non-HTTP schemes.\n"));
 480       goto out;
 481     }
 482
 483   /* 2. If it is an absolute link and they are not followed, throw it
 484      out.  */
 485   if (u_scheme_like_http)
 486     if (opt.relative_only && !upos->link_relative_p)
 487       {
 488         DEBUGP (("It doesn't really look like a relative link.\n"));
 489         goto out;
 490       }
 491
 492   /* 3. If its domain is not to be accepted/looked-up, chuck it
 493      out.  */
 494   if (!accept_domain (u))
 495     {
 496       DEBUGP (("The domain was not accepted.\n"));
 497       goto out;
 498     }
 499
 500   /* 4. Check for parent directory.
 501
 502      If we descended to a different host or changed the scheme, ignore
 503      opt.no_parent.  Also ignore it for documents needed to display
 504      the parent page when in -p mode.  */
 505   if (opt.no_parent
 506       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
 507       && 0 == strcasecmp (u->host, start_url_parsed->host)
 508       && u->port == start_url_parsed->port
 509       && !(opt.page_requisites && upos->link_inline_p))
 510     {
 511       if (!subdir_p (start_url_parsed->dir, u->dir))
 512         {
 513           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 514                    u->dir, start_url_parsed->dir));
 515           goto out;
 516         }
 517     }
 518
 519   /* 5. If the file does not match the acceptance list, or is on the
 520      rejection list, chuck it out.  The same goes for the directory
 521      exclusion and inclusion lists.  */
 522   if (opt.includes || opt.excludes)
 523     {
 524       if (!accdir (u->dir))
 525         {
 526           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 527           goto out;
 528         }
 529     }
 530
 531   /* 6. Check for acceptance/rejection rules.  We ignore these rules
 532      for directories (no file name to match) and for non-leaf HTMLs,
 533      which can lead to other files that do need to be downloaded.  (-p
 534      automatically implies non-leaf because with -p we can, if
 535      necesary, overstep the maximum depth to get the page requisites.)  */
 536   if (u->file[0] != '\0'
 537       && !(has_html_suffix_p (u->file)
 538            /* The exception only applies to non-leaf HTMLs (but -p
 539               always implies non-leaf because we can overstep the
 540               maximum depth to get the requisites): */
 541            && (/* non-leaf */
 542                opt.reclevel == INFINITE_RECURSION
 543                /* also non-leaf */
 544                || depth < opt.reclevel - 1
 545                /* -p, which implies non-leaf (see above) */
 546                || opt.page_requisites)))
 547     {
 548       if (!acceptable (u->file))
 549         {
 550           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 551                    url, u->file));
 552           goto out;
 553         }
 554     }
 555
 556   /* 7. */
 557   if (schemes_are_similar_p (u->scheme, parent->scheme))
 558     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 559       {
 560         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 561                  u->host, parent->host));
 562         goto out;
 563       }
 564
 565   /* 8. */
 566   if (opt.use_robots && u_scheme_like_http)
 567     {
 568       struct robot_specs *specs = res_get_specs (u->host, u->port);
 569       if (!specs)
 570         {
 571           char *rfile;
 572           if (res_retrieve_file (url, &rfile))
 573             {
 574               specs = res_parse_from_file (rfile);
 575
 576               /* Delete the robots.txt file if we chose to either delete the
 577                  files after downloading or we're just running a spider. */
 578               if (opt.delete_after || opt.spider)
 579                 {
 580                   logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
 581                   if (unlink (rfile))
 582                       logprintf (LOG_NOTQUIET, "unlink: %s\n",
 583                                  strerror (errno));
 584                 }
 585
 586               xfree (rfile);
 587             }
 588           else
 589             {
 590               /* If we cannot get real specs, at least produce
 591                  dummy ones so that we can register them and stop
 592                  trying to retrieve them.  */
 593               specs = res_parse ("", 0);
 594             }
 595           res_register_specs (u->host, u->port, specs);
 596         }
 597
 598       /* Now that we have (or don't have) robots.txt specs, we can
 599          check what they say.  */
 600       if (!res_match_path (specs, u->path))
 601         {
 602           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 603           string_set_add (blacklist, url);
 604           goto out;
 605         }
 606     }
 607
 608   /* The URL has passed all the tests.  It can be placed in the
 609      download queue. */
 610   DEBUGP (("Decided to load it.\n"));
 611
 612   return true;
 613
 614  out:
 615   DEBUGP (("Decided NOT to load it.\n"));
 616
 617   return false;
 618 }
 619
 620 /* This function determines whether we will consider downloading the
 621    children of a URL whose download resulted in a redirection,
 622    possibly to another host, etc.  It is needed very rarely, and thus
 623    it is merely a simple-minded wrapper around download_child_p.  */
 624
 625 static bool
 626 descend_redirect_p (const char *redirected, const char *original, int depth,
 627                     struct url *start_url_parsed, struct hash_table *blacklist)
 628 {
 629   struct url *orig_parsed, *new_parsed;
 630   struct urlpos *upos;
 631   bool success;
 632
 633   orig_parsed = url_parse (original, NULL);
 634   assert (orig_parsed != NULL);
 635
 636   new_parsed = url_parse (redirected, NULL);
 637   assert (new_parsed != NULL);
 638
 639   upos = xnew0 (struct urlpos);
 640   upos->url = new_parsed;
 641
 642   success = download_child_p (upos, orig_parsed, depth,
 643                               start_url_parsed, blacklist);
 644
 645   url_free (orig_parsed);
 646   url_free (new_parsed);
 647   xfree (upos);
 648
 649   if (!success)
 650     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 651
 652   return success;
 653 }
 654
 655 /* vim:set sts=2 sw=2 cino+={s: */