sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1996-2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "retr.h"
  46 #include "ftp.h"
  47 #include "host.h"
  48 #include "hash.h"
  49 #include "res.h"
  50 #include "convert.h"
  51 #include "spider.h"
  52 \f
  53 /* Functions for maintaining the URL queue.  */
  54
  55 struct queue_element {
  56   const char *url;              /* the URL to download */
  57   const char *referer;          /* the referring document */
  58   int depth;                    /* the depth */
  59   bool html_allowed;            /* whether the document is allowed to
  60                                    be treated as HTML. */
  61
  62   struct queue_element *next;   /* next element in queue */
  63 };
  64
  65 struct url_queue {
  66   struct queue_element *head;
  67   struct queue_element *tail;
  68   int count, maxcount;
  69 };
  70
  71 /* Create a URL queue. */
  72
  73 static struct url_queue *
  74 url_queue_new (void)
  75 {
  76   struct url_queue *queue = xnew0 (struct url_queue);
  77   return queue;
  78 }
  79
  80 /* Delete a URL queue. */
  81
  82 static void
  83 url_queue_delete (struct url_queue *queue)
  84 {
  85   xfree (queue);
  86 }
  87
  88 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  89    retrieved ("dequeued") from the queue in the order they were placed
  90    into it.  */
  91
  92 static void
  93 url_enqueue (struct url_queue *queue,
  94              const char *url, const char *referer, int depth, bool html_allowed)
  95 {
  96   struct queue_element *qel = xnew (struct queue_element);
  97   qel->url = url;
  98   qel->referer = referer;
  99   qel->depth = depth;
 100   qel->html_allowed = html_allowed;
 101   qel->next = NULL;
 102
 103   ++queue->count;
 104   if (queue->count > queue->maxcount)
 105     queue->maxcount = queue->count;
 106
 107   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 108   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 109
 110   if (queue->tail)
 111     queue->tail->next = qel;
 112   queue->tail = qel;
 113
 114   if (!queue->head)
 115     queue->head = queue->tail;
 116 }
 117
 118 /* Take a URL out of the queue.  Return true if this operation
 119    succeeded, or false if the queue is empty.  */
 120
 121 static bool
 122 url_dequeue (struct url_queue *queue,
 123              const char **url, const char **referer, int *depth,
 124              bool *html_allowed)
 125 {
 126   struct queue_element *qel = queue->head;
 127
 128   if (!qel)
 129     return false;
 130
 131   queue->head = queue->head->next;
 132   if (!queue->head)
 133     queue->tail = NULL;
 134
 135   *url = qel->url;
 136   *referer = qel->referer;
 137   *depth = qel->depth;
 138   *html_allowed = qel->html_allowed;
 139
 140   --queue->count;
 141
 142   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 143   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 144
 145   xfree (qel);
 146   return true;
 147 }
 148 \f
 149 static bool download_child_p (const struct urlpos *, struct url *, int,
 150                               struct url *, struct hash_table *);
 151 static bool descend_redirect_p (const char *, const char *, int,
 152                                 struct url *, struct hash_table *);
 153
 154
 155 /* Retrieve a part of the web beginning with START_URL.  This used to
 156    be called "recursive retrieval", because the old function was
 157    recursive and implemented depth-first search.  retrieve_tree on the
 158    other hand implements breadth-search traversal of the tree, which
 159    results in much nicer ordering of downloads.
 160
 161    The algorithm this function uses is simple:
 162
 163    1. put START_URL in the queue.
 164    2. while there are URLs in the queue:
 165
 166      3. get next URL from the queue.
 167      4. download it.
 168      5. if the URL is HTML and its depth does not exceed maximum depth,
 169         get the list of URLs embedded therein.
 170      6. for each of those URLs do the following:
 171
 172        7. if the URL is not one of those downloaded before, and if it
 173           satisfies the criteria specified by the various command-line
 174           options, add it to the queue. */
 175
 176 uerr_t
 177 retrieve_tree (const char *start_url)
 178 {
 179   uerr_t status = RETROK;
 180
 181   /* The queue of URLs we need to load. */
 182   struct url_queue *queue;
 183
 184   /* The URLs we do not wish to enqueue, because they are already in
 185      the queue, but haven't been downloaded yet.  */
 186   struct hash_table *blacklist;
 187
 188   int up_error_code;
 189   struct url *start_url_parsed = url_parse (start_url, &up_error_code);
 190
 191   if (!start_url_parsed)
 192     {
 193       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
 194                  url_error (up_error_code));
 195       return URLERROR;
 196     }
 197
 198   queue = url_queue_new ();
 199   blacklist = make_string_hash_table (0);
 200
 201   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
 202      just URL so we enqueue the canonical form of the URL.  */
 203   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
 204   string_set_add (blacklist, start_url_parsed->url);
 205
 206   while (1)
 207     {
 208       bool descend = false;
 209       char *url, *referer, *file = NULL;
 210       int depth;
 211       bool html_allowed;
 212       bool dash_p_leaf_HTML = false;
 213
 214       if (opt.quota && total_downloaded_bytes > opt.quota)
 215         break;
 216       if (status == FWRITEERR)
 217         break;
 218
 219       /* Get the next URL from the queue... */
 220
 221       if (!url_dequeue (queue,
 222                         (const char **)&url, (const char **)&referer,
 223                         &depth, &html_allowed))
 224         break;
 225
 226       /* ...and download it.  Note that this download is in most cases
 227          unconditional, as download_child_p already makes sure a file
 228          doesn't get enqueued twice -- and yet this check is here, and
 229          not in download_child_p.  This is so that if you run `wget -r
 230          URL1 URL2', and a random URL is encountered once under URL1
 231          and again under URL2, but at a different (possibly smaller)
 232          depth, we want the URL's children to be taken into account
 233          the second time.  */
 234       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 235         {
 236           file = xstrdup (hash_table_get (dl_url_file_map, url));
 237
 238           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 239                    url, file));
 240
 241           if (html_allowed
 242               && downloaded_html_set
 243               && string_set_contains (downloaded_html_set, file))
 244             descend = true;
 245         }
 246       else
 247         {
 248           int dt = 0;
 249           char *redirected = NULL;
 250
 251           status = retrieve_url (url, &file, &redirected, referer, &dt, false);
 252
 253           if (html_allowed && file && status == RETROK
 254               && (dt & RETROKF) && (dt & TEXTHTML))
 255             descend = true;
 256
 257           if (redirected)
 258             {
 259               /* We have been redirected, possibly to another host, or
 260                  different path, or wherever.  Check whether we really
 261                  want to follow it.  */
 262               if (descend)
 263                 {
 264                   if (!descend_redirect_p (redirected, url, depth,
 265                                            start_url_parsed, blacklist))
 266                     descend = false;
 267                   else
 268                     /* Make sure that the old pre-redirect form gets
 269                        blacklisted. */
 270                     string_set_add (blacklist, url);
 271                 }
 272
 273               xfree (url);
 274               url = redirected;
 275             }
 276         }
 277
 278       if (opt.spider)
 279         {
 280           visited_url (url, referer);
 281         }
 282
 283       if (descend
 284           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 285         {
 286           if (opt.page_requisites
 287               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 288             {
 289               /* When -p is specified, we are allowed to exceed the
 290                  maximum depth, but only for the "inline" links,
 291                  i.e. those that are needed to display the page.
 292                  Originally this could exceed the depth at most by
 293                  one, but we allow one more level so that the leaf
 294                  pages that contain frames can be loaded
 295                  correctly.  */
 296               dash_p_leaf_HTML = true;
 297             }
 298           else
 299             {
 300               /* Either -p wasn't specified or it was and we've
 301                  already spent the two extra (pseudo-)levels that it
 302                  affords us, so we need to bail out. */
 303               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 304                        depth, opt.reclevel));
 305               descend = false;
 306             }
 307         }
 308
 309       /* If the downloaded document was HTML, parse it and enqueue the
 310          links it contains. */
 311
 312       if (descend)
 313         {
 314           bool meta_disallow_follow = false;
 315           struct urlpos *children
 316             = get_urls_html (file, url, &meta_disallow_follow);
 317
 318           if (opt.use_robots && meta_disallow_follow)
 319             {
 320               free_urlpos (children);
 321               children = NULL;
 322             }
 323
 324           if (children)
 325             {
 326               struct urlpos *child = children;
 327               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 328               assert (url_parsed != NULL);
 329
 330               for (; child; child = child->next)
 331                 {
 332                   if (child->ignore_when_downloading)
 333                     continue;
 334                   if (dash_p_leaf_HTML && !child->link_inline_p)
 335                     continue;
 336                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 337                                         blacklist))
 338                     {
 339                       url_enqueue (queue, xstrdup (child->url->url),
 340                                    xstrdup (url), depth + 1,
 341                                    child->link_expect_html);
 342                       /* We blacklist the URL we have enqueued, because we
 343                          don't want to enqueue (and hence download) the
 344                          same URL twice.  */
 345                       string_set_add (blacklist, child->url->url);
 346                     }
 347                 }
 348
 349               url_free (url_parsed);
 350               free_urlpos (children);
 351             }
 352         }
 353
 354       if (file
 355           && (opt.delete_after
 356               || opt.spider /* opt.recursive is implicitely true */
 357               || !acceptable (file)))
 358         {
 359           /* Either --delete-after was specified, or we loaded this
 360              (otherwise unneeded because of --spider or rejected by -R)
 361              HTML file just to harvest its hyperlinks -- in either case,
 362              delete the local file. */
 363           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 364                    opt.delete_after ? "--delete-after" :
 365                    (opt.spider ? "--spider" :
 366                     "recursive rejection criteria")));
 367           logprintf (LOG_VERBOSE,
 368                      (opt.delete_after || opt.spider
 369                       ? _("Removing %s.\n")
 370                       : _("Removing %s since it should be rejected.\n")),
 371                      file);
 372           if (unlink (file))
 373             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 374           logputs (LOG_VERBOSE, "\n");
 375           register_delete_file (file);
 376         }
 377
 378       xfree (url);
 379       xfree_null (referer);
 380       xfree_null (file);
 381     }
 382
 383   /* If anything is left of the queue due to a premature exit, free it
 384      now.  */
 385   {
 386     char *d1, *d2;
 387     int d3;
 388     bool d4;
 389     while (url_dequeue (queue,
 390                         (const char **)&d1, (const char **)&d2, &d3, &d4))
 391       {
 392         xfree (d1);
 393         xfree_null (d2);
 394       }
 395   }
 396   url_queue_delete (queue);
 397
 398   if (start_url_parsed)
 399     url_free (start_url_parsed);
 400   string_set_free (blacklist);
 401
 402   if (opt.quota && total_downloaded_bytes > opt.quota)
 403     return QUOTEXC;
 404   else if (status == FWRITEERR)
 405     return FWRITEERR;
 406   else
 407     return RETROK;
 408 }
 409
 410 /* Based on the context provided by retrieve_tree, decide whether a
 411    URL is to be descended to.  This is only ever called from
 412    retrieve_tree, but is in a separate function for clarity.
 413
 414    The most expensive checks (such as those for robots) are memoized
 415    by storing these URLs to BLACKLIST.  This may or may not help.  It
 416    will help if those URLs are encountered many times.  */
 417
 418 static bool
 419 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 420                   struct url *start_url_parsed, struct hash_table *blacklist)
 421 {
 422   struct url *u = upos->url;
 423   const char *url = u->url;
 424   bool u_scheme_like_http;
 425
 426   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 427
 428   if (string_set_contains (blacklist, url))
 429     {
 430       if (opt.spider)
 431         {
 432           char *referrer = url_string (parent, true);
 433           DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
 434           visited_url (url, referrer);
 435           xfree (referrer);
 436         }
 437       DEBUGP (("Already on the black list.\n"));
 438       goto out;
 439     }
 440
 441   /* Several things to check for:
 442      1. if scheme is not http, and we don't load it
 443      2. check for relative links (if relative_only is set)
 444      3. check for domain
 445      4. check for no-parent
 446      5. check for excludes && includes
 447      6. check for suffix
 448      7. check for same host (if spanhost is unset), with possible
 449      gethostbyname baggage
 450      8. check for robots.txt
 451
 452      Addendum: If the URL is FTP, and it is to be loaded, only the
 453      domain and suffix settings are "stronger".
 454
 455      Note that .html files will get loaded regardless of suffix rules
 456      (but that is remedied later with unlink) unless the depth equals
 457      the maximum depth.
 458
 459      More time- and memory- consuming tests should be put later on
 460      the list.  */
 461
 462   /* Determine whether URL under consideration has a HTTP-like scheme. */
 463   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 464
 465   /* 1. Schemes other than HTTP are normally not recursed into. */
 466   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 467     {
 468       DEBUGP (("Not following non-HTTP schemes.\n"));
 469       goto out;
 470     }
 471
 472   /* 2. If it is an absolute link and they are not followed, throw it
 473      out.  */
 474   if (u_scheme_like_http)
 475     if (opt.relative_only && !upos->link_relative_p)
 476       {
 477         DEBUGP (("It doesn't really look like a relative link.\n"));
 478         goto out;
 479       }
 480
 481   /* 3. If its domain is not to be accepted/looked-up, chuck it
 482      out.  */
 483   if (!accept_domain (u))
 484     {
 485       DEBUGP (("The domain was not accepted.\n"));
 486       goto out;
 487     }
 488
 489   /* 4. Check for parent directory.
 490
 491      If we descended to a different host or changed the scheme, ignore
 492      opt.no_parent.  Also ignore it for documents needed to display
 493      the parent page when in -p mode.  */
 494   if (opt.no_parent
 495       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
 496       && 0 == strcasecmp (u->host, start_url_parsed->host)
 497       && u->port == start_url_parsed->port
 498       && !(opt.page_requisites && upos->link_inline_p))
 499     {
 500       if (!subdir_p (start_url_parsed->dir, u->dir))
 501         {
 502           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 503                    u->dir, start_url_parsed->dir));
 504           goto out;
 505         }
 506     }
 507
 508   /* 5. If the file does not match the acceptance list, or is on the
 509      rejection list, chuck it out.  The same goes for the directory
 510      exclusion and inclusion lists.  */
 511   if (opt.includes || opt.excludes)
 512     {
 513       if (!accdir (u->dir))
 514         {
 515           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 516           goto out;
 517         }
 518     }
 519
 520   /* 6. Check for acceptance/rejection rules.  We ignore these rules
 521      for directories (no file name to match) and for non-leaf HTMLs,
 522      which can lead to other files that do need to be downloaded.  (-p
 523      automatically implies non-leaf because with -p we can, if
 524      necesary, overstep the maximum depth to get the page requisites.)  */
 525   if (u->file[0] != '\0'
 526       && !(has_html_suffix_p (u->file)
 527            /* The exception only applies to non-leaf HTMLs (but -p
 528               always implies non-leaf because we can overstep the
 529               maximum depth to get the requisites): */
 530            && (/* non-leaf */
 531                opt.reclevel == INFINITE_RECURSION
 532                /* also non-leaf */
 533                || depth < opt.reclevel - 1
 534                /* -p, which implies non-leaf (see above) */
 535                || opt.page_requisites)))
 536     {
 537       if (!acceptable (u->file))
 538         {
 539           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 540                    url, u->file));
 541           goto out;
 542         }
 543     }
 544
 545   /* 7. */
 546   if (schemes_are_similar_p (u->scheme, parent->scheme))
 547     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 548       {
 549         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 550                  u->host, parent->host));
 551         goto out;
 552       }
 553
 554   /* 8. */
 555   if (opt.use_robots && u_scheme_like_http)
 556     {
 557       struct robot_specs *specs = res_get_specs (u->host, u->port);
 558       if (!specs)
 559         {
 560           char *rfile;
 561           if (res_retrieve_file (url, &rfile))
 562             {
 563               specs = res_parse_from_file (rfile);
 564               xfree (rfile);
 565             }
 566           else
 567             {
 568               /* If we cannot get real specs, at least produce
 569                  dummy ones so that we can register them and stop
 570                  trying to retrieve them.  */
 571               specs = res_parse ("", 0);
 572             }
 573           res_register_specs (u->host, u->port, specs);
 574         }
 575
 576       /* Now that we have (or don't have) robots.txt specs, we can
 577          check what they say.  */
 578       if (!res_match_path (specs, u->path))
 579         {
 580           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 581           string_set_add (blacklist, url);
 582           goto out;
 583         }
 584     }
 585
 586   /* The URL has passed all the tests.  It can be placed in the
 587      download queue. */
 588   DEBUGP (("Decided to load it.\n"));
 589
 590   return true;
 591
 592  out:
 593   DEBUGP (("Decided NOT to load it.\n"));
 594
 595   return false;
 596 }
 597
 598 /* This function determines whether we will consider downloading the
 599    children of a URL whose download resulted in a redirection,
 600    possibly to another host, etc.  It is needed very rarely, and thus
 601    it is merely a simple-minded wrapper around download_child_p.  */
 602
 603 static bool
 604 descend_redirect_p (const char *redirected, const char *original, int depth,
 605                     struct url *start_url_parsed, struct hash_table *blacklist)
 606 {
 607   struct url *orig_parsed, *new_parsed;
 608   struct urlpos *upos;
 609   bool success;
 610
 611   orig_parsed = url_parse (original, NULL);
 612   assert (orig_parsed != NULL);
 613
 614   new_parsed = url_parse (redirected, NULL);
 615   assert (new_parsed != NULL);
 616
 617   upos = xnew0 (struct urlpos);
 618   upos->url = new_parsed;
 619
 620   success = download_child_p (upos, orig_parsed, depth,
 621                               start_url_parsed, blacklist);
 622
 623   url_free (orig_parsed);
 624   url_free (new_parsed);
 625   xfree (upos);
 626
 627   if (!success)
 628     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 629
 630   return success;
 631 }