sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif /* HAVE_UNISTD_H */
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "url.h"
  44 #include "recur.h"
  45 #include "utils.h"
  46 #include "retr.h"
  47 #include "ftp.h"
  48 #include "host.h"
  49 #include "hash.h"
  50 #include "res.h"
  51 #include "convert.h"
  52 #include "spider.h"
  53 \f
  54 /* Functions for maintaining the URL queue.  */
  55
  56 struct queue_element {
  57   const char *url;              /* the URL to download */
  58   const char *referer;          /* the referring document */
  59   int depth;                    /* the depth */
  60   bool html_allowed;            /* whether the document is allowed to
  61                                    be treated as HTML. */
  62
  63   struct queue_element *next;   /* next element in queue */
  64 };
  65
  66 struct url_queue {
  67   struct queue_element *head;
  68   struct queue_element *tail;
  69   int count, maxcount;
  70 };
  71
  72 /* Create a URL queue. */
  73
  74 static struct url_queue *
  75 url_queue_new (void)
  76 {
  77   struct url_queue *queue = xnew0 (struct url_queue);
  78   return queue;
  79 }
  80
  81 /* Delete a URL queue. */
  82
  83 static void
  84 url_queue_delete (struct url_queue *queue)
  85 {
  86   xfree (queue);
  87 }
  88
  89 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  90    retrieved ("dequeued") from the queue in the order they were placed
  91    into it.  */
  92
  93 static void
  94 url_enqueue (struct url_queue *queue,
  95              const char *url, const char *referer, int depth, bool html_allowed)
  96 {
  97   struct queue_element *qel = xnew (struct queue_element);
  98   qel->url = url;
  99   qel->referer = referer;
 100   qel->depth = depth;
 101   qel->html_allowed = html_allowed;
 102   qel->next = NULL;
 103
 104   ++queue->count;
 105   if (queue->count > queue->maxcount)
 106     queue->maxcount = queue->count;
 107
 108   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 109   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 110
 111   if (queue->tail)
 112     queue->tail->next = qel;
 113   queue->tail = qel;
 114
 115   if (!queue->head)
 116     queue->head = queue->tail;
 117 }
 118
 119 /* Take a URL out of the queue.  Return true if this operation
 120    succeeded, or false if the queue is empty.  */
 121
 122 static bool
 123 url_dequeue (struct url_queue *queue,
 124              const char **url, const char **referer, int *depth,
 125              bool *html_allowed)
 126 {
 127   struct queue_element *qel = queue->head;
 128
 129   if (!qel)
 130     return false;
 131
 132   queue->head = queue->head->next;
 133   if (!queue->head)
 134     queue->tail = NULL;
 135
 136   *url = qel->url;
 137   *referer = qel->referer;
 138   *depth = qel->depth;
 139   *html_allowed = qel->html_allowed;
 140
 141   --queue->count;
 142
 143   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 144   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 145
 146   xfree (qel);
 147   return true;
 148 }
 149 \f
 150 static bool download_child_p (const struct urlpos *, struct url *, int,
 151                               struct url *, struct hash_table *);
 152 static bool descend_redirect_p (const char *, const char *, int,
 153                                 struct url *, struct hash_table *);
 154
 155
 156 /* Retrieve a part of the web beginning with START_URL.  This used to
 157    be called "recursive retrieval", because the old function was
 158    recursive and implemented depth-first search.  retrieve_tree on the
 159    other hand implements breadth-search traversal of the tree, which
 160    results in much nicer ordering of downloads.
 161
 162    The algorithm this function uses is simple:
 163
 164    1. put START_URL in the queue.
 165    2. while there are URLs in the queue:
 166
 167      3. get next URL from the queue.
 168      4. download it.
 169      5. if the URL is HTML and its depth does not exceed maximum depth,
 170         get the list of URLs embedded therein.
 171      6. for each of those URLs do the following:
 172
 173        7. if the URL is not one of those downloaded before, and if it
 174           satisfies the criteria specified by the various command-line
 175           options, add it to the queue. */
 176
 177 uerr_t
 178 retrieve_tree (const char *start_url)
 179 {
 180   uerr_t status = RETROK;
 181
 182   /* The queue of URLs we need to load. */
 183   struct url_queue *queue;
 184
 185   /* The URLs we do not wish to enqueue, because they are already in
 186      the queue, but haven't been downloaded yet.  */
 187   struct hash_table *blacklist;
 188
 189   int up_error_code;
 190   struct url *start_url_parsed = url_parse (start_url, &up_error_code);
 191
 192   if (!start_url_parsed)
 193     {
 194       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
 195                  url_error (up_error_code));
 196       return URLERROR;
 197     }
 198
 199   queue = url_queue_new ();
 200   blacklist = make_string_hash_table (0);
 201
 202   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
 203      just URL so we enqueue the canonical form of the URL.  */
 204   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
 205   string_set_add (blacklist, start_url_parsed->url);
 206
 207   while (1)
 208     {
 209       bool descend = false;
 210       char *url, *referer, *file = NULL;
 211       int depth;
 212       bool html_allowed;
 213       bool dash_p_leaf_HTML = false;
 214
 215       if (opt.quota && total_downloaded_bytes > opt.quota)
 216         break;
 217       if (status == FWRITEERR)
 218         break;
 219
 220       /* Get the next URL from the queue... */
 221
 222       if (!url_dequeue (queue,
 223                         (const char **)&url, (const char **)&referer,
 224                         &depth, &html_allowed))
 225         break;
 226
 227       /* ...and download it.  Note that this download is in most cases
 228          unconditional, as download_child_p already makes sure a file
 229          doesn't get enqueued twice -- and yet this check is here, and
 230          not in download_child_p.  This is so that if you run `wget -r
 231          URL1 URL2', and a random URL is encountered once under URL1
 232          and again under URL2, but at a different (possibly smaller)
 233          depth, we want the URL's children to be taken into account
 234          the second time.  */
 235       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 236         {
 237           file = xstrdup (hash_table_get (dl_url_file_map, url));
 238
 239           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 240                    url, file));
 241
 242           if (html_allowed
 243               && downloaded_html_set
 244               && string_set_contains (downloaded_html_set, file))
 245             descend = true;
 246         }
 247       else
 248         {
 249           int dt = 0;
 250           char *redirected = NULL;
 251
 252           status = retrieve_url (url, &file, &redirected, referer, &dt, false);
 253
 254           if (html_allowed && file && status == RETROK
 255               && (dt & RETROKF) && (dt & TEXTHTML))
 256             descend = true;
 257
 258           if (redirected)
 259             {
 260               /* We have been redirected, possibly to another host, or
 261                  different path, or wherever.  Check whether we really
 262                  want to follow it.  */
 263               if (descend)
 264                 {
 265                   if (!descend_redirect_p (redirected, url, depth,
 266                                            start_url_parsed, blacklist))
 267                     descend = false;
 268                   else
 269                     /* Make sure that the old pre-redirect form gets
 270                        blacklisted. */
 271                     string_set_add (blacklist, url);
 272                 }
 273
 274               xfree (url);
 275               url = redirected;
 276             }
 277         }
 278
 279       if (opt.spider)
 280         {
 281           visited_url (url, referer);
 282         }
 283
 284       if (descend
 285           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 286         {
 287           if (opt.page_requisites
 288               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 289             {
 290               /* When -p is specified, we are allowed to exceed the
 291                  maximum depth, but only for the "inline" links,
 292                  i.e. those that are needed to display the page.
 293                  Originally this could exceed the depth at most by
 294                  one, but we allow one more level so that the leaf
 295                  pages that contain frames can be loaded
 296                  correctly.  */
 297               dash_p_leaf_HTML = true;
 298             }
 299           else
 300             {
 301               /* Either -p wasn't specified or it was and we've
 302                  already spent the two extra (pseudo-)levels that it
 303                  affords us, so we need to bail out. */
 304               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 305                        depth, opt.reclevel));
 306               descend = false;
 307             }
 308         }
 309
 310       /* If the downloaded document was HTML, parse it and enqueue the
 311          links it contains. */
 312
 313       if (descend)
 314         {
 315           bool meta_disallow_follow = false;
 316           struct urlpos *children
 317             = get_urls_html (file, url, &meta_disallow_follow);
 318
 319           if (opt.use_robots && meta_disallow_follow)
 320             {
 321               free_urlpos (children);
 322               children = NULL;
 323             }
 324
 325           if (children)
 326             {
 327               struct urlpos *child = children;
 328               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 329               char *referer_url = url;
 330               bool strip_auth = (url_parsed != NULL
 331                                  && url_parsed->user != NULL);
 332               assert (url_parsed != NULL);
 333
 334               /* Strip auth info if present */
 335               if (strip_auth)
 336                 referer_url = url_string (url_parsed, URL_AUTH_HIDE);
 337
 338               for (; child; child = child->next)
 339                 {
 340                   if (child->ignore_when_downloading)
 341                     continue;
 342                   if (dash_p_leaf_HTML && !child->link_inline_p)
 343                     continue;
 344                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 345                                         blacklist))
 346                     {
 347                       url_enqueue (queue, xstrdup (child->url->url),
 348                                    xstrdup (referer_url), depth + 1,
 349                                    child->link_expect_html);
 350                       /* We blacklist the URL we have enqueued, because we
 351                          don't want to enqueue (and hence download) the
 352                          same URL twice.  */
 353                       string_set_add (blacklist, child->url->url);
 354                     }
 355                 }
 356
 357               if (strip_auth)
 358                 xfree (referer_url);
 359               url_free (url_parsed);
 360               free_urlpos (children);
 361             }
 362         }
 363
 364       if (file
 365           && (opt.delete_after
 366               || opt.spider /* opt.recursive is implicitely true */
 367               || !acceptable (file)))
 368         {
 369           /* Either --delete-after was specified, or we loaded this
 370              (otherwise unneeded because of --spider or rejected by -R)
 371              HTML file just to harvest its hyperlinks -- in either case,
 372              delete the local file. */
 373           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 374                    opt.delete_after ? "--delete-after" :
 375                    (opt.spider ? "--spider" :
 376                     "recursive rejection criteria")));
 377           logprintf (LOG_VERBOSE,
 378                      (opt.delete_after || opt.spider
 379                       ? _("Removing %s.\n")
 380                       : _("Removing %s since it should be rejected.\n")),
 381                      file);
 382           if (unlink (file))
 383             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 384           logputs (LOG_VERBOSE, "\n");
 385           register_delete_file (file);
 386         }
 387
 388       xfree (url);
 389       xfree_null (referer);
 390       xfree_null (file);
 391     }
 392
 393   /* If anything is left of the queue due to a premature exit, free it
 394      now.  */
 395   {
 396     char *d1, *d2;
 397     int d3;
 398     bool d4;
 399     while (url_dequeue (queue,
 400                         (const char **)&d1, (const char **)&d2, &d3, &d4))
 401       {
 402         xfree (d1);
 403         xfree_null (d2);
 404       }
 405   }
 406   url_queue_delete (queue);
 407
 408   if (start_url_parsed)
 409     url_free (start_url_parsed);
 410   string_set_free (blacklist);
 411
 412   if (opt.quota && total_downloaded_bytes > opt.quota)
 413     return QUOTEXC;
 414   else if (status == FWRITEERR)
 415     return FWRITEERR;
 416   else
 417     return RETROK;
 418 }
 419
 420 /* Based on the context provided by retrieve_tree, decide whether a
 421    URL is to be descended to.  This is only ever called from
 422    retrieve_tree, but is in a separate function for clarity.
 423
 424    The most expensive checks (such as those for robots) are memoized
 425    by storing these URLs to BLACKLIST.  This may or may not help.  It
 426    will help if those URLs are encountered many times.  */
 427
 428 static bool
 429 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 430                   struct url *start_url_parsed, struct hash_table *blacklist)
 431 {
 432   struct url *u = upos->url;
 433   const char *url = u->url;
 434   bool u_scheme_like_http;
 435
 436   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 437
 438   if (string_set_contains (blacklist, url))
 439     {
 440       if (opt.spider)
 441         {
 442           char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
 443           DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
 444           visited_url (url, referrer);
 445           xfree (referrer);
 446         }
 447       DEBUGP (("Already on the black list.\n"));
 448       goto out;
 449     }
 450
 451   /* Several things to check for:
 452      1. if scheme is not http, and we don't load it
 453      2. check for relative links (if relative_only is set)
 454      3. check for domain
 455      4. check for no-parent
 456      5. check for excludes && includes
 457      6. check for suffix
 458      7. check for same host (if spanhost is unset), with possible
 459      gethostbyname baggage
 460      8. check for robots.txt
 461
 462      Addendum: If the URL is FTP, and it is to be loaded, only the
 463      domain and suffix settings are "stronger".
 464
 465      Note that .html files will get loaded regardless of suffix rules
 466      (but that is remedied later with unlink) unless the depth equals
 467      the maximum depth.
 468
 469      More time- and memory- consuming tests should be put later on
 470      the list.  */
 471
 472   /* Determine whether URL under consideration has a HTTP-like scheme. */
 473   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 474
 475   /* 1. Schemes other than HTTP are normally not recursed into. */
 476   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 477     {
 478       DEBUGP (("Not following non-HTTP schemes.\n"));
 479       goto out;
 480     }
 481
 482   /* 2. If it is an absolute link and they are not followed, throw it
 483      out.  */
 484   if (u_scheme_like_http)
 485     if (opt.relative_only && !upos->link_relative_p)
 486       {
 487         DEBUGP (("It doesn't really look like a relative link.\n"));
 488         goto out;
 489       }
 490
 491   /* 3. If its domain is not to be accepted/looked-up, chuck it
 492      out.  */
 493   if (!accept_domain (u))
 494     {
 495       DEBUGP (("The domain was not accepted.\n"));
 496       goto out;
 497     }
 498
 499   /* 4. Check for parent directory.
 500
 501      If we descended to a different host or changed the scheme, ignore
 502      opt.no_parent.  Also ignore it for documents needed to display
 503      the parent page when in -p mode.  */
 504   if (opt.no_parent
 505       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
 506       && 0 == strcasecmp (u->host, start_url_parsed->host)
 507       && u->port == start_url_parsed->port
 508       && !(opt.page_requisites && upos->link_inline_p))
 509     {
 510       if (!subdir_p (start_url_parsed->dir, u->dir))
 511         {
 512           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 513                    u->dir, start_url_parsed->dir));
 514           goto out;
 515         }
 516     }
 517
 518   /* 5. If the file does not match the acceptance list, or is on the
 519      rejection list, chuck it out.  The same goes for the directory
 520      exclusion and inclusion lists.  */
 521   if (opt.includes || opt.excludes)
 522     {
 523       if (!accdir (u->dir))
 524         {
 525           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 526           goto out;
 527         }
 528     }
 529
 530   /* 6. Check for acceptance/rejection rules.  We ignore these rules
 531      for directories (no file name to match) and for non-leaf HTMLs,
 532      which can lead to other files that do need to be downloaded.  (-p
 533      automatically implies non-leaf because with -p we can, if
 534      necesary, overstep the maximum depth to get the page requisites.)  */
 535   if (u->file[0] != '\0'
 536       && !(has_html_suffix_p (u->file)
 537            /* The exception only applies to non-leaf HTMLs (but -p
 538               always implies non-leaf because we can overstep the
 539               maximum depth to get the requisites): */
 540            && (/* non-leaf */
 541                opt.reclevel == INFINITE_RECURSION
 542                /* also non-leaf */
 543                || depth < opt.reclevel - 1
 544                /* -p, which implies non-leaf (see above) */
 545                || opt.page_requisites)))
 546     {
 547       if (!acceptable (u->file))
 548         {
 549           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 550                    url, u->file));
 551           goto out;
 552         }
 553     }
 554
 555   /* 7. */
 556   if (schemes_are_similar_p (u->scheme, parent->scheme))
 557     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 558       {
 559         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 560                  u->host, parent->host));
 561         goto out;
 562       }
 563
 564   /* 8. */
 565   if (opt.use_robots && u_scheme_like_http)
 566     {
 567       struct robot_specs *specs = res_get_specs (u->host, u->port);
 568       if (!specs)
 569         {
 570           char *rfile;
 571           if (res_retrieve_file (url, &rfile))
 572             {
 573               specs = res_parse_from_file (rfile);
 574               xfree (rfile);
 575             }
 576           else
 577             {
 578               /* If we cannot get real specs, at least produce
 579                  dummy ones so that we can register them and stop
 580                  trying to retrieve them.  */
 581               specs = res_parse ("", 0);
 582             }
 583           res_register_specs (u->host, u->port, specs);
 584         }
 585
 586       /* Now that we have (or don't have) robots.txt specs, we can
 587          check what they say.  */
 588       if (!res_match_path (specs, u->path))
 589         {
 590           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 591           string_set_add (blacklist, url);
 592           goto out;
 593         }
 594     }
 595
 596   /* The URL has passed all the tests.  It can be placed in the
 597      download queue. */
 598   DEBUGP (("Decided to load it.\n"));
 599
 600   return true;
 601
 602  out:
 603   DEBUGP (("Decided NOT to load it.\n"));
 604
 605   return false;
 606 }
 607
 608 /* This function determines whether we will consider downloading the
 609    children of a URL whose download resulted in a redirection,
 610    possibly to another host, etc.  It is needed very rarely, and thus
 611    it is merely a simple-minded wrapper around download_child_p.  */
 612
 613 static bool
 614 descend_redirect_p (const char *redirected, const char *original, int depth,
 615                     struct url *start_url_parsed, struct hash_table *blacklist)
 616 {
 617   struct url *orig_parsed, *new_parsed;
 618   struct urlpos *upos;
 619   bool success;
 620
 621   orig_parsed = url_parse (original, NULL);
 622   assert (orig_parsed != NULL);
 623
 624   new_parsed = url_parse (redirected, NULL);
 625   assert (new_parsed != NULL);
 626
 627   upos = xnew0 (struct urlpos);
 628   upos->url = new_parsed;
 629
 630   success = download_child_p (upos, orig_parsed, depth,
 631                               start_url_parsed, blacklist);
 632
 633   url_free (orig_parsed);
 634   url_free (new_parsed);
 635   xfree (upos);
 636
 637   if (!success)
 638     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 639
 640   return success;
 641 }
 642
 643 /* vim:set sts=2 sw=2 cino+={s: */