sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "retr.h"
  46 #include "ftp.h"
  47 #include "host.h"
  48 #include "hash.h"
  49 #include "res.h"
  50 #include "convert.h"
  51
  52 extern char *version_string;
  53 extern SUM_SIZE_INT total_downloaded_bytes;
  54
  55 extern struct hash_table *dl_url_file_map;
  56 extern struct hash_table *downloaded_html_set;
  57 \f
  58 /* Functions for maintaining the URL queue.  */
  59
  60 struct queue_element {
  61   const char *url;              /* the URL to download */
  62   const char *referer;          /* the referring document */
  63   int depth;                    /* the depth */
  64   unsigned int html_allowed :1; /* whether the document is allowed to
  65                                    be treated as HTML. */
  66
  67   struct queue_element *next;   /* next element in queue */
  68 };
  69
  70 struct url_queue {
  71   struct queue_element *head;
  72   struct queue_element *tail;
  73   int count, maxcount;
  74 };
  75
  76 /* Create a URL queue. */
  77
  78 static struct url_queue *
  79 url_queue_new (void)
  80 {
  81   struct url_queue *queue = xnew0 (struct url_queue);
  82   return queue;
  83 }
  84
  85 /* Delete a URL queue. */
  86
  87 static void
  88 url_queue_delete (struct url_queue *queue)
  89 {
  90   xfree (queue);
  91 }
  92
  93 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  94    retrieved ("dequeued") from the queue in the order they were placed
  95    into it.  */
  96
  97 static void
  98 url_enqueue (struct url_queue *queue,
  99              const char *url, const char *referer, int depth, bool html_allowed)
 100 {
 101   struct queue_element *qel = xnew (struct queue_element);
 102   qel->url = url;
 103   qel->referer = referer;
 104   qel->depth = depth;
 105   qel->html_allowed = html_allowed;
 106   qel->next = NULL;
 107
 108   ++queue->count;
 109   if (queue->count > queue->maxcount)
 110     queue->maxcount = queue->count;
 111
 112   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 113   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 114
 115   if (queue->tail)
 116     queue->tail->next = qel;
 117   queue->tail = qel;
 118
 119   if (!queue->head)
 120     queue->head = queue->tail;
 121 }
 122
 123 /* Take a URL out of the queue.  Return true if this operation
 124    succeeded, or false if the queue is empty.  */
 125
 126 static bool
 127 url_dequeue (struct url_queue *queue,
 128              const char **url, const char **referer, int *depth,
 129              bool *html_allowed)
 130 {
 131   struct queue_element *qel = queue->head;
 132
 133   if (!qel)
 134     return false;
 135
 136   queue->head = queue->head->next;
 137   if (!queue->head)
 138     queue->tail = NULL;
 139
 140   *url = qel->url;
 141   *referer = qel->referer;
 142   *depth = qel->depth;
 143   *html_allowed = qel->html_allowed;
 144
 145   --queue->count;
 146
 147   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 148   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 149
 150   xfree (qel);
 151   return true;
 152 }
 153 \f
 154 static bool download_child_p (const struct urlpos *, struct url *, int,
 155                               struct url *, struct hash_table *);
 156 static bool descend_redirect_p (const char *, const char *, int,
 157                                 struct url *, struct hash_table *);
 158
 159
 160 /* Retrieve a part of the web beginning with START_URL.  This used to
 161    be called "recursive retrieval", because the old function was
 162    recursive and implemented depth-first search.  retrieve_tree on the
 163    other hand implements breadth-search traversal of the tree, which
 164    results in much nicer ordering of downloads.
 165
 166    The algorithm this function uses is simple:
 167
 168    1. put START_URL in the queue.
 169    2. while there are URLs in the queue:
 170
 171      3. get next URL from the queue.
 172      4. download it.
 173      5. if the URL is HTML and its depth does not exceed maximum depth,
 174         get the list of URLs embedded therein.
 175      6. for each of those URLs do the following:
 176
 177        7. if the URL is not one of those downloaded before, and if it
 178           satisfies the criteria specified by the various command-line
 179           options, add it to the queue. */
 180
 181 uerr_t
 182 retrieve_tree (const char *start_url)
 183 {
 184   uerr_t status = RETROK;
 185
 186   /* The queue of URLs we need to load. */
 187   struct url_queue *queue;
 188
 189   /* The URLs we do not wish to enqueue, because they are already in
 190      the queue, but haven't been downloaded yet.  */
 191   struct hash_table *blacklist;
 192
 193   int up_error_code;
 194   struct url *start_url_parsed = url_parse (start_url, &up_error_code);
 195
 196   if (!start_url_parsed)
 197     {
 198       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
 199                  url_error (up_error_code));
 200       return URLERROR;
 201     }
 202
 203   queue = url_queue_new ();
 204   blacklist = make_string_hash_table (0);
 205
 206   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
 207      just URL so we enqueue the canonical form of the URL.  */
 208   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
 209   string_set_add (blacklist, start_url_parsed->url);
 210
 211   while (1)
 212     {
 213       bool descend = false;
 214       char *url, *referer, *file = NULL;
 215       int depth;
 216       bool html_allowed;
 217       bool dash_p_leaf_HTML = false;
 218
 219       if (opt.quota && total_downloaded_bytes > opt.quota)
 220         break;
 221       if (status == FWRITEERR)
 222         break;
 223
 224       /* Get the next URL from the queue... */
 225
 226       if (!url_dequeue (queue,
 227                         (const char **)&url, (const char **)&referer,
 228                         &depth, &html_allowed))
 229         break;
 230
 231       /* ...and download it.  Note that this download is in most cases
 232          unconditional, as download_child_p already makes sure a file
 233          doesn't get enqueued twice -- and yet this check is here, and
 234          not in download_child_p.  This is so that if you run `wget -r
 235          URL1 URL2', and a random URL is encountered once under URL1
 236          and again under URL2, but at a different (possibly smaller)
 237          depth, we want the URL's children to be taken into account
 238          the second time.  */
 239       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 240         {
 241           file = xstrdup (hash_table_get (dl_url_file_map, url));
 242
 243           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 244                    url, file));
 245
 246           if (html_allowed
 247               && downloaded_html_set
 248               && string_set_contains (downloaded_html_set, file))
 249             descend = true;
 250         }
 251       else
 252         {
 253           int dt = 0;
 254           char *redirected = NULL;
 255           bool oldrec = opt.recursive;
 256
 257           opt.recursive = false;
 258           status = retrieve_url (url, &file, &redirected, referer, &dt);
 259           opt.recursive = oldrec;
 260
 261           if (html_allowed && file && status == RETROK
 262               && (dt & RETROKF) && (dt & TEXTHTML))
 263             descend = true;
 264
 265           if (redirected)
 266             {
 267               /* We have been redirected, possibly to another host, or
 268                  different path, or wherever.  Check whether we really
 269                  want to follow it.  */
 270               if (descend)
 271                 {
 272                   if (!descend_redirect_p (redirected, url, depth,
 273                                            start_url_parsed, blacklist))
 274                     descend = false;
 275                   else
 276                     /* Make sure that the old pre-redirect form gets
 277                        blacklisted. */
 278                     string_set_add (blacklist, url);
 279                 }
 280
 281               xfree (url);
 282               url = redirected;
 283             }
 284         }
 285
 286       if (descend
 287           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 288         {
 289           if (opt.page_requisites
 290               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 291             {
 292               /* When -p is specified, we are allowed to exceed the
 293                  maximum depth, but only for the "inline" links,
 294                  i.e. those that are needed to display the page.
 295                  Originally this could exceed the depth at most by
 296                  one, but we allow one more level so that the leaf
 297                  pages that contain frames can be loaded
 298                  correctly.  */
 299               dash_p_leaf_HTML = true;
 300             }
 301           else
 302             {
 303               /* Either -p wasn't specified or it was and we've
 304                  already spent the two extra (pseudo-)levels that it
 305                  affords us, so we need to bail out. */
 306               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 307                        depth, opt.reclevel));
 308               descend = false;
 309             }
 310         }
 311
 312       /* If the downloaded document was HTML, parse it and enqueue the
 313          links it contains. */
 314
 315       if (descend)
 316         {
 317           bool meta_disallow_follow = false;
 318           struct urlpos *children
 319             = get_urls_html (file, url, &meta_disallow_follow);
 320
 321           if (opt.use_robots && meta_disallow_follow)
 322             {
 323               free_urlpos (children);
 324               children = NULL;
 325             }
 326
 327           if (children)
 328             {
 329               struct urlpos *child = children;
 330               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 331               assert (url_parsed != NULL);
 332
 333               for (; child; child = child->next)
 334                 {
 335                   if (child->ignore_when_downloading)
 336                     continue;
 337                   if (dash_p_leaf_HTML && !child->link_inline_p)
 338                     continue;
 339                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 340                                         blacklist))
 341                     {
 342                       url_enqueue (queue, xstrdup (child->url->url),
 343                                    xstrdup (url), depth + 1,
 344                                    child->link_expect_html);
 345                       /* We blacklist the URL we have enqueued, because we
 346                          don't want to enqueue (and hence download) the
 347                          same URL twice.  */
 348                       string_set_add (blacklist, child->url->url);
 349                     }
 350                 }
 351
 352               url_free (url_parsed);
 353               free_urlpos (children);
 354             }
 355         }
 356
 357       if (opt.delete_after || (file && !acceptable (file)))
 358         {
 359           /* Either --delete-after was specified, or we loaded this
 360              otherwise rejected (e.g. by -R) HTML file just so we
 361              could harvest its hyperlinks -- in either case, delete
 362              the local file. */
 363           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 364                    opt.delete_after ? "--delete-after" :
 365                    "recursive rejection criteria"));
 366           logprintf (LOG_VERBOSE,
 367                      (opt.delete_after
 368                       ? _("Removing %s.\n")
 369                       : _("Removing %s since it should be rejected.\n")),
 370                      file);
 371           if (unlink (file))
 372             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 373           register_delete_file (file);
 374         }
 375
 376       xfree (url);
 377       xfree_null (referer);
 378       xfree_null (file);
 379     }
 380
 381   /* If anything is left of the queue due to a premature exit, free it
 382      now.  */
 383   {
 384     char *d1, *d2;
 385     int d3;
 386     bool d4;
 387     while (url_dequeue (queue,
 388                         (const char **)&d1, (const char **)&d2, &d3, &d4))
 389       {
 390         xfree (d1);
 391         xfree_null (d2);
 392       }
 393   }
 394   url_queue_delete (queue);
 395
 396   if (start_url_parsed)
 397     url_free (start_url_parsed);
 398   string_set_free (blacklist);
 399
 400   if (opt.quota && total_downloaded_bytes > opt.quota)
 401     return QUOTEXC;
 402   else if (status == FWRITEERR)
 403     return FWRITEERR;
 404   else
 405     return RETROK;
 406 }
 407
 408 /* Based on the context provided by retrieve_tree, decide whether a
 409    URL is to be descended to.  This is only ever called from
 410    retrieve_tree, but is in a separate function for clarity.
 411
 412    The most expensive checks (such as those for robots) are memoized
 413    by storing these URLs to BLACKLIST.  This may or may not help.  It
 414    will help if those URLs are encountered many times.  */
 415
 416 static bool
 417 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 418                   struct url *start_url_parsed, struct hash_table *blacklist)
 419 {
 420   struct url *u = upos->url;
 421   const char *url = u->url;
 422   bool u_scheme_like_http;
 423
 424   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 425
 426   if (string_set_contains (blacklist, url))
 427     {
 428       DEBUGP (("Already on the black list.\n"));
 429       goto out;
 430     }
 431
 432   /* Several things to check for:
 433      1. if scheme is not http, and we don't load it
 434      2. check for relative links (if relative_only is set)
 435      3. check for domain
 436      4. check for no-parent
 437      5. check for excludes && includes
 438      6. check for suffix
 439      7. check for same host (if spanhost is unset), with possible
 440      gethostbyname baggage
 441      8. check for robots.txt
 442
 443      Addendum: If the URL is FTP, and it is to be loaded, only the
 444      domain and suffix settings are "stronger".
 445
 446      Note that .html files will get loaded regardless of suffix rules
 447      (but that is remedied later with unlink) unless the depth equals
 448      the maximum depth.
 449
 450      More time- and memory- consuming tests should be put later on
 451      the list.  */
 452
 453   /* Determine whether URL under consideration has a HTTP-like scheme. */
 454   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 455
 456   /* 1. Schemes other than HTTP are normally not recursed into. */
 457   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 458     {
 459       DEBUGP (("Not following non-HTTP schemes.\n"));
 460       goto out;
 461     }
 462
 463   /* 2. If it is an absolute link and they are not followed, throw it
 464      out.  */
 465   if (u_scheme_like_http)
 466     if (opt.relative_only && !upos->link_relative_p)
 467       {
 468         DEBUGP (("It doesn't really look like a relative link.\n"));
 469         goto out;
 470       }
 471
 472   /* 3. If its domain is not to be accepted/looked-up, chuck it
 473      out.  */
 474   if (!accept_domain (u))
 475     {
 476       DEBUGP (("The domain was not accepted.\n"));
 477       goto out;
 478     }
 479
 480   /* 4. Check for parent directory.
 481
 482      If we descended to a different host or changed the scheme, ignore
 483      opt.no_parent.  Also ignore it for documents needed to display
 484      the parent page when in -p mode.  */
 485   if (opt.no_parent
 486       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
 487       && 0 == strcasecmp (u->host, start_url_parsed->host)
 488       && u->port == start_url_parsed->port
 489       && !(opt.page_requisites && upos->link_inline_p))
 490     {
 491       if (!frontcmp (start_url_parsed->dir, u->dir))
 492         {
 493           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 494                    u->dir, start_url_parsed->dir));
 495           goto out;
 496         }
 497     }
 498
 499   /* 5. If the file does not match the acceptance list, or is on the
 500      rejection list, chuck it out.  The same goes for the directory
 501      exclusion and inclusion lists.  */
 502   if (opt.includes || opt.excludes)
 503     {
 504       if (!accdir (u->dir, ALLABS))
 505         {
 506           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 507           goto out;
 508         }
 509     }
 510
 511   /* 6. Check for acceptance/rejection rules.  We ignore these rules
 512      for directories (no file name to match) and for non-leaf HTMLs,
 513      which can lead to other files that do need to be downloaded.  (-p
 514      automatically implies non-leaf because with -p we can, if
 515      necesary, overstep the maximum depth to get the page requisites.)  */
 516   if (u->file[0] != '\0'
 517       && !(has_html_suffix_p (u->file)
 518            /* The exception only applies to non-leaf HTMLs (but -p
 519               always implies non-leaf because we can overstep the
 520               maximum depth to get the requisites): */
 521            && (/* non-leaf */
 522                opt.reclevel == INFINITE_RECURSION
 523                /* also non-leaf */
 524                || depth < opt.reclevel - 1
 525                /* -p, which implies non-leaf (see above) */
 526                || opt.page_requisites)))
 527     {
 528       if (!acceptable (u->file))
 529         {
 530           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 531                    url, u->file));
 532           goto out;
 533         }
 534     }
 535
 536   /* 7. */
 537   if (schemes_are_similar_p (u->scheme, parent->scheme))
 538     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 539       {
 540         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 541                  u->host, parent->host));
 542         goto out;
 543       }
 544
 545   /* 8. */
 546   if (opt.use_robots && u_scheme_like_http)
 547     {
 548       struct robot_specs *specs = res_get_specs (u->host, u->port);
 549       if (!specs)
 550         {
 551           char *rfile;
 552           if (res_retrieve_file (url, &rfile))
 553             {
 554               specs = res_parse_from_file (rfile);
 555               xfree (rfile);
 556             }
 557           else
 558             {
 559               /* If we cannot get real specs, at least produce
 560                  dummy ones so that we can register them and stop
 561                  trying to retrieve them.  */
 562               specs = res_parse ("", 0);
 563             }
 564           res_register_specs (u->host, u->port, specs);
 565         }
 566
 567       /* Now that we have (or don't have) robots.txt specs, we can
 568          check what they say.  */
 569       if (!res_match_path (specs, u->path))
 570         {
 571           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 572           string_set_add (blacklist, url);
 573           goto out;
 574         }
 575     }
 576
 577   /* The URL has passed all the tests.  It can be placed in the
 578      download queue. */
 579   DEBUGP (("Decided to load it.\n"));
 580
 581   return true;
 582
 583  out:
 584   DEBUGP (("Decided NOT to load it.\n"));
 585
 586   return false;
 587 }
 588
 589 /* This function determines whether we will consider downloading the
 590    children of a URL whose download resulted in a redirection,
 591    possibly to another host, etc.  It is needed very rarely, and thus
 592    it is merely a simple-minded wrapper around download_child_p.  */
 593
 594 static bool
 595 descend_redirect_p (const char *redirected, const char *original, int depth,
 596                     struct url *start_url_parsed, struct hash_table *blacklist)
 597 {
 598   struct url *orig_parsed, *new_parsed;
 599   struct urlpos *upos;
 600   bool success;
 601
 602   orig_parsed = url_parse (original, NULL);
 603   assert (orig_parsed != NULL);
 604
 605   new_parsed = url_parse (redirected, NULL);
 606   assert (new_parsed != NULL);
 607
 608   upos = xnew0 (struct urlpos);
 609   upos->url = new_parsed;
 610
 611   success = download_child_p (upos, orig_parsed, depth,
 612                               start_url_parsed, blacklist);
 613
 614   url_free (orig_parsed);
 615   url_free (new_parsed);
 616   xfree (upos);
 617
 618   if (!success)
 619     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 620
 621   return success;
 622 }