sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif /* HAVE_UNISTD_H */
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "retr.h"
  46 #include "ftp.h"
  47 #include "host.h"
  48 #include "hash.h"
  49 #include "res.h"
  50 #include "convert.h"
  51 #include "html-url.h"
  52 #include "css-url.h"
  53 #include "spider.h"
  54 #include "iri.h"
  55 \f
  56 /* Functions for maintaining the URL queue.  */
  57
  58 struct queue_element {
  59   const char *url;              /* the URL to download */
  60   const char *referer;          /* the referring document */
  61   int depth;                    /* the depth */
  62   bool html_allowed;            /* whether the document is allowed to
  63                                    be treated as HTML. */
  64   struct iri *iri;                /* sXXXav */
  65   bool css_allowed;             /* whether the document is allowed to
  66                                    be treated as CSS. */
  67   struct queue_element *next;   /* next element in queue */
  68 };
  69
  70 struct url_queue {
  71   struct queue_element *head;
  72   struct queue_element *tail;
  73   int count, maxcount;
  74 };
  75
  76 /* Create a URL queue. */
  77
  78 static struct url_queue *
  79 url_queue_new (void)
  80 {
  81   struct url_queue *queue = xnew0 (struct url_queue);
  82   return queue;
  83 }
  84
  85 /* Delete a URL queue. */
  86
  87 static void
  88 url_queue_delete (struct url_queue *queue)
  89 {
  90   xfree (queue);
  91 }
  92
  93 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  94    retrieved ("dequeued") from the queue in the order they were placed
  95    into it.  */
  96
  97 static void
  98 url_enqueue (struct url_queue *queue, struct iri *i,
  99              const char *url, const char *referer, int depth,
 100              bool html_allowed, bool css_allowed)
 101 {
 102   struct queue_element *qel = xnew (struct queue_element);
 103   qel->iri = i;
 104   qel->url = url;
 105   qel->referer = referer;
 106   qel->depth = depth;
 107   qel->html_allowed = html_allowed;
 108   qel->css_allowed = css_allowed;
 109   qel->next = NULL;
 110
 111   ++queue->count;
 112   if (queue->count > queue->maxcount)
 113     queue->maxcount = queue->count;
 114
 115   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 116   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 117
 118   if (i)
 119     DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url),
 120                quote (i->uri_encoding)));
 121
 122   if (queue->tail)
 123     queue->tail->next = qel;
 124   queue->tail = qel;
 125
 126   if (!queue->head)
 127     queue->head = queue->tail;
 128 }
 129
 130 /* Take a URL out of the queue.  Return true if this operation
 131    succeeded, or false if the queue is empty.  */
 132
 133 static bool
 134 url_dequeue (struct url_queue *queue, struct iri **i,
 135              const char **url, const char **referer, int *depth,
 136              bool *html_allowed, bool *css_allowed)
 137 {
 138   struct queue_element *qel = queue->head;
 139
 140   if (!qel)
 141     return false;
 142
 143   queue->head = queue->head->next;
 144   if (!queue->head)
 145     queue->tail = NULL;
 146
 147   *i = qel->iri;
 148   *url = qel->url;
 149   *referer = qel->referer;
 150   *depth = qel->depth;
 151   *html_allowed = qel->html_allowed;
 152   *css_allowed = qel->css_allowed;
 153
 154   --queue->count;
 155
 156   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 157   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 158
 159   xfree (qel);
 160   return true;
 161 }
 162 \f
 163 static bool download_child_p (const struct urlpos *, struct url *, int,
 164                               struct url *, struct hash_table *, struct iri *);
 165 static bool descend_redirect_p (const char *, const char *, int,
 166                                 struct url *, struct hash_table *, struct iri *);
 167
 168
 169 /* Retrieve a part of the web beginning with START_URL.  This used to
 170    be called "recursive retrieval", because the old function was
 171    recursive and implemented depth-first search.  retrieve_tree on the
 172    other hand implements breadth-search traversal of the tree, which
 173    results in much nicer ordering of downloads.
 174
 175    The algorithm this function uses is simple:
 176
 177    1. put START_URL in the queue.
 178    2. while there are URLs in the queue:
 179
 180      3. get next URL from the queue.
 181      4. download it.
 182      5. if the URL is HTML and its depth does not exceed maximum depth,
 183         get the list of URLs embedded therein.
 184      6. for each of those URLs do the following:
 185
 186        7. if the URL is not one of those downloaded before, and if it
 187           satisfies the criteria specified by the various command-line
 188           options, add it to the queue. */
 189
 190 uerr_t
 191 retrieve_tree (const char *start_url)
 192 {
 193   uerr_t status = RETROK;
 194
 195   /* The queue of URLs we need to load. */
 196   struct url_queue *queue;
 197
 198   /* The URLs we do not wish to enqueue, because they are already in
 199      the queue, but haven't been downloaded yet.  */
 200   struct hash_table *blacklist;
 201
 202   int up_error_code;
 203   struct url *start_url_parsed;
 204   struct iri *i = iri_new ();
 205   set_uri_encoding (i, opt.locale);
 206
 207   start_url_parsed = url_parse (start_url, &up_error_code, i);
 208   if (!start_url_parsed)
 209     {
 210       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
 211                  url_error (up_error_code));
 212       return URLERROR;
 213     }
 214
 215   queue = url_queue_new ();
 216   blacklist = make_string_hash_table (0);
 217
 218   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
 219      just URL so we enqueue the canonical form of the URL.  */
 220   url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
 221                false);
 222   string_set_add (blacklist, start_url_parsed->url);
 223
 224   while (1)
 225     {
 226       bool descend = false;
 227       char *url, *referer, *file = NULL;
 228       int depth;
 229       bool html_allowed, css_allowed;
 230       bool is_css = false;
 231       bool dash_p_leaf_HTML = false;
 232
 233       if (opt.quota && total_downloaded_bytes > opt.quota)
 234         break;
 235       if (status == FWRITEERR)
 236         break;
 237
 238       /* Get the next URL from the queue... */
 239
 240       if (!url_dequeue (queue, (struct iri **) &i,
 241                         (const char **)&url, (const char **)&referer,
 242                         &depth, &html_allowed, &css_allowed))
 243         break;
 244
 245       /* ...and download it.  Note that this download is in most cases
 246          unconditional, as download_child_p already makes sure a file
 247          doesn't get enqueued twice -- and yet this check is here, and
 248          not in download_child_p.  This is so that if you run `wget -r
 249          URL1 URL2', and a random URL is encountered once under URL1
 250          and again under URL2, but at a different (possibly smaller)
 251          depth, we want the URL's children to be taken into account
 252          the second time.  */
 253       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 254         {
 255           file = xstrdup (hash_table_get (dl_url_file_map, url));
 256
 257           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 258                    url, file));
 259
 260           /* this sucks, needs to be combined! */
 261           if (html_allowed
 262               && downloaded_html_set
 263               && string_set_contains (downloaded_html_set, file))
 264             {
 265               descend = true;
 266               is_css = false;
 267             }
 268           if (css_allowed
 269               && downloaded_css_set
 270               && string_set_contains (downloaded_css_set, file))
 271             {
 272               descend = true;
 273               is_css = true;
 274             }
 275         }
 276       else
 277         {
 278           int dt = 0;
 279           char *redirected = NULL;
 280
 281           status = retrieve_url (url, &file, &redirected, referer, &dt,
 282                                  false, i);
 283
 284           if (html_allowed && file && status == RETROK
 285               && (dt & RETROKF) && (dt & TEXTHTML))
 286             {
 287               descend = true;
 288               is_css = false;
 289             }
 290
 291           /* a little different, css_allowed can override content type
 292              lots of web servers serve css with an incorrect content type
 293           */
 294           if (file && status == RETROK
 295               && (dt & RETROKF) &&
 296               ((dt & TEXTCSS) || css_allowed))
 297             {
 298               descend = true;
 299               is_css = true;
 300             }
 301
 302           if (redirected)
 303             {
 304               /* We have been redirected, possibly to another host, or
 305                  different path, or wherever.  Check whether we really
 306                  want to follow it.  */
 307               if (descend)
 308                 {
 309                   if (!descend_redirect_p (redirected, url, depth,
 310                                            start_url_parsed, blacklist, i))
 311                     descend = false;
 312                   else
 313                     /* Make sure that the old pre-redirect form gets
 314                        blacklisted. */
 315                     string_set_add (blacklist, url);
 316                 }
 317
 318               xfree (url);
 319               url = redirected;
 320             }
 321         }
 322
 323       if (opt.spider)
 324         {
 325           visited_url (url, referer);
 326         }
 327
 328       if (descend
 329           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 330         {
 331           if (opt.page_requisites
 332               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 333             {
 334               /* When -p is specified, we are allowed to exceed the
 335                  maximum depth, but only for the "inline" links,
 336                  i.e. those that are needed to display the page.
 337                  Originally this could exceed the depth at most by
 338                  one, but we allow one more level so that the leaf
 339                  pages that contain frames can be loaded
 340                  correctly.  */
 341               dash_p_leaf_HTML = true;
 342             }
 343           else
 344             {
 345               /* Either -p wasn't specified or it was and we've
 346                  already spent the two extra (pseudo-)levels that it
 347                  affords us, so we need to bail out. */
 348               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 349                        depth, opt.reclevel));
 350               descend = false;
 351             }
 352         }
 353
 354       /* If the downloaded document was HTML or CSS, parse it and enqueue the
 355          links it contains. */
 356
 357       if (descend)
 358         {
 359           bool meta_disallow_follow = false;
 360           struct urlpos *children
 361             = is_css ? get_urls_css_file (file, url) :
 362                        get_urls_html (file, url, &meta_disallow_follow, i);
 363
 364           if (opt.use_robots && meta_disallow_follow)
 365             {
 366               free_urlpos (children);
 367               children = NULL;
 368             }
 369
 370           if (children)
 371             {
 372               struct urlpos *child = children;
 373               struct url *url_parsed = url_parse (url, NULL, i);
 374               struct iri *ci;
 375               char *referer_url = url;
 376               bool strip_auth = (url_parsed != NULL
 377                                  && url_parsed->user != NULL);
 378               assert (url_parsed != NULL);
 379
 380               /* Strip auth info if present */
 381               if (strip_auth)
 382                 referer_url = url_string (url_parsed, URL_AUTH_HIDE);
 383
 384               for (; child; child = child->next)
 385                 {
 386                   if (child->ignore_when_downloading)
 387                     continue;
 388                   if (dash_p_leaf_HTML && !child->link_inline_p)
 389                     continue;
 390                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 391                                         blacklist, i))
 392                     {
 393                       ci = iri_new ();
 394                       set_uri_encoding (ci, i->content_encoding);
 395                       url_enqueue (queue, ci, xstrdup (child->url->url),
 396                                    xstrdup (referer_url), depth + 1,
 397                                    child->link_expect_html,
 398                                    child->link_expect_css);
 399                       /* We blacklist the URL we have enqueued, because we
 400                          don't want to enqueue (and hence download) the
 401                          same URL twice.  */
 402                       string_set_add (blacklist, child->url->url);
 403                     }
 404                 }
 405
 406               if (strip_auth)
 407                 xfree (referer_url);
 408               url_free (url_parsed);
 409               free_urlpos (children);
 410             }
 411         }
 412
 413       if (file
 414           && (opt.delete_after
 415               || opt.spider /* opt.recursive is implicitely true */
 416               || !acceptable (file)))
 417         {
 418           /* Either --delete-after was specified, or we loaded this
 419              (otherwise unneeded because of --spider or rejected by -R)
 420              HTML file just to harvest its hyperlinks -- in either case,
 421              delete the local file. */
 422           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 423                    opt.delete_after ? "--delete-after" :
 424                    (opt.spider ? "--spider" :
 425                     "recursive rejection criteria")));
 426           logprintf (LOG_VERBOSE,
 427                      (opt.delete_after || opt.spider
 428                       ? _("Removing %s.\n")
 429                       : _("Removing %s since it should be rejected.\n")),
 430                      file);
 431           if (unlink (file))
 432             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 433           logputs (LOG_VERBOSE, "\n");
 434           register_delete_file (file);
 435         }
 436
 437       xfree (url);
 438       xfree_null (referer);
 439       xfree_null (file);
 440       iri_free (i);
 441     }
 442
 443   /* If anything is left of the queue due to a premature exit, free it
 444      now.  */
 445   {
 446     char *d1, *d2;
 447     int d3;
 448     bool d4, d5;
 449     struct iri *d6;
 450     while (url_dequeue (queue, (struct iri **)&d6,
 451                         (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
 452       {
 453         iri_free (d6);
 454         xfree (d1);
 455         xfree_null (d2);
 456       }
 457   }
 458   url_queue_delete (queue);
 459
 460   if (start_url_parsed)
 461     url_free (start_url_parsed);
 462   string_set_free (blacklist);
 463
 464   if (opt.quota && total_downloaded_bytes > opt.quota)
 465     return QUOTEXC;
 466   else if (status == FWRITEERR)
 467     return FWRITEERR;
 468   else
 469     return RETROK;
 470 }
 471
 472 /* Based on the context provided by retrieve_tree, decide whether a
 473    URL is to be descended to.  This is only ever called from
 474    retrieve_tree, but is in a separate function for clarity.
 475
 476    The most expensive checks (such as those for robots) are memoized
 477    by storing these URLs to BLACKLIST.  This may or may not help.  It
 478    will help if those URLs are encountered many times.  */
 479
 480 static bool
 481 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 482                   struct url *start_url_parsed, struct hash_table *blacklist,
 483                   struct iri *iri)
 484 {
 485   struct url *u = upos->url;
 486   const char *url = u->url;
 487   bool u_scheme_like_http;
 488
 489   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 490
 491   if (string_set_contains (blacklist, url))
 492     {
 493       if (opt.spider)
 494         {
 495           char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
 496           DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
 497           visited_url (url, referrer);
 498           xfree (referrer);
 499         }
 500       DEBUGP (("Already on the black list.\n"));
 501       goto out;
 502     }
 503
 504   /* Several things to check for:
 505      1. if scheme is not http, and we don't load it
 506      2. check for relative links (if relative_only is set)
 507      3. check for domain
 508      4. check for no-parent
 509      5. check for excludes && includes
 510      6. check for suffix
 511      7. check for same host (if spanhost is unset), with possible
 512      gethostbyname baggage
 513      8. check for robots.txt
 514
 515      Addendum: If the URL is FTP, and it is to be loaded, only the
 516      domain and suffix settings are "stronger".
 517
 518      Note that .html files will get loaded regardless of suffix rules
 519      (but that is remedied later with unlink) unless the depth equals
 520      the maximum depth.
 521
 522      More time- and memory- consuming tests should be put later on
 523      the list.  */
 524
 525   /* Determine whether URL under consideration has a HTTP-like scheme. */
 526   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 527
 528   /* 1. Schemes other than HTTP are normally not recursed into. */
 529   if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 530     {
 531       DEBUGP (("Not following non-HTTP schemes.\n"));
 532       goto out;
 533     }
 534
 535   /* 2. If it is an absolute link and they are not followed, throw it
 536      out.  */
 537   if (u_scheme_like_http)
 538     if (opt.relative_only && !upos->link_relative_p)
 539       {
 540         DEBUGP (("It doesn't really look like a relative link.\n"));
 541         goto out;
 542       }
 543
 544   /* 3. If its domain is not to be accepted/looked-up, chuck it
 545      out.  */
 546   if (!accept_domain (u))
 547     {
 548       DEBUGP (("The domain was not accepted.\n"));
 549       goto out;
 550     }
 551
 552   /* 4. Check for parent directory.
 553
 554      If we descended to a different host or changed the scheme, ignore
 555      opt.no_parent.  Also ignore it for documents needed to display
 556      the parent page when in -p mode.  */
 557   if (opt.no_parent
 558       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
 559       && 0 == strcasecmp (u->host, start_url_parsed->host)
 560       && u->port == start_url_parsed->port
 561       && !(opt.page_requisites && upos->link_inline_p))
 562     {
 563       if (!subdir_p (start_url_parsed->dir, u->dir))
 564         {
 565           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 566                    u->dir, start_url_parsed->dir));
 567           goto out;
 568         }
 569     }
 570
 571   /* 5. If the file does not match the acceptance list, or is on the
 572      rejection list, chuck it out.  The same goes for the directory
 573      exclusion and inclusion lists.  */
 574   if (opt.includes || opt.excludes)
 575     {
 576       if (!accdir (u->dir))
 577         {
 578           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 579           goto out;
 580         }
 581     }
 582
 583   /* 6. Check for acceptance/rejection rules.  We ignore these rules
 584      for directories (no file name to match) and for non-leaf HTMLs,
 585      which can lead to other files that do need to be downloaded.  (-p
 586      automatically implies non-leaf because with -p we can, if
 587      necesary, overstep the maximum depth to get the page requisites.)  */
 588   if (u->file[0] != '\0'
 589       && !(has_html_suffix_p (u->file)
 590            /* The exception only applies to non-leaf HTMLs (but -p
 591               always implies non-leaf because we can overstep the
 592               maximum depth to get the requisites): */
 593            && (/* non-leaf */
 594                opt.reclevel == INFINITE_RECURSION
 595                /* also non-leaf */
 596                || depth < opt.reclevel - 1
 597                /* -p, which implies non-leaf (see above) */
 598                || opt.page_requisites)))
 599     {
 600       if (!acceptable (u->file))
 601         {
 602           DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 603                    url, u->file));
 604           goto out;
 605         }
 606     }
 607
 608   /* 7. */
 609   if (schemes_are_similar_p (u->scheme, parent->scheme))
 610     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 611       {
 612         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 613                  u->host, parent->host));
 614         goto out;
 615       }
 616
 617   /* 8. */
 618   if (opt.use_robots && u_scheme_like_http)
 619     {
 620       struct robot_specs *specs = res_get_specs (u->host, u->port);
 621       if (!specs)
 622         {
 623           char *rfile;
 624           if (res_retrieve_file (url, &rfile, iri))
 625             {
 626               specs = res_parse_from_file (rfile);
 627
 628               /* Delete the robots.txt file if we chose to either delete the
 629                  files after downloading or we're just running a spider. */
 630               if (opt.delete_after || opt.spider)
 631                 {
 632                   logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
 633                   if (unlink (rfile))
 634                       logprintf (LOG_NOTQUIET, "unlink: %s\n",
 635                                  strerror (errno));
 636                 }
 637
 638               xfree (rfile);
 639             }
 640           else
 641             {
 642               /* If we cannot get real specs, at least produce
 643                  dummy ones so that we can register them and stop
 644                  trying to retrieve them.  */
 645               specs = res_parse ("", 0);
 646             }
 647           res_register_specs (u->host, u->port, specs);
 648         }
 649
 650       /* Now that we have (or don't have) robots.txt specs, we can
 651          check what they say.  */
 652       if (!res_match_path (specs, u->path))
 653         {
 654           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 655           string_set_add (blacklist, url);
 656           goto out;
 657         }
 658     }
 659
 660   /* The URL has passed all the tests.  It can be placed in the
 661      download queue. */
 662   DEBUGP (("Decided to load it.\n"));
 663
 664   return true;
 665
 666  out:
 667   DEBUGP (("Decided NOT to load it.\n"));
 668
 669   return false;
 670 }
 671
 672 /* This function determines whether we will consider downloading the
 673    children of a URL whose download resulted in a redirection,
 674    possibly to another host, etc.  It is needed very rarely, and thus
 675    it is merely a simple-minded wrapper around download_child_p.  */
 676
 677 static bool
 678 descend_redirect_p (const char *redirected, const char *original, int depth,
 679                     struct url *start_url_parsed, struct hash_table *blacklist,
 680                     struct iri *iri)
 681 {
 682   struct url *orig_parsed, *new_parsed;
 683   struct urlpos *upos;
 684   bool success;
 685
 686   orig_parsed = url_parse (original, NULL, NULL);
 687   assert (orig_parsed != NULL);
 688
 689   new_parsed = url_parse (redirected, NULL, NULL);
 690   assert (new_parsed != NULL);
 691
 692   upos = xnew0 (struct urlpos);
 693   upos->url = new_parsed;
 694
 695   success = download_child_p (upos, orig_parsed, depth,
 696                               start_url_parsed, blacklist, iri);
 697
 698   url_free (orig_parsed);
 699   url_free (new_parsed);
 700   xfree (upos);
 701
 702   if (!success)
 703     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 704
 705   return success;
 706 }
 707
 708 /* vim:set sts=2 sw=2 cino+={s: */