sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61 \f
  62 /* Functions for maintaining the URL queue.  */
  63
  64 struct queue_element {
  65   const char *url;
  66   const char *referer;
  67   int depth;
  68   struct queue_element *next;
  69 };
  70
  71 struct url_queue {
  72   struct queue_element *head;
  73   struct queue_element *tail;
  74   int count, maxcount;
  75 };
  76
  77 /* Create a URL queue. */
  78
  79 static struct url_queue *
  80 url_queue_new (void)
  81 {
  82   struct url_queue *queue = xmalloc (sizeof (*queue));
  83   memset (queue, '\0', sizeof (*queue));
  84   return queue;
  85 }
  86
  87 /* Delete a URL queue. */
  88
  89 static void
  90 url_queue_delete (struct url_queue *queue)
  91 {
  92   xfree (queue);
  93 }
  94
  95 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  96    retrieved ("dequeued") from the queue in the order they were placed
  97    into it.  */
  98
  99 static void
 100 url_enqueue (struct url_queue *queue,
 101              const char *url, const char *referer, int depth)
 102 {
 103   struct queue_element *qel = xmalloc (sizeof (*qel));
 104   qel->url = url;
 105   qel->referer = referer;
 106   qel->depth = depth;
 107   qel->next = NULL;
 108
 109   ++queue->count;
 110   if (queue->count > queue->maxcount)
 111     queue->maxcount = queue->count;
 112
 113   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 114   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 115
 116   if (queue->tail)
 117     queue->tail->next = qel;
 118   queue->tail = qel;
 119
 120   if (!queue->head)
 121     queue->head = queue->tail;
 122 }
 123
 124 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 125    or 0 if the queue is empty.  */
 126
 127 static int
 128 url_dequeue (struct url_queue *queue,
 129              const char **url, const char **referer, int *depth)
 130 {
 131   struct queue_element *qel = queue->head;
 132
 133   if (!qel)
 134     return 0;
 135
 136   queue->head = queue->head->next;
 137   if (!queue->head)
 138     queue->tail = NULL;
 139
 140   *url = qel->url;
 141   *referer = qel->referer;
 142   *depth = qel->depth;
 143
 144   --queue->count;
 145
 146   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 147   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 148
 149   xfree (qel);
 150   return 1;
 151 }
 152 \f
 153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
 154                                   struct url *, struct hash_table *));
 155
 156 /* Retrieve a part of the web beginning with START_URL.  This used to
 157    be called "recursive retrieval", because the old function was
 158    recursive and implemented depth-first search.  retrieve_tree on the
 159    other hand implements breadth-search traversal of the tree, which
 160    results in much nicer ordering of downloads.
 161
 162    The algorithm this function uses is simple:
 163
 164    1. put START_URL in the queue.
 165    2. while there are URLs in the queue:
 166
 167      3. get next URL from the queue.
 168      4. download it.
 169      5. if the URL is HTML and its depth does not exceed maximum depth,
 170         get the list of URLs embedded therein.
 171      6. for each of those URLs do the following:
 172
 173        7. if the URL is not one of those downloaded before, and if it
 174           satisfies the criteria specified by the various command-line
 175           options, add it to the queue. */
 176
 177 uerr_t
 178 retrieve_tree (const char *start_url)
 179 {
 180   uerr_t status = RETROK;
 181
 182   /* The queue of URLs we need to load. */
 183   struct url_queue *queue = url_queue_new ();
 184
 185   /* The URLs we do not wish to enqueue, because they are already in
 186      the queue, but haven't been downloaded yet.  */
 187   struct hash_table *blacklist = make_string_hash_table (0);
 188
 189   /* We'll need various components of this, so better get it over with
 190      now. */
 191   struct url *start_url_parsed = url_parse (start_url, NULL);
 192
 193   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 194   string_set_add (blacklist, start_url);
 195
 196   while (1)
 197     {
 198       int descend = 0;
 199       char *url, *referer, *file = NULL;
 200       int depth;
 201       boolean dash_p_leaf_HTML = FALSE;
 202
 203       if (downloaded_exceeds_quota ())
 204         break;
 205
 206       if (status == FWRITEERR)
 207         break;
 208
 209       /* Get the next URL from the queue. */
 210
 211       if (!url_dequeue (queue,
 212                         (const char **)&url, (const char **)&referer,
 213                         &depth))
 214         break;
 215
 216       /* And download it. */
 217
 218       {
 219         int dt = 0;
 220         char *redirected = NULL;
 221         int oldrec = opt.recursive;
 222
 223         opt.recursive = 0;
 224         status = retrieve_url (url, &file, &redirected, NULL, &dt);
 225         opt.recursive = oldrec;
 226
 227         if (redirected)
 228           {
 229             xfree (url);
 230             url = redirected;
 231           }
 232         if (file && status == RETROK
 233             && (dt & RETROKF) && (dt & TEXTHTML))
 234           descend = 1;
 235       }
 236
 237       if (descend
 238           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 239         {
 240           if (opt.page_requisites && depth == opt.reclevel)
 241             /* When -p is specified, we can do one more partial
 242                recursion from the "leaf nodes" on the HTML document
 243                tree.  The recursion is partial in that we won't
 244                traverse any <A> or <AREA> tags, nor any <LINK> tags
 245                except for <LINK REL="stylesheet">. */
 246             dash_p_leaf_HTML = TRUE;
 247           else
 248             {
 249               /* Either -p wasn't specified or it was and we've
 250                  already gone the one extra (pseudo-)level that it
 251                  affords us, so we need to bail out. */
 252               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 253                        depth, opt.reclevel));
 254               descend = 0;
 255             }
 256         }
 257
 258       /* If the downloaded document was HTML, parse it and enqueue the
 259          links it contains. */
 260
 261       if (descend)
 262         {
 263           int meta_disallow_follow = 0;
 264           struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
 265                                                    &meta_disallow_follow);
 266
 267           if (opt.use_robots && meta_disallow_follow)
 268             {
 269               free_urlpos (children);
 270               children = NULL;
 271             }
 272
 273           if (children)
 274             {
 275               struct urlpos *child = children;
 276               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 277               assert (url_parsed != NULL);
 278
 279               for (; child; child = child->next)
 280                 {
 281                   if (child->ignore_when_downloading)
 282                     continue;
 283                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 284                                      blacklist))
 285                     {
 286                       url_enqueue (queue, xstrdup (child->url->url),
 287                                    xstrdup (url), depth + 1);
 288                       /* We blacklist the URL we have enqueued, because we
 289                          don't want to enqueue (and hence download) the
 290                          same URL twice.  */
 291                       string_set_add (blacklist, child->url->url);
 292                     }
 293                 }
 294
 295               url_free (url_parsed);
 296               free_urlpos (children);
 297             }
 298         }
 299
 300       if (opt.delete_after || (file && !acceptable (file)))
 301         {
 302           /* Either --delete-after was specified, or we loaded this
 303              otherwise rejected (e.g. by -R) HTML file just so we
 304              could harvest its hyperlinks -- in either case, delete
 305              the local file. */
 306           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 307                    opt.delete_after ? "--delete-after" :
 308                    "recursive rejection criteria"));
 309           logprintf (LOG_VERBOSE,
 310                      (opt.delete_after ? _("Removing %s.\n")
 311                       : _("Removing %s since it should be rejected.\n")),
 312                      file);
 313           if (unlink (file))
 314             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 315         }
 316
 317       xfree (url);
 318       FREE_MAYBE (referer);
 319       FREE_MAYBE (file);
 320     }
 321
 322   /* If anything is left of the queue due to a premature exit, free it
 323      now.  */
 324   {
 325     char *d1, *d2;
 326     int d3;
 327     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 328       {
 329         xfree (d1);
 330         FREE_MAYBE (d2);
 331       }
 332   }
 333   url_queue_delete (queue);
 334
 335   if (start_url_parsed)
 336     url_free (start_url_parsed);
 337   string_set_free (blacklist);
 338
 339   if (downloaded_exceeds_quota ())
 340     return QUOTEXC;
 341   else if (status == FWRITEERR)
 342     return FWRITEERR;
 343   else
 344     return RETROK;
 345 }
 346
 347 /* Based on the context provided by retrieve_tree, decide whether a
 348    URL is to be descended to.  This is only ever called from
 349    retrieve_tree, but is in a separate function for clarity.
 350
 351    The most expensive checks (such as those for robots) are memoized
 352    by storing these URLs to BLACKLIST.  This may or may not help.  It
 353    will help if those URLs are encountered many times.  */
 354
 355 static int
 356 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
 357                struct url *start_url_parsed, struct hash_table *blacklist)
 358 {
 359   struct url *u = upos->url;
 360   const char *url = u->url;
 361
 362   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 363
 364   if (string_set_contains (blacklist, url))
 365     {
 366       DEBUGP (("Already on the black list.\n"));
 367       goto out;
 368     }
 369
 370   /* Several things to check for:
 371      1. if scheme is not http, and we don't load it
 372      2. check for relative links (if relative_only is set)
 373      3. check for domain
 374      4. check for no-parent
 375      5. check for excludes && includes
 376      6. check for suffix
 377      7. check for same host (if spanhost is unset), with possible
 378      gethostbyname baggage
 379      8. check for robots.txt
 380
 381      Addendum: If the URL is FTP, and it is to be loaded, only the
 382      domain and suffix settings are "stronger".
 383
 384      Note that .html files will get loaded regardless of suffix rules
 385      (but that is remedied later with unlink) unless the depth equals
 386      the maximum depth.
 387
 388      More time- and memory- consuming tests should be put later on
 389      the list.  */
 390
 391   /* 1. Schemes other than HTTP are normally not recursed into. */
 392   if (u->scheme != SCHEME_HTTP
 393       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 394     {
 395       DEBUGP (("Not following non-HTTP schemes.\n"));
 396       goto out;
 397     }
 398
 399   /* 2. If it is an absolute link and they are not followed, throw it
 400      out.  */
 401   if (u->scheme == SCHEME_HTTP)
 402     if (opt.relative_only && !upos->link_relative_p)
 403       {
 404         DEBUGP (("It doesn't really look like a relative link.\n"));
 405         goto out;
 406       }
 407
 408   /* 3. If its domain is not to be accepted/looked-up, chuck it
 409      out.  */
 410   if (!accept_domain (u))
 411     {
 412       DEBUGP (("The domain was not accepted.\n"));
 413       goto out;
 414     }
 415
 416   /* 4. Check for parent directory.
 417
 418      If we descended to a different host or changed the scheme, ignore
 419      opt.no_parent.  Also ignore it for -p leaf retrievals.  */
 420   if (opt.no_parent
 421       && u->scheme == parent->scheme
 422       && 0 == strcasecmp (u->host, parent->host)
 423       && u->port == parent->port)
 424     {
 425       if (!frontcmp (parent->dir, u->dir))
 426         {
 427           DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
 428           goto out;
 429         }
 430     }
 431
 432   /* 5. If the file does not match the acceptance list, or is on the
 433      rejection list, chuck it out.  The same goes for the directory
 434      exclusion and inclusion lists.  */
 435   if (opt.includes || opt.excludes)
 436     {
 437       if (!accdir (u->dir, ALLABS))
 438         {
 439           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 440           goto out;
 441         }
 442     }
 443
 444   /* 6. */
 445   {
 446     char *suf;
 447     /* Check for acceptance/rejection rules.  We ignore these rules
 448        for HTML documents because they might lead to other files which
 449        need to be downloaded.  Of course, we don't know which
 450        documents are HTML before downloading them, so we guess.
 451
 452        A file is subject to acceptance/rejection rules if:
 453
 454        * u->file is not "" (i.e. it is not a directory)
 455        and either:
 456          + there is no file suffix,
 457          + or there is a suffix, but is not "html" or "htm",
 458          + both:
 459            - recursion is not infinite,
 460            - and we are at its very end. */
 461
 462     if (u->file[0] != '\0'
 463         && ((suf = suffix (url)) == NULL
 464             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 465             || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
 466       {
 467         if (!acceptable (u->file))
 468           {
 469             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 470                      url, u->file));
 471             goto out;
 472           }
 473       }
 474   }
 475
 476   /* 7. */
 477   if (u->scheme == parent->scheme)
 478     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 479       {
 480         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 481                  u->host, parent->host));
 482         goto out;
 483       }
 484
 485   /* 8. */
 486   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 487     {
 488       struct robot_specs *specs = res_get_specs (u->host, u->port);
 489       if (!specs)
 490         {
 491           char *rfile;
 492           if (res_retrieve_file (url, &rfile))
 493             {
 494               specs = res_parse_from_file (rfile);
 495               xfree (rfile);
 496             }
 497           else
 498             {
 499               /* If we cannot get real specs, at least produce
 500                  dummy ones so that we can register them and stop
 501                  trying to retrieve them.  */
 502               specs = res_parse ("", 0);
 503             }
 504           res_register_specs (u->host, u->port, specs);
 505         }
 506
 507       /* Now that we have (or don't have) robots.txt specs, we can
 508          check what they say.  */
 509       if (!res_match_path (specs, u->path))
 510         {
 511           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 512           string_set_add (blacklist, url);
 513           goto out;
 514         }
 515     }
 516
 517   /* The URL has passed all the tests.  It can be placed in the
 518      download queue. */
 519   DEBUGP (("Decided to load it.\n"));
 520
 521   return 1;
 522
 523  out:
 524   DEBUGP (("Decided NOT to load it.\n"));
 525
 526   return 0;
 527 }
 528 \f
 529 /* Register that URL has been successfully downloaded to FILE. */
 530
 531 void
 532 register_download (const char *url, const char *file)
 533 {
 534   if (!opt.convert_links)
 535     return;
 536   if (!dl_file_url_map)
 537     dl_file_url_map = make_string_hash_table (0);
 538   if (!dl_url_file_map)
 539     dl_url_file_map = make_string_hash_table (0);
 540
 541   if (!hash_table_contains (dl_file_url_map, file))
 542     hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 543   if (!hash_table_contains (dl_url_file_map, url))
 544     hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 545 }
 546
 547 /* Register that FROM has been redirected to TO.  This assumes that TO
 548    is successfully downloaded and already registered using
 549    register_download() above.  */
 550
 551 void
 552 register_redirection (const char *from, const char *to)
 553 {
 554   char *file;
 555
 556   if (!opt.convert_links)
 557     return;
 558
 559   file = hash_table_get (dl_url_file_map, to);
 560   assert (file != NULL);
 561   if (!hash_table_contains (dl_url_file_map, from))
 562     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 563 }
 564
 565 /* Register that URL corresponds to the HTML file FILE. */
 566
 567 void
 568 register_html (const char *url, const char *file)
 569 {
 570   if (!opt.convert_links)
 571     return;
 572   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 573 }
 574
 575 /* convert_links() is called from recursive_retrieve() after we're
 576    done with an HTML file.  This call to convert_links is not complete
 577    because it converts only the downloaded files, and Wget cannot know
 578    which files will be downloaded afterwards.  So, if we have file
 579    fileone.html with:
 580
 581    <a href="/c/something.gif">
 582
 583    and /c/something.gif was not downloaded because it exceeded the
 584    recursion depth, the reference will *not* be changed.
 585
 586    However, later we can encounter /c/something.gif from an "upper"
 587    level HTML (let's call it filetwo.html), and it gets downloaded.
 588
 589    But now we have a problem because /c/something.gif will be
 590    correctly transformed in filetwo.html, but not in fileone.html,
 591    since Wget could not have known that /c/something.gif will be
 592    downloaded in the future.
 593
 594    This is why Wget must, after the whole retrieval, call
 595    convert_all_links to go once more through the entire list of
 596    retrieved HTMLs, and re-convert them.
 597
 598    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 599    in urls_downloaded.  From these two lists information is
 600    extracted.  */
 601 void
 602 convert_all_links (void)
 603 {
 604   slist *html;
 605   struct wget_timer *timer;
 606   long msecs;
 607   int file_count = 0;
 608
 609   timer = wtimer_new ();
 610
 611   /* Destructively reverse downloaded_html_files to get it in the right order.
 612      recursive_retrieve() used slist_prepend() consistently.  */
 613   downloaded_html_files = slist_nreverse (downloaded_html_files);
 614
 615   for (html = downloaded_html_files; html; html = html->next)
 616     {
 617       struct urlpos *urls, *cur_url;
 618       char *url;
 619
 620       DEBUGP (("Rescanning %s\n", html->string));
 621
 622       /* Determine the URL of the HTML file.  get_urls_html will need
 623          it.  */
 624       url = hash_table_get (dl_file_url_map, html->string);
 625       if (url)
 626         DEBUGP (("It should correspond to %s.\n", url));
 627       else
 628         DEBUGP (("I cannot find the corresponding URL.\n"));
 629
 630       /* Parse the HTML file...  */
 631       urls = get_urls_html (html->string, url, FALSE, NULL);
 632
 633       /* We don't respect meta_disallow_follow here because, even if
 634          the file is not followed, we might still want to convert the
 635          links that have been followed from other files.  */
 636
 637       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 638         {
 639           char *local_name;
 640           struct url *u = cur_url->url;
 641
 642           if (cur_url->link_base_p)
 643             {
 644               /* Base references have been resolved by our parser, so
 645                  we turn the base URL into an empty string.  (Perhaps
 646                  we should remove the tag entirely?)  */
 647               cur_url->convert = CO_NULLIFY_BASE;
 648               continue;
 649             }
 650
 651           /* We decide the direction of conversion according to whether
 652              a URL was downloaded.  Downloaded URLs will be converted
 653              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 654           local_name = hash_table_get (dl_url_file_map, u->url);
 655           if (local_name)
 656             DEBUGP (("%s marked for conversion, local %s\n",
 657                      u->url, local_name));
 658
 659           /* Decide on the conversion type.  */
 660           if (local_name)
 661             {
 662               /* We've downloaded this URL.  Convert it to relative
 663                  form.  We do this even if the URL already is in
 664                  relative form, because our directory structure may
 665                  not be identical to that on the server (think `-nd',
 666                  `--cut-dirs', etc.)  */
 667               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 668               cur_url->local_name = xstrdup (local_name);
 669             }
 670           else
 671             {
 672               /* We haven't downloaded this URL.  If it's not already
 673                  complete (including a full host name), convert it to
 674                  that form, so it can be reached while browsing this
 675                  HTML locally.  */
 676               if (!cur_url->link_complete_p)
 677                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 678               cur_url->local_name = NULL;
 679             }
 680         }
 681
 682       /* Convert the links in the file.  */
 683       convert_links (html->string, urls);
 684       ++file_count;
 685
 686       /* Free the data.  */
 687       free_urlpos (urls);
 688     }
 689
 690   msecs = wtimer_elapsed (timer);
 691   wtimer_delete (timer);
 692   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 693              file_count, (double)msecs / 1000);
 694 }
 695
 696 /* Cleanup the data structures associated with recursive retrieving
 697    (the variables above).  */
 698 void
 699 recursive_cleanup (void)
 700 {
 701   if (dl_file_url_map)
 702     {
 703       free_keys_and_values (dl_file_url_map);
 704       hash_table_destroy (dl_file_url_map);
 705       dl_file_url_map = NULL;
 706     }
 707   if (dl_url_file_map)
 708     {
 709       free_keys_and_values (dl_url_file_map);
 710       hash_table_destroy (dl_url_file_map);
 711       dl_url_file_map = NULL;
 712     }
 713   slist_free (downloaded_html_files);
 714   downloaded_html_files = NULL;
 715 }