sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61 \f
  62 /* Functions for maintaining the URL queue.  */
  63
  64 struct queue_element {
  65   const char *url;
  66   const char *referer;
  67   int depth;
  68   struct queue_element *next;
  69 };
  70
  71 struct url_queue {
  72   struct queue_element *head;
  73   struct queue_element *tail;
  74   int count, maxcount;
  75 };
  76
  77 /* Create a URL queue. */
  78
  79 static struct url_queue *
  80 url_queue_new (void)
  81 {
  82   struct url_queue *queue = xmalloc (sizeof (*queue));
  83   memset (queue, '\0', sizeof (*queue));
  84   return queue;
  85 }
  86
  87 /* Delete a URL queue. */
  88
  89 static void
  90 url_queue_delete (struct url_queue *queue)
  91 {
  92   xfree (queue);
  93 }
  94
  95 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  96    retrieved ("dequeued") from the queue in the order they were placed
  97    into it.  */
  98
  99 static void
 100 url_enqueue (struct url_queue *queue,
 101              const char *url, const char *referer, int depth)
 102 {
 103   struct queue_element *qel = xmalloc (sizeof (*qel));
 104   qel->url = url;
 105   qel->referer = referer;
 106   qel->depth = depth;
 107   qel->next = NULL;
 108
 109   ++queue->count;
 110   if (queue->count > queue->maxcount)
 111     queue->maxcount = queue->count;
 112
 113   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 114   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 115
 116   if (queue->tail)
 117     queue->tail->next = qel;
 118   queue->tail = qel;
 119
 120   if (!queue->head)
 121     queue->head = queue->tail;
 122 }
 123
 124 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 125    or 0 if the queue is empty.  */
 126
 127 static int
 128 url_dequeue (struct url_queue *queue,
 129              const char **url, const char **referer, int *depth)
 130 {
 131   struct queue_element *qel = queue->head;
 132
 133   if (!qel)
 134     return 0;
 135
 136   queue->head = queue->head->next;
 137   if (!queue->head)
 138     queue->tail = NULL;
 139
 140   *url = qel->url;
 141   *referer = qel->referer;
 142   *depth = qel->depth;
 143
 144   --queue->count;
 145
 146   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 147   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 148
 149   xfree (qel);
 150   return 1;
 151 }
 152
 153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
 154                                   struct url *, struct hash_table *));
 155
 156 /* Retrieve a part of the web beginning with START_URL.  This used to
 157    be called "recursive retrieval", because the old function was
 158    recursive and implemented depth-first search.  retrieve_tree on the
 159    other hand implements breadth-search traversal of the tree, which
 160    results in much nicer ordering of downloads.
 161
 162    The algorithm this function uses is simple:
 163
 164    1. put START_URL in the queue.
 165    2. while there are URLs in the queue:
 166
 167      3. get next URL from the queue.
 168      4. download it.
 169      5. if the URL is HTML and its depth does not exceed maximum depth,
 170         get the list of URLs embedded therein.
 171      6. for each of those URLs do the following:
 172
 173        7. if the URL is not one of those downloaded before, and if it
 174           satisfies the criteria specified by the various command-line
 175           options, add it to the queue. */
 176
 177 uerr_t
 178 retrieve_tree (const char *start_url)
 179 {
 180   uerr_t status = RETROK;
 181
 182   /* The queue of URLs we need to load. */
 183   struct url_queue *queue = url_queue_new ();
 184
 185   /* The URLs we decided we don't want to load. */
 186   struct hash_table *blacklist = make_string_hash_table (0);
 187
 188   /* We'll need various components of this, so better get it over with
 189      now. */
 190   struct url *start_url_parsed = url_parse (start_url, NULL);
 191
 192   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 193   string_set_add (blacklist, start_url);
 194
 195   while (1)
 196     {
 197       int descend = 0;
 198       char *url, *referer, *file = NULL;
 199       int depth;
 200       boolean dash_p_leaf_HTML = FALSE;
 201
 202       if (downloaded_exceeds_quota ())
 203         break;
 204
 205       if (status == FWRITEERR)
 206         break;
 207
 208       /* Get the next URL from the queue. */
 209
 210       if (!url_dequeue (queue,
 211                         (const char **)&url, (const char **)&referer,
 212                         &depth))
 213         break;
 214
 215       /* And download it. */
 216
 217       {
 218         int dt = 0;
 219         char *redirected = NULL;
 220         int oldrec = opt.recursive;
 221
 222         opt.recursive = 0;
 223         status = retrieve_url (url, &file, &redirected, NULL, &dt);
 224         opt.recursive = oldrec;
 225
 226         if (redirected)
 227           {
 228             xfree (url);
 229             url = redirected;
 230           }
 231         if (file && status == RETROK
 232             && (dt & RETROKF) && (dt & TEXTHTML))
 233           descend = 1;
 234       }
 235
 236       if (descend
 237           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 238         {
 239           if (opt.page_requisites && depth == opt.reclevel)
 240             /* When -p is specified, we can do one more partial
 241                recursion from the "leaf nodes" on the HTML document
 242                tree.  The recursion is partial in that we won't
 243                traverse any <A> or <AREA> tags, nor any <LINK> tags
 244                except for <LINK REL="stylesheet">. */
 245             /* #### This would be the place to implement the TODO
 246                entry saying that -p should do two more hops on
 247                framesets.  */
 248             dash_p_leaf_HTML = TRUE;
 249           else
 250             {
 251               /* Either -p wasn't specified or it was and we've
 252                  already gone the one extra (pseudo-)level that it
 253                  affords us, so we need to bail out. */
 254               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 255                        depth, opt.reclevel));
 256               descend = 0;
 257             }
 258         }
 259
 260       /* If the downloaded document was HTML, parse it and enqueue the
 261          links it contains. */
 262
 263       if (descend)
 264         {
 265           int meta_disallow_follow = 0;
 266           struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
 267                                                    &meta_disallow_follow);
 268
 269           if (opt.use_robots && meta_disallow_follow)
 270             {
 271               free_urlpos (children);
 272               children = NULL;
 273             }
 274
 275           if (children)
 276             {
 277               struct urlpos *child = children;
 278               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 279               assert (url_parsed != NULL);
 280
 281               for (; child; child = child->next)
 282                 {
 283                   if (child->ignore_when_downloading)
 284                     continue;
 285                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 286                                      blacklist))
 287                     {
 288                       url_enqueue (queue, xstrdup (child->url->url),
 289                                    xstrdup (url), depth + 1);
 290                       /* We blacklist the URL we have enqueued, because we
 291                          don't want to enqueue (and hence download) the
 292                          same URL twice.  */
 293                       string_set_add (blacklist, child->url->url);
 294                     }
 295                 }
 296
 297               url_free (url_parsed);
 298               free_urlpos (children);
 299             }
 300         }
 301
 302       if (opt.delete_after || (file && !acceptable (file)))
 303         {
 304           /* Either --delete-after was specified, or we loaded this
 305              otherwise rejected (e.g. by -R) HTML file just so we
 306              could harvest its hyperlinks -- in either case, delete
 307              the local file. */
 308           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 309                    opt.delete_after ? "--delete-after" :
 310                    "recursive rejection criteria"));
 311           logprintf (LOG_VERBOSE,
 312                      (opt.delete_after ? _("Removing %s.\n")
 313                       : _("Removing %s since it should be rejected.\n")),
 314                      file);
 315           if (unlink (file))
 316             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 317         }
 318
 319       xfree (url);
 320       FREE_MAYBE (referer);
 321       FREE_MAYBE (file);
 322     }
 323
 324   /* If anything is left of the queue due to a premature exit, free it
 325      now.  */
 326   {
 327     char *d1, *d2;
 328     int d3;
 329     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 330       {
 331         xfree (d1);
 332         FREE_MAYBE (d2);
 333       }
 334   }
 335   url_queue_delete (queue);
 336
 337   if (start_url_parsed)
 338     url_free (start_url_parsed);
 339   string_set_free (blacklist);
 340
 341   if (downloaded_exceeds_quota ())
 342     return QUOTEXC;
 343   else if (status == FWRITEERR)
 344     return FWRITEERR;
 345   else
 346     return RETROK;
 347 }
 348
 349 /* Based on the context provided by retrieve_tree, decide whether a
 350    URL is to be descended to.  This is only ever called from
 351    retrieve_tree, but is in a separate function for clarity.  */
 352
 353 static int
 354 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
 355                struct url *start_url_parsed, struct hash_table *blacklist)
 356 {
 357   struct url *u = upos->url;
 358   const char *url = u->url;
 359
 360   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 361
 362   if (string_set_contains (blacklist, url))
 363     {
 364       DEBUGP (("Already on the black list.\n"));
 365       goto out;
 366     }
 367
 368   /* Several things to check for:
 369      1. if scheme is not http, and we don't load it
 370      2. check for relative links (if relative_only is set)
 371      3. check for domain
 372      4. check for no-parent
 373      5. check for excludes && includes
 374      6. check for suffix
 375      7. check for same host (if spanhost is unset), with possible
 376      gethostbyname baggage
 377      8. check for robots.txt
 378
 379      Addendum: If the URL is FTP, and it is to be loaded, only the
 380      domain and suffix settings are "stronger".
 381
 382      Note that .html files will get loaded regardless of suffix rules
 383      (but that is remedied later with unlink) unless the depth equals
 384      the maximum depth.
 385
 386      More time- and memory- consuming tests should be put later on
 387      the list.  */
 388
 389   /* 1. Schemes other than HTTP are normally not recursed into. */
 390   if (u->scheme != SCHEME_HTTP
 391       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 392     {
 393       DEBUGP (("Not following non-HTTP schemes.\n"));
 394       goto blacklist;
 395     }
 396
 397   /* 2. If it is an absolute link and they are not followed, throw it
 398      out.  */
 399   if (u->scheme == SCHEME_HTTP)
 400     if (opt.relative_only && !upos->link_relative_p)
 401       {
 402         DEBUGP (("It doesn't really look like a relative link.\n"));
 403         goto blacklist;
 404       }
 405
 406   /* 3. If its domain is not to be accepted/looked-up, chuck it
 407      out.  */
 408   if (!accept_domain (u))
 409     {
 410       DEBUGP (("The domain was not accepted.\n"));
 411       goto blacklist;
 412     }
 413
 414   /* 4. Check for parent directory.
 415
 416      If we descended to a different host or changed the scheme, ignore
 417      opt.no_parent.  Also ignore it for -p leaf retrievals.  */
 418   if (opt.no_parent
 419       && u->scheme == parent->scheme
 420       && 0 == strcasecmp (u->host, parent->host)
 421       && u->port == parent->port)
 422     {
 423       if (!frontcmp (parent->dir, u->dir))
 424         {
 425           DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
 426           goto blacklist;
 427         }
 428     }
 429
 430   /* 5. If the file does not match the acceptance list, or is on the
 431      rejection list, chuck it out.  The same goes for the directory
 432      exclusion and inclusion lists.  */
 433   if (opt.includes || opt.excludes)
 434     {
 435       if (!accdir (u->dir, ALLABS))
 436         {
 437           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 438           goto blacklist;
 439         }
 440     }
 441
 442   /* 6. */
 443   {
 444     char *suf = NULL;
 445     /* Check for acceptance/rejection rules.  We ignore these rules
 446        for HTML documents because they might lead to other files which
 447        need to be downloaded.  Of course, we don't know which
 448        documents are HTML before downloading them, so we guess.
 449
 450        A file is subject to acceptance/rejection rules if:
 451
 452        * u->file is not "" (i.e. it is not a directory)
 453        and either:
 454          + there is no file suffix,
 455          + or there is a suffix, but is not "html" or "htm",
 456          + both:
 457            - recursion is not infinite,
 458            - and we are at its very end. */
 459
 460     if (u->file[0] != '\0'
 461         && ((suf = suffix (url)) == NULL
 462             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 463             || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
 464       {
 465         if (!acceptable (u->file))
 466           {
 467             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 468                      url, u->file));
 469             FREE_MAYBE (suf);
 470             goto blacklist;
 471           }
 472       }
 473     FREE_MAYBE (suf);
 474   }
 475
 476   /* 7. */
 477   if (u->scheme == parent->scheme)
 478     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 479       {
 480         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 481                  u->host, parent->host));
 482         goto blacklist;
 483       }
 484
 485   /* 8. */
 486   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 487     {
 488       struct robot_specs *specs = res_get_specs (u->host, u->port);
 489       if (!specs)
 490         {
 491           char *rfile;
 492           if (res_retrieve_file (url, &rfile))
 493             {
 494               specs = res_parse_from_file (rfile);
 495               xfree (rfile);
 496             }
 497           else
 498             {
 499               /* If we cannot get real specs, at least produce
 500                  dummy ones so that we can register them and stop
 501                  trying to retrieve them.  */
 502               specs = res_parse ("", 0);
 503             }
 504           res_register_specs (u->host, u->port, specs);
 505         }
 506
 507       /* Now that we have (or don't have) robots.txt specs, we can
 508          check what they say.  */
 509       if (!res_match_path (specs, u->path))
 510         {
 511           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 512           goto blacklist;
 513         }
 514     }
 515
 516   /* The URL has passed all the tests.  It can be placed in the
 517      download queue. */
 518   DEBUGP (("Decided to load it.\n"));
 519
 520   return 1;
 521
 522  blacklist:
 523   string_set_add (blacklist, url);
 524
 525  out:
 526   DEBUGP (("Decided NOT to load it.\n"));
 527
 528   return 0;
 529 }
 530 \f
 531 /* Register that URL has been successfully downloaded to FILE. */
 532
 533 void
 534 register_download (const char *url, const char *file)
 535 {
 536   if (!opt.convert_links)
 537     return;
 538   if (!dl_file_url_map)
 539     dl_file_url_map = make_string_hash_table (0);
 540   if (!dl_url_file_map)
 541     dl_url_file_map = make_string_hash_table (0);
 542
 543   if (!hash_table_contains (dl_file_url_map, file))
 544     hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 545   if (!hash_table_contains (dl_url_file_map, url))
 546     hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 547 }
 548
 549 /* Register that FROM has been redirected to TO.  This assumes that TO
 550    is successfully downloaded and already registered using
 551    register_download() above.  */
 552
 553 void
 554 register_redirection (const char *from, const char *to)
 555 {
 556   char *file;
 557
 558   if (!opt.convert_links)
 559     return;
 560
 561   file = hash_table_get (dl_url_file_map, to);
 562   assert (file != NULL);
 563   if (!hash_table_contains (dl_url_file_map, from))
 564     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 565 }
 566
 567 /* Register that URL corresponds to the HTML file FILE. */
 568
 569 void
 570 register_html (const char *url, const char *file)
 571 {
 572   if (!opt.convert_links)
 573     return;
 574   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 575 }
 576
 577 /* convert_links() is called from recursive_retrieve() after we're
 578    done with an HTML file.  This call to convert_links is not complete
 579    because it converts only the downloaded files, and Wget cannot know
 580    which files will be downloaded afterwards.  So, if we have file
 581    fileone.html with:
 582
 583    <a href="/c/something.gif">
 584
 585    and /c/something.gif was not downloaded because it exceeded the
 586    recursion depth, the reference will *not* be changed.
 587
 588    However, later we can encounter /c/something.gif from an "upper"
 589    level HTML (let's call it filetwo.html), and it gets downloaded.
 590
 591    But now we have a problem because /c/something.gif will be
 592    correctly transformed in filetwo.html, but not in fileone.html,
 593    since Wget could not have known that /c/something.gif will be
 594    downloaded in the future.
 595
 596    This is why Wget must, after the whole retrieval, call
 597    convert_all_links to go once more through the entire list of
 598    retrieved HTMLs, and re-convert them.
 599
 600    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 601    in urls_downloaded.  From these two lists information is
 602    extracted.  */
 603 void
 604 convert_all_links (void)
 605 {
 606   slist *html;
 607
 608   /* Destructively reverse downloaded_html_files to get it in the right order.
 609      recursive_retrieve() used slist_prepend() consistently.  */
 610   downloaded_html_files = slist_nreverse (downloaded_html_files);
 611
 612   for (html = downloaded_html_files; html; html = html->next)
 613     {
 614       struct urlpos *urls, *cur_url;
 615       char *url;
 616
 617       DEBUGP (("Rescanning %s\n", html->string));
 618
 619       /* Determine the URL of the HTML file.  get_urls_html will need
 620          it.  */
 621       url = hash_table_get (dl_file_url_map, html->string);
 622       if (url)
 623         DEBUGP (("It should correspond to %s.\n", url));
 624       else
 625         DEBUGP (("I cannot find the corresponding URL.\n"));
 626
 627       /* Parse the HTML file...  */
 628       urls = get_urls_html (html->string, url, FALSE, NULL);
 629
 630       /* We don't respect meta_disallow_follow here because, even if
 631          the file is not followed, we might still want to convert the
 632          links that have been followed from other files.  */
 633
 634       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 635         {
 636           char *local_name;
 637           struct url *u = cur_url->url;
 638
 639           if (cur_url->link_base_p)
 640             {
 641               /* Base references have been resolved by our parser, so
 642                  we turn the base URL into an empty string.  (Perhaps
 643                  we should remove the tag entirely?)  */
 644               cur_url->convert = CO_NULLIFY_BASE;
 645               continue;
 646             }
 647
 648           /* We decide the direction of conversion according to whether
 649              a URL was downloaded.  Downloaded URLs will be converted
 650              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 651           local_name = hash_table_get (dl_url_file_map, u->url);
 652           if (local_name)
 653             DEBUGP (("%s marked for conversion, local %s\n",
 654                      u->url, local_name));
 655
 656           /* Decide on the conversion type.  */
 657           if (local_name)
 658             {
 659               /* We've downloaded this URL.  Convert it to relative
 660                  form.  We do this even if the URL already is in
 661                  relative form, because our directory structure may
 662                  not be identical to that on the server (think `-nd',
 663                  `--cut-dirs', etc.)  */
 664               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 665               cur_url->local_name = xstrdup (local_name);
 666             }
 667           else
 668             {
 669               /* We haven't downloaded this URL.  If it's not already
 670                  complete (including a full host name), convert it to
 671                  that form, so it can be reached while browsing this
 672                  HTML locally.  */
 673               if (!cur_url->link_complete_p)
 674                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 675               cur_url->local_name = NULL;
 676             }
 677         }
 678       /* Convert the links in the file.  */
 679       convert_links (html->string, urls);
 680       /* Free the data.  */
 681       free_urlpos (urls);
 682     }
 683 }
 684
 685 /* Cleanup the data structures associated with recursive retrieving
 686    (the variables above).  */
 687 void
 688 recursive_cleanup (void)
 689 {
 690   if (dl_file_url_map)
 691     {
 692       free_keys_and_values (dl_file_url_map);
 693       hash_table_destroy (dl_file_url_map);
 694       dl_file_url_map = NULL;
 695     }
 696   if (dl_url_file_map)
 697     {
 698       free_keys_and_values (dl_url_file_map);
 699       hash_table_destroy (dl_url_file_map);
 700       dl_url_file_map = NULL;
 701     }
 702   slist_free (downloaded_html_files);
 703   downloaded_html_files = NULL;
 704 }