sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  */
  58 static slist *downloaded_html_files;
  59
  60 /* List of undesirable-to-load URLs.  */
  61 static struct hash_table *undesirable_urls;
  62
  63 /* Current recursion depth.  */
  64 static int depth;
  65
  66 /* Base directory we're recursing from (used by no_parent).  */
  67 static char *base_dir;
  68
  69 static int first_time = 1;
  70
  71
  72 /* Cleanup the data structures associated with recursive retrieving
  73    (the variables above).  */
  74 void
  75 recursive_cleanup (void)
  76 {
  77   if (undesirable_urls)
  78     {
  79       string_set_free (undesirable_urls);
  80       undesirable_urls = NULL;
  81     }
  82   if (dl_file_url_map)
  83     {
  84       free_keys_and_values (dl_file_url_map);
  85       hash_table_destroy (dl_file_url_map);
  86       dl_file_url_map = NULL;
  87     }
  88   if (dl_url_file_map)
  89     {
  90       free_keys_and_values (dl_url_file_map);
  91       hash_table_destroy (dl_url_file_map);
  92       dl_url_file_map = NULL;
  93     }
  94   undesirable_urls = NULL;
  95   slist_free (downloaded_html_files);
  96   downloaded_html_files = NULL;
  97   FREE_MAYBE (base_dir);
  98   first_time = 1;
  99 }
 100
 101 /* Reset FIRST_TIME to 1, so that some action can be taken in
 102    recursive_retrieve().  */
 103 void
 104 recursive_reset (void)
 105 {
 106   first_time = 1;
 107 }
 108
 109 /* The core of recursive retrieving.  Endless recursion is avoided by
 110    having all URLs stored to a linked list of URLs, which is checked
 111    before loading any URL.  That way no URL can get loaded twice.
 112
 113    The function also supports specification of maximum recursion depth
 114    and a number of other goodies.  */
 115 uerr_t
 116 recursive_retrieve (const char *file, const char *this_url)
 117 {
 118   char *constr, *filename, *newloc;
 119   char *canon_this_url = NULL;
 120   int dt, inl, dash_p_leaf_HTML = FALSE;
 121   int meta_disallow_follow;
 122   int this_url_ftp;            /* See below the explanation */
 123   uerr_t err;
 124   urlpos *url_list, *cur_url;
 125   struct urlinfo *u;
 126
 127   assert (this_url != NULL);
 128   assert (file != NULL);
 129   /* If quota was exceeded earlier, bail out.  */
 130   if (downloaded_exceeds_quota ())
 131     return QUOTEXC;
 132   /* Cache the current URL in the list.  */
 133   if (first_time)
 134     {
 135       /* These three operations need to be done only once per Wget
 136          run.  They should probably be at a different location.  */
 137       if (!undesirable_urls)
 138         undesirable_urls = make_string_hash_table (0);
 139
 140       hash_table_clear (undesirable_urls);
 141       string_set_add (undesirable_urls, this_url);
 142       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 143       u = newurl ();
 144       err = parseurl (this_url, u, 0);
 145       if (err == URLOK)
 146         {
 147           string_set_add (undesirable_urls, u->url);
 148           if (opt.no_parent)
 149             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 150           /* Set the canonical this_url to be sent as referer.  This
 151              problem exists only when running the first time.  */
 152           canon_this_url = xstrdup (u->url);
 153         }
 154       else
 155         {
 156           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 157           base_dir = NULL;
 158         }
 159       freeurl (u, 1);
 160       depth = 1;
 161       first_time = 0;
 162     }
 163   else
 164     ++depth;
 165
 166   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 167     /* We've exceeded the maximum recursion depth specified by the user. */
 168     {
 169       if (opt.page_requisites && depth <= opt.reclevel + 1)
 170         /* When -p is specified, we can do one more partial recursion from the
 171            "leaf nodes" on the HTML document tree.  The recursion is partial in
 172            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 173            except for <LINK REL="stylesheet">. */
 174         dash_p_leaf_HTML = TRUE;
 175       else
 176         /* Either -p wasn't specified or it was and we've already gone the one
 177            extra (pseudo-)level that it affords us, so we need to bail out. */
 178         {
 179           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 180                    depth, opt.reclevel));
 181           --depth;
 182           return RECLEVELEXC;
 183         }
 184     }
 185
 186   /* Determine whether this_url is an FTP URL.  If it is, it means
 187      that the retrieval is done through proxy.  In that case, FTP
 188      links will be followed by default and recursion will not be
 189      turned off when following them.  */
 190   this_url_ftp = (urlproto (this_url) == URLFTP);
 191
 192   /* Get the URL-s from an HTML file: */
 193   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 194                             dash_p_leaf_HTML, &meta_disallow_follow);
 195
 196   if (opt.use_robots && meta_disallow_follow)
 197     {
 198       /* The META tag says we are not to follow this file.  Respect
 199          that.  */
 200       free_urlpos (url_list);
 201       url_list = NULL;
 202     }
 203
 204   /* Decide what to do with each of the URLs.  A URL will be loaded if
 205      it meets several requirements, discussed later.  */
 206   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 207     {
 208       /* If quota was exceeded earlier, bail out.  */
 209       if (downloaded_exceeds_quota ())
 210         break;
 211       /* Parse the URL for convenient use in other functions, as well
 212          as to get the optimized form.  It also checks URL integrity.  */
 213       u = newurl ();
 214       if (parseurl (cur_url->url, u, 0) != URLOK)
 215         {
 216           DEBUGP (("Yuck!  A bad URL.\n"));
 217           freeurl (u, 1);
 218           continue;
 219         }
 220       if (u->proto == URLFILE)
 221         {
 222           DEBUGP (("Nothing to do with file:// around here.\n"));
 223           freeurl (u, 1);
 224           continue;
 225         }
 226       assert (u->url != NULL);
 227       constr = xstrdup (u->url);
 228
 229       /* Several checkings whether a file is acceptable to load:
 230          1. check if URL is ftp, and we don't load it
 231          2. check for relative links (if relative_only is set)
 232          3. check for domain
 233          4. check for no-parent
 234          5. check for excludes && includes
 235          6. check for suffix
 236          7. check for same host (if spanhost is unset), with possible
 237          gethostbyname baggage
 238          8. check for robots.txt
 239
 240          Addendum: If the URL is FTP, and it is to be loaded, only the
 241          domain and suffix settings are "stronger".
 242
 243          Note that .html and (yuck) .htm will get loaded regardless of
 244          suffix rules (but that is remedied later with unlink) unless
 245          the depth equals the maximum depth.
 246
 247          More time- and memory- consuming tests should be put later on
 248          the list.  */
 249
 250       /* inl is set if the URL we are working on (constr) is stored in
 251          undesirable_urls.  Using it is crucial to avoid unnecessary
 252          repeated continuous hits to the hash table.  */
 253       inl = string_set_contains (undesirable_urls, constr);
 254
 255       /* If it is FTP, and FTP is not followed, chuck it out.  */
 256       if (!inl)
 257         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 258           {
 259             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 260             string_set_add (undesirable_urls, constr);
 261             inl = 1;
 262           }
 263       /* If it is absolute link and they are not followed, chuck it
 264          out.  */
 265       if (!inl && u->proto != URLFTP)
 266         if (opt.relative_only && !cur_url->link_relative_p)
 267           {
 268             DEBUGP (("It doesn't really look like a relative link.\n"));
 269             string_set_add (undesirable_urls, constr);
 270             inl = 1;
 271           }
 272       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 273       if (!inl)
 274         if (!accept_domain (u))
 275           {
 276             DEBUGP (("I don't like the smell of that domain.\n"));
 277             string_set_add (undesirable_urls, constr);
 278             inl = 1;
 279           }
 280       /* Check for parent directory.  */
 281       if (!inl && opt.no_parent
 282           /* If the new URL is FTP and the old was not, ignore
 283              opt.no_parent.  */
 284           && !(!this_url_ftp && u->proto == URLFTP))
 285         {
 286           /* Check for base_dir first.  */
 287           if (!(base_dir && frontcmp (base_dir, u->dir)))
 288             {
 289               /* Failing that, check for parent dir.  */
 290               struct urlinfo *ut = newurl ();
 291               if (parseurl (this_url, ut, 0) != URLOK)
 292                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 293               else if (!frontcmp (ut->dir, u->dir))
 294                 {
 295                   /* Failing that too, kill the URL.  */
 296                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 297                   string_set_add (undesirable_urls, constr);
 298                   inl = 1;
 299                 }
 300               freeurl (ut, 1);
 301             }
 302         }
 303       /* If the file does not match the acceptance list, or is on the
 304          rejection list, chuck it out.  The same goes for the
 305          directory exclude- and include- lists.  */
 306       if (!inl && (opt.includes || opt.excludes))
 307         {
 308           if (!accdir (u->dir, ALLABS))
 309             {
 310               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 311               string_set_add (undesirable_urls, constr);
 312               inl = 1;
 313             }
 314         }
 315       if (!inl)
 316         {
 317           char *suf = NULL;
 318           /* We check for acceptance/rejection rules only for non-HTML
 319              documents.  Since we don't know whether they really are
 320              HTML, it will be deduced from (an OR-ed list):
 321
 322              1) u->file is "" (meaning it is a directory)
 323              2) suffix exists, AND:
 324              a) it is "html", OR
 325              b) it is "htm"
 326
 327              If the file *is* supposed to be HTML, it will *not* be
 328             subject to acc/rej rules, unless a finite maximum depth has
 329             been specified and the current depth is the maximum depth. */
 330           if (!
 331               (!*u->file
 332                || (((suf = suffix (constr)) != NULL)
 333                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 334                       && ((opt.reclevel != INFINITE_RECURSION) &&
 335                           (depth != opt.reclevel))))))
 336             {
 337               if (!acceptable (u->file))
 338                 {
 339                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 340                           constr, u->file));
 341                   string_set_add (undesirable_urls, constr);
 342                   inl = 1;
 343                 }
 344             }
 345           FREE_MAYBE (suf);
 346         }
 347       /* Optimize the URL (which includes possible DNS lookup) only
 348          after all other possibilities have been exhausted.  */
 349       if (!inl)
 350         {
 351           if (!opt.simple_check)
 352             opt_url (u);
 353           else
 354             {
 355               char *p;
 356               /* Just lowercase the hostname.  */
 357               for (p = u->host; *p; p++)
 358                 *p = TOLOWER (*p);
 359               xfree (u->url);
 360               u->url = str_url (u, 0);
 361             }
 362           xfree (constr);
 363           constr = xstrdup (u->url);
 364           /* After we have canonicalized the URL, check if we have it
 365              on the black list. */
 366           if (string_set_contains (undesirable_urls, constr))
 367             inl = 1;
 368           /* This line is bogus. */
 369           /*string_set_add (undesirable_urls, constr);*/
 370
 371           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 372             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 373               {
 374                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 375                 string_set_add (undesirable_urls, constr);
 376                 inl = 1;
 377               }
 378         }
 379       /* What about robots.txt?  */
 380       if (!inl && opt.use_robots && u->proto == URLHTTP)
 381         {
 382           struct robot_specs *specs = res_get_specs (u->host, u->port);
 383           if (!specs)
 384             {
 385               char *rfile;
 386               if (res_retrieve_file (constr, &rfile))
 387                 {
 388                   specs = res_parse_from_file (rfile);
 389                   xfree (rfile);
 390                 }
 391               else
 392                 {
 393                   /* If we cannot get real specs, at least produce
 394                      dummy ones so that we can register them and stop
 395                      trying to retrieve them.  */
 396                   specs = res_parse ("", 0);
 397                 }
 398               res_register_specs (u->host, u->port, specs);
 399             }
 400
 401           /* Now that we have (or don't have) robots.txt specs, we can
 402              check what they say.  */
 403           if (!res_match_path (specs, u->path))
 404             {
 405               DEBUGP (("Not following %s because robots.txt forbids it.\n",
 406                        constr));
 407               string_set_add (undesirable_urls, constr);
 408               inl = 1;
 409             }
 410         }
 411
 412       filename = NULL;
 413       /* If it wasn't chucked out, do something with it.  */
 414       if (!inl)
 415         {
 416           DEBUGP (("I've decided to load it -> "));
 417           /* Add it to the list of already-loaded URL-s.  */
 418           string_set_add (undesirable_urls, constr);
 419           /* Automatically followed FTPs will *not* be downloaded
 420              recursively.  */
 421           if (u->proto == URLFTP)
 422             {
 423               /* Don't you adore side-effects?  */
 424               opt.recursive = 0;
 425             }
 426           /* Reset its type.  */
 427           dt = 0;
 428           /* Retrieve it.  */
 429           retrieve_url (constr, &filename, &newloc,
 430                        canon_this_url ? canon_this_url : this_url, &dt);
 431           if (u->proto == URLFTP)
 432             {
 433               /* Restore...  */
 434               opt.recursive = 1;
 435             }
 436           if (newloc)
 437             {
 438               xfree (constr);
 439               constr = newloc;
 440             }
 441           /* If there was no error, and the type is text/html, parse
 442              it recursively.  */
 443           if (dt & TEXTHTML)
 444             {
 445               if (dt & RETROKF)
 446                 recursive_retrieve (filename, constr);
 447             }
 448           else
 449             DEBUGP (("%s is not text/html so we don't chase.\n",
 450                      filename ? filename: "(null)"));
 451
 452           if (opt.delete_after || (filename && !acceptable (filename)))
 453             /* Either --delete-after was specified, or we loaded this otherwise
 454                rejected (e.g. by -R) HTML file just so we could harvest its
 455                hyperlinks -- in either case, delete the local file. */
 456             {
 457               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 458                        opt.delete_after ? "--delete-after" :
 459                        "recursive rejection criteria"));
 460               logprintf (LOG_VERBOSE,
 461                          (opt.delete_after ? _("Removing %s.\n")
 462                           : _("Removing %s since it should be rejected.\n")),
 463                          filename);
 464               if (unlink (filename))
 465                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 466               dt &= ~RETROKF;
 467             }
 468
 469           /* If everything was OK, and links are to be converted, let's
 470              store the local filename.  */
 471           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 472             {
 473               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 474               cur_url->local_name = xstrdup (filename);
 475             }
 476         }
 477       else
 478         DEBUGP (("%s already in list, so we don't load.\n", constr));
 479       /* Free filename and constr.  */
 480       FREE_MAYBE (filename);
 481       FREE_MAYBE (constr);
 482       freeurl (u, 1);
 483       /* Increment the pbuf for the appropriate size.  */
 484     }
 485   if (opt.convert_links && !opt.delete_after)
 486     /* This is merely the first pass: the links that have been
 487        successfully downloaded are converted.  In the second pass,
 488        convert_all_links() will also convert those links that have NOT
 489        been downloaded to their canonical form.  */
 490     convert_links (file, url_list);
 491   /* Free the linked list of URL-s.  */
 492   free_urlpos (url_list);
 493   /* Free the canonical this_url.  */
 494   FREE_MAYBE (canon_this_url);
 495   /* Decrement the recursion depth.  */
 496   --depth;
 497   if (downloaded_exceeds_quota ())
 498     return QUOTEXC;
 499   else
 500     return RETROK;
 501 }
 502 \f
 503 void
 504 register_download (const char *url, const char *file)
 505 {
 506   if (!opt.convert_links)
 507     return;
 508   if (!dl_file_url_map)
 509     dl_file_url_map = make_string_hash_table (0);
 510   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 511   if (!dl_url_file_map)
 512     dl_url_file_map = make_string_hash_table (0);
 513   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 514 }
 515
 516 void
 517 register_html (const char *url, const char *file)
 518 {
 519   if (!opt.convert_links)
 520     return;
 521   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 522 }
 523
 524 /* convert_links() is called from recursive_retrieve() after we're
 525    done with an HTML file.  This call to convert_links is not complete
 526    because it converts only the downloaded files, and Wget cannot know
 527    which files will be downloaded afterwards.  So, if we have file
 528    fileone.html with:
 529
 530    <a href="/c/something.gif">
 531
 532    and /c/something.gif was not downloaded because it exceeded the
 533    recursion depth, the reference will *not* be changed.
 534
 535    However, later we can encounter /c/something.gif from an "upper"
 536    level HTML (let's call it filetwo.html), and it gets downloaded.
 537
 538    But now we have a problem because /c/something.gif will be
 539    correctly transformed in filetwo.html, but not in fileone.html,
 540    since Wget could not have known that /c/something.gif will be
 541    downloaded in the future.
 542
 543    This is why Wget must, after the whole retrieval, call
 544    convert_all_links to go once more through the entire list of
 545    retrieved HTMLs, and re-convert them.
 546
 547    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 548    in urls_downloaded.  From these two lists information is
 549    extracted.  */
 550 void
 551 convert_all_links (void)
 552 {
 553   slist *html;
 554
 555   /* Destructively reverse downloaded_html_files to get it in the right order.
 556      recursive_retrieve() used slist_prepend() consistently.  */
 557   downloaded_html_files = slist_nreverse (downloaded_html_files);
 558
 559   for (html = downloaded_html_files; html; html = html->next)
 560     {
 561       urlpos *urls, *cur_url;
 562       char *url;
 563
 564       DEBUGP (("Rescanning %s\n", html->string));
 565       /* Determine the URL of the HTML file.  get_urls_html will need
 566          it.  */
 567       url = hash_table_get (dl_file_url_map, html->string);
 568       if (url)
 569         DEBUGP (("It should correspond to %s.\n", url));
 570       else
 571         DEBUGP (("I cannot find the corresponding URL.\n"));
 572       /* Parse the HTML file...  */
 573       urls = get_urls_html (html->string, url, FALSE, NULL);
 574       /* We don't respect meta_disallow_follow here because, even if
 575          the file is not followed, we might still want to convert the
 576          links that have been followed from other files.  */
 577       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 578         {
 579           char *local_name;
 580
 581           /* The URL must be in canonical form to be compared.  */
 582           struct urlinfo *u = newurl ();
 583           uerr_t res = parseurl (cur_url->url, u, 0);
 584           if (res != URLOK)
 585             {
 586               freeurl (u, 1);
 587               continue;
 588             }
 589           /* We decide the direction of conversion according to whether
 590              a URL was downloaded.  Downloaded URLs will be converted
 591              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 592           local_name = hash_table_get (dl_url_file_map, u->url);
 593           if (local_name)
 594             DEBUGP (("%s marked for conversion, local %s\n",
 595                      u->url, local_name));
 596           /* Decide on the conversion direction.  */
 597           if (local_name)
 598             {
 599               /* We've downloaded this URL.  Convert it to relative
 600                  form.  We do this even if the URL already is in
 601                  relative form, because our directory structure may
 602                  not be identical to that on the server (think `-nd',
 603                  `--cut-dirs', etc.)  */
 604               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 605               cur_url->local_name = xstrdup (local_name);
 606             }
 607           else
 608             {
 609               /* We haven't downloaded this URL.  If it's not already
 610                  complete (including a full host name), convert it to
 611                  that form, so it can be reached while browsing this
 612                  HTML locally.  */
 613               if (!cur_url->link_complete_p)
 614                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 615               cur_url->local_name = NULL;
 616             }
 617           freeurl (u, 1);
 618         }
 619       /* Convert the links in the file.  */
 620       convert_links (html->string, urls);
 621       /* Free the data.  */
 622       free_urlpos (urls);
 623     }
 624 }