sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  */
  58 static slist *downloaded_html_files;
  59
  60 /* List of undesirable-to-load URLs.  */
  61 static struct hash_table *undesirable_urls;
  62
  63 /* Current recursion depth.  */
  64 static int depth;
  65
  66 /* Base directory we're recursing from (used by no_parent).  */
  67 static char *base_dir;
  68
  69 static int first_time = 1;
  70
  71
  72 /* Cleanup the data structures associated with recursive retrieving
  73    (the variables above).  */
  74 void
  75 recursive_cleanup (void)
  76 {
  77   if (undesirable_urls)
  78     {
  79       string_set_free (undesirable_urls);
  80       undesirable_urls = NULL;
  81     }
  82   if (dl_file_url_map)
  83     {
  84       free_keys_and_values (dl_file_url_map);
  85       hash_table_destroy (dl_file_url_map);
  86       dl_file_url_map = NULL;
  87     }
  88   if (dl_url_file_map)
  89     {
  90       free_keys_and_values (dl_url_file_map);
  91       hash_table_destroy (dl_url_file_map);
  92       dl_url_file_map = NULL;
  93     }
  94   undesirable_urls = NULL;
  95   slist_free (downloaded_html_files);
  96   downloaded_html_files = NULL;
  97   FREE_MAYBE (base_dir);
  98   first_time = 1;
  99 }
 100
 101 /* Reset FIRST_TIME to 1, so that some action can be taken in
 102    recursive_retrieve().  */
 103 void
 104 recursive_reset (void)
 105 {
 106   first_time = 1;
 107 }
 108
 109 /* The core of recursive retrieving.  Endless recursion is avoided by
 110    having all URLs stored to a linked list of URLs, which is checked
 111    before loading any URL.  That way no URL can get loaded twice.
 112
 113    The function also supports specification of maximum recursion depth
 114    and a number of other goodies.  */
 115 uerr_t
 116 recursive_retrieve (const char *file, const char *this_url)
 117 {
 118   char *constr, *filename, *newloc;
 119   char *canon_this_url = NULL;
 120   int dt, inl, dash_p_leaf_HTML = FALSE;
 121   int meta_disallow_follow;
 122   int this_url_ftp;            /* See below the explanation */
 123   uerr_t err;
 124   urlpos *url_list, *cur_url;
 125   struct urlinfo *u;
 126
 127   assert (this_url != NULL);
 128   assert (file != NULL);
 129   /* If quota was exceeded earlier, bail out.  */
 130   if (downloaded_exceeds_quota ())
 131     return QUOTEXC;
 132   /* Cache the current URL in the list.  */
 133   if (first_time)
 134     {
 135       /* These three operations need to be done only once per Wget
 136          run.  They should probably be at a different location.  */
 137       if (!undesirable_urls)
 138         undesirable_urls = make_string_hash_table (0);
 139
 140       hash_table_clear (undesirable_urls);
 141       string_set_add (undesirable_urls, this_url);
 142       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 143       u = newurl ();
 144       err = parseurl (this_url, u, 0);
 145       if (err == URLOK)
 146         {
 147           string_set_add (undesirable_urls, u->url);
 148           if (opt.no_parent)
 149             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 150           /* Set the canonical this_url to be sent as referer.  This
 151              problem exists only when running the first time.  */
 152           canon_this_url = xstrdup (u->url);
 153         }
 154       else
 155         {
 156           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 157           base_dir = NULL;
 158         }
 159       freeurl (u, 1);
 160       depth = 1;
 161       first_time = 0;
 162     }
 163   else
 164     ++depth;
 165
 166   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 167     /* We've exceeded the maximum recursion depth specified by the user. */
 168     {
 169       if (opt.page_requisites && depth <= opt.reclevel + 1)
 170         /* When -p is specified, we can do one more partial recursion from the
 171            "leaf nodes" on the HTML document tree.  The recursion is partial in
 172            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 173            except for <LINK REL="stylesheet">. */
 174         dash_p_leaf_HTML = TRUE;
 175       else
 176         /* Either -p wasn't specified or it was and we've already gone the one
 177            extra (pseudo-)level that it affords us, so we need to bail out. */
 178         {
 179           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 180                    depth, opt.reclevel));
 181           --depth;
 182           return RECLEVELEXC;
 183         }
 184     }
 185
 186   /* Determine whether this_url is an FTP URL.  If it is, it means
 187      that the retrieval is done through proxy.  In that case, FTP
 188      links will be followed by default and recursion will not be
 189      turned off when following them.  */
 190   this_url_ftp = (url_scheme (this_url) == SCHEME_FTP);
 191
 192   /* Get the URL-s from an HTML file: */
 193   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 194                             dash_p_leaf_HTML, &meta_disallow_follow);
 195
 196   if (opt.use_robots && meta_disallow_follow)
 197     {
 198       /* The META tag says we are not to follow this file.  Respect
 199          that.  */
 200       free_urlpos (url_list);
 201       url_list = NULL;
 202     }
 203
 204   /* Decide what to do with each of the URLs.  A URL will be loaded if
 205      it meets several requirements, discussed later.  */
 206   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 207     {
 208       /* If quota was exceeded earlier, bail out.  */
 209       if (downloaded_exceeds_quota ())
 210         break;
 211       /* Parse the URL for convenient use in other functions, as well
 212          as to get the optimized form.  It also checks URL integrity.  */
 213       u = newurl ();
 214       if (parseurl (cur_url->url, u, 0) != URLOK)
 215         {
 216           DEBUGP (("Yuck!  A bad URL.\n"));
 217           freeurl (u, 1);
 218           continue;
 219         }
 220       assert (u->url != NULL);
 221       constr = xstrdup (u->url);
 222
 223       /* Several checkings whether a file is acceptable to load:
 224          1. check if URL is ftp, and we don't load it
 225          2. check for relative links (if relative_only is set)
 226          3. check for domain
 227          4. check for no-parent
 228          5. check for excludes && includes
 229          6. check for suffix
 230          7. check for same host (if spanhost is unset), with possible
 231          gethostbyname baggage
 232          8. check for robots.txt
 233
 234          Addendum: If the URL is FTP, and it is to be loaded, only the
 235          domain and suffix settings are "stronger".
 236
 237          Note that .html and (yuck) .htm will get loaded regardless of
 238          suffix rules (but that is remedied later with unlink) unless
 239          the depth equals the maximum depth.
 240
 241          More time- and memory- consuming tests should be put later on
 242          the list.  */
 243
 244       /* inl is set if the URL we are working on (constr) is stored in
 245          undesirable_urls.  Using it is crucial to avoid unnecessary
 246          repeated continuous hits to the hash table.  */
 247       inl = string_set_contains (undesirable_urls, constr);
 248
 249       /* If it is FTP, and FTP is not followed, chuck it out.  */
 250       if (!inl)
 251         if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp)
 252           {
 253             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 254             string_set_add (undesirable_urls, constr);
 255             inl = 1;
 256           }
 257       /* If it is absolute link and they are not followed, chuck it
 258          out.  */
 259       if (!inl && u->scheme != SCHEME_FTP)
 260         if (opt.relative_only && !cur_url->link_relative_p)
 261           {
 262             DEBUGP (("It doesn't really look like a relative link.\n"));
 263             string_set_add (undesirable_urls, constr);
 264             inl = 1;
 265           }
 266       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 267       if (!inl)
 268         if (!accept_domain (u))
 269           {
 270             DEBUGP (("I don't like the smell of that domain.\n"));
 271             string_set_add (undesirable_urls, constr);
 272             inl = 1;
 273           }
 274       /* Check for parent directory.  */
 275       if (!inl && opt.no_parent
 276           /* If the new URL is FTP and the old was not, ignore
 277              opt.no_parent.  */
 278           && !(!this_url_ftp && u->scheme == SCHEME_FTP))
 279         {
 280           /* Check for base_dir first.  */
 281           if (!(base_dir && frontcmp (base_dir, u->dir)))
 282             {
 283               /* Failing that, check for parent dir.  */
 284               struct urlinfo *ut = newurl ();
 285               if (parseurl (this_url, ut, 0) != URLOK)
 286                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 287               else if (!frontcmp (ut->dir, u->dir))
 288                 {
 289                   /* Failing that too, kill the URL.  */
 290                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 291                   string_set_add (undesirable_urls, constr);
 292                   inl = 1;
 293                 }
 294               freeurl (ut, 1);
 295             }
 296         }
 297       /* If the file does not match the acceptance list, or is on the
 298          rejection list, chuck it out.  The same goes for the
 299          directory exclude- and include- lists.  */
 300       if (!inl && (opt.includes || opt.excludes))
 301         {
 302           if (!accdir (u->dir, ALLABS))
 303             {
 304               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 305               string_set_add (undesirable_urls, constr);
 306               inl = 1;
 307             }
 308         }
 309       if (!inl)
 310         {
 311           char *suf = NULL;
 312           /* We check for acceptance/rejection rules only for non-HTML
 313              documents.  Since we don't know whether they really are
 314              HTML, it will be deduced from (an OR-ed list):
 315
 316              1) u->file is "" (meaning it is a directory)
 317              2) suffix exists, AND:
 318              a) it is "html", OR
 319              b) it is "htm"
 320
 321              If the file *is* supposed to be HTML, it will *not* be
 322             subject to acc/rej rules, unless a finite maximum depth has
 323             been specified and the current depth is the maximum depth. */
 324           if (!
 325               (!*u->file
 326                || (((suf = suffix (constr)) != NULL)
 327                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 328                       && ((opt.reclevel != INFINITE_RECURSION) &&
 329                           (depth != opt.reclevel))))))
 330             {
 331               if (!acceptable (u->file))
 332                 {
 333                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 334                           constr, u->file));
 335                   string_set_add (undesirable_urls, constr);
 336                   inl = 1;
 337                 }
 338             }
 339           FREE_MAYBE (suf);
 340         }
 341       /* Optimize the URL (which includes possible DNS lookup) only
 342          after all other possibilities have been exhausted.  */
 343       if (!inl)
 344         {
 345           if (!opt.simple_check)
 346             opt_url (u);
 347           else
 348             {
 349               char *p;
 350               /* Just lowercase the hostname.  */
 351               for (p = u->host; *p; p++)
 352                 *p = TOLOWER (*p);
 353               xfree (u->url);
 354               u->url = str_url (u, 0);
 355             }
 356           xfree (constr);
 357           constr = xstrdup (u->url);
 358           /* After we have canonicalized the URL, check if we have it
 359              on the black list. */
 360           if (string_set_contains (undesirable_urls, constr))
 361             inl = 1;
 362           /* This line is bogus. */
 363           /*string_set_add (undesirable_urls, constr);*/
 364
 365           if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp))
 366             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 367               {
 368                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 369                 string_set_add (undesirable_urls, constr);
 370                 inl = 1;
 371               }
 372         }
 373       /* What about robots.txt?  */
 374       if (!inl && opt.use_robots && u->scheme == SCHEME_FTP)
 375         {
 376           struct robot_specs *specs = res_get_specs (u->host, u->port);
 377           if (!specs)
 378             {
 379               char *rfile;
 380               if (res_retrieve_file (constr, &rfile))
 381                 {
 382                   specs = res_parse_from_file (rfile);
 383                   xfree (rfile);
 384                 }
 385               else
 386                 {
 387                   /* If we cannot get real specs, at least produce
 388                      dummy ones so that we can register them and stop
 389                      trying to retrieve them.  */
 390                   specs = res_parse ("", 0);
 391                 }
 392               res_register_specs (u->host, u->port, specs);
 393             }
 394
 395           /* Now that we have (or don't have) robots.txt specs, we can
 396              check what they say.  */
 397           if (!res_match_path (specs, u->path))
 398             {
 399               DEBUGP (("Not following %s because robots.txt forbids it.\n",
 400                        constr));
 401               string_set_add (undesirable_urls, constr);
 402               inl = 1;
 403             }
 404         }
 405
 406       filename = NULL;
 407       /* If it wasn't chucked out, do something with it.  */
 408       if (!inl)
 409         {
 410           DEBUGP (("I've decided to load it -> "));
 411           /* Add it to the list of already-loaded URL-s.  */
 412           string_set_add (undesirable_urls, constr);
 413           /* Automatically followed FTPs will *not* be downloaded
 414              recursively.  */
 415           if (u->scheme == SCHEME_FTP)
 416             {
 417               /* Don't you adore side-effects?  */
 418               opt.recursive = 0;
 419             }
 420           /* Reset its type.  */
 421           dt = 0;
 422           /* Retrieve it.  */
 423           retrieve_url (constr, &filename, &newloc,
 424                        canon_this_url ? canon_this_url : this_url, &dt);
 425           if (u->scheme == SCHEME_FTP)
 426             {
 427               /* Restore...  */
 428               opt.recursive = 1;
 429             }
 430           if (newloc)
 431             {
 432               xfree (constr);
 433               constr = newloc;
 434             }
 435           /* If there was no error, and the type is text/html, parse
 436              it recursively.  */
 437           if (dt & TEXTHTML)
 438             {
 439               if (dt & RETROKF)
 440                 recursive_retrieve (filename, constr);
 441             }
 442           else
 443             DEBUGP (("%s is not text/html so we don't chase.\n",
 444                      filename ? filename: "(null)"));
 445
 446           if (opt.delete_after || (filename && !acceptable (filename)))
 447             /* Either --delete-after was specified, or we loaded this otherwise
 448                rejected (e.g. by -R) HTML file just so we could harvest its
 449                hyperlinks -- in either case, delete the local file. */
 450             {
 451               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 452                        opt.delete_after ? "--delete-after" :
 453                        "recursive rejection criteria"));
 454               logprintf (LOG_VERBOSE,
 455                          (opt.delete_after ? _("Removing %s.\n")
 456                           : _("Removing %s since it should be rejected.\n")),
 457                          filename);
 458               if (unlink (filename))
 459                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 460               dt &= ~RETROKF;
 461             }
 462
 463           /* If everything was OK, and links are to be converted, let's
 464              store the local filename.  */
 465           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 466             {
 467               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 468               cur_url->local_name = xstrdup (filename);
 469             }
 470         }
 471       else
 472         DEBUGP (("%s already in list, so we don't load.\n", constr));
 473       /* Free filename and constr.  */
 474       FREE_MAYBE (filename);
 475       FREE_MAYBE (constr);
 476       freeurl (u, 1);
 477       /* Increment the pbuf for the appropriate size.  */
 478     }
 479   if (opt.convert_links && !opt.delete_after)
 480     /* This is merely the first pass: the links that have been
 481        successfully downloaded are converted.  In the second pass,
 482        convert_all_links() will also convert those links that have NOT
 483        been downloaded to their canonical form.  */
 484     convert_links (file, url_list);
 485   /* Free the linked list of URL-s.  */
 486   free_urlpos (url_list);
 487   /* Free the canonical this_url.  */
 488   FREE_MAYBE (canon_this_url);
 489   /* Decrement the recursion depth.  */
 490   --depth;
 491   if (downloaded_exceeds_quota ())
 492     return QUOTEXC;
 493   else
 494     return RETROK;
 495 }
 496 \f
 497 void
 498 register_download (const char *url, const char *file)
 499 {
 500   if (!opt.convert_links)
 501     return;
 502   if (!dl_file_url_map)
 503     dl_file_url_map = make_string_hash_table (0);
 504   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 505   if (!dl_url_file_map)
 506     dl_url_file_map = make_string_hash_table (0);
 507   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 508 }
 509
 510 void
 511 register_html (const char *url, const char *file)
 512 {
 513   if (!opt.convert_links)
 514     return;
 515   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 516 }
 517
 518 /* convert_links() is called from recursive_retrieve() after we're
 519    done with an HTML file.  This call to convert_links is not complete
 520    because it converts only the downloaded files, and Wget cannot know
 521    which files will be downloaded afterwards.  So, if we have file
 522    fileone.html with:
 523
 524    <a href="/c/something.gif">
 525
 526    and /c/something.gif was not downloaded because it exceeded the
 527    recursion depth, the reference will *not* be changed.
 528
 529    However, later we can encounter /c/something.gif from an "upper"
 530    level HTML (let's call it filetwo.html), and it gets downloaded.
 531
 532    But now we have a problem because /c/something.gif will be
 533    correctly transformed in filetwo.html, but not in fileone.html,
 534    since Wget could not have known that /c/something.gif will be
 535    downloaded in the future.
 536
 537    This is why Wget must, after the whole retrieval, call
 538    convert_all_links to go once more through the entire list of
 539    retrieved HTMLs, and re-convert them.
 540
 541    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 542    in urls_downloaded.  From these two lists information is
 543    extracted.  */
 544 void
 545 convert_all_links (void)
 546 {
 547   slist *html;
 548
 549   /* Destructively reverse downloaded_html_files to get it in the right order.
 550      recursive_retrieve() used slist_prepend() consistently.  */
 551   downloaded_html_files = slist_nreverse (downloaded_html_files);
 552
 553   for (html = downloaded_html_files; html; html = html->next)
 554     {
 555       urlpos *urls, *cur_url;
 556       char *url;
 557
 558       DEBUGP (("Rescanning %s\n", html->string));
 559       /* Determine the URL of the HTML file.  get_urls_html will need
 560          it.  */
 561       url = hash_table_get (dl_file_url_map, html->string);
 562       if (url)
 563         DEBUGP (("It should correspond to %s.\n", url));
 564       else
 565         DEBUGP (("I cannot find the corresponding URL.\n"));
 566       /* Parse the HTML file...  */
 567       urls = get_urls_html (html->string, url, FALSE, NULL);
 568       /* We don't respect meta_disallow_follow here because, even if
 569          the file is not followed, we might still want to convert the
 570          links that have been followed from other files.  */
 571       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 572         {
 573           char *local_name;
 574
 575           /* The URL must be in canonical form to be compared.  */
 576           struct urlinfo *u = newurl ();
 577           uerr_t res = parseurl (cur_url->url, u, 0);
 578           if (res != URLOK)
 579             {
 580               freeurl (u, 1);
 581               continue;
 582             }
 583           /* We decide the direction of conversion according to whether
 584              a URL was downloaded.  Downloaded URLs will be converted
 585              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 586           local_name = hash_table_get (dl_url_file_map, u->url);
 587           if (local_name)
 588             DEBUGP (("%s marked for conversion, local %s\n",
 589                      u->url, local_name));
 590           /* Decide on the conversion direction.  */
 591           if (local_name)
 592             {
 593               /* We've downloaded this URL.  Convert it to relative
 594                  form.  We do this even if the URL already is in
 595                  relative form, because our directory structure may
 596                  not be identical to that on the server (think `-nd',
 597                  `--cut-dirs', etc.)  */
 598               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 599               cur_url->local_name = xstrdup (local_name);
 600             }
 601           else
 602             {
 603               /* We haven't downloaded this URL.  If it's not already
 604                  complete (including a full host name), convert it to
 605                  that form, so it can be reached while browsing this
 606                  HTML locally.  */
 607               if (!cur_url->link_complete_p)
 608                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 609               cur_url->local_name = NULL;
 610             }
 611           freeurl (u, 1);
 612         }
 613       /* Convert the links in the file.  */
 614       convert_links (html->string, urls);
 615       /* Free the data.  */
 616       free_urlpos (urls);
 617     }
 618 }