sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45
  46 #ifndef errno
  47 extern int errno;
  48 #endif
  49
  50 extern char *version_string;
  51
  52 #define ROBOTS_FILENAME "robots.txt"
  53
  54 static struct hash_table *dl_file_url_map;
  55 static struct hash_table *dl_url_file_map;
  56
  57 /* List of HTML URLs.  */
  58 static slist *urls_html;
  59
  60 /* List of undesirable-to-load URLs.  */
  61 static struct hash_table *undesirable_urls;
  62
  63 /* List of forbidden locations.  */
  64 static char **forbidden = NULL;
  65
  66 /* Current recursion depth.  */
  67 static int depth;
  68
  69 /* Base directory we're recursing from (used by no_parent).  */
  70 static char *base_dir;
  71
  72 /* The host name for which we last checked robots.  */
  73 static char *robots_host;
  74
  75 static int first_time = 1;
  76
  77 /* Construct the robots URL.  */
  78 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  79 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  80 static char **parse_robots PARAMS ((const char *));
  81 static int robots_match PARAMS ((struct urlinfo *, char **));
  82
  83
  84 /* Cleanup the data structures associated with recursive retrieving
  85    (the variables above).  */
  86 void
  87 recursive_cleanup (void)
  88 {
  89   if (undesirable_urls)
  90     {
  91       string_set_free (undesirable_urls);
  92       undesirable_urls = NULL;
  93     }
  94   if (dl_file_url_map)
  95     {
  96       free_keys_and_values (dl_file_url_map);
  97       hash_table_destroy (dl_file_url_map);
  98       dl_file_url_map = NULL;
  99     }
 100   if (dl_url_file_map)
 101     {
 102       free_keys_and_values (dl_url_file_map);
 103       hash_table_destroy (dl_url_file_map);
 104       dl_url_file_map = NULL;
 105     }
 106   undesirable_urls = NULL;
 107   free_vec (forbidden);
 108   forbidden = NULL;
 109   slist_free (urls_html);
 110   urls_html = NULL;
 111   FREE_MAYBE (base_dir);
 112   FREE_MAYBE (robots_host);
 113   first_time = 1;
 114 }
 115
 116 /* Reset FIRST_TIME to 1, so that some action can be taken in
 117    recursive_retrieve().  */
 118 void
 119 recursive_reset (void)
 120 {
 121   first_time = 1;
 122 }
 123
 124 /* The core of recursive retrieving.  Endless recursion is avoided by
 125    having all URLs stored to a linked list of URLs, which is checked
 126    before loading any URL.  That way no URL can get loaded twice.
 127
 128    The function also supports specification of maximum recursion depth
 129    and a number of other goodies.  */
 130 uerr_t
 131 recursive_retrieve (const char *file, const char *this_url)
 132 {
 133   char *constr, *filename, *newloc;
 134   char *canon_this_url = NULL;
 135   int dt, inl, dash_p_leaf_HTML = FALSE;
 136   int meta_disallow_follow;
 137   int this_url_ftp;            /* See below the explanation */
 138   uerr_t err;
 139   struct urlinfo *rurl;
 140   urlpos *url_list, *cur_url;
 141   char *rfile; /* For robots */
 142   struct urlinfo *u;
 143
 144   assert (this_url != NULL);
 145   assert (file != NULL);
 146   /* If quota was exceeded earlier, bail out.  */
 147   if (downloaded_exceeds_quota ())
 148     return QUOTEXC;
 149   /* Cache the current URL in the list.  */
 150   if (first_time)
 151     {
 152       /* These three operations need to be done only once per Wget
 153          run.  They should probably be at a different location.  */
 154       if (!undesirable_urls)
 155         undesirable_urls = make_string_hash_table (0);
 156       if (!dl_file_url_map)
 157         dl_file_url_map = make_string_hash_table (0);
 158       if (!dl_url_file_map)
 159         dl_url_file_map = make_string_hash_table (0);
 160
 161       hash_table_clear (undesirable_urls);
 162       string_set_add (undesirable_urls, this_url);
 163       hash_table_clear (dl_file_url_map);
 164       hash_table_clear (dl_url_file_map);
 165       urls_html = NULL;
 166       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 167       u = newurl ();
 168       err = parseurl (this_url, u, 0);
 169       if (err == URLOK)
 170         {
 171           string_set_add (undesirable_urls, u->url);
 172           hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
 173           hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
 174           urls_html = slist_prepend (urls_html, file);
 175           if (opt.no_parent)
 176             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 177           /* Set the canonical this_url to be sent as referer.  This
 178              problem exists only when running the first time.  */
 179           canon_this_url = xstrdup (u->url);
 180         }
 181       else
 182         {
 183           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 184           base_dir = NULL;
 185         }
 186       freeurl (u, 1);
 187       depth = 1;
 188       robots_host = NULL;
 189       forbidden = NULL;
 190       first_time = 0;
 191     }
 192   else
 193     ++depth;
 194
 195   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 196     /* We've exceeded the maximum recursion depth specified by the user. */
 197     {
 198       if (opt.page_requisites && depth <= opt.reclevel + 1)
 199         /* When -p is specified, we can do one more partial recursion from the
 200            "leaf nodes" on the HTML document tree.  The recursion is partial in
 201            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 202            except for <LINK REL="stylesheet">. */
 203         dash_p_leaf_HTML = TRUE;
 204       else
 205         /* Either -p wasn't specified or it was and we've already gone the one
 206            extra (pseudo-)level that it affords us, so we need to bail out. */
 207         {
 208           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 209                    depth, opt.reclevel));
 210           --depth;
 211           return RECLEVELEXC;
 212         }
 213     }
 214
 215   /* Determine whether this_url is an FTP URL.  If it is, it means
 216      that the retrieval is done through proxy.  In that case, FTP
 217      links will be followed by default and recursion will not be
 218      turned off when following them.  */
 219   this_url_ftp = (urlproto (this_url) == URLFTP);
 220
 221   /* Get the URL-s from an HTML file: */
 222   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 223                             dash_p_leaf_HTML, &meta_disallow_follow);
 224
 225   if (opt.use_robots && meta_disallow_follow)
 226     {
 227       /* The META tag says we are not to follow this file.  Respect
 228          that.  */
 229       free_urlpos (url_list);
 230       url_list = NULL;
 231     }
 232
 233   /* Decide what to do with each of the URLs.  A URL will be loaded if
 234      it meets several requirements, discussed later.  */
 235   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 236     {
 237       /* If quota was exceeded earlier, bail out.  */
 238       if (downloaded_exceeds_quota ())
 239         break;
 240       /* Parse the URL for convenient use in other functions, as well
 241          as to get the optimized form.  It also checks URL integrity.  */
 242       u = newurl ();
 243       if (parseurl (cur_url->url, u, 0) != URLOK)
 244         {
 245           DEBUGP (("Yuck!  A bad URL.\n"));
 246           freeurl (u, 1);
 247           continue;
 248         }
 249       if (u->proto == URLFILE)
 250         {
 251           DEBUGP (("Nothing to do with file:// around here.\n"));
 252           freeurl (u, 1);
 253           continue;
 254         }
 255       assert (u->url != NULL);
 256       constr = xstrdup (u->url);
 257
 258       /* Several checkings whether a file is acceptable to load:
 259          1. check if URL is ftp, and we don't load it
 260          2. check for relative links (if relative_only is set)
 261          3. check for domain
 262          4. check for no-parent
 263          5. check for excludes && includes
 264          6. check for suffix
 265          7. check for same host (if spanhost is unset), with possible
 266          gethostbyname baggage
 267          8. check for robots.txt
 268
 269          Addendum: If the URL is FTP, and it is to be loaded, only the
 270          domain and suffix settings are "stronger".
 271
 272          Note that .html and (yuck) .htm will get loaded regardless of
 273          suffix rules (but that is remedied later with unlink) unless
 274          the depth equals the maximum depth.
 275
 276          More time- and memory- consuming tests should be put later on
 277          the list.  */
 278
 279       /* inl is set if the URL we are working on (constr) is stored in
 280          undesirable_urls.  Using it is crucial to avoid unnecessary
 281          repeated continuous hits to the hash table.  */
 282       inl = string_set_exists (undesirable_urls, constr);
 283
 284       /* If it is FTP, and FTP is not followed, chuck it out.  */
 285       if (!inl)
 286         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 287           {
 288             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 289             string_set_add (undesirable_urls, constr);
 290             inl = 1;
 291           }
 292       /* If it is absolute link and they are not followed, chuck it
 293          out.  */
 294       if (!inl && u->proto != URLFTP)
 295         if (opt.relative_only && !cur_url->link_relative_p)
 296           {
 297             DEBUGP (("It doesn't really look like a relative link.\n"));
 298             string_set_add (undesirable_urls, constr);
 299             inl = 1;
 300           }
 301       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 302       if (!inl)
 303         if (!accept_domain (u))
 304           {
 305             DEBUGP (("I don't like the smell of that domain.\n"));
 306             string_set_add (undesirable_urls, constr);
 307             inl = 1;
 308           }
 309       /* Check for parent directory.  */
 310       if (!inl && opt.no_parent
 311           /* If the new URL is FTP and the old was not, ignore
 312              opt.no_parent.  */
 313           && !(!this_url_ftp && u->proto == URLFTP))
 314         {
 315           /* Check for base_dir first.  */
 316           if (!(base_dir && frontcmp (base_dir, u->dir)))
 317             {
 318               /* Failing that, check for parent dir.  */
 319               struct urlinfo *ut = newurl ();
 320               if (parseurl (this_url, ut, 0) != URLOK)
 321                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 322               else if (!frontcmp (ut->dir, u->dir))
 323                 {
 324                   /* Failing that too, kill the URL.  */
 325                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 326                   string_set_add (undesirable_urls, constr);
 327                   inl = 1;
 328                 }
 329               freeurl (ut, 1);
 330             }
 331         }
 332       /* If the file does not match the acceptance list, or is on the
 333          rejection list, chuck it out.  The same goes for the
 334          directory exclude- and include- lists.  */
 335       if (!inl && (opt.includes || opt.excludes))
 336         {
 337           if (!accdir (u->dir, ALLABS))
 338             {
 339               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 340               string_set_add (undesirable_urls, constr);
 341               inl = 1;
 342             }
 343         }
 344       if (!inl)
 345         {
 346           char *suf = NULL;
 347           /* We check for acceptance/rejection rules only for non-HTML
 348              documents.  Since we don't know whether they really are
 349              HTML, it will be deduced from (an OR-ed list):
 350
 351              1) u->file is "" (meaning it is a directory)
 352              2) suffix exists, AND:
 353              a) it is "html", OR
 354              b) it is "htm"
 355
 356              If the file *is* supposed to be HTML, it will *not* be
 357             subject to acc/rej rules, unless a finite maximum depth has
 358             been specified and the current depth is the maximum depth. */
 359           if (!
 360               (!*u->file
 361                || (((suf = suffix (constr)) != NULL)
 362                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 363                       && ((opt.reclevel != INFINITE_RECURSION) &&
 364                           (depth != opt.reclevel))))))
 365             {
 366               if (!acceptable (u->file))
 367                 {
 368                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 369                           constr, u->file));
 370                   string_set_add (undesirable_urls, constr);
 371                   inl = 1;
 372                 }
 373             }
 374           FREE_MAYBE (suf);
 375         }
 376       /* Optimize the URL (which includes possible DNS lookup) only
 377          after all other possibilities have been exhausted.  */
 378       if (!inl)
 379         {
 380           if (!opt.simple_check)
 381             opt_url (u);
 382           else
 383             {
 384               char *p;
 385               /* Just lowercase the hostname.  */
 386               for (p = u->host; *p; p++)
 387                 *p = TOLOWER (*p);
 388               xfree (u->url);
 389               u->url = str_url (u, 0);
 390             }
 391           xfree (constr);
 392           constr = xstrdup (u->url);
 393           string_set_add (undesirable_urls, constr);
 394           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 395             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 396               {
 397                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 398                 string_set_add (undesirable_urls, constr);
 399                 inl = 1;
 400               }
 401         }
 402       /* What about robots.txt?  */
 403       if (!inl && opt.use_robots && u->proto == URLHTTP)
 404         {
 405           /* Since Wget knows about only one set of robot rules at a
 406              time, /robots.txt must be reloaded whenever a new host is
 407              accessed.
 408
 409              robots_host holds the host the current `forbid' variable
 410              is assigned to.  */
 411           if (!robots_host || !same_host (robots_host, u->host))
 412             {
 413               FREE_MAYBE (robots_host);
 414               /* Now make robots_host the new host, no matter what the
 415                  result will be.  So if there is no /robots.txt on the
 416                  site, Wget will not retry getting robots all the
 417                  time.  */
 418               robots_host = xstrdup (u->host);
 419               free_vec (forbidden);
 420               forbidden = NULL;
 421               err = retrieve_robots (constr, ROBOTS_FILENAME);
 422               if (err == ROBOTSOK)
 423                 {
 424                   rurl = robots_url (constr, ROBOTS_FILENAME);
 425                   rfile = url_filename (rurl);
 426                   forbidden = parse_robots (rfile);
 427                   freeurl (rurl, 1);
 428                   xfree (rfile);
 429                 }
 430             }
 431
 432           /* Now that we have (or don't have) robots, we can check for
 433              them.  */
 434           if (!robots_match (u, forbidden))
 435             {
 436               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 437                        ROBOTS_FILENAME));
 438               string_set_add (undesirable_urls, constr);
 439               inl = 1;
 440             }
 441         }
 442
 443       filename = NULL;
 444       /* If it wasn't chucked out, do something with it.  */
 445       if (!inl)
 446         {
 447           DEBUGP (("I've decided to load it -> "));
 448           /* Add it to the list of already-loaded URL-s.  */
 449           string_set_add (undesirable_urls, constr);
 450           /* Automatically followed FTPs will *not* be downloaded
 451              recursively.  */
 452           if (u->proto == URLFTP)
 453             {
 454               /* Don't you adore side-effects?  */
 455               opt.recursive = 0;
 456             }
 457           /* Reset its type.  */
 458           dt = 0;
 459           /* Retrieve it.  */
 460           retrieve_url (constr, &filename, &newloc,
 461                        canon_this_url ? canon_this_url : this_url, &dt);
 462           if (u->proto == URLFTP)
 463             {
 464               /* Restore...  */
 465               opt.recursive = 1;
 466             }
 467           if (newloc)
 468             {
 469               xfree (constr);
 470               constr = newloc;
 471             }
 472           /* In case of convert_links: If there was no error, add it to
 473              the list of downloaded URLs.  We might need it for
 474              conversion.  */
 475           if (opt.convert_links && filename)
 476             {
 477               if (dt & RETROKF)
 478                 {
 479                   hash_table_put (dl_file_url_map,
 480                                   xstrdup (filename), xstrdup (constr));
 481                   hash_table_put (dl_url_file_map,
 482                                   xstrdup (constr), xstrdup (filename));
 483                   /* If the URL is HTML, note it.  */
 484                   if (dt & TEXTHTML)
 485                     urls_html = slist_prepend (urls_html, filename);
 486                 }
 487             }
 488           /* If there was no error, and the type is text/html, parse
 489              it recursively.  */
 490           if (dt & TEXTHTML)
 491             {
 492               if (dt & RETROKF)
 493                 recursive_retrieve (filename, constr);
 494             }
 495           else
 496             DEBUGP (("%s is not text/html so we don't chase.\n",
 497                      filename ? filename: "(null)"));
 498
 499           if (opt.delete_after || (filename && !acceptable (filename)))
 500             /* Either --delete-after was specified, or we loaded this otherwise
 501                rejected (e.g. by -R) HTML file just so we could harvest its
 502                hyperlinks -- in either case, delete the local file. */
 503             {
 504               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 505                        opt.delete_after ? "--delete-after" :
 506                        "recursive rejection criteria"));
 507               logprintf (LOG_VERBOSE,
 508                          (opt.delete_after ? _("Removing %s.\n")
 509                           : _("Removing %s since it should be rejected.\n")),
 510                          filename);
 511               if (unlink (filename))
 512                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 513               dt &= ~RETROKF;
 514             }
 515
 516           /* If everything was OK, and links are to be converted, let's
 517              store the local filename.  */
 518           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 519             {
 520               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 521               cur_url->local_name = xstrdup (filename);
 522             }
 523         }
 524       else
 525         DEBUGP (("%s already in list, so we don't load.\n", constr));
 526       /* Free filename and constr.  */
 527       FREE_MAYBE (filename);
 528       FREE_MAYBE (constr);
 529       freeurl (u, 1);
 530       /* Increment the pbuf for the appropriate size.  */
 531     }
 532   if (opt.convert_links && !opt.delete_after)
 533     /* This is merely the first pass: the links that have been
 534        successfully downloaded are converted.  In the second pass,
 535        convert_all_links() will also convert those links that have NOT
 536        been downloaded to their canonical form.  */
 537     convert_links (file, url_list);
 538   /* Free the linked list of URL-s.  */
 539   free_urlpos (url_list);
 540   /* Free the canonical this_url.  */
 541   FREE_MAYBE (canon_this_url);
 542   /* Decrement the recursion depth.  */
 543   --depth;
 544   if (downloaded_exceeds_quota ())
 545     return QUOTEXC;
 546   else
 547     return RETROK;
 548 }
 549 \f
 550 /* convert_links() is called from recursive_retrieve() after we're
 551    done with an HTML file.  This call to convert_links is not complete
 552    because it converts only the downloaded files, and Wget cannot know
 553    which files will be downloaded afterwards.  So, if we have file
 554    fileone.html with:
 555
 556    <a href="/c/something.gif">
 557
 558    and /c/something.gif was not downloaded because it exceeded the
 559    recursion depth, the reference will *not* be changed.
 560
 561    However, later we can encounter /c/something.gif from an "upper"
 562    level HTML (let's call it filetwo.html), and it gets downloaded.
 563
 564    But now we have a problem because /c/something.gif will be
 565    correctly transformed in filetwo.html, but not in fileone.html,
 566    since Wget could not have known that /c/something.gif will be
 567    downloaded in the future.
 568
 569    This is why Wget must, after the whole retrieval, call
 570    convert_all_links to go once more through the entire list of
 571    retrieved HTMLs, and re-convert them.
 572
 573    All the downloaded HTMLs are kept in urls_html, and downloaded URLs
 574    in urls_downloaded.  From these two lists information is
 575    extracted.  */
 576 void
 577 convert_all_links (void)
 578 {
 579   slist *html;
 580
 581   /* Destructively reverse urls_html to get it in the right order.
 582      recursive_retrieve() used slist_prepend() consistently.  */
 583   urls_html = slist_nreverse (urls_html);
 584
 585   for (html = urls_html; html; html = html->next)
 586     {
 587       urlpos *urls, *cur_url;
 588       char *url;
 589
 590       DEBUGP (("Rescanning %s\n", html->string));
 591       /* Determine the URL of the HTML file.  get_urls_html will need
 592          it.  */
 593       url = hash_table_get (dl_file_url_map, html->string);
 594       if (url)
 595         DEBUGP (("It should correspond to %s.\n", url));
 596       else
 597         DEBUGP (("I cannot find the corresponding URL.\n"));
 598       /* Parse the HTML file...  */
 599       urls = get_urls_html (html->string, url, FALSE, NULL);
 600       /* We don't respect meta_disallow_follow here because, even if
 601          the file is not followed, we might still want to convert the
 602          links that have been followed from other files.  */
 603       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 604         {
 605           char *local_name;
 606
 607           /* The URL must be in canonical form to be compared.  */
 608           struct urlinfo *u = newurl ();
 609           uerr_t res = parseurl (cur_url->url, u, 0);
 610           if (res != URLOK)
 611             {
 612               freeurl (u, 1);
 613               continue;
 614             }
 615           /* We decide the direction of conversion according to whether
 616              a URL was downloaded.  Downloaded URLs will be converted
 617              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 618           local_name = hash_table_get (dl_url_file_map, u->url);
 619           if (local_name)
 620             DEBUGP (("%s marked for conversion, local %s\n",
 621                      u->url, local_name));
 622           /* Decide on the conversion direction.  */
 623           if (local_name)
 624             {
 625               /* We've downloaded this URL.  Convert it to relative
 626                  form.  We do this even if the URL already is in
 627                  relative form, because our directory structure may
 628                  not be identical to that on the server (think `-nd',
 629                  `--cut-dirs', etc.)  */
 630               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 631               cur_url->local_name = xstrdup (local_name);
 632             }
 633           else
 634             {
 635               /* We haven't downloaded this URL.  If it's not already
 636                  complete (including a full host name), convert it to
 637                  that form, so it can be reached while browsing this
 638                  HTML locally.  */
 639               if (!cur_url->link_complete_p)
 640                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 641               cur_url->local_name = NULL;
 642             }
 643           freeurl (u, 1);
 644         }
 645       /* Convert the links in the file.  */
 646       convert_links (html->string, urls);
 647       /* Free the data.  */
 648       free_urlpos (urls);
 649     }
 650 }
 651 \f
 652 /* Robots support.  */
 653
 654 /* Construct the robots URL.  */
 655 static struct urlinfo *
 656 robots_url (const char *url, const char *robots_filename)
 657 {
 658   struct urlinfo *u = newurl ();
 659   uerr_t err;
 660
 661   err = parseurl (url, u, 0);
 662   assert (err == URLOK && u->proto == URLHTTP);
 663   xfree (u->file);
 664   xfree (u->dir);
 665   xfree (u->url);
 666   u->dir = xstrdup ("");
 667   u->file = xstrdup (robots_filename);
 668   u->url = str_url (u, 0);
 669   return u;
 670 }
 671
 672 /* Retrieves the robots_filename from the root server directory, if
 673    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 674    NOROBOTS if robots could not be retrieved for any reason.  */
 675 static uerr_t
 676 retrieve_robots (const char *url, const char *robots_filename)
 677 {
 678   int dt;
 679   uerr_t err;
 680   struct urlinfo *u;
 681
 682   u = robots_url (url, robots_filename);
 683   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 684   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 685   freeurl (u, 1);
 686   if (err == RETROK)
 687     return ROBOTSOK;
 688   else
 689     return NOROBOTS;
 690 }
 691
 692 /* Parse the robots_filename and return the disallowed path components
 693    in a malloc-ed vector of character pointers.
 694
 695    It should be fully compliant with the syntax as described in the
 696    file norobots.txt, adopted by the robots mailing list
 697    (robots@webcrawler.com).  */
 698 static char **
 699 parse_robots (const char *robots_filename)
 700 {
 701   FILE *fp;
 702   char **entries;
 703   char *line, *cmd, *str, *p;
 704   char *base_version, *version;
 705   int num, i;
 706   int wget_matched;             /* is the part meant for Wget?  */
 707
 708   entries = NULL;
 709
 710   num = 0;
 711   fp = fopen (robots_filename, "rb");
 712   if (!fp)
 713     return NULL;
 714
 715   /* Kill version number.  */
 716   if (opt.useragent)
 717     {
 718       STRDUP_ALLOCA (base_version, opt.useragent);
 719       STRDUP_ALLOCA (version, opt.useragent);
 720     }
 721   else
 722     {
 723       int len = 10 + strlen (version_string);
 724       base_version = (char *)alloca (len);
 725       sprintf (base_version, "Wget/%s", version_string);
 726       version = (char *)alloca (len);
 727       sprintf (version, "Wget/%s", version_string);
 728     }
 729   for (p = version; *p; p++)
 730     *p = TOLOWER (*p);
 731   for (p = base_version; *p && *p != '/'; p++)
 732     *p = TOLOWER (*p);
 733   *p = '\0';
 734
 735   /* Setting this to 1 means that Wget considers itself under
 736      restrictions by default, even if the User-Agent field is not
 737      present.  However, if it finds the user-agent set to anything
 738      other than Wget, the rest will be ignored (up to the following
 739      User-Agent field).  Thus you may have something like:
 740
 741      Disallow: 1
 742      Disallow: 2
 743      User-Agent: stupid-robot
 744      Disallow: 3
 745      Disallow: 4
 746      User-Agent: Wget*
 747      Disallow: 5
 748      Disallow: 6
 749      User-Agent: *
 750      Disallow: 7
 751
 752      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 753      stored.  */
 754   wget_matched = 1;
 755   while ((line = read_whole_line (fp)))
 756     {
 757       int len = strlen (line);
 758       /* Destroy <CR><LF> if present.  */
 759       if (len && line[len - 1] == '\n')
 760         line[--len] = '\0';
 761       if (len && line[len - 1] == '\r')
 762         line[--len] = '\0';
 763       /* According to specifications, optional space may be at the
 764          end...  */
 765       DEBUGP (("Line: %s\n", line));
 766       /* Skip spaces.  */
 767       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 768       if (!*cmd)
 769         {
 770           xfree (line);
 771           DEBUGP (("(chucked out)\n"));
 772           continue;
 773         }
 774       /* Look for ':'.  */
 775       for (str = cmd; *str && *str != ':'; str++);
 776       if (!*str)
 777         {
 778           xfree (line);
 779           DEBUGP (("(chucked out)\n"));
 780           continue;
 781         }
 782       /* Zero-terminate the command.  */
 783       *str++ = '\0';
 784       /* Look for the string beginning...  */
 785       for (; *str && ISSPACE (*str); str++);
 786       /* Look for comments or trailing spaces and kill them off.  */
 787       for (p = str; *p; p++)
 788         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 789           {
 790             /* We have found either a shell-style comment `<sp>+#' or some
 791                trailing spaces.  Now rewind to the beginning of the spaces
 792                and place '\0' there.  */
 793             while (p > str && ISSPACE (*p))
 794               --p;
 795             if (p == str)
 796               *p = '\0';
 797             else
 798               *(p + 1) = '\0';
 799             break;
 800           }
 801       if (!strcasecmp (cmd, "User-agent"))
 802         {
 803           int match = 0;
 804           /* Lowercase the agent string.  */
 805           for (p = str; *p; p++)
 806             *p = TOLOWER (*p);
 807           /* If the string is `*', it matches.  */
 808           if (*str == '*' && !*(str + 1))
 809             match = 1;
 810           else
 811             {
 812               /* If the string contains wildcards, we'll run it through
 813                  fnmatch().  */
 814               if (has_wildcards_p (str))
 815                 {
 816                   /* If the string contains '/', compare with the full
 817                      version.  Else, compare it to base_version.  */
 818                   if (strchr (str, '/'))
 819                     match = !fnmatch (str, version, 0);
 820                   else
 821                     match = !fnmatch (str, base_version, 0);
 822                 }
 823               else                /* Substring search */
 824                 {
 825                   if (strstr (version, str))
 826                     match = 1;
 827                   else
 828                     match = 0;
 829                 }
 830             }
 831           /* If Wget is not matched, skip all the entries up to the
 832              next User-agent field.  */
 833           wget_matched = match;
 834         }
 835       else if (!wget_matched)
 836         {
 837           xfree (line);
 838           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 839           continue;
 840         }
 841       else if (!strcasecmp (cmd, "Disallow"))
 842         {
 843           /* If "Disallow" is empty, the robot is welcome.  */
 844           if (!*str)
 845             {
 846               free_vec (entries);
 847               entries = (char **)xmalloc (sizeof (char *));
 848               *entries = NULL;
 849               num = 0;
 850             }
 851           else
 852             {
 853               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 854               entries[num] = xstrdup (str);
 855               entries[++num] = NULL;
 856               /* Strip trailing spaces, according to specifications.  */
 857               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 858                 if (ISSPACE (str[i]))
 859                   str[i] = '\0';
 860             }
 861         }
 862       else
 863         {
 864           /* unknown command */
 865           DEBUGP (("(chucked out)\n"));
 866         }
 867       xfree (line);
 868     }
 869   fclose (fp);
 870   return entries;
 871 }
 872
 873 /* May the URL url be loaded according to disallowing rules stored in
 874    forbidden?  */
 875 static int
 876 robots_match (struct urlinfo *u, char **fb)
 877 {
 878   int l;
 879
 880   if (!fb)
 881     return 1;
 882   DEBUGP (("Matching %s against: ", u->path));
 883   for (; *fb; fb++)
 884     {
 885       DEBUGP (("%s ", *fb));
 886       l = strlen (*fb);
 887       /* If dir is fb, we may not load the file.  */
 888       if (strncmp (u->path, *fb, l) == 0)
 889         {
 890           DEBUGP (("matched.\n"));
 891           return 0; /* Matches, i.e. does not load...  */
 892         }
 893     }
 894   DEBUGP (("not matched.\n"));
 895   return 1;
 896 }