sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45
  46 #ifndef errno
  47 extern int errno;
  48 #endif
  49
  50 extern char *version_string;
  51
  52 #define ROBOTS_FILENAME "robots.txt"
  53
  54 static struct hash_table *dl_file_url_map;
  55 static struct hash_table *dl_url_file_map;
  56
  57 /* List of HTML files downloaded in this Wget run.  Used for link
  58    conversion after Wget is done.  */
  59 static slist *downloaded_html_files;
  60
  61 /* List of undesirable-to-load URLs.  */
  62 static struct hash_table *undesirable_urls;
  63
  64 /* List of forbidden locations.  */
  65 static char **forbidden = NULL;
  66
  67 /* Current recursion depth.  */
  68 static int depth;
  69
  70 /* Base directory we're recursing from (used by no_parent).  */
  71 static char *base_dir;
  72
  73 /* The host name for which we last checked robots.  */
  74 static char *robots_host;
  75
  76 static int first_time = 1;
  77
  78 /* Construct the robots URL.  */
  79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  81 static char **parse_robots PARAMS ((const char *));
  82 static int robots_match PARAMS ((struct urlinfo *, char **));
  83
  84
  85 /* Cleanup the data structures associated with recursive retrieving
  86    (the variables above).  */
  87 void
  88 recursive_cleanup (void)
  89 {
  90   if (undesirable_urls)
  91     {
  92       string_set_free (undesirable_urls);
  93       undesirable_urls = NULL;
  94     }
  95   if (dl_file_url_map)
  96     {
  97       free_keys_and_values (dl_file_url_map);
  98       hash_table_destroy (dl_file_url_map);
  99       dl_file_url_map = NULL;
 100     }
 101   if (dl_url_file_map)
 102     {
 103       free_keys_and_values (dl_url_file_map);
 104       hash_table_destroy (dl_url_file_map);
 105       dl_url_file_map = NULL;
 106     }
 107   undesirable_urls = NULL;
 108   free_vec (forbidden);
 109   forbidden = NULL;
 110   slist_free (downloaded_html_files);
 111   downloaded_html_files = NULL;
 112   FREE_MAYBE (base_dir);
 113   FREE_MAYBE (robots_host);
 114   first_time = 1;
 115 }
 116
 117 /* Reset FIRST_TIME to 1, so that some action can be taken in
 118    recursive_retrieve().  */
 119 void
 120 recursive_reset (void)
 121 {
 122   first_time = 1;
 123 }
 124
 125 /* The core of recursive retrieving.  Endless recursion is avoided by
 126    having all URLs stored to a linked list of URLs, which is checked
 127    before loading any URL.  That way no URL can get loaded twice.
 128
 129    The function also supports specification of maximum recursion depth
 130    and a number of other goodies.  */
 131 uerr_t
 132 recursive_retrieve (const char *file, const char *this_url)
 133 {
 134   char *constr, *filename, *newloc;
 135   char *canon_this_url = NULL;
 136   int dt, inl, dash_p_leaf_HTML = FALSE;
 137   int meta_disallow_follow;
 138   int this_url_ftp;            /* See below the explanation */
 139   uerr_t err;
 140   struct urlinfo *rurl;
 141   urlpos *url_list, *cur_url;
 142   char *rfile; /* For robots */
 143   struct urlinfo *u;
 144
 145   assert (this_url != NULL);
 146   assert (file != NULL);
 147   /* If quota was exceeded earlier, bail out.  */
 148   if (downloaded_exceeds_quota ())
 149     return QUOTEXC;
 150   /* Cache the current URL in the list.  */
 151   if (first_time)
 152     {
 153       /* These three operations need to be done only once per Wget
 154          run.  They should probably be at a different location.  */
 155       if (!undesirable_urls)
 156         undesirable_urls = make_string_hash_table (0);
 157
 158       hash_table_clear (undesirable_urls);
 159       string_set_add (undesirable_urls, this_url);
 160       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 161       u = newurl ();
 162       err = parseurl (this_url, u, 0);
 163       if (err == URLOK)
 164         {
 165           string_set_add (undesirable_urls, u->url);
 166           if (opt.no_parent)
 167             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 168           /* Set the canonical this_url to be sent as referer.  This
 169              problem exists only when running the first time.  */
 170           canon_this_url = xstrdup (u->url);
 171         }
 172       else
 173         {
 174           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 175           base_dir = NULL;
 176         }
 177       freeurl (u, 1);
 178       depth = 1;
 179       robots_host = NULL;
 180       forbidden = NULL;
 181       first_time = 0;
 182     }
 183   else
 184     ++depth;
 185
 186   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 187     /* We've exceeded the maximum recursion depth specified by the user. */
 188     {
 189       if (opt.page_requisites && depth <= opt.reclevel + 1)
 190         /* When -p is specified, we can do one more partial recursion from the
 191            "leaf nodes" on the HTML document tree.  The recursion is partial in
 192            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 193            except for <LINK REL="stylesheet">. */
 194         dash_p_leaf_HTML = TRUE;
 195       else
 196         /* Either -p wasn't specified or it was and we've already gone the one
 197            extra (pseudo-)level that it affords us, so we need to bail out. */
 198         {
 199           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 200                    depth, opt.reclevel));
 201           --depth;
 202           return RECLEVELEXC;
 203         }
 204     }
 205
 206   /* Determine whether this_url is an FTP URL.  If it is, it means
 207      that the retrieval is done through proxy.  In that case, FTP
 208      links will be followed by default and recursion will not be
 209      turned off when following them.  */
 210   this_url_ftp = (urlproto (this_url) == URLFTP);
 211
 212   /* Get the URL-s from an HTML file: */
 213   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 214                             dash_p_leaf_HTML, &meta_disallow_follow);
 215
 216   if (opt.use_robots && meta_disallow_follow)
 217     {
 218       /* The META tag says we are not to follow this file.  Respect
 219          that.  */
 220       free_urlpos (url_list);
 221       url_list = NULL;
 222     }
 223
 224   /* Decide what to do with each of the URLs.  A URL will be loaded if
 225      it meets several requirements, discussed later.  */
 226   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 227     {
 228       /* If quota was exceeded earlier, bail out.  */
 229       if (downloaded_exceeds_quota ())
 230         break;
 231       /* Parse the URL for convenient use in other functions, as well
 232          as to get the optimized form.  It also checks URL integrity.  */
 233       u = newurl ();
 234       if (parseurl (cur_url->url, u, 0) != URLOK)
 235         {
 236           DEBUGP (("Yuck!  A bad URL.\n"));
 237           freeurl (u, 1);
 238           continue;
 239         }
 240       if (u->proto == URLFILE)
 241         {
 242           DEBUGP (("Nothing to do with file:// around here.\n"));
 243           freeurl (u, 1);
 244           continue;
 245         }
 246       assert (u->url != NULL);
 247       constr = xstrdup (u->url);
 248
 249       /* Several checkings whether a file is acceptable to load:
 250          1. check if URL is ftp, and we don't load it
 251          2. check for relative links (if relative_only is set)
 252          3. check for domain
 253          4. check for no-parent
 254          5. check for excludes && includes
 255          6. check for suffix
 256          7. check for same host (if spanhost is unset), with possible
 257          gethostbyname baggage
 258          8. check for robots.txt
 259
 260          Addendum: If the URL is FTP, and it is to be loaded, only the
 261          domain and suffix settings are "stronger".
 262
 263          Note that .html and (yuck) .htm will get loaded regardless of
 264          suffix rules (but that is remedied later with unlink) unless
 265          the depth equals the maximum depth.
 266
 267          More time- and memory- consuming tests should be put later on
 268          the list.  */
 269
 270       /* inl is set if the URL we are working on (constr) is stored in
 271          undesirable_urls.  Using it is crucial to avoid unnecessary
 272          repeated continuous hits to the hash table.  */
 273       inl = string_set_contains (undesirable_urls, constr);
 274
 275       /* If it is FTP, and FTP is not followed, chuck it out.  */
 276       if (!inl)
 277         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 278           {
 279             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 280             string_set_add (undesirable_urls, constr);
 281             inl = 1;
 282           }
 283       /* If it is absolute link and they are not followed, chuck it
 284          out.  */
 285       if (!inl && u->proto != URLFTP)
 286         if (opt.relative_only && !cur_url->link_relative_p)
 287           {
 288             DEBUGP (("It doesn't really look like a relative link.\n"));
 289             string_set_add (undesirable_urls, constr);
 290             inl = 1;
 291           }
 292       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 293       if (!inl)
 294         if (!accept_domain (u))
 295           {
 296             DEBUGP (("I don't like the smell of that domain.\n"));
 297             string_set_add (undesirable_urls, constr);
 298             inl = 1;
 299           }
 300       /* Check for parent directory.  */
 301       if (!inl && opt.no_parent
 302           /* If the new URL is FTP and the old was not, ignore
 303              opt.no_parent.  */
 304           && !(!this_url_ftp && u->proto == URLFTP))
 305         {
 306           /* Check for base_dir first.  */
 307           if (!(base_dir && frontcmp (base_dir, u->dir)))
 308             {
 309               /* Failing that, check for parent dir.  */
 310               struct urlinfo *ut = newurl ();
 311               if (parseurl (this_url, ut, 0) != URLOK)
 312                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 313               else if (!frontcmp (ut->dir, u->dir))
 314                 {
 315                   /* Failing that too, kill the URL.  */
 316                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 317                   string_set_add (undesirable_urls, constr);
 318                   inl = 1;
 319                 }
 320               freeurl (ut, 1);
 321             }
 322         }
 323       /* If the file does not match the acceptance list, or is on the
 324          rejection list, chuck it out.  The same goes for the
 325          directory exclude- and include- lists.  */
 326       if (!inl && (opt.includes || opt.excludes))
 327         {
 328           if (!accdir (u->dir, ALLABS))
 329             {
 330               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 331               string_set_add (undesirable_urls, constr);
 332               inl = 1;
 333             }
 334         }
 335       if (!inl)
 336         {
 337           char *suf = NULL;
 338           /* We check for acceptance/rejection rules only for non-HTML
 339              documents.  Since we don't know whether they really are
 340              HTML, it will be deduced from (an OR-ed list):
 341
 342              1) u->file is "" (meaning it is a directory)
 343              2) suffix exists, AND:
 344              a) it is "html", OR
 345              b) it is "htm"
 346
 347              If the file *is* supposed to be HTML, it will *not* be
 348             subject to acc/rej rules, unless a finite maximum depth has
 349             been specified and the current depth is the maximum depth. */
 350           if (!
 351               (!*u->file
 352                || (((suf = suffix (constr)) != NULL)
 353                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 354                       && ((opt.reclevel != INFINITE_RECURSION) &&
 355                           (depth != opt.reclevel))))))
 356             {
 357               if (!acceptable (u->file))
 358                 {
 359                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 360                           constr, u->file));
 361                   string_set_add (undesirable_urls, constr);
 362                   inl = 1;
 363                 }
 364             }
 365           FREE_MAYBE (suf);
 366         }
 367       /* Optimize the URL (which includes possible DNS lookup) only
 368          after all other possibilities have been exhausted.  */
 369       if (!inl)
 370         {
 371           if (!opt.simple_check)
 372             opt_url (u);
 373           else
 374             {
 375               char *p;
 376               /* Just lowercase the hostname.  */
 377               for (p = u->host; *p; p++)
 378                 *p = TOLOWER (*p);
 379               xfree (u->url);
 380               u->url = str_url (u, 0);
 381             }
 382           xfree (constr);
 383           constr = xstrdup (u->url);
 384           string_set_add (undesirable_urls, constr);
 385           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 386             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 387               {
 388                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 389                 string_set_add (undesirable_urls, constr);
 390                 inl = 1;
 391               }
 392         }
 393       /* What about robots.txt?  */
 394       if (!inl && opt.use_robots && u->proto == URLHTTP)
 395         {
 396           /* Since Wget knows about only one set of robot rules at a
 397              time, /robots.txt must be reloaded whenever a new host is
 398              accessed.
 399
 400              robots_host holds the host the current `forbid' variable
 401              is assigned to.  */
 402           if (!robots_host || !same_host (robots_host, u->host))
 403             {
 404               FREE_MAYBE (robots_host);
 405               /* Now make robots_host the new host, no matter what the
 406                  result will be.  So if there is no /robots.txt on the
 407                  site, Wget will not retry getting robots all the
 408                  time.  */
 409               robots_host = xstrdup (u->host);
 410               free_vec (forbidden);
 411               forbidden = NULL;
 412               err = retrieve_robots (constr, ROBOTS_FILENAME);
 413               if (err == ROBOTSOK)
 414                 {
 415                   rurl = robots_url (constr, ROBOTS_FILENAME);
 416                   rfile = url_filename (rurl);
 417                   forbidden = parse_robots (rfile);
 418                   freeurl (rurl, 1);
 419                   xfree (rfile);
 420                 }
 421             }
 422
 423           /* Now that we have (or don't have) robots, we can check for
 424              them.  */
 425           if (!robots_match (u, forbidden))
 426             {
 427               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 428                        ROBOTS_FILENAME));
 429               string_set_add (undesirable_urls, constr);
 430               inl = 1;
 431             }
 432         }
 433
 434       filename = NULL;
 435       /* If it wasn't chucked out, do something with it.  */
 436       if (!inl)
 437         {
 438           DEBUGP (("I've decided to load it -> "));
 439           /* Add it to the list of already-loaded URL-s.  */
 440           string_set_add (undesirable_urls, constr);
 441           /* Automatically followed FTPs will *not* be downloaded
 442              recursively.  */
 443           if (u->proto == URLFTP)
 444             {
 445               /* Don't you adore side-effects?  */
 446               opt.recursive = 0;
 447             }
 448           /* Reset its type.  */
 449           dt = 0;
 450           /* Retrieve it.  */
 451           retrieve_url (constr, &filename, &newloc,
 452                        canon_this_url ? canon_this_url : this_url, &dt);
 453           if (u->proto == URLFTP)
 454             {
 455               /* Restore...  */
 456               opt.recursive = 1;
 457             }
 458           if (newloc)
 459             {
 460               xfree (constr);
 461               constr = newloc;
 462             }
 463           /* If there was no error, and the type is text/html, parse
 464              it recursively.  */
 465           if (dt & TEXTHTML)
 466             {
 467               if (dt & RETROKF)
 468                 recursive_retrieve (filename, constr);
 469             }
 470           else
 471             DEBUGP (("%s is not text/html so we don't chase.\n",
 472                      filename ? filename: "(null)"));
 473
 474           if (opt.delete_after || (filename && !acceptable (filename)))
 475             /* Either --delete-after was specified, or we loaded this otherwise
 476                rejected (e.g. by -R) HTML file just so we could harvest its
 477                hyperlinks -- in either case, delete the local file. */
 478             {
 479               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 480                        opt.delete_after ? "--delete-after" :
 481                        "recursive rejection criteria"));
 482               logprintf (LOG_VERBOSE,
 483                          (opt.delete_after ? _("Removing %s.\n")
 484                           : _("Removing %s since it should be rejected.\n")),
 485                          filename);
 486               if (unlink (filename))
 487                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 488               dt &= ~RETROKF;
 489             }
 490
 491           /* If everything was OK, and links are to be converted, let's
 492              store the local filename.  */
 493           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 494             {
 495               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 496               cur_url->local_name = xstrdup (filename);
 497             }
 498         }
 499       else
 500         DEBUGP (("%s already in list, so we don't load.\n", constr));
 501       /* Free filename and constr.  */
 502       FREE_MAYBE (filename);
 503       FREE_MAYBE (constr);
 504       freeurl (u, 1);
 505       /* Increment the pbuf for the appropriate size.  */
 506     }
 507   if (opt.convert_links && !opt.delete_after)
 508     /* This is merely the first pass: the links that have been
 509        successfully downloaded are converted.  In the second pass,
 510        convert_all_links() will also convert those links that have NOT
 511        been downloaded to their canonical form.  */
 512     convert_links (file, url_list);
 513   /* Free the linked list of URL-s.  */
 514   free_urlpos (url_list);
 515   /* Free the canonical this_url.  */
 516   FREE_MAYBE (canon_this_url);
 517   /* Decrement the recursion depth.  */
 518   --depth;
 519   if (downloaded_exceeds_quota ())
 520     return QUOTEXC;
 521   else
 522     return RETROK;
 523 }
 524 \f
 525 void
 526 register_download (const char *url, const char *file)
 527 {
 528   if (!opt.convert_links)
 529     return;
 530   if (!dl_file_url_map)
 531     dl_file_url_map = make_string_hash_table (0);
 532   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 533   if (!dl_url_file_map)
 534     dl_url_file_map = make_string_hash_table (0);
 535   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 536 }
 537
 538 void
 539 register_html (const char *url, const char *file)
 540 {
 541   if (!opt.convert_links)
 542     return;
 543   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 544 }
 545
 546 /* convert_links() is called from recursive_retrieve() after we're
 547    done with an HTML file.  This call to convert_links is not complete
 548    because it converts only the downloaded files, and Wget cannot know
 549    which files will be downloaded afterwards.  So, if we have file
 550    fileone.html with:
 551
 552    <a href="/c/something.gif">
 553
 554    and /c/something.gif was not downloaded because it exceeded the
 555    recursion depth, the reference will *not* be changed.
 556
 557    However, later we can encounter /c/something.gif from an "upper"
 558    level HTML (let's call it filetwo.html), and it gets downloaded.
 559
 560    But now we have a problem because /c/something.gif will be
 561    correctly transformed in filetwo.html, but not in fileone.html,
 562    since Wget could not have known that /c/something.gif will be
 563    downloaded in the future.
 564
 565    This is why Wget must, after the whole retrieval, call
 566    convert_all_links to go once more through the entire list of
 567    retrieved HTMLs, and re-convert them.
 568
 569    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 570    in urls_downloaded.  From these two lists information is
 571    extracted.  */
 572 void
 573 convert_all_links (void)
 574 {
 575   slist *html;
 576
 577   /* Destructively reverse downloaded_html_files to get it in the right order.
 578      recursive_retrieve() used slist_prepend() consistently.  */
 579   downloaded_html_files = slist_nreverse (downloaded_html_files);
 580
 581   for (html = downloaded_html_files; html; html = html->next)
 582     {
 583       urlpos *urls, *cur_url;
 584       char *url;
 585
 586       DEBUGP (("Rescanning %s\n", html->string));
 587       /* Determine the URL of the HTML file.  get_urls_html will need
 588          it.  */
 589       url = hash_table_get (dl_file_url_map, html->string);
 590       if (url)
 591         DEBUGP (("It should correspond to %s.\n", url));
 592       else
 593         DEBUGP (("I cannot find the corresponding URL.\n"));
 594       /* Parse the HTML file...  */
 595       urls = get_urls_html (html->string, url, FALSE, NULL);
 596       /* We don't respect meta_disallow_follow here because, even if
 597          the file is not followed, we might still want to convert the
 598          links that have been followed from other files.  */
 599       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 600         {
 601           char *local_name;
 602
 603           /* The URL must be in canonical form to be compared.  */
 604           struct urlinfo *u = newurl ();
 605           uerr_t res = parseurl (cur_url->url, u, 0);
 606           if (res != URLOK)
 607             {
 608               freeurl (u, 1);
 609               continue;
 610             }
 611           /* We decide the direction of conversion according to whether
 612              a URL was downloaded.  Downloaded URLs will be converted
 613              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 614           local_name = hash_table_get (dl_url_file_map, u->url);
 615           if (local_name)
 616             DEBUGP (("%s marked for conversion, local %s\n",
 617                      u->url, local_name));
 618           /* Decide on the conversion direction.  */
 619           if (local_name)
 620             {
 621               /* We've downloaded this URL.  Convert it to relative
 622                  form.  We do this even if the URL already is in
 623                  relative form, because our directory structure may
 624                  not be identical to that on the server (think `-nd',
 625                  `--cut-dirs', etc.)  */
 626               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 627               cur_url->local_name = xstrdup (local_name);
 628             }
 629           else
 630             {
 631               /* We haven't downloaded this URL.  If it's not already
 632                  complete (including a full host name), convert it to
 633                  that form, so it can be reached while browsing this
 634                  HTML locally.  */
 635               if (!cur_url->link_complete_p)
 636                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 637               cur_url->local_name = NULL;
 638             }
 639           freeurl (u, 1);
 640         }
 641       /* Convert the links in the file.  */
 642       convert_links (html->string, urls);
 643       /* Free the data.  */
 644       free_urlpos (urls);
 645     }
 646 }
 647 \f
 648 /* Robots support.  */
 649
 650 /* Construct the robots URL.  */
 651 static struct urlinfo *
 652 robots_url (const char *url, const char *robots_filename)
 653 {
 654   struct urlinfo *u = newurl ();
 655   uerr_t err;
 656
 657   err = parseurl (url, u, 0);
 658   assert (err == URLOK && u->proto == URLHTTP);
 659   xfree (u->file);
 660   xfree (u->dir);
 661   xfree (u->url);
 662   u->dir = xstrdup ("");
 663   u->file = xstrdup (robots_filename);
 664   u->url = str_url (u, 0);
 665   return u;
 666 }
 667
 668 /* Retrieves the robots_filename from the root server directory, if
 669    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 670    NOROBOTS if robots could not be retrieved for any reason.  */
 671 static uerr_t
 672 retrieve_robots (const char *url, const char *robots_filename)
 673 {
 674   int dt;
 675   uerr_t err;
 676   struct urlinfo *u;
 677
 678   u = robots_url (url, robots_filename);
 679   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 680   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 681   freeurl (u, 1);
 682   if (err == RETROK)
 683     return ROBOTSOK;
 684   else
 685     return NOROBOTS;
 686 }
 687
 688 /* Parse the robots_filename and return the disallowed path components
 689    in a malloc-ed vector of character pointers.
 690
 691    It should be fully compliant with the syntax as described in the
 692    file norobots.txt, adopted by the robots mailing list
 693    (robots@webcrawler.com).  */
 694 static char **
 695 parse_robots (const char *robots_filename)
 696 {
 697   FILE *fp;
 698   char **entries;
 699   char *line, *cmd, *str, *p;
 700   char *base_version, *version;
 701   int num, i;
 702   int wget_matched;             /* is the part meant for Wget?  */
 703
 704   entries = NULL;
 705
 706   num = 0;
 707   fp = fopen (robots_filename, "rb");
 708   if (!fp)
 709     return NULL;
 710
 711   /* Kill version number.  */
 712   if (opt.useragent)
 713     {
 714       STRDUP_ALLOCA (base_version, opt.useragent);
 715       STRDUP_ALLOCA (version, opt.useragent);
 716     }
 717   else
 718     {
 719       int len = 10 + strlen (version_string);
 720       base_version = (char *)alloca (len);
 721       sprintf (base_version, "Wget/%s", version_string);
 722       version = (char *)alloca (len);
 723       sprintf (version, "Wget/%s", version_string);
 724     }
 725   for (p = version; *p; p++)
 726     *p = TOLOWER (*p);
 727   for (p = base_version; *p && *p != '/'; p++)
 728     *p = TOLOWER (*p);
 729   *p = '\0';
 730
 731   /* Setting this to 1 means that Wget considers itself under
 732      restrictions by default, even if the User-Agent field is not
 733      present.  However, if it finds the user-agent set to anything
 734      other than Wget, the rest will be ignored (up to the following
 735      User-Agent field).  Thus you may have something like:
 736
 737      Disallow: 1
 738      Disallow: 2
 739      User-Agent: stupid-robot
 740      Disallow: 3
 741      Disallow: 4
 742      User-Agent: Wget*
 743      Disallow: 5
 744      Disallow: 6
 745      User-Agent: *
 746      Disallow: 7
 747
 748      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 749      stored.  */
 750   wget_matched = 1;
 751   while ((line = read_whole_line (fp)))
 752     {
 753       int len = strlen (line);
 754       /* Destroy <CR><LF> if present.  */
 755       if (len && line[len - 1] == '\n')
 756         line[--len] = '\0';
 757       if (len && line[len - 1] == '\r')
 758         line[--len] = '\0';
 759       /* According to specifications, optional space may be at the
 760          end...  */
 761       DEBUGP (("Line: %s\n", line));
 762       /* Skip spaces.  */
 763       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 764       if (!*cmd)
 765         {
 766           xfree (line);
 767           DEBUGP (("(chucked out)\n"));
 768           continue;
 769         }
 770       /* Look for ':'.  */
 771       for (str = cmd; *str && *str != ':'; str++);
 772       if (!*str)
 773         {
 774           xfree (line);
 775           DEBUGP (("(chucked out)\n"));
 776           continue;
 777         }
 778       /* Zero-terminate the command.  */
 779       *str++ = '\0';
 780       /* Look for the string beginning...  */
 781       for (; *str && ISSPACE (*str); str++);
 782       /* Look for comments or trailing spaces and kill them off.  */
 783       for (p = str; *p; p++)
 784         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 785           {
 786             /* We have found either a shell-style comment `<sp>+#' or some
 787                trailing spaces.  Now rewind to the beginning of the spaces
 788                and place '\0' there.  */
 789             while (p > str && ISSPACE (*p))
 790               --p;
 791             if (p == str)
 792               *p = '\0';
 793             else
 794               *(p + 1) = '\0';
 795             break;
 796           }
 797       if (!strcasecmp (cmd, "User-agent"))
 798         {
 799           int match = 0;
 800           /* Lowercase the agent string.  */
 801           for (p = str; *p; p++)
 802             *p = TOLOWER (*p);
 803           /* If the string is `*', it matches.  */
 804           if (*str == '*' && !*(str + 1))
 805             match = 1;
 806           else
 807             {
 808               /* If the string contains wildcards, we'll run it through
 809                  fnmatch().  */
 810               if (has_wildcards_p (str))
 811                 {
 812                   /* If the string contains '/', compare with the full
 813                      version.  Else, compare it to base_version.  */
 814                   if (strchr (str, '/'))
 815                     match = !fnmatch (str, version, 0);
 816                   else
 817                     match = !fnmatch (str, base_version, 0);
 818                 }
 819               else                /* Substring search */
 820                 {
 821                   if (strstr (version, str))
 822                     match = 1;
 823                   else
 824                     match = 0;
 825                 }
 826             }
 827           /* If Wget is not matched, skip all the entries up to the
 828              next User-agent field.  */
 829           wget_matched = match;
 830         }
 831       else if (!wget_matched)
 832         {
 833           xfree (line);
 834           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 835           continue;
 836         }
 837       else if (!strcasecmp (cmd, "Disallow"))
 838         {
 839           /* If "Disallow" is empty, the robot is welcome.  */
 840           if (!*str)
 841             {
 842               free_vec (entries);
 843               entries = (char **)xmalloc (sizeof (char *));
 844               *entries = NULL;
 845               num = 0;
 846             }
 847           else
 848             {
 849               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 850               entries[num] = xstrdup (str);
 851               entries[++num] = NULL;
 852               /* Strip trailing spaces, according to specifications.  */
 853               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 854                 if (ISSPACE (str[i]))
 855                   str[i] = '\0';
 856             }
 857         }
 858       else
 859         {
 860           /* unknown command */
 861           DEBUGP (("(chucked out)\n"));
 862         }
 863       xfree (line);
 864     }
 865   fclose (fp);
 866   return entries;
 867 }
 868
 869 /* May the URL url be loaded according to disallowing rules stored in
 870    forbidden?  */
 871 static int
 872 robots_match (struct urlinfo *u, char **fb)
 873 {
 874   int l;
 875
 876   if (!fb)
 877     return 1;
 878   DEBUGP (("Matching %s against: ", u->path));
 879   for (; *fb; fb++)
 880     {
 881       DEBUGP (("%s ", *fb));
 882       l = strlen (*fb);
 883       /* If dir is fb, we may not load the file.  */
 884       if (strncmp (u->path, *fb, l) == 0)
 885         {
 886           DEBUGP (("matched.\n"));
 887           return 0; /* Matches, i.e. does not load...  */
 888         }
 889     }
 890   DEBUGP (("not matched.\n"));
 891   return 1;
 892 }