sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45
  46 #ifndef errno
  47 extern int errno;
  48 #endif
  49
  50 extern char *version_string;
  51
  52 #define ROBOTS_FILENAME "robots.txt"
  53
  54 static struct hash_table *dl_file_url_map;
  55 static struct hash_table *dl_url_file_map;
  56
  57 /* List of HTML files downloaded in this Wget run.  Used for link
  58    conversion after Wget is done.  */
  59 static slist *downloaded_html_files;
  60
  61 /* List of undesirable-to-load URLs.  */
  62 static struct hash_table *undesirable_urls;
  63
  64 /* List of forbidden locations.  */
  65 static char **forbidden = NULL;
  66
  67 /* Current recursion depth.  */
  68 static int depth;
  69
  70 /* Base directory we're recursing from (used by no_parent).  */
  71 static char *base_dir;
  72
  73 /* The host name for which we last checked robots.  */
  74 static char *robots_host;
  75
  76 static int first_time = 1;
  77
  78 /* Construct the robots URL.  */
  79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  81 static char **parse_robots PARAMS ((const char *));
  82 static int robots_match PARAMS ((struct urlinfo *, char **));
  83
  84
  85 /* Cleanup the data structures associated with recursive retrieving
  86    (the variables above).  */
  87 void
  88 recursive_cleanup (void)
  89 {
  90   if (undesirable_urls)
  91     {
  92       string_set_free (undesirable_urls);
  93       undesirable_urls = NULL;
  94     }
  95   if (dl_file_url_map)
  96     {
  97       free_keys_and_values (dl_file_url_map);
  98       hash_table_destroy (dl_file_url_map);
  99       dl_file_url_map = NULL;
 100     }
 101   if (dl_url_file_map)
 102     {
 103       free_keys_and_values (dl_url_file_map);
 104       hash_table_destroy (dl_url_file_map);
 105       dl_url_file_map = NULL;
 106     }
 107   undesirable_urls = NULL;
 108   free_vec (forbidden);
 109   forbidden = NULL;
 110   slist_free (downloaded_html_files);
 111   downloaded_html_files = NULL;
 112   FREE_MAYBE (base_dir);
 113   FREE_MAYBE (robots_host);
 114   first_time = 1;
 115 }
 116
 117 /* Reset FIRST_TIME to 1, so that some action can be taken in
 118    recursive_retrieve().  */
 119 void
 120 recursive_reset (void)
 121 {
 122   first_time = 1;
 123 }
 124
 125 /* The core of recursive retrieving.  Endless recursion is avoided by
 126    having all URLs stored to a linked list of URLs, which is checked
 127    before loading any URL.  That way no URL can get loaded twice.
 128
 129    The function also supports specification of maximum recursion depth
 130    and a number of other goodies.  */
 131 uerr_t
 132 recursive_retrieve (const char *file, const char *this_url)
 133 {
 134   char *constr, *filename, *newloc;
 135   char *canon_this_url = NULL;
 136   int dt, inl, dash_p_leaf_HTML = FALSE;
 137   int meta_disallow_follow;
 138   int this_url_ftp;            /* See below the explanation */
 139   uerr_t err;
 140   struct urlinfo *rurl;
 141   urlpos *url_list, *cur_url;
 142   char *rfile; /* For robots */
 143   struct urlinfo *u;
 144
 145   assert (this_url != NULL);
 146   assert (file != NULL);
 147   /* If quota was exceeded earlier, bail out.  */
 148   if (downloaded_exceeds_quota ())
 149     return QUOTEXC;
 150   /* Cache the current URL in the list.  */
 151   if (first_time)
 152     {
 153       /* These three operations need to be done only once per Wget
 154          run.  They should probably be at a different location.  */
 155       if (!undesirable_urls)
 156         undesirable_urls = make_string_hash_table (0);
 157
 158       hash_table_clear (undesirable_urls);
 159       string_set_add (undesirable_urls, this_url);
 160       hash_table_clear (dl_file_url_map);
 161       hash_table_clear (dl_url_file_map);
 162       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 163       u = newurl ();
 164       err = parseurl (this_url, u, 0);
 165       if (err == URLOK)
 166         {
 167           string_set_add (undesirable_urls, u->url);
 168           if (opt.no_parent)
 169             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 170           /* Set the canonical this_url to be sent as referer.  This
 171              problem exists only when running the first time.  */
 172           canon_this_url = xstrdup (u->url);
 173         }
 174       else
 175         {
 176           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 177           base_dir = NULL;
 178         }
 179       freeurl (u, 1);
 180       depth = 1;
 181       robots_host = NULL;
 182       forbidden = NULL;
 183       first_time = 0;
 184     }
 185   else
 186     ++depth;
 187
 188   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 189     /* We've exceeded the maximum recursion depth specified by the user. */
 190     {
 191       if (opt.page_requisites && depth <= opt.reclevel + 1)
 192         /* When -p is specified, we can do one more partial recursion from the
 193            "leaf nodes" on the HTML document tree.  The recursion is partial in
 194            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 195            except for <LINK REL="stylesheet">. */
 196         dash_p_leaf_HTML = TRUE;
 197       else
 198         /* Either -p wasn't specified or it was and we've already gone the one
 199            extra (pseudo-)level that it affords us, so we need to bail out. */
 200         {
 201           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 202                    depth, opt.reclevel));
 203           --depth;
 204           return RECLEVELEXC;
 205         }
 206     }
 207
 208   /* Determine whether this_url is an FTP URL.  If it is, it means
 209      that the retrieval is done through proxy.  In that case, FTP
 210      links will be followed by default and recursion will not be
 211      turned off when following them.  */
 212   this_url_ftp = (urlproto (this_url) == URLFTP);
 213
 214   /* Get the URL-s from an HTML file: */
 215   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 216                             dash_p_leaf_HTML, &meta_disallow_follow);
 217
 218   if (opt.use_robots && meta_disallow_follow)
 219     {
 220       /* The META tag says we are not to follow this file.  Respect
 221          that.  */
 222       free_urlpos (url_list);
 223       url_list = NULL;
 224     }
 225
 226   /* Decide what to do with each of the URLs.  A URL will be loaded if
 227      it meets several requirements, discussed later.  */
 228   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 229     {
 230       /* If quota was exceeded earlier, bail out.  */
 231       if (downloaded_exceeds_quota ())
 232         break;
 233       /* Parse the URL for convenient use in other functions, as well
 234          as to get the optimized form.  It also checks URL integrity.  */
 235       u = newurl ();
 236       if (parseurl (cur_url->url, u, 0) != URLOK)
 237         {
 238           DEBUGP (("Yuck!  A bad URL.\n"));
 239           freeurl (u, 1);
 240           continue;
 241         }
 242       if (u->proto == URLFILE)
 243         {
 244           DEBUGP (("Nothing to do with file:// around here.\n"));
 245           freeurl (u, 1);
 246           continue;
 247         }
 248       assert (u->url != NULL);
 249       constr = xstrdup (u->url);
 250
 251       /* Several checkings whether a file is acceptable to load:
 252          1. check if URL is ftp, and we don't load it
 253          2. check for relative links (if relative_only is set)
 254          3. check for domain
 255          4. check for no-parent
 256          5. check for excludes && includes
 257          6. check for suffix
 258          7. check for same host (if spanhost is unset), with possible
 259          gethostbyname baggage
 260          8. check for robots.txt
 261
 262          Addendum: If the URL is FTP, and it is to be loaded, only the
 263          domain and suffix settings are "stronger".
 264
 265          Note that .html and (yuck) .htm will get loaded regardless of
 266          suffix rules (but that is remedied later with unlink) unless
 267          the depth equals the maximum depth.
 268
 269          More time- and memory- consuming tests should be put later on
 270          the list.  */
 271
 272       /* inl is set if the URL we are working on (constr) is stored in
 273          undesirable_urls.  Using it is crucial to avoid unnecessary
 274          repeated continuous hits to the hash table.  */
 275       inl = string_set_exists (undesirable_urls, constr);
 276
 277       /* If it is FTP, and FTP is not followed, chuck it out.  */
 278       if (!inl)
 279         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 280           {
 281             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 282             string_set_add (undesirable_urls, constr);
 283             inl = 1;
 284           }
 285       /* If it is absolute link and they are not followed, chuck it
 286          out.  */
 287       if (!inl && u->proto != URLFTP)
 288         if (opt.relative_only && !cur_url->link_relative_p)
 289           {
 290             DEBUGP (("It doesn't really look like a relative link.\n"));
 291             string_set_add (undesirable_urls, constr);
 292             inl = 1;
 293           }
 294       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 295       if (!inl)
 296         if (!accept_domain (u))
 297           {
 298             DEBUGP (("I don't like the smell of that domain.\n"));
 299             string_set_add (undesirable_urls, constr);
 300             inl = 1;
 301           }
 302       /* Check for parent directory.  */
 303       if (!inl && opt.no_parent
 304           /* If the new URL is FTP and the old was not, ignore
 305              opt.no_parent.  */
 306           && !(!this_url_ftp && u->proto == URLFTP))
 307         {
 308           /* Check for base_dir first.  */
 309           if (!(base_dir && frontcmp (base_dir, u->dir)))
 310             {
 311               /* Failing that, check for parent dir.  */
 312               struct urlinfo *ut = newurl ();
 313               if (parseurl (this_url, ut, 0) != URLOK)
 314                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 315               else if (!frontcmp (ut->dir, u->dir))
 316                 {
 317                   /* Failing that too, kill the URL.  */
 318                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 319                   string_set_add (undesirable_urls, constr);
 320                   inl = 1;
 321                 }
 322               freeurl (ut, 1);
 323             }
 324         }
 325       /* If the file does not match the acceptance list, or is on the
 326          rejection list, chuck it out.  The same goes for the
 327          directory exclude- and include- lists.  */
 328       if (!inl && (opt.includes || opt.excludes))
 329         {
 330           if (!accdir (u->dir, ALLABS))
 331             {
 332               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 333               string_set_add (undesirable_urls, constr);
 334               inl = 1;
 335             }
 336         }
 337       if (!inl)
 338         {
 339           char *suf = NULL;
 340           /* We check for acceptance/rejection rules only for non-HTML
 341              documents.  Since we don't know whether they really are
 342              HTML, it will be deduced from (an OR-ed list):
 343
 344              1) u->file is "" (meaning it is a directory)
 345              2) suffix exists, AND:
 346              a) it is "html", OR
 347              b) it is "htm"
 348
 349              If the file *is* supposed to be HTML, it will *not* be
 350             subject to acc/rej rules, unless a finite maximum depth has
 351             been specified and the current depth is the maximum depth. */
 352           if (!
 353               (!*u->file
 354                || (((suf = suffix (constr)) != NULL)
 355                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 356                       && ((opt.reclevel != INFINITE_RECURSION) &&
 357                           (depth != opt.reclevel))))))
 358             {
 359               if (!acceptable (u->file))
 360                 {
 361                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 362                           constr, u->file));
 363                   string_set_add (undesirable_urls, constr);
 364                   inl = 1;
 365                 }
 366             }
 367           FREE_MAYBE (suf);
 368         }
 369       /* Optimize the URL (which includes possible DNS lookup) only
 370          after all other possibilities have been exhausted.  */
 371       if (!inl)
 372         {
 373           if (!opt.simple_check)
 374             opt_url (u);
 375           else
 376             {
 377               char *p;
 378               /* Just lowercase the hostname.  */
 379               for (p = u->host; *p; p++)
 380                 *p = TOLOWER (*p);
 381               xfree (u->url);
 382               u->url = str_url (u, 0);
 383             }
 384           xfree (constr);
 385           constr = xstrdup (u->url);
 386           string_set_add (undesirable_urls, constr);
 387           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 388             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 389               {
 390                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 391                 string_set_add (undesirable_urls, constr);
 392                 inl = 1;
 393               }
 394         }
 395       /* What about robots.txt?  */
 396       if (!inl && opt.use_robots && u->proto == URLHTTP)
 397         {
 398           /* Since Wget knows about only one set of robot rules at a
 399              time, /robots.txt must be reloaded whenever a new host is
 400              accessed.
 401
 402              robots_host holds the host the current `forbid' variable
 403              is assigned to.  */
 404           if (!robots_host || !same_host (robots_host, u->host))
 405             {
 406               FREE_MAYBE (robots_host);
 407               /* Now make robots_host the new host, no matter what the
 408                  result will be.  So if there is no /robots.txt on the
 409                  site, Wget will not retry getting robots all the
 410                  time.  */
 411               robots_host = xstrdup (u->host);
 412               free_vec (forbidden);
 413               forbidden = NULL;
 414               err = retrieve_robots (constr, ROBOTS_FILENAME);
 415               if (err == ROBOTSOK)
 416                 {
 417                   rurl = robots_url (constr, ROBOTS_FILENAME);
 418                   rfile = url_filename (rurl);
 419                   forbidden = parse_robots (rfile);
 420                   freeurl (rurl, 1);
 421                   xfree (rfile);
 422                 }
 423             }
 424
 425           /* Now that we have (or don't have) robots, we can check for
 426              them.  */
 427           if (!robots_match (u, forbidden))
 428             {
 429               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 430                        ROBOTS_FILENAME));
 431               string_set_add (undesirable_urls, constr);
 432               inl = 1;
 433             }
 434         }
 435
 436       filename = NULL;
 437       /* If it wasn't chucked out, do something with it.  */
 438       if (!inl)
 439         {
 440           DEBUGP (("I've decided to load it -> "));
 441           /* Add it to the list of already-loaded URL-s.  */
 442           string_set_add (undesirable_urls, constr);
 443           /* Automatically followed FTPs will *not* be downloaded
 444              recursively.  */
 445           if (u->proto == URLFTP)
 446             {
 447               /* Don't you adore side-effects?  */
 448               opt.recursive = 0;
 449             }
 450           /* Reset its type.  */
 451           dt = 0;
 452           /* Retrieve it.  */
 453           retrieve_url (constr, &filename, &newloc,
 454                        canon_this_url ? canon_this_url : this_url, &dt);
 455           if (u->proto == URLFTP)
 456             {
 457               /* Restore...  */
 458               opt.recursive = 1;
 459             }
 460           if (newloc)
 461             {
 462               xfree (constr);
 463               constr = newloc;
 464             }
 465           /* If there was no error, and the type is text/html, parse
 466              it recursively.  */
 467           if (dt & TEXTHTML)
 468             {
 469               if (dt & RETROKF)
 470                 recursive_retrieve (filename, constr);
 471             }
 472           else
 473             DEBUGP (("%s is not text/html so we don't chase.\n",
 474                      filename ? filename: "(null)"));
 475
 476           if (opt.delete_after || (filename && !acceptable (filename)))
 477             /* Either --delete-after was specified, or we loaded this otherwise
 478                rejected (e.g. by -R) HTML file just so we could harvest its
 479                hyperlinks -- in either case, delete the local file. */
 480             {
 481               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 482                        opt.delete_after ? "--delete-after" :
 483                        "recursive rejection criteria"));
 484               logprintf (LOG_VERBOSE,
 485                          (opt.delete_after ? _("Removing %s.\n")
 486                           : _("Removing %s since it should be rejected.\n")),
 487                          filename);
 488               if (unlink (filename))
 489                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 490               dt &= ~RETROKF;
 491             }
 492
 493           /* If everything was OK, and links are to be converted, let's
 494              store the local filename.  */
 495           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 496             {
 497               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 498               cur_url->local_name = xstrdup (filename);
 499             }
 500         }
 501       else
 502         DEBUGP (("%s already in list, so we don't load.\n", constr));
 503       /* Free filename and constr.  */
 504       FREE_MAYBE (filename);
 505       FREE_MAYBE (constr);
 506       freeurl (u, 1);
 507       /* Increment the pbuf for the appropriate size.  */
 508     }
 509   if (opt.convert_links && !opt.delete_after)
 510     /* This is merely the first pass: the links that have been
 511        successfully downloaded are converted.  In the second pass,
 512        convert_all_links() will also convert those links that have NOT
 513        been downloaded to their canonical form.  */
 514     convert_links (file, url_list);
 515   /* Free the linked list of URL-s.  */
 516   free_urlpos (url_list);
 517   /* Free the canonical this_url.  */
 518   FREE_MAYBE (canon_this_url);
 519   /* Decrement the recursion depth.  */
 520   --depth;
 521   if (downloaded_exceeds_quota ())
 522     return QUOTEXC;
 523   else
 524     return RETROK;
 525 }
 526 \f
 527 void
 528 register_download (const char *url, const char *file)
 529 {
 530   if (!opt.convert_links)
 531     return;
 532   if (!dl_file_url_map)
 533     dl_file_url_map = make_string_hash_table (0);
 534   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 535   if (!dl_url_file_map)
 536     dl_url_file_map = make_string_hash_table (0);
 537   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 538 }
 539
 540 void
 541 register_html (const char *url, const char *file)
 542 {
 543   if (!opt.convert_links)
 544     return;
 545   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 546 }
 547
 548 /* convert_links() is called from recursive_retrieve() after we're
 549    done with an HTML file.  This call to convert_links is not complete
 550    because it converts only the downloaded files, and Wget cannot know
 551    which files will be downloaded afterwards.  So, if we have file
 552    fileone.html with:
 553
 554    <a href="/c/something.gif">
 555
 556    and /c/something.gif was not downloaded because it exceeded the
 557    recursion depth, the reference will *not* be changed.
 558
 559    However, later we can encounter /c/something.gif from an "upper"
 560    level HTML (let's call it filetwo.html), and it gets downloaded.
 561
 562    But now we have a problem because /c/something.gif will be
 563    correctly transformed in filetwo.html, but not in fileone.html,
 564    since Wget could not have known that /c/something.gif will be
 565    downloaded in the future.
 566
 567    This is why Wget must, after the whole retrieval, call
 568    convert_all_links to go once more through the entire list of
 569    retrieved HTMLs, and re-convert them.
 570
 571    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 572    in urls_downloaded.  From these two lists information is
 573    extracted.  */
 574 void
 575 convert_all_links (void)
 576 {
 577   slist *html;
 578
 579   /* Destructively reverse downloaded_html_files to get it in the right order.
 580      recursive_retrieve() used slist_prepend() consistently.  */
 581   downloaded_html_files = slist_nreverse (downloaded_html_files);
 582
 583   for (html = downloaded_html_files; html; html = html->next)
 584     {
 585       urlpos *urls, *cur_url;
 586       char *url;
 587
 588       DEBUGP (("Rescanning %s\n", html->string));
 589       /* Determine the URL of the HTML file.  get_urls_html will need
 590          it.  */
 591       url = hash_table_get (dl_file_url_map, html->string);
 592       if (url)
 593         DEBUGP (("It should correspond to %s.\n", url));
 594       else
 595         DEBUGP (("I cannot find the corresponding URL.\n"));
 596       /* Parse the HTML file...  */
 597       urls = get_urls_html (html->string, url, FALSE, NULL);
 598       /* We don't respect meta_disallow_follow here because, even if
 599          the file is not followed, we might still want to convert the
 600          links that have been followed from other files.  */
 601       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 602         {
 603           char *local_name;
 604
 605           /* The URL must be in canonical form to be compared.  */
 606           struct urlinfo *u = newurl ();
 607           uerr_t res = parseurl (cur_url->url, u, 0);
 608           if (res != URLOK)
 609             {
 610               freeurl (u, 1);
 611               continue;
 612             }
 613           /* We decide the direction of conversion according to whether
 614              a URL was downloaded.  Downloaded URLs will be converted
 615              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 616           local_name = hash_table_get (dl_url_file_map, u->url);
 617           if (local_name)
 618             DEBUGP (("%s marked for conversion, local %s\n",
 619                      u->url, local_name));
 620           /* Decide on the conversion direction.  */
 621           if (local_name)
 622             {
 623               /* We've downloaded this URL.  Convert it to relative
 624                  form.  We do this even if the URL already is in
 625                  relative form, because our directory structure may
 626                  not be identical to that on the server (think `-nd',
 627                  `--cut-dirs', etc.)  */
 628               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 629               cur_url->local_name = xstrdup (local_name);
 630             }
 631           else
 632             {
 633               /* We haven't downloaded this URL.  If it's not already
 634                  complete (including a full host name), convert it to
 635                  that form, so it can be reached while browsing this
 636                  HTML locally.  */
 637               if (!cur_url->link_complete_p)
 638                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 639               cur_url->local_name = NULL;
 640             }
 641           freeurl (u, 1);
 642         }
 643       /* Convert the links in the file.  */
 644       convert_links (html->string, urls);
 645       /* Free the data.  */
 646       free_urlpos (urls);
 647     }
 648 }
 649 \f
 650 /* Robots support.  */
 651
 652 /* Construct the robots URL.  */
 653 static struct urlinfo *
 654 robots_url (const char *url, const char *robots_filename)
 655 {
 656   struct urlinfo *u = newurl ();
 657   uerr_t err;
 658
 659   err = parseurl (url, u, 0);
 660   assert (err == URLOK && u->proto == URLHTTP);
 661   xfree (u->file);
 662   xfree (u->dir);
 663   xfree (u->url);
 664   u->dir = xstrdup ("");
 665   u->file = xstrdup (robots_filename);
 666   u->url = str_url (u, 0);
 667   return u;
 668 }
 669
 670 /* Retrieves the robots_filename from the root server directory, if
 671    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 672    NOROBOTS if robots could not be retrieved for any reason.  */
 673 static uerr_t
 674 retrieve_robots (const char *url, const char *robots_filename)
 675 {
 676   int dt;
 677   uerr_t err;
 678   struct urlinfo *u;
 679
 680   u = robots_url (url, robots_filename);
 681   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 682   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 683   freeurl (u, 1);
 684   if (err == RETROK)
 685     return ROBOTSOK;
 686   else
 687     return NOROBOTS;
 688 }
 689
 690 /* Parse the robots_filename and return the disallowed path components
 691    in a malloc-ed vector of character pointers.
 692
 693    It should be fully compliant with the syntax as described in the
 694    file norobots.txt, adopted by the robots mailing list
 695    (robots@webcrawler.com).  */
 696 static char **
 697 parse_robots (const char *robots_filename)
 698 {
 699   FILE *fp;
 700   char **entries;
 701   char *line, *cmd, *str, *p;
 702   char *base_version, *version;
 703   int num, i;
 704   int wget_matched;             /* is the part meant for Wget?  */
 705
 706   entries = NULL;
 707
 708   num = 0;
 709   fp = fopen (robots_filename, "rb");
 710   if (!fp)
 711     return NULL;
 712
 713   /* Kill version number.  */
 714   if (opt.useragent)
 715     {
 716       STRDUP_ALLOCA (base_version, opt.useragent);
 717       STRDUP_ALLOCA (version, opt.useragent);
 718     }
 719   else
 720     {
 721       int len = 10 + strlen (version_string);
 722       base_version = (char *)alloca (len);
 723       sprintf (base_version, "Wget/%s", version_string);
 724       version = (char *)alloca (len);
 725       sprintf (version, "Wget/%s", version_string);
 726     }
 727   for (p = version; *p; p++)
 728     *p = TOLOWER (*p);
 729   for (p = base_version; *p && *p != '/'; p++)
 730     *p = TOLOWER (*p);
 731   *p = '\0';
 732
 733   /* Setting this to 1 means that Wget considers itself under
 734      restrictions by default, even if the User-Agent field is not
 735      present.  However, if it finds the user-agent set to anything
 736      other than Wget, the rest will be ignored (up to the following
 737      User-Agent field).  Thus you may have something like:
 738
 739      Disallow: 1
 740      Disallow: 2
 741      User-Agent: stupid-robot
 742      Disallow: 3
 743      Disallow: 4
 744      User-Agent: Wget*
 745      Disallow: 5
 746      Disallow: 6
 747      User-Agent: *
 748      Disallow: 7
 749
 750      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 751      stored.  */
 752   wget_matched = 1;
 753   while ((line = read_whole_line (fp)))
 754     {
 755       int len = strlen (line);
 756       /* Destroy <CR><LF> if present.  */
 757       if (len && line[len - 1] == '\n')
 758         line[--len] = '\0';
 759       if (len && line[len - 1] == '\r')
 760         line[--len] = '\0';
 761       /* According to specifications, optional space may be at the
 762          end...  */
 763       DEBUGP (("Line: %s\n", line));
 764       /* Skip spaces.  */
 765       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 766       if (!*cmd)
 767         {
 768           xfree (line);
 769           DEBUGP (("(chucked out)\n"));
 770           continue;
 771         }
 772       /* Look for ':'.  */
 773       for (str = cmd; *str && *str != ':'; str++);
 774       if (!*str)
 775         {
 776           xfree (line);
 777           DEBUGP (("(chucked out)\n"));
 778           continue;
 779         }
 780       /* Zero-terminate the command.  */
 781       *str++ = '\0';
 782       /* Look for the string beginning...  */
 783       for (; *str && ISSPACE (*str); str++);
 784       /* Look for comments or trailing spaces and kill them off.  */
 785       for (p = str; *p; p++)
 786         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 787           {
 788             /* We have found either a shell-style comment `<sp>+#' or some
 789                trailing spaces.  Now rewind to the beginning of the spaces
 790                and place '\0' there.  */
 791             while (p > str && ISSPACE (*p))
 792               --p;
 793             if (p == str)
 794               *p = '\0';
 795             else
 796               *(p + 1) = '\0';
 797             break;
 798           }
 799       if (!strcasecmp (cmd, "User-agent"))
 800         {
 801           int match = 0;
 802           /* Lowercase the agent string.  */
 803           for (p = str; *p; p++)
 804             *p = TOLOWER (*p);
 805           /* If the string is `*', it matches.  */
 806           if (*str == '*' && !*(str + 1))
 807             match = 1;
 808           else
 809             {
 810               /* If the string contains wildcards, we'll run it through
 811                  fnmatch().  */
 812               if (has_wildcards_p (str))
 813                 {
 814                   /* If the string contains '/', compare with the full
 815                      version.  Else, compare it to base_version.  */
 816                   if (strchr (str, '/'))
 817                     match = !fnmatch (str, version, 0);
 818                   else
 819                     match = !fnmatch (str, base_version, 0);
 820                 }
 821               else                /* Substring search */
 822                 {
 823                   if (strstr (version, str))
 824                     match = 1;
 825                   else
 826                     match = 0;
 827                 }
 828             }
 829           /* If Wget is not matched, skip all the entries up to the
 830              next User-agent field.  */
 831           wget_matched = match;
 832         }
 833       else if (!wget_matched)
 834         {
 835           xfree (line);
 836           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 837           continue;
 838         }
 839       else if (!strcasecmp (cmd, "Disallow"))
 840         {
 841           /* If "Disallow" is empty, the robot is welcome.  */
 842           if (!*str)
 843             {
 844               free_vec (entries);
 845               entries = (char **)xmalloc (sizeof (char *));
 846               *entries = NULL;
 847               num = 0;
 848             }
 849           else
 850             {
 851               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 852               entries[num] = xstrdup (str);
 853               entries[++num] = NULL;
 854               /* Strip trailing spaces, according to specifications.  */
 855               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 856                 if (ISSPACE (str[i]))
 857                   str[i] = '\0';
 858             }
 859         }
 860       else
 861         {
 862           /* unknown command */
 863           DEBUGP (("(chucked out)\n"));
 864         }
 865       xfree (line);
 866     }
 867   fclose (fp);
 868   return entries;
 869 }
 870
 871 /* May the URL url be loaded according to disallowing rules stored in
 872    forbidden?  */
 873 static int
 874 robots_match (struct urlinfo *u, char **fb)
 875 {
 876   int l;
 877
 878   if (!fb)
 879     return 1;
 880   DEBUGP (("Matching %s against: ", u->path));
 881   for (; *fb; fb++)
 882     {
 883       DEBUGP (("%s ", *fb));
 884       l = strlen (*fb);
 885       /* If dir is fb, we may not load the file.  */
 886       if (strncmp (u->path, *fb, l) == 0)
 887         {
 888           DEBUGP (("matched.\n"));
 889           return 0; /* Matches, i.e. does not load...  */
 890         }
 891     }
 892   DEBUGP (("not matched.\n"));
 893   return 1;
 894 }