sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <ctype.h>
  35 #include <sys/types.h>
  36
  37 #include "wget.h"
  38 #include "url.h"
  39 #include "recur.h"
  40 #include "utils.h"
  41 #include "retr.h"
  42 #include "ftp.h"
  43 #include "fnmatch.h"
  44 #include "host.h"
  45
  46 extern char *version_string;
  47
  48 #define ROBOTS_FILENAME "robots.txt"
  49
  50 /* #### Many of these lists should really be hashtables!  */
  51
  52 /* List of downloaded URLs.  */
  53 static urlpos *urls_downloaded;
  54
  55 /* List of HTML URLs.  */
  56 static slist *urls_html;
  57
  58 /* List of undesirable-to-load URLs.  */
  59 static slist *ulist;
  60
  61 /* List of forbidden locations.  */
  62 static char **forbidden = NULL;
  63
  64 /* Current recursion depth.  */
  65 static int depth;
  66
  67 /* Base directory we're recursing from (used by no_parent).  */
  68 static char *base_dir;
  69
  70 /* The host name for which we last checked robots.  */
  71 static char *robots_host;
  72
  73 static int first_time = 1;
  74
  75 /* Construct the robots URL.  */
  76 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  77 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  78 static char **parse_robots PARAMS ((const char *));
  79 static int robots_match PARAMS ((struct urlinfo *, char **));
  80
  81
  82 /* Cleanup the data structures associated with recursive retrieving
  83    (the variables above).  */
  84 void
  85 recursive_cleanup (void)
  86 {
  87   free_slist (ulist);
  88   ulist = NULL;
  89   free_vec (forbidden);
  90   forbidden = NULL;
  91   free_slist (urls_html);
  92   urls_html = NULL;
  93   free_urlpos (urls_downloaded);
  94   urls_downloaded = NULL;
  95   FREE_MAYBE (base_dir);
  96   FREE_MAYBE (robots_host);
  97   first_time = 1;
  98 }
  99
 100 /* Reset FIRST_TIME to 1, so that some action can be taken in
 101    recursive_retrieve().  */
 102 void
 103 recursive_reset (void)
 104 {
 105   first_time = 1;
 106 }
 107
 108 /* The core of recursive retrieving.  Endless recursion is avoided by
 109    having all URL-s stored to a linked list of URL-s, which is checked
 110    before loading any URL.  That way no URL can get loaded twice.
 111
 112    The function also supports specification of maximum recursion depth
 113    and a number of other goodies.  */
 114 uerr_t
 115 recursive_retrieve (const char *file, const char *this_url)
 116 {
 117   char *constr, *filename, *newloc;
 118   char *canon_this_url = NULL;
 119   int dt, inl;
 120   int this_url_ftp;            /* See below the explanation */
 121   uerr_t err;
 122   struct urlinfo *rurl;
 123   urlpos *url_list, *cur_url;
 124   char *rfile; /* For robots */
 125   struct urlinfo *u;
 126
 127   assert (this_url != NULL);
 128   assert (file != NULL);
 129   /* If quota was exceeded earlier, bail out.  */
 130   if (opt.quota && (opt.downloaded > opt.quota))
 131     return QUOTEXC;
 132   /* Cache the current URL in the list.  */
 133   if (first_time)
 134     {
 135       ulist = add_slist (ulist, this_url, 0);
 136       urls_downloaded = NULL;
 137       urls_html = NULL;
 138       /* Enter this_url to the slist, in original and "enhanced" form.  */
 139       u = newurl ();
 140       err = parseurl (this_url, u, 0);
 141       if (err == URLOK)
 142         {
 143           ulist = add_slist (ulist, u->url, 0);
 144           urls_downloaded = add_url (urls_downloaded, u->url, file);
 145           urls_html = add_slist (urls_html, file, NOSORT);
 146           if (opt.no_parent)
 147             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 148           /* Set the canonical this_url to be sent as referer.  This
 149              problem exists only when running the first time.  */
 150           canon_this_url = xstrdup (u->url);
 151         }
 152       else
 153         {
 154           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 155           base_dir = NULL;
 156         }
 157       freeurl (u, 1);
 158       depth = 1;
 159       robots_host = NULL;
 160       forbidden = NULL;
 161       first_time = 0;
 162     }
 163   else
 164     ++depth;
 165
 166   /* Bail out if opt.reclevel is exceeded.  */
 167   if ((opt.reclevel != 0) && (depth > opt.reclevel))
 168     {
 169       DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 170                depth, opt.reclevel));
 171       --depth;
 172       return RECLEVELEXC;
 173     }
 174
 175   /* Determine whether this_url is an FTP URL.  If it is, it means
 176      that the retrieval is done through proxy.  In that case, FTP
 177      links will be followed by default and recursion will not be
 178      turned off when following them.  */
 179   this_url_ftp = (urlproto (this_url) == URLFTP);
 180
 181   /* Get the URL-s from an HTML file: */
 182   url_list = get_urls_html (file,
 183                             canon_this_url ? canon_this_url : this_url, 0);
 184
 185   /* Decide what to do with each of the URLs.  A URL will be loaded if
 186      it meets several requirements, discussed later.  */
 187   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 188     {
 189       /* If quota was exceeded earlier, bail out.  */
 190       if (opt.quota && (opt.downloaded > opt.quota))
 191         break;
 192       /* Parse the URL for convenient use in other functions, as well
 193          as to get the optimized form.  It also checks URL integrity.  */
 194       u = newurl ();
 195       if (parseurl (cur_url->url, u, 0) != URLOK)
 196         {
 197           DEBUGP (("Yuck!  A bad URL.\n"));
 198           freeurl (u, 1);
 199           continue;
 200         }
 201       if (u->proto == URLFILE)
 202         {
 203           DEBUGP (("Nothing to do with file:// around here.\n"));
 204           freeurl (u, 1);
 205           continue;
 206         }
 207       assert (u->url != NULL);
 208       constr = xstrdup (u->url);
 209
 210       /* Several checkings whether a file is acceptable to load:
 211          1. check if URL is ftp, and we don't load it
 212          2. check for relative links (if relative_only is set)
 213          3. check for domain
 214          4. check for no-parent
 215          5. check for excludes && includes
 216          6. check for suffix
 217          7. check for same host (if spanhost is unset), with possible
 218          gethostbyname baggage
 219          8. check for robots.txt
 220
 221          Addendum: If the URL is FTP, and it is to be loaded, only the
 222          domain and suffix settings are "stronger".
 223
 224          Note that .html and (yuck) .htm will get loaded regardless of
 225          suffix rules (but that is remedied later with unlink) unless
 226          the depth equals the maximum depth.
 227
 228          More time- and memory- consuming tests should be put later on
 229          the list.  */
 230
 231       /* inl is set if the URL we are working on (constr) is stored in
 232          ulist.  Using it is crucial to avoid the incessant calls to
 233          in_slist, which is quite slow.  */
 234       inl = in_slist (ulist, constr);
 235
 236       /* If it is FTP, and FTP is not followed, chuck it out.  */
 237       if (!inl)
 238         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 239           {
 240             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 241             ulist = add_slist (ulist, constr, 0);
 242             inl = 1;
 243           }
 244       /* If it is absolute link and they are not followed, chuck it
 245          out.  */
 246       if (!inl && u->proto != URLFTP)
 247         if (opt.relative_only && !(cur_url->flags & URELATIVE))
 248           {
 249             DEBUGP (("It doesn't really look like a relative link.\n"));
 250             ulist = add_slist (ulist, constr, 0);
 251             inl = 1;
 252           }
 253       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 254       if (!inl)
 255         if (!accept_domain (u))
 256           {
 257             DEBUGP (("I don't like the smell of that domain.\n"));
 258             ulist = add_slist (ulist, constr, 0);
 259             inl = 1;
 260           }
 261       /* Check for parent directory.  */
 262       if (!inl && opt.no_parent
 263           /* If the new URL is FTP and the old was not, ignore
 264              opt.no_parent.  */
 265           && !(!this_url_ftp && u->proto == URLFTP))
 266         {
 267           /* Check for base_dir first.  */
 268           if (!(base_dir && frontcmp (base_dir, u->dir)))
 269             {
 270               /* Failing that, check for parent dir.  */
 271               struct urlinfo *ut = newurl ();
 272               if (parseurl (this_url, ut, 0) != URLOK)
 273                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 274               else if (!frontcmp (ut->dir, u->dir))
 275                 {
 276                   /* Failing that too, kill the URL.  */
 277                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 278                   ulist = add_slist (ulist, constr, 0);
 279                   inl = 1;
 280                 }
 281               freeurl (ut, 1);
 282             }
 283         }
 284       /* If the file does not match the acceptance list, or is on the
 285          rejection list, chuck it out.  The same goes for the
 286          directory exclude- and include- lists.  */
 287       if (!inl && (opt.includes || opt.excludes))
 288         {
 289           if (!accdir (u->dir, ALLABS))
 290             {
 291               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 292               ulist = add_slist (ulist, constr, 0);
 293               inl = 1;
 294             }
 295         }
 296       if (!inl)
 297         {
 298           char *suf = NULL;
 299           /* We check for acceptance/rejection rules only for non-HTML
 300              documents.  Since we don't know whether they really are
 301              HTML, it will be deduced from (an OR-ed list):
 302
 303              1) u->file is "" (meaning it is a directory)
 304              2) suffix exists, AND:
 305              a) it is "html", OR
 306              b) it is "htm"
 307
 308              If the file *is* supposed to be HTML, it will *not* be
 309             subject to acc/rej rules, unless a finite maximum depth has
 310             been specified and the current depth is the maximum depth. */
 311           if (!
 312               (!*u->file
 313                || (((suf = suffix (constr)) != NULL)
 314                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 315                       && ((opt.reclevel != 0) && (depth != opt.reclevel))))))
 316             {
 317               if (!acceptable (u->file))
 318                 {
 319                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 320                           constr, u->file));
 321                   ulist = add_slist (ulist, constr, 0);
 322                   inl = 1;
 323                 }
 324             }
 325           FREE_MAYBE (suf);
 326         }
 327       /* Optimize the URL (which includes possible DNS lookup) only
 328          after all other possibilities have been exhausted.  */
 329       if (!inl)
 330         {
 331           if (!opt.simple_check)
 332             opt_url (u);
 333           else
 334             {
 335               char *p;
 336               /* Just lowercase the hostname.  */
 337               for (p = u->host; *p; p++)
 338                 *p = tolower (*p);
 339               free (u->url);
 340               u->url = str_url (u, 0);
 341             }
 342           free (constr);
 343           constr = xstrdup (u->url);
 344           inl = in_slist (ulist, constr);
 345           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 346             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 347               {
 348                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 349                 ulist = add_slist (ulist, constr, 0);
 350                 inl = 1;
 351               }
 352         }
 353       /* What about robots.txt?  */
 354       if (!inl && opt.use_robots && u->proto == URLHTTP)
 355         {
 356           /* Since Wget knows about only one set of robot rules at a
 357              time, /robots.txt must be reloaded whenever a new host is
 358              accessed.
 359
 360              robots_host holds the host the current `forbid' variable
 361              is assigned to.  */
 362           if (!robots_host || !same_host (robots_host, u->host))
 363             {
 364               FREE_MAYBE (robots_host);
 365               /* Now make robots_host the new host, no matter what the
 366                  result will be.  So if there is no /robots.txt on the
 367                  site, Wget will not retry getting robots all the
 368                  time.  */
 369               robots_host = xstrdup (u->host);
 370               free_vec (forbidden);
 371               forbidden = NULL;
 372               err = retrieve_robots (constr, ROBOTS_FILENAME);
 373               if (err == ROBOTSOK)
 374                 {
 375                   rurl = robots_url (constr, ROBOTS_FILENAME);
 376                   rfile = url_filename (rurl);
 377                   forbidden = parse_robots (rfile);
 378                   freeurl (rurl, 1);
 379                   free (rfile);
 380                 }
 381             }
 382
 383           /* Now that we have (or don't have) robots, we can check for
 384              them.  */
 385           if (!robots_match (u, forbidden))
 386             {
 387               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 388                        ROBOTS_FILENAME));
 389               ulist = add_slist (ulist, constr, 0);
 390               inl = 1;
 391             }
 392         }
 393
 394       filename = NULL;
 395       /* If it wasn't chucked out, do something with it.  */
 396       if (!inl)
 397         {
 398           DEBUGP (("I've decided to load it -> "));
 399           /* Add it to the list of already-loaded URL-s.  */
 400           ulist = add_slist (ulist, constr, 0);
 401           /* Automatically followed FTPs will *not* be downloaded
 402              recursively.  */
 403           if (u->proto == URLFTP)
 404             {
 405               /* Don't you adore side-effects?  */
 406               opt.recursive = 0;
 407             }
 408           /* Reset its type.  */
 409           dt = 0;
 410           /* Retrieve it.  */
 411           retrieve_url (constr, &filename, &newloc,
 412                        canon_this_url ? canon_this_url : this_url, &dt);
 413           if (u->proto == URLFTP)
 414             {
 415               /* Restore...  */
 416               opt.recursive = 1;
 417             }
 418           if (newloc)
 419             {
 420               free (constr);
 421               constr = newloc;
 422             }
 423           /* In case of convert_links: If there was no error, add it to
 424              the list of downloaded URLs.  We might need it for
 425              conversion.  */
 426           if (opt.convert_links && filename)
 427             {
 428               if (dt & RETROKF)
 429                 {
 430                   urls_downloaded = add_url (urls_downloaded, constr, filename);
 431                   /* If the URL is HTML, note it.  */
 432                   if (dt & TEXTHTML)
 433                     urls_html = add_slist (urls_html, filename, NOSORT);
 434                 }
 435             }
 436           /* If there was no error, and the type is text/html, parse
 437              it recursively.  */
 438           if (dt & TEXTHTML)
 439             {
 440               if (dt & RETROKF)
 441                 recursive_retrieve (filename, constr);
 442             }
 443           else
 444             DEBUGP (("%s is not text/html so we don't chase.\n",
 445                      filename ? filename: "(null)"));
 446           /* If an suffix-rejected file was loaded only because it was HTML,
 447              undo the error now */
 448           if (opt.delete_after || (filename && !acceptable (filename)))
 449             {
 450               logprintf (LOG_VERBOSE,
 451                          (opt.delete_after ? _("Removing %s.\n")
 452                           : _("Removing %s since it should be rejected.\n")),
 453                          filename);
 454               if (unlink (filename))
 455                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 456               dt &= ~RETROKF;
 457             }
 458           /* If everything was OK, and links are to be converted, let's
 459              store the local filename.  */
 460           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 461             {
 462               cur_url->flags |= UABS2REL;
 463               cur_url->local_name = xstrdup (filename);
 464             }
 465         }
 466       DEBUGP (("%s already in list, so we don't load.\n", constr));
 467       /* Free filename and constr.  */
 468       FREE_MAYBE (filename);
 469       FREE_MAYBE (constr);
 470       freeurl (u, 1);
 471       /* Increment the pbuf for the appropriate size.  */
 472     }
 473   if (opt.convert_links)
 474     convert_links (file, url_list);
 475   /* Free the linked list of URL-s.  */
 476   free_urlpos (url_list);
 477   /* Free the canonical this_url.  */
 478   FREE_MAYBE (canon_this_url);
 479   /* Decrement the recursion depth.  */
 480   --depth;
 481   if (opt.quota && (opt.downloaded > opt.quota))
 482     return QUOTEXC;
 483   else
 484     return RETROK;
 485 }
 486 \f
 487 /* Simple calls to convert_links will often fail because only the
 488    downloaded files are converted, and Wget cannot know which files
 489    will be converted in the future.  So, if we have file fileone.html
 490    with:
 491
 492    <a href=/c/something.gif>
 493
 494    and /c/something.gif was not downloaded because it exceeded the
 495    recursion depth, the reference will *not* be changed.
 496
 497    However, later we can encounter /c/something.gif from an "upper"
 498    level HTML (let's call it filetwo.html), and it gets downloaded.
 499
 500    But now we have a problem because /c/something.gif will be
 501    correctly transformed in filetwo.html, but not in fileone.html,
 502    since Wget could not have known that /c/something.gif will be
 503    downloaded in the future.
 504
 505    This is why Wget must, after the whole retrieval, call
 506    convert_all_links to go once more through the entire list of
 507    retrieved HTML-s, and re-convert them.
 508
 509    All the downloaded HTMLs are kept in urls_html, and downloaded URLs
 510    in urls_downloaded.  From these two lists information is
 511    extracted.  */
 512 void
 513 convert_all_links (void)
 514 {
 515   uerr_t res;
 516   urlpos *l1, *l2, *urls;
 517   struct urlinfo *u;
 518   slist *html;
 519   urlpos *urlhtml;
 520
 521   for (html = urls_html; html; html = html->next)
 522     {
 523       DEBUGP (("Rescanning %s\n", html->string));
 524       /* Determine the URL of the HTML file.  get_urls_html will need
 525          it.  */
 526       for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
 527         if (!strcmp (urlhtml->local_name, html->string))
 528           break;
 529       if (urlhtml)
 530         DEBUGP (("It should correspond to %s.\n", urlhtml->url));
 531       else
 532         DEBUGP (("I cannot find the corresponding URL.\n"));
 533       /* Parse the HTML file...  */
 534       urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1);
 535       if (!urls)
 536         continue;
 537       for (l1 = urls; l1; l1 = l1->next)
 538         {
 539           /* The URL must be in canonical form to be compared.  */
 540           u = newurl ();
 541           res = parseurl (l1->url, u, 0);
 542           if (res != URLOK)
 543             {
 544               freeurl (u, 1);
 545               continue;
 546             }
 547           /* We decide the direction of conversion according to whether
 548              a URL was downloaded.  Downloaded URLs will be converted
 549              ABS2REL, whereas non-downloaded will be converted REL2ABS.
 550              Note: not yet implemented; only ABS2REL works.  */
 551           for (l2 = urls_downloaded; l2; l2 = l2->next)
 552             if (!strcmp (l2->url, u->url))
 553               {
 554                 DEBUGP (("%s flagged for conversion, local %s\n",
 555                          l2->url, l2->local_name));
 556                 break;
 557               }
 558           /* Clear the flags.  */
 559           l1->flags &= ~ (UABS2REL | UREL2ABS);
 560           /* Decide on the conversion direction.  */
 561           if (l2)
 562             {
 563               l1->flags |= UABS2REL;
 564               l1->local_name = xstrdup (l2->local_name);
 565             }
 566           else
 567             {
 568               l1->flags |= UREL2ABS;
 569               l1->local_name = NULL;
 570             }
 571           freeurl (u, 1);
 572         }
 573       /* Convert the links in the file.  */
 574       convert_links (html->string, urls);
 575       /* Free the data.  */
 576       free_urlpos (urls);
 577     }
 578 }
 579 \f
 580 /* Robots support.  */
 581
 582 /* Construct the robots URL.  */
 583 static struct urlinfo *
 584 robots_url (const char *url, const char *robots_filename)
 585 {
 586   struct urlinfo *u = newurl ();
 587   uerr_t err;
 588
 589   err = parseurl (url, u, 0);
 590   assert (err == URLOK && u->proto == URLHTTP);
 591   free (u->file);
 592   free (u->dir);
 593   free (u->url);
 594   u->dir = xstrdup ("");
 595   u->file = xstrdup (robots_filename);
 596   u->url = str_url (u, 0);
 597   return u;
 598 }
 599
 600 /* Retrieves the robots_filename from the root server directory, if
 601    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 602    NOROBOTS if robots could not be retrieved for any reason.  */
 603 static uerr_t
 604 retrieve_robots (const char *url, const char *robots_filename)
 605 {
 606   int dt;
 607   uerr_t err;
 608   struct urlinfo *u;
 609
 610   u = robots_url (url, robots_filename);
 611   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 612   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 613   freeurl (u, 1);
 614   if (err == RETROK)
 615     return ROBOTSOK;
 616   else
 617     return NOROBOTS;
 618 }
 619
 620 /* Parse the robots_filename and return the disallowed path components
 621    in a malloc-ed vector of character pointers.
 622
 623    It should be fully compliant with the syntax as described in the
 624    file norobots.txt, adopted by the robots mailing list
 625    (robots@webcrawler.com).  */
 626 static char **
 627 parse_robots (const char *robots_filename)
 628 {
 629   FILE *fp;
 630   char **entries;
 631   char *line, *cmd, *str, *p;
 632   char *base_version, *version;
 633   int len, num, i;
 634   int wget_matched;             /* is the part meant for Wget?  */
 635
 636   entries = NULL;
 637
 638   num = 0;
 639   fp = fopen (robots_filename, "rb");
 640   if (!fp)
 641     return NULL;
 642
 643   /* Kill version number.  */
 644     if (opt.useragent)
 645       {
 646         STRDUP_ALLOCA (base_version, opt.useragent);
 647         STRDUP_ALLOCA (version, opt.useragent);
 648       }
 649     else
 650       {
 651         int len = 10 + strlen (version_string);
 652         base_version = (char *)alloca (len);
 653         sprintf (base_version, "Wget/%s", version_string);
 654         version = (char *)alloca (len);
 655         sprintf (version, "Wget/%s", version_string);
 656       }
 657   for (p = version; *p; p++)
 658     *p = tolower (*p);
 659   for (p = base_version; *p && *p != '/'; p++)
 660     *p = tolower (*p);
 661   *p = '\0';
 662
 663   /* Setting this to 1 means that Wget considers itself under
 664      restrictions by default, even if the User-Agent field is not
 665      present.  However, if it finds the user-agent set to anything
 666      other than Wget, the rest will be ignored (up to the following
 667      User-Agent field).  Thus you may have something like:
 668
 669      Disallow: 1
 670      Disallow: 2
 671      User-Agent: stupid-robot
 672      Disallow: 3
 673      Disallow: 4
 674      User-Agent: Wget*
 675      Disallow: 5
 676      Disallow: 6
 677      User-Agent: *
 678      Disallow: 7
 679
 680      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 681      stored.  */
 682   wget_matched = 1;
 683   while ((line = read_whole_line (fp)))
 684     {
 685       len = strlen (line);
 686       /* Destroy <CR> if there is one.  */
 687       if (len && line[len - 1] == '\r')
 688         line[len - 1] = '\0';
 689       /* According to specifications, optional space may be at the
 690          end...  */
 691       DEBUGP (("Line: %s\n", line));
 692       /* Skip spaces.  */
 693       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 694       if (!*cmd)
 695         {
 696           free (line);
 697           DEBUGP (("(chucked out)\n"));
 698           continue;
 699         }
 700       /* Look for ':'.  */
 701       for (str = cmd; *str && *str != ':'; str++);
 702       if (!*str)
 703         {
 704           free (line);
 705           DEBUGP (("(chucked out)\n"));
 706           continue;
 707         }
 708       /* Zero-terminate the command.  */
 709       *str++ = '\0';
 710       /* Look for the string beginning...  */
 711       for (; *str && ISSPACE (*str); str++);
 712       /* Look for comments and kill them off.  */
 713       for (p = str; *p; p++)
 714         if (*p && ISSPACE (*p) && *(p + 1) == '#')
 715           {
 716             /* We have found a shell-style comment `<sp>+ #'.  Now
 717                rewind to the beginning of the spaces and place '\0'
 718                there.  */
 719             while (p > str && ISSPACE (*p))
 720               --p;
 721             if (p == str)
 722               *p = '\0';
 723             else
 724               *(p + 1) = '\0';
 725             break;
 726           }
 727       if (!strcasecmp (cmd, "User-agent"))
 728         {
 729           int match = 0;
 730           /* Lowercase the agent string.  */
 731           for (p = str; *p; p++)
 732             *p = tolower (*p);
 733           /* If the string is `*', it matches.  */
 734           if (*str == '*' && !*(str + 1))
 735             match = 1;
 736           else
 737             {
 738               /* If the string contains wildcards, we'll run it through
 739                  fnmatch().  */
 740               if (has_wildcards_p (str))
 741                 {
 742                   /* If the string contains '/', compare with the full
 743                      version.  Else, compare it to base_version.  */
 744                   if (strchr (str, '/'))
 745                     match = !fnmatch (str, version, 0);
 746                   else
 747                     match = !fnmatch (str, base_version, 0);
 748                 }
 749               else                /* Substring search */
 750                 {
 751                   if (strstr (version, str))
 752                     match = 1;
 753                   else
 754                     match = 0;
 755                 }
 756             }
 757           /* If Wget is not matched, skip all the entries up to the
 758              next User-agent field.  */
 759           wget_matched = match;
 760         }
 761       else if (!wget_matched)
 762         {
 763           free (line);
 764           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 765           continue;
 766         }
 767       else if (!strcasecmp (cmd, "Disallow"))
 768         {
 769           /* If "Disallow" is empty, the robot is welcome.  */
 770           if (!*str)
 771             {
 772               free_vec (entries);
 773               entries = (char **)xmalloc (sizeof (char *));
 774               *entries = NULL;
 775               num = 0;
 776             }
 777           else
 778             {
 779               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 780               entries[num] = xstrdup (str);
 781               entries[++num] = NULL;
 782               /* Strip trailing spaces, according to specifications.  */
 783               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 784                 if (ISSPACE (str[i]))
 785                   str[i] = '\0';
 786             }
 787         }
 788       else
 789         {
 790           /* unknown command */
 791           DEBUGP (("(chucked out)\n"));
 792         }
 793       free (line);
 794     }
 795   fclose (fp);
 796   return entries;
 797 }
 798
 799 /* May the URL url be loaded according to disallowing rules stored in
 800    forbidden?  */
 801 static int
 802 robots_match (struct urlinfo *u, char **forbidden)
 803 {
 804   int l;
 805
 806   if (!forbidden)
 807     return 1;
 808   DEBUGP (("Matching %s against: ", u->path));
 809   for (; *forbidden; forbidden++)
 810     {
 811       DEBUGP (("%s ", *forbidden));
 812       l = strlen (*forbidden);
 813       /* If dir is forbidden, we may not load the file.  */
 814       if (strncmp (u->path, *forbidden, l) == 0)
 815         {
 816           DEBUGP (("matched.\n"));
 817           return 0; /* Matches, i.e. does not load...  */
 818         }
 819     }
 820   DEBUGP (("not matched.\n"));
 821   return 1;
 822 }