sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <ctype.h>
  35 #include <sys/types.h>
  36
  37 #include "wget.h"
  38 #include "url.h"
  39 #include "recur.h"
  40 #include "utils.h"
  41 #include "retr.h"
  42 #include "ftp.h"
  43 #include "fnmatch.h"
  44 #include "host.h"
  45
  46 extern char *version_string;
  47
  48 #define ROBOTS_FILENAME "robots.txt"
  49
  50 /* #### Many of these lists should really be hashtables!  */
  51
  52 /* List of downloaded URLs.  */
  53 static urlpos *urls_downloaded;
  54
  55 /* List of HTML URLs.  */
  56 static slist *urls_html;
  57
  58 /* List of undesirable-to-load URLs.  */
  59 static slist *ulist;
  60
  61 /* List of forbidden locations.  */
  62 static char **forbidden = NULL;
  63
  64 /* Current recursion depth.  */
  65 static int depth;
  66
  67 /* Base directory we're recursing from (used by no_parent).  */
  68 static char *base_dir;
  69
  70 /* The host name for which we last checked robots.  */
  71 static char *robots_host;
  72
  73 static int first_time = 1;
  74
  75 /* Construct the robots URL.  */
  76 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  77 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  78 static char **parse_robots PARAMS ((const char *));
  79 static int robots_match PARAMS ((struct urlinfo *, char **));
  80
  81
  82 /* Cleanup the data structures associated with recursive retrieving
  83    (the variables above).  */
  84 void
  85 recursive_cleanup (void)
  86 {
  87   free_slist (ulist);
  88   ulist = NULL;
  89   free_vec (forbidden);
  90   forbidden = NULL;
  91   free_slist (urls_html);
  92   urls_html = NULL;
  93   free_urlpos (urls_downloaded);
  94   urls_downloaded = NULL;
  95   FREE_MAYBE (base_dir);
  96   FREE_MAYBE (robots_host);
  97   first_time = 1;
  98 }
  99
 100 /* Reset FIRST_TIME to 1, so that some action can be taken in
 101    recursive_retrieve().  */
 102 void
 103 recursive_reset (void)
 104 {
 105   first_time = 1;
 106 }
 107
 108 /* The core of recursive retrieving.  Endless recursion is avoided by
 109    having all URLs stored to a linked list of URLs, which is checked
 110    before loading any URL.  That way no URL can get loaded twice.
 111
 112    The function also supports specification of maximum recursion depth
 113    and a number of other goodies.  */
 114 uerr_t
 115 recursive_retrieve (const char *file, const char *this_url)
 116 {
 117   char *constr, *filename, *newloc;
 118   char *canon_this_url = NULL;
 119   int dt, inl, dash_p_leaf_HTML = FALSE;
 120   int this_url_ftp;            /* See below the explanation */
 121   uerr_t err;
 122   struct urlinfo *rurl;
 123   urlpos *url_list, *cur_url;
 124   char *rfile; /* For robots */
 125   struct urlinfo *u;
 126
 127   assert (this_url != NULL);
 128   assert (file != NULL);
 129   /* If quota was exceeded earlier, bail out.  */
 130   if (downloaded_exceeds_quota ())
 131     return QUOTEXC;
 132   /* Cache the current URL in the list.  */
 133   if (first_time)
 134     {
 135       ulist = add_slist (ulist, this_url, 0);
 136       urls_downloaded = NULL;
 137       urls_html = NULL;
 138       /* Enter this_url to the slist, in original and "enhanced" form.  */
 139       u = newurl ();
 140       err = parseurl (this_url, u, 0);
 141       if (err == URLOK)
 142         {
 143           ulist = add_slist (ulist, u->url, 0);
 144           urls_downloaded = add_url (urls_downloaded, u->url, file);
 145           urls_html = add_slist (urls_html, file, NOSORT);
 146           if (opt.no_parent)
 147             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 148           /* Set the canonical this_url to be sent as referer.  This
 149              problem exists only when running the first time.  */
 150           canon_this_url = xstrdup (u->url);
 151         }
 152       else
 153         {
 154           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 155           base_dir = NULL;
 156         }
 157       freeurl (u, 1);
 158       depth = 1;
 159       robots_host = NULL;
 160       forbidden = NULL;
 161       first_time = 0;
 162     }
 163   else
 164     ++depth;
 165
 166   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 167     /* We've exceeded the maximum recursion depth specified by the user. */
 168     {
 169       if (opt.page_requisites && depth <= opt.reclevel + 1)
 170         /* When -p is specified, we can do one more partial recursion from the
 171            "leaf nodes" on the HTML document tree.  The recursion is partial in
 172            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 173            except for <LINK REL="stylesheet">. */
 174         dash_p_leaf_HTML = TRUE;
 175       else
 176         /* Either -p wasn't specified or it was and we've already gone the one
 177            extra (pseudo-)level that it affords us, so we need to bail out. */
 178         {
 179           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 180                    depth, opt.reclevel));
 181           --depth;
 182           return RECLEVELEXC;
 183         }
 184     }
 185
 186   /* Determine whether this_url is an FTP URL.  If it is, it means
 187      that the retrieval is done through proxy.  In that case, FTP
 188      links will be followed by default and recursion will not be
 189      turned off when following them.  */
 190   this_url_ftp = (urlproto (this_url) == URLFTP);
 191
 192   /* Get the URL-s from an HTML file: */
 193   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 194                             0, dash_p_leaf_HTML);
 195
 196   /* Decide what to do with each of the URLs.  A URL will be loaded if
 197      it meets several requirements, discussed later.  */
 198   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 199     {
 200       /* If quota was exceeded earlier, bail out.  */
 201       if (downloaded_exceeds_quota ())
 202         break;
 203       /* Parse the URL for convenient use in other functions, as well
 204          as to get the optimized form.  It also checks URL integrity.  */
 205       u = newurl ();
 206       if (parseurl (cur_url->url, u, 0) != URLOK)
 207         {
 208           DEBUGP (("Yuck!  A bad URL.\n"));
 209           freeurl (u, 1);
 210           continue;
 211         }
 212       if (u->proto == URLFILE)
 213         {
 214           DEBUGP (("Nothing to do with file:// around here.\n"));
 215           freeurl (u, 1);
 216           continue;
 217         }
 218       assert (u->url != NULL);
 219       constr = xstrdup (u->url);
 220
 221       /* Several checkings whether a file is acceptable to load:
 222          1. check if URL is ftp, and we don't load it
 223          2. check for relative links (if relative_only is set)
 224          3. check for domain
 225          4. check for no-parent
 226          5. check for excludes && includes
 227          6. check for suffix
 228          7. check for same host (if spanhost is unset), with possible
 229          gethostbyname baggage
 230          8. check for robots.txt
 231
 232          Addendum: If the URL is FTP, and it is to be loaded, only the
 233          domain and suffix settings are "stronger".
 234
 235          Note that .html and (yuck) .htm will get loaded regardless of
 236          suffix rules (but that is remedied later with unlink) unless
 237          the depth equals the maximum depth.
 238
 239          More time- and memory- consuming tests should be put later on
 240          the list.  */
 241
 242       /* inl is set if the URL we are working on (constr) is stored in
 243          ulist.  Using it is crucial to avoid the incessant calls to
 244          in_slist, which is quite slow.  */
 245       inl = in_slist (ulist, constr);
 246
 247       /* If it is FTP, and FTP is not followed, chuck it out.  */
 248       if (!inl)
 249         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 250           {
 251             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 252             ulist = add_slist (ulist, constr, 0);
 253             inl = 1;
 254           }
 255       /* If it is absolute link and they are not followed, chuck it
 256          out.  */
 257       if (!inl && u->proto != URLFTP)
 258         if (opt.relative_only && !(cur_url->flags & URELATIVE))
 259           {
 260             DEBUGP (("It doesn't really look like a relative link.\n"));
 261             ulist = add_slist (ulist, constr, 0);
 262             inl = 1;
 263           }
 264       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 265       if (!inl)
 266         if (!accept_domain (u))
 267           {
 268             DEBUGP (("I don't like the smell of that domain.\n"));
 269             ulist = add_slist (ulist, constr, 0);
 270             inl = 1;
 271           }
 272       /* Check for parent directory.  */
 273       if (!inl && opt.no_parent
 274           /* If the new URL is FTP and the old was not, ignore
 275              opt.no_parent.  */
 276           && !(!this_url_ftp && u->proto == URLFTP))
 277         {
 278           /* Check for base_dir first.  */
 279           if (!(base_dir && frontcmp (base_dir, u->dir)))
 280             {
 281               /* Failing that, check for parent dir.  */
 282               struct urlinfo *ut = newurl ();
 283               if (parseurl (this_url, ut, 0) != URLOK)
 284                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 285               else if (!frontcmp (ut->dir, u->dir))
 286                 {
 287                   /* Failing that too, kill the URL.  */
 288                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 289                   ulist = add_slist (ulist, constr, 0);
 290                   inl = 1;
 291                 }
 292               freeurl (ut, 1);
 293             }
 294         }
 295       /* If the file does not match the acceptance list, or is on the
 296          rejection list, chuck it out.  The same goes for the
 297          directory exclude- and include- lists.  */
 298       if (!inl && (opt.includes || opt.excludes))
 299         {
 300           if (!accdir (u->dir, ALLABS))
 301             {
 302               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 303               ulist = add_slist (ulist, constr, 0);
 304               inl = 1;
 305             }
 306         }
 307       if (!inl)
 308         {
 309           char *suf = NULL;
 310           /* We check for acceptance/rejection rules only for non-HTML
 311              documents.  Since we don't know whether they really are
 312              HTML, it will be deduced from (an OR-ed list):
 313
 314              1) u->file is "" (meaning it is a directory)
 315              2) suffix exists, AND:
 316              a) it is "html", OR
 317              b) it is "htm"
 318
 319              If the file *is* supposed to be HTML, it will *not* be
 320             subject to acc/rej rules, unless a finite maximum depth has
 321             been specified and the current depth is the maximum depth. */
 322           if (!
 323               (!*u->file
 324                || (((suf = suffix (constr)) != NULL)
 325                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 326                       && ((opt.reclevel != INFINITE_RECURSION) &&
 327                           (depth != opt.reclevel))))))
 328             {
 329               if (!acceptable (u->file))
 330                 {
 331                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 332                           constr, u->file));
 333                   ulist = add_slist (ulist, constr, 0);
 334                   inl = 1;
 335                 }
 336             }
 337           FREE_MAYBE (suf);
 338         }
 339       /* Optimize the URL (which includes possible DNS lookup) only
 340          after all other possibilities have been exhausted.  */
 341       if (!inl)
 342         {
 343           if (!opt.simple_check)
 344             opt_url (u);
 345           else
 346             {
 347               char *p;
 348               /* Just lowercase the hostname.  */
 349               for (p = u->host; *p; p++)
 350                 *p = TOLOWER (*p);
 351               free (u->url);
 352               u->url = str_url (u, 0);
 353             }
 354           free (constr);
 355           constr = xstrdup (u->url);
 356           inl = in_slist (ulist, constr);
 357           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 358             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 359               {
 360                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 361                 ulist = add_slist (ulist, constr, 0);
 362                 inl = 1;
 363               }
 364         }
 365       /* What about robots.txt?  */
 366       if (!inl && opt.use_robots && u->proto == URLHTTP)
 367         {
 368           /* Since Wget knows about only one set of robot rules at a
 369              time, /robots.txt must be reloaded whenever a new host is
 370              accessed.
 371
 372              robots_host holds the host the current `forbid' variable
 373              is assigned to.  */
 374           if (!robots_host || !same_host (robots_host, u->host))
 375             {
 376               FREE_MAYBE (robots_host);
 377               /* Now make robots_host the new host, no matter what the
 378                  result will be.  So if there is no /robots.txt on the
 379                  site, Wget will not retry getting robots all the
 380                  time.  */
 381               robots_host = xstrdup (u->host);
 382               free_vec (forbidden);
 383               forbidden = NULL;
 384               err = retrieve_robots (constr, ROBOTS_FILENAME);
 385               if (err == ROBOTSOK)
 386                 {
 387                   rurl = robots_url (constr, ROBOTS_FILENAME);
 388                   rfile = url_filename (rurl);
 389                   forbidden = parse_robots (rfile);
 390                   freeurl (rurl, 1);
 391                   free (rfile);
 392                 }
 393             }
 394
 395           /* Now that we have (or don't have) robots, we can check for
 396              them.  */
 397           if (!robots_match (u, forbidden))
 398             {
 399               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 400                        ROBOTS_FILENAME));
 401               ulist = add_slist (ulist, constr, 0);
 402               inl = 1;
 403             }
 404         }
 405
 406       filename = NULL;
 407       /* If it wasn't chucked out, do something with it.  */
 408       if (!inl)
 409         {
 410           DEBUGP (("I've decided to load it -> "));
 411           /* Add it to the list of already-loaded URL-s.  */
 412           ulist = add_slist (ulist, constr, 0);
 413           /* Automatically followed FTPs will *not* be downloaded
 414              recursively.  */
 415           if (u->proto == URLFTP)
 416             {
 417               /* Don't you adore side-effects?  */
 418               opt.recursive = 0;
 419             }
 420           /* Reset its type.  */
 421           dt = 0;
 422           /* Retrieve it.  */
 423           retrieve_url (constr, &filename, &newloc,
 424                        canon_this_url ? canon_this_url : this_url, &dt);
 425           if (u->proto == URLFTP)
 426             {
 427               /* Restore...  */
 428               opt.recursive = 1;
 429             }
 430           if (newloc)
 431             {
 432               free (constr);
 433               constr = newloc;
 434             }
 435           /* In case of convert_links: If there was no error, add it to
 436              the list of downloaded URLs.  We might need it for
 437              conversion.  */
 438           if (opt.convert_links && filename)
 439             {
 440               if (dt & RETROKF)
 441                 {
 442                   urls_downloaded = add_url (urls_downloaded, constr, filename);
 443                   /* If the URL is HTML, note it.  */
 444                   if (dt & TEXTHTML)
 445                     urls_html = add_slist (urls_html, filename, NOSORT);
 446                 }
 447             }
 448           /* If there was no error, and the type is text/html, parse
 449              it recursively.  */
 450           if (dt & TEXTHTML)
 451             {
 452               if (dt & RETROKF)
 453                 recursive_retrieve (filename, constr);
 454             }
 455           else
 456             DEBUGP (("%s is not text/html so we don't chase.\n",
 457                      filename ? filename: "(null)"));
 458
 459           if (opt.delete_after || (filename && !acceptable (filename)))
 460             /* Either --delete-after was specified, or we loaded this otherwise
 461                rejected (e.g. by -R) HTML file just so we could harvest its
 462                hyperlinks -- in either case, delete the local file. */
 463             {
 464               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 465                        opt.delete_after ? "--delete-after" :
 466                        "recursive rejection criteria"));
 467               logprintf (LOG_VERBOSE,
 468                          (opt.delete_after ? _("Removing %s.\n")
 469                           : _("Removing %s since it should be rejected.\n")),
 470                          filename);
 471               if (unlink (filename))
 472                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 473               dt &= ~RETROKF;
 474             }
 475
 476           /* If everything was OK, and links are to be converted, let's
 477              store the local filename.  */
 478           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 479             {
 480               cur_url->flags |= UABS2REL;
 481               cur_url->local_name = xstrdup (filename);
 482             }
 483         }
 484       DEBUGP (("%s already in list, so we don't load.\n", constr));
 485       /* Free filename and constr.  */
 486       FREE_MAYBE (filename);
 487       FREE_MAYBE (constr);
 488       freeurl (u, 1);
 489       /* Increment the pbuf for the appropriate size.  */
 490     }
 491   if (opt.convert_links && !opt.delete_after)
 492     convert_links (file, url_list);
 493   /* Free the linked list of URL-s.  */
 494   free_urlpos (url_list);
 495   /* Free the canonical this_url.  */
 496   FREE_MAYBE (canon_this_url);
 497   /* Decrement the recursion depth.  */
 498   --depth;
 499   if (downloaded_exceeds_quota ())
 500     return QUOTEXC;
 501   else
 502     return RETROK;
 503 }
 504 \f
 505 /* Simple calls to convert_links will often fail because only the
 506    downloaded files are converted, and Wget cannot know which files
 507    will be converted in the future.  So, if we have file fileone.html
 508    with:
 509
 510    <a href=/c/something.gif>
 511
 512    and /c/something.gif was not downloaded because it exceeded the
 513    recursion depth, the reference will *not* be changed.
 514
 515    However, later we can encounter /c/something.gif from an "upper"
 516    level HTML (let's call it filetwo.html), and it gets downloaded.
 517
 518    But now we have a problem because /c/something.gif will be
 519    correctly transformed in filetwo.html, but not in fileone.html,
 520    since Wget could not have known that /c/something.gif will be
 521    downloaded in the future.
 522
 523    This is why Wget must, after the whole retrieval, call
 524    convert_all_links to go once more through the entire list of
 525    retrieved HTMLs, and re-convert them.
 526
 527    All the downloaded HTMLs are kept in urls_html, and downloaded URLs
 528    in urls_downloaded.  From these two lists information is
 529    extracted.  */
 530 void
 531 convert_all_links (void)
 532 {
 533   uerr_t res;
 534   urlpos *l1, *l2, *urls;
 535   struct urlinfo *u;
 536   slist *html;
 537   urlpos *urlhtml;
 538
 539   for (html = urls_html; html; html = html->next)
 540     {
 541       DEBUGP (("Rescanning %s\n", html->string));
 542       /* Determine the URL of the HTML file.  get_urls_html will need
 543          it.  */
 544       for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
 545         if (!strcmp (urlhtml->local_name, html->string))
 546           break;
 547       if (urlhtml)
 548         DEBUGP (("It should correspond to %s.\n", urlhtml->url));
 549       else
 550         DEBUGP (("I cannot find the corresponding URL.\n"));
 551       /* Parse the HTML file...  */
 552       urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
 553                             FALSE);
 554       if (!urls)
 555         continue;
 556       for (l1 = urls; l1; l1 = l1->next)
 557         {
 558           /* The URL must be in canonical form to be compared.  */
 559           u = newurl ();
 560           res = parseurl (l1->url, u, 0);
 561           if (res != URLOK)
 562             {
 563               freeurl (u, 1);
 564               continue;
 565             }
 566           /* We decide the direction of conversion according to whether
 567              a URL was downloaded.  Downloaded URLs will be converted
 568              ABS2REL, whereas non-downloaded will be converted REL2ABS.
 569              Note: not yet implemented; only ABS2REL works.  */
 570           for (l2 = urls_downloaded; l2; l2 = l2->next)
 571             if (!strcmp (l2->url, u->url))
 572               {
 573                 DEBUGP (("%s flagged for conversion, local %s\n",
 574                          l2->url, l2->local_name));
 575                 break;
 576               }
 577           /* Clear the flags.  */
 578           l1->flags &= ~ (UABS2REL | UREL2ABS);
 579           /* Decide on the conversion direction.  */
 580           if (l2)
 581             {
 582               l1->flags |= UABS2REL;
 583               l1->local_name = xstrdup (l2->local_name);
 584             }
 585           else
 586             {
 587               l1->flags |= UREL2ABS;
 588               l1->local_name = NULL;
 589             }
 590           freeurl (u, 1);
 591         }
 592       /* Convert the links in the file.  */
 593       convert_links (html->string, urls);
 594       /* Free the data.  */
 595       free_urlpos (urls);
 596     }
 597 }
 598 \f
 599 /* Robots support.  */
 600
 601 /* Construct the robots URL.  */
 602 static struct urlinfo *
 603 robots_url (const char *url, const char *robots_filename)
 604 {
 605   struct urlinfo *u = newurl ();
 606   uerr_t err;
 607
 608   err = parseurl (url, u, 0);
 609   assert (err == URLOK && u->proto == URLHTTP);
 610   free (u->file);
 611   free (u->dir);
 612   free (u->url);
 613   u->dir = xstrdup ("");
 614   u->file = xstrdup (robots_filename);
 615   u->url = str_url (u, 0);
 616   return u;
 617 }
 618
 619 /* Retrieves the robots_filename from the root server directory, if
 620    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 621    NOROBOTS if robots could not be retrieved for any reason.  */
 622 static uerr_t
 623 retrieve_robots (const char *url, const char *robots_filename)
 624 {
 625   int dt;
 626   uerr_t err;
 627   struct urlinfo *u;
 628
 629   u = robots_url (url, robots_filename);
 630   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 631   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 632   freeurl (u, 1);
 633   if (err == RETROK)
 634     return ROBOTSOK;
 635   else
 636     return NOROBOTS;
 637 }
 638
 639 /* Parse the robots_filename and return the disallowed path components
 640    in a malloc-ed vector of character pointers.
 641
 642    It should be fully compliant with the syntax as described in the
 643    file norobots.txt, adopted by the robots mailing list
 644    (robots@webcrawler.com).  */
 645 static char **
 646 parse_robots (const char *robots_filename)
 647 {
 648   FILE *fp;
 649   char **entries;
 650   char *line, *cmd, *str, *p;
 651   char *base_version, *version;
 652   int len, num, i;
 653   int wget_matched;             /* is the part meant for Wget?  */
 654
 655   entries = NULL;
 656
 657   num = 0;
 658   fp = fopen (robots_filename, "rb");
 659   if (!fp)
 660     return NULL;
 661
 662   /* Kill version number.  */
 663     if (opt.useragent)
 664       {
 665         STRDUP_ALLOCA (base_version, opt.useragent);
 666         STRDUP_ALLOCA (version, opt.useragent);
 667       }
 668     else
 669       {
 670         int len = 10 + strlen (version_string);
 671         base_version = (char *)alloca (len);
 672         sprintf (base_version, "Wget/%s", version_string);
 673         version = (char *)alloca (len);
 674         sprintf (version, "Wget/%s", version_string);
 675       }
 676   for (p = version; *p; p++)
 677     *p = TOLOWER (*p);
 678   for (p = base_version; *p && *p != '/'; p++)
 679     *p = TOLOWER (*p);
 680   *p = '\0';
 681
 682   /* Setting this to 1 means that Wget considers itself under
 683      restrictions by default, even if the User-Agent field is not
 684      present.  However, if it finds the user-agent set to anything
 685      other than Wget, the rest will be ignored (up to the following
 686      User-Agent field).  Thus you may have something like:
 687
 688      Disallow: 1
 689      Disallow: 2
 690      User-Agent: stupid-robot
 691      Disallow: 3
 692      Disallow: 4
 693      User-Agent: Wget*
 694      Disallow: 5
 695      Disallow: 6
 696      User-Agent: *
 697      Disallow: 7
 698
 699      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 700      stored.  */
 701   wget_matched = 1;
 702   while ((line = read_whole_line (fp)))
 703     {
 704       len = strlen (line);
 705       /* Destroy <CR> if there is one.  */
 706       if (len && line[len - 1] == '\r')
 707         line[len - 1] = '\0';
 708       /* According to specifications, optional space may be at the
 709          end...  */
 710       DEBUGP (("Line: %s\n", line));
 711       /* Skip spaces.  */
 712       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 713       if (!*cmd)
 714         {
 715           free (line);
 716           DEBUGP (("(chucked out)\n"));
 717           continue;
 718         }
 719       /* Look for ':'.  */
 720       for (str = cmd; *str && *str != ':'; str++);
 721       if (!*str)
 722         {
 723           free (line);
 724           DEBUGP (("(chucked out)\n"));
 725           continue;
 726         }
 727       /* Zero-terminate the command.  */
 728       *str++ = '\0';
 729       /* Look for the string beginning...  */
 730       for (; *str && ISSPACE (*str); str++);
 731       /* Look for comments or trailing spaces and kill them off.  */
 732       for (p = str; *p; p++)
 733         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 734           {
 735             /* We have found either a shell-style comment `<sp>+#' or some
 736                trailing spaces.  Now rewind to the beginning of the spaces
 737                and place '\0' there.  */
 738             while (p > str && ISSPACE (*p))
 739               --p;
 740             if (p == str)
 741               *p = '\0';
 742             else
 743               *(p + 1) = '\0';
 744             break;
 745           }
 746       if (!strcasecmp (cmd, "User-agent"))
 747         {
 748           int match = 0;
 749           /* Lowercase the agent string.  */
 750           for (p = str; *p; p++)
 751             *p = TOLOWER (*p);
 752           /* If the string is `*', it matches.  */
 753           if (*str == '*' && !*(str + 1))
 754             match = 1;
 755           else
 756             {
 757               /* If the string contains wildcards, we'll run it through
 758                  fnmatch().  */
 759               if (has_wildcards_p (str))
 760                 {
 761                   /* If the string contains '/', compare with the full
 762                      version.  Else, compare it to base_version.  */
 763                   if (strchr (str, '/'))
 764                     match = !fnmatch (str, version, 0);
 765                   else
 766                     match = !fnmatch (str, base_version, 0);
 767                 }
 768               else                /* Substring search */
 769                 {
 770                   if (strstr (version, str))
 771                     match = 1;
 772                   else
 773                     match = 0;
 774                 }
 775             }
 776           /* If Wget is not matched, skip all the entries up to the
 777              next User-agent field.  */
 778           wget_matched = match;
 779         }
 780       else if (!wget_matched)
 781         {
 782           free (line);
 783           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 784           continue;
 785         }
 786       else if (!strcasecmp (cmd, "Disallow"))
 787         {
 788           /* If "Disallow" is empty, the robot is welcome.  */
 789           if (!*str)
 790             {
 791               free_vec (entries);
 792               entries = (char **)xmalloc (sizeof (char *));
 793               *entries = NULL;
 794               num = 0;
 795             }
 796           else
 797             {
 798               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 799               entries[num] = xstrdup (str);
 800               entries[++num] = NULL;
 801               /* Strip trailing spaces, according to specifications.  */
 802               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 803                 if (ISSPACE (str[i]))
 804                   str[i] = '\0';
 805             }
 806         }
 807       else
 808         {
 809           /* unknown command */
 810           DEBUGP (("(chucked out)\n"));
 811         }
 812       free (line);
 813     }
 814   fclose (fp);
 815   return entries;
 816 }
 817
 818 /* May the URL url be loaded according to disallowing rules stored in
 819    forbidden?  */
 820 static int
 821 robots_match (struct urlinfo *u, char **forbidden)
 822 {
 823   int l;
 824
 825   if (!forbidden)
 826     return 1;
 827   DEBUGP (("Matching %s against: ", u->path));
 828   for (; *forbidden; forbidden++)
 829     {
 830       DEBUGP (("%s ", *forbidden));
 831       l = strlen (*forbidden);
 832       /* If dir is forbidden, we may not load the file.  */
 833       if (strncmp (u->path, *forbidden, l) == 0)
 834         {
 835           DEBUGP (("matched.\n"));
 836           return 0; /* Matches, i.e. does not load...  */
 837         }
 838     }
 839   DEBUGP (("not matched.\n"));
 840   return 1;
 841 }