sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <ctype.h>
  35 #include <sys/types.h>
  36
  37 #include "wget.h"
  38 #include "url.h"
  39 #include "recur.h"
  40 #include "utils.h"
  41 #include "retr.h"
  42 #include "ftp.h"
  43 #include "fnmatch.h"
  44 #include "host.h"
  45
  46 extern char *version_string;
  47
  48 #define ROBOTS_FILENAME "robots.txt"
  49
  50 /* #### Many of these lists should really be hashtables!  */
  51
  52 /* List of downloaded URLs.  */
  53 static urlpos *urls_downloaded;
  54
  55 /* List of HTML URLs.  */
  56 static slist *urls_html;
  57
  58 /* List of undesirable-to-load URLs.  */
  59 static slist *ulist;
  60
  61 /* List of forbidden locations.  */
  62 static char **forbidden = NULL;
  63
  64 /* Current recursion depth.  */
  65 static int depth;
  66
  67 /* Base directory we're recursing from (used by no_parent).  */
  68 static char *base_dir;
  69
  70 /* The host name for which we last checked robots.  */
  71 static char *robots_host;
  72
  73 static int first_time = 1;
  74
  75 /* Construct the robots URL.  */
  76 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  77 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  78 static char **parse_robots PARAMS ((const char *));
  79 static int robots_match PARAMS ((struct urlinfo *, char **));
  80
  81
  82 /* Cleanup the data structures associated with recursive retrieving
  83    (the variables above).  */
  84 void
  85 recursive_cleanup (void)
  86 {
  87   free_slist (ulist);
  88   ulist = NULL;
  89   free_vec (forbidden);
  90   forbidden = NULL;
  91   free_slist (urls_html);
  92   urls_html = NULL;
  93   free_urlpos (urls_downloaded);
  94   urls_downloaded = NULL;
  95   FREE_MAYBE (base_dir);
  96   FREE_MAYBE (robots_host);
  97   first_time = 1;
  98 }
  99
 100 /* Reset FIRST_TIME to 1, so that some action can be taken in
 101    recursive_retrieve().  */
 102 void
 103 recursive_reset (void)
 104 {
 105   first_time = 1;
 106 }
 107
 108 /* The core of recursive retrieving.  Endless recursion is avoided by
 109    having all URL-s stored to a linked list of URL-s, which is checked
 110    before loading any URL.  That way no URL can get loaded twice.
 111
 112    The function also supports specification of maximum recursion depth
 113    and a number of other goodies.  */
 114 uerr_t
 115 recursive_retrieve (const char *file, const char *this_url)
 116 {
 117   char *constr, *filename, *newloc;
 118   char *canon_this_url = NULL;
 119   int dt, inl;
 120   int this_url_ftp;            /* See below the explanation */
 121   uerr_t err;
 122   struct urlinfo *rurl;
 123   urlpos *url_list, *cur_url;
 124   char *rfile; /* For robots */
 125   struct urlinfo *u;
 126
 127   assert (this_url != NULL);
 128   assert (file != NULL);
 129   /* If quota was exceeded earlier, bail out.  */
 130   if (opt.quota && (opt.downloaded > opt.quota))
 131     return QUOTEXC;
 132   /* Cache the current URL in the list.  */
 133   if (first_time)
 134     {
 135       ulist = add_slist (ulist, this_url, 0);
 136       urls_downloaded = NULL;
 137       urls_html = NULL;
 138       /* Enter this_url to the slist, in original and "enhanced" form.  */
 139       u = newurl ();
 140       err = parseurl (this_url, u, 0);
 141       if (err == URLOK)
 142         {
 143           ulist = add_slist (ulist, u->url, 0);
 144           urls_downloaded = add_url (urls_downloaded, u->url, file);
 145           urls_html = add_slist (urls_html, file, NOSORT);
 146           if (opt.no_parent)
 147             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 148           /* Set the canonical this_url to be sent as referer.  This
 149              problem exists only when running the first time.  */
 150           canon_this_url = xstrdup (u->url);
 151         }
 152       else
 153         {
 154           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 155           base_dir = NULL;
 156         }
 157       freeurl (u, 1);
 158       depth = 1;
 159       robots_host = NULL;
 160       forbidden = NULL;
 161       first_time = 0;
 162     }
 163   else
 164     ++depth;
 165
 166   /* Bail out if opt.reclevel is exceeded.  */
 167   if ((opt.reclevel != 0) && (depth > opt.reclevel))
 168     {
 169       DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 170                depth, opt.reclevel));
 171       --depth;
 172       return RECLEVELEXC;
 173     }
 174
 175   /* Determine whether this_url is an FTP URL.  If it is, it means
 176      that the retrieval is done through proxy.  In that case, FTP
 177      links will be followed by default and recursion will not be
 178      turned off when following them.  */
 179   this_url_ftp = (urlproto (this_url) == URLFTP);
 180
 181   /* Get the URL-s from an HTML file: */
 182   url_list = get_urls_html (file,
 183                             canon_this_url ? canon_this_url : this_url, 0);
 184
 185   /* Decide what to do with each of the URLs.  A URL will be loaded if
 186      it meets several requirements, discussed later.  */
 187   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 188     {
 189       /* If quota was exceeded earlier, bail out.  */
 190       if (opt.quota && (opt.downloaded > opt.quota))
 191         break;
 192       /* Parse the URL for convenient use in other functions, as well
 193          as to get the optimized form.  It also checks URL integrity.  */
 194       u = newurl ();
 195       if (parseurl (cur_url->url, u, 0) != URLOK)
 196         {
 197           DEBUGP (("Yuck!  A bad URL.\n"));
 198           freeurl (u, 1);
 199           continue;
 200         }
 201       if (u->proto == URLFILE)
 202         {
 203           DEBUGP (("Nothing to do with file:// around here.\n"));
 204           freeurl (u, 1);
 205           continue;
 206         }
 207       assert (u->url != NULL);
 208       constr = xstrdup (u->url);
 209
 210       /* Several checkings whether a file is acceptable to load:
 211          1. check if URL is ftp, and we don't load it
 212          2. check for relative links (if relative_only is set)
 213          3. check for domain
 214          4. check for no-parent
 215          5. check for excludes && includes
 216          6. check for suffix
 217          7. check for same host (if spanhost is unset), with possible
 218          gethostbyname baggage
 219          8. check for robots.txt
 220
 221          Addendum: If the URL is FTP, and it is to be loaded, only the
 222          domain and suffix settings are "stronger".
 223
 224          Note that .html and (yuck) .htm will get loaded
 225          regardless of suffix rules (but that is remedied later with
 226          unlink).
 227
 228          More time- and memory- consuming tests should be put later on
 229          the list.  */
 230
 231       /* inl is set if the URL we are working on (constr) is stored in
 232          ulist.  Using it is crucial to avoid the incessant calls to
 233          in_slist, which is quite slow.  */
 234       inl = in_slist (ulist, constr);
 235
 236       /* If it is FTP, and FTP is not followed, chuck it out.  */
 237       if (!inl)
 238         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 239           {
 240             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 241             ulist = add_slist (ulist, constr, 0);
 242             inl = 1;
 243           }
 244       /* If it is absolute link and they are not followed, chuck it
 245          out.  */
 246       if (!inl && u->proto != URLFTP)
 247         if (opt.relative_only && !(cur_url->flags & URELATIVE))
 248           {
 249             DEBUGP (("It doesn't really look like a relative link.\n"));
 250             ulist = add_slist (ulist, constr, 0);
 251             inl = 1;
 252           }
 253       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 254       if (!inl)
 255         if (!accept_domain (u))
 256           {
 257             DEBUGP (("I don't like the smell of that domain.\n"));
 258             ulist = add_slist (ulist, constr, 0);
 259             inl = 1;
 260           }
 261       /* Check for parent directory.  */
 262       if (!inl && opt.no_parent
 263           /* If the new URL is FTP and the old was not, ignore
 264              opt.no_parent.  */
 265           && !(!this_url_ftp && u->proto == URLFTP))
 266         {
 267           /* Check for base_dir first.  */
 268           if (!(base_dir && frontcmp (base_dir, u->dir)))
 269             {
 270               /* Failing that, check for parent dir.  */
 271               struct urlinfo *ut = newurl ();
 272               if (parseurl (this_url, ut, 0) != URLOK)
 273                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 274               else if (!frontcmp (ut->dir, u->dir))
 275                 {
 276                   /* Failing that too, kill the URL.  */
 277                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 278                   ulist = add_slist (ulist, constr, 0);
 279                   inl = 1;
 280                 }
 281               freeurl (ut, 1);
 282             }
 283         }
 284       /* If the file does not match the acceptance list, or is on the
 285          rejection list, chuck it out.  The same goes for the
 286          directory exclude- and include- lists.  */
 287       if (!inl && (opt.includes || opt.excludes))
 288         {
 289           if (!accdir (u->dir, ALLABS))
 290             {
 291               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 292               ulist = add_slist (ulist, constr, 0);
 293               inl = 1;
 294             }
 295         }
 296       if (!inl)
 297         {
 298           char *suf = NULL;
 299           /* We check for acceptance/rejection rules only for non-HTML
 300              documents.  Since we don't know whether they really are
 301              HTML, it will be deduced from (an OR-ed list):
 302
 303              1) u->file is "" (meaning it is a directory)
 304              2) suffix exists, AND:
 305              a) it is "html", OR
 306              b) it is "htm"
 307
 308              If the file *is* supposed to be HTML, it will *not* be
 309              subject to acc/rej rules.  That's why the `!'.  */
 310           if (!
 311               (!*u->file
 312                || (((suf = suffix (constr)) != NULL)
 313                    && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
 314             {
 315               if (!acceptable (u->file))
 316                 {
 317                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 318                           constr, u->file));
 319                   ulist = add_slist (ulist, constr, 0);
 320                   inl = 1;
 321                 }
 322             }
 323           FREE_MAYBE (suf);
 324         }
 325       /* Optimize the URL (which includes possible DNS lookup) only
 326          after all other possibilities have been exhausted.  */
 327       if (!inl)
 328         {
 329           if (!opt.simple_check)
 330             opt_url (u);
 331           else
 332             {
 333               char *p;
 334               /* Just lowercase the hostname.  */
 335               for (p = u->host; *p; p++)
 336                 *p = tolower (*p);
 337               free (u->url);
 338               u->url = str_url (u, 0);
 339             }
 340           free (constr);
 341           constr = xstrdup (u->url);
 342           inl = in_slist (ulist, constr);
 343           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 344             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 345               {
 346                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 347                 ulist = add_slist (ulist, constr, 0);
 348                 inl = 1;
 349               }
 350         }
 351       /* What about robots.txt?  */
 352       if (!inl && opt.use_robots && u->proto == URLHTTP)
 353         {
 354           /* Since Wget knows about only one set of robot rules at a
 355              time, /robots.txt must be reloaded whenever a new host is
 356              accessed.
 357
 358              robots_host holds the host the current `forbid' variable
 359              is assigned to.  */
 360           if (!robots_host || !same_host (robots_host, u->host))
 361             {
 362               FREE_MAYBE (robots_host);
 363               /* Now make robots_host the new host, no matter what the
 364                  result will be.  So if there is no /robots.txt on the
 365                  site, Wget will not retry getting robots all the
 366                  time.  */
 367               robots_host = xstrdup (u->host);
 368               free_vec (forbidden);
 369               forbidden = NULL;
 370               err = retrieve_robots (constr, ROBOTS_FILENAME);
 371               if (err == ROBOTSOK)
 372                 {
 373                   rurl = robots_url (constr, ROBOTS_FILENAME);
 374                   rfile = url_filename (rurl);
 375                   forbidden = parse_robots (rfile);
 376                   freeurl (rurl, 1);
 377                   free (rfile);
 378                 }
 379             }
 380
 381           /* Now that we have (or don't have) robots, we can check for
 382              them.  */
 383           if (!robots_match (u, forbidden))
 384             {
 385               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 386                        ROBOTS_FILENAME));
 387               ulist = add_slist (ulist, constr, 0);
 388               inl = 1;
 389             }
 390         }
 391
 392       filename = NULL;
 393       /* If it wasn't chucked out, do something with it.  */
 394       if (!inl)
 395         {
 396           DEBUGP (("I've decided to load it -> "));
 397           /* Add it to the list of already-loaded URL-s.  */
 398           ulist = add_slist (ulist, constr, 0);
 399           /* Automatically followed FTPs will *not* be downloaded
 400              recursively.  */
 401           if (u->proto == URLFTP)
 402             {
 403               /* Don't you adore side-effects?  */
 404               opt.recursive = 0;
 405             }
 406           /* Reset its type.  */
 407           dt = 0;
 408           /* Retrieve it.  */
 409           retrieve_url (constr, &filename, &newloc,
 410                        canon_this_url ? canon_this_url : this_url, &dt);
 411           if (u->proto == URLFTP)
 412             {
 413               /* Restore...  */
 414               opt.recursive = 1;
 415             }
 416           if (newloc)
 417             {
 418               free (constr);
 419               constr = newloc;
 420             }
 421           /* In case of convert_links: If there was no error, add it to
 422              the list of downloaded URLs.  We might need it for
 423              conversion.  */
 424           if (opt.convert_links && filename)
 425             {
 426               if (dt & RETROKF)
 427                 {
 428                   urls_downloaded = add_url (urls_downloaded, constr, filename);
 429                   /* If the URL is HTML, note it.  */
 430                   if (dt & TEXTHTML)
 431                     urls_html = add_slist (urls_html, filename, NOSORT);
 432                 }
 433             }
 434           /* If there was no error, and the type is text/html, parse
 435              it recursively.  */
 436           if (dt & TEXTHTML)
 437             {
 438               if (dt & RETROKF)
 439                 recursive_retrieve (filename, constr);
 440             }
 441           else
 442             DEBUGP (("%s is not text/html so we don't chase.\n",
 443                      filename ? filename: "(null)"));
 444           /* If an suffix-rejected file was loaded only because it was HTML,
 445              undo the error now */
 446           if (opt.delete_after || (filename && !acceptable (filename)))
 447             {
 448               logprintf (LOG_VERBOSE,
 449                          (opt.delete_after ? _("Removing %s.\n")
 450                           : _("Removing %s since it should be rejected.\n")),
 451                          filename);
 452               if (unlink (filename))
 453                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 454               dt &= ~RETROKF;
 455             }
 456           /* If everything was OK, and links are to be converted, let's
 457              store the local filename.  */
 458           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 459             {
 460               cur_url->flags |= UABS2REL;
 461               cur_url->local_name = xstrdup (filename);
 462             }
 463         }
 464       DEBUGP (("%s already in list, so we don't load.\n", constr));
 465       /* Free filename and constr.  */
 466       FREE_MAYBE (filename);
 467       FREE_MAYBE (constr);
 468       freeurl (u, 1);
 469       /* Increment the pbuf for the appropriate size.  */
 470     }
 471   if (opt.convert_links)
 472     convert_links (file, url_list);
 473   /* Free the linked list of URL-s.  */
 474   free_urlpos (url_list);
 475   /* Free the canonical this_url.  */
 476   FREE_MAYBE (canon_this_url);
 477   /* Decrement the recursion depth.  */
 478   --depth;
 479   if (opt.quota && (opt.downloaded > opt.quota))
 480     return QUOTEXC;
 481   else
 482     return RETROK;
 483 }
 484 \f
 485 /* Simple calls to convert_links will often fail because only the
 486    downloaded files are converted, and Wget cannot know which files
 487    will be converted in the future.  So, if we have file fileone.html
 488    with:
 489
 490    <a href=/c/something.gif>
 491
 492    and /c/something.gif was not downloaded because it exceeded the
 493    recursion depth, the reference will *not* be changed.
 494
 495    However, later we can encounter /c/something.gif from an "upper"
 496    level HTML (let's call it filetwo.html), and it gets downloaded.
 497
 498    But now we have a problem because /c/something.gif will be
 499    correctly transformed in filetwo.html, but not in fileone.html,
 500    since Wget could not have known that /c/something.gif will be
 501    downloaded in the future.
 502
 503    This is why Wget must, after the whole retrieval, call
 504    convert_all_links to go once more through the entire list of
 505    retrieved HTML-s, and re-convert them.
 506
 507    All the downloaded HTMLs are kept in urls_html, and downloaded URLs
 508    in urls_downloaded.  From these two lists information is
 509    extracted.  */
 510 void
 511 convert_all_links (void)
 512 {
 513   uerr_t res;
 514   urlpos *l1, *l2, *urls;
 515   struct urlinfo *u;
 516   slist *html;
 517   urlpos *urlhtml;
 518
 519   for (html = urls_html; html; html = html->next)
 520     {
 521       DEBUGP (("Rescanning %s\n", html->string));
 522       /* Determine the URL of the HTML file.  get_urls_html will need
 523          it.  */
 524       for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
 525         if (!strcmp (urlhtml->local_name, html->string))
 526           break;
 527       if (urlhtml)
 528         DEBUGP (("It should correspond to %s.\n", urlhtml->url));
 529       else
 530         DEBUGP (("I cannot find the corresponding URL.\n"));
 531       /* Parse the HTML file...  */
 532       urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1);
 533       if (!urls)
 534         continue;
 535       for (l1 = urls; l1; l1 = l1->next)
 536         {
 537           /* The URL must be in canonical form to be compared.  */
 538           u = newurl ();
 539           res = parseurl (l1->url, u, 0);
 540           if (res != URLOK)
 541             {
 542               freeurl (u, 1);
 543               continue;
 544             }
 545           /* We decide the direction of conversion according to whether
 546              a URL was downloaded.  Downloaded URLs will be converted
 547              ABS2REL, whereas non-downloaded will be converted REL2ABS.
 548              Note: not yet implemented; only ABS2REL works.  */
 549           for (l2 = urls_downloaded; l2; l2 = l2->next)
 550             if (!strcmp (l2->url, u->url))
 551               {
 552                 DEBUGP (("%s flagged for conversion, local %s\n",
 553                          l2->url, l2->local_name));
 554                 break;
 555               }
 556           /* Clear the flags.  */
 557           l1->flags &= ~ (UABS2REL | UREL2ABS);
 558           /* Decide on the conversion direction.  */
 559           if (l2)
 560             {
 561               l1->flags |= UABS2REL;
 562               l1->local_name = xstrdup (l2->local_name);
 563             }
 564           else
 565             {
 566               l1->flags |= UREL2ABS;
 567               l1->local_name = NULL;
 568             }
 569           freeurl (u, 1);
 570         }
 571       /* Convert the links in the file.  */
 572       convert_links (html->string, urls);
 573       /* Free the data.  */
 574       free_urlpos (urls);
 575     }
 576 }
 577 \f
 578 /* Robots support.  */
 579
 580 /* Construct the robots URL.  */
 581 static struct urlinfo *
 582 robots_url (const char *url, const char *robots_filename)
 583 {
 584   struct urlinfo *u = newurl ();
 585   uerr_t err;
 586
 587   err = parseurl (url, u, 0);
 588   assert (err == URLOK && u->proto == URLHTTP);
 589   free (u->file);
 590   free (u->dir);
 591   free (u->url);
 592   u->dir = xstrdup ("");
 593   u->file = xstrdup (robots_filename);
 594   u->url = str_url (u, 0);
 595   return u;
 596 }
 597
 598 /* Retrieves the robots_filename from the root server directory, if
 599    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 600    NOROBOTS if robots could not be retrieved for any reason.  */
 601 static uerr_t
 602 retrieve_robots (const char *url, const char *robots_filename)
 603 {
 604   int dt;
 605   uerr_t err;
 606   struct urlinfo *u;
 607
 608   u = robots_url (url, robots_filename);
 609   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 610   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 611   freeurl (u, 1);
 612   if (err == RETROK)
 613     return ROBOTSOK;
 614   else
 615     return NOROBOTS;
 616 }
 617
 618 /* Parse the robots_filename and return the disallowed path components
 619    in a malloc-ed vector of character pointers.
 620
 621    It should be fully compliant with the syntax as described in the
 622    file norobots.txt, adopted by the robots mailing list
 623    (robots@webcrawler.com).  */
 624 static char **
 625 parse_robots (const char *robots_filename)
 626 {
 627   FILE *fp;
 628   char **entries;
 629   char *line, *cmd, *str, *p;
 630   char *base_version, *version;
 631   int len, num, i;
 632   int wget_matched;             /* is the part meant for Wget?  */
 633
 634   entries = NULL;
 635
 636   num = 0;
 637   fp = fopen (robots_filename, "rb");
 638   if (!fp)
 639     return NULL;
 640
 641   /* Kill version number.  */
 642     if (opt.useragent)
 643       {
 644         STRDUP_ALLOCA (base_version, opt.useragent);
 645         STRDUP_ALLOCA (version, opt.useragent);
 646       }
 647     else
 648       {
 649         int len = 10 + strlen (version_string);
 650         base_version = (char *)alloca (len);
 651         sprintf (base_version, "Wget/%s", version_string);
 652         version = (char *)alloca (len);
 653         sprintf (version, "Wget/%s", version_string);
 654       }
 655   for (p = version; *p; p++)
 656     *p = tolower (*p);
 657   for (p = base_version; *p && *p != '/'; p++)
 658     *p = tolower (*p);
 659   *p = '\0';
 660
 661   /* Setting this to 1 means that Wget considers itself under
 662      restrictions by default, even if the User-Agent field is not
 663      present.  However, if it finds the user-agent set to anything
 664      other than Wget, the rest will be ignored (up to the following
 665      User-Agent field).  Thus you may have something like:
 666
 667      Disallow: 1
 668      Disallow: 2
 669      User-Agent: stupid-robot
 670      Disallow: 3
 671      Disallow: 4
 672      User-Agent: Wget*
 673      Disallow: 5
 674      Disallow: 6
 675      User-Agent: *
 676      Disallow: 7
 677
 678      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 679      stored.  */
 680   wget_matched = 1;
 681   while ((line = read_whole_line (fp)))
 682     {
 683       len = strlen (line);
 684       /* Destroy <CR> if there is one.  */
 685       if (len && line[len - 1] == '\r')
 686         line[len - 1] = '\0';
 687       /* According to specifications, optional space may be at the
 688          end...  */
 689       DEBUGP (("Line: %s\n", line));
 690       /* Skip spaces.  */
 691       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 692       if (!*cmd)
 693         {
 694           free (line);
 695           DEBUGP (("(chucked out)\n"));
 696           continue;
 697         }
 698       /* Look for ':'.  */
 699       for (str = cmd; *str && *str != ':'; str++);
 700       if (!*str)
 701         {
 702           free (line);
 703           DEBUGP (("(chucked out)\n"));
 704           continue;
 705         }
 706       /* Zero-terminate the command.  */
 707       *str++ = '\0';
 708       /* Look for the string beginning...  */
 709       for (; *str && ISSPACE (*str); str++);
 710       /* Look for comments and kill them off.  */
 711       for (p = str; *p; p++)
 712         if (*p && ISSPACE (*p) && *(p + 1) == '#')
 713           {
 714             /* We have found a shell-style comment `<sp>+ #'.  Now
 715                rewind to the beginning of the spaces and place '\0'
 716                there.  */
 717             while (p > str && ISSPACE (*p))
 718               --p;
 719             if (p == str)
 720               *p = '\0';
 721             else
 722               *(p + 1) = '\0';
 723             break;
 724           }
 725       if (!strcasecmp (cmd, "User-agent"))
 726         {
 727           int match = 0;
 728           /* Lowercase the agent string.  */
 729           for (p = str; *p; p++)
 730             *p = tolower (*p);
 731           /* If the string is `*', it matches.  */
 732           if (*str == '*' && !*(str + 1))
 733             match = 1;
 734           else
 735             {
 736               /* If the string contains wildcards, we'll run it through
 737                  fnmatch().  */
 738               if (has_wildcards_p (str))
 739                 {
 740                   /* If the string contains '/', compare with the full
 741                      version.  Else, compare it to base_version.  */
 742                   if (strchr (str, '/'))
 743                     match = !fnmatch (str, version, 0);
 744                   else
 745                     match = !fnmatch (str, base_version, 0);
 746                 }
 747               else                /* Substring search */
 748                 {
 749                   if (strstr (version, str))
 750                     match = 1;
 751                   else
 752                     match = 0;
 753                 }
 754             }
 755           /* If Wget is not matched, skip all the entries up to the
 756              next User-agent field.  */
 757           wget_matched = match;
 758         }
 759       else if (!wget_matched)
 760         {
 761           free (line);
 762           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 763           continue;
 764         }
 765       else if (!strcasecmp (cmd, "Disallow"))
 766         {
 767           /* If "Disallow" is empty, the robot is welcome.  */
 768           if (!*str)
 769             {
 770               free_vec (entries);
 771               entries = (char **)xmalloc (sizeof (char *));
 772               *entries = NULL;
 773               num = 0;
 774             }
 775           else
 776             {
 777               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 778               entries[num] = xstrdup (str);
 779               entries[++num] = NULL;
 780               /* Strip trailing spaces, according to specifications.  */
 781               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 782                 if (ISSPACE (str[i]))
 783                   str[i] = '\0';
 784             }
 785         }
 786       else
 787         {
 788           /* unknown command */
 789           DEBUGP (("(chucked out)\n"));
 790         }
 791       free (line);
 792     }
 793   fclose (fp);
 794   return entries;
 795 }
 796
 797 /* May the URL url be loaded according to disallowing rules stored in
 798    forbidden?  */
 799 static int
 800 robots_match (struct urlinfo *u, char **forbidden)
 801 {
 802   int l;
 803
 804   if (!forbidden)
 805     return 1;
 806   DEBUGP (("Matching %s against: ", u->path));
 807   for (; *forbidden; forbidden++)
 808     {
 809       DEBUGP (("%s ", *forbidden));
 810       l = strlen (*forbidden);
 811       /* If dir is forbidden, we may not load the file.  */
 812       if (strncmp (u->path, *forbidden, l) == 0)
 813         {
 814           DEBUGP (("matched.\n"));
 815           return 0; /* Matches, i.e. does not load...  */
 816         }
 817     }
 818   DEBUGP (("not matched.\n"));
 819   return 1;
 820 }