sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <ctype.h>
  35 #include <sys/types.h>
  36
  37 #include "wget.h"
  38 #include "url.h"
  39 #include "recur.h"
  40 #include "utils.h"
  41 #include "retr.h"
  42 #include "ftp.h"
  43 #include "fnmatch.h"
  44 #include "host.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 #define ROBOTS_FILENAME "robots.txt"
  54
  55 static struct hash_table *dl_file_url_map;
  56 static struct hash_table *dl_url_file_map;
  57
  58 /* List of HTML URLs.  */
  59 static slist *urls_html;
  60
  61 /* List of undesirable-to-load URLs.  */
  62 static struct hash_table *undesirable_urls;
  63
  64 /* List of forbidden locations.  */
  65 static char **forbidden = NULL;
  66
  67 /* Current recursion depth.  */
  68 static int depth;
  69
  70 /* Base directory we're recursing from (used by no_parent).  */
  71 static char *base_dir;
  72
  73 /* The host name for which we last checked robots.  */
  74 static char *robots_host;
  75
  76 static int first_time = 1;
  77
  78 /* Construct the robots URL.  */
  79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  81 static char **parse_robots PARAMS ((const char *));
  82 static int robots_match PARAMS ((struct urlinfo *, char **));
  83
  84
  85 /* Cleanup the data structures associated with recursive retrieving
  86    (the variables above).  */
  87 void
  88 recursive_cleanup (void)
  89 {
  90   if (undesirable_urls)
  91     {
  92       string_set_free (undesirable_urls);
  93       undesirable_urls = NULL;
  94     }
  95   if (dl_file_url_map)
  96     {
  97       free_keys_and_values (dl_file_url_map);
  98       hash_table_destroy (dl_file_url_map);
  99       dl_file_url_map = NULL;
 100     }
 101   if (dl_url_file_map)
 102     {
 103       free_keys_and_values (dl_url_file_map);
 104       hash_table_destroy (dl_url_file_map);
 105       dl_url_file_map = NULL;
 106     }
 107   undesirable_urls = NULL;
 108   free_vec (forbidden);
 109   forbidden = NULL;
 110   slist_free (urls_html);
 111   urls_html = NULL;
 112   FREE_MAYBE (base_dir);
 113   FREE_MAYBE (robots_host);
 114   first_time = 1;
 115 }
 116
 117 /* Reset FIRST_TIME to 1, so that some action can be taken in
 118    recursive_retrieve().  */
 119 void
 120 recursive_reset (void)
 121 {
 122   first_time = 1;
 123 }
 124
 125 /* The core of recursive retrieving.  Endless recursion is avoided by
 126    having all URLs stored to a linked list of URLs, which is checked
 127    before loading any URL.  That way no URL can get loaded twice.
 128
 129    The function also supports specification of maximum recursion depth
 130    and a number of other goodies.  */
 131 uerr_t
 132 recursive_retrieve (const char *file, const char *this_url)
 133 {
 134   char *constr, *filename, *newloc;
 135   char *canon_this_url = NULL;
 136   int dt, inl, dash_p_leaf_HTML = FALSE;
 137   int meta_disallow_follow;
 138   int this_url_ftp;            /* See below the explanation */
 139   uerr_t err;
 140   struct urlinfo *rurl;
 141   urlpos *url_list, *cur_url;
 142   char *rfile; /* For robots */
 143   struct urlinfo *u;
 144
 145   assert (this_url != NULL);
 146   assert (file != NULL);
 147   /* If quota was exceeded earlier, bail out.  */
 148   if (downloaded_exceeds_quota ())
 149     return QUOTEXC;
 150   /* Cache the current URL in the list.  */
 151   if (first_time)
 152     {
 153       /* These three operations need to be done only once per Wget
 154          run.  They should probably be at a different location.  */
 155       if (!undesirable_urls)
 156         undesirable_urls = make_string_hash_table (0);
 157       if (!dl_file_url_map)
 158         dl_file_url_map = make_string_hash_table (0);
 159       if (!dl_url_file_map)
 160         dl_url_file_map = make_string_hash_table (0);
 161
 162       hash_table_clear (undesirable_urls);
 163       string_set_add (undesirable_urls, this_url);
 164       hash_table_clear (dl_file_url_map);
 165       hash_table_clear (dl_url_file_map);
 166       urls_html = NULL;
 167       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 168       u = newurl ();
 169       err = parseurl (this_url, u, 0);
 170       if (err == URLOK)
 171         {
 172           string_set_add (undesirable_urls, u->url);
 173           hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
 174           hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
 175           urls_html = slist_prepend (urls_html, file);
 176           if (opt.no_parent)
 177             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 178           /* Set the canonical this_url to be sent as referer.  This
 179              problem exists only when running the first time.  */
 180           canon_this_url = xstrdup (u->url);
 181         }
 182       else
 183         {
 184           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 185           base_dir = NULL;
 186         }
 187       freeurl (u, 1);
 188       depth = 1;
 189       robots_host = NULL;
 190       forbidden = NULL;
 191       first_time = 0;
 192     }
 193   else
 194     ++depth;
 195
 196   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 197     /* We've exceeded the maximum recursion depth specified by the user. */
 198     {
 199       if (opt.page_requisites && depth <= opt.reclevel + 1)
 200         /* When -p is specified, we can do one more partial recursion from the
 201            "leaf nodes" on the HTML document tree.  The recursion is partial in
 202            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 203            except for <LINK REL="stylesheet">. */
 204         dash_p_leaf_HTML = TRUE;
 205       else
 206         /* Either -p wasn't specified or it was and we've already gone the one
 207            extra (pseudo-)level that it affords us, so we need to bail out. */
 208         {
 209           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 210                    depth, opt.reclevel));
 211           --depth;
 212           return RECLEVELEXC;
 213         }
 214     }
 215
 216   /* Determine whether this_url is an FTP URL.  If it is, it means
 217      that the retrieval is done through proxy.  In that case, FTP
 218      links will be followed by default and recursion will not be
 219      turned off when following them.  */
 220   this_url_ftp = (urlproto (this_url) == URLFTP);
 221
 222   /* Get the URL-s from an HTML file: */
 223   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 224                             dash_p_leaf_HTML, &meta_disallow_follow);
 225
 226   if (opt.use_robots && meta_disallow_follow)
 227     {
 228       /* The META tag says we are not to follow this file.  Respect
 229          that.  */
 230       free_urlpos (url_list);
 231       url_list = NULL;
 232     }
 233
 234   /* Decide what to do with each of the URLs.  A URL will be loaded if
 235      it meets several requirements, discussed later.  */
 236   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 237     {
 238       /* If quota was exceeded earlier, bail out.  */
 239       if (downloaded_exceeds_quota ())
 240         break;
 241       /* Parse the URL for convenient use in other functions, as well
 242          as to get the optimized form.  It also checks URL integrity.  */
 243       u = newurl ();
 244       if (parseurl (cur_url->url, u, 0) != URLOK)
 245         {
 246           DEBUGP (("Yuck!  A bad URL.\n"));
 247           freeurl (u, 1);
 248           continue;
 249         }
 250       if (u->proto == URLFILE)
 251         {
 252           DEBUGP (("Nothing to do with file:// around here.\n"));
 253           freeurl (u, 1);
 254           continue;
 255         }
 256       assert (u->url != NULL);
 257       constr = xstrdup (u->url);
 258
 259       /* Several checkings whether a file is acceptable to load:
 260          1. check if URL is ftp, and we don't load it
 261          2. check for relative links (if relative_only is set)
 262          3. check for domain
 263          4. check for no-parent
 264          5. check for excludes && includes
 265          6. check for suffix
 266          7. check for same host (if spanhost is unset), with possible
 267          gethostbyname baggage
 268          8. check for robots.txt
 269
 270          Addendum: If the URL is FTP, and it is to be loaded, only the
 271          domain and suffix settings are "stronger".
 272
 273          Note that .html and (yuck) .htm will get loaded regardless of
 274          suffix rules (but that is remedied later with unlink) unless
 275          the depth equals the maximum depth.
 276
 277          More time- and memory- consuming tests should be put later on
 278          the list.  */
 279
 280       /* inl is set if the URL we are working on (constr) is stored in
 281          undesirable_urls.  Using it is crucial to avoid unnecessary
 282          repeated continuous hits to the hash table.  */
 283       inl = string_set_exists (undesirable_urls, constr);
 284
 285       /* If it is FTP, and FTP is not followed, chuck it out.  */
 286       if (!inl)
 287         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 288           {
 289             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 290             string_set_add (undesirable_urls, constr);
 291             inl = 1;
 292           }
 293       /* If it is absolute link and they are not followed, chuck it
 294          out.  */
 295       if (!inl && u->proto != URLFTP)
 296         if (opt.relative_only && !cur_url->link_relative_p)
 297           {
 298             DEBUGP (("It doesn't really look like a relative link.\n"));
 299             string_set_add (undesirable_urls, constr);
 300             inl = 1;
 301           }
 302       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 303       if (!inl)
 304         if (!accept_domain (u))
 305           {
 306             DEBUGP (("I don't like the smell of that domain.\n"));
 307             string_set_add (undesirable_urls, constr);
 308             inl = 1;
 309           }
 310       /* Check for parent directory.  */
 311       if (!inl && opt.no_parent
 312           /* If the new URL is FTP and the old was not, ignore
 313              opt.no_parent.  */
 314           && !(!this_url_ftp && u->proto == URLFTP))
 315         {
 316           /* Check for base_dir first.  */
 317           if (!(base_dir && frontcmp (base_dir, u->dir)))
 318             {
 319               /* Failing that, check for parent dir.  */
 320               struct urlinfo *ut = newurl ();
 321               if (parseurl (this_url, ut, 0) != URLOK)
 322                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 323               else if (!frontcmp (ut->dir, u->dir))
 324                 {
 325                   /* Failing that too, kill the URL.  */
 326                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 327                   string_set_add (undesirable_urls, constr);
 328                   inl = 1;
 329                 }
 330               freeurl (ut, 1);
 331             }
 332         }
 333       /* If the file does not match the acceptance list, or is on the
 334          rejection list, chuck it out.  The same goes for the
 335          directory exclude- and include- lists.  */
 336       if (!inl && (opt.includes || opt.excludes))
 337         {
 338           if (!accdir (u->dir, ALLABS))
 339             {
 340               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 341               string_set_add (undesirable_urls, constr);
 342               inl = 1;
 343             }
 344         }
 345       if (!inl)
 346         {
 347           char *suf = NULL;
 348           /* We check for acceptance/rejection rules only for non-HTML
 349              documents.  Since we don't know whether they really are
 350              HTML, it will be deduced from (an OR-ed list):
 351
 352              1) u->file is "" (meaning it is a directory)
 353              2) suffix exists, AND:
 354              a) it is "html", OR
 355              b) it is "htm"
 356
 357              If the file *is* supposed to be HTML, it will *not* be
 358             subject to acc/rej rules, unless a finite maximum depth has
 359             been specified and the current depth is the maximum depth. */
 360           if (!
 361               (!*u->file
 362                || (((suf = suffix (constr)) != NULL)
 363                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 364                       && ((opt.reclevel != INFINITE_RECURSION) &&
 365                           (depth != opt.reclevel))))))
 366             {
 367               if (!acceptable (u->file))
 368                 {
 369                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 370                           constr, u->file));
 371                   string_set_add (undesirable_urls, constr);
 372                   inl = 1;
 373                 }
 374             }
 375           FREE_MAYBE (suf);
 376         }
 377       /* Optimize the URL (which includes possible DNS lookup) only
 378          after all other possibilities have been exhausted.  */
 379       if (!inl)
 380         {
 381           if (!opt.simple_check)
 382             opt_url (u);
 383           else
 384             {
 385               char *p;
 386               /* Just lowercase the hostname.  */
 387               for (p = u->host; *p; p++)
 388                 *p = TOLOWER (*p);
 389               xfree (u->url);
 390               u->url = str_url (u, 0);
 391             }
 392           xfree (constr);
 393           constr = xstrdup (u->url);
 394           string_set_add (undesirable_urls, constr);
 395           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 396             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 397               {
 398                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 399                 string_set_add (undesirable_urls, constr);
 400                 inl = 1;
 401               }
 402         }
 403       /* What about robots.txt?  */
 404       if (!inl && opt.use_robots && u->proto == URLHTTP)
 405         {
 406           /* Since Wget knows about only one set of robot rules at a
 407              time, /robots.txt must be reloaded whenever a new host is
 408              accessed.
 409
 410              robots_host holds the host the current `forbid' variable
 411              is assigned to.  */
 412           if (!robots_host || !same_host (robots_host, u->host))
 413             {
 414               FREE_MAYBE (robots_host);
 415               /* Now make robots_host the new host, no matter what the
 416                  result will be.  So if there is no /robots.txt on the
 417                  site, Wget will not retry getting robots all the
 418                  time.  */
 419               robots_host = xstrdup (u->host);
 420               free_vec (forbidden);
 421               forbidden = NULL;
 422               err = retrieve_robots (constr, ROBOTS_FILENAME);
 423               if (err == ROBOTSOK)
 424                 {
 425                   rurl = robots_url (constr, ROBOTS_FILENAME);
 426                   rfile = url_filename (rurl);
 427                   forbidden = parse_robots (rfile);
 428                   freeurl (rurl, 1);
 429                   xfree (rfile);
 430                 }
 431             }
 432
 433           /* Now that we have (or don't have) robots, we can check for
 434              them.  */
 435           if (!robots_match (u, forbidden))
 436             {
 437               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 438                        ROBOTS_FILENAME));
 439               string_set_add (undesirable_urls, constr);
 440               inl = 1;
 441             }
 442         }
 443
 444       filename = NULL;
 445       /* If it wasn't chucked out, do something with it.  */
 446       if (!inl)
 447         {
 448           DEBUGP (("I've decided to load it -> "));
 449           /* Add it to the list of already-loaded URL-s.  */
 450           string_set_add (undesirable_urls, constr);
 451           /* Automatically followed FTPs will *not* be downloaded
 452              recursively.  */
 453           if (u->proto == URLFTP)
 454             {
 455               /* Don't you adore side-effects?  */
 456               opt.recursive = 0;
 457             }
 458           /* Reset its type.  */
 459           dt = 0;
 460           /* Retrieve it.  */
 461           retrieve_url (constr, &filename, &newloc,
 462                        canon_this_url ? canon_this_url : this_url, &dt);
 463           if (u->proto == URLFTP)
 464             {
 465               /* Restore...  */
 466               opt.recursive = 1;
 467             }
 468           if (newloc)
 469             {
 470               xfree (constr);
 471               constr = newloc;
 472             }
 473           /* In case of convert_links: If there was no error, add it to
 474              the list of downloaded URLs.  We might need it for
 475              conversion.  */
 476           if (opt.convert_links && filename)
 477             {
 478               if (dt & RETROKF)
 479                 {
 480                   hash_table_put (dl_file_url_map,
 481                                   xstrdup (filename), xstrdup (constr));
 482                   hash_table_put (dl_url_file_map,
 483                                   xstrdup (constr), xstrdup (filename));
 484                   /* If the URL is HTML, note it.  */
 485                   if (dt & TEXTHTML)
 486                     urls_html = slist_prepend (urls_html, filename);
 487                 }
 488             }
 489           /* If there was no error, and the type is text/html, parse
 490              it recursively.  */
 491           if (dt & TEXTHTML)
 492             {
 493               if (dt & RETROKF)
 494                 recursive_retrieve (filename, constr);
 495             }
 496           else
 497             DEBUGP (("%s is not text/html so we don't chase.\n",
 498                      filename ? filename: "(null)"));
 499
 500           if (opt.delete_after || (filename && !acceptable (filename)))
 501             /* Either --delete-after was specified, or we loaded this otherwise
 502                rejected (e.g. by -R) HTML file just so we could harvest its
 503                hyperlinks -- in either case, delete the local file. */
 504             {
 505               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 506                        opt.delete_after ? "--delete-after" :
 507                        "recursive rejection criteria"));
 508               logprintf (LOG_VERBOSE,
 509                          (opt.delete_after ? _("Removing %s.\n")
 510                           : _("Removing %s since it should be rejected.\n")),
 511                          filename);
 512               if (unlink (filename))
 513                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 514               dt &= ~RETROKF;
 515             }
 516
 517           /* If everything was OK, and links are to be converted, let's
 518              store the local filename.  */
 519           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 520             {
 521               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 522               cur_url->local_name = xstrdup (filename);
 523             }
 524         }
 525       else
 526         DEBUGP (("%s already in list, so we don't load.\n", constr));
 527       /* Free filename and constr.  */
 528       FREE_MAYBE (filename);
 529       FREE_MAYBE (constr);
 530       freeurl (u, 1);
 531       /* Increment the pbuf for the appropriate size.  */
 532     }
 533   if (opt.convert_links && !opt.delete_after)
 534     /* This is merely the first pass: the links that have been
 535        successfully downloaded are converted.  In the second pass,
 536        convert_all_links() will also convert those links that have NOT
 537        been downloaded to their canonical form.  */
 538     convert_links (file, url_list);
 539   /* Free the linked list of URL-s.  */
 540   free_urlpos (url_list);
 541   /* Free the canonical this_url.  */
 542   FREE_MAYBE (canon_this_url);
 543   /* Decrement the recursion depth.  */
 544   --depth;
 545   if (downloaded_exceeds_quota ())
 546     return QUOTEXC;
 547   else
 548     return RETROK;
 549 }
 550 \f
 551 /* convert_links() is called from recursive_retrieve() after we're
 552    done with an HTML file.  This call to convert_links is not complete
 553    because it converts only the downloaded files, and Wget cannot know
 554    which files will be downloaded afterwards.  So, if we have file
 555    fileone.html with:
 556
 557    <a href="/c/something.gif">
 558
 559    and /c/something.gif was not downloaded because it exceeded the
 560    recursion depth, the reference will *not* be changed.
 561
 562    However, later we can encounter /c/something.gif from an "upper"
 563    level HTML (let's call it filetwo.html), and it gets downloaded.
 564
 565    But now we have a problem because /c/something.gif will be
 566    correctly transformed in filetwo.html, but not in fileone.html,
 567    since Wget could not have known that /c/something.gif will be
 568    downloaded in the future.
 569
 570    This is why Wget must, after the whole retrieval, call
 571    convert_all_links to go once more through the entire list of
 572    retrieved HTMLs, and re-convert them.
 573
 574    All the downloaded HTMLs are kept in urls_html, and downloaded URLs
 575    in urls_downloaded.  From these two lists information is
 576    extracted.  */
 577 void
 578 convert_all_links (void)
 579 {
 580   slist *html;
 581
 582   /* Destructively reverse urls_html to get it in the right order.
 583      recursive_retrieve() used slist_prepend() consistently.  */
 584   urls_html = slist_nreverse (urls_html);
 585
 586   for (html = urls_html; html; html = html->next)
 587     {
 588       urlpos *urls, *cur_url;
 589       char *url;
 590
 591       DEBUGP (("Rescanning %s\n", html->string));
 592       /* Determine the URL of the HTML file.  get_urls_html will need
 593          it.  */
 594       url = hash_table_get (dl_file_url_map, html->string);
 595       if (url)
 596         DEBUGP (("It should correspond to %s.\n", url));
 597       else
 598         DEBUGP (("I cannot find the corresponding URL.\n"));
 599       /* Parse the HTML file...  */
 600       urls = get_urls_html (html->string, url, FALSE, NULL);
 601       /* We don't respect meta_disallow_follow here because, even if
 602          the file is not followed, we might still want to convert the
 603          links that have been followed from other files.  */
 604       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 605         {
 606           char *local_name;
 607
 608           /* The URL must be in canonical form to be compared.  */
 609           struct urlinfo *u = newurl ();
 610           uerr_t res = parseurl (cur_url->url, u, 0);
 611           if (res != URLOK)
 612             {
 613               freeurl (u, 1);
 614               continue;
 615             }
 616           /* We decide the direction of conversion according to whether
 617              a URL was downloaded.  Downloaded URLs will be converted
 618              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 619           local_name = hash_table_get (dl_url_file_map, u->url);
 620           if (local_name)
 621             DEBUGP (("%s marked for conversion, local %s\n",
 622                      u->url, local_name));
 623           /* Decide on the conversion direction.  */
 624           if (local_name)
 625             {
 626               /* We've downloaded this URL.  Convert it to relative
 627                  form.  We do this even if the URL already is in
 628                  relative form, because our directory structure may
 629                  not be identical to that on the server (think `-nd',
 630                  `--cut-dirs', etc.)  */
 631               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 632               cur_url->local_name = xstrdup (local_name);
 633             }
 634           else
 635             {
 636               /* We haven't downloaded this URL.  If it's not already
 637                  complete (including a full host name), convert it to
 638                  that form, so it can be reached while browsing this
 639                  HTML locally.  */
 640               if (!cur_url->link_complete_p)
 641                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 642               cur_url->local_name = NULL;
 643             }
 644           freeurl (u, 1);
 645         }
 646       /* Convert the links in the file.  */
 647       convert_links (html->string, urls);
 648       /* Free the data.  */
 649       free_urlpos (urls);
 650     }
 651 }
 652 \f
 653 /* Robots support.  */
 654
 655 /* Construct the robots URL.  */
 656 static struct urlinfo *
 657 robots_url (const char *url, const char *robots_filename)
 658 {
 659   struct urlinfo *u = newurl ();
 660   uerr_t err;
 661
 662   err = parseurl (url, u, 0);
 663   assert (err == URLOK && u->proto == URLHTTP);
 664   xfree (u->file);
 665   xfree (u->dir);
 666   xfree (u->url);
 667   u->dir = xstrdup ("");
 668   u->file = xstrdup (robots_filename);
 669   u->url = str_url (u, 0);
 670   return u;
 671 }
 672
 673 /* Retrieves the robots_filename from the root server directory, if
 674    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 675    NOROBOTS if robots could not be retrieved for any reason.  */
 676 static uerr_t
 677 retrieve_robots (const char *url, const char *robots_filename)
 678 {
 679   int dt;
 680   uerr_t err;
 681   struct urlinfo *u;
 682
 683   u = robots_url (url, robots_filename);
 684   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 685   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 686   freeurl (u, 1);
 687   if (err == RETROK)
 688     return ROBOTSOK;
 689   else
 690     return NOROBOTS;
 691 }
 692
 693 /* Parse the robots_filename and return the disallowed path components
 694    in a malloc-ed vector of character pointers.
 695
 696    It should be fully compliant with the syntax as described in the
 697    file norobots.txt, adopted by the robots mailing list
 698    (robots@webcrawler.com).  */
 699 static char **
 700 parse_robots (const char *robots_filename)
 701 {
 702   FILE *fp;
 703   char **entries;
 704   char *line, *cmd, *str, *p;
 705   char *base_version, *version;
 706   int len, num, i;
 707   int wget_matched;             /* is the part meant for Wget?  */
 708
 709   entries = NULL;
 710
 711   num = 0;
 712   fp = fopen (robots_filename, "rb");
 713   if (!fp)
 714     return NULL;
 715
 716   /* Kill version number.  */
 717     if (opt.useragent)
 718       {
 719         STRDUP_ALLOCA (base_version, opt.useragent);
 720         STRDUP_ALLOCA (version, opt.useragent);
 721       }
 722     else
 723       {
 724         int len = 10 + strlen (version_string);
 725         base_version = (char *)alloca (len);
 726         sprintf (base_version, "Wget/%s", version_string);
 727         version = (char *)alloca (len);
 728         sprintf (version, "Wget/%s", version_string);
 729       }
 730   for (p = version; *p; p++)
 731     *p = TOLOWER (*p);
 732   for (p = base_version; *p && *p != '/'; p++)
 733     *p = TOLOWER (*p);
 734   *p = '\0';
 735
 736   /* Setting this to 1 means that Wget considers itself under
 737      restrictions by default, even if the User-Agent field is not
 738      present.  However, if it finds the user-agent set to anything
 739      other than Wget, the rest will be ignored (up to the following
 740      User-Agent field).  Thus you may have something like:
 741
 742      Disallow: 1
 743      Disallow: 2
 744      User-Agent: stupid-robot
 745      Disallow: 3
 746      Disallow: 4
 747      User-Agent: Wget*
 748      Disallow: 5
 749      Disallow: 6
 750      User-Agent: *
 751      Disallow: 7
 752
 753      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 754      stored.  */
 755   wget_matched = 1;
 756   while ((line = read_whole_line (fp)))
 757     {
 758       len = strlen (line);
 759       /* Destroy <CR><LF> if present.  */
 760       if (len && line[len - 1] == '\n')
 761         line[--len] = '\0';
 762       if (len && line[len - 1] == '\r')
 763         line[--len] = '\0';
 764       /* According to specifications, optional space may be at the
 765          end...  */
 766       DEBUGP (("Line: %s\n", line));
 767       /* Skip spaces.  */
 768       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 769       if (!*cmd)
 770         {
 771           xfree (line);
 772           DEBUGP (("(chucked out)\n"));
 773           continue;
 774         }
 775       /* Look for ':'.  */
 776       for (str = cmd; *str && *str != ':'; str++);
 777       if (!*str)
 778         {
 779           xfree (line);
 780           DEBUGP (("(chucked out)\n"));
 781           continue;
 782         }
 783       /* Zero-terminate the command.  */
 784       *str++ = '\0';
 785       /* Look for the string beginning...  */
 786       for (; *str && ISSPACE (*str); str++);
 787       /* Look for comments or trailing spaces and kill them off.  */
 788       for (p = str; *p; p++)
 789         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 790           {
 791             /* We have found either a shell-style comment `<sp>+#' or some
 792                trailing spaces.  Now rewind to the beginning of the spaces
 793                and place '\0' there.  */
 794             while (p > str && ISSPACE (*p))
 795               --p;
 796             if (p == str)
 797               *p = '\0';
 798             else
 799               *(p + 1) = '\0';
 800             break;
 801           }
 802       if (!strcasecmp (cmd, "User-agent"))
 803         {
 804           int match = 0;
 805           /* Lowercase the agent string.  */
 806           for (p = str; *p; p++)
 807             *p = TOLOWER (*p);
 808           /* If the string is `*', it matches.  */
 809           if (*str == '*' && !*(str + 1))
 810             match = 1;
 811           else
 812             {
 813               /* If the string contains wildcards, we'll run it through
 814                  fnmatch().  */
 815               if (has_wildcards_p (str))
 816                 {
 817                   /* If the string contains '/', compare with the full
 818                      version.  Else, compare it to base_version.  */
 819                   if (strchr (str, '/'))
 820                     match = !fnmatch (str, version, 0);
 821                   else
 822                     match = !fnmatch (str, base_version, 0);
 823                 }
 824               else                /* Substring search */
 825                 {
 826                   if (strstr (version, str))
 827                     match = 1;
 828                   else
 829                     match = 0;
 830                 }
 831             }
 832           /* If Wget is not matched, skip all the entries up to the
 833              next User-agent field.  */
 834           wget_matched = match;
 835         }
 836       else if (!wget_matched)
 837         {
 838           xfree (line);
 839           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 840           continue;
 841         }
 842       else if (!strcasecmp (cmd, "Disallow"))
 843         {
 844           /* If "Disallow" is empty, the robot is welcome.  */
 845           if (!*str)
 846             {
 847               free_vec (entries);
 848               entries = (char **)xmalloc (sizeof (char *));
 849               *entries = NULL;
 850               num = 0;
 851             }
 852           else
 853             {
 854               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 855               entries[num] = xstrdup (str);
 856               entries[++num] = NULL;
 857               /* Strip trailing spaces, according to specifications.  */
 858               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 859                 if (ISSPACE (str[i]))
 860                   str[i] = '\0';
 861             }
 862         }
 863       else
 864         {
 865           /* unknown command */
 866           DEBUGP (("(chucked out)\n"));
 867         }
 868       xfree (line);
 869     }
 870   fclose (fp);
 871   return entries;
 872 }
 873
 874 /* May the URL url be loaded according to disallowing rules stored in
 875    forbidden?  */
 876 static int
 877 robots_match (struct urlinfo *u, char **forbidden)
 878 {
 879   int l;
 880
 881   if (!forbidden)
 882     return 1;
 883   DEBUGP (("Matching %s against: ", u->path));
 884   for (; *forbidden; forbidden++)
 885     {
 886       DEBUGP (("%s ", *forbidden));
 887       l = strlen (*forbidden);
 888       /* If dir is forbidden, we may not load the file.  */
 889       if (strncmp (u->path, *forbidden, l) == 0)
 890         {
 891           DEBUGP (("matched.\n"));
 892           return 0; /* Matches, i.e. does not load...  */
 893         }
 894     }
 895   DEBUGP (("not matched.\n"));
 896   return 1;
 897 }