sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45
  46 #ifndef errno
  47 extern int errno;
  48 #endif
  49
  50 extern char *version_string;
  51
  52 #define ROBOTS_FILENAME "robots.txt"
  53
  54 static struct hash_table *dl_file_url_map;
  55 static struct hash_table *dl_url_file_map;
  56
  57 /* List of HTML files downloaded in this Wget run.  Used for link
  58    conversion after Wget is done.  */
  59 static slist *downloaded_html_files;
  60
  61 /* List of undesirable-to-load URLs.  */
  62 static struct hash_table *undesirable_urls;
  63
  64 /* List of forbidden locations.  */
  65 static char **forbidden = NULL;
  66
  67 /* Current recursion depth.  */
  68 static int depth;
  69
  70 /* Base directory we're recursing from (used by no_parent).  */
  71 static char *base_dir;
  72
  73 /* The host name for which we last checked robots.  */
  74 static char *robots_host;
  75
  76 static int first_time = 1;
  77
  78 /* Construct the robots URL.  */
  79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  81 static char **parse_robots PARAMS ((const char *));
  82 static int robots_match PARAMS ((struct urlinfo *, char **));
  83
  84
  85 /* Cleanup the data structures associated with recursive retrieving
  86    (the variables above).  */
  87 void
  88 recursive_cleanup (void)
  89 {
  90   if (undesirable_urls)
  91     {
  92       string_set_free (undesirable_urls);
  93       undesirable_urls = NULL;
  94     }
  95   if (dl_file_url_map)
  96     {
  97       free_keys_and_values (dl_file_url_map);
  98       hash_table_destroy (dl_file_url_map);
  99       dl_file_url_map = NULL;
 100     }
 101   if (dl_url_file_map)
 102     {
 103       free_keys_and_values (dl_url_file_map);
 104       hash_table_destroy (dl_url_file_map);
 105       dl_url_file_map = NULL;
 106     }
 107   undesirable_urls = NULL;
 108   free_vec (forbidden);
 109   forbidden = NULL;
 110   slist_free (downloaded_html_files);
 111   downloaded_html_files = NULL;
 112   FREE_MAYBE (base_dir);
 113   FREE_MAYBE (robots_host);
 114   first_time = 1;
 115 }
 116
 117 /* Reset FIRST_TIME to 1, so that some action can be taken in
 118    recursive_retrieve().  */
 119 void
 120 recursive_reset (void)
 121 {
 122   first_time = 1;
 123 }
 124
 125 /* The core of recursive retrieving.  Endless recursion is avoided by
 126    having all URLs stored to a linked list of URLs, which is checked
 127    before loading any URL.  That way no URL can get loaded twice.
 128
 129    The function also supports specification of maximum recursion depth
 130    and a number of other goodies.  */
 131 uerr_t
 132 recursive_retrieve (const char *file, const char *this_url)
 133 {
 134   char *constr, *filename, *newloc;
 135   char *canon_this_url = NULL;
 136   int dt, inl, dash_p_leaf_HTML = FALSE;
 137   int meta_disallow_follow;
 138   int this_url_ftp;            /* See below the explanation */
 139   uerr_t err;
 140   struct urlinfo *rurl;
 141   urlpos *url_list, *cur_url;
 142   char *rfile; /* For robots */
 143   struct urlinfo *u;
 144
 145   assert (this_url != NULL);
 146   assert (file != NULL);
 147   /* If quota was exceeded earlier, bail out.  */
 148   if (downloaded_exceeds_quota ())
 149     return QUOTEXC;
 150   /* Cache the current URL in the list.  */
 151   if (first_time)
 152     {
 153       /* These three operations need to be done only once per Wget
 154          run.  They should probably be at a different location.  */
 155       if (!undesirable_urls)
 156         undesirable_urls = make_string_hash_table (0);
 157
 158       hash_table_clear (undesirable_urls);
 159       string_set_add (undesirable_urls, this_url);
 160       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 161       u = newurl ();
 162       err = parseurl (this_url, u, 0);
 163       if (err == URLOK)
 164         {
 165           string_set_add (undesirable_urls, u->url);
 166           if (opt.no_parent)
 167             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 168           /* Set the canonical this_url to be sent as referer.  This
 169              problem exists only when running the first time.  */
 170           canon_this_url = xstrdup (u->url);
 171         }
 172       else
 173         {
 174           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 175           base_dir = NULL;
 176         }
 177       freeurl (u, 1);
 178       depth = 1;
 179       robots_host = NULL;
 180       forbidden = NULL;
 181       first_time = 0;
 182     }
 183   else
 184     ++depth;
 185
 186   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 187     /* We've exceeded the maximum recursion depth specified by the user. */
 188     {
 189       if (opt.page_requisites && depth <= opt.reclevel + 1)
 190         /* When -p is specified, we can do one more partial recursion from the
 191            "leaf nodes" on the HTML document tree.  The recursion is partial in
 192            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 193            except for <LINK REL="stylesheet">. */
 194         dash_p_leaf_HTML = TRUE;
 195       else
 196         /* Either -p wasn't specified or it was and we've already gone the one
 197            extra (pseudo-)level that it affords us, so we need to bail out. */
 198         {
 199           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 200                    depth, opt.reclevel));
 201           --depth;
 202           return RECLEVELEXC;
 203         }
 204     }
 205
 206   /* Determine whether this_url is an FTP URL.  If it is, it means
 207      that the retrieval is done through proxy.  In that case, FTP
 208      links will be followed by default and recursion will not be
 209      turned off when following them.  */
 210   this_url_ftp = (urlproto (this_url) == URLFTP);
 211
 212   /* Get the URL-s from an HTML file: */
 213   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 214                             dash_p_leaf_HTML, &meta_disallow_follow);
 215
 216   if (opt.use_robots && meta_disallow_follow)
 217     {
 218       /* The META tag says we are not to follow this file.  Respect
 219          that.  */
 220       free_urlpos (url_list);
 221       url_list = NULL;
 222     }
 223
 224   /* Decide what to do with each of the URLs.  A URL will be loaded if
 225      it meets several requirements, discussed later.  */
 226   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 227     {
 228       /* If quota was exceeded earlier, bail out.  */
 229       if (downloaded_exceeds_quota ())
 230         break;
 231       /* Parse the URL for convenient use in other functions, as well
 232          as to get the optimized form.  It also checks URL integrity.  */
 233       u = newurl ();
 234       if (parseurl (cur_url->url, u, 0) != URLOK)
 235         {
 236           DEBUGP (("Yuck!  A bad URL.\n"));
 237           freeurl (u, 1);
 238           continue;
 239         }
 240       if (u->proto == URLFILE)
 241         {
 242           DEBUGP (("Nothing to do with file:// around here.\n"));
 243           freeurl (u, 1);
 244           continue;
 245         }
 246       assert (u->url != NULL);
 247       constr = xstrdup (u->url);
 248
 249       /* Several checkings whether a file is acceptable to load:
 250          1. check if URL is ftp, and we don't load it
 251          2. check for relative links (if relative_only is set)
 252          3. check for domain
 253          4. check for no-parent
 254          5. check for excludes && includes
 255          6. check for suffix
 256          7. check for same host (if spanhost is unset), with possible
 257          gethostbyname baggage
 258          8. check for robots.txt
 259
 260          Addendum: If the URL is FTP, and it is to be loaded, only the
 261          domain and suffix settings are "stronger".
 262
 263          Note that .html and (yuck) .htm will get loaded regardless of
 264          suffix rules (but that is remedied later with unlink) unless
 265          the depth equals the maximum depth.
 266
 267          More time- and memory- consuming tests should be put later on
 268          the list.  */
 269
 270       /* inl is set if the URL we are working on (constr) is stored in
 271          undesirable_urls.  Using it is crucial to avoid unnecessary
 272          repeated continuous hits to the hash table.  */
 273       inl = string_set_contains (undesirable_urls, constr);
 274
 275       /* If it is FTP, and FTP is not followed, chuck it out.  */
 276       if (!inl)
 277         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 278           {
 279             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 280             string_set_add (undesirable_urls, constr);
 281             inl = 1;
 282           }
 283       /* If it is absolute link and they are not followed, chuck it
 284          out.  */
 285       if (!inl && u->proto != URLFTP)
 286         if (opt.relative_only && !cur_url->link_relative_p)
 287           {
 288             DEBUGP (("It doesn't really look like a relative link.\n"));
 289             string_set_add (undesirable_urls, constr);
 290             inl = 1;
 291           }
 292       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 293       if (!inl)
 294         if (!accept_domain (u))
 295           {
 296             DEBUGP (("I don't like the smell of that domain.\n"));
 297             string_set_add (undesirable_urls, constr);
 298             inl = 1;
 299           }
 300       /* Check for parent directory.  */
 301       if (!inl && opt.no_parent
 302           /* If the new URL is FTP and the old was not, ignore
 303              opt.no_parent.  */
 304           && !(!this_url_ftp && u->proto == URLFTP))
 305         {
 306           /* Check for base_dir first.  */
 307           if (!(base_dir && frontcmp (base_dir, u->dir)))
 308             {
 309               /* Failing that, check for parent dir.  */
 310               struct urlinfo *ut = newurl ();
 311               if (parseurl (this_url, ut, 0) != URLOK)
 312                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 313               else if (!frontcmp (ut->dir, u->dir))
 314                 {
 315                   /* Failing that too, kill the URL.  */
 316                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 317                   string_set_add (undesirable_urls, constr);
 318                   inl = 1;
 319                 }
 320               freeurl (ut, 1);
 321             }
 322         }
 323       /* If the file does not match the acceptance list, or is on the
 324          rejection list, chuck it out.  The same goes for the
 325          directory exclude- and include- lists.  */
 326       if (!inl && (opt.includes || opt.excludes))
 327         {
 328           if (!accdir (u->dir, ALLABS))
 329             {
 330               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 331               string_set_add (undesirable_urls, constr);
 332               inl = 1;
 333             }
 334         }
 335       if (!inl)
 336         {
 337           char *suf = NULL;
 338           /* We check for acceptance/rejection rules only for non-HTML
 339              documents.  Since we don't know whether they really are
 340              HTML, it will be deduced from (an OR-ed list):
 341
 342              1) u->file is "" (meaning it is a directory)
 343              2) suffix exists, AND:
 344              a) it is "html", OR
 345              b) it is "htm"
 346
 347              If the file *is* supposed to be HTML, it will *not* be
 348             subject to acc/rej rules, unless a finite maximum depth has
 349             been specified and the current depth is the maximum depth. */
 350           if (!
 351               (!*u->file
 352                || (((suf = suffix (constr)) != NULL)
 353                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 354                       && ((opt.reclevel != INFINITE_RECURSION) &&
 355                           (depth != opt.reclevel))))))
 356             {
 357               if (!acceptable (u->file))
 358                 {
 359                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 360                           constr, u->file));
 361                   string_set_add (undesirable_urls, constr);
 362                   inl = 1;
 363                 }
 364             }
 365           FREE_MAYBE (suf);
 366         }
 367       /* Optimize the URL (which includes possible DNS lookup) only
 368          after all other possibilities have been exhausted.  */
 369       if (!inl)
 370         {
 371           if (!opt.simple_check)
 372             opt_url (u);
 373           else
 374             {
 375               char *p;
 376               /* Just lowercase the hostname.  */
 377               for (p = u->host; *p; p++)
 378                 *p = TOLOWER (*p);
 379               xfree (u->url);
 380               u->url = str_url (u, 0);
 381             }
 382           xfree (constr);
 383           constr = xstrdup (u->url);
 384           /* After we have canonicalized the URL, check if we have it
 385              on the black list. */
 386           if (string_set_contains (undesirable_urls, constr))
 387             inl = 1;
 388           /* This line is bogus. */
 389           /*string_set_add (undesirable_urls, constr);*/
 390
 391           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 392             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 393               {
 394                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 395                 string_set_add (undesirable_urls, constr);
 396                 inl = 1;
 397               }
 398         }
 399       /* What about robots.txt?  */
 400       if (!inl && opt.use_robots && u->proto == URLHTTP)
 401         {
 402           /* Since Wget knows about only one set of robot rules at a
 403              time, /robots.txt must be reloaded whenever a new host is
 404              accessed.
 405
 406              robots_host holds the host the current `forbid' variable
 407              is assigned to.  */
 408           if (!robots_host || !same_host (robots_host, u->host))
 409             {
 410               FREE_MAYBE (robots_host);
 411               /* Now make robots_host the new host, no matter what the
 412                  result will be.  So if there is no /robots.txt on the
 413                  site, Wget will not retry getting robots all the
 414                  time.  */
 415               robots_host = xstrdup (u->host);
 416               free_vec (forbidden);
 417               forbidden = NULL;
 418               err = retrieve_robots (constr, ROBOTS_FILENAME);
 419               if (err == ROBOTSOK)
 420                 {
 421                   rurl = robots_url (constr, ROBOTS_FILENAME);
 422                   rfile = url_filename (rurl);
 423                   forbidden = parse_robots (rfile);
 424                   freeurl (rurl, 1);
 425                   xfree (rfile);
 426                 }
 427             }
 428
 429           /* Now that we have (or don't have) robots, we can check for
 430              them.  */
 431           if (!robots_match (u, forbidden))
 432             {
 433               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 434                        ROBOTS_FILENAME));
 435               string_set_add (undesirable_urls, constr);
 436               inl = 1;
 437             }
 438         }
 439
 440       filename = NULL;
 441       /* If it wasn't chucked out, do something with it.  */
 442       if (!inl)
 443         {
 444           DEBUGP (("I've decided to load it -> "));
 445           /* Add it to the list of already-loaded URL-s.  */
 446           string_set_add (undesirable_urls, constr);
 447           /* Automatically followed FTPs will *not* be downloaded
 448              recursively.  */
 449           if (u->proto == URLFTP)
 450             {
 451               /* Don't you adore side-effects?  */
 452               opt.recursive = 0;
 453             }
 454           /* Reset its type.  */
 455           dt = 0;
 456           /* Retrieve it.  */
 457           retrieve_url (constr, &filename, &newloc,
 458                        canon_this_url ? canon_this_url : this_url, &dt);
 459           if (u->proto == URLFTP)
 460             {
 461               /* Restore...  */
 462               opt.recursive = 1;
 463             }
 464           if (newloc)
 465             {
 466               xfree (constr);
 467               constr = newloc;
 468             }
 469           /* If there was no error, and the type is text/html, parse
 470              it recursively.  */
 471           if (dt & TEXTHTML)
 472             {
 473               if (dt & RETROKF)
 474                 recursive_retrieve (filename, constr);
 475             }
 476           else
 477             DEBUGP (("%s is not text/html so we don't chase.\n",
 478                      filename ? filename: "(null)"));
 479
 480           if (opt.delete_after || (filename && !acceptable (filename)))
 481             /* Either --delete-after was specified, or we loaded this otherwise
 482                rejected (e.g. by -R) HTML file just so we could harvest its
 483                hyperlinks -- in either case, delete the local file. */
 484             {
 485               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 486                        opt.delete_after ? "--delete-after" :
 487                        "recursive rejection criteria"));
 488               logprintf (LOG_VERBOSE,
 489                          (opt.delete_after ? _("Removing %s.\n")
 490                           : _("Removing %s since it should be rejected.\n")),
 491                          filename);
 492               if (unlink (filename))
 493                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 494               dt &= ~RETROKF;
 495             }
 496
 497           /* If everything was OK, and links are to be converted, let's
 498              store the local filename.  */
 499           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 500             {
 501               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 502               cur_url->local_name = xstrdup (filename);
 503             }
 504         }
 505       else
 506         DEBUGP (("%s already in list, so we don't load.\n", constr));
 507       /* Free filename and constr.  */
 508       FREE_MAYBE (filename);
 509       FREE_MAYBE (constr);
 510       freeurl (u, 1);
 511       /* Increment the pbuf for the appropriate size.  */
 512     }
 513   if (opt.convert_links && !opt.delete_after)
 514     /* This is merely the first pass: the links that have been
 515        successfully downloaded are converted.  In the second pass,
 516        convert_all_links() will also convert those links that have NOT
 517        been downloaded to their canonical form.  */
 518     convert_links (file, url_list);
 519   /* Free the linked list of URL-s.  */
 520   free_urlpos (url_list);
 521   /* Free the canonical this_url.  */
 522   FREE_MAYBE (canon_this_url);
 523   /* Decrement the recursion depth.  */
 524   --depth;
 525   if (downloaded_exceeds_quota ())
 526     return QUOTEXC;
 527   else
 528     return RETROK;
 529 }
 530 \f
 531 void
 532 register_download (const char *url, const char *file)
 533 {
 534   if (!opt.convert_links)
 535     return;
 536   if (!dl_file_url_map)
 537     dl_file_url_map = make_string_hash_table (0);
 538   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 539   if (!dl_url_file_map)
 540     dl_url_file_map = make_string_hash_table (0);
 541   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 542 }
 543
 544 void
 545 register_html (const char *url, const char *file)
 546 {
 547   if (!opt.convert_links)
 548     return;
 549   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 550 }
 551
 552 /* convert_links() is called from recursive_retrieve() after we're
 553    done with an HTML file.  This call to convert_links is not complete
 554    because it converts only the downloaded files, and Wget cannot know
 555    which files will be downloaded afterwards.  So, if we have file
 556    fileone.html with:
 557
 558    <a href="/c/something.gif">
 559
 560    and /c/something.gif was not downloaded because it exceeded the
 561    recursion depth, the reference will *not* be changed.
 562
 563    However, later we can encounter /c/something.gif from an "upper"
 564    level HTML (let's call it filetwo.html), and it gets downloaded.
 565
 566    But now we have a problem because /c/something.gif will be
 567    correctly transformed in filetwo.html, but not in fileone.html,
 568    since Wget could not have known that /c/something.gif will be
 569    downloaded in the future.
 570
 571    This is why Wget must, after the whole retrieval, call
 572    convert_all_links to go once more through the entire list of
 573    retrieved HTMLs, and re-convert them.
 574
 575    All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
 576    in urls_downloaded.  From these two lists information is
 577    extracted.  */
 578 void
 579 convert_all_links (void)
 580 {
 581   slist *html;
 582
 583   /* Destructively reverse downloaded_html_files to get it in the right order.
 584      recursive_retrieve() used slist_prepend() consistently.  */
 585   downloaded_html_files = slist_nreverse (downloaded_html_files);
 586
 587   for (html = downloaded_html_files; html; html = html->next)
 588     {
 589       urlpos *urls, *cur_url;
 590       char *url;
 591
 592       DEBUGP (("Rescanning %s\n", html->string));
 593       /* Determine the URL of the HTML file.  get_urls_html will need
 594          it.  */
 595       url = hash_table_get (dl_file_url_map, html->string);
 596       if (url)
 597         DEBUGP (("It should correspond to %s.\n", url));
 598       else
 599         DEBUGP (("I cannot find the corresponding URL.\n"));
 600       /* Parse the HTML file...  */
 601       urls = get_urls_html (html->string, url, FALSE, NULL);
 602       /* We don't respect meta_disallow_follow here because, even if
 603          the file is not followed, we might still want to convert the
 604          links that have been followed from other files.  */
 605       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 606         {
 607           char *local_name;
 608
 609           /* The URL must be in canonical form to be compared.  */
 610           struct urlinfo *u = newurl ();
 611           uerr_t res = parseurl (cur_url->url, u, 0);
 612           if (res != URLOK)
 613             {
 614               freeurl (u, 1);
 615               continue;
 616             }
 617           /* We decide the direction of conversion according to whether
 618              a URL was downloaded.  Downloaded URLs will be converted
 619              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 620           local_name = hash_table_get (dl_url_file_map, u->url);
 621           if (local_name)
 622             DEBUGP (("%s marked for conversion, local %s\n",
 623                      u->url, local_name));
 624           /* Decide on the conversion direction.  */
 625           if (local_name)
 626             {
 627               /* We've downloaded this URL.  Convert it to relative
 628                  form.  We do this even if the URL already is in
 629                  relative form, because our directory structure may
 630                  not be identical to that on the server (think `-nd',
 631                  `--cut-dirs', etc.)  */
 632               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 633               cur_url->local_name = xstrdup (local_name);
 634             }
 635           else
 636             {
 637               /* We haven't downloaded this URL.  If it's not already
 638                  complete (including a full host name), convert it to
 639                  that form, so it can be reached while browsing this
 640                  HTML locally.  */
 641               if (!cur_url->link_complete_p)
 642                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 643               cur_url->local_name = NULL;
 644             }
 645           freeurl (u, 1);
 646         }
 647       /* Convert the links in the file.  */
 648       convert_links (html->string, urls);
 649       /* Free the data.  */
 650       free_urlpos (urls);
 651     }
 652 }
 653 \f
 654 /* Robots support.  */
 655
 656 /* Construct the robots URL.  */
 657 static struct urlinfo *
 658 robots_url (const char *url, const char *robots_filename)
 659 {
 660   struct urlinfo *u = newurl ();
 661   uerr_t err;
 662
 663   err = parseurl (url, u, 0);
 664   assert (err == URLOK && u->proto == URLHTTP);
 665   xfree (u->file);
 666   xfree (u->dir);
 667   xfree (u->url);
 668   u->dir = xstrdup ("");
 669   u->file = xstrdup (robots_filename);
 670   u->url = str_url (u, 0);
 671   return u;
 672 }
 673
 674 /* Retrieves the robots_filename from the root server directory, if
 675    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 676    NOROBOTS if robots could not be retrieved for any reason.  */
 677 static uerr_t
 678 retrieve_robots (const char *url, const char *robots_filename)
 679 {
 680   int dt;
 681   uerr_t err;
 682   struct urlinfo *u;
 683
 684   u = robots_url (url, robots_filename);
 685   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 686   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 687   freeurl (u, 1);
 688   if (err == RETROK)
 689     return ROBOTSOK;
 690   else
 691     return NOROBOTS;
 692 }
 693
 694 /* Parse the robots_filename and return the disallowed path components
 695    in a malloc-ed vector of character pointers.
 696
 697    It should be fully compliant with the syntax as described in the
 698    file norobots.txt, adopted by the robots mailing list
 699    (robots@webcrawler.com).  */
 700 static char **
 701 parse_robots (const char *robots_filename)
 702 {
 703   FILE *fp;
 704   char **entries;
 705   char *line, *cmd, *str, *p;
 706   char *base_version, *version;
 707   int num, i;
 708   int wget_matched;             /* is the part meant for Wget?  */
 709
 710   entries = NULL;
 711
 712   num = 0;
 713   fp = fopen (robots_filename, "rb");
 714   if (!fp)
 715     return NULL;
 716
 717   /* Kill version number.  */
 718   if (opt.useragent)
 719     {
 720       STRDUP_ALLOCA (base_version, opt.useragent);
 721       STRDUP_ALLOCA (version, opt.useragent);
 722     }
 723   else
 724     {
 725       int len = 10 + strlen (version_string);
 726       base_version = (char *)alloca (len);
 727       sprintf (base_version, "Wget/%s", version_string);
 728       version = (char *)alloca (len);
 729       sprintf (version, "Wget/%s", version_string);
 730     }
 731   for (p = version; *p; p++)
 732     *p = TOLOWER (*p);
 733   for (p = base_version; *p && *p != '/'; p++)
 734     *p = TOLOWER (*p);
 735   *p = '\0';
 736
 737   /* Setting this to 1 means that Wget considers itself under
 738      restrictions by default, even if the User-Agent field is not
 739      present.  However, if it finds the user-agent set to anything
 740      other than Wget, the rest will be ignored (up to the following
 741      User-Agent field).  Thus you may have something like:
 742
 743      Disallow: 1
 744      Disallow: 2
 745      User-Agent: stupid-robot
 746      Disallow: 3
 747      Disallow: 4
 748      User-Agent: Wget*
 749      Disallow: 5
 750      Disallow: 6
 751      User-Agent: *
 752      Disallow: 7
 753
 754      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 755      stored.  */
 756   wget_matched = 1;
 757   while ((line = read_whole_line (fp)))
 758     {
 759       int len = strlen (line);
 760       /* Destroy <CR><LF> if present.  */
 761       if (len && line[len - 1] == '\n')
 762         line[--len] = '\0';
 763       if (len && line[len - 1] == '\r')
 764         line[--len] = '\0';
 765       /* According to specifications, optional space may be at the
 766          end...  */
 767       DEBUGP (("Line: %s\n", line));
 768       /* Skip spaces.  */
 769       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 770       if (!*cmd)
 771         {
 772           xfree (line);
 773           DEBUGP (("(chucked out)\n"));
 774           continue;
 775         }
 776       /* Look for ':'.  */
 777       for (str = cmd; *str && *str != ':'; str++);
 778       if (!*str)
 779         {
 780           xfree (line);
 781           DEBUGP (("(chucked out)\n"));
 782           continue;
 783         }
 784       /* Zero-terminate the command.  */
 785       *str++ = '\0';
 786       /* Look for the string beginning...  */
 787       for (; *str && ISSPACE (*str); str++);
 788       /* Look for comments or trailing spaces and kill them off.  */
 789       for (p = str; *p; p++)
 790         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 791           {
 792             /* We have found either a shell-style comment `<sp>+#' or some
 793                trailing spaces.  Now rewind to the beginning of the spaces
 794                and place '\0' there.  */
 795             while (p > str && ISSPACE (*p))
 796               --p;
 797             if (p == str)
 798               *p = '\0';
 799             else
 800               *(p + 1) = '\0';
 801             break;
 802           }
 803       if (!strcasecmp (cmd, "User-agent"))
 804         {
 805           int match = 0;
 806           /* Lowercase the agent string.  */
 807           for (p = str; *p; p++)
 808             *p = TOLOWER (*p);
 809           /* If the string is `*', it matches.  */
 810           if (*str == '*' && !*(str + 1))
 811             match = 1;
 812           else
 813             {
 814               /* If the string contains wildcards, we'll run it through
 815                  fnmatch().  */
 816               if (has_wildcards_p (str))
 817                 {
 818                   /* If the string contains '/', compare with the full
 819                      version.  Else, compare it to base_version.  */
 820                   if (strchr (str, '/'))
 821                     match = !fnmatch (str, version, 0);
 822                   else
 823                     match = !fnmatch (str, base_version, 0);
 824                 }
 825               else                /* Substring search */
 826                 {
 827                   if (strstr (version, str))
 828                     match = 1;
 829                   else
 830                     match = 0;
 831                 }
 832             }
 833           /* If Wget is not matched, skip all the entries up to the
 834              next User-agent field.  */
 835           wget_matched = match;
 836         }
 837       else if (!wget_matched)
 838         {
 839           xfree (line);
 840           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 841           continue;
 842         }
 843       else if (!strcasecmp (cmd, "Disallow"))
 844         {
 845           /* If "Disallow" is empty, the robot is welcome.  */
 846           if (!*str)
 847             {
 848               free_vec (entries);
 849               entries = (char **)xmalloc (sizeof (char *));
 850               *entries = NULL;
 851               num = 0;
 852             }
 853           else
 854             {
 855               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 856               entries[num] = xstrdup (str);
 857               entries[++num] = NULL;
 858               /* Strip trailing spaces, according to specifications.  */
 859               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 860                 if (ISSPACE (str[i]))
 861                   str[i] = '\0';
 862             }
 863         }
 864       else
 865         {
 866           /* unknown command */
 867           DEBUGP (("(chucked out)\n"));
 868         }
 869       xfree (line);
 870     }
 871   fclose (fp);
 872   return entries;
 873 }
 874
 875 /* May the URL url be loaded according to disallowing rules stored in
 876    forbidden?  */
 877 static int
 878 robots_match (struct urlinfo *u, char **fb)
 879 {
 880   int l;
 881
 882   if (!fb)
 883     return 1;
 884   DEBUGP (("Matching %s against: ", u->path));
 885   for (; *fb; fb++)
 886     {
 887       DEBUGP (("%s ", *fb));
 888       l = strlen (*fb);
 889       /* If dir is fb, we may not load the file.  */
 890       if (strncmp (u->path, *fb, l) == 0)
 891         {
 892           DEBUGP (("matched.\n"));
 893           return 0; /* Matches, i.e. does not load...  */
 894         }
 895     }
 896   DEBUGP (("not matched.\n"));
 897   return 1;
 898 }