sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <ctype.h>
  35 #include <sys/types.h>
  36
  37 #include "wget.h"
  38 #include "url.h"
  39 #include "recur.h"
  40 #include "utils.h"
  41 #include "retr.h"
  42 #include "ftp.h"
  43 #include "fnmatch.h"
  44 #include "host.h"
  45 #include "hash.h"
  46
  47 extern char *version_string;
  48
  49 #define ROBOTS_FILENAME "robots.txt"
  50
  51 static struct hash_table *dl_file_url_map;
  52 static struct hash_table *dl_url_file_map;
  53
  54 /* List of HTML URLs.  */
  55 static slist *urls_html;
  56
  57 /* List of undesirable-to-load URLs.  */
  58 static struct hash_table *undesirable_urls;
  59
  60 /* List of forbidden locations.  */
  61 static char **forbidden = NULL;
  62
  63 /* Current recursion depth.  */
  64 static int depth;
  65
  66 /* Base directory we're recursing from (used by no_parent).  */
  67 static char *base_dir;
  68
  69 /* The host name for which we last checked robots.  */
  70 static char *robots_host;
  71
  72 static int first_time = 1;
  73
  74 /* Construct the robots URL.  */
  75 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
  76 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
  77 static char **parse_robots PARAMS ((const char *));
  78 static int robots_match PARAMS ((struct urlinfo *, char **));
  79
  80
  81 /* Cleanup the data structures associated with recursive retrieving
  82    (the variables above).  */
  83 void
  84 recursive_cleanup (void)
  85 {
  86   if (undesirable_urls)
  87     {
  88       string_set_free (undesirable_urls);
  89       undesirable_urls = NULL;
  90     }
  91   if (dl_file_url_map)
  92     {
  93       free_keys_and_values (dl_file_url_map);
  94       hash_table_destroy (dl_file_url_map);
  95       dl_file_url_map = NULL;
  96     }
  97   if (dl_url_file_map)
  98     {
  99       free_keys_and_values (dl_url_file_map);
 100       hash_table_destroy (dl_url_file_map);
 101       dl_url_file_map = NULL;
 102     }
 103   undesirable_urls = NULL;
 104   free_vec (forbidden);
 105   forbidden = NULL;
 106   slist_free (urls_html);
 107   urls_html = NULL;
 108   FREE_MAYBE (base_dir);
 109   FREE_MAYBE (robots_host);
 110   first_time = 1;
 111 }
 112
 113 /* Reset FIRST_TIME to 1, so that some action can be taken in
 114    recursive_retrieve().  */
 115 void
 116 recursive_reset (void)
 117 {
 118   first_time = 1;
 119 }
 120
 121 /* The core of recursive retrieving.  Endless recursion is avoided by
 122    having all URLs stored to a linked list of URLs, which is checked
 123    before loading any URL.  That way no URL can get loaded twice.
 124
 125    The function also supports specification of maximum recursion depth
 126    and a number of other goodies.  */
 127 uerr_t
 128 recursive_retrieve (const char *file, const char *this_url)
 129 {
 130   char *constr, *filename, *newloc;
 131   char *canon_this_url = NULL;
 132   int dt, inl, dash_p_leaf_HTML = FALSE;
 133   int meta_disallow_follow;
 134   int this_url_ftp;            /* See below the explanation */
 135   uerr_t err;
 136   struct urlinfo *rurl;
 137   urlpos *url_list, *cur_url;
 138   char *rfile; /* For robots */
 139   struct urlinfo *u;
 140
 141   assert (this_url != NULL);
 142   assert (file != NULL);
 143   /* If quota was exceeded earlier, bail out.  */
 144   if (downloaded_exceeds_quota ())
 145     return QUOTEXC;
 146   /* Cache the current URL in the list.  */
 147   if (first_time)
 148     {
 149       /* These three operations need to be done only once per Wget
 150          run.  They should probably be at a different location.  */
 151       if (!undesirable_urls)
 152         undesirable_urls = make_string_hash_table (0);
 153       if (!dl_file_url_map)
 154         dl_file_url_map = make_string_hash_table (0);
 155       if (!dl_url_file_map)
 156         dl_url_file_map = make_string_hash_table (0);
 157
 158       hash_table_clear (undesirable_urls);
 159       string_set_add (undesirable_urls, this_url);
 160       hash_table_clear (dl_file_url_map);
 161       hash_table_clear (dl_url_file_map);
 162       urls_html = NULL;
 163       /* Enter this_url to the hash table, in original and "enhanced" form.  */
 164       u = newurl ();
 165       err = parseurl (this_url, u, 0);
 166       if (err == URLOK)
 167         {
 168           string_set_add (undesirable_urls, u->url);
 169           hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
 170           hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
 171           urls_html = slist_prepend (urls_html, file);
 172           if (opt.no_parent)
 173             base_dir = xstrdup (u->dir); /* Set the base dir.  */
 174           /* Set the canonical this_url to be sent as referer.  This
 175              problem exists only when running the first time.  */
 176           canon_this_url = xstrdup (u->url);
 177         }
 178       else
 179         {
 180           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 181           base_dir = NULL;
 182         }
 183       freeurl (u, 1);
 184       depth = 1;
 185       robots_host = NULL;
 186       forbidden = NULL;
 187       first_time = 0;
 188     }
 189   else
 190     ++depth;
 191
 192   if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 193     /* We've exceeded the maximum recursion depth specified by the user. */
 194     {
 195       if (opt.page_requisites && depth <= opt.reclevel + 1)
 196         /* When -p is specified, we can do one more partial recursion from the
 197            "leaf nodes" on the HTML document tree.  The recursion is partial in
 198            that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 199            except for <LINK REL="stylesheet">. */
 200         dash_p_leaf_HTML = TRUE;
 201       else
 202         /* Either -p wasn't specified or it was and we've already gone the one
 203            extra (pseudo-)level that it affords us, so we need to bail out. */
 204         {
 205           DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 206                    depth, opt.reclevel));
 207           --depth;
 208           return RECLEVELEXC;
 209         }
 210     }
 211
 212   /* Determine whether this_url is an FTP URL.  If it is, it means
 213      that the retrieval is done through proxy.  In that case, FTP
 214      links will be followed by default and recursion will not be
 215      turned off when following them.  */
 216   this_url_ftp = (urlproto (this_url) == URLFTP);
 217
 218   /* Get the URL-s from an HTML file: */
 219   url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
 220                             dash_p_leaf_HTML, &meta_disallow_follow);
 221
 222   if (opt.use_robots && meta_disallow_follow)
 223     {
 224       /* The META tag says we are not to follow this file.  Respect
 225          that.  */
 226       free_urlpos (url_list);
 227       url_list = NULL;
 228     }
 229
 230   /* Decide what to do with each of the URLs.  A URL will be loaded if
 231      it meets several requirements, discussed later.  */
 232   for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 233     {
 234       /* If quota was exceeded earlier, bail out.  */
 235       if (downloaded_exceeds_quota ())
 236         break;
 237       /* Parse the URL for convenient use in other functions, as well
 238          as to get the optimized form.  It also checks URL integrity.  */
 239       u = newurl ();
 240       if (parseurl (cur_url->url, u, 0) != URLOK)
 241         {
 242           DEBUGP (("Yuck!  A bad URL.\n"));
 243           freeurl (u, 1);
 244           continue;
 245         }
 246       if (u->proto == URLFILE)
 247         {
 248           DEBUGP (("Nothing to do with file:// around here.\n"));
 249           freeurl (u, 1);
 250           continue;
 251         }
 252       assert (u->url != NULL);
 253       constr = xstrdup (u->url);
 254
 255       /* Several checkings whether a file is acceptable to load:
 256          1. check if URL is ftp, and we don't load it
 257          2. check for relative links (if relative_only is set)
 258          3. check for domain
 259          4. check for no-parent
 260          5. check for excludes && includes
 261          6. check for suffix
 262          7. check for same host (if spanhost is unset), with possible
 263          gethostbyname baggage
 264          8. check for robots.txt
 265
 266          Addendum: If the URL is FTP, and it is to be loaded, only the
 267          domain and suffix settings are "stronger".
 268
 269          Note that .html and (yuck) .htm will get loaded regardless of
 270          suffix rules (but that is remedied later with unlink) unless
 271          the depth equals the maximum depth.
 272
 273          More time- and memory- consuming tests should be put later on
 274          the list.  */
 275
 276       /* inl is set if the URL we are working on (constr) is stored in
 277          undesirable_urls.  Using it is crucial to avoid unnecessary
 278          repeated continuous hits to the hash table.  */
 279       inl = string_set_exists (undesirable_urls, constr);
 280
 281       /* If it is FTP, and FTP is not followed, chuck it out.  */
 282       if (!inl)
 283         if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 284           {
 285             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
 286             string_set_add (undesirable_urls, constr);
 287             inl = 1;
 288           }
 289       /* If it is absolute link and they are not followed, chuck it
 290          out.  */
 291       if (!inl && u->proto != URLFTP)
 292         if (opt.relative_only && !cur_url->link_relative_p)
 293           {
 294             DEBUGP (("It doesn't really look like a relative link.\n"));
 295             string_set_add (undesirable_urls, constr);
 296             inl = 1;
 297           }
 298       /* If its domain is not to be accepted/looked-up, chuck it out.  */
 299       if (!inl)
 300         if (!accept_domain (u))
 301           {
 302             DEBUGP (("I don't like the smell of that domain.\n"));
 303             string_set_add (undesirable_urls, constr);
 304             inl = 1;
 305           }
 306       /* Check for parent directory.  */
 307       if (!inl && opt.no_parent
 308           /* If the new URL is FTP and the old was not, ignore
 309              opt.no_parent.  */
 310           && !(!this_url_ftp && u->proto == URLFTP))
 311         {
 312           /* Check for base_dir first.  */
 313           if (!(base_dir && frontcmp (base_dir, u->dir)))
 314             {
 315               /* Failing that, check for parent dir.  */
 316               struct urlinfo *ut = newurl ();
 317               if (parseurl (this_url, ut, 0) != URLOK)
 318                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 319               else if (!frontcmp (ut->dir, u->dir))
 320                 {
 321                   /* Failing that too, kill the URL.  */
 322                   DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
 323                   string_set_add (undesirable_urls, constr);
 324                   inl = 1;
 325                 }
 326               freeurl (ut, 1);
 327             }
 328         }
 329       /* If the file does not match the acceptance list, or is on the
 330          rejection list, chuck it out.  The same goes for the
 331          directory exclude- and include- lists.  */
 332       if (!inl && (opt.includes || opt.excludes))
 333         {
 334           if (!accdir (u->dir, ALLABS))
 335             {
 336               DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
 337               string_set_add (undesirable_urls, constr);
 338               inl = 1;
 339             }
 340         }
 341       if (!inl)
 342         {
 343           char *suf = NULL;
 344           /* We check for acceptance/rejection rules only for non-HTML
 345              documents.  Since we don't know whether they really are
 346              HTML, it will be deduced from (an OR-ed list):
 347
 348              1) u->file is "" (meaning it is a directory)
 349              2) suffix exists, AND:
 350              a) it is "html", OR
 351              b) it is "htm"
 352
 353              If the file *is* supposed to be HTML, it will *not* be
 354             subject to acc/rej rules, unless a finite maximum depth has
 355             been specified and the current depth is the maximum depth. */
 356           if (!
 357               (!*u->file
 358                || (((suf = suffix (constr)) != NULL)
 359                   && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
 360                       && ((opt.reclevel != INFINITE_RECURSION) &&
 361                           (depth != opt.reclevel))))))
 362             {
 363               if (!acceptable (u->file))
 364                 {
 365                   DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 366                           constr, u->file));
 367                   string_set_add (undesirable_urls, constr);
 368                   inl = 1;
 369                 }
 370             }
 371           FREE_MAYBE (suf);
 372         }
 373       /* Optimize the URL (which includes possible DNS lookup) only
 374          after all other possibilities have been exhausted.  */
 375       if (!inl)
 376         {
 377           if (!opt.simple_check)
 378             opt_url (u);
 379           else
 380             {
 381               char *p;
 382               /* Just lowercase the hostname.  */
 383               for (p = u->host; *p; p++)
 384                 *p = TOLOWER (*p);
 385               free (u->url);
 386               u->url = str_url (u, 0);
 387             }
 388           free (constr);
 389           constr = xstrdup (u->url);
 390           string_set_add (undesirable_urls, constr);
 391           if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 392             if (!opt.spanhost && this_url && !same_host (this_url, constr))
 393               {
 394                 DEBUGP (("This is not the same hostname as the parent's.\n"));
 395                 string_set_add (undesirable_urls, constr);
 396                 inl = 1;
 397               }
 398         }
 399       /* What about robots.txt?  */
 400       if (!inl && opt.use_robots && u->proto == URLHTTP)
 401         {
 402           /* Since Wget knows about only one set of robot rules at a
 403              time, /robots.txt must be reloaded whenever a new host is
 404              accessed.
 405
 406              robots_host holds the host the current `forbid' variable
 407              is assigned to.  */
 408           if (!robots_host || !same_host (robots_host, u->host))
 409             {
 410               FREE_MAYBE (robots_host);
 411               /* Now make robots_host the new host, no matter what the
 412                  result will be.  So if there is no /robots.txt on the
 413                  site, Wget will not retry getting robots all the
 414                  time.  */
 415               robots_host = xstrdup (u->host);
 416               free_vec (forbidden);
 417               forbidden = NULL;
 418               err = retrieve_robots (constr, ROBOTS_FILENAME);
 419               if (err == ROBOTSOK)
 420                 {
 421                   rurl = robots_url (constr, ROBOTS_FILENAME);
 422                   rfile = url_filename (rurl);
 423                   forbidden = parse_robots (rfile);
 424                   freeurl (rurl, 1);
 425                   free (rfile);
 426                 }
 427             }
 428
 429           /* Now that we have (or don't have) robots, we can check for
 430              them.  */
 431           if (!robots_match (u, forbidden))
 432             {
 433               DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 434                        ROBOTS_FILENAME));
 435               string_set_add (undesirable_urls, constr);
 436               inl = 1;
 437             }
 438         }
 439
 440       filename = NULL;
 441       /* If it wasn't chucked out, do something with it.  */
 442       if (!inl)
 443         {
 444           DEBUGP (("I've decided to load it -> "));
 445           /* Add it to the list of already-loaded URL-s.  */
 446           string_set_add (undesirable_urls, constr);
 447           /* Automatically followed FTPs will *not* be downloaded
 448              recursively.  */
 449           if (u->proto == URLFTP)
 450             {
 451               /* Don't you adore side-effects?  */
 452               opt.recursive = 0;
 453             }
 454           /* Reset its type.  */
 455           dt = 0;
 456           /* Retrieve it.  */
 457           retrieve_url (constr, &filename, &newloc,
 458                        canon_this_url ? canon_this_url : this_url, &dt);
 459           if (u->proto == URLFTP)
 460             {
 461               /* Restore...  */
 462               opt.recursive = 1;
 463             }
 464           if (newloc)
 465             {
 466               free (constr);
 467               constr = newloc;
 468             }
 469           /* In case of convert_links: If there was no error, add it to
 470              the list of downloaded URLs.  We might need it for
 471              conversion.  */
 472           if (opt.convert_links && filename)
 473             {
 474               if (dt & RETROKF)
 475                 {
 476                   hash_table_put (dl_file_url_map,
 477                                   xstrdup (filename), xstrdup (constr));
 478                   hash_table_put (dl_url_file_map,
 479                                   xstrdup (constr), xstrdup (filename));
 480                   /* If the URL is HTML, note it.  */
 481                   if (dt & TEXTHTML)
 482                     urls_html = slist_prepend (urls_html, filename);
 483                 }
 484             }
 485           /* If there was no error, and the type is text/html, parse
 486              it recursively.  */
 487           if (dt & TEXTHTML)
 488             {
 489               if (dt & RETROKF)
 490                 recursive_retrieve (filename, constr);
 491             }
 492           else
 493             DEBUGP (("%s is not text/html so we don't chase.\n",
 494                      filename ? filename: "(null)"));
 495
 496           if (opt.delete_after || (filename && !acceptable (filename)))
 497             /* Either --delete-after was specified, or we loaded this otherwise
 498                rejected (e.g. by -R) HTML file just so we could harvest its
 499                hyperlinks -- in either case, delete the local file. */
 500             {
 501               DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 502                        opt.delete_after ? "--delete-after" :
 503                        "recursive rejection criteria"));
 504               logprintf (LOG_VERBOSE,
 505                          (opt.delete_after ? _("Removing %s.\n")
 506                           : _("Removing %s since it should be rejected.\n")),
 507                          filename);
 508               if (unlink (filename))
 509                 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 510               dt &= ~RETROKF;
 511             }
 512
 513           /* If everything was OK, and links are to be converted, let's
 514              store the local filename.  */
 515           if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 516             {
 517               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 518               cur_url->local_name = xstrdup (filename);
 519             }
 520         }
 521       else
 522         DEBUGP (("%s already in list, so we don't load.\n", constr));
 523       /* Free filename and constr.  */
 524       FREE_MAYBE (filename);
 525       FREE_MAYBE (constr);
 526       freeurl (u, 1);
 527       /* Increment the pbuf for the appropriate size.  */
 528     }
 529   if (opt.convert_links && !opt.delete_after)
 530     /* This is merely the first pass: the links that have been
 531        successfully downloaded are converted.  In the second pass,
 532        convert_all_links() will also convert those links that have NOT
 533        been downloaded to their canonical form.  */
 534     convert_links (file, url_list);
 535   /* Free the linked list of URL-s.  */
 536   free_urlpos (url_list);
 537   /* Free the canonical this_url.  */
 538   FREE_MAYBE (canon_this_url);
 539   /* Decrement the recursion depth.  */
 540   --depth;
 541   if (downloaded_exceeds_quota ())
 542     return QUOTEXC;
 543   else
 544     return RETROK;
 545 }
 546 \f
 547 /* convert_links() is called from recursive_retrieve() after we're
 548    done with an HTML file.  This call to convert_links is not complete
 549    because it converts only the downloaded files, and Wget cannot know
 550    which files will be downloaded afterwards.  So, if we have file
 551    fileone.html with:
 552
 553    <a href="/c/something.gif">
 554
 555    and /c/something.gif was not downloaded because it exceeded the
 556    recursion depth, the reference will *not* be changed.
 557
 558    However, later we can encounter /c/something.gif from an "upper"
 559    level HTML (let's call it filetwo.html), and it gets downloaded.
 560
 561    But now we have a problem because /c/something.gif will be
 562    correctly transformed in filetwo.html, but not in fileone.html,
 563    since Wget could not have known that /c/something.gif will be
 564    downloaded in the future.
 565
 566    This is why Wget must, after the whole retrieval, call
 567    convert_all_links to go once more through the entire list of
 568    retrieved HTMLs, and re-convert them.
 569
 570    All the downloaded HTMLs are kept in urls_html, and downloaded URLs
 571    in urls_downloaded.  From these two lists information is
 572    extracted.  */
 573 void
 574 convert_all_links (void)
 575 {
 576   slist *html;
 577
 578   /* Destructively reverse urls_html to get it in the right order.
 579      recursive_retrieve() used slist_prepend() consistently.  */
 580   urls_html = slist_nreverse (urls_html);
 581
 582   for (html = urls_html; html; html = html->next)
 583     {
 584       urlpos *urls, *cur_url;
 585       char *url;
 586
 587       DEBUGP (("Rescanning %s\n", html->string));
 588       /* Determine the URL of the HTML file.  get_urls_html will need
 589          it.  */
 590       url = hash_table_get (dl_file_url_map, html->string);
 591       if (url)
 592         DEBUGP (("It should correspond to %s.\n", url));
 593       else
 594         DEBUGP (("I cannot find the corresponding URL.\n"));
 595       /* Parse the HTML file...  */
 596       urls = get_urls_html (html->string, url, FALSE, NULL);
 597       /* We don't respect meta_disallow_follow here because, even if
 598          the file is not followed, we might still want to convert the
 599          links that have been followed from other files.  */
 600       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 601         {
 602           char *local_name;
 603
 604           /* The URL must be in canonical form to be compared.  */
 605           struct urlinfo *u = newurl ();
 606           uerr_t res = parseurl (cur_url->url, u, 0);
 607           if (res != URLOK)
 608             {
 609               freeurl (u, 1);
 610               continue;
 611             }
 612           /* We decide the direction of conversion according to whether
 613              a URL was downloaded.  Downloaded URLs will be converted
 614              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 615           local_name = hash_table_get (dl_url_file_map, u->url);
 616           if (local_name)
 617             DEBUGP (("%s marked for conversion, local %s\n",
 618                      u->url, local_name));
 619           /* Decide on the conversion direction.  */
 620           if (local_name)
 621             {
 622               /* We've downloaded this URL.  Convert it to relative
 623                  form.  We do this even if the URL already is in
 624                  relative form, because our directory structure may
 625                  not be identical to that on the server (think `-nd',
 626                  `--cut-dirs', etc.)  */
 627               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 628               cur_url->local_name = xstrdup (local_name);
 629             }
 630           else
 631             {
 632               /* We haven't downloaded this URL.  If it's not already
 633                  complete (including a full host name), convert it to
 634                  that form, so it can be reached while browsing this
 635                  HTML locally.  */
 636               if (!cur_url->link_complete_p)
 637                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 638               cur_url->local_name = NULL;
 639             }
 640           freeurl (u, 1);
 641         }
 642       /* Convert the links in the file.  */
 643       convert_links (html->string, urls);
 644       /* Free the data.  */
 645       free_urlpos (urls);
 646     }
 647 }
 648 \f
 649 /* Robots support.  */
 650
 651 /* Construct the robots URL.  */
 652 static struct urlinfo *
 653 robots_url (const char *url, const char *robots_filename)
 654 {
 655   struct urlinfo *u = newurl ();
 656   uerr_t err;
 657
 658   err = parseurl (url, u, 0);
 659   assert (err == URLOK && u->proto == URLHTTP);
 660   free (u->file);
 661   free (u->dir);
 662   free (u->url);
 663   u->dir = xstrdup ("");
 664   u->file = xstrdup (robots_filename);
 665   u->url = str_url (u, 0);
 666   return u;
 667 }
 668
 669 /* Retrieves the robots_filename from the root server directory, if
 670    possible.  Returns ROBOTSOK if robots were retrieved OK, and
 671    NOROBOTS if robots could not be retrieved for any reason.  */
 672 static uerr_t
 673 retrieve_robots (const char *url, const char *robots_filename)
 674 {
 675   int dt;
 676   uerr_t err;
 677   struct urlinfo *u;
 678
 679   u = robots_url (url, robots_filename);
 680   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 681   err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
 682   freeurl (u, 1);
 683   if (err == RETROK)
 684     return ROBOTSOK;
 685   else
 686     return NOROBOTS;
 687 }
 688
 689 /* Parse the robots_filename and return the disallowed path components
 690    in a malloc-ed vector of character pointers.
 691
 692    It should be fully compliant with the syntax as described in the
 693    file norobots.txt, adopted by the robots mailing list
 694    (robots@webcrawler.com).  */
 695 static char **
 696 parse_robots (const char *robots_filename)
 697 {
 698   FILE *fp;
 699   char **entries;
 700   char *line, *cmd, *str, *p;
 701   char *base_version, *version;
 702   int len, num, i;
 703   int wget_matched;             /* is the part meant for Wget?  */
 704
 705   entries = NULL;
 706
 707   num = 0;
 708   fp = fopen (robots_filename, "rb");
 709   if (!fp)
 710     return NULL;
 711
 712   /* Kill version number.  */
 713     if (opt.useragent)
 714       {
 715         STRDUP_ALLOCA (base_version, opt.useragent);
 716         STRDUP_ALLOCA (version, opt.useragent);
 717       }
 718     else
 719       {
 720         int len = 10 + strlen (version_string);
 721         base_version = (char *)alloca (len);
 722         sprintf (base_version, "Wget/%s", version_string);
 723         version = (char *)alloca (len);
 724         sprintf (version, "Wget/%s", version_string);
 725       }
 726   for (p = version; *p; p++)
 727     *p = TOLOWER (*p);
 728   for (p = base_version; *p && *p != '/'; p++)
 729     *p = TOLOWER (*p);
 730   *p = '\0';
 731
 732   /* Setting this to 1 means that Wget considers itself under
 733      restrictions by default, even if the User-Agent field is not
 734      present.  However, if it finds the user-agent set to anything
 735      other than Wget, the rest will be ignored (up to the following
 736      User-Agent field).  Thus you may have something like:
 737
 738      Disallow: 1
 739      Disallow: 2
 740      User-Agent: stupid-robot
 741      Disallow: 3
 742      Disallow: 4
 743      User-Agent: Wget*
 744      Disallow: 5
 745      Disallow: 6
 746      User-Agent: *
 747      Disallow: 7
 748
 749      In this case the 1, 2, 5, 6 and 7 disallow lines will be
 750      stored.  */
 751   wget_matched = 1;
 752   while ((line = read_whole_line (fp)))
 753     {
 754       len = strlen (line);
 755       /* Destroy <CR><LF> if present.  */
 756       if (len && line[len - 1] == '\n')
 757         line[--len] = '\0';
 758       if (len && line[len - 1] == '\r')
 759         line[--len] = '\0';
 760       /* According to specifications, optional space may be at the
 761          end...  */
 762       DEBUGP (("Line: %s\n", line));
 763       /* Skip spaces.  */
 764       for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
 765       if (!*cmd)
 766         {
 767           free (line);
 768           DEBUGP (("(chucked out)\n"));
 769           continue;
 770         }
 771       /* Look for ':'.  */
 772       for (str = cmd; *str && *str != ':'; str++);
 773       if (!*str)
 774         {
 775           free (line);
 776           DEBUGP (("(chucked out)\n"));
 777           continue;
 778         }
 779       /* Zero-terminate the command.  */
 780       *str++ = '\0';
 781       /* Look for the string beginning...  */
 782       for (; *str && ISSPACE (*str); str++);
 783       /* Look for comments or trailing spaces and kill them off.  */
 784       for (p = str; *p; p++)
 785         if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
 786           {
 787             /* We have found either a shell-style comment `<sp>+#' or some
 788                trailing spaces.  Now rewind to the beginning of the spaces
 789                and place '\0' there.  */
 790             while (p > str && ISSPACE (*p))
 791               --p;
 792             if (p == str)
 793               *p = '\0';
 794             else
 795               *(p + 1) = '\0';
 796             break;
 797           }
 798       if (!strcasecmp (cmd, "User-agent"))
 799         {
 800           int match = 0;
 801           /* Lowercase the agent string.  */
 802           for (p = str; *p; p++)
 803             *p = TOLOWER (*p);
 804           /* If the string is `*', it matches.  */
 805           if (*str == '*' && !*(str + 1))
 806             match = 1;
 807           else
 808             {
 809               /* If the string contains wildcards, we'll run it through
 810                  fnmatch().  */
 811               if (has_wildcards_p (str))
 812                 {
 813                   /* If the string contains '/', compare with the full
 814                      version.  Else, compare it to base_version.  */
 815                   if (strchr (str, '/'))
 816                     match = !fnmatch (str, version, 0);
 817                   else
 818                     match = !fnmatch (str, base_version, 0);
 819                 }
 820               else                /* Substring search */
 821                 {
 822                   if (strstr (version, str))
 823                     match = 1;
 824                   else
 825                     match = 0;
 826                 }
 827             }
 828           /* If Wget is not matched, skip all the entries up to the
 829              next User-agent field.  */
 830           wget_matched = match;
 831         }
 832       else if (!wget_matched)
 833         {
 834           free (line);
 835           DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
 836           continue;
 837         }
 838       else if (!strcasecmp (cmd, "Disallow"))
 839         {
 840           /* If "Disallow" is empty, the robot is welcome.  */
 841           if (!*str)
 842             {
 843               free_vec (entries);
 844               entries = (char **)xmalloc (sizeof (char *));
 845               *entries = NULL;
 846               num = 0;
 847             }
 848           else
 849             {
 850               entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
 851               entries[num] = xstrdup (str);
 852               entries[++num] = NULL;
 853               /* Strip trailing spaces, according to specifications.  */
 854               for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
 855                 if (ISSPACE (str[i]))
 856                   str[i] = '\0';
 857             }
 858         }
 859       else
 860         {
 861           /* unknown command */
 862           DEBUGP (("(chucked out)\n"));
 863         }
 864       free (line);
 865     }
 866   fclose (fp);
 867   return entries;
 868 }
 869
 870 /* May the URL url be loaded according to disallowing rules stored in
 871    forbidden?  */
 872 static int
 873 robots_match (struct urlinfo *u, char **forbidden)
 874 {
 875   int l;
 876
 877   if (!forbidden)
 878     return 1;
 879   DEBUGP (("Matching %s against: ", u->path));
 880   for (; *forbidden; forbidden++)
 881     {
 882       DEBUGP (("%s ", *forbidden));
 883       l = strlen (*forbidden);
 884       /* If dir is forbidden, we may not load the file.  */
 885       if (strncmp (u->path, *forbidden, l) == 0)
 886         {
 887           DEBUGP (("matched.\n"));
 888           return 0; /* Matches, i.e. does not load...  */
 889         }
 890     }
 891   DEBUGP (("not matched.\n"));
 892   return 1;
 893 }