1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
50 extern char *version_string;
52 #define ROBOTS_FILENAME "robots.txt"
54 static struct hash_table *dl_file_url_map;
55 static struct hash_table *dl_url_file_map;
57 /* List of HTML URLs. */
58 static slist *urls_html;
60 /* List of undesirable-to-load URLs. */
61 static struct hash_table *undesirable_urls;
63 /* List of forbidden locations. */
64 static char **forbidden = NULL;
66 /* Current recursion depth. */
69 /* Base directory we're recursing from (used by no_parent). */
70 static char *base_dir;
72 /* The host name for which we last checked robots. */
73 static char *robots_host;
75 static int first_time = 1;
77 /* Construct the robots URL. */
78 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
79 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
80 static char **parse_robots PARAMS ((const char *));
81 static int robots_match PARAMS ((struct urlinfo *, char **));
84 /* Cleanup the data structures associated with recursive retrieving
85 (the variables above). */
87 recursive_cleanup (void)
91 string_set_free (undesirable_urls);
92 undesirable_urls = NULL;
96 free_keys_and_values (dl_file_url_map);
97 hash_table_destroy (dl_file_url_map);
98 dl_file_url_map = NULL;
102 free_keys_and_values (dl_url_file_map);
103 hash_table_destroy (dl_url_file_map);
104 dl_url_file_map = NULL;
106 undesirable_urls = NULL;
107 free_vec (forbidden);
109 slist_free (urls_html);
111 FREE_MAYBE (base_dir);
112 FREE_MAYBE (robots_host);
116 /* Reset FIRST_TIME to 1, so that some action can be taken in
117 recursive_retrieve(). */
119 recursive_reset (void)
124 /* The core of recursive retrieving. Endless recursion is avoided by
125 having all URLs stored to a linked list of URLs, which is checked
126 before loading any URL. That way no URL can get loaded twice.
128 The function also supports specification of maximum recursion depth
129 and a number of other goodies. */
131 recursive_retrieve (const char *file, const char *this_url)
133 char *constr, *filename, *newloc;
134 char *canon_this_url = NULL;
135 int dt, inl, dash_p_leaf_HTML = FALSE;
136 int meta_disallow_follow;
137 int this_url_ftp; /* See below the explanation */
139 struct urlinfo *rurl;
140 urlpos *url_list, *cur_url;
141 char *rfile; /* For robots */
144 assert (this_url != NULL);
145 assert (file != NULL);
146 /* If quota was exceeded earlier, bail out. */
147 if (downloaded_exceeds_quota ())
149 /* Cache the current URL in the list. */
152 /* These three operations need to be done only once per Wget
153 run. They should probably be at a different location. */
154 if (!undesirable_urls)
155 undesirable_urls = make_string_hash_table (0);
156 if (!dl_file_url_map)
157 dl_file_url_map = make_string_hash_table (0);
158 if (!dl_url_file_map)
159 dl_url_file_map = make_string_hash_table (0);
161 hash_table_clear (undesirable_urls);
162 string_set_add (undesirable_urls, this_url);
163 hash_table_clear (dl_file_url_map);
164 hash_table_clear (dl_url_file_map);
166 /* Enter this_url to the hash table, in original and "enhanced" form. */
168 err = parseurl (this_url, u, 0);
171 string_set_add (undesirable_urls, u->url);
172 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
173 hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
174 urls_html = slist_prepend (urls_html, file);
176 base_dir = xstrdup (u->dir); /* Set the base dir. */
177 /* Set the canonical this_url to be sent as referer. This
178 problem exists only when running the first time. */
179 canon_this_url = xstrdup (u->url);
183 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
195 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
196 /* We've exceeded the maximum recursion depth specified by the user. */
198 if (opt.page_requisites && depth <= opt.reclevel + 1)
199 /* When -p is specified, we can do one more partial recursion from the
200 "leaf nodes" on the HTML document tree. The recursion is partial in
201 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
202 except for <LINK REL="stylesheet">. */
203 dash_p_leaf_HTML = TRUE;
205 /* Either -p wasn't specified or it was and we've already gone the one
206 extra (pseudo-)level that it affords us, so we need to bail out. */
208 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
209 depth, opt.reclevel));
215 /* Determine whether this_url is an FTP URL. If it is, it means
216 that the retrieval is done through proxy. In that case, FTP
217 links will be followed by default and recursion will not be
218 turned off when following them. */
219 this_url_ftp = (urlproto (this_url) == URLFTP);
221 /* Get the URL-s from an HTML file: */
222 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
223 dash_p_leaf_HTML, &meta_disallow_follow);
225 if (opt.use_robots && meta_disallow_follow)
227 /* The META tag says we are not to follow this file. Respect
229 free_urlpos (url_list);
233 /* Decide what to do with each of the URLs. A URL will be loaded if
234 it meets several requirements, discussed later. */
235 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
237 /* If quota was exceeded earlier, bail out. */
238 if (downloaded_exceeds_quota ())
240 /* Parse the URL for convenient use in other functions, as well
241 as to get the optimized form. It also checks URL integrity. */
243 if (parseurl (cur_url->url, u, 0) != URLOK)
245 DEBUGP (("Yuck! A bad URL.\n"));
249 if (u->proto == URLFILE)
251 DEBUGP (("Nothing to do with file:// around here.\n"));
255 assert (u->url != NULL);
256 constr = xstrdup (u->url);
258 /* Several checkings whether a file is acceptable to load:
259 1. check if URL is ftp, and we don't load it
260 2. check for relative links (if relative_only is set)
262 4. check for no-parent
263 5. check for excludes && includes
265 7. check for same host (if spanhost is unset), with possible
266 gethostbyname baggage
267 8. check for robots.txt
269 Addendum: If the URL is FTP, and it is to be loaded, only the
270 domain and suffix settings are "stronger".
272 Note that .html and (yuck) .htm will get loaded regardless of
273 suffix rules (but that is remedied later with unlink) unless
274 the depth equals the maximum depth.
276 More time- and memory- consuming tests should be put later on
279 /* inl is set if the URL we are working on (constr) is stored in
280 undesirable_urls. Using it is crucial to avoid unnecessary
281 repeated continuous hits to the hash table. */
282 inl = string_set_exists (undesirable_urls, constr);
284 /* If it is FTP, and FTP is not followed, chuck it out. */
286 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
288 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
289 string_set_add (undesirable_urls, constr);
292 /* If it is absolute link and they are not followed, chuck it
294 if (!inl && u->proto != URLFTP)
295 if (opt.relative_only && !cur_url->link_relative_p)
297 DEBUGP (("It doesn't really look like a relative link.\n"));
298 string_set_add (undesirable_urls, constr);
301 /* If its domain is not to be accepted/looked-up, chuck it out. */
303 if (!accept_domain (u))
305 DEBUGP (("I don't like the smell of that domain.\n"));
306 string_set_add (undesirable_urls, constr);
309 /* Check for parent directory. */
310 if (!inl && opt.no_parent
311 /* If the new URL is FTP and the old was not, ignore
313 && !(!this_url_ftp && u->proto == URLFTP))
315 /* Check for base_dir first. */
316 if (!(base_dir && frontcmp (base_dir, u->dir)))
318 /* Failing that, check for parent dir. */
319 struct urlinfo *ut = newurl ();
320 if (parseurl (this_url, ut, 0) != URLOK)
321 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
322 else if (!frontcmp (ut->dir, u->dir))
324 /* Failing that too, kill the URL. */
325 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
326 string_set_add (undesirable_urls, constr);
332 /* If the file does not match the acceptance list, or is on the
333 rejection list, chuck it out. The same goes for the
334 directory exclude- and include- lists. */
335 if (!inl && (opt.includes || opt.excludes))
337 if (!accdir (u->dir, ALLABS))
339 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
340 string_set_add (undesirable_urls, constr);
347 /* We check for acceptance/rejection rules only for non-HTML
348 documents. Since we don't know whether they really are
349 HTML, it will be deduced from (an OR-ed list):
351 1) u->file is "" (meaning it is a directory)
352 2) suffix exists, AND:
356 If the file *is* supposed to be HTML, it will *not* be
357 subject to acc/rej rules, unless a finite maximum depth has
358 been specified and the current depth is the maximum depth. */
361 || (((suf = suffix (constr)) != NULL)
362 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
363 && ((opt.reclevel != INFINITE_RECURSION) &&
364 (depth != opt.reclevel))))))
366 if (!acceptable (u->file))
368 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
370 string_set_add (undesirable_urls, constr);
376 /* Optimize the URL (which includes possible DNS lookup) only
377 after all other possibilities have been exhausted. */
380 if (!opt.simple_check)
385 /* Just lowercase the hostname. */
386 for (p = u->host; *p; p++)
389 u->url = str_url (u, 0);
392 constr = xstrdup (u->url);
393 string_set_add (undesirable_urls, constr);
394 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
395 if (!opt.spanhost && this_url && !same_host (this_url, constr))
397 DEBUGP (("This is not the same hostname as the parent's.\n"));
398 string_set_add (undesirable_urls, constr);
402 /* What about robots.txt? */
403 if (!inl && opt.use_robots && u->proto == URLHTTP)
405 /* Since Wget knows about only one set of robot rules at a
406 time, /robots.txt must be reloaded whenever a new host is
409 robots_host holds the host the current `forbid' variable
411 if (!robots_host || !same_host (robots_host, u->host))
413 FREE_MAYBE (robots_host);
414 /* Now make robots_host the new host, no matter what the
415 result will be. So if there is no /robots.txt on the
416 site, Wget will not retry getting robots all the
418 robots_host = xstrdup (u->host);
419 free_vec (forbidden);
421 err = retrieve_robots (constr, ROBOTS_FILENAME);
424 rurl = robots_url (constr, ROBOTS_FILENAME);
425 rfile = url_filename (rurl);
426 forbidden = parse_robots (rfile);
432 /* Now that we have (or don't have) robots, we can check for
434 if (!robots_match (u, forbidden))
436 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
438 string_set_add (undesirable_urls, constr);
444 /* If it wasn't chucked out, do something with it. */
447 DEBUGP (("I've decided to load it -> "));
448 /* Add it to the list of already-loaded URL-s. */
449 string_set_add (undesirable_urls, constr);
450 /* Automatically followed FTPs will *not* be downloaded
452 if (u->proto == URLFTP)
454 /* Don't you adore side-effects? */
457 /* Reset its type. */
460 retrieve_url (constr, &filename, &newloc,
461 canon_this_url ? canon_this_url : this_url, &dt);
462 if (u->proto == URLFTP)
472 /* In case of convert_links: If there was no error, add it to
473 the list of downloaded URLs. We might need it for
475 if (opt.convert_links && filename)
479 hash_table_put (dl_file_url_map,
480 xstrdup (filename), xstrdup (constr));
481 hash_table_put (dl_url_file_map,
482 xstrdup (constr), xstrdup (filename));
483 /* If the URL is HTML, note it. */
485 urls_html = slist_prepend (urls_html, filename);
488 /* If there was no error, and the type is text/html, parse
493 recursive_retrieve (filename, constr);
496 DEBUGP (("%s is not text/html so we don't chase.\n",
497 filename ? filename: "(null)"));
499 if (opt.delete_after || (filename && !acceptable (filename)))
500 /* Either --delete-after was specified, or we loaded this otherwise
501 rejected (e.g. by -R) HTML file just so we could harvest its
502 hyperlinks -- in either case, delete the local file. */
504 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
505 opt.delete_after ? "--delete-after" :
506 "recursive rejection criteria"));
507 logprintf (LOG_VERBOSE,
508 (opt.delete_after ? _("Removing %s.\n")
509 : _("Removing %s since it should be rejected.\n")),
511 if (unlink (filename))
512 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
516 /* If everything was OK, and links are to be converted, let's
517 store the local filename. */
518 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
520 cur_url->convert = CO_CONVERT_TO_RELATIVE;
521 cur_url->local_name = xstrdup (filename);
525 DEBUGP (("%s already in list, so we don't load.\n", constr));
526 /* Free filename and constr. */
527 FREE_MAYBE (filename);
530 /* Increment the pbuf for the appropriate size. */
532 if (opt.convert_links && !opt.delete_after)
533 /* This is merely the first pass: the links that have been
534 successfully downloaded are converted. In the second pass,
535 convert_all_links() will also convert those links that have NOT
536 been downloaded to their canonical form. */
537 convert_links (file, url_list);
538 /* Free the linked list of URL-s. */
539 free_urlpos (url_list);
540 /* Free the canonical this_url. */
541 FREE_MAYBE (canon_this_url);
542 /* Decrement the recursion depth. */
544 if (downloaded_exceeds_quota ())
550 /* convert_links() is called from recursive_retrieve() after we're
551 done with an HTML file. This call to convert_links is not complete
552 because it converts only the downloaded files, and Wget cannot know
553 which files will be downloaded afterwards. So, if we have file
556 <a href="/c/something.gif">
558 and /c/something.gif was not downloaded because it exceeded the
559 recursion depth, the reference will *not* be changed.
561 However, later we can encounter /c/something.gif from an "upper"
562 level HTML (let's call it filetwo.html), and it gets downloaded.
564 But now we have a problem because /c/something.gif will be
565 correctly transformed in filetwo.html, but not in fileone.html,
566 since Wget could not have known that /c/something.gif will be
567 downloaded in the future.
569 This is why Wget must, after the whole retrieval, call
570 convert_all_links to go once more through the entire list of
571 retrieved HTMLs, and re-convert them.
573 All the downloaded HTMLs are kept in urls_html, and downloaded URLs
574 in urls_downloaded. From these two lists information is
577 convert_all_links (void)
581 /* Destructively reverse urls_html to get it in the right order.
582 recursive_retrieve() used slist_prepend() consistently. */
583 urls_html = slist_nreverse (urls_html);
585 for (html = urls_html; html; html = html->next)
587 urlpos *urls, *cur_url;
590 DEBUGP (("Rescanning %s\n", html->string));
591 /* Determine the URL of the HTML file. get_urls_html will need
593 url = hash_table_get (dl_file_url_map, html->string);
595 DEBUGP (("It should correspond to %s.\n", url));
597 DEBUGP (("I cannot find the corresponding URL.\n"));
598 /* Parse the HTML file... */
599 urls = get_urls_html (html->string, url, FALSE, NULL);
600 /* We don't respect meta_disallow_follow here because, even if
601 the file is not followed, we might still want to convert the
602 links that have been followed from other files. */
603 for (cur_url = urls; cur_url; cur_url = cur_url->next)
607 /* The URL must be in canonical form to be compared. */
608 struct urlinfo *u = newurl ();
609 uerr_t res = parseurl (cur_url->url, u, 0);
615 /* We decide the direction of conversion according to whether
616 a URL was downloaded. Downloaded URLs will be converted
617 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
618 local_name = hash_table_get (dl_url_file_map, u->url);
620 DEBUGP (("%s marked for conversion, local %s\n",
621 u->url, local_name));
622 /* Decide on the conversion direction. */
625 /* We've downloaded this URL. Convert it to relative
626 form. We do this even if the URL already is in
627 relative form, because our directory structure may
628 not be identical to that on the server (think `-nd',
629 `--cut-dirs', etc.) */
630 cur_url->convert = CO_CONVERT_TO_RELATIVE;
631 cur_url->local_name = xstrdup (local_name);
635 /* We haven't downloaded this URL. If it's not already
636 complete (including a full host name), convert it to
637 that form, so it can be reached while browsing this
639 if (!cur_url->link_complete_p)
640 cur_url->convert = CO_CONVERT_TO_COMPLETE;
641 cur_url->local_name = NULL;
645 /* Convert the links in the file. */
646 convert_links (html->string, urls);
652 /* Robots support. */
654 /* Construct the robots URL. */
655 static struct urlinfo *
656 robots_url (const char *url, const char *robots_filename)
658 struct urlinfo *u = newurl ();
661 err = parseurl (url, u, 0);
662 assert (err == URLOK && u->proto == URLHTTP);
666 u->dir = xstrdup ("");
667 u->file = xstrdup (robots_filename);
668 u->url = str_url (u, 0);
672 /* Retrieves the robots_filename from the root server directory, if
673 possible. Returns ROBOTSOK if robots were retrieved OK, and
674 NOROBOTS if robots could not be retrieved for any reason. */
676 retrieve_robots (const char *url, const char *robots_filename)
682 u = robots_url (url, robots_filename);
683 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
684 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
692 /* Parse the robots_filename and return the disallowed path components
693 in a malloc-ed vector of character pointers.
695 It should be fully compliant with the syntax as described in the
696 file norobots.txt, adopted by the robots mailing list
697 (robots@webcrawler.com). */
699 parse_robots (const char *robots_filename)
703 char *line, *cmd, *str, *p;
704 char *base_version, *version;
706 int wget_matched; /* is the part meant for Wget? */
711 fp = fopen (robots_filename, "rb");
715 /* Kill version number. */
718 STRDUP_ALLOCA (base_version, opt.useragent);
719 STRDUP_ALLOCA (version, opt.useragent);
723 int len = 10 + strlen (version_string);
724 base_version = (char *)alloca (len);
725 sprintf (base_version, "Wget/%s", version_string);
726 version = (char *)alloca (len);
727 sprintf (version, "Wget/%s", version_string);
729 for (p = version; *p; p++)
731 for (p = base_version; *p && *p != '/'; p++)
735 /* Setting this to 1 means that Wget considers itself under
736 restrictions by default, even if the User-Agent field is not
737 present. However, if it finds the user-agent set to anything
738 other than Wget, the rest will be ignored (up to the following
739 User-Agent field). Thus you may have something like:
743 User-Agent: stupid-robot
752 In this case the 1, 2, 5, 6 and 7 disallow lines will be
755 while ((line = read_whole_line (fp)))
757 int len = strlen (line);
758 /* Destroy <CR><LF> if present. */
759 if (len && line[len - 1] == '\n')
761 if (len && line[len - 1] == '\r')
763 /* According to specifications, optional space may be at the
765 DEBUGP (("Line: %s\n", line));
767 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
771 DEBUGP (("(chucked out)\n"));
775 for (str = cmd; *str && *str != ':'; str++);
779 DEBUGP (("(chucked out)\n"));
782 /* Zero-terminate the command. */
784 /* Look for the string beginning... */
785 for (; *str && ISSPACE (*str); str++);
786 /* Look for comments or trailing spaces and kill them off. */
787 for (p = str; *p; p++)
788 if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
790 /* We have found either a shell-style comment `<sp>+#' or some
791 trailing spaces. Now rewind to the beginning of the spaces
792 and place '\0' there. */
793 while (p > str && ISSPACE (*p))
801 if (!strcasecmp (cmd, "User-agent"))
804 /* Lowercase the agent string. */
805 for (p = str; *p; p++)
807 /* If the string is `*', it matches. */
808 if (*str == '*' && !*(str + 1))
812 /* If the string contains wildcards, we'll run it through
814 if (has_wildcards_p (str))
816 /* If the string contains '/', compare with the full
817 version. Else, compare it to base_version. */
818 if (strchr (str, '/'))
819 match = !fnmatch (str, version, 0);
821 match = !fnmatch (str, base_version, 0);
823 else /* Substring search */
825 if (strstr (version, str))
831 /* If Wget is not matched, skip all the entries up to the
832 next User-agent field. */
833 wget_matched = match;
835 else if (!wget_matched)
838 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
841 else if (!strcasecmp (cmd, "Disallow"))
843 /* If "Disallow" is empty, the robot is welcome. */
847 entries = (char **)xmalloc (sizeof (char *));
853 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
854 entries[num] = xstrdup (str);
855 entries[++num] = NULL;
856 /* Strip trailing spaces, according to specifications. */
857 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
858 if (ISSPACE (str[i]))
864 /* unknown command */
865 DEBUGP (("(chucked out)\n"));
873 /* May the URL url be loaded according to disallowing rules stored in
876 robots_match (struct urlinfo *u, char **fb)
882 DEBUGP (("Matching %s against: ", u->path));
885 DEBUGP (("%s ", *fb));
887 /* If dir is fb, we may not load the file. */
888 if (strncmp (u->path, *fb, l) == 0)
890 DEBUGP (("matched.\n"));
891 return 0; /* Matches, i.e. does not load... */
894 DEBUGP (("not matched.\n"));