1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
35 #include <sys/types.h>
51 extern char *version_string;
53 #define ROBOTS_FILENAME "robots.txt"
55 static struct hash_table *dl_file_url_map;
56 static struct hash_table *dl_url_file_map;
58 /* List of HTML URLs. */
59 static slist *urls_html;
61 /* List of undesirable-to-load URLs. */
62 static struct hash_table *undesirable_urls;
64 /* List of forbidden locations. */
65 static char **forbidden = NULL;
67 /* Current recursion depth. */
70 /* Base directory we're recursing from (used by no_parent). */
71 static char *base_dir;
73 /* The host name for which we last checked robots. */
74 static char *robots_host;
76 static int first_time = 1;
78 /* Construct the robots URL. */
79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
81 static char **parse_robots PARAMS ((const char *));
82 static int robots_match PARAMS ((struct urlinfo *, char **));
85 /* Cleanup the data structures associated with recursive retrieving
86 (the variables above). */
88 recursive_cleanup (void)
92 string_set_free (undesirable_urls);
93 undesirable_urls = NULL;
97 free_keys_and_values (dl_file_url_map);
98 hash_table_destroy (dl_file_url_map);
99 dl_file_url_map = NULL;
103 free_keys_and_values (dl_url_file_map);
104 hash_table_destroy (dl_url_file_map);
105 dl_url_file_map = NULL;
107 undesirable_urls = NULL;
108 free_vec (forbidden);
110 slist_free (urls_html);
112 FREE_MAYBE (base_dir);
113 FREE_MAYBE (robots_host);
117 /* Reset FIRST_TIME to 1, so that some action can be taken in
118 recursive_retrieve(). */
120 recursive_reset (void)
125 /* The core of recursive retrieving. Endless recursion is avoided by
126 having all URLs stored to a linked list of URLs, which is checked
127 before loading any URL. That way no URL can get loaded twice.
129 The function also supports specification of maximum recursion depth
130 and a number of other goodies. */
132 recursive_retrieve (const char *file, const char *this_url)
134 char *constr, *filename, *newloc;
135 char *canon_this_url = NULL;
136 int dt, inl, dash_p_leaf_HTML = FALSE;
137 int meta_disallow_follow;
138 int this_url_ftp; /* See below the explanation */
140 struct urlinfo *rurl;
141 urlpos *url_list, *cur_url;
142 char *rfile; /* For robots */
145 assert (this_url != NULL);
146 assert (file != NULL);
147 /* If quota was exceeded earlier, bail out. */
148 if (downloaded_exceeds_quota ())
150 /* Cache the current URL in the list. */
153 /* These three operations need to be done only once per Wget
154 run. They should probably be at a different location. */
155 if (!undesirable_urls)
156 undesirable_urls = make_string_hash_table (0);
157 if (!dl_file_url_map)
158 dl_file_url_map = make_string_hash_table (0);
159 if (!dl_url_file_map)
160 dl_url_file_map = make_string_hash_table (0);
162 hash_table_clear (undesirable_urls);
163 string_set_add (undesirable_urls, this_url);
164 hash_table_clear (dl_file_url_map);
165 hash_table_clear (dl_url_file_map);
167 /* Enter this_url to the hash table, in original and "enhanced" form. */
169 err = parseurl (this_url, u, 0);
172 string_set_add (undesirable_urls, u->url);
173 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
174 hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
175 urls_html = slist_prepend (urls_html, file);
177 base_dir = xstrdup (u->dir); /* Set the base dir. */
178 /* Set the canonical this_url to be sent as referer. This
179 problem exists only when running the first time. */
180 canon_this_url = xstrdup (u->url);
184 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
196 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
197 /* We've exceeded the maximum recursion depth specified by the user. */
199 if (opt.page_requisites && depth <= opt.reclevel + 1)
200 /* When -p is specified, we can do one more partial recursion from the
201 "leaf nodes" on the HTML document tree. The recursion is partial in
202 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
203 except for <LINK REL="stylesheet">. */
204 dash_p_leaf_HTML = TRUE;
206 /* Either -p wasn't specified or it was and we've already gone the one
207 extra (pseudo-)level that it affords us, so we need to bail out. */
209 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
210 depth, opt.reclevel));
216 /* Determine whether this_url is an FTP URL. If it is, it means
217 that the retrieval is done through proxy. In that case, FTP
218 links will be followed by default and recursion will not be
219 turned off when following them. */
220 this_url_ftp = (urlproto (this_url) == URLFTP);
222 /* Get the URL-s from an HTML file: */
223 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
224 dash_p_leaf_HTML, &meta_disallow_follow);
226 if (opt.use_robots && meta_disallow_follow)
228 /* The META tag says we are not to follow this file. Respect
230 free_urlpos (url_list);
234 /* Decide what to do with each of the URLs. A URL will be loaded if
235 it meets several requirements, discussed later. */
236 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
238 /* If quota was exceeded earlier, bail out. */
239 if (downloaded_exceeds_quota ())
241 /* Parse the URL for convenient use in other functions, as well
242 as to get the optimized form. It also checks URL integrity. */
244 if (parseurl (cur_url->url, u, 0) != URLOK)
246 DEBUGP (("Yuck! A bad URL.\n"));
250 if (u->proto == URLFILE)
252 DEBUGP (("Nothing to do with file:// around here.\n"));
256 assert (u->url != NULL);
257 constr = xstrdup (u->url);
259 /* Several checkings whether a file is acceptable to load:
260 1. check if URL is ftp, and we don't load it
261 2. check for relative links (if relative_only is set)
263 4. check for no-parent
264 5. check for excludes && includes
266 7. check for same host (if spanhost is unset), with possible
267 gethostbyname baggage
268 8. check for robots.txt
270 Addendum: If the URL is FTP, and it is to be loaded, only the
271 domain and suffix settings are "stronger".
273 Note that .html and (yuck) .htm will get loaded regardless of
274 suffix rules (but that is remedied later with unlink) unless
275 the depth equals the maximum depth.
277 More time- and memory- consuming tests should be put later on
280 /* inl is set if the URL we are working on (constr) is stored in
281 undesirable_urls. Using it is crucial to avoid unnecessary
282 repeated continuous hits to the hash table. */
283 inl = string_set_exists (undesirable_urls, constr);
285 /* If it is FTP, and FTP is not followed, chuck it out. */
287 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
289 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
290 string_set_add (undesirable_urls, constr);
293 /* If it is absolute link and they are not followed, chuck it
295 if (!inl && u->proto != URLFTP)
296 if (opt.relative_only && !cur_url->link_relative_p)
298 DEBUGP (("It doesn't really look like a relative link.\n"));
299 string_set_add (undesirable_urls, constr);
302 /* If its domain is not to be accepted/looked-up, chuck it out. */
304 if (!accept_domain (u))
306 DEBUGP (("I don't like the smell of that domain.\n"));
307 string_set_add (undesirable_urls, constr);
310 /* Check for parent directory. */
311 if (!inl && opt.no_parent
312 /* If the new URL is FTP and the old was not, ignore
314 && !(!this_url_ftp && u->proto == URLFTP))
316 /* Check for base_dir first. */
317 if (!(base_dir && frontcmp (base_dir, u->dir)))
319 /* Failing that, check for parent dir. */
320 struct urlinfo *ut = newurl ();
321 if (parseurl (this_url, ut, 0) != URLOK)
322 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
323 else if (!frontcmp (ut->dir, u->dir))
325 /* Failing that too, kill the URL. */
326 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
327 string_set_add (undesirable_urls, constr);
333 /* If the file does not match the acceptance list, or is on the
334 rejection list, chuck it out. The same goes for the
335 directory exclude- and include- lists. */
336 if (!inl && (opt.includes || opt.excludes))
338 if (!accdir (u->dir, ALLABS))
340 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
341 string_set_add (undesirable_urls, constr);
348 /* We check for acceptance/rejection rules only for non-HTML
349 documents. Since we don't know whether they really are
350 HTML, it will be deduced from (an OR-ed list):
352 1) u->file is "" (meaning it is a directory)
353 2) suffix exists, AND:
357 If the file *is* supposed to be HTML, it will *not* be
358 subject to acc/rej rules, unless a finite maximum depth has
359 been specified and the current depth is the maximum depth. */
362 || (((suf = suffix (constr)) != NULL)
363 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
364 && ((opt.reclevel != INFINITE_RECURSION) &&
365 (depth != opt.reclevel))))))
367 if (!acceptable (u->file))
369 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
371 string_set_add (undesirable_urls, constr);
377 /* Optimize the URL (which includes possible DNS lookup) only
378 after all other possibilities have been exhausted. */
381 if (!opt.simple_check)
386 /* Just lowercase the hostname. */
387 for (p = u->host; *p; p++)
390 u->url = str_url (u, 0);
393 constr = xstrdup (u->url);
394 string_set_add (undesirable_urls, constr);
395 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
396 if (!opt.spanhost && this_url && !same_host (this_url, constr))
398 DEBUGP (("This is not the same hostname as the parent's.\n"));
399 string_set_add (undesirable_urls, constr);
403 /* What about robots.txt? */
404 if (!inl && opt.use_robots && u->proto == URLHTTP)
406 /* Since Wget knows about only one set of robot rules at a
407 time, /robots.txt must be reloaded whenever a new host is
410 robots_host holds the host the current `forbid' variable
412 if (!robots_host || !same_host (robots_host, u->host))
414 FREE_MAYBE (robots_host);
415 /* Now make robots_host the new host, no matter what the
416 result will be. So if there is no /robots.txt on the
417 site, Wget will not retry getting robots all the
419 robots_host = xstrdup (u->host);
420 free_vec (forbidden);
422 err = retrieve_robots (constr, ROBOTS_FILENAME);
425 rurl = robots_url (constr, ROBOTS_FILENAME);
426 rfile = url_filename (rurl);
427 forbidden = parse_robots (rfile);
433 /* Now that we have (or don't have) robots, we can check for
435 if (!robots_match (u, forbidden))
437 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
439 string_set_add (undesirable_urls, constr);
445 /* If it wasn't chucked out, do something with it. */
448 DEBUGP (("I've decided to load it -> "));
449 /* Add it to the list of already-loaded URL-s. */
450 string_set_add (undesirable_urls, constr);
451 /* Automatically followed FTPs will *not* be downloaded
453 if (u->proto == URLFTP)
455 /* Don't you adore side-effects? */
458 /* Reset its type. */
461 retrieve_url (constr, &filename, &newloc,
462 canon_this_url ? canon_this_url : this_url, &dt);
463 if (u->proto == URLFTP)
473 /* In case of convert_links: If there was no error, add it to
474 the list of downloaded URLs. We might need it for
476 if (opt.convert_links && filename)
480 hash_table_put (dl_file_url_map,
481 xstrdup (filename), xstrdup (constr));
482 hash_table_put (dl_url_file_map,
483 xstrdup (constr), xstrdup (filename));
484 /* If the URL is HTML, note it. */
486 urls_html = slist_prepend (urls_html, filename);
489 /* If there was no error, and the type is text/html, parse
494 recursive_retrieve (filename, constr);
497 DEBUGP (("%s is not text/html so we don't chase.\n",
498 filename ? filename: "(null)"));
500 if (opt.delete_after || (filename && !acceptable (filename)))
501 /* Either --delete-after was specified, or we loaded this otherwise
502 rejected (e.g. by -R) HTML file just so we could harvest its
503 hyperlinks -- in either case, delete the local file. */
505 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
506 opt.delete_after ? "--delete-after" :
507 "recursive rejection criteria"));
508 logprintf (LOG_VERBOSE,
509 (opt.delete_after ? _("Removing %s.\n")
510 : _("Removing %s since it should be rejected.\n")),
512 if (unlink (filename))
513 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
517 /* If everything was OK, and links are to be converted, let's
518 store the local filename. */
519 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
521 cur_url->convert = CO_CONVERT_TO_RELATIVE;
522 cur_url->local_name = xstrdup (filename);
526 DEBUGP (("%s already in list, so we don't load.\n", constr));
527 /* Free filename and constr. */
528 FREE_MAYBE (filename);
531 /* Increment the pbuf for the appropriate size. */
533 if (opt.convert_links && !opt.delete_after)
534 /* This is merely the first pass: the links that have been
535 successfully downloaded are converted. In the second pass,
536 convert_all_links() will also convert those links that have NOT
537 been downloaded to their canonical form. */
538 convert_links (file, url_list);
539 /* Free the linked list of URL-s. */
540 free_urlpos (url_list);
541 /* Free the canonical this_url. */
542 FREE_MAYBE (canon_this_url);
543 /* Decrement the recursion depth. */
545 if (downloaded_exceeds_quota ())
551 /* convert_links() is called from recursive_retrieve() after we're
552 done with an HTML file. This call to convert_links is not complete
553 because it converts only the downloaded files, and Wget cannot know
554 which files will be downloaded afterwards. So, if we have file
557 <a href="/c/something.gif">
559 and /c/something.gif was not downloaded because it exceeded the
560 recursion depth, the reference will *not* be changed.
562 However, later we can encounter /c/something.gif from an "upper"
563 level HTML (let's call it filetwo.html), and it gets downloaded.
565 But now we have a problem because /c/something.gif will be
566 correctly transformed in filetwo.html, but not in fileone.html,
567 since Wget could not have known that /c/something.gif will be
568 downloaded in the future.
570 This is why Wget must, after the whole retrieval, call
571 convert_all_links to go once more through the entire list of
572 retrieved HTMLs, and re-convert them.
574 All the downloaded HTMLs are kept in urls_html, and downloaded URLs
575 in urls_downloaded. From these two lists information is
578 convert_all_links (void)
582 /* Destructively reverse urls_html to get it in the right order.
583 recursive_retrieve() used slist_prepend() consistently. */
584 urls_html = slist_nreverse (urls_html);
586 for (html = urls_html; html; html = html->next)
588 urlpos *urls, *cur_url;
591 DEBUGP (("Rescanning %s\n", html->string));
592 /* Determine the URL of the HTML file. get_urls_html will need
594 url = hash_table_get (dl_file_url_map, html->string);
596 DEBUGP (("It should correspond to %s.\n", url));
598 DEBUGP (("I cannot find the corresponding URL.\n"));
599 /* Parse the HTML file... */
600 urls = get_urls_html (html->string, url, FALSE, NULL);
601 /* We don't respect meta_disallow_follow here because, even if
602 the file is not followed, we might still want to convert the
603 links that have been followed from other files. */
604 for (cur_url = urls; cur_url; cur_url = cur_url->next)
608 /* The URL must be in canonical form to be compared. */
609 struct urlinfo *u = newurl ();
610 uerr_t res = parseurl (cur_url->url, u, 0);
616 /* We decide the direction of conversion according to whether
617 a URL was downloaded. Downloaded URLs will be converted
618 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
619 local_name = hash_table_get (dl_url_file_map, u->url);
621 DEBUGP (("%s marked for conversion, local %s\n",
622 u->url, local_name));
623 /* Decide on the conversion direction. */
626 /* We've downloaded this URL. Convert it to relative
627 form. We do this even if the URL already is in
628 relative form, because our directory structure may
629 not be identical to that on the server (think `-nd',
630 `--cut-dirs', etc.) */
631 cur_url->convert = CO_CONVERT_TO_RELATIVE;
632 cur_url->local_name = xstrdup (local_name);
636 /* We haven't downloaded this URL. If it's not already
637 complete (including a full host name), convert it to
638 that form, so it can be reached while browsing this
640 if (!cur_url->link_complete_p)
641 cur_url->convert = CO_CONVERT_TO_COMPLETE;
642 cur_url->local_name = NULL;
646 /* Convert the links in the file. */
647 convert_links (html->string, urls);
653 /* Robots support. */
655 /* Construct the robots URL. */
656 static struct urlinfo *
657 robots_url (const char *url, const char *robots_filename)
659 struct urlinfo *u = newurl ();
662 err = parseurl (url, u, 0);
663 assert (err == URLOK && u->proto == URLHTTP);
667 u->dir = xstrdup ("");
668 u->file = xstrdup (robots_filename);
669 u->url = str_url (u, 0);
673 /* Retrieves the robots_filename from the root server directory, if
674 possible. Returns ROBOTSOK if robots were retrieved OK, and
675 NOROBOTS if robots could not be retrieved for any reason. */
677 retrieve_robots (const char *url, const char *robots_filename)
683 u = robots_url (url, robots_filename);
684 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
685 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
693 /* Parse the robots_filename and return the disallowed path components
694 in a malloc-ed vector of character pointers.
696 It should be fully compliant with the syntax as described in the
697 file norobots.txt, adopted by the robots mailing list
698 (robots@webcrawler.com). */
700 parse_robots (const char *robots_filename)
704 char *line, *cmd, *str, *p;
705 char *base_version, *version;
707 int wget_matched; /* is the part meant for Wget? */
712 fp = fopen (robots_filename, "rb");
716 /* Kill version number. */
719 STRDUP_ALLOCA (base_version, opt.useragent);
720 STRDUP_ALLOCA (version, opt.useragent);
724 int len = 10 + strlen (version_string);
725 base_version = (char *)alloca (len);
726 sprintf (base_version, "Wget/%s", version_string);
727 version = (char *)alloca (len);
728 sprintf (version, "Wget/%s", version_string);
730 for (p = version; *p; p++)
732 for (p = base_version; *p && *p != '/'; p++)
736 /* Setting this to 1 means that Wget considers itself under
737 restrictions by default, even if the User-Agent field is not
738 present. However, if it finds the user-agent set to anything
739 other than Wget, the rest will be ignored (up to the following
740 User-Agent field). Thus you may have something like:
744 User-Agent: stupid-robot
753 In this case the 1, 2, 5, 6 and 7 disallow lines will be
756 while ((line = read_whole_line (fp)))
759 /* Destroy <CR><LF> if present. */
760 if (len && line[len - 1] == '\n')
762 if (len && line[len - 1] == '\r')
764 /* According to specifications, optional space may be at the
766 DEBUGP (("Line: %s\n", line));
768 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
772 DEBUGP (("(chucked out)\n"));
776 for (str = cmd; *str && *str != ':'; str++);
780 DEBUGP (("(chucked out)\n"));
783 /* Zero-terminate the command. */
785 /* Look for the string beginning... */
786 for (; *str && ISSPACE (*str); str++);
787 /* Look for comments or trailing spaces and kill them off. */
788 for (p = str; *p; p++)
789 if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
791 /* We have found either a shell-style comment `<sp>+#' or some
792 trailing spaces. Now rewind to the beginning of the spaces
793 and place '\0' there. */
794 while (p > str && ISSPACE (*p))
802 if (!strcasecmp (cmd, "User-agent"))
805 /* Lowercase the agent string. */
806 for (p = str; *p; p++)
808 /* If the string is `*', it matches. */
809 if (*str == '*' && !*(str + 1))
813 /* If the string contains wildcards, we'll run it through
815 if (has_wildcards_p (str))
817 /* If the string contains '/', compare with the full
818 version. Else, compare it to base_version. */
819 if (strchr (str, '/'))
820 match = !fnmatch (str, version, 0);
822 match = !fnmatch (str, base_version, 0);
824 else /* Substring search */
826 if (strstr (version, str))
832 /* If Wget is not matched, skip all the entries up to the
833 next User-agent field. */
834 wget_matched = match;
836 else if (!wget_matched)
839 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
842 else if (!strcasecmp (cmd, "Disallow"))
844 /* If "Disallow" is empty, the robot is welcome. */
848 entries = (char **)xmalloc (sizeof (char *));
854 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
855 entries[num] = xstrdup (str);
856 entries[++num] = NULL;
857 /* Strip trailing spaces, according to specifications. */
858 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
859 if (ISSPACE (str[i]))
865 /* unknown command */
866 DEBUGP (("(chucked out)\n"));
874 /* May the URL url be loaded according to disallowing rules stored in
877 robots_match (struct urlinfo *u, char **forbidden)
883 DEBUGP (("Matching %s against: ", u->path));
884 for (; *forbidden; forbidden++)
886 DEBUGP (("%s ", *forbidden));
887 l = strlen (*forbidden);
888 /* If dir is forbidden, we may not load the file. */
889 if (strncmp (u->path, *forbidden, l) == 0)
891 DEBUGP (("matched.\n"));
892 return 0; /* Matches, i.e. does not load... */
895 DEBUGP (("not matched.\n"));