1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
50 extern char *version_string;
52 #define ROBOTS_FILENAME "robots.txt"
54 static struct hash_table *dl_file_url_map;
55 static struct hash_table *dl_url_file_map;
57 /* List of HTML files downloaded in this Wget run. Used for link
58 conversion after Wget is done. */
59 static slist *downloaded_html_files;
61 /* List of undesirable-to-load URLs. */
62 static struct hash_table *undesirable_urls;
64 /* List of forbidden locations. */
65 static char **forbidden = NULL;
67 /* Current recursion depth. */
70 /* Base directory we're recursing from (used by no_parent). */
71 static char *base_dir;
73 /* The host name for which we last checked robots. */
74 static char *robots_host;
76 static int first_time = 1;
78 /* Construct the robots URL. */
79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
81 static char **parse_robots PARAMS ((const char *));
82 static int robots_match PARAMS ((struct urlinfo *, char **));
85 /* Cleanup the data structures associated with recursive retrieving
86 (the variables above). */
88 recursive_cleanup (void)
92 string_set_free (undesirable_urls);
93 undesirable_urls = NULL;
97 free_keys_and_values (dl_file_url_map);
98 hash_table_destroy (dl_file_url_map);
99 dl_file_url_map = NULL;
103 free_keys_and_values (dl_url_file_map);
104 hash_table_destroy (dl_url_file_map);
105 dl_url_file_map = NULL;
107 undesirable_urls = NULL;
108 free_vec (forbidden);
110 slist_free (downloaded_html_files);
111 downloaded_html_files = NULL;
112 FREE_MAYBE (base_dir);
113 FREE_MAYBE (robots_host);
117 /* Reset FIRST_TIME to 1, so that some action can be taken in
118 recursive_retrieve(). */
120 recursive_reset (void)
125 /* The core of recursive retrieving. Endless recursion is avoided by
126 having all URLs stored to a linked list of URLs, which is checked
127 before loading any URL. That way no URL can get loaded twice.
129 The function also supports specification of maximum recursion depth
130 and a number of other goodies. */
132 recursive_retrieve (const char *file, const char *this_url)
134 char *constr, *filename, *newloc;
135 char *canon_this_url = NULL;
136 int dt, inl, dash_p_leaf_HTML = FALSE;
137 int meta_disallow_follow;
138 int this_url_ftp; /* See below the explanation */
140 struct urlinfo *rurl;
141 urlpos *url_list, *cur_url;
142 char *rfile; /* For robots */
145 assert (this_url != NULL);
146 assert (file != NULL);
147 /* If quota was exceeded earlier, bail out. */
148 if (downloaded_exceeds_quota ())
150 /* Cache the current URL in the list. */
153 /* These three operations need to be done only once per Wget
154 run. They should probably be at a different location. */
155 if (!undesirable_urls)
156 undesirable_urls = make_string_hash_table (0);
158 hash_table_clear (undesirable_urls);
159 string_set_add (undesirable_urls, this_url);
160 hash_table_clear (dl_file_url_map);
161 hash_table_clear (dl_url_file_map);
162 /* Enter this_url to the hash table, in original and "enhanced" form. */
164 err = parseurl (this_url, u, 0);
167 string_set_add (undesirable_urls, u->url);
169 base_dir = xstrdup (u->dir); /* Set the base dir. */
170 /* Set the canonical this_url to be sent as referer. This
171 problem exists only when running the first time. */
172 canon_this_url = xstrdup (u->url);
176 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
188 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
189 /* We've exceeded the maximum recursion depth specified by the user. */
191 if (opt.page_requisites && depth <= opt.reclevel + 1)
192 /* When -p is specified, we can do one more partial recursion from the
193 "leaf nodes" on the HTML document tree. The recursion is partial in
194 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
195 except for <LINK REL="stylesheet">. */
196 dash_p_leaf_HTML = TRUE;
198 /* Either -p wasn't specified or it was and we've already gone the one
199 extra (pseudo-)level that it affords us, so we need to bail out. */
201 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
202 depth, opt.reclevel));
208 /* Determine whether this_url is an FTP URL. If it is, it means
209 that the retrieval is done through proxy. In that case, FTP
210 links will be followed by default and recursion will not be
211 turned off when following them. */
212 this_url_ftp = (urlproto (this_url) == URLFTP);
214 /* Get the URL-s from an HTML file: */
215 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
216 dash_p_leaf_HTML, &meta_disallow_follow);
218 if (opt.use_robots && meta_disallow_follow)
220 /* The META tag says we are not to follow this file. Respect
222 free_urlpos (url_list);
226 /* Decide what to do with each of the URLs. A URL will be loaded if
227 it meets several requirements, discussed later. */
228 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
230 /* If quota was exceeded earlier, bail out. */
231 if (downloaded_exceeds_quota ())
233 /* Parse the URL for convenient use in other functions, as well
234 as to get the optimized form. It also checks URL integrity. */
236 if (parseurl (cur_url->url, u, 0) != URLOK)
238 DEBUGP (("Yuck! A bad URL.\n"));
242 if (u->proto == URLFILE)
244 DEBUGP (("Nothing to do with file:// around here.\n"));
248 assert (u->url != NULL);
249 constr = xstrdup (u->url);
251 /* Several checkings whether a file is acceptable to load:
252 1. check if URL is ftp, and we don't load it
253 2. check for relative links (if relative_only is set)
255 4. check for no-parent
256 5. check for excludes && includes
258 7. check for same host (if spanhost is unset), with possible
259 gethostbyname baggage
260 8. check for robots.txt
262 Addendum: If the URL is FTP, and it is to be loaded, only the
263 domain and suffix settings are "stronger".
265 Note that .html and (yuck) .htm will get loaded regardless of
266 suffix rules (but that is remedied later with unlink) unless
267 the depth equals the maximum depth.
269 More time- and memory- consuming tests should be put later on
272 /* inl is set if the URL we are working on (constr) is stored in
273 undesirable_urls. Using it is crucial to avoid unnecessary
274 repeated continuous hits to the hash table. */
275 inl = string_set_exists (undesirable_urls, constr);
277 /* If it is FTP, and FTP is not followed, chuck it out. */
279 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
281 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
282 string_set_add (undesirable_urls, constr);
285 /* If it is absolute link and they are not followed, chuck it
287 if (!inl && u->proto != URLFTP)
288 if (opt.relative_only && !cur_url->link_relative_p)
290 DEBUGP (("It doesn't really look like a relative link.\n"));
291 string_set_add (undesirable_urls, constr);
294 /* If its domain is not to be accepted/looked-up, chuck it out. */
296 if (!accept_domain (u))
298 DEBUGP (("I don't like the smell of that domain.\n"));
299 string_set_add (undesirable_urls, constr);
302 /* Check for parent directory. */
303 if (!inl && opt.no_parent
304 /* If the new URL is FTP and the old was not, ignore
306 && !(!this_url_ftp && u->proto == URLFTP))
308 /* Check for base_dir first. */
309 if (!(base_dir && frontcmp (base_dir, u->dir)))
311 /* Failing that, check for parent dir. */
312 struct urlinfo *ut = newurl ();
313 if (parseurl (this_url, ut, 0) != URLOK)
314 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
315 else if (!frontcmp (ut->dir, u->dir))
317 /* Failing that too, kill the URL. */
318 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
319 string_set_add (undesirable_urls, constr);
325 /* If the file does not match the acceptance list, or is on the
326 rejection list, chuck it out. The same goes for the
327 directory exclude- and include- lists. */
328 if (!inl && (opt.includes || opt.excludes))
330 if (!accdir (u->dir, ALLABS))
332 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
333 string_set_add (undesirable_urls, constr);
340 /* We check for acceptance/rejection rules only for non-HTML
341 documents. Since we don't know whether they really are
342 HTML, it will be deduced from (an OR-ed list):
344 1) u->file is "" (meaning it is a directory)
345 2) suffix exists, AND:
349 If the file *is* supposed to be HTML, it will *not* be
350 subject to acc/rej rules, unless a finite maximum depth has
351 been specified and the current depth is the maximum depth. */
354 || (((suf = suffix (constr)) != NULL)
355 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
356 && ((opt.reclevel != INFINITE_RECURSION) &&
357 (depth != opt.reclevel))))))
359 if (!acceptable (u->file))
361 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
363 string_set_add (undesirable_urls, constr);
369 /* Optimize the URL (which includes possible DNS lookup) only
370 after all other possibilities have been exhausted. */
373 if (!opt.simple_check)
378 /* Just lowercase the hostname. */
379 for (p = u->host; *p; p++)
382 u->url = str_url (u, 0);
385 constr = xstrdup (u->url);
386 string_set_add (undesirable_urls, constr);
387 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
388 if (!opt.spanhost && this_url && !same_host (this_url, constr))
390 DEBUGP (("This is not the same hostname as the parent's.\n"));
391 string_set_add (undesirable_urls, constr);
395 /* What about robots.txt? */
396 if (!inl && opt.use_robots && u->proto == URLHTTP)
398 /* Since Wget knows about only one set of robot rules at a
399 time, /robots.txt must be reloaded whenever a new host is
402 robots_host holds the host the current `forbid' variable
404 if (!robots_host || !same_host (robots_host, u->host))
406 FREE_MAYBE (robots_host);
407 /* Now make robots_host the new host, no matter what the
408 result will be. So if there is no /robots.txt on the
409 site, Wget will not retry getting robots all the
411 robots_host = xstrdup (u->host);
412 free_vec (forbidden);
414 err = retrieve_robots (constr, ROBOTS_FILENAME);
417 rurl = robots_url (constr, ROBOTS_FILENAME);
418 rfile = url_filename (rurl);
419 forbidden = parse_robots (rfile);
425 /* Now that we have (or don't have) robots, we can check for
427 if (!robots_match (u, forbidden))
429 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
431 string_set_add (undesirable_urls, constr);
437 /* If it wasn't chucked out, do something with it. */
440 DEBUGP (("I've decided to load it -> "));
441 /* Add it to the list of already-loaded URL-s. */
442 string_set_add (undesirable_urls, constr);
443 /* Automatically followed FTPs will *not* be downloaded
445 if (u->proto == URLFTP)
447 /* Don't you adore side-effects? */
450 /* Reset its type. */
453 retrieve_url (constr, &filename, &newloc,
454 canon_this_url ? canon_this_url : this_url, &dt);
455 if (u->proto == URLFTP)
465 /* If there was no error, and the type is text/html, parse
470 recursive_retrieve (filename, constr);
473 DEBUGP (("%s is not text/html so we don't chase.\n",
474 filename ? filename: "(null)"));
476 if (opt.delete_after || (filename && !acceptable (filename)))
477 /* Either --delete-after was specified, or we loaded this otherwise
478 rejected (e.g. by -R) HTML file just so we could harvest its
479 hyperlinks -- in either case, delete the local file. */
481 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
482 opt.delete_after ? "--delete-after" :
483 "recursive rejection criteria"));
484 logprintf (LOG_VERBOSE,
485 (opt.delete_after ? _("Removing %s.\n")
486 : _("Removing %s since it should be rejected.\n")),
488 if (unlink (filename))
489 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
493 /* If everything was OK, and links are to be converted, let's
494 store the local filename. */
495 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
497 cur_url->convert = CO_CONVERT_TO_RELATIVE;
498 cur_url->local_name = xstrdup (filename);
502 DEBUGP (("%s already in list, so we don't load.\n", constr));
503 /* Free filename and constr. */
504 FREE_MAYBE (filename);
507 /* Increment the pbuf for the appropriate size. */
509 if (opt.convert_links && !opt.delete_after)
510 /* This is merely the first pass: the links that have been
511 successfully downloaded are converted. In the second pass,
512 convert_all_links() will also convert those links that have NOT
513 been downloaded to their canonical form. */
514 convert_links (file, url_list);
515 /* Free the linked list of URL-s. */
516 free_urlpos (url_list);
517 /* Free the canonical this_url. */
518 FREE_MAYBE (canon_this_url);
519 /* Decrement the recursion depth. */
521 if (downloaded_exceeds_quota ())
528 register_download (const char *url, const char *file)
530 if (!opt.convert_links)
532 if (!dl_file_url_map)
533 dl_file_url_map = make_string_hash_table (0);
534 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
535 if (!dl_url_file_map)
536 dl_url_file_map = make_string_hash_table (0);
537 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
541 register_html (const char *url, const char *file)
543 if (!opt.convert_links)
545 downloaded_html_files = slist_prepend (downloaded_html_files, file);
548 /* convert_links() is called from recursive_retrieve() after we're
549 done with an HTML file. This call to convert_links is not complete
550 because it converts only the downloaded files, and Wget cannot know
551 which files will be downloaded afterwards. So, if we have file
554 <a href="/c/something.gif">
556 and /c/something.gif was not downloaded because it exceeded the
557 recursion depth, the reference will *not* be changed.
559 However, later we can encounter /c/something.gif from an "upper"
560 level HTML (let's call it filetwo.html), and it gets downloaded.
562 But now we have a problem because /c/something.gif will be
563 correctly transformed in filetwo.html, but not in fileone.html,
564 since Wget could not have known that /c/something.gif will be
565 downloaded in the future.
567 This is why Wget must, after the whole retrieval, call
568 convert_all_links to go once more through the entire list of
569 retrieved HTMLs, and re-convert them.
571 All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
572 in urls_downloaded. From these two lists information is
575 convert_all_links (void)
579 /* Destructively reverse downloaded_html_files to get it in the right order.
580 recursive_retrieve() used slist_prepend() consistently. */
581 downloaded_html_files = slist_nreverse (downloaded_html_files);
583 for (html = downloaded_html_files; html; html = html->next)
585 urlpos *urls, *cur_url;
588 DEBUGP (("Rescanning %s\n", html->string));
589 /* Determine the URL of the HTML file. get_urls_html will need
591 url = hash_table_get (dl_file_url_map, html->string);
593 DEBUGP (("It should correspond to %s.\n", url));
595 DEBUGP (("I cannot find the corresponding URL.\n"));
596 /* Parse the HTML file... */
597 urls = get_urls_html (html->string, url, FALSE, NULL);
598 /* We don't respect meta_disallow_follow here because, even if
599 the file is not followed, we might still want to convert the
600 links that have been followed from other files. */
601 for (cur_url = urls; cur_url; cur_url = cur_url->next)
605 /* The URL must be in canonical form to be compared. */
606 struct urlinfo *u = newurl ();
607 uerr_t res = parseurl (cur_url->url, u, 0);
613 /* We decide the direction of conversion according to whether
614 a URL was downloaded. Downloaded URLs will be converted
615 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
616 local_name = hash_table_get (dl_url_file_map, u->url);
618 DEBUGP (("%s marked for conversion, local %s\n",
619 u->url, local_name));
620 /* Decide on the conversion direction. */
623 /* We've downloaded this URL. Convert it to relative
624 form. We do this even if the URL already is in
625 relative form, because our directory structure may
626 not be identical to that on the server (think `-nd',
627 `--cut-dirs', etc.) */
628 cur_url->convert = CO_CONVERT_TO_RELATIVE;
629 cur_url->local_name = xstrdup (local_name);
633 /* We haven't downloaded this URL. If it's not already
634 complete (including a full host name), convert it to
635 that form, so it can be reached while browsing this
637 if (!cur_url->link_complete_p)
638 cur_url->convert = CO_CONVERT_TO_COMPLETE;
639 cur_url->local_name = NULL;
643 /* Convert the links in the file. */
644 convert_links (html->string, urls);
650 /* Robots support. */
652 /* Construct the robots URL. */
653 static struct urlinfo *
654 robots_url (const char *url, const char *robots_filename)
656 struct urlinfo *u = newurl ();
659 err = parseurl (url, u, 0);
660 assert (err == URLOK && u->proto == URLHTTP);
664 u->dir = xstrdup ("");
665 u->file = xstrdup (robots_filename);
666 u->url = str_url (u, 0);
670 /* Retrieves the robots_filename from the root server directory, if
671 possible. Returns ROBOTSOK if robots were retrieved OK, and
672 NOROBOTS if robots could not be retrieved for any reason. */
674 retrieve_robots (const char *url, const char *robots_filename)
680 u = robots_url (url, robots_filename);
681 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
682 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
690 /* Parse the robots_filename and return the disallowed path components
691 in a malloc-ed vector of character pointers.
693 It should be fully compliant with the syntax as described in the
694 file norobots.txt, adopted by the robots mailing list
695 (robots@webcrawler.com). */
697 parse_robots (const char *robots_filename)
701 char *line, *cmd, *str, *p;
702 char *base_version, *version;
704 int wget_matched; /* is the part meant for Wget? */
709 fp = fopen (robots_filename, "rb");
713 /* Kill version number. */
716 STRDUP_ALLOCA (base_version, opt.useragent);
717 STRDUP_ALLOCA (version, opt.useragent);
721 int len = 10 + strlen (version_string);
722 base_version = (char *)alloca (len);
723 sprintf (base_version, "Wget/%s", version_string);
724 version = (char *)alloca (len);
725 sprintf (version, "Wget/%s", version_string);
727 for (p = version; *p; p++)
729 for (p = base_version; *p && *p != '/'; p++)
733 /* Setting this to 1 means that Wget considers itself under
734 restrictions by default, even if the User-Agent field is not
735 present. However, if it finds the user-agent set to anything
736 other than Wget, the rest will be ignored (up to the following
737 User-Agent field). Thus you may have something like:
741 User-Agent: stupid-robot
750 In this case the 1, 2, 5, 6 and 7 disallow lines will be
753 while ((line = read_whole_line (fp)))
755 int len = strlen (line);
756 /* Destroy <CR><LF> if present. */
757 if (len && line[len - 1] == '\n')
759 if (len && line[len - 1] == '\r')
761 /* According to specifications, optional space may be at the
763 DEBUGP (("Line: %s\n", line));
765 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
769 DEBUGP (("(chucked out)\n"));
773 for (str = cmd; *str && *str != ':'; str++);
777 DEBUGP (("(chucked out)\n"));
780 /* Zero-terminate the command. */
782 /* Look for the string beginning... */
783 for (; *str && ISSPACE (*str); str++);
784 /* Look for comments or trailing spaces and kill them off. */
785 for (p = str; *p; p++)
786 if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
788 /* We have found either a shell-style comment `<sp>+#' or some
789 trailing spaces. Now rewind to the beginning of the spaces
790 and place '\0' there. */
791 while (p > str && ISSPACE (*p))
799 if (!strcasecmp (cmd, "User-agent"))
802 /* Lowercase the agent string. */
803 for (p = str; *p; p++)
805 /* If the string is `*', it matches. */
806 if (*str == '*' && !*(str + 1))
810 /* If the string contains wildcards, we'll run it through
812 if (has_wildcards_p (str))
814 /* If the string contains '/', compare with the full
815 version. Else, compare it to base_version. */
816 if (strchr (str, '/'))
817 match = !fnmatch (str, version, 0);
819 match = !fnmatch (str, base_version, 0);
821 else /* Substring search */
823 if (strstr (version, str))
829 /* If Wget is not matched, skip all the entries up to the
830 next User-agent field. */
831 wget_matched = match;
833 else if (!wget_matched)
836 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
839 else if (!strcasecmp (cmd, "Disallow"))
841 /* If "Disallow" is empty, the robot is welcome. */
845 entries = (char **)xmalloc (sizeof (char *));
851 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
852 entries[num] = xstrdup (str);
853 entries[++num] = NULL;
854 /* Strip trailing spaces, according to specifications. */
855 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
856 if (ISSPACE (str[i]))
862 /* unknown command */
863 DEBUGP (("(chucked out)\n"));
871 /* May the URL url be loaded according to disallowing rules stored in
874 robots_match (struct urlinfo *u, char **fb)
880 DEBUGP (("Matching %s against: ", u->path));
883 DEBUGP (("%s ", *fb));
885 /* If dir is fb, we may not load the file. */
886 if (strncmp (u->path, *fb, l) == 0)
888 DEBUGP (("matched.\n"));
889 return 0; /* Matches, i.e. does not load... */
892 DEBUGP (("not matched.\n"));