1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
35 #include <sys/types.h>
47 extern char *version_string;
49 #define ROBOTS_FILENAME "robots.txt"
51 static struct hash_table *dl_file_url_map;
52 static struct hash_table *dl_url_file_map;
54 /* List of HTML URLs. */
55 static slist *urls_html;
57 /* List of undesirable-to-load URLs. */
58 static struct hash_table *undesirable_urls;
60 /* List of forbidden locations. */
61 static char **forbidden = NULL;
63 /* Current recursion depth. */
66 /* Base directory we're recursing from (used by no_parent). */
67 static char *base_dir;
69 /* The host name for which we last checked robots. */
70 static char *robots_host;
72 static int first_time = 1;
74 /* Construct the robots URL. */
75 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
76 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
77 static char **parse_robots PARAMS ((const char *));
78 static int robots_match PARAMS ((struct urlinfo *, char **));
81 /* Cleanup the data structures associated with recursive retrieving
82 (the variables above). */
84 recursive_cleanup (void)
88 string_set_free (undesirable_urls);
89 undesirable_urls = NULL;
93 free_keys_and_values (dl_file_url_map);
94 hash_table_destroy (dl_file_url_map);
95 dl_file_url_map = NULL;
99 free_keys_and_values (dl_url_file_map);
100 hash_table_destroy (dl_url_file_map);
101 dl_url_file_map = NULL;
103 undesirable_urls = NULL;
104 free_vec (forbidden);
106 slist_free (urls_html);
108 FREE_MAYBE (base_dir);
109 FREE_MAYBE (robots_host);
113 /* Reset FIRST_TIME to 1, so that some action can be taken in
114 recursive_retrieve(). */
116 recursive_reset (void)
121 /* The core of recursive retrieving. Endless recursion is avoided by
122 having all URLs stored to a linked list of URLs, which is checked
123 before loading any URL. That way no URL can get loaded twice.
125 The function also supports specification of maximum recursion depth
126 and a number of other goodies. */
128 recursive_retrieve (const char *file, const char *this_url)
130 char *constr, *filename, *newloc;
131 char *canon_this_url = NULL;
132 int dt, inl, dash_p_leaf_HTML = FALSE;
133 int meta_disallow_follow;
134 int this_url_ftp; /* See below the explanation */
136 struct urlinfo *rurl;
137 urlpos *url_list, *cur_url;
138 char *rfile; /* For robots */
141 assert (this_url != NULL);
142 assert (file != NULL);
143 /* If quota was exceeded earlier, bail out. */
144 if (downloaded_exceeds_quota ())
146 /* Cache the current URL in the list. */
149 /* These three operations need to be done only once per Wget
150 run. They should probably be at a different location. */
151 if (!undesirable_urls)
152 undesirable_urls = make_string_hash_table (0);
153 if (!dl_file_url_map)
154 dl_file_url_map = make_string_hash_table (0);
155 if (!dl_url_file_map)
156 dl_url_file_map = make_string_hash_table (0);
158 hash_table_clear (undesirable_urls);
159 string_set_add (undesirable_urls, this_url);
160 hash_table_clear (dl_file_url_map);
161 hash_table_clear (dl_url_file_map);
163 /* Enter this_url to the hash table, in original and "enhanced" form. */
165 err = parseurl (this_url, u, 0);
168 string_set_add (undesirable_urls, u->url);
169 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
170 hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
171 urls_html = slist_prepend (urls_html, file);
173 base_dir = xstrdup (u->dir); /* Set the base dir. */
174 /* Set the canonical this_url to be sent as referer. This
175 problem exists only when running the first time. */
176 canon_this_url = xstrdup (u->url);
180 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
192 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
193 /* We've exceeded the maximum recursion depth specified by the user. */
195 if (opt.page_requisites && depth <= opt.reclevel + 1)
196 /* When -p is specified, we can do one more partial recursion from the
197 "leaf nodes" on the HTML document tree. The recursion is partial in
198 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
199 except for <LINK REL="stylesheet">. */
200 dash_p_leaf_HTML = TRUE;
202 /* Either -p wasn't specified or it was and we've already gone the one
203 extra (pseudo-)level that it affords us, so we need to bail out. */
205 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
206 depth, opt.reclevel));
212 /* Determine whether this_url is an FTP URL. If it is, it means
213 that the retrieval is done through proxy. In that case, FTP
214 links will be followed by default and recursion will not be
215 turned off when following them. */
216 this_url_ftp = (urlproto (this_url) == URLFTP);
218 /* Get the URL-s from an HTML file: */
219 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
220 dash_p_leaf_HTML, &meta_disallow_follow);
222 if (opt.use_robots && meta_disallow_follow)
224 /* The META tag says we are not to follow this file. Respect
226 free_urlpos (url_list);
230 /* Decide what to do with each of the URLs. A URL will be loaded if
231 it meets several requirements, discussed later. */
232 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
234 /* If quota was exceeded earlier, bail out. */
235 if (downloaded_exceeds_quota ())
237 /* Parse the URL for convenient use in other functions, as well
238 as to get the optimized form. It also checks URL integrity. */
240 if (parseurl (cur_url->url, u, 0) != URLOK)
242 DEBUGP (("Yuck! A bad URL.\n"));
246 if (u->proto == URLFILE)
248 DEBUGP (("Nothing to do with file:// around here.\n"));
252 assert (u->url != NULL);
253 constr = xstrdup (u->url);
255 /* Several checkings whether a file is acceptable to load:
256 1. check if URL is ftp, and we don't load it
257 2. check for relative links (if relative_only is set)
259 4. check for no-parent
260 5. check for excludes && includes
262 7. check for same host (if spanhost is unset), with possible
263 gethostbyname baggage
264 8. check for robots.txt
266 Addendum: If the URL is FTP, and it is to be loaded, only the
267 domain and suffix settings are "stronger".
269 Note that .html and (yuck) .htm will get loaded regardless of
270 suffix rules (but that is remedied later with unlink) unless
271 the depth equals the maximum depth.
273 More time- and memory- consuming tests should be put later on
276 /* inl is set if the URL we are working on (constr) is stored in
277 undesirable_urls. Using it is crucial to avoid unnecessary
278 repeated continuous hits to the hash table. */
279 inl = string_set_exists (undesirable_urls, constr);
281 /* If it is FTP, and FTP is not followed, chuck it out. */
283 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
285 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
286 string_set_add (undesirable_urls, constr);
289 /* If it is absolute link and they are not followed, chuck it
291 if (!inl && u->proto != URLFTP)
292 if (opt.relative_only && !cur_url->link_relative_p)
294 DEBUGP (("It doesn't really look like a relative link.\n"));
295 string_set_add (undesirable_urls, constr);
298 /* If its domain is not to be accepted/looked-up, chuck it out. */
300 if (!accept_domain (u))
302 DEBUGP (("I don't like the smell of that domain.\n"));
303 string_set_add (undesirable_urls, constr);
306 /* Check for parent directory. */
307 if (!inl && opt.no_parent
308 /* If the new URL is FTP and the old was not, ignore
310 && !(!this_url_ftp && u->proto == URLFTP))
312 /* Check for base_dir first. */
313 if (!(base_dir && frontcmp (base_dir, u->dir)))
315 /* Failing that, check for parent dir. */
316 struct urlinfo *ut = newurl ();
317 if (parseurl (this_url, ut, 0) != URLOK)
318 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
319 else if (!frontcmp (ut->dir, u->dir))
321 /* Failing that too, kill the URL. */
322 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
323 string_set_add (undesirable_urls, constr);
329 /* If the file does not match the acceptance list, or is on the
330 rejection list, chuck it out. The same goes for the
331 directory exclude- and include- lists. */
332 if (!inl && (opt.includes || opt.excludes))
334 if (!accdir (u->dir, ALLABS))
336 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
337 string_set_add (undesirable_urls, constr);
344 /* We check for acceptance/rejection rules only for non-HTML
345 documents. Since we don't know whether they really are
346 HTML, it will be deduced from (an OR-ed list):
348 1) u->file is "" (meaning it is a directory)
349 2) suffix exists, AND:
353 If the file *is* supposed to be HTML, it will *not* be
354 subject to acc/rej rules, unless a finite maximum depth has
355 been specified and the current depth is the maximum depth. */
358 || (((suf = suffix (constr)) != NULL)
359 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
360 && ((opt.reclevel != INFINITE_RECURSION) &&
361 (depth != opt.reclevel))))))
363 if (!acceptable (u->file))
365 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
367 string_set_add (undesirable_urls, constr);
373 /* Optimize the URL (which includes possible DNS lookup) only
374 after all other possibilities have been exhausted. */
377 if (!opt.simple_check)
382 /* Just lowercase the hostname. */
383 for (p = u->host; *p; p++)
386 u->url = str_url (u, 0);
389 constr = xstrdup (u->url);
390 string_set_add (undesirable_urls, constr);
391 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
392 if (!opt.spanhost && this_url && !same_host (this_url, constr))
394 DEBUGP (("This is not the same hostname as the parent's.\n"));
395 string_set_add (undesirable_urls, constr);
399 /* What about robots.txt? */
400 if (!inl && opt.use_robots && u->proto == URLHTTP)
402 /* Since Wget knows about only one set of robot rules at a
403 time, /robots.txt must be reloaded whenever a new host is
406 robots_host holds the host the current `forbid' variable
408 if (!robots_host || !same_host (robots_host, u->host))
410 FREE_MAYBE (robots_host);
411 /* Now make robots_host the new host, no matter what the
412 result will be. So if there is no /robots.txt on the
413 site, Wget will not retry getting robots all the
415 robots_host = xstrdup (u->host);
416 free_vec (forbidden);
418 err = retrieve_robots (constr, ROBOTS_FILENAME);
421 rurl = robots_url (constr, ROBOTS_FILENAME);
422 rfile = url_filename (rurl);
423 forbidden = parse_robots (rfile);
429 /* Now that we have (or don't have) robots, we can check for
431 if (!robots_match (u, forbidden))
433 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
435 string_set_add (undesirable_urls, constr);
441 /* If it wasn't chucked out, do something with it. */
444 DEBUGP (("I've decided to load it -> "));
445 /* Add it to the list of already-loaded URL-s. */
446 string_set_add (undesirable_urls, constr);
447 /* Automatically followed FTPs will *not* be downloaded
449 if (u->proto == URLFTP)
451 /* Don't you adore side-effects? */
454 /* Reset its type. */
457 retrieve_url (constr, &filename, &newloc,
458 canon_this_url ? canon_this_url : this_url, &dt);
459 if (u->proto == URLFTP)
469 /* In case of convert_links: If there was no error, add it to
470 the list of downloaded URLs. We might need it for
472 if (opt.convert_links && filename)
476 hash_table_put (dl_file_url_map,
477 xstrdup (filename), xstrdup (constr));
478 hash_table_put (dl_url_file_map,
479 xstrdup (constr), xstrdup (filename));
480 /* If the URL is HTML, note it. */
482 urls_html = slist_prepend (urls_html, filename);
485 /* If there was no error, and the type is text/html, parse
490 recursive_retrieve (filename, constr);
493 DEBUGP (("%s is not text/html so we don't chase.\n",
494 filename ? filename: "(null)"));
496 if (opt.delete_after || (filename && !acceptable (filename)))
497 /* Either --delete-after was specified, or we loaded this otherwise
498 rejected (e.g. by -R) HTML file just so we could harvest its
499 hyperlinks -- in either case, delete the local file. */
501 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
502 opt.delete_after ? "--delete-after" :
503 "recursive rejection criteria"));
504 logprintf (LOG_VERBOSE,
505 (opt.delete_after ? _("Removing %s.\n")
506 : _("Removing %s since it should be rejected.\n")),
508 if (unlink (filename))
509 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
513 /* If everything was OK, and links are to be converted, let's
514 store the local filename. */
515 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
517 cur_url->convert = CO_CONVERT_TO_RELATIVE;
518 cur_url->local_name = xstrdup (filename);
522 DEBUGP (("%s already in list, so we don't load.\n", constr));
523 /* Free filename and constr. */
524 FREE_MAYBE (filename);
527 /* Increment the pbuf for the appropriate size. */
529 if (opt.convert_links && !opt.delete_after)
530 /* This is merely the first pass: the links that have been
531 successfully downloaded are converted. In the second pass,
532 convert_all_links() will also convert those links that have NOT
533 been downloaded to their canonical form. */
534 convert_links (file, url_list);
535 /* Free the linked list of URL-s. */
536 free_urlpos (url_list);
537 /* Free the canonical this_url. */
538 FREE_MAYBE (canon_this_url);
539 /* Decrement the recursion depth. */
541 if (downloaded_exceeds_quota ())
547 /* convert_links() is called from recursive_retrieve() after we're
548 done with an HTML file. This call to convert_links is not complete
549 because it converts only the downloaded files, and Wget cannot know
550 which files will be downloaded afterwards. So, if we have file
553 <a href="/c/something.gif">
555 and /c/something.gif was not downloaded because it exceeded the
556 recursion depth, the reference will *not* be changed.
558 However, later we can encounter /c/something.gif from an "upper"
559 level HTML (let's call it filetwo.html), and it gets downloaded.
561 But now we have a problem because /c/something.gif will be
562 correctly transformed in filetwo.html, but not in fileone.html,
563 since Wget could not have known that /c/something.gif will be
564 downloaded in the future.
566 This is why Wget must, after the whole retrieval, call
567 convert_all_links to go once more through the entire list of
568 retrieved HTMLs, and re-convert them.
570 All the downloaded HTMLs are kept in urls_html, and downloaded URLs
571 in urls_downloaded. From these two lists information is
574 convert_all_links (void)
578 /* Destructively reverse urls_html to get it in the right order.
579 recursive_retrieve() used slist_prepend() consistently. */
580 urls_html = slist_nreverse (urls_html);
582 for (html = urls_html; html; html = html->next)
584 urlpos *urls, *cur_url;
587 DEBUGP (("Rescanning %s\n", html->string));
588 /* Determine the URL of the HTML file. get_urls_html will need
590 url = hash_table_get (dl_file_url_map, html->string);
592 DEBUGP (("It should correspond to %s.\n", url));
594 DEBUGP (("I cannot find the corresponding URL.\n"));
595 /* Parse the HTML file... */
596 urls = get_urls_html (html->string, url, FALSE, NULL);
597 /* We don't respect meta_disallow_follow here because, even if
598 the file is not followed, we might still want to convert the
599 links that have been followed from other files. */
600 for (cur_url = urls; cur_url; cur_url = cur_url->next)
604 /* The URL must be in canonical form to be compared. */
605 struct urlinfo *u = newurl ();
606 uerr_t res = parseurl (cur_url->url, u, 0);
612 /* We decide the direction of conversion according to whether
613 a URL was downloaded. Downloaded URLs will be converted
614 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
615 local_name = hash_table_get (dl_url_file_map, u->url);
617 DEBUGP (("%s marked for conversion, local %s\n",
618 u->url, local_name));
619 /* Decide on the conversion direction. */
622 /* We've downloaded this URL. Convert it to relative
623 form. We do this even if the URL already is in
624 relative form, because our directory structure may
625 not be identical to that on the server (think `-nd',
626 `--cut-dirs', etc.) */
627 cur_url->convert = CO_CONVERT_TO_RELATIVE;
628 cur_url->local_name = xstrdup (local_name);
632 /* We haven't downloaded this URL. If it's not already
633 complete (including a full host name), convert it to
634 that form, so it can be reached while browsing this
636 if (!cur_url->link_complete_p)
637 cur_url->convert = CO_CONVERT_TO_COMPLETE;
638 cur_url->local_name = NULL;
642 /* Convert the links in the file. */
643 convert_links (html->string, urls);
649 /* Robots support. */
651 /* Construct the robots URL. */
652 static struct urlinfo *
653 robots_url (const char *url, const char *robots_filename)
655 struct urlinfo *u = newurl ();
658 err = parseurl (url, u, 0);
659 assert (err == URLOK && u->proto == URLHTTP);
663 u->dir = xstrdup ("");
664 u->file = xstrdup (robots_filename);
665 u->url = str_url (u, 0);
669 /* Retrieves the robots_filename from the root server directory, if
670 possible. Returns ROBOTSOK if robots were retrieved OK, and
671 NOROBOTS if robots could not be retrieved for any reason. */
673 retrieve_robots (const char *url, const char *robots_filename)
679 u = robots_url (url, robots_filename);
680 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
681 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
689 /* Parse the robots_filename and return the disallowed path components
690 in a malloc-ed vector of character pointers.
692 It should be fully compliant with the syntax as described in the
693 file norobots.txt, adopted by the robots mailing list
694 (robots@webcrawler.com). */
696 parse_robots (const char *robots_filename)
700 char *line, *cmd, *str, *p;
701 char *base_version, *version;
703 int wget_matched; /* is the part meant for Wget? */
708 fp = fopen (robots_filename, "rb");
712 /* Kill version number. */
715 STRDUP_ALLOCA (base_version, opt.useragent);
716 STRDUP_ALLOCA (version, opt.useragent);
720 int len = 10 + strlen (version_string);
721 base_version = (char *)alloca (len);
722 sprintf (base_version, "Wget/%s", version_string);
723 version = (char *)alloca (len);
724 sprintf (version, "Wget/%s", version_string);
726 for (p = version; *p; p++)
728 for (p = base_version; *p && *p != '/'; p++)
732 /* Setting this to 1 means that Wget considers itself under
733 restrictions by default, even if the User-Agent field is not
734 present. However, if it finds the user-agent set to anything
735 other than Wget, the rest will be ignored (up to the following
736 User-Agent field). Thus you may have something like:
740 User-Agent: stupid-robot
749 In this case the 1, 2, 5, 6 and 7 disallow lines will be
752 while ((line = read_whole_line (fp)))
755 /* Destroy <CR><LF> if present. */
756 if (len && line[len - 1] == '\n')
758 if (len && line[len - 1] == '\r')
760 /* According to specifications, optional space may be at the
762 DEBUGP (("Line: %s\n", line));
764 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
768 DEBUGP (("(chucked out)\n"));
772 for (str = cmd; *str && *str != ':'; str++);
776 DEBUGP (("(chucked out)\n"));
779 /* Zero-terminate the command. */
781 /* Look for the string beginning... */
782 for (; *str && ISSPACE (*str); str++);
783 /* Look for comments or trailing spaces and kill them off. */
784 for (p = str; *p; p++)
785 if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
787 /* We have found either a shell-style comment `<sp>+#' or some
788 trailing spaces. Now rewind to the beginning of the spaces
789 and place '\0' there. */
790 while (p > str && ISSPACE (*p))
798 if (!strcasecmp (cmd, "User-agent"))
801 /* Lowercase the agent string. */
802 for (p = str; *p; p++)
804 /* If the string is `*', it matches. */
805 if (*str == '*' && !*(str + 1))
809 /* If the string contains wildcards, we'll run it through
811 if (has_wildcards_p (str))
813 /* If the string contains '/', compare with the full
814 version. Else, compare it to base_version. */
815 if (strchr (str, '/'))
816 match = !fnmatch (str, version, 0);
818 match = !fnmatch (str, base_version, 0);
820 else /* Substring search */
822 if (strstr (version, str))
828 /* If Wget is not matched, skip all the entries up to the
829 next User-agent field. */
830 wget_matched = match;
832 else if (!wget_matched)
835 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
838 else if (!strcasecmp (cmd, "Disallow"))
840 /* If "Disallow" is empty, the robot is welcome. */
844 entries = (char **)xmalloc (sizeof (char *));
850 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
851 entries[num] = xstrdup (str);
852 entries[++num] = NULL;
853 /* Strip trailing spaces, according to specifications. */
854 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
855 if (ISSPACE (str[i]))
861 /* unknown command */
862 DEBUGP (("(chucked out)\n"));
870 /* May the URL url be loaded according to disallowing rules stored in
873 robots_match (struct urlinfo *u, char **forbidden)
879 DEBUGP (("Matching %s against: ", u->path));
880 for (; *forbidden; forbidden++)
882 DEBUGP (("%s ", *forbidden));
883 l = strlen (*forbidden);
884 /* If dir is forbidden, we may not load the file. */
885 if (strncmp (u->path, *forbidden, l) == 0)
887 DEBUGP (("matched.\n"));
888 return 0; /* Matches, i.e. does not load... */
891 DEBUGP (("not matched.\n"));