1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
50 extern char *version_string;
52 #define ROBOTS_FILENAME "robots.txt"
54 static struct hash_table *dl_file_url_map;
55 static struct hash_table *dl_url_file_map;
57 /* List of HTML files downloaded in this Wget run. Used for link
58 conversion after Wget is done. */
59 static slist *downloaded_html_files;
61 /* List of undesirable-to-load URLs. */
62 static struct hash_table *undesirable_urls;
64 /* List of forbidden locations. */
65 static char **forbidden = NULL;
67 /* Current recursion depth. */
70 /* Base directory we're recursing from (used by no_parent). */
71 static char *base_dir;
73 /* The host name for which we last checked robots. */
74 static char *robots_host;
76 static int first_time = 1;
78 /* Construct the robots URL. */
79 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
80 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
81 static char **parse_robots PARAMS ((const char *));
82 static int robots_match PARAMS ((struct urlinfo *, char **));
85 /* Cleanup the data structures associated with recursive retrieving
86 (the variables above). */
88 recursive_cleanup (void)
92 string_set_free (undesirable_urls);
93 undesirable_urls = NULL;
97 free_keys_and_values (dl_file_url_map);
98 hash_table_destroy (dl_file_url_map);
99 dl_file_url_map = NULL;
103 free_keys_and_values (dl_url_file_map);
104 hash_table_destroy (dl_url_file_map);
105 dl_url_file_map = NULL;
107 undesirable_urls = NULL;
108 free_vec (forbidden);
110 slist_free (downloaded_html_files);
111 downloaded_html_files = NULL;
112 FREE_MAYBE (base_dir);
113 FREE_MAYBE (robots_host);
117 /* Reset FIRST_TIME to 1, so that some action can be taken in
118 recursive_retrieve(). */
120 recursive_reset (void)
125 /* The core of recursive retrieving. Endless recursion is avoided by
126 having all URLs stored to a linked list of URLs, which is checked
127 before loading any URL. That way no URL can get loaded twice.
129 The function also supports specification of maximum recursion depth
130 and a number of other goodies. */
132 recursive_retrieve (const char *file, const char *this_url)
134 char *constr, *filename, *newloc;
135 char *canon_this_url = NULL;
136 int dt, inl, dash_p_leaf_HTML = FALSE;
137 int meta_disallow_follow;
138 int this_url_ftp; /* See below the explanation */
140 struct urlinfo *rurl;
141 urlpos *url_list, *cur_url;
142 char *rfile; /* For robots */
145 assert (this_url != NULL);
146 assert (file != NULL);
147 /* If quota was exceeded earlier, bail out. */
148 if (downloaded_exceeds_quota ())
150 /* Cache the current URL in the list. */
153 /* These three operations need to be done only once per Wget
154 run. They should probably be at a different location. */
155 if (!undesirable_urls)
156 undesirable_urls = make_string_hash_table (0);
158 hash_table_clear (undesirable_urls);
159 string_set_add (undesirable_urls, this_url);
160 /* Enter this_url to the hash table, in original and "enhanced" form. */
162 err = parseurl (this_url, u, 0);
165 string_set_add (undesirable_urls, u->url);
167 base_dir = xstrdup (u->dir); /* Set the base dir. */
168 /* Set the canonical this_url to be sent as referer. This
169 problem exists only when running the first time. */
170 canon_this_url = xstrdup (u->url);
174 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
186 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
187 /* We've exceeded the maximum recursion depth specified by the user. */
189 if (opt.page_requisites && depth <= opt.reclevel + 1)
190 /* When -p is specified, we can do one more partial recursion from the
191 "leaf nodes" on the HTML document tree. The recursion is partial in
192 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
193 except for <LINK REL="stylesheet">. */
194 dash_p_leaf_HTML = TRUE;
196 /* Either -p wasn't specified or it was and we've already gone the one
197 extra (pseudo-)level that it affords us, so we need to bail out. */
199 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
200 depth, opt.reclevel));
206 /* Determine whether this_url is an FTP URL. If it is, it means
207 that the retrieval is done through proxy. In that case, FTP
208 links will be followed by default and recursion will not be
209 turned off when following them. */
210 this_url_ftp = (urlproto (this_url) == URLFTP);
212 /* Get the URL-s from an HTML file: */
213 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
214 dash_p_leaf_HTML, &meta_disallow_follow);
216 if (opt.use_robots && meta_disallow_follow)
218 /* The META tag says we are not to follow this file. Respect
220 free_urlpos (url_list);
224 /* Decide what to do with each of the URLs. A URL will be loaded if
225 it meets several requirements, discussed later. */
226 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
228 /* If quota was exceeded earlier, bail out. */
229 if (downloaded_exceeds_quota ())
231 /* Parse the URL for convenient use in other functions, as well
232 as to get the optimized form. It also checks URL integrity. */
234 if (parseurl (cur_url->url, u, 0) != URLOK)
236 DEBUGP (("Yuck! A bad URL.\n"));
240 if (u->proto == URLFILE)
242 DEBUGP (("Nothing to do with file:// around here.\n"));
246 assert (u->url != NULL);
247 constr = xstrdup (u->url);
249 /* Several checkings whether a file is acceptable to load:
250 1. check if URL is ftp, and we don't load it
251 2. check for relative links (if relative_only is set)
253 4. check for no-parent
254 5. check for excludes && includes
256 7. check for same host (if spanhost is unset), with possible
257 gethostbyname baggage
258 8. check for robots.txt
260 Addendum: If the URL is FTP, and it is to be loaded, only the
261 domain and suffix settings are "stronger".
263 Note that .html and (yuck) .htm will get loaded regardless of
264 suffix rules (but that is remedied later with unlink) unless
265 the depth equals the maximum depth.
267 More time- and memory- consuming tests should be put later on
270 /* inl is set if the URL we are working on (constr) is stored in
271 undesirable_urls. Using it is crucial to avoid unnecessary
272 repeated continuous hits to the hash table. */
273 inl = string_set_contains (undesirable_urls, constr);
275 /* If it is FTP, and FTP is not followed, chuck it out. */
277 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
279 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
280 string_set_add (undesirable_urls, constr);
283 /* If it is absolute link and they are not followed, chuck it
285 if (!inl && u->proto != URLFTP)
286 if (opt.relative_only && !cur_url->link_relative_p)
288 DEBUGP (("It doesn't really look like a relative link.\n"));
289 string_set_add (undesirable_urls, constr);
292 /* If its domain is not to be accepted/looked-up, chuck it out. */
294 if (!accept_domain (u))
296 DEBUGP (("I don't like the smell of that domain.\n"));
297 string_set_add (undesirable_urls, constr);
300 /* Check for parent directory. */
301 if (!inl && opt.no_parent
302 /* If the new URL is FTP and the old was not, ignore
304 && !(!this_url_ftp && u->proto == URLFTP))
306 /* Check for base_dir first. */
307 if (!(base_dir && frontcmp (base_dir, u->dir)))
309 /* Failing that, check for parent dir. */
310 struct urlinfo *ut = newurl ();
311 if (parseurl (this_url, ut, 0) != URLOK)
312 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
313 else if (!frontcmp (ut->dir, u->dir))
315 /* Failing that too, kill the URL. */
316 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
317 string_set_add (undesirable_urls, constr);
323 /* If the file does not match the acceptance list, or is on the
324 rejection list, chuck it out. The same goes for the
325 directory exclude- and include- lists. */
326 if (!inl && (opt.includes || opt.excludes))
328 if (!accdir (u->dir, ALLABS))
330 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
331 string_set_add (undesirable_urls, constr);
338 /* We check for acceptance/rejection rules only for non-HTML
339 documents. Since we don't know whether they really are
340 HTML, it will be deduced from (an OR-ed list):
342 1) u->file is "" (meaning it is a directory)
343 2) suffix exists, AND:
347 If the file *is* supposed to be HTML, it will *not* be
348 subject to acc/rej rules, unless a finite maximum depth has
349 been specified and the current depth is the maximum depth. */
352 || (((suf = suffix (constr)) != NULL)
353 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
354 && ((opt.reclevel != INFINITE_RECURSION) &&
355 (depth != opt.reclevel))))))
357 if (!acceptable (u->file))
359 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
361 string_set_add (undesirable_urls, constr);
367 /* Optimize the URL (which includes possible DNS lookup) only
368 after all other possibilities have been exhausted. */
371 if (!opt.simple_check)
376 /* Just lowercase the hostname. */
377 for (p = u->host; *p; p++)
380 u->url = str_url (u, 0);
383 constr = xstrdup (u->url);
384 /* After we have canonicalized the URL, check if we have it
385 on the black list. */
386 if (string_set_contains (undesirable_urls, constr))
388 /* This line is bogus. */
389 /*string_set_add (undesirable_urls, constr);*/
391 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
392 if (!opt.spanhost && this_url && !same_host (this_url, constr))
394 DEBUGP (("This is not the same hostname as the parent's.\n"));
395 string_set_add (undesirable_urls, constr);
399 /* What about robots.txt? */
400 if (!inl && opt.use_robots && u->proto == URLHTTP)
402 /* Since Wget knows about only one set of robot rules at a
403 time, /robots.txt must be reloaded whenever a new host is
406 robots_host holds the host the current `forbid' variable
408 if (!robots_host || !same_host (robots_host, u->host))
410 FREE_MAYBE (robots_host);
411 /* Now make robots_host the new host, no matter what the
412 result will be. So if there is no /robots.txt on the
413 site, Wget will not retry getting robots all the
415 robots_host = xstrdup (u->host);
416 free_vec (forbidden);
418 err = retrieve_robots (constr, ROBOTS_FILENAME);
421 rurl = robots_url (constr, ROBOTS_FILENAME);
422 rfile = url_filename (rurl);
423 forbidden = parse_robots (rfile);
429 /* Now that we have (or don't have) robots, we can check for
431 if (!robots_match (u, forbidden))
433 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
435 string_set_add (undesirable_urls, constr);
441 /* If it wasn't chucked out, do something with it. */
444 DEBUGP (("I've decided to load it -> "));
445 /* Add it to the list of already-loaded URL-s. */
446 string_set_add (undesirable_urls, constr);
447 /* Automatically followed FTPs will *not* be downloaded
449 if (u->proto == URLFTP)
451 /* Don't you adore side-effects? */
454 /* Reset its type. */
457 retrieve_url (constr, &filename, &newloc,
458 canon_this_url ? canon_this_url : this_url, &dt);
459 if (u->proto == URLFTP)
469 /* If there was no error, and the type is text/html, parse
474 recursive_retrieve (filename, constr);
477 DEBUGP (("%s is not text/html so we don't chase.\n",
478 filename ? filename: "(null)"));
480 if (opt.delete_after || (filename && !acceptable (filename)))
481 /* Either --delete-after was specified, or we loaded this otherwise
482 rejected (e.g. by -R) HTML file just so we could harvest its
483 hyperlinks -- in either case, delete the local file. */
485 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
486 opt.delete_after ? "--delete-after" :
487 "recursive rejection criteria"));
488 logprintf (LOG_VERBOSE,
489 (opt.delete_after ? _("Removing %s.\n")
490 : _("Removing %s since it should be rejected.\n")),
492 if (unlink (filename))
493 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
497 /* If everything was OK, and links are to be converted, let's
498 store the local filename. */
499 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
501 cur_url->convert = CO_CONVERT_TO_RELATIVE;
502 cur_url->local_name = xstrdup (filename);
506 DEBUGP (("%s already in list, so we don't load.\n", constr));
507 /* Free filename and constr. */
508 FREE_MAYBE (filename);
511 /* Increment the pbuf for the appropriate size. */
513 if (opt.convert_links && !opt.delete_after)
514 /* This is merely the first pass: the links that have been
515 successfully downloaded are converted. In the second pass,
516 convert_all_links() will also convert those links that have NOT
517 been downloaded to their canonical form. */
518 convert_links (file, url_list);
519 /* Free the linked list of URL-s. */
520 free_urlpos (url_list);
521 /* Free the canonical this_url. */
522 FREE_MAYBE (canon_this_url);
523 /* Decrement the recursion depth. */
525 if (downloaded_exceeds_quota ())
532 register_download (const char *url, const char *file)
534 if (!opt.convert_links)
536 if (!dl_file_url_map)
537 dl_file_url_map = make_string_hash_table (0);
538 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
539 if (!dl_url_file_map)
540 dl_url_file_map = make_string_hash_table (0);
541 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
545 register_html (const char *url, const char *file)
547 if (!opt.convert_links)
549 downloaded_html_files = slist_prepend (downloaded_html_files, file);
552 /* convert_links() is called from recursive_retrieve() after we're
553 done with an HTML file. This call to convert_links is not complete
554 because it converts only the downloaded files, and Wget cannot know
555 which files will be downloaded afterwards. So, if we have file
558 <a href="/c/something.gif">
560 and /c/something.gif was not downloaded because it exceeded the
561 recursion depth, the reference will *not* be changed.
563 However, later we can encounter /c/something.gif from an "upper"
564 level HTML (let's call it filetwo.html), and it gets downloaded.
566 But now we have a problem because /c/something.gif will be
567 correctly transformed in filetwo.html, but not in fileone.html,
568 since Wget could not have known that /c/something.gif will be
569 downloaded in the future.
571 This is why Wget must, after the whole retrieval, call
572 convert_all_links to go once more through the entire list of
573 retrieved HTMLs, and re-convert them.
575 All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
576 in urls_downloaded. From these two lists information is
579 convert_all_links (void)
583 /* Destructively reverse downloaded_html_files to get it in the right order.
584 recursive_retrieve() used slist_prepend() consistently. */
585 downloaded_html_files = slist_nreverse (downloaded_html_files);
587 for (html = downloaded_html_files; html; html = html->next)
589 urlpos *urls, *cur_url;
592 DEBUGP (("Rescanning %s\n", html->string));
593 /* Determine the URL of the HTML file. get_urls_html will need
595 url = hash_table_get (dl_file_url_map, html->string);
597 DEBUGP (("It should correspond to %s.\n", url));
599 DEBUGP (("I cannot find the corresponding URL.\n"));
600 /* Parse the HTML file... */
601 urls = get_urls_html (html->string, url, FALSE, NULL);
602 /* We don't respect meta_disallow_follow here because, even if
603 the file is not followed, we might still want to convert the
604 links that have been followed from other files. */
605 for (cur_url = urls; cur_url; cur_url = cur_url->next)
609 /* The URL must be in canonical form to be compared. */
610 struct urlinfo *u = newurl ();
611 uerr_t res = parseurl (cur_url->url, u, 0);
617 /* We decide the direction of conversion according to whether
618 a URL was downloaded. Downloaded URLs will be converted
619 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
620 local_name = hash_table_get (dl_url_file_map, u->url);
622 DEBUGP (("%s marked for conversion, local %s\n",
623 u->url, local_name));
624 /* Decide on the conversion direction. */
627 /* We've downloaded this URL. Convert it to relative
628 form. We do this even if the URL already is in
629 relative form, because our directory structure may
630 not be identical to that on the server (think `-nd',
631 `--cut-dirs', etc.) */
632 cur_url->convert = CO_CONVERT_TO_RELATIVE;
633 cur_url->local_name = xstrdup (local_name);
637 /* We haven't downloaded this URL. If it's not already
638 complete (including a full host name), convert it to
639 that form, so it can be reached while browsing this
641 if (!cur_url->link_complete_p)
642 cur_url->convert = CO_CONVERT_TO_COMPLETE;
643 cur_url->local_name = NULL;
647 /* Convert the links in the file. */
648 convert_links (html->string, urls);
654 /* Robots support. */
656 /* Construct the robots URL. */
657 static struct urlinfo *
658 robots_url (const char *url, const char *robots_filename)
660 struct urlinfo *u = newurl ();
663 err = parseurl (url, u, 0);
664 assert (err == URLOK && u->proto == URLHTTP);
668 u->dir = xstrdup ("");
669 u->file = xstrdup (robots_filename);
670 u->url = str_url (u, 0);
674 /* Retrieves the robots_filename from the root server directory, if
675 possible. Returns ROBOTSOK if robots were retrieved OK, and
676 NOROBOTS if robots could not be retrieved for any reason. */
678 retrieve_robots (const char *url, const char *robots_filename)
684 u = robots_url (url, robots_filename);
685 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
686 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
694 /* Parse the robots_filename and return the disallowed path components
695 in a malloc-ed vector of character pointers.
697 It should be fully compliant with the syntax as described in the
698 file norobots.txt, adopted by the robots mailing list
699 (robots@webcrawler.com). */
701 parse_robots (const char *robots_filename)
705 char *line, *cmd, *str, *p;
706 char *base_version, *version;
708 int wget_matched; /* is the part meant for Wget? */
713 fp = fopen (robots_filename, "rb");
717 /* Kill version number. */
720 STRDUP_ALLOCA (base_version, opt.useragent);
721 STRDUP_ALLOCA (version, opt.useragent);
725 int len = 10 + strlen (version_string);
726 base_version = (char *)alloca (len);
727 sprintf (base_version, "Wget/%s", version_string);
728 version = (char *)alloca (len);
729 sprintf (version, "Wget/%s", version_string);
731 for (p = version; *p; p++)
733 for (p = base_version; *p && *p != '/'; p++)
737 /* Setting this to 1 means that Wget considers itself under
738 restrictions by default, even if the User-Agent field is not
739 present. However, if it finds the user-agent set to anything
740 other than Wget, the rest will be ignored (up to the following
741 User-Agent field). Thus you may have something like:
745 User-Agent: stupid-robot
754 In this case the 1, 2, 5, 6 and 7 disallow lines will be
757 while ((line = read_whole_line (fp)))
759 int len = strlen (line);
760 /* Destroy <CR><LF> if present. */
761 if (len && line[len - 1] == '\n')
763 if (len && line[len - 1] == '\r')
765 /* According to specifications, optional space may be at the
767 DEBUGP (("Line: %s\n", line));
769 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
773 DEBUGP (("(chucked out)\n"));
777 for (str = cmd; *str && *str != ':'; str++);
781 DEBUGP (("(chucked out)\n"));
784 /* Zero-terminate the command. */
786 /* Look for the string beginning... */
787 for (; *str && ISSPACE (*str); str++);
788 /* Look for comments or trailing spaces and kill them off. */
789 for (p = str; *p; p++)
790 if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
792 /* We have found either a shell-style comment `<sp>+#' or some
793 trailing spaces. Now rewind to the beginning of the spaces
794 and place '\0' there. */
795 while (p > str && ISSPACE (*p))
803 if (!strcasecmp (cmd, "User-agent"))
806 /* Lowercase the agent string. */
807 for (p = str; *p; p++)
809 /* If the string is `*', it matches. */
810 if (*str == '*' && !*(str + 1))
814 /* If the string contains wildcards, we'll run it through
816 if (has_wildcards_p (str))
818 /* If the string contains '/', compare with the full
819 version. Else, compare it to base_version. */
820 if (strchr (str, '/'))
821 match = !fnmatch (str, version, 0);
823 match = !fnmatch (str, base_version, 0);
825 else /* Substring search */
827 if (strstr (version, str))
833 /* If Wget is not matched, skip all the entries up to the
834 next User-agent field. */
835 wget_matched = match;
837 else if (!wget_matched)
840 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
843 else if (!strcasecmp (cmd, "Disallow"))
845 /* If "Disallow" is empty, the robot is welcome. */
849 entries = (char **)xmalloc (sizeof (char *));
855 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
856 entries[num] = xstrdup (str);
857 entries[++num] = NULL;
858 /* Strip trailing spaces, according to specifications. */
859 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
860 if (ISSPACE (str[i]))
866 /* unknown command */
867 DEBUGP (("(chucked out)\n"));
875 /* May the URL url be loaded according to disallowing rules stored in
878 robots_match (struct urlinfo *u, char **fb)
884 DEBUGP (("Matching %s against: ", u->path));
887 DEBUGP (("%s ", *fb));
889 /* If dir is fb, we may not load the file. */
890 if (strncmp (u->path, *fb, l) == 0)
892 DEBUGP (("matched.\n"));
893 return 0; /* Matches, i.e. does not load... */
896 DEBUGP (("not matched.\n"));