1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 static void register_delete_file PARAMS ((const char *));
64 /* Functions for maintaining the URL queue. */
66 struct queue_element {
70 struct queue_element *next;
74 struct queue_element *head;
75 struct queue_element *tail;
79 /* Create a URL queue. */
81 static struct url_queue *
84 struct url_queue *queue = xmalloc (sizeof (*queue));
85 memset (queue, '\0', sizeof (*queue));
89 /* Delete a URL queue. */
92 url_queue_delete (struct url_queue *queue)
97 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
98 retrieved ("dequeued") from the queue in the order they were placed
102 url_enqueue (struct url_queue *queue,
103 const char *url, const char *referer, int depth)
105 struct queue_element *qel = xmalloc (sizeof (*qel));
107 qel->referer = referer;
112 if (queue->count > queue->maxcount)
113 queue->maxcount = queue->count;
115 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
116 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
119 queue->tail->next = qel;
123 queue->head = queue->tail;
126 /* Take a URL out of the queue. Return 1 if this operation succeeded,
127 or 0 if the queue is empty. */
130 url_dequeue (struct url_queue *queue,
131 const char **url, const char **referer, int *depth)
133 struct queue_element *qel = queue->head;
138 queue->head = queue->head->next;
143 *referer = qel->referer;
148 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
149 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
155 static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
156 struct url *, struct hash_table *));
157 static int descend_redirect_p PARAMS ((const char *, const char *, int,
158 struct url *, struct hash_table *));
161 /* Retrieve a part of the web beginning with START_URL. This used to
162 be called "recursive retrieval", because the old function was
163 recursive and implemented depth-first search. retrieve_tree on the
164 other hand implements breadth-search traversal of the tree, which
165 results in much nicer ordering of downloads.
167 The algorithm this function uses is simple:
169 1. put START_URL in the queue.
170 2. while there are URLs in the queue:
172 3. get next URL from the queue.
174 5. if the URL is HTML and its depth does not exceed maximum depth,
175 get the list of URLs embedded therein.
176 6. for each of those URLs do the following:
178 7. if the URL is not one of those downloaded before, and if it
179 satisfies the criteria specified by the various command-line
180 options, add it to the queue. */
183 retrieve_tree (const char *start_url)
185 uerr_t status = RETROK;
187 /* The queue of URLs we need to load. */
188 struct url_queue *queue = url_queue_new ();
190 /* The URLs we do not wish to enqueue, because they are already in
191 the queue, but haven't been downloaded yet. */
192 struct hash_table *blacklist = make_string_hash_table (0);
194 /* We'll need various components of this, so better get it over with
196 struct url *start_url_parsed = url_parse (start_url, NULL);
198 url_enqueue (queue, xstrdup (start_url), NULL, 0);
199 string_set_add (blacklist, start_url);
204 char *url, *referer, *file = NULL;
206 boolean dash_p_leaf_HTML = FALSE;
208 if (downloaded_exceeds_quota ())
210 if (status == FWRITEERR)
213 /* Get the next URL from the queue... */
215 if (!url_dequeue (queue,
216 (const char **)&url, (const char **)&referer,
220 /* ...and download it. Note that this download is in most cases
221 unconditional, as download_child_p already makes sure a file
222 doesn't get enqueued twice -- and yet this check is here, and
223 not in download_child_p. This is so that if you run `wget -r
224 URL1 URL2', and a random URL is encountered once under URL1
225 and again under URL2, but at a different (possibly smaller)
226 depth, we want the URL's children to be taken into account
228 if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
230 DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
231 url, (char *)hash_table_get (dl_url_file_map, url)));
236 char *redirected = NULL;
237 int oldrec = opt.recursive;
240 status = retrieve_url (url, &file, &redirected, NULL, &dt);
241 opt.recursive = oldrec;
243 if (file && status == RETROK
244 && (dt & RETROKF) && (dt & TEXTHTML))
249 /* We have been redirected, possibly to another host, or
250 different path, or wherever. Check whether we really
251 want to follow it. */
254 if (!descend_redirect_p (redirected, url, depth,
255 start_url_parsed, blacklist))
258 /* Make sure that the old pre-redirect form gets
260 string_set_add (blacklist, url);
269 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
271 if (opt.page_requisites
272 && (depth == opt.reclevel || depth == opt.reclevel + 1))
274 /* When -p is specified, we are allowed to exceed the
275 maximum depth, but only for the "inline" links,
276 i.e. those that are needed to display the page.
277 Originally this could exceed the depth at most by
278 one, but we allow one more level so that the leaf
279 pages that contain frames can be loaded
281 dash_p_leaf_HTML = TRUE;
285 /* Either -p wasn't specified or it was and we've
286 already spent the two extra (pseudo-)levels that it
287 affords us, so we need to bail out. */
288 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
289 depth, opt.reclevel));
294 /* If the downloaded document was HTML, parse it and enqueue the
295 links it contains. */
299 int meta_disallow_follow = 0;
300 struct urlpos *children
301 = get_urls_html (file, url, &meta_disallow_follow);
303 if (opt.use_robots && meta_disallow_follow)
305 free_urlpos (children);
311 struct urlpos *child = children;
312 struct url *url_parsed = url_parsed = url_parse (url, NULL);
313 assert (url_parsed != NULL);
315 for (; child; child = child->next)
317 if (child->ignore_when_downloading)
319 if (dash_p_leaf_HTML && !child->link_inline_p)
321 if (download_child_p (child, url_parsed, depth, start_url_parsed,
324 url_enqueue (queue, xstrdup (child->url->url),
325 xstrdup (url), depth + 1);
326 /* We blacklist the URL we have enqueued, because we
327 don't want to enqueue (and hence download) the
329 string_set_add (blacklist, child->url->url);
333 url_free (url_parsed);
334 free_urlpos (children);
338 if (opt.delete_after || (file && !acceptable (file)))
340 /* Either --delete-after was specified, or we loaded this
341 otherwise rejected (e.g. by -R) HTML file just so we
342 could harvest its hyperlinks -- in either case, delete
344 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
345 opt.delete_after ? "--delete-after" :
346 "recursive rejection criteria"));
347 logprintf (LOG_VERBOSE,
349 ? _("Removing %s.\n")
350 : _("Removing %s since it should be rejected.\n")),
353 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
354 register_delete_file (file);
358 FREE_MAYBE (referer);
362 /* If anything is left of the queue due to a premature exit, free it
367 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
373 url_queue_delete (queue);
375 if (start_url_parsed)
376 url_free (start_url_parsed);
377 string_set_free (blacklist);
379 if (downloaded_exceeds_quota ())
381 else if (status == FWRITEERR)
387 /* Based on the context provided by retrieve_tree, decide whether a
388 URL is to be descended to. This is only ever called from
389 retrieve_tree, but is in a separate function for clarity.
391 The most expensive checks (such as those for robots) are memoized
392 by storing these URLs to BLACKLIST. This may or may not help. It
393 will help if those URLs are encountered many times. */
396 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
397 struct url *start_url_parsed, struct hash_table *blacklist)
399 struct url *u = upos->url;
400 const char *url = u->url;
402 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
404 if (string_set_contains (blacklist, url))
406 DEBUGP (("Already on the black list.\n"));
410 /* Several things to check for:
411 1. if scheme is not http, and we don't load it
412 2. check for relative links (if relative_only is set)
414 4. check for no-parent
415 5. check for excludes && includes
417 7. check for same host (if spanhost is unset), with possible
418 gethostbyname baggage
419 8. check for robots.txt
421 Addendum: If the URL is FTP, and it is to be loaded, only the
422 domain and suffix settings are "stronger".
424 Note that .html files will get loaded regardless of suffix rules
425 (but that is remedied later with unlink) unless the depth equals
428 More time- and memory- consuming tests should be put later on
431 /* 1. Schemes other than HTTP are normally not recursed into. */
432 if (u->scheme != SCHEME_HTTP
433 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
435 DEBUGP (("Not following non-HTTP schemes.\n"));
439 /* 2. If it is an absolute link and they are not followed, throw it
441 if (u->scheme == SCHEME_HTTP)
442 if (opt.relative_only && !upos->link_relative_p)
444 DEBUGP (("It doesn't really look like a relative link.\n"));
448 /* 3. If its domain is not to be accepted/looked-up, chuck it
450 if (!accept_domain (u))
452 DEBUGP (("The domain was not accepted.\n"));
456 /* 4. Check for parent directory.
458 If we descended to a different host or changed the scheme, ignore
459 opt.no_parent. Also ignore it for documents needed to display
460 the parent page when in -p mode. */
462 && u->scheme == start_url_parsed->scheme
463 && 0 == strcasecmp (u->host, start_url_parsed->host)
464 && u->port == start_url_parsed->port
465 && !(opt.page_requisites && upos->link_inline_p))
467 if (!frontcmp (start_url_parsed->dir, u->dir))
469 DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
470 u->dir, start_url_parsed->dir));
475 /* 5. If the file does not match the acceptance list, or is on the
476 rejection list, chuck it out. The same goes for the directory
477 exclusion and inclusion lists. */
478 if (opt.includes || opt.excludes)
480 if (!accdir (u->dir, ALLABS))
482 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
490 /* Check for acceptance/rejection rules. We ignore these rules
491 for HTML documents because they might lead to other files which
492 need to be downloaded. Of course, we don't know which
493 documents are HTML before downloading them, so we guess.
495 A file is subject to acceptance/rejection rules if:
497 * u->file is not "" (i.e. it is not a directory)
499 + there is no file suffix,
500 + or there is a suffix, but is not "html" or "htm",
502 - recursion is not infinite,
503 - and we are at its very end. */
505 if (u->file[0] != '\0'
506 && ((suf = suffix (url)) == NULL
507 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
508 || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
510 if (!acceptable (u->file))
512 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
520 if (u->scheme == parent->scheme)
521 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
523 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
524 u->host, parent->host));
529 if (opt.use_robots && u->scheme == SCHEME_HTTP)
531 struct robot_specs *specs = res_get_specs (u->host, u->port);
535 if (res_retrieve_file (url, &rfile))
537 specs = res_parse_from_file (rfile);
542 /* If we cannot get real specs, at least produce
543 dummy ones so that we can register them and stop
544 trying to retrieve them. */
545 specs = res_parse ("", 0);
547 res_register_specs (u->host, u->port, specs);
550 /* Now that we have (or don't have) robots.txt specs, we can
551 check what they say. */
552 if (!res_match_path (specs, u->path))
554 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
555 string_set_add (blacklist, url);
560 /* The URL has passed all the tests. It can be placed in the
562 DEBUGP (("Decided to load it.\n"));
567 DEBUGP (("Decided NOT to load it.\n"));
572 /* This function determines whether we will consider downloading the
573 children of a URL whose download resulted in a redirection,
574 possibly to another host, etc. It is needed very rarely, and thus
575 it is merely a simple-minded wrapper around download_child_p. */
578 descend_redirect_p (const char *redirected, const char *original, int depth,
579 struct url *start_url_parsed, struct hash_table *blacklist)
581 struct url *orig_parsed, *new_parsed;
585 orig_parsed = url_parse (original, NULL);
586 assert (orig_parsed != NULL);
588 new_parsed = url_parse (redirected, NULL);
589 assert (new_parsed != NULL);
591 upos = xmalloc (sizeof (struct urlpos));
592 memset (upos, 0, sizeof (*upos));
593 upos->url = new_parsed;
595 success = download_child_p (upos, orig_parsed, depth,
596 start_url_parsed, blacklist);
598 url_free (orig_parsed);
599 url_free (new_parsed);
603 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
609 #define ENSURE_TABLES_EXIST do { \
610 if (!dl_file_url_map) \
611 dl_file_url_map = make_string_hash_table (0); \
612 if (!dl_url_file_map) \
613 dl_url_file_map = make_string_hash_table (0); \
616 /* Return 1 if S1 and S2 are the same, except for "/index.html". The
617 three cases in which it returns one are (substitute any substring
620 m("foo/index.html", "foo/") ==> 1
621 m("foo/", "foo/index.html") ==> 1
622 m("foo", "foo/index.html") ==> 1
623 m("foo", "foo/" ==> 1
624 m("foo", "foo") ==> 1 */
627 match_except_index (const char *s1, const char *s2)
632 /* Skip common substring. */
633 for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
636 /* Strings differ at the very beginning -- bail out. We need to
637 check this explicitly to avoid `lng - 1' reading outside the
642 /* Both strings hit EOF -- strings are equal. */
645 /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
648 /* S1 is the longer one. */
651 /* S2 is the longer one. */
655 /* foo/index.html */ /* or */ /* foo/index.html */
659 /* The right-hand case. */
662 if (*lng == '/' && *(lng + 1) == '\0')
667 return 0 == strcmp (lng, "/index.html");
671 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
673 char *mapping_url = (char *)key;
674 char *mapping_file = (char *)value;
675 char *file = (char *)arg;
677 if (0 == strcmp (mapping_file, file))
679 hash_table_remove (dl_url_file_map, mapping_url);
681 xfree (mapping_file);
684 /* Continue mapping. */
688 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
691 dissociate_urls_from_file (const char *file)
693 hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
697 /* Register that URL has been successfully downloaded to FILE. This
698 is used by the link conversion code to convert references to URLs
699 to references to local files. It is also being used to check if a
700 URL has already been downloaded. */
703 register_download (const char *url, const char *file)
705 char *old_file, *old_url;
709 /* With some forms of retrieval, it is possible, although not likely
710 or particularly desirable. If both are downloaded, the second
711 download will override the first one. When that happens,
712 dissociate the old file name from the URL. */
714 if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
716 if (0 == strcmp (url, old_url))
717 /* We have somehow managed to download the same URL twice.
721 if (match_except_index (url, old_url)
722 && !hash_table_contains (dl_url_file_map, url))
723 /* The two URLs differ only in the "index.html" ending. For
724 example, one is "http://www.server.com/", and the other is
725 "http://www.server.com/index.html". Don't remove the old
726 one, just add the new one as a non-canonical entry. */
729 hash_table_remove (dl_file_url_map, file);
733 /* Remove all the URLs that point to this file. Yes, there can
734 be more than one such URL, because we store redirections as
735 multiple entries in dl_url_file_map. For example, if URL1
736 redirects to URL2 which gets downloaded to FILE, we map both
737 URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
738 only points to URL2.) When another URL gets loaded to FILE,
739 we want both URL1 and URL2 dissociated from it.
741 This is a relatively expensive operation because it performs
742 a linear search of the whole hash table, but it should be
743 called very rarely, only when two URLs resolve to the same
744 file name, *and* the "<file>.1" extensions are turned off.
745 In other words, almost never. */
746 dissociate_urls_from_file (file);
749 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
752 /* A URL->FILE mapping is not possible without a FILE->URL mapping.
753 If the latter were present, it should have been removed by the
754 above `if'. So we could write:
756 assert (!hash_table_contains (dl_url_file_map, url));
758 The above is correct when running in recursive mode where the
759 same URL always resolves to the same file. But if you do
764 then the first URL will resolve to "FILE", and the other to
765 "FILE.1". In that case, FILE.1 will not be found in
766 dl_file_url_map, but URL will still point to FILE in
768 if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
770 hash_table_remove (dl_url_file_map, url);
775 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
778 /* Register that FROM has been redirected to TO. This assumes that TO
779 is successfully downloaded and already registered using
780 register_download() above. */
783 register_redirection (const char *from, const char *to)
789 file = hash_table_get (dl_url_file_map, to);
790 assert (file != NULL);
791 if (!hash_table_contains (dl_url_file_map, from))
792 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
795 /* Register that the file has been deleted. */
798 register_delete_file (const char *file)
800 char *old_url, *old_file;
804 if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
807 hash_table_remove (dl_file_url_map, file);
810 dissociate_urls_from_file (file);
813 /* Register that FILE is an HTML file that has been downloaded. */
816 register_html (const char *url, const char *file)
818 if (!opt.convert_links)
820 downloaded_html_files = slist_prepend (downloaded_html_files, file);
823 /* This function is called when the retrieval is done to convert the
824 links that have been downloaded. It has to be called at the end of
825 the retrieval, because only then does Wget know conclusively which
826 URLs have been downloaded, and which not, so it can tell which
827 direction to convert to.
829 The "direction" means that the URLs to the files that have been
830 downloaded get converted to the relative URL which will point to
831 that file. And the other URLs get converted to the remote URL on
834 All the downloaded HTMLs are kept in downloaded_html_files, and
835 downloaded URLs in urls_downloaded. All the information is
836 extracted from these two lists. */
839 convert_all_links (void)
845 struct wget_timer *timer = wtimer_new ();
846 struct hash_table *seen = make_string_hash_table (0);
848 /* Destructively reverse downloaded_html_files to get it in the right order.
849 recursive_retrieve() used slist_prepend() consistently. */
850 downloaded_html_files = slist_nreverse (downloaded_html_files);
852 for (html = downloaded_html_files; html; html = html->next)
854 struct urlpos *urls, *cur_url;
856 char *file = html->string;
858 /* Guard against duplicates. */
859 if (string_set_contains (seen, file))
861 string_set_add (seen, file);
863 /* Determine the URL of the HTML file. get_urls_html will need
865 url = hash_table_get (dl_file_url_map, file);
868 DEBUGP (("Apparently %s has been removed.\n", file));
872 DEBUGP (("Scanning %s (from %s)\n", file, url));
874 /* Parse the HTML file... */
875 urls = get_urls_html (file, url, NULL);
877 /* We don't respect meta_disallow_follow here because, even if
878 the file is not followed, we might still want to convert the
879 links that have been followed from other files. */
881 for (cur_url = urls; cur_url; cur_url = cur_url->next)
884 struct url *u = cur_url->url;
886 if (cur_url->link_base_p)
888 /* Base references have been resolved by our parser, so
889 we turn the base URL into an empty string. (Perhaps
890 we should remove the tag entirely?) */
891 cur_url->convert = CO_NULLIFY_BASE;
895 /* We decide the direction of conversion according to whether
896 a URL was downloaded. Downloaded URLs will be converted
897 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
898 local_name = hash_table_get (dl_url_file_map, u->url);
900 /* Decide on the conversion type. */
903 /* We've downloaded this URL. Convert it to relative
904 form. We do this even if the URL already is in
905 relative form, because our directory structure may
906 not be identical to that on the server (think `-nd',
907 `--cut-dirs', etc.) */
908 cur_url->convert = CO_CONVERT_TO_RELATIVE;
909 cur_url->local_name = xstrdup (local_name);
910 DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
914 /* We haven't downloaded this URL. If it's not already
915 complete (including a full host name), convert it to
916 that form, so it can be reached while browsing this
918 if (!cur_url->link_complete_p)
919 cur_url->convert = CO_CONVERT_TO_COMPLETE;
920 cur_url->local_name = NULL;
921 DEBUGP (("will convert url %s to complete\n", u->url));
925 /* Convert the links in the file. */
926 convert_links (file, urls);
933 msecs = wtimer_elapsed (timer);
934 wtimer_delete (timer);
935 logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
936 file_count, (double)msecs / 1000);
938 string_set_free (seen);
941 /* Cleanup the data structures associated with recursive retrieving
942 (the variables above). */
944 recursive_cleanup (void)
948 free_keys_and_values (dl_file_url_map);
949 hash_table_destroy (dl_file_url_map);
950 dl_file_url_map = NULL;
954 free_keys_and_values (dl_url_file_map);
955 hash_table_destroy (dl_url_file_map);
956 dl_url_file_map = NULL;
958 slist_free (downloaded_html_files);
959 downloaded_html_files = NULL;