1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 /* Functions for maintaining the URL queue. */
64 struct queue_element {
68 struct queue_element *next;
72 struct queue_element *head;
73 struct queue_element *tail;
77 /* Create a URL queue. */
79 static struct url_queue *
82 struct url_queue *queue = xmalloc (sizeof (*queue));
83 memset (queue, '\0', sizeof (*queue));
87 /* Delete a URL queue. */
90 url_queue_delete (struct url_queue *queue)
95 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
96 retrieved ("dequeued") from the queue in the order they were placed
100 url_enqueue (struct url_queue *queue,
101 const char *url, const char *referer, int depth)
103 struct queue_element *qel = xmalloc (sizeof (*qel));
105 qel->referer = referer;
110 if (queue->count > queue->maxcount)
111 queue->maxcount = queue->count;
113 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
114 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 queue->tail->next = qel;
121 queue->head = queue->tail;
124 /* Take a URL out of the queue. Return 1 if this operation succeeded,
125 or 0 if the queue is empty. */
128 url_dequeue (struct url_queue *queue,
129 const char **url, const char **referer, int *depth)
131 struct queue_element *qel = queue->head;
136 queue->head = queue->head->next;
141 *referer = qel->referer;
146 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
147 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
154 struct url *, struct hash_table *));
155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
156 struct url *, struct hash_table *));
159 /* Retrieve a part of the web beginning with START_URL. This used to
160 be called "recursive retrieval", because the old function was
161 recursive and implemented depth-first search. retrieve_tree on the
162 other hand implements breadth-search traversal of the tree, which
163 results in much nicer ordering of downloads.
165 The algorithm this function uses is simple:
167 1. put START_URL in the queue.
168 2. while there are URLs in the queue:
170 3. get next URL from the queue.
172 5. if the URL is HTML and its depth does not exceed maximum depth,
173 get the list of URLs embedded therein.
174 6. for each of those URLs do the following:
176 7. if the URL is not one of those downloaded before, and if it
177 satisfies the criteria specified by the various command-line
178 options, add it to the queue. */
181 retrieve_tree (const char *start_url)
183 uerr_t status = RETROK;
185 /* The queue of URLs we need to load. */
186 struct url_queue *queue = url_queue_new ();
188 /* The URLs we do not wish to enqueue, because they are already in
189 the queue, but haven't been downloaded yet. */
190 struct hash_table *blacklist = make_string_hash_table (0);
192 /* We'll need various components of this, so better get it over with
194 struct url *start_url_parsed = url_parse (start_url, NULL);
196 url_enqueue (queue, xstrdup (start_url), NULL, 0);
197 string_set_add (blacklist, start_url);
202 char *url, *referer, *file = NULL;
204 boolean dash_p_leaf_HTML = FALSE;
206 if (downloaded_exceeds_quota ())
209 if (status == FWRITEERR)
212 /* Get the next URL from the queue. */
214 if (!url_dequeue (queue,
215 (const char **)&url, (const char **)&referer,
219 /* And download it. */
223 char *redirected = NULL;
224 int oldrec = opt.recursive;
227 status = retrieve_url (url, &file, &redirected, NULL, &dt);
228 opt.recursive = oldrec;
230 if (file && status == RETROK
231 && (dt & RETROKF) && (dt & TEXTHTML))
236 /* We have been redirected, possibly to another host, or
237 different path, or wherever. Check whether we really
238 want to follow it. */
241 if (!descend_redirect_p (redirected, url, depth,
242 start_url_parsed, blacklist))
245 /* Make sure that the old pre-redirect form gets
247 string_set_add (blacklist, url);
256 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
258 if (opt.page_requisites
259 && (depth == opt.reclevel || depth == opt.reclevel + 1))
261 /* When -p is specified, we are allowed to exceed the
262 maximum depth, but only for the "inline" links,
263 i.e. those that are needed to display the page.
264 Originally this could exceed the depth at most by
265 one, but we allow one more level so that the leaf
266 pages that contain frames can be loaded
268 dash_p_leaf_HTML = TRUE;
272 /* Either -p wasn't specified or it was and we've
273 already spent the two extra (pseudo-)levels that it
274 affords us, so we need to bail out. */
275 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
276 depth, opt.reclevel));
281 /* If the downloaded document was HTML, parse it and enqueue the
282 links it contains. */
286 int meta_disallow_follow = 0;
287 struct urlpos *children
288 = get_urls_html (file, url, &meta_disallow_follow);
290 if (opt.use_robots && meta_disallow_follow)
292 free_urlpos (children);
298 struct urlpos *child = children;
299 struct url *url_parsed = url_parsed = url_parse (url, NULL);
300 assert (url_parsed != NULL);
302 for (; child; child = child->next)
304 if (child->ignore_when_downloading)
306 if (dash_p_leaf_HTML && !child->link_inline_p)
308 if (descend_url_p (child, url_parsed, depth, start_url_parsed,
311 url_enqueue (queue, xstrdup (child->url->url),
312 xstrdup (url), depth + 1);
313 /* We blacklist the URL we have enqueued, because we
314 don't want to enqueue (and hence download) the
316 string_set_add (blacklist, child->url->url);
320 url_free (url_parsed);
321 free_urlpos (children);
325 if (opt.delete_after || (file && !acceptable (file)))
327 /* Either --delete-after was specified, or we loaded this
328 otherwise rejected (e.g. by -R) HTML file just so we
329 could harvest its hyperlinks -- in either case, delete
331 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
332 opt.delete_after ? "--delete-after" :
333 "recursive rejection criteria"));
334 logprintf (LOG_VERBOSE,
336 ? _("Removing %s.\n")
337 : _("Removing %s since it should be rejected.\n")),
340 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
344 FREE_MAYBE (referer);
348 /* If anything is left of the queue due to a premature exit, free it
353 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
359 url_queue_delete (queue);
361 if (start_url_parsed)
362 url_free (start_url_parsed);
363 string_set_free (blacklist);
365 if (downloaded_exceeds_quota ())
367 else if (status == FWRITEERR)
373 /* Based on the context provided by retrieve_tree, decide whether a
374 URL is to be descended to. This is only ever called from
375 retrieve_tree, but is in a separate function for clarity.
377 The most expensive checks (such as those for robots) are memoized
378 by storing these URLs to BLACKLIST. This may or may not help. It
379 will help if those URLs are encountered many times. */
382 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
383 struct url *start_url_parsed, struct hash_table *blacklist)
385 struct url *u = upos->url;
386 const char *url = u->url;
388 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
390 if (string_set_contains (blacklist, url))
392 DEBUGP (("Already on the black list.\n"));
396 /* Several things to check for:
397 1. if scheme is not http, and we don't load it
398 2. check for relative links (if relative_only is set)
400 4. check for no-parent
401 5. check for excludes && includes
403 7. check for same host (if spanhost is unset), with possible
404 gethostbyname baggage
405 8. check for robots.txt
407 Addendum: If the URL is FTP, and it is to be loaded, only the
408 domain and suffix settings are "stronger".
410 Note that .html files will get loaded regardless of suffix rules
411 (but that is remedied later with unlink) unless the depth equals
414 More time- and memory- consuming tests should be put later on
417 /* 1. Schemes other than HTTP are normally not recursed into. */
418 if (u->scheme != SCHEME_HTTP
419 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
421 DEBUGP (("Not following non-HTTP schemes.\n"));
425 /* 2. If it is an absolute link and they are not followed, throw it
427 if (u->scheme == SCHEME_HTTP)
428 if (opt.relative_only && !upos->link_relative_p)
430 DEBUGP (("It doesn't really look like a relative link.\n"));
434 /* 3. If its domain is not to be accepted/looked-up, chuck it
436 if (!accept_domain (u))
438 DEBUGP (("The domain was not accepted.\n"));
442 /* 4. Check for parent directory.
444 If we descended to a different host or changed the scheme, ignore
445 opt.no_parent. Also ignore it for documents needed to display
446 the parent page when in -p mode. */
448 && u->scheme == start_url_parsed->scheme
449 && 0 == strcasecmp (u->host, start_url_parsed->host)
450 && u->port == start_url_parsed->port
451 && !(opt.page_requisites && upos->link_inline_p))
453 if (!frontcmp (start_url_parsed->dir, u->dir))
455 DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
456 u->dir, start_url_parsed->dir));
461 /* 5. If the file does not match the acceptance list, or is on the
462 rejection list, chuck it out. The same goes for the directory
463 exclusion and inclusion lists. */
464 if (opt.includes || opt.excludes)
466 if (!accdir (u->dir, ALLABS))
468 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
476 /* Check for acceptance/rejection rules. We ignore these rules
477 for HTML documents because they might lead to other files which
478 need to be downloaded. Of course, we don't know which
479 documents are HTML before downloading them, so we guess.
481 A file is subject to acceptance/rejection rules if:
483 * u->file is not "" (i.e. it is not a directory)
485 + there is no file suffix,
486 + or there is a suffix, but is not "html" or "htm",
488 - recursion is not infinite,
489 - and we are at its very end. */
491 if (u->file[0] != '\0'
492 && ((suf = suffix (url)) == NULL
493 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
494 || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
496 if (!acceptable (u->file))
498 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
506 if (u->scheme == parent->scheme)
507 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
509 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
510 u->host, parent->host));
515 if (opt.use_robots && u->scheme == SCHEME_HTTP)
517 struct robot_specs *specs = res_get_specs (u->host, u->port);
521 if (res_retrieve_file (url, &rfile))
523 specs = res_parse_from_file (rfile);
528 /* If we cannot get real specs, at least produce
529 dummy ones so that we can register them and stop
530 trying to retrieve them. */
531 specs = res_parse ("", 0);
533 res_register_specs (u->host, u->port, specs);
536 /* Now that we have (or don't have) robots.txt specs, we can
537 check what they say. */
538 if (!res_match_path (specs, u->path))
540 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
541 string_set_add (blacklist, url);
546 /* The URL has passed all the tests. It can be placed in the
548 DEBUGP (("Decided to load it.\n"));
553 DEBUGP (("Decided NOT to load it.\n"));
558 /* This function determines whether we should descend the children of
559 the URL whose download resulted in a redirection, possibly to
560 another host, etc. It is needed very rarely, and thus it is merely
561 a simple-minded wrapper around descend_url_p. */
564 descend_redirect_p (const char *redirected, const char *original, int depth,
565 struct url *start_url_parsed, struct hash_table *blacklist)
567 struct url *orig_parsed, *new_parsed;
571 orig_parsed = url_parse (original, NULL);
572 assert (orig_parsed != NULL);
574 new_parsed = url_parse (redirected, NULL);
575 assert (new_parsed != NULL);
577 upos = xmalloc (sizeof (struct urlpos));
578 memset (upos, 0, sizeof (*upos));
579 upos->url = new_parsed;
581 success = descend_url_p (upos, orig_parsed, depth,
582 start_url_parsed, blacklist);
584 url_free (orig_parsed);
585 url_free (new_parsed);
589 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
595 /* Register that URL has been successfully downloaded to FILE. */
598 register_download (const char *url, const char *file)
600 if (!opt.convert_links)
602 if (!dl_file_url_map)
603 dl_file_url_map = make_string_hash_table (0);
604 if (!dl_url_file_map)
605 dl_url_file_map = make_string_hash_table (0);
607 if (!hash_table_contains (dl_file_url_map, file))
608 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
609 if (!hash_table_contains (dl_url_file_map, url))
610 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
613 /* Register that FROM has been redirected to TO. This assumes that TO
614 is successfully downloaded and already registered using
615 register_download() above. */
618 register_redirection (const char *from, const char *to)
622 if (!opt.convert_links)
625 file = hash_table_get (dl_url_file_map, to);
626 assert (file != NULL);
627 if (!hash_table_contains (dl_url_file_map, from))
628 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
631 /* Register that URL corresponds to the HTML file FILE. */
634 register_html (const char *url, const char *file)
636 if (!opt.convert_links)
638 downloaded_html_files = slist_prepend (downloaded_html_files, file);
641 /* This function is called when the retrieval is done to convert the
642 links that have been downloaded. It has to be called at the end of
643 the retrieval, because only then does Wget know conclusively which
644 URLs have been downloaded, and which not, so it can tell which
645 direction to convert to.
647 The "direction" means that the URLs to the files that have been
648 downloaded get converted to the relative URL which will point to
649 that file. And the other URLs get converted to the remote URL on
652 All the downloaded HTMLs are kept in downloaded_html_files, and
653 downloaded URLs in urls_downloaded. All the information is
654 extracted from these two lists. */
657 convert_all_links (void)
660 struct wget_timer *timer;
664 timer = wtimer_new ();
666 /* Destructively reverse downloaded_html_files to get it in the right order.
667 recursive_retrieve() used slist_prepend() consistently. */
668 downloaded_html_files = slist_nreverse (downloaded_html_files);
670 for (html = downloaded_html_files; html; html = html->next)
672 struct urlpos *urls, *cur_url;
675 DEBUGP (("Rescanning %s\n", html->string));
677 /* Determine the URL of the HTML file. get_urls_html will need
679 url = hash_table_get (dl_file_url_map, html->string);
681 DEBUGP (("It should correspond to %s.\n", url));
683 DEBUGP (("I cannot find the corresponding URL.\n"));
685 /* Parse the HTML file... */
686 urls = get_urls_html (html->string, url, NULL);
688 /* We don't respect meta_disallow_follow here because, even if
689 the file is not followed, we might still want to convert the
690 links that have been followed from other files. */
692 for (cur_url = urls; cur_url; cur_url = cur_url->next)
695 struct url *u = cur_url->url;
697 if (cur_url->link_base_p)
699 /* Base references have been resolved by our parser, so
700 we turn the base URL into an empty string. (Perhaps
701 we should remove the tag entirely?) */
702 cur_url->convert = CO_NULLIFY_BASE;
706 /* We decide the direction of conversion according to whether
707 a URL was downloaded. Downloaded URLs will be converted
708 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
709 local_name = hash_table_get (dl_url_file_map, u->url);
711 DEBUGP (("%s marked for conversion, local %s\n",
712 u->url, local_name));
714 /* Decide on the conversion type. */
717 /* We've downloaded this URL. Convert it to relative
718 form. We do this even if the URL already is in
719 relative form, because our directory structure may
720 not be identical to that on the server (think `-nd',
721 `--cut-dirs', etc.) */
722 cur_url->convert = CO_CONVERT_TO_RELATIVE;
723 cur_url->local_name = xstrdup (local_name);
727 /* We haven't downloaded this URL. If it's not already
728 complete (including a full host name), convert it to
729 that form, so it can be reached while browsing this
731 if (!cur_url->link_complete_p)
732 cur_url->convert = CO_CONVERT_TO_COMPLETE;
733 cur_url->local_name = NULL;
737 /* Convert the links in the file. */
738 convert_links (html->string, urls);
745 msecs = wtimer_elapsed (timer);
746 wtimer_delete (timer);
747 logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
748 file_count, (double)msecs / 1000);
751 /* Cleanup the data structures associated with recursive retrieving
752 (the variables above). */
754 recursive_cleanup (void)
758 free_keys_and_values (dl_file_url_map);
759 hash_table_destroy (dl_file_url_map);
760 dl_file_url_map = NULL;
764 free_keys_and_values (dl_url_file_map);
765 hash_table_destroy (dl_url_file_map);
766 dl_url_file_map = NULL;
768 slist_free (downloaded_html_files);
769 downloaded_html_files = NULL;