1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 /* Functions for maintaining the URL queue. */
64 struct queue_element {
68 struct queue_element *next;
72 struct queue_element *head;
73 struct queue_element *tail;
77 /* Create a URL queue. */
79 static struct url_queue *
82 struct url_queue *queue = xmalloc (sizeof (*queue));
83 memset (queue, '\0', sizeof (*queue));
87 /* Delete a URL queue. */
90 url_queue_delete (struct url_queue *queue)
95 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
96 retrieved ("dequeued") from the queue in the order they were placed
100 url_enqueue (struct url_queue *queue,
101 const char *url, const char *referer, int depth)
103 struct queue_element *qel = xmalloc (sizeof (*qel));
105 qel->referer = referer;
110 if (queue->count > queue->maxcount)
111 queue->maxcount = queue->count;
113 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
114 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 queue->tail->next = qel;
121 queue->head = queue->tail;
124 /* Take a URL out of the queue. Return 1 if this operation succeeded,
125 or 0 if the queue is empty. */
128 url_dequeue (struct url_queue *queue,
129 const char **url, const char **referer, int *depth)
131 struct queue_element *qel = queue->head;
136 queue->head = queue->head->next;
141 *referer = qel->referer;
146 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
147 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
154 struct url *, struct hash_table *));
155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
156 struct url *, struct hash_table *));
159 /* Retrieve a part of the web beginning with START_URL. This used to
160 be called "recursive retrieval", because the old function was
161 recursive and implemented depth-first search. retrieve_tree on the
162 other hand implements breadth-search traversal of the tree, which
163 results in much nicer ordering of downloads.
165 The algorithm this function uses is simple:
167 1. put START_URL in the queue.
168 2. while there are URLs in the queue:
170 3. get next URL from the queue.
172 5. if the URL is HTML and its depth does not exceed maximum depth,
173 get the list of URLs embedded therein.
174 6. for each of those URLs do the following:
176 7. if the URL is not one of those downloaded before, and if it
177 satisfies the criteria specified by the various command-line
178 options, add it to the queue. */
181 retrieve_tree (const char *start_url)
183 uerr_t status = RETROK;
185 /* The queue of URLs we need to load. */
186 struct url_queue *queue = url_queue_new ();
188 /* The URLs we do not wish to enqueue, because they are already in
189 the queue, but haven't been downloaded yet. */
190 struct hash_table *blacklist = make_string_hash_table (0);
192 /* We'll need various components of this, so better get it over with
194 struct url *start_url_parsed = url_parse (start_url, NULL);
196 url_enqueue (queue, xstrdup (start_url), NULL, 0);
197 string_set_add (blacklist, start_url);
202 char *url, *referer, *file = NULL;
204 boolean dash_p_leaf_HTML = FALSE;
206 if (downloaded_exceeds_quota ())
209 if (status == FWRITEERR)
212 /* Get the next URL from the queue. */
214 if (!url_dequeue (queue,
215 (const char **)&url, (const char **)&referer,
219 /* And download it. */
223 char *redirected = NULL;
224 int oldrec = opt.recursive;
227 status = retrieve_url (url, &file, &redirected, NULL, &dt);
228 opt.recursive = oldrec;
230 if (file && status == RETROK
231 && (dt & RETROKF) && (dt & TEXTHTML))
236 /* We have been redirected, possibly to another host, or
237 different path, or wherever. Check whether we really
238 want to follow it. */
241 if (!descend_redirect_p (redirected, url, depth,
242 start_url_parsed, blacklist))
252 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
254 if (opt.page_requisites && depth == opt.reclevel)
255 /* When -p is specified, we can do one more partial
256 recursion from the "leaf nodes" on the HTML document
257 tree. The recursion is partial in that we won't
258 traverse any <A> or <AREA> tags, nor any <LINK> tags
259 except for <LINK REL="stylesheet">. */
260 dash_p_leaf_HTML = TRUE;
263 /* Either -p wasn't specified or it was and we've
264 already gone the one extra (pseudo-)level that it
265 affords us, so we need to bail out. */
266 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
267 depth, opt.reclevel));
272 /* If the downloaded document was HTML, parse it and enqueue the
273 links it contains. */
277 int meta_disallow_follow = 0;
278 struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
279 &meta_disallow_follow);
281 if (opt.use_robots && meta_disallow_follow)
283 free_urlpos (children);
289 struct urlpos *child = children;
290 struct url *url_parsed = url_parsed = url_parse (url, NULL);
291 assert (url_parsed != NULL);
293 for (; child; child = child->next)
295 if (child->ignore_when_downloading)
297 if (descend_url_p (child, url_parsed, depth, start_url_parsed,
300 url_enqueue (queue, xstrdup (child->url->url),
301 xstrdup (url), depth + 1);
302 /* We blacklist the URL we have enqueued, because we
303 don't want to enqueue (and hence download) the
305 string_set_add (blacklist, child->url->url);
309 url_free (url_parsed);
310 free_urlpos (children);
314 if (opt.delete_after || (file && !acceptable (file)))
316 /* Either --delete-after was specified, or we loaded this
317 otherwise rejected (e.g. by -R) HTML file just so we
318 could harvest its hyperlinks -- in either case, delete
320 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
321 opt.delete_after ? "--delete-after" :
322 "recursive rejection criteria"));
323 logprintf (LOG_VERBOSE,
325 ? _("Removing %s.\n")
326 : _("Removing %s since it should be rejected.\n")),
329 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
333 FREE_MAYBE (referer);
337 /* If anything is left of the queue due to a premature exit, free it
342 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
348 url_queue_delete (queue);
350 if (start_url_parsed)
351 url_free (start_url_parsed);
352 string_set_free (blacklist);
354 if (downloaded_exceeds_quota ())
356 else if (status == FWRITEERR)
362 /* Based on the context provided by retrieve_tree, decide whether a
363 URL is to be descended to. This is only ever called from
364 retrieve_tree, but is in a separate function for clarity.
366 The most expensive checks (such as those for robots) are memoized
367 by storing these URLs to BLACKLIST. This may or may not help. It
368 will help if those URLs are encountered many times. */
371 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
372 struct url *start_url_parsed, struct hash_table *blacklist)
374 struct url *u = upos->url;
375 const char *url = u->url;
377 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
379 if (string_set_contains (blacklist, url))
381 DEBUGP (("Already on the black list.\n"));
385 /* Several things to check for:
386 1. if scheme is not http, and we don't load it
387 2. check for relative links (if relative_only is set)
389 4. check for no-parent
390 5. check for excludes && includes
392 7. check for same host (if spanhost is unset), with possible
393 gethostbyname baggage
394 8. check for robots.txt
396 Addendum: If the URL is FTP, and it is to be loaded, only the
397 domain and suffix settings are "stronger".
399 Note that .html files will get loaded regardless of suffix rules
400 (but that is remedied later with unlink) unless the depth equals
403 More time- and memory- consuming tests should be put later on
406 /* 1. Schemes other than HTTP are normally not recursed into. */
407 if (u->scheme != SCHEME_HTTP
408 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
410 DEBUGP (("Not following non-HTTP schemes.\n"));
414 /* 2. If it is an absolute link and they are not followed, throw it
416 if (u->scheme == SCHEME_HTTP)
417 if (opt.relative_only && !upos->link_relative_p)
419 DEBUGP (("It doesn't really look like a relative link.\n"));
423 /* 3. If its domain is not to be accepted/looked-up, chuck it
425 if (!accept_domain (u))
427 DEBUGP (("The domain was not accepted.\n"));
431 /* 4. Check for parent directory.
433 If we descended to a different host or changed the scheme, ignore
434 opt.no_parent. Also ignore it for -p leaf retrievals. */
436 && u->scheme == parent->scheme
437 && 0 == strcasecmp (u->host, parent->host)
438 && u->port == parent->port)
440 if (!frontcmp (parent->dir, u->dir))
442 DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
447 /* 5. If the file does not match the acceptance list, or is on the
448 rejection list, chuck it out. The same goes for the directory
449 exclusion and inclusion lists. */
450 if (opt.includes || opt.excludes)
452 if (!accdir (u->dir, ALLABS))
454 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
462 /* Check for acceptance/rejection rules. We ignore these rules
463 for HTML documents because they might lead to other files which
464 need to be downloaded. Of course, we don't know which
465 documents are HTML before downloading them, so we guess.
467 A file is subject to acceptance/rejection rules if:
469 * u->file is not "" (i.e. it is not a directory)
471 + there is no file suffix,
472 + or there is a suffix, but is not "html" or "htm",
474 - recursion is not infinite,
475 - and we are at its very end. */
477 if (u->file[0] != '\0'
478 && ((suf = suffix (url)) == NULL
479 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
480 || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
482 if (!acceptable (u->file))
484 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
492 if (u->scheme == parent->scheme)
493 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
495 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
496 u->host, parent->host));
501 if (opt.use_robots && u->scheme == SCHEME_HTTP)
503 struct robot_specs *specs = res_get_specs (u->host, u->port);
507 if (res_retrieve_file (url, &rfile))
509 specs = res_parse_from_file (rfile);
514 /* If we cannot get real specs, at least produce
515 dummy ones so that we can register them and stop
516 trying to retrieve them. */
517 specs = res_parse ("", 0);
519 res_register_specs (u->host, u->port, specs);
522 /* Now that we have (or don't have) robots.txt specs, we can
523 check what they say. */
524 if (!res_match_path (specs, u->path))
526 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
527 string_set_add (blacklist, url);
532 /* The URL has passed all the tests. It can be placed in the
534 DEBUGP (("Decided to load it.\n"));
539 DEBUGP (("Decided NOT to load it.\n"));
544 /* This function determines whether we should descend the children of
545 the URL whose download resulted in a redirection, possibly to
546 another host, etc. It is needed very rarely, and thus it is merely
547 a simple-minded wrapper around descend_url_p. */
550 descend_redirect_p (const char *redirected, const char *original, int depth,
551 struct url *start_url_parsed, struct hash_table *blacklist)
553 struct url *orig_parsed, *new_parsed;
557 orig_parsed = url_parse (original, NULL);
558 assert (orig_parsed != NULL);
560 new_parsed = url_parse (redirected, NULL);
561 assert (new_parsed != NULL);
563 upos = xmalloc (sizeof (struct urlpos));
564 memset (upos, 0, sizeof (*upos));
565 upos->url = new_parsed;
567 success = descend_url_p (upos, orig_parsed, depth,
568 start_url_parsed, blacklist);
570 url_free (orig_parsed);
571 url_free (new_parsed);
575 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
581 /* Register that URL has been successfully downloaded to FILE. */
584 register_download (const char *url, const char *file)
586 if (!opt.convert_links)
588 if (!dl_file_url_map)
589 dl_file_url_map = make_string_hash_table (0);
590 if (!dl_url_file_map)
591 dl_url_file_map = make_string_hash_table (0);
593 if (!hash_table_contains (dl_file_url_map, file))
594 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
595 if (!hash_table_contains (dl_url_file_map, url))
596 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
599 /* Register that FROM has been redirected to TO. This assumes that TO
600 is successfully downloaded and already registered using
601 register_download() above. */
604 register_redirection (const char *from, const char *to)
608 if (!opt.convert_links)
611 file = hash_table_get (dl_url_file_map, to);
612 assert (file != NULL);
613 if (!hash_table_contains (dl_url_file_map, from))
614 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
617 /* Register that URL corresponds to the HTML file FILE. */
620 register_html (const char *url, const char *file)
622 if (!opt.convert_links)
624 downloaded_html_files = slist_prepend (downloaded_html_files, file);
627 /* This function is called when the retrieval is done to convert the
628 links that have been downloaded. It has to be called at the end of
629 the retrieval, because only then does Wget know conclusively which
630 URLs have been downloaded, and which not, so it can tell which
631 direction to convert to.
633 The "direction" means that the URLs to the files that have been
634 downloaded get converted to the relative URL which will point to
635 that file. And the other URLs get converted to the remote URL on
638 All the downloaded HTMLs are kept in downloaded_html_files, and
639 downloaded URLs in urls_downloaded. All the information is
640 extracted from these two lists. */
643 convert_all_links (void)
646 struct wget_timer *timer;
650 timer = wtimer_new ();
652 /* Destructively reverse downloaded_html_files to get it in the right order.
653 recursive_retrieve() used slist_prepend() consistently. */
654 downloaded_html_files = slist_nreverse (downloaded_html_files);
656 for (html = downloaded_html_files; html; html = html->next)
658 struct urlpos *urls, *cur_url;
661 DEBUGP (("Rescanning %s\n", html->string));
663 /* Determine the URL of the HTML file. get_urls_html will need
665 url = hash_table_get (dl_file_url_map, html->string);
667 DEBUGP (("It should correspond to %s.\n", url));
669 DEBUGP (("I cannot find the corresponding URL.\n"));
671 /* Parse the HTML file... */
672 urls = get_urls_html (html->string, url, FALSE, NULL);
674 /* We don't respect meta_disallow_follow here because, even if
675 the file is not followed, we might still want to convert the
676 links that have been followed from other files. */
678 for (cur_url = urls; cur_url; cur_url = cur_url->next)
681 struct url *u = cur_url->url;
683 if (cur_url->link_base_p)
685 /* Base references have been resolved by our parser, so
686 we turn the base URL into an empty string. (Perhaps
687 we should remove the tag entirely?) */
688 cur_url->convert = CO_NULLIFY_BASE;
692 /* We decide the direction of conversion according to whether
693 a URL was downloaded. Downloaded URLs will be converted
694 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
695 local_name = hash_table_get (dl_url_file_map, u->url);
697 DEBUGP (("%s marked for conversion, local %s\n",
698 u->url, local_name));
700 /* Decide on the conversion type. */
703 /* We've downloaded this URL. Convert it to relative
704 form. We do this even if the URL already is in
705 relative form, because our directory structure may
706 not be identical to that on the server (think `-nd',
707 `--cut-dirs', etc.) */
708 cur_url->convert = CO_CONVERT_TO_RELATIVE;
709 cur_url->local_name = xstrdup (local_name);
713 /* We haven't downloaded this URL. If it's not already
714 complete (including a full host name), convert it to
715 that form, so it can be reached while browsing this
717 if (!cur_url->link_complete_p)
718 cur_url->convert = CO_CONVERT_TO_COMPLETE;
719 cur_url->local_name = NULL;
723 /* Convert the links in the file. */
724 convert_links (html->string, urls);
731 msecs = wtimer_elapsed (timer);
732 wtimer_delete (timer);
733 logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
734 file_count, (double)msecs / 1000);
737 /* Cleanup the data structures associated with recursive retrieving
738 (the variables above). */
740 recursive_cleanup (void)
744 free_keys_and_values (dl_file_url_map);
745 hash_table_destroy (dl_file_url_map);
746 dl_file_url_map = NULL;
750 free_keys_and_values (dl_url_file_map);
751 hash_table_destroy (dl_url_file_map);
752 dl_url_file_map = NULL;
754 slist_free (downloaded_html_files);
755 downloaded_html_files = NULL;