1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 /* Functions for maintaining the URL queue. */
64 struct queue_element {
68 struct queue_element *next;
72 struct queue_element *head;
73 struct queue_element *tail;
77 /* Create a URL queue. */
79 static struct url_queue *
82 struct url_queue *queue = xmalloc (sizeof (*queue));
83 memset (queue, '\0', sizeof (*queue));
87 /* Delete a URL queue. */
90 url_queue_delete (struct url_queue *queue)
95 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
96 retrieved ("dequeued") from the queue in the order they were placed
100 url_enqueue (struct url_queue *queue,
101 const char *url, const char *referer, int depth)
103 struct queue_element *qel = xmalloc (sizeof (*qel));
105 qel->referer = referer;
110 if (queue->count > queue->maxcount)
111 queue->maxcount = queue->count;
113 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
114 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 queue->tail->next = qel;
121 queue->head = queue->tail;
124 /* Take a URL out of the queue. Return 1 if this operation succeeded,
125 or 0 if the queue is empty. */
128 url_dequeue (struct url_queue *queue,
129 const char **url, const char **referer, int *depth)
131 struct queue_element *qel = queue->head;
136 queue->head = queue->head->next;
141 *referer = qel->referer;
146 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
147 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
154 struct url *, struct hash_table *));
155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
156 struct url *, struct hash_table *));
159 /* Retrieve a part of the web beginning with START_URL. This used to
160 be called "recursive retrieval", because the old function was
161 recursive and implemented depth-first search. retrieve_tree on the
162 other hand implements breadth-search traversal of the tree, which
163 results in much nicer ordering of downloads.
165 The algorithm this function uses is simple:
167 1. put START_URL in the queue.
168 2. while there are URLs in the queue:
170 3. get next URL from the queue.
172 5. if the URL is HTML and its depth does not exceed maximum depth,
173 get the list of URLs embedded therein.
174 6. for each of those URLs do the following:
176 7. if the URL is not one of those downloaded before, and if it
177 satisfies the criteria specified by the various command-line
178 options, add it to the queue. */
181 retrieve_tree (const char *start_url)
183 uerr_t status = RETROK;
185 /* The queue of URLs we need to load. */
186 struct url_queue *queue = url_queue_new ();
188 /* The URLs we do not wish to enqueue, because they are already in
189 the queue, but haven't been downloaded yet. */
190 struct hash_table *blacklist = make_string_hash_table (0);
192 /* We'll need various components of this, so better get it over with
194 struct url *start_url_parsed = url_parse (start_url, NULL);
196 url_enqueue (queue, xstrdup (start_url), NULL, 0);
197 string_set_add (blacklist, start_url);
202 char *url, *referer, *file = NULL;
204 boolean dash_p_leaf_HTML = FALSE;
206 if (downloaded_exceeds_quota ())
209 if (status == FWRITEERR)
212 /* Get the next URL from the queue. */
214 if (!url_dequeue (queue,
215 (const char **)&url, (const char **)&referer,
219 /* And download it. */
223 char *redirected = NULL;
224 int oldrec = opt.recursive;
227 status = retrieve_url (url, &file, &redirected, NULL, &dt);
228 opt.recursive = oldrec;
230 if (file && status == RETROK
231 && (dt & RETROKF) && (dt & TEXTHTML))
236 /* We have been redirected, possibly to another host, or
237 different path, or wherever. Check whether we really
238 want to follow it. */
241 if (!descend_redirect_p (redirected, url, depth,
242 start_url_parsed, blacklist))
245 /* Make sure that the old pre-redirect form gets
247 string_set_add (blacklist, url);
256 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
258 if (opt.page_requisites && depth == opt.reclevel)
259 /* When -p is specified, we can do one more partial
260 recursion from the "leaf nodes" on the HTML document
261 tree. The recursion is partial in that we won't
262 traverse any <A> or <AREA> tags, nor any <LINK> tags
263 except for <LINK REL="stylesheet">. */
264 dash_p_leaf_HTML = TRUE;
267 /* Either -p wasn't specified or it was and we've
268 already gone the one extra (pseudo-)level that it
269 affords us, so we need to bail out. */
270 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
271 depth, opt.reclevel));
276 /* If the downloaded document was HTML, parse it and enqueue the
277 links it contains. */
281 int meta_disallow_follow = 0;
282 struct urlpos *children
283 = get_urls_html (file, url, &meta_disallow_follow);
285 if (opt.use_robots && meta_disallow_follow)
287 free_urlpos (children);
293 struct urlpos *child = children;
294 struct url *url_parsed = url_parsed = url_parse (url, NULL);
295 assert (url_parsed != NULL);
297 for (; child; child = child->next)
299 if (child->ignore_when_downloading)
301 if (dash_p_leaf_HTML && !child->link_inline_p)
303 if (descend_url_p (child, url_parsed, depth, start_url_parsed,
306 url_enqueue (queue, xstrdup (child->url->url),
307 xstrdup (url), depth + 1);
308 /* We blacklist the URL we have enqueued, because we
309 don't want to enqueue (and hence download) the
311 string_set_add (blacklist, child->url->url);
315 url_free (url_parsed);
316 free_urlpos (children);
320 if (opt.delete_after || (file && !acceptable (file)))
322 /* Either --delete-after was specified, or we loaded this
323 otherwise rejected (e.g. by -R) HTML file just so we
324 could harvest its hyperlinks -- in either case, delete
326 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
327 opt.delete_after ? "--delete-after" :
328 "recursive rejection criteria"));
329 logprintf (LOG_VERBOSE,
331 ? _("Removing %s.\n")
332 : _("Removing %s since it should be rejected.\n")),
335 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
339 FREE_MAYBE (referer);
343 /* If anything is left of the queue due to a premature exit, free it
348 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
354 url_queue_delete (queue);
356 if (start_url_parsed)
357 url_free (start_url_parsed);
358 string_set_free (blacklist);
360 if (downloaded_exceeds_quota ())
362 else if (status == FWRITEERR)
368 /* Based on the context provided by retrieve_tree, decide whether a
369 URL is to be descended to. This is only ever called from
370 retrieve_tree, but is in a separate function for clarity.
372 The most expensive checks (such as those for robots) are memoized
373 by storing these URLs to BLACKLIST. This may or may not help. It
374 will help if those URLs are encountered many times. */
377 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
378 struct url *start_url_parsed, struct hash_table *blacklist)
380 struct url *u = upos->url;
381 const char *url = u->url;
383 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
385 if (string_set_contains (blacklist, url))
387 DEBUGP (("Already on the black list.\n"));
391 /* Several things to check for:
392 1. if scheme is not http, and we don't load it
393 2. check for relative links (if relative_only is set)
395 4. check for no-parent
396 5. check for excludes && includes
398 7. check for same host (if spanhost is unset), with possible
399 gethostbyname baggage
400 8. check for robots.txt
402 Addendum: If the URL is FTP, and it is to be loaded, only the
403 domain and suffix settings are "stronger".
405 Note that .html files will get loaded regardless of suffix rules
406 (but that is remedied later with unlink) unless the depth equals
409 More time- and memory- consuming tests should be put later on
412 /* 1. Schemes other than HTTP are normally not recursed into. */
413 if (u->scheme != SCHEME_HTTP
414 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
416 DEBUGP (("Not following non-HTTP schemes.\n"));
420 /* 2. If it is an absolute link and they are not followed, throw it
422 if (u->scheme == SCHEME_HTTP)
423 if (opt.relative_only && !upos->link_relative_p)
425 DEBUGP (("It doesn't really look like a relative link.\n"));
429 /* 3. If its domain is not to be accepted/looked-up, chuck it
431 if (!accept_domain (u))
433 DEBUGP (("The domain was not accepted.\n"));
437 /* 4. Check for parent directory.
439 If we descended to a different host or changed the scheme, ignore
440 opt.no_parent. Also ignore it for documents needed to display
441 the parent page when in -p mode. */
443 && u->scheme == start_url_parsed->scheme
444 && 0 == strcasecmp (u->host, start_url_parsed->host)
445 && u->port == start_url_parsed->port
446 && !(opt.page_requisites && upos->link_inline_p))
448 if (!frontcmp (start_url_parsed->dir, u->dir))
450 DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
451 u->dir, start_url_parsed->dir));
456 /* 5. If the file does not match the acceptance list, or is on the
457 rejection list, chuck it out. The same goes for the directory
458 exclusion and inclusion lists. */
459 if (opt.includes || opt.excludes)
461 if (!accdir (u->dir, ALLABS))
463 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
471 /* Check for acceptance/rejection rules. We ignore these rules
472 for HTML documents because they might lead to other files which
473 need to be downloaded. Of course, we don't know which
474 documents are HTML before downloading them, so we guess.
476 A file is subject to acceptance/rejection rules if:
478 * u->file is not "" (i.e. it is not a directory)
480 + there is no file suffix,
481 + or there is a suffix, but is not "html" or "htm",
483 - recursion is not infinite,
484 - and we are at its very end. */
486 if (u->file[0] != '\0'
487 && ((suf = suffix (url)) == NULL
488 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
489 || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
491 if (!acceptable (u->file))
493 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
501 if (u->scheme == parent->scheme)
502 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
504 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
505 u->host, parent->host));
510 if (opt.use_robots && u->scheme == SCHEME_HTTP)
512 struct robot_specs *specs = res_get_specs (u->host, u->port);
516 if (res_retrieve_file (url, &rfile))
518 specs = res_parse_from_file (rfile);
523 /* If we cannot get real specs, at least produce
524 dummy ones so that we can register them and stop
525 trying to retrieve them. */
526 specs = res_parse ("", 0);
528 res_register_specs (u->host, u->port, specs);
531 /* Now that we have (or don't have) robots.txt specs, we can
532 check what they say. */
533 if (!res_match_path (specs, u->path))
535 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
536 string_set_add (blacklist, url);
541 /* The URL has passed all the tests. It can be placed in the
543 DEBUGP (("Decided to load it.\n"));
548 DEBUGP (("Decided NOT to load it.\n"));
553 /* This function determines whether we should descend the children of
554 the URL whose download resulted in a redirection, possibly to
555 another host, etc. It is needed very rarely, and thus it is merely
556 a simple-minded wrapper around descend_url_p. */
559 descend_redirect_p (const char *redirected, const char *original, int depth,
560 struct url *start_url_parsed, struct hash_table *blacklist)
562 struct url *orig_parsed, *new_parsed;
566 orig_parsed = url_parse (original, NULL);
567 assert (orig_parsed != NULL);
569 new_parsed = url_parse (redirected, NULL);
570 assert (new_parsed != NULL);
572 upos = xmalloc (sizeof (struct urlpos));
573 memset (upos, 0, sizeof (*upos));
574 upos->url = new_parsed;
576 success = descend_url_p (upos, orig_parsed, depth,
577 start_url_parsed, blacklist);
579 url_free (orig_parsed);
580 url_free (new_parsed);
584 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
590 /* Register that URL has been successfully downloaded to FILE. */
593 register_download (const char *url, const char *file)
595 if (!opt.convert_links)
597 if (!dl_file_url_map)
598 dl_file_url_map = make_string_hash_table (0);
599 if (!dl_url_file_map)
600 dl_url_file_map = make_string_hash_table (0);
602 if (!hash_table_contains (dl_file_url_map, file))
603 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
604 if (!hash_table_contains (dl_url_file_map, url))
605 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
608 /* Register that FROM has been redirected to TO. This assumes that TO
609 is successfully downloaded and already registered using
610 register_download() above. */
613 register_redirection (const char *from, const char *to)
617 if (!opt.convert_links)
620 file = hash_table_get (dl_url_file_map, to);
621 assert (file != NULL);
622 if (!hash_table_contains (dl_url_file_map, from))
623 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
626 /* Register that URL corresponds to the HTML file FILE. */
629 register_html (const char *url, const char *file)
631 if (!opt.convert_links)
633 downloaded_html_files = slist_prepend (downloaded_html_files, file);
636 /* This function is called when the retrieval is done to convert the
637 links that have been downloaded. It has to be called at the end of
638 the retrieval, because only then does Wget know conclusively which
639 URLs have been downloaded, and which not, so it can tell which
640 direction to convert to.
642 The "direction" means that the URLs to the files that have been
643 downloaded get converted to the relative URL which will point to
644 that file. And the other URLs get converted to the remote URL on
647 All the downloaded HTMLs are kept in downloaded_html_files, and
648 downloaded URLs in urls_downloaded. All the information is
649 extracted from these two lists. */
652 convert_all_links (void)
655 struct wget_timer *timer;
659 timer = wtimer_new ();
661 /* Destructively reverse downloaded_html_files to get it in the right order.
662 recursive_retrieve() used slist_prepend() consistently. */
663 downloaded_html_files = slist_nreverse (downloaded_html_files);
665 for (html = downloaded_html_files; html; html = html->next)
667 struct urlpos *urls, *cur_url;
670 DEBUGP (("Rescanning %s\n", html->string));
672 /* Determine the URL of the HTML file. get_urls_html will need
674 url = hash_table_get (dl_file_url_map, html->string);
676 DEBUGP (("It should correspond to %s.\n", url));
678 DEBUGP (("I cannot find the corresponding URL.\n"));
680 /* Parse the HTML file... */
681 urls = get_urls_html (html->string, url, NULL);
683 /* We don't respect meta_disallow_follow here because, even if
684 the file is not followed, we might still want to convert the
685 links that have been followed from other files. */
687 for (cur_url = urls; cur_url; cur_url = cur_url->next)
690 struct url *u = cur_url->url;
692 if (cur_url->link_base_p)
694 /* Base references have been resolved by our parser, so
695 we turn the base URL into an empty string. (Perhaps
696 we should remove the tag entirely?) */
697 cur_url->convert = CO_NULLIFY_BASE;
701 /* We decide the direction of conversion according to whether
702 a URL was downloaded. Downloaded URLs will be converted
703 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
704 local_name = hash_table_get (dl_url_file_map, u->url);
706 DEBUGP (("%s marked for conversion, local %s\n",
707 u->url, local_name));
709 /* Decide on the conversion type. */
712 /* We've downloaded this URL. Convert it to relative
713 form. We do this even if the URL already is in
714 relative form, because our directory structure may
715 not be identical to that on the server (think `-nd',
716 `--cut-dirs', etc.) */
717 cur_url->convert = CO_CONVERT_TO_RELATIVE;
718 cur_url->local_name = xstrdup (local_name);
722 /* We haven't downloaded this URL. If it's not already
723 complete (including a full host name), convert it to
724 that form, so it can be reached while browsing this
726 if (!cur_url->link_complete_p)
727 cur_url->convert = CO_CONVERT_TO_COMPLETE;
728 cur_url->local_name = NULL;
732 /* Convert the links in the file. */
733 convert_links (html->string, urls);
740 msecs = wtimer_elapsed (timer);
741 wtimer_delete (timer);
742 logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
743 file_count, (double)msecs / 1000);
746 /* Cleanup the data structures associated with recursive retrieving
747 (the variables above). */
749 recursive_cleanup (void)
753 free_keys_and_values (dl_file_url_map);
754 hash_table_destroy (dl_file_url_map);
755 dl_file_url_map = NULL;
759 free_keys_and_values (dl_url_file_map);
760 hash_table_destroy (dl_url_file_map);
761 dl_url_file_map = NULL;
763 slist_free (downloaded_html_files);
764 downloaded_html_files = NULL;