1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 /* Functions for maintaining the URL queue. */
64 struct queue_element {
68 struct queue_element *next;
72 struct queue_element *head;
73 struct queue_element *tail;
77 /* Create a URL queue. */
79 static struct url_queue *
82 struct url_queue *queue = xmalloc (sizeof (*queue));
83 memset (queue, '\0', sizeof (*queue));
87 /* Delete a URL queue. */
90 url_queue_delete (struct url_queue *queue)
95 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
96 retrieved ("dequeued") from the queue in the order they were placed
100 url_enqueue (struct url_queue *queue,
101 const char *url, const char *referer, int depth)
103 struct queue_element *qel = xmalloc (sizeof (*qel));
105 qel->referer = referer;
110 if (queue->count > queue->maxcount)
111 queue->maxcount = queue->count;
113 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
114 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 queue->tail->next = qel;
121 queue->head = queue->tail;
124 /* Take a URL out of the queue. Return 1 if this operation succeeded,
125 or 0 if the queue is empty. */
128 url_dequeue (struct url_queue *queue,
129 const char **url, const char **referer, int *depth)
131 struct queue_element *qel = queue->head;
136 queue->head = queue->head->next;
141 *referer = qel->referer;
146 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
147 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
154 struct url *, struct hash_table *));
155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
156 struct url *, struct hash_table *));
159 /* Retrieve a part of the web beginning with START_URL. This used to
160 be called "recursive retrieval", because the old function was
161 recursive and implemented depth-first search. retrieve_tree on the
162 other hand implements breadth-search traversal of the tree, which
163 results in much nicer ordering of downloads.
165 The algorithm this function uses is simple:
167 1. put START_URL in the queue.
168 2. while there are URLs in the queue:
170 3. get next URL from the queue.
172 5. if the URL is HTML and its depth does not exceed maximum depth,
173 get the list of URLs embedded therein.
174 6. for each of those URLs do the following:
176 7. if the URL is not one of those downloaded before, and if it
177 satisfies the criteria specified by the various command-line
178 options, add it to the queue. */
181 retrieve_tree (const char *start_url)
183 uerr_t status = RETROK;
185 /* The queue of URLs we need to load. */
186 struct url_queue *queue = url_queue_new ();
188 /* The URLs we do not wish to enqueue, because they are already in
189 the queue, but haven't been downloaded yet. */
190 struct hash_table *blacklist = make_string_hash_table (0);
192 /* We'll need various components of this, so better get it over with
194 struct url *start_url_parsed = url_parse (start_url, NULL);
196 url_enqueue (queue, xstrdup (start_url), NULL, 0);
197 string_set_add (blacklist, start_url);
202 char *url, *referer, *file = NULL;
204 boolean dash_p_leaf_HTML = FALSE;
206 if (downloaded_exceeds_quota ())
209 if (status == FWRITEERR)
212 /* Get the next URL from the queue. */
214 if (!url_dequeue (queue,
215 (const char **)&url, (const char **)&referer,
219 /* And download it. */
223 char *redirected = NULL;
224 int oldrec = opt.recursive;
227 status = retrieve_url (url, &file, &redirected, NULL, &dt);
228 opt.recursive = oldrec;
230 if (file && status == RETROK
231 && (dt & RETROKF) && (dt & TEXTHTML))
236 /* We have been redirected, possibly to another host, or
237 different path, or wherever. Check whether we really
238 want to follow it. */
241 if (!descend_redirect_p (redirected, url, depth,
242 start_url_parsed, blacklist))
245 /* Make sure that the old pre-redirect form gets
247 string_set_add (blacklist, url);
256 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
258 if (opt.page_requisites && depth == opt.reclevel)
259 /* When -p is specified, we can do one more partial
260 recursion from the "leaf nodes" on the HTML document
261 tree. The recursion is partial in that we won't
262 traverse any <A> or <AREA> tags, nor any <LINK> tags
263 except for <LINK REL="stylesheet">. */
264 dash_p_leaf_HTML = TRUE;
267 /* Either -p wasn't specified or it was and we've
268 already gone the one extra (pseudo-)level that it
269 affords us, so we need to bail out. */
270 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
271 depth, opt.reclevel));
276 /* If the downloaded document was HTML, parse it and enqueue the
277 links it contains. */
281 int meta_disallow_follow = 0;
282 struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
283 &meta_disallow_follow);
285 if (opt.use_robots && meta_disallow_follow)
287 free_urlpos (children);
293 struct urlpos *child = children;
294 struct url *url_parsed = url_parsed = url_parse (url, NULL);
295 assert (url_parsed != NULL);
297 for (; child; child = child->next)
299 if (child->ignore_when_downloading)
301 if (descend_url_p (child, url_parsed, depth, start_url_parsed,
304 url_enqueue (queue, xstrdup (child->url->url),
305 xstrdup (url), depth + 1);
306 /* We blacklist the URL we have enqueued, because we
307 don't want to enqueue (and hence download) the
309 string_set_add (blacklist, child->url->url);
313 url_free (url_parsed);
314 free_urlpos (children);
318 if (opt.delete_after || (file && !acceptable (file)))
320 /* Either --delete-after was specified, or we loaded this
321 otherwise rejected (e.g. by -R) HTML file just so we
322 could harvest its hyperlinks -- in either case, delete
324 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
325 opt.delete_after ? "--delete-after" :
326 "recursive rejection criteria"));
327 logprintf (LOG_VERBOSE,
329 ? _("Removing %s.\n")
330 : _("Removing %s since it should be rejected.\n")),
333 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
337 FREE_MAYBE (referer);
341 /* If anything is left of the queue due to a premature exit, free it
346 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
352 url_queue_delete (queue);
354 if (start_url_parsed)
355 url_free (start_url_parsed);
356 string_set_free (blacklist);
358 if (downloaded_exceeds_quota ())
360 else if (status == FWRITEERR)
366 /* Based on the context provided by retrieve_tree, decide whether a
367 URL is to be descended to. This is only ever called from
368 retrieve_tree, but is in a separate function for clarity.
370 The most expensive checks (such as those for robots) are memoized
371 by storing these URLs to BLACKLIST. This may or may not help. It
372 will help if those URLs are encountered many times. */
375 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
376 struct url *start_url_parsed, struct hash_table *blacklist)
378 struct url *u = upos->url;
379 const char *url = u->url;
381 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
383 if (string_set_contains (blacklist, url))
385 DEBUGP (("Already on the black list.\n"));
389 /* Several things to check for:
390 1. if scheme is not http, and we don't load it
391 2. check for relative links (if relative_only is set)
393 4. check for no-parent
394 5. check for excludes && includes
396 7. check for same host (if spanhost is unset), with possible
397 gethostbyname baggage
398 8. check for robots.txt
400 Addendum: If the URL is FTP, and it is to be loaded, only the
401 domain and suffix settings are "stronger".
403 Note that .html files will get loaded regardless of suffix rules
404 (but that is remedied later with unlink) unless the depth equals
407 More time- and memory- consuming tests should be put later on
410 /* 1. Schemes other than HTTP are normally not recursed into. */
411 if (u->scheme != SCHEME_HTTP
412 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
414 DEBUGP (("Not following non-HTTP schemes.\n"));
418 /* 2. If it is an absolute link and they are not followed, throw it
420 if (u->scheme == SCHEME_HTTP)
421 if (opt.relative_only && !upos->link_relative_p)
423 DEBUGP (("It doesn't really look like a relative link.\n"));
427 /* 3. If its domain is not to be accepted/looked-up, chuck it
429 if (!accept_domain (u))
431 DEBUGP (("The domain was not accepted.\n"));
435 /* 4. Check for parent directory.
437 If we descended to a different host or changed the scheme, ignore
438 opt.no_parent. Also ignore it for -p leaf retrievals. */
440 && u->scheme == parent->scheme
441 && 0 == strcasecmp (u->host, parent->host)
442 && u->port == parent->port)
444 if (!frontcmp (parent->dir, u->dir))
446 DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
451 /* 5. If the file does not match the acceptance list, or is on the
452 rejection list, chuck it out. The same goes for the directory
453 exclusion and inclusion lists. */
454 if (opt.includes || opt.excludes)
456 if (!accdir (u->dir, ALLABS))
458 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
466 /* Check for acceptance/rejection rules. We ignore these rules
467 for HTML documents because they might lead to other files which
468 need to be downloaded. Of course, we don't know which
469 documents are HTML before downloading them, so we guess.
471 A file is subject to acceptance/rejection rules if:
473 * u->file is not "" (i.e. it is not a directory)
475 + there is no file suffix,
476 + or there is a suffix, but is not "html" or "htm",
478 - recursion is not infinite,
479 - and we are at its very end. */
481 if (u->file[0] != '\0'
482 && ((suf = suffix (url)) == NULL
483 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
484 || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
486 if (!acceptable (u->file))
488 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
496 if (u->scheme == parent->scheme)
497 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
499 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
500 u->host, parent->host));
505 if (opt.use_robots && u->scheme == SCHEME_HTTP)
507 struct robot_specs *specs = res_get_specs (u->host, u->port);
511 if (res_retrieve_file (url, &rfile))
513 specs = res_parse_from_file (rfile);
518 /* If we cannot get real specs, at least produce
519 dummy ones so that we can register them and stop
520 trying to retrieve them. */
521 specs = res_parse ("", 0);
523 res_register_specs (u->host, u->port, specs);
526 /* Now that we have (or don't have) robots.txt specs, we can
527 check what they say. */
528 if (!res_match_path (specs, u->path))
530 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
531 string_set_add (blacklist, url);
536 /* The URL has passed all the tests. It can be placed in the
538 DEBUGP (("Decided to load it.\n"));
543 DEBUGP (("Decided NOT to load it.\n"));
548 /* This function determines whether we should descend the children of
549 the URL whose download resulted in a redirection, possibly to
550 another host, etc. It is needed very rarely, and thus it is merely
551 a simple-minded wrapper around descend_url_p. */
554 descend_redirect_p (const char *redirected, const char *original, int depth,
555 struct url *start_url_parsed, struct hash_table *blacklist)
557 struct url *orig_parsed, *new_parsed;
561 orig_parsed = url_parse (original, NULL);
562 assert (orig_parsed != NULL);
564 new_parsed = url_parse (redirected, NULL);
565 assert (new_parsed != NULL);
567 upos = xmalloc (sizeof (struct urlpos));
568 memset (upos, 0, sizeof (*upos));
569 upos->url = new_parsed;
571 success = descend_url_p (upos, orig_parsed, depth,
572 start_url_parsed, blacklist);
574 url_free (orig_parsed);
575 url_free (new_parsed);
579 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
585 /* Register that URL has been successfully downloaded to FILE. */
588 register_download (const char *url, const char *file)
590 if (!opt.convert_links)
592 if (!dl_file_url_map)
593 dl_file_url_map = make_string_hash_table (0);
594 if (!dl_url_file_map)
595 dl_url_file_map = make_string_hash_table (0);
597 if (!hash_table_contains (dl_file_url_map, file))
598 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
599 if (!hash_table_contains (dl_url_file_map, url))
600 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
603 /* Register that FROM has been redirected to TO. This assumes that TO
604 is successfully downloaded and already registered using
605 register_download() above. */
608 register_redirection (const char *from, const char *to)
612 if (!opt.convert_links)
615 file = hash_table_get (dl_url_file_map, to);
616 assert (file != NULL);
617 if (!hash_table_contains (dl_url_file_map, from))
618 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
621 /* Register that URL corresponds to the HTML file FILE. */
624 register_html (const char *url, const char *file)
626 if (!opt.convert_links)
628 downloaded_html_files = slist_prepend (downloaded_html_files, file);
631 /* This function is called when the retrieval is done to convert the
632 links that have been downloaded. It has to be called at the end of
633 the retrieval, because only then does Wget know conclusively which
634 URLs have been downloaded, and which not, so it can tell which
635 direction to convert to.
637 The "direction" means that the URLs to the files that have been
638 downloaded get converted to the relative URL which will point to
639 that file. And the other URLs get converted to the remote URL on
642 All the downloaded HTMLs are kept in downloaded_html_files, and
643 downloaded URLs in urls_downloaded. All the information is
644 extracted from these two lists. */
647 convert_all_links (void)
650 struct wget_timer *timer;
654 timer = wtimer_new ();
656 /* Destructively reverse downloaded_html_files to get it in the right order.
657 recursive_retrieve() used slist_prepend() consistently. */
658 downloaded_html_files = slist_nreverse (downloaded_html_files);
660 for (html = downloaded_html_files; html; html = html->next)
662 struct urlpos *urls, *cur_url;
665 DEBUGP (("Rescanning %s\n", html->string));
667 /* Determine the URL of the HTML file. get_urls_html will need
669 url = hash_table_get (dl_file_url_map, html->string);
671 DEBUGP (("It should correspond to %s.\n", url));
673 DEBUGP (("I cannot find the corresponding URL.\n"));
675 /* Parse the HTML file... */
676 urls = get_urls_html (html->string, url, FALSE, NULL);
678 /* We don't respect meta_disallow_follow here because, even if
679 the file is not followed, we might still want to convert the
680 links that have been followed from other files. */
682 for (cur_url = urls; cur_url; cur_url = cur_url->next)
685 struct url *u = cur_url->url;
687 if (cur_url->link_base_p)
689 /* Base references have been resolved by our parser, so
690 we turn the base URL into an empty string. (Perhaps
691 we should remove the tag entirely?) */
692 cur_url->convert = CO_NULLIFY_BASE;
696 /* We decide the direction of conversion according to whether
697 a URL was downloaded. Downloaded URLs will be converted
698 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
699 local_name = hash_table_get (dl_url_file_map, u->url);
701 DEBUGP (("%s marked for conversion, local %s\n",
702 u->url, local_name));
704 /* Decide on the conversion type. */
707 /* We've downloaded this URL. Convert it to relative
708 form. We do this even if the URL already is in
709 relative form, because our directory structure may
710 not be identical to that on the server (think `-nd',
711 `--cut-dirs', etc.) */
712 cur_url->convert = CO_CONVERT_TO_RELATIVE;
713 cur_url->local_name = xstrdup (local_name);
717 /* We haven't downloaded this URL. If it's not already
718 complete (including a full host name), convert it to
719 that form, so it can be reached while browsing this
721 if (!cur_url->link_complete_p)
722 cur_url->convert = CO_CONVERT_TO_COMPLETE;
723 cur_url->local_name = NULL;
727 /* Convert the links in the file. */
728 convert_links (html->string, urls);
735 msecs = wtimer_elapsed (timer);
736 wtimer_delete (timer);
737 logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
738 file_count, (double)msecs / 1000);
741 /* Cleanup the data structures associated with recursive retrieving
742 (the variables above). */
744 recursive_cleanup (void)
748 free_keys_and_values (dl_file_url_map);
749 hash_table_destroy (dl_file_url_map);
750 dl_file_url_map = NULL;
754 free_keys_and_values (dl_url_file_map);
755 hash_table_destroy (dl_url_file_map);
756 dl_url_file_map = NULL;
758 slist_free (downloaded_html_files);
759 downloaded_html_files = NULL;