1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
3 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
38 #endif /* HAVE_UNISTD_H */
54 /* Functions for maintaining the URL queue. */
56 struct queue_element {
57 const char *url; /* the URL to download */
58 const char *referer; /* the referring document */
59 int depth; /* the depth */
60 bool html_allowed; /* whether the document is allowed to
61 be treated as HTML. */
62 char *remote_encoding;
63 struct queue_element *next; /* next element in queue */
67 struct queue_element *head;
68 struct queue_element *tail;
72 /* Create a URL queue. */
74 static struct url_queue *
77 struct url_queue *queue = xnew0 (struct url_queue);
81 /* Delete a URL queue. */
84 url_queue_delete (struct url_queue *queue)
89 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
90 retrieved ("dequeued") from the queue in the order they were placed
94 url_enqueue (struct url_queue *queue,
95 const char *url, const char *referer, int depth, bool html_allowed)
97 struct queue_element *qel = xnew (struct queue_element);
98 char *charset = get_current_charset ();
100 qel->referer = referer;
102 qel->html_allowed = html_allowed;
106 qel->remote_encoding = xstrdup (charset);
108 qel->remote_encoding = NULL;
111 if (queue->count > queue->maxcount)
112 queue->maxcount = queue->count;
114 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
115 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
120 queue->tail->next = qel;
124 queue->head = queue->tail;
127 /* Take a URL out of the queue. Return true if this operation
128 succeeded, or false if the queue is empty. */
131 url_dequeue (struct url_queue *queue,
132 const char **url, const char **referer, int *depth,
135 struct queue_element *qel = queue->head;
140 queue->head = queue->head->next;
144 set_remote_charset (qel->remote_encoding);
145 if (qel->remote_encoding)
146 xfree (qel->remote_encoding);
149 *referer = qel->referer;
151 *html_allowed = qel->html_allowed;
155 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
156 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
162 static bool download_child_p (const struct urlpos *, struct url *, int,
163 struct url *, struct hash_table *);
164 static bool descend_redirect_p (const char *, const char *, int,
165 struct url *, struct hash_table *);
168 /* Retrieve a part of the web beginning with START_URL. This used to
169 be called "recursive retrieval", because the old function was
170 recursive and implemented depth-first search. retrieve_tree on the
171 other hand implements breadth-search traversal of the tree, which
172 results in much nicer ordering of downloads.
174 The algorithm this function uses is simple:
176 1. put START_URL in the queue.
177 2. while there are URLs in the queue:
179 3. get next URL from the queue.
181 5. if the URL is HTML and its depth does not exceed maximum depth,
182 get the list of URLs embedded therein.
183 6. for each of those URLs do the following:
185 7. if the URL is not one of those downloaded before, and if it
186 satisfies the criteria specified by the various command-line
187 options, add it to the queue. */
190 retrieve_tree (const char *start_url)
192 uerr_t status = RETROK;
194 /* The queue of URLs we need to load. */
195 struct url_queue *queue;
197 /* The URLs we do not wish to enqueue, because they are already in
198 the queue, but haven't been downloaded yet. */
199 struct hash_table *blacklist;
202 struct url *start_url_parsed;
204 set_ugly_no_encode (true);
205 start_url_parsed= url_parse (start_url, &up_error_code);
206 set_ugly_no_encode (false);
207 if (!start_url_parsed)
209 logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
210 url_error (up_error_code));
214 queue = url_queue_new ();
215 blacklist = make_string_hash_table (0);
217 /* Enqueue the starting URL. Use start_url_parsed->url rather than
218 just URL so we enqueue the canonical form of the URL. */
219 url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
220 string_set_add (blacklist, start_url_parsed->url);
224 bool descend = false;
225 char *url, *referer, *file = NULL;
228 bool dash_p_leaf_HTML = false;
230 if (opt.quota && total_downloaded_bytes > opt.quota)
232 if (status == FWRITEERR)
235 /* Get the next URL from the queue... */
237 if (!url_dequeue (queue,
238 (const char **)&url, (const char **)&referer,
239 &depth, &html_allowed))
242 /* ...and download it. Note that this download is in most cases
243 unconditional, as download_child_p already makes sure a file
244 doesn't get enqueued twice -- and yet this check is here, and
245 not in download_child_p. This is so that if you run `wget -r
246 URL1 URL2', and a random URL is encountered once under URL1
247 and again under URL2, but at a different (possibly smaller)
248 depth, we want the URL's children to be taken into account
250 if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
252 file = xstrdup (hash_table_get (dl_url_file_map, url));
254 DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
258 && downloaded_html_set
259 && string_set_contains (downloaded_html_set, file))
265 char *redirected = NULL;
267 status = retrieve_url (url, &file, &redirected, referer, &dt, false);
269 if (html_allowed && file && status == RETROK
270 && (dt & RETROKF) && (dt & TEXTHTML))
275 /* We have been redirected, possibly to another host, or
276 different path, or wherever. Check whether we really
277 want to follow it. */
280 if (!descend_redirect_p (redirected, url, depth,
281 start_url_parsed, blacklist))
284 /* Make sure that the old pre-redirect form gets
286 string_set_add (blacklist, url);
296 visited_url (url, referer);
300 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
302 if (opt.page_requisites
303 && (depth == opt.reclevel || depth == opt.reclevel + 1))
305 /* When -p is specified, we are allowed to exceed the
306 maximum depth, but only for the "inline" links,
307 i.e. those that are needed to display the page.
308 Originally this could exceed the depth at most by
309 one, but we allow one more level so that the leaf
310 pages that contain frames can be loaded
312 dash_p_leaf_HTML = true;
316 /* Either -p wasn't specified or it was and we've
317 already spent the two extra (pseudo-)levels that it
318 affords us, so we need to bail out. */
319 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
320 depth, opt.reclevel));
325 /* If the downloaded document was HTML, parse it and enqueue the
326 links it contains. */
330 bool meta_disallow_follow = false;
331 struct urlpos *children
332 = get_urls_html (file, url, &meta_disallow_follow);
334 if (opt.use_robots && meta_disallow_follow)
336 free_urlpos (children);
342 struct urlpos *child = children;
343 set_ugly_no_encode (true);
344 struct url *url_parsed = url_parse (url, NULL);
345 set_ugly_no_encode (false);
346 char *referer_url = url;
347 bool strip_auth = (url_parsed != NULL
348 && url_parsed->user != NULL);
349 assert (url_parsed != NULL);
351 /* Strip auth info if present */
353 referer_url = url_string (url_parsed, URL_AUTH_HIDE);
355 for (; child; child = child->next)
357 if (child->ignore_when_downloading)
359 if (dash_p_leaf_HTML && !child->link_inline_p)
361 if (download_child_p (child, url_parsed, depth, start_url_parsed,
364 url_enqueue (queue, xstrdup (child->url->url),
365 xstrdup (referer_url), depth + 1,
366 child->link_expect_html);
367 /* We blacklist the URL we have enqueued, because we
368 don't want to enqueue (and hence download) the
370 string_set_add (blacklist, child->url->url);
376 url_free (url_parsed);
377 free_urlpos (children);
383 || opt.spider /* opt.recursive is implicitely true */
384 || !acceptable (file)))
386 /* Either --delete-after was specified, or we loaded this
387 (otherwise unneeded because of --spider or rejected by -R)
388 HTML file just to harvest its hyperlinks -- in either case,
389 delete the local file. */
390 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
391 opt.delete_after ? "--delete-after" :
392 (opt.spider ? "--spider" :
393 "recursive rejection criteria")));
394 logprintf (LOG_VERBOSE,
395 (opt.delete_after || opt.spider
396 ? _("Removing %s.\n")
397 : _("Removing %s since it should be rejected.\n")),
400 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
401 logputs (LOG_VERBOSE, "\n");
402 register_delete_file (file);
406 xfree_null (referer);
410 /* If anything is left of the queue due to a premature exit, free it
416 while (url_dequeue (queue,
417 (const char **)&d1, (const char **)&d2, &d3, &d4))
423 url_queue_delete (queue);
425 if (start_url_parsed)
426 url_free (start_url_parsed);
427 string_set_free (blacklist);
429 if (opt.quota && total_downloaded_bytes > opt.quota)
431 else if (status == FWRITEERR)
437 /* Based on the context provided by retrieve_tree, decide whether a
438 URL is to be descended to. This is only ever called from
439 retrieve_tree, but is in a separate function for clarity.
441 The most expensive checks (such as those for robots) are memoized
442 by storing these URLs to BLACKLIST. This may or may not help. It
443 will help if those URLs are encountered many times. */
446 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
447 struct url *start_url_parsed, struct hash_table *blacklist)
449 struct url *u = upos->url;
450 const char *url = u->url;
451 bool u_scheme_like_http;
453 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
455 if (string_set_contains (blacklist, url))
459 char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
460 DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
461 visited_url (url, referrer);
464 DEBUGP (("Already on the black list.\n"));
468 /* Several things to check for:
469 1. if scheme is not http, and we don't load it
470 2. check for relative links (if relative_only is set)
472 4. check for no-parent
473 5. check for excludes && includes
475 7. check for same host (if spanhost is unset), with possible
476 gethostbyname baggage
477 8. check for robots.txt
479 Addendum: If the URL is FTP, and it is to be loaded, only the
480 domain and suffix settings are "stronger".
482 Note that .html files will get loaded regardless of suffix rules
483 (but that is remedied later with unlink) unless the depth equals
486 More time- and memory- consuming tests should be put later on
489 /* Determine whether URL under consideration has a HTTP-like scheme. */
490 u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
492 /* 1. Schemes other than HTTP are normally not recursed into. */
493 if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
495 DEBUGP (("Not following non-HTTP schemes.\n"));
499 /* 2. If it is an absolute link and they are not followed, throw it
501 if (u_scheme_like_http)
502 if (opt.relative_only && !upos->link_relative_p)
504 DEBUGP (("It doesn't really look like a relative link.\n"));
508 /* 3. If its domain is not to be accepted/looked-up, chuck it
510 if (!accept_domain (u))
512 DEBUGP (("The domain was not accepted.\n"));
516 /* 4. Check for parent directory.
518 If we descended to a different host or changed the scheme, ignore
519 opt.no_parent. Also ignore it for documents needed to display
520 the parent page when in -p mode. */
522 && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
523 && 0 == strcasecmp (u->host, start_url_parsed->host)
524 && u->port == start_url_parsed->port
525 && !(opt.page_requisites && upos->link_inline_p))
527 if (!subdir_p (start_url_parsed->dir, u->dir))
529 DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
530 u->dir, start_url_parsed->dir));
535 /* 5. If the file does not match the acceptance list, or is on the
536 rejection list, chuck it out. The same goes for the directory
537 exclusion and inclusion lists. */
538 if (opt.includes || opt.excludes)
540 if (!accdir (u->dir))
542 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
547 /* 6. Check for acceptance/rejection rules. We ignore these rules
548 for directories (no file name to match) and for non-leaf HTMLs,
549 which can lead to other files that do need to be downloaded. (-p
550 automatically implies non-leaf because with -p we can, if
551 necesary, overstep the maximum depth to get the page requisites.) */
552 if (u->file[0] != '\0'
553 && !(has_html_suffix_p (u->file)
554 /* The exception only applies to non-leaf HTMLs (but -p
555 always implies non-leaf because we can overstep the
556 maximum depth to get the requisites): */
558 opt.reclevel == INFINITE_RECURSION
560 || depth < opt.reclevel - 1
561 /* -p, which implies non-leaf (see above) */
562 || opt.page_requisites)))
564 if (!acceptable (u->file))
566 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
573 if (schemes_are_similar_p (u->scheme, parent->scheme))
574 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
576 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
577 u->host, parent->host));
582 if (opt.use_robots && u_scheme_like_http)
584 struct robot_specs *specs = res_get_specs (u->host, u->port);
588 if (res_retrieve_file (url, &rfile))
590 specs = res_parse_from_file (rfile);
592 /* Delete the robots.txt file if we chose to either delete the
593 files after downloading or we're just running a spider. */
594 if (opt.delete_after || opt.spider)
596 logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
598 logprintf (LOG_NOTQUIET, "unlink: %s\n",
606 /* If we cannot get real specs, at least produce
607 dummy ones so that we can register them and stop
608 trying to retrieve them. */
609 specs = res_parse ("", 0);
611 res_register_specs (u->host, u->port, specs);
614 /* Now that we have (or don't have) robots.txt specs, we can
615 check what they say. */
616 if (!res_match_path (specs, u->path))
618 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
619 string_set_add (blacklist, url);
624 /* The URL has passed all the tests. It can be placed in the
626 DEBUGP (("Decided to load it.\n"));
631 DEBUGP (("Decided NOT to load it.\n"));
636 /* This function determines whether we will consider downloading the
637 children of a URL whose download resulted in a redirection,
638 possibly to another host, etc. It is needed very rarely, and thus
639 it is merely a simple-minded wrapper around download_child_p. */
642 descend_redirect_p (const char *redirected, const char *original, int depth,
643 struct url *start_url_parsed, struct hash_table *blacklist)
645 struct url *orig_parsed, *new_parsed;
649 set_ugly_no_encode (true);
650 orig_parsed = url_parse (original, NULL);
651 assert (orig_parsed != NULL);
653 new_parsed = url_parse (redirected, NULL);
654 assert (new_parsed != NULL);
655 set_ugly_no_encode (false);
657 upos = xnew0 (struct urlpos);
658 upos->url = new_parsed;
660 success = download_child_p (upos, orig_parsed, depth,
661 start_url_parsed, blacklist);
663 url_free (orig_parsed);
664 url_free (new_parsed);
668 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
673 /* vim:set sts=2 sw=2 cino+={s: */