1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
3 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
38 #endif /* HAVE_UNISTD_H */
54 /* Functions for maintaining the URL queue. */
56 struct queue_element {
57 const char *url; /* the URL to download */
58 const char *referer; /* the referring document */
59 int depth; /* the depth */
60 bool html_allowed; /* whether the document is allowed to
61 be treated as HTML. */
63 struct queue_element *next; /* next element in queue */
67 struct queue_element *head;
68 struct queue_element *tail;
72 /* Create a URL queue. */
74 static struct url_queue *
77 struct url_queue *queue = xnew0 (struct url_queue);
81 /* Delete a URL queue. */
84 url_queue_delete (struct url_queue *queue)
89 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
90 retrieved ("dequeued") from the queue in the order they were placed
94 url_enqueue (struct url_queue *queue,
95 const char *url, const char *referer, int depth, bool html_allowed)
97 struct queue_element *qel = xnew (struct queue_element);
99 qel->referer = referer;
101 qel->html_allowed = html_allowed;
105 if (queue->count > queue->maxcount)
106 queue->maxcount = queue->count;
108 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
109 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
112 queue->tail->next = qel;
116 queue->head = queue->tail;
119 /* Take a URL out of the queue. Return true if this operation
120 succeeded, or false if the queue is empty. */
123 url_dequeue (struct url_queue *queue,
124 const char **url, const char **referer, int *depth,
127 struct queue_element *qel = queue->head;
132 queue->head = queue->head->next;
137 *referer = qel->referer;
139 *html_allowed = qel->html_allowed;
143 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
144 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
150 static bool download_child_p (const struct urlpos *, struct url *, int,
151 struct url *, struct hash_table *);
152 static bool descend_redirect_p (const char *, const char *, int,
153 struct url *, struct hash_table *);
156 /* Retrieve a part of the web beginning with START_URL. This used to
157 be called "recursive retrieval", because the old function was
158 recursive and implemented depth-first search. retrieve_tree on the
159 other hand implements breadth-search traversal of the tree, which
160 results in much nicer ordering of downloads.
162 The algorithm this function uses is simple:
164 1. put START_URL in the queue.
165 2. while there are URLs in the queue:
167 3. get next URL from the queue.
169 5. if the URL is HTML and its depth does not exceed maximum depth,
170 get the list of URLs embedded therein.
171 6. for each of those URLs do the following:
173 7. if the URL is not one of those downloaded before, and if it
174 satisfies the criteria specified by the various command-line
175 options, add it to the queue. */
178 retrieve_tree (const char *start_url)
180 uerr_t status = RETROK;
182 /* The queue of URLs we need to load. */
183 struct url_queue *queue;
185 /* The URLs we do not wish to enqueue, because they are already in
186 the queue, but haven't been downloaded yet. */
187 struct hash_table *blacklist;
190 struct url *start_url_parsed = url_parse (start_url, &up_error_code);
192 if (!start_url_parsed)
194 logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
195 url_error (up_error_code));
199 queue = url_queue_new ();
200 blacklist = make_string_hash_table (0);
202 /* Enqueue the starting URL. Use start_url_parsed->url rather than
203 just URL so we enqueue the canonical form of the URL. */
204 url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
205 string_set_add (blacklist, start_url_parsed->url);
209 bool descend = false;
210 char *url, *referer, *file = NULL;
213 bool dash_p_leaf_HTML = false;
215 if (opt.quota && total_downloaded_bytes > opt.quota)
217 if (status == FWRITEERR)
220 /* Get the next URL from the queue... */
222 if (!url_dequeue (queue,
223 (const char **)&url, (const char **)&referer,
224 &depth, &html_allowed))
227 /* ...and download it. Note that this download is in most cases
228 unconditional, as download_child_p already makes sure a file
229 doesn't get enqueued twice -- and yet this check is here, and
230 not in download_child_p. This is so that if you run `wget -r
231 URL1 URL2', and a random URL is encountered once under URL1
232 and again under URL2, but at a different (possibly smaller)
233 depth, we want the URL's children to be taken into account
235 if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
237 file = xstrdup (hash_table_get (dl_url_file_map, url));
239 DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
243 && downloaded_html_set
244 && string_set_contains (downloaded_html_set, file))
250 char *redirected = NULL;
252 status = retrieve_url (url, &file, &redirected, referer, &dt, false);
254 if (html_allowed && file && status == RETROK
255 && (dt & RETROKF) && (dt & TEXTHTML))
260 /* We have been redirected, possibly to another host, or
261 different path, or wherever. Check whether we really
262 want to follow it. */
265 if (!descend_redirect_p (redirected, url, depth,
266 start_url_parsed, blacklist))
269 /* Make sure that the old pre-redirect form gets
271 string_set_add (blacklist, url);
281 visited_url (url, referer);
285 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
287 if (opt.page_requisites
288 && (depth == opt.reclevel || depth == opt.reclevel + 1))
290 /* When -p is specified, we are allowed to exceed the
291 maximum depth, but only for the "inline" links,
292 i.e. those that are needed to display the page.
293 Originally this could exceed the depth at most by
294 one, but we allow one more level so that the leaf
295 pages that contain frames can be loaded
297 dash_p_leaf_HTML = true;
301 /* Either -p wasn't specified or it was and we've
302 already spent the two extra (pseudo-)levels that it
303 affords us, so we need to bail out. */
304 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
305 depth, opt.reclevel));
310 /* If the downloaded document was HTML, parse it and enqueue the
311 links it contains. */
315 bool meta_disallow_follow = false;
316 struct urlpos *children
317 = get_urls_html (file, url, &meta_disallow_follow);
319 if (opt.use_robots && meta_disallow_follow)
321 free_urlpos (children);
327 struct urlpos *child = children;
328 struct url *url_parsed = url_parsed = url_parse (url, NULL);
329 char *referer_url = url;
330 bool strip_auth = (url_parsed != NULL
331 && url_parsed->user != NULL);
332 assert (url_parsed != NULL);
334 /* Strip auth info if present */
336 referer_url = url_string (url_parsed, URL_AUTH_HIDE);
338 for (; child; child = child->next)
340 if (child->ignore_when_downloading)
342 if (dash_p_leaf_HTML && !child->link_inline_p)
344 if (download_child_p (child, url_parsed, depth, start_url_parsed,
347 url_enqueue (queue, xstrdup (child->url->url),
348 xstrdup (referer_url), depth + 1,
349 child->link_expect_html);
350 /* We blacklist the URL we have enqueued, because we
351 don't want to enqueue (and hence download) the
353 string_set_add (blacklist, child->url->url);
359 url_free (url_parsed);
360 free_urlpos (children);
366 || opt.spider /* opt.recursive is implicitely true */
367 || !acceptable (file)))
369 /* Either --delete-after was specified, or we loaded this
370 (otherwise unneeded because of --spider or rejected by -R)
371 HTML file just to harvest its hyperlinks -- in either case,
372 delete the local file. */
373 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
374 opt.delete_after ? "--delete-after" :
375 (opt.spider ? "--spider" :
376 "recursive rejection criteria")));
377 logprintf (LOG_VERBOSE,
378 (opt.delete_after || opt.spider
379 ? _("Removing %s.\n")
380 : _("Removing %s since it should be rejected.\n")),
383 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
384 logputs (LOG_VERBOSE, "\n");
385 register_delete_file (file);
389 xfree_null (referer);
393 /* If anything is left of the queue due to a premature exit, free it
399 while (url_dequeue (queue,
400 (const char **)&d1, (const char **)&d2, &d3, &d4))
406 url_queue_delete (queue);
408 if (start_url_parsed)
409 url_free (start_url_parsed);
410 string_set_free (blacklist);
412 if (opt.quota && total_downloaded_bytes > opt.quota)
414 else if (status == FWRITEERR)
420 /* Based on the context provided by retrieve_tree, decide whether a
421 URL is to be descended to. This is only ever called from
422 retrieve_tree, but is in a separate function for clarity.
424 The most expensive checks (such as those for robots) are memoized
425 by storing these URLs to BLACKLIST. This may or may not help. It
426 will help if those URLs are encountered many times. */
429 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
430 struct url *start_url_parsed, struct hash_table *blacklist)
432 struct url *u = upos->url;
433 const char *url = u->url;
434 bool u_scheme_like_http;
436 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
438 if (string_set_contains (blacklist, url))
442 char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
443 DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
444 visited_url (url, referrer);
447 DEBUGP (("Already on the black list.\n"));
451 /* Several things to check for:
452 1. if scheme is not http, and we don't load it
453 2. check for relative links (if relative_only is set)
455 4. check for no-parent
456 5. check for excludes && includes
458 7. check for same host (if spanhost is unset), with possible
459 gethostbyname baggage
460 8. check for robots.txt
462 Addendum: If the URL is FTP, and it is to be loaded, only the
463 domain and suffix settings are "stronger".
465 Note that .html files will get loaded regardless of suffix rules
466 (but that is remedied later with unlink) unless the depth equals
469 More time- and memory- consuming tests should be put later on
472 /* Determine whether URL under consideration has a HTTP-like scheme. */
473 u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
475 /* 1. Schemes other than HTTP are normally not recursed into. */
476 if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
478 DEBUGP (("Not following non-HTTP schemes.\n"));
482 /* 2. If it is an absolute link and they are not followed, throw it
484 if (u_scheme_like_http)
485 if (opt.relative_only && !upos->link_relative_p)
487 DEBUGP (("It doesn't really look like a relative link.\n"));
491 /* 3. If its domain is not to be accepted/looked-up, chuck it
493 if (!accept_domain (u))
495 DEBUGP (("The domain was not accepted.\n"));
499 /* 4. Check for parent directory.
501 If we descended to a different host or changed the scheme, ignore
502 opt.no_parent. Also ignore it for documents needed to display
503 the parent page when in -p mode. */
505 && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
506 && 0 == strcasecmp (u->host, start_url_parsed->host)
507 && u->port == start_url_parsed->port
508 && !(opt.page_requisites && upos->link_inline_p))
510 if (!subdir_p (start_url_parsed->dir, u->dir))
512 DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
513 u->dir, start_url_parsed->dir));
518 /* 5. If the file does not match the acceptance list, or is on the
519 rejection list, chuck it out. The same goes for the directory
520 exclusion and inclusion lists. */
521 if (opt.includes || opt.excludes)
523 if (!accdir (u->dir))
525 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
530 /* 6. Check for acceptance/rejection rules. We ignore these rules
531 for directories (no file name to match) and for non-leaf HTMLs,
532 which can lead to other files that do need to be downloaded. (-p
533 automatically implies non-leaf because with -p we can, if
534 necesary, overstep the maximum depth to get the page requisites.) */
535 if (u->file[0] != '\0'
536 && !(has_html_suffix_p (u->file)
537 /* The exception only applies to non-leaf HTMLs (but -p
538 always implies non-leaf because we can overstep the
539 maximum depth to get the requisites): */
541 opt.reclevel == INFINITE_RECURSION
543 || depth < opt.reclevel - 1
544 /* -p, which implies non-leaf (see above) */
545 || opt.page_requisites)))
547 if (!acceptable (u->file))
549 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
556 if (schemes_are_similar_p (u->scheme, parent->scheme))
557 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
559 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
560 u->host, parent->host));
565 if (opt.use_robots && u_scheme_like_http)
567 struct robot_specs *specs = res_get_specs (u->host, u->port);
571 if (res_retrieve_file (url, &rfile))
573 specs = res_parse_from_file (rfile);
578 /* If we cannot get real specs, at least produce
579 dummy ones so that we can register them and stop
580 trying to retrieve them. */
581 specs = res_parse ("", 0);
583 res_register_specs (u->host, u->port, specs);
586 /* Now that we have (or don't have) robots.txt specs, we can
587 check what they say. */
588 if (!res_match_path (specs, u->path))
590 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
591 string_set_add (blacklist, url);
596 /* The URL has passed all the tests. It can be placed in the
598 DEBUGP (("Decided to load it.\n"));
603 DEBUGP (("Decided NOT to load it.\n"));
608 /* This function determines whether we will consider downloading the
609 children of a URL whose download resulted in a redirection,
610 possibly to another host, etc. It is needed very rarely, and thus
611 it is merely a simple-minded wrapper around download_child_p. */
614 descend_redirect_p (const char *redirected, const char *original, int depth,
615 struct url *start_url_parsed, struct hash_table *blacklist)
617 struct url *orig_parsed, *new_parsed;
621 orig_parsed = url_parse (original, NULL);
622 assert (orig_parsed != NULL);
624 new_parsed = url_parse (redirected, NULL);
625 assert (new_parsed != NULL);
627 upos = xnew0 (struct urlpos);
628 upos->url = new_parsed;
630 success = download_child_p (upos, orig_parsed, depth,
631 start_url_parsed, blacklist);
633 url_free (orig_parsed);
634 url_free (new_parsed);
638 DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
643 /* vim:set sts=2 sw=2 cino+={s: */