1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 /* Functions for maintaining the URL queue. */
64 struct queue_element {
68 struct queue_element *next;
72 struct queue_element *head;
73 struct queue_element *tail;
77 /* Create a URL queue. */
79 static struct url_queue *
82 struct url_queue *queue = xmalloc (sizeof (*queue));
83 memset (queue, '\0', sizeof (*queue));
87 /* Delete a URL queue. */
90 url_queue_delete (struct url_queue *queue)
95 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
96 retrieved ("dequeued") from the queue in the order they were placed
100 url_enqueue (struct url_queue *queue,
101 const char *url, const char *referer, int depth)
103 struct queue_element *qel = xmalloc (sizeof (*qel));
105 qel->referer = referer;
110 if (queue->count > queue->maxcount)
111 queue->maxcount = queue->count;
113 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
114 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 queue->tail->next = qel;
121 queue->head = queue->tail;
124 /* Take a URL out of the queue. Return 1 if this operation succeeded,
125 or 0 if the queue is empty. */
128 url_dequeue (struct url_queue *queue,
129 const char **url, const char **referer, int *depth)
131 struct queue_element *qel = queue->head;
136 queue->head = queue->head->next;
141 *referer = qel->referer;
146 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
147 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
154 struct url *, struct hash_table *));
156 /* Retrieve a part of the web beginning with START_URL. This used to
157 be called "recursive retrieval", because the old function was
158 recursive and implemented depth-first search. retrieve_tree on the
159 other hand implements breadth-search traversal of the tree, which
160 results in much nicer ordering of downloads.
162 The algorithm this function uses is simple:
164 1. put START_URL in the queue.
165 2. while there are URLs in the queue:
167 3. get next URL from the queue.
169 5. if the URL is HTML and its depth does not exceed maximum depth,
170 get the list of URLs embedded therein.
171 6. for each of those URLs do the following:
173 7. if the URL is not one of those downloaded before, and if it
174 satisfies the criteria specified by the various command-line
175 options, add it to the queue. */
178 retrieve_tree (const char *start_url)
180 uerr_t status = RETROK;
182 /* The queue of URLs we need to load. */
183 struct url_queue *queue = url_queue_new ();
185 /* The URLs we do not wish to enqueue, because they are already in
186 the queue, but haven't been downloaded yet. */
187 struct hash_table *blacklist = make_string_hash_table (0);
189 /* We'll need various components of this, so better get it over with
191 struct url *start_url_parsed = url_parse (start_url, NULL);
193 url_enqueue (queue, xstrdup (start_url), NULL, 0);
194 string_set_add (blacklist, start_url);
199 char *url, *referer, *file = NULL;
201 boolean dash_p_leaf_HTML = FALSE;
203 if (downloaded_exceeds_quota ())
206 if (status == FWRITEERR)
209 /* Get the next URL from the queue. */
211 if (!url_dequeue (queue,
212 (const char **)&url, (const char **)&referer,
216 /* And download it. */
220 char *redirected = NULL;
221 int oldrec = opt.recursive;
224 status = retrieve_url (url, &file, &redirected, NULL, &dt);
225 opt.recursive = oldrec;
232 if (file && status == RETROK
233 && (dt & RETROKF) && (dt & TEXTHTML))
238 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
240 if (opt.page_requisites && depth == opt.reclevel)
241 /* When -p is specified, we can do one more partial
242 recursion from the "leaf nodes" on the HTML document
243 tree. The recursion is partial in that we won't
244 traverse any <A> or <AREA> tags, nor any <LINK> tags
245 except for <LINK REL="stylesheet">. */
246 dash_p_leaf_HTML = TRUE;
249 /* Either -p wasn't specified or it was and we've
250 already gone the one extra (pseudo-)level that it
251 affords us, so we need to bail out. */
252 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
253 depth, opt.reclevel));
258 /* If the downloaded document was HTML, parse it and enqueue the
259 links it contains. */
263 int meta_disallow_follow = 0;
264 struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
265 &meta_disallow_follow);
267 if (opt.use_robots && meta_disallow_follow)
269 free_urlpos (children);
275 struct urlpos *child = children;
276 struct url *url_parsed = url_parsed = url_parse (url, NULL);
277 assert (url_parsed != NULL);
279 for (; child; child = child->next)
281 if (child->ignore_when_downloading)
283 if (descend_url_p (child, url_parsed, depth, start_url_parsed,
286 url_enqueue (queue, xstrdup (child->url->url),
287 xstrdup (url), depth + 1);
288 /* We blacklist the URL we have enqueued, because we
289 don't want to enqueue (and hence download) the
291 string_set_add (blacklist, child->url->url);
295 url_free (url_parsed);
296 free_urlpos (children);
300 if (opt.delete_after || (file && !acceptable (file)))
302 /* Either --delete-after was specified, or we loaded this
303 otherwise rejected (e.g. by -R) HTML file just so we
304 could harvest its hyperlinks -- in either case, delete
306 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
307 opt.delete_after ? "--delete-after" :
308 "recursive rejection criteria"));
309 logprintf (LOG_VERBOSE,
310 (opt.delete_after ? _("Removing %s.\n")
311 : _("Removing %s since it should be rejected.\n")),
314 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
318 FREE_MAYBE (referer);
322 /* If anything is left of the queue due to a premature exit, free it
327 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
333 url_queue_delete (queue);
335 if (start_url_parsed)
336 url_free (start_url_parsed);
337 string_set_free (blacklist);
339 if (downloaded_exceeds_quota ())
341 else if (status == FWRITEERR)
347 /* Based on the context provided by retrieve_tree, decide whether a
348 URL is to be descended to. This is only ever called from
349 retrieve_tree, but is in a separate function for clarity.
351 The most expensive checks (such as those for robots) are memoized
352 by storing these URLs to BLACKLIST. This may or may not help. It
353 will help if those URLs are encountered many times. */
356 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
357 struct url *start_url_parsed, struct hash_table *blacklist)
359 struct url *u = upos->url;
360 const char *url = u->url;
362 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
364 if (string_set_contains (blacklist, url))
366 DEBUGP (("Already on the black list.\n"));
370 /* Several things to check for:
371 1. if scheme is not http, and we don't load it
372 2. check for relative links (if relative_only is set)
374 4. check for no-parent
375 5. check for excludes && includes
377 7. check for same host (if spanhost is unset), with possible
378 gethostbyname baggage
379 8. check for robots.txt
381 Addendum: If the URL is FTP, and it is to be loaded, only the
382 domain and suffix settings are "stronger".
384 Note that .html files will get loaded regardless of suffix rules
385 (but that is remedied later with unlink) unless the depth equals
388 More time- and memory- consuming tests should be put later on
391 /* 1. Schemes other than HTTP are normally not recursed into. */
392 if (u->scheme != SCHEME_HTTP
393 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
395 DEBUGP (("Not following non-HTTP schemes.\n"));
399 /* 2. If it is an absolute link and they are not followed, throw it
401 if (u->scheme == SCHEME_HTTP)
402 if (opt.relative_only && !upos->link_relative_p)
404 DEBUGP (("It doesn't really look like a relative link.\n"));
408 /* 3. If its domain is not to be accepted/looked-up, chuck it
410 if (!accept_domain (u))
412 DEBUGP (("The domain was not accepted.\n"));
416 /* 4. Check for parent directory.
418 If we descended to a different host or changed the scheme, ignore
419 opt.no_parent. Also ignore it for -p leaf retrievals. */
421 && u->scheme == parent->scheme
422 && 0 == strcasecmp (u->host, parent->host)
423 && u->port == parent->port)
425 if (!frontcmp (parent->dir, u->dir))
427 DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
432 /* 5. If the file does not match the acceptance list, or is on the
433 rejection list, chuck it out. The same goes for the directory
434 exclusion and inclusion lists. */
435 if (opt.includes || opt.excludes)
437 if (!accdir (u->dir, ALLABS))
439 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
447 /* Check for acceptance/rejection rules. We ignore these rules
448 for HTML documents because they might lead to other files which
449 need to be downloaded. Of course, we don't know which
450 documents are HTML before downloading them, so we guess.
452 A file is subject to acceptance/rejection rules if:
454 * u->file is not "" (i.e. it is not a directory)
456 + there is no file suffix,
457 + or there is a suffix, but is not "html" or "htm",
459 - recursion is not infinite,
460 - and we are at its very end. */
462 if (u->file[0] != '\0'
463 && ((suf = suffix (url)) == NULL
464 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
465 || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
467 if (!acceptable (u->file))
469 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
477 if (u->scheme == parent->scheme)
478 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
480 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
481 u->host, parent->host));
486 if (opt.use_robots && u->scheme == SCHEME_HTTP)
488 struct robot_specs *specs = res_get_specs (u->host, u->port);
492 if (res_retrieve_file (url, &rfile))
494 specs = res_parse_from_file (rfile);
499 /* If we cannot get real specs, at least produce
500 dummy ones so that we can register them and stop
501 trying to retrieve them. */
502 specs = res_parse ("", 0);
504 res_register_specs (u->host, u->port, specs);
507 /* Now that we have (or don't have) robots.txt specs, we can
508 check what they say. */
509 if (!res_match_path (specs, u->path))
511 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
512 string_set_add (blacklist, url);
517 /* The URL has passed all the tests. It can be placed in the
519 DEBUGP (("Decided to load it.\n"));
524 DEBUGP (("Decided NOT to load it.\n"));
529 /* Register that URL has been successfully downloaded to FILE. */
532 register_download (const char *url, const char *file)
534 if (!opt.convert_links)
536 if (!dl_file_url_map)
537 dl_file_url_map = make_string_hash_table (0);
538 if (!dl_url_file_map)
539 dl_url_file_map = make_string_hash_table (0);
541 if (!hash_table_contains (dl_file_url_map, file))
542 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
543 if (!hash_table_contains (dl_url_file_map, url))
544 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
547 /* Register that FROM has been redirected to TO. This assumes that TO
548 is successfully downloaded and already registered using
549 register_download() above. */
552 register_redirection (const char *from, const char *to)
556 if (!opt.convert_links)
559 file = hash_table_get (dl_url_file_map, to);
560 assert (file != NULL);
561 if (!hash_table_contains (dl_url_file_map, from))
562 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
565 /* Register that URL corresponds to the HTML file FILE. */
568 register_html (const char *url, const char *file)
570 if (!opt.convert_links)
572 downloaded_html_files = slist_prepend (downloaded_html_files, file);
575 /* convert_links() is called from recursive_retrieve() after we're
576 done with an HTML file. This call to convert_links is not complete
577 because it converts only the downloaded files, and Wget cannot know
578 which files will be downloaded afterwards. So, if we have file
581 <a href="/c/something.gif">
583 and /c/something.gif was not downloaded because it exceeded the
584 recursion depth, the reference will *not* be changed.
586 However, later we can encounter /c/something.gif from an "upper"
587 level HTML (let's call it filetwo.html), and it gets downloaded.
589 But now we have a problem because /c/something.gif will be
590 correctly transformed in filetwo.html, but not in fileone.html,
591 since Wget could not have known that /c/something.gif will be
592 downloaded in the future.
594 This is why Wget must, after the whole retrieval, call
595 convert_all_links to go once more through the entire list of
596 retrieved HTMLs, and re-convert them.
598 All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
599 in urls_downloaded. From these two lists information is
602 convert_all_links (void)
605 struct wget_timer *timer;
609 timer = wtimer_new ();
611 /* Destructively reverse downloaded_html_files to get it in the right order.
612 recursive_retrieve() used slist_prepend() consistently. */
613 downloaded_html_files = slist_nreverse (downloaded_html_files);
615 for (html = downloaded_html_files; html; html = html->next)
617 struct urlpos *urls, *cur_url;
620 DEBUGP (("Rescanning %s\n", html->string));
622 /* Determine the URL of the HTML file. get_urls_html will need
624 url = hash_table_get (dl_file_url_map, html->string);
626 DEBUGP (("It should correspond to %s.\n", url));
628 DEBUGP (("I cannot find the corresponding URL.\n"));
630 /* Parse the HTML file... */
631 urls = get_urls_html (html->string, url, FALSE, NULL);
633 /* We don't respect meta_disallow_follow here because, even if
634 the file is not followed, we might still want to convert the
635 links that have been followed from other files. */
637 for (cur_url = urls; cur_url; cur_url = cur_url->next)
640 struct url *u = cur_url->url;
642 if (cur_url->link_base_p)
644 /* Base references have been resolved by our parser, so
645 we turn the base URL into an empty string. (Perhaps
646 we should remove the tag entirely?) */
647 cur_url->convert = CO_NULLIFY_BASE;
651 /* We decide the direction of conversion according to whether
652 a URL was downloaded. Downloaded URLs will be converted
653 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
654 local_name = hash_table_get (dl_url_file_map, u->url);
656 DEBUGP (("%s marked for conversion, local %s\n",
657 u->url, local_name));
659 /* Decide on the conversion type. */
662 /* We've downloaded this URL. Convert it to relative
663 form. We do this even if the URL already is in
664 relative form, because our directory structure may
665 not be identical to that on the server (think `-nd',
666 `--cut-dirs', etc.) */
667 cur_url->convert = CO_CONVERT_TO_RELATIVE;
668 cur_url->local_name = xstrdup (local_name);
672 /* We haven't downloaded this URL. If it's not already
673 complete (including a full host name), convert it to
674 that form, so it can be reached while browsing this
676 if (!cur_url->link_complete_p)
677 cur_url->convert = CO_CONVERT_TO_COMPLETE;
678 cur_url->local_name = NULL;
682 /* Convert the links in the file. */
683 convert_links (html->string, urls);
690 msecs = wtimer_elapsed (timer);
691 wtimer_delete (timer);
692 logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
693 file_count, (double)msecs / 1000);
696 /* Cleanup the data structures associated with recursive retrieving
697 (the variables above). */
699 recursive_cleanup (void)
703 free_keys_and_values (dl_file_url_map);
704 hash_table_destroy (dl_file_url_map);
705 dl_file_url_map = NULL;
709 free_keys_and_values (dl_url_file_map);
710 hash_table_destroy (dl_url_file_map);
711 dl_url_file_map = NULL;
713 slist_free (downloaded_html_files);
714 downloaded_html_files = NULL;