1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. This list should only be traversed
58 in order. If you need to check whether a file has been downloaded,
59 use a hash table, e.g. dl_file_url_map. */
60 static slist *downloaded_html_files;
62 /* Functions for maintaining the URL queue. */
64 struct queue_element {
68 struct queue_element *next;
72 struct queue_element *head;
73 struct queue_element *tail;
77 /* Create a URL queue. */
79 static struct url_queue *
82 struct url_queue *queue = xmalloc (sizeof (*queue));
83 memset (queue, '\0', sizeof (*queue));
87 /* Delete a URL queue. */
90 url_queue_delete (struct url_queue *queue)
95 /* Enqueue a URL in the queue. The queue is FIFO: the items will be
96 retrieved ("dequeued") from the queue in the order they were placed
100 url_enqueue (struct url_queue *queue,
101 const char *url, const char *referer, int depth)
103 struct queue_element *qel = xmalloc (sizeof (*qel));
105 qel->referer = referer;
110 if (queue->count > queue->maxcount)
111 queue->maxcount = queue->count;
113 DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
114 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117 queue->tail->next = qel;
121 queue->head = queue->tail;
124 /* Take a URL out of the queue. Return 1 if this operation succeeded,
125 or 0 if the queue is empty. */
128 url_dequeue (struct url_queue *queue,
129 const char **url, const char **referer, int *depth)
131 struct queue_element *qel = queue->head;
136 queue->head = queue->head->next;
141 *referer = qel->referer;
146 DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
147 DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
154 struct url *, struct hash_table *));
156 /* Retrieve a part of the web beginning with START_URL. This used to
157 be called "recursive retrieval", because the old function was
158 recursive and implemented depth-first search. retrieve_tree on the
159 other hand implements breadth-search traversal of the tree, which
160 results in much nicer ordering of downloads.
162 The algorithm this function uses is simple:
164 1. put START_URL in the queue.
165 2. while there are URLs in the queue:
167 3. get next URL from the queue.
169 5. if the URL is HTML and its depth does not exceed maximum depth,
170 get the list of URLs embedded therein.
171 6. for each of those URLs do the following:
173 7. if the URL is not one of those downloaded before, and if it
174 satisfies the criteria specified by the various command-line
175 options, add it to the queue. */
178 retrieve_tree (const char *start_url)
180 uerr_t status = RETROK;
182 /* The queue of URLs we need to load. */
183 struct url_queue *queue = url_queue_new ();
185 /* The URLs we decided we don't want to load. */
186 struct hash_table *blacklist = make_string_hash_table (0);
188 /* We'll need various components of this, so better get it over with
190 struct url *start_url_parsed = url_parse (start_url, NULL);
192 url_enqueue (queue, xstrdup (start_url), NULL, 0);
193 string_set_add (blacklist, start_url);
198 char *url, *referer, *file = NULL;
200 boolean dash_p_leaf_HTML = FALSE;
202 if (downloaded_exceeds_quota ())
205 if (status == FWRITEERR)
208 /* Get the next URL from the queue. */
210 if (!url_dequeue (queue,
211 (const char **)&url, (const char **)&referer,
215 /* And download it. */
219 char *redirected = NULL;
220 int oldrec = opt.recursive;
223 status = retrieve_url (url, &file, &redirected, NULL, &dt);
224 opt.recursive = oldrec;
231 if (file && status == RETROK
232 && (dt & RETROKF) && (dt & TEXTHTML))
237 && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
239 if (opt.page_requisites && depth == opt.reclevel)
240 /* When -p is specified, we can do one more partial
241 recursion from the "leaf nodes" on the HTML document
242 tree. The recursion is partial in that we won't
243 traverse any <A> or <AREA> tags, nor any <LINK> tags
244 except for <LINK REL="stylesheet">. */
245 /* #### This would be the place to implement the TODO
246 entry saying that -p should do two more hops on
248 dash_p_leaf_HTML = TRUE;
251 /* Either -p wasn't specified or it was and we've
252 already gone the one extra (pseudo-)level that it
253 affords us, so we need to bail out. */
254 DEBUGP (("Not descending further; at depth %d, max. %d.\n",
255 depth, opt.reclevel));
260 /* If the downloaded document was HTML, parse it and enqueue the
261 links it contains. */
265 int meta_disallow_follow = 0;
266 struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
267 &meta_disallow_follow);
269 if (opt.use_robots && meta_disallow_follow)
271 free_urlpos (children);
277 struct urlpos *child = children;
278 struct url *url_parsed = url_parsed = url_parse (url, NULL);
279 assert (url_parsed != NULL);
281 for (; child; child = child->next)
283 if (child->ignore_when_downloading)
285 if (descend_url_p (child, url_parsed, depth, start_url_parsed,
288 url_enqueue (queue, xstrdup (child->url->url),
289 xstrdup (url), depth + 1);
290 /* We blacklist the URL we have enqueued, because we
291 don't want to enqueue (and hence download) the
293 string_set_add (blacklist, child->url->url);
297 url_free (url_parsed);
298 free_urlpos (children);
302 if (opt.delete_after || (file && !acceptable (file)))
304 /* Either --delete-after was specified, or we loaded this
305 otherwise rejected (e.g. by -R) HTML file just so we
306 could harvest its hyperlinks -- in either case, delete
308 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
309 opt.delete_after ? "--delete-after" :
310 "recursive rejection criteria"));
311 logprintf (LOG_VERBOSE,
312 (opt.delete_after ? _("Removing %s.\n")
313 : _("Removing %s since it should be rejected.\n")),
316 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
320 FREE_MAYBE (referer);
324 /* If anything is left of the queue due to a premature exit, free it
329 while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
335 url_queue_delete (queue);
337 if (start_url_parsed)
338 url_free (start_url_parsed);
339 string_set_free (blacklist);
341 if (downloaded_exceeds_quota ())
343 else if (status == FWRITEERR)
349 /* Based on the context provided by retrieve_tree, decide whether a
350 URL is to be descended to. This is only ever called from
351 retrieve_tree, but is in a separate function for clarity. */
354 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
355 struct url *start_url_parsed, struct hash_table *blacklist)
357 struct url *u = upos->url;
358 const char *url = u->url;
360 DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
362 if (string_set_contains (blacklist, url))
364 DEBUGP (("Already on the black list.\n"));
368 /* Several things to check for:
369 1. if scheme is not http, and we don't load it
370 2. check for relative links (if relative_only is set)
372 4. check for no-parent
373 5. check for excludes && includes
375 7. check for same host (if spanhost is unset), with possible
376 gethostbyname baggage
377 8. check for robots.txt
379 Addendum: If the URL is FTP, and it is to be loaded, only the
380 domain and suffix settings are "stronger".
382 Note that .html files will get loaded regardless of suffix rules
383 (but that is remedied later with unlink) unless the depth equals
386 More time- and memory- consuming tests should be put later on
389 /* 1. Schemes other than HTTP are normally not recursed into. */
390 if (u->scheme != SCHEME_HTTP
391 && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
393 DEBUGP (("Not following non-HTTP schemes.\n"));
397 /* 2. If it is an absolute link and they are not followed, throw it
399 if (u->scheme == SCHEME_HTTP)
400 if (opt.relative_only && !upos->link_relative_p)
402 DEBUGP (("It doesn't really look like a relative link.\n"));
406 /* 3. If its domain is not to be accepted/looked-up, chuck it
408 if (!accept_domain (u))
410 DEBUGP (("The domain was not accepted.\n"));
414 /* 4. Check for parent directory.
416 If we descended to a different host or changed the scheme, ignore
417 opt.no_parent. Also ignore it for -p leaf retrievals. */
419 && u->scheme == parent->scheme
420 && 0 == strcasecmp (u->host, parent->host)
421 && u->port == parent->port)
423 if (!frontcmp (parent->dir, u->dir))
425 DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
430 /* 5. If the file does not match the acceptance list, or is on the
431 rejection list, chuck it out. The same goes for the directory
432 exclusion and inclusion lists. */
433 if (opt.includes || opt.excludes)
435 if (!accdir (u->dir, ALLABS))
437 DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
445 /* Check for acceptance/rejection rules. We ignore these rules
446 for HTML documents because they might lead to other files which
447 need to be downloaded. Of course, we don't know which
448 documents are HTML before downloading them, so we guess.
450 A file is subject to acceptance/rejection rules if:
452 * u->file is not "" (i.e. it is not a directory)
454 + there is no file suffix,
455 + or there is a suffix, but is not "html" or "htm",
457 - recursion is not infinite,
458 - and we are at its very end. */
460 if (u->file[0] != '\0'
461 && ((suf = suffix (url)) == NULL
462 || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
463 || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
465 if (!acceptable (u->file))
467 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
477 if (u->scheme == parent->scheme)
478 if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
480 DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
481 u->host, parent->host));
486 if (opt.use_robots && u->scheme == SCHEME_HTTP)
488 struct robot_specs *specs = res_get_specs (u->host, u->port);
492 if (res_retrieve_file (url, &rfile))
494 specs = res_parse_from_file (rfile);
499 /* If we cannot get real specs, at least produce
500 dummy ones so that we can register them and stop
501 trying to retrieve them. */
502 specs = res_parse ("", 0);
504 res_register_specs (u->host, u->port, specs);
507 /* Now that we have (or don't have) robots.txt specs, we can
508 check what they say. */
509 if (!res_match_path (specs, u->path))
511 DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
516 /* The URL has passed all the tests. It can be placed in the
518 DEBUGP (("Decided to load it.\n"));
523 string_set_add (blacklist, url);
526 DEBUGP (("Decided NOT to load it.\n"));
531 /* Register that URL has been successfully downloaded to FILE. */
534 register_download (const char *url, const char *file)
536 if (!opt.convert_links)
538 if (!dl_file_url_map)
539 dl_file_url_map = make_string_hash_table (0);
540 if (!dl_url_file_map)
541 dl_url_file_map = make_string_hash_table (0);
543 if (!hash_table_contains (dl_file_url_map, file))
544 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
545 if (!hash_table_contains (dl_url_file_map, url))
546 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
549 /* Register that FROM has been redirected to TO. This assumes that TO
550 is successfully downloaded and already registered using
551 register_download() above. */
554 register_redirection (const char *from, const char *to)
558 if (!opt.convert_links)
561 file = hash_table_get (dl_url_file_map, to);
562 assert (file != NULL);
563 if (!hash_table_contains (dl_url_file_map, from))
564 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
567 /* Register that URL corresponds to the HTML file FILE. */
570 register_html (const char *url, const char *file)
572 if (!opt.convert_links)
574 downloaded_html_files = slist_prepend (downloaded_html_files, file);
577 /* convert_links() is called from recursive_retrieve() after we're
578 done with an HTML file. This call to convert_links is not complete
579 because it converts only the downloaded files, and Wget cannot know
580 which files will be downloaded afterwards. So, if we have file
583 <a href="/c/something.gif">
585 and /c/something.gif was not downloaded because it exceeded the
586 recursion depth, the reference will *not* be changed.
588 However, later we can encounter /c/something.gif from an "upper"
589 level HTML (let's call it filetwo.html), and it gets downloaded.
591 But now we have a problem because /c/something.gif will be
592 correctly transformed in filetwo.html, but not in fileone.html,
593 since Wget could not have known that /c/something.gif will be
594 downloaded in the future.
596 This is why Wget must, after the whole retrieval, call
597 convert_all_links to go once more through the entire list of
598 retrieved HTMLs, and re-convert them.
600 All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
601 in urls_downloaded. From these two lists information is
604 convert_all_links (void)
608 /* Destructively reverse downloaded_html_files to get it in the right order.
609 recursive_retrieve() used slist_prepend() consistently. */
610 downloaded_html_files = slist_nreverse (downloaded_html_files);
612 for (html = downloaded_html_files; html; html = html->next)
614 struct urlpos *urls, *cur_url;
617 DEBUGP (("Rescanning %s\n", html->string));
619 /* Determine the URL of the HTML file. get_urls_html will need
621 url = hash_table_get (dl_file_url_map, html->string);
623 DEBUGP (("It should correspond to %s.\n", url));
625 DEBUGP (("I cannot find the corresponding URL.\n"));
627 /* Parse the HTML file... */
628 urls = get_urls_html (html->string, url, FALSE, NULL);
630 /* We don't respect meta_disallow_follow here because, even if
631 the file is not followed, we might still want to convert the
632 links that have been followed from other files. */
634 for (cur_url = urls; cur_url; cur_url = cur_url->next)
637 struct url *u = cur_url->url;
639 if (cur_url->link_base_p)
641 /* Base references have been resolved by our parser, so
642 we turn the base URL into an empty string. (Perhaps
643 we should remove the tag entirely?) */
644 cur_url->convert = CO_NULLIFY_BASE;
648 /* We decide the direction of conversion according to whether
649 a URL was downloaded. Downloaded URLs will be converted
650 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
651 local_name = hash_table_get (dl_url_file_map, u->url);
653 DEBUGP (("%s marked for conversion, local %s\n",
654 u->url, local_name));
656 /* Decide on the conversion type. */
659 /* We've downloaded this URL. Convert it to relative
660 form. We do this even if the URL already is in
661 relative form, because our directory structure may
662 not be identical to that on the server (think `-nd',
663 `--cut-dirs', etc.) */
664 cur_url->convert = CO_CONVERT_TO_RELATIVE;
665 cur_url->local_name = xstrdup (local_name);
669 /* We haven't downloaded this URL. If it's not already
670 complete (including a full host name), convert it to
671 that form, so it can be reached while browsing this
673 if (!cur_url->link_complete_p)
674 cur_url->convert = CO_CONVERT_TO_COMPLETE;
675 cur_url->local_name = NULL;
678 /* Convert the links in the file. */
679 convert_links (html->string, urls);
685 /* Cleanup the data structures associated with recursive retrieving
686 (the variables above). */
688 recursive_cleanup (void)
692 free_keys_and_values (dl_file_url_map);
693 hash_table_destroy (dl_file_url_map);
694 dl_file_url_map = NULL;
698 free_keys_and_values (dl_url_file_map);
699 hash_table_destroy (dl_url_file_map);
700 dl_url_file_map = NULL;
702 slist_free (downloaded_html_files);
703 downloaded_html_files = NULL;