X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=741ca823094a75c0fd760f44cd126dba88e81216;hb=766df9d4e9392045a4e5c730ed81e599b509557a;hp=11c30a2157e00ba2f433cba0e98045b3bb9bb2a4;hpb=d5be8ecca466601bda9b81c28a79077fbda6ccde;p=wget diff --git a/src/recur.c b/src/recur.c index 11c30a21..741ca823 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,12 +1,13 @@ /* Handling of recursive HTTP retrieving. - Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. +the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -14,607 +15,674 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +along with Wget. If not, see . -#include +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif /* HAVE_STRING_H */ +#include #ifdef HAVE_UNISTD_H # include #endif /* HAVE_UNISTD_H */ #include #include -#include -#include "wget.h" #include "url.h" #include "recur.h" #include "utils.h" #include "retr.h" #include "ftp.h" -#include "fnmatch.h" #include "host.h" #include "hash.h" #include "res.h" +#include "convert.h" +#include "html-url.h" +#include "css-url.h" +#include "spider.h" + +/* Functions for maintaining the URL queue. */ + +struct queue_element { + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + bool html_allowed; /* whether the document is allowed to + be treated as HTML. */ + bool css_allowed; /* whether the document is allowed to + be treated as CSS. */ + struct queue_element *next; /* next element in queue */ +}; + +struct url_queue { + struct queue_element *head; + struct queue_element *tail; + int count, maxcount; +}; + +/* Create a URL queue. */ + +static struct url_queue * +url_queue_new (void) +{ + struct url_queue *queue = xnew0 (struct url_queue); + return queue; +} + +/* Delete a URL queue. */ + +static void +url_queue_delete (struct url_queue *queue) +{ + xfree (queue); +} -#ifndef errno -extern int errno; -#endif +/* Enqueue a URL in the queue. The queue is FIFO: the items will be + retrieved ("dequeued") from the queue in the order they were placed + into it. */ -extern char *version_string; +static void +url_enqueue (struct url_queue *queue, + const char *url, const char *referer, int depth, + bool html_allowed, bool css_allowed) +{ + struct queue_element *qel = xnew (struct queue_element); + qel->url = url; + qel->referer = referer; + qel->depth = depth; + qel->html_allowed = html_allowed; + qel->css_allowed = css_allowed; + qel->next = NULL; + + ++queue->count; + if (queue->count > queue->maxcount) + queue->maxcount = queue->count; + + DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); + DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + + if (queue->tail) + queue->tail->next = qel; + queue->tail = qel; + + if (!queue->head) + queue->head = queue->tail; +} + +/* Take a URL out of the queue. Return true if this operation + succeeded, or false if the queue is empty. */ + +static bool +url_dequeue (struct url_queue *queue, + const char **url, const char **referer, int *depth, + bool *html_allowed, bool *css_allowed) +{ + struct queue_element *qel = queue->head; -static struct hash_table *dl_file_url_map; -static struct hash_table *dl_url_file_map; + if (!qel) + return false; -/* List of HTML files downloaded in this Wget run. Used for link - conversion after Wget is done. */ -static slist *downloaded_html_files; + queue->head = queue->head->next; + if (!queue->head) + queue->tail = NULL; -/* List of undesirable-to-load URLs. */ -static struct hash_table *undesirable_urls; + *url = qel->url; + *referer = qel->referer; + *depth = qel->depth; + *html_allowed = qel->html_allowed; + *css_allowed = qel->css_allowed; + + --queue->count; + + DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth)); + DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + + xfree (qel); + return true; +} + +static bool download_child_p (const struct urlpos *, struct url *, int, + struct url *, struct hash_table *); +static bool descend_redirect_p (const char *, const char *, int, + struct url *, struct hash_table *); -/* Current recursion depth. */ -static int depth; -/* Base directory we're recursing from (used by no_parent). */ -static char *base_dir; +/* Retrieve a part of the web beginning with START_URL. This used to + be called "recursive retrieval", because the old function was + recursive and implemented depth-first search. retrieve_tree on the + other hand implements breadth-search traversal of the tree, which + results in much nicer ordering of downloads. -static int first_time = 1; + The algorithm this function uses is simple: + 1. put START_URL in the queue. + 2. while there are URLs in the queue: -/* Cleanup the data structures associated with recursive retrieving - (the variables above). */ -void -recursive_cleanup (void) + 3. get next URL from the queue. + 4. download it. + 5. if the URL is HTML and its depth does not exceed maximum depth, + get the list of URLs embedded therein. + 6. for each of those URLs do the following: + + 7. if the URL is not one of those downloaded before, and if it + satisfies the criteria specified by the various command-line + options, add it to the queue. */ + +uerr_t +retrieve_tree (const char *start_url) { - if (undesirable_urls) - { - string_set_free (undesirable_urls); - undesirable_urls = NULL; - } - if (dl_file_url_map) + uerr_t status = RETROK; + + /* The queue of URLs we need to load. */ + struct url_queue *queue; + + /* The URLs we do not wish to enqueue, because they are already in + the queue, but haven't been downloaded yet. */ + struct hash_table *blacklist; + + int up_error_code; + struct url *start_url_parsed = url_parse (start_url, &up_error_code); + + if (!start_url_parsed) { - free_keys_and_values (dl_file_url_map); - hash_table_destroy (dl_file_url_map); - dl_file_url_map = NULL; + char *error = url_error (start_url, up_error_code); + logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error); + xfree (error); + return URLERROR; } - if (dl_url_file_map) + + queue = url_queue_new (); + blacklist = make_string_hash_table (0); + + /* Enqueue the starting URL. Use start_url_parsed->url rather than + just URL so we enqueue the canonical form of the URL. */ + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); + string_set_add (blacklist, start_url_parsed->url); + + while (1) { - free_keys_and_values (dl_url_file_map); - hash_table_destroy (dl_url_file_map); - dl_url_file_map = NULL; + bool descend = false; + char *url, *referer, *file = NULL; + int depth; + bool html_allowed, css_allowed; + bool is_css = false; + bool dash_p_leaf_HTML = false; + + if (opt.quota && total_downloaded_bytes > opt.quota) + break; + if (status == FWRITEERR) + break; + + /* Get the next URL from the queue... */ + + if (!url_dequeue (queue, + (const char **)&url, (const char **)&referer, + &depth, &html_allowed, &css_allowed)) + break; + + /* ...and download it. Note that this download is in most cases + unconditional, as download_child_p already makes sure a file + doesn't get enqueued twice -- and yet this check is here, and + not in download_child_p. This is so that if you run `wget -r + URL1 URL2', and a random URL is encountered once under URL1 + and again under URL2, but at a different (possibly smaller) + depth, we want the URL's children to be taken into account + the second time. */ + if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) + { + file = xstrdup (hash_table_get (dl_url_file_map, url)); + + DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", + url, file)); + + /* this sucks, needs to be combined! */ + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) + { + descend = true; + is_css = false; + } + if (css_allowed + && downloaded_css_set + && string_set_contains (downloaded_css_set, file)) + { + descend = true; + is_css = true; + } + } + else + { + int dt = 0; + char *redirected = NULL; + + status = retrieve_url (url, &file, &redirected, referer, &dt, false); + + if (html_allowed && file && status == RETROK + && (dt & RETROKF) && (dt & TEXTHTML)) + { + descend = true; + is_css = false; + } + + /* a little different, css_allowed can override content type + lots of web servers serve css with an incorrect content type + */ + if (file && status == RETROK + && (dt & RETROKF) && + ((dt & TEXTCSS) || css_allowed)) + { + descend = true; + is_css = true; + } + + if (redirected) + { + /* We have been redirected, possibly to another host, or + different path, or wherever. Check whether we really + want to follow it. */ + if (descend) + { + if (!descend_redirect_p (redirected, url, depth, + start_url_parsed, blacklist)) + descend = false; + else + /* Make sure that the old pre-redirect form gets + blacklisted. */ + string_set_add (blacklist, url); + } + + xfree (url); + url = redirected; + } + } + + if (opt.spider) + { + visited_url (url, referer); + } + + if (descend + && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) + { + if (opt.page_requisites + && (depth == opt.reclevel || depth == opt.reclevel + 1)) + { + /* When -p is specified, we are allowed to exceed the + maximum depth, but only for the "inline" links, + i.e. those that are needed to display the page. + Originally this could exceed the depth at most by + one, but we allow one more level so that the leaf + pages that contain frames can be loaded + correctly. */ + dash_p_leaf_HTML = true; + } + else + { + /* Either -p wasn't specified or it was and we've + already spent the two extra (pseudo-)levels that it + affords us, so we need to bail out. */ + DEBUGP (("Not descending further; at depth %d, max. %d.\n", + depth, opt.reclevel)); + descend = false; + } + } + + /* If the downloaded document was HTML or CSS, parse it and enqueue the + links it contains. */ + + if (descend) + { + bool meta_disallow_follow = false; + struct urlpos *children + = is_css ? get_urls_css_file (file, url) : + get_urls_html (file, url, &meta_disallow_follow); + + if (opt.use_robots && meta_disallow_follow) + { + free_urlpos (children); + children = NULL; + } + + if (children) + { + struct urlpos *child = children; + struct url *url_parsed = url_parsed = url_parse (url, NULL); + char *referer_url = url; + bool strip_auth = (url_parsed != NULL + && url_parsed->user != NULL); + assert (url_parsed != NULL); + + /* Strip auth info if present */ + if (strip_auth) + referer_url = url_string (url_parsed, URL_AUTH_HIDE); + + for (; child; child = child->next) + { + if (child->ignore_when_downloading) + continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; + if (download_child_p (child, url_parsed, depth, start_url_parsed, + blacklist)) + { + url_enqueue (queue, xstrdup (child->url->url), + xstrdup (referer_url), depth + 1, + child->link_expect_html, + child->link_expect_css); + /* We blacklist the URL we have enqueued, because we + don't want to enqueue (and hence download) the + same URL twice. */ + string_set_add (blacklist, child->url->url); + } + } + + if (strip_auth) + xfree (referer_url); + url_free (url_parsed); + free_urlpos (children); + } + } + + if (file + && (opt.delete_after + || opt.spider /* opt.recursive is implicitely true */ + || !acceptable (file))) + { + /* Either --delete-after was specified, or we loaded this + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, + delete the local file. */ + DEBUGP (("Removing file due to %s in recursive_retrieve():\n", + opt.delete_after ? "--delete-after" : + (opt.spider ? "--spider" : + "recursive rejection criteria"))); + logprintf (LOG_VERBOSE, + (opt.delete_after || opt.spider + ? _("Removing %s.\n") + : _("Removing %s since it should be rejected.\n")), + file); + if (unlink (file)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + logputs (LOG_VERBOSE, "\n"); + register_delete_file (file); + } + + xfree (url); + xfree_null (referer); + xfree_null (file); } - undesirable_urls = NULL; - slist_free (downloaded_html_files); - downloaded_html_files = NULL; - FREE_MAYBE (base_dir); - first_time = 1; -} -/* Reset FIRST_TIME to 1, so that some action can be taken in - recursive_retrieve(). */ -void -recursive_reset (void) -{ - first_time = 1; + /* If anything is left of the queue due to a premature exit, free it + now. */ + { + char *d1, *d2; + int d3; + bool d4, d5; + while (url_dequeue (queue, + (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) + { + xfree (d1); + xfree_null (d2); + } + } + url_queue_delete (queue); + + if (start_url_parsed) + url_free (start_url_parsed); + string_set_free (blacklist); + + if (opt.quota && total_downloaded_bytes > opt.quota) + return QUOTEXC; + else if (status == FWRITEERR) + return FWRITEERR; + else + return RETROK; } -/* The core of recursive retrieving. Endless recursion is avoided by - having all URLs stored to a linked list of URLs, which is checked - before loading any URL. That way no URL can get loaded twice. +/* Based on the context provided by retrieve_tree, decide whether a + URL is to be descended to. This is only ever called from + retrieve_tree, but is in a separate function for clarity. - The function also supports specification of maximum recursion depth - and a number of other goodies. */ -uerr_t -recursive_retrieve (const char *file, const char *this_url) + The most expensive checks (such as those for robots) are memoized + by storing these URLs to BLACKLIST. This may or may not help. It + will help if those URLs are encountered many times. */ + +static bool +download_child_p (const struct urlpos *upos, struct url *parent, int depth, + struct url *start_url_parsed, struct hash_table *blacklist) { - char *constr, *filename, *newloc; - char *canon_this_url = NULL; - int dt, inl, dash_p_leaf_HTML = FALSE; - int meta_disallow_follow; - int this_url_ftp; /* See below the explanation */ - urlpos *url_list, *cur_url; - struct url *u; - - assert (this_url != NULL); - assert (file != NULL); - /* If quota was exceeded earlier, bail out. */ - if (downloaded_exceeds_quota ()) - return QUOTEXC; - /* Cache the current URL in the list. */ - if (first_time) + struct url *u = upos->url; + const char *url = u->url; + bool u_scheme_like_http; + + DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); + + if (string_set_contains (blacklist, url)) { - /* These three operations need to be done only once per Wget - run. They should probably be at a different location. */ - if (!undesirable_urls) - undesirable_urls = make_string_hash_table (0); - - hash_table_clear (undesirable_urls); - string_set_add (undesirable_urls, this_url); - /* Enter this_url to the hash table, in original and "enhanced" form. */ - u = url_parse (this_url, NULL); - if (u) - { - string_set_add (undesirable_urls, u->url); - if (opt.no_parent) - base_dir = xstrdup (u->dir); /* Set the base dir. */ - /* Set the canonical this_url to be sent as referer. This - problem exists only when running the first time. */ - canon_this_url = xstrdup (u->url); - } - else - { - DEBUGP (("Double yuck! The *base* URL is broken.\n")); - base_dir = NULL; - } - url_free (u); - depth = 1; - first_time = 0; + if (opt.spider) + { + char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); + DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); + visited_url (url, referrer); + xfree (referrer); + } + DEBUGP (("Already on the black list.\n")); + goto out; } - else - ++depth; - if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel) - /* We've exceeded the maximum recursion depth specified by the user. */ + /* Several things to check for: + 1. if scheme is not http, and we don't load it + 2. check for relative links (if relative_only is set) + 3. check for domain + 4. check for no-parent + 5. check for excludes && includes + 6. check for suffix + 7. check for same host (if spanhost is unset), with possible + gethostbyname baggage + 8. check for robots.txt + + Addendum: If the URL is FTP, and it is to be loaded, only the + domain and suffix settings are "stronger". + + Note that .html files will get loaded regardless of suffix rules + (but that is remedied later with unlink) unless the depth equals + the maximum depth. + + More time- and memory- consuming tests should be put later on + the list. */ + + /* Determine whether URL under consideration has a HTTP-like scheme. */ + u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); + + /* 1. Schemes other than HTTP are normally not recursed into. */ + if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) { - if (opt.page_requisites && depth <= opt.reclevel + 1) - /* When -p is specified, we can do one more partial recursion from the - "leaf nodes" on the HTML document tree. The recursion is partial in - that we won't traverse any or tags, nor any tags - except for . */ - dash_p_leaf_HTML = TRUE; - else - /* Either -p wasn't specified or it was and we've already gone the one - extra (pseudo-)level that it affords us, so we need to bail out. */ - { - DEBUGP (("Recursion depth %d exceeded max. depth %d.\n", - depth, opt.reclevel)); - --depth; - return RECLEVELEXC; - } + DEBUGP (("Not following non-HTTP schemes.\n")); + goto out; } - /* Determine whether this_url is an FTP URL. If it is, it means - that the retrieval is done through proxy. In that case, FTP - links will be followed by default and recursion will not be - turned off when following them. */ - this_url_ftp = (url_scheme (this_url) == SCHEME_FTP); + /* 2. If it is an absolute link and they are not followed, throw it + out. */ + if (u_scheme_like_http) + if (opt.relative_only && !upos->link_relative_p) + { + DEBUGP (("It doesn't really look like a relative link.\n")); + goto out; + } + + /* 3. If its domain is not to be accepted/looked-up, chuck it + out. */ + if (!accept_domain (u)) + { + DEBUGP (("The domain was not accepted.\n")); + goto out; + } - /* Get the URL-s from an HTML file: */ - url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, - dash_p_leaf_HTML, &meta_disallow_follow); + /* 4. Check for parent directory. - if (opt.use_robots && meta_disallow_follow) + If we descended to a different host or changed the scheme, ignore + opt.no_parent. Also ignore it for documents needed to display + the parent page when in -p mode. */ + if (opt.no_parent + && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) + && 0 == strcasecmp (u->host, start_url_parsed->host) + && u->port == start_url_parsed->port + && !(opt.page_requisites && upos->link_inline_p)) { - /* The META tag says we are not to follow this file. Respect - that. */ - free_urlpos (url_list); - url_list = NULL; + if (!subdir_p (start_url_parsed->dir, u->dir)) + { + DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", + u->dir, start_url_parsed->dir)); + goto out; + } } - /* Decide what to do with each of the URLs. A URL will be loaded if - it meets several requirements, discussed later. */ - for (cur_url = url_list; cur_url; cur_url = cur_url->next) + /* 5. If the file does not match the acceptance list, or is on the + rejection list, chuck it out. The same goes for the directory + exclusion and inclusion lists. */ + if (opt.includes || opt.excludes) { - /* If quota was exceeded earlier, bail out. */ - if (downloaded_exceeds_quota ()) - break; - /* Parse the URL for convenient use in other functions, as well - as to get the optimized form. It also checks URL integrity. */ - u = url_parse (cur_url->url, NULL); - if (!u) - { - DEBUGP (("Yuck! A bad URL.\n")); - continue; - } - assert (u->url != NULL); - constr = xstrdup (u->url); - - /* Several checkings whether a file is acceptable to load: - 1. check if URL is ftp, and we don't load it - 2. check for relative links (if relative_only is set) - 3. check for domain - 4. check for no-parent - 5. check for excludes && includes - 6. check for suffix - 7. check for same host (if spanhost is unset), with possible - gethostbyname baggage - 8. check for robots.txt - - Addendum: If the URL is FTP, and it is to be loaded, only the - domain and suffix settings are "stronger". - - Note that .html and (yuck) .htm will get loaded regardless of - suffix rules (but that is remedied later with unlink) unless - the depth equals the maximum depth. - - More time- and memory- consuming tests should be put later on - the list. */ - - /* inl is set if the URL we are working on (constr) is stored in - undesirable_urls. Using it is crucial to avoid unnecessary - repeated continuous hits to the hash table. */ - inl = string_set_contains (undesirable_urls, constr); - - /* If it is FTP, and FTP is not followed, chuck it out. */ - if (!inl) - if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp) - { - DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* If it is absolute link and they are not followed, chuck it - out. */ - if (!inl && u->scheme != SCHEME_FTP) - if (opt.relative_only && !cur_url->link_relative_p) - { - DEBUGP (("It doesn't really look like a relative link.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* If its domain is not to be accepted/looked-up, chuck it out. */ - if (!inl) - if (!accept_domain (u)) - { - DEBUGP (("I don't like the smell of that domain.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* Check for parent directory. */ - if (!inl && opt.no_parent - /* If the new URL is FTP and the old was not, ignore - opt.no_parent. */ - && !(!this_url_ftp && u->scheme == SCHEME_FTP)) - { - /* Check for base_dir first. */ - if (!(base_dir && frontcmp (base_dir, u->dir))) - { - /* Failing that, check for parent dir. */ - struct url *ut = url_parse (this_url, NULL); - if (!ut) - DEBUGP (("Double yuck! The *base* URL is broken.\n")); - else if (!frontcmp (ut->dir, u->dir)) - { - /* Failing that too, kill the URL. */ - DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - url_free (ut); - } - } - /* If the file does not match the acceptance list, or is on the - rejection list, chuck it out. The same goes for the - directory exclude- and include- lists. */ - if (!inl && (opt.includes || opt.excludes)) - { - if (!accdir (u->dir, ALLABS)) - { - DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - if (!inl) - { - char *suf = NULL; - /* We check for acceptance/rejection rules only for non-HTML - documents. Since we don't know whether they really are - HTML, it will be deduced from (an OR-ed list): - - 1) u->file is "" (meaning it is a directory) - 2) suffix exists, AND: - a) it is "html", OR - b) it is "htm" - - If the file *is* supposed to be HTML, it will *not* be - subject to acc/rej rules, unless a finite maximum depth has - been specified and the current depth is the maximum depth. */ - if (! - (!*u->file - || (((suf = suffix (constr)) != NULL) - && ((!strcmp (suf, "html") || !strcmp (suf, "htm")) - && ((opt.reclevel != INFINITE_RECURSION) && - (depth != opt.reclevel)))))) - { - if (!acceptable (u->file)) - { - DEBUGP (("%s (%s) does not match acc/rej rules.\n", - constr, u->file)); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - FREE_MAYBE (suf); - } - /* Optimize the URL (which includes possible DNS lookup) only - after all other possibilities have been exhausted. */ - if (!inl) - { - if (!opt.simple_check) - { - /* Find the "true" host. */ - char *host = realhost (u->host); - xfree (u->host); - u->host = host; - - /* Refresh the printed representation of the URL. */ - xfree (u->url); - u->url = url_string (u, 0); - } - else - { - char *p; - /* Just lowercase the hostname. */ - for (p = u->host; *p; p++) - *p = TOLOWER (*p); - xfree (u->url); - u->url = url_string (u, 0); - } - xfree (constr); - constr = xstrdup (u->url); - /* After we have canonicalized the URL, check if we have it - on the black list. */ - if (string_set_contains (undesirable_urls, constr)) - inl = 1; - /* This line is bogus. */ - /*string_set_add (undesirable_urls, constr);*/ - - if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp)) - if (!opt.spanhost && this_url && !same_host (this_url, constr)) - { - DEBUGP (("This is not the same hostname as the parent's.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - /* What about robots.txt? */ - if (!inl && opt.use_robots && u->scheme == SCHEME_FTP) - { - struct robot_specs *specs = res_get_specs (u->host, u->port); - if (!specs) - { - char *rfile; - if (res_retrieve_file (constr, &rfile)) - { - specs = res_parse_from_file (rfile); - xfree (rfile); - } - else - { - /* If we cannot get real specs, at least produce - dummy ones so that we can register them and stop - trying to retrieve them. */ - specs = res_parse ("", 0); - } - res_register_specs (u->host, u->port, specs); - } - - /* Now that we have (or don't have) robots.txt specs, we can - check what they say. */ - if (!res_match_path (specs, u->path)) - { - DEBUGP (("Not following %s because robots.txt forbids it.\n", - constr)); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - - filename = NULL; - /* If it wasn't chucked out, do something with it. */ - if (!inl) - { - DEBUGP (("I've decided to load it -> ")); - /* Add it to the list of already-loaded URL-s. */ - string_set_add (undesirable_urls, constr); - /* Automatically followed FTPs will *not* be downloaded - recursively. */ - if (u->scheme == SCHEME_FTP) - { - /* Don't you adore side-effects? */ - opt.recursive = 0; - } - /* Reset its type. */ - dt = 0; - /* Retrieve it. */ - retrieve_url (constr, &filename, &newloc, - canon_this_url ? canon_this_url : this_url, &dt); - if (u->scheme == SCHEME_FTP) - { - /* Restore... */ - opt.recursive = 1; - } - if (newloc) - { - xfree (constr); - constr = newloc; - } - /* If there was no error, and the type is text/html, parse - it recursively. */ - if (dt & TEXTHTML) - { - if (dt & RETROKF) - recursive_retrieve (filename, constr); - } - else - DEBUGP (("%s is not text/html so we don't chase.\n", - filename ? filename: "(null)")); - - if (opt.delete_after || (filename && !acceptable (filename))) - /* Either --delete-after was specified, or we loaded this otherwise - rejected (e.g. by -R) HTML file just so we could harvest its - hyperlinks -- in either case, delete the local file. */ - { - DEBUGP (("Removing file due to %s in recursive_retrieve():\n", - opt.delete_after ? "--delete-after" : - "recursive rejection criteria")); - logprintf (LOG_VERBOSE, - (opt.delete_after ? _("Removing %s.\n") - : _("Removing %s since it should be rejected.\n")), - filename); - if (unlink (filename)) - logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); - dt &= ~RETROKF; - } - - /* If everything was OK, and links are to be converted, let's - store the local filename. */ - if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) - { - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (filename); - } - } - else - DEBUGP (("%s already in list, so we don't load.\n", constr)); - /* Free filename and constr. */ - FREE_MAYBE (filename); - FREE_MAYBE (constr); - url_free (u); - /* Increment the pbuf for the appropriate size. */ + if (!accdir (u->dir)) + { + DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); + goto out; + } } - if (opt.convert_links && !opt.delete_after) - /* This is merely the first pass: the links that have been - successfully downloaded are converted. In the second pass, - convert_all_links() will also convert those links that have NOT - been downloaded to their canonical form. */ - convert_links (file, url_list); - /* Free the linked list of URL-s. */ - free_urlpos (url_list); - /* Free the canonical this_url. */ - FREE_MAYBE (canon_this_url); - /* Decrement the recursion depth. */ - --depth; - if (downloaded_exceeds_quota ()) - return QUOTEXC; - else - return RETROK; -} - -void -register_download (const char *url, const char *file) -{ - if (!opt.convert_links) - return; - if (!dl_file_url_map) - dl_file_url_map = make_string_hash_table (0); - hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); - if (!dl_url_file_map) - dl_url_file_map = make_string_hash_table (0); - hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); -} -void -register_html (const char *url, const char *file) -{ - if (!opt.convert_links) - return; - downloaded_html_files = slist_prepend (downloaded_html_files, file); -} + /* 6. Check for acceptance/rejection rules. We ignore these rules + for directories (no file name to match) and for non-leaf HTMLs, + which can lead to other files that do need to be downloaded. (-p + automatically implies non-leaf because with -p we can, if + necesary, overstep the maximum depth to get the page requisites.) */ + if (u->file[0] != '\0' + && !(has_html_suffix_p (u->file) + /* The exception only applies to non-leaf HTMLs (but -p + always implies non-leaf because we can overstep the + maximum depth to get the requisites): */ + && (/* non-leaf */ + opt.reclevel == INFINITE_RECURSION + /* also non-leaf */ + || depth < opt.reclevel - 1 + /* -p, which implies non-leaf (see above) */ + || opt.page_requisites))) + { + if (!acceptable (u->file)) + { + DEBUGP (("%s (%s) does not match acc/rej rules.\n", + url, u->file)); + goto out; + } + } -/* convert_links() is called from recursive_retrieve() after we're - done with an HTML file. This call to convert_links is not complete - because it converts only the downloaded files, and Wget cannot know - which files will be downloaded afterwards. So, if we have file - fileone.html with: + /* 7. */ + if (schemes_are_similar_p (u->scheme, parent->scheme)) + if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) + { + DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", + u->host, parent->host)); + goto out; + } + + /* 8. */ + if (opt.use_robots && u_scheme_like_http) + { + struct robot_specs *specs = res_get_specs (u->host, u->port); + if (!specs) + { + char *rfile; + if (res_retrieve_file (url, &rfile)) + { + specs = res_parse_from_file (rfile); + + /* Delete the robots.txt file if we chose to either delete the + files after downloading or we're just running a spider. */ + if (opt.delete_after || opt.spider) + { + logprintf (LOG_VERBOSE, "Removing %s.\n", rfile); + if (unlink (rfile)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", + strerror (errno)); + } + + xfree (rfile); + } + else + { + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); + } + res_register_specs (u->host, u->port, specs); + } + + /* Now that we have (or don't have) robots.txt specs, we can + check what they say. */ + if (!res_match_path (specs, u->path)) + { + DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); + string_set_add (blacklist, url); + goto out; + } + } - + /* The URL has passed all the tests. It can be placed in the + download queue. */ + DEBUGP (("Decided to load it.\n")); - and /c/something.gif was not downloaded because it exceeded the - recursion depth, the reference will *not* be changed. + return true; - However, later we can encounter /c/something.gif from an "upper" - level HTML (let's call it filetwo.html), and it gets downloaded. + out: + DEBUGP (("Decided NOT to load it.\n")); - But now we have a problem because /c/something.gif will be - correctly transformed in filetwo.html, but not in fileone.html, - since Wget could not have known that /c/something.gif will be - downloaded in the future. + return false; +} - This is why Wget must, after the whole retrieval, call - convert_all_links to go once more through the entire list of - retrieved HTMLs, and re-convert them. +/* This function determines whether we will consider downloading the + children of a URL whose download resulted in a redirection, + possibly to another host, etc. It is needed very rarely, and thus + it is merely a simple-minded wrapper around download_child_p. */ - All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs - in urls_downloaded. From these two lists information is - extracted. */ -void -convert_all_links (void) +static bool +descend_redirect_p (const char *redirected, const char *original, int depth, + struct url *start_url_parsed, struct hash_table *blacklist) { - slist *html; + struct url *orig_parsed, *new_parsed; + struct urlpos *upos; + bool success; - /* Destructively reverse downloaded_html_files to get it in the right order. - recursive_retrieve() used slist_prepend() consistently. */ - downloaded_html_files = slist_nreverse (downloaded_html_files); + orig_parsed = url_parse (original, NULL); + assert (orig_parsed != NULL); - for (html = downloaded_html_files; html; html = html->next) - { - urlpos *urls, *cur_url; - char *url; - - DEBUGP (("Rescanning %s\n", html->string)); - /* Determine the URL of the HTML file. get_urls_html will need - it. */ - url = hash_table_get (dl_file_url_map, html->string); - if (url) - DEBUGP (("It should correspond to %s.\n", url)); - else - DEBUGP (("I cannot find the corresponding URL.\n")); - /* Parse the HTML file... */ - urls = get_urls_html (html->string, url, FALSE, NULL); - /* We don't respect meta_disallow_follow here because, even if - the file is not followed, we might still want to convert the - links that have been followed from other files. */ - for (cur_url = urls; cur_url; cur_url = cur_url->next) - { - char *local_name; - - /* The URL must be in canonical form to be compared. */ - struct url *u = url_parse (cur_url->url, NULL); - if (!u) - continue; - /* We decide the direction of conversion according to whether - a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. */ - local_name = hash_table_get (dl_url_file_map, u->url); - if (local_name) - DEBUGP (("%s marked for conversion, local %s\n", - u->url, local_name)); - /* Decide on the conversion direction. */ - if (local_name) - { - /* We've downloaded this URL. Convert it to relative - form. We do this even if the URL already is in - relative form, because our directory structure may - not be identical to that on the server (think `-nd', - `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (local_name); - } - else - { - /* We haven't downloaded this URL. If it's not already - complete (including a full host name), convert it to - that form, so it can be reached while browsing this - HTML locally. */ - if (!cur_url->link_complete_p) - cur_url->convert = CO_CONVERT_TO_COMPLETE; - cur_url->local_name = NULL; - } - url_free (u); - } - /* Convert the links in the file. */ - convert_links (html->string, urls); - /* Free the data. */ - free_urlpos (urls); - } + new_parsed = url_parse (redirected, NULL); + assert (new_parsed != NULL); + + upos = xnew0 (struct urlpos); + upos->url = new_parsed; + + success = download_child_p (upos, orig_parsed, depth, + start_url_parsed, blacklist); + + url_free (orig_parsed); + url_free (new_parsed); + xfree (upos); + + if (!success) + DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); + + return success; } + +/* vim:set sts=2 sw=2 cino+={s: */