X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=4cf1d988943d79779f0e73547b4c1a836b782593;hb=5f0a2b3f0846dd4c2f72fc62e7171200d1fd6e06;hp=2c26157933646915eba8801cea8dd8f2d84da284;hpb=222e9465b7cef12a75e5ce0d6cc7df60c934566f;p=wget diff --git a/src/recur.c b/src/recur.c index 2c261579..4cf1d988 100644 --- a/src/recur.c +++ b/src/recur.c @@ -6,7 +6,7 @@ This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -15,7 +15,17 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ #include @@ -39,33 +49,31 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utils.h" #include "retr.h" #include "ftp.h" -#include "fnmatch.h" #include "host.h" #include "hash.h" #include "res.h" +#include "convert.h" #ifndef errno extern int errno; #endif extern char *version_string; +extern LARGE_INT total_downloaded_bytes; -static struct hash_table *dl_file_url_map; -static struct hash_table *dl_url_file_map; - -/* List of HTML files downloaded in this Wget run. Used for link - conversion after Wget is done. This list should only be traversed - in order. If you need to check whether a file has been downloaded, - use a hash table, e.g. dl_file_url_map. */ -static slist *downloaded_html_files; +extern struct hash_table *dl_url_file_map; +extern struct hash_table *downloaded_html_set; /* Functions for maintaining the URL queue. */ struct queue_element { - const char *url; - const char *referer; - int depth; - struct queue_element *next; + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + unsigned int html_allowed :1; /* whether the document is allowed to + be treated as HTML. */ + + struct queue_element *next; /* next element in queue */ }; struct url_queue { @@ -79,8 +87,7 @@ struct url_queue { static struct url_queue * url_queue_new (void) { - struct url_queue *queue = xmalloc (sizeof (*queue)); - memset (queue, '\0', sizeof (*queue)); + struct url_queue *queue = xnew0 (struct url_queue); return queue; } @@ -98,12 +105,13 @@ url_queue_delete (struct url_queue *queue) static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth) + const char *url, const char *referer, int depth, int html_allowed) { - struct queue_element *qel = xmalloc (sizeof (*qel)); + struct queue_element *qel = xnew (struct queue_element); qel->url = url; qel->referer = referer; qel->depth = depth; + qel->html_allowed = html_allowed; qel->next = NULL; ++queue->count; @@ -126,7 +134,8 @@ url_enqueue (struct url_queue *queue, static int url_dequeue (struct url_queue *queue, - const char **url, const char **referer, int *depth) + const char **url, const char **referer, int *depth, + int *html_allowed) { struct queue_element *qel = queue->head; @@ -140,6 +149,7 @@ url_dequeue (struct url_queue *queue, *url = qel->url; *referer = qel->referer; *depth = qel->depth; + *html_allowed = qel->html_allowed; --queue->count; @@ -149,9 +159,12 @@ url_dequeue (struct url_queue *queue, xfree (qel); return 1; } + +static int download_child_p PARAMS ((const struct urlpos *, struct url *, int, + struct url *, struct hash_table *)); +static int descend_redirect_p PARAMS ((const char *, const char *, int, + struct url *, struct hash_table *)); -static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int, - struct url *, struct hash_table *)); /* Retrieve a part of the web beginning with START_URL. This used to be called "recursive retrieval", because the old function was @@ -180,76 +193,123 @@ retrieve_tree (const char *start_url) uerr_t status = RETROK; /* The queue of URLs we need to load. */ - struct url_queue *queue = url_queue_new (); + struct url_queue *queue; + + /* The URLs we do not wish to enqueue, because they are already in + the queue, but haven't been downloaded yet. */ + struct hash_table *blacklist; + + int up_error_code; + struct url *start_url_parsed = url_parse (start_url, &up_error_code); - /* The URLs we decided we don't want to load. */ - struct hash_table *blacklist = make_string_hash_table (0); + if (!start_url_parsed) + { + logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, + url_error (up_error_code)); + return URLERROR; + } - /* We'll need various components of this, so better get it over with - now. */ - struct url *start_url_parsed = url_parse (start_url, NULL); + queue = url_queue_new (); + blacklist = make_string_hash_table (0); - url_enqueue (queue, xstrdup (start_url), NULL, 0); - string_set_add (blacklist, start_url); + /* Enqueue the starting URL. Use start_url_parsed->url rather than + just URL so we enqueue the canonical form of the URL. */ + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1); + string_set_add (blacklist, start_url_parsed->url); while (1) { int descend = 0; char *url, *referer, *file = NULL; - int depth; + int depth, html_allowed; boolean dash_p_leaf_HTML = FALSE; - if (downloaded_exceeds_quota ()) + if (opt.quota && total_downloaded_bytes > opt.quota) break; - if (status == FWRITEERR) break; - /* Get the next URL from the queue. */ + /* Get the next URL from the queue... */ if (!url_dequeue (queue, (const char **)&url, (const char **)&referer, - &depth)) + &depth, &html_allowed)) break; - /* And download it. */ + /* ...and download it. Note that this download is in most cases + unconditional, as download_child_p already makes sure a file + doesn't get enqueued twice -- and yet this check is here, and + not in download_child_p. This is so that if you run `wget -r + URL1 URL2', and a random URL is encountered once under URL1 + and again under URL2, but at a different (possibly smaller) + depth, we want the URL's children to be taken into account + the second time. */ + if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) + { + file = xstrdup (hash_table_get (dl_url_file_map, url)); - { - int dt = 0; - char *redirected = NULL; - int oldrec = opt.recursive; - - opt.recursive = 0; - status = retrieve_url (url, &file, &redirected, NULL, &dt); - opt.recursive = oldrec; - - if (redirected) - { - xfree (url); - url = redirected; - } - if (file && status == RETROK - && (dt & RETROKF) && (dt & TEXTHTML)) - descend = 1; - } + DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", + url, file)); + + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) + descend = 1; + } + else + { + int dt = 0; + char *redirected = NULL; + int oldrec = opt.recursive; + + opt.recursive = 0; + status = retrieve_url (url, &file, &redirected, referer, &dt); + opt.recursive = oldrec; + + if (html_allowed && file && status == RETROK + && (dt & RETROKF) && (dt & TEXTHTML)) + descend = 1; + + if (redirected) + { + /* We have been redirected, possibly to another host, or + different path, or wherever. Check whether we really + want to follow it. */ + if (descend) + { + if (!descend_redirect_p (redirected, url, depth, + start_url_parsed, blacklist)) + descend = 0; + else + /* Make sure that the old pre-redirect form gets + blacklisted. */ + string_set_add (blacklist, url); + } + + xfree (url); + url = redirected; + } + } if (descend && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) { - if (opt.page_requisites && depth == opt.reclevel) - /* When -p is specified, we can do one more partial - recursion from the "leaf nodes" on the HTML document - tree. The recursion is partial in that we won't - traverse any or tags, nor any tags - except for . */ - /* #### This would be the place to implement the TODO - entry saying that -p should do two more hops on - framesets. */ - dash_p_leaf_HTML = TRUE; + if (opt.page_requisites + && (depth == opt.reclevel || depth == opt.reclevel + 1)) + { + /* When -p is specified, we are allowed to exceed the + maximum depth, but only for the "inline" links, + i.e. those that are needed to display the page. + Originally this could exceed the depth at most by + one, but we allow one more level so that the leaf + pages that contain frames can be loaded + correctly. */ + dash_p_leaf_HTML = TRUE; + } else { /* Either -p wasn't specified or it was and we've - already gone the one extra (pseudo-)level that it + already spent the two extra (pseudo-)levels that it affords us, so we need to bail out. */ DEBUGP (("Not descending further; at depth %d, max. %d.\n", depth, opt.reclevel)); @@ -263,8 +323,8 @@ retrieve_tree (const char *start_url) if (descend) { int meta_disallow_follow = 0; - struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML, - &meta_disallow_follow); + struct urlpos *children + = get_urls_html (file, url, &meta_disallow_follow); if (opt.use_robots && meta_disallow_follow) { @@ -280,11 +340,16 @@ retrieve_tree (const char *start_url) for (; child; child = child->next) { - if (descend_url_p (child, url_parsed, depth, start_url_parsed, - blacklist)) + if (child->ignore_when_downloading) + continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; + if (download_child_p (child, url_parsed, depth, start_url_parsed, + blacklist)) { url_enqueue (queue, xstrdup (child->url->url), - xstrdup (url), depth + 1); + xstrdup (url), depth + 1, + child->link_expect_html); /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ @@ -307,11 +372,13 @@ retrieve_tree (const char *start_url) opt.delete_after ? "--delete-after" : "recursive rejection criteria")); logprintf (LOG_VERBOSE, - (opt.delete_after ? _("Removing %s.\n") + (opt.delete_after + ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), file); if (unlink (file)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + register_delete_file (file); } xfree (url); @@ -323,8 +390,9 @@ retrieve_tree (const char *start_url) now. */ { char *d1, *d2; - int d3; - while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3)) + int d3, d4; + while (url_dequeue (queue, + (const char **)&d1, (const char **)&d2, &d3, &d4)) { xfree (d1); FREE_MAYBE (d2); @@ -336,7 +404,7 @@ retrieve_tree (const char *start_url) url_free (start_url_parsed); string_set_free (blacklist); - if (downloaded_exceeds_quota ()) + if (opt.quota && total_downloaded_bytes > opt.quota) return QUOTEXC; else if (status == FWRITEERR) return FWRITEERR; @@ -346,14 +414,19 @@ retrieve_tree (const char *start_url) /* Based on the context provided by retrieve_tree, decide whether a URL is to be descended to. This is only ever called from - retrieve_tree, but is in a separate function for clarity. */ + retrieve_tree, but is in a separate function for clarity. + + The most expensive checks (such as those for robots) are memoized + by storing these URLs to BLACKLIST. This may or may not help. It + will help if those URLs are encountered many times. */ static int -descend_url_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) +download_child_p (const struct urlpos *upos, struct url *parent, int depth, + struct url *start_url_parsed, struct hash_table *blacklist) { struct url *u = upos->url; const char *url = u->url; + int u_scheme_like_http; DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); @@ -384,21 +457,23 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, More time- and memory- consuming tests should be put later on the list. */ + /* Determine whether URL under consideration has a HTTP-like scheme. */ + u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); + /* 1. Schemes other than HTTP are normally not recursed into. */ - if (u->scheme != SCHEME_HTTP - && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) + if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) { DEBUGP (("Not following non-HTTP schemes.\n")); - goto blacklist; + goto out; } /* 2. If it is an absolute link and they are not followed, throw it out. */ - if (u->scheme == SCHEME_HTTP) + if (u_scheme_like_http) if (opt.relative_only && !upos->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); - goto blacklist; + goto out; } /* 3. If its domain is not to be accepted/looked-up, chuck it @@ -406,22 +481,25 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, if (!accept_domain (u)) { DEBUGP (("The domain was not accepted.\n")); - goto blacklist; + goto out; } /* 4. Check for parent directory. If we descended to a different host or changed the scheme, ignore - opt.no_parent. Also ignore it for -p leaf retrievals. */ + opt.no_parent. Also ignore it for documents needed to display + the parent page when in -p mode. */ if (opt.no_parent - && u->scheme == parent->scheme - && 0 == strcasecmp (u->host, parent->host) - && u->port == parent->port) + && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) + && 0 == strcasecmp (u->host, start_url_parsed->host) + && u->port == start_url_parsed->port + && !(opt.page_requisites && upos->link_inline_p)) { - if (!frontcmp (parent->dir, u->dir)) + if (!frontcmp (start_url_parsed->dir, u->dir)) { - DEBUGP (("Trying to escape the root directory with no_parent in effect.\n")); - goto blacklist; + DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", + u->dir, start_url_parsed->dir)); + goto out; } } @@ -433,55 +511,38 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, if (!accdir (u->dir, ALLABS)) { DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); - goto blacklist; + goto out; } } - /* 6. */ - { - char *suf = NULL; - /* Check for acceptance/rejection rules. We ignore these rules - for HTML documents because they might lead to other files which - need to be downloaded. Of course, we don't know which - documents are HTML before downloading them, so we guess. - - A file is subject to acceptance/rejection rules if: - - * u->file is not "" (i.e. it is not a directory) - and either: - + there is no file suffix, - + or there is a suffix, but is not "html" or "htm", - + both: - - recursion is not infinite, - - and we are at its very end. */ - - if (u->file[0] != '\0' - && ((suf = suffix (url)) == NULL - || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm")) - || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel))) - { - if (!acceptable (u->file)) - { - DEBUGP (("%s (%s) does not match acc/rej rules.\n", - url, u->file)); - FREE_MAYBE (suf); - goto blacklist; - } - } - FREE_MAYBE (suf); - } + /* 6. Check for acceptance/rejection rules. We ignore these rules + for directories (no file name to match) and for HTML documents, + which might lead to other files that do need to be downloaded. + That is, unless we've exhausted the recursion depth anyway. */ + if (u->file[0] != '\0' + && !(has_html_suffix_p (u->file) + && depth != INFINITE_RECURSION + && depth < opt.reclevel - 1)) + { + if (!acceptable (u->file)) + { + DEBUGP (("%s (%s) does not match acc/rej rules.\n", + url, u->file)); + goto out; + } + } /* 7. */ - if (u->scheme == parent->scheme) + if (schemes_are_similar_p (u->scheme, parent->scheme)) if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) { DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", u->host, parent->host)); - goto blacklist; + goto out; } /* 8. */ - if (opt.use_robots && u->scheme == SCHEME_HTTP) + if (opt.use_robots && u_scheme_like_http) { struct robot_specs *specs = res_get_specs (u->host, u->port); if (!specs) @@ -507,7 +568,8 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, if (!res_match_path (specs, u->path)) { DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); - goto blacklist; + string_set_add (blacklist, url); + goto out; } } @@ -517,177 +579,43 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, return 1; - blacklist: - string_set_add (blacklist, url); - out: DEBUGP (("Decided NOT to load it.\n")); return 0; } - -/* Register that URL has been successfully downloaded to FILE. */ - -void -register_download (const char *url, const char *file) -{ - if (!opt.convert_links) - return; - if (!dl_file_url_map) - dl_file_url_map = make_string_hash_table (0); - if (!dl_url_file_map) - dl_url_file_map = make_string_hash_table (0); - - if (!hash_table_contains (dl_file_url_map, file)) - hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); - if (!hash_table_contains (dl_url_file_map, url)) - hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); -} - -/* Register that FROM has been redirected to TO. This assumes that TO - is successfully downloaded and already registered using - register_download() above. */ - -void -register_redirection (const char *from, const char *to) -{ - char *file; - - if (!opt.convert_links) - return; - - file = hash_table_get (dl_url_file_map, to); - assert (file != NULL); - if (!hash_table_contains (dl_url_file_map, from)) - hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file)); -} - -/* Register that URL corresponds to the HTML file FILE. */ - -void -register_html (const char *url, const char *file) -{ - if (!opt.convert_links) - return; - downloaded_html_files = slist_prepend (downloaded_html_files, file); -} - -/* convert_links() is called from recursive_retrieve() after we're - done with an HTML file. This call to convert_links is not complete - because it converts only the downloaded files, and Wget cannot know - which files will be downloaded afterwards. So, if we have file - fileone.html with: - - - - and /c/something.gif was not downloaded because it exceeded the - recursion depth, the reference will *not* be changed. - - However, later we can encounter /c/something.gif from an "upper" - level HTML (let's call it filetwo.html), and it gets downloaded. - But now we have a problem because /c/something.gif will be - correctly transformed in filetwo.html, but not in fileone.html, - since Wget could not have known that /c/something.gif will be - downloaded in the future. +/* This function determines whether we will consider downloading the + children of a URL whose download resulted in a redirection, + possibly to another host, etc. It is needed very rarely, and thus + it is merely a simple-minded wrapper around download_child_p. */ - This is why Wget must, after the whole retrieval, call - convert_all_links to go once more through the entire list of - retrieved HTMLs, and re-convert them. - - All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs - in urls_downloaded. From these two lists information is - extracted. */ -void -convert_all_links (void) +static int +descend_redirect_p (const char *redirected, const char *original, int depth, + struct url *start_url_parsed, struct hash_table *blacklist) { - slist *html; + struct url *orig_parsed, *new_parsed; + struct urlpos *upos; + int success; - /* Destructively reverse downloaded_html_files to get it in the right order. - recursive_retrieve() used slist_prepend() consistently. */ - downloaded_html_files = slist_nreverse (downloaded_html_files); + orig_parsed = url_parse (original, NULL); + assert (orig_parsed != NULL); - for (html = downloaded_html_files; html; html = html->next) - { - struct urlpos *urls, *cur_url; - char *url; + new_parsed = url_parse (redirected, NULL); + assert (new_parsed != NULL); - DEBUGP (("Rescanning %s\n", html->string)); + upos = xnew0 (struct urlpos); + upos->url = new_parsed; - /* Determine the URL of the HTML file. get_urls_html will need - it. */ - url = hash_table_get (dl_file_url_map, html->string); - if (url) - DEBUGP (("It should correspond to %s.\n", url)); - else - DEBUGP (("I cannot find the corresponding URL.\n")); + success = download_child_p (upos, orig_parsed, depth, + start_url_parsed, blacklist); - /* Parse the HTML file... */ - urls = get_urls_html (html->string, url, FALSE, NULL); + url_free (orig_parsed); + url_free (new_parsed); + xfree (upos); - /* We don't respect meta_disallow_follow here because, even if - the file is not followed, we might still want to convert the - links that have been followed from other files. */ + if (!success) + DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); - for (cur_url = urls; cur_url; cur_url = cur_url->next) - { - char *local_name; - struct url *u = cur_url->url; - - /* We decide the direction of conversion according to whether - a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. */ - local_name = hash_table_get (dl_url_file_map, u->url); - if (local_name) - DEBUGP (("%s marked for conversion, local %s\n", - u->url, local_name)); - - /* Decide on the conversion direction. */ - if (local_name) - { - /* We've downloaded this URL. Convert it to relative - form. We do this even if the URL already is in - relative form, because our directory structure may - not be identical to that on the server (think `-nd', - `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (local_name); - } - else - { - /* We haven't downloaded this URL. If it's not already - complete (including a full host name), convert it to - that form, so it can be reached while browsing this - HTML locally. */ - if (!cur_url->link_complete_p) - cur_url->convert = CO_CONVERT_TO_COMPLETE; - cur_url->local_name = NULL; - } - } - /* Convert the links in the file. */ - convert_links (html->string, urls); - /* Free the data. */ - free_urlpos (urls); - } -} - -/* Cleanup the data structures associated with recursive retrieving - (the variables above). */ -void -recursive_cleanup (void) -{ - if (dl_file_url_map) - { - free_keys_and_values (dl_file_url_map); - hash_table_destroy (dl_file_url_map); - dl_file_url_map = NULL; - } - if (dl_url_file_map) - { - free_keys_and_values (dl_url_file_map); - hash_table_destroy (dl_url_file_map); - dl_url_file_map = NULL; - } - slist_free (downloaded_html_files); - downloaded_html_files = NULL; + return success; }