X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=4cf1d988943d79779f0e73547b4c1a836b782593;hb=5f0a2b3f0846dd4c2f72fc62e7171200d1fd6e06;hp=7f2b55497f13a31868745559625524651fa4ae83;hpb=d4b0486cc42647e24e6dc740de1f9de6770a854c;p=wget diff --git a/src/recur.c b/src/recur.c index 7f2b5549..4cf1d988 100644 --- a/src/recur.c +++ b/src/recur.c @@ -6,7 +6,7 @@ This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -15,7 +15,17 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ #include @@ -39,36 +49,31 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utils.h" #include "retr.h" #include "ftp.h" -#include "fnmatch.h" #include "host.h" #include "hash.h" #include "res.h" +#include "convert.h" #ifndef errno extern int errno; #endif extern char *version_string; +extern LARGE_INT total_downloaded_bytes; -static struct hash_table *dl_file_url_map; -static struct hash_table *dl_url_file_map; - -/* List of HTML files downloaded in this Wget run, used for link - conversion after Wget is done. The list and the set contain the - same information, except the list maintains the order. Perhaps I - should get rid of the list, it's there for historical reasons. */ -static slist *downloaded_html_list; -static struct hash_table *downloaded_html_set; - -static void register_delete_file PARAMS ((const char *)); +extern struct hash_table *dl_url_file_map; +extern struct hash_table *downloaded_html_set; /* Functions for maintaining the URL queue. */ struct queue_element { - const char *url; - const char *referer; - int depth; - struct queue_element *next; + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + unsigned int html_allowed :1; /* whether the document is allowed to + be treated as HTML. */ + + struct queue_element *next; /* next element in queue */ }; struct url_queue { @@ -82,8 +87,7 @@ struct url_queue { static struct url_queue * url_queue_new (void) { - struct url_queue *queue = xmalloc (sizeof (*queue)); - memset (queue, '\0', sizeof (*queue)); + struct url_queue *queue = xnew0 (struct url_queue); return queue; } @@ -101,12 +105,13 @@ url_queue_delete (struct url_queue *queue) static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth) + const char *url, const char *referer, int depth, int html_allowed) { - struct queue_element *qel = xmalloc (sizeof (*qel)); + struct queue_element *qel = xnew (struct queue_element); qel->url = url; qel->referer = referer; qel->depth = depth; + qel->html_allowed = html_allowed; qel->next = NULL; ++queue->count; @@ -129,7 +134,8 @@ url_enqueue (struct url_queue *queue, static int url_dequeue (struct url_queue *queue, - const char **url, const char **referer, int *depth) + const char **url, const char **referer, int *depth, + int *html_allowed) { struct queue_element *qel = queue->head; @@ -143,6 +149,7 @@ url_dequeue (struct url_queue *queue, *url = qel->url; *referer = qel->referer; *depth = qel->depth; + *html_allowed = qel->html_allowed; --queue->count; @@ -207,17 +214,17 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0); + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1); string_set_add (blacklist, start_url_parsed->url); while (1) { int descend = 0; char *url, *referer, *file = NULL; - int depth; + int depth, html_allowed; boolean dash_p_leaf_HTML = FALSE; - if (downloaded_exceeds_quota ()) + if (opt.quota && total_downloaded_bytes > opt.quota) break; if (status == FWRITEERR) break; @@ -226,7 +233,7 @@ retrieve_tree (const char *start_url) if (!url_dequeue (queue, (const char **)&url, (const char **)&referer, - &depth)) + &depth, &html_allowed)) break; /* ...and download it. Note that this download is in most cases @@ -244,7 +251,9 @@ retrieve_tree (const char *start_url) DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", url, file)); - if (string_set_contains (downloaded_html_set, file)) + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) descend = 1; } else @@ -257,7 +266,7 @@ retrieve_tree (const char *start_url) status = retrieve_url (url, &file, &redirected, referer, &dt); opt.recursive = oldrec; - if (file && status == RETROK + if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) descend = 1; @@ -339,7 +348,8 @@ retrieve_tree (const char *start_url) blacklist)) { url_enqueue (queue, xstrdup (child->url->url), - xstrdup (url), depth + 1); + xstrdup (url), depth + 1, + child->link_expect_html); /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ @@ -380,8 +390,9 @@ retrieve_tree (const char *start_url) now. */ { char *d1, *d2; - int d3; - while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3)) + int d3, d4; + while (url_dequeue (queue, + (const char **)&d1, (const char **)&d2, &d3, &d4)) { xfree (d1); FREE_MAYBE (d2); @@ -393,7 +404,7 @@ retrieve_tree (const char *start_url) url_free (start_url_parsed); string_set_free (blacklist); - if (downloaded_exceeds_quota ()) + if (opt.quota && total_downloaded_bytes > opt.quota) return QUOTEXC; else if (status == FWRITEERR) return FWRITEERR; @@ -415,6 +426,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, { struct url *u = upos->url; const char *url = u->url; + int u_scheme_like_http; DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); @@ -445,12 +457,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, More time- and memory- consuming tests should be put later on the list. */ + /* Determine whether URL under consideration has a HTTP-like scheme. */ + u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); + /* 1. Schemes other than HTTP are normally not recursed into. */ - if (u->scheme != SCHEME_HTTP -#ifdef HAVE_SSL - && u->scheme != SCHEME_HTTPS -#endif - && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) + if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) { DEBUGP (("Not following non-HTTP schemes.\n")); goto out; @@ -458,11 +469,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, /* 2. If it is an absolute link and they are not followed, throw it out. */ - if (u->scheme == SCHEME_HTTP -#ifdef HAVE_SSL - || u->scheme == SCHEME_HTTPS -#endif - ) + if (u_scheme_like_http) if (opt.relative_only && !upos->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); @@ -483,7 +490,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, opt.no_parent. Also ignore it for documents needed to display the parent page when in -p mode. */ if (opt.no_parent - && u->scheme == start_url_parsed->scheme + && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) && 0 == strcasecmp (u->host, start_url_parsed->host) && u->port == start_url_parsed->port && !(opt.page_requisites && upos->link_inline_p)) @@ -514,8 +521,8 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, That is, unless we've exhausted the recursion depth anyway. */ if (u->file[0] != '\0' && !(has_html_suffix_p (u->file) - && depth < opt.reclevel - 1 - && depth != INFINITE_RECURSION)) + && depth != INFINITE_RECURSION + && depth < opt.reclevel - 1)) { if (!acceptable (u->file)) { @@ -526,7 +533,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, } /* 7. */ - if (u->scheme == parent->scheme) + if (schemes_are_similar_p (u->scheme, parent->scheme)) if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) { DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", @@ -535,12 +542,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, } /* 8. */ - if (opt.use_robots && (u->scheme == SCHEME_HTTP -#ifdef HAVE_SSL - || u->scheme == SCHEME_HTTPS -#endif - ) - ) + if (opt.use_robots && u_scheme_like_http) { struct robot_specs *specs = res_get_specs (u->host, u->port); if (!specs) @@ -602,8 +604,7 @@ descend_redirect_p (const char *redirected, const char *original, int depth, new_parsed = url_parse (redirected, NULL); assert (new_parsed != NULL); - upos = xmalloc (sizeof (struct urlpos)); - memset (upos, 0, sizeof (*upos)); + upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, @@ -618,358 +619,3 @@ descend_redirect_p (const char *redirected, const char *original, int depth, return success; } - - -#define ENSURE_TABLES_EXIST do { \ - if (!dl_file_url_map) \ - dl_file_url_map = make_string_hash_table (0); \ - if (!dl_url_file_map) \ - dl_url_file_map = make_string_hash_table (0); \ -} while (0) - -/* Return 1 if S1 and S2 are the same, except for "/index.html". The - three cases in which it returns one are (substitute any substring - for "foo"): - - m("foo/index.html", "foo/") ==> 1 - m("foo/", "foo/index.html") ==> 1 - m("foo", "foo/index.html") ==> 1 - m("foo", "foo/" ==> 1 - m("foo", "foo") ==> 1 */ - -static int -match_except_index (const char *s1, const char *s2) -{ - int i; - const char *lng; - - /* Skip common substring. */ - for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++) - ; - if (i == 0) - /* Strings differ at the very beginning -- bail out. We need to - check this explicitly to avoid `lng - 1' reading outside the - array. */ - return 0; - - if (!*s1 && !*s2) - /* Both strings hit EOF -- strings are equal. */ - return 1; - else if (*s1 && *s2) - /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */ - return 0; - else if (*s1) - /* S1 is the longer one. */ - lng = s1; - else - /* S2 is the longer one. */ - lng = s2; - - /* foo */ /* foo/ */ - /* foo/index.html */ /* or */ /* foo/index.html */ - /* ^ */ /* ^ */ - - if (*lng != '/') - /* The right-hand case. */ - --lng; - - if (*lng == '/' && *(lng + 1) == '\0') - /* foo */ - /* foo/ */ - return 1; - - return 0 == strcmp (lng, "/index.html"); -} - -static int -dissociate_urls_from_file_mapper (void *key, void *value, void *arg) -{ - char *mapping_url = (char *)key; - char *mapping_file = (char *)value; - char *file = (char *)arg; - - if (0 == strcmp (mapping_file, file)) - { - hash_table_remove (dl_url_file_map, mapping_url); - xfree (mapping_url); - xfree (mapping_file); - } - - /* Continue mapping. */ - return 0; -} - -/* Remove all associations from various URLs to FILE from dl_url_file_map. */ - -static void -dissociate_urls_from_file (const char *file) -{ - hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper, - (char *)file); -} - -/* Register that URL has been successfully downloaded to FILE. This - is used by the link conversion code to convert references to URLs - to references to local files. It is also being used to check if a - URL has already been downloaded. */ - -void -register_download (const char *url, const char *file) -{ - char *old_file, *old_url; - - ENSURE_TABLES_EXIST; - - /* With some forms of retrieval, it is possible, although not likely - or particularly desirable. If both are downloaded, the second - download will override the first one. When that happens, - dissociate the old file name from the URL. */ - - if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url)) - { - if (0 == strcmp (url, old_url)) - /* We have somehow managed to download the same URL twice. - Nothing to do. */ - return; - - if (match_except_index (url, old_url) - && !hash_table_contains (dl_url_file_map, url)) - /* The two URLs differ only in the "index.html" ending. For - example, one is "http://www.server.com/", and the other is - "http://www.server.com/index.html". Don't remove the old - one, just add the new one as a non-canonical entry. */ - goto url_only; - - hash_table_remove (dl_file_url_map, file); - xfree (old_file); - xfree (old_url); - - /* Remove all the URLs that point to this file. Yes, there can - be more than one such URL, because we store redirections as - multiple entries in dl_url_file_map. For example, if URL1 - redirects to URL2 which gets downloaded to FILE, we map both - URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map - only points to URL2.) When another URL gets loaded to FILE, - we want both URL1 and URL2 dissociated from it. - - This is a relatively expensive operation because it performs - a linear search of the whole hash table, but it should be - called very rarely, only when two URLs resolve to the same - file name, *and* the ".1" extensions are turned off. - In other words, almost never. */ - dissociate_urls_from_file (file); - } - - hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); - - url_only: - /* A URL->FILE mapping is not possible without a FILE->URL mapping. - If the latter were present, it should have been removed by the - above `if'. So we could write: - - assert (!hash_table_contains (dl_url_file_map, url)); - - The above is correct when running in recursive mode where the - same URL always resolves to the same file. But if you do - something like: - - wget URL URL - - then the first URL will resolve to "FILE", and the other to - "FILE.1". In that case, FILE.1 will not be found in - dl_file_url_map, but URL will still point to FILE in - dl_url_file_map. */ - if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file)) - { - hash_table_remove (dl_url_file_map, url); - xfree (old_url); - xfree (old_file); - } - - hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); -} - -/* Register that FROM has been redirected to TO. This assumes that TO - is successfully downloaded and already registered using - register_download() above. */ - -void -register_redirection (const char *from, const char *to) -{ - char *file; - - ENSURE_TABLES_EXIST; - - file = hash_table_get (dl_url_file_map, to); - assert (file != NULL); - if (!hash_table_contains (dl_url_file_map, from)) - hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file)); -} - -/* Register that the file has been deleted. */ - -static void -register_delete_file (const char *file) -{ - char *old_url, *old_file; - - ENSURE_TABLES_EXIST; - - if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url)) - return; - - hash_table_remove (dl_file_url_map, file); - xfree (old_file); - xfree (old_url); - dissociate_urls_from_file (file); -} - -/* Register that FILE is an HTML file that has been downloaded. */ - -void -register_html (const char *url, const char *file) -{ - if (!downloaded_html_set) - downloaded_html_set = make_string_hash_table (0); - else if (hash_table_contains (downloaded_html_set, file)) - return; - - /* The set and the list should use the same copy of FILE, but the - slist interface insists on strduping the string it gets. Oh - well. */ - string_set_add (downloaded_html_set, file); - downloaded_html_list = slist_prepend (downloaded_html_list, file); -} - -/* This function is called when the retrieval is done to convert the - links that have been downloaded. It has to be called at the end of - the retrieval, because only then does Wget know conclusively which - URLs have been downloaded, and which not, so it can tell which - direction to convert to. - - The "direction" means that the URLs to the files that have been - downloaded get converted to the relative URL which will point to - that file. And the other URLs get converted to the remote URL on - the server. - - All the downloaded HTMLs are kept in downloaded_html_files, and - downloaded URLs in urls_downloaded. All the information is - extracted from these two lists. */ - -void -convert_all_links (void) -{ - slist *html; - long msecs; - int file_count = 0; - - struct wget_timer *timer = wtimer_new (); - - /* Destructively reverse downloaded_html_files to get it in the right order. - recursive_retrieve() used slist_prepend() consistently. */ - downloaded_html_list = slist_nreverse (downloaded_html_list); - - for (html = downloaded_html_list; html; html = html->next) - { - struct urlpos *urls, *cur_url; - char *url; - char *file = html->string; - - /* Determine the URL of the HTML file. get_urls_html will need - it. */ - url = hash_table_get (dl_file_url_map, file); - if (!url) - { - DEBUGP (("Apparently %s has been removed.\n", file)); - continue; - } - - DEBUGP (("Scanning %s (from %s)\n", file, url)); - - /* Parse the HTML file... */ - urls = get_urls_html (file, url, NULL); - - /* We don't respect meta_disallow_follow here because, even if - the file is not followed, we might still want to convert the - links that have been followed from other files. */ - - for (cur_url = urls; cur_url; cur_url = cur_url->next) - { - char *local_name; - struct url *u = cur_url->url; - - if (cur_url->link_base_p) - { - /* Base references have been resolved by our parser, so - we turn the base URL into an empty string. (Perhaps - we should remove the tag entirely?) */ - cur_url->convert = CO_NULLIFY_BASE; - continue; - } - - /* We decide the direction of conversion according to whether - a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. */ - local_name = hash_table_get (dl_url_file_map, u->url); - - /* Decide on the conversion type. */ - if (local_name) - { - /* We've downloaded this URL. Convert it to relative - form. We do this even if the URL already is in - relative form, because our directory structure may - not be identical to that on the server (think `-nd', - `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (local_name); - DEBUGP (("will convert url %s to local %s\n", u->url, local_name)); - } - else - { - /* We haven't downloaded this URL. If it's not already - complete (including a full host name), convert it to - that form, so it can be reached while browsing this - HTML locally. */ - if (!cur_url->link_complete_p) - cur_url->convert = CO_CONVERT_TO_COMPLETE; - cur_url->local_name = NULL; - DEBUGP (("will convert url %s to complete\n", u->url)); - } - } - - /* Convert the links in the file. */ - convert_links (file, urls); - ++file_count; - - /* Free the data. */ - free_urlpos (urls); - } - - msecs = wtimer_elapsed (timer); - wtimer_delete (timer); - logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"), - file_count, (double)msecs / 1000); -} - -/* Cleanup the data structures associated with recursive retrieving - (the variables above). */ -void -recursive_cleanup (void) -{ - if (dl_file_url_map) - { - free_keys_and_values (dl_file_url_map); - hash_table_destroy (dl_file_url_map); - dl_file_url_map = NULL; - } - if (dl_url_file_map) - { - free_keys_and_values (dl_url_file_map); - hash_table_destroy (dl_url_file_map); - dl_url_file_map = NULL; - } - if (downloaded_html_set) - string_set_free (downloaded_html_set); - slist_free (downloaded_html_list); - downloaded_html_list = NULL; -}