X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=4cf1d988943d79779f0e73547b4c1a836b782593;hb=5f0a2b3f0846dd4c2f72fc62e7171200d1fd6e06;hp=a159f119430948995f0f17b108af214aa8803247;hpb=f178e6c61367309bef8ba5789a025d7c2aa05775;p=wget diff --git a/src/recur.c b/src/recur.c index a159f119..4cf1d988 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,12 +1,12 @@ /* Handling of recursive HTTP retrieving. - Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. + Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -15,7 +15,17 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ #include @@ -39,580 +49,573 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utils.h" #include "retr.h" #include "ftp.h" -#include "fnmatch.h" #include "host.h" #include "hash.h" #include "res.h" +#include "convert.h" #ifndef errno extern int errno; #endif extern char *version_string; +extern LARGE_INT total_downloaded_bytes; -static struct hash_table *dl_file_url_map; -static struct hash_table *dl_url_file_map; +extern struct hash_table *dl_url_file_map; +extern struct hash_table *downloaded_html_set; + +/* Functions for maintaining the URL queue. */ -/* List of HTML files downloaded in this Wget run. Used for link - conversion after Wget is done. */ -static slist *downloaded_html_files; +struct queue_element { + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + unsigned int html_allowed :1; /* whether the document is allowed to + be treated as HTML. */ -/* List of undesirable-to-load URLs. */ -static struct hash_table *undesirable_urls; + struct queue_element *next; /* next element in queue */ +}; -/* Current recursion depth. */ -static int depth; +struct url_queue { + struct queue_element *head; + struct queue_element *tail; + int count, maxcount; +}; -/* Base directory we're recursing from (used by no_parent). */ -static char *base_dir; +/* Create a URL queue. */ -static int first_time = 1; +static struct url_queue * +url_queue_new (void) +{ + struct url_queue *queue = xnew0 (struct url_queue); + return queue; +} +/* Delete a URL queue. */ -/* Cleanup the data structures associated with recursive retrieving - (the variables above). */ -void -recursive_cleanup (void) +static void +url_queue_delete (struct url_queue *queue) { - if (undesirable_urls) - { - string_set_free (undesirable_urls); - undesirable_urls = NULL; - } - if (dl_file_url_map) - { - free_keys_and_values (dl_file_url_map); - hash_table_destroy (dl_file_url_map); - dl_file_url_map = NULL; - } - if (dl_url_file_map) - { - free_keys_and_values (dl_url_file_map); - hash_table_destroy (dl_url_file_map); - dl_url_file_map = NULL; - } - undesirable_urls = NULL; - slist_free (downloaded_html_files); - downloaded_html_files = NULL; - FREE_MAYBE (base_dir); - first_time = 1; + xfree (queue); } -/* Reset FIRST_TIME to 1, so that some action can be taken in - recursive_retrieve(). */ -void -recursive_reset (void) +/* Enqueue a URL in the queue. The queue is FIFO: the items will be + retrieved ("dequeued") from the queue in the order they were placed + into it. */ + +static void +url_enqueue (struct url_queue *queue, + const char *url, const char *referer, int depth, int html_allowed) +{ + struct queue_element *qel = xnew (struct queue_element); + qel->url = url; + qel->referer = referer; + qel->depth = depth; + qel->html_allowed = html_allowed; + qel->next = NULL; + + ++queue->count; + if (queue->count > queue->maxcount) + queue->maxcount = queue->count; + + DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); + DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + + if (queue->tail) + queue->tail->next = qel; + queue->tail = qel; + + if (!queue->head) + queue->head = queue->tail; +} + +/* Take a URL out of the queue. Return 1 if this operation succeeded, + or 0 if the queue is empty. */ + +static int +url_dequeue (struct url_queue *queue, + const char **url, const char **referer, int *depth, + int *html_allowed) { - first_time = 1; + struct queue_element *qel = queue->head; + + if (!qel) + return 0; + + queue->head = queue->head->next; + if (!queue->head) + queue->tail = NULL; + + *url = qel->url; + *referer = qel->referer; + *depth = qel->depth; + *html_allowed = qel->html_allowed; + + --queue->count; + + DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth)); + DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + + xfree (qel); + return 1; } + +static int download_child_p PARAMS ((const struct urlpos *, struct url *, int, + struct url *, struct hash_table *)); +static int descend_redirect_p PARAMS ((const char *, const char *, int, + struct url *, struct hash_table *)); + + +/* Retrieve a part of the web beginning with START_URL. This used to + be called "recursive retrieval", because the old function was + recursive and implemented depth-first search. retrieve_tree on the + other hand implements breadth-search traversal of the tree, which + results in much nicer ordering of downloads. -/* The core of recursive retrieving. Endless recursion is avoided by - having all URLs stored to a linked list of URLs, which is checked - before loading any URL. That way no URL can get loaded twice. + The algorithm this function uses is simple: + + 1. put START_URL in the queue. + 2. while there are URLs in the queue: + + 3. get next URL from the queue. + 4. download it. + 5. if the URL is HTML and its depth does not exceed maximum depth, + get the list of URLs embedded therein. + 6. for each of those URLs do the following: + + 7. if the URL is not one of those downloaded before, and if it + satisfies the criteria specified by the various command-line + options, add it to the queue. */ - The function also supports specification of maximum recursion depth - and a number of other goodies. */ uerr_t -recursive_retrieve (const char *file, const char *this_url) +retrieve_tree (const char *start_url) { - char *constr, *filename, *newloc; - char *canon_this_url = NULL; - int dt, inl, dash_p_leaf_HTML = FALSE; - int meta_disallow_follow; - int this_url_ftp; /* See below the explanation */ - uerr_t err; - urlpos *url_list, *cur_url; - struct urlinfo *u; - - assert (this_url != NULL); - assert (file != NULL); - /* If quota was exceeded earlier, bail out. */ - if (downloaded_exceeds_quota ()) - return QUOTEXC; - /* Cache the current URL in the list. */ - if (first_time) - { - /* These three operations need to be done only once per Wget - run. They should probably be at a different location. */ - if (!undesirable_urls) - undesirable_urls = make_string_hash_table (0); - - hash_table_clear (undesirable_urls); - string_set_add (undesirable_urls, this_url); - /* Enter this_url to the hash table, in original and "enhanced" form. */ - u = newurl (); - err = parseurl (this_url, u, 0); - if (err == URLOK) - { - string_set_add (undesirable_urls, u->url); - if (opt.no_parent) - base_dir = xstrdup (u->dir); /* Set the base dir. */ - /* Set the canonical this_url to be sent as referer. This - problem exists only when running the first time. */ - canon_this_url = xstrdup (u->url); - } - else - { - DEBUGP (("Double yuck! The *base* URL is broken.\n")); - base_dir = NULL; - } - freeurl (u, 1); - depth = 1; - first_time = 0; - } - else - ++depth; + uerr_t status = RETROK; - if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel) - /* We've exceeded the maximum recursion depth specified by the user. */ - { - if (opt.page_requisites && depth <= opt.reclevel + 1) - /* When -p is specified, we can do one more partial recursion from the - "leaf nodes" on the HTML document tree. The recursion is partial in - that we won't traverse any or tags, nor any tags - except for . */ - dash_p_leaf_HTML = TRUE; - else - /* Either -p wasn't specified or it was and we've already gone the one - extra (pseudo-)level that it affords us, so we need to bail out. */ - { - DEBUGP (("Recursion depth %d exceeded max. depth %d.\n", - depth, opt.reclevel)); - --depth; - return RECLEVELEXC; - } - } + /* The queue of URLs we need to load. */ + struct url_queue *queue; - /* Determine whether this_url is an FTP URL. If it is, it means - that the retrieval is done through proxy. In that case, FTP - links will be followed by default and recursion will not be - turned off when following them. */ - this_url_ftp = (url_scheme (this_url) == SCHEME_FTP); + /* The URLs we do not wish to enqueue, because they are already in + the queue, but haven't been downloaded yet. */ + struct hash_table *blacklist; - /* Get the URL-s from an HTML file: */ - url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, - dash_p_leaf_HTML, &meta_disallow_follow); + int up_error_code; + struct url *start_url_parsed = url_parse (start_url, &up_error_code); - if (opt.use_robots && meta_disallow_follow) + if (!start_url_parsed) { - /* The META tag says we are not to follow this file. Respect - that. */ - free_urlpos (url_list); - url_list = NULL; + logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, + url_error (up_error_code)); + return URLERROR; } - /* Decide what to do with each of the URLs. A URL will be loaded if - it meets several requirements, discussed later. */ - for (cur_url = url_list; cur_url; cur_url = cur_url->next) + queue = url_queue_new (); + blacklist = make_string_hash_table (0); + + /* Enqueue the starting URL. Use start_url_parsed->url rather than + just URL so we enqueue the canonical form of the URL. */ + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1); + string_set_add (blacklist, start_url_parsed->url); + + while (1) { - /* If quota was exceeded earlier, bail out. */ - if (downloaded_exceeds_quota ()) + int descend = 0; + char *url, *referer, *file = NULL; + int depth, html_allowed; + boolean dash_p_leaf_HTML = FALSE; + + if (opt.quota && total_downloaded_bytes > opt.quota) + break; + if (status == FWRITEERR) + break; + + /* Get the next URL from the queue... */ + + if (!url_dequeue (queue, + (const char **)&url, (const char **)&referer, + &depth, &html_allowed)) break; - /* Parse the URL for convenient use in other functions, as well - as to get the optimized form. It also checks URL integrity. */ - u = newurl (); - if (parseurl (cur_url->url, u, 0) != URLOK) + + /* ...and download it. Note that this download is in most cases + unconditional, as download_child_p already makes sure a file + doesn't get enqueued twice -- and yet this check is here, and + not in download_child_p. This is so that if you run `wget -r + URL1 URL2', and a random URL is encountered once under URL1 + and again under URL2, but at a different (possibly smaller) + depth, we want the URL's children to be taken into account + the second time. */ + if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) { - DEBUGP (("Yuck! A bad URL.\n")); - freeurl (u, 1); - continue; + file = xstrdup (hash_table_get (dl_url_file_map, url)); + + DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", + url, file)); + + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) + descend = 1; } - assert (u->url != NULL); - constr = xstrdup (u->url); - - /* Several checkings whether a file is acceptable to load: - 1. check if URL is ftp, and we don't load it - 2. check for relative links (if relative_only is set) - 3. check for domain - 4. check for no-parent - 5. check for excludes && includes - 6. check for suffix - 7. check for same host (if spanhost is unset), with possible - gethostbyname baggage - 8. check for robots.txt - - Addendum: If the URL is FTP, and it is to be loaded, only the - domain and suffix settings are "stronger". - - Note that .html and (yuck) .htm will get loaded regardless of - suffix rules (but that is remedied later with unlink) unless - the depth equals the maximum depth. - - More time- and memory- consuming tests should be put later on - the list. */ - - /* inl is set if the URL we are working on (constr) is stored in - undesirable_urls. Using it is crucial to avoid unnecessary - repeated continuous hits to the hash table. */ - inl = string_set_contains (undesirable_urls, constr); - - /* If it is FTP, and FTP is not followed, chuck it out. */ - if (!inl) - if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp) - { - DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* If it is absolute link and they are not followed, chuck it - out. */ - if (!inl && u->scheme != SCHEME_FTP) - if (opt.relative_only && !cur_url->link_relative_p) - { - DEBUGP (("It doesn't really look like a relative link.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* If its domain is not to be accepted/looked-up, chuck it out. */ - if (!inl) - if (!accept_domain (u)) - { - DEBUGP (("I don't like the smell of that domain.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* Check for parent directory. */ - if (!inl && opt.no_parent - /* If the new URL is FTP and the old was not, ignore - opt.no_parent. */ - && !(!this_url_ftp && u->scheme == SCHEME_FTP)) + else { - /* Check for base_dir first. */ - if (!(base_dir && frontcmp (base_dir, u->dir))) + int dt = 0; + char *redirected = NULL; + int oldrec = opt.recursive; + + opt.recursive = 0; + status = retrieve_url (url, &file, &redirected, referer, &dt); + opt.recursive = oldrec; + + if (html_allowed && file && status == RETROK + && (dt & RETROKF) && (dt & TEXTHTML)) + descend = 1; + + if (redirected) { - /* Failing that, check for parent dir. */ - struct urlinfo *ut = newurl (); - if (parseurl (this_url, ut, 0) != URLOK) - DEBUGP (("Double yuck! The *base* URL is broken.\n")); - else if (!frontcmp (ut->dir, u->dir)) + /* We have been redirected, possibly to another host, or + different path, or wherever. Check whether we really + want to follow it. */ + if (descend) { - /* Failing that too, kill the URL. */ - DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; + if (!descend_redirect_p (redirected, url, depth, + start_url_parsed, blacklist)) + descend = 0; + else + /* Make sure that the old pre-redirect form gets + blacklisted. */ + string_set_add (blacklist, url); } - freeurl (ut, 1); + + xfree (url); + url = redirected; } } - /* If the file does not match the acceptance list, or is on the - rejection list, chuck it out. The same goes for the - directory exclude- and include- lists. */ - if (!inl && (opt.includes || opt.excludes)) + + if (descend + && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) { - if (!accdir (u->dir, ALLABS)) + if (opt.page_requisites + && (depth == opt.reclevel || depth == opt.reclevel + 1)) { - DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); - string_set_add (undesirable_urls, constr); - inl = 1; + /* When -p is specified, we are allowed to exceed the + maximum depth, but only for the "inline" links, + i.e. those that are needed to display the page. + Originally this could exceed the depth at most by + one, but we allow one more level so that the leaf + pages that contain frames can be loaded + correctly. */ + dash_p_leaf_HTML = TRUE; } - } - if (!inl) - { - char *suf = NULL; - /* We check for acceptance/rejection rules only for non-HTML - documents. Since we don't know whether they really are - HTML, it will be deduced from (an OR-ed list): - - 1) u->file is "" (meaning it is a directory) - 2) suffix exists, AND: - a) it is "html", OR - b) it is "htm" - - If the file *is* supposed to be HTML, it will *not* be - subject to acc/rej rules, unless a finite maximum depth has - been specified and the current depth is the maximum depth. */ - if (! - (!*u->file - || (((suf = suffix (constr)) != NULL) - && ((!strcmp (suf, "html") || !strcmp (suf, "htm")) - && ((opt.reclevel != INFINITE_RECURSION) && - (depth != opt.reclevel)))))) + else { - if (!acceptable (u->file)) - { - DEBUGP (("%s (%s) does not match acc/rej rules.\n", - constr, u->file)); - string_set_add (undesirable_urls, constr); - inl = 1; - } + /* Either -p wasn't specified or it was and we've + already spent the two extra (pseudo-)levels that it + affords us, so we need to bail out. */ + DEBUGP (("Not descending further; at depth %d, max. %d.\n", + depth, opt.reclevel)); + descend = 0; } - FREE_MAYBE (suf); } - /* Optimize the URL (which includes possible DNS lookup) only - after all other possibilities have been exhausted. */ - if (!inl) + + /* If the downloaded document was HTML, parse it and enqueue the + links it contains. */ + + if (descend) { - if (!opt.simple_check) - opt_url (u); - else + int meta_disallow_follow = 0; + struct urlpos *children + = get_urls_html (file, url, &meta_disallow_follow); + + if (opt.use_robots && meta_disallow_follow) { - char *p; - /* Just lowercase the hostname. */ - for (p = u->host; *p; p++) - *p = TOLOWER (*p); - xfree (u->url); - u->url = str_url (u, 0); + free_urlpos (children); + children = NULL; } - xfree (constr); - constr = xstrdup (u->url); - /* After we have canonicalized the URL, check if we have it - on the black list. */ - if (string_set_contains (undesirable_urls, constr)) - inl = 1; - /* This line is bogus. */ - /*string_set_add (undesirable_urls, constr);*/ - - if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp)) - if (!opt.spanhost && this_url && !same_host (this_url, constr)) - { - DEBUGP (("This is not the same hostname as the parent's.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - /* What about robots.txt? */ - if (!inl && opt.use_robots && u->scheme == SCHEME_FTP) - { - struct robot_specs *specs = res_get_specs (u->host, u->port); - if (!specs) + + if (children) { - char *rfile; - if (res_retrieve_file (constr, &rfile)) - { - specs = res_parse_from_file (rfile); - xfree (rfile); - } - else + struct urlpos *child = children; + struct url *url_parsed = url_parsed = url_parse (url, NULL); + assert (url_parsed != NULL); + + for (; child; child = child->next) { - /* If we cannot get real specs, at least produce - dummy ones so that we can register them and stop - trying to retrieve them. */ - specs = res_parse ("", 0); + if (child->ignore_when_downloading) + continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; + if (download_child_p (child, url_parsed, depth, start_url_parsed, + blacklist)) + { + url_enqueue (queue, xstrdup (child->url->url), + xstrdup (url), depth + 1, + child->link_expect_html); + /* We blacklist the URL we have enqueued, because we + don't want to enqueue (and hence download) the + same URL twice. */ + string_set_add (blacklist, child->url->url); + } } - res_register_specs (u->host, u->port, specs); - } - /* Now that we have (or don't have) robots.txt specs, we can - check what they say. */ - if (!res_match_path (specs, u->path)) - { - DEBUGP (("Not following %s because robots.txt forbids it.\n", - constr)); - string_set_add (undesirable_urls, constr); - inl = 1; + url_free (url_parsed); + free_urlpos (children); } } - filename = NULL; - /* If it wasn't chucked out, do something with it. */ - if (!inl) + if (opt.delete_after || (file && !acceptable (file))) { - DEBUGP (("I've decided to load it -> ")); - /* Add it to the list of already-loaded URL-s. */ - string_set_add (undesirable_urls, constr); - /* Automatically followed FTPs will *not* be downloaded - recursively. */ - if (u->scheme == SCHEME_FTP) - { - /* Don't you adore side-effects? */ - opt.recursive = 0; - } - /* Reset its type. */ - dt = 0; - /* Retrieve it. */ - retrieve_url (constr, &filename, &newloc, - canon_this_url ? canon_this_url : this_url, &dt); - if (u->scheme == SCHEME_FTP) - { - /* Restore... */ - opt.recursive = 1; - } - if (newloc) - { - xfree (constr); - constr = newloc; - } - /* If there was no error, and the type is text/html, parse - it recursively. */ - if (dt & TEXTHTML) - { - if (dt & RETROKF) - recursive_retrieve (filename, constr); - } - else - DEBUGP (("%s is not text/html so we don't chase.\n", - filename ? filename: "(null)")); - - if (opt.delete_after || (filename && !acceptable (filename))) - /* Either --delete-after was specified, or we loaded this otherwise - rejected (e.g. by -R) HTML file just so we could harvest its - hyperlinks -- in either case, delete the local file. */ - { - DEBUGP (("Removing file due to %s in recursive_retrieve():\n", - opt.delete_after ? "--delete-after" : - "recursive rejection criteria")); - logprintf (LOG_VERBOSE, - (opt.delete_after ? _("Removing %s.\n") - : _("Removing %s since it should be rejected.\n")), - filename); - if (unlink (filename)) - logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); - dt &= ~RETROKF; - } - - /* If everything was OK, and links are to be converted, let's - store the local filename. */ - if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) - { - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (filename); - } + /* Either --delete-after was specified, or we loaded this + otherwise rejected (e.g. by -R) HTML file just so we + could harvest its hyperlinks -- in either case, delete + the local file. */ + DEBUGP (("Removing file due to %s in recursive_retrieve():\n", + opt.delete_after ? "--delete-after" : + "recursive rejection criteria")); + logprintf (LOG_VERBOSE, + (opt.delete_after + ? _("Removing %s.\n") + : _("Removing %s since it should be rejected.\n")), + file); + if (unlink (file)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + register_delete_file (file); } - else - DEBUGP (("%s already in list, so we don't load.\n", constr)); - /* Free filename and constr. */ - FREE_MAYBE (filename); - FREE_MAYBE (constr); - freeurl (u, 1); - /* Increment the pbuf for the appropriate size. */ + + xfree (url); + FREE_MAYBE (referer); + FREE_MAYBE (file); } - if (opt.convert_links && !opt.delete_after) - /* This is merely the first pass: the links that have been - successfully downloaded are converted. In the second pass, - convert_all_links() will also convert those links that have NOT - been downloaded to their canonical form. */ - convert_links (file, url_list); - /* Free the linked list of URL-s. */ - free_urlpos (url_list); - /* Free the canonical this_url. */ - FREE_MAYBE (canon_this_url); - /* Decrement the recursion depth. */ - --depth; - if (downloaded_exceeds_quota ()) + + /* If anything is left of the queue due to a premature exit, free it + now. */ + { + char *d1, *d2; + int d3, d4; + while (url_dequeue (queue, + (const char **)&d1, (const char **)&d2, &d3, &d4)) + { + xfree (d1); + FREE_MAYBE (d2); + } + } + url_queue_delete (queue); + + if (start_url_parsed) + url_free (start_url_parsed); + string_set_free (blacklist); + + if (opt.quota && total_downloaded_bytes > opt.quota) return QUOTEXC; + else if (status == FWRITEERR) + return FWRITEERR; else return RETROK; } - -void -register_download (const char *url, const char *file) -{ - if (!opt.convert_links) - return; - if (!dl_file_url_map) - dl_file_url_map = make_string_hash_table (0); - hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); - if (!dl_url_file_map) - dl_url_file_map = make_string_hash_table (0); - hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); -} -void -register_html (const char *url, const char *file) +/* Based on the context provided by retrieve_tree, decide whether a + URL is to be descended to. This is only ever called from + retrieve_tree, but is in a separate function for clarity. + + The most expensive checks (such as those for robots) are memoized + by storing these URLs to BLACKLIST. This may or may not help. It + will help if those URLs are encountered many times. */ + +static int +download_child_p (const struct urlpos *upos, struct url *parent, int depth, + struct url *start_url_parsed, struct hash_table *blacklist) { - if (!opt.convert_links) - return; - downloaded_html_files = slist_prepend (downloaded_html_files, file); -} + struct url *u = upos->url; + const char *url = u->url; + int u_scheme_like_http; -/* convert_links() is called from recursive_retrieve() after we're - done with an HTML file. This call to convert_links is not complete - because it converts only the downloaded files, and Wget cannot know - which files will be downloaded afterwards. So, if we have file - fileone.html with: + DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); - + if (string_set_contains (blacklist, url)) + { + DEBUGP (("Already on the black list.\n")); + goto out; + } - and /c/something.gif was not downloaded because it exceeded the - recursion depth, the reference will *not* be changed. + /* Several things to check for: + 1. if scheme is not http, and we don't load it + 2. check for relative links (if relative_only is set) + 3. check for domain + 4. check for no-parent + 5. check for excludes && includes + 6. check for suffix + 7. check for same host (if spanhost is unset), with possible + gethostbyname baggage + 8. check for robots.txt - However, later we can encounter /c/something.gif from an "upper" - level HTML (let's call it filetwo.html), and it gets downloaded. + Addendum: If the URL is FTP, and it is to be loaded, only the + domain and suffix settings are "stronger". - But now we have a problem because /c/something.gif will be - correctly transformed in filetwo.html, but not in fileone.html, - since Wget could not have known that /c/something.gif will be - downloaded in the future. + Note that .html files will get loaded regardless of suffix rules + (but that is remedied later with unlink) unless the depth equals + the maximum depth. - This is why Wget must, after the whole retrieval, call - convert_all_links to go once more through the entire list of - retrieved HTMLs, and re-convert them. + More time- and memory- consuming tests should be put later on + the list. */ - All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs - in urls_downloaded. From these two lists information is - extracted. */ -void -convert_all_links (void) -{ - slist *html; + /* Determine whether URL under consideration has a HTTP-like scheme. */ + u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); - /* Destructively reverse downloaded_html_files to get it in the right order. - recursive_retrieve() used slist_prepend() consistently. */ - downloaded_html_files = slist_nreverse (downloaded_html_files); + /* 1. Schemes other than HTTP are normally not recursed into. */ + if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) + { + DEBUGP (("Not following non-HTTP schemes.\n")); + goto out; + } - for (html = downloaded_html_files; html; html = html->next) + /* 2. If it is an absolute link and they are not followed, throw it + out. */ + if (u_scheme_like_http) + if (opt.relative_only && !upos->link_relative_p) + { + DEBUGP (("It doesn't really look like a relative link.\n")); + goto out; + } + + /* 3. If its domain is not to be accepted/looked-up, chuck it + out. */ + if (!accept_domain (u)) { - urlpos *urls, *cur_url; - char *url; - - DEBUGP (("Rescanning %s\n", html->string)); - /* Determine the URL of the HTML file. get_urls_html will need - it. */ - url = hash_table_get (dl_file_url_map, html->string); - if (url) - DEBUGP (("It should correspond to %s.\n", url)); - else - DEBUGP (("I cannot find the corresponding URL.\n")); - /* Parse the HTML file... */ - urls = get_urls_html (html->string, url, FALSE, NULL); - /* We don't respect meta_disallow_follow here because, even if - the file is not followed, we might still want to convert the - links that have been followed from other files. */ - for (cur_url = urls; cur_url; cur_url = cur_url->next) + DEBUGP (("The domain was not accepted.\n")); + goto out; + } + + /* 4. Check for parent directory. + + If we descended to a different host or changed the scheme, ignore + opt.no_parent. Also ignore it for documents needed to display + the parent page when in -p mode. */ + if (opt.no_parent + && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) + && 0 == strcasecmp (u->host, start_url_parsed->host) + && u->port == start_url_parsed->port + && !(opt.page_requisites && upos->link_inline_p)) + { + if (!frontcmp (start_url_parsed->dir, u->dir)) { - char *local_name; + DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", + u->dir, start_url_parsed->dir)); + goto out; + } + } - /* The URL must be in canonical form to be compared. */ - struct urlinfo *u = newurl (); - uerr_t res = parseurl (cur_url->url, u, 0); - if (res != URLOK) - { - freeurl (u, 1); - continue; - } - /* We decide the direction of conversion according to whether - a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. */ - local_name = hash_table_get (dl_url_file_map, u->url); - if (local_name) - DEBUGP (("%s marked for conversion, local %s\n", - u->url, local_name)); - /* Decide on the conversion direction. */ - if (local_name) + /* 5. If the file does not match the acceptance list, or is on the + rejection list, chuck it out. The same goes for the directory + exclusion and inclusion lists. */ + if (opt.includes || opt.excludes) + { + if (!accdir (u->dir, ALLABS)) + { + DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); + goto out; + } + } + + /* 6. Check for acceptance/rejection rules. We ignore these rules + for directories (no file name to match) and for HTML documents, + which might lead to other files that do need to be downloaded. + That is, unless we've exhausted the recursion depth anyway. */ + if (u->file[0] != '\0' + && !(has_html_suffix_p (u->file) + && depth != INFINITE_RECURSION + && depth < opt.reclevel - 1)) + { + if (!acceptable (u->file)) + { + DEBUGP (("%s (%s) does not match acc/rej rules.\n", + url, u->file)); + goto out; + } + } + + /* 7. */ + if (schemes_are_similar_p (u->scheme, parent->scheme)) + if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) + { + DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", + u->host, parent->host)); + goto out; + } + + /* 8. */ + if (opt.use_robots && u_scheme_like_http) + { + struct robot_specs *specs = res_get_specs (u->host, u->port); + if (!specs) + { + char *rfile; + if (res_retrieve_file (url, &rfile)) { - /* We've downloaded this URL. Convert it to relative - form. We do this even if the URL already is in - relative form, because our directory structure may - not be identical to that on the server (think `-nd', - `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (local_name); + specs = res_parse_from_file (rfile); + xfree (rfile); } else { - /* We haven't downloaded this URL. If it's not already - complete (including a full host name), convert it to - that form, so it can be reached while browsing this - HTML locally. */ - if (!cur_url->link_complete_p) - cur_url->convert = CO_CONVERT_TO_COMPLETE; - cur_url->local_name = NULL; + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); } - freeurl (u, 1); + res_register_specs (u->host, u->port, specs); + } + + /* Now that we have (or don't have) robots.txt specs, we can + check what they say. */ + if (!res_match_path (specs, u->path)) + { + DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); + string_set_add (blacklist, url); + goto out; } - /* Convert the links in the file. */ - convert_links (html->string, urls); - /* Free the data. */ - free_urlpos (urls); } + + /* The URL has passed all the tests. It can be placed in the + download queue. */ + DEBUGP (("Decided to load it.\n")); + + return 1; + + out: + DEBUGP (("Decided NOT to load it.\n")); + + return 0; +} + +/* This function determines whether we will consider downloading the + children of a URL whose download resulted in a redirection, + possibly to another host, etc. It is needed very rarely, and thus + it is merely a simple-minded wrapper around download_child_p. */ + +static int +descend_redirect_p (const char *redirected, const char *original, int depth, + struct url *start_url_parsed, struct hash_table *blacklist) +{ + struct url *orig_parsed, *new_parsed; + struct urlpos *upos; + int success; + + orig_parsed = url_parse (original, NULL); + assert (orig_parsed != NULL); + + new_parsed = url_parse (redirected, NULL); + assert (new_parsed != NULL); + + upos = xnew0 (struct urlpos); + upos->url = new_parsed; + + success = download_child_p (upos, orig_parsed, depth, + start_url_parsed, blacklist); + + url_free (orig_parsed); + url_free (new_parsed); + xfree (upos); + + if (!success) + DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); + + return success; }