X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Frecur.c;h=71fbe7bf1f73767b47b105e44042cb6e81cb8c88;hp=98c5597035f9d6d29fa0c8f4bf51495db68b2b35;hb=84395897ad2d1c107be470946daba744b2e7ebe8;hpb=2ffb47eabf9fe89d513dc79bdc535e4092e1d6ee diff --git a/src/recur.c b/src/recur.c index 98c55970..71fbe7bf 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,893 +1,707 @@ /* Handling of recursive HTTP retrieving. - Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. -This file is part of Wget. +This file is part of GNU Wget. -This program is free software; you can redistribute it and/or modify +GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. +the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, +GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +along with Wget. If not, see . -#include +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif /* HAVE_STRING_H */ +#include #ifdef HAVE_UNISTD_H # include #endif /* HAVE_UNISTD_H */ #include #include -#include -#include -#include "wget.h" #include "url.h" #include "recur.h" #include "utils.h" #include "retr.h" #include "ftp.h" -#include "fnmatch.h" #include "host.h" #include "hash.h" +#include "res.h" +#include "convert.h" +#include "html-url.h" +#include "css-url.h" +#include "spider.h" + +/* Functions for maintaining the URL queue. */ + +struct queue_element { + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + bool html_allowed; /* whether the document is allowed to + be treated as HTML. */ + struct iri *iri; /* sXXXav */ + bool css_allowed; /* whether the document is allowed to + be treated as CSS. */ + struct queue_element *next; /* next element in queue */ +}; + +struct url_queue { + struct queue_element *head; + struct queue_element *tail; + int count, maxcount; +}; + +/* Create a URL queue. */ + +static struct url_queue * +url_queue_new (void) +{ + struct url_queue *queue = xnew0 (struct url_queue); + return queue; +} -extern char *version_string; - -#define ROBOTS_FILENAME "robots.txt" +/* Delete a URL queue. */ -static struct hash_table *dl_file_url_map; -static struct hash_table *dl_url_file_map; +static void +url_queue_delete (struct url_queue *queue) +{ + xfree (queue); +} -/* List of HTML URLs. */ -static slist *urls_html; +/* Enqueue a URL in the queue. The queue is FIFO: the items will be + retrieved ("dequeued") from the queue in the order they were placed + into it. */ -/* List of undesirable-to-load URLs. */ -static struct hash_table *undesirable_urls; +static void +url_enqueue (struct url_queue *queue, struct iri *i, + const char *url, const char *referer, int depth, + bool html_allowed, bool css_allowed) +{ + struct queue_element *qel = xnew (struct queue_element); + qel->iri = i; + qel->url = url; + qel->referer = referer; + qel->depth = depth; + qel->html_allowed = html_allowed; + qel->css_allowed = css_allowed; + qel->next = NULL; + + ++queue->count; + if (queue->count > queue->maxcount) + queue->maxcount = queue->count; + + DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); + DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + + if (i) + DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url), + i->uri_encoding ? quote (i->uri_encoding) : "None")); + + if (queue->tail) + queue->tail->next = qel; + queue->tail = qel; + + if (!queue->head) + queue->head = queue->tail; +} -/* List of forbidden locations. */ -static char **forbidden = NULL; +/* Take a URL out of the queue. Return true if this operation + succeeded, or false if the queue is empty. */ -/* Current recursion depth. */ -static int depth; +static bool +url_dequeue (struct url_queue *queue, struct iri **i, + const char **url, const char **referer, int *depth, + bool *html_allowed, bool *css_allowed) +{ + struct queue_element *qel = queue->head; -/* Base directory we're recursing from (used by no_parent). */ -static char *base_dir; + if (!qel) + return false; -/* The host name for which we last checked robots. */ -static char *robots_host; + queue->head = queue->head->next; + if (!queue->head) + queue->tail = NULL; -static int first_time = 1; + *i = qel->iri; + *url = qel->url; + *referer = qel->referer; + *depth = qel->depth; + *html_allowed = qel->html_allowed; + *css_allowed = qel->css_allowed; -/* Construct the robots URL. */ -static struct urlinfo *robots_url PARAMS ((const char *, const char *)); -static uerr_t retrieve_robots PARAMS ((const char *, const char *)); -static char **parse_robots PARAMS ((const char *)); -static int robots_match PARAMS ((struct urlinfo *, char **)); + --queue->count; + DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth)); + DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); -/* Cleanup the data structures associated with recursive retrieving - (the variables above). */ -void -recursive_cleanup (void) -{ - if (undesirable_urls) - { - string_set_free (undesirable_urls); - undesirable_urls = NULL; - } - if (dl_file_url_map) - { - free_keys_and_values (dl_file_url_map); - hash_table_destroy (dl_file_url_map); - dl_file_url_map = NULL; - } - if (dl_url_file_map) - { - free_keys_and_values (dl_url_file_map); - hash_table_destroy (dl_url_file_map); - dl_url_file_map = NULL; - } - undesirable_urls = NULL; - free_vec (forbidden); - forbidden = NULL; - slist_free (urls_html); - urls_html = NULL; - FREE_MAYBE (base_dir); - FREE_MAYBE (robots_host); - first_time = 1; + xfree (qel); + return true; } + +static bool download_child_p (const struct urlpos *, struct url *, int, + struct url *, struct hash_table *, struct iri *); +static bool descend_redirect_p (const char *, const char *, int, + struct url *, struct hash_table *, struct iri *); -/* Reset FIRST_TIME to 1, so that some action can be taken in - recursive_retrieve(). */ -void -recursive_reset (void) -{ - first_time = 1; -} -/* The core of recursive retrieving. Endless recursion is avoided by - having all URLs stored to a linked list of URLs, which is checked - before loading any URL. That way no URL can get loaded twice. +/* Retrieve a part of the web beginning with START_URL. This used to + be called "recursive retrieval", because the old function was + recursive and implemented depth-first search. retrieve_tree on the + other hand implements breadth-search traversal of the tree, which + results in much nicer ordering of downloads. + + The algorithm this function uses is simple: + + 1. put START_URL in the queue. + 2. while there are URLs in the queue: + + 3. get next URL from the queue. + 4. download it. + 5. if the URL is HTML and its depth does not exceed maximum depth, + get the list of URLs embedded therein. + 6. for each of those URLs do the following: + + 7. if the URL is not one of those downloaded before, and if it + satisfies the criteria specified by the various command-line + options, add it to the queue. */ - The function also supports specification of maximum recursion depth - and a number of other goodies. */ uerr_t -recursive_retrieve (const char *file, const char *this_url) +retrieve_tree (const char *start_url) { - char *constr, *filename, *newloc; - char *canon_this_url = NULL; - int dt, inl, dash_p_leaf_HTML = FALSE; - int meta_disallow_follow; - int this_url_ftp; /* See below the explanation */ - uerr_t err; - struct urlinfo *rurl; - urlpos *url_list, *cur_url; - char *rfile; /* For robots */ - struct urlinfo *u; - - assert (this_url != NULL); - assert (file != NULL); - /* If quota was exceeded earlier, bail out. */ - if (downloaded_exceeds_quota ()) - return QUOTEXC; - /* Cache the current URL in the list. */ - if (first_time) - { - /* These three operations need to be done only once per Wget - run. They should probably be at a different location. */ - if (!undesirable_urls) - undesirable_urls = make_string_hash_table (0); - if (!dl_file_url_map) - dl_file_url_map = make_string_hash_table (0); - if (!dl_url_file_map) - dl_url_file_map = make_string_hash_table (0); - - hash_table_clear (undesirable_urls); - string_set_add (undesirable_urls, this_url); - hash_table_clear (dl_file_url_map); - hash_table_clear (dl_url_file_map); - urls_html = NULL; - /* Enter this_url to the hash table, in original and "enhanced" form. */ - u = newurl (); - err = parseurl (this_url, u, 0); - if (err == URLOK) - { - string_set_add (undesirable_urls, u->url); - hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url)); - hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file)); - urls_html = slist_prepend (urls_html, file); - if (opt.no_parent) - base_dir = xstrdup (u->dir); /* Set the base dir. */ - /* Set the canonical this_url to be sent as referer. This - problem exists only when running the first time. */ - canon_this_url = xstrdup (u->url); - } - else - { - DEBUGP (("Double yuck! The *base* URL is broken.\n")); - base_dir = NULL; - } - freeurl (u, 1); - depth = 1; - robots_host = NULL; - forbidden = NULL; - first_time = 0; - } - else - ++depth; + uerr_t status = RETROK; - if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel) - /* We've exceeded the maximum recursion depth specified by the user. */ - { - if (opt.page_requisites && depth <= opt.reclevel + 1) - /* When -p is specified, we can do one more partial recursion from the - "leaf nodes" on the HTML document tree. The recursion is partial in - that we won't traverse any or tags, nor any tags - except for . */ - dash_p_leaf_HTML = TRUE; - else - /* Either -p wasn't specified or it was and we've already gone the one - extra (pseudo-)level that it affords us, so we need to bail out. */ - { - DEBUGP (("Recursion depth %d exceeded max. depth %d.\n", - depth, opt.reclevel)); - --depth; - return RECLEVELEXC; - } - } + /* The queue of URLs we need to load. */ + struct url_queue *queue; - /* Determine whether this_url is an FTP URL. If it is, it means - that the retrieval is done through proxy. In that case, FTP - links will be followed by default and recursion will not be - turned off when following them. */ - this_url_ftp = (urlproto (this_url) == URLFTP); + /* The URLs we do not wish to enqueue, because they are already in + the queue, but haven't been downloaded yet. */ + struct hash_table *blacklist; - /* Get the URL-s from an HTML file: */ - url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, - dash_p_leaf_HTML, &meta_disallow_follow); + int up_error_code; + struct url *start_url_parsed; + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale, true); - if (opt.use_robots && meta_disallow_follow) + start_url_parsed = url_parse (start_url, &up_error_code, i); + if (!start_url_parsed) { - /* The META tag says we are not to follow this file. Respect - that. */ - free_urlpos (url_list); - url_list = NULL; + logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, + url_error (up_error_code)); + return URLERROR; } - /* Decide what to do with each of the URLs. A URL will be loaded if - it meets several requirements, discussed later. */ - for (cur_url = url_list; cur_url; cur_url = cur_url->next) + queue = url_queue_new (); + blacklist = make_string_hash_table (0); + + /* Enqueue the starting URL. Use start_url_parsed->url rather than + just URL so we enqueue the canonical form of the URL. */ + url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, + false); + string_set_add (blacklist, start_url_parsed->url); + + while (1) { - /* If quota was exceeded earlier, bail out. */ - if (downloaded_exceeds_quota ()) - break; - /* Parse the URL for convenient use in other functions, as well - as to get the optimized form. It also checks URL integrity. */ - u = newurl (); - if (parseurl (cur_url->url, u, 0) != URLOK) - { - DEBUGP (("Yuck! A bad URL.\n")); - freeurl (u, 1); - continue; - } - if (u->proto == URLFILE) - { - DEBUGP (("Nothing to do with file:// around here.\n")); - freeurl (u, 1); - continue; - } - assert (u->url != NULL); - constr = xstrdup (u->url); - - /* Several checkings whether a file is acceptable to load: - 1. check if URL is ftp, and we don't load it - 2. check for relative links (if relative_only is set) - 3. check for domain - 4. check for no-parent - 5. check for excludes && includes - 6. check for suffix - 7. check for same host (if spanhost is unset), with possible - gethostbyname baggage - 8. check for robots.txt - - Addendum: If the URL is FTP, and it is to be loaded, only the - domain and suffix settings are "stronger". - - Note that .html and (yuck) .htm will get loaded regardless of - suffix rules (but that is remedied later with unlink) unless - the depth equals the maximum depth. - - More time- and memory- consuming tests should be put later on - the list. */ - - /* inl is set if the URL we are working on (constr) is stored in - undesirable_urls. Using it is crucial to avoid unnecessary - repeated continuous hits to the hash table. */ - inl = string_set_exists (undesirable_urls, constr); - - /* If it is FTP, and FTP is not followed, chuck it out. */ - if (!inl) - if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) - { - DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* If it is absolute link and they are not followed, chuck it - out. */ - if (!inl && u->proto != URLFTP) - if (opt.relative_only && !cur_url->link_relative_p) - { - DEBUGP (("It doesn't really look like a relative link.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* If its domain is not to be accepted/looked-up, chuck it out. */ - if (!inl) - if (!accept_domain (u)) - { - DEBUGP (("I don't like the smell of that domain.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - /* Check for parent directory. */ - if (!inl && opt.no_parent - /* If the new URL is FTP and the old was not, ignore - opt.no_parent. */ - && !(!this_url_ftp && u->proto == URLFTP)) - { - /* Check for base_dir first. */ - if (!(base_dir && frontcmp (base_dir, u->dir))) - { - /* Failing that, check for parent dir. */ - struct urlinfo *ut = newurl (); - if (parseurl (this_url, ut, 0) != URLOK) - DEBUGP (("Double yuck! The *base* URL is broken.\n")); - else if (!frontcmp (ut->dir, u->dir)) - { - /* Failing that too, kill the URL. */ - DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - freeurl (ut, 1); - } - } - /* If the file does not match the acceptance list, or is on the - rejection list, chuck it out. The same goes for the - directory exclude- and include- lists. */ - if (!inl && (opt.includes || opt.excludes)) - { - if (!accdir (u->dir, ALLABS)) - { - DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - if (!inl) - { - char *suf = NULL; - /* We check for acceptance/rejection rules only for non-HTML - documents. Since we don't know whether they really are - HTML, it will be deduced from (an OR-ed list): - - 1) u->file is "" (meaning it is a directory) - 2) suffix exists, AND: - a) it is "html", OR - b) it is "htm" - - If the file *is* supposed to be HTML, it will *not* be - subject to acc/rej rules, unless a finite maximum depth has - been specified and the current depth is the maximum depth. */ - if (! - (!*u->file - || (((suf = suffix (constr)) != NULL) - && ((!strcmp (suf, "html") || !strcmp (suf, "htm")) - && ((opt.reclevel != INFINITE_RECURSION) && - (depth != opt.reclevel)))))) - { - if (!acceptable (u->file)) - { - DEBUGP (("%s (%s) does not match acc/rej rules.\n", - constr, u->file)); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - FREE_MAYBE (suf); - } - /* Optimize the URL (which includes possible DNS lookup) only - after all other possibilities have been exhausted. */ - if (!inl) - { - if (!opt.simple_check) - opt_url (u); - else - { - char *p; - /* Just lowercase the hostname. */ - for (p = u->host; *p; p++) - *p = TOLOWER (*p); - xfree (u->url); - u->url = str_url (u, 0); - } - xfree (constr); - constr = xstrdup (u->url); - string_set_add (undesirable_urls, constr); - if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) - if (!opt.spanhost && this_url && !same_host (this_url, constr)) - { - DEBUGP (("This is not the same hostname as the parent's.\n")); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - /* What about robots.txt? */ - if (!inl && opt.use_robots && u->proto == URLHTTP) - { - /* Since Wget knows about only one set of robot rules at a - time, /robots.txt must be reloaded whenever a new host is - accessed. - - robots_host holds the host the current `forbid' variable - is assigned to. */ - if (!robots_host || !same_host (robots_host, u->host)) - { - FREE_MAYBE (robots_host); - /* Now make robots_host the new host, no matter what the - result will be. So if there is no /robots.txt on the - site, Wget will not retry getting robots all the - time. */ - robots_host = xstrdup (u->host); - free_vec (forbidden); - forbidden = NULL; - err = retrieve_robots (constr, ROBOTS_FILENAME); - if (err == ROBOTSOK) - { - rurl = robots_url (constr, ROBOTS_FILENAME); - rfile = url_filename (rurl); - forbidden = parse_robots (rfile); - freeurl (rurl, 1); - xfree (rfile); - } - } - - /* Now that we have (or don't have) robots, we can check for - them. */ - if (!robots_match (u, forbidden)) - { - DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, - ROBOTS_FILENAME)); - string_set_add (undesirable_urls, constr); - inl = 1; - } - } - - filename = NULL; - /* If it wasn't chucked out, do something with it. */ - if (!inl) - { - DEBUGP (("I've decided to load it -> ")); - /* Add it to the list of already-loaded URL-s. */ - string_set_add (undesirable_urls, constr); - /* Automatically followed FTPs will *not* be downloaded - recursively. */ - if (u->proto == URLFTP) - { - /* Don't you adore side-effects? */ - opt.recursive = 0; - } - /* Reset its type. */ - dt = 0; - /* Retrieve it. */ - retrieve_url (constr, &filename, &newloc, - canon_this_url ? canon_this_url : this_url, &dt); - if (u->proto == URLFTP) - { - /* Restore... */ - opt.recursive = 1; - } - if (newloc) - { - xfree (constr); - constr = newloc; - } - /* In case of convert_links: If there was no error, add it to - the list of downloaded URLs. We might need it for - conversion. */ - if (opt.convert_links && filename) - { - if (dt & RETROKF) - { - hash_table_put (dl_file_url_map, - xstrdup (filename), xstrdup (constr)); - hash_table_put (dl_url_file_map, - xstrdup (constr), xstrdup (filename)); - /* If the URL is HTML, note it. */ - if (dt & TEXTHTML) - urls_html = slist_prepend (urls_html, filename); - } - } - /* If there was no error, and the type is text/html, parse - it recursively. */ - if (dt & TEXTHTML) - { - if (dt & RETROKF) - recursive_retrieve (filename, constr); - } - else - DEBUGP (("%s is not text/html so we don't chase.\n", - filename ? filename: "(null)")); - - if (opt.delete_after || (filename && !acceptable (filename))) - /* Either --delete-after was specified, or we loaded this otherwise - rejected (e.g. by -R) HTML file just so we could harvest its - hyperlinks -- in either case, delete the local file. */ - { - DEBUGP (("Removing file due to %s in recursive_retrieve():\n", - opt.delete_after ? "--delete-after" : - "recursive rejection criteria")); - logprintf (LOG_VERBOSE, - (opt.delete_after ? _("Removing %s.\n") - : _("Removing %s since it should be rejected.\n")), - filename); - if (unlink (filename)) - logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); - dt &= ~RETROKF; - } - - /* If everything was OK, and links are to be converted, let's - store the local filename. */ - if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) - { - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (filename); - } - } + bool descend = false; + char *url, *referer, *file = NULL; + int depth; + bool html_allowed, css_allowed; + bool is_css = false; + bool dash_p_leaf_HTML = false; + + if (opt.quota && total_downloaded_bytes > opt.quota) + break; + if (status == FWRITEERR) + break; + + /* Get the next URL from the queue... */ + + if (!url_dequeue (queue, (struct iri **) &i, + (const char **)&url, (const char **)&referer, + &depth, &html_allowed, &css_allowed)) + break; + + /* ...and download it. Note that this download is in most cases + unconditional, as download_child_p already makes sure a file + doesn't get enqueued twice -- and yet this check is here, and + not in download_child_p. This is so that if you run `wget -r + URL1 URL2', and a random URL is encountered once under URL1 + and again under URL2, but at a different (possibly smaller) + depth, we want the URL's children to be taken into account + the second time. */ + if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) + { + file = xstrdup (hash_table_get (dl_url_file_map, url)); + + DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", + url, file)); + + /* this sucks, needs to be combined! */ + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) + { + descend = true; + is_css = false; + } + if (css_allowed + && downloaded_css_set + && string_set_contains (downloaded_css_set, file)) + { + descend = true; + is_css = true; + } + } else - DEBUGP (("%s already in list, so we don't load.\n", constr)); - /* Free filename and constr. */ - FREE_MAYBE (filename); - FREE_MAYBE (constr); - freeurl (u, 1); - /* Increment the pbuf for the appropriate size. */ + { + int dt = 0; + char *redirected = NULL; + + status = retrieve_url (url, &file, &redirected, referer, &dt, + false, i); + + if (html_allowed && file && status == RETROK + && (dt & RETROKF) && (dt & TEXTHTML)) + { + descend = true; + is_css = false; + } + + /* a little different, css_allowed can override content type + lots of web servers serve css with an incorrect content type + */ + if (file && status == RETROK + && (dt & RETROKF) && + ((dt & TEXTCSS) || css_allowed)) + { + descend = true; + is_css = true; + } + + if (redirected) + { + /* We have been redirected, possibly to another host, or + different path, or wherever. Check whether we really + want to follow it. */ + if (descend) + { + if (!descend_redirect_p (redirected, url, depth, + start_url_parsed, blacklist, i)) + descend = false; + else + /* Make sure that the old pre-redirect form gets + blacklisted. */ + string_set_add (blacklist, url); + } + + xfree (url); + url = redirected; + } + } + + if (opt.spider) + { + visited_url (url, referer); + } + + if (descend + && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) + { + if (opt.page_requisites + && (depth == opt.reclevel || depth == opt.reclevel + 1)) + { + /* When -p is specified, we are allowed to exceed the + maximum depth, but only for the "inline" links, + i.e. those that are needed to display the page. + Originally this could exceed the depth at most by + one, but we allow one more level so that the leaf + pages that contain frames can be loaded + correctly. */ + dash_p_leaf_HTML = true; + } + else + { + /* Either -p wasn't specified or it was and we've + already spent the two extra (pseudo-)levels that it + affords us, so we need to bail out. */ + DEBUGP (("Not descending further; at depth %d, max. %d.\n", + depth, opt.reclevel)); + descend = false; + } + } + + /* If the downloaded document was HTML or CSS, parse it and enqueue the + links it contains. */ + + if (descend) + { + bool meta_disallow_follow = false; + struct urlpos *children + = is_css ? get_urls_css_file (file, url) : + get_urls_html (file, url, &meta_disallow_follow, i); + + if (opt.use_robots && meta_disallow_follow) + { + free_urlpos (children); + children = NULL; + } + + if (children) + { + struct urlpos *child = children; + struct url *url_parsed = url_parse (url, NULL, i); + struct iri *ci; + char *referer_url = url; + bool strip_auth = (url_parsed != NULL + && url_parsed->user != NULL); + assert (url_parsed != NULL); + + /* Strip auth info if present */ + if (strip_auth) + referer_url = url_string (url_parsed, URL_AUTH_HIDE); + + for (; child; child = child->next) + { + if (child->ignore_when_downloading) + continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; + if (download_child_p (child, url_parsed, depth, start_url_parsed, + blacklist, i)) + { + ci = iri_new (); + set_uri_encoding (ci, i->content_encoding, false); + url_enqueue (queue, ci, xstrdup (child->url->url), + xstrdup (referer_url), depth + 1, + child->link_expect_html, + child->link_expect_css); + /* We blacklist the URL we have enqueued, because we + don't want to enqueue (and hence download) the + same URL twice. */ + string_set_add (blacklist, child->url->url); + } + } + + if (strip_auth) + xfree (referer_url); + url_free (url_parsed); + free_urlpos (children); + } + } + + if (file + && (opt.delete_after + || opt.spider /* opt.recursive is implicitely true */ + || !acceptable (file))) + { + /* Either --delete-after was specified, or we loaded this + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, + delete the local file. */ + DEBUGP (("Removing file due to %s in recursive_retrieve():\n", + opt.delete_after ? "--delete-after" : + (opt.spider ? "--spider" : + "recursive rejection criteria"))); + logprintf (LOG_VERBOSE, + (opt.delete_after || opt.spider + ? _("Removing %s.\n") + : _("Removing %s since it should be rejected.\n")), + file); + if (unlink (file)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + logputs (LOG_VERBOSE, "\n"); + register_delete_file (file); + } + + xfree (url); + xfree_null (referer); + xfree_null (file); + iri_free (i); } - if (opt.convert_links && !opt.delete_after) - /* This is merely the first pass: the links that have been - successfully downloaded are converted. In the second pass, - convert_all_links() will also convert those links that have NOT - been downloaded to their canonical form. */ - convert_links (file, url_list); - /* Free the linked list of URL-s. */ - free_urlpos (url_list); - /* Free the canonical this_url. */ - FREE_MAYBE (canon_this_url); - /* Decrement the recursion depth. */ - --depth; - if (downloaded_exceeds_quota ()) + + /* If anything is left of the queue due to a premature exit, free it + now. */ + { + char *d1, *d2; + int d3; + bool d4, d5; + struct iri *d6; + while (url_dequeue (queue, (struct iri **)&d6, + (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) + { + iri_free (d6); + xfree (d1); + xfree_null (d2); + } + } + url_queue_delete (queue); + + if (start_url_parsed) + url_free (start_url_parsed); + string_set_free (blacklist); + + if (opt.quota && total_downloaded_bytes > opt.quota) return QUOTEXC; + else if (status == FWRITEERR) + return FWRITEERR; else return RETROK; } - -/* convert_links() is called from recursive_retrieve() after we're - done with an HTML file. This call to convert_links is not complete - because it converts only the downloaded files, and Wget cannot know - which files will be downloaded afterwards. So, if we have file - fileone.html with: - - - - and /c/something.gif was not downloaded because it exceeded the - recursion depth, the reference will *not* be changed. - - However, later we can encounter /c/something.gif from an "upper" - level HTML (let's call it filetwo.html), and it gets downloaded. - - But now we have a problem because /c/something.gif will be - correctly transformed in filetwo.html, but not in fileone.html, - since Wget could not have known that /c/something.gif will be - downloaded in the future. - - This is why Wget must, after the whole retrieval, call - convert_all_links to go once more through the entire list of - retrieved HTMLs, and re-convert them. - - All the downloaded HTMLs are kept in urls_html, and downloaded URLs - in urls_downloaded. From these two lists information is - extracted. */ -void -convert_all_links (void) + +/* Based on the context provided by retrieve_tree, decide whether a + URL is to be descended to. This is only ever called from + retrieve_tree, but is in a separate function for clarity. + + The most expensive checks (such as those for robots) are memoized + by storing these URLs to BLACKLIST. This may or may not help. It + will help if those URLs are encountered many times. */ + +static bool +download_child_p (const struct urlpos *upos, struct url *parent, int depth, + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { - slist *html; + struct url *u = upos->url; + const char *url = u->url; + bool u_scheme_like_http; - /* Destructively reverse urls_html to get it in the right order. - recursive_retrieve() used slist_prepend() consistently. */ - urls_html = slist_nreverse (urls_html); + DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); - for (html = urls_html; html; html = html->next) + if (string_set_contains (blacklist, url)) { - urlpos *urls, *cur_url; - char *url; - - DEBUGP (("Rescanning %s\n", html->string)); - /* Determine the URL of the HTML file. get_urls_html will need - it. */ - url = hash_table_get (dl_file_url_map, html->string); - if (url) - DEBUGP (("It should correspond to %s.\n", url)); - else - DEBUGP (("I cannot find the corresponding URL.\n")); - /* Parse the HTML file... */ - urls = get_urls_html (html->string, url, FALSE, NULL); - /* We don't respect meta_disallow_follow here because, even if - the file is not followed, we might still want to convert the - links that have been followed from other files. */ - for (cur_url = urls; cur_url; cur_url = cur_url->next) - { - char *local_name; - - /* The URL must be in canonical form to be compared. */ - struct urlinfo *u = newurl (); - uerr_t res = parseurl (cur_url->url, u, 0); - if (res != URLOK) - { - freeurl (u, 1); - continue; - } - /* We decide the direction of conversion according to whether - a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. */ - local_name = hash_table_get (dl_url_file_map, u->url); - if (local_name) - DEBUGP (("%s marked for conversion, local %s\n", - u->url, local_name)); - /* Decide on the conversion direction. */ - if (local_name) - { - /* We've downloaded this URL. Convert it to relative - form. We do this even if the URL already is in - relative form, because our directory structure may - not be identical to that on the server (think `-nd', - `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (local_name); - } - else - { - /* We haven't downloaded this URL. If it's not already - complete (including a full host name), convert it to - that form, so it can be reached while browsing this - HTML locally. */ - if (!cur_url->link_complete_p) - cur_url->convert = CO_CONVERT_TO_COMPLETE; - cur_url->local_name = NULL; - } - freeurl (u, 1); - } - /* Convert the links in the file. */ - convert_links (html->string, urls); - /* Free the data. */ - free_urlpos (urls); + if (opt.spider) + { + char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); + DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); + visited_url (url, referrer); + xfree (referrer); + } + DEBUGP (("Already on the black list.\n")); + goto out; } -} - -/* Robots support. */ -/* Construct the robots URL. */ -static struct urlinfo * -robots_url (const char *url, const char *robots_filename) -{ - struct urlinfo *u = newurl (); - uerr_t err; - - err = parseurl (url, u, 0); - assert (err == URLOK && u->proto == URLHTTP); - xfree (u->file); - xfree (u->dir); - xfree (u->url); - u->dir = xstrdup (""); - u->file = xstrdup (robots_filename); - u->url = str_url (u, 0); - return u; -} + /* Several things to check for: + 1. if scheme is not http, and we don't load it + 2. check for relative links (if relative_only is set) + 3. check for domain + 4. check for no-parent + 5. check for excludes && includes + 6. check for suffix + 7. check for same host (if spanhost is unset), with possible + gethostbyname baggage + 8. check for robots.txt -/* Retrieves the robots_filename from the root server directory, if - possible. Returns ROBOTSOK if robots were retrieved OK, and - NOROBOTS if robots could not be retrieved for any reason. */ -static uerr_t -retrieve_robots (const char *url, const char *robots_filename) -{ - int dt; - uerr_t err; - struct urlinfo *u; - - u = robots_url (url, robots_filename); - logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); - err = retrieve_url (u->url, NULL, NULL, NULL, &dt); - freeurl (u, 1); - if (err == RETROK) - return ROBOTSOK; - else - return NOROBOTS; -} + Addendum: If the URL is FTP, and it is to be loaded, only the + domain and suffix settings are "stronger". -/* Parse the robots_filename and return the disallowed path components - in a malloc-ed vector of character pointers. + Note that .html files will get loaded regardless of suffix rules + (but that is remedied later with unlink) unless the depth equals + the maximum depth. - It should be fully compliant with the syntax as described in the - file norobots.txt, adopted by the robots mailing list - (robots@webcrawler.com). */ -static char ** -parse_robots (const char *robots_filename) -{ - FILE *fp; - char **entries; - char *line, *cmd, *str, *p; - char *base_version, *version; - int len, num, i; - int wget_matched; /* is the part meant for Wget? */ - - entries = NULL; - - num = 0; - fp = fopen (robots_filename, "rb"); - if (!fp) - return NULL; - - /* Kill version number. */ - if (opt.useragent) + More time- and memory- consuming tests should be put later on + the list. */ + + /* Determine whether URL under consideration has a HTTP-like scheme. */ + u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); + + /* 1. Schemes other than HTTP are normally not recursed into. */ + if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp)) + { + DEBUGP (("Not following non-HTTP schemes.\n")); + goto out; + } + + /* 2. If it is an absolute link and they are not followed, throw it + out. */ + if (u_scheme_like_http) + if (opt.relative_only && !upos->link_relative_p) { - STRDUP_ALLOCA (base_version, opt.useragent); - STRDUP_ALLOCA (version, opt.useragent); + DEBUGP (("It doesn't really look like a relative link.\n")); + goto out; } - else + + /* 3. If its domain is not to be accepted/looked-up, chuck it + out. */ + if (!accept_domain (u)) + { + DEBUGP (("The domain was not accepted.\n")); + goto out; + } + + /* 4. Check for parent directory. + + If we descended to a different host or changed the scheme, ignore + opt.no_parent. Also ignore it for documents needed to display + the parent page when in -p mode. */ + if (opt.no_parent + && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) + && 0 == strcasecmp (u->host, start_url_parsed->host) + && u->port == start_url_parsed->port + && !(opt.page_requisites && upos->link_inline_p)) + { + if (!subdir_p (start_url_parsed->dir, u->dir)) + { + DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", + u->dir, start_url_parsed->dir)); + goto out; + } + } + + /* 5. If the file does not match the acceptance list, or is on the + rejection list, chuck it out. The same goes for the directory + exclusion and inclusion lists. */ + if (opt.includes || opt.excludes) + { + if (!accdir (u->dir)) + { + DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); + goto out; + } + } + + /* 6. Check for acceptance/rejection rules. We ignore these rules + for directories (no file name to match) and for non-leaf HTMLs, + which can lead to other files that do need to be downloaded. (-p + automatically implies non-leaf because with -p we can, if + necesary, overstep the maximum depth to get the page requisites.) */ + if (u->file[0] != '\0' + && !(has_html_suffix_p (u->file) + /* The exception only applies to non-leaf HTMLs (but -p + always implies non-leaf because we can overstep the + maximum depth to get the requisites): */ + && (/* non-leaf */ + opt.reclevel == INFINITE_RECURSION + /* also non-leaf */ + || depth < opt.reclevel - 1 + /* -p, which implies non-leaf (see above) */ + || opt.page_requisites))) + { + if (!acceptable (u->file)) + { + DEBUGP (("%s (%s) does not match acc/rej rules.\n", + url, u->file)); + goto out; + } + } + + /* 7. */ + if (schemes_are_similar_p (u->scheme, parent->scheme)) + if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) { - int len = 10 + strlen (version_string); - base_version = (char *)alloca (len); - sprintf (base_version, "Wget/%s", version_string); - version = (char *)alloca (len); - sprintf (version, "Wget/%s", version_string); + DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", + u->host, parent->host)); + goto out; } - for (p = version; *p; p++) - *p = TOLOWER (*p); - for (p = base_version; *p && *p != '/'; p++) - *p = TOLOWER (*p); - *p = '\0'; - - /* Setting this to 1 means that Wget considers itself under - restrictions by default, even if the User-Agent field is not - present. However, if it finds the user-agent set to anything - other than Wget, the rest will be ignored (up to the following - User-Agent field). Thus you may have something like: - - Disallow: 1 - Disallow: 2 - User-Agent: stupid-robot - Disallow: 3 - Disallow: 4 - User-Agent: Wget* - Disallow: 5 - Disallow: 6 - User-Agent: * - Disallow: 7 - - In this case the 1, 2, 5, 6 and 7 disallow lines will be - stored. */ - wget_matched = 1; - while ((line = read_whole_line (fp))) + + /* 8. */ + if (opt.use_robots && u_scheme_like_http) { - len = strlen (line); - /* Destroy if present. */ - if (len && line[len - 1] == '\n') - line[--len] = '\0'; - if (len && line[len - 1] == '\r') - line[--len] = '\0'; - /* According to specifications, optional space may be at the - end... */ - DEBUGP (("Line: %s\n", line)); - /* Skip spaces. */ - for (cmd = line; *cmd && ISSPACE (*cmd); cmd++); - if (!*cmd) - { - xfree (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Look for ':'. */ - for (str = cmd; *str && *str != ':'; str++); - if (!*str) - { - xfree (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Zero-terminate the command. */ - *str++ = '\0'; - /* Look for the string beginning... */ - for (; *str && ISSPACE (*str); str++); - /* Look for comments or trailing spaces and kill them off. */ - for (p = str; *p; p++) - if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0'))) - { - /* We have found either a shell-style comment `+#' or some - trailing spaces. Now rewind to the beginning of the spaces - and place '\0' there. */ - while (p > str && ISSPACE (*p)) - --p; - if (p == str) - *p = '\0'; - else - *(p + 1) = '\0'; - break; - } - if (!strcasecmp (cmd, "User-agent")) - { - int match = 0; - /* Lowercase the agent string. */ - for (p = str; *p; p++) - *p = TOLOWER (*p); - /* If the string is `*', it matches. */ - if (*str == '*' && !*(str + 1)) - match = 1; - else - { - /* If the string contains wildcards, we'll run it through - fnmatch(). */ - if (has_wildcards_p (str)) - { - /* If the string contains '/', compare with the full - version. Else, compare it to base_version. */ - if (strchr (str, '/')) - match = !fnmatch (str, version, 0); - else - match = !fnmatch (str, base_version, 0); - } - else /* Substring search */ - { - if (strstr (version, str)) - match = 1; - else - match = 0; - } - } - /* If Wget is not matched, skip all the entries up to the - next User-agent field. */ - wget_matched = match; - } - else if (!wget_matched) - { - xfree (line); - DEBUGP (("(chucking out since it is not applicable for Wget)\n")); - continue; - } - else if (!strcasecmp (cmd, "Disallow")) - { - /* If "Disallow" is empty, the robot is welcome. */ - if (!*str) - { - free_vec (entries); - entries = (char **)xmalloc (sizeof (char *)); - *entries = NULL; - num = 0; - } - else - { - entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *)); - entries[num] = xstrdup (str); - entries[++num] = NULL; - /* Strip trailing spaces, according to specifications. */ - for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--) - if (ISSPACE (str[i])) - str[i] = '\0'; - } - } - else - { - /* unknown command */ - DEBUGP (("(chucked out)\n")); - } - xfree (line); + struct robot_specs *specs = res_get_specs (u->host, u->port); + if (!specs) + { + char *rfile; + if (res_retrieve_file (url, &rfile, iri)) + { + specs = res_parse_from_file (rfile); + + /* Delete the robots.txt file if we chose to either delete the + files after downloading or we're just running a spider. */ + if (opt.delete_after || opt.spider) + { + logprintf (LOG_VERBOSE, "Removing %s.\n", rfile); + if (unlink (rfile)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", + strerror (errno)); + } + + xfree (rfile); + } + else + { + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); + } + res_register_specs (u->host, u->port, specs); + } + + /* Now that we have (or don't have) robots.txt specs, we can + check what they say. */ + if (!res_match_path (specs, u->path)) + { + DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); + string_set_add (blacklist, url); + goto out; + } } - fclose (fp); - return entries; + + /* The URL has passed all the tests. It can be placed in the + download queue. */ + DEBUGP (("Decided to load it.\n")); + + return true; + + out: + DEBUGP (("Decided NOT to load it.\n")); + + return false; } -/* May the URL url be loaded according to disallowing rules stored in - forbidden? */ -static int -robots_match (struct urlinfo *u, char **forbidden) +/* This function determines whether we will consider downloading the + children of a URL whose download resulted in a redirection, + possibly to another host, etc. It is needed very rarely, and thus + it is merely a simple-minded wrapper around download_child_p. */ + +static bool +descend_redirect_p (const char *redirected, const char *original, int depth, + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { - int l; + struct url *orig_parsed, *new_parsed; + struct urlpos *upos; + bool success; - if (!forbidden) - return 1; - DEBUGP (("Matching %s against: ", u->path)); - for (; *forbidden; forbidden++) - { - DEBUGP (("%s ", *forbidden)); - l = strlen (*forbidden); - /* If dir is forbidden, we may not load the file. */ - if (strncmp (u->path, *forbidden, l) == 0) - { - DEBUGP (("matched.\n")); - return 0; /* Matches, i.e. does not load... */ - } - } - DEBUGP (("not matched.\n")); - return 1; + orig_parsed = url_parse (original, NULL, NULL); + assert (orig_parsed != NULL); + + new_parsed = url_parse (redirected, NULL, NULL); + assert (new_parsed != NULL); + + upos = xnew0 (struct urlpos); + upos->url = new_parsed; + + success = download_child_p (upos, orig_parsed, depth, + start_url_parsed, blacklist, iri); + + url_free (orig_parsed); + url_free (new_parsed); + xfree (upos); + + if (!success) + DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); + + return success; } + +/* vim:set sts=2 sw=2 cino+={s: */