X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Frecur.c;h=a8fc71af5fd000ef22acc849dbb620f6f274dde2;hp=9ecb7d8efb8bfd8bfe33271e0a1d2a8e7c8b283c;hb=7d2066b2213bd8ee5705dfdf6ed4297e91d694d7;hpb=2fe72be505d2d91fc0bbbd22cc19f3d288813671 diff --git a/src/recur.c b/src/recur.c index 9ecb7d8e..a8fc71af 100644 --- a/src/recur.c +++ b/src/recur.c @@ -52,13 +52,13 @@ so, delete this exception statement from your version. */ /* Functions for maintaining the URL queue. */ struct queue_element { - const char *url; /* the URL to download */ - const char *referer; /* the referring document */ - int depth; /* the depth */ - bool html_allowed; /* whether the document is allowed to - be treated as HTML. */ + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + bool html_allowed; /* whether the document is allowed to + be treated as HTML. */ - struct queue_element *next; /* next element in queue */ + struct queue_element *next; /* next element in queue */ }; struct url_queue { @@ -90,7 +90,7 @@ url_queue_delete (struct url_queue *queue) static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth, bool html_allowed) + const char *url, const char *referer, int depth, bool html_allowed) { struct queue_element *qel = xnew (struct queue_element); qel->url = url; @@ -119,8 +119,8 @@ url_enqueue (struct url_queue *queue, static bool url_dequeue (struct url_queue *queue, - const char **url, const char **referer, int *depth, - bool *html_allowed) + const char **url, const char **referer, int *depth, + bool *html_allowed) { struct queue_element *qel = queue->head; @@ -146,9 +146,9 @@ url_dequeue (struct url_queue *queue, } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *); static bool descend_redirect_p (const char *, const char *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *); /* Retrieve a part of the web beginning with START_URL. This used to @@ -170,7 +170,7 @@ static bool descend_redirect_p (const char *, const char *, int, 7. if the URL is not one of those downloaded before, and if it satisfies the criteria specified by the various command-line - options, add it to the queue. */ + options, add it to the queue. */ uerr_t retrieve_tree (const char *start_url) @@ -190,7 +190,7 @@ retrieve_tree (const char *start_url) if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, - url_error (up_error_code)); + url_error (up_error_code)); return URLERROR; } @@ -211,176 +211,176 @@ retrieve_tree (const char *start_url) bool dash_p_leaf_HTML = false; if (opt.quota && total_downloaded_bytes > opt.quota) - break; + break; if (status == FWRITEERR) - break; + break; /* Get the next URL from the queue... */ if (!url_dequeue (queue, - (const char **)&url, (const char **)&referer, - &depth, &html_allowed)) - break; + (const char **)&url, (const char **)&referer, + &depth, &html_allowed)) + break; /* ...and download it. Note that this download is in most cases - unconditional, as download_child_p already makes sure a file - doesn't get enqueued twice -- and yet this check is here, and - not in download_child_p. This is so that if you run `wget -r - URL1 URL2', and a random URL is encountered once under URL1 - and again under URL2, but at a different (possibly smaller) - depth, we want the URL's children to be taken into account - the second time. */ + unconditional, as download_child_p already makes sure a file + doesn't get enqueued twice -- and yet this check is here, and + not in download_child_p. This is so that if you run `wget -r + URL1 URL2', and a random URL is encountered once under URL1 + and again under URL2, but at a different (possibly smaller) + depth, we want the URL's children to be taken into account + the second time. */ if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) - { - file = xstrdup (hash_table_get (dl_url_file_map, url)); + { + file = xstrdup (hash_table_get (dl_url_file_map, url)); - DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", - url, file)); + DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", + url, file)); - if (html_allowed - && downloaded_html_set - && string_set_contains (downloaded_html_set, file)) - descend = true; - } + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) + descend = true; + } else - { - int dt = 0; - char *redirected = NULL; - - status = retrieve_url (url, &file, &redirected, referer, &dt, false); - - if (html_allowed && file && status == RETROK - && (dt & RETROKF) && (dt & TEXTHTML)) - descend = true; - - if (redirected) - { - /* We have been redirected, possibly to another host, or - different path, or wherever. Check whether we really - want to follow it. */ - if (descend) - { - if (!descend_redirect_p (redirected, url, depth, - start_url_parsed, blacklist)) - descend = false; - else - /* Make sure that the old pre-redirect form gets - blacklisted. */ - string_set_add (blacklist, url); - } - - xfree (url); - url = redirected; - } - } + { + int dt = 0; + char *redirected = NULL; + + status = retrieve_url (url, &file, &redirected, referer, &dt, false); + + if (html_allowed && file && status == RETROK + && (dt & RETROKF) && (dt & TEXTHTML)) + descend = true; + + if (redirected) + { + /* We have been redirected, possibly to another host, or + different path, or wherever. Check whether we really + want to follow it. */ + if (descend) + { + if (!descend_redirect_p (redirected, url, depth, + start_url_parsed, blacklist)) + descend = false; + else + /* Make sure that the old pre-redirect form gets + blacklisted. */ + string_set_add (blacklist, url); + } + + xfree (url); + url = redirected; + } + } if (opt.spider) - { + { visited_url (url, referer); - } + } if (descend - && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) - { - if (opt.page_requisites - && (depth == opt.reclevel || depth == opt.reclevel + 1)) - { - /* When -p is specified, we are allowed to exceed the - maximum depth, but only for the "inline" links, - i.e. those that are needed to display the page. - Originally this could exceed the depth at most by - one, but we allow one more level so that the leaf - pages that contain frames can be loaded - correctly. */ - dash_p_leaf_HTML = true; - } - else - { - /* Either -p wasn't specified or it was and we've - already spent the two extra (pseudo-)levels that it - affords us, so we need to bail out. */ - DEBUGP (("Not descending further; at depth %d, max. %d.\n", - depth, opt.reclevel)); - descend = false; - } - } + && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) + { + if (opt.page_requisites + && (depth == opt.reclevel || depth == opt.reclevel + 1)) + { + /* When -p is specified, we are allowed to exceed the + maximum depth, but only for the "inline" links, + i.e. those that are needed to display the page. + Originally this could exceed the depth at most by + one, but we allow one more level so that the leaf + pages that contain frames can be loaded + correctly. */ + dash_p_leaf_HTML = true; + } + else + { + /* Either -p wasn't specified or it was and we've + already spent the two extra (pseudo-)levels that it + affords us, so we need to bail out. */ + DEBUGP (("Not descending further; at depth %d, max. %d.\n", + depth, opt.reclevel)); + descend = false; + } + } /* If the downloaded document was HTML, parse it and enqueue the - links it contains. */ + links it contains. */ if (descend) - { - bool meta_disallow_follow = false; - struct urlpos *children - = get_urls_html (file, url, &meta_disallow_follow); - - if (opt.use_robots && meta_disallow_follow) - { - free_urlpos (children); - children = NULL; - } - - if (children) - { - struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + { + bool meta_disallow_follow = false; + struct urlpos *children + = get_urls_html (file, url, &meta_disallow_follow); + + if (opt.use_robots && meta_disallow_follow) + { + free_urlpos (children); + children = NULL; + } + + if (children) + { + struct urlpos *child = children; + struct url *url_parsed = url_parsed = url_parse (url, NULL); char *referer_url = url; bool strip_auth = url_parsed->user; - assert (url_parsed != NULL); + assert (url_parsed != NULL); /* Strip auth info if present */ if (strip_auth) referer_url = url_string (url_parsed, URL_AUTH_HIDE); - for (; child; child = child->next) - { - if (child->ignore_when_downloading) - continue; - if (dash_p_leaf_HTML && !child->link_inline_p) - continue; - if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) - { - url_enqueue (queue, xstrdup (child->url->url), - xstrdup (referer_url), depth + 1, - child->link_expect_html); - /* We blacklist the URL we have enqueued, because we - don't want to enqueue (and hence download) the - same URL twice. */ - string_set_add (blacklist, child->url->url); - } - } + for (; child; child = child->next) + { + if (child->ignore_when_downloading) + continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; + if (download_child_p (child, url_parsed, depth, start_url_parsed, + blacklist)) + { + url_enqueue (queue, xstrdup (child->url->url), + xstrdup (referer_url), depth + 1, + child->link_expect_html); + /* We blacklist the URL we have enqueued, because we + don't want to enqueue (and hence download) the + same URL twice. */ + string_set_add (blacklist, child->url->url); + } + } if (strip_auth) xfree (referer_url); - url_free (url_parsed); - free_urlpos (children); - } - } + url_free (url_parsed); + free_urlpos (children); + } + } if (file && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) - { - /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, - delete the local file. */ - DEBUGP (("Removing file due to %s in recursive_retrieve():\n", - opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : - "recursive rejection criteria"))); - logprintf (LOG_VERBOSE, - (opt.delete_after || opt.spider - ? _("Removing %s.\n") - : _("Removing %s since it should be rejected.\n")), - file); - if (unlink (file)) - logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); - logputs (LOG_VERBOSE, "\n"); - register_delete_file (file); - } + { + /* Either --delete-after was specified, or we loaded this + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, + delete the local file. */ + DEBUGP (("Removing file due to %s in recursive_retrieve():\n", + opt.delete_after ? "--delete-after" : + (opt.spider ? "--spider" : + "recursive rejection criteria"))); + logprintf (LOG_VERBOSE, + (opt.delete_after || opt.spider + ? _("Removing %s.\n") + : _("Removing %s since it should be rejected.\n")), + file); + if (unlink (file)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + logputs (LOG_VERBOSE, "\n"); + register_delete_file (file); + } xfree (url); xfree_null (referer); @@ -394,10 +394,10 @@ retrieve_tree (const char *start_url) int d3; bool d4; while (url_dequeue (queue, - (const char **)&d1, (const char **)&d2, &d3, &d4)) + (const char **)&d1, (const char **)&d2, &d3, &d4)) { - xfree (d1); - xfree_null (d2); + xfree (d1); + xfree_null (d2); } } url_queue_delete (queue); @@ -424,7 +424,7 @@ retrieve_tree (const char *start_url) static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist) { struct url *u = upos->url; const char *url = u->url; @@ -435,12 +435,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { if (opt.spider) - { + { char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url)); visited_url (url, referrer); - xfree (referrer); - } + xfree (referrer); + } DEBUGP (("Already on the black list.\n")); goto out; } @@ -481,8 +481,8 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (u_scheme_like_http) if (opt.relative_only && !upos->link_relative_p) { - DEBUGP (("It doesn't really look like a relative link.\n")); - goto out; + DEBUGP (("It doesn't really look like a relative link.\n")); + goto out; } /* 3. If its domain is not to be accepted/looked-up, chuck it @@ -505,11 +505,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, && !(opt.page_requisites && upos->link_inline_p)) { if (!subdir_p (start_url_parsed->dir, u->dir)) - { - DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", - u->dir, start_url_parsed->dir)); - goto out; - } + { + DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", + u->dir, start_url_parsed->dir)); + goto out; + } } /* 5. If the file does not match the acceptance list, or is on the @@ -518,10 +518,10 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (opt.includes || opt.excludes) { if (!accdir (u->dir)) - { - DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); - goto out; - } + { + DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); + goto out; + } } /* 6. Check for acceptance/rejection rules. We ignore these rules @@ -531,31 +531,31 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, necesary, overstep the maximum depth to get the page requisites.) */ if (u->file[0] != '\0' && !(has_html_suffix_p (u->file) - /* The exception only applies to non-leaf HTMLs (but -p - always implies non-leaf because we can overstep the - maximum depth to get the requisites): */ - && (/* non-leaf */ - opt.reclevel == INFINITE_RECURSION - /* also non-leaf */ - || depth < opt.reclevel - 1 - /* -p, which implies non-leaf (see above) */ - || opt.page_requisites))) + /* The exception only applies to non-leaf HTMLs (but -p + always implies non-leaf because we can overstep the + maximum depth to get the requisites): */ + && (/* non-leaf */ + opt.reclevel == INFINITE_RECURSION + /* also non-leaf */ + || depth < opt.reclevel - 1 + /* -p, which implies non-leaf (see above) */ + || opt.page_requisites))) { if (!acceptable (u->file)) - { - DEBUGP (("%s (%s) does not match acc/rej rules.\n", - url, u->file)); - goto out; - } + { + DEBUGP (("%s (%s) does not match acc/rej rules.\n", + url, u->file)); + goto out; + } } /* 7. */ if (schemes_are_similar_p (u->scheme, parent->scheme)) if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) { - DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", - u->host, parent->host)); - goto out; + DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", + u->host, parent->host)); + goto out; } /* 8. */ @@ -563,31 +563,31 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, { struct robot_specs *specs = res_get_specs (u->host, u->port); if (!specs) - { - char *rfile; - if (res_retrieve_file (url, &rfile)) - { - specs = res_parse_from_file (rfile); - xfree (rfile); - } - else - { - /* If we cannot get real specs, at least produce - dummy ones so that we can register them and stop - trying to retrieve them. */ - specs = res_parse ("", 0); - } - res_register_specs (u->host, u->port, specs); - } + { + char *rfile; + if (res_retrieve_file (url, &rfile)) + { + specs = res_parse_from_file (rfile); + xfree (rfile); + } + else + { + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); + } + res_register_specs (u->host, u->port, specs); + } /* Now that we have (or don't have) robots.txt specs, we can - check what they say. */ + check what they say. */ if (!res_match_path (specs, u->path)) - { - DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); - string_set_add (blacklist, url); - goto out; - } + { + DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); + string_set_add (blacklist, url); + goto out; + } } /* The URL has passed all the tests. It can be placed in the @@ -609,7 +609,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, static bool descend_redirect_p (const char *redirected, const char *original, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist) { struct url *orig_parsed, *new_parsed; struct urlpos *upos; @@ -625,7 +625,7 @@ descend_redirect_p (const char *redirected, const char *original, int depth, upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist); url_free (orig_parsed); url_free (new_parsed);