X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Frecur.c;h=53cd39cdc618311ecba790588478d1e3b4963fb7;hp=007354b76770af936ed3b3588e46a4d2e1d856d8;hb=4d7c5e087b2bc82c9f503dff003916d1047903ce;hpb=097923f7b19a1f40313ecce0e743e42df5c75673 diff --git a/src/recur.c b/src/recur.c index 007354b7..53cd39cd 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,12 +1,12 @@ /* Handling of recursive HTTP retrieving. - Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc. + Copyright (C) 1996-2006 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. +the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +along with Wget. If not, see . In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -31,17 +30,12 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif /* HAVE_STRING_H */ +#include #ifdef HAVE_UNISTD_H # include #endif /* HAVE_UNISTD_H */ #include #include -#include #include "wget.h" #include "url.h" @@ -53,23 +47,18 @@ so, delete this exception statement from your version. */ #include "hash.h" #include "res.h" #include "convert.h" - -#ifndef errno -extern int errno; -#endif - -extern char *version_string; - -extern struct hash_table *dl_url_file_map; -extern struct hash_table *downloaded_html_set; +#include "spider.h" /* Functions for maintaining the URL queue. */ struct queue_element { - const char *url; - const char *referer; - int depth; - struct queue_element *next; + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + bool html_allowed; /* whether the document is allowed to + be treated as HTML. */ + + struct queue_element *next; /* next element in queue */ }; struct url_queue { @@ -83,8 +72,7 @@ struct url_queue { static struct url_queue * url_queue_new (void) { - struct url_queue *queue = xmalloc (sizeof (*queue)); - memset (queue, '\0', sizeof (*queue)); + struct url_queue *queue = xnew0 (struct url_queue); return queue; } @@ -102,12 +90,13 @@ url_queue_delete (struct url_queue *queue) static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth) + const char *url, const char *referer, int depth, bool html_allowed) { - struct queue_element *qel = xmalloc (sizeof (*qel)); + struct queue_element *qel = xnew (struct queue_element); qel->url = url; qel->referer = referer; qel->depth = depth; + qel->html_allowed = html_allowed; qel->next = NULL; ++queue->count; @@ -125,17 +114,18 @@ url_enqueue (struct url_queue *queue, queue->head = queue->tail; } -/* Take a URL out of the queue. Return 1 if this operation succeeded, - or 0 if the queue is empty. */ +/* Take a URL out of the queue. Return true if this operation + succeeded, or false if the queue is empty. */ -static int +static bool url_dequeue (struct url_queue *queue, - const char **url, const char **referer, int *depth) + const char **url, const char **referer, int *depth, + bool *html_allowed) { struct queue_element *qel = queue->head; if (!qel) - return 0; + return false; queue->head = queue->head->next; if (!queue->head) @@ -144,6 +134,7 @@ url_dequeue (struct url_queue *queue, *url = qel->url; *referer = qel->referer; *depth = qel->depth; + *html_allowed = qel->html_allowed; --queue->count; @@ -151,13 +142,13 @@ url_dequeue (struct url_queue *queue, DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); xfree (qel); - return 1; + return true; } -static int download_child_p PARAMS ((const struct urlpos *, struct url *, int, - struct url *, struct hash_table *)); -static int descend_redirect_p PARAMS ((const char *, const char *, int, - struct url *, struct hash_table *)); +static bool download_child_p (const struct urlpos *, struct url *, int, + struct url *, struct hash_table *); +static bool descend_redirect_p (const char *, const char *, int, + struct url *, struct hash_table *); /* Retrieve a part of the web beginning with START_URL. This used to @@ -208,17 +199,18 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0); + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true); string_set_add (blacklist, start_url_parsed->url); while (1) { - int descend = 0; + bool descend = false; char *url, *referer, *file = NULL; int depth; - boolean dash_p_leaf_HTML = FALSE; + bool html_allowed; + bool dash_p_leaf_HTML = false; - if (downloaded_exceeds_quota ()) + if (opt.quota && total_downloaded_bytes > opt.quota) break; if (status == FWRITEERR) break; @@ -227,7 +219,7 @@ retrieve_tree (const char *start_url) if (!url_dequeue (queue, (const char **)&url, (const char **)&referer, - &depth)) + &depth, &html_allowed)) break; /* ...and download it. Note that this download is in most cases @@ -245,23 +237,21 @@ retrieve_tree (const char *start_url) DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", url, file)); - if (downloaded_html_set + if (html_allowed + && downloaded_html_set && string_set_contains (downloaded_html_set, file)) - descend = 1; + descend = true; } else { int dt = 0; char *redirected = NULL; - int oldrec = opt.recursive; - opt.recursive = 0; - status = retrieve_url (url, &file, &redirected, referer, &dt); - opt.recursive = oldrec; + status = retrieve_url (url, &file, &redirected, referer, &dt, false); - if (file && status == RETROK + if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) - descend = 1; + descend = true; if (redirected) { @@ -272,7 +262,7 @@ retrieve_tree (const char *start_url) { if (!descend_redirect_p (redirected, url, depth, start_url_parsed, blacklist)) - descend = 0; + descend = false; else /* Make sure that the old pre-redirect form gets blacklisted. */ @@ -284,6 +274,11 @@ retrieve_tree (const char *start_url) } } + if (opt.spider) + { + visited_url (url, referer); + } + if (descend && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) { @@ -297,7 +292,7 @@ retrieve_tree (const char *start_url) one, but we allow one more level so that the leaf pages that contain frames can be loaded correctly. */ - dash_p_leaf_HTML = TRUE; + dash_p_leaf_HTML = true; } else { @@ -306,7 +301,7 @@ retrieve_tree (const char *start_url) affords us, so we need to bail out. */ DEBUGP (("Not descending further; at depth %d, max. %d.\n", depth, opt.reclevel)); - descend = 0; + descend = false; } } @@ -315,7 +310,7 @@ retrieve_tree (const char *start_url) if (descend) { - int meta_disallow_follow = 0; + bool meta_disallow_follow = false; struct urlpos *children = get_urls_html (file, url, &meta_disallow_follow); @@ -341,7 +336,8 @@ retrieve_tree (const char *start_url) blacklist)) { url_enqueue (queue, xstrdup (child->url->url), - xstrdup (url), depth + 1); + xstrdup (url), depth + 1, + child->link_expect_html); /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ @@ -354,28 +350,33 @@ retrieve_tree (const char *start_url) } } - if (opt.delete_after || (file && !acceptable (file))) + if (file + && (opt.delete_after + || opt.spider /* opt.recursive is implicitely true */ + || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - otherwise rejected (e.g. by -R) HTML file just so we - could harvest its hyperlinks -- in either case, delete - the local file. */ + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, + delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - "recursive rejection criteria")); + (opt.spider ? "--spider" : + "recursive rejection criteria"))); logprintf (LOG_VERBOSE, - (opt.delete_after + (opt.delete_after || opt.spider ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), file); if (unlink (file)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + logputs (LOG_VERBOSE, "\n"); register_delete_file (file); } xfree (url); - FREE_MAYBE (referer); - FREE_MAYBE (file); + xfree_null (referer); + xfree_null (file); } /* If anything is left of the queue due to a premature exit, free it @@ -383,10 +384,12 @@ retrieve_tree (const char *start_url) { char *d1, *d2; int d3; - while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3)) + bool d4; + while (url_dequeue (queue, + (const char **)&d1, (const char **)&d2, &d3, &d4)) { xfree (d1); - FREE_MAYBE (d2); + xfree_null (d2); } } url_queue_delete (queue); @@ -395,7 +398,7 @@ retrieve_tree (const char *start_url) url_free (start_url_parsed); string_set_free (blacklist); - if (downloaded_exceeds_quota ()) + if (opt.quota && total_downloaded_bytes > opt.quota) return QUOTEXC; else if (status == FWRITEERR) return FWRITEERR; @@ -411,18 +414,25 @@ retrieve_tree (const char *start_url) by storing these URLs to BLACKLIST. This may or may not help. It will help if those URLs are encountered many times. */ -static int +static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, struct url *start_url_parsed, struct hash_table *blacklist) { struct url *u = upos->url; const char *url = u->url; - int u_scheme_like_http; + bool u_scheme_like_http; DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); if (string_set_contains (blacklist, url)) { + if (opt.spider) + { + char *referrer = url_string (parent, true); + DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url)); + visited_url (url, referrer); + xfree (referrer); + } DEBUGP (("Already on the black list.\n")); goto out; } @@ -486,7 +496,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, && u->port == start_url_parsed->port && !(opt.page_requisites && upos->link_inline_p)) { - if (!frontcmp (start_url_parsed->dir, u->dir)) + if (!subdir_p (start_url_parsed->dir, u->dir)) { DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", u->dir, start_url_parsed->dir)); @@ -499,7 +509,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, exclusion and inclusion lists. */ if (opt.includes || opt.excludes) { - if (!accdir (u->dir, ALLABS)) + if (!accdir (u->dir)) { DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); goto out; @@ -507,13 +517,21 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, } /* 6. Check for acceptance/rejection rules. We ignore these rules - for directories (no file name to match) and for HTML documents, - which might lead to other files that do need to be downloaded. - That is, unless we've exhausted the recursion depth anyway. */ + for directories (no file name to match) and for non-leaf HTMLs, + which can lead to other files that do need to be downloaded. (-p + automatically implies non-leaf because with -p we can, if + necesary, overstep the maximum depth to get the page requisites.) */ if (u->file[0] != '\0' && !(has_html_suffix_p (u->file) - && depth != INFINITE_RECURSION - && depth < opt.reclevel - 1)) + /* The exception only applies to non-leaf HTMLs (but -p + always implies non-leaf because we can overstep the + maximum depth to get the requisites): */ + && (/* non-leaf */ + opt.reclevel == INFINITE_RECURSION + /* also non-leaf */ + || depth < opt.reclevel - 1 + /* -p, which implies non-leaf (see above) */ + || opt.page_requisites))) { if (!acceptable (u->file)) { @@ -568,12 +586,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, download queue. */ DEBUGP (("Decided to load it.\n")); - return 1; + return true; out: DEBUGP (("Decided NOT to load it.\n")); - return 0; + return false; } /* This function determines whether we will consider downloading the @@ -581,13 +599,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, possibly to another host, etc. It is needed very rarely, and thus it is merely a simple-minded wrapper around download_child_p. */ -static int +static bool descend_redirect_p (const char *redirected, const char *original, int depth, struct url *start_url_parsed, struct hash_table *blacklist) { struct url *orig_parsed, *new_parsed; struct urlpos *upos; - int success; + bool success; orig_parsed = url_parse (original, NULL); assert (orig_parsed != NULL); @@ -595,8 +613,7 @@ descend_redirect_p (const char *redirected, const char *original, int depth, new_parsed = url_parse (redirected, NULL); assert (new_parsed != NULL); - upos = xmalloc (sizeof (struct urlpos)); - memset (upos, 0, sizeof (*upos)); + upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth,