+2006-08-24 Mauro Tortonesi <mauro@ferrara.linux.it>
+
+ * Makefile.in: Added spider.c to the list of files to compile and
+ spider.h to the list of header files. Updated copyright information.
+
+ * http.c: Major changes to recursive spider mode. Now for every
+ resource we are supposed to check, we send a HEAD request to find out
+ if it exists. If the resource is a HTML file, we retrieve it and parse
+ it to discover links to other resources.
+
+ * recur.c: Ditto.
+
+ * res.c (res_retrieve_file): Disable opt.timestamping and opt.spider
+ when retrieving robots.txt. Updated copyright information.
+
+ * convert.c: Moved code tracking broken links to spider.c.
+
+ * convert.h: Ditto.
+
+ * spider.c: Created new file to keep track of visited URLs in spider
+ mode.
+
+ * spider.h: Ditto.
+
2006-08-21 Mauro Tortonesi <mauro@ferrara.linux.it>
* http.c: Fixed timestamping-related bug.
# Makefile for `wget' utility
-# Copyright (C) 1995-2005 Free Software Foundation, Inc.
+# Copyright (C) 1995-2006 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \
log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
- res.o retr.o safe-ctype.o snprintf.o $(SSL_OBJ) url.o \
- utils.o version.o xmalloc.o
+ res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ) \
+ url.o utils.o version.o xmalloc.o
.SUFFIXES:
.SUFFIXES: .c .o
# time, and it's a lot safer than attempting to get all the
# dependencies right.
-$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
- gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
- http-ntlm.h init.h log.h mswindows.h netrc.h options.h \
- progress.h ptimer.h recur.h res.h retr.h safe-ctype.h ssl.h \
- sysdep.h url.h utils.h wget.h xmalloc.h
+$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
+ gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
+ http-ntlm.h init.h log.h mswindows.h netrc.h options.h \
+ progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \
+ spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
#
# Dependencies for installing
conversion after Wget is done. */
struct hash_table *downloaded_html_set;
-static struct hash_table *nonexisting_urls_hash;
-
static void convert_links (const char *, struct urlpos *);
/* This function is called when the retrieval is done to convert the
}
static void downloaded_files_free (void);
-static void nonexisting_urls_free (void);
/* Cleanup the data structures associated with this file. */
if (downloaded_html_set)
string_set_free (downloaded_html_set);
downloaded_files_free ();
- nonexisting_urls_free ();
if (converted_files)
string_set_free (converted_files);
}
downloaded_files_hash = NULL;
}
}
-\f
-/* Remembers broken links. */
-
-struct broken_urls_list
-{
- char *url;
- struct broken_urls_list *next;
-};
-
-static bool
-in_list (const struct broken_urls_list *list, const char *url)
-{
- const struct broken_urls_list *ptr;
-
- for (ptr = list; ptr; ptr = ptr->next)
- {
- /* str[case]cmp is inadequate for URL comparison */
- if (are_urls_equal (url, ptr->url) == 0) return true;
- }
-
- return false;
-}
-
-void
-nonexisting_url (const char *url, const char *referrer)
-{
- struct broken_urls_list *list;
-
- /* Ignore robots.txt URLs */
- if (is_robots_txt_url (url))
- return;
-
- if (!nonexisting_urls_hash)
- nonexisting_urls_hash = make_string_hash_table (0);
-
- list = hash_table_get (nonexisting_urls_hash, url);
- if (!list)
- {
- list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
- list->url = referrer ? xstrdup (referrer) : NULL;
- hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
- }
- else if (list && !in_list (list, referrer))
- {
- /* Append referrer at the end of the list */
- struct broken_urls_list *newnode;
-
- while (list->next) list = list->next;
-
- newnode = xnew0 (struct broken_urls_list);
- newnode->url = xstrdup (referrer);
- list->next = newnode;
- }
-}
-
-static void
-nonexisting_urls_free (void)
-{
- if (nonexisting_urls_hash)
- {
- hash_table_iterator iter;
- for (hash_table_iterate (nonexisting_urls_hash, &iter);
- hash_table_iter_next (&iter);
- )
- {
- xfree (iter.key);
- xfree (iter.value);
- }
- hash_table_destroy (nonexisting_urls_hash);
- nonexisting_urls_hash = NULL;
- }
-}
-
-void
-print_broken_links (void)
-{
- hash_table_iterator iter;
- int num_elems;
-
- if (!nonexisting_urls_hash)
- {
- logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
- return;
- }
-
- num_elems = hash_table_count (nonexisting_urls_hash);
- assert (num_elems > 0);
-
- if (num_elems > 1)
- {
- logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
- num_elems);
- }
- else
- {
- logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
- }
-
- for (hash_table_iterate (nonexisting_urls_hash, &iter);
- hash_table_iter_next (&iter);
- )
- {
- struct broken_urls_list *list;
-
- logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
-
- for (list = (struct broken_urls_list *) iter.value;
- list;
- list = list->next)
- {
- logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
- }
- }
- logputs (LOG_NOTQUIET, "\n");
-}
-
\f
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic
char *html_quote_string (const char *);
-void nonexisting_url (const char *, const char *);
-void print_broken_links (void);
-
#endif /* CONVERT_H */
/* Get the current time string. */
tms = time_str (time (NULL));
+ if (opt.spider && !got_head)
+ logprintf (LOG_VERBOSE, _("\
+Spider mode enabled. Check if remote file exists.\n"));
+
/* Print fetch message, if opt.verbose. */
if (opt.verbose)
{
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
- if ((opt.spider && !opt.recursive)
- || (opt.timestamping && !got_head)
+ if (((opt.spider || opt.timestamping) && !got_head)
|| (opt.always_rest && !got_name))
*dt |= HEAD_ONLY;
else
hurl = url_string (u, true);
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
}
- if (opt.spider && opt.recursive)
+ /* Maybe we should always keep track of broken links, not just in
+ * spider mode. */
+ if (opt.spider)
+ {
+ /* #### Again: ugly ugly ugly! */
+ if (!hurl)
+ hurl = url_string (u, true);
+ nonexisting_url (hurl);
+ logprintf (LOG_NOTQUIET, _("\
+Remote file does not exist -- broken link!!!\n"));
+ }
+ else
{
- if (!hurl) hurl = url_string (u, true);
- nonexisting_url (hurl, referer);
+ logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
+ tms, hstat.statcode, escnonprint (hstat.error));
}
- logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
- tms, hstat.statcode, escnonprint (hstat.error));
logputs (LOG_VERBOSE, "\n");
ret = WRONGCODE;
xfree_null (hurl);
/* The time-stamping section. */
if (opt.timestamping)
{
- if (hstat.orig_file_name) /* Perform this check only if the file we're
- supposed to download already exists. */
+ if (hstat.orig_file_name) /* Perform the following checks only
+ if the file we're supposed to
+ download already exists. */
{
- if (hstat.remote_time && tmr != (time_t) (-1))
+ if (hstat.remote_time &&
+ tmr != (time_t) (-1))
{
/* Now time-stamping can be used validly. Time-stamping
means that if the sizes of the local and remote file
download procedure is resumed. */
if (hstat.orig_file_tstamp >= tmr)
{
- if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen)
+ if (hstat.contlen == -1
+ || hstat.orig_file_size == hstat.contlen)
{
logprintf (LOG_VERBOSE, _("\
Server file no newer than local file `%s' -- not retrieving.\n\n"),
got_name = true;
restart_loop = true;
}
+
+ if (opt.spider)
+ {
+ if (opt.recursive)
+ {
+ if (*dt & TEXTHTML)
+ {
+ logputs (LOG_VERBOSE, _("\
+Remote file exists and could contain links to other resources -- retrieving.\n\n"));
+ restart_loop = true;
+ }
+ else
+ {
+ logprintf (LOG_VERBOSE, _("\
+Remote file exists but does not contain any link -- not retrieving.\n\n"));
+ ret = RETRUNNEEDED;
+ goto exit;
+ }
+ }
+ else
+ {
+ logprintf (LOG_VERBOSE, _("\
+Remote file exists but recursion is disabled -- not retrieving.\n\n"));
+ ret = RETRUNNEEDED;
+ goto exit;
+ }
+ }
got_head = true; /* no more time-stamping */
*dt &= ~HEAD_ONLY;
}
if ((tmr != (time_t) (-1))
- && (!opt.spider || opt.recursive)
&& ((hstat.len == hstat.contlen) ||
((hstat.res == 0) && (hstat.contlen == -1))))
{
}
/* End of time-stamping section. */
- if (opt.spider && !opt.recursive)
- {
- logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
- escnonprint (hstat.error));
- ret = RETROK;
- goto exit;
- }
-
tmrate = retr_rate (hstat.rd_size, hstat.dltime);
total_download_time += hstat.dltime;
}
}
+ if (opt.spider)
+ {
+ visited_url (url, referer);
+ }
+
if (descend
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
{
file);
if (unlink (file))
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+ logputs (LOG_VERBOSE, "\n");
register_delete_file (file);
}
if (string_set_contains (blacklist, url))
{
+ if (opt.spider)
+ {
+ char *referrer = url_string (parent, true);
+ DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+ visited_url (url, referrer);
+ xfree (referrer);
+ }
DEBUGP (("Already on the black list.\n"));
goto out;
}
/* Support for Robot Exclusion Standard (RES).
- Copyright (C) 2001 Free Software Foundation, Inc.
+ Copyright (C) 2001,2006 Free Software Foundation, Inc.
This file is part of Wget.
{
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
+ int saved_ts_val = opt.timestamping;
+ int saved_sp_val = opt.spider;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
+ opt.timestamping = false;
+ opt.spider = false;
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+ opt.timestamping = saved_ts_val;
+ opt.spider = saved_sp_val;
xfree (robots_url);
if (err != RETROK && *file != NULL)
--- /dev/null
+/* Keep track of visited URLs in spider mode.
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "spider.h"
+#include "url.h"
+#include "utils.h"
+#include "hash.h"
+#include "res.h"
+
+
+static struct hash_table *visited_urls_hash;
+static struct hash_table *nonexisting_urls_set;
+
+/* Cleanup the data structures associated with this file. */
+
+void
+spider_cleanup (void)
+{
+ if (visited_urls_hash)
+ {
+ free_keys_and_values (visited_urls_hash);
+ hash_table_destroy (visited_urls_hash);
+ visited_urls_hash = NULL;
+ }
+ if (nonexisting_urls_set)
+ string_set_free (nonexisting_urls_set);
+}
+\f
+/* Remembers visited files. */
+
+struct url_list
+{
+ char *url;
+ struct url_list *next;
+};
+
+static bool
+in_url_list_p (const struct url_list *list, const char *url, bool verbose)
+{
+ const struct url_list *ptr;
+
+ for (ptr = list; ptr; ptr = ptr->next)
+ {
+ /* str[case]cmp is inadequate for URL comparison */
+ if (are_urls_equal (url, ptr->url))
+ return true;
+ }
+
+ return false;
+}
+
+void
+visited_url (const char *url, const char *referrer)
+{
+ struct url_list *list;
+
+ /* Ignore robots.txt URLs */
+ if (is_robots_txt_url (url))
+ return;
+
+ if (!visited_urls_hash)
+ visited_urls_hash = make_string_hash_table (0);
+
+ list = hash_table_get (visited_urls_hash, url);
+ if (!list)
+ {
+ list = (struct url_list *) xnew0 (struct url_list);
+ list->url = referrer ? xstrdup (referrer) : NULL;
+ hash_table_put (visited_urls_hash, xstrdup (url), list);
+ }
+ else if (referrer && !in_url_list_p (list, referrer, false))
+ {
+ /* Append referrer at the end of the list */
+ struct url_list *newnode;
+
+ while (list->next)
+ list = list->next;
+
+ newnode = (struct url_list *) xnew0 (struct url_list);
+ newnode->url = xstrdup (referrer);
+ list->next = newnode;
+ }
+}
+\f
+/* Remembers broken links. */
+void
+nonexisting_url (const char *url)
+{
+ /* Ignore robots.txt URLs */
+ if (is_robots_txt_url (url))
+ return;
+ if (!nonexisting_urls_set)
+ nonexisting_urls_set = make_string_hash_table (0);
+ string_set_add (nonexisting_urls_set, url);
+}
+
+void
+print_broken_links (void)
+{
+ hash_table_iterator iter;
+ int num_elems;
+
+ if (!nonexisting_urls_set)
+ {
+ logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
+ return;
+ }
+
+ num_elems = hash_table_count (nonexisting_urls_set);
+ assert (num_elems > 0);
+
+ if (num_elems > 1)
+ {
+ logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
+ num_elems);
+ }
+ else
+ {
+ logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
+ }
+
+ for (hash_table_iterate (nonexisting_urls_set, &iter);
+ hash_table_iter_next (&iter); )
+ {
+ struct url_list *list;
+ const char *url = (const char *) iter.key;
+
+ logprintf (LOG_NOTQUIET, _("%s referred by:\n"), url);
+
+ for (list = (struct url_list *) hash_table_get (visited_urls_hash, url);
+ list; list = list->next)
+ {
+ logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
+ }
+ }
+ logputs (LOG_NOTQUIET, "\n");
+}
+
+/*
+ * vim: et ts=2 sw=2
+ */
+
--- /dev/null
+/* Declarations for spider.c
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+#ifndef SPIDER_H
+#define SPIDER_H
+
+void visited_url (const char *, const char *);
+void nonexisting_url (const char *);
+void print_broken_links (void);
+
+#endif /* SPIDER_H */