+2006-05-25 Mauro Tortonesi <mauro@ferrara.linux.it>
+
+ * convert.c: Added mechanisms to keep track broken links.
+
+ * convert.h: Ditto.
+
+ * wget.h: Reordered and enumerated uerr_t constants.
+
+ * recur.c: Fixes to support recursive spider mode.
+
+ * http.c: Ditto.
+
+ * main.c: Print broken links in case of recursive spider mode.
+
+ * retr.c: Changed interface of retrieve_url.
+
+ * retr.h: Ditto.
+
+ * ftp.c: Changed interface of ftp_loop.
+
+ * ftp.h: Ditto.
+
+ * res.c: Minor change to reflect changes in interface of retrieve_url.
+
2006-05-18 Lawrence Jones <lawrence.jones@ugs.com>
* ftp-ls.c (ftp_parse_unix_ls): Correct size parsing, add size
conversion after Wget is done. */
struct hash_table *downloaded_html_set;
+static struct hash_table *nonexisting_urls_hash;
+
static void convert_links (const char *, struct urlpos *);
/* This function is called when the retrieval is done to convert the
}
static void downloaded_files_free (void);
+static void nonexisting_urls_free (void);
/* Cleanup the data structures associated with this file. */
if (downloaded_html_set)
string_set_free (downloaded_html_set);
downloaded_files_free ();
+ nonexisting_urls_free ();
if (converted_files)
string_set_free (converted_files);
}
downloaded_files_hash = NULL;
}
}
+\f
+/* Remembers broken links. */
+
+struct broken_urls_list
+{
+ char *url;
+ struct broken_urls_list *next;
+};
+
+static bool
+in_list (const struct broken_urls_list *list, const char *url)
+{
+ const struct broken_urls_list *ptr;
+
+ for (ptr = list; ptr; ptr = ptr->next)
+ {
+ /* TODO: strcasecmp may not be appropriate to compare URLs */
+ if (strcasecmp (url, ptr->url) == 0) return true;
+ }
+
+ return false;
+}
+
+void
+nonexisting_url (const char *url, const char *referrer)
+{
+ struct broken_urls_list *list;
+
+ if (!nonexisting_urls_hash)
+ nonexisting_urls_hash = make_string_hash_table (0);
+
+ list = hash_table_get (nonexisting_urls_hash, url);
+ if (!list)
+ {
+ list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
+ list->url = referrer ? xstrdup (referrer) : NULL;
+ hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
+ }
+ else if (list && !in_list (list, referrer))
+ {
+ /* Append referrer at the end of the list */
+ struct broken_urls_list *newnode;
+
+ while (list->next) list = list->next;
+
+ newnode = xnew0 (struct broken_urls_list);
+ newnode->url = xstrdup (referrer);
+ list->next = newnode;
+ }
+}
+
+static void
+nonexisting_urls_free (void)
+{
+ if (nonexisting_urls_hash)
+ {
+ hash_table_iterator iter;
+ for (hash_table_iterate (nonexisting_urls_hash, &iter);
+ hash_table_iter_next (&iter);
+ )
+ {
+ xfree (iter.key);
+ xfree (iter.value);
+ }
+ hash_table_destroy (nonexisting_urls_hash);
+ nonexisting_urls_hash = NULL;
+ }
+}
+
+void
+print_broken_links (void)
+{
+ hash_table_iterator iter;
+ int num_elems;
+
+ if (!nonexisting_urls_hash)
+ {
+ logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
+ return;
+ }
+
+ num_elems = hash_table_count (nonexisting_urls_hash);
+ assert (num_elems > 0);
+
+ if (num_elems > 1)
+ {
+ logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
+ num_elems);
+ }
+ else
+ {
+ logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
+ }
+
+ for (hash_table_iterate (nonexisting_urls_hash, &iter);
+ hash_table_iter_next (&iter);
+ )
+ {
+ struct broken_urls_list *list;
+
+ logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
+
+ for (list = (struct broken_urls_list *) iter.value;
+ list;
+ list = list->next)
+ {
+ logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
+ }
+ }
+ logputs (LOG_NOTQUIET, "\n");
+}
+
\f
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic
char *html_quote_string (const char *);
+void nonexisting_url (const char *, const char *);
+void print_broken_links (void);
+
#endif /* CONVERT_H */
of URL. Inherently, its capabilities are limited on what can be
encoded into a URL. */
uerr_t
-ftp_loop (struct url *u, int *dt, struct url *proxy)
+ftp_loop (struct url *u, int *dt, struct url *proxy, bool recursive, bool glob)
{
ccon con; /* FTP connection */
uerr_t res;
/* If the file name is empty, the user probably wants a directory
index. We'll provide one, properly HTML-ized. Unless
opt.htmlify is 0, of course. :-) */
- if (!*u->file && !opt.recursive)
+ if (!*u->file && !recursive)
{
struct fileinfo *f;
res = ftp_get_listing (u, &con, &f);
else
{
bool ispattern = false;
- if (opt.ftp_glob)
+ if (glob)
{
/* Treat the URL as a pattern if the file name part of the
URL path contains wildcards. (Don't check for u->file
file_part = u->path;
ispattern = has_wildcards_p (file_part);
}
- if (ispattern || opt.recursive || opt.timestamping)
+ if (ispattern || recursive || opt.timestamping)
{
/* ftp_retrieve_glob is a catch-all function that gets called
if we need globbing, time-stamping or recursion. Its
};
struct fileinfo *ftp_parse_ls (const char *, const enum stype);
-uerr_t ftp_loop (struct url *, int *, struct url *);
+uerr_t ftp_loop (struct url *, int *, struct url *, bool, bool);
uerr_t ftp_index (const char *, struct url *, struct fileinfo *);
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
- if (opt.spider || (opt.timestamping && !got_head))
+ if ((opt.spider && !opt.recursive) || (opt.timestamping && !got_head))
*dt |= HEAD_ONLY;
else
*dt &= ~HEAD_ONLY;
/* All possibilities should have been exhausted. */
abort ();
}
-
+
if (!(*dt & RETROKF))
{
+ char *hurl = NULL;
if (!opt.verbose)
{
/* #### Ugly ugly ugly! */
- char *hurl = url_string (u, true);
+ hurl = url_string (u, true);
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
- xfree (hurl);
+ }
+ if (opt.spider && opt.recursive)
+ {
+ if (!hurl) hurl = url_string (u, true);
+ nonexisting_url (hurl, referer);
}
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
tms, hstat.statcode, escnonprint (hstat.error));
logputs (LOG_VERBOSE, "\n");
ret = WRONGCODE;
+ xfree_null (hurl);
goto exit;
}
}
if ((tmr != (time_t) (-1))
- && !opt.spider
+ && (!opt.spider || opt.recursive)
&& ((hstat.len == hstat.contlen) ||
((hstat.res == 0) && (hstat.contlen == -1))))
{
}
/* End of time-stamping section. */
- if (opt.spider)
+ if (opt.spider && !opt.recursive)
{
logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
escnonprint (hstat.error));
&& url_scheme (*t) != SCHEME_FTP)
status = retrieve_tree (*t);
else
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt);
+ status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
if (opt.delete_after && file_exists_p(filename))
{
logprintf (LOG_NOTQUIET, _("No URLs found in %s.\n"),
opt.input_filename);
}
+
+ /* Print broken links. */
+ if (opt.recursive && opt.spider)
+ {
+ print_broken_links();
+ }
+
/* Print the downloaded sum. */
if ((opt.recursive || opt.page_requisites
|| nurl > 1
{
int dt = 0;
char *redirected = NULL;
- bool oldrec = opt.recursive;
- opt.recursive = false;
- status = retrieve_url (url, &file, &redirected, referer, &dt);
- opt.recursive = oldrec;
+ status = retrieve_url (url, &file, &redirected, referer, &dt, false);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
}
}
- if (opt.delete_after || (file && !acceptable (file)))
+ if (file
+ && (opt.delete_after
+ || opt.spider /* opt.recursive is implicitely true */
+ || !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- otherwise rejected (e.g. by -R) HTML file just so we
- could harvest its hyperlinks -- in either case, delete
- the local file. */
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
+ delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- "recursive rejection criteria"));
+ (opt.spider ? "--spider" :
+ "recursive rejection criteria")));
logprintf (LOG_VERBOSE,
- (opt.delete_after
+ (opt.delete_after || opt.spider
? _("Removing %s.\n")
: _("Removing %s since it should be rejected.\n")),
file);
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL);
+ err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
xfree (robots_url);
if (err != RETROK && *file != NULL)
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt)
+ const char *refurl, int *dt, bool recursive)
{
uerr_t result;
char *url;
/* If this is a redirection, temporarily turn off opt.ftp_glob
and opt.recursive, both being undesirable when following
redirects. */
- bool oldrec = opt.recursive, oldglob = opt.ftp_glob;
+ bool oldrec = recursive, glob = opt.ftp_glob;
if (redirection_count)
- opt.recursive = opt.ftp_glob = false;
+ oldrec = glob = false;
- result = ftp_loop (u, dt, proxy_url);
- opt.recursive = oldrec;
- opt.ftp_glob = oldglob;
+ result = ftp_loop (u, dt, proxy_url, recursive, glob);
+ recursive = oldrec;
/* There is a possibility of having HTTP being redirected to
FTP. In these cases we must decide whether the text is HTML
&& cur_url->url->scheme != SCHEME_FTP)
status = retrieve_tree (cur_url->url->url);
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
+ status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
if (filename && opt.delete_after && file_exists_p (filename))
{
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *);
+uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);
simplified. */
typedef enum
{
+ /* 0 */
NOCONERROR, HOSTERR, CONSOCKERR, CONERROR, CONSSLERR,
- CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR,
- CONCLOSED, FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR,
- FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR,
- FTPREXC, FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR,
- FOPENERR, FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF,
+ CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR, CONCLOSED,
+ /* 10 */
+ FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR,
+ FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR, FTPREXC,
+ /* 20 */
+ FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR, FOPENERR,
+ FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF,
+ /* 30 */
HERR, RETROK, RECLEVELEXC, FTPACCDENIED, WRONGCODE,
- FTPINVPASV, FTPNOPASV,
- CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED, READERR, TRYLIMEXC,
- URLBADPATTERN, FILEBADFILE, RANGEERR, RETRBADPATTERN,
- RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED,
- QUOTEXC, WRITEFAILED, SSLINITFAILED
+ FTPINVPASV, FTPNOPASV, CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED,
+ /* 40 */
+ READERR, TRYLIMEXC, URLBADPATTERN, FILEBADFILE, RANGEERR,
+ RETRBADPATTERN, RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR,
+ /* 50 */
+ AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED
} uerr_t;
#endif /* WGET_H */