/* Handling of recursive HTTP retrieving.
Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
-#include <config.h>
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "url.h"
#include "recur.h"
#include "utils.h"
#include "hash.h"
#include "res.h"
#include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
#include "spider.h"
-\f
+
/* Functions for maintaining the URL queue. */
struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
-
+ bool css_allowed; /* whether the document is allowed to
+ be treated as CSS. */
struct queue_element *next; /* next element in queue */
};
static void
url_enqueue (struct url_queue *queue,
- const char *url, const char *referer, int depth, bool html_allowed)
+ const char *url, const char *referer, int depth,
+ bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
+ qel->css_allowed = css_allowed;
qel->next = NULL;
++queue->count;
static bool
url_dequeue (struct url_queue *queue,
const char **url, const char **referer, int *depth,
- bool *html_allowed)
+ bool *html_allowed, bool *css_allowed)
{
struct queue_element *qel = queue->head;
*referer = qel->referer;
*depth = qel->depth;
*html_allowed = qel->html_allowed;
+ *css_allowed = qel->css_allowed;
--queue->count;
\f
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *);
-static bool descend_redirect_p (const char *, const char *, int,
+static bool descend_redirect_p (const char *, struct url *, int,
struct url *, struct hash_table *);
options, add it to the queue. */
uerr_t
-retrieve_tree (const char *start_url)
+retrieve_tree (struct url *start_url_parsed)
{
uerr_t status = RETROK;
the queue, but haven't been downloaded yet. */
struct hash_table *blacklist;
- int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
-
- if (!start_url_parsed)
- {
- logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
- url_error (up_error_code));
- return URLERROR;
- }
-
queue = url_queue_new ();
blacklist = make_string_hash_table (0);
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
- url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
+ url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
bool descend = false;
char *url, *referer, *file = NULL;
int depth;
- bool html_allowed;
+ bool html_allowed, css_allowed;
+ bool is_css = false;
bool dash_p_leaf_HTML = false;
if (opt.quota && total_downloaded_bytes > opt.quota)
if (!url_dequeue (queue,
(const char **)&url, (const char **)&referer,
- &depth, &html_allowed))
+ &depth, &html_allowed, &css_allowed))
break;
/* ...and download it. Note that this download is in most cases
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
url, file));
+ /* this sucks, needs to be combined! */
if (html_allowed
&& downloaded_html_set
&& string_set_contains (downloaded_html_set, file))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+ if (css_allowed
+ && downloaded_css_set
+ && string_set_contains (downloaded_css_set, file))
+ {
+ descend = true;
+ is_css = true;
+ }
}
else
{
- int dt = 0;
+ int dt = 0, url_err;
char *redirected = NULL;
+ struct url *url_parsed = url_parse (url, &url_err);
- status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ status = URLERROR;
+ }
+ else
+ {
+ status = retrieve_url (url_parsed, url, &file, &redirected,
+ referer, &dt, false);
+ }
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+
+ /* a little different, css_allowed can override content type
+ lots of web servers serve css with an incorrect content type
+ */
+ if (file && status == RETROK
+ && (dt & RETROKF) &&
+ ((dt & TEXTCSS) || css_allowed))
+ {
+ descend = true;
+ is_css = true;
+ }
if (redirected)
{
want to follow it. */
if (descend)
{
- if (!descend_redirect_p (redirected, url, depth,
+ if (!descend_redirect_p (redirected, url_parsed, depth,
start_url_parsed, blacklist))
descend = false;
else
xfree (url);
url = redirected;
}
+ url_free(url_parsed);
}
if (opt.spider)
}
}
- /* If the downloaded document was HTML, parse it and enqueue the
+ /* If the downloaded document was HTML or CSS, parse it and enqueue the
links it contains. */
if (descend)
{
bool meta_disallow_follow = false;
struct urlpos *children
- = get_urls_html (file, url, &meta_disallow_follow);
+ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
{
url_enqueue (queue, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
- child->link_expect_html);
+ child->link_expect_html,
+ child->link_expect_css);
/* We blacklist the URL we have enqueued, because we
don't want to enqueue (and hence download) the
same URL twice. */
{
char *d1, *d2;
int d3;
- bool d4;
+ bool d4, d5;
while (url_dequeue (queue,
- (const char **)&d1, (const char **)&d2, &d3, &d4))
+ (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
xfree (d1);
xfree_null (d2);
}
url_queue_delete (queue);
- if (start_url_parsed)
- url_free (start_url_parsed);
string_set_free (blacklist);
if (opt.quota && total_downloaded_bytes > opt.quota)
if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
- DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+ DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
visited_url (url, referrer);
xfree (referrer);
}
if (res_retrieve_file (url, &rfile))
{
specs = res_parse_from_file (rfile);
+
+ /* Delete the robots.txt file if we chose to either delete the
+ files after downloading or we're just running a spider. */
+ if (opt.delete_after || opt.spider)
+ {
+ logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
+ if (unlink (rfile))
+ logprintf (LOG_NOTQUIET, "unlink: %s\n",
+ strerror (errno));
+ }
+
xfree (rfile);
}
else
it is merely a simple-minded wrapper around download_child_p. */
static bool
-descend_redirect_p (const char *redirected, const char *original, int depth,
+descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
{
- struct url *orig_parsed, *new_parsed;
+ struct url *new_parsed;
struct urlpos *upos;
bool success;
- orig_parsed = url_parse (original, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
- url_free (orig_parsed);
url_free (new_parsed);
xfree (upos);