/* Handling of recursive HTTP retrieving.
Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
-#include <config.h>
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "url.h"
#include "recur.h"
#include "utils.h"
#include "hash.h"
#include "res.h"
#include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
#include "spider.h"
+#include "iri.h"
\f
/* Functions for maintaining the URL queue. */
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
-
+ char *remote_encoding;
+ bool css_allowed; /* whether the document is allowed to
+ be treated as CSS. */
struct queue_element *next; /* next element in queue */
};
static void
url_enqueue (struct url_queue *queue,
- const char *url, const char *referer, int depth, bool html_allowed)
+ const char *url, const char *referer, int depth,
+ bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
+ char *charset = get_current_charset ();
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
+ qel->css_allowed = css_allowed;
qel->next = NULL;
+ if (charset)
+ qel->remote_encoding = xstrdup (charset);
+ else
+ qel->remote_encoding = NULL;
+
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
+ /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
static bool
url_dequeue (struct url_queue *queue,
const char **url, const char **referer, int *depth,
- bool *html_allowed)
+ bool *html_allowed, bool *css_allowed)
{
struct queue_element *qel = queue->head;
if (!queue->head)
queue->tail = NULL;
+ set_remote_charset (qel->remote_encoding);
+ if (qel->remote_encoding)
+ xfree (qel->remote_encoding);
+
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
*html_allowed = qel->html_allowed;
+ *css_allowed = qel->css_allowed;
--queue->count;
struct hash_table *blacklist;
int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ struct url *start_url_parsed;
+ set_ugly_no_encode (true);
+ start_url_parsed= url_parse (start_url, &up_error_code);
+ set_ugly_no_encode (false);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
- url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
+ url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
bool descend = false;
char *url, *referer, *file = NULL;
int depth;
- bool html_allowed;
+ bool html_allowed, css_allowed;
+ bool is_css = false;
bool dash_p_leaf_HTML = false;
if (opt.quota && total_downloaded_bytes > opt.quota)
if (!url_dequeue (queue,
(const char **)&url, (const char **)&referer,
- &depth, &html_allowed))
+ &depth, &html_allowed, &css_allowed))
break;
/* ...and download it. Note that this download is in most cases
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
url, file));
+ /* this sucks, needs to be combined! */
if (html_allowed
&& downloaded_html_set
&& string_set_contains (downloaded_html_set, file))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+ if (css_allowed
+ && downloaded_css_set
+ && string_set_contains (downloaded_css_set, file))
+ {
+ descend = true;
+ is_css = true;
+ }
}
else
{
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+
+ /* a little different, css_allowed can override content type
+ lots of web servers serve css with an incorrect content type
+ */
+ if (file && status == RETROK
+ && (dt & RETROKF) &&
+ ((dt & TEXTCSS) || css_allowed))
+ {
+ descend = true;
+ is_css = true;
+ }
if (redirected)
{
}
}
- /* If the downloaded document was HTML, parse it and enqueue the
+ /* If the downloaded document was HTML or CSS, parse it and enqueue the
links it contains. */
if (descend)
{
bool meta_disallow_follow = false;
struct urlpos *children
- = get_urls_html (file, url, &meta_disallow_follow);
+ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
if (children)
{
struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ set_ugly_no_encode (true);
+ struct url *url_parsed = url_parse (url, NULL);
+ set_ugly_no_encode (false);
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
{
url_enqueue (queue, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
- child->link_expect_html);
+ child->link_expect_html,
+ child->link_expect_css);
/* We blacklist the URL we have enqueued, because we
don't want to enqueue (and hence download) the
same URL twice. */
}
}
- if (file
- && (opt.delete_after
+ if (file
+ && (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- (otherwise unneeded because of --spider or rejected by -R)
- HTML file just to harvest its hyperlinks -- in either case,
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- (opt.spider ? "--spider" :
+ (opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
{
char *d1, *d2;
int d3;
- bool d4;
+ bool d4, d5;
while (url_dequeue (queue,
- (const char **)&d1, (const char **)&d2, &d3, &d4))
+ (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
xfree (d1);
xfree_null (d2);
if (string_set_contains (blacklist, url))
{
- if (opt.spider)
+ if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
- DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+ DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
visited_url (url, referrer);
xfree (referrer);
}
if (res_retrieve_file (url, &rfile))
{
specs = res_parse_from_file (rfile);
+
+ /* Delete the robots.txt file if we chose to either delete the
+ files after downloading or we're just running a spider. */
+ if (opt.delete_after || opt.spider)
+ {
+ logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
+ if (unlink (rfile))
+ logprintf (LOG_NOTQUIET, "unlink: %s\n",
+ strerror (errno));
+ }
+
xfree (rfile);
}
else
struct urlpos *upos;
bool success;
+ set_ugly_no_encode (true);
orig_parsed = url_parse (original, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
assert (new_parsed != NULL);
+ set_ugly_no_encode (false);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;