X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=741ca823094a75c0fd760f44cd126dba88e81216;hb=123f5c39669abc055987d69a311785c861494c87;hp=87440b41c8c7351a89f364673c950d85b3f620d5;hpb=68740f10dd55cb272bcad0bd0c5199bbdef5b26e;p=wget
diff --git a/src/recur.c b/src/recur.c
index 87440b41..741ca823 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -1,6 +1,6 @@
/* Handling of recursive HTTP retrieving.
Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
@@ -17,17 +17,18 @@ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see .
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
-#include
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
#include
#include
@@ -38,7 +39,6 @@ so, delete this exception statement from your version. */
#include
#include
-#include "wget.h"
#include "url.h"
#include "recur.h"
#include "utils.h"
@@ -48,8 +48,10 @@ so, delete this exception statement from your version. */
#include "hash.h"
#include "res.h"
#include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
#include "spider.h"
-
+
/* Functions for maintaining the URL queue. */
struct queue_element {
@@ -58,7 +60,8 @@ struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
-
+ bool css_allowed; /* whether the document is allowed to
+ be treated as CSS. */
struct queue_element *next; /* next element in queue */
};
@@ -91,13 +94,15 @@ url_queue_delete (struct url_queue *queue)
static void
url_enqueue (struct url_queue *queue,
- const char *url, const char *referer, int depth, bool html_allowed)
+ const char *url, const char *referer, int depth,
+ bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
+ qel->css_allowed = css_allowed;
qel->next = NULL;
++queue->count;
@@ -121,7 +126,7 @@ url_enqueue (struct url_queue *queue,
static bool
url_dequeue (struct url_queue *queue,
const char **url, const char **referer, int *depth,
- bool *html_allowed)
+ bool *html_allowed, bool *css_allowed)
{
struct queue_element *qel = queue->head;
@@ -136,6 +141,7 @@ url_dequeue (struct url_queue *queue,
*referer = qel->referer;
*depth = qel->depth;
*html_allowed = qel->html_allowed;
+ *css_allowed = qel->css_allowed;
--queue->count;
@@ -190,8 +196,9 @@ retrieve_tree (const char *start_url)
if (!start_url_parsed)
{
- logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
- url_error (up_error_code));
+ char *error = url_error (start_url, up_error_code);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error);
+ xfree (error);
return URLERROR;
}
@@ -200,7 +207,7 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
- url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
+ url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
@@ -208,7 +215,8 @@ retrieve_tree (const char *start_url)
bool descend = false;
char *url, *referer, *file = NULL;
int depth;
- bool html_allowed;
+ bool html_allowed, css_allowed;
+ bool is_css = false;
bool dash_p_leaf_HTML = false;
if (opt.quota && total_downloaded_bytes > opt.quota)
@@ -220,7 +228,7 @@ retrieve_tree (const char *start_url)
if (!url_dequeue (queue,
(const char **)&url, (const char **)&referer,
- &depth, &html_allowed))
+ &depth, &html_allowed, &css_allowed))
break;
/* ...and download it. Note that this download is in most cases
@@ -238,10 +246,21 @@ retrieve_tree (const char *start_url)
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
url, file));
+ /* this sucks, needs to be combined! */
if (html_allowed
&& downloaded_html_set
&& string_set_contains (downloaded_html_set, file))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+ if (css_allowed
+ && downloaded_css_set
+ && string_set_contains (downloaded_css_set, file))
+ {
+ descend = true;
+ is_css = true;
+ }
}
else
{
@@ -252,7 +271,21 @@ retrieve_tree (const char *start_url)
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+
+ /* a little different, css_allowed can override content type
+ lots of web servers serve css with an incorrect content type
+ */
+ if (file && status == RETROK
+ && (dt & RETROKF) &&
+ ((dt & TEXTCSS) || css_allowed))
+ {
+ descend = true;
+ is_css = true;
+ }
if (redirected)
{
@@ -306,14 +339,15 @@ retrieve_tree (const char *start_url)
}
}
- /* If the downloaded document was HTML, parse it and enqueue the
+ /* If the downloaded document was HTML or CSS, parse it and enqueue the
links it contains. */
if (descend)
{
bool meta_disallow_follow = false;
struct urlpos *children
- = get_urls_html (file, url, &meta_disallow_follow);
+ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
@@ -345,7 +379,8 @@ retrieve_tree (const char *start_url)
{
url_enqueue (queue, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
- child->link_expect_html);
+ child->link_expect_html,
+ child->link_expect_css);
/* We blacklist the URL we have enqueued, because we
don't want to enqueue (and hence download) the
same URL twice. */
@@ -394,9 +429,9 @@ retrieve_tree (const char *start_url)
{
char *d1, *d2;
int d3;
- bool d4;
+ bool d4, d5;
while (url_dequeue (queue,
- (const char **)&d1, (const char **)&d2, &d3, &d4))
+ (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
xfree (d1);
xfree_null (d2);
@@ -439,7 +474,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
- DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+ DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
visited_url (url, referrer);
xfree (referrer);
}