#include "html-url.h"
#include "css-url.h"
#include "spider.h"
-
+#include "iri.h"
+\f
/* Functions for maintaining the URL queue. */
struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
+ char *remote_encoding;
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
+ char *charset = get_current_charset ();
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->css_allowed = css_allowed;
qel->next = NULL;
+ if (charset)
+ qel->remote_encoding = xstrdup (charset);
+ else
+ qel->remote_encoding = NULL;
+
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
+ /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
if (!queue->head)
queue->tail = NULL;
+ set_remote_charset (qel->remote_encoding);
+ if (qel->remote_encoding)
+ xfree (qel->remote_encoding);
+
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
struct hash_table *blacklist;
int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ struct url *start_url_parsed;
+ set_ugly_no_encode (true);
+ start_url_parsed= url_parse (start_url, &up_error_code);
+ set_ugly_no_encode (false);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
if (children)
{
struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ set_ugly_no_encode (true);
+ struct url *url_parsed = url_parse (url, NULL);
+ set_ugly_no_encode (false);
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
}
}
- if (file
- && (opt.delete_after
+ if (file
+ && (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- (otherwise unneeded because of --spider or rejected by -R)
- HTML file just to harvest its hyperlinks -- in either case,
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- (opt.spider ? "--spider" :
+ (opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
if (string_set_contains (blacklist, url))
{
- if (opt.spider)
+ if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
struct urlpos *upos;
bool success;
+ set_ugly_no_encode (true);
orig_parsed = url_parse (original, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
assert (new_parsed != NULL);
+ set_ugly_no_encode (false);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;