X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-url.c;h=218659d454d4a23b34c0288f1b848585a14b1e4b;hp=ef93a7e49c5c61ded555c2b6c022be3346bbec2c;hb=d763f8bf6d6e13ce006ffab616cc8a77e747a633;hpb=b30a0dd817886f77a64be9218c5e5399bcbc2e67 diff --git a/src/html-url.c b/src/html-url.c index ef93a7e4..218659d4 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -44,7 +44,6 @@ as that of the covered work. */ #include "recur.h" #include "html-url.h" #include "css-url.h" -#include "iri.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -175,6 +174,10 @@ static const char *additional_attributes[] = { static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - set_ugly_no_encode (true); - url = url_parse (link_uri, NULL); - set_ugly_no_encode (false); + url = url_parse (link_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -303,12 +304,13 @@ append_url (const char *link_uri, int position, int size, char *complete_uri = uri_merge (base, link_uri); - DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", - ctx->document_file, base, link_uri, complete_uri)); + DEBUGP (("%s: merge(%s, %s) -> %s\n", + quotearg_n_style (0, escape_quoting_style, ctx->document_file), + quote_n (1, base), + quote_n (2, link_uri), + quotearg_n_style (3, escape_quoting_style, complete_uri))); - set_ugly_no_encode (true); - url = url_parse (complete_uri, NULL); - set_ugly_no_encode (false); + url = url_parse (complete_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -319,7 +321,7 @@ append_url (const char *link_uri, int position, int size, xfree (complete_uri); } - DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); + DEBUGP (("appending %s to urlpos.\n", quote (url->url))); newel = xnew0 (struct urlpos); newel->url = url; @@ -572,10 +574,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!mcharset) return; - /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ - - set_current_charset (mcharset); - xfree (mcharset); + xfree_null (meta_charset); + meta_charset = mcharset; } else if (name && 0 == strcasecmp (name, "robots")) { @@ -590,15 +590,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) { while (*content) { - /* Find the next occurrence of ',' or the end of - the string. */ - char *end = strchr (content, ','); - if (end) - ++end; - else - end = content + strlen (content); + char *end; + /* Skip any initial whitespace. */ + content += strspn (content, " \f\n\r\t\v"); + /* Find the next occurrence of ',' or whitespace, + * or the end of the string. */ + end = content + strcspn (content, ", \f\n\r\t\v"); if (!strncasecmp (content, "nofollow", end - content)) ctx->nofollow = true; + /* Skip past the next comma, if any. */ + if (*end == ',') + ++end; + else + { + end = strchr (end, ','); + if (end) + ++end; + else + end = content + strlen (content); + } content = end; } } @@ -641,7 +651,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -681,6 +692,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; @@ -750,14 +765,14 @@ get_urls_file (const char *file) url_text = merged; } - set_ugly_no_encode (true); - url = url_parse (url_text, &up_error_code); - set_ugly_no_encode (false); + url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { + char *error = url_error (url_text, up_error_code); logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), - file, url_text, url_error (up_error_code)); + file, url_text, error); xfree (url_text); + xfree (error); continue; } xfree (url_text);