X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=c954cb97191b83f87b27a149696c1bcb66982001;hb=26a3eea8e2f42c621ce6c40a93acf5ff1cd12220;hp=95df8bf98e35b5d93b21da4003326fa5539b45f7;hpb=0fae9cb3887905279e22acd65ce639a1e5f56cc6;p=wget diff --git a/src/html-url.c b/src/html-url.c index 95df8bf9..c954cb97 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -174,6 +174,10 @@ static const char *additional_attributes[] = { static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, NULL); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, NULL); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) entry->link_expect_html = 1; } } + else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) + { + /* Handle stuff like: + */ + + char *mcharset; + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + + mcharset = parse_charset (content); + if (!mcharset) + return; + + xfree_null (meta_charset); + meta_charset = mcharset; + } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: @@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; @@ -726,7 +752,7 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, NULL); if (!url) { char *error = url_error (url_text, up_error_code);