X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=c954cb97191b83f87b27a149696c1bcb66982001;hb=26a3eea8e2f42c621ce6c40a93acf5ff1cd12220;hp=75bec7d97e7c56709fea35044385480466ea8cfb;hpb=2e2ac6ad2fc90eaf46ae5fee0bc4f61dd97b4284;p=wget
diff --git a/src/html-url.c b/src/html-url.c
index 75bec7d9..c954cb97 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
+/* Will contains the (last) charset found in 'http-equiv=content-type'
+ meta tags */
+static char *meta_charset;
+
static void
init_interesting (void)
{
@@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL;
}
- url = url_parse (link_uri, NULL);
+ url = url_parse (link_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
- url = url_parse (complete_uri, NULL);
+ url = url_parse (complete_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ xfree_null (meta_charset);
+ meta_charset = mcharset;
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
@@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
and does the right thing. */
struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+ struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
@@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
+ /* If meta charset isn't null, override content encoding */
+ if (iri && meta_charset)
+ set_content_encoding (iri, meta_charset);
+
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
@@ -726,12 +752,14 @@ get_urls_file (const char *file)
url_text = merged;
}
- url = url_parse (url_text, &up_error_code);
+ url = url_parse (url_text, &up_error_code, NULL);
if (!url)
{
+ char *error = url_error (url_text, up_error_code);
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
- file, url_text, url_error (up_error_code));
+ file, url_text, error);
xfree (url_text);
+ xfree (error);
continue;
}
xfree (url_text);