X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=523f5e0dab1aa82f1431f7752be20742c356f4ec;hb=e095cc064eb72ca0cee6d41622e39e7ea3f211a6;hp=c9cf28f6df39fd470966b236b13d16c7e0f184b0;hpb=caae3b70f46bd519857b595f7f06ea0179551336;p=wget
diff --git a/src/html-url.c b/src/html-url.c
index c9cf28f6..523f5e0d 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -1,6 +1,6 @@
/* Collect URLs from HTML source.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007, 2008 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
This file is part of GNU Wget.
@@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
+/* Will contains the (last) charset found in 'http-equiv=content-type'
+ meta tags */
+static char *meta_charset;
+
static void
init_interesting (void)
{
@@ -186,7 +190,7 @@ init_interesting (void)
matches the user's preferences as specified through --ignore-tags
and --follow-tags. */
- int i;
+ size_t i;
interesting_tags = make_nocase_string_hash_table (countof (known_tags));
/* First, add all the tags we know hot to handle, mapped to their
@@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL;
}
- url = url_parse (link_uri, NULL);
+ url = url_parse (link_uri, NULL, NULL, false);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -300,10 +304,13 @@ append_url (const char *link_uri, int position, int size,
char *complete_uri = uri_merge (base, link_uri);
- DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
- ctx->document_file, base, link_uri, complete_uri));
+ DEBUGP (("%s: merge(%s, %s) -> %s\n",
+ quotearg_n_style (0, escape_quoting_style, ctx->document_file),
+ quote_n (1, base),
+ quote_n (2, link_uri),
+ quotearg_n_style (3, escape_quoting_style, complete_uri)));
- url = url_parse (complete_uri, NULL);
+ url = url_parse (complete_uri, NULL, NULL, false);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -314,7 +321,7 @@ append_url (const char *link_uri, int position, int size,
xfree (complete_uri);
}
- DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
+ DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
newel = xnew0 (struct urlpos);
newel->url = url;
@@ -328,13 +335,27 @@ append_url (const char *link_uri, int position, int size,
else if (link_has_scheme)
newel->link_complete_p = 1;
- if (ctx->tail)
+ /* Append the new URL maintaining the order by position. */
+ if (ctx->head == NULL)
+ ctx->head = newel;
+ else
{
- ctx->tail->next = newel;
- ctx->tail = newel;
+ struct urlpos *it, *prev = NULL;
+
+ it = ctx->head;
+ while (it && position > it->pos)
+ {
+ prev = it;
+ it = it->next;
+ }
+
+ newel->next = it;
+
+ if (prev)
+ prev->next = newel;
+ else
+ ctx->head = newel;
}
- else
- ctx->tail = ctx->head = newel;
return newel;
}
@@ -360,7 +381,8 @@ check_style_attr (struct taginfo *tag, struct map_context *ctx)
static void
tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
{
- int i, attrind;
+ size_t i;
+ int attrind;
int first = -1;
for (i = 0; i < countof (tag_url_attributes); i++)
@@ -387,7 +409,7 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
/* Find whether TAG/ATTRIND is a combination that contains a
URL. */
char *link = tag->attrs[attrind].value;
- const int size = countof (tag_url_attributes);
+ const size_t size = countof (tag_url_attributes);
/* If you're cringing at the inefficiency of the nested loops,
remember that they both iterate over a very small number of
@@ -552,6 +574,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ xfree_null (meta_charset);
+ meta_charset = mcharset;
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
@@ -565,15 +604,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
{
while (*content)
{
- /* Find the next occurrence of ',' or the end of
- the string. */
- char *end = strchr (content, ',');
- if (end)
- ++end;
- else
- end = content + strlen (content);
+ char *end;
+ /* Skip any initial whitespace. */
+ content += strspn (content, " \f\n\r\t\v");
+ /* Find the next occurrence of ',' or whitespace,
+ * or the end of the string. */
+ end = content + strcspn (content, ", \f\n\r\t\v");
if (!strncasecmp (content, "nofollow", end - content))
ctx->nofollow = true;
+ /* Skip past the next comma, if any. */
+ if (*end == ',')
+ ++end;
+ else
+ {
+ end = strchr (end, ',');
+ if (end)
+ ++end;
+ else
+ end = content + strlen (content);
+ }
content = end;
}
}
@@ -590,7 +639,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
/* Find the tag in our table of tags. This must not fail because
map_html_tags only returns tags found in interesting_tags.
-
+
I've changed this for now, I'm passing NULL as interesting_tags
to map_html_tags. This way we can check all tags for a style
attribute.
@@ -616,14 +665,15 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
and does the right thing. */
struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+ struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
int flags;
/* Load the file. */
- fm = read_file (file);
+ fm = wget_read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
@@ -632,7 +682,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
ctx.text = fm->content;
- ctx.head = ctx.tail = NULL;
+ ctx.head = NULL;
ctx.base = NULL;
ctx.parent_base = url ? url : opt.base_href;
ctx.document_file = file;
@@ -656,12 +706,16 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
+ /* If meta charset isn't null, override content encoding */
+ if (iri && meta_charset)
+ set_content_encoding (iri, meta_charset);
+
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
xfree_null (ctx.base);
- read_file_free (fm);
+ wget_read_file_free (fm);
return ctx.head;
}
@@ -676,7 +730,7 @@ get_urls_file (const char *file)
const char *text, *text_end;
/* Load the file. */
- fm = read_file (file);
+ fm = wget_read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
@@ -725,12 +779,14 @@ get_urls_file (const char *file)
url_text = merged;
}
- url = url_parse (url_text, &up_error_code);
+ url = url_parse (url_text, &up_error_code, NULL, false);
if (!url)
{
+ char *error = url_error (url_text, up_error_code);
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
- file, url_text, url_error (up_error_code));
+ file, url_text, error);
xfree (url_text);
+ xfree (error);
continue;
}
xfree (url_text);
@@ -744,7 +800,7 @@ get_urls_file (const char *file)
tail->next = entry;
tail = entry;
}
- read_file_free (fm);
+ wget_read_file_free (fm);
return head;
}