/* Collect URLs from HTML source.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007, 2008 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
This file is part of GNU Wget.
#include <errno.h>
#include <assert.h>
+#include "exits.h"
#include "html-parse.h"
#include "url.h"
#include "utils.h"
TAG_SCRIPT,
TAG_TABLE,
TAG_TD,
- TAG_TH
+ TAG_TH,
+ TAG_VIDEO,
+ TAG_AUDIO,
+ TAG_SOURCE
};
/* The list of known tags and functions used for handling them. Most
{ TAG_SCRIPT, "script", tag_find_urls },
{ TAG_TABLE, "table", tag_find_urls },
{ TAG_TD, "td", tag_find_urls },
- { TAG_TH, "th", tag_find_urls }
+ { TAG_TH, "th", tag_find_urls },
+ { TAG_VIDEO, "video", tag_find_urls },
+ { TAG_AUDIO, "audio", tag_find_urls },
+ { TAG_SOURCE, "source", tag_find_urls }
};
/* tag_url_attributes documents which attributes of which tags contain
{ TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", ATTR_INLINE },
{ TAG_TD, "background", ATTR_INLINE },
- { TAG_TH, "background", ATTR_INLINE }
+ { TAG_TH, "background", ATTR_INLINE },
+ { TAG_VIDEO, "src", ATTR_INLINE },
+ { TAG_VIDEO, "poster", ATTR_INLINE },
+ { TAG_AUDIO, "src", ATTR_INLINE },
+ { TAG_AUDIO, "poster", ATTR_INLINE },
+ { TAG_SOURCE, "src", ATTR_INLINE }
};
/* The lists of interesting tags and attributes are built dynamically,
to the attributes not mentioned here. We add them manually. */
static const char *additional_attributes[] = {
"rel", /* used by tag_handle_link */
+ "type", /* used by tag_handle_link */
"http-equiv", /* used by tag_handle_meta */
"name", /* used by tag_handle_meta */
"content", /* used by tag_handle_meta */
const char *base = ctx->base ? ctx->base : ctx->parent_base;
struct url *url;
+ struct iri *iri = iri_new ();
+ set_uri_encoding (iri, opt.locale, true);
+ iri->utf8_encode = true;
+
if (!base)
{
DEBUGP (("%s: no base, merge will use \"%s\".\n",
return NULL;
}
- url = url_parse (link_uri, NULL, NULL);
+ url = url_parse (link_uri, NULL, iri, false);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
char *complete_uri = uri_merge (base, link_uri);
- DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
- ctx->document_file, base, link_uri, complete_uri));
+ DEBUGP (("%s: merge(%s, %s) -> %s\n",
+ quotearg_n_style (0, escape_quoting_style, ctx->document_file),
+ quote_n (1, base),
+ quote_n (2, link_uri),
+ quotearg_n_style (3, escape_quoting_style, complete_uri)));
- url = url_parse (complete_uri, NULL, NULL);
+ url = url_parse (complete_uri, NULL, iri, false);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
xfree (complete_uri);
}
- DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
+ iri_free (iri);
+
+ DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
newel = xnew0 (struct urlpos);
newel->url = url;
else if (link_has_scheme)
newel->link_complete_p = 1;
- if (ctx->tail)
+ /* Append the new URL maintaining the order by position. */
+ if (ctx->head == NULL)
+ ctx->head = newel;
+ else
{
- ctx->tail->next = newel;
- ctx->tail = newel;
+ struct urlpos *it, *prev = NULL;
+
+ it = ctx->head;
+ while (it && position > it->pos)
+ {
+ prev = it;
+ it = it->next;
+ }
+
+ newel->next = it;
+
+ if (prev)
+ prev->next = newel;
+ else
+ ctx->head = newel;
}
- else
- ctx->tail = ctx->head = newel;
return newel;
}
check_style_attr (struct taginfo *tag, struct map_context *ctx)
{
int attrind;
+ int raw_start;
+ int raw_len;
char *style = find_attr (tag, "style", &attrind);
if (!style)
return;
- /* raw pos and raw size include the quotes, hence the +1 -2 */
- get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
+ /* raw pos and raw size include the quotes, skip them when they are
+ present. */
+ raw_start = ATTR_POS (tag, attrind, ctx);
+ raw_len = ATTR_SIZE (tag, attrind);
+ if( *(char *)(ctx->text + raw_start) == '\''
+ || *(char *)(ctx->text + raw_start) == '"')
+ {
+ raw_start += 1;
+ raw_len -= 2;
+ }
+
+ if(raw_len <= 0)
+ return;
+
+ get_urls_css (ctx, raw_start, raw_len);
}
/* All the tag_* functions are called from collect_tags_mapper, as
/* Handle the BASE tag, for <base href=...>. */
static void
-tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
+tag_handle_base (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
{
struct urlpos *base_urlpos;
int attrind;
/* Mark the URL found in <form action=...> for conversion. */
static void
-tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
+tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
{
int attrind;
char *action = find_attr (tag, "action", &attrind);
links will be followed in -p mode depends on the REL attribute. */
static void
-tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
+tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
{
int attrind;
char *href = find_attr (tag, "href", &attrind);
/* All <link href="..."> link references are external, except those
known not to be, such as style sheet and shortcut icon:
- <link rel="stylesheet" href="...">
- <link rel="shortcut icon" href="...">
+ <link rel="stylesheet" href="...">
+ <link rel="shortcut icon" href="...">
*/
if (href)
{
{
up->link_inline_p = 1;
}
+ else
+ {
+ /* The external ones usually point to HTML pages, such as
+ <link rel="next" href="...">
+ except when the type attribute says otherwise:
+ <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
+ */
+ char *type = find_attr (tag, "type", NULL);
+ if (!type || strcasecmp (type, "text/html") == 0)
+ up->link_expect_html = 1;
+ }
}
- else
- /* The external ones usually point to HTML pages, such as
- <link rel="next" href="..."> */
- up->link_expect_html = 1;
}
}
}
refresh feature and because of robot exclusion. */
static void
-tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
+tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
{
char *name = find_attr (tag, "name", NULL);
char *http_equiv = find_attr (tag, "http-equiv", NULL);
{
while (*content)
{
- /* Find the next occurrence of ',' or the end of
- the string. */
- char *end = strchr (content, ',');
- if (end)
- ++end;
- else
- end = content + strlen (content);
+ char *end;
+ /* Skip any initial whitespace. */
+ content += strspn (content, " \f\n\r\t\v");
+ /* Find the next occurrence of ',' or whitespace,
+ * or the end of the string. */
+ end = content + strcspn (content, ", \f\n\r\t\v");
if (!strncasecmp (content, "nofollow", end - content))
ctx->nofollow = true;
+ /* Skip past the next comma, if any. */
+ if (*end == ',')
+ ++end;
+ else
+ {
+ end = strchr (end, ',');
+ if (end)
+ ++end;
+ else
+ end = content + strlen (content);
+ }
content = end;
}
}
/* Find the tag in our table of tags. This must not fail because
map_html_tags only returns tags found in interesting_tags.
-
+
I've changed this for now, I'm passing NULL as interesting_tags
to map_html_tags. This way we can check all tags for a style
attribute.
check_style_attr (tag, ctx);
- if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
- tag->contents_begin && tag->contents_end)
+ if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style"))
+ && tag->contents_begin && tag->contents_end
+ && tag->contents_begin <= tag->contents_end)
{
/* parse contents */
get_urls_css (ctx, tag->contents_begin - ctx->text,
int flags;
/* Load the file. */
- fm = read_file (file);
+ fm = wget_read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
ctx.text = fm->content;
- ctx.head = ctx.tail = NULL;
+ ctx.head = NULL;
ctx.base = NULL;
ctx.parent_base = url ? url : opt.base_href;
ctx.document_file = file;
*meta_disallow_follow = ctx.nofollow;
xfree_null (ctx.base);
- read_file_free (fm);
+ wget_read_file_free (fm);
return ctx.head;
}
const char *text, *text_end;
/* Load the file. */
- fm = read_file (file);
+ fm = wget_read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
url_text = merged;
}
- url = url_parse (url_text, &up_error_code, NULL);
+ char *new_url = rewrite_shorthand_url (url_text);
+ if (new_url)
+ {
+ xfree (url_text);
+ url_text = new_url;
+ }
+
+ url = url_parse (url_text, &up_error_code, NULL, false);
if (!url)
{
char *error = url_error (url_text, up_error_code);
file, url_text, error);
xfree (url_text);
xfree (error);
+ inform_exit_status (URLERROR);
continue;
}
xfree (url_text);
tail->next = entry;
tail = entry;
}
- read_file_free (fm);
+ wget_read_file_free (fm);
return head;
}