X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-url.c;h=3c6c9b924c9a3c9a22648ce0e4a59880cde1c824;hp=f938c80c2b81685c8ef2cac374d3c61cfae53c41;hb=38a7829dcb4eb5dba28dbf0f05c6a80fea9217f8;hpb=4a08094db88011153adadbf995103770b20d2a31 diff --git a/src/html-url.c b/src/html-url.c index f938c80c..3c6c9b92 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,6 +1,6 @@ /* Collect URLs from HTML source. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007, 2008, 2009 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -36,6 +36,7 @@ as that of the covered work. */ #include #include +#include "exits.h" #include "html-parse.h" #include "url.h" #include "utils.h" @@ -78,7 +79,10 @@ enum { TAG_SCRIPT, TAG_TABLE, TAG_TD, - TAG_TH + TAG_TH, + TAG_VIDEO, + TAG_AUDIO, + TAG_SOURCE }; /* The list of known tags and functions used for handling them. Most @@ -109,7 +113,10 @@ static struct known_tag { { TAG_SCRIPT, "script", tag_find_urls }, { TAG_TABLE, "table", tag_find_urls }, { TAG_TD, "td", tag_find_urls }, - { TAG_TH, "th", tag_find_urls } + { TAG_TH, "th", tag_find_urls }, + { TAG_VIDEO, "video", tag_find_urls }, + { TAG_AUDIO, "audio", tag_find_urls }, + { TAG_SOURCE, "source", tag_find_urls } }; /* tag_url_attributes documents which attributes of which tags contain @@ -156,7 +163,12 @@ static struct { { TAG_SCRIPT, "src", ATTR_INLINE }, { TAG_TABLE, "background", ATTR_INLINE }, { TAG_TD, "background", ATTR_INLINE }, - { TAG_TH, "background", ATTR_INLINE } + { TAG_TH, "background", ATTR_INLINE }, + { TAG_VIDEO, "src", ATTR_INLINE }, + { TAG_VIDEO, "poster", ATTR_INLINE }, + { TAG_AUDIO, "src", ATTR_INLINE }, + { TAG_AUDIO, "poster", ATTR_INLINE }, + { TAG_SOURCE, "src", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, @@ -164,6 +176,7 @@ static struct { to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { "rel", /* used by tag_handle_link */ + "type", /* used by tag_handle_link */ "http-equiv", /* used by tag_handle_meta */ "name", /* used by tag_handle_meta */ "content", /* used by tag_handle_meta */ @@ -271,6 +284,10 @@ append_url (const char *link_uri, int position, int size, const char *base = ctx->base ? ctx->base : ctx->parent_base; struct url *url; + struct iri *iri = iri_new (); + set_uri_encoding (iri, opt.locale, true); + iri->utf8_encode = true; + if (!base) { DEBUGP (("%s: no base, merge will use \"%s\".\n", @@ -288,7 +305,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - url = url_parse (link_uri, NULL, NULL, false); + url = url_parse (link_uri, NULL, iri, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -310,7 +327,7 @@ append_url (const char *link_uri, int position, int size, quote_n (2, link_uri), quotearg_n_style (3, escape_quoting_style, complete_uri))); - url = url_parse (complete_uri, NULL, NULL, false); + url = url_parse (complete_uri, NULL, iri, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -321,6 +338,8 @@ append_url (const char *link_uri, int position, int size, xfree (complete_uri); } + iri_free (iri); + DEBUGP (("appending %s to urlpos.\n", quote (url->url))); newel = xnew0 (struct urlpos); @@ -335,13 +354,27 @@ append_url (const char *link_uri, int position, int size, else if (link_has_scheme) newel->link_complete_p = 1; - if (ctx->tail) + /* Append the new URL maintaining the order by position. */ + if (ctx->head == NULL) + ctx->head = newel; + else { - ctx->tail->next = newel; - ctx->tail = newel; + struct urlpos *it, *prev = NULL; + + it = ctx->head; + while (it && position > it->pos) + { + prev = it; + it = it->next; + } + + newel->next = it; + + if (prev) + prev->next = newel; + else + ctx->head = newel; } - else - ctx->tail = ctx->head = newel; return newel; } @@ -350,12 +383,27 @@ static void check_style_attr (struct taginfo *tag, struct map_context *ctx) { int attrind; + int raw_start; + int raw_len; char *style = find_attr (tag, "style", &attrind); if (!style) return; - /* raw pos and raw size include the quotes, hence the +1 -2 */ - get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2); + /* raw pos and raw size include the quotes, skip them when they are + present. */ + raw_start = ATTR_POS (tag, attrind, ctx); + raw_len = ATTR_SIZE (tag, attrind); + if( *(char *)(ctx->text + raw_start) == '\'' + || *(char *)(ctx->text + raw_start) == '"') + { + raw_start += 1; + raw_len -= 2; + } + + if(raw_len <= 0) + return; + + get_urls_css (ctx, raw_start, raw_len); } /* All the tag_* functions are called from collect_tags_mapper, as @@ -424,7 +472,7 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) /* Handle the BASE tag, for . */ static void -tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) +tag_handle_base (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) { struct urlpos *base_urlpos; int attrind; @@ -450,7 +498,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) /* Mark the URL found in
for conversion. */ static void -tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) +tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) { int attrind; char *action = find_attr (tag, "action", &attrind); @@ -468,7 +516,7 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) links will be followed in -p mode depends on the REL attribute. */ static void -tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) +tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) { int attrind; char *href = find_attr (tag, "href", &attrind); @@ -476,8 +524,8 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) /* All link references are external, except those known not to be, such as style sheet and shortcut icon: - - + + */ if (href) { @@ -497,11 +545,18 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) { up->link_inline_p = 1; } + else + { + /* The external ones usually point to HTML pages, such as + + except when the type attribute says otherwise: + + */ + char *type = find_attr (tag, "type", NULL); + if (!type || strcasecmp (type, "text/html") == 0) + up->link_expect_html = 1; + } } - else - /* The external ones usually point to HTML pages, such as - */ - up->link_expect_html = 1; } } } @@ -510,7 +565,7 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) refresh feature and because of robot exclusion. */ static void -tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) +tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) { char *name = find_attr (tag, "name", NULL); char *http_equiv = find_attr (tag, "http-equiv", NULL); @@ -637,8 +692,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg) check_style_attr (tag, ctx); - if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) && - tag->contents_begin && tag->contents_end) + if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) + && tag->contents_begin && tag->contents_end + && tag->contents_begin <= tag->contents_end) { /* parse contents */ get_urls_css (ctx, tag->contents_begin - ctx->text, @@ -659,7 +715,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, int flags; /* Load the file. */ - fm = read_file (file); + fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); @@ -668,7 +724,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; - ctx.head = ctx.tail = NULL; + ctx.head = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; @@ -701,7 +757,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, *meta_disallow_follow = ctx.nofollow; xfree_null (ctx.base); - read_file_free (fm); + wget_read_file_free (fm); return ctx.head; } @@ -716,7 +772,7 @@ get_urls_file (const char *file) const char *text, *text_end; /* Load the file. */ - fm = read_file (file); + fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); @@ -765,6 +821,13 @@ get_urls_file (const char *file) url_text = merged; } + char *new_url = rewrite_shorthand_url (url_text); + if (new_url) + { + xfree (url_text); + url_text = new_url; + } + url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { @@ -773,6 +836,7 @@ get_urls_file (const char *file) file, url_text, error); xfree (url_text); xfree (error); + inform_exit_status (URLERROR); continue; } xfree (url_text); @@ -786,7 +850,7 @@ get_urls_file (const char *file) tail->next = entry; tail = entry; } - read_file_free (fm); + wget_read_file_free (fm); return head; }