X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-url.c;h=287b2f548414fbbcb6e8fa1d62040ec43910e80e;hp=02092e5a31023a1322d8b05f787d0028ff4bc964;hb=2f6aa1d7417df1dfc58597777686fbd77179b9fd;hpb=cf3c678c8246fc326b69ae64b4e2766a69df5704 diff --git a/src/html-url.c b/src/html-url.c index 02092e5a..287b2f54 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,6 +1,6 @@ /* Collect URLs from HTML source. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -164,6 +164,7 @@ static struct { to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { "rel", /* used by tag_handle_link */ + "type", /* used by tag_handle_link */ "http-equiv", /* used by tag_handle_meta */ "name", /* used by tag_handle_meta */ "content", /* used by tag_handle_meta */ @@ -335,13 +336,27 @@ append_url (const char *link_uri, int position, int size, else if (link_has_scheme) newel->link_complete_p = 1; - if (ctx->tail) + /* Append the new URL maintaining the order by position. */ + if (ctx->head == NULL) + ctx->head = newel; + else { - ctx->tail->next = newel; - ctx->tail = newel; + struct urlpos *it, *prev = NULL; + + it = ctx->head; + while (it && position > it->pos) + { + prev = it; + it = it->next; + } + + newel->next = it; + + if (prev) + prev->next = newel; + else + ctx->head = newel; } - else - ctx->tail = ctx->head = newel; return newel; } @@ -350,12 +365,27 @@ static void check_style_attr (struct taginfo *tag, struct map_context *ctx) { int attrind; + int raw_start; + int raw_len; char *style = find_attr (tag, "style", &attrind); if (!style) return; - /* raw pos and raw size include the quotes, hence the +1 -2 */ - get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2); + /* raw pos and raw size include the quotes, skip them when they are + present. */ + raw_start = ATTR_POS (tag, attrind, ctx); + raw_len = ATTR_SIZE (tag, attrind); + if( *(char *)(ctx->text + raw_start) == '\'' + || *(char *)(ctx->text + raw_start) == '"') + { + raw_start += 1; + raw_len -= 2; + } + + if(raw_len <= 0) + return; + + get_urls_css (ctx, raw_start, raw_len); } /* All the tag_* functions are called from collect_tags_mapper, as @@ -476,8 +506,8 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) /* All link references are external, except those known not to be, such as style sheet and shortcut icon: - - + + */ if (href) { @@ -497,11 +527,18 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) { up->link_inline_p = 1; } + else + { + /* The external ones usually point to HTML pages, such as + + except when the type attribute says otherwise: + + */ + char *type = find_attr (tag, "type", NULL); + if (!type || strcasecmp (type, "text/html") == 0) + up->link_expect_html = 1; + } } - else - /* The external ones usually point to HTML pages, such as - */ - up->link_expect_html = 1; } } } @@ -668,7 +705,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; - ctx.head = ctx.tail = NULL; + ctx.head = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file;