X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-url.c;h=287b2f548414fbbcb6e8fa1d62040ec43910e80e;hp=97443ea0e20cc9ac7f86bf9007ab173da8cb57c2;hb=2f6aa1d7417df1dfc58597777686fbd77179b9fd;hpb=53d0aff795316dc1a4b785632f0d4d93c861e9cb diff --git a/src/html-url.c b/src/html-url.c index 97443ea0..287b2f54 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,6 +1,6 @@ /* Collect URLs from HTML source. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -17,17 +17,18 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Wget. If not, see . -In addition, as a special exception, the Free Software Foundation -gives permission to link the code of its release of Wget with the -OpenSSL project's "OpenSSL" library (or with modified versions of it -that use the same license as the "OpenSSL" library), and distribute -the linked executables. You must obey the GNU General Public License -in all respects for all of the code used other than "OpenSSL". If you -modify this file, you may extend this exception to your version of the -file, but you are not obligated to do so. If you do not wish to do -so, delete this exception statement from your version. */ +Additional permission under GNU GPL version 3 section 7 -#include +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" #include #include @@ -35,15 +36,14 @@ so, delete this exception statement from your version. */ #include #include -#include "wget.h" #include "html-parse.h" #include "url.h" #include "utils.h" #include "hash.h" #include "convert.h" -#include "recur.h" /* declaration of get_urls_html */ - -struct map_context; +#include "recur.h" +#include "html-url.h" +#include "css-url.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -163,16 +163,22 @@ static struct { from the information above. However, some places in the code refer to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { - "rel", /* used by tag_handle_link */ - "http-equiv", /* used by tag_handle_meta */ - "name", /* used by tag_handle_meta */ - "content", /* used by tag_handle_meta */ - "action" /* used by tag_handle_form */ + "rel", /* used by tag_handle_link */ + "type", /* used by tag_handle_link */ + "http-equiv", /* used by tag_handle_meta */ + "name", /* used by tag_handle_meta */ + "content", /* used by tag_handle_meta */ + "action", /* used by tag_handle_form */ + "style" /* used by check_style_attr */ }; static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -185,7 +191,7 @@ init_interesting (void) matches the user's preferences as specified through --ignore-tags and --follow-tags. */ - int i; + size_t i; interesting_tags = make_nocase_string_hash_table (countof (known_tags)); /* First, add all the tags we know hot to handle, mapped to their @@ -246,28 +252,20 @@ find_attr (struct taginfo *tag, const char *name, int *attrind) return NULL; } -struct map_context { - char *text; /* HTML text. */ - char *base; /* Base URI of the document, possibly - changed through . */ - const char *parent_base; /* Base of the current document. */ - const char *document_file; /* File name of this document. */ - bool nofollow; /* whether NOFOLLOW was specified in a - tag. */ - - struct urlpos *head, *tail; /* List of URLs that is being - built. */ -}; +/* used for calls to append_url */ +#define ATTR_POS(tag, attrind, ctx) \ + (tag->attrs[attrind].value_raw_beginning - ctx->text) +#define ATTR_SIZE(tag, attrind) \ + (tag->attrs[attrind].value_raw_size) /* Append LINK_URI to the urlpos structure that is being built. - LINK_URI will be merged with the current document base. TAG and - ATTRIND are the necessary context to store the position and - size. */ + LINK_URI will be merged with the current document base. +*/ -static struct urlpos * -append_url (const char *link_uri, - struct taginfo *tag, int attrind, struct map_context *ctx) +struct urlpos * +append_url (const char *link_uri, int position, int size, + struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -291,7 +289,7 @@ append_url (const char *link_uri, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -307,10 +305,13 @@ append_url (const char *link_uri, char *complete_uri = uri_merge (base, link_uri); - DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", - ctx->document_file, base, link_uri, complete_uri)); + DEBUGP (("%s: merge(%s, %s) -> %s\n", + quotearg_n_style (0, escape_quoting_style, ctx->document_file), + quote_n (1, base), + quote_n (2, link_uri), + quotearg_n_style (3, escape_quoting_style, complete_uri))); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -321,12 +322,12 @@ append_url (const char *link_uri, xfree (complete_uri); } - DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); + DEBUGP (("appending %s to urlpos.\n", quote (url->url))); newel = xnew0 (struct urlpos); newel->url = url; - newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; - newel->size = tag->attrs[attrind].value_raw_size; + newel->pos = position; + newel->size = size; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -335,17 +336,58 @@ append_url (const char *link_uri, else if (link_has_scheme) newel->link_complete_p = 1; - if (ctx->tail) + /* Append the new URL maintaining the order by position. */ + if (ctx->head == NULL) + ctx->head = newel; + else { - ctx->tail->next = newel; - ctx->tail = newel; + struct urlpos *it, *prev = NULL; + + it = ctx->head; + while (it && position > it->pos) + { + prev = it; + it = it->next; + } + + newel->next = it; + + if (prev) + prev->next = newel; + else + ctx->head = newel; } - else - ctx->tail = ctx->head = newel; return newel; } +static void +check_style_attr (struct taginfo *tag, struct map_context *ctx) +{ + int attrind; + int raw_start; + int raw_len; + char *style = find_attr (tag, "style", &attrind); + if (!style) + return; + + /* raw pos and raw size include the quotes, skip them when they are + present. */ + raw_start = ATTR_POS (tag, attrind, ctx); + raw_len = ATTR_SIZE (tag, attrind); + if( *(char *)(ctx->text + raw_start) == '\'' + || *(char *)(ctx->text + raw_start) == '"') + { + raw_start += 1; + raw_len -= 2; + } + + if(raw_len <= 0) + return; + + get_urls_css (ctx, raw_start, raw_len); +} + /* All the tag_* functions are called from collect_tags_mapper, as specified by KNOWN_TAGS. */ @@ -355,7 +397,8 @@ append_url (const char *link_uri, static void tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) { - int i, attrind; + size_t i; + int attrind; int first = -1; for (i = 0; i < countof (tag_url_attributes); i++) @@ -382,7 +425,7 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) /* Find whether TAG/ATTRIND is a combination that contains a URL. */ char *link = tag->attrs[attrind].value; - const int size = countof (tag_url_attributes); + const size_t size = countof (tag_url_attributes); /* If you're cringing at the inefficiency of the nested loops, remember that they both iterate over a very small number of @@ -393,7 +436,8 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - struct urlpos *up = append_url (link, tag, attrind, ctx); + struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) { int flags = tag_url_attributes[i].flags; @@ -418,7 +462,8 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_url (newbase, tag, attrind, ctx); + base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -439,9 +484,11 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) { int attrind; char *action = find_attr (tag, "action", &attrind); + if (action) { - struct urlpos *up = append_url (action, tag, attrind, ctx); + struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) up->ignore_when_downloading = 1; } @@ -459,23 +506,39 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) /* All link references are external, except those known not to be, such as style sheet and shortcut icon: - - + + */ if (href) { - struct urlpos *up = append_url (href, tag, attrind, ctx); + struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) { char *rel = find_attr (tag, "rel", NULL); - if (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))) - up->link_inline_p = 1; - else - /* The external ones usually point to HTML pages, such as - */ - up->link_expect_html = 1; + if (rel) + { + if (0 == strcasecmp (rel, "stylesheet")) + { + up->link_inline_p = 1; + up->link_expect_css = 1; + } + else if (0 == strcasecmp (rel, "shortcut icon")) + { + up->link_inline_p = 1; + } + else + { + /* The external ones usually point to HTML pages, such as + + except when the type attribute says otherwise: + + */ + char *type = find_attr (tag, "type", NULL); + if (!type || strcasecmp (type, "text/html") == 0) + up->link_expect_html = 1; + } + } } } } @@ -509,23 +572,24 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!refresh) return; - for (p = refresh; ISDIGIT (*p); p++) + for (p = refresh; c_isdigit (*p); p++) timeout = 10 * timeout + *p - '0'; if (*p++ != ';') return; - while (ISSPACE (*p)) + while (c_isspace (*p)) ++p; - if (!( TOUPPER (*p) == 'U' - && TOUPPER (*(p + 1)) == 'R' - && TOUPPER (*(p + 2)) == 'L' + if (!( c_toupper (*p) == 'U' + && c_toupper (*(p + 1)) == 'R' + && c_toupper (*(p + 2)) == 'L' && *(p + 3) == '=')) return; p += 4; - while (ISSPACE (*p)) + while (c_isspace (*p)) ++p; - entry = append_url (p, tag, attrind, ctx); + entry = append_url (p, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (entry) { entry->link_refresh_p = 1; @@ -533,6 +597,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) entry->link_expect_html = 1; } } + else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) + { + /* Handle stuff like: + */ + + char *mcharset; + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + + mcharset = parse_charset (content); + if (!mcharset) + return; + + xfree_null (meta_charset); + meta_charset = mcharset; + } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: @@ -546,15 +627,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) { while (*content) { - /* Find the next occurrence of ',' or the end of - the string. */ - char *end = strchr (content, ','); - if (end) - ++end; - else - end = content + strlen (content); + char *end; + /* Skip any initial whitespace. */ + content += strspn (content, " \f\n\r\t\v"); + /* Find the next occurrence of ',' or whitespace, + * or the end of the string. */ + end = content + strcspn (content, ", \f\n\r\t\v"); if (!strncasecmp (content, "nofollow", end - content)) ctx->nofollow = true; + /* Skip past the next comma, if any. */ + if (*end == ',') + ++end; + else + { + end = strchr (end, ','); + if (end) + ++end; + else + end = content + strlen (content); + } content = end; } } @@ -570,11 +661,26 @@ collect_tags_mapper (struct taginfo *tag, void *arg) struct map_context *ctx = (struct map_context *)arg; /* Find the tag in our table of tags. This must not fail because - map_html_tags only returns tags found in interesting_tags. */ + map_html_tags only returns tags found in interesting_tags. + + I've changed this for now, I'm passing NULL as interesting_tags + to map_html_tags. This way we can check all tags for a style + attribute. + */ struct known_tag *t = hash_table_get (interesting_tags, tag->name); - assert (t != NULL); - t->handler (t->tagid, tag, ctx); + if (t != NULL) + t->handler (t->tagid, tag, ctx); + + check_style_attr (tag, ctx); + + if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) && + tag->contents_begin && tag->contents_end) + { + /* parse contents */ + get_urls_css (ctx, tag->contents_begin - ctx->text, + tag->contents_end - tag->contents_begin); + } } /* Analyze HTML tags FILE and construct a list of URLs referenced from @@ -582,14 +688,15 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; int flags; /* Load the file. */ - fm = read_file (file); + fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); @@ -598,7 +705,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; - ctx.head = ctx.tail = NULL; + ctx.head = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; @@ -618,15 +725,20 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; + /* the NULL here used to be interesting_tags */ map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, - interesting_tags, interesting_attributes); + NULL, interesting_attributes); + + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; xfree_null (ctx.base); - read_file_free (fm); + wget_read_file_free (fm); return ctx.head; } @@ -641,7 +753,7 @@ get_urls_file (const char *file) const char *text, *text_end; /* Load the file. */ - fm = read_file (file); + fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); @@ -668,9 +780,9 @@ get_urls_file (const char *file) text = line_end; /* Strip whitespace from the beginning and end of line. */ - while (line_beg < line_end && ISSPACE (*line_beg)) + while (line_beg < line_end && c_isspace (*line_beg)) ++line_beg; - while (line_end > line_beg && ISSPACE (*(line_end - 1))) + while (line_end > line_beg && c_isspace (*(line_end - 1))) --line_end; if (line_beg == line_end) @@ -690,12 +802,14 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { + char *error = url_error (url_text, up_error_code); logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), - file, url_text, url_error (up_error_code)); + file, url_text, error); xfree (url_text); + xfree (error); continue; } xfree (url_text); @@ -709,7 +823,7 @@ get_urls_file (const char *file) tail->next = entry; tail = entry; } - read_file_free (fm); + wget_read_file_free (fm); return head; }