X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=cbaffb25cb4550e3040fb2644663f566e40f4211;hb=a459f0aac9eda068f12bc1430d6e59e07a224ddf;hp=9b515432a8122550b9beaca988bce6e99d1c16dd;hpb=13fec855660ee55c43f64fe47fbc284f35ca6e6e;p=wget diff --git a/src/html-url.c b/src/html-url.c index 9b515432..cbaffb25 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -41,10 +41,9 @@ as that of the covered work. */ #include "utils.h" #include "hash.h" #include "convert.h" -#include "recur.h" /* declaration of get_urls_html */ -#include "iri.h" - -struct map_context; +#include "recur.h" +#include "html-url.h" +#include "css-url.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -164,16 +163,21 @@ static struct { from the information above. However, some places in the code refer to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { - "rel", /* used by tag_handle_link */ - "http-equiv", /* used by tag_handle_meta */ - "name", /* used by tag_handle_meta */ - "content", /* used by tag_handle_meta */ - "action" /* used by tag_handle_form */ + "rel", /* used by tag_handle_link */ + "http-equiv", /* used by tag_handle_meta */ + "name", /* used by tag_handle_meta */ + "content", /* used by tag_handle_meta */ + "action", /* used by tag_handle_form */ + "style" /* used by check_style_attr */ }; static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -247,28 +251,20 @@ find_attr (struct taginfo *tag, const char *name, int *attrind) return NULL; } -struct map_context { - char *text; /* HTML text. */ - char *base; /* Base URI of the document, possibly - changed through . */ - const char *parent_base; /* Base of the current document. */ - const char *document_file; /* File name of this document. */ - bool nofollow; /* whether NOFOLLOW was specified in a - tag. */ - - struct urlpos *head, *tail; /* List of URLs that is being - built. */ -}; +/* used for calls to append_url */ +#define ATTR_POS(tag, attrind, ctx) \ + (tag->attrs[attrind].value_raw_beginning - ctx->text) +#define ATTR_SIZE(tag, attrind) \ + (tag->attrs[attrind].value_raw_size) /* Append LINK_URI to the urlpos structure that is being built. - LINK_URI will be merged with the current document base. TAG and - ATTRIND are the necessary context to store the position and - size. */ + LINK_URI will be merged with the current document base. +*/ -static struct urlpos * -append_url (const char *link_uri, - struct taginfo *tag, int attrind, struct map_context *ctx) +struct urlpos * +append_url (const char *link_uri, int position, int size, + struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -292,7 +288,7 @@ append_url (const char *link_uri, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, NULL); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -311,7 +307,7 @@ append_url (const char *link_uri, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, NULL); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -326,8 +322,8 @@ append_url (const char *link_uri, newel = xnew0 (struct urlpos); newel->url = url; - newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; - newel->size = tag->attrs[attrind].value_raw_size; + newel->pos = position; + newel->size = size; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -347,6 +343,18 @@ append_url (const char *link_uri, return newel; } +static void +check_style_attr (struct taginfo *tag, struct map_context *ctx) +{ + int attrind; + char *style = find_attr (tag, "style", &attrind); + if (!style) + return; + + /* raw pos and raw size include the quotes, hence the +1 -2 */ + get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2); +} + /* All the tag_* functions are called from collect_tags_mapper, as specified by KNOWN_TAGS. */ @@ -395,7 +403,8 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - struct urlpos *up = append_url (link, tag, attrind, ctx); + struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) { int flags = tag_url_attributes[i].flags; @@ -420,7 +429,8 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_url (newbase, tag, attrind, ctx); + base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -441,9 +451,11 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) { int attrind; char *action = find_attr (tag, "action", &attrind); + if (action) { - struct urlpos *up = append_url (action, tag, attrind, ctx); + struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) up->ignore_when_downloading = 1; } @@ -466,14 +478,23 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) */ if (href) { - struct urlpos *up = append_url (href, tag, attrind, ctx); + struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) { char *rel = find_attr (tag, "rel", NULL); - if (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))) - up->link_inline_p = 1; + if (rel) + { + if (0 == strcasecmp (rel, "stylesheet")) + { + up->link_inline_p = 1; + up->link_expect_css = 1; + } + else if (0 == strcasecmp (rel, "shortcut icon")) + { + up->link_inline_p = 1; + } + } else /* The external ones usually point to HTML pages, such as */ @@ -527,7 +548,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) while (c_isspace (*p)) ++p; - entry = append_url (p, tag, attrind, ctx); + entry = append_url (p, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (entry) { entry->link_refresh_p = 1; @@ -549,10 +571,9 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!mcharset) return; - logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset)); - - /* sXXXav: Not used yet */ - xfree (mcharset); + /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ + xfree_null (meta_charset); + meta_charset = mcharset; } else if (name && 0 == strcasecmp (name, "robots")) { @@ -591,11 +612,26 @@ collect_tags_mapper (struct taginfo *tag, void *arg) struct map_context *ctx = (struct map_context *)arg; /* Find the tag in our table of tags. This must not fail because - map_html_tags only returns tags found in interesting_tags. */ + map_html_tags only returns tags found in interesting_tags. + + I've changed this for now, I'm passing NULL as interesting_tags + to map_html_tags. This way we can check all tags for a style + attribute. + */ struct known_tag *t = hash_table_get (interesting_tags, tag->name); - assert (t != NULL); - t->handler (t->tagid, tag, ctx); + if (t != NULL) + t->handler (t->tagid, tag, ctx); + + check_style_attr (tag, ctx); + + if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) && + tag->contents_begin && tag->contents_end) + { + /* parse contents */ + get_urls_css (ctx, tag->contents_begin - ctx->text, + tag->contents_end - tag->contents_begin); + } } /* Analyze HTML tags FILE and construct a list of URLs referenced from @@ -603,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -639,8 +676,13 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; + /* the NULL here used to be interesting_tags */ map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, - interesting_tags, interesting_attributes); + NULL, interesting_attributes); + + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) @@ -711,12 +753,14 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, NULL); if (!url) { + char *error = url_error (url_text, up_error_code); logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), - file, url_text, url_error (up_error_code)); + file, url_text, error); xfree (url_text); + xfree (error); continue; } xfree (url_text);