X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-url.c;h=b5da0781abd47c9ae89e90d1ff58aa410c558283;hp=3c7c409e82e4c9f48e490abee7f4131a9113da0f;hb=7d2066b2213bd8ee5705dfdf6ed4297e91d694d7;hpb=2fe72be505d2d91fc0bbbd22cc19f3d288813671 diff --git a/src/html-url.c b/src/html-url.c index 3c7c409e..b5da0781 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -40,13 +40,13 @@ so, delete this exception statement from your version. */ #include "utils.h" #include "hash.h" #include "convert.h" -#include "recur.h" /* declaration of get_urls_html */ +#include "recur.h" /* declaration of get_urls_html */ struct map_context; typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); -#define DECLARE_TAG_HANDLER(fun) \ +#define DECLARE_TAG_HANDLER(fun) \ static void fun (int, struct taginfo *, struct map_context *) DECLARE_TAG_HANDLER (tag_find_urls); @@ -87,28 +87,28 @@ static struct known_tag { const char *name; tag_handler_t handler; } known_tags[] = { - { TAG_A, "a", tag_find_urls }, - { TAG_APPLET, "applet", tag_find_urls }, - { TAG_AREA, "area", tag_find_urls }, - { TAG_BASE, "base", tag_handle_base }, - { TAG_BGSOUND, "bgsound", tag_find_urls }, - { TAG_BODY, "body", tag_find_urls }, - { TAG_EMBED, "embed", tag_find_urls }, - { TAG_FIG, "fig", tag_find_urls }, - { TAG_FORM, "form", tag_handle_form }, - { TAG_FRAME, "frame", tag_find_urls }, - { TAG_IFRAME, "iframe", tag_find_urls }, - { TAG_IMG, "img", tag_find_urls }, - { TAG_INPUT, "input", tag_find_urls }, - { TAG_LAYER, "layer", tag_find_urls }, - { TAG_LINK, "link", tag_handle_link }, - { TAG_META, "meta", tag_handle_meta }, - { TAG_OBJECT, "object", tag_find_urls }, - { TAG_OVERLAY, "overlay", tag_find_urls }, - { TAG_SCRIPT, "script", tag_find_urls }, - { TAG_TABLE, "table", tag_find_urls }, - { TAG_TD, "td", tag_find_urls }, - { TAG_TH, "th", tag_find_urls } + { TAG_A, "a", tag_find_urls }, + { TAG_APPLET, "applet", tag_find_urls }, + { TAG_AREA, "area", tag_find_urls }, + { TAG_BASE, "base", tag_handle_base }, + { TAG_BGSOUND, "bgsound", tag_find_urls }, + { TAG_BODY, "body", tag_find_urls }, + { TAG_EMBED, "embed", tag_find_urls }, + { TAG_FIG, "fig", tag_find_urls }, + { TAG_FORM, "form", tag_handle_form }, + { TAG_FRAME, "frame", tag_find_urls }, + { TAG_IFRAME, "iframe", tag_find_urls }, + { TAG_IMG, "img", tag_find_urls }, + { TAG_INPUT, "input", tag_find_urls }, + { TAG_LAYER, "layer", tag_find_urls }, + { TAG_LINK, "link", tag_handle_link }, + { TAG_META, "meta", tag_handle_meta }, + { TAG_OBJECT, "object", tag_find_urls }, + { TAG_OVERLAY, "overlay", tag_find_urls }, + { TAG_SCRIPT, "script", tag_find_urls }, + { TAG_TABLE, "table", tag_find_urls }, + { TAG_TD, "td", tag_find_urls }, + { TAG_TH, "th", tag_find_urls } }; /* tag_url_attributes documents which attributes of which tags contain @@ -119,14 +119,14 @@ static struct known_tag { /* The link is "inline", i.e. needs to be retrieved for this document to be correctly rendered. Inline links include inlined images, stylesheets, children frames, etc. */ -#define ATTR_INLINE 1 +#define ATTR_INLINE 1 /* The link is expected to yield HTML contents. It's important not to try to follow HTML obtained by following e.g. regardless of content-type. Doing this causes infinite loops for "images" that return non-404 error pages with links to the same image. */ -#define ATTR_HTML 2 +#define ATTR_HTML 2 /* For tags handled by tag_find_urls: attributes that contain URLs to download. */ @@ -135,38 +135,38 @@ static struct { const char *attr_name; int flags; } tag_url_attributes[] = { - { TAG_A, "href", ATTR_HTML }, - { TAG_APPLET, "code", ATTR_INLINE }, - { TAG_AREA, "href", ATTR_HTML }, - { TAG_BGSOUND, "src", ATTR_INLINE }, - { TAG_BODY, "background", ATTR_INLINE }, - { TAG_EMBED, "href", ATTR_HTML }, - { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, - { TAG_FIG, "src", ATTR_INLINE }, - { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, - { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, - { TAG_IMG, "href", ATTR_INLINE }, - { TAG_IMG, "lowsrc", ATTR_INLINE }, - { TAG_IMG, "src", ATTR_INLINE }, - { TAG_INPUT, "src", ATTR_INLINE }, - { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, - { TAG_OBJECT, "data", ATTR_INLINE }, - { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, - { TAG_SCRIPT, "src", ATTR_INLINE }, - { TAG_TABLE, "background", ATTR_INLINE }, - { TAG_TD, "background", ATTR_INLINE }, - { TAG_TH, "background", ATTR_INLINE } + { TAG_A, "href", ATTR_HTML }, + { TAG_APPLET, "code", ATTR_INLINE }, + { TAG_AREA, "href", ATTR_HTML }, + { TAG_BGSOUND, "src", ATTR_INLINE }, + { TAG_BODY, "background", ATTR_INLINE }, + { TAG_EMBED, "href", ATTR_HTML }, + { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_FIG, "src", ATTR_INLINE }, + { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IMG, "href", ATTR_INLINE }, + { TAG_IMG, "lowsrc", ATTR_INLINE }, + { TAG_IMG, "src", ATTR_INLINE }, + { TAG_INPUT, "src", ATTR_INLINE }, + { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OBJECT, "data", ATTR_INLINE }, + { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_SCRIPT, "src", ATTR_INLINE }, + { TAG_TABLE, "background", ATTR_INLINE }, + { TAG_TD, "background", ATTR_INLINE }, + { TAG_TH, "background", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, from the information above. However, some places in the code refer to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { - "rel", /* used by tag_handle_link */ - "http-equiv", /* used by tag_handle_meta */ - "name", /* used by tag_handle_meta */ - "content", /* used by tag_handle_meta */ - "action" /* used by tag_handle_form */ + "rel", /* used by tag_handle_link */ + "http-equiv", /* used by tag_handle_meta */ + "name", /* used by tag_handle_meta */ + "content", /* used by tag_handle_meta */ + "action" /* used by tag_handle_form */ }; static struct hash_table *interesting_tags; @@ -197,23 +197,23 @@ init_interesting (void) { char **ignored; for (ignored = opt.ignore_tags; *ignored; ignored++) - hash_table_remove (interesting_tags, *ignored); + hash_table_remove (interesting_tags, *ignored); } /* If --follow-tags is specified, use only those tags. */ if (opt.follow_tags) { /* Create a new table intersecting --follow-tags and known_tags, - and use it as interesting_tags. */ + and use it as interesting_tags. */ struct hash_table *intersect = make_nocase_string_hash_table (0); char **followed; for (followed = opt.follow_tags; *followed; followed++) - { - struct known_tag *t = hash_table_get (interesting_tags, *followed); - if (!t) - continue; /* ignore unknown --follow-tags entries. */ - hash_table_put (intersect, *followed, t); - } + { + struct known_tag *t = hash_table_get (interesting_tags, *followed); + if (!t) + continue; /* ignore unknown --follow-tags entries. */ + hash_table_put (intersect, *followed, t); + } hash_table_destroy (interesting_tags); interesting_tags = intersect; } @@ -224,7 +224,7 @@ init_interesting (void) hash_table_put (interesting_attributes, additional_attributes[i], "1"); for (i = 0; i < countof (tag_url_attributes); i++) hash_table_put (interesting_attributes, - tag_url_attributes[i].attr_name, "1"); + tag_url_attributes[i].attr_name, "1"); } /* Find the value of attribute named NAME in the taginfo TAG. If the @@ -238,24 +238,24 @@ find_attr (struct taginfo *tag, const char *name, int *attrind) for (i = 0; i < tag->nattrs; i++) if (!strcasecmp (tag->attrs[i].name, name)) { - if (attrind) - *attrind = i; - return tag->attrs[i].value; + if (attrind) + *attrind = i; + return tag->attrs[i].value; } return NULL; } struct map_context { - char *text; /* HTML text. */ - char *base; /* Base URI of the document, possibly - changed through . */ - const char *parent_base; /* Base of the current document. */ - const char *document_file; /* File name of this document. */ - bool nofollow; /* whether NOFOLLOW was specified in a + char *text; /* HTML text. */ + char *base; /* Base URI of the document, possibly + changed through . */ + const char *parent_base; /* Base of the current document. */ + const char *document_file; /* File name of this document. */ + bool nofollow; /* whether NOFOLLOW was specified in a tag. */ - struct urlpos *head, *tail; /* List of URLs that is being - built. */ + struct urlpos *head, *tail; /* List of URLs that is being + built. */ }; /* Append LINK_URI to the urlpos structure that is being built. @@ -266,7 +266,7 @@ struct map_context { static struct urlpos * append_url (const char *link_uri, - struct taginfo *tag, int attrind, struct map_context *ctx) + struct taginfo *tag, int attrind, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -276,47 +276,47 @@ append_url (const char *link_uri, if (!base) { DEBUGP (("%s: no base, merge will use \"%s\".\n", - ctx->document_file, link_uri)); + ctx->document_file, link_uri)); if (!link_has_scheme) - { - /* Base URL is unavailable, and the link does not have a - location attached to it -- we have to give up. Since - this can only happen when using `--force-html -i', print - a warning. */ - logprintf (LOG_NOTQUIET, - _("%s: Cannot resolve incomplete link %s.\n"), - ctx->document_file, link_uri); - return NULL; - } + { + /* Base URL is unavailable, and the link does not have a + location attached to it -- we have to give up. Since + this can only happen when using `--force-html -i', print + a warning. */ + logprintf (LOG_NOTQUIET, + _("%s: Cannot resolve incomplete link %s.\n"), + ctx->document_file, link_uri); + return NULL; + } url = url_parse (link_uri, NULL); if (!url) - { - DEBUGP (("%s: link \"%s\" doesn't parse.\n", - ctx->document_file, link_uri)); - return NULL; - } + { + DEBUGP (("%s: link \"%s\" doesn't parse.\n", + ctx->document_file, link_uri)); + return NULL; + } } else { /* Merge BASE with LINK_URI, but also make sure the result is - canonicalized, i.e. that "../" have been resolved. - (parse_url will do that for us.) */ + canonicalized, i.e. that "../" have been resolved. + (parse_url will do that for us.) */ char *complete_uri = uri_merge (base, link_uri); DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", - ctx->document_file, base, link_uri, complete_uri)); + ctx->document_file, base, link_uri, complete_uri)); url = url_parse (complete_uri, NULL); if (!url) - { - DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", - ctx->document_file, complete_uri)); - xfree (complete_uri); - return NULL; - } + { + DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", + ctx->document_file, complete_uri)); + xfree (complete_uri); + return NULL; + } xfree (complete_uri); } @@ -360,10 +360,10 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) for (i = 0; i < countof (tag_url_attributes); i++) if (tag_url_attributes[i].tagid == tagid) { - /* We've found the index of tag_url_attributes where the - attributes of our tag begin. */ - first = i; - break; + /* We've found the index of tag_url_attributes where the + attributes of our tag begin. */ + first = i; + break; } assert (first != -1); @@ -379,30 +379,30 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) for (attrind = 0; attrind < tag->nattrs; attrind++) { /* Find whether TAG/ATTRIND is a combination that contains a - URL. */ + URL. */ char *link = tag->attrs[attrind].value; const int size = countof (tag_url_attributes); /* If you're cringing at the inefficiency of the nested loops, - remember that they both iterate over a very small number of - items. The worst-case inner loop is for the IMG tag, which - has three attributes. */ + remember that they both iterate over a very small number of + items. The worst-case inner loop is for the IMG tag, which + has three attributes. */ for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) - { - if (0 == strcasecmp (tag->attrs[attrind].name, - tag_url_attributes[i].attr_name)) - { - struct urlpos *up = append_url (link, tag, attrind, ctx); - if (up) - { - int flags = tag_url_attributes[i].flags; - if (flags & ATTR_INLINE) - up->link_inline_p = 1; - if (flags & ATTR_HTML) - up->link_expect_html = 1; - } - } - } + { + if (0 == strcasecmp (tag->attrs[attrind].name, + tag_url_attributes[i].attr_name)) + { + struct urlpos *up = append_url (link, tag, attrind, ctx); + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } + } + } } } @@ -442,7 +442,7 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) { struct urlpos *up = append_url (action, tag, attrind, ctx); if (up) - up->ignore_when_downloading = 1; + up->ignore_when_downloading = 1; } } @@ -465,17 +465,17 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) { struct urlpos *up = append_url (href, tag, attrind, ctx); if (up) - { - char *rel = find_attr (tag, "rel", NULL); - if (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))) - up->link_inline_p = 1; - else - /* The external ones usually point to HTML pages, such as - */ - up->link_expect_html = 1; - } + { + char *rel = find_attr (tag, "rel", NULL); + if (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))) + up->link_inline_p = 1; + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; + } } } @@ -491,13 +491,13 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (http_equiv && 0 == strcasecmp (http_equiv, "refresh")) { /* Some pages use a META tag to specify that the page be - refreshed by a new page after a given number of seconds. The - general format for this is: + refreshed by a new page after a given number of seconds. The + general format for this is: - + - So we just need to skip past the "NUMBER; URL=" garbage to - get to the URL. */ + So we just need to skip past the "NUMBER; URL=" garbage to + get to the URL. */ struct urlpos *entry; int attrind; @@ -506,57 +506,57 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) char *refresh = find_attr (tag, "content", &attrind); if (!refresh) - return; + return; for (p = refresh; ISDIGIT (*p); p++) - timeout = 10 * timeout + *p - '0'; + timeout = 10 * timeout + *p - '0'; if (*p++ != ';') - return; + return; while (ISSPACE (*p)) - ++p; + ++p; if (!( TOUPPER (*p) == 'U' - && TOUPPER (*(p + 1)) == 'R' - && TOUPPER (*(p + 2)) == 'L' - && *(p + 3) == '=')) - return; + && TOUPPER (*(p + 1)) == 'R' + && TOUPPER (*(p + 2)) == 'L' + && *(p + 3) == '=')) + return; p += 4; while (ISSPACE (*p)) - ++p; + ++p; entry = append_url (p, tag, attrind, ctx); if (entry) - { - entry->link_refresh_p = 1; - entry->refresh_timeout = timeout; - entry->link_expect_html = 1; - } + { + entry->link_refresh_p = 1; + entry->refresh_timeout = timeout; + entry->link_expect_html = 1; + } } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: - */ + */ char *content = find_attr (tag, "content", NULL); if (!content) - return; + return; if (!strcasecmp (content, "none")) - ctx->nofollow = true; + ctx->nofollow = true; else - { - while (*content) - { - /* Find the next occurrence of ',' or the end of - the string. */ - char *end = strchr (content, ','); - if (end) - ++end; - else - end = content + strlen (content); - if (!strncasecmp (content, "nofollow", end - content)) - ctx->nofollow = true; - content = end; - } - } + { + while (*content) + { + /* Find the next occurrence of ',' or the end of + the string. */ + char *end = strchr (content, ','); + if (end) + ++end; + else + end = content + strlen (content); + if (!strncasecmp (content, "nofollow", end - content)) + ctx->nofollow = true; + content = end; + } + } } } @@ -618,7 +618,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) flags |= MHT_STRICT_COMMENTS; map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, - interesting_tags, interesting_attributes); + interesting_tags, interesting_attributes); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) @@ -661,51 +661,51 @@ get_urls_file (const char *file) const char *line_beg = text; const char *line_end = memchr (text, '\n', text_end - text); if (!line_end) - line_end = text_end; + line_end = text_end; else - ++line_end; + ++line_end; text = line_end; /* Strip whitespace from the beginning and end of line. */ while (line_beg < line_end && ISSPACE (*line_beg)) - ++line_beg; + ++line_beg; while (line_end > line_beg && ISSPACE (*(line_end - 1))) - --line_end; + --line_end; if (line_beg == line_end) - continue; + continue; /* The URL is in the [line_beg, line_end) region. */ /* We must copy the URL to a zero-terminated string, and we - can't use alloca because we're in a loop. *sigh*. */ + can't use alloca because we're in a loop. *sigh*. */ url_text = strdupdelim (line_beg, line_end); if (opt.base_href) - { - /* Merge opt.base_href with URL. */ - char *merged = uri_merge (opt.base_href, url_text); - xfree (url_text); - url_text = merged; - } + { + /* Merge opt.base_href with URL. */ + char *merged = uri_merge (opt.base_href, url_text); + xfree (url_text); + url_text = merged; + } url = url_parse (url_text, &up_error_code); if (!url) - { - logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), - file, url_text, url_error (up_error_code)); - xfree (url_text); - continue; - } + { + logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), + file, url_text, url_error (up_error_code)); + xfree (url_text); + continue; + } xfree (url_text); entry = xnew0 (struct urlpos); entry->url = url; if (!head) - head = entry; + head = entry; else - tail->next = entry; + tail->next = entry; tail = entry; } read_file_free (fm);