X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=3c7c409e82e4c9f48e490abee7f4131a9113da0f;hb=4d7c5e087b2bc82c9f503dff003916d1047903ce;hp=80f5b96c19ffcec006a0f9d3565868ddd11be95a;hpb=ffc2d0f653db2f49c8d3d4b26aa517ed90eadd30;p=wget diff --git a/src/html-url.c b/src/html-url.c index 80f5b96c..3c7c409e 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,11 +1,11 @@ /* Collect URLs from HTML source. - Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 1998-2006 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or +the Free Software Foundation; either version 3 of the License, or (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, @@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +along with Wget. If not, see . In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -30,11 +29,7 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #include #include @@ -45,18 +40,14 @@ so, delete this exception statement from your version. */ #include "utils.h" #include "hash.h" #include "convert.h" - -#ifndef errno -extern int errno; -#endif +#include "recur.h" /* declaration of get_urls_html */ struct map_context; -typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *, - struct map_context *)); +typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); -#define DECLARE_TAG_HANDLER(fun) \ - static void fun PARAMS ((int, struct taginfo *, struct map_context *)) +#define DECLARE_TAG_HANDLER(fun) \ + static void fun (int, struct taginfo *, struct map_context *) DECLARE_TAG_HANDLER (tag_find_urls); DECLARE_TAG_HANDLER (tag_handle_base); @@ -81,6 +72,7 @@ enum { TAG_LAYER, TAG_LINK, TAG_META, + TAG_OBJECT, TAG_OVERLAY, TAG_SCRIPT, TAG_TABLE, @@ -111,6 +103,7 @@ static struct known_tag { { TAG_LAYER, "layer", tag_find_urls }, { TAG_LINK, "link", tag_handle_link }, { TAG_META, "meta", tag_handle_meta }, + { TAG_OBJECT, "object", tag_find_urls }, { TAG_OVERLAY, "overlay", tag_find_urls }, { TAG_SCRIPT, "script", tag_find_urls }, { TAG_TABLE, "table", tag_find_urls }, @@ -121,11 +114,19 @@ static struct known_tag { /* tag_url_attributes documents which attributes of which tags contain URLs to harvest. It is used by tag_find_urls. */ -/* Defines for the FLAGS field; currently only one flag is defined. */ +/* Defines for the FLAGS. */ + +/* The link is "inline", i.e. needs to be retrieved for this document + to be correctly rendered. Inline links include inlined images, + stylesheets, children frames, etc. */ +#define ATTR_INLINE 1 -/* This tag points to an external document not necessary for rendering this - document (i.e. it's not an inlined image, stylesheet, etc.). */ -#define TUA_EXTERNAL 1 +/* The link is expected to yield HTML contents. It's important not to + try to follow HTML obtained by following e.g. + regardless of content-type. Doing this causes infinite loops for + "images" that return non-404 error pages with links to the same + image. */ +#define ATTR_HTML 2 /* For tags handled by tag_find_urls: attributes that contain URLs to download. */ @@ -134,26 +135,27 @@ static struct { const char *attr_name; int flags; } tag_url_attributes[] = { - { TAG_A, "href", TUA_EXTERNAL }, - { TAG_APPLET, "code", 0 }, - { TAG_AREA, "href", TUA_EXTERNAL }, - { TAG_BGSOUND, "src", 0 }, - { TAG_BODY, "background", 0 }, - { TAG_EMBED, "href", TUA_EXTERNAL }, - { TAG_EMBED, "src", 0 }, - { TAG_FIG, "src", 0 }, - { TAG_FRAME, "src", 0 }, - { TAG_IFRAME, "src", 0 }, - { TAG_IMG, "href", 0 }, - { TAG_IMG, "lowsrc", 0 }, - { TAG_IMG, "src", 0 }, - { TAG_INPUT, "src", 0 }, - { TAG_LAYER, "src", 0 }, - { TAG_OVERLAY, "src", 0 }, - { TAG_SCRIPT, "src", 0 }, - { TAG_TABLE, "background", 0 }, - { TAG_TD, "background", 0 }, - { TAG_TH, "background", 0 } + { TAG_A, "href", ATTR_HTML }, + { TAG_APPLET, "code", ATTR_INLINE }, + { TAG_AREA, "href", ATTR_HTML }, + { TAG_BGSOUND, "src", ATTR_INLINE }, + { TAG_BODY, "background", ATTR_INLINE }, + { TAG_EMBED, "href", ATTR_HTML }, + { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_FIG, "src", ATTR_INLINE }, + { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IMG, "href", ATTR_INLINE }, + { TAG_IMG, "lowsrc", ATTR_INLINE }, + { TAG_IMG, "src", ATTR_INLINE }, + { TAG_INPUT, "src", ATTR_INLINE }, + { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OBJECT, "data", ATTR_INLINE }, + { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_SCRIPT, "src", ATTR_INLINE }, + { TAG_TABLE, "background", ATTR_INLINE }, + { TAG_TD, "background", ATTR_INLINE }, + { TAG_TH, "background", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, @@ -167,8 +169,8 @@ static const char *additional_attributes[] = { "action" /* used by tag_handle_form */ }; -struct hash_table *interesting_tags; -struct hash_table *interesting_attributes; +static struct hash_table *interesting_tags; +static struct hash_table *interesting_attributes; static void init_interesting (void) @@ -219,9 +221,10 @@ init_interesting (void) /* Add the attributes we care about. */ interesting_attributes = make_nocase_string_hash_table (10); for (i = 0; i < countof (additional_attributes); i++) - string_set_add (interesting_attributes, additional_attributes[i]); + hash_table_put (interesting_attributes, additional_attributes[i], "1"); for (i = 0; i < countof (tag_url_attributes); i++) - string_set_add (interesting_attributes, tag_url_attributes[i].attr_name); + hash_table_put (interesting_attributes, + tag_url_attributes[i].attr_name, "1"); } /* Find the value of attribute named NAME in the taginfo TAG. If the @@ -248,7 +251,7 @@ struct map_context { changed through . */ const char *parent_base; /* Base of the current document. */ const char *document_file; /* File name of this document. */ - int nofollow; /* whether NOFOLLOW was specified in a + bool nofollow; /* whether NOFOLLOW was specified in a tag. */ struct urlpos *head, *tail; /* List of URLs that is being @@ -262,8 +265,8 @@ struct map_context { size. */ static struct urlpos * -append_one_url (const char *link_uri, int inlinep, - struct taginfo *tag, int attrind, struct map_context *ctx) +append_url (const char *link_uri, + struct taginfo *tag, int attrind, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -319,14 +322,10 @@ append_one_url (const char *link_uri, int inlinep, DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); - newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (newel, 0, sizeof (*newel)); - - newel->next = NULL; + newel = xnew0 (struct urlpos); newel->url = url; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->size = tag->attrs[attrind].value_raw_size; - newel->link_inline_p = inlinep; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -393,8 +392,15 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - int flags = tag_url_attributes[i].flags; - append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); + struct urlpos *up = append_url (link, tag, attrind, ctx); + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } } } } @@ -411,7 +417,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); + base_urlpos = append_url (newbase, tag, attrind, ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -434,10 +440,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) char *action = find_attr (tag, "action", &attrind); if (action) { - struct urlpos *action_urlpos = append_one_url (action, 0, tag, - attrind, ctx); - if (action_urlpos) - action_urlpos->ignore_when_downloading = 1; + struct urlpos *up = append_url (action, tag, attrind, ctx); + if (up) + up->ignore_when_downloading = 1; } } @@ -458,11 +463,19 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) */ if (href) { - char *rel = find_attr (tag, "rel", NULL); - int inlinep = (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))); - append_one_url (href, inlinep, tag, attrind, ctx); + struct urlpos *up = append_url (href, tag, attrind, ctx); + if (up) + { + char *rel = find_attr (tag, "rel", NULL); + if (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))) + up->link_inline_p = 1; + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; + } } } @@ -511,11 +524,12 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) while (ISSPACE (*p)) ++p; - entry = append_one_url (p, 0, tag, attrind, ctx); + entry = append_url (p, tag, attrind, ctx); if (entry) { entry->link_refresh_p = 1; entry->refresh_timeout = timeout; + entry->link_expect_html = 1; } } else if (name && 0 == strcasecmp (name, "robots")) @@ -526,7 +540,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!content) return; if (!strcasecmp (content, "none")) - ctx->nofollow = 1; + ctx->nofollow = true; else { while (*content) @@ -539,7 +553,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) else end = content + strlen (content); if (!strncasecmp (content, "nofollow", end - content)) - ctx->nofollow = 1; + ctx->nofollow = true; content = end; } } @@ -567,7 +581,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, int *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) { struct file_memory *fm; struct map_context ctx; @@ -580,22 +594,25 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = ctx.tail = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; - ctx.nofollow = 0; + ctx.nofollow = false; if (!interesting_tags) init_interesting (); /* Specify MHT_TRIM_VALUES because of buggy HTML generators that - generate instead of (Netscape - ignores spaces as well.) If you really mean space, use &32; or - %20. */ + generate instead of (browsers + ignore spaces as well.) If you really mean space, use &32; or + %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, + e.g. in . Such newlines are also + ignored by IE and Mozilla and are presumably introduced by + writing HTML with editors that force word wrap. */ flags = MHT_TRIM_VALUES; if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; @@ -607,7 +624,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; - FREE_MAYBE (ctx.base); + xfree_null (ctx.base); read_file_free (fm); return ctx.head; } @@ -629,7 +646,7 @@ get_urls_file (const char *file) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; @@ -675,16 +692,14 @@ get_urls_file (const char *file) url = url_parse (url_text, &up_error_code); if (!url) { - logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n", + logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), file, url_text, url_error (up_error_code)); xfree (url_text); continue; } xfree (url_text); - entry = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (entry, 0, sizeof (*entry)); - entry->next = NULL; + entry = xnew0 (struct urlpos); entry->url = url; if (!head) @@ -700,6 +715,10 @@ get_urls_file (const char *file) void cleanup_html_url (void) { - FREE_MAYBE (interesting_tags); - FREE_MAYBE (interesting_attributes); + /* Destroy the hash tables. The hash table keys and values are not + allocated by this code, so we don't need to free them here. */ + if (interesting_tags) + hash_table_destroy (interesting_tags); + if (interesting_attributes) + hash_table_destroy (interesting_attributes); }