X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=3c7c409e82e4c9f48e490abee7f4131a9113da0f;hb=4d7c5e087b2bc82c9f503dff003916d1047903ce;hp=57ad8b5b21578b46ed06d3f2380372c8958fcc21;hpb=233ebb78de296361d5a25d0856e0957de1058f15;p=wget diff --git a/src/html-url.c b/src/html-url.c index 57ad8b5b..3c7c409e 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,11 +1,11 @@ /* Collect URLs from HTML source. - Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 1998-2006 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or +the Free Software Foundation; either version 3 of the License, or (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, @@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +along with Wget. If not, see . In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -30,11 +29,7 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #include #include @@ -45,18 +40,14 @@ so, delete this exception statement from your version. */ #include "utils.h" #include "hash.h" #include "convert.h" - -#ifndef errno -extern int errno; -#endif +#include "recur.h" /* declaration of get_urls_html */ struct map_context; -typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *, - struct map_context *)); +typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); -#define DECLARE_TAG_HANDLER(fun) \ - static void fun PARAMS ((int, struct taginfo *, struct map_context *)) +#define DECLARE_TAG_HANDLER(fun) \ + static void fun (int, struct taginfo *, struct map_context *) DECLARE_TAG_HANDLER (tag_find_urls); DECLARE_TAG_HANDLER (tag_handle_base); @@ -81,6 +72,7 @@ enum { TAG_LAYER, TAG_LINK, TAG_META, + TAG_OBJECT, TAG_OVERLAY, TAG_SCRIPT, TAG_TABLE, @@ -111,6 +103,7 @@ static struct known_tag { { TAG_LAYER, "layer", tag_find_urls }, { TAG_LINK, "link", tag_handle_link }, { TAG_META, "meta", tag_handle_meta }, + { TAG_OBJECT, "object", tag_find_urls }, { TAG_OVERLAY, "overlay", tag_find_urls }, { TAG_SCRIPT, "script", tag_find_urls }, { TAG_TABLE, "table", tag_find_urls }, @@ -121,11 +114,19 @@ static struct known_tag { /* tag_url_attributes documents which attributes of which tags contain URLs to harvest. It is used by tag_find_urls. */ -/* Defines for the FLAGS field; currently only one flag is defined. */ +/* Defines for the FLAGS. */ + +/* The link is "inline", i.e. needs to be retrieved for this document + to be correctly rendered. Inline links include inlined images, + stylesheets, children frames, etc. */ +#define ATTR_INLINE 1 -/* This tag points to an external document not necessary for rendering this - document (i.e. it's not an inlined image, stylesheet, etc.). */ -#define TUA_EXTERNAL 1 +/* The link is expected to yield HTML contents. It's important not to + try to follow HTML obtained by following e.g. + regardless of content-type. Doing this causes infinite loops for + "images" that return non-404 error pages with links to the same + image. */ +#define ATTR_HTML 2 /* For tags handled by tag_find_urls: attributes that contain URLs to download. */ @@ -134,26 +135,27 @@ static struct { const char *attr_name; int flags; } tag_url_attributes[] = { - { TAG_A, "href", TUA_EXTERNAL }, - { TAG_APPLET, "code", 0 }, - { TAG_AREA, "href", TUA_EXTERNAL }, - { TAG_BGSOUND, "src", 0 }, - { TAG_BODY, "background", 0 }, - { TAG_EMBED, "href", TUA_EXTERNAL }, - { TAG_EMBED, "src", 0 }, - { TAG_FIG, "src", 0 }, - { TAG_FRAME, "src", 0 }, - { TAG_IFRAME, "src", 0 }, - { TAG_IMG, "href", 0 }, - { TAG_IMG, "lowsrc", 0 }, - { TAG_IMG, "src", 0 }, - { TAG_INPUT, "src", 0 }, - { TAG_LAYER, "src", 0 }, - { TAG_OVERLAY, "src", 0 }, - { TAG_SCRIPT, "src", 0 }, - { TAG_TABLE, "background", 0 }, - { TAG_TD, "background", 0 }, - { TAG_TH, "background", 0 } + { TAG_A, "href", ATTR_HTML }, + { TAG_APPLET, "code", ATTR_INLINE }, + { TAG_AREA, "href", ATTR_HTML }, + { TAG_BGSOUND, "src", ATTR_INLINE }, + { TAG_BODY, "background", ATTR_INLINE }, + { TAG_EMBED, "href", ATTR_HTML }, + { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_FIG, "src", ATTR_INLINE }, + { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IMG, "href", ATTR_INLINE }, + { TAG_IMG, "lowsrc", ATTR_INLINE }, + { TAG_IMG, "src", ATTR_INLINE }, + { TAG_INPUT, "src", ATTR_INLINE }, + { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OBJECT, "data", ATTR_INLINE }, + { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_SCRIPT, "src", ATTR_INLINE }, + { TAG_TABLE, "background", ATTR_INLINE }, + { TAG_TD, "background", ATTR_INLINE }, + { TAG_TH, "background", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, @@ -167,8 +169,8 @@ static const char *additional_attributes[] = { "action" /* used by tag_handle_form */ }; -struct hash_table *interesting_tags; -struct hash_table *interesting_attributes; +static struct hash_table *interesting_tags; +static struct hash_table *interesting_attributes; static void init_interesting (void) @@ -201,16 +203,15 @@ init_interesting (void) /* If --follow-tags is specified, use only those tags. */ if (opt.follow_tags) { - /* Create a new hash table with the intersection of tags in - --follow-tags and known_tags, and use that as - interesting_tags. */ + /* Create a new table intersecting --follow-tags and known_tags, + and use it as interesting_tags. */ struct hash_table *intersect = make_nocase_string_hash_table (0); char **followed; for (followed = opt.follow_tags; *followed; followed++) { struct known_tag *t = hash_table_get (interesting_tags, *followed); if (!t) - continue; /* ignore unknown tags in --follow-tags. */ + continue; /* ignore unknown --follow-tags entries. */ hash_table_put (intersect, *followed, t); } hash_table_destroy (interesting_tags); @@ -218,11 +219,12 @@ init_interesting (void) } /* Add the attributes we care about. */ - interesting_attributes = make_nocase_string_hash_table (17); + interesting_attributes = make_nocase_string_hash_table (10); for (i = 0; i < countof (additional_attributes); i++) - string_set_add (interesting_attributes, additional_attributes[i]); + hash_table_put (interesting_attributes, additional_attributes[i], "1"); for (i = 0; i < countof (tag_url_attributes); i++) - string_set_add (interesting_attributes, tag_url_attributes[i].attr_name); + hash_table_put (interesting_attributes, + tag_url_attributes[i].attr_name, "1"); } /* Find the value of attribute named NAME in the taginfo TAG. If the @@ -249,7 +251,7 @@ struct map_context { changed through . */ const char *parent_base; /* Base of the current document. */ const char *document_file; /* File name of this document. */ - int nofollow; /* whether NOFOLLOW was specified in a + bool nofollow; /* whether NOFOLLOW was specified in a tag. */ struct urlpos *head, *tail; /* List of URLs that is being @@ -263,8 +265,8 @@ struct map_context { size. */ static struct urlpos * -append_one_url (const char *link_uri, int inlinep, - struct taginfo *tag, int attrind, struct map_context *ctx) +append_url (const char *link_uri, + struct taginfo *tag, int attrind, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -320,14 +322,10 @@ append_one_url (const char *link_uri, int inlinep, DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); - newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (newel, 0, sizeof (*newel)); - - newel->next = NULL; + newel = xnew0 (struct urlpos); newel->url = url; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->size = tag->attrs[attrind].value_raw_size; - newel->link_inline_p = inlinep; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -394,8 +392,15 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - int flags = tag_url_attributes[i].flags; - append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); + struct urlpos *up = append_url (link, tag, attrind, ctx); + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } } } } @@ -412,7 +417,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); + base_urlpos = append_url (newbase, tag, attrind, ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -435,10 +440,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) char *action = find_attr (tag, "action", &attrind); if (action) { - struct urlpos *action_urlpos = append_one_url (action, 0, tag, - attrind, ctx); - if (action_urlpos) - action_urlpos->ignore_when_downloading = 1; + struct urlpos *up = append_url (action, tag, attrind, ctx); + if (up) + up->ignore_when_downloading = 1; } } @@ -459,11 +463,19 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) */ if (href) { - char *rel = find_attr (tag, "rel", NULL); - int inlinep = (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))); - append_one_url (href, inlinep, tag, attrind, ctx); + struct urlpos *up = append_url (href, tag, attrind, ctx); + if (up) + { + char *rel = find_attr (tag, "rel", NULL); + if (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))) + up->link_inline_p = 1; + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; + } } } @@ -512,11 +524,12 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) while (ISSPACE (*p)) ++p; - entry = append_one_url (p, 0, tag, attrind, ctx); + entry = append_url (p, tag, attrind, ctx); if (entry) { entry->link_refresh_p = 1; entry->refresh_timeout = timeout; + entry->link_expect_html = 1; } } else if (name && 0 == strcasecmp (name, "robots")) @@ -527,7 +540,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!content) return; if (!strcasecmp (content, "none")) - ctx->nofollow = 1; + ctx->nofollow = true; else { while (*content) @@ -540,7 +553,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) else end = content + strlen (content); if (!strncasecmp (content, "nofollow", end - content)) - ctx->nofollow = 1; + ctx->nofollow = true; content = end; } } @@ -568,7 +581,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, int *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) { struct file_memory *fm; struct map_context ctx; @@ -581,22 +594,25 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = ctx.tail = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; - ctx.nofollow = 0; + ctx.nofollow = false; if (!interesting_tags) init_interesting (); /* Specify MHT_TRIM_VALUES because of buggy HTML generators that - generate instead of (Netscape - ignores spaces as well.) If you really mean space, use &32; or - %20. */ + generate instead of (browsers + ignore spaces as well.) If you really mean space, use &32; or + %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, + e.g. in . Such newlines are also + ignored by IE and Mozilla and are presumably introduced by + writing HTML with editors that force word wrap. */ flags = MHT_TRIM_VALUES; if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; @@ -608,7 +624,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; - FREE_MAYBE (ctx.base); + xfree_null (ctx.base); read_file_free (fm); return ctx.head; } @@ -630,7 +646,7 @@ get_urls_file (const char *file) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; @@ -676,16 +692,14 @@ get_urls_file (const char *file) url = url_parse (url_text, &up_error_code); if (!url) { - logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n", + logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), file, url_text, url_error (up_error_code)); xfree (url_text); continue; } xfree (url_text); - entry = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (entry, 0, sizeof (*entry)); - entry->next = NULL; + entry = xnew0 (struct urlpos); entry->url = url; if (!head) @@ -701,6 +715,10 @@ get_urls_file (const char *file) void cleanup_html_url (void) { - FREE_MAYBE (interesting_tags); - FREE_MAYBE (interesting_attributes); + /* Destroy the hash tables. The hash table keys and values are not + allocated by this code, so we don't need to free them here. */ + if (interesting_tags) + hash_table_destroy (interesting_tags); + if (interesting_attributes) + hash_table_destroy (interesting_attributes); }