X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=2ce9172140fb774f46154906575d572c127f0818;hb=2447fb9a9b85083c1e6fa54d0a18bdf3962fab1f;hp=09962eddc9286a0f6a6569c697660caf9f2e96a8;hpb=ae1d264fcc190f9c74cb490aa6da0240b0b77b1e;p=wget diff --git a/src/html-url.c b/src/html-url.c index 09962edd..2ce91721 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,12 +1,12 @@ /* Collect URLs from HTML source. - Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc. + Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -30,11 +30,7 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #include #include @@ -43,19 +39,16 @@ so, delete this exception statement from your version. */ #include "html-parse.h" #include "url.h" #include "utils.h" +#include "hash.h" #include "convert.h" - -#ifndef errno -extern int errno; -#endif +#include "recur.h" /* declaration of get_urls_html */ struct map_context; -typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *, - struct map_context *)); +typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); -#define DECLARE_TAG_HANDLER(fun) \ - static void fun PARAMS ((int, struct taginfo *, struct map_context *)) +#define DECLARE_TAG_HANDLER(fun) \ + static void fun (int, struct taginfo *, struct map_context *) DECLARE_TAG_HANDLER (tag_find_urls); DECLARE_TAG_HANDLER (tag_handle_base); @@ -63,64 +56,78 @@ DECLARE_TAG_HANDLER (tag_handle_form); DECLARE_TAG_HANDLER (tag_handle_link); DECLARE_TAG_HANDLER (tag_handle_meta); +enum { + TAG_A, + TAG_APPLET, + TAG_AREA, + TAG_BASE, + TAG_BGSOUND, + TAG_BODY, + TAG_EMBED, + TAG_FIG, + TAG_FORM, + TAG_FRAME, + TAG_IFRAME, + TAG_IMG, + TAG_INPUT, + TAG_LAYER, + TAG_LINK, + TAG_META, + TAG_OBJECT, + TAG_OVERLAY, + TAG_SCRIPT, + TAG_TABLE, + TAG_TD, + TAG_TH +}; + /* The list of known tags and functions used for handling them. Most tags are simply harvested for URLs. */ -static struct { +static struct known_tag { + int tagid; const char *name; tag_handler_t handler; } known_tags[] = { -#define TAG_A 0 - { "a", tag_find_urls }, -#define TAG_APPLET 1 - { "applet", tag_find_urls }, -#define TAG_AREA 2 - { "area", tag_find_urls }, -#define TAG_BASE 3 - { "base", tag_handle_base }, -#define TAG_BGSOUND 4 - { "bgsound", tag_find_urls }, -#define TAG_BODY 5 - { "body", tag_find_urls }, -#define TAG_EMBED 6 - { "embed", tag_find_urls }, -#define TAG_FIG 7 - { "fig", tag_find_urls }, -#define TAG_FORM 8 - { "form", tag_handle_form }, -#define TAG_FRAME 9 - { "frame", tag_find_urls }, -#define TAG_IFRAME 10 - { "iframe", tag_find_urls }, -#define TAG_IMG 11 - { "img", tag_find_urls }, -#define TAG_INPUT 12 - { "input", tag_find_urls }, -#define TAG_LAYER 13 - { "layer", tag_find_urls }, -#define TAG_LINK 14 - { "link", tag_handle_link }, -#define TAG_META 15 - { "meta", tag_handle_meta }, -#define TAG_OVERLAY 16 - { "overlay", tag_find_urls }, -#define TAG_SCRIPT 17 - { "script", tag_find_urls }, -#define TAG_TABLE 18 - { "table", tag_find_urls }, -#define TAG_TD 19 - { "td", tag_find_urls }, -#define TAG_TH 20 - { "th", tag_find_urls } + { TAG_A, "a", tag_find_urls }, + { TAG_APPLET, "applet", tag_find_urls }, + { TAG_AREA, "area", tag_find_urls }, + { TAG_BASE, "base", tag_handle_base }, + { TAG_BGSOUND, "bgsound", tag_find_urls }, + { TAG_BODY, "body", tag_find_urls }, + { TAG_EMBED, "embed", tag_find_urls }, + { TAG_FIG, "fig", tag_find_urls }, + { TAG_FORM, "form", tag_handle_form }, + { TAG_FRAME, "frame", tag_find_urls }, + { TAG_IFRAME, "iframe", tag_find_urls }, + { TAG_IMG, "img", tag_find_urls }, + { TAG_INPUT, "input", tag_find_urls }, + { TAG_LAYER, "layer", tag_find_urls }, + { TAG_LINK, "link", tag_handle_link }, + { TAG_META, "meta", tag_handle_meta }, + { TAG_OBJECT, "object", tag_find_urls }, + { TAG_OVERLAY, "overlay", tag_find_urls }, + { TAG_SCRIPT, "script", tag_find_urls }, + { TAG_TABLE, "table", tag_find_urls }, + { TAG_TD, "td", tag_find_urls }, + { TAG_TH, "th", tag_find_urls } }; /* tag_url_attributes documents which attributes of which tags contain URLs to harvest. It is used by tag_find_urls. */ -/* Defines for the FLAGS field; currently only one flag is defined. */ +/* Defines for the FLAGS. */ -/* This tag points to an external document not necessary for rendering this - document (i.e. it's not an inlined image, stylesheet, etc.). */ -#define TUA_EXTERNAL 1 +/* The link is "inline", i.e. needs to be retrieved for this document + to be correctly rendered. Inline links include inlined images, + stylesheets, children frames, etc. */ +#define ATTR_INLINE 1 + +/* The link is expected to yield HTML contents. It's important not to + try to follow HTML obtained by following e.g. + regardless of content-type. Doing this causes infinite loops for + "images" that return non-404 error pages with links to the same + image. */ +#define ATTR_HTML 2 /* For tags handled by tag_find_urls: attributes that contain URLs to download. */ @@ -129,26 +136,27 @@ static struct { const char *attr_name; int flags; } tag_url_attributes[] = { - { TAG_A, "href", TUA_EXTERNAL }, - { TAG_APPLET, "code", 0 }, - { TAG_AREA, "href", TUA_EXTERNAL }, - { TAG_BGSOUND, "src", 0 }, - { TAG_BODY, "background", 0 }, - { TAG_EMBED, "href", TUA_EXTERNAL }, - { TAG_EMBED, "src", 0 }, - { TAG_FIG, "src", 0 }, - { TAG_FRAME, "src", 0 }, - { TAG_IFRAME, "src", 0 }, - { TAG_IMG, "href", 0 }, - { TAG_IMG, "lowsrc", 0 }, - { TAG_IMG, "src", 0 }, - { TAG_INPUT, "src", 0 }, - { TAG_LAYER, "src", 0 }, - { TAG_OVERLAY, "src", 0 }, - { TAG_SCRIPT, "src", 0 }, - { TAG_TABLE, "background", 0 }, - { TAG_TD, "background", 0 }, - { TAG_TH, "background", 0 } + { TAG_A, "href", ATTR_HTML }, + { TAG_APPLET, "code", ATTR_INLINE }, + { TAG_AREA, "href", ATTR_HTML }, + { TAG_BGSOUND, "src", ATTR_INLINE }, + { TAG_BODY, "background", ATTR_INLINE }, + { TAG_EMBED, "href", ATTR_HTML }, + { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_FIG, "src", ATTR_INLINE }, + { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IMG, "href", ATTR_INLINE }, + { TAG_IMG, "lowsrc", ATTR_INLINE }, + { TAG_IMG, "src", ATTR_INLINE }, + { TAG_INPUT, "src", ATTR_INLINE }, + { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OBJECT, "data", ATTR_INLINE }, + { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_SCRIPT, "src", ATTR_INLINE }, + { TAG_TABLE, "background", ATTR_INLINE }, + { TAG_TD, "background", ATTR_INLINE }, + { TAG_TH, "background", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, @@ -162,8 +170,8 @@ static const char *additional_attributes[] = { "action" /* used by tag_handle_form */ }; -static const char **interesting_tags; -static const char **interesting_attributes; +static struct hash_table *interesting_tags; +static struct hash_table *interesting_attributes; static void init_interesting (void) @@ -175,125 +183,49 @@ init_interesting (void) Here we also make sure that what we put in interesting_tags matches the user's preferences as specified through --ignore-tags - and --follow-tags. - - This function is as large as this only because of the glorious - expressivity of the C programming language. */ - - { - int i, ind = 0; - int size = countof (known_tags); - interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *)); - - for (i = 0; i < size; i++) - { - const char *name = known_tags[i].name; - - /* Normally here we could say: - interesting_tags[i] = name; - But we need to respect the settings of --ignore-tags and - --follow-tags, so the code gets a bit hairier. */ - - if (opt.ignore_tags) - { - /* --ignore-tags was specified. Do not match these - specific tags. --ignore-tags takes precedence over - --follow-tags, so we process --ignore first and fall - through if there's no match. */ - int j, lose = 0; - for (j = 0; opt.ignore_tags[j] != NULL; j++) - /* Loop through all the tags this user doesn't care about. */ - if (strcasecmp(opt.ignore_tags[j], name) == EQ) - { - lose = 1; - break; - } - if (lose) - continue; - } - - if (opt.follow_tags) - { - /* --follow-tags was specified. Only match these specific tags, so - continue back to top of for if we don't match one of them. */ - int j, win = 0; - for (j = 0; opt.follow_tags[j] != NULL; j++) - /* Loop through all the tags this user cares about. */ - if (strcasecmp(opt.follow_tags[j], name) == EQ) - { - win = 1; - break; - } - if (!win) - continue; /* wasn't one of the explicitly desired tags */ - } - - /* If we get to here, --follow-tags isn't being used or the - tag is among the ones that are followed, and --ignore-tags, - if specified, didn't include this tag, so it's an - "interesting" one. */ - interesting_tags[ind++] = name; - } - interesting_tags[ind] = NULL; - } - - /* The same for attributes, except we loop through tag_url_attributes. - Here we also need to make sure that the list of attributes is - unique, and to include the attributes from additional_attributes. */ - { - int i, ind; - const char **att = xmalloc ((countof (additional_attributes) + 1) - * sizeof (char *)); - /* First copy the "additional" attributes. */ - for (i = 0; i < countof (additional_attributes); i++) - att[i] = additional_attributes[i]; - ind = i; - att[ind] = NULL; - for (i = 0; i < countof (tag_url_attributes); i++) - { - int j, seen = 0; - const char *look_for = tag_url_attributes[i].attr_name; - for (j = 0; j < ind - 1; j++) - if (!strcmp (att[j], look_for)) - { - seen = 1; - break; - } - if (!seen) - { - att = xrealloc (att, (ind + 2) * sizeof (*att)); - att[ind++] = look_for; - att[ind] = NULL; - } - } - interesting_attributes = att; - } -} + and --follow-tags. */ -/* Find tag with name TAG_NAME in KNOWN_TAGS and return its index. */ + int i; + interesting_tags = make_nocase_string_hash_table (countof (known_tags)); -static int -find_tag (const char *tag_name) -{ - /* Originally implemented as linear search. In Wget 1.9 known_tags - contains 21 elements, for which binary search requires max. 5 - comparisons, whereas linear search performs 10 on average. */ + /* First, add all the tags we know hot to handle, mapped to their + respective entries in known_tags. */ + for (i = 0; i < countof (known_tags); i++) + hash_table_put (interesting_tags, known_tags[i].name, known_tags + i); - int lo = 0, hi = countof (known_tags) - 1; + /* Then remove the tags ignored through --ignore-tags. */ + if (opt.ignore_tags) + { + char **ignored; + for (ignored = opt.ignore_tags; *ignored; ignored++) + hash_table_remove (interesting_tags, *ignored); + } - while (lo <= hi) + /* If --follow-tags is specified, use only those tags. */ + if (opt.follow_tags) { - int mid = (lo + hi) >> 1; - int cmp = strcasecmp (tag_name, known_tags[mid].name); - if (cmp < 0) - hi = mid - 1; - else if (cmp > 0) - lo = mid + 1; - else - return mid; + /* Create a new table intersecting --follow-tags and known_tags, + and use it as interesting_tags. */ + struct hash_table *intersect = make_nocase_string_hash_table (0); + char **followed; + for (followed = opt.follow_tags; *followed; followed++) + { + struct known_tag *t = hash_table_get (interesting_tags, *followed); + if (!t) + continue; /* ignore unknown --follow-tags entries. */ + hash_table_put (intersect, *followed, t); + } + hash_table_destroy (interesting_tags); + interesting_tags = intersect; } - return -1; + /* Add the attributes we care about. */ + interesting_attributes = make_nocase_string_hash_table (10); + for (i = 0; i < countof (additional_attributes); i++) + hash_table_put (interesting_attributes, additional_attributes[i], "1"); + for (i = 0; i < countof (tag_url_attributes); i++) + hash_table_put (interesting_attributes, + tag_url_attributes[i].attr_name, "1"); } /* Find the value of attribute named NAME in the taginfo TAG. If the @@ -320,7 +252,7 @@ struct map_context { changed through . */ const char *parent_base; /* Base of the current document. */ const char *document_file; /* File name of this document. */ - int nofollow; /* whether NOFOLLOW was specified in a + bool nofollow; /* whether NOFOLLOW was specified in a tag. */ struct urlpos *head, *tail; /* List of URLs that is being @@ -334,8 +266,8 @@ struct map_context { size. */ static struct urlpos * -append_one_url (const char *link_uri, int inlinep, - struct taginfo *tag, int attrind, struct map_context *ctx) +append_url (const char *link_uri, + struct taginfo *tag, int attrind, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -391,14 +323,10 @@ append_one_url (const char *link_uri, int inlinep, DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); - newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (newel, 0, sizeof (*newel)); - - newel->next = NULL; + newel = xnew0 (struct urlpos); newel->url = url; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->size = tag->attrs[attrind].value_raw_size; - newel->link_inline_p = inlinep; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -427,10 +355,10 @@ append_one_url (const char *link_uri, int inlinep, static void tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) { - int i, attrind, first = -1; - int size = countof (tag_url_attributes); + int i, attrind; + int first = -1; - for (i = 0; i < size; i++) + for (i = 0; i < countof (tag_url_attributes); i++) if (tag_url_attributes[i].tagid == tagid) { /* We've found the index of tag_url_attributes where the @@ -454,18 +382,26 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) /* Find whether TAG/ATTRIND is a combination that contains a URL. */ char *link = tag->attrs[attrind].value; + const int size = countof (tag_url_attributes); /* If you're cringing at the inefficiency of the nested loops, - remember that they both iterate over a laughably small - quantity of items. The worst-case inner loop is for the IMG - tag, which has three attributes. */ + remember that they both iterate over a very small number of + items. The worst-case inner loop is for the IMG tag, which + has three attributes. */ for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) { if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - int flags = tag_url_attributes[i].flags; - append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); + struct urlpos *up = append_url (link, tag, attrind, ctx); + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } } } } @@ -482,7 +418,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); + base_urlpos = append_url (newbase, tag, attrind, ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -505,10 +441,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) char *action = find_attr (tag, "action", &attrind); if (action) { - struct urlpos *action_urlpos = append_one_url (action, 0, tag, - attrind, ctx); - if (action_urlpos) - action_urlpos->ignore_when_downloading = 1; + struct urlpos *up = append_url (action, tag, attrind, ctx); + if (up) + up->ignore_when_downloading = 1; } } @@ -529,11 +464,19 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) */ if (href) { - char *rel = find_attr (tag, "rel", NULL); - int inlinep = (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))); - append_one_url (href, inlinep, tag, attrind, ctx); + struct urlpos *up = append_url (href, tag, attrind, ctx); + if (up) + { + char *rel = find_attr (tag, "rel", NULL); + if (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))) + up->link_inline_p = 1; + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; + } } } @@ -582,11 +525,12 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) while (ISSPACE (*p)) ++p; - entry = append_one_url (p, 0, tag, attrind, ctx); + entry = append_url (p, tag, attrind, ctx); if (entry) { entry->link_refresh_p = 1; entry->refresh_timeout = timeout; + entry->link_expect_html = 1; } } else if (name && 0 == strcasecmp (name, "robots")) @@ -597,7 +541,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!content) return; if (!strcasecmp (content, "none")) - ctx->nofollow = 1; + ctx->nofollow = true; else { while (*content) @@ -610,28 +554,27 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) else end = content + strlen (content); if (!strncasecmp (content, "nofollow", end - content)) - ctx->nofollow = 1; + ctx->nofollow = true; content = end; } } } } -/* Examine name and attributes of TAG and take appropriate action - according to the tag. */ +/* Dispatch the tag handler appropriate for the tag we're mapping + over. See known_tags[] for definition of tag handlers. */ static void collect_tags_mapper (struct taginfo *tag, void *arg) { struct map_context *ctx = (struct map_context *)arg; - int tagid; - tag_handler_t handler; - tagid = find_tag (tag->name); - assert (tagid != -1); - handler = known_tags[tagid].handler; + /* Find the tag in our table of tags. This must not fail because + map_html_tags only returns tags found in interesting_tags. */ + struct known_tag *t = hash_table_get (interesting_tags, tag->name); + assert (t != NULL); - handler (tagid, tag, ctx); + t->handler (t->tagid, tag, ctx); } /* Analyze HTML tags FILE and construct a list of URLs referenced from @@ -639,7 +582,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, int *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) { struct file_memory *fm; struct map_context ctx; @@ -652,22 +595,25 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = ctx.tail = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; - ctx.nofollow = 0; + ctx.nofollow = false; if (!interesting_tags) init_interesting (); /* Specify MHT_TRIM_VALUES because of buggy HTML generators that - generate instead of (Netscape - ignores spaces as well.) If you really mean space, use &32; or - %20. */ + generate instead of (browsers + ignore spaces as well.) If you really mean space, use &32; or + %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, + e.g. in . Such newlines are also + ignored by IE and Mozilla and are presumably introduced by + writing HTML with editors that force word wrap. */ flags = MHT_TRIM_VALUES; if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; @@ -679,7 +625,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; - FREE_MAYBE (ctx.base); + xfree_null (ctx.base); read_file_free (fm); return ctx.head; } @@ -701,7 +647,7 @@ get_urls_file (const char *file) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; @@ -747,16 +693,14 @@ get_urls_file (const char *file) url = url_parse (url_text, &up_error_code); if (!url) { - logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n", + logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), file, url_text, url_error (up_error_code)); xfree (url_text); continue; } xfree (url_text); - entry = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (entry, 0, sizeof (*entry)); - entry->next = NULL; + entry = xnew0 (struct urlpos); entry->url = url; if (!head) @@ -772,6 +716,10 @@ get_urls_file (const char *file) void cleanup_html_url (void) { - FREE_MAYBE (interesting_tags); - FREE_MAYBE (interesting_attributes); + /* Destroy the hash tables. The hash table keys and values are not + allocated by this code, so we don't need to free them here. */ + if (interesting_tags) + hash_table_destroy (interesting_tags); + if (interesting_attributes) + hash_table_destroy (interesting_attributes); }