1 /* Collect URLs from HTML source.
2 Copyright (C) 1998-2006 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 In addition, as a special exception, the Free Software Foundation
20 gives permission to link the code of its release of Wget with the
21 OpenSSL project's "OpenSSL" library (or with modified versions of it
22 that use the same license as the "OpenSSL" library), and distribute
23 the linked executables. You must obey the GNU General Public License
24 in all respects for all of the code used other than "OpenSSL". If you
25 modify this file, you may extend this exception to your version of the
26 file, but you are not obligated to do so. If you do not wish to do
27 so, delete this exception statement from your version. */
38 #include "html-parse.h"
43 #include "recur.h" /* declaration of get_urls_html */
47 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
49 #define DECLARE_TAG_HANDLER(fun) \
50 static void fun (int, struct taginfo *, struct map_context *)
52 DECLARE_TAG_HANDLER (tag_find_urls);
53 DECLARE_TAG_HANDLER (tag_handle_base);
54 DECLARE_TAG_HANDLER (tag_handle_form);
55 DECLARE_TAG_HANDLER (tag_handle_link);
56 DECLARE_TAG_HANDLER (tag_handle_meta);
83 /* The list of known tags and functions used for handling them. Most
84 tags are simply harvested for URLs. */
85 static struct known_tag {
88 tag_handler_t handler;
90 { TAG_A, "a", tag_find_urls },
91 { TAG_APPLET, "applet", tag_find_urls },
92 { TAG_AREA, "area", tag_find_urls },
93 { TAG_BASE, "base", tag_handle_base },
94 { TAG_BGSOUND, "bgsound", tag_find_urls },
95 { TAG_BODY, "body", tag_find_urls },
96 { TAG_EMBED, "embed", tag_find_urls },
97 { TAG_FIG, "fig", tag_find_urls },
98 { TAG_FORM, "form", tag_handle_form },
99 { TAG_FRAME, "frame", tag_find_urls },
100 { TAG_IFRAME, "iframe", tag_find_urls },
101 { TAG_IMG, "img", tag_find_urls },
102 { TAG_INPUT, "input", tag_find_urls },
103 { TAG_LAYER, "layer", tag_find_urls },
104 { TAG_LINK, "link", tag_handle_link },
105 { TAG_META, "meta", tag_handle_meta },
106 { TAG_OBJECT, "object", tag_find_urls },
107 { TAG_OVERLAY, "overlay", tag_find_urls },
108 { TAG_SCRIPT, "script", tag_find_urls },
109 { TAG_TABLE, "table", tag_find_urls },
110 { TAG_TD, "td", tag_find_urls },
111 { TAG_TH, "th", tag_find_urls }
114 /* tag_url_attributes documents which attributes of which tags contain
115 URLs to harvest. It is used by tag_find_urls. */
117 /* Defines for the FLAGS. */
119 /* The link is "inline", i.e. needs to be retrieved for this document
120 to be correctly rendered. Inline links include inlined images,
121 stylesheets, children frames, etc. */
122 #define ATTR_INLINE 1
124 /* The link is expected to yield HTML contents. It's important not to
125 try to follow HTML obtained by following e.g. <img src="...">
126 regardless of content-type. Doing this causes infinite loops for
127 "images" that return non-404 error pages with links to the same
131 /* For tags handled by tag_find_urls: attributes that contain URLs to
135 const char *attr_name;
137 } tag_url_attributes[] = {
138 { TAG_A, "href", ATTR_HTML },
139 { TAG_APPLET, "code", ATTR_INLINE },
140 { TAG_AREA, "href", ATTR_HTML },
141 { TAG_BGSOUND, "src", ATTR_INLINE },
142 { TAG_BODY, "background", ATTR_INLINE },
143 { TAG_EMBED, "href", ATTR_HTML },
144 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
145 { TAG_FIG, "src", ATTR_INLINE },
146 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
147 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
148 { TAG_IMG, "href", ATTR_INLINE },
149 { TAG_IMG, "lowsrc", ATTR_INLINE },
150 { TAG_IMG, "src", ATTR_INLINE },
151 { TAG_INPUT, "src", ATTR_INLINE },
152 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
153 { TAG_OBJECT, "data", ATTR_INLINE },
154 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
155 { TAG_SCRIPT, "src", ATTR_INLINE },
156 { TAG_TABLE, "background", ATTR_INLINE },
157 { TAG_TD, "background", ATTR_INLINE },
158 { TAG_TH, "background", ATTR_INLINE }
161 /* The lists of interesting tags and attributes are built dynamically,
162 from the information above. However, some places in the code refer
163 to the attributes not mentioned here. We add them manually. */
164 static const char *additional_attributes[] = {
165 "rel", /* used by tag_handle_link */
166 "http-equiv", /* used by tag_handle_meta */
167 "name", /* used by tag_handle_meta */
168 "content", /* used by tag_handle_meta */
169 "action" /* used by tag_handle_form */
172 static struct hash_table *interesting_tags;
173 static struct hash_table *interesting_attributes;
176 init_interesting (void)
178 /* Init the variables interesting_tags and interesting_attributes
179 that are used by the HTML parser to know which tags and
180 attributes we're interested in. We initialize this only once,
181 for performance reasons.
183 Here we also make sure that what we put in interesting_tags
184 matches the user's preferences as specified through --ignore-tags
185 and --follow-tags. */
188 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
190 /* First, add all the tags we know hot to handle, mapped to their
191 respective entries in known_tags. */
192 for (i = 0; i < countof (known_tags); i++)
193 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
195 /* Then remove the tags ignored through --ignore-tags. */
199 for (ignored = opt.ignore_tags; *ignored; ignored++)
200 hash_table_remove (interesting_tags, *ignored);
203 /* If --follow-tags is specified, use only those tags. */
206 /* Create a new table intersecting --follow-tags and known_tags,
207 and use it as interesting_tags. */
208 struct hash_table *intersect = make_nocase_string_hash_table (0);
210 for (followed = opt.follow_tags; *followed; followed++)
212 struct known_tag *t = hash_table_get (interesting_tags, *followed);
214 continue; /* ignore unknown --follow-tags entries. */
215 hash_table_put (intersect, *followed, t);
217 hash_table_destroy (interesting_tags);
218 interesting_tags = intersect;
221 /* Add the attributes we care about. */
222 interesting_attributes = make_nocase_string_hash_table (10);
223 for (i = 0; i < countof (additional_attributes); i++)
224 hash_table_put (interesting_attributes, additional_attributes[i], "1");
225 for (i = 0; i < countof (tag_url_attributes); i++)
226 hash_table_put (interesting_attributes,
227 tag_url_attributes[i].attr_name, "1");
230 /* Find the value of attribute named NAME in the taginfo TAG. If the
231 attribute is not present, return NULL. If ATTRIND is non-NULL, the
232 index of the attribute in TAG will be stored there. */
235 find_attr (struct taginfo *tag, const char *name, int *attrind)
238 for (i = 0; i < tag->nattrs; i++)
239 if (!strcasecmp (tag->attrs[i].name, name))
243 return tag->attrs[i].value;
249 char *text; /* HTML text. */
250 char *base; /* Base URI of the document, possibly
251 changed through <base href=...>. */
252 const char *parent_base; /* Base of the current document. */
253 const char *document_file; /* File name of this document. */
254 bool nofollow; /* whether NOFOLLOW was specified in a
255 <meta name=robots> tag. */
257 struct urlpos *head, *tail; /* List of URLs that is being
261 /* Append LINK_URI to the urlpos structure that is being built.
263 LINK_URI will be merged with the current document base. TAG and
264 ATTRIND are the necessary context to store the position and
267 static struct urlpos *
268 append_url (const char *link_uri,
269 struct taginfo *tag, int attrind, struct map_context *ctx)
271 int link_has_scheme = url_has_scheme (link_uri);
272 struct urlpos *newel;
273 const char *base = ctx->base ? ctx->base : ctx->parent_base;
278 DEBUGP (("%s: no base, merge will use \"%s\".\n",
279 ctx->document_file, link_uri));
281 if (!link_has_scheme)
283 /* Base URL is unavailable, and the link does not have a
284 location attached to it -- we have to give up. Since
285 this can only happen when using `--force-html -i', print
287 logprintf (LOG_NOTQUIET,
288 _("%s: Cannot resolve incomplete link %s.\n"),
289 ctx->document_file, link_uri);
293 url = url_parse (link_uri, NULL);
296 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
297 ctx->document_file, link_uri));
303 /* Merge BASE with LINK_URI, but also make sure the result is
304 canonicalized, i.e. that "../" have been resolved.
305 (parse_url will do that for us.) */
307 char *complete_uri = uri_merge (base, link_uri);
309 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
310 ctx->document_file, base, link_uri, complete_uri));
312 url = url_parse (complete_uri, NULL);
315 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
316 ctx->document_file, complete_uri));
317 xfree (complete_uri);
320 xfree (complete_uri);
323 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
325 newel = xnew0 (struct urlpos);
327 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
328 newel->size = tag->attrs[attrind].value_raw_size;
330 /* A URL is relative if the host is not named, and the name does not
332 if (!link_has_scheme && *link_uri != '/')
333 newel->link_relative_p = 1;
334 else if (link_has_scheme)
335 newel->link_complete_p = 1;
339 ctx->tail->next = newel;
343 ctx->tail = ctx->head = newel;
348 /* All the tag_* functions are called from collect_tags_mapper, as
349 specified by KNOWN_TAGS. */
351 /* Default tag handler: collect URLs from attributes specified for
352 this tag by tag_url_attributes. */
355 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
360 for (i = 0; i < countof (tag_url_attributes); i++)
361 if (tag_url_attributes[i].tagid == tagid)
363 /* We've found the index of tag_url_attributes where the
364 attributes of our tag begin. */
368 assert (first != -1);
370 /* Loop over the "interesting" attributes of this tag. In this
371 example, it will loop over "src" and "lowsrc".
373 <img src="foo.png" lowsrc="bar.png">
375 This has to be done in the outer loop so that the attributes are
376 processed in the same order in which they appear in the page.
377 This is required when converting links. */
379 for (attrind = 0; attrind < tag->nattrs; attrind++)
381 /* Find whether TAG/ATTRIND is a combination that contains a
383 char *link = tag->attrs[attrind].value;
384 const int size = countof (tag_url_attributes);
386 /* If you're cringing at the inefficiency of the nested loops,
387 remember that they both iterate over a very small number of
388 items. The worst-case inner loop is for the IMG tag, which
389 has three attributes. */
390 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
392 if (0 == strcasecmp (tag->attrs[attrind].name,
393 tag_url_attributes[i].attr_name))
395 struct urlpos *up = append_url (link, tag, attrind, ctx);
398 int flags = tag_url_attributes[i].flags;
399 if (flags & ATTR_INLINE)
400 up->link_inline_p = 1;
401 if (flags & ATTR_HTML)
402 up->link_expect_html = 1;
409 /* Handle the BASE tag, for <base href=...>. */
412 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
414 struct urlpos *base_urlpos;
416 char *newbase = find_attr (tag, "href", &attrind);
420 base_urlpos = append_url (newbase, tag, attrind, ctx);
423 base_urlpos->ignore_when_downloading = 1;
424 base_urlpos->link_base_p = 1;
428 if (ctx->parent_base)
429 ctx->base = uri_merge (ctx->parent_base, newbase);
431 ctx->base = xstrdup (newbase);
434 /* Mark the URL found in <form action=...> for conversion. */
437 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
440 char *action = find_attr (tag, "action", &attrind);
443 struct urlpos *up = append_url (action, tag, attrind, ctx);
445 up->ignore_when_downloading = 1;
449 /* Handle the LINK tag. It requires special handling because how its
450 links will be followed in -p mode depends on the REL attribute. */
453 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
456 char *href = find_attr (tag, "href", &attrind);
458 /* All <link href="..."> link references are external, except those
459 known not to be, such as style sheet and shortcut icon:
461 <link rel="stylesheet" href="...">
462 <link rel="shortcut icon" href="...">
466 struct urlpos *up = append_url (href, tag, attrind, ctx);
469 char *rel = find_attr (tag, "rel", NULL);
471 && (0 == strcasecmp (rel, "stylesheet")
472 || 0 == strcasecmp (rel, "shortcut icon")))
473 up->link_inline_p = 1;
475 /* The external ones usually point to HTML pages, such as
476 <link rel="next" href="..."> */
477 up->link_expect_html = 1;
482 /* Handle the META tag. This requires special handling because of the
483 refresh feature and because of robot exclusion. */
486 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
488 char *name = find_attr (tag, "name", NULL);
489 char *http_equiv = find_attr (tag, "http-equiv", NULL);
491 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
493 /* Some pages use a META tag to specify that the page be
494 refreshed by a new page after a given number of seconds. The
495 general format for this is:
497 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
499 So we just need to skip past the "NUMBER; URL=" garbage to
502 struct urlpos *entry;
507 char *refresh = find_attr (tag, "content", &attrind);
511 for (p = refresh; ISDIGIT (*p); p++)
512 timeout = 10 * timeout + *p - '0';
518 if (!( TOUPPER (*p) == 'U'
519 && TOUPPER (*(p + 1)) == 'R'
520 && TOUPPER (*(p + 2)) == 'L'
527 entry = append_url (p, tag, attrind, ctx);
530 entry->link_refresh_p = 1;
531 entry->refresh_timeout = timeout;
532 entry->link_expect_html = 1;
535 else if (name && 0 == strcasecmp (name, "robots"))
537 /* Handle stuff like:
538 <meta name="robots" content="index,nofollow"> */
539 char *content = find_attr (tag, "content", NULL);
542 if (!strcasecmp (content, "none"))
543 ctx->nofollow = true;
548 /* Find the next occurrence of ',' or the end of
550 char *end = strchr (content, ',');
554 end = content + strlen (content);
555 if (!strncasecmp (content, "nofollow", end - content))
556 ctx->nofollow = true;
563 /* Dispatch the tag handler appropriate for the tag we're mapping
564 over. See known_tags[] for definition of tag handlers. */
567 collect_tags_mapper (struct taginfo *tag, void *arg)
569 struct map_context *ctx = (struct map_context *)arg;
571 /* Find the tag in our table of tags. This must not fail because
572 map_html_tags only returns tags found in interesting_tags. */
573 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
576 t->handler (t->tagid, tag, ctx);
579 /* Analyze HTML tags FILE and construct a list of URLs referenced from
580 it. It merges relative links in FILE with URL. It is aware of
581 <base href=...> and does the right thing. */
584 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
586 struct file_memory *fm;
587 struct map_context ctx;
591 fm = read_file (file);
594 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
597 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
599 ctx.text = fm->content;
600 ctx.head = ctx.tail = NULL;
602 ctx.parent_base = url ? url : opt.base_href;
603 ctx.document_file = file;
604 ctx.nofollow = false;
606 if (!interesting_tags)
609 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
610 generate <a href=" foo"> instead of <a href="foo"> (browsers
611 ignore spaces as well.) If you really mean space, use &32; or
612 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
613 e.g. in <img src="foo.[newline]html">. Such newlines are also
614 ignored by IE and Mozilla and are presumably introduced by
615 writing HTML with editors that force word wrap. */
616 flags = MHT_TRIM_VALUES;
617 if (opt.strict_comments)
618 flags |= MHT_STRICT_COMMENTS;
620 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
621 interesting_tags, interesting_attributes);
623 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
624 if (meta_disallow_follow)
625 *meta_disallow_follow = ctx.nofollow;
627 xfree_null (ctx.base);
632 /* This doesn't really have anything to do with HTML, but it's similar
633 to get_urls_html, so we put it here. */
636 get_urls_file (const char *file)
638 struct file_memory *fm;
639 struct urlpos *head, *tail;
640 const char *text, *text_end;
643 fm = read_file (file);
646 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
649 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
653 text_end = fm->content + fm->length;
654 while (text < text_end)
658 struct urlpos *entry;
661 const char *line_beg = text;
662 const char *line_end = memchr (text, '\n', text_end - text);
669 /* Strip whitespace from the beginning and end of line. */
670 while (line_beg < line_end && ISSPACE (*line_beg))
672 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
675 if (line_beg == line_end)
678 /* The URL is in the [line_beg, line_end) region. */
680 /* We must copy the URL to a zero-terminated string, and we
681 can't use alloca because we're in a loop. *sigh*. */
682 url_text = strdupdelim (line_beg, line_end);
686 /* Merge opt.base_href with URL. */
687 char *merged = uri_merge (opt.base_href, url_text);
692 url = url_parse (url_text, &up_error_code);
695 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
696 file, url_text, url_error (up_error_code));
702 entry = xnew0 (struct urlpos);
716 cleanup_html_url (void)
718 /* Destroy the hash tables. The hash table keys and values are not
719 allocated by this code, so we don't need to free them here. */
720 if (interesting_tags)
721 hash_table_destroy (interesting_tags);
722 if (interesting_attributes)
723 hash_table_destroy (interesting_attributes);