1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3 2007, 2008 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
31 #define USE_GNULIB_ALLOC
41 #include "html-parse.h"
46 #include "recur.h" /* declaration of get_urls_html */
50 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
52 #define DECLARE_TAG_HANDLER(fun) \
53 static void fun (int, struct taginfo *, struct map_context *)
55 DECLARE_TAG_HANDLER (tag_find_urls);
56 DECLARE_TAG_HANDLER (tag_handle_base);
57 DECLARE_TAG_HANDLER (tag_handle_form);
58 DECLARE_TAG_HANDLER (tag_handle_link);
59 DECLARE_TAG_HANDLER (tag_handle_meta);
86 /* The list of known tags and functions used for handling them. Most
87 tags are simply harvested for URLs. */
88 static struct known_tag {
91 tag_handler_t handler;
93 { TAG_A, "a", tag_find_urls },
94 { TAG_APPLET, "applet", tag_find_urls },
95 { TAG_AREA, "area", tag_find_urls },
96 { TAG_BASE, "base", tag_handle_base },
97 { TAG_BGSOUND, "bgsound", tag_find_urls },
98 { TAG_BODY, "body", tag_find_urls },
99 { TAG_EMBED, "embed", tag_find_urls },
100 { TAG_FIG, "fig", tag_find_urls },
101 { TAG_FORM, "form", tag_handle_form },
102 { TAG_FRAME, "frame", tag_find_urls },
103 { TAG_IFRAME, "iframe", tag_find_urls },
104 { TAG_IMG, "img", tag_find_urls },
105 { TAG_INPUT, "input", tag_find_urls },
106 { TAG_LAYER, "layer", tag_find_urls },
107 { TAG_LINK, "link", tag_handle_link },
108 { TAG_META, "meta", tag_handle_meta },
109 { TAG_OBJECT, "object", tag_find_urls },
110 { TAG_OVERLAY, "overlay", tag_find_urls },
111 { TAG_SCRIPT, "script", tag_find_urls },
112 { TAG_TABLE, "table", tag_find_urls },
113 { TAG_TD, "td", tag_find_urls },
114 { TAG_TH, "th", tag_find_urls }
117 /* tag_url_attributes documents which attributes of which tags contain
118 URLs to harvest. It is used by tag_find_urls. */
120 /* Defines for the FLAGS. */
122 /* The link is "inline", i.e. needs to be retrieved for this document
123 to be correctly rendered. Inline links include inlined images,
124 stylesheets, children frames, etc. */
125 #define ATTR_INLINE 1
127 /* The link is expected to yield HTML contents. It's important not to
128 try to follow HTML obtained by following e.g. <img src="...">
129 regardless of content-type. Doing this causes infinite loops for
130 "images" that return non-404 error pages with links to the same
134 /* For tags handled by tag_find_urls: attributes that contain URLs to
138 const char *attr_name;
140 } tag_url_attributes[] = {
141 { TAG_A, "href", ATTR_HTML },
142 { TAG_APPLET, "code", ATTR_INLINE },
143 { TAG_AREA, "href", ATTR_HTML },
144 { TAG_BGSOUND, "src", ATTR_INLINE },
145 { TAG_BODY, "background", ATTR_INLINE },
146 { TAG_EMBED, "href", ATTR_HTML },
147 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
148 { TAG_FIG, "src", ATTR_INLINE },
149 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
150 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
151 { TAG_IMG, "href", ATTR_INLINE },
152 { TAG_IMG, "lowsrc", ATTR_INLINE },
153 { TAG_IMG, "src", ATTR_INLINE },
154 { TAG_INPUT, "src", ATTR_INLINE },
155 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
156 { TAG_OBJECT, "data", ATTR_INLINE },
157 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
158 { TAG_SCRIPT, "src", ATTR_INLINE },
159 { TAG_TABLE, "background", ATTR_INLINE },
160 { TAG_TD, "background", ATTR_INLINE },
161 { TAG_TH, "background", ATTR_INLINE }
164 /* The lists of interesting tags and attributes are built dynamically,
165 from the information above. However, some places in the code refer
166 to the attributes not mentioned here. We add them manually. */
167 static const char *additional_attributes[] = {
168 "rel", /* used by tag_handle_link */
169 "http-equiv", /* used by tag_handle_meta */
170 "name", /* used by tag_handle_meta */
171 "content", /* used by tag_handle_meta */
172 "action" /* used by tag_handle_form */
175 static struct hash_table *interesting_tags;
176 static struct hash_table *interesting_attributes;
179 init_interesting (void)
181 /* Init the variables interesting_tags and interesting_attributes
182 that are used by the HTML parser to know which tags and
183 attributes we're interested in. We initialize this only once,
184 for performance reasons.
186 Here we also make sure that what we put in interesting_tags
187 matches the user's preferences as specified through --ignore-tags
188 and --follow-tags. */
191 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
193 /* First, add all the tags we know hot to handle, mapped to their
194 respective entries in known_tags. */
195 for (i = 0; i < countof (known_tags); i++)
196 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
198 /* Then remove the tags ignored through --ignore-tags. */
202 for (ignored = opt.ignore_tags; *ignored; ignored++)
203 hash_table_remove (interesting_tags, *ignored);
206 /* If --follow-tags is specified, use only those tags. */
209 /* Create a new table intersecting --follow-tags and known_tags,
210 and use it as interesting_tags. */
211 struct hash_table *intersect = make_nocase_string_hash_table (0);
213 for (followed = opt.follow_tags; *followed; followed++)
215 struct known_tag *t = hash_table_get (interesting_tags, *followed);
217 continue; /* ignore unknown --follow-tags entries. */
218 hash_table_put (intersect, *followed, t);
220 hash_table_destroy (interesting_tags);
221 interesting_tags = intersect;
224 /* Add the attributes we care about. */
225 interesting_attributes = make_nocase_string_hash_table (10);
226 for (i = 0; i < countof (additional_attributes); i++)
227 hash_table_put (interesting_attributes, additional_attributes[i], "1");
228 for (i = 0; i < countof (tag_url_attributes); i++)
229 hash_table_put (interesting_attributes,
230 tag_url_attributes[i].attr_name, "1");
233 /* Find the value of attribute named NAME in the taginfo TAG. If the
234 attribute is not present, return NULL. If ATTRIND is non-NULL, the
235 index of the attribute in TAG will be stored there. */
238 find_attr (struct taginfo *tag, const char *name, int *attrind)
241 for (i = 0; i < tag->nattrs; i++)
242 if (!strcasecmp (tag->attrs[i].name, name))
246 return tag->attrs[i].value;
252 char *text; /* HTML text. */
253 char *base; /* Base URI of the document, possibly
254 changed through <base href=...>. */
255 const char *parent_base; /* Base of the current document. */
256 const char *document_file; /* File name of this document. */
257 bool nofollow; /* whether NOFOLLOW was specified in a
258 <meta name=robots> tag. */
260 struct urlpos *head, *tail; /* List of URLs that is being
264 /* Append LINK_URI to the urlpos structure that is being built.
266 LINK_URI will be merged with the current document base. TAG and
267 ATTRIND are the necessary context to store the position and
270 static struct urlpos *
271 append_url (const char *link_uri,
272 struct taginfo *tag, int attrind, struct map_context *ctx)
274 int link_has_scheme = url_has_scheme (link_uri);
275 struct urlpos *newel;
276 const char *base = ctx->base ? ctx->base : ctx->parent_base;
281 DEBUGP (("%s: no base, merge will use \"%s\".\n",
282 ctx->document_file, link_uri));
284 if (!link_has_scheme)
286 /* Base URL is unavailable, and the link does not have a
287 location attached to it -- we have to give up. Since
288 this can only happen when using `--force-html -i', print
290 logprintf (LOG_NOTQUIET,
291 _("%s: Cannot resolve incomplete link %s.\n"),
292 ctx->document_file, link_uri);
296 url = url_parse (link_uri, NULL);
299 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
300 ctx->document_file, link_uri));
306 /* Merge BASE with LINK_URI, but also make sure the result is
307 canonicalized, i.e. that "../" have been resolved.
308 (parse_url will do that for us.) */
310 char *complete_uri = uri_merge (base, link_uri);
312 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
313 ctx->document_file, base, link_uri, complete_uri));
315 url = url_parse (complete_uri, NULL);
318 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
319 ctx->document_file, complete_uri));
320 xfree (complete_uri);
323 xfree (complete_uri);
326 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
328 newel = xnew0 (struct urlpos);
330 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
331 newel->size = tag->attrs[attrind].value_raw_size;
333 /* A URL is relative if the host is not named, and the name does not
335 if (!link_has_scheme && *link_uri != '/')
336 newel->link_relative_p = 1;
337 else if (link_has_scheme)
338 newel->link_complete_p = 1;
342 ctx->tail->next = newel;
346 ctx->tail = ctx->head = newel;
351 /* All the tag_* functions are called from collect_tags_mapper, as
352 specified by KNOWN_TAGS. */
354 /* Default tag handler: collect URLs from attributes specified for
355 this tag by tag_url_attributes. */
358 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
363 for (i = 0; i < countof (tag_url_attributes); i++)
364 if (tag_url_attributes[i].tagid == tagid)
366 /* We've found the index of tag_url_attributes where the
367 attributes of our tag begin. */
371 assert (first != -1);
373 /* Loop over the "interesting" attributes of this tag. In this
374 example, it will loop over "src" and "lowsrc".
376 <img src="foo.png" lowsrc="bar.png">
378 This has to be done in the outer loop so that the attributes are
379 processed in the same order in which they appear in the page.
380 This is required when converting links. */
382 for (attrind = 0; attrind < tag->nattrs; attrind++)
384 /* Find whether TAG/ATTRIND is a combination that contains a
386 char *link = tag->attrs[attrind].value;
387 const int size = countof (tag_url_attributes);
389 /* If you're cringing at the inefficiency of the nested loops,
390 remember that they both iterate over a very small number of
391 items. The worst-case inner loop is for the IMG tag, which
392 has three attributes. */
393 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
395 if (0 == strcasecmp (tag->attrs[attrind].name,
396 tag_url_attributes[i].attr_name))
398 struct urlpos *up = append_url (link, tag, attrind, ctx);
401 int flags = tag_url_attributes[i].flags;
402 if (flags & ATTR_INLINE)
403 up->link_inline_p = 1;
404 if (flags & ATTR_HTML)
405 up->link_expect_html = 1;
412 /* Handle the BASE tag, for <base href=...>. */
415 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
417 struct urlpos *base_urlpos;
419 char *newbase = find_attr (tag, "href", &attrind);
423 base_urlpos = append_url (newbase, tag, attrind, ctx);
426 base_urlpos->ignore_when_downloading = 1;
427 base_urlpos->link_base_p = 1;
431 if (ctx->parent_base)
432 ctx->base = uri_merge (ctx->parent_base, newbase);
434 ctx->base = xstrdup (newbase);
437 /* Mark the URL found in <form action=...> for conversion. */
440 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
443 char *action = find_attr (tag, "action", &attrind);
446 struct urlpos *up = append_url (action, tag, attrind, ctx);
448 up->ignore_when_downloading = 1;
452 /* Handle the LINK tag. It requires special handling because how its
453 links will be followed in -p mode depends on the REL attribute. */
456 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
459 char *href = find_attr (tag, "href", &attrind);
461 /* All <link href="..."> link references are external, except those
462 known not to be, such as style sheet and shortcut icon:
464 <link rel="stylesheet" href="...">
465 <link rel="shortcut icon" href="...">
469 struct urlpos *up = append_url (href, tag, attrind, ctx);
472 char *rel = find_attr (tag, "rel", NULL);
474 && (0 == strcasecmp (rel, "stylesheet")
475 || 0 == strcasecmp (rel, "shortcut icon")))
476 up->link_inline_p = 1;
478 /* The external ones usually point to HTML pages, such as
479 <link rel="next" href="..."> */
480 up->link_expect_html = 1;
485 /* Handle the META tag. This requires special handling because of the
486 refresh feature and because of robot exclusion. */
489 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
491 char *name = find_attr (tag, "name", NULL);
492 char *http_equiv = find_attr (tag, "http-equiv", NULL);
494 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
496 /* Some pages use a META tag to specify that the page be
497 refreshed by a new page after a given number of seconds. The
498 general format for this is:
500 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
502 So we just need to skip past the "NUMBER; URL=" garbage to
505 struct urlpos *entry;
510 char *refresh = find_attr (tag, "content", &attrind);
514 for (p = refresh; c_isdigit (*p); p++)
515 timeout = 10 * timeout + *p - '0';
519 while (c_isspace (*p))
521 if (!( c_toupper (*p) == 'U'
522 && c_toupper (*(p + 1)) == 'R'
523 && c_toupper (*(p + 2)) == 'L'
527 while (c_isspace (*p))
530 entry = append_url (p, tag, attrind, ctx);
533 entry->link_refresh_p = 1;
534 entry->refresh_timeout = timeout;
535 entry->link_expect_html = 1;
538 else if (name && 0 == strcasecmp (name, "robots"))
540 /* Handle stuff like:
541 <meta name="robots" content="index,nofollow"> */
542 char *content = find_attr (tag, "content", NULL);
545 if (!strcasecmp (content, "none"))
546 ctx->nofollow = true;
551 /* Find the next occurrence of ',' or the end of
553 char *end = strchr (content, ',');
557 end = content + strlen (content);
558 if (!strncasecmp (content, "nofollow", end - content))
559 ctx->nofollow = true;
566 /* Dispatch the tag handler appropriate for the tag we're mapping
567 over. See known_tags[] for definition of tag handlers. */
570 collect_tags_mapper (struct taginfo *tag, void *arg)
572 struct map_context *ctx = (struct map_context *)arg;
574 /* Find the tag in our table of tags. This must not fail because
575 map_html_tags only returns tags found in interesting_tags. */
576 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
579 t->handler (t->tagid, tag, ctx);
582 /* Analyze HTML tags FILE and construct a list of URLs referenced from
583 it. It merges relative links in FILE with URL. It is aware of
584 <base href=...> and does the right thing. */
587 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
589 struct file_memory *fm;
590 struct map_context ctx;
594 fm = read_file (file);
597 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
600 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
602 ctx.text = fm->content;
603 ctx.head = ctx.tail = NULL;
605 ctx.parent_base = url ? url : opt.base_href;
606 ctx.document_file = file;
607 ctx.nofollow = false;
609 if (!interesting_tags)
612 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
613 generate <a href=" foo"> instead of <a href="foo"> (browsers
614 ignore spaces as well.) If you really mean space, use &32; or
615 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
616 e.g. in <img src="foo.[newline]html">. Such newlines are also
617 ignored by IE and Mozilla and are presumably introduced by
618 writing HTML with editors that force word wrap. */
619 flags = MHT_TRIM_VALUES;
620 if (opt.strict_comments)
621 flags |= MHT_STRICT_COMMENTS;
623 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
624 interesting_tags, interesting_attributes);
626 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
627 if (meta_disallow_follow)
628 *meta_disallow_follow = ctx.nofollow;
630 xfree_null (ctx.base);
635 /* This doesn't really have anything to do with HTML, but it's similar
636 to get_urls_html, so we put it here. */
639 get_urls_file (const char *file)
641 struct file_memory *fm;
642 struct urlpos *head, *tail;
643 const char *text, *text_end;
646 fm = read_file (file);
649 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
652 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
656 text_end = fm->content + fm->length;
657 while (text < text_end)
661 struct urlpos *entry;
664 const char *line_beg = text;
665 const char *line_end = memchr (text, '\n', text_end - text);
672 /* Strip whitespace from the beginning and end of line. */
673 while (line_beg < line_end && c_isspace (*line_beg))
675 while (line_end > line_beg && c_isspace (*(line_end - 1)))
678 if (line_beg == line_end)
681 /* The URL is in the [line_beg, line_end) region. */
683 /* We must copy the URL to a zero-terminated string, and we
684 can't use alloca because we're in a loop. *sigh*. */
685 url_text = strdupdelim (line_beg, line_end);
689 /* Merge opt.base_href with URL. */
690 char *merged = uri_merge (opt.base_href, url_text);
695 url = url_parse (url_text, &up_error_code);
698 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
699 file, url_text, url_error (up_error_code));
705 entry = xnew0 (struct urlpos);
719 cleanup_html_url (void)
721 /* Destroy the hash tables. The hash table keys and values are not
722 allocated by this code, so we don't need to free them here. */
723 if (interesting_tags)
724 hash_table_destroy (interesting_tags);
725 if (interesting_attributes)
726 hash_table_destroy (interesting_attributes);