1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3 2007, 2008 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
39 #include "html-parse.h"
49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
51 #define DECLARE_TAG_HANDLER(fun) \
52 static void fun (int, struct taginfo *, struct map_context *)
54 DECLARE_TAG_HANDLER (tag_find_urls);
55 DECLARE_TAG_HANDLER (tag_handle_base);
56 DECLARE_TAG_HANDLER (tag_handle_form);
57 DECLARE_TAG_HANDLER (tag_handle_link);
58 DECLARE_TAG_HANDLER (tag_handle_meta);
85 /* The list of known tags and functions used for handling them. Most
86 tags are simply harvested for URLs. */
87 static struct known_tag {
90 tag_handler_t handler;
92 { TAG_A, "a", tag_find_urls },
93 { TAG_APPLET, "applet", tag_find_urls },
94 { TAG_AREA, "area", tag_find_urls },
95 { TAG_BASE, "base", tag_handle_base },
96 { TAG_BGSOUND, "bgsound", tag_find_urls },
97 { TAG_BODY, "body", tag_find_urls },
98 { TAG_EMBED, "embed", tag_find_urls },
99 { TAG_FIG, "fig", tag_find_urls },
100 { TAG_FORM, "form", tag_handle_form },
101 { TAG_FRAME, "frame", tag_find_urls },
102 { TAG_IFRAME, "iframe", tag_find_urls },
103 { TAG_IMG, "img", tag_find_urls },
104 { TAG_INPUT, "input", tag_find_urls },
105 { TAG_LAYER, "layer", tag_find_urls },
106 { TAG_LINK, "link", tag_handle_link },
107 { TAG_META, "meta", tag_handle_meta },
108 { TAG_OBJECT, "object", tag_find_urls },
109 { TAG_OVERLAY, "overlay", tag_find_urls },
110 { TAG_SCRIPT, "script", tag_find_urls },
111 { TAG_TABLE, "table", tag_find_urls },
112 { TAG_TD, "td", tag_find_urls },
113 { TAG_TH, "th", tag_find_urls }
116 /* tag_url_attributes documents which attributes of which tags contain
117 URLs to harvest. It is used by tag_find_urls. */
119 /* Defines for the FLAGS. */
121 /* The link is "inline", i.e. needs to be retrieved for this document
122 to be correctly rendered. Inline links include inlined images,
123 stylesheets, children frames, etc. */
124 #define ATTR_INLINE 1
126 /* The link is expected to yield HTML contents. It's important not to
127 try to follow HTML obtained by following e.g. <img src="...">
128 regardless of content-type. Doing this causes infinite loops for
129 "images" that return non-404 error pages with links to the same
133 /* For tags handled by tag_find_urls: attributes that contain URLs to
137 const char *attr_name;
139 } tag_url_attributes[] = {
140 { TAG_A, "href", ATTR_HTML },
141 { TAG_APPLET, "code", ATTR_INLINE },
142 { TAG_AREA, "href", ATTR_HTML },
143 { TAG_BGSOUND, "src", ATTR_INLINE },
144 { TAG_BODY, "background", ATTR_INLINE },
145 { TAG_EMBED, "href", ATTR_HTML },
146 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
147 { TAG_FIG, "src", ATTR_INLINE },
148 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
149 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
150 { TAG_IMG, "href", ATTR_INLINE },
151 { TAG_IMG, "lowsrc", ATTR_INLINE },
152 { TAG_IMG, "src", ATTR_INLINE },
153 { TAG_INPUT, "src", ATTR_INLINE },
154 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
155 { TAG_OBJECT, "data", ATTR_INLINE },
156 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
157 { TAG_SCRIPT, "src", ATTR_INLINE },
158 { TAG_TABLE, "background", ATTR_INLINE },
159 { TAG_TD, "background", ATTR_INLINE },
160 { TAG_TH, "background", ATTR_INLINE }
163 /* The lists of interesting tags and attributes are built dynamically,
164 from the information above. However, some places in the code refer
165 to the attributes not mentioned here. We add them manually. */
166 static const char *additional_attributes[] = {
167 "rel", /* used by tag_handle_link */
168 "http-equiv", /* used by tag_handle_meta */
169 "name", /* used by tag_handle_meta */
170 "content", /* used by tag_handle_meta */
171 "action", /* used by tag_handle_form */
172 "style" /* used by check_style_attr */
175 static struct hash_table *interesting_tags;
176 static struct hash_table *interesting_attributes;
179 init_interesting (void)
181 /* Init the variables interesting_tags and interesting_attributes
182 that are used by the HTML parser to know which tags and
183 attributes we're interested in. We initialize this only once,
184 for performance reasons.
186 Here we also make sure that what we put in interesting_tags
187 matches the user's preferences as specified through --ignore-tags
188 and --follow-tags. */
191 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
193 /* First, add all the tags we know hot to handle, mapped to their
194 respective entries in known_tags. */
195 for (i = 0; i < countof (known_tags); i++)
196 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
198 /* Then remove the tags ignored through --ignore-tags. */
202 for (ignored = opt.ignore_tags; *ignored; ignored++)
203 hash_table_remove (interesting_tags, *ignored);
206 /* If --follow-tags is specified, use only those tags. */
209 /* Create a new table intersecting --follow-tags and known_tags,
210 and use it as interesting_tags. */
211 struct hash_table *intersect = make_nocase_string_hash_table (0);
213 for (followed = opt.follow_tags; *followed; followed++)
215 struct known_tag *t = hash_table_get (interesting_tags, *followed);
217 continue; /* ignore unknown --follow-tags entries. */
218 hash_table_put (intersect, *followed, t);
220 hash_table_destroy (interesting_tags);
221 interesting_tags = intersect;
224 /* Add the attributes we care about. */
225 interesting_attributes = make_nocase_string_hash_table (10);
226 for (i = 0; i < countof (additional_attributes); i++)
227 hash_table_put (interesting_attributes, additional_attributes[i], "1");
228 for (i = 0; i < countof (tag_url_attributes); i++)
229 hash_table_put (interesting_attributes,
230 tag_url_attributes[i].attr_name, "1");
233 /* Find the value of attribute named NAME in the taginfo TAG. If the
234 attribute is not present, return NULL. If ATTRIND is non-NULL, the
235 index of the attribute in TAG will be stored there. */
238 find_attr (struct taginfo *tag, const char *name, int *attrind)
241 for (i = 0; i < tag->nattrs; i++)
242 if (!strcasecmp (tag->attrs[i].name, name))
246 return tag->attrs[i].value;
251 /* used for calls to append_url */
252 #define ATTR_POS(tag, attrind, ctx) \
253 (tag->attrs[attrind].value_raw_beginning - ctx->text)
254 #define ATTR_SIZE(tag, attrind) \
255 (tag->attrs[attrind].value_raw_size)
257 /* Append LINK_URI to the urlpos structure that is being built.
259 LINK_URI will be merged with the current document base.
263 append_url (const char *link_uri, int position, int size,
264 struct map_context *ctx)
266 int link_has_scheme = url_has_scheme (link_uri);
267 struct urlpos *newel;
268 const char *base = ctx->base ? ctx->base : ctx->parent_base;
273 DEBUGP (("%s: no base, merge will use \"%s\".\n",
274 ctx->document_file, link_uri));
276 if (!link_has_scheme)
278 /* Base URL is unavailable, and the link does not have a
279 location attached to it -- we have to give up. Since
280 this can only happen when using `--force-html -i', print
282 logprintf (LOG_NOTQUIET,
283 _("%s: Cannot resolve incomplete link %s.\n"),
284 ctx->document_file, link_uri);
288 set_ugly_no_encode (true);
289 url = url_parse (link_uri, NULL);
290 set_ugly_no_encode (false);
293 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
294 ctx->document_file, link_uri));
300 /* Merge BASE with LINK_URI, but also make sure the result is
301 canonicalized, i.e. that "../" have been resolved.
302 (parse_url will do that for us.) */
304 char *complete_uri = uri_merge (base, link_uri);
306 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
307 ctx->document_file, base, link_uri, complete_uri));
309 set_ugly_no_encode (true);
310 url = url_parse (complete_uri, NULL);
311 set_ugly_no_encode (false);
314 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
315 ctx->document_file, complete_uri));
316 xfree (complete_uri);
319 xfree (complete_uri);
322 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
324 newel = xnew0 (struct urlpos);
326 newel->pos = position;
329 /* A URL is relative if the host is not named, and the name does not
331 if (!link_has_scheme && *link_uri != '/')
332 newel->link_relative_p = 1;
333 else if (link_has_scheme)
334 newel->link_complete_p = 1;
338 ctx->tail->next = newel;
342 ctx->tail = ctx->head = newel;
348 check_style_attr (struct taginfo *tag, struct map_context *ctx)
351 char *style = find_attr (tag, "style", &attrind);
355 /* raw pos and raw size include the quotes, hence the +1 -2 */
356 get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
359 /* All the tag_* functions are called from collect_tags_mapper, as
360 specified by KNOWN_TAGS. */
362 /* Default tag handler: collect URLs from attributes specified for
363 this tag by tag_url_attributes. */
366 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
372 for (i = 0; i < countof (tag_url_attributes); i++)
373 if (tag_url_attributes[i].tagid == tagid)
375 /* We've found the index of tag_url_attributes where the
376 attributes of our tag begin. */
380 assert (first != -1);
382 /* Loop over the "interesting" attributes of this tag. In this
383 example, it will loop over "src" and "lowsrc".
385 <img src="foo.png" lowsrc="bar.png">
387 This has to be done in the outer loop so that the attributes are
388 processed in the same order in which they appear in the page.
389 This is required when converting links. */
391 for (attrind = 0; attrind < tag->nattrs; attrind++)
393 /* Find whether TAG/ATTRIND is a combination that contains a
395 char *link = tag->attrs[attrind].value;
396 const size_t size = countof (tag_url_attributes);
398 /* If you're cringing at the inefficiency of the nested loops,
399 remember that they both iterate over a very small number of
400 items. The worst-case inner loop is for the IMG tag, which
401 has three attributes. */
402 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
404 if (0 == strcasecmp (tag->attrs[attrind].name,
405 tag_url_attributes[i].attr_name))
407 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
408 ATTR_SIZE(tag,attrind), ctx);
411 int flags = tag_url_attributes[i].flags;
412 if (flags & ATTR_INLINE)
413 up->link_inline_p = 1;
414 if (flags & ATTR_HTML)
415 up->link_expect_html = 1;
422 /* Handle the BASE tag, for <base href=...>. */
425 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
427 struct urlpos *base_urlpos;
429 char *newbase = find_attr (tag, "href", &attrind);
433 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
434 ATTR_SIZE(tag,attrind), ctx);
437 base_urlpos->ignore_when_downloading = 1;
438 base_urlpos->link_base_p = 1;
442 if (ctx->parent_base)
443 ctx->base = uri_merge (ctx->parent_base, newbase);
445 ctx->base = xstrdup (newbase);
448 /* Mark the URL found in <form action=...> for conversion. */
451 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
454 char *action = find_attr (tag, "action", &attrind);
458 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
459 ATTR_SIZE(tag,attrind), ctx);
461 up->ignore_when_downloading = 1;
465 /* Handle the LINK tag. It requires special handling because how its
466 links will be followed in -p mode depends on the REL attribute. */
469 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
472 char *href = find_attr (tag, "href", &attrind);
474 /* All <link href="..."> link references are external, except those
475 known not to be, such as style sheet and shortcut icon:
477 <link rel="stylesheet" href="...">
478 <link rel="shortcut icon" href="...">
482 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
483 ATTR_SIZE(tag,attrind), ctx);
486 char *rel = find_attr (tag, "rel", NULL);
489 if (0 == strcasecmp (rel, "stylesheet"))
491 up->link_inline_p = 1;
492 up->link_expect_css = 1;
494 else if (0 == strcasecmp (rel, "shortcut icon"))
496 up->link_inline_p = 1;
500 /* The external ones usually point to HTML pages, such as
501 <link rel="next" href="..."> */
502 up->link_expect_html = 1;
507 /* Handle the META tag. This requires special handling because of the
508 refresh feature and because of robot exclusion. */
511 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
513 char *name = find_attr (tag, "name", NULL);
514 char *http_equiv = find_attr (tag, "http-equiv", NULL);
516 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
518 /* Some pages use a META tag to specify that the page be
519 refreshed by a new page after a given number of seconds. The
520 general format for this is:
522 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
524 So we just need to skip past the "NUMBER; URL=" garbage to
527 struct urlpos *entry;
532 char *refresh = find_attr (tag, "content", &attrind);
536 for (p = refresh; c_isdigit (*p); p++)
537 timeout = 10 * timeout + *p - '0';
541 while (c_isspace (*p))
543 if (!( c_toupper (*p) == 'U'
544 && c_toupper (*(p + 1)) == 'R'
545 && c_toupper (*(p + 2)) == 'L'
549 while (c_isspace (*p))
552 entry = append_url (p, ATTR_POS(tag,attrind,ctx),
553 ATTR_SIZE(tag,attrind), ctx);
556 entry->link_refresh_p = 1;
557 entry->refresh_timeout = timeout;
558 entry->link_expect_html = 1;
561 else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
563 /* Handle stuff like:
564 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
567 char *content = find_attr (tag, "content", NULL);
571 mcharset = parse_charset (content);
575 /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
577 set_current_charset (mcharset);
580 else if (name && 0 == strcasecmp (name, "robots"))
582 /* Handle stuff like:
583 <meta name="robots" content="index,nofollow"> */
584 char *content = find_attr (tag, "content", NULL);
587 if (!strcasecmp (content, "none"))
588 ctx->nofollow = true;
593 /* Find the next occurrence of ',' or the end of
595 char *end = strchr (content, ',');
599 end = content + strlen (content);
600 if (!strncasecmp (content, "nofollow", end - content))
601 ctx->nofollow = true;
608 /* Dispatch the tag handler appropriate for the tag we're mapping
609 over. See known_tags[] for definition of tag handlers. */
612 collect_tags_mapper (struct taginfo *tag, void *arg)
614 struct map_context *ctx = (struct map_context *)arg;
616 /* Find the tag in our table of tags. This must not fail because
617 map_html_tags only returns tags found in interesting_tags.
619 I've changed this for now, I'm passing NULL as interesting_tags
620 to map_html_tags. This way we can check all tags for a style
623 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
626 t->handler (t->tagid, tag, ctx);
628 check_style_attr (tag, ctx);
630 if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
631 tag->contents_begin && tag->contents_end)
634 get_urls_css (ctx, tag->contents_begin - ctx->text,
635 tag->contents_end - tag->contents_begin);
639 /* Analyze HTML tags FILE and construct a list of URLs referenced from
640 it. It merges relative links in FILE with URL. It is aware of
641 <base href=...> and does the right thing. */
644 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
646 struct file_memory *fm;
647 struct map_context ctx;
651 fm = read_file (file);
654 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
657 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
659 ctx.text = fm->content;
660 ctx.head = ctx.tail = NULL;
662 ctx.parent_base = url ? url : opt.base_href;
663 ctx.document_file = file;
664 ctx.nofollow = false;
666 if (!interesting_tags)
669 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
670 generate <a href=" foo"> instead of <a href="foo"> (browsers
671 ignore spaces as well.) If you really mean space, use &32; or
672 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
673 e.g. in <img src="foo.[newline]html">. Such newlines are also
674 ignored by IE and Mozilla and are presumably introduced by
675 writing HTML with editors that force word wrap. */
676 flags = MHT_TRIM_VALUES;
677 if (opt.strict_comments)
678 flags |= MHT_STRICT_COMMENTS;
680 /* the NULL here used to be interesting_tags */
681 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
682 NULL, interesting_attributes);
684 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
685 if (meta_disallow_follow)
686 *meta_disallow_follow = ctx.nofollow;
688 xfree_null (ctx.base);
693 /* This doesn't really have anything to do with HTML, but it's similar
694 to get_urls_html, so we put it here. */
697 get_urls_file (const char *file)
699 struct file_memory *fm;
700 struct urlpos *head, *tail;
701 const char *text, *text_end;
704 fm = read_file (file);
707 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
710 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
714 text_end = fm->content + fm->length;
715 while (text < text_end)
719 struct urlpos *entry;
722 const char *line_beg = text;
723 const char *line_end = memchr (text, '\n', text_end - text);
730 /* Strip whitespace from the beginning and end of line. */
731 while (line_beg < line_end && c_isspace (*line_beg))
733 while (line_end > line_beg && c_isspace (*(line_end - 1)))
736 if (line_beg == line_end)
739 /* The URL is in the [line_beg, line_end) region. */
741 /* We must copy the URL to a zero-terminated string, and we
742 can't use alloca because we're in a loop. *sigh*. */
743 url_text = strdupdelim (line_beg, line_end);
747 /* Merge opt.base_href with URL. */
748 char *merged = uri_merge (opt.base_href, url_text);
753 set_ugly_no_encode (true);
754 url = url_parse (url_text, &up_error_code);
755 set_ugly_no_encode (false);
758 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
759 file, url_text, url_error (up_error_code));
765 entry = xnew0 (struct urlpos);
779 cleanup_html_url (void)
781 /* Destroy the hash tables. The hash table keys and values are not
782 allocated by this code, so we don't need to free them here. */
783 if (interesting_tags)
784 hash_table_destroy (interesting_tags);
785 if (interesting_attributes)
786 hash_table_destroy (interesting_attributes);