1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3 2007, 2008 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
39 #include "html-parse.h"
48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
50 #define DECLARE_TAG_HANDLER(fun) \
51 static void fun (int, struct taginfo *, struct map_context *)
53 DECLARE_TAG_HANDLER (tag_find_urls);
54 DECLARE_TAG_HANDLER (tag_handle_base);
55 DECLARE_TAG_HANDLER (tag_handle_form);
56 DECLARE_TAG_HANDLER (tag_handle_link);
57 DECLARE_TAG_HANDLER (tag_handle_meta);
84 /* The list of known tags and functions used for handling them. Most
85 tags are simply harvested for URLs. */
86 static struct known_tag {
89 tag_handler_t handler;
91 { TAG_A, "a", tag_find_urls },
92 { TAG_APPLET, "applet", tag_find_urls },
93 { TAG_AREA, "area", tag_find_urls },
94 { TAG_BASE, "base", tag_handle_base },
95 { TAG_BGSOUND, "bgsound", tag_find_urls },
96 { TAG_BODY, "body", tag_find_urls },
97 { TAG_EMBED, "embed", tag_find_urls },
98 { TAG_FIG, "fig", tag_find_urls },
99 { TAG_FORM, "form", tag_handle_form },
100 { TAG_FRAME, "frame", tag_find_urls },
101 { TAG_IFRAME, "iframe", tag_find_urls },
102 { TAG_IMG, "img", tag_find_urls },
103 { TAG_INPUT, "input", tag_find_urls },
104 { TAG_LAYER, "layer", tag_find_urls },
105 { TAG_LINK, "link", tag_handle_link },
106 { TAG_META, "meta", tag_handle_meta },
107 { TAG_OBJECT, "object", tag_find_urls },
108 { TAG_OVERLAY, "overlay", tag_find_urls },
109 { TAG_SCRIPT, "script", tag_find_urls },
110 { TAG_TABLE, "table", tag_find_urls },
111 { TAG_TD, "td", tag_find_urls },
112 { TAG_TH, "th", tag_find_urls }
115 /* tag_url_attributes documents which attributes of which tags contain
116 URLs to harvest. It is used by tag_find_urls. */
118 /* Defines for the FLAGS. */
120 /* The link is "inline", i.e. needs to be retrieved for this document
121 to be correctly rendered. Inline links include inlined images,
122 stylesheets, children frames, etc. */
123 #define ATTR_INLINE 1
125 /* The link is expected to yield HTML contents. It's important not to
126 try to follow HTML obtained by following e.g. <img src="...">
127 regardless of content-type. Doing this causes infinite loops for
128 "images" that return non-404 error pages with links to the same
132 /* For tags handled by tag_find_urls: attributes that contain URLs to
136 const char *attr_name;
138 } tag_url_attributes[] = {
139 { TAG_A, "href", ATTR_HTML },
140 { TAG_APPLET, "code", ATTR_INLINE },
141 { TAG_AREA, "href", ATTR_HTML },
142 { TAG_BGSOUND, "src", ATTR_INLINE },
143 { TAG_BODY, "background", ATTR_INLINE },
144 { TAG_EMBED, "href", ATTR_HTML },
145 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
146 { TAG_FIG, "src", ATTR_INLINE },
147 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
148 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
149 { TAG_IMG, "href", ATTR_INLINE },
150 { TAG_IMG, "lowsrc", ATTR_INLINE },
151 { TAG_IMG, "src", ATTR_INLINE },
152 { TAG_INPUT, "src", ATTR_INLINE },
153 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
154 { TAG_OBJECT, "data", ATTR_INLINE },
155 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
156 { TAG_SCRIPT, "src", ATTR_INLINE },
157 { TAG_TABLE, "background", ATTR_INLINE },
158 { TAG_TD, "background", ATTR_INLINE },
159 { TAG_TH, "background", ATTR_INLINE }
162 /* The lists of interesting tags and attributes are built dynamically,
163 from the information above. However, some places in the code refer
164 to the attributes not mentioned here. We add them manually. */
165 static const char *additional_attributes[] = {
166 "rel", /* used by tag_handle_link */
167 "http-equiv", /* used by tag_handle_meta */
168 "name", /* used by tag_handle_meta */
169 "content", /* used by tag_handle_meta */
170 "action", /* used by tag_handle_form */
171 "style" /* used by check_style_attr */
174 static struct hash_table *interesting_tags;
175 static struct hash_table *interesting_attributes;
177 /* Will contains the (last) charset found in 'http-equiv=content-type'
179 static char *meta_charset;
182 init_interesting (void)
184 /* Init the variables interesting_tags and interesting_attributes
185 that are used by the HTML parser to know which tags and
186 attributes we're interested in. We initialize this only once,
187 for performance reasons.
189 Here we also make sure that what we put in interesting_tags
190 matches the user's preferences as specified through --ignore-tags
191 and --follow-tags. */
194 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
196 /* First, add all the tags we know hot to handle, mapped to their
197 respective entries in known_tags. */
198 for (i = 0; i < countof (known_tags); i++)
199 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
201 /* Then remove the tags ignored through --ignore-tags. */
205 for (ignored = opt.ignore_tags; *ignored; ignored++)
206 hash_table_remove (interesting_tags, *ignored);
209 /* If --follow-tags is specified, use only those tags. */
212 /* Create a new table intersecting --follow-tags and known_tags,
213 and use it as interesting_tags. */
214 struct hash_table *intersect = make_nocase_string_hash_table (0);
216 for (followed = opt.follow_tags; *followed; followed++)
218 struct known_tag *t = hash_table_get (interesting_tags, *followed);
220 continue; /* ignore unknown --follow-tags entries. */
221 hash_table_put (intersect, *followed, t);
223 hash_table_destroy (interesting_tags);
224 interesting_tags = intersect;
227 /* Add the attributes we care about. */
228 interesting_attributes = make_nocase_string_hash_table (10);
229 for (i = 0; i < countof (additional_attributes); i++)
230 hash_table_put (interesting_attributes, additional_attributes[i], "1");
231 for (i = 0; i < countof (tag_url_attributes); i++)
232 hash_table_put (interesting_attributes,
233 tag_url_attributes[i].attr_name, "1");
236 /* Find the value of attribute named NAME in the taginfo TAG. If the
237 attribute is not present, return NULL. If ATTRIND is non-NULL, the
238 index of the attribute in TAG will be stored there. */
241 find_attr (struct taginfo *tag, const char *name, int *attrind)
244 for (i = 0; i < tag->nattrs; i++)
245 if (!strcasecmp (tag->attrs[i].name, name))
249 return tag->attrs[i].value;
254 /* used for calls to append_url */
255 #define ATTR_POS(tag, attrind, ctx) \
256 (tag->attrs[attrind].value_raw_beginning - ctx->text)
257 #define ATTR_SIZE(tag, attrind) \
258 (tag->attrs[attrind].value_raw_size)
260 /* Append LINK_URI to the urlpos structure that is being built.
262 LINK_URI will be merged with the current document base.
266 append_url (const char *link_uri, int position, int size,
267 struct map_context *ctx)
269 int link_has_scheme = url_has_scheme (link_uri);
270 struct urlpos *newel;
271 const char *base = ctx->base ? ctx->base : ctx->parent_base;
276 DEBUGP (("%s: no base, merge will use \"%s\".\n",
277 ctx->document_file, link_uri));
279 if (!link_has_scheme)
281 /* Base URL is unavailable, and the link does not have a
282 location attached to it -- we have to give up. Since
283 this can only happen when using `--force-html -i', print
285 logprintf (LOG_NOTQUIET,
286 _("%s: Cannot resolve incomplete link %s.\n"),
287 ctx->document_file, link_uri);
291 url = url_parse (link_uri, NULL, NULL, false);
294 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
295 ctx->document_file, link_uri));
301 /* Merge BASE with LINK_URI, but also make sure the result is
302 canonicalized, i.e. that "../" have been resolved.
303 (parse_url will do that for us.) */
305 char *complete_uri = uri_merge (base, link_uri);
307 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
308 ctx->document_file, base, link_uri, complete_uri));
310 url = url_parse (complete_uri, NULL, NULL, false);
313 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
314 ctx->document_file, complete_uri));
315 xfree (complete_uri);
318 xfree (complete_uri);
321 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
323 newel = xnew0 (struct urlpos);
325 newel->pos = position;
328 /* A URL is relative if the host is not named, and the name does not
330 if (!link_has_scheme && *link_uri != '/')
331 newel->link_relative_p = 1;
332 else if (link_has_scheme)
333 newel->link_complete_p = 1;
337 ctx->tail->next = newel;
341 ctx->tail = ctx->head = newel;
347 check_style_attr (struct taginfo *tag, struct map_context *ctx)
350 char *style = find_attr (tag, "style", &attrind);
354 /* raw pos and raw size include the quotes, hence the +1 -2 */
355 get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
358 /* All the tag_* functions are called from collect_tags_mapper, as
359 specified by KNOWN_TAGS. */
361 /* Default tag handler: collect URLs from attributes specified for
362 this tag by tag_url_attributes. */
365 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
371 for (i = 0; i < countof (tag_url_attributes); i++)
372 if (tag_url_attributes[i].tagid == tagid)
374 /* We've found the index of tag_url_attributes where the
375 attributes of our tag begin. */
379 assert (first != -1);
381 /* Loop over the "interesting" attributes of this tag. In this
382 example, it will loop over "src" and "lowsrc".
384 <img src="foo.png" lowsrc="bar.png">
386 This has to be done in the outer loop so that the attributes are
387 processed in the same order in which they appear in the page.
388 This is required when converting links. */
390 for (attrind = 0; attrind < tag->nattrs; attrind++)
392 /* Find whether TAG/ATTRIND is a combination that contains a
394 char *link = tag->attrs[attrind].value;
395 const size_t size = countof (tag_url_attributes);
397 /* If you're cringing at the inefficiency of the nested loops,
398 remember that they both iterate over a very small number of
399 items. The worst-case inner loop is for the IMG tag, which
400 has three attributes. */
401 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
403 if (0 == strcasecmp (tag->attrs[attrind].name,
404 tag_url_attributes[i].attr_name))
406 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
407 ATTR_SIZE(tag,attrind), ctx);
410 int flags = tag_url_attributes[i].flags;
411 if (flags & ATTR_INLINE)
412 up->link_inline_p = 1;
413 if (flags & ATTR_HTML)
414 up->link_expect_html = 1;
421 /* Handle the BASE tag, for <base href=...>. */
424 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
426 struct urlpos *base_urlpos;
428 char *newbase = find_attr (tag, "href", &attrind);
432 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
433 ATTR_SIZE(tag,attrind), ctx);
436 base_urlpos->ignore_when_downloading = 1;
437 base_urlpos->link_base_p = 1;
441 if (ctx->parent_base)
442 ctx->base = uri_merge (ctx->parent_base, newbase);
444 ctx->base = xstrdup (newbase);
447 /* Mark the URL found in <form action=...> for conversion. */
450 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
453 char *action = find_attr (tag, "action", &attrind);
457 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
458 ATTR_SIZE(tag,attrind), ctx);
460 up->ignore_when_downloading = 1;
464 /* Handle the LINK tag. It requires special handling because how its
465 links will be followed in -p mode depends on the REL attribute. */
468 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
471 char *href = find_attr (tag, "href", &attrind);
473 /* All <link href="..."> link references are external, except those
474 known not to be, such as style sheet and shortcut icon:
476 <link rel="stylesheet" href="...">
477 <link rel="shortcut icon" href="...">
481 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
482 ATTR_SIZE(tag,attrind), ctx);
485 char *rel = find_attr (tag, "rel", NULL);
488 if (0 == strcasecmp (rel, "stylesheet"))
490 up->link_inline_p = 1;
491 up->link_expect_css = 1;
493 else if (0 == strcasecmp (rel, "shortcut icon"))
495 up->link_inline_p = 1;
499 /* The external ones usually point to HTML pages, such as
500 <link rel="next" href="..."> */
501 up->link_expect_html = 1;
506 /* Handle the META tag. This requires special handling because of the
507 refresh feature and because of robot exclusion. */
510 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
512 char *name = find_attr (tag, "name", NULL);
513 char *http_equiv = find_attr (tag, "http-equiv", NULL);
515 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
517 /* Some pages use a META tag to specify that the page be
518 refreshed by a new page after a given number of seconds. The
519 general format for this is:
521 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
523 So we just need to skip past the "NUMBER; URL=" garbage to
526 struct urlpos *entry;
531 char *refresh = find_attr (tag, "content", &attrind);
535 for (p = refresh; c_isdigit (*p); p++)
536 timeout = 10 * timeout + *p - '0';
540 while (c_isspace (*p))
542 if (!( c_toupper (*p) == 'U'
543 && c_toupper (*(p + 1)) == 'R'
544 && c_toupper (*(p + 2)) == 'L'
548 while (c_isspace (*p))
551 entry = append_url (p, ATTR_POS(tag,attrind,ctx),
552 ATTR_SIZE(tag,attrind), ctx);
555 entry->link_refresh_p = 1;
556 entry->refresh_timeout = timeout;
557 entry->link_expect_html = 1;
560 else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
562 /* Handle stuff like:
563 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
566 char *content = find_attr (tag, "content", NULL);
570 mcharset = parse_charset (content);
574 xfree_null (meta_charset);
575 meta_charset = mcharset;
577 else if (name && 0 == strcasecmp (name, "robots"))
579 /* Handle stuff like:
580 <meta name="robots" content="index,nofollow"> */
581 char *content = find_attr (tag, "content", NULL);
584 if (!strcasecmp (content, "none"))
585 ctx->nofollow = true;
590 /* Find the next occurrence of ',' or the end of
592 char *end = strchr (content, ',');
596 end = content + strlen (content);
597 if (!strncasecmp (content, "nofollow", end - content))
598 ctx->nofollow = true;
605 /* Dispatch the tag handler appropriate for the tag we're mapping
606 over. See known_tags[] for definition of tag handlers. */
609 collect_tags_mapper (struct taginfo *tag, void *arg)
611 struct map_context *ctx = (struct map_context *)arg;
613 /* Find the tag in our table of tags. This must not fail because
614 map_html_tags only returns tags found in interesting_tags.
616 I've changed this for now, I'm passing NULL as interesting_tags
617 to map_html_tags. This way we can check all tags for a style
620 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
623 t->handler (t->tagid, tag, ctx);
625 check_style_attr (tag, ctx);
627 if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
628 tag->contents_begin && tag->contents_end)
631 get_urls_css (ctx, tag->contents_begin - ctx->text,
632 tag->contents_end - tag->contents_begin);
636 /* Analyze HTML tags FILE and construct a list of URLs referenced from
637 it. It merges relative links in FILE with URL. It is aware of
638 <base href=...> and does the right thing. */
641 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
644 struct file_memory *fm;
645 struct map_context ctx;
649 fm = read_file (file);
652 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
655 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
657 ctx.text = fm->content;
658 ctx.head = ctx.tail = NULL;
660 ctx.parent_base = url ? url : opt.base_href;
661 ctx.document_file = file;
662 ctx.nofollow = false;
664 if (!interesting_tags)
667 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
668 generate <a href=" foo"> instead of <a href="foo"> (browsers
669 ignore spaces as well.) If you really mean space, use &32; or
670 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
671 e.g. in <img src="foo.[newline]html">. Such newlines are also
672 ignored by IE and Mozilla and are presumably introduced by
673 writing HTML with editors that force word wrap. */
674 flags = MHT_TRIM_VALUES;
675 if (opt.strict_comments)
676 flags |= MHT_STRICT_COMMENTS;
678 /* the NULL here used to be interesting_tags */
679 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
680 NULL, interesting_attributes);
682 /* If meta charset isn't null, override content encoding */
683 if (iri && meta_charset)
684 set_content_encoding (iri, meta_charset);
686 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
687 if (meta_disallow_follow)
688 *meta_disallow_follow = ctx.nofollow;
690 xfree_null (ctx.base);
695 /* This doesn't really have anything to do with HTML, but it's similar
696 to get_urls_html, so we put it here. */
699 get_urls_file (const char *file)
701 struct file_memory *fm;
702 struct urlpos *head, *tail;
703 const char *text, *text_end;
706 fm = read_file (file);
709 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
712 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
716 text_end = fm->content + fm->length;
717 while (text < text_end)
721 struct urlpos *entry;
724 const char *line_beg = text;
725 const char *line_end = memchr (text, '\n', text_end - text);
732 /* Strip whitespace from the beginning and end of line. */
733 while (line_beg < line_end && c_isspace (*line_beg))
735 while (line_end > line_beg && c_isspace (*(line_end - 1)))
738 if (line_beg == line_end)
741 /* The URL is in the [line_beg, line_end) region. */
743 /* We must copy the URL to a zero-terminated string, and we
744 can't use alloca because we're in a loop. *sigh*. */
745 url_text = strdupdelim (line_beg, line_end);
749 /* Merge opt.base_href with URL. */
750 char *merged = uri_merge (opt.base_href, url_text);
755 url = url_parse (url_text, &up_error_code, NULL, false);
758 char *error = url_error (url_text, up_error_code);
759 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
760 file, url_text, error);
767 entry = xnew0 (struct urlpos);
781 cleanup_html_url (void)
783 /* Destroy the hash tables. The hash table keys and values are not
784 allocated by this code, so we don't need to free them here. */
785 if (interesting_tags)
786 hash_table_destroy (interesting_tags);
787 if (interesting_attributes)
788 hash_table_destroy (interesting_attributes);