1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
40 #include "html-parse.h"
49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
51 #define DECLARE_TAG_HANDLER(fun) \
52 static void fun (int, struct taginfo *, struct map_context *)
54 DECLARE_TAG_HANDLER (tag_find_urls);
55 DECLARE_TAG_HANDLER (tag_handle_base);
56 DECLARE_TAG_HANDLER (tag_handle_form);
57 DECLARE_TAG_HANDLER (tag_handle_link);
58 DECLARE_TAG_HANDLER (tag_handle_meta);
85 /* The list of known tags and functions used for handling them. Most
86 tags are simply harvested for URLs. */
87 static struct known_tag {
90 tag_handler_t handler;
92 { TAG_A, "a", tag_find_urls },
93 { TAG_APPLET, "applet", tag_find_urls },
94 { TAG_AREA, "area", tag_find_urls },
95 { TAG_BASE, "base", tag_handle_base },
96 { TAG_BGSOUND, "bgsound", tag_find_urls },
97 { TAG_BODY, "body", tag_find_urls },
98 { TAG_EMBED, "embed", tag_find_urls },
99 { TAG_FIG, "fig", tag_find_urls },
100 { TAG_FORM, "form", tag_handle_form },
101 { TAG_FRAME, "frame", tag_find_urls },
102 { TAG_IFRAME, "iframe", tag_find_urls },
103 { TAG_IMG, "img", tag_find_urls },
104 { TAG_INPUT, "input", tag_find_urls },
105 { TAG_LAYER, "layer", tag_find_urls },
106 { TAG_LINK, "link", tag_handle_link },
107 { TAG_META, "meta", tag_handle_meta },
108 { TAG_OBJECT, "object", tag_find_urls },
109 { TAG_OVERLAY, "overlay", tag_find_urls },
110 { TAG_SCRIPT, "script", tag_find_urls },
111 { TAG_TABLE, "table", tag_find_urls },
112 { TAG_TD, "td", tag_find_urls },
113 { TAG_TH, "th", tag_find_urls }
116 /* tag_url_attributes documents which attributes of which tags contain
117 URLs to harvest. It is used by tag_find_urls. */
119 /* Defines for the FLAGS. */
121 /* The link is "inline", i.e. needs to be retrieved for this document
122 to be correctly rendered. Inline links include inlined images,
123 stylesheets, children frames, etc. */
124 #define ATTR_INLINE 1
126 /* The link is expected to yield HTML contents. It's important not to
127 try to follow HTML obtained by following e.g. <img src="...">
128 regardless of content-type. Doing this causes infinite loops for
129 "images" that return non-404 error pages with links to the same
133 /* For tags handled by tag_find_urls: attributes that contain URLs to
137 const char *attr_name;
139 } tag_url_attributes[] = {
140 { TAG_A, "href", ATTR_HTML },
141 { TAG_APPLET, "code", ATTR_INLINE },
142 { TAG_AREA, "href", ATTR_HTML },
143 { TAG_BGSOUND, "src", ATTR_INLINE },
144 { TAG_BODY, "background", ATTR_INLINE },
145 { TAG_EMBED, "href", ATTR_HTML },
146 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
147 { TAG_FIG, "src", ATTR_INLINE },
148 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
149 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
150 { TAG_IMG, "href", ATTR_INLINE },
151 { TAG_IMG, "lowsrc", ATTR_INLINE },
152 { TAG_IMG, "src", ATTR_INLINE },
153 { TAG_INPUT, "src", ATTR_INLINE },
154 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
155 { TAG_OBJECT, "data", ATTR_INLINE },
156 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
157 { TAG_SCRIPT, "src", ATTR_INLINE },
158 { TAG_TABLE, "background", ATTR_INLINE },
159 { TAG_TD, "background", ATTR_INLINE },
160 { TAG_TH, "background", ATTR_INLINE }
163 /* The lists of interesting tags and attributes are built dynamically,
164 from the information above. However, some places in the code refer
165 to the attributes not mentioned here. We add them manually. */
166 static const char *additional_attributes[] = {
167 "rel", /* used by tag_handle_link */
168 "type", /* used by tag_handle_link */
169 "http-equiv", /* used by tag_handle_meta */
170 "name", /* used by tag_handle_meta */
171 "content", /* used by tag_handle_meta */
172 "action", /* used by tag_handle_form */
173 "style" /* used by check_style_attr */
176 static struct hash_table *interesting_tags;
177 static struct hash_table *interesting_attributes;
179 /* Will contains the (last) charset found in 'http-equiv=content-type'
181 static char *meta_charset;
184 init_interesting (void)
186 /* Init the variables interesting_tags and interesting_attributes
187 that are used by the HTML parser to know which tags and
188 attributes we're interested in. We initialize this only once,
189 for performance reasons.
191 Here we also make sure that what we put in interesting_tags
192 matches the user's preferences as specified through --ignore-tags
193 and --follow-tags. */
196 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
198 /* First, add all the tags we know hot to handle, mapped to their
199 respective entries in known_tags. */
200 for (i = 0; i < countof (known_tags); i++)
201 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
203 /* Then remove the tags ignored through --ignore-tags. */
207 for (ignored = opt.ignore_tags; *ignored; ignored++)
208 hash_table_remove (interesting_tags, *ignored);
211 /* If --follow-tags is specified, use only those tags. */
214 /* Create a new table intersecting --follow-tags and known_tags,
215 and use it as interesting_tags. */
216 struct hash_table *intersect = make_nocase_string_hash_table (0);
218 for (followed = opt.follow_tags; *followed; followed++)
220 struct known_tag *t = hash_table_get (interesting_tags, *followed);
222 continue; /* ignore unknown --follow-tags entries. */
223 hash_table_put (intersect, *followed, t);
225 hash_table_destroy (interesting_tags);
226 interesting_tags = intersect;
229 /* Add the attributes we care about. */
230 interesting_attributes = make_nocase_string_hash_table (10);
231 for (i = 0; i < countof (additional_attributes); i++)
232 hash_table_put (interesting_attributes, additional_attributes[i], "1");
233 for (i = 0; i < countof (tag_url_attributes); i++)
234 hash_table_put (interesting_attributes,
235 tag_url_attributes[i].attr_name, "1");
238 /* Find the value of attribute named NAME in the taginfo TAG. If the
239 attribute is not present, return NULL. If ATTRIND is non-NULL, the
240 index of the attribute in TAG will be stored there. */
243 find_attr (struct taginfo *tag, const char *name, int *attrind)
246 for (i = 0; i < tag->nattrs; i++)
247 if (!strcasecmp (tag->attrs[i].name, name))
251 return tag->attrs[i].value;
256 /* used for calls to append_url */
257 #define ATTR_POS(tag, attrind, ctx) \
258 (tag->attrs[attrind].value_raw_beginning - ctx->text)
259 #define ATTR_SIZE(tag, attrind) \
260 (tag->attrs[attrind].value_raw_size)
262 /* Append LINK_URI to the urlpos structure that is being built.
264 LINK_URI will be merged with the current document base.
268 append_url (const char *link_uri, int position, int size,
269 struct map_context *ctx)
271 int link_has_scheme = url_has_scheme (link_uri);
272 struct urlpos *newel;
273 const char *base = ctx->base ? ctx->base : ctx->parent_base;
278 DEBUGP (("%s: no base, merge will use \"%s\".\n",
279 ctx->document_file, link_uri));
281 if (!link_has_scheme)
283 /* Base URL is unavailable, and the link does not have a
284 location attached to it -- we have to give up. Since
285 this can only happen when using `--force-html -i', print
287 logprintf (LOG_NOTQUIET,
288 _("%s: Cannot resolve incomplete link %s.\n"),
289 ctx->document_file, link_uri);
293 url = url_parse (link_uri, NULL, NULL, false);
296 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
297 ctx->document_file, link_uri));
303 /* Merge BASE with LINK_URI, but also make sure the result is
304 canonicalized, i.e. that "../" have been resolved.
305 (parse_url will do that for us.) */
307 char *complete_uri = uri_merge (base, link_uri);
309 DEBUGP (("%s: merge(%s, %s) -> %s\n",
310 quotearg_n_style (0, escape_quoting_style, ctx->document_file),
312 quote_n (2, link_uri),
313 quotearg_n_style (3, escape_quoting_style, complete_uri)));
315 url = url_parse (complete_uri, NULL, NULL, false);
318 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
319 ctx->document_file, complete_uri));
320 xfree (complete_uri);
323 xfree (complete_uri);
326 DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
328 newel = xnew0 (struct urlpos);
330 newel->pos = position;
333 /* A URL is relative if the host is not named, and the name does not
335 if (!link_has_scheme && *link_uri != '/')
336 newel->link_relative_p = 1;
337 else if (link_has_scheme)
338 newel->link_complete_p = 1;
340 /* Append the new URL maintaining the order by position. */
341 if (ctx->head == NULL)
345 struct urlpos *it, *prev = NULL;
348 while (it && position > it->pos)
366 check_style_attr (struct taginfo *tag, struct map_context *ctx)
371 char *style = find_attr (tag, "style", &attrind);
375 /* raw pos and raw size include the quotes, skip them when they are
377 raw_start = ATTR_POS (tag, attrind, ctx);
378 raw_len = ATTR_SIZE (tag, attrind);
379 if( *(char *)(ctx->text + raw_start) == '\''
380 || *(char *)(ctx->text + raw_start) == '"')
389 get_urls_css (ctx, raw_start, raw_len);
392 /* All the tag_* functions are called from collect_tags_mapper, as
393 specified by KNOWN_TAGS. */
395 /* Default tag handler: collect URLs from attributes specified for
396 this tag by tag_url_attributes. */
399 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
405 for (i = 0; i < countof (tag_url_attributes); i++)
406 if (tag_url_attributes[i].tagid == tagid)
408 /* We've found the index of tag_url_attributes where the
409 attributes of our tag begin. */
413 assert (first != -1);
415 /* Loop over the "interesting" attributes of this tag. In this
416 example, it will loop over "src" and "lowsrc".
418 <img src="foo.png" lowsrc="bar.png">
420 This has to be done in the outer loop so that the attributes are
421 processed in the same order in which they appear in the page.
422 This is required when converting links. */
424 for (attrind = 0; attrind < tag->nattrs; attrind++)
426 /* Find whether TAG/ATTRIND is a combination that contains a
428 char *link = tag->attrs[attrind].value;
429 const size_t size = countof (tag_url_attributes);
431 /* If you're cringing at the inefficiency of the nested loops,
432 remember that they both iterate over a very small number of
433 items. The worst-case inner loop is for the IMG tag, which
434 has three attributes. */
435 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
437 if (0 == strcasecmp (tag->attrs[attrind].name,
438 tag_url_attributes[i].attr_name))
440 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
441 ATTR_SIZE(tag,attrind), ctx);
444 int flags = tag_url_attributes[i].flags;
445 if (flags & ATTR_INLINE)
446 up->link_inline_p = 1;
447 if (flags & ATTR_HTML)
448 up->link_expect_html = 1;
455 /* Handle the BASE tag, for <base href=...>. */
458 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
460 struct urlpos *base_urlpos;
462 char *newbase = find_attr (tag, "href", &attrind);
466 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
467 ATTR_SIZE(tag,attrind), ctx);
470 base_urlpos->ignore_when_downloading = 1;
471 base_urlpos->link_base_p = 1;
475 if (ctx->parent_base)
476 ctx->base = uri_merge (ctx->parent_base, newbase);
478 ctx->base = xstrdup (newbase);
481 /* Mark the URL found in <form action=...> for conversion. */
484 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
487 char *action = find_attr (tag, "action", &attrind);
491 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
492 ATTR_SIZE(tag,attrind), ctx);
494 up->ignore_when_downloading = 1;
498 /* Handle the LINK tag. It requires special handling because how its
499 links will be followed in -p mode depends on the REL attribute. */
502 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
505 char *href = find_attr (tag, "href", &attrind);
507 /* All <link href="..."> link references are external, except those
508 known not to be, such as style sheet and shortcut icon:
510 <link rel="stylesheet" href="...">
511 <link rel="shortcut icon" href="...">
515 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
516 ATTR_SIZE(tag,attrind), ctx);
519 char *rel = find_attr (tag, "rel", NULL);
522 if (0 == strcasecmp (rel, "stylesheet"))
524 up->link_inline_p = 1;
525 up->link_expect_css = 1;
527 else if (0 == strcasecmp (rel, "shortcut icon"))
529 up->link_inline_p = 1;
533 /* The external ones usually point to HTML pages, such as
534 <link rel="next" href="...">
535 except when the type attribute says otherwise:
536 <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
538 char *type = find_attr (tag, "type", NULL);
539 if (!type || strcasecmp (type, "text/html") == 0)
540 up->link_expect_html = 1;
547 /* Handle the META tag. This requires special handling because of the
548 refresh feature and because of robot exclusion. */
551 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
553 char *name = find_attr (tag, "name", NULL);
554 char *http_equiv = find_attr (tag, "http-equiv", NULL);
556 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
558 /* Some pages use a META tag to specify that the page be
559 refreshed by a new page after a given number of seconds. The
560 general format for this is:
562 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
564 So we just need to skip past the "NUMBER; URL=" garbage to
567 struct urlpos *entry;
572 char *refresh = find_attr (tag, "content", &attrind);
576 for (p = refresh; c_isdigit (*p); p++)
577 timeout = 10 * timeout + *p - '0';
581 while (c_isspace (*p))
583 if (!( c_toupper (*p) == 'U'
584 && c_toupper (*(p + 1)) == 'R'
585 && c_toupper (*(p + 2)) == 'L'
589 while (c_isspace (*p))
592 entry = append_url (p, ATTR_POS(tag,attrind,ctx),
593 ATTR_SIZE(tag,attrind), ctx);
596 entry->link_refresh_p = 1;
597 entry->refresh_timeout = timeout;
598 entry->link_expect_html = 1;
601 else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
603 /* Handle stuff like:
604 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
607 char *content = find_attr (tag, "content", NULL);
611 mcharset = parse_charset (content);
615 xfree_null (meta_charset);
616 meta_charset = mcharset;
618 else if (name && 0 == strcasecmp (name, "robots"))
620 /* Handle stuff like:
621 <meta name="robots" content="index,nofollow"> */
622 char *content = find_attr (tag, "content", NULL);
625 if (!strcasecmp (content, "none"))
626 ctx->nofollow = true;
632 /* Skip any initial whitespace. */
633 content += strspn (content, " \f\n\r\t\v");
634 /* Find the next occurrence of ',' or whitespace,
635 * or the end of the string. */
636 end = content + strcspn (content, ", \f\n\r\t\v");
637 if (!strncasecmp (content, "nofollow", end - content))
638 ctx->nofollow = true;
639 /* Skip past the next comma, if any. */
644 end = strchr (end, ',');
648 end = content + strlen (content);
656 /* Dispatch the tag handler appropriate for the tag we're mapping
657 over. See known_tags[] for definition of tag handlers. */
660 collect_tags_mapper (struct taginfo *tag, void *arg)
662 struct map_context *ctx = (struct map_context *)arg;
664 /* Find the tag in our table of tags. This must not fail because
665 map_html_tags only returns tags found in interesting_tags.
667 I've changed this for now, I'm passing NULL as interesting_tags
668 to map_html_tags. This way we can check all tags for a style
671 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
674 t->handler (t->tagid, tag, ctx);
676 check_style_attr (tag, ctx);
678 if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
679 tag->contents_begin && tag->contents_end)
682 get_urls_css (ctx, tag->contents_begin - ctx->text,
683 tag->contents_end - tag->contents_begin);
687 /* Analyze HTML tags FILE and construct a list of URLs referenced from
688 it. It merges relative links in FILE with URL. It is aware of
689 <base href=...> and does the right thing. */
692 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
695 struct file_memory *fm;
696 struct map_context ctx;
700 fm = wget_read_file (file);
703 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
706 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
708 ctx.text = fm->content;
711 ctx.parent_base = url ? url : opt.base_href;
712 ctx.document_file = file;
713 ctx.nofollow = false;
715 if (!interesting_tags)
718 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
719 generate <a href=" foo"> instead of <a href="foo"> (browsers
720 ignore spaces as well.) If you really mean space, use &32; or
721 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
722 e.g. in <img src="foo.[newline]html">. Such newlines are also
723 ignored by IE and Mozilla and are presumably introduced by
724 writing HTML with editors that force word wrap. */
725 flags = MHT_TRIM_VALUES;
726 if (opt.strict_comments)
727 flags |= MHT_STRICT_COMMENTS;
729 /* the NULL here used to be interesting_tags */
730 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
731 NULL, interesting_attributes);
733 /* If meta charset isn't null, override content encoding */
734 if (iri && meta_charset)
735 set_content_encoding (iri, meta_charset);
737 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
738 if (meta_disallow_follow)
739 *meta_disallow_follow = ctx.nofollow;
741 xfree_null (ctx.base);
742 wget_read_file_free (fm);
746 /* This doesn't really have anything to do with HTML, but it's similar
747 to get_urls_html, so we put it here. */
750 get_urls_file (const char *file)
752 struct file_memory *fm;
753 struct urlpos *head, *tail;
754 const char *text, *text_end;
757 fm = wget_read_file (file);
760 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
763 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
767 text_end = fm->content + fm->length;
768 while (text < text_end)
772 struct urlpos *entry;
775 const char *line_beg = text;
776 const char *line_end = memchr (text, '\n', text_end - text);
783 /* Strip whitespace from the beginning and end of line. */
784 while (line_beg < line_end && c_isspace (*line_beg))
786 while (line_end > line_beg && c_isspace (*(line_end - 1)))
789 if (line_beg == line_end)
792 /* The URL is in the [line_beg, line_end) region. */
794 /* We must copy the URL to a zero-terminated string, and we
795 can't use alloca because we're in a loop. *sigh*. */
796 url_text = strdupdelim (line_beg, line_end);
800 /* Merge opt.base_href with URL. */
801 char *merged = uri_merge (opt.base_href, url_text);
806 url = url_parse (url_text, &up_error_code, NULL, false);
809 char *error = url_error (url_text, up_error_code);
810 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
811 file, url_text, error);
814 inform_exit_status (URLERROR);
819 entry = xnew0 (struct urlpos);
828 wget_read_file_free (fm);
833 cleanup_html_url (void)
835 /* Destroy the hash tables. The hash table keys and values are not
836 allocated by this code, so we don't need to free them here. */
837 if (interesting_tags)
838 hash_table_destroy (interesting_tags);
839 if (interesting_attributes)
840 hash_table_destroy (interesting_attributes);