1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
40 #include "html-parse.h"
49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
51 #define DECLARE_TAG_HANDLER(fun) \
52 static void fun (int, struct taginfo *, struct map_context *)
54 DECLARE_TAG_HANDLER (tag_find_urls);
55 DECLARE_TAG_HANDLER (tag_handle_base);
56 DECLARE_TAG_HANDLER (tag_handle_form);
57 DECLARE_TAG_HANDLER (tag_handle_link);
58 DECLARE_TAG_HANDLER (tag_handle_meta);
88 /* The list of known tags and functions used for handling them. Most
89 tags are simply harvested for URLs. */
90 static struct known_tag {
93 tag_handler_t handler;
95 { TAG_A, "a", tag_find_urls },
96 { TAG_APPLET, "applet", tag_find_urls },
97 { TAG_AREA, "area", tag_find_urls },
98 { TAG_BASE, "base", tag_handle_base },
99 { TAG_BGSOUND, "bgsound", tag_find_urls },
100 { TAG_BODY, "body", tag_find_urls },
101 { TAG_EMBED, "embed", tag_find_urls },
102 { TAG_FIG, "fig", tag_find_urls },
103 { TAG_FORM, "form", tag_handle_form },
104 { TAG_FRAME, "frame", tag_find_urls },
105 { TAG_IFRAME, "iframe", tag_find_urls },
106 { TAG_IMG, "img", tag_find_urls },
107 { TAG_INPUT, "input", tag_find_urls },
108 { TAG_LAYER, "layer", tag_find_urls },
109 { TAG_LINK, "link", tag_handle_link },
110 { TAG_META, "meta", tag_handle_meta },
111 { TAG_OBJECT, "object", tag_find_urls },
112 { TAG_OVERLAY, "overlay", tag_find_urls },
113 { TAG_SCRIPT, "script", tag_find_urls },
114 { TAG_TABLE, "table", tag_find_urls },
115 { TAG_TD, "td", tag_find_urls },
116 { TAG_TH, "th", tag_find_urls },
117 { TAG_VIDEO, "video", tag_find_urls },
118 { TAG_AUDIO, "audio", tag_find_urls },
119 { TAG_SOURCE, "source", tag_find_urls }
122 /* tag_url_attributes documents which attributes of which tags contain
123 URLs to harvest. It is used by tag_find_urls. */
125 /* Defines for the FLAGS. */
127 /* The link is "inline", i.e. needs to be retrieved for this document
128 to be correctly rendered. Inline links include inlined images,
129 stylesheets, children frames, etc. */
130 #define ATTR_INLINE 1
132 /* The link is expected to yield HTML contents. It's important not to
133 try to follow HTML obtained by following e.g. <img src="...">
134 regardless of content-type. Doing this causes infinite loops for
135 "images" that return non-404 error pages with links to the same
139 /* For tags handled by tag_find_urls: attributes that contain URLs to
143 const char *attr_name;
145 } tag_url_attributes[] = {
146 { TAG_A, "href", ATTR_HTML },
147 { TAG_APPLET, "code", ATTR_INLINE },
148 { TAG_AREA, "href", ATTR_HTML },
149 { TAG_BGSOUND, "src", ATTR_INLINE },
150 { TAG_BODY, "background", ATTR_INLINE },
151 { TAG_EMBED, "href", ATTR_HTML },
152 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
153 { TAG_FIG, "src", ATTR_INLINE },
154 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
155 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
156 { TAG_IMG, "href", ATTR_INLINE },
157 { TAG_IMG, "lowsrc", ATTR_INLINE },
158 { TAG_IMG, "src", ATTR_INLINE },
159 { TAG_INPUT, "src", ATTR_INLINE },
160 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
161 { TAG_OBJECT, "data", ATTR_INLINE },
162 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
163 { TAG_SCRIPT, "src", ATTR_INLINE },
164 { TAG_TABLE, "background", ATTR_INLINE },
165 { TAG_TD, "background", ATTR_INLINE },
166 { TAG_TH, "background", ATTR_INLINE },
167 { TAG_VIDEO, "src", ATTR_INLINE },
168 { TAG_VIDEO, "poster", ATTR_INLINE },
169 { TAG_AUDIO, "src", ATTR_INLINE },
170 { TAG_AUDIO, "poster", ATTR_INLINE },
171 { TAG_SOURCE, "src", ATTR_INLINE }
174 /* The lists of interesting tags and attributes are built dynamically,
175 from the information above. However, some places in the code refer
176 to the attributes not mentioned here. We add them manually. */
177 static const char *additional_attributes[] = {
178 "rel", /* used by tag_handle_link */
179 "type", /* used by tag_handle_link */
180 "http-equiv", /* used by tag_handle_meta */
181 "name", /* used by tag_handle_meta */
182 "content", /* used by tag_handle_meta */
183 "action", /* used by tag_handle_form */
184 "style" /* used by check_style_attr */
187 static struct hash_table *interesting_tags;
188 static struct hash_table *interesting_attributes;
190 /* Will contains the (last) charset found in 'http-equiv=content-type'
192 static char *meta_charset;
195 init_interesting (void)
197 /* Init the variables interesting_tags and interesting_attributes
198 that are used by the HTML parser to know which tags and
199 attributes we're interested in. We initialize this only once,
200 for performance reasons.
202 Here we also make sure that what we put in interesting_tags
203 matches the user's preferences as specified through --ignore-tags
204 and --follow-tags. */
207 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
209 /* First, add all the tags we know hot to handle, mapped to their
210 respective entries in known_tags. */
211 for (i = 0; i < countof (known_tags); i++)
212 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
214 /* Then remove the tags ignored through --ignore-tags. */
218 for (ignored = opt.ignore_tags; *ignored; ignored++)
219 hash_table_remove (interesting_tags, *ignored);
222 /* If --follow-tags is specified, use only those tags. */
225 /* Create a new table intersecting --follow-tags and known_tags,
226 and use it as interesting_tags. */
227 struct hash_table *intersect = make_nocase_string_hash_table (0);
229 for (followed = opt.follow_tags; *followed; followed++)
231 struct known_tag *t = hash_table_get (interesting_tags, *followed);
233 continue; /* ignore unknown --follow-tags entries. */
234 hash_table_put (intersect, *followed, t);
236 hash_table_destroy (interesting_tags);
237 interesting_tags = intersect;
240 /* Add the attributes we care about. */
241 interesting_attributes = make_nocase_string_hash_table (10);
242 for (i = 0; i < countof (additional_attributes); i++)
243 hash_table_put (interesting_attributes, additional_attributes[i], "1");
244 for (i = 0; i < countof (tag_url_attributes); i++)
245 hash_table_put (interesting_attributes,
246 tag_url_attributes[i].attr_name, "1");
249 /* Find the value of attribute named NAME in the taginfo TAG. If the
250 attribute is not present, return NULL. If ATTRIND is non-NULL, the
251 index of the attribute in TAG will be stored there. */
254 find_attr (struct taginfo *tag, const char *name, int *attrind)
257 for (i = 0; i < tag->nattrs; i++)
258 if (!strcasecmp (tag->attrs[i].name, name))
262 return tag->attrs[i].value;
267 /* used for calls to append_url */
268 #define ATTR_POS(tag, attrind, ctx) \
269 (tag->attrs[attrind].value_raw_beginning - ctx->text)
270 #define ATTR_SIZE(tag, attrind) \
271 (tag->attrs[attrind].value_raw_size)
273 /* Append LINK_URI to the urlpos structure that is being built.
275 LINK_URI will be merged with the current document base.
279 append_url (const char *link_uri, int position, int size,
280 struct map_context *ctx)
282 int link_has_scheme = url_has_scheme (link_uri);
283 struct urlpos *newel;
284 const char *base = ctx->base ? ctx->base : ctx->parent_base;
287 struct iri *iri = iri_new ();
288 set_uri_encoding (iri, opt.locale, true);
289 iri->utf8_encode = true;
293 DEBUGP (("%s: no base, merge will use \"%s\".\n",
294 ctx->document_file, link_uri));
296 if (!link_has_scheme)
298 /* Base URL is unavailable, and the link does not have a
299 location attached to it -- we have to give up. Since
300 this can only happen when using `--force-html -i', print
302 logprintf (LOG_NOTQUIET,
303 _("%s: Cannot resolve incomplete link %s.\n"),
304 ctx->document_file, link_uri);
308 url = url_parse (link_uri, NULL, iri, false);
311 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
312 ctx->document_file, link_uri));
318 /* Merge BASE with LINK_URI, but also make sure the result is
319 canonicalized, i.e. that "../" have been resolved.
320 (parse_url will do that for us.) */
322 char *complete_uri = uri_merge (base, link_uri);
324 DEBUGP (("%s: merge(%s, %s) -> %s\n",
325 quotearg_n_style (0, escape_quoting_style, ctx->document_file),
327 quote_n (2, link_uri),
328 quotearg_n_style (3, escape_quoting_style, complete_uri)));
330 url = url_parse (complete_uri, NULL, iri, false);
333 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
334 ctx->document_file, complete_uri));
335 xfree (complete_uri);
338 xfree (complete_uri);
343 DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
345 newel = xnew0 (struct urlpos);
347 newel->pos = position;
350 /* A URL is relative if the host is not named, and the name does not
352 if (!link_has_scheme && *link_uri != '/')
353 newel->link_relative_p = 1;
354 else if (link_has_scheme)
355 newel->link_complete_p = 1;
357 /* Append the new URL maintaining the order by position. */
358 if (ctx->head == NULL)
362 struct urlpos *it, *prev = NULL;
365 while (it && position > it->pos)
383 check_style_attr (struct taginfo *tag, struct map_context *ctx)
388 char *style = find_attr (tag, "style", &attrind);
392 /* raw pos and raw size include the quotes, skip them when they are
394 raw_start = ATTR_POS (tag, attrind, ctx);
395 raw_len = ATTR_SIZE (tag, attrind);
396 if( *(char *)(ctx->text + raw_start) == '\''
397 || *(char *)(ctx->text + raw_start) == '"')
406 get_urls_css (ctx, raw_start, raw_len);
409 /* All the tag_* functions are called from collect_tags_mapper, as
410 specified by KNOWN_TAGS. */
412 /* Default tag handler: collect URLs from attributes specified for
413 this tag by tag_url_attributes. */
416 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
422 for (i = 0; i < countof (tag_url_attributes); i++)
423 if (tag_url_attributes[i].tagid == tagid)
425 /* We've found the index of tag_url_attributes where the
426 attributes of our tag begin. */
430 assert (first != -1);
432 /* Loop over the "interesting" attributes of this tag. In this
433 example, it will loop over "src" and "lowsrc".
435 <img src="foo.png" lowsrc="bar.png">
437 This has to be done in the outer loop so that the attributes are
438 processed in the same order in which they appear in the page.
439 This is required when converting links. */
441 for (attrind = 0; attrind < tag->nattrs; attrind++)
443 /* Find whether TAG/ATTRIND is a combination that contains a
445 char *link = tag->attrs[attrind].value;
446 const size_t size = countof (tag_url_attributes);
448 /* If you're cringing at the inefficiency of the nested loops,
449 remember that they both iterate over a very small number of
450 items. The worst-case inner loop is for the IMG tag, which
451 has three attributes. */
452 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
454 if (0 == strcasecmp (tag->attrs[attrind].name,
455 tag_url_attributes[i].attr_name))
457 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
458 ATTR_SIZE(tag,attrind), ctx);
461 int flags = tag_url_attributes[i].flags;
462 if (flags & ATTR_INLINE)
463 up->link_inline_p = 1;
464 if (flags & ATTR_HTML)
465 up->link_expect_html = 1;
472 /* Handle the BASE tag, for <base href=...>. */
475 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
477 struct urlpos *base_urlpos;
479 char *newbase = find_attr (tag, "href", &attrind);
483 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
484 ATTR_SIZE(tag,attrind), ctx);
487 base_urlpos->ignore_when_downloading = 1;
488 base_urlpos->link_base_p = 1;
492 if (ctx->parent_base)
493 ctx->base = uri_merge (ctx->parent_base, newbase);
495 ctx->base = xstrdup (newbase);
498 /* Mark the URL found in <form action=...> for conversion. */
501 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
504 char *action = find_attr (tag, "action", &attrind);
508 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
509 ATTR_SIZE(tag,attrind), ctx);
511 up->ignore_when_downloading = 1;
515 /* Handle the LINK tag. It requires special handling because how its
516 links will be followed in -p mode depends on the REL attribute. */
519 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
522 char *href = find_attr (tag, "href", &attrind);
524 /* All <link href="..."> link references are external, except those
525 known not to be, such as style sheet and shortcut icon:
527 <link rel="stylesheet" href="...">
528 <link rel="shortcut icon" href="...">
532 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
533 ATTR_SIZE(tag,attrind), ctx);
536 char *rel = find_attr (tag, "rel", NULL);
539 if (0 == strcasecmp (rel, "stylesheet"))
541 up->link_inline_p = 1;
542 up->link_expect_css = 1;
544 else if (0 == strcasecmp (rel, "shortcut icon"))
546 up->link_inline_p = 1;
550 /* The external ones usually point to HTML pages, such as
551 <link rel="next" href="...">
552 except when the type attribute says otherwise:
553 <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
555 char *type = find_attr (tag, "type", NULL);
556 if (!type || strcasecmp (type, "text/html") == 0)
557 up->link_expect_html = 1;
564 /* Handle the META tag. This requires special handling because of the
565 refresh feature and because of robot exclusion. */
568 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
570 char *name = find_attr (tag, "name", NULL);
571 char *http_equiv = find_attr (tag, "http-equiv", NULL);
573 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
575 /* Some pages use a META tag to specify that the page be
576 refreshed by a new page after a given number of seconds. The
577 general format for this is:
579 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
581 So we just need to skip past the "NUMBER; URL=" garbage to
584 struct urlpos *entry;
589 char *refresh = find_attr (tag, "content", &attrind);
593 for (p = refresh; c_isdigit (*p); p++)
594 timeout = 10 * timeout + *p - '0';
598 while (c_isspace (*p))
600 if (!( c_toupper (*p) == 'U'
601 && c_toupper (*(p + 1)) == 'R'
602 && c_toupper (*(p + 2)) == 'L'
606 while (c_isspace (*p))
609 entry = append_url (p, ATTR_POS(tag,attrind,ctx),
610 ATTR_SIZE(tag,attrind), ctx);
613 entry->link_refresh_p = 1;
614 entry->refresh_timeout = timeout;
615 entry->link_expect_html = 1;
618 else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
620 /* Handle stuff like:
621 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
624 char *content = find_attr (tag, "content", NULL);
628 mcharset = parse_charset (content);
632 xfree_null (meta_charset);
633 meta_charset = mcharset;
635 else if (name && 0 == strcasecmp (name, "robots"))
637 /* Handle stuff like:
638 <meta name="robots" content="index,nofollow"> */
639 char *content = find_attr (tag, "content", NULL);
642 if (!strcasecmp (content, "none"))
643 ctx->nofollow = true;
649 /* Skip any initial whitespace. */
650 content += strspn (content, " \f\n\r\t\v");
651 /* Find the next occurrence of ',' or whitespace,
652 * or the end of the string. */
653 end = content + strcspn (content, ", \f\n\r\t\v");
654 if (!strncasecmp (content, "nofollow", end - content))
655 ctx->nofollow = true;
656 /* Skip past the next comma, if any. */
661 end = strchr (end, ',');
665 end = content + strlen (content);
673 /* Dispatch the tag handler appropriate for the tag we're mapping
674 over. See known_tags[] for definition of tag handlers. */
677 collect_tags_mapper (struct taginfo *tag, void *arg)
679 struct map_context *ctx = (struct map_context *)arg;
681 /* Find the tag in our table of tags. This must not fail because
682 map_html_tags only returns tags found in interesting_tags.
684 I've changed this for now, I'm passing NULL as interesting_tags
685 to map_html_tags. This way we can check all tags for a style
688 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
691 t->handler (t->tagid, tag, ctx);
693 check_style_attr (tag, ctx);
695 if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style"))
696 && tag->contents_begin && tag->contents_end
697 && tag->contents_begin <= tag->contents_end)
700 get_urls_css (ctx, tag->contents_begin - ctx->text,
701 tag->contents_end - tag->contents_begin);
705 /* Analyze HTML tags FILE and construct a list of URLs referenced from
706 it. It merges relative links in FILE with URL. It is aware of
707 <base href=...> and does the right thing. */
710 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
713 struct file_memory *fm;
714 struct map_context ctx;
718 fm = wget_read_file (file);
721 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
724 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
726 ctx.text = fm->content;
729 ctx.parent_base = url ? url : opt.base_href;
730 ctx.document_file = file;
731 ctx.nofollow = false;
733 if (!interesting_tags)
736 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
737 generate <a href=" foo"> instead of <a href="foo"> (browsers
738 ignore spaces as well.) If you really mean space, use &32; or
739 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
740 e.g. in <img src="foo.[newline]html">. Such newlines are also
741 ignored by IE and Mozilla and are presumably introduced by
742 writing HTML with editors that force word wrap. */
743 flags = MHT_TRIM_VALUES;
744 if (opt.strict_comments)
745 flags |= MHT_STRICT_COMMENTS;
747 /* the NULL here used to be interesting_tags */
748 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
749 NULL, interesting_attributes);
751 /* If meta charset isn't null, override content encoding */
752 if (iri && meta_charset)
753 set_content_encoding (iri, meta_charset);
755 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
756 if (meta_disallow_follow)
757 *meta_disallow_follow = ctx.nofollow;
759 xfree_null (ctx.base);
760 wget_read_file_free (fm);
764 /* This doesn't really have anything to do with HTML, but it's similar
765 to get_urls_html, so we put it here. */
768 get_urls_file (const char *file)
770 struct file_memory *fm;
771 struct urlpos *head, *tail;
772 const char *text, *text_end;
775 fm = wget_read_file (file);
778 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
781 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
785 text_end = fm->content + fm->length;
786 while (text < text_end)
790 struct urlpos *entry;
793 const char *line_beg = text;
794 const char *line_end = memchr (text, '\n', text_end - text);
801 /* Strip whitespace from the beginning and end of line. */
802 while (line_beg < line_end && c_isspace (*line_beg))
804 while (line_end > line_beg && c_isspace (*(line_end - 1)))
807 if (line_beg == line_end)
810 /* The URL is in the [line_beg, line_end) region. */
812 /* We must copy the URL to a zero-terminated string, and we
813 can't use alloca because we're in a loop. *sigh*. */
814 url_text = strdupdelim (line_beg, line_end);
818 /* Merge opt.base_href with URL. */
819 char *merged = uri_merge (opt.base_href, url_text);
824 char *new_url = rewrite_shorthand_url (url_text);
831 url = url_parse (url_text, &up_error_code, NULL, false);
834 char *error = url_error (url_text, up_error_code);
835 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
836 file, url_text, error);
839 inform_exit_status (URLERROR);
844 entry = xnew0 (struct urlpos);
853 wget_read_file_free (fm);
858 cleanup_html_url (void)
860 /* Destroy the hash tables. The hash table keys and values are not
861 allocated by this code, so we don't need to free them here. */
862 if (interesting_tags)
863 hash_table_destroy (interesting_tags);
864 if (interesting_attributes)
865 hash_table_destroy (interesting_attributes);