1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
39 #include "html-parse.h"
48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
50 #define DECLARE_TAG_HANDLER(fun) \
51 static void fun (int, struct taginfo *, struct map_context *)
53 DECLARE_TAG_HANDLER (tag_find_urls);
54 DECLARE_TAG_HANDLER (tag_handle_base);
55 DECLARE_TAG_HANDLER (tag_handle_form);
56 DECLARE_TAG_HANDLER (tag_handle_link);
57 DECLARE_TAG_HANDLER (tag_handle_meta);
84 /* The list of known tags and functions used for handling them. Most
85 tags are simply harvested for URLs. */
86 static struct known_tag {
89 tag_handler_t handler;
91 { TAG_A, "a", tag_find_urls },
92 { TAG_APPLET, "applet", tag_find_urls },
93 { TAG_AREA, "area", tag_find_urls },
94 { TAG_BASE, "base", tag_handle_base },
95 { TAG_BGSOUND, "bgsound", tag_find_urls },
96 { TAG_BODY, "body", tag_find_urls },
97 { TAG_EMBED, "embed", tag_find_urls },
98 { TAG_FIG, "fig", tag_find_urls },
99 { TAG_FORM, "form", tag_handle_form },
100 { TAG_FRAME, "frame", tag_find_urls },
101 { TAG_IFRAME, "iframe", tag_find_urls },
102 { TAG_IMG, "img", tag_find_urls },
103 { TAG_INPUT, "input", tag_find_urls },
104 { TAG_LAYER, "layer", tag_find_urls },
105 { TAG_LINK, "link", tag_handle_link },
106 { TAG_META, "meta", tag_handle_meta },
107 { TAG_OBJECT, "object", tag_find_urls },
108 { TAG_OVERLAY, "overlay", tag_find_urls },
109 { TAG_SCRIPT, "script", tag_find_urls },
110 { TAG_TABLE, "table", tag_find_urls },
111 { TAG_TD, "td", tag_find_urls },
112 { TAG_TH, "th", tag_find_urls }
115 /* tag_url_attributes documents which attributes of which tags contain
116 URLs to harvest. It is used by tag_find_urls. */
118 /* Defines for the FLAGS. */
120 /* The link is "inline", i.e. needs to be retrieved for this document
121 to be correctly rendered. Inline links include inlined images,
122 stylesheets, children frames, etc. */
123 #define ATTR_INLINE 1
125 /* The link is expected to yield HTML contents. It's important not to
126 try to follow HTML obtained by following e.g. <img src="...">
127 regardless of content-type. Doing this causes infinite loops for
128 "images" that return non-404 error pages with links to the same
132 /* For tags handled by tag_find_urls: attributes that contain URLs to
136 const char *attr_name;
138 } tag_url_attributes[] = {
139 { TAG_A, "href", ATTR_HTML },
140 { TAG_APPLET, "code", ATTR_INLINE },
141 { TAG_AREA, "href", ATTR_HTML },
142 { TAG_BGSOUND, "src", ATTR_INLINE },
143 { TAG_BODY, "background", ATTR_INLINE },
144 { TAG_EMBED, "href", ATTR_HTML },
145 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
146 { TAG_FIG, "src", ATTR_INLINE },
147 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
148 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
149 { TAG_IMG, "href", ATTR_INLINE },
150 { TAG_IMG, "lowsrc", ATTR_INLINE },
151 { TAG_IMG, "src", ATTR_INLINE },
152 { TAG_INPUT, "src", ATTR_INLINE },
153 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
154 { TAG_OBJECT, "data", ATTR_INLINE },
155 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
156 { TAG_SCRIPT, "src", ATTR_INLINE },
157 { TAG_TABLE, "background", ATTR_INLINE },
158 { TAG_TD, "background", ATTR_INLINE },
159 { TAG_TH, "background", ATTR_INLINE }
162 /* The lists of interesting tags and attributes are built dynamically,
163 from the information above. However, some places in the code refer
164 to the attributes not mentioned here. We add them manually. */
165 static const char *additional_attributes[] = {
166 "rel", /* used by tag_handle_link */
167 "type", /* used by tag_handle_link */
168 "http-equiv", /* used by tag_handle_meta */
169 "name", /* used by tag_handle_meta */
170 "content", /* used by tag_handle_meta */
171 "action", /* used by tag_handle_form */
172 "style" /* used by check_style_attr */
175 static struct hash_table *interesting_tags;
176 static struct hash_table *interesting_attributes;
178 /* Will contains the (last) charset found in 'http-equiv=content-type'
180 static char *meta_charset;
183 init_interesting (void)
185 /* Init the variables interesting_tags and interesting_attributes
186 that are used by the HTML parser to know which tags and
187 attributes we're interested in. We initialize this only once,
188 for performance reasons.
190 Here we also make sure that what we put in interesting_tags
191 matches the user's preferences as specified through --ignore-tags
192 and --follow-tags. */
195 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
197 /* First, add all the tags we know hot to handle, mapped to their
198 respective entries in known_tags. */
199 for (i = 0; i < countof (known_tags); i++)
200 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
202 /* Then remove the tags ignored through --ignore-tags. */
206 for (ignored = opt.ignore_tags; *ignored; ignored++)
207 hash_table_remove (interesting_tags, *ignored);
210 /* If --follow-tags is specified, use only those tags. */
213 /* Create a new table intersecting --follow-tags and known_tags,
214 and use it as interesting_tags. */
215 struct hash_table *intersect = make_nocase_string_hash_table (0);
217 for (followed = opt.follow_tags; *followed; followed++)
219 struct known_tag *t = hash_table_get (interesting_tags, *followed);
221 continue; /* ignore unknown --follow-tags entries. */
222 hash_table_put (intersect, *followed, t);
224 hash_table_destroy (interesting_tags);
225 interesting_tags = intersect;
228 /* Add the attributes we care about. */
229 interesting_attributes = make_nocase_string_hash_table (10);
230 for (i = 0; i < countof (additional_attributes); i++)
231 hash_table_put (interesting_attributes, additional_attributes[i], "1");
232 for (i = 0; i < countof (tag_url_attributes); i++)
233 hash_table_put (interesting_attributes,
234 tag_url_attributes[i].attr_name, "1");
237 /* Find the value of attribute named NAME in the taginfo TAG. If the
238 attribute is not present, return NULL. If ATTRIND is non-NULL, the
239 index of the attribute in TAG will be stored there. */
242 find_attr (struct taginfo *tag, const char *name, int *attrind)
245 for (i = 0; i < tag->nattrs; i++)
246 if (!strcasecmp (tag->attrs[i].name, name))
250 return tag->attrs[i].value;
255 /* used for calls to append_url */
256 #define ATTR_POS(tag, attrind, ctx) \
257 (tag->attrs[attrind].value_raw_beginning - ctx->text)
258 #define ATTR_SIZE(tag, attrind) \
259 (tag->attrs[attrind].value_raw_size)
261 /* Append LINK_URI to the urlpos structure that is being built.
263 LINK_URI will be merged with the current document base.
267 append_url (const char *link_uri, int position, int size,
268 struct map_context *ctx)
270 int link_has_scheme = url_has_scheme (link_uri);
271 struct urlpos *newel;
272 const char *base = ctx->base ? ctx->base : ctx->parent_base;
277 DEBUGP (("%s: no base, merge will use \"%s\".\n",
278 ctx->document_file, link_uri));
280 if (!link_has_scheme)
282 /* Base URL is unavailable, and the link does not have a
283 location attached to it -- we have to give up. Since
284 this can only happen when using `--force-html -i', print
286 logprintf (LOG_NOTQUIET,
287 _("%s: Cannot resolve incomplete link %s.\n"),
288 ctx->document_file, link_uri);
292 url = url_parse (link_uri, NULL, NULL, false);
295 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
296 ctx->document_file, link_uri));
302 /* Merge BASE with LINK_URI, but also make sure the result is
303 canonicalized, i.e. that "../" have been resolved.
304 (parse_url will do that for us.) */
306 char *complete_uri = uri_merge (base, link_uri);
308 DEBUGP (("%s: merge(%s, %s) -> %s\n",
309 quotearg_n_style (0, escape_quoting_style, ctx->document_file),
311 quote_n (2, link_uri),
312 quotearg_n_style (3, escape_quoting_style, complete_uri)));
314 url = url_parse (complete_uri, NULL, NULL, false);
317 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
318 ctx->document_file, complete_uri));
319 xfree (complete_uri);
322 xfree (complete_uri);
325 DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
327 newel = xnew0 (struct urlpos);
329 newel->pos = position;
332 /* A URL is relative if the host is not named, and the name does not
334 if (!link_has_scheme && *link_uri != '/')
335 newel->link_relative_p = 1;
336 else if (link_has_scheme)
337 newel->link_complete_p = 1;
339 /* Append the new URL maintaining the order by position. */
340 if (ctx->head == NULL)
344 struct urlpos *it, *prev = NULL;
347 while (it && position > it->pos)
365 check_style_attr (struct taginfo *tag, struct map_context *ctx)
370 char *style = find_attr (tag, "style", &attrind);
374 /* raw pos and raw size include the quotes, skip them when they are
376 raw_start = ATTR_POS (tag, attrind, ctx);
377 raw_len = ATTR_SIZE (tag, attrind);
378 if( *(char *)(ctx->text + raw_start) == '\''
379 || *(char *)(ctx->text + raw_start) == '"')
388 get_urls_css (ctx, raw_start, raw_len);
391 /* All the tag_* functions are called from collect_tags_mapper, as
392 specified by KNOWN_TAGS. */
394 /* Default tag handler: collect URLs from attributes specified for
395 this tag by tag_url_attributes. */
398 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
404 for (i = 0; i < countof (tag_url_attributes); i++)
405 if (tag_url_attributes[i].tagid == tagid)
407 /* We've found the index of tag_url_attributes where the
408 attributes of our tag begin. */
412 assert (first != -1);
414 /* Loop over the "interesting" attributes of this tag. In this
415 example, it will loop over "src" and "lowsrc".
417 <img src="foo.png" lowsrc="bar.png">
419 This has to be done in the outer loop so that the attributes are
420 processed in the same order in which they appear in the page.
421 This is required when converting links. */
423 for (attrind = 0; attrind < tag->nattrs; attrind++)
425 /* Find whether TAG/ATTRIND is a combination that contains a
427 char *link = tag->attrs[attrind].value;
428 const size_t size = countof (tag_url_attributes);
430 /* If you're cringing at the inefficiency of the nested loops,
431 remember that they both iterate over a very small number of
432 items. The worst-case inner loop is for the IMG tag, which
433 has three attributes. */
434 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
436 if (0 == strcasecmp (tag->attrs[attrind].name,
437 tag_url_attributes[i].attr_name))
439 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
440 ATTR_SIZE(tag,attrind), ctx);
443 int flags = tag_url_attributes[i].flags;
444 if (flags & ATTR_INLINE)
445 up->link_inline_p = 1;
446 if (flags & ATTR_HTML)
447 up->link_expect_html = 1;
454 /* Handle the BASE tag, for <base href=...>. */
457 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
459 struct urlpos *base_urlpos;
461 char *newbase = find_attr (tag, "href", &attrind);
465 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
466 ATTR_SIZE(tag,attrind), ctx);
469 base_urlpos->ignore_when_downloading = 1;
470 base_urlpos->link_base_p = 1;
474 if (ctx->parent_base)
475 ctx->base = uri_merge (ctx->parent_base, newbase);
477 ctx->base = xstrdup (newbase);
480 /* Mark the URL found in <form action=...> for conversion. */
483 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
486 char *action = find_attr (tag, "action", &attrind);
490 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
491 ATTR_SIZE(tag,attrind), ctx);
493 up->ignore_when_downloading = 1;
497 /* Handle the LINK tag. It requires special handling because how its
498 links will be followed in -p mode depends on the REL attribute. */
501 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
504 char *href = find_attr (tag, "href", &attrind);
506 /* All <link href="..."> link references are external, except those
507 known not to be, such as style sheet and shortcut icon:
509 <link rel="stylesheet" href="...">
510 <link rel="shortcut icon" href="...">
514 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
515 ATTR_SIZE(tag,attrind), ctx);
518 char *rel = find_attr (tag, "rel", NULL);
521 if (0 == strcasecmp (rel, "stylesheet"))
523 up->link_inline_p = 1;
524 up->link_expect_css = 1;
526 else if (0 == strcasecmp (rel, "shortcut icon"))
528 up->link_inline_p = 1;
532 /* The external ones usually point to HTML pages, such as
533 <link rel="next" href="...">
534 except when the type attribute says otherwise:
535 <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
537 char *type = find_attr (tag, "type", NULL);
538 if (!type || strcasecmp (type, "text/html") == 0)
539 up->link_expect_html = 1;
546 /* Handle the META tag. This requires special handling because of the
547 refresh feature and because of robot exclusion. */
550 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
552 char *name = find_attr (tag, "name", NULL);
553 char *http_equiv = find_attr (tag, "http-equiv", NULL);
555 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
557 /* Some pages use a META tag to specify that the page be
558 refreshed by a new page after a given number of seconds. The
559 general format for this is:
561 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
563 So we just need to skip past the "NUMBER; URL=" garbage to
566 struct urlpos *entry;
571 char *refresh = find_attr (tag, "content", &attrind);
575 for (p = refresh; c_isdigit (*p); p++)
576 timeout = 10 * timeout + *p - '0';
580 while (c_isspace (*p))
582 if (!( c_toupper (*p) == 'U'
583 && c_toupper (*(p + 1)) == 'R'
584 && c_toupper (*(p + 2)) == 'L'
588 while (c_isspace (*p))
591 entry = append_url (p, ATTR_POS(tag,attrind,ctx),
592 ATTR_SIZE(tag,attrind), ctx);
595 entry->link_refresh_p = 1;
596 entry->refresh_timeout = timeout;
597 entry->link_expect_html = 1;
600 else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
602 /* Handle stuff like:
603 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
606 char *content = find_attr (tag, "content", NULL);
610 mcharset = parse_charset (content);
614 xfree_null (meta_charset);
615 meta_charset = mcharset;
617 else if (name && 0 == strcasecmp (name, "robots"))
619 /* Handle stuff like:
620 <meta name="robots" content="index,nofollow"> */
621 char *content = find_attr (tag, "content", NULL);
624 if (!strcasecmp (content, "none"))
625 ctx->nofollow = true;
631 /* Skip any initial whitespace. */
632 content += strspn (content, " \f\n\r\t\v");
633 /* Find the next occurrence of ',' or whitespace,
634 * or the end of the string. */
635 end = content + strcspn (content, ", \f\n\r\t\v");
636 if (!strncasecmp (content, "nofollow", end - content))
637 ctx->nofollow = true;
638 /* Skip past the next comma, if any. */
643 end = strchr (end, ',');
647 end = content + strlen (content);
655 /* Dispatch the tag handler appropriate for the tag we're mapping
656 over. See known_tags[] for definition of tag handlers. */
659 collect_tags_mapper (struct taginfo *tag, void *arg)
661 struct map_context *ctx = (struct map_context *)arg;
663 /* Find the tag in our table of tags. This must not fail because
664 map_html_tags only returns tags found in interesting_tags.
666 I've changed this for now, I'm passing NULL as interesting_tags
667 to map_html_tags. This way we can check all tags for a style
670 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
673 t->handler (t->tagid, tag, ctx);
675 check_style_attr (tag, ctx);
677 if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
678 tag->contents_begin && tag->contents_end)
681 get_urls_css (ctx, tag->contents_begin - ctx->text,
682 tag->contents_end - tag->contents_begin);
686 /* Analyze HTML tags FILE and construct a list of URLs referenced from
687 it. It merges relative links in FILE with URL. It is aware of
688 <base href=...> and does the right thing. */
691 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
694 struct file_memory *fm;
695 struct map_context ctx;
699 fm = wget_read_file (file);
702 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
705 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
707 ctx.text = fm->content;
710 ctx.parent_base = url ? url : opt.base_href;
711 ctx.document_file = file;
712 ctx.nofollow = false;
714 if (!interesting_tags)
717 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
718 generate <a href=" foo"> instead of <a href="foo"> (browsers
719 ignore spaces as well.) If you really mean space, use &32; or
720 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
721 e.g. in <img src="foo.[newline]html">. Such newlines are also
722 ignored by IE and Mozilla and are presumably introduced by
723 writing HTML with editors that force word wrap. */
724 flags = MHT_TRIM_VALUES;
725 if (opt.strict_comments)
726 flags |= MHT_STRICT_COMMENTS;
728 /* the NULL here used to be interesting_tags */
729 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
730 NULL, interesting_attributes);
732 /* If meta charset isn't null, override content encoding */
733 if (iri && meta_charset)
734 set_content_encoding (iri, meta_charset);
736 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
737 if (meta_disallow_follow)
738 *meta_disallow_follow = ctx.nofollow;
740 xfree_null (ctx.base);
741 wget_read_file_free (fm);
745 /* This doesn't really have anything to do with HTML, but it's similar
746 to get_urls_html, so we put it here. */
749 get_urls_file (const char *file)
751 struct file_memory *fm;
752 struct urlpos *head, *tail;
753 const char *text, *text_end;
756 fm = wget_read_file (file);
759 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
762 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
766 text_end = fm->content + fm->length;
767 while (text < text_end)
771 struct urlpos *entry;
774 const char *line_beg = text;
775 const char *line_end = memchr (text, '\n', text_end - text);
782 /* Strip whitespace from the beginning and end of line. */
783 while (line_beg < line_end && c_isspace (*line_beg))
785 while (line_end > line_beg && c_isspace (*(line_end - 1)))
788 if (line_beg == line_end)
791 /* The URL is in the [line_beg, line_end) region. */
793 /* We must copy the URL to a zero-terminated string, and we
794 can't use alloca because we're in a loop. *sigh*. */
795 url_text = strdupdelim (line_beg, line_end);
799 /* Merge opt.base_href with URL. */
800 char *merged = uri_merge (opt.base_href, url_text);
805 url = url_parse (url_text, &up_error_code, NULL, false);
808 char *error = url_error (url_text, up_error_code);
809 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
810 file, url_text, error);
817 entry = xnew0 (struct urlpos);
826 wget_read_file_free (fm);
831 cleanup_html_url (void)
833 /* Destroy the hash tables. The hash table keys and values are not
834 allocated by this code, so we don't need to free them here. */
835 if (interesting_tags)
836 hash_table_destroy (interesting_tags);
837 if (interesting_attributes)
838 hash_table_destroy (interesting_attributes);