1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
43 #include "html-parse.h"
55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
56 struct map_context *));
58 #define DECLARE_TAG_HANDLER(fun) \
59 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
61 DECLARE_TAG_HANDLER (tag_find_urls);
62 DECLARE_TAG_HANDLER (tag_handle_base);
63 DECLARE_TAG_HANDLER (tag_handle_form);
64 DECLARE_TAG_HANDLER (tag_handle_link);
65 DECLARE_TAG_HANDLER (tag_handle_meta);
91 /* The list of known tags and functions used for handling them. Most
92 tags are simply harvested for URLs. */
93 static struct known_tag {
96 tag_handler_t handler;
98 { TAG_A, "a", tag_find_urls },
99 { TAG_APPLET, "applet", tag_find_urls },
100 { TAG_AREA, "area", tag_find_urls },
101 { TAG_BASE, "base", tag_handle_base },
102 { TAG_BGSOUND, "bgsound", tag_find_urls },
103 { TAG_BODY, "body", tag_find_urls },
104 { TAG_EMBED, "embed", tag_find_urls },
105 { TAG_FIG, "fig", tag_find_urls },
106 { TAG_FORM, "form", tag_handle_form },
107 { TAG_FRAME, "frame", tag_find_urls },
108 { TAG_IFRAME, "iframe", tag_find_urls },
109 { TAG_IMG, "img", tag_find_urls },
110 { TAG_INPUT, "input", tag_find_urls },
111 { TAG_LAYER, "layer", tag_find_urls },
112 { TAG_LINK, "link", tag_handle_link },
113 { TAG_META, "meta", tag_handle_meta },
114 { TAG_OVERLAY, "overlay", tag_find_urls },
115 { TAG_SCRIPT, "script", tag_find_urls },
116 { TAG_TABLE, "table", tag_find_urls },
117 { TAG_TD, "td", tag_find_urls },
118 { TAG_TH, "th", tag_find_urls }
121 /* tag_url_attributes documents which attributes of which tags contain
122 URLs to harvest. It is used by tag_find_urls. */
124 /* Defines for the FLAGS field; currently only one flag is defined. */
126 /* This tag points to an external document not necessary for rendering this
127 document (i.e. it's not an inlined image, stylesheet, etc.). */
128 #define TUA_EXTERNAL 1
130 /* For tags handled by tag_find_urls: attributes that contain URLs to
134 const char *attr_name;
136 } tag_url_attributes[] = {
137 { TAG_A, "href", TUA_EXTERNAL },
138 { TAG_APPLET, "code", 0 },
139 { TAG_AREA, "href", TUA_EXTERNAL },
140 { TAG_BGSOUND, "src", 0 },
141 { TAG_BODY, "background", 0 },
142 { TAG_EMBED, "href", TUA_EXTERNAL },
143 { TAG_EMBED, "src", 0 },
144 { TAG_FIG, "src", 0 },
145 { TAG_FRAME, "src", 0 },
146 { TAG_IFRAME, "src", 0 },
147 { TAG_IMG, "href", 0 },
148 { TAG_IMG, "lowsrc", 0 },
149 { TAG_IMG, "src", 0 },
150 { TAG_INPUT, "src", 0 },
151 { TAG_LAYER, "src", 0 },
152 { TAG_OVERLAY, "src", 0 },
153 { TAG_SCRIPT, "src", 0 },
154 { TAG_TABLE, "background", 0 },
155 { TAG_TD, "background", 0 },
156 { TAG_TH, "background", 0 }
159 /* The lists of interesting tags and attributes are built dynamically,
160 from the information above. However, some places in the code refer
161 to the attributes not mentioned here. We add them manually. */
162 static const char *additional_attributes[] = {
163 "rel", /* used by tag_handle_link */
164 "http-equiv", /* used by tag_handle_meta */
165 "name", /* used by tag_handle_meta */
166 "content", /* used by tag_handle_meta */
167 "action" /* used by tag_handle_form */
170 struct hash_table *interesting_tags;
171 struct hash_table *interesting_attributes;
174 init_interesting (void)
176 /* Init the variables interesting_tags and interesting_attributes
177 that are used by the HTML parser to know which tags and
178 attributes we're interested in. We initialize this only once,
179 for performance reasons.
181 Here we also make sure that what we put in interesting_tags
182 matches the user's preferences as specified through --ignore-tags
183 and --follow-tags. */
186 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
188 /* First, add all the tags we know hot to handle, mapped to their
189 respective entries in known_tags. */
190 for (i = 0; i < countof (known_tags); i++)
191 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
193 /* Then remove the tags ignored through --ignore-tags. */
197 for (ignored = opt.ignore_tags; *ignored; ignored++)
198 hash_table_remove (interesting_tags, *ignored);
201 /* If --follow-tags is specified, use only those tags. */
204 /* Create a new hash table with the intersection of tags in
205 --follow-tags and known_tags, and use that as
207 struct hash_table *intersect = make_nocase_string_hash_table (0);
209 for (followed = opt.follow_tags; *followed; followed++)
211 struct known_tag *t = hash_table_get (interesting_tags, *followed);
213 continue; /* ignore unknown tags in --follow-tags. */
214 hash_table_put (intersect, *followed, t);
216 hash_table_destroy (interesting_tags);
217 interesting_tags = intersect;
220 /* Add the attributes we care about. */
221 interesting_attributes = make_nocase_string_hash_table (17);
222 for (i = 0; i < countof (additional_attributes); i++)
223 string_set_add (interesting_attributes, additional_attributes[i]);
224 for (i = 0; i < countof (tag_url_attributes); i++)
225 string_set_add (interesting_attributes, tag_url_attributes[i].attr_name);
228 /* Find the value of attribute named NAME in the taginfo TAG. If the
229 attribute is not present, return NULL. If ATTRIND is non-NULL, the
230 index of the attribute in TAG will be stored there. */
233 find_attr (struct taginfo *tag, const char *name, int *attrind)
236 for (i = 0; i < tag->nattrs; i++)
237 if (!strcasecmp (tag->attrs[i].name, name))
241 return tag->attrs[i].value;
247 char *text; /* HTML text. */
248 char *base; /* Base URI of the document, possibly
249 changed through <base href=...>. */
250 const char *parent_base; /* Base of the current document. */
251 const char *document_file; /* File name of this document. */
252 int nofollow; /* whether NOFOLLOW was specified in a
253 <meta name=robots> tag. */
255 struct urlpos *head, *tail; /* List of URLs that is being
259 /* Append LINK_URI to the urlpos structure that is being built.
261 LINK_URI will be merged with the current document base. TAG and
262 ATTRIND are the necessary context to store the position and
265 static struct urlpos *
266 append_one_url (const char *link_uri, int inlinep,
267 struct taginfo *tag, int attrind, struct map_context *ctx)
269 int link_has_scheme = url_has_scheme (link_uri);
270 struct urlpos *newel;
271 const char *base = ctx->base ? ctx->base : ctx->parent_base;
276 DEBUGP (("%s: no base, merge will use \"%s\".\n",
277 ctx->document_file, link_uri));
279 if (!link_has_scheme)
281 /* Base URL is unavailable, and the link does not have a
282 location attached to it -- we have to give up. Since
283 this can only happen when using `--force-html -i', print
285 logprintf (LOG_NOTQUIET,
286 _("%s: Cannot resolve incomplete link %s.\n"),
287 ctx->document_file, link_uri);
291 url = url_parse (link_uri, NULL);
294 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
295 ctx->document_file, link_uri));
301 /* Merge BASE with LINK_URI, but also make sure the result is
302 canonicalized, i.e. that "../" have been resolved.
303 (parse_url will do that for us.) */
305 char *complete_uri = uri_merge (base, link_uri);
307 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
308 ctx->document_file, base, link_uri, complete_uri));
310 url = url_parse (complete_uri, NULL);
313 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
314 ctx->document_file, complete_uri));
315 xfree (complete_uri);
318 xfree (complete_uri);
321 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
323 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
324 memset (newel, 0, sizeof (*newel));
328 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
329 newel->size = tag->attrs[attrind].value_raw_size;
330 newel->link_inline_p = inlinep;
332 /* A URL is relative if the host is not named, and the name does not
334 if (!link_has_scheme && *link_uri != '/')
335 newel->link_relative_p = 1;
336 else if (link_has_scheme)
337 newel->link_complete_p = 1;
341 ctx->tail->next = newel;
345 ctx->tail = ctx->head = newel;
350 /* All the tag_* functions are called from collect_tags_mapper, as
351 specified by KNOWN_TAGS. */
353 /* Default tag handler: collect URLs from attributes specified for
354 this tag by tag_url_attributes. */
357 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
362 for (i = 0; i < countof (tag_url_attributes); i++)
363 if (tag_url_attributes[i].tagid == tagid)
365 /* We've found the index of tag_url_attributes where the
366 attributes of our tag begin. */
370 assert (first != -1);
372 /* Loop over the "interesting" attributes of this tag. In this
373 example, it will loop over "src" and "lowsrc".
375 <img src="foo.png" lowsrc="bar.png">
377 This has to be done in the outer loop so that the attributes are
378 processed in the same order in which they appear in the page.
379 This is required when converting links. */
381 for (attrind = 0; attrind < tag->nattrs; attrind++)
383 /* Find whether TAG/ATTRIND is a combination that contains a
385 char *link = tag->attrs[attrind].value;
386 const int size = countof (tag_url_attributes);
388 /* If you're cringing at the inefficiency of the nested loops,
389 remember that they both iterate over a very small number of
390 items. The worst-case inner loop is for the IMG tag, which
391 has three attributes. */
392 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
394 if (0 == strcasecmp (tag->attrs[attrind].name,
395 tag_url_attributes[i].attr_name))
397 int flags = tag_url_attributes[i].flags;
398 append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
404 /* Handle the BASE tag, for <base href=...>. */
407 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
409 struct urlpos *base_urlpos;
411 char *newbase = find_attr (tag, "href", &attrind);
415 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
418 base_urlpos->ignore_when_downloading = 1;
419 base_urlpos->link_base_p = 1;
423 if (ctx->parent_base)
424 ctx->base = uri_merge (ctx->parent_base, newbase);
426 ctx->base = xstrdup (newbase);
429 /* Mark the URL found in <form action=...> for conversion. */
432 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
435 char *action = find_attr (tag, "action", &attrind);
438 struct urlpos *action_urlpos = append_one_url (action, 0, tag,
441 action_urlpos->ignore_when_downloading = 1;
445 /* Handle the LINK tag. It requires special handling because how its
446 links will be followed in -p mode depends on the REL attribute. */
449 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
452 char *href = find_attr (tag, "href", &attrind);
454 /* All <link href="..."> link references are external, except those
455 known not to be, such as style sheet and shortcut icon:
457 <link rel="stylesheet" href="...">
458 <link rel="shortcut icon" href="...">
462 char *rel = find_attr (tag, "rel", NULL);
464 && (0 == strcasecmp (rel, "stylesheet")
465 || 0 == strcasecmp (rel, "shortcut icon")));
466 append_one_url (href, inlinep, tag, attrind, ctx);
470 /* Handle the META tag. This requires special handling because of the
471 refresh feature and because of robot exclusion. */
474 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
476 char *name = find_attr (tag, "name", NULL);
477 char *http_equiv = find_attr (tag, "http-equiv", NULL);
479 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
481 /* Some pages use a META tag to specify that the page be
482 refreshed by a new page after a given number of seconds. The
483 general format for this is:
485 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
487 So we just need to skip past the "NUMBER; URL=" garbage to
490 struct urlpos *entry;
495 char *refresh = find_attr (tag, "content", &attrind);
499 for (p = refresh; ISDIGIT (*p); p++)
500 timeout = 10 * timeout + *p - '0';
506 if (!( TOUPPER (*p) == 'U'
507 && TOUPPER (*(p + 1)) == 'R'
508 && TOUPPER (*(p + 2)) == 'L'
515 entry = append_one_url (p, 0, tag, attrind, ctx);
518 entry->link_refresh_p = 1;
519 entry->refresh_timeout = timeout;
522 else if (name && 0 == strcasecmp (name, "robots"))
524 /* Handle stuff like:
525 <meta name="robots" content="index,nofollow"> */
526 char *content = find_attr (tag, "content", NULL);
529 if (!strcasecmp (content, "none"))
535 /* Find the next occurrence of ',' or the end of
537 char *end = strchr (content, ',');
541 end = content + strlen (content);
542 if (!strncasecmp (content, "nofollow", end - content))
550 /* Dispatch the tag handler appropriate for the tag we're mapping
551 over. See known_tags[] for definition of tag handlers. */
554 collect_tags_mapper (struct taginfo *tag, void *arg)
556 struct map_context *ctx = (struct map_context *)arg;
558 /* Find the tag in our table of tags. This must not fail because
559 map_html_tags only returns tags found in interesting_tags. */
560 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
563 t->handler (t->tagid, tag, ctx);
566 /* Analyze HTML tags FILE and construct a list of URLs referenced from
567 it. It merges relative links in FILE with URL. It is aware of
568 <base href=...> and does the right thing. */
571 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
573 struct file_memory *fm;
574 struct map_context ctx;
578 fm = read_file (file);
581 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
584 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
586 ctx.text = fm->content;
587 ctx.head = ctx.tail = NULL;
589 ctx.parent_base = url ? url : opt.base_href;
590 ctx.document_file = file;
593 if (!interesting_tags)
596 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
597 generate <a href=" foo"> instead of <a href="foo"> (Netscape
598 ignores spaces as well.) If you really mean space, use &32; or
600 flags = MHT_TRIM_VALUES;
601 if (opt.strict_comments)
602 flags |= MHT_STRICT_COMMENTS;
604 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
605 interesting_tags, interesting_attributes);
607 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
608 if (meta_disallow_follow)
609 *meta_disallow_follow = ctx.nofollow;
611 FREE_MAYBE (ctx.base);
616 /* This doesn't really have anything to do with HTML, but it's similar
617 to get_urls_html, so we put it here. */
620 get_urls_file (const char *file)
622 struct file_memory *fm;
623 struct urlpos *head, *tail;
624 const char *text, *text_end;
627 fm = read_file (file);
630 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
633 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
637 text_end = fm->content + fm->length;
638 while (text < text_end)
642 struct urlpos *entry;
645 const char *line_beg = text;
646 const char *line_end = memchr (text, '\n', text_end - text);
653 /* Strip whitespace from the beginning and end of line. */
654 while (line_beg < line_end && ISSPACE (*line_beg))
656 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
659 if (line_beg == line_end)
662 /* The URL is in the [line_beg, line_end) region. */
664 /* We must copy the URL to a zero-terminated string, and we
665 can't use alloca because we're in a loop. *sigh*. */
666 url_text = strdupdelim (line_beg, line_end);
670 /* Merge opt.base_href with URL. */
671 char *merged = uri_merge (opt.base_href, url_text);
676 url = url_parse (url_text, &up_error_code);
679 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
680 file, url_text, url_error (up_error_code));
686 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
687 memset (entry, 0, sizeof (*entry));
702 cleanup_html_url (void)
704 FREE_MAYBE (interesting_tags);
705 FREE_MAYBE (interesting_attributes);