1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
43 #include "html-parse.h"
53 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
54 struct map_context *));
56 #define DECLARE_TAG_HANDLER(fun) \
57 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
59 DECLARE_TAG_HANDLER (tag_find_urls);
60 DECLARE_TAG_HANDLER (tag_handle_base);
61 DECLARE_TAG_HANDLER (tag_handle_form);
62 DECLARE_TAG_HANDLER (tag_handle_link);
63 DECLARE_TAG_HANDLER (tag_handle_meta);
65 /* The list of known tags and functions used for handling them. Most
66 tags are simply harvested for URLs. */
69 tag_handler_t handler;
72 { "a", tag_find_urls },
74 { "applet", tag_find_urls },
76 { "area", tag_find_urls },
78 { "base", tag_handle_base },
80 { "bgsound", tag_find_urls },
82 { "body", tag_find_urls },
84 { "embed", tag_find_urls },
86 { "fig", tag_find_urls },
88 { "form", tag_handle_form },
90 { "frame", tag_find_urls },
92 { "iframe", tag_find_urls },
94 { "img", tag_find_urls },
96 { "input", tag_find_urls },
98 { "layer", tag_find_urls },
100 { "link", tag_handle_link },
102 { "meta", tag_handle_meta },
103 #define TAG_OVERLAY 16
104 { "overlay", tag_find_urls },
105 #define TAG_SCRIPT 17
106 { "script", tag_find_urls },
108 { "table", tag_find_urls },
110 { "td", tag_find_urls },
112 { "th", tag_find_urls }
115 /* tag_url_attributes documents which attributes of which tags contain
116 URLs to harvest. It is used by tag_find_urls. */
118 /* Defines for the FLAGS field; currently only one flag is defined. */
120 /* This tag points to an external document not necessary for rendering this
121 document (i.e. it's not an inlined image, stylesheet, etc.). */
122 #define TUA_EXTERNAL 1
124 /* For tags handled by tag_find_urls: attributes that contain URLs to
128 const char *attr_name;
130 } tag_url_attributes[] = {
131 { TAG_A, "href", TUA_EXTERNAL },
132 { TAG_APPLET, "code", 0 },
133 { TAG_AREA, "href", TUA_EXTERNAL },
134 { TAG_BGSOUND, "src", 0 },
135 { TAG_BODY, "background", 0 },
136 { TAG_EMBED, "href", TUA_EXTERNAL },
137 { TAG_EMBED, "src", 0 },
138 { TAG_FIG, "src", 0 },
139 { TAG_FRAME, "src", 0 },
140 { TAG_IFRAME, "src", 0 },
141 { TAG_IMG, "href", 0 },
142 { TAG_IMG, "lowsrc", 0 },
143 { TAG_IMG, "src", 0 },
144 { TAG_INPUT, "src", 0 },
145 { TAG_LAYER, "src", 0 },
146 { TAG_OVERLAY, "src", 0 },
147 { TAG_SCRIPT, "src", 0 },
148 { TAG_TABLE, "background", 0 },
149 { TAG_TD, "background", 0 },
150 { TAG_TH, "background", 0 }
153 /* The lists of interesting tags and attributes are built dynamically,
154 from the information above. However, some places in the code refer
155 to the attributes not mentioned here. We add them manually. */
156 static const char *additional_attributes[] = {
157 "rel", /* used by tag_handle_link */
158 "http-equiv", /* used by tag_handle_meta */
159 "name", /* used by tag_handle_meta */
160 "content", /* used by tag_handle_meta */
161 "action" /* used by tag_handle_form */
164 static const char **interesting_tags;
165 static const char **interesting_attributes;
168 init_interesting (void)
170 /* Init the variables interesting_tags and interesting_attributes
171 that are used by the HTML parser to know which tags and
172 attributes we're interested in. We initialize this only once,
173 for performance reasons.
175 Here we also make sure that what we put in interesting_tags
176 matches the user's preferences as specified through --ignore-tags
179 This function is as large as this only because of the glorious
180 expressivity of the C programming language. */
184 int size = ARRAY_SIZE (known_tags);
185 interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
187 for (i = 0; i < size; i++)
189 const char *name = known_tags[i].name;
191 /* Normally here we could say:
192 interesting_tags[i] = name;
193 But we need to respect the settings of --ignore-tags and
194 --follow-tags, so the code gets a bit hairier. */
198 /* --ignore-tags was specified. Do not match these
199 specific tags. --ignore-tags takes precedence over
200 --follow-tags, so we process --ignore first and fall
201 through if there's no match. */
203 for (j = 0; opt.ignore_tags[j] != NULL; j++)
204 /* Loop through all the tags this user doesn't care about. */
205 if (strcasecmp(opt.ignore_tags[j], name) == EQ)
216 /* --follow-tags was specified. Only match these specific tags, so
217 continue back to top of for if we don't match one of them. */
219 for (j = 0; opt.follow_tags[j] != NULL; j++)
220 /* Loop through all the tags this user cares about. */
221 if (strcasecmp(opt.follow_tags[j], name) == EQ)
227 continue; /* wasn't one of the explicitly desired tags */
230 /* If we get to here, --follow-tags isn't being used or the
231 tag is among the ones that are followed, and --ignore-tags,
232 if specified, didn't include this tag, so it's an
233 "interesting" one. */
234 interesting_tags[ind++] = name;
236 interesting_tags[ind] = NULL;
239 /* The same for attributes, except we loop through tag_url_attributes.
240 Here we also need to make sure that the list of attributes is
241 unique, and to include the attributes from additional_attributes. */
244 const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
246 /* First copy the "additional" attributes. */
247 for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
248 att[i] = additional_attributes[i];
251 for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
254 const char *look_for = tag_url_attributes[i].attr_name;
255 for (j = 0; j < ind - 1; j++)
256 if (!strcmp (att[j], look_for))
263 att = xrealloc (att, (ind + 2) * sizeof (*att));
264 att[ind++] = look_for;
268 interesting_attributes = att;
273 find_tag (const char *tag_name)
277 /* This is linear search; if the number of tags grow, we can switch
280 for (i = 0; i < ARRAY_SIZE (known_tags); i++)
282 int cmp = strcasecmp (known_tags[i].name, tag_name);
283 /* known_tags are sorted alphabetically, so we can
293 /* Find the value of attribute named NAME in the taginfo TAG. If the
294 attribute is not present, return NULL. If ATTRIND is non-NULL, the
295 index of the attribute in TAG will be stored there. */
297 find_attr (struct taginfo *tag, const char *name, int *attrind)
300 for (i = 0; i < tag->nattrs; i++)
301 if (!strcasecmp (tag->attrs[i].name, name))
305 return tag->attrs[i].value;
311 char *text; /* HTML text. */
312 char *base; /* Base URI of the document, possibly
313 changed through <base href=...>. */
314 const char *parent_base; /* Base of the current document. */
315 const char *document_file; /* File name of this document. */
316 int nofollow; /* whether NOFOLLOW was specified in a
317 <meta name=robots> tag. */
319 struct urlpos *head, *tail; /* List of URLs that is being
323 /* Append LINK_URI to the urlpos structure that is being built.
325 LINK_URI will be merged with the current document base. TAG and
326 ATTRIND are the necessary context to store the position and
329 static struct urlpos *
330 append_one_url (const char *link_uri, int inlinep,
331 struct taginfo *tag, int attrind, struct map_context *ctx)
333 int link_has_scheme = url_has_scheme (link_uri);
334 struct urlpos *newel;
335 const char *base = ctx->base ? ctx->base : ctx->parent_base;
340 DEBUGP (("%s: no base, merge will use \"%s\".\n",
341 ctx->document_file, link_uri));
343 if (!link_has_scheme)
345 /* Base URL is unavailable, and the link does not have a
346 location attached to it -- we have to give up. Since
347 this can only happen when using `--force-html -i', print
349 logprintf (LOG_NOTQUIET,
350 _("%s: Cannot resolve incomplete link %s.\n"),
351 ctx->document_file, link_uri);
355 url = url_parse (link_uri, NULL);
358 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
359 ctx->document_file, link_uri));
365 /* Merge BASE with LINK_URI, but also make sure the result is
366 canonicalized, i.e. that "../" have been resolved.
367 (parse_url will do that for us.) */
369 char *complete_uri = uri_merge (base, link_uri);
371 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
372 ctx->document_file, base, link_uri, complete_uri));
374 url = url_parse (complete_uri, NULL);
377 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
378 ctx->document_file, complete_uri));
379 xfree (complete_uri);
382 xfree (complete_uri);
385 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
387 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
388 memset (newel, 0, sizeof (*newel));
392 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
393 newel->size = tag->attrs[attrind].value_raw_size;
394 newel->link_inline_p = inlinep;
396 /* A URL is relative if the host is not named, and the name does not
398 if (!link_has_scheme && *link_uri != '/')
399 newel->link_relative_p = 1;
400 else if (link_has_scheme)
401 newel->link_complete_p = 1;
405 ctx->tail->next = newel;
409 ctx->tail = ctx->head = newel;
414 /* All the tag_* functions are called from collect_tags_mapper, as
415 specified by KNOWN_TAGS. */
417 /* Default tag handler: collect URLs from attributes specified for
418 this tag by tag_url_attributes. */
421 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
423 int i, attrind, first = -1;
424 int size = ARRAY_SIZE (tag_url_attributes);
426 for (i = 0; i < size; i++)
427 if (tag_url_attributes[i].tagid == tagid)
429 /* We've found the index of tag_url_attributes where the
430 attributes of our tag begin. */
434 assert (first != -1);
436 /* Loop over the "interesting" attributes of this tag. In this
437 example, it will loop over "src" and "lowsrc".
439 <img src="foo.png" lowsrc="bar.png">
441 This has to be done in the outer loop so that the attributes are
442 processed in the same order in which they appear in the page.
443 This is required when converting links. */
445 for (attrind = 0; attrind < tag->nattrs; attrind++)
447 /* Find whether TAG/ATTRIND is a combination that contains a
449 char *link = tag->attrs[attrind].value;
451 /* If you're cringing at the inefficiency of the nested loops,
452 remember that they both iterate over a laughably small
453 quantity of items. The worst-case inner loop is for the IMG
454 tag, which has three attributes. */
455 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
457 if (0 == strcasecmp (tag->attrs[attrind].name,
458 tag_url_attributes[i].attr_name))
460 int flags = tag_url_attributes[i].flags;
461 append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
467 /* Handle the BASE tag, for <base href=...>. */
470 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
472 struct urlpos *base_urlpos;
474 char *newbase = find_attr (tag, "href", &attrind);
478 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
481 base_urlpos->ignore_when_downloading = 1;
482 base_urlpos->link_base_p = 1;
486 if (ctx->parent_base)
487 ctx->base = uri_merge (ctx->parent_base, newbase);
489 ctx->base = xstrdup (newbase);
492 /* Mark the URL found in <form action=...> for conversion. */
495 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
498 char *action = find_attr (tag, "action", &attrind);
501 struct urlpos *action_urlpos = append_one_url (action, 0, tag,
504 action_urlpos->ignore_when_downloading = 1;
508 /* Handle the LINK tag. It requires special handling because how its
509 links will be followed in -p mode depends on the REL attribute. */
512 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
515 char *href = find_attr (tag, "href", &attrind);
517 /* All <link href="..."> link references are external, except those
518 known not to be, such as style sheet and shortcut icon:
520 <link rel="stylesheet" href="...">
521 <link rel="shortcut icon" href="...">
525 char *rel = find_attr (tag, "rel", NULL);
527 && (0 == strcasecmp (rel, "stylesheet")
528 || 0 == strcasecmp (rel, "shortcut icon")));
529 append_one_url (href, inlinep, tag, attrind, ctx);
533 /* Handle the META tag. This requires special handling because of the
534 refresh feature and because of robot exclusion. */
537 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
539 char *name = find_attr (tag, "name", NULL);
540 char *http_equiv = find_attr (tag, "http-equiv", NULL);
542 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
544 /* Some pages use a META tag to specify that the page be
545 refreshed by a new page after a given number of seconds. The
546 general format for this is:
548 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
550 So we just need to skip past the "NUMBER; URL=" garbage to
553 struct urlpos *entry;
558 char *refresh = find_attr (tag, "content", &attrind);
562 for (p = refresh; ISDIGIT (*p); p++)
563 timeout = 10 * timeout + *p - '0';
569 if (!( TOUPPER (*p) == 'U'
570 && TOUPPER (*(p + 1)) == 'R'
571 && TOUPPER (*(p + 2)) == 'L'
578 entry = append_one_url (p, 0, tag, attrind, ctx);
581 entry->link_refresh_p = 1;
582 entry->refresh_timeout = timeout;
585 else if (name && 0 == strcasecmp (name, "robots"))
587 /* Handle stuff like:
588 <meta name="robots" content="index,nofollow"> */
589 char *content = find_attr (tag, "content", NULL);
592 if (!strcasecmp (content, "none"))
598 /* Find the next occurrence of ',' or the end of
600 char *end = strchr (content, ',');
604 end = content + strlen (content);
605 if (!strncasecmp (content, "nofollow", end - content))
613 /* Examine name and attributes of TAG and take appropriate action
614 according to the tag. */
617 collect_tags_mapper (struct taginfo *tag, void *arg)
619 struct map_context *ctx = (struct map_context *)arg;
621 tag_handler_t handler;
623 tagid = find_tag (tag->name);
624 assert (tagid != -1);
625 handler = known_tags[tagid].handler;
627 handler (tagid, tag, ctx);
630 /* Analyze HTML tags FILE and construct a list of URLs referenced from
631 it. It merges relative links in FILE with URL. It is aware of
632 <base href=...> and does the right thing. */
634 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
636 struct file_memory *fm;
637 struct map_context ctx;
640 fm = read_file (file);
643 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
646 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
648 ctx.text = fm->content;
649 ctx.head = ctx.tail = NULL;
651 ctx.parent_base = url ? url : opt.base_href;
652 ctx.document_file = file;
655 if (!interesting_tags)
658 map_html_tags (fm->content, fm->length, interesting_tags,
659 interesting_attributes, collect_tags_mapper, &ctx);
661 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
662 if (meta_disallow_follow)
663 *meta_disallow_follow = ctx.nofollow;
665 FREE_MAYBE (ctx.base);
671 cleanup_html_url (void)
673 FREE_MAYBE (interesting_tags);
674 FREE_MAYBE (interesting_attributes);