1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
43 #include "html-parse.h"
54 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
55 struct map_context *));
57 #define DECLARE_TAG_HANDLER(fun) \
58 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
60 DECLARE_TAG_HANDLER (tag_find_urls);
61 DECLARE_TAG_HANDLER (tag_handle_base);
62 DECLARE_TAG_HANDLER (tag_handle_form);
63 DECLARE_TAG_HANDLER (tag_handle_link);
64 DECLARE_TAG_HANDLER (tag_handle_meta);
66 /* The list of known tags and functions used for handling them. Most
67 tags are simply harvested for URLs. */
70 tag_handler_t handler;
73 { "a", tag_find_urls },
75 { "applet", tag_find_urls },
77 { "area", tag_find_urls },
79 { "base", tag_handle_base },
81 { "bgsound", tag_find_urls },
83 { "body", tag_find_urls },
85 { "embed", tag_find_urls },
87 { "fig", tag_find_urls },
89 { "form", tag_handle_form },
91 { "frame", tag_find_urls },
93 { "iframe", tag_find_urls },
95 { "img", tag_find_urls },
97 { "input", tag_find_urls },
99 { "layer", tag_find_urls },
101 { "link", tag_handle_link },
103 { "meta", tag_handle_meta },
104 #define TAG_OVERLAY 16
105 { "overlay", tag_find_urls },
106 #define TAG_SCRIPT 17
107 { "script", tag_find_urls },
109 { "table", tag_find_urls },
111 { "td", tag_find_urls },
113 { "th", tag_find_urls }
116 /* tag_url_attributes documents which attributes of which tags contain
117 URLs to harvest. It is used by tag_find_urls. */
119 /* Defines for the FLAGS field; currently only one flag is defined. */
121 /* This tag points to an external document not necessary for rendering this
122 document (i.e. it's not an inlined image, stylesheet, etc.). */
123 #define TUA_EXTERNAL 1
125 /* For tags handled by tag_find_urls: attributes that contain URLs to
129 const char *attr_name;
131 } tag_url_attributes[] = {
132 { TAG_A, "href", TUA_EXTERNAL },
133 { TAG_APPLET, "code", 0 },
134 { TAG_AREA, "href", TUA_EXTERNAL },
135 { TAG_BGSOUND, "src", 0 },
136 { TAG_BODY, "background", 0 },
137 { TAG_EMBED, "href", TUA_EXTERNAL },
138 { TAG_EMBED, "src", 0 },
139 { TAG_FIG, "src", 0 },
140 { TAG_FRAME, "src", 0 },
141 { TAG_IFRAME, "src", 0 },
142 { TAG_IMG, "href", 0 },
143 { TAG_IMG, "lowsrc", 0 },
144 { TAG_IMG, "src", 0 },
145 { TAG_INPUT, "src", 0 },
146 { TAG_LAYER, "src", 0 },
147 { TAG_OVERLAY, "src", 0 },
148 { TAG_SCRIPT, "src", 0 },
149 { TAG_TABLE, "background", 0 },
150 { TAG_TD, "background", 0 },
151 { TAG_TH, "background", 0 }
154 /* The lists of interesting tags and attributes are built dynamically,
155 from the information above. However, some places in the code refer
156 to the attributes not mentioned here. We add them manually. */
157 static const char *additional_attributes[] = {
158 "rel", /* used by tag_handle_link */
159 "http-equiv", /* used by tag_handle_meta */
160 "name", /* used by tag_handle_meta */
161 "content", /* used by tag_handle_meta */
162 "action" /* used by tag_handle_form */
165 static const char **interesting_tags;
166 static const char **interesting_attributes;
169 init_interesting (void)
171 /* Init the variables interesting_tags and interesting_attributes
172 that are used by the HTML parser to know which tags and
173 attributes we're interested in. We initialize this only once,
174 for performance reasons.
176 Here we also make sure that what we put in interesting_tags
177 matches the user's preferences as specified through --ignore-tags
180 This function is as large as this only because of the glorious
181 expressivity of the C programming language. */
185 int size = countof (known_tags);
186 interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
188 for (i = 0; i < size; i++)
190 const char *name = known_tags[i].name;
192 /* Normally here we could say:
193 interesting_tags[i] = name;
194 But we need to respect the settings of --ignore-tags and
195 --follow-tags, so the code gets a bit hairier. */
199 /* --ignore-tags was specified. Do not match these
200 specific tags. --ignore-tags takes precedence over
201 --follow-tags, so we process --ignore first and fall
202 through if there's no match. */
204 for (j = 0; opt.ignore_tags[j] != NULL; j++)
205 /* Loop through all the tags this user doesn't care about. */
206 if (strcasecmp(opt.ignore_tags[j], name) == EQ)
217 /* --follow-tags was specified. Only match these specific tags, so
218 continue back to top of for if we don't match one of them. */
220 for (j = 0; opt.follow_tags[j] != NULL; j++)
221 /* Loop through all the tags this user cares about. */
222 if (strcasecmp(opt.follow_tags[j], name) == EQ)
228 continue; /* wasn't one of the explicitly desired tags */
231 /* If we get to here, --follow-tags isn't being used or the
232 tag is among the ones that are followed, and --ignore-tags,
233 if specified, didn't include this tag, so it's an
234 "interesting" one. */
235 interesting_tags[ind++] = name;
237 interesting_tags[ind] = NULL;
240 /* The same for attributes, except we loop through tag_url_attributes.
241 Here we also need to make sure that the list of attributes is
242 unique, and to include the attributes from additional_attributes. */
245 const char **att = xmalloc ((countof (additional_attributes) + 1)
247 /* First copy the "additional" attributes. */
248 for (i = 0; i < countof (additional_attributes); i++)
249 att[i] = additional_attributes[i];
252 for (i = 0; i < countof (tag_url_attributes); i++)
255 const char *look_for = tag_url_attributes[i].attr_name;
256 for (j = 0; j < ind - 1; j++)
257 if (!strcmp (att[j], look_for))
264 att = xrealloc (att, (ind + 2) * sizeof (*att));
265 att[ind++] = look_for;
269 interesting_attributes = att;
274 find_tag (const char *tag_name)
278 /* This is linear search; if the number of tags grow, we can switch
281 for (i = 0; i < countof (known_tags); i++)
283 int cmp = strcasecmp (known_tags[i].name, tag_name);
284 /* known_tags are sorted alphabetically, so we can
294 /* Find the value of attribute named NAME in the taginfo TAG. If the
295 attribute is not present, return NULL. If ATTRIND is non-NULL, the
296 index of the attribute in TAG will be stored there. */
298 find_attr (struct taginfo *tag, const char *name, int *attrind)
301 for (i = 0; i < tag->nattrs; i++)
302 if (!strcasecmp (tag->attrs[i].name, name))
306 return tag->attrs[i].value;
312 char *text; /* HTML text. */
313 char *base; /* Base URI of the document, possibly
314 changed through <base href=...>. */
315 const char *parent_base; /* Base of the current document. */
316 const char *document_file; /* File name of this document. */
317 int nofollow; /* whether NOFOLLOW was specified in a
318 <meta name=robots> tag. */
320 struct urlpos *head, *tail; /* List of URLs that is being
324 /* Append LINK_URI to the urlpos structure that is being built.
326 LINK_URI will be merged with the current document base. TAG and
327 ATTRIND are the necessary context to store the position and
330 static struct urlpos *
331 append_one_url (const char *link_uri, int inlinep,
332 struct taginfo *tag, int attrind, struct map_context *ctx)
334 int link_has_scheme = url_has_scheme (link_uri);
335 struct urlpos *newel;
336 const char *base = ctx->base ? ctx->base : ctx->parent_base;
341 DEBUGP (("%s: no base, merge will use \"%s\".\n",
342 ctx->document_file, link_uri));
344 if (!link_has_scheme)
346 /* Base URL is unavailable, and the link does not have a
347 location attached to it -- we have to give up. Since
348 this can only happen when using `--force-html -i', print
350 logprintf (LOG_NOTQUIET,
351 _("%s: Cannot resolve incomplete link %s.\n"),
352 ctx->document_file, link_uri);
356 url = url_parse (link_uri, NULL);
359 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
360 ctx->document_file, link_uri));
366 /* Merge BASE with LINK_URI, but also make sure the result is
367 canonicalized, i.e. that "../" have been resolved.
368 (parse_url will do that for us.) */
370 char *complete_uri = uri_merge (base, link_uri);
372 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
373 ctx->document_file, base, link_uri, complete_uri));
375 url = url_parse (complete_uri, NULL);
378 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
379 ctx->document_file, complete_uri));
380 xfree (complete_uri);
383 xfree (complete_uri);
386 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
388 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
389 memset (newel, 0, sizeof (*newel));
393 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
394 newel->size = tag->attrs[attrind].value_raw_size;
395 newel->link_inline_p = inlinep;
397 /* A URL is relative if the host is not named, and the name does not
399 if (!link_has_scheme && *link_uri != '/')
400 newel->link_relative_p = 1;
401 else if (link_has_scheme)
402 newel->link_complete_p = 1;
406 ctx->tail->next = newel;
410 ctx->tail = ctx->head = newel;
415 /* All the tag_* functions are called from collect_tags_mapper, as
416 specified by KNOWN_TAGS. */
418 /* Default tag handler: collect URLs from attributes specified for
419 this tag by tag_url_attributes. */
422 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
424 int i, attrind, first = -1;
425 int size = countof (tag_url_attributes);
427 for (i = 0; i < size; i++)
428 if (tag_url_attributes[i].tagid == tagid)
430 /* We've found the index of tag_url_attributes where the
431 attributes of our tag begin. */
435 assert (first != -1);
437 /* Loop over the "interesting" attributes of this tag. In this
438 example, it will loop over "src" and "lowsrc".
440 <img src="foo.png" lowsrc="bar.png">
442 This has to be done in the outer loop so that the attributes are
443 processed in the same order in which they appear in the page.
444 This is required when converting links. */
446 for (attrind = 0; attrind < tag->nattrs; attrind++)
448 /* Find whether TAG/ATTRIND is a combination that contains a
450 char *link = tag->attrs[attrind].value;
452 /* If you're cringing at the inefficiency of the nested loops,
453 remember that they both iterate over a laughably small
454 quantity of items. The worst-case inner loop is for the IMG
455 tag, which has three attributes. */
456 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
458 if (0 == strcasecmp (tag->attrs[attrind].name,
459 tag_url_attributes[i].attr_name))
461 int flags = tag_url_attributes[i].flags;
462 append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
468 /* Handle the BASE tag, for <base href=...>. */
471 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
473 struct urlpos *base_urlpos;
475 char *newbase = find_attr (tag, "href", &attrind);
479 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
482 base_urlpos->ignore_when_downloading = 1;
483 base_urlpos->link_base_p = 1;
487 if (ctx->parent_base)
488 ctx->base = uri_merge (ctx->parent_base, newbase);
490 ctx->base = xstrdup (newbase);
493 /* Mark the URL found in <form action=...> for conversion. */
496 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
499 char *action = find_attr (tag, "action", &attrind);
502 struct urlpos *action_urlpos = append_one_url (action, 0, tag,
505 action_urlpos->ignore_when_downloading = 1;
509 /* Handle the LINK tag. It requires special handling because how its
510 links will be followed in -p mode depends on the REL attribute. */
513 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
516 char *href = find_attr (tag, "href", &attrind);
518 /* All <link href="..."> link references are external, except those
519 known not to be, such as style sheet and shortcut icon:
521 <link rel="stylesheet" href="...">
522 <link rel="shortcut icon" href="...">
526 char *rel = find_attr (tag, "rel", NULL);
528 && (0 == strcasecmp (rel, "stylesheet")
529 || 0 == strcasecmp (rel, "shortcut icon")));
530 append_one_url (href, inlinep, tag, attrind, ctx);
534 /* Handle the META tag. This requires special handling because of the
535 refresh feature and because of robot exclusion. */
538 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
540 char *name = find_attr (tag, "name", NULL);
541 char *http_equiv = find_attr (tag, "http-equiv", NULL);
543 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
545 /* Some pages use a META tag to specify that the page be
546 refreshed by a new page after a given number of seconds. The
547 general format for this is:
549 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
551 So we just need to skip past the "NUMBER; URL=" garbage to
554 struct urlpos *entry;
559 char *refresh = find_attr (tag, "content", &attrind);
563 for (p = refresh; ISDIGIT (*p); p++)
564 timeout = 10 * timeout + *p - '0';
570 if (!( TOUPPER (*p) == 'U'
571 && TOUPPER (*(p + 1)) == 'R'
572 && TOUPPER (*(p + 2)) == 'L'
579 entry = append_one_url (p, 0, tag, attrind, ctx);
582 entry->link_refresh_p = 1;
583 entry->refresh_timeout = timeout;
586 else if (name && 0 == strcasecmp (name, "robots"))
588 /* Handle stuff like:
589 <meta name="robots" content="index,nofollow"> */
590 char *content = find_attr (tag, "content", NULL);
593 if (!strcasecmp (content, "none"))
599 /* Find the next occurrence of ',' or the end of
601 char *end = strchr (content, ',');
605 end = content + strlen (content);
606 if (!strncasecmp (content, "nofollow", end - content))
614 /* Examine name and attributes of TAG and take appropriate action
615 according to the tag. */
618 collect_tags_mapper (struct taginfo *tag, void *arg)
620 struct map_context *ctx = (struct map_context *)arg;
622 tag_handler_t handler;
624 tagid = find_tag (tag->name);
625 assert (tagid != -1);
626 handler = known_tags[tagid].handler;
628 handler (tagid, tag, ctx);
631 /* Analyze HTML tags FILE and construct a list of URLs referenced from
632 it. It merges relative links in FILE with URL. It is aware of
633 <base href=...> and does the right thing. */
636 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
638 struct file_memory *fm;
639 struct map_context ctx;
642 fm = read_file (file);
645 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
648 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
650 ctx.text = fm->content;
651 ctx.head = ctx.tail = NULL;
653 ctx.parent_base = url ? url : opt.base_href;
654 ctx.document_file = file;
657 if (!interesting_tags)
660 map_html_tags (fm->content, fm->length, interesting_tags,
661 interesting_attributes, collect_tags_mapper, &ctx);
663 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
664 if (meta_disallow_follow)
665 *meta_disallow_follow = ctx.nofollow;
667 FREE_MAYBE (ctx.base);
672 /* This doesn't really have anything to do with HTML, but it's similar
673 to get_urls_html, so we put it here. */
676 get_urls_file (const char *file)
678 struct file_memory *fm;
679 struct urlpos *head, *tail;
680 const char *text, *text_end;
683 fm = read_file (file);
686 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
689 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
693 text_end = fm->content + fm->length;
694 while (text < text_end)
698 struct urlpos *entry;
701 const char *line_beg = text;
702 const char *line_end = memchr (text, '\n', text_end - text);
709 /* Strip whitespace from the beginning and end of line. */
710 while (line_beg < line_end && ISSPACE (*line_beg))
712 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
715 if (line_beg == line_end)
718 /* The URL is in the [line_beg, line_end) region. */
720 /* We must copy the URL to a zero-terminated string, and we
721 can't use alloca because we're in a loop. *sigh*. */
722 url_text = strdupdelim (line_beg, line_end);
726 /* Merge opt.base_href with URL. */
727 char *merged = uri_merge (opt.base_href, url_text);
732 url = url_parse (url_text, &up_error_code);
735 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
736 file, url_text, url_error (up_error_code));
742 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
743 memset (entry, 0, sizeof (*entry));
758 cleanup_html_url (void)
760 FREE_MAYBE (interesting_tags);
761 FREE_MAYBE (interesting_attributes);