1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
43 #include "html-parse.h"
54 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
55 struct map_context *));
57 #define DECLARE_TAG_HANDLER(fun) \
58 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
60 DECLARE_TAG_HANDLER (tag_find_urls);
61 DECLARE_TAG_HANDLER (tag_handle_base);
62 DECLARE_TAG_HANDLER (tag_handle_form);
63 DECLARE_TAG_HANDLER (tag_handle_link);
64 DECLARE_TAG_HANDLER (tag_handle_meta);
66 /* The list of known tags and functions used for handling them. Most
67 tags are simply harvested for URLs. */
70 tag_handler_t handler;
73 { "a", tag_find_urls },
75 { "applet", tag_find_urls },
77 { "area", tag_find_urls },
79 { "base", tag_handle_base },
81 { "bgsound", tag_find_urls },
83 { "body", tag_find_urls },
85 { "embed", tag_find_urls },
87 { "fig", tag_find_urls },
89 { "form", tag_handle_form },
91 { "frame", tag_find_urls },
93 { "iframe", tag_find_urls },
95 { "img", tag_find_urls },
97 { "input", tag_find_urls },
99 { "layer", tag_find_urls },
101 { "link", tag_handle_link },
103 { "meta", tag_handle_meta },
104 #define TAG_OVERLAY 16
105 { "overlay", tag_find_urls },
106 #define TAG_SCRIPT 17
107 { "script", tag_find_urls },
109 { "table", tag_find_urls },
111 { "td", tag_find_urls },
113 { "th", tag_find_urls }
116 /* tag_url_attributes documents which attributes of which tags contain
117 URLs to harvest. It is used by tag_find_urls. */
119 /* Defines for the FLAGS field; currently only one flag is defined. */
121 /* This tag points to an external document not necessary for rendering this
122 document (i.e. it's not an inlined image, stylesheet, etc.). */
123 #define TUA_EXTERNAL 1
125 /* For tags handled by tag_find_urls: attributes that contain URLs to
129 const char *attr_name;
131 } tag_url_attributes[] = {
132 { TAG_A, "href", TUA_EXTERNAL },
133 { TAG_APPLET, "code", 0 },
134 { TAG_AREA, "href", TUA_EXTERNAL },
135 { TAG_BGSOUND, "src", 0 },
136 { TAG_BODY, "background", 0 },
137 { TAG_EMBED, "href", TUA_EXTERNAL },
138 { TAG_EMBED, "src", 0 },
139 { TAG_FIG, "src", 0 },
140 { TAG_FRAME, "src", 0 },
141 { TAG_IFRAME, "src", 0 },
142 { TAG_IMG, "href", 0 },
143 { TAG_IMG, "lowsrc", 0 },
144 { TAG_IMG, "src", 0 },
145 { TAG_INPUT, "src", 0 },
146 { TAG_LAYER, "src", 0 },
147 { TAG_OVERLAY, "src", 0 },
148 { TAG_SCRIPT, "src", 0 },
149 { TAG_TABLE, "background", 0 },
150 { TAG_TD, "background", 0 },
151 { TAG_TH, "background", 0 }
154 /* The lists of interesting tags and attributes are built dynamically,
155 from the information above. However, some places in the code refer
156 to the attributes not mentioned here. We add them manually. */
157 static const char *additional_attributes[] = {
158 "rel", /* used by tag_handle_link */
159 "http-equiv", /* used by tag_handle_meta */
160 "name", /* used by tag_handle_meta */
161 "content", /* used by tag_handle_meta */
162 "action" /* used by tag_handle_form */
165 static const char **interesting_tags;
166 static const char **interesting_attributes;
169 init_interesting (void)
171 /* Init the variables interesting_tags and interesting_attributes
172 that are used by the HTML parser to know which tags and
173 attributes we're interested in. We initialize this only once,
174 for performance reasons.
176 Here we also make sure that what we put in interesting_tags
177 matches the user's preferences as specified through --ignore-tags
180 This function is as large as this only because of the glorious
181 expressivity of the C programming language. */
185 int size = countof (known_tags);
186 interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
188 for (i = 0; i < size; i++)
190 const char *name = known_tags[i].name;
192 /* Normally here we could say:
193 interesting_tags[i] = name;
194 But we need to respect the settings of --ignore-tags and
195 --follow-tags, so the code gets a bit hairier. */
199 /* --ignore-tags was specified. Do not match these
200 specific tags. --ignore-tags takes precedence over
201 --follow-tags, so we process --ignore first and fall
202 through if there's no match. */
204 for (j = 0; opt.ignore_tags[j] != NULL; j++)
205 /* Loop through all the tags this user doesn't care about. */
206 if (strcasecmp(opt.ignore_tags[j], name) == EQ)
217 /* --follow-tags was specified. Only match these specific tags, so
218 continue back to top of for if we don't match one of them. */
220 for (j = 0; opt.follow_tags[j] != NULL; j++)
221 /* Loop through all the tags this user cares about. */
222 if (strcasecmp(opt.follow_tags[j], name) == EQ)
228 continue; /* wasn't one of the explicitly desired tags */
231 /* If we get to here, --follow-tags isn't being used or the
232 tag is among the ones that are followed, and --ignore-tags,
233 if specified, didn't include this tag, so it's an
234 "interesting" one. */
235 interesting_tags[ind++] = name;
237 interesting_tags[ind] = NULL;
240 /* The same for attributes, except we loop through tag_url_attributes.
241 Here we also need to make sure that the list of attributes is
242 unique, and to include the attributes from additional_attributes. */
245 const char **att = xmalloc ((countof (additional_attributes) + 1)
247 /* First copy the "additional" attributes. */
248 for (i = 0; i < countof (additional_attributes); i++)
249 att[i] = additional_attributes[i];
252 for (i = 0; i < countof (tag_url_attributes); i++)
255 const char *look_for = tag_url_attributes[i].attr_name;
256 for (j = 0; j < ind - 1; j++)
257 if (!strcmp (att[j], look_for))
264 att = xrealloc (att, (ind + 2) * sizeof (*att));
265 att[ind++] = look_for;
269 interesting_attributes = att;
273 /* Find tag with name TAG_NAME in KNOWN_TAGS and return its index. */
276 find_tag (const char *tag_name)
278 /* Originally implemented as linear search. In Wget 1.9 known_tags
279 contains 21 elements, for which binary search requires max. 5
280 comparisons, whereas linear search performs 10 on average. */
282 int lo = 0, hi = countof (known_tags) - 1;
286 int mid = (lo + hi) >> 1;
287 int cmp = strcasecmp (tag_name, known_tags[mid].name);
299 /* Find the value of attribute named NAME in the taginfo TAG. If the
300 attribute is not present, return NULL. If ATTRIND is non-NULL, the
301 index of the attribute in TAG will be stored there. */
304 find_attr (struct taginfo *tag, const char *name, int *attrind)
307 for (i = 0; i < tag->nattrs; i++)
308 if (!strcasecmp (tag->attrs[i].name, name))
312 return tag->attrs[i].value;
318 char *text; /* HTML text. */
319 char *base; /* Base URI of the document, possibly
320 changed through <base href=...>. */
321 const char *parent_base; /* Base of the current document. */
322 const char *document_file; /* File name of this document. */
323 int nofollow; /* whether NOFOLLOW was specified in a
324 <meta name=robots> tag. */
326 struct urlpos *head, *tail; /* List of URLs that is being
330 /* Append LINK_URI to the urlpos structure that is being built.
332 LINK_URI will be merged with the current document base. TAG and
333 ATTRIND are the necessary context to store the position and
336 static struct urlpos *
337 append_one_url (const char *link_uri, int inlinep,
338 struct taginfo *tag, int attrind, struct map_context *ctx)
340 int link_has_scheme = url_has_scheme (link_uri);
341 struct urlpos *newel;
342 const char *base = ctx->base ? ctx->base : ctx->parent_base;
347 DEBUGP (("%s: no base, merge will use \"%s\".\n",
348 ctx->document_file, link_uri));
350 if (!link_has_scheme)
352 /* Base URL is unavailable, and the link does not have a
353 location attached to it -- we have to give up. Since
354 this can only happen when using `--force-html -i', print
356 logprintf (LOG_NOTQUIET,
357 _("%s: Cannot resolve incomplete link %s.\n"),
358 ctx->document_file, link_uri);
362 url = url_parse (link_uri, NULL);
365 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
366 ctx->document_file, link_uri));
372 /* Merge BASE with LINK_URI, but also make sure the result is
373 canonicalized, i.e. that "../" have been resolved.
374 (parse_url will do that for us.) */
376 char *complete_uri = uri_merge (base, link_uri);
378 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
379 ctx->document_file, base, link_uri, complete_uri));
381 url = url_parse (complete_uri, NULL);
384 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
385 ctx->document_file, complete_uri));
386 xfree (complete_uri);
389 xfree (complete_uri);
392 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
394 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
395 memset (newel, 0, sizeof (*newel));
399 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
400 newel->size = tag->attrs[attrind].value_raw_size;
401 newel->link_inline_p = inlinep;
403 /* A URL is relative if the host is not named, and the name does not
405 if (!link_has_scheme && *link_uri != '/')
406 newel->link_relative_p = 1;
407 else if (link_has_scheme)
408 newel->link_complete_p = 1;
412 ctx->tail->next = newel;
416 ctx->tail = ctx->head = newel;
421 /* All the tag_* functions are called from collect_tags_mapper, as
422 specified by KNOWN_TAGS. */
424 /* Default tag handler: collect URLs from attributes specified for
425 this tag by tag_url_attributes. */
428 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
430 int i, attrind, first = -1;
431 int size = countof (tag_url_attributes);
433 for (i = 0; i < size; i++)
434 if (tag_url_attributes[i].tagid == tagid)
436 /* We've found the index of tag_url_attributes where the
437 attributes of our tag begin. */
441 assert (first != -1);
443 /* Loop over the "interesting" attributes of this tag. In this
444 example, it will loop over "src" and "lowsrc".
446 <img src="foo.png" lowsrc="bar.png">
448 This has to be done in the outer loop so that the attributes are
449 processed in the same order in which they appear in the page.
450 This is required when converting links. */
452 for (attrind = 0; attrind < tag->nattrs; attrind++)
454 /* Find whether TAG/ATTRIND is a combination that contains a
456 char *link = tag->attrs[attrind].value;
458 /* If you're cringing at the inefficiency of the nested loops,
459 remember that they both iterate over a laughably small
460 quantity of items. The worst-case inner loop is for the IMG
461 tag, which has three attributes. */
462 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
464 if (0 == strcasecmp (tag->attrs[attrind].name,
465 tag_url_attributes[i].attr_name))
467 int flags = tag_url_attributes[i].flags;
468 append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
474 /* Handle the BASE tag, for <base href=...>. */
477 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
479 struct urlpos *base_urlpos;
481 char *newbase = find_attr (tag, "href", &attrind);
485 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
488 base_urlpos->ignore_when_downloading = 1;
489 base_urlpos->link_base_p = 1;
493 if (ctx->parent_base)
494 ctx->base = uri_merge (ctx->parent_base, newbase);
496 ctx->base = xstrdup (newbase);
499 /* Mark the URL found in <form action=...> for conversion. */
502 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
505 char *action = find_attr (tag, "action", &attrind);
508 struct urlpos *action_urlpos = append_one_url (action, 0, tag,
511 action_urlpos->ignore_when_downloading = 1;
515 /* Handle the LINK tag. It requires special handling because how its
516 links will be followed in -p mode depends on the REL attribute. */
519 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
522 char *href = find_attr (tag, "href", &attrind);
524 /* All <link href="..."> link references are external, except those
525 known not to be, such as style sheet and shortcut icon:
527 <link rel="stylesheet" href="...">
528 <link rel="shortcut icon" href="...">
532 char *rel = find_attr (tag, "rel", NULL);
534 && (0 == strcasecmp (rel, "stylesheet")
535 || 0 == strcasecmp (rel, "shortcut icon")));
536 append_one_url (href, inlinep, tag, attrind, ctx);
540 /* Handle the META tag. This requires special handling because of the
541 refresh feature and because of robot exclusion. */
544 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
546 char *name = find_attr (tag, "name", NULL);
547 char *http_equiv = find_attr (tag, "http-equiv", NULL);
549 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
551 /* Some pages use a META tag to specify that the page be
552 refreshed by a new page after a given number of seconds. The
553 general format for this is:
555 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
557 So we just need to skip past the "NUMBER; URL=" garbage to
560 struct urlpos *entry;
565 char *refresh = find_attr (tag, "content", &attrind);
569 for (p = refresh; ISDIGIT (*p); p++)
570 timeout = 10 * timeout + *p - '0';
576 if (!( TOUPPER (*p) == 'U'
577 && TOUPPER (*(p + 1)) == 'R'
578 && TOUPPER (*(p + 2)) == 'L'
585 entry = append_one_url (p, 0, tag, attrind, ctx);
588 entry->link_refresh_p = 1;
589 entry->refresh_timeout = timeout;
592 else if (name && 0 == strcasecmp (name, "robots"))
594 /* Handle stuff like:
595 <meta name="robots" content="index,nofollow"> */
596 char *content = find_attr (tag, "content", NULL);
599 if (!strcasecmp (content, "none"))
605 /* Find the next occurrence of ',' or the end of
607 char *end = strchr (content, ',');
611 end = content + strlen (content);
612 if (!strncasecmp (content, "nofollow", end - content))
620 /* Examine name and attributes of TAG and take appropriate action
621 according to the tag. */
624 collect_tags_mapper (struct taginfo *tag, void *arg)
626 struct map_context *ctx = (struct map_context *)arg;
628 tag_handler_t handler;
630 tagid = find_tag (tag->name);
631 assert (tagid != -1);
632 handler = known_tags[tagid].handler;
634 handler (tagid, tag, ctx);
637 /* Analyze HTML tags FILE and construct a list of URLs referenced from
638 it. It merges relative links in FILE with URL. It is aware of
639 <base href=...> and does the right thing. */
642 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
644 struct file_memory *fm;
645 struct map_context ctx;
649 fm = read_file (file);
652 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
655 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
657 ctx.text = fm->content;
658 ctx.head = ctx.tail = NULL;
660 ctx.parent_base = url ? url : opt.base_href;
661 ctx.document_file = file;
664 if (!interesting_tags)
667 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
668 generate <a href=" foo"> instead of <a href="foo"> (Netscape
669 ignores spaces as well.) If you really mean space, use &32; or
671 flags = MHT_TRIM_VALUES;
672 if (opt.strict_comments)
673 flags |= MHT_STRICT_COMMENTS;
675 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
676 interesting_tags, interesting_attributes);
678 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
679 if (meta_disallow_follow)
680 *meta_disallow_follow = ctx.nofollow;
682 FREE_MAYBE (ctx.base);
687 /* This doesn't really have anything to do with HTML, but it's similar
688 to get_urls_html, so we put it here. */
691 get_urls_file (const char *file)
693 struct file_memory *fm;
694 struct urlpos *head, *tail;
695 const char *text, *text_end;
698 fm = read_file (file);
701 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
704 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
708 text_end = fm->content + fm->length;
709 while (text < text_end)
713 struct urlpos *entry;
716 const char *line_beg = text;
717 const char *line_end = memchr (text, '\n', text_end - text);
724 /* Strip whitespace from the beginning and end of line. */
725 while (line_beg < line_end && ISSPACE (*line_beg))
727 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
730 if (line_beg == line_end)
733 /* The URL is in the [line_beg, line_end) region. */
735 /* We must copy the URL to a zero-terminated string, and we
736 can't use alloca because we're in a loop. *sigh*. */
737 url_text = strdupdelim (line_beg, line_end);
741 /* Merge opt.base_href with URL. */
742 char *merged = uri_merge (opt.base_href, url_text);
747 url = url_parse (url_text, &up_error_code);
750 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
751 file, url_text, url_error (up_error_code));
757 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
758 memset (entry, 0, sizeof (*entry));
773 cleanup_html_url (void)
775 FREE_MAYBE (interesting_tags);
776 FREE_MAYBE (interesting_attributes);