1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
33 #include "html-parse.h"
43 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
44 struct map_context *));
46 #define DECLARE_TAG_HANDLER(fun) \
47 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
49 DECLARE_TAG_HANDLER (tag_find_urls);
50 DECLARE_TAG_HANDLER (tag_handle_base);
51 DECLARE_TAG_HANDLER (tag_handle_link);
52 DECLARE_TAG_HANDLER (tag_handle_meta);
54 /* The list of known tags and functions used for handling them. Most
55 tags are simply harvested for URLs. */
58 tag_handler_t handler;
61 { "a", tag_find_urls },
63 { "applet", tag_find_urls },
65 { "area", tag_find_urls },
67 { "base", tag_handle_base },
69 { "bgsound", tag_find_urls },
71 { "body", tag_find_urls },
73 { "embed", tag_find_urls },
75 { "fig", tag_find_urls },
77 { "frame", tag_find_urls },
79 { "iframe", tag_find_urls },
81 { "img", tag_find_urls },
83 { "input", tag_find_urls },
85 { "layer", tag_find_urls },
87 { "link", tag_handle_link },
89 { "meta", tag_handle_meta },
90 #define TAG_OVERLAY 15
91 { "overlay", tag_find_urls },
93 { "script", tag_find_urls },
95 { "table", tag_find_urls },
97 { "td", tag_find_urls },
99 { "th", tag_find_urls }
102 /* tag_url_attributes documents which attributes of which tags contain
103 URLs to harvest. It is used by tag_find_urls. */
105 /* Defines for the FLAGS field; currently only one flag is defined. */
107 /* This tag points to an external document not necessary for rendering this
108 document (i.e. it's not an inlined image, stylesheet, etc.). */
109 #define TUA_EXTERNAL 1
111 /* For tags handled by tag_find_urls: attributes that contain URLs to
115 const char *attr_name;
117 } tag_url_attributes[] = {
118 { TAG_A, "href", TUA_EXTERNAL },
119 { TAG_APPLET, "code", 0 },
120 { TAG_AREA, "href", TUA_EXTERNAL },
121 { TAG_BGSOUND, "src", 0 },
122 { TAG_BODY, "background", 0 },
123 { TAG_EMBED, "href", TUA_EXTERNAL },
124 { TAG_EMBED, "src", 0 },
125 { TAG_FIG, "src", 0 },
126 { TAG_FRAME, "src", 0 },
127 { TAG_IFRAME, "src", 0 },
128 { TAG_IMG, "href", 0 },
129 { TAG_IMG, "lowsrc", 0 },
130 { TAG_IMG, "src", 0 },
131 { TAG_INPUT, "src", 0 },
132 { TAG_LAYER, "src", 0 },
133 { TAG_OVERLAY, "src", 0 },
134 { TAG_SCRIPT, "src", 0 },
135 { TAG_TABLE, "background", 0 },
136 { TAG_TD, "background", 0 },
137 { TAG_TH, "background", 0 }
140 /* The lists of interesting tags and attributes are built dynamically,
141 from the information above. However, some places in the code refer
142 to the attributes not mentioned here. We add them manually. */
143 static const char *additional_attributes[] = {
144 "rel", /* for TAG_LINK */
145 "http-equiv", /* for TAG_META */
146 "name", /* for TAG_META */
147 "content" /* for TAG_META */
150 static const char **interesting_tags;
151 static const char **interesting_attributes;
154 init_interesting (void)
156 /* Init the variables interesting_tags and interesting_attributes
157 that are used by the HTML parser to know which tags and
158 attributes we're interested in. We initialize this only once,
159 for performance reasons.
161 Here we also make sure that what we put in interesting_tags
162 matches the user's preferences as specified through --ignore-tags
165 This function is as large as this only because of the glorious
166 expressivity of the C programming language. */
170 int size = ARRAY_SIZE (known_tags);
171 interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
173 for (i = 0; i < size; i++)
175 const char *name = known_tags[i].name;
177 /* Normally here we could say:
178 interesting_tags[i] = name;
179 But we need to respect the settings of --ignore-tags and
180 --follow-tags, so the code gets a bit hairier. */
184 /* --ignore-tags was specified. Do not match these
185 specific tags. --ignore-tags takes precedence over
186 --follow-tags, so we process --ignore first and fall
187 through if there's no match. */
189 for (j = 0; opt.ignore_tags[j] != NULL; j++)
190 /* Loop through all the tags this user doesn't care about. */
191 if (strcasecmp(opt.ignore_tags[j], name) == EQ)
202 /* --follow-tags was specified. Only match these specific tags, so
203 continue back to top of for if we don't match one of them. */
205 for (j = 0; opt.follow_tags[j] != NULL; j++)
206 /* Loop through all the tags this user cares about. */
207 if (strcasecmp(opt.follow_tags[j], name) == EQ)
213 continue; /* wasn't one of the explicitly desired tags */
216 /* If we get to here, --follow-tags isn't being used or the
217 tag is among the ones that are followed, and --ignore-tags,
218 if specified, didn't include this tag, so it's an
219 "interesting" one. */
220 interesting_tags[ind++] = name;
222 interesting_tags[ind] = NULL;
225 /* The same for attributes, except we loop through tag_url_attributes.
226 Here we also need to make sure that the list of attributes is
227 unique, and to include the attributes from additional_attributes. */
230 const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
232 /* First copy the "additional" attributes. */
233 for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
234 att[i] = additional_attributes[i];
237 for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
240 const char *look_for = tag_url_attributes[i].attr_name;
241 for (j = 0; j < ind - 1; j++)
242 if (!strcmp (att[j], look_for))
249 att = xrealloc (att, (ind + 2) * sizeof (*att));
250 att[ind++] = look_for;
254 interesting_attributes = att;
259 find_tag (const char *tag_name)
263 /* This is linear search; if the number of tags grow, we can switch
266 for (i = 0; i < ARRAY_SIZE (known_tags); i++)
268 int cmp = strcasecmp (known_tags[i].name, tag_name);
269 /* known_tags are sorted alphabetically, so we can
279 /* Find the value of attribute named NAME in the taginfo TAG. If the
280 attribute is not present, return NULL. If ATTRIND is non-NULL, the
281 index of the attribute in TAG will be stored there. */
283 find_attr (struct taginfo *tag, const char *name, int *attrind)
286 for (i = 0; i < tag->nattrs; i++)
287 if (!strcasecmp (tag->attrs[i].name, name))
291 return tag->attrs[i].value;
297 char *text; /* HTML text. */
298 char *base; /* Base URI of the document, possibly
299 changed through <base href=...>. */
300 const char *parent_base; /* Base of the current document. */
301 const char *document_file; /* File name of this document. */
302 int nofollow; /* whether NOFOLLOW was specified in a
303 <meta name=robots> tag. */
305 struct urlpos *head, *tail; /* List of URLs that is being
309 /* Append LINK_URI to the urlpos structure that is being built.
311 LINK_URI will be merged with the current document base. TAG and
312 ATTRIND are the necessary context to store the position and
315 static struct urlpos *
316 append_one_url (const char *link_uri, int inlinep,
317 struct taginfo *tag, int attrind, struct map_context *ctx)
319 int link_has_scheme = url_has_scheme (link_uri);
320 struct urlpos *newel;
321 const char *base = ctx->base ? ctx->base : ctx->parent_base;
326 DEBUGP (("%s: no base, merge will use \"%s\".\n",
327 ctx->document_file, link_uri));
329 if (!link_has_scheme)
331 /* Base URL is unavailable, and the link does not have a
332 location attached to it -- we have to give up. Since
333 this can only happen when using `--force-html -i', print
335 logprintf (LOG_NOTQUIET,
336 _("%s: Cannot resolve incomplete link %s.\n"),
337 ctx->document_file, link_uri);
341 url = url_parse (link_uri, NULL);
344 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
345 ctx->document_file, link_uri));
351 /* Merge BASE with LINK_URI, but also make sure the result is
352 canonicalized, i.e. that "../" have been resolved.
353 (parse_url will do that for us.) */
355 char *complete_uri = uri_merge (base, link_uri);
357 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
358 ctx->document_file, base, link_uri, complete_uri));
360 url = url_parse (complete_uri, NULL);
363 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
364 ctx->document_file, complete_uri));
365 xfree (complete_uri);
368 xfree (complete_uri);
371 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
373 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
374 memset (newel, 0, sizeof (*newel));
378 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
379 newel->size = tag->attrs[attrind].value_raw_size;
380 newel->link_inline_p = inlinep;
382 /* A URL is relative if the host is not named, and the name does not
384 if (!link_has_scheme && *link_uri != '/')
385 newel->link_relative_p = 1;
386 else if (link_has_scheme)
387 newel->link_complete_p = 1;
391 ctx->tail->next = newel;
395 ctx->tail = ctx->head = newel;
400 /* All the tag_* functions are called from collect_tags_mapper, as
401 specified by KNOWN_TAGS. */
403 /* Default tag handler: collect URLs from attributes specified for
404 this tag by tag_url_attributes. */
407 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
409 int i, attrind, first = -1;
410 int size = ARRAY_SIZE (tag_url_attributes);
412 for (i = 0; i < size; i++)
413 if (tag_url_attributes[i].tagid == tagid)
415 /* We've found the index of tag_url_attributes where the
416 attributes of our tag begin. */
420 assert (first != -1);
422 /* Loop over the "interesting" attributes of this tag. In this
423 example, it will loop over "src" and "lowsrc".
425 <img src="foo.png" lowsrc="bar.png">
427 This has to be done in the outer loop so that the attributes are
428 processed in the same order in which they appear in the page.
429 This is required when converting links. */
431 for (attrind = 0; attrind < tag->nattrs; attrind++)
433 /* Find whether TAG/ATTRIND is a combination that contains a
435 char *link = tag->attrs[attrind].value;
437 /* If you're cringing at the inefficiency of the nested loops,
438 remember that they both iterate over a laughably small
439 quantity of items. The worst-case inner loop is for the IMG
440 tag, which has three attributes. */
441 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
443 if (0 == strcasecmp (tag->attrs[attrind].name,
444 tag_url_attributes[i].attr_name))
446 int flags = tag_url_attributes[i].flags;
447 append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
453 /* Handle the BASE tag, for <base href=...>. */
456 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
458 struct urlpos *base_urlpos;
460 char *newbase = find_attr (tag, "href", &attrind);
464 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
467 base_urlpos->ignore_when_downloading = 1;
468 base_urlpos->link_base_p = 1;
472 if (ctx->parent_base)
473 ctx->base = uri_merge (ctx->parent_base, newbase);
475 ctx->base = xstrdup (newbase);
478 /* Handle the LINK tag. It requires special handling because how its
479 links will be followed in -p mode depends on the REL attribute. */
482 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
485 char *href = find_attr (tag, "href", &attrind);
487 /* All <link href="..."> link references are external,
488 except for <link rel="stylesheet" href="...">. */
491 char *rel = find_attr (tag, "rel", NULL);
492 int inlinep = (rel && 0 == strcasecmp (rel, "stylesheet"));
493 append_one_url (href, inlinep, tag, attrind, ctx);
497 /* Handle the META tag. This requires special handling because of the
498 refresh feature and because of robot exclusion. */
501 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
503 char *name = find_attr (tag, "name", NULL);
504 char *http_equiv = find_attr (tag, "http-equiv", NULL);
506 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
508 /* Some pages use a META tag to specify that the page be
509 refreshed by a new page after a given number of seconds. The
510 general format for this is:
512 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
514 So we just need to skip past the "NUMBER; URL=" garbage to
517 struct urlpos *entry;
520 char *p, *refresh = find_attr (tag, "content", &attrind);
523 for (p = refresh; ISDIGIT (*p); p++)
524 timeout = 10 * timeout + *p - '0';
530 if (!( TOUPPER (*p) == 'U'
531 && TOUPPER (*(p + 1)) == 'R'
532 && TOUPPER (*(p + 2)) == 'L'
539 entry = append_one_url (p, 0, tag, attrind, ctx);
542 entry->link_refresh_p = 1;
543 entry->refresh_timeout = timeout;
546 else if (name && 0 == strcasecmp (name, "robots"))
548 /* Handle stuff like:
549 <meta name="robots" content="index,nofollow"> */
550 char *content = find_attr (tag, "content", NULL);
553 if (!strcasecmp (content, "none"))
559 /* Find the next occurrence of ',' or the end of
561 char *end = strchr (content, ',');
565 end = content + strlen (content);
566 if (!strncasecmp (content, "nofollow", end - content))
574 /* Examine name and attributes of TAG and take appropriate action
575 according to the tag. */
578 collect_tags_mapper (struct taginfo *tag, void *arg)
580 struct map_context *ctx = (struct map_context *)arg;
582 tag_handler_t handler;
584 tagid = find_tag (tag->name);
585 assert (tagid != -1);
586 handler = known_tags[tagid].handler;
588 handler (tagid, tag, ctx);
591 /* Analyze HTML tags FILE and construct a list of URLs referenced from
592 it. It merges relative links in FILE with URL. It is aware of
593 <base href=...> and does the right thing. */
595 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
597 struct file_memory *fm;
598 struct map_context ctx;
601 fm = read_file (file);
604 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
607 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
609 ctx.text = fm->content;
610 ctx.head = ctx.tail = NULL;
612 ctx.parent_base = url ? url : opt.base_href;
613 ctx.document_file = file;
616 if (!interesting_tags)
619 map_html_tags (fm->content, fm->length, interesting_tags,
620 interesting_attributes, collect_tags_mapper, &ctx);
622 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
623 if (meta_disallow_follow)
624 *meta_disallow_follow = ctx.nofollow;
626 FREE_MAYBE (ctx.base);
632 cleanup_html_url (void)
634 FREE_MAYBE (interesting_tags);
635 FREE_MAYBE (interesting_attributes);