1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
33 #include "html-parse.h"
43 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
44 struct map_context *));
46 #define DECLARE_TAG_HANDLER(fun) \
47 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
49 DECLARE_TAG_HANDLER (tag_find_urls);
50 DECLARE_TAG_HANDLER (tag_handle_base);
51 DECLARE_TAG_HANDLER (tag_handle_link);
52 DECLARE_TAG_HANDLER (tag_handle_meta);
54 /* The list of known tags and functions used for handling them. Most
55 tags are simply harvested for URLs. */
58 tag_handler_t handler;
61 { "a", tag_find_urls },
63 { "applet", tag_find_urls },
65 { "area", tag_find_urls },
67 { "base", tag_handle_base },
69 { "bgsound", tag_find_urls },
71 { "body", tag_find_urls },
73 { "embed", tag_find_urls },
75 { "fig", tag_find_urls },
77 { "frame", tag_find_urls },
79 { "iframe", tag_find_urls },
81 { "img", tag_find_urls },
83 { "input", tag_find_urls },
85 { "layer", tag_find_urls },
87 { "link", tag_handle_link },
89 { "meta", tag_handle_meta },
90 #define TAG_OVERLAY 15
91 { "overlay", tag_find_urls },
93 { "script", tag_find_urls },
95 { "table", tag_find_urls },
97 { "td", tag_find_urls },
99 { "th", tag_find_urls }
102 /* tag_url_attributes documents which attributes of which tags contain
103 URLs to harvest. It is used by tag_find_urls. */
105 /* Defines for the FLAGS field; currently only one flag is defined. */
107 /* This tag points to an external document not necessary for rendering this
108 document (i.e. it's not an inlined image, stylesheet, etc.). */
109 #define TUA_EXTERNAL 1
111 /* For tags handled by tag_find_urls: attributes that contain URLs to
115 const char *attr_name;
117 } tag_url_attributes[] = {
118 { TAG_A, "href", TUA_EXTERNAL },
119 { TAG_APPLET, "code", 0 },
120 { TAG_AREA, "href", TUA_EXTERNAL },
121 { TAG_BGSOUND, "src", 0 },
122 { TAG_BODY, "background", 0 },
123 { TAG_EMBED, "href", 0 },
124 { TAG_EMBED, "src", 0 },
125 { TAG_FIG, "src", 0 },
126 { TAG_FRAME, "src", 0 },
127 { TAG_IFRAME, "src", 0 },
128 { TAG_IMG, "href", 0 },
129 { TAG_IMG, "lowsrc", 0 },
130 { TAG_IMG, "src", 0 },
131 { TAG_INPUT, "src", 0 },
132 { TAG_LAYER, "src", 0 },
133 { TAG_OVERLAY, "src", 0 },
134 { TAG_SCRIPT, "src", 0 },
135 { TAG_TABLE, "background", 0 },
136 { TAG_TD, "background", 0 },
137 { TAG_TH, "background", 0 }
140 /* The lists of interesting tags and attributes are built dynamically,
141 from the information above. However, some places in the code refer
142 to the attributes not mentioned here. We add them manually. */
143 static const char *additional_attributes[] = {
144 "rel", /* for TAG_LINK */
145 "http-equiv", /* for TAG_META */
146 "name", /* for TAG_META */
147 "content" /* for TAG_META */
150 static const char **interesting_tags;
151 static const char **interesting_attributes;
154 init_interesting (void)
156 /* Init the variables interesting_tags and interesting_attributes
157 that are used by the HTML parser to know which tags and
158 attributes we're interested in. We initialize this only once,
159 for performance reasons.
161 Here we also make sure that what we put in interesting_tags
162 matches the user's preferences as specified through --ignore-tags
165 This function is as large as this only because of the glorious
166 expressivity of the C programming language. */
170 int size = ARRAY_SIZE (known_tags);
171 interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
173 for (i = 0; i < size; i++)
175 const char *name = known_tags[i].name;
177 /* Normally here we could say:
178 interesting_tags[i] = name;
179 But we need to respect the settings of --ignore-tags and
180 --follow-tags, so the code gets a bit hairier. */
184 /* --ignore-tags was specified. Do not match these
185 specific tags. --ignore-tags takes precedence over
186 --follow-tags, so we process --ignore first and fall
187 through if there's no match. */
189 for (j = 0; opt.ignore_tags[j] != NULL; j++)
190 /* Loop through all the tags this user doesn't care about. */
191 if (strcasecmp(opt.ignore_tags[j], name) == EQ)
202 /* --follow-tags was specified. Only match these specific tags, so
203 continue back to top of for if we don't match one of them. */
205 for (j = 0; opt.follow_tags[j] != NULL; j++)
206 /* Loop through all the tags this user cares about. */
207 if (strcasecmp(opt.follow_tags[j], name) == EQ)
213 continue; /* wasn't one of the explicitly desired tags */
216 /* If we get to here, --follow-tags isn't being used or the
217 tag is among the ones that are followed, and --ignore-tags,
218 if specified, didn't include this tag, so it's an
219 "interesting" one. */
220 interesting_tags[ind++] = name;
222 interesting_tags[ind] = NULL;
225 /* The same for attributes, except we loop through tag_url_attributes.
226 Here we also need to make sure that the list of attributes is
227 unique, and to include the attributes from additional_attributes. */
230 const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
232 /* First copy the "additional" attributes. */
233 for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
234 att[i] = additional_attributes[i];
237 for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
240 const char *look_for = tag_url_attributes[i].attr_name;
241 for (j = 0; j < ind - 1; j++)
242 if (!strcmp (att[j], look_for))
249 att = xrealloc (att, (ind + 2) * sizeof (*att));
250 att[ind++] = look_for;
254 interesting_attributes = att;
259 find_tag (const char *tag_name)
263 /* This is linear search; if the number of tags grow, we can switch
266 for (i = 0; i < ARRAY_SIZE (known_tags); i++)
268 int cmp = strcasecmp (known_tags[i].name, tag_name);
269 /* known_tags are sorted alphabetically, so we can
279 /* Find the value of attribute named NAME in the taginfo TAG. If the
280 attribute is not present, return NULL. If ATTRIND is non-NULL, the
281 index of the attribute in TAG will be stored there. */
283 find_attr (struct taginfo *tag, const char *name, int *attrind)
286 for (i = 0; i < tag->nattrs; i++)
287 if (!strcasecmp (tag->attrs[i].name, name))
291 return tag->attrs[i].value;
297 char *text; /* HTML text. */
298 char *base; /* Base URI of the document, possibly
299 changed through <base href=...>. */
300 const char *parent_base; /* Base of the current document. */
301 const char *document_file; /* File name of this document. */
302 int nofollow; /* whether NOFOLLOW was specified in a
303 <meta name=robots> tag. */
305 struct urlpos *head, *tail; /* List of URLs that is being
309 /* Append LINK_URI to the urlpos structure that is being built.
311 LINK_URI will be merged with the current document base. TAG and
312 ATTRIND are the necessary context to store the position and
315 static struct urlpos *
316 append_one_url (const char *link_uri, int inlinep,
317 struct taginfo *tag, int attrind, struct map_context *ctx)
319 int link_has_scheme = url_has_scheme (link_uri);
320 struct urlpos *newel;
321 const char *base = ctx->base ? ctx->base : ctx->parent_base;
326 DEBUGP (("%s: no base, merge will use \"%s\".\n",
327 ctx->document_file, link_uri));
329 if (!link_has_scheme)
331 /* We have no base, and the link does not have a host
332 attached to it. Nothing we can do. */
333 /* #### Should we print a warning here? Wget 1.5.x used to. */
337 url = url_parse (link_uri, NULL);
340 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
341 ctx->document_file, link_uri));
347 /* Merge BASE with LINK_URI, but also make sure the result is
348 canonicalized, i.e. that "../" have been resolved.
349 (parse_url will do that for us.) */
351 char *complete_uri = uri_merge (base, link_uri);
353 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
354 ctx->document_file, base, link_uri, complete_uri));
356 url = url_parse (complete_uri, NULL);
359 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
360 ctx->document_file, complete_uri));
361 xfree (complete_uri);
364 xfree (complete_uri);
367 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
368 memset (newel, 0, sizeof (*newel));
372 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
373 newel->size = tag->attrs[attrind].value_raw_size;
374 newel->link_inline_p = inlinep;
376 /* A URL is relative if the host is not named, and the name does not
378 if (!link_has_scheme && *link_uri != '/')
379 newel->link_relative_p = 1;
380 else if (link_has_scheme)
381 newel->link_complete_p = 1;
385 ctx->tail->next = newel;
389 ctx->tail = ctx->head = newel;
394 /* All the tag_* functions are called from collect_tags_mapper, as
395 specified by KNOWN_TAGS. */
397 /* For most tags, all we want to do is harvest URLs from their
401 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
403 int i, attrind, first = -1;
404 int size = ARRAY_SIZE (tag_url_attributes);
406 for (i = 0; i < size; i++)
407 if (tag_url_attributes[i].tagid == tagid)
409 /* We've found the index of tag_url_attributes where the
410 attributes of our tags begin. */
414 assert (first != -1);
416 /* Loop over the "interesting" attributes of this tag. In this
417 example, it will loop over "src" and "lowsrc".
419 <img src="foo.png" lowsrc="bar.png">
421 This has to be done in the outer loop so that the attributes are
422 processed in the same order in which they appear in the page.
423 This is required when converting links. */
425 for (attrind = 0; attrind < tag->nattrs; attrind++)
427 /* Find whether TAG/ATTRIND is a combination that contains a
429 char *attrvalue = tag->attrs[attrind].value;
431 /* If you're cringing at the inefficiency of the nested loops,
432 remember that the number of attributes the inner loop
433 iterates over is laughably small -- three in the worst case
435 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
437 if (0 == strcasecmp (tag->attrs[attrind].name,
438 tag_url_attributes[i].attr_name))
440 int flags = tag_url_attributes[i].flags;
441 append_one_url (attrvalue, !(flags & TUA_EXTERNAL),
449 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
451 struct urlpos *base_urlpos;
453 char *newbase = find_attr (tag, "href", &attrind);
457 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
460 base_urlpos->ignore_when_downloading = 1;
461 base_urlpos->link_base_p = 1;
465 if (ctx->parent_base)
466 ctx->base = uri_merge (ctx->parent_base, newbase);
468 ctx->base = xstrdup (newbase);
472 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
475 char *href = find_attr (tag, "href", &attrind);
477 /* All <link href="..."> link references are external,
478 except for <link rel="stylesheet" href="...">. */
481 char *rel = find_attr (tag, "rel", NULL);
482 int inlinep = (rel && 0 == strcasecmp (rel, "stylesheet"));
483 append_one_url (href, inlinep, tag, attrind, ctx);
487 /* Some pages use a META tag to specify that the page be refreshed by
488 a new page after a given number of seconds. The general format for
491 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
493 So we just need to skip past the "NUMBER; URL=" garbage to get to
497 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
499 char *name = find_attr (tag, "name", NULL);
500 char *http_equiv = find_attr (tag, "http-equiv", NULL);
502 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
504 struct urlpos *entry;
507 char *p, *refresh = find_attr (tag, "content", &attrind);
510 for (p = refresh; ISDIGIT (*p); p++)
511 timeout = 10 * timeout + *p - '0';
517 if (!( TOUPPER (*p) == 'U'
518 && TOUPPER (*(p + 1)) == 'R'
519 && TOUPPER (*(p + 2)) == 'L'
526 entry = append_one_url (p, 0, tag, attrind, ctx);
529 entry->link_refresh_p = 1;
530 entry->refresh_timeout = timeout;
533 else if (name && 0 == strcasecmp (name, "robots"))
535 /* Handle stuff like:
536 <meta name="robots" content="index,nofollow"> */
537 char *content = find_attr (tag, "content", NULL);
540 if (!strcasecmp (content, "none"))
546 /* Find the next occurrence of ',' or the end of
548 char *end = strchr (content, ',');
552 end = content + strlen (content);
553 if (!strncasecmp (content, "nofollow", end - content))
561 /* Examine name and attributes of TAG and take appropriate action
562 according to the tag. */
565 collect_tags_mapper (struct taginfo *tag, void *arg)
567 struct map_context *ctx = (struct map_context *)arg;
569 tag_handler_t handler;
571 tagid = find_tag (tag->name);
572 assert (tagid != -1);
573 handler = known_tags[tagid].handler;
575 handler (tagid, tag, ctx);
578 /* Analyze HTML tags FILE and construct a list of URLs referenced from
579 it. It merges relative links in FILE with URL. It is aware of
580 <base href=...> and does the right thing. */
582 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
584 struct file_memory *fm;
585 struct map_context ctx;
588 fm = read_file (file);
591 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
594 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
596 ctx.text = fm->content;
597 ctx.head = ctx.tail = NULL;
599 ctx.parent_base = url ? url : opt.base_href;
600 ctx.document_file = file;
603 if (!interesting_tags)
606 map_html_tags (fm->content, fm->length, interesting_tags,
607 interesting_attributes, collect_tags_mapper, &ctx);
609 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
610 if (meta_disallow_follow)
611 *meta_disallow_follow = ctx.nofollow;
613 FREE_MAYBE (ctx.base);
619 cleanup_html_url (void)
621 FREE_MAYBE (interesting_tags);
622 FREE_MAYBE (interesting_attributes);