1 /* Collect URLs from HTML source.
2 Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
43 #include "html-parse.h"
55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
56 struct map_context *));
58 #define DECLARE_TAG_HANDLER(fun) \
59 static void fun PARAMS ((int, struct taginfo *, struct map_context *))
61 DECLARE_TAG_HANDLER (tag_find_urls);
62 DECLARE_TAG_HANDLER (tag_handle_base);
63 DECLARE_TAG_HANDLER (tag_handle_form);
64 DECLARE_TAG_HANDLER (tag_handle_link);
65 DECLARE_TAG_HANDLER (tag_handle_meta);
91 /* The list of known tags and functions used for handling them. Most
92 tags are simply harvested for URLs. */
93 static struct known_tag {
96 tag_handler_t handler;
98 { TAG_A, "a", tag_find_urls },
99 { TAG_APPLET, "applet", tag_find_urls },
100 { TAG_AREA, "area", tag_find_urls },
101 { TAG_BASE, "base", tag_handle_base },
102 { TAG_BGSOUND, "bgsound", tag_find_urls },
103 { TAG_BODY, "body", tag_find_urls },
104 { TAG_EMBED, "embed", tag_find_urls },
105 { TAG_FIG, "fig", tag_find_urls },
106 { TAG_FORM, "form", tag_handle_form },
107 { TAG_FRAME, "frame", tag_find_urls },
108 { TAG_IFRAME, "iframe", tag_find_urls },
109 { TAG_IMG, "img", tag_find_urls },
110 { TAG_INPUT, "input", tag_find_urls },
111 { TAG_LAYER, "layer", tag_find_urls },
112 { TAG_LINK, "link", tag_handle_link },
113 { TAG_META, "meta", tag_handle_meta },
114 { TAG_OVERLAY, "overlay", tag_find_urls },
115 { TAG_SCRIPT, "script", tag_find_urls },
116 { TAG_TABLE, "table", tag_find_urls },
117 { TAG_TD, "td", tag_find_urls },
118 { TAG_TH, "th", tag_find_urls }
121 /* tag_url_attributes documents which attributes of which tags contain
122 URLs to harvest. It is used by tag_find_urls. */
124 /* Defines for the FLAGS field; currently only one flag is defined. */
126 /* This tag points to an external document not necessary for rendering this
127 document (i.e. it's not an inlined image, stylesheet, etc.). */
128 #define TUA_EXTERNAL 1
130 /* For tags handled by tag_find_urls: attributes that contain URLs to
134 const char *attr_name;
136 } tag_url_attributes[] = {
137 { TAG_A, "href", TUA_EXTERNAL },
138 { TAG_APPLET, "code", 0 },
139 { TAG_AREA, "href", TUA_EXTERNAL },
140 { TAG_BGSOUND, "src", 0 },
141 { TAG_BODY, "background", 0 },
142 { TAG_EMBED, "href", TUA_EXTERNAL },
143 { TAG_EMBED, "src", 0 },
144 { TAG_FIG, "src", 0 },
145 { TAG_FRAME, "src", 0 },
146 { TAG_IFRAME, "src", 0 },
147 { TAG_IMG, "href", 0 },
148 { TAG_IMG, "lowsrc", 0 },
149 { TAG_IMG, "src", 0 },
150 { TAG_INPUT, "src", 0 },
151 { TAG_LAYER, "src", 0 },
152 { TAG_OVERLAY, "src", 0 },
153 { TAG_SCRIPT, "src", 0 },
154 { TAG_TABLE, "background", 0 },
155 { TAG_TD, "background", 0 },
156 { TAG_TH, "background", 0 }
159 /* The lists of interesting tags and attributes are built dynamically,
160 from the information above. However, some places in the code refer
161 to the attributes not mentioned here. We add them manually. */
162 static const char *additional_attributes[] = {
163 "rel", /* used by tag_handle_link */
164 "http-equiv", /* used by tag_handle_meta */
165 "name", /* used by tag_handle_meta */
166 "content", /* used by tag_handle_meta */
167 "action" /* used by tag_handle_form */
170 struct hash_table *interesting_tags;
171 struct hash_table *interesting_attributes;
174 init_interesting (void)
176 /* Init the variables interesting_tags and interesting_attributes
177 that are used by the HTML parser to know which tags and
178 attributes we're interested in. We initialize this only once,
179 for performance reasons.
181 Here we also make sure that what we put in interesting_tags
182 matches the user's preferences as specified through --ignore-tags
183 and --follow-tags. */
186 interesting_tags = make_nocase_string_hash_table (countof (known_tags));
188 /* First, add all the tags we know hot to handle, mapped to their
189 respective entries in known_tags. */
190 for (i = 0; i < countof (known_tags); i++)
191 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
193 /* Then remove the tags ignored through --ignore-tags. */
197 for (ignored = opt.ignore_tags; *ignored; ignored++)
198 hash_table_remove (interesting_tags, *ignored);
201 /* If --follow-tags is specified, use only those tags. */
204 /* Create a new table intersecting --follow-tags and known_tags,
205 and use it as interesting_tags. */
206 struct hash_table *intersect = make_nocase_string_hash_table (0);
208 for (followed = opt.follow_tags; *followed; followed++)
210 struct known_tag *t = hash_table_get (interesting_tags, *followed);
212 continue; /* ignore unknown --follow-tags entries. */
213 hash_table_put (intersect, *followed, t);
215 hash_table_destroy (interesting_tags);
216 interesting_tags = intersect;
219 /* Add the attributes we care about. */
220 interesting_attributes = make_nocase_string_hash_table (10);
221 for (i = 0; i < countof (additional_attributes); i++)
222 string_set_add (interesting_attributes, additional_attributes[i]);
223 for (i = 0; i < countof (tag_url_attributes); i++)
224 string_set_add (interesting_attributes, tag_url_attributes[i].attr_name);
227 /* Find the value of attribute named NAME in the taginfo TAG. If the
228 attribute is not present, return NULL. If ATTRIND is non-NULL, the
229 index of the attribute in TAG will be stored there. */
232 find_attr (struct taginfo *tag, const char *name, int *attrind)
235 for (i = 0; i < tag->nattrs; i++)
236 if (!strcasecmp (tag->attrs[i].name, name))
240 return tag->attrs[i].value;
246 char *text; /* HTML text. */
247 char *base; /* Base URI of the document, possibly
248 changed through <base href=...>. */
249 const char *parent_base; /* Base of the current document. */
250 const char *document_file; /* File name of this document. */
251 int nofollow; /* whether NOFOLLOW was specified in a
252 <meta name=robots> tag. */
254 struct urlpos *head, *tail; /* List of URLs that is being
258 /* Append LINK_URI to the urlpos structure that is being built.
260 LINK_URI will be merged with the current document base. TAG and
261 ATTRIND are the necessary context to store the position and
264 static struct urlpos *
265 append_one_url (const char *link_uri, int inlinep,
266 struct taginfo *tag, int attrind, struct map_context *ctx)
268 int link_has_scheme = url_has_scheme (link_uri);
269 struct urlpos *newel;
270 const char *base = ctx->base ? ctx->base : ctx->parent_base;
275 DEBUGP (("%s: no base, merge will use \"%s\".\n",
276 ctx->document_file, link_uri));
278 if (!link_has_scheme)
280 /* Base URL is unavailable, and the link does not have a
281 location attached to it -- we have to give up. Since
282 this can only happen when using `--force-html -i', print
284 logprintf (LOG_NOTQUIET,
285 _("%s: Cannot resolve incomplete link %s.\n"),
286 ctx->document_file, link_uri);
290 url = url_parse (link_uri, NULL);
293 DEBUGP (("%s: link \"%s\" doesn't parse.\n",
294 ctx->document_file, link_uri));
300 /* Merge BASE with LINK_URI, but also make sure the result is
301 canonicalized, i.e. that "../" have been resolved.
302 (parse_url will do that for us.) */
304 char *complete_uri = uri_merge (base, link_uri);
306 DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
307 ctx->document_file, base, link_uri, complete_uri));
309 url = url_parse (complete_uri, NULL);
312 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
313 ctx->document_file, complete_uri));
314 xfree (complete_uri);
317 xfree (complete_uri);
320 DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
322 newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
323 memset (newel, 0, sizeof (*newel));
327 newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
328 newel->size = tag->attrs[attrind].value_raw_size;
329 newel->link_inline_p = inlinep;
331 /* A URL is relative if the host is not named, and the name does not
333 if (!link_has_scheme && *link_uri != '/')
334 newel->link_relative_p = 1;
335 else if (link_has_scheme)
336 newel->link_complete_p = 1;
340 ctx->tail->next = newel;
344 ctx->tail = ctx->head = newel;
349 /* All the tag_* functions are called from collect_tags_mapper, as
350 specified by KNOWN_TAGS. */
352 /* Default tag handler: collect URLs from attributes specified for
353 this tag by tag_url_attributes. */
356 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
361 for (i = 0; i < countof (tag_url_attributes); i++)
362 if (tag_url_attributes[i].tagid == tagid)
364 /* We've found the index of tag_url_attributes where the
365 attributes of our tag begin. */
369 assert (first != -1);
371 /* Loop over the "interesting" attributes of this tag. In this
372 example, it will loop over "src" and "lowsrc".
374 <img src="foo.png" lowsrc="bar.png">
376 This has to be done in the outer loop so that the attributes are
377 processed in the same order in which they appear in the page.
378 This is required when converting links. */
380 for (attrind = 0; attrind < tag->nattrs; attrind++)
382 /* Find whether TAG/ATTRIND is a combination that contains a
384 char *link = tag->attrs[attrind].value;
385 const int size = countof (tag_url_attributes);
387 /* If you're cringing at the inefficiency of the nested loops,
388 remember that they both iterate over a very small number of
389 items. The worst-case inner loop is for the IMG tag, which
390 has three attributes. */
391 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
393 if (0 == strcasecmp (tag->attrs[attrind].name,
394 tag_url_attributes[i].attr_name))
396 int flags = tag_url_attributes[i].flags;
397 append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
403 /* Handle the BASE tag, for <base href=...>. */
406 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
408 struct urlpos *base_urlpos;
410 char *newbase = find_attr (tag, "href", &attrind);
414 base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
417 base_urlpos->ignore_when_downloading = 1;
418 base_urlpos->link_base_p = 1;
422 if (ctx->parent_base)
423 ctx->base = uri_merge (ctx->parent_base, newbase);
425 ctx->base = xstrdup (newbase);
428 /* Mark the URL found in <form action=...> for conversion. */
431 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
434 char *action = find_attr (tag, "action", &attrind);
437 struct urlpos *action_urlpos = append_one_url (action, 0, tag,
440 action_urlpos->ignore_when_downloading = 1;
444 /* Handle the LINK tag. It requires special handling because how its
445 links will be followed in -p mode depends on the REL attribute. */
448 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
451 char *href = find_attr (tag, "href", &attrind);
453 /* All <link href="..."> link references are external, except those
454 known not to be, such as style sheet and shortcut icon:
456 <link rel="stylesheet" href="...">
457 <link rel="shortcut icon" href="...">
461 char *rel = find_attr (tag, "rel", NULL);
463 && (0 == strcasecmp (rel, "stylesheet")
464 || 0 == strcasecmp (rel, "shortcut icon")));
465 append_one_url (href, inlinep, tag, attrind, ctx);
469 /* Handle the META tag. This requires special handling because of the
470 refresh feature and because of robot exclusion. */
473 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
475 char *name = find_attr (tag, "name", NULL);
476 char *http_equiv = find_attr (tag, "http-equiv", NULL);
478 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
480 /* Some pages use a META tag to specify that the page be
481 refreshed by a new page after a given number of seconds. The
482 general format for this is:
484 <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
486 So we just need to skip past the "NUMBER; URL=" garbage to
489 struct urlpos *entry;
494 char *refresh = find_attr (tag, "content", &attrind);
498 for (p = refresh; ISDIGIT (*p); p++)
499 timeout = 10 * timeout + *p - '0';
505 if (!( TOUPPER (*p) == 'U'
506 && TOUPPER (*(p + 1)) == 'R'
507 && TOUPPER (*(p + 2)) == 'L'
514 entry = append_one_url (p, 0, tag, attrind, ctx);
517 entry->link_refresh_p = 1;
518 entry->refresh_timeout = timeout;
521 else if (name && 0 == strcasecmp (name, "robots"))
523 /* Handle stuff like:
524 <meta name="robots" content="index,nofollow"> */
525 char *content = find_attr (tag, "content", NULL);
528 if (!strcasecmp (content, "none"))
534 /* Find the next occurrence of ',' or the end of
536 char *end = strchr (content, ',');
540 end = content + strlen (content);
541 if (!strncasecmp (content, "nofollow", end - content))
549 /* Dispatch the tag handler appropriate for the tag we're mapping
550 over. See known_tags[] for definition of tag handlers. */
553 collect_tags_mapper (struct taginfo *tag, void *arg)
555 struct map_context *ctx = (struct map_context *)arg;
557 /* Find the tag in our table of tags. This must not fail because
558 map_html_tags only returns tags found in interesting_tags. */
559 struct known_tag *t = hash_table_get (interesting_tags, tag->name);
562 t->handler (t->tagid, tag, ctx);
565 /* Analyze HTML tags FILE and construct a list of URLs referenced from
566 it. It merges relative links in FILE with URL. It is aware of
567 <base href=...> and does the right thing. */
570 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
572 struct file_memory *fm;
573 struct map_context ctx;
577 fm = read_file (file);
580 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
583 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
585 ctx.text = fm->content;
586 ctx.head = ctx.tail = NULL;
588 ctx.parent_base = url ? url : opt.base_href;
589 ctx.document_file = file;
592 if (!interesting_tags)
595 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
596 generate <a href=" foo"> instead of <a href="foo"> (Netscape
597 ignores spaces as well.) If you really mean space, use &32; or
599 flags = MHT_TRIM_VALUES;
600 if (opt.strict_comments)
601 flags |= MHT_STRICT_COMMENTS;
603 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
604 interesting_tags, interesting_attributes);
606 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
607 if (meta_disallow_follow)
608 *meta_disallow_follow = ctx.nofollow;
610 FREE_MAYBE (ctx.base);
615 /* This doesn't really have anything to do with HTML, but it's similar
616 to get_urls_html, so we put it here. */
619 get_urls_file (const char *file)
621 struct file_memory *fm;
622 struct urlpos *head, *tail;
623 const char *text, *text_end;
626 fm = read_file (file);
629 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
632 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
636 text_end = fm->content + fm->length;
637 while (text < text_end)
641 struct urlpos *entry;
644 const char *line_beg = text;
645 const char *line_end = memchr (text, '\n', text_end - text);
652 /* Strip whitespace from the beginning and end of line. */
653 while (line_beg < line_end && ISSPACE (*line_beg))
655 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
658 if (line_beg == line_end)
661 /* The URL is in the [line_beg, line_end) region. */
663 /* We must copy the URL to a zero-terminated string, and we
664 can't use alloca because we're in a loop. *sigh*. */
665 url_text = strdupdelim (line_beg, line_end);
669 /* Merge opt.base_href with URL. */
670 char *merged = uri_merge (opt.base_href, url_text);
675 url = url_parse (url_text, &up_error_code);
678 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
679 file, url_text, url_error (up_error_code));
685 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
686 memset (entry, 0, sizeof (*entry));
701 cleanup_html_url (void)
703 FREE_MAYBE (interesting_tags);
704 FREE_MAYBE (interesting_attributes);