1 /* A simple HTML parser.
2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
43 static state_t global_state;
51 /* Match a string against a null-terminated list of identifiers. */
53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
57 if (tag == NULL || attr == NULL)
60 for (i = 0; tags[i].tag; i++)
61 /* Loop through all the tags wget ever cares about. */
62 if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
63 /* The tag and attribute matched one of the ones wget cares about. */
66 /* --ignore-tags was specified. Do not match these specific tags.
67 --ignore-tags takes precedence over --follow-tags, so we process
68 --ignore first and fall through if there's no match. */
69 for (j = 0; opt.ignore_tags[j] != NULL; j++)
70 /* Loop through all the tags this user doesn't care about. */
71 if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
75 /* --follow-tags was specified. Only match these specific tags, so
76 return FALSE if we don't match one of them. */
78 for (j = 0; opt.follow_tags[j] != NULL; j++)
79 /* Loop through all the tags this user cares about. */
80 if (strcasecmp(opt.follow_tags[j], tag) == EQ)
83 return FALSE; /* wasn't one of the explicitly desired tags */
86 /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
87 if specified, didn't include this tag, so it's okay to follow. */
91 return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
95 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
96 describing URLs to follow. When a tag is encountered, extract its
97 components (as described by html_allow[] array), and return the
98 address and the length of the string. Return NULL if no URL is
101 htmlfindurl (const char *buf, int bufsize, int *size, int init,
102 int dash_p_leaf_HTML)
105 state_t *s = &global_state;
107 /* NULL-terminated list of tags and modifiers someone would want to
108 follow -- feel free to edit to suit your needs: */
109 static struct tag_attr html_allow[] = {
113 { "body", "background" },
117 { "overlay", "src" },
118 { "applet", "code" },
121 { "bgsound", "src" },
125 { "table", "background"},
126 { "th", "background"},
127 { "td", "background"},
128 /* Tags below this line are treated specially. */
134 { "meta", "content" },
140 DEBUGP (("Resetting a parser state.\n"));
141 memset (s, 0, sizeof (*s));
146 const char* link_href = NULL;
147 const char* link_rel = NULL;
148 int link_href_saved_size;
152 /* Let's look for a tag, if we are not already in one. */
157 for (; bufsize && *buf != '<'; ++buf, --bufsize);
161 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
166 /* Find the tag end. */
167 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
173 /* <tag=something> is illegal. Just skip it. */
183 s->tag = strdupdelim (p, buf);
192 else /* s->at_value */
194 /* Reset AT_VALUE. */
196 /* If in quotes, just skip out of them and continue living. */
200 for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
210 FREE_MAYBE (s->attr);
211 s->tag = s->attr = NULL;
215 /* Find the attributes. */
218 FREE_MAYBE (s->attr);
222 /* Skip the spaces if we have them. We don't have them at
223 places like <img alt="something"src="something-else">.
226 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
228 if (!bufsize || *buf == '>')
232 /* This is the case of <tag = something>, which is
233 illegal. Just skip it. */
238 /* Find the attribute end. */
239 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
241 if (!bufsize || *buf == '>')
243 /* Construct the attribute. */
244 s->attr = strdupdelim (p, buf);
245 /* Now we must skip the spaces to find '='. */
248 for (; bufsize && ISSPACE (*buf) && *buf != '>';
250 if (!bufsize || *buf == '>')
253 /* If we still don't have '=', something is amiss. */
256 /* Find the beginning of attribute value by skipping the
259 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
260 if (!bufsize || *buf == '>')
263 /* The value of an attribute can, but does not have to be
265 if (*buf == '\"' || *buf == '\'')
268 s->quote_char = *buf;
270 for (++buf, --bufsize;
271 bufsize && *buf != s->quote_char && *buf != '\n';
273 if (ph && *buf == '#')
282 /* #### Is the following logic good?
284 Obviously no longer in quote. It might be well
285 to check whether '>' was encountered, but that
286 would be encouraging writers of invalid HTMLs,
287 and we don't want that, now do we? */
295 for (; bufsize && !ISSPACE (*buf) && *buf != '>';
297 if (ph && *buf == '#')
302 /* If '#' was found unprotected in a URI, it is probably an
303 HTML marker, or color spec. */
304 *size = (ph ? ph : buf) - p;
305 /* The URI is liable to be returned if:
307 2) its tag and attribute are found in html_allow. */
308 if (*size && idmatch (html_allow, s->tag, s->attr))
310 if (strcasecmp(s->tag, "a") == EQ ||
311 strcasecmp(s->tag, "area") == EQ)
313 /* Only follow these if we're not at a -p leaf node, as they
314 always link to external documents. */
315 if (!dash_p_leaf_HTML)
321 else if (!strcasecmp (s->tag, "base") &&
322 !strcasecmp (s->attr, "href"))
324 FREE_MAYBE (s->base);
325 s->base = strdupdelim (p, buf);
327 else if (strcasecmp(s->tag, "link") == EQ)
329 if (strcasecmp(s->attr, "href") == EQ)
332 link_href_saved_size = *size; /* for restoration below */
334 else if (strcasecmp(s->attr, "rel") == EQ)
337 if (link_href != NULL && link_rel != NULL)
338 /* Okay, we've now seen this <LINK> tag's HREF and REL
339 attributes (they may be in either order), so it's now
340 possible to decide if we want to traverse it. */
341 if (!dash_p_leaf_HTML ||
342 strncasecmp(link_rel, "stylesheet",
343 sizeof("stylesheet") - 1) == EQ)
344 /* In the normal case, all <LINK> tags are fair game.
346 In the special case of when -p is active, however, and
347 we're at a leaf node (relative to the -l max. depth) in
348 the HTML document tree, the only <LINK> tag we'll
349 follow is a <LINK REL="stylesheet">, as it's necessary
350 for displaying this document properly. We won't follow
351 other <LINK> tags, like <LINK REL="home">, for
352 instance, as they refer to external documents.
354 Note that the above strncasecmp() will incorrectly
355 consider something like '<LINK REL="stylesheet.old"' as
356 equivalent to '<LINK REL="stylesheet"'. Not really
357 worth the trouble to explicitly check for such cases --
358 if time is spent, it should be spent ripping out wget's
359 somewhat kludgy HTML parser and hooking in a real,
360 componentized one. */
362 /* When we return, the 'size' IN/OUT parameter
363 determines where in the buffer the end of the current
364 attribute value is. If REL came after HREF in this
365 <LINK> tag, size is currently set to the size for
366 REL's value -- set it to what it was when we were
367 looking at HREF's value. */
368 *size = link_href_saved_size;
374 else if (!strcasecmp (s->tag, "meta") &&
375 !strcasecmp (s->attr, "content"))
377 /* Some pages use a META tag to specify that the page
378 be refreshed by a new page after a given number of
379 seconds. We need to attempt to extract an URL for
380 the new page from the other garbage present. The
381 general format for this is:
382 <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
384 So we just need to skip past the "0; URL="
385 garbage to get to the URL. META tags are also
386 used for specifying random things like the page
387 author's name and what editor was used to create
388 it. So we need to be careful to ignore them and
389 not assume that an URL will be present at all. */
390 for (; *size && ISDIGIT (*p); p++, *size -= 1);
393 for (p++, *size -= 1;
394 *size && ISSPACE (*p);
396 if (!strncasecmp (p, "URL=", 4))
410 /* Exit from quote. */
411 if (*buf == s->quote_char)
416 } while (*buf != '>');
418 FREE_MAYBE (s->attr);
419 s->tag = s->attr = NULL;
425 FREE_MAYBE (s->attr);
426 FREE_MAYBE (s->base);
427 memset (s, 0, sizeof (*s)); /* just to be sure */
428 DEBUGP (("HTML parser ends here (state destroyed).\n"));
432 /* The function returns the base reference of HTML buffer id, or NULL
433 if one wasn't defined for that buffer. */
437 return global_state.base;
440 /* The function returns the pointer to the malloc-ed quoted version of
441 string s. It will recognize and quote numeric and special graphic
442 entities, as per RFC1866:
449 No other entities are recognized or replaced. */
451 html_quote_string (const char *s)
457 /* Pass through the string, and count the new size. */
458 for (i = 0; *s; s++, i++)
462 else if (*s == '<' || *s == '>')
463 i += 3; /* `lt;' and `gt;' */
465 i += 5; /* `quot;' */
467 res = (char *)xmalloc (i + 1);
469 for (p = res; *s; s++)
482 *p++ = (*s == '<' ? 'l' : 'g');
502 /* The function creates an HTML index containing references to given
503 directories and files on the appropriate host. The references are
506 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
510 char *htclfile; /* HTML-clean file name */
514 fp = fopen (file, "wb");
517 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
525 char *tmpu, *tmpp; /* temporary, clean user and passwd */
527 tmpu = CLEANDUP (u->user);
528 tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
529 upwd = (char *)xmalloc (strlen (tmpu)
530 + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
531 sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
537 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
538 fprintf (fp, "<html>\n<head>\n<title>");
539 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
540 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
541 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
542 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
548 /* #### Should we translate the months? */
549 static char *months[] = {
550 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
551 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
553 struct tm *ptm = localtime ((time_t *)&f->tstamp);
555 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
558 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
563 fprintf (fp, _("time unknown "));
567 fprintf (fp, _("File "));
570 fprintf (fp, _("Directory "));
573 fprintf (fp, _("Link "));
576 fprintf (fp, _("Not sure "));
579 htclfile = html_quote_string (f->name);
580 fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
583 fprintf (fp, "%s", u->dir);
586 fprintf (fp, "%s", htclfile);
587 if (f->type == FT_DIRECTORY)
589 fprintf (fp, "\">%s", htclfile);
590 if (f->type == FT_DIRECTORY)
592 fprintf (fp, "</a> ");
593 if (f->type == FT_PLAINFILE)
594 fprintf (fp, _(" (%s bytes)"), legible (f->size));
595 else if (f->type == FT_SYMLINK)
596 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
601 fprintf (fp, "</pre>\n</body>\n</html>\n");