1 /* A simple HTML parser.
2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
43 static state_t global_state;
51 /* Match a string against a null-terminated list of identifiers. */
53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
57 if (tag == NULL || attr == NULL)
60 for (i = 0; tags[i].tag; i++)
61 /* Loop through all the tags wget ever cares about. */
62 if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
63 /* The tag and attribute matched one of the ones wget cares about. */
66 /* --ignore-tags was specified. Do not match these specific tags.
67 --ignore-tags takes precedence over --follow-tags, so we process
68 --ignore first and fall through if there's no match. */
69 for (j = 0; opt.ignore_tags[j] != NULL; j++)
70 /* Loop through all the tags this user doesn't care about. */
71 if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
75 /* --follow-tags was specified. Only match these specific tags, so
76 return FALSE if we don't match one of them. */
78 for (j = 0; opt.follow_tags[j] != NULL; j++)
79 /* Loop through all the tags this user cares about. */
80 if (strcasecmp(opt.follow_tags[j], tag) == EQ)
83 return FALSE; /* wasn't one of the explicitly desired tags */
86 /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
87 if specified, didn't include this tag, so it's okay to follow. */
91 return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
95 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
96 describing URLs to follow. When a tag is encountered, extract its
97 components (as described by html_allow[] array), and return the
98 address and the length of the string. Return NULL if no URL is
101 htmlfindurl (const char *buf, int bufsize, int *size, int init)
105 /* NULL-terminated list of tags and modifiers someone would want to
106 follow -- feel free to edit to suit your needs: */
107 static struct tag_attr html_allow[] = {
113 { "body", "background" },
117 { "overlay", "src" },
118 { "applet", "code" },
121 { "bgsound", "src" },
126 { "table", "background"},
127 { "th", "background"},
128 { "td", "background"},
129 /* Tags below this line are treated specially. */
131 { "meta", "content" },
139 DEBUGP (("Resetting a parser state.\n"));
140 memset (s, 0, sizeof (*s));
147 /* Let's look for a tag, if we are not already in one. */
152 for (; bufsize && *buf != '<'; ++buf, --bufsize);
156 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
161 /* Find the tag end. */
162 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
168 /* <tag=something> is illegal. Just skip it. */
178 s->tag = strdupdelim (p, buf);
187 else /* s->at_value */
189 /* Reset AT_VALUE. */
191 /* If in quotes, just skip out of them and continue living. */
195 for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
205 FREE_MAYBE (s->attr);
206 s->tag = s->attr = NULL;
210 /* Find the attributes. */
213 FREE_MAYBE (s->attr);
217 /* Skip the spaces if we have them. We don't have them at
218 places like <img alt="something"src="something-else">.
221 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
223 if (!bufsize || *buf == '>')
227 /* This is the case of <tag = something>, which is
228 illegal. Just skip it. */
233 /* Find the attribute end. */
234 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
236 if (!bufsize || *buf == '>')
238 /* Construct the attribute. */
239 s->attr = strdupdelim (p, buf);
240 /* Now we must skip the spaces to find '='. */
243 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
244 if (!bufsize || *buf == '>')
247 /* If we still don't have '=', something is amiss. */
250 /* Find the beginning of attribute value by skipping the
253 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
254 if (!bufsize || *buf == '>')
257 /* The value of an attribute can, but does not have to be
259 if (*buf == '\"' || *buf == '\'')
262 s->quote_char = *buf;
264 for (++buf, --bufsize;
265 bufsize && *buf != s->quote_char && *buf != '\n';
276 /* #### Is the following logic good?
278 Obviously no longer in quote. It might be well
279 to check whether '>' was encountered, but that
280 would be encouraging writers of invalid HTMLs,
281 and we don't want that, now do we? */
289 for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
295 /* If '#' was found unprotected in a URI, it is probably an
296 HTML marker, or color spec. */
297 *size = (ph ? ph : buf) - p;
298 /* The URI is liable to be returned if:
300 2) its tag and attribute are found in html_allow. */
301 if (*size && idmatch (html_allow, s->tag, s->attr))
303 if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
305 FREE_MAYBE (s->base);
306 s->base = strdupdelim (p, buf);
308 else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
310 /* Some pages use a META tag to specify that the page
311 be refreshed by a new page after a given number of
312 seconds. We need to attempt to extract an URL for
313 the new page from the other garbage present. The
314 general format for this is:
315 <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
317 So we just need to skip past the "0; URL="
318 garbage to get to the URL. META tags are also
319 used for specifying random things like the page
320 author's name and what editor was used to create
321 it. So we need to be careful to ignore them and
322 not assume that an URL will be present at all. */
323 for (; *size && ISDIGIT (*p); p++, *size -= 1);
326 for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
327 if (!strncasecmp (p, "URL=", 4))
341 /* Exit from quote. */
342 if (*buf == s->quote_char)
347 } while (*buf != '>');
349 FREE_MAYBE (s->attr);
350 s->tag = s->attr = NULL;
356 FREE_MAYBE (s->attr);
357 FREE_MAYBE (s->base);
358 memset (s, 0, sizeof (*s)); /* just to be sure */
359 DEBUGP (("HTML parser ends here (state destroyed).\n"));
363 /* The function returns the base reference of HTML buffer id, or NULL
364 if one wasn't defined for that buffer. */
368 return global_state.base;
371 /* The function returns the pointer to the malloc-ed quoted version of
372 string s. It will recognize and quote numeric and special graphic
373 entities, as per RFC1866:
380 No other entities are recognized or replaced. */
382 html_quote_string (const char *s)
388 /* Pass through the string, and count the new size. */
389 for (i = 0; *s; s++, i++)
393 else if (*s == '<' || *s == '>')
394 i += 3; /* `lt;' and `gt;' */
396 i += 5; /* `quot;' */
398 res = (char *)xmalloc (i + 1);
400 for (p = res; *s; s++)
413 *p++ = (*s == '<' ? 'l' : 'g');
433 /* The function creates an HTML index containing references to given
434 directories and files on the appropriate host. The references are
437 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
441 char *htclfile; /* HTML-clean file name */
445 fp = fopen (file, "wb");
448 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
456 char *tmpu, *tmpp; /* temporary, clean user and passwd */
458 tmpu = CLEANDUP (u->user);
459 tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
460 upwd = (char *)xmalloc (strlen (tmpu)
461 + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
462 sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
468 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
469 fprintf (fp, "<html>\n<head>\n<title>");
470 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
471 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
472 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
473 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
479 /* #### Should we translate the months? */
480 static char *months[] = {
481 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
482 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
484 struct tm *ptm = localtime ((time_t *)&f->tstamp);
486 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
489 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
494 fprintf (fp, _("time unknown "));
498 fprintf (fp, _("File "));
501 fprintf (fp, _("Directory "));
504 fprintf (fp, _("Link "));
507 fprintf (fp, _("Not sure "));
510 htclfile = html_quote_string (f->name);
511 fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
514 fprintf (fp, "%s", u->dir);
517 fprintf (fp, "%s", htclfile);
518 if (f->type == FT_DIRECTORY)
520 fprintf (fp, "\">%s", htclfile);
521 if (f->type == FT_DIRECTORY)
523 fprintf (fp, "</a> ");
524 if (f->type == FT_PLAINFILE)
525 fprintf (fp, _(" (%s bytes)"), legible (f->size));
526 else if (f->type == FT_SYMLINK)
527 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
532 fprintf (fp, "</pre>\n</body>\n</html>\n");