1 /* A simple HTML parser.
2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
43 static state_t global_state;
51 /* Match a string against a null-terminated list of identifiers. */
53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
60 for (i = 0; tags[i].tag; i++)
61 if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
66 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
67 describing URLs to follow. When a tag is encountered, extract its
68 components (as described by html_allow[] array), and return the
69 address and the length of the string. Return NULL if no URL is
72 htmlfindurl (const char *buf, int bufsize, int *size, int init)
76 /* NULL-terminated list of tags and modifiers someone would want to
77 follow -- feel free to edit to suit your needs: */
78 static struct tag_attr html_allow[] = {
82 { "body", "background" },
95 { "table", "background"},
96 { "th", "background"},
97 { "td", "background"},
98 /* Tags below this line are treated specially. */
100 { "meta", "content" },
108 DEBUGP (("Resetting a parser state.\n"));
109 memset (s, 0, sizeof (*s));
116 /* Let's look for a tag, if we are not already in one. */
121 for (; bufsize && *buf != '<'; ++buf, --bufsize);
125 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
130 /* Find the tag end. */
131 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
137 /* <tag=something> is illegal. Just skip it. */
147 s->tag = strdupdelim (p, buf);
156 else /* s->at_value */
158 /* Reset AT_VALUE. */
160 /* If in quotes, just skip out of them and continue living. */
164 for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
174 FREE_MAYBE (s->attr);
175 s->tag = s->attr = NULL;
179 /* Find the attributes. */
182 FREE_MAYBE (s->attr);
186 /* Skip the spaces if we have them. We don't have them at
187 places like <img alt="something"src="something-else">.
190 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
192 if (!bufsize || *buf == '>')
196 /* This is the case of <tag = something>, which is
197 illegal. Just skip it. */
202 /* Find the attribute end. */
203 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
205 if (!bufsize || *buf == '>')
207 /* Construct the attribute. */
208 s->attr = strdupdelim (p, buf);
209 /* Now we must skip the spaces to find '='. */
212 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
213 if (!bufsize || *buf == '>')
216 /* If we still don't have '=', something is amiss. */
219 /* Find the beginning of attribute value by skipping the
222 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
223 if (!bufsize || *buf == '>')
226 /* The value of an attribute can, but does not have to be
228 if (*buf == '\"' || *buf == '\'')
231 s->quote_char = *buf;
233 for (++buf, --bufsize;
234 bufsize && *buf != s->quote_char && *buf != '\n';
245 /* #### Is the following logic good?
247 Obviously no longer in quote. It might be well
248 to check whether '>' was encountered, but that
249 would be encouraging writers of invalid HTMLs,
250 and we don't want that, now do we? */
258 for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
264 /* If '#' was found unprotected in a URI, it is probably an
265 HTML marker, or color spec. */
266 *size = (ph ? ph : buf) - p;
267 /* The URI is liable to be returned if:
269 2) its tag and attribute are found in html_allow. */
270 if (*size && idmatch (html_allow, s->tag, s->attr))
272 if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
274 FREE_MAYBE (s->base);
275 s->base = strdupdelim (p, buf);
277 else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
279 /* Some pages use a META tag to specify that the page
280 be refreshed by a new page after a given number of
281 seconds. We need to attempt to extract an URL for
282 the new page from the other garbage present. The
283 general format for this is:
284 <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
286 So we just need to skip past the "0; URL="
287 garbage to get to the URL. META tags are also
288 used for specifying random things like the page
289 author's name and what editor was used to create
290 it. So we need to be careful to ignore them and
291 not assume that an URL will be present at all. */
292 for (; *size && ISDIGIT (*p); p++, *size -= 1);
295 for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
296 if (!strncasecmp (p, "URL=", 4))
310 /* Exit from quote. */
311 if (*buf == s->quote_char)
316 } while (*buf != '>');
318 FREE_MAYBE (s->attr);
319 s->tag = s->attr = NULL;
325 FREE_MAYBE (s->attr);
326 FREE_MAYBE (s->base);
327 memset (s, 0, sizeof (*s)); /* just to be sure */
328 DEBUGP (("HTML parser ends here (state destroyed).\n"));
332 /* The function returns the base reference of HTML buffer id, or NULL
333 if one wasn't defined for that buffer. */
337 return global_state.base;
340 /* The function returns the pointer to the malloc-ed quoted version of
341 string s. It will recognize and quote numeric and special graphic
342 entities, as per RFC1866:
349 No other entities are recognized or replaced. */
351 html_quote_string (const char *s)
357 /* Pass through the string, and count the new size. */
358 for (i = 0; *s; s++, i++)
362 else if (*s == '<' || *s == '>')
363 i += 3; /* `lt;' and `gt;' */
365 i += 5; /* `quot;' */
367 res = (char *)xmalloc (i + 1);
369 for (p = res; *s; s++)
382 *p++ = (*s == '<' ? 'l' : 'g');
402 /* The function creates an HTML index containing references to given
403 directories and files on the appropriate host. The references are
406 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
410 char *htclfile; /* HTML-clean file name */
414 fp = fopen (file, "wb");
417 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
425 char *tmpu, *tmpp; /* temporary, clean user and passwd */
427 tmpu = CLEANDUP (u->user);
428 tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
429 upwd = (char *)xmalloc (strlen (tmpu)
430 + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
431 sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
437 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
438 fprintf (fp, "<html>\n<head>\n<title>");
439 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
440 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
441 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
442 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
448 /* #### Should we translate the months? */
449 static char *months[] = {
450 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
451 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
453 struct tm *ptm = localtime ((time_t *)&f->tstamp);
455 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
458 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
463 fprintf (fp, _("time unknown "));
467 fprintf (fp, _("File "));
470 fprintf (fp, _("Directory "));
473 fprintf (fp, _("Link "));
476 fprintf (fp, _("Not sure "));
479 htclfile = html_quote_string (f->name);
480 fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
483 fprintf (fp, "%s", u->dir);
486 fprintf (fp, "%s", htclfile);
487 if (f->type == FT_DIRECTORY)
489 fprintf (fp, "\">%s", htclfile);
490 if (f->type == FT_DIRECTORY)
492 fprintf (fp, "</a> ");
493 if (f->type == FT_PLAINFILE)
494 fprintf (fp, _(" (%s bytes)"), legible (f->size));
495 else if (f->type == FT_SYMLINK)
496 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
501 fprintf (fp, "</pre>\n</body>\n</html>\n");