1 /* A simple HTML parser.
2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
43 static state_t global_state;
51 /* Match a string against a null-terminated list of identifiers. */
53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
60 for (i = 0; tags[i].tag; i++)
61 if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
66 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
67 describing URLs to follow. When a tag is encountered, extract its
68 components (as described by html_allow[] array), and return the
69 address and the length of the string. Return NULL if no URL is
72 htmlfindurl (const char *buf, int bufsize, int *size, int init)
76 /* NULL-terminated list of tags and modifiers someone would want to
77 follow -- feel free to edit to suit your needs: */
78 static struct tag_attr html_allow[] = {
84 { "body", "background" },
97 { "table", "background"},
98 { "th", "background"},
99 { "td", "background"},
100 /* Tags below this line are treated specially. */
102 { "meta", "content" },
110 DEBUGP (("Resetting a parser state.\n"));
111 memset (s, 0, sizeof (*s));
118 /* Let's look for a tag, if we are not already in one. */
123 for (; bufsize && *buf != '<'; ++buf, --bufsize);
127 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
132 /* Find the tag end. */
133 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
139 /* <tag=something> is illegal. Just skip it. */
149 s->tag = strdupdelim (p, buf);
158 else /* s->at_value */
160 /* Reset AT_VALUE. */
162 /* If in quotes, just skip out of them and continue living. */
166 for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
176 FREE_MAYBE (s->attr);
177 s->tag = s->attr = NULL;
181 /* Find the attributes. */
184 FREE_MAYBE (s->attr);
188 /* Skip the spaces if we have them. We don't have them at
189 places like <img alt="something"src="something-else">.
192 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
194 if (!bufsize || *buf == '>')
198 /* This is the case of <tag = something>, which is
199 illegal. Just skip it. */
204 /* Find the attribute end. */
205 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
207 if (!bufsize || *buf == '>')
209 /* Construct the attribute. */
210 s->attr = strdupdelim (p, buf);
211 /* Now we must skip the spaces to find '='. */
214 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
215 if (!bufsize || *buf == '>')
218 /* If we still don't have '=', something is amiss. */
221 /* Find the beginning of attribute value by skipping the
224 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
225 if (!bufsize || *buf == '>')
228 /* The value of an attribute can, but does not have to be
230 if (*buf == '\"' || *buf == '\'')
233 s->quote_char = *buf;
235 for (++buf, --bufsize;
236 bufsize && *buf != s->quote_char && *buf != '\n';
247 /* #### Is the following logic good?
249 Obviously no longer in quote. It might be well
250 to check whether '>' was encountered, but that
251 would be encouraging writers of invalid HTMLs,
252 and we don't want that, now do we? */
260 for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
266 /* If '#' was found unprotected in a URI, it is probably an
267 HTML marker, or color spec. */
268 *size = (ph ? ph : buf) - p;
269 /* The URI is liable to be returned if:
271 2) its tag and attribute are found in html_allow. */
272 if (*size && idmatch (html_allow, s->tag, s->attr))
274 if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
276 FREE_MAYBE (s->base);
277 s->base = strdupdelim (p, buf);
279 else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
281 /* Some pages use a META tag to specify that the page
282 be refreshed by a new page after a given number of
283 seconds. We need to attempt to extract an URL for
284 the new page from the other garbage present. The
285 general format for this is:
286 <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
288 So we just need to skip past the "0; URL="
289 garbage to get to the URL. META tags are also
290 used for specifying random things like the page
291 author's name and what editor was used to create
292 it. So we need to be careful to ignore them and
293 not assume that an URL will be present at all. */
294 for (; *size && ISDIGIT (*p); p++, *size -= 1);
297 for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
298 if (!strncasecmp (p, "URL=", 4))
312 /* Exit from quote. */
313 if (*buf == s->quote_char)
318 } while (*buf != '>');
320 FREE_MAYBE (s->attr);
321 s->tag = s->attr = NULL;
327 FREE_MAYBE (s->attr);
328 FREE_MAYBE (s->base);
329 memset (s, 0, sizeof (*s)); /* just to be sure */
330 DEBUGP (("HTML parser ends here (state destroyed).\n"));
334 /* The function returns the base reference of HTML buffer id, or NULL
335 if one wasn't defined for that buffer. */
339 return global_state.base;
342 /* The function returns the pointer to the malloc-ed quoted version of
343 string s. It will recognize and quote numeric and special graphic
344 entities, as per RFC1866:
351 No other entities are recognized or replaced. */
353 html_quote_string (const char *s)
359 /* Pass through the string, and count the new size. */
360 for (i = 0; *s; s++, i++)
364 else if (*s == '<' || *s == '>')
365 i += 3; /* `lt;' and `gt;' */
367 i += 5; /* `quot;' */
369 res = (char *)xmalloc (i + 1);
371 for (p = res; *s; s++)
384 *p++ = (*s == '<' ? 'l' : 'g');
404 /* The function creates an HTML index containing references to given
405 directories and files on the appropriate host. The references are
408 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
412 char *htclfile; /* HTML-clean file name */
416 fp = fopen (file, "wb");
419 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
427 char *tmpu, *tmpp; /* temporary, clean user and passwd */
429 tmpu = CLEANDUP (u->user);
430 tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
431 upwd = (char *)xmalloc (strlen (tmpu)
432 + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
433 sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
439 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
440 fprintf (fp, "<html>\n<head>\n<title>");
441 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
442 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
443 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
444 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
450 /* #### Should we translate the months? */
451 static char *months[] = {
452 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
453 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
455 struct tm *ptm = localtime ((time_t *)&f->tstamp);
457 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
460 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
465 fprintf (fp, _("time unknown "));
469 fprintf (fp, _("File "));
472 fprintf (fp, _("Directory "));
475 fprintf (fp, _("Link "));
478 fprintf (fp, _("Not sure "));
481 htclfile = html_quote_string (f->name);
482 fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
485 fprintf (fp, "%s", u->dir);
488 fprintf (fp, "%s", htclfile);
489 if (f->type == FT_DIRECTORY)
491 fprintf (fp, "\">%s", htclfile);
492 if (f->type == FT_DIRECTORY)
494 fprintf (fp, "</a> ");
495 if (f->type == FT_PLAINFILE)
496 fprintf (fp, _(" (%s bytes)"), legible (f->size));
497 else if (f->type == FT_SYMLINK)
498 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
503 fprintf (fp, "</pre>\n</body>\n</html>\n");