1 /* A simple HTML parser.
2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
43 static state_t global_state;
51 /* Match a string against a null-terminated list of identifiers. */
53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
57 if (tag == NULL || attr == NULL)
60 for (i = 0; tags[i].tag; i++)
61 /* Loop through all the tags wget ever cares about. */
62 if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
63 /* The tag and attribute matched one of the ones wget cares about. */
66 /* --ignore-tags was specified. Do not match these specific tags.
67 --ignore-tags takes precedence over --follow-tags, so we process
68 --ignore first and fall through if there's no match. */
69 for (j = 0; opt.ignore_tags[j] != NULL; j++)
70 /* Loop through all the tags this user doesn't care about. */
71 if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
75 /* --follow-tags was specified. Only match these specific tags, so
76 return FALSE if we don't match one of them. */
78 for (j = 0; opt.follow_tags[j] != NULL; j++)
79 /* Loop through all the tags this user cares about. */
80 if (strcasecmp(opt.follow_tags[j], tag) == EQ)
83 return FALSE; /* wasn't one of the explicitly desired tags */
86 /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
87 if specified, didn't include this tag, so it's okay to follow. */
91 return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
94 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
95 describing URLs to follow. When a tag is encountered, extract its
96 components (as described by html_allow[] array), and return the
97 address and the length of the string. Return NULL if no URL is
100 htmlfindurl (const char *buf, int bufsize, int *size, int init,
101 int dash_p_leaf_HTML)
104 state_t *s = &global_state;
106 /* NULL-terminated list of tags and modifiers someone would want to
107 follow -- feel free to edit to suit your needs: */
108 static struct tag_attr html_allow[] = {
112 { "body", "background" },
116 { "overlay", "src" },
117 { "applet", "code" },
120 { "bgsound", "src" },
124 { "table", "background"},
125 { "th", "background"},
126 { "td", "background"},
127 /* Tags below this line are treated specially. */
133 { "meta", "content" },
139 DEBUGP (("Resetting a parser state.\n"));
140 memset (s, 0, sizeof (*s));
145 const char* link_href = NULL;
146 const char* link_rel = NULL;
147 int link_href_saved_size = 0; /* init. just to shut up warning */
151 /* Let's look for a tag, if we are not already in one. */
156 for (; bufsize && *buf != '<'; ++buf, --bufsize);
160 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
165 /* Find the tag end. */
166 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
172 /* <tag=something> is illegal. Just skip it. */
182 s->tag = strdupdelim (p, buf);
191 else /* s->at_value */
193 /* Reset AT_VALUE. */
195 /* If in quotes, just skip out of them and continue living. */
199 for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
209 FREE_MAYBE (s->attr);
210 s->tag = s->attr = NULL;
214 /* Find the attributes. */
217 FREE_MAYBE (s->attr);
221 /* Skip the spaces if we have them. We don't have them at
222 places like <img alt="something"src="something-else">.
225 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
227 if (!bufsize || *buf == '>')
231 /* This is the case of <tag = something>, which is
232 illegal. Just skip it. */
237 /* Find the attribute end. */
238 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
240 if (!bufsize || *buf == '>')
242 /* Construct the attribute. */
243 s->attr = strdupdelim (p, buf);
244 /* Now we must skip the spaces to find '='. */
247 for (; bufsize && ISSPACE (*buf) && *buf != '>';
249 if (!bufsize || *buf == '>')
252 /* If we still don't have '=', something is amiss. */
255 /* Find the beginning of attribute value by skipping the
258 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
259 if (!bufsize || *buf == '>')
262 /* The value of an attribute can, but does not have to be
264 if (*buf == '\"' || *buf == '\'')
267 s->quote_char = *buf;
269 for (++buf, --bufsize;
270 bufsize && *buf != s->quote_char && *buf != '\n';
272 if (!ph && *buf == '#' && *(buf - 1) != '&')
281 /* #### Is the following logic good?
283 Obviously no longer in quote. It might be well
284 to check whether '>' was encountered, but that
285 would be encouraging writers of invalid HTMLs,
286 and we don't want that, now do we? */
294 for (; bufsize && !ISSPACE (*buf) && *buf != '>';
296 if (!ph && *buf == '#' && *(buf - 1) != '&')
301 /* If '#' was found unprotected in a URI, it is probably an
302 HTML marker, or color spec. */
303 *size = (ph ? ph : buf) - p;
304 /* The URI is liable to be returned if:
306 2) its tag and attribute are found in html_allow. */
307 if (*size && idmatch (html_allow, s->tag, s->attr))
309 if (strcasecmp(s->tag, "a") == EQ ||
310 strcasecmp(s->tag, "area") == EQ)
312 /* Only follow these if we're not at a -p leaf node, as they
313 always link to external documents. */
314 if (!dash_p_leaf_HTML)
320 else if (!strcasecmp (s->tag, "base") &&
321 !strcasecmp (s->attr, "href"))
323 FREE_MAYBE (s->base);
324 s->base = strdupdelim (p, buf);
326 else if (strcasecmp(s->tag, "link") == EQ)
328 if (strcasecmp(s->attr, "href") == EQ)
331 link_href_saved_size = *size; /* for restoration below */
333 else if (strcasecmp(s->attr, "rel") == EQ)
336 if (link_href != NULL && link_rel != NULL)
337 /* Okay, we've now seen this <LINK> tag's HREF and REL
338 attributes (they may be in either order), so it's now
339 possible to decide if we want to traverse it. */
340 if (!dash_p_leaf_HTML ||
341 strncasecmp(link_rel, "stylesheet",
342 sizeof("stylesheet") - 1) == EQ)
343 /* In the normal case, all <LINK> tags are fair game.
345 In the special case of when -p is active, however, and
346 we're at a leaf node (relative to the -l max. depth) in
347 the HTML document tree, the only <LINK> tag we'll
348 follow is a <LINK REL="stylesheet">, as it's necessary
349 for displaying this document properly. We won't follow
350 other <LINK> tags, like <LINK REL="home">, for
351 instance, as they refer to external documents.
353 Note that the above strncasecmp() will incorrectly
354 consider something like '<LINK REL="stylesheet.old"' as
355 equivalent to '<LINK REL="stylesheet"'. Not really
356 worth the trouble to explicitly check for such cases --
357 if time is spent, it should be spent ripping out wget's
358 somewhat kludgy HTML parser and hooking in a real,
359 componentized one. */
361 /* When we return, the 'size' IN/OUT parameter
362 determines where in the buffer the end of the current
363 attribute value is. If REL came after HREF in this
364 <LINK> tag, size is currently set to the size for
365 REL's value -- set it to what it was when we were
366 looking at HREF's value. */
367 *size = link_href_saved_size;
373 else if (!strcasecmp (s->tag, "meta") &&
374 !strcasecmp (s->attr, "content"))
376 /* Some pages use a META tag to specify that the page
377 be refreshed by a new page after a given number of
378 seconds. We need to attempt to extract an URL for
379 the new page from the other garbage present. The
380 general format for this is:
381 <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
383 So we just need to skip past the "0; URL="
384 garbage to get to the URL. META tags are also
385 used for specifying random things like the page
386 author's name and what editor was used to create
387 it. So we need to be careful to ignore them and
388 not assume that an URL will be present at all. */
389 for (; *size && ISDIGIT (*p); p++, *size -= 1);
392 for (p++, *size -= 1;
393 *size && ISSPACE (*p);
395 if (!strncasecmp (p, "URL=", 4))
409 /* Exit from quote. */
410 if (*buf == s->quote_char)
415 } while (*buf != '>');
417 FREE_MAYBE (s->attr);
418 s->tag = s->attr = NULL;
424 FREE_MAYBE (s->attr);
425 FREE_MAYBE (s->base);
426 memset (s, 0, sizeof (*s)); /* just to be sure */
427 DEBUGP (("HTML parser ends here (state destroyed).\n"));
431 /* The function returns the base reference of HTML buffer id, or NULL
432 if one wasn't defined for that buffer. */
436 return global_state.base;
439 /* Create a malloc'ed copy of text in the range [beg, end), but with
440 the HTML entities processed. Recognized entities are <, >,
441 &, ",   and the numerical entities. */
444 html_decode_entities (const char *beg, const char *end)
446 char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
447 const char *from = beg;
456 const char *save = from;
459 if (++from == end) goto lose;
466 if (from == end || !ISDIGIT (*from)) goto lose;
467 for (numeric = 0; from < end && ISDIGIT (*from); from++)
468 numeric = 10 * numeric + (*from) - '0';
469 if (from < end && ISALPHA (*from)) goto lose;
473 #define FROB(literal) (remain >= (sizeof (literal) - 1) \
474 && !memcmp (from, literal, sizeof (literal) - 1) \
475 && (*(from + sizeof (literal) - 1) == ';' \
476 || remain == sizeof (literal) - 1 \
477 || !ISALNUM (*(from + sizeof (literal) - 1))))
478 else if (FROB ("lt"))
479 *to++ = '<', from += 2;
480 else if (FROB ("gt"))
481 *to++ = '>', from += 2;
482 else if (FROB ("amp"))
483 *to++ = '&', from += 3;
484 else if (FROB ("quot"))
485 *to++ = '\"', from += 4;
486 /* We don't implement the "Added Latin 1" entities proposed
487 by rfc1866 (except for nbsp), because it is unnecessary
488 in the context of Wget, and would require hashing to work
490 else if (FROB ("nbsp"))
491 *to++ = 160, from += 4;
495 /* If the entity was followed by `;', we step over the `;'.
496 Otherwise, it was followed by either a non-alphanumeric
497 or EOB, in which case we do nothing. */
498 if (from < end && *from == ';')
503 /* This was not an entity after all. Back out. */
509 /* #### Should we try to do this: */
511 newstr = xrealloc (newstr, to - newstr);
516 /* The function returns the pointer to the malloc-ed quoted version of
517 string s. It will recognize and quote numeric and special graphic
518 entities, as per RFC1866:
525 No other entities are recognized or replaced. */
527 html_quote_string (const char *s)
533 /* Pass through the string, and count the new size. */
534 for (i = 0; *s; s++, i++)
538 else if (*s == '<' || *s == '>')
539 i += 3; /* `lt;' and `gt;' */
541 i += 5; /* `quot;' */
543 res = (char *)xmalloc (i + 1);
545 for (p = res; *s; s++)
558 *p++ = (*s == '<' ? 'l' : 'g');
578 /* The function creates an HTML index containing references to given
579 directories and files on the appropriate host. The references are
582 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
586 char *htclfile; /* HTML-clean file name */
590 fp = fopen (file, "wb");
593 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
601 char *tmpu, *tmpp; /* temporary, clean user and passwd */
603 tmpu = CLEANDUP (u->user);
604 tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
605 upwd = (char *)xmalloc (strlen (tmpu)
606 + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
607 sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
613 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
614 fprintf (fp, "<html>\n<head>\n<title>");
615 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
616 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
617 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
618 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
624 /* #### Should we translate the months? */
625 static char *months[] = {
626 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
627 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
629 struct tm *ptm = localtime ((time_t *)&f->tstamp);
631 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
634 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
639 fprintf (fp, _("time unknown "));
643 fprintf (fp, _("File "));
646 fprintf (fp, _("Directory "));
649 fprintf (fp, _("Link "));
652 fprintf (fp, _("Not sure "));
655 htclfile = html_quote_string (f->name);
656 fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
659 fprintf (fp, "%s", u->dir);
662 fprintf (fp, "%s", htclfile);
663 if (f->type == FT_DIRECTORY)
665 fprintf (fp, "\">%s", htclfile);
666 if (f->type == FT_DIRECTORY)
668 fprintf (fp, "</a> ");
669 if (f->type == FT_PLAINFILE)
670 fprintf (fp, _(" (%s bytes)"), legible (f->size));
671 else if (f->type == FT_SYMLINK)
672 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
677 fprintf (fp, "</pre>\n</body>\n</html>\n");