sjero.net Git - wget/blob - src/html.c

   1 /* A simple HTML parser.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <sys/types.h>
  31 #include <errno.h>
  32
  33 #include "wget.h"
  34 #include "url.h"
  35 #include "utils.h"
  36 #include "ftp.h"
  37 #include "html.h"
  38
  39 #ifndef errno
  40 extern int errno;
  41 #endif
  42
  43 static state_t global_state;
  44
  45 struct tag_attr {
  46   char *tag;
  47   char *attr;
  48 };
  49
  50
  51 /* Match a string against a null-terminated list of identifiers.  */
  52 static int
  53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  54 {
  55   int i;
  56
  57   if (!tag || !attr)
  58     return 0;
  59
  60   for (i = 0; tags[i].tag; i++)
  61     if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
  62       return 1;
  63   return 0;
  64 }
  65
  66 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
  67    describing URLs to follow.  When a tag is encountered, extract its
  68    components (as described by html_allow[] array), and return the
  69    address and the length of the string.  Return NULL if no URL is
  70    found.  */
  71 const char *
  72 htmlfindurl (const char *buf, int bufsize, int *size, int init)
  73 {
  74   const char *p, *ph;
  75   state_t *s;
  76   /* NULL-terminated list of tags and modifiers someone would want to
  77      follow -- feel free to edit to suit your needs: */
  78   static struct tag_attr html_allow[] = {
  79     { "a", "href" },
  80     { "img", "src" },
  81     { "img", "href" },
  82     { "body", "background" },
  83     { "frame", "src" },
  84     { "iframe", "src" },
  85     { "fig", "src" },
  86     { "overlay", "src" },
  87     { "applet", "code" },
  88     { "script", "src" },
  89     { "embed", "src" },
  90     { "bgsound", "src" },
  91     { "area", "href" },
  92     { "img", "lowsrc" },
  93     { "input", "src" },
  94     { "layer", "src" },
  95     { "table", "background"},
  96     { "th", "background"},
  97     { "td", "background"},
  98     /* Tags below this line are treated specially.  */
  99     { "base", "href" },
 100     { "meta", "content" },
 101     { NULL, NULL }
 102   };
 103
 104   s = &global_state;
 105
 106   if (init)
 107     {
 108       DEBUGP (("Resetting a parser state.\n"));
 109       memset (s, 0, sizeof (*s));
 110     }
 111
 112   while (1)
 113     {
 114       if (!bufsize)
 115         break;
 116       /* Let's look for a tag, if we are not already in one.  */
 117       if (!s->at_value)
 118         {
 119           /* Find '<'.  */
 120           if (*buf != '<')
 121             for (; bufsize && *buf != '<'; ++buf, --bufsize);
 122           if (!bufsize)
 123             break;
 124           /* Skip spaces.  */
 125           for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 126                ++buf, --bufsize);
 127           if (!bufsize)
 128             break;
 129           p = buf;
 130           /* Find the tag end.  */
 131           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 132                ++buf, --bufsize);
 133           if (!bufsize)
 134             break;
 135           if (*buf == '=')
 136             {
 137               /* <tag=something> is illegal.  Just skip it.  */
 138               ++buf, --bufsize;
 139               continue;
 140             }
 141           if (p == buf)
 142             {
 143               /* *buf == '>'.  */
 144               ++buf, --bufsize;
 145               continue;
 146             }
 147           s->tag = strdupdelim (p, buf);
 148           if (*buf == '>')
 149             {
 150               free (s->tag);
 151               s->tag = NULL;
 152               ++buf, --bufsize;
 153               continue;
 154             }
 155         }
 156       else                      /* s->at_value */
 157         {
 158           /* Reset AT_VALUE.  */
 159           s->at_value = 0;
 160           /* If in quotes, just skip out of them and continue living.  */
 161           if (s->in_quote)
 162             {
 163               s->in_quote = 0;
 164               for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
 165               if (!bufsize)
 166                 break;
 167               ++buf, --bufsize;
 168             }
 169           if (!bufsize)
 170             break;
 171           if (*buf == '>')
 172             {
 173               FREE_MAYBE (s->tag);
 174               FREE_MAYBE (s->attr);
 175               s->tag = s->attr = NULL;
 176               continue;
 177             }
 178         }
 179       /* Find the attributes.  */
 180       do
 181         {
 182           FREE_MAYBE (s->attr);
 183           s->attr = NULL;
 184           if (!bufsize)
 185             break;
 186           /* Skip the spaces if we have them.  We don't have them at
 187              places like <img alt="something"src="something-else">.
 188                                              ^ no spaces here */
 189           if (ISSPACE (*buf))
 190             for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 191                  ++buf, --bufsize);
 192           if (!bufsize || *buf == '>')
 193             break;
 194           if (*buf == '=')
 195             {
 196               /* This is the case of <tag = something>, which is
 197                  illegal.  Just skip it.  */
 198               ++buf, --bufsize;
 199               continue;
 200             }
 201           p = buf;
 202           /* Find the attribute end.  */
 203           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 204                ++buf, --bufsize);
 205           if (!bufsize || *buf == '>')
 206             break;
 207           /* Construct the attribute.  */
 208           s->attr = strdupdelim (p, buf);
 209           /* Now we must skip the spaces to find '='.  */
 210           if (*buf != '=')
 211             {
 212               for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 213               if (!bufsize || *buf == '>')
 214                 break;
 215             }
 216           /* If we still don't have '=', something is amiss.  */
 217           if (*buf != '=')
 218             continue;
 219           /* Find the beginning of attribute value by skipping the
 220              spaces.  */
 221           ++buf, --bufsize;
 222           for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 223           if (!bufsize || *buf == '>')
 224             break;
 225           ph = NULL;
 226           /* The value of an attribute can, but does not have to be
 227              quoted.  */
 228           if (*buf == '\"' || *buf == '\'')
 229             {
 230               s->in_quote = 1;
 231               s->quote_char = *buf;
 232               p = buf + 1;
 233               for (++buf, --bufsize;
 234                    bufsize && *buf != s->quote_char && *buf != '\n';
 235                    ++buf, --bufsize)
 236                 if (*buf == '#')
 237                   ph = buf;
 238               if (!bufsize)
 239                 {
 240                   s->in_quote = 0;
 241                   break;
 242                 }
 243               if (*buf == '\n')
 244                 {
 245                   /* #### Is the following logic good?
 246
 247                      Obviously no longer in quote.  It might be well
 248                      to check whether '>' was encountered, but that
 249                      would be encouraging writers of invalid HTMLs,
 250                      and we don't want that, now do we?  */
 251                   s->in_quote = 0;
 252                   continue;
 253                 }
 254             }
 255           else
 256             {
 257               p = buf;
 258               for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
 259                 if (*buf == '#')
 260                   ph = buf;
 261               if (!bufsize)
 262                 break;
 263             }
 264           /* If '#' was found unprotected in a URI, it is probably an
 265              HTML marker, or color spec.  */
 266           *size = (ph ? ph : buf) - p;
 267           /* The URI is liable to be returned if:
 268              1) *size != 0;
 269              2) its tag and attribute are found in html_allow.  */
 270           if (*size && idmatch (html_allow, s->tag, s->attr))
 271             {
 272               if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
 273                 {
 274                   FREE_MAYBE (s->base);
 275                   s->base = strdupdelim (p, buf);
 276                 }
 277               else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
 278                 {
 279                   /* Some pages use a META tag to specify that the page
 280                      be refreshed by a new page after a given number of
 281                      seconds.  We need to attempt to extract an URL for
 282                      the new page from the other garbage present.  The
 283                      general format for this is:
 284                      <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
 285
 286                      So we just need to skip past the "0; URL="
 287                      garbage to get to the URL.  META tags are also
 288                      used for specifying random things like the page
 289                      author's name and what editor was used to create
 290                      it.  So we need to be careful to ignore them and
 291                      not assume that an URL will be present at all.  */
 292                   for (; *size && ISDIGIT (*p); p++, *size -= 1);
 293                   if (*p == ';')
 294                     {
 295                       for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
 296                       if (!strncasecmp (p, "URL=", 4))
 297                         {
 298                           p += 4, *size -= 4;
 299                           s->at_value = 1;
 300                           return p;
 301                         }
 302                     }
 303                 }
 304               else
 305                 {
 306                   s->at_value = 1;
 307                   return p;
 308                 }
 309             }
 310           /* Exit from quote.  */
 311           if (*buf == s->quote_char)
 312             {
 313               s->in_quote = 0;
 314               ++buf, --bufsize;
 315             }
 316         } while (*buf != '>');
 317       FREE_MAYBE (s->tag);
 318       FREE_MAYBE (s->attr);
 319       s->tag = s->attr = NULL;
 320       if (!bufsize)
 321         break;
 322     }
 323
 324   FREE_MAYBE (s->tag);
 325   FREE_MAYBE (s->attr);
 326   FREE_MAYBE (s->base);
 327   memset (s, 0, sizeof (*s));   /* just to be sure */
 328   DEBUGP (("HTML parser ends here (state destroyed).\n"));
 329   return NULL;
 330 }
 331
 332 /* The function returns the base reference of HTML buffer id, or NULL
 333    if one wasn't defined for that buffer.  */
 334 const char *
 335 html_base (void)
 336 {
 337   return global_state.base;
 338 }
 339
 340 /* The function returns the pointer to the malloc-ed quoted version of
 341    string s.  It will recognize and quote numeric and special graphic
 342    entities, as per RFC1866:
 343
 344    `&' -> `&amp;'
 345    `<' -> `&lt;'
 346    `>' -> `&gt;'
 347    `"' -> `&quot;'
 348
 349    No other entities are recognized or replaced.  */
 350 static char *
 351 html_quote_string (const char *s)
 352 {
 353   const char *b = s;
 354   char *p, *res;
 355   int i;
 356
 357   /* Pass through the string, and count the new size.  */
 358   for (i = 0; *s; s++, i++)
 359     {
 360       if (*s == '&')
 361         i += 4;                /* `amp;' */
 362       else if (*s == '<' || *s == '>')
 363         i += 3;                /* `lt;' and `gt;' */
 364       else if (*s == '\"')
 365         i += 5;                /* `quot;' */
 366     }
 367   res = (char *)xmalloc (i + 1);
 368   s = b;
 369   for (p = res; *s; s++)
 370     {
 371       switch (*s)
 372         {
 373         case '&':
 374           *p++ = '&';
 375           *p++ = 'a';
 376           *p++ = 'm';
 377           *p++ = 'p';
 378           *p++ = ';';
 379           break;
 380         case '<': case '>':
 381           *p++ = '&';
 382           *p++ = (*s == '<' ? 'l' : 'g');
 383           *p++ = 't';
 384           *p++ = ';';
 385           break;
 386         case '\"':
 387           *p++ = '&';
 388           *p++ = 'q';
 389           *p++ = 'u';
 390           *p++ = 'o';
 391           *p++ = 't';
 392           *p++ = ';';
 393           break;
 394         default:
 395           *p++ = *s;
 396         }
 397     }
 398   *p = '\0';
 399   return res;
 400 }
 401
 402 /* The function creates an HTML index containing references to given
 403    directories and files on the appropriate host.  The references are
 404    FTP.  */
 405 uerr_t
 406 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
 407 {
 408   FILE *fp;
 409   char *upwd;
 410   char *htclfile;               /* HTML-clean file name */
 411
 412   if (!opt.dfp)
 413     {
 414       fp = fopen (file, "wb");
 415       if (!fp)
 416         {
 417           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 418           return FOPENERR;
 419         }
 420     }
 421   else
 422     fp = opt.dfp;
 423   if (u->user)
 424     {
 425       char *tmpu, *tmpp;        /* temporary, clean user and passwd */
 426
 427       tmpu = CLEANDUP (u->user);
 428       tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
 429       upwd = (char *)xmalloc (strlen (tmpu)
 430                              + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
 431       sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
 432       free (tmpu);
 433       FREE_MAYBE (tmpp);
 434     }
 435   else
 436     upwd = xstrdup ("");
 437   fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
 438   fprintf (fp, "<html>\n<head>\n<title>");
 439   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 440   fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
 441   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 442   fprintf (fp, "</h1>\n<hr>\n<pre>\n");
 443   while (f)
 444     {
 445       fprintf (fp, "  ");
 446       if (f->tstamp != -1)
 447         {
 448           /* #### Should we translate the months? */
 449           static char *months[] = {
 450             "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 451             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 452           };
 453           struct tm *ptm = localtime ((time_t *)&f->tstamp);
 454
 455           fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
 456                   ptm->tm_mday);
 457           if (ptm->tm_hour)
 458             fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
 459           else
 460             fprintf (fp, "       ");
 461         }
 462       else
 463         fprintf (fp, _("time unknown       "));
 464       switch (f->type)
 465         {
 466         case FT_PLAINFILE:
 467           fprintf (fp, _("File        "));
 468           break;
 469         case FT_DIRECTORY:
 470           fprintf (fp, _("Directory   "));
 471           break;
 472         case FT_SYMLINK:
 473           fprintf (fp, _("Link        "));
 474           break;
 475         default:
 476           fprintf (fp, _("Not sure    "));
 477           break;
 478         }
 479       htclfile = html_quote_string (f->name);
 480       fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
 481       if (*u->dir != '/')
 482         putc ('/', fp);
 483       fprintf (fp, "%s", u->dir);
 484       if (*u->dir)
 485         putc ('/', fp);
 486       fprintf (fp, "%s", htclfile);
 487       if (f->type == FT_DIRECTORY)
 488         putc ('/', fp);
 489       fprintf (fp, "\">%s", htclfile);
 490       if (f->type == FT_DIRECTORY)
 491         putc ('/', fp);
 492       fprintf (fp, "</a> ");
 493       if (f->type == FT_PLAINFILE)
 494         fprintf (fp, _(" (%s bytes)"), legible (f->size));
 495       else if (f->type == FT_SYMLINK)
 496         fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
 497       putc ('\n', fp);
 498       free (htclfile);
 499       f = f->next;
 500     }
 501   fprintf (fp, "</pre>\n</body>\n</html>\n");
 502   free (upwd);
 503   if (!opt.dfp)
 504     fclose (fp);
 505   else
 506     fflush (fp);
 507   return FTPOK;
 508 }