sjero.net Git - wget/blob - src/html.c

   1 /* A simple HTML parser.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <sys/types.h>
  31 #include <errno.h>
  32
  33 #include "wget.h"
  34 #include "url.h"
  35 #include "utils.h"
  36 #include "ftp.h"
  37 #include "html.h"
  38
  39 #ifndef errno
  40 extern int errno;
  41 #endif
  42
  43 static state_t global_state;
  44
  45 struct tag_attr {
  46   char *tag;
  47   char *attr;
  48 };
  49
  50
  51 /* Match a string against a null-terminated list of identifiers.  */
  52 static int
  53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  54 {
  55   int i;
  56
  57   if (!tag || !attr)
  58     return 0;
  59
  60   for (i = 0; tags[i].tag; i++)
  61     if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
  62       return 1;
  63   return 0;
  64 }
  65
  66 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
  67    describing URLs to follow.  When a tag is encountered, extract its
  68    components (as described by html_allow[] array), and return the
  69    address and the length of the string.  Return NULL if no URL is
  70    found.  */
  71 const char *
  72 htmlfindurl (const char *buf, int bufsize, int *size, int init)
  73 {
  74   const char *p, *ph;
  75   state_t *s;
  76   /* NULL-terminated list of tags and modifiers someone would want to
  77      follow -- feel free to edit to suit your needs: */
  78   static struct tag_attr html_allow[] = {
  79     { "a", "href" },
  80     { "link", "href" },
  81     { "script", "src" },
  82     { "img", "src" },
  83     { "img", "href" },
  84     { "body", "background" },
  85     { "frame", "src" },
  86     { "iframe", "src" },
  87     { "fig", "src" },
  88     { "overlay", "src" },
  89     { "applet", "code" },
  90     { "script", "src" },
  91     { "embed", "src" },
  92     { "bgsound", "src" },
  93     { "area", "href" },
  94     { "img", "lowsrc" },
  95     { "input", "src" },
  96     { "layer", "src" },
  97     { "table", "background"},
  98     { "th", "background"},
  99     { "td", "background"},
 100     /* Tags below this line are treated specially.  */
 101     { "base", "href" },
 102     { "meta", "content" },
 103     { NULL, NULL }
 104   };
 105
 106   s = &global_state;
 107
 108   if (init)
 109     {
 110       DEBUGP (("Resetting a parser state.\n"));
 111       memset (s, 0, sizeof (*s));
 112     }
 113
 114   while (1)
 115     {
 116       if (!bufsize)
 117         break;
 118       /* Let's look for a tag, if we are not already in one.  */
 119       if (!s->at_value)
 120         {
 121           /* Find '<'.  */
 122           if (*buf != '<')
 123             for (; bufsize && *buf != '<'; ++buf, --bufsize);
 124           if (!bufsize)
 125             break;
 126           /* Skip spaces.  */
 127           for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 128                ++buf, --bufsize);
 129           if (!bufsize)
 130             break;
 131           p = buf;
 132           /* Find the tag end.  */
 133           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 134                ++buf, --bufsize);
 135           if (!bufsize)
 136             break;
 137           if (*buf == '=')
 138             {
 139               /* <tag=something> is illegal.  Just skip it.  */
 140               ++buf, --bufsize;
 141               continue;
 142             }
 143           if (p == buf)
 144             {
 145               /* *buf == '>'.  */
 146               ++buf, --bufsize;
 147               continue;
 148             }
 149           s->tag = strdupdelim (p, buf);
 150           if (*buf == '>')
 151             {
 152               free (s->tag);
 153               s->tag = NULL;
 154               ++buf, --bufsize;
 155               continue;
 156             }
 157         }
 158       else                      /* s->at_value */
 159         {
 160           /* Reset AT_VALUE.  */
 161           s->at_value = 0;
 162           /* If in quotes, just skip out of them and continue living.  */
 163           if (s->in_quote)
 164             {
 165               s->in_quote = 0;
 166               for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
 167               if (!bufsize)
 168                 break;
 169               ++buf, --bufsize;
 170             }
 171           if (!bufsize)
 172             break;
 173           if (*buf == '>')
 174             {
 175               FREE_MAYBE (s->tag);
 176               FREE_MAYBE (s->attr);
 177               s->tag = s->attr = NULL;
 178               continue;
 179             }
 180         }
 181       /* Find the attributes.  */
 182       do
 183         {
 184           FREE_MAYBE (s->attr);
 185           s->attr = NULL;
 186           if (!bufsize)
 187             break;
 188           /* Skip the spaces if we have them.  We don't have them at
 189              places like <img alt="something"src="something-else">.
 190                                              ^ no spaces here */
 191           if (ISSPACE (*buf))
 192             for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 193                  ++buf, --bufsize);
 194           if (!bufsize || *buf == '>')
 195             break;
 196           if (*buf == '=')
 197             {
 198               /* This is the case of <tag = something>, which is
 199                  illegal.  Just skip it.  */
 200               ++buf, --bufsize;
 201               continue;
 202             }
 203           p = buf;
 204           /* Find the attribute end.  */
 205           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 206                ++buf, --bufsize);
 207           if (!bufsize || *buf == '>')
 208             break;
 209           /* Construct the attribute.  */
 210           s->attr = strdupdelim (p, buf);
 211           /* Now we must skip the spaces to find '='.  */
 212           if (*buf != '=')
 213             {
 214               for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 215               if (!bufsize || *buf == '>')
 216                 break;
 217             }
 218           /* If we still don't have '=', something is amiss.  */
 219           if (*buf != '=')
 220             continue;
 221           /* Find the beginning of attribute value by skipping the
 222              spaces.  */
 223           ++buf, --bufsize;
 224           for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 225           if (!bufsize || *buf == '>')
 226             break;
 227           ph = NULL;
 228           /* The value of an attribute can, but does not have to be
 229              quoted.  */
 230           if (*buf == '\"' || *buf == '\'')
 231             {
 232               s->in_quote = 1;
 233               s->quote_char = *buf;
 234               p = buf + 1;
 235               for (++buf, --bufsize;
 236                    bufsize && *buf != s->quote_char && *buf != '\n';
 237                    ++buf, --bufsize)
 238                 if (*buf == '#')
 239                   ph = buf;
 240               if (!bufsize)
 241                 {
 242                   s->in_quote = 0;
 243                   break;
 244                 }
 245               if (*buf == '\n')
 246                 {
 247                   /* #### Is the following logic good?
 248
 249                      Obviously no longer in quote.  It might be well
 250                      to check whether '>' was encountered, but that
 251                      would be encouraging writers of invalid HTMLs,
 252                      and we don't want that, now do we?  */
 253                   s->in_quote = 0;
 254                   continue;
 255                 }
 256             }
 257           else
 258             {
 259               p = buf;
 260               for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
 261                 if (*buf == '#')
 262                   ph = buf;
 263               if (!bufsize)
 264                 break;
 265             }
 266           /* If '#' was found unprotected in a URI, it is probably an
 267              HTML marker, or color spec.  */
 268           *size = (ph ? ph : buf) - p;
 269           /* The URI is liable to be returned if:
 270              1) *size != 0;
 271              2) its tag and attribute are found in html_allow.  */
 272           if (*size && idmatch (html_allow, s->tag, s->attr))
 273             {
 274               if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
 275                 {
 276                   FREE_MAYBE (s->base);
 277                   s->base = strdupdelim (p, buf);
 278                 }
 279               else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
 280                 {
 281                   /* Some pages use a META tag to specify that the page
 282                      be refreshed by a new page after a given number of
 283                      seconds.  We need to attempt to extract an URL for
 284                      the new page from the other garbage present.  The
 285                      general format for this is:
 286                      <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
 287
 288                      So we just need to skip past the "0; URL="
 289                      garbage to get to the URL.  META tags are also
 290                      used for specifying random things like the page
 291                      author's name and what editor was used to create
 292                      it.  So we need to be careful to ignore them and
 293                      not assume that an URL will be present at all.  */
 294                   for (; *size && ISDIGIT (*p); p++, *size -= 1);
 295                   if (*p == ';')
 296                     {
 297                       for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
 298                       if (!strncasecmp (p, "URL=", 4))
 299                         {
 300                           p += 4, *size -= 4;
 301                           s->at_value = 1;
 302                           return p;
 303                         }
 304                     }
 305                 }
 306               else
 307                 {
 308                   s->at_value = 1;
 309                   return p;
 310                 }
 311             }
 312           /* Exit from quote.  */
 313           if (*buf == s->quote_char)
 314             {
 315               s->in_quote = 0;
 316               ++buf, --bufsize;
 317             }
 318         } while (*buf != '>');
 319       FREE_MAYBE (s->tag);
 320       FREE_MAYBE (s->attr);
 321       s->tag = s->attr = NULL;
 322       if (!bufsize)
 323         break;
 324     }
 325
 326   FREE_MAYBE (s->tag);
 327   FREE_MAYBE (s->attr);
 328   FREE_MAYBE (s->base);
 329   memset (s, 0, sizeof (*s));   /* just to be sure */
 330   DEBUGP (("HTML parser ends here (state destroyed).\n"));
 331   return NULL;
 332 }
 333
 334 /* The function returns the base reference of HTML buffer id, or NULL
 335    if one wasn't defined for that buffer.  */
 336 const char *
 337 html_base (void)
 338 {
 339   return global_state.base;
 340 }
 341
 342 /* The function returns the pointer to the malloc-ed quoted version of
 343    string s.  It will recognize and quote numeric and special graphic
 344    entities, as per RFC1866:
 345
 346    `&' -> `&amp;'
 347    `<' -> `&lt;'
 348    `>' -> `&gt;'
 349    `"' -> `&quot;'
 350
 351    No other entities are recognized or replaced.  */
 352 static char *
 353 html_quote_string (const char *s)
 354 {
 355   const char *b = s;
 356   char *p, *res;
 357   int i;
 358
 359   /* Pass through the string, and count the new size.  */
 360   for (i = 0; *s; s++, i++)
 361     {
 362       if (*s == '&')
 363         i += 4;                /* `amp;' */
 364       else if (*s == '<' || *s == '>')
 365         i += 3;                /* `lt;' and `gt;' */
 366       else if (*s == '\"')
 367         i += 5;                /* `quot;' */
 368     }
 369   res = (char *)xmalloc (i + 1);
 370   s = b;
 371   for (p = res; *s; s++)
 372     {
 373       switch (*s)
 374         {
 375         case '&':
 376           *p++ = '&';
 377           *p++ = 'a';
 378           *p++ = 'm';
 379           *p++ = 'p';
 380           *p++ = ';';
 381           break;
 382         case '<': case '>':
 383           *p++ = '&';
 384           *p++ = (*s == '<' ? 'l' : 'g');
 385           *p++ = 't';
 386           *p++ = ';';
 387           break;
 388         case '\"':
 389           *p++ = '&';
 390           *p++ = 'q';
 391           *p++ = 'u';
 392           *p++ = 'o';
 393           *p++ = 't';
 394           *p++ = ';';
 395           break;
 396         default:
 397           *p++ = *s;
 398         }
 399     }
 400   *p = '\0';
 401   return res;
 402 }
 403
 404 /* The function creates an HTML index containing references to given
 405    directories and files on the appropriate host.  The references are
 406    FTP.  */
 407 uerr_t
 408 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
 409 {
 410   FILE *fp;
 411   char *upwd;
 412   char *htclfile;               /* HTML-clean file name */
 413
 414   if (!opt.dfp)
 415     {
 416       fp = fopen (file, "wb");
 417       if (!fp)
 418         {
 419           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 420           return FOPENERR;
 421         }
 422     }
 423   else
 424     fp = opt.dfp;
 425   if (u->user)
 426     {
 427       char *tmpu, *tmpp;        /* temporary, clean user and passwd */
 428
 429       tmpu = CLEANDUP (u->user);
 430       tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
 431       upwd = (char *)xmalloc (strlen (tmpu)
 432                              + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
 433       sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
 434       free (tmpu);
 435       FREE_MAYBE (tmpp);
 436     }
 437   else
 438     upwd = xstrdup ("");
 439   fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
 440   fprintf (fp, "<html>\n<head>\n<title>");
 441   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 442   fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
 443   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 444   fprintf (fp, "</h1>\n<hr>\n<pre>\n");
 445   while (f)
 446     {
 447       fprintf (fp, "  ");
 448       if (f->tstamp != -1)
 449         {
 450           /* #### Should we translate the months? */
 451           static char *months[] = {
 452             "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 453             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 454           };
 455           struct tm *ptm = localtime ((time_t *)&f->tstamp);
 456
 457           fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
 458                   ptm->tm_mday);
 459           if (ptm->tm_hour)
 460             fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
 461           else
 462             fprintf (fp, "       ");
 463         }
 464       else
 465         fprintf (fp, _("time unknown       "));
 466       switch (f->type)
 467         {
 468         case FT_PLAINFILE:
 469           fprintf (fp, _("File        "));
 470           break;
 471         case FT_DIRECTORY:
 472           fprintf (fp, _("Directory   "));
 473           break;
 474         case FT_SYMLINK:
 475           fprintf (fp, _("Link        "));
 476           break;
 477         default:
 478           fprintf (fp, _("Not sure    "));
 479           break;
 480         }
 481       htclfile = html_quote_string (f->name);
 482       fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
 483       if (*u->dir != '/')
 484         putc ('/', fp);
 485       fprintf (fp, "%s", u->dir);
 486       if (*u->dir)
 487         putc ('/', fp);
 488       fprintf (fp, "%s", htclfile);
 489       if (f->type == FT_DIRECTORY)
 490         putc ('/', fp);
 491       fprintf (fp, "\">%s", htclfile);
 492       if (f->type == FT_DIRECTORY)
 493         putc ('/', fp);
 494       fprintf (fp, "</a> ");
 495       if (f->type == FT_PLAINFILE)
 496         fprintf (fp, _(" (%s bytes)"), legible (f->size));
 497       else if (f->type == FT_SYMLINK)
 498         fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
 499       putc ('\n', fp);
 500       free (htclfile);
 501       f = f->next;
 502     }
 503   fprintf (fp, "</pre>\n</body>\n</html>\n");
 504   free (upwd);
 505   if (!opt.dfp)
 506     fclose (fp);
 507   else
 508     fflush (fp);
 509   return FTPOK;
 510 }