sjero.net Git - wget/blob - src/html.c

   1 /* A simple HTML parser.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <sys/types.h>
  31 #include <errno.h>
  32
  33 #include "wget.h"
  34 #include "url.h"
  35 #include "utils.h"
  36 #include "ftp.h"
  37 #include "html.h"
  38
  39 #ifndef errno
  40 extern int errno;
  41 #endif
  42
  43 static state_t global_state;
  44
  45 struct tag_attr {
  46   char *tag;
  47   char *attr;
  48 };
  49
  50
  51 /* Match a string against a null-terminated list of identifiers.  */
  52 static int
  53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  54 {
  55   int  i, j;
  56
  57   if (tag == NULL || attr == NULL)
  58     return FALSE;
  59
  60   for (i = 0; tags[i].tag; i++)
  61     /* Loop through all the tags wget ever cares about. */
  62     if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
  63       /* The tag and attribute matched one of the ones wget cares about. */
  64       {
  65         if (opt.ignore_tags)
  66           /* --ignore-tags was specified.  Do not match these specific tags.
  67              --ignore-tags takes precedence over --follow-tags, so we process
  68              --ignore first and fall through if there's no match. */
  69           for (j = 0; opt.ignore_tags[j] != NULL; j++)
  70             /* Loop through all the tags this user doesn't care about. */
  71             if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
  72               return FALSE;
  73
  74         if (opt.follow_tags)
  75           /* --follow-tags was specified.  Only match these specific tags, so
  76              return FALSE if we don't match one of them. */
  77           {
  78             for (j = 0; opt.follow_tags[j] != NULL; j++)
  79               /* Loop through all the tags this user cares about. */
  80               if (strcasecmp(opt.follow_tags[j], tag) == EQ)
  81                 return TRUE;
  82
  83             return FALSE;  /* wasn't one of the explicitly desired tags */
  84           }
  85
  86         /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
  87            if specified, didn't include this tag, so it's okay to follow. */
  88         return TRUE;
  89       }
  90
  91   return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
  92 }
  93
  94
  95 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
  96    describing URLs to follow.  When a tag is encountered, extract its
  97    components (as described by html_allow[] array), and return the
  98    address and the length of the string.  Return NULL if no URL is
  99    found.  */
 100 const char *
 101 htmlfindurl (const char *buf, int bufsize, int *size, int init)
 102 {
 103   const char *p, *ph;
 104   state_t *s;
 105   /* NULL-terminated list of tags and modifiers someone would want to
 106      follow -- feel free to edit to suit your needs: */
 107   static struct tag_attr html_allow[] = {
 108     { "a", "href" },
 109     { "link", "href" },
 110     { "script", "src" },
 111     { "img", "src" },
 112     { "img", "href" },
 113     { "body", "background" },
 114     { "frame", "src" },
 115     { "iframe", "src" },
 116     { "fig", "src" },
 117     { "overlay", "src" },
 118     { "applet", "code" },
 119     { "script", "src" },
 120     { "embed", "src" },
 121     { "bgsound", "src" },
 122     { "area", "href" },
 123     { "img", "lowsrc" },
 124     { "input", "src" },
 125     { "layer", "src" },
 126     { "table", "background"},
 127     { "th", "background"},
 128     { "td", "background"},
 129     /* Tags below this line are treated specially.  */
 130     { "base", "href" },
 131     { "meta", "content" },
 132     { NULL, NULL }
 133   };
 134
 135   s = &global_state;
 136
 137   if (init)
 138     {
 139       DEBUGP (("Resetting a parser state.\n"));
 140       memset (s, 0, sizeof (*s));
 141     }
 142
 143   while (1)
 144     {
 145       if (!bufsize)
 146         break;
 147       /* Let's look for a tag, if we are not already in one.  */
 148       if (!s->at_value)
 149         {
 150           /* Find '<'.  */
 151           if (*buf != '<')
 152             for (; bufsize && *buf != '<'; ++buf, --bufsize);
 153           if (!bufsize)
 154             break;
 155           /* Skip spaces.  */
 156           for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 157                ++buf, --bufsize);
 158           if (!bufsize)
 159             break;
 160           p = buf;
 161           /* Find the tag end.  */
 162           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 163                ++buf, --bufsize);
 164           if (!bufsize)
 165             break;
 166           if (*buf == '=')
 167             {
 168               /* <tag=something> is illegal.  Just skip it.  */
 169               ++buf, --bufsize;
 170               continue;
 171             }
 172           if (p == buf)
 173             {
 174               /* *buf == '>'.  */
 175               ++buf, --bufsize;
 176               continue;
 177             }
 178           s->tag = strdupdelim (p, buf);
 179           if (*buf == '>')
 180             {
 181               free (s->tag);
 182               s->tag = NULL;
 183               ++buf, --bufsize;
 184               continue;
 185             }
 186         }
 187       else                      /* s->at_value */
 188         {
 189           /* Reset AT_VALUE.  */
 190           s->at_value = 0;
 191           /* If in quotes, just skip out of them and continue living.  */
 192           if (s->in_quote)
 193             {
 194               s->in_quote = 0;
 195               for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
 196               if (!bufsize)
 197                 break;
 198               ++buf, --bufsize;
 199             }
 200           if (!bufsize)
 201             break;
 202           if (*buf == '>')
 203             {
 204               FREE_MAYBE (s->tag);
 205               FREE_MAYBE (s->attr);
 206               s->tag = s->attr = NULL;
 207               continue;
 208             }
 209         }
 210       /* Find the attributes.  */
 211       do
 212         {
 213           FREE_MAYBE (s->attr);
 214           s->attr = NULL;
 215           if (!bufsize)
 216             break;
 217           /* Skip the spaces if we have them.  We don't have them at
 218              places like <img alt="something"src="something-else">.
 219                                              ^ no spaces here */
 220           if (ISSPACE (*buf))
 221             for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 222                  ++buf, --bufsize);
 223           if (!bufsize || *buf == '>')
 224             break;
 225           if (*buf == '=')
 226             {
 227               /* This is the case of <tag = something>, which is
 228                  illegal.  Just skip it.  */
 229               ++buf, --bufsize;
 230               continue;
 231             }
 232           p = buf;
 233           /* Find the attribute end.  */
 234           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 235                ++buf, --bufsize);
 236           if (!bufsize || *buf == '>')
 237             break;
 238           /* Construct the attribute.  */
 239           s->attr = strdupdelim (p, buf);
 240           /* Now we must skip the spaces to find '='.  */
 241           if (*buf != '=')
 242             {
 243               for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 244               if (!bufsize || *buf == '>')
 245                 break;
 246             }
 247           /* If we still don't have '=', something is amiss.  */
 248           if (*buf != '=')
 249             continue;
 250           /* Find the beginning of attribute value by skipping the
 251              spaces.  */
 252           ++buf, --bufsize;
 253           for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 254           if (!bufsize || *buf == '>')
 255             break;
 256           ph = NULL;
 257           /* The value of an attribute can, but does not have to be
 258              quoted.  */
 259           if (*buf == '\"' || *buf == '\'')
 260             {
 261               s->in_quote = 1;
 262               s->quote_char = *buf;
 263               p = buf + 1;
 264               for (++buf, --bufsize;
 265                    bufsize && *buf != s->quote_char && *buf != '\n';
 266                    ++buf, --bufsize)
 267                 if (*buf == '#')
 268                   ph = buf;
 269               if (!bufsize)
 270                 {
 271                   s->in_quote = 0;
 272                   break;
 273                 }
 274               if (*buf == '\n')
 275                 {
 276                   /* #### Is the following logic good?
 277
 278                      Obviously no longer in quote.  It might be well
 279                      to check whether '>' was encountered, but that
 280                      would be encouraging writers of invalid HTMLs,
 281                      and we don't want that, now do we?  */
 282                   s->in_quote = 0;
 283                   continue;
 284                 }
 285             }
 286           else
 287             {
 288               p = buf;
 289               for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
 290                 if (*buf == '#')
 291                   ph = buf;
 292               if (!bufsize)
 293                 break;
 294             }
 295           /* If '#' was found unprotected in a URI, it is probably an
 296              HTML marker, or color spec.  */
 297           *size = (ph ? ph : buf) - p;
 298           /* The URI is liable to be returned if:
 299              1) *size != 0;
 300              2) its tag and attribute are found in html_allow.  */
 301           if (*size && idmatch (html_allow, s->tag, s->attr))
 302             {
 303               if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
 304                 {
 305                   FREE_MAYBE (s->base);
 306                   s->base = strdupdelim (p, buf);
 307                 }
 308               else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
 309                 {
 310                   /* Some pages use a META tag to specify that the page
 311                      be refreshed by a new page after a given number of
 312                      seconds.  We need to attempt to extract an URL for
 313                      the new page from the other garbage present.  The
 314                      general format for this is:
 315                      <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
 316
 317                      So we just need to skip past the "0; URL="
 318                      garbage to get to the URL.  META tags are also
 319                      used for specifying random things like the page
 320                      author's name and what editor was used to create
 321                      it.  So we need to be careful to ignore them and
 322                      not assume that an URL will be present at all.  */
 323                   for (; *size && ISDIGIT (*p); p++, *size -= 1);
 324                   if (*p == ';')
 325                     {
 326                       for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
 327                       if (!strncasecmp (p, "URL=", 4))
 328                         {
 329                           p += 4, *size -= 4;
 330                           s->at_value = 1;
 331                           return p;
 332                         }
 333                     }
 334                 }
 335               else
 336                 {
 337                   s->at_value = 1;
 338                   return p;
 339                 }
 340             }
 341           /* Exit from quote.  */
 342           if (*buf == s->quote_char)
 343             {
 344               s->in_quote = 0;
 345               ++buf, --bufsize;
 346             }
 347         } while (*buf != '>');
 348       FREE_MAYBE (s->tag);
 349       FREE_MAYBE (s->attr);
 350       s->tag = s->attr = NULL;
 351       if (!bufsize)
 352         break;
 353     }
 354
 355   FREE_MAYBE (s->tag);
 356   FREE_MAYBE (s->attr);
 357   FREE_MAYBE (s->base);
 358   memset (s, 0, sizeof (*s));   /* just to be sure */
 359   DEBUGP (("HTML parser ends here (state destroyed).\n"));
 360   return NULL;
 361 }
 362
 363 /* The function returns the base reference of HTML buffer id, or NULL
 364    if one wasn't defined for that buffer.  */
 365 const char *
 366 html_base (void)
 367 {
 368   return global_state.base;
 369 }
 370
 371 /* The function returns the pointer to the malloc-ed quoted version of
 372    string s.  It will recognize and quote numeric and special graphic
 373    entities, as per RFC1866:
 374
 375    `&' -> `&amp;'
 376    `<' -> `&lt;'
 377    `>' -> `&gt;'
 378    `"' -> `&quot;'
 379
 380    No other entities are recognized or replaced.  */
 381 static char *
 382 html_quote_string (const char *s)
 383 {
 384   const char *b = s;
 385   char *p, *res;
 386   int i;
 387
 388   /* Pass through the string, and count the new size.  */
 389   for (i = 0; *s; s++, i++)
 390     {
 391       if (*s == '&')
 392         i += 4;                /* `amp;' */
 393       else if (*s == '<' || *s == '>')
 394         i += 3;                /* `lt;' and `gt;' */
 395       else if (*s == '\"')
 396         i += 5;                /* `quot;' */
 397     }
 398   res = (char *)xmalloc (i + 1);
 399   s = b;
 400   for (p = res; *s; s++)
 401     {
 402       switch (*s)
 403         {
 404         case '&':
 405           *p++ = '&';
 406           *p++ = 'a';
 407           *p++ = 'm';
 408           *p++ = 'p';
 409           *p++ = ';';
 410           break;
 411         case '<': case '>':
 412           *p++ = '&';
 413           *p++ = (*s == '<' ? 'l' : 'g');
 414           *p++ = 't';
 415           *p++ = ';';
 416           break;
 417         case '\"':
 418           *p++ = '&';
 419           *p++ = 'q';
 420           *p++ = 'u';
 421           *p++ = 'o';
 422           *p++ = 't';
 423           *p++ = ';';
 424           break;
 425         default:
 426           *p++ = *s;
 427         }
 428     }
 429   *p = '\0';
 430   return res;
 431 }
 432
 433 /* The function creates an HTML index containing references to given
 434    directories and files on the appropriate host.  The references are
 435    FTP.  */
 436 uerr_t
 437 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
 438 {
 439   FILE *fp;
 440   char *upwd;
 441   char *htclfile;               /* HTML-clean file name */
 442
 443   if (!opt.dfp)
 444     {
 445       fp = fopen (file, "wb");
 446       if (!fp)
 447         {
 448           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 449           return FOPENERR;
 450         }
 451     }
 452   else
 453     fp = opt.dfp;
 454   if (u->user)
 455     {
 456       char *tmpu, *tmpp;        /* temporary, clean user and passwd */
 457
 458       tmpu = CLEANDUP (u->user);
 459       tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
 460       upwd = (char *)xmalloc (strlen (tmpu)
 461                              + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
 462       sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
 463       free (tmpu);
 464       FREE_MAYBE (tmpp);
 465     }
 466   else
 467     upwd = xstrdup ("");
 468   fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
 469   fprintf (fp, "<html>\n<head>\n<title>");
 470   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 471   fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
 472   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 473   fprintf (fp, "</h1>\n<hr>\n<pre>\n");
 474   while (f)
 475     {
 476       fprintf (fp, "  ");
 477       if (f->tstamp != -1)
 478         {
 479           /* #### Should we translate the months? */
 480           static char *months[] = {
 481             "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 482             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 483           };
 484           struct tm *ptm = localtime ((time_t *)&f->tstamp);
 485
 486           fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
 487                   ptm->tm_mday);
 488           if (ptm->tm_hour)
 489             fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
 490           else
 491             fprintf (fp, "       ");
 492         }
 493       else
 494         fprintf (fp, _("time unknown       "));
 495       switch (f->type)
 496         {
 497         case FT_PLAINFILE:
 498           fprintf (fp, _("File        "));
 499           break;
 500         case FT_DIRECTORY:
 501           fprintf (fp, _("Directory   "));
 502           break;
 503         case FT_SYMLINK:
 504           fprintf (fp, _("Link        "));
 505           break;
 506         default:
 507           fprintf (fp, _("Not sure    "));
 508           break;
 509         }
 510       htclfile = html_quote_string (f->name);
 511       fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
 512       if (*u->dir != '/')
 513         putc ('/', fp);
 514       fprintf (fp, "%s", u->dir);
 515       if (*u->dir)
 516         putc ('/', fp);
 517       fprintf (fp, "%s", htclfile);
 518       if (f->type == FT_DIRECTORY)
 519         putc ('/', fp);
 520       fprintf (fp, "\">%s", htclfile);
 521       if (f->type == FT_DIRECTORY)
 522         putc ('/', fp);
 523       fprintf (fp, "</a> ");
 524       if (f->type == FT_PLAINFILE)
 525         fprintf (fp, _(" (%s bytes)"), legible (f->size));
 526       else if (f->type == FT_SYMLINK)
 527         fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
 528       putc ('\n', fp);
 529       free (htclfile);
 530       f = f->next;
 531     }
 532   fprintf (fp, "</pre>\n</body>\n</html>\n");
 533   free (upwd);
 534   if (!opt.dfp)
 535     fclose (fp);
 536   else
 537     fflush (fp);
 538   return FTPOK;
 539 }