sjero.net Git - wget/blob - src/html.c

   1 /* A simple HTML parser.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <sys/types.h>
  31 #include <errno.h>
  32
  33 #include "wget.h"
  34 #include "url.h"
  35 #include "utils.h"
  36 #include "ftp.h"
  37 #include "html.h"
  38
  39 #ifndef errno
  40 extern int errno;
  41 #endif
  42
  43 static state_t global_state;
  44
  45 struct tag_attr {
  46   char *tag;
  47   char *attr;
  48 };
  49
  50
  51 /* Match a string against a null-terminated list of identifiers.  */
  52 static int
  53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  54 {
  55   int  i, j;
  56
  57   if (tag == NULL || attr == NULL)
  58     return FALSE;
  59
  60   for (i = 0; tags[i].tag; i++)
  61     /* Loop through all the tags wget ever cares about. */
  62     if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
  63       /* The tag and attribute matched one of the ones wget cares about. */
  64       {
  65         if (opt.ignore_tags)
  66           /* --ignore-tags was specified.  Do not match these specific tags.
  67              --ignore-tags takes precedence over --follow-tags, so we process
  68              --ignore first and fall through if there's no match. */
  69           for (j = 0; opt.ignore_tags[j] != NULL; j++)
  70             /* Loop through all the tags this user doesn't care about. */
  71             if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
  72               return FALSE;
  73
  74         if (opt.follow_tags)
  75           /* --follow-tags was specified.  Only match these specific tags, so
  76              return FALSE if we don't match one of them. */
  77           {
  78             for (j = 0; opt.follow_tags[j] != NULL; j++)
  79               /* Loop through all the tags this user cares about. */
  80               if (strcasecmp(opt.follow_tags[j], tag) == EQ)
  81                 return TRUE;
  82
  83             return FALSE;  /* wasn't one of the explicitly desired tags */
  84           }
  85
  86         /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
  87            if specified, didn't include this tag, so it's okay to follow. */
  88         return TRUE;
  89       }
  90
  91   return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
  92 }
  93
  94 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
  95    describing URLs to follow.  When a tag is encountered, extract its
  96    components (as described by html_allow[] array), and return the
  97    address and the length of the string.  Return NULL if no URL is
  98    found.  */
  99 const char *
 100 htmlfindurl (const char *buf, int bufsize, int *size, int init,
 101              int dash_p_leaf_HTML)
 102 {
 103   const char *p, *ph;
 104   state_t    *s = &global_state;
 105
 106   /* NULL-terminated list of tags and modifiers someone would want to
 107      follow -- feel free to edit to suit your needs: */
 108   static struct tag_attr html_allow[] = {
 109     { "script", "src" },
 110     { "img", "src" },
 111     { "img", "href" },
 112     { "body", "background" },
 113     { "frame", "src" },
 114     { "iframe", "src" },
 115     { "fig", "src" },
 116     { "overlay", "src" },
 117     { "applet", "code" },
 118     { "script", "src" },
 119     { "embed", "src" },
 120     { "bgsound", "src" },
 121     { "img", "lowsrc" },
 122     { "input", "src" },
 123     { "layer", "src" },
 124     { "table", "background"},
 125     { "th", "background"},
 126     { "td", "background"},
 127     /* Tags below this line are treated specially.  */
 128     { "a", "href" },
 129     { "area", "href" },
 130     { "base", "href" },
 131     { "link", "href" },
 132     { "link", "rel" },
 133     { "meta", "content" },
 134     { NULL, NULL }
 135   };
 136
 137   if (init)
 138     {
 139       DEBUGP (("Resetting a parser state.\n"));
 140       memset (s, 0, sizeof (*s));
 141     }
 142
 143   while (1)
 144     {
 145       const char*  link_href = NULL;
 146       const char*  link_rel = NULL;
 147       int          link_href_saved_size = 0; /* init. just to shut up warning */
 148
 149       if (!bufsize)
 150         break;
 151       /* Let's look for a tag, if we are not already in one.  */
 152       if (!s->at_value)
 153         {
 154           /* Find '<'.  */
 155           if (*buf != '<')
 156             for (; bufsize && *buf != '<'; ++buf, --bufsize);
 157           if (!bufsize)
 158             break;
 159           /* Skip spaces.  */
 160           for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 161                ++buf, --bufsize);
 162           if (!bufsize)
 163             break;
 164           p = buf;
 165           /* Find the tag end.  */
 166           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 167                ++buf, --bufsize);
 168           if (!bufsize)
 169             break;
 170           if (*buf == '=')
 171             {
 172               /* <tag=something> is illegal.  Just skip it.  */
 173               ++buf, --bufsize;
 174               continue;
 175             }
 176           if (p == buf)
 177             {
 178               /* *buf == '>'.  */
 179               ++buf, --bufsize;
 180               continue;
 181             }
 182           s->tag = strdupdelim (p, buf);
 183           if (*buf == '>')
 184             {
 185               xfree (s->tag);
 186               s->tag = NULL;
 187               ++buf, --bufsize;
 188               continue;
 189             }
 190         }
 191       else                      /* s->at_value */
 192         {
 193           /* Reset AT_VALUE.  */
 194           s->at_value = 0;
 195           /* If in quotes, just skip out of them and continue living.  */
 196           if (s->in_quote)
 197             {
 198               s->in_quote = 0;
 199               for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
 200               if (!bufsize)
 201                 break;
 202               ++buf, --bufsize;
 203             }
 204           if (!bufsize)
 205             break;
 206           if (*buf == '>')
 207             {
 208               FREE_MAYBE (s->tag);
 209               FREE_MAYBE (s->attr);
 210               s->tag = s->attr = NULL;
 211               continue;
 212             }
 213         }
 214       /* Find the attributes.  */
 215       do
 216         {
 217           FREE_MAYBE (s->attr);
 218           s->attr = NULL;
 219           if (!bufsize)
 220             break;
 221           /* Skip the spaces if we have them.  We don't have them at
 222              places like <img alt="something"src="something-else">.
 223                                              ^ no spaces here */
 224           if (ISSPACE (*buf))
 225             for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 226                  ++buf, --bufsize);
 227           if (!bufsize || *buf == '>')
 228             break;
 229           if (*buf == '=')
 230             {
 231               /* This is the case of <tag = something>, which is
 232                  illegal.  Just skip it.  */
 233               ++buf, --bufsize;
 234               continue;
 235             }
 236           p = buf;
 237           /* Find the attribute end.  */
 238           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 239                ++buf, --bufsize);
 240           if (!bufsize || *buf == '>')
 241             break;
 242           /* Construct the attribute.  */
 243           s->attr = strdupdelim (p, buf);
 244           /* Now we must skip the spaces to find '='.  */
 245           if (*buf != '=')
 246             {
 247               for (; bufsize && ISSPACE (*buf) && *buf != '>';
 248                    ++buf, --bufsize);
 249               if (!bufsize || *buf == '>')
 250                 break;
 251             }
 252           /* If we still don't have '=', something is amiss.  */
 253           if (*buf != '=')
 254             continue;
 255           /* Find the beginning of attribute value by skipping the
 256              spaces.  */
 257           ++buf, --bufsize;
 258           for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 259           if (!bufsize || *buf == '>')
 260             break;
 261           ph = NULL;
 262           /* The value of an attribute can, but does not have to be
 263              quoted.  */
 264           if (*buf == '\"' || *buf == '\'')
 265             {
 266               s->in_quote = 1;
 267               s->quote_char = *buf;
 268               p = buf + 1;
 269               for (++buf, --bufsize;
 270                    bufsize && *buf != s->quote_char && *buf != '\n';
 271                    ++buf, --bufsize)
 272                 if (!ph && *buf == '#' && *(buf - 1) != '&')
 273                   ph = buf;
 274               if (!bufsize)
 275                 {
 276                   s->in_quote = 0;
 277                   break;
 278                 }
 279               if (*buf == '\n')
 280                 {
 281                   /* #### Is the following logic good?
 282
 283                      Obviously no longer in quote.  It might be well
 284                      to check whether '>' was encountered, but that
 285                      would be encouraging writers of invalid HTMLs,
 286                      and we don't want that, now do we?  */
 287                   s->in_quote = 0;
 288                   continue;
 289                 }
 290             }
 291           else
 292             {
 293               p = buf;
 294               for (; bufsize && !ISSPACE (*buf) && *buf != '>';
 295                    ++buf, --bufsize)
 296                 if (!ph && *buf == '#' && *(buf - 1) != '&')
 297                   ph = buf;
 298               if (!bufsize)
 299                 break;
 300             }
 301           /* If '#' was found unprotected in a URI, it is probably an
 302              HTML marker, or color spec.  */
 303           *size = (ph ? ph : buf) - p;
 304           /* The URI is liable to be returned if:
 305              1) *size != 0;
 306              2) its tag and attribute are found in html_allow.  */
 307           if (*size && idmatch (html_allow, s->tag, s->attr))
 308             {
 309               if (strcasecmp(s->tag, "a") == EQ ||
 310                   strcasecmp(s->tag, "area") == EQ)
 311                 {
 312                   /* Only follow these if we're not at a -p leaf node, as they
 313                      always link to external documents. */
 314                   if (!dash_p_leaf_HTML)
 315                     {
 316                       s->at_value = 1;
 317                       return p;
 318                     }
 319                 }
 320               else if (!strcasecmp (s->tag, "base") &&
 321                        !strcasecmp (s->attr, "href"))
 322                 {
 323                   FREE_MAYBE (s->base);
 324                   s->base = strdupdelim (p, buf);
 325                 }
 326               else if (strcasecmp(s->tag, "link") == EQ)
 327                 {
 328                   if (strcasecmp(s->attr, "href") == EQ)
 329                     {
 330                       link_href = p;
 331                       link_href_saved_size = *size;  /* for restoration below */
 332                     }
 333                   else if (strcasecmp(s->attr, "rel") == EQ)
 334                     link_rel = p;
 335
 336                   if (link_href != NULL && link_rel != NULL)
 337                     /* Okay, we've now seen this <LINK> tag's HREF and REL
 338                        attributes (they may be in either order), so it's now
 339                        possible to decide if we want to traverse it. */
 340                     if (!dash_p_leaf_HTML ||
 341                         strncasecmp(link_rel, "stylesheet",
 342                                     sizeof("stylesheet") - 1) == EQ)
 343                       /* In the normal case, all <LINK> tags are fair game.
 344
 345                          In the special case of when -p is active, however, and
 346                          we're at a leaf node (relative to the -l max. depth) in
 347                          the HTML document tree, the only <LINK> tag we'll
 348                          follow is a <LINK REL="stylesheet">, as it's necessary
 349                          for displaying this document properly.  We won't follow
 350                          other <LINK> tags, like <LINK REL="home">, for
 351                          instance, as they refer to external documents.
 352
 353                          Note that the above strncasecmp() will incorrectly
 354                          consider something like '<LINK REL="stylesheet.old"' as
 355                          equivalent to '<LINK REL="stylesheet"'.  Not really
 356                          worth the trouble to explicitly check for such cases --
 357                          if time is spent, it should be spent ripping out wget's
 358                          somewhat kludgy HTML parser and hooking in a real,
 359                          componentized one. */
 360                       {
 361                         /* When we return, the 'size' IN/OUT parameter
 362                            determines where in the buffer the end of the current
 363                            attribute value is.  If REL came after HREF in this
 364                            <LINK> tag, size is currently set to the size for
 365                            REL's value -- set it to what it was when we were
 366                            looking at HREF's value. */
 367                         *size = link_href_saved_size;
 368
 369                         s->at_value = 1;
 370                         return link_href;
 371                       }
 372                 }
 373               else if (!strcasecmp (s->tag, "meta") &&
 374                        !strcasecmp (s->attr, "content"))
 375                 {
 376                   /* Some pages use a META tag to specify that the page
 377                      be refreshed by a new page after a given number of
 378                      seconds.  We need to attempt to extract an URL for
 379                      the new page from the other garbage present.  The
 380                      general format for this is:
 381                      <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
 382
 383                      So we just need to skip past the "0; URL="
 384                      garbage to get to the URL.  META tags are also
 385                      used for specifying random things like the page
 386                      author's name and what editor was used to create
 387                      it.  So we need to be careful to ignore them and
 388                      not assume that an URL will be present at all.  */
 389                   for (; *size && ISDIGIT (*p); p++, *size -= 1);
 390                   if (*p == ';')
 391                     {
 392                       for (p++, *size -= 1;
 393                            *size && ISSPACE (*p);
 394                            p++, *size -= 1) ;
 395                       if (!strncasecmp (p, "URL=", 4))
 396                         {
 397                           p += 4, *size -= 4;
 398                           s->at_value = 1;
 399                           return p;
 400                         }
 401                     }
 402                 }
 403               else
 404                 {
 405                   s->at_value = 1;
 406                   return p;
 407                 }
 408             }
 409           /* Exit from quote.  */
 410           if (*buf == s->quote_char)
 411             {
 412               s->in_quote = 0;
 413               ++buf, --bufsize;
 414             }
 415         } while (*buf != '>');
 416       FREE_MAYBE (s->tag);
 417       FREE_MAYBE (s->attr);
 418       s->tag = s->attr = NULL;
 419       if (!bufsize)
 420         break;
 421     }
 422
 423   FREE_MAYBE (s->tag);
 424   FREE_MAYBE (s->attr);
 425   FREE_MAYBE (s->base);
 426   memset (s, 0, sizeof (*s));   /* just to be sure */
 427   DEBUGP (("HTML parser ends here (state destroyed).\n"));
 428   return NULL;
 429 }
 430
 431 /* The function returns the base reference of HTML buffer id, or NULL
 432    if one wasn't defined for that buffer.  */
 433 const char *
 434 html_base (void)
 435 {
 436   return global_state.base;
 437 }
 438
 439 /* Create a malloc'ed copy of text in the range [beg, end), but with
 440    the HTML entities processed.  Recognized entities are &lt, &gt,
 441    &amp, &quot, &nbsp and the numerical entities.  */
 442
 443 char *
 444 html_decode_entities (const char *beg, const char *end)
 445 {
 446   char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
 447   const char *from = beg;
 448   char *to = newstr;
 449
 450   while (from < end)
 451     {
 452       if (*from != '&')
 453         *to++ = *from++;
 454       else
 455         {
 456           const char *save = from;
 457           int remain;
 458
 459           if (++from == end) goto lose;
 460           remain = end - from;
 461
 462           if (*from == '#')
 463             {
 464               int numeric;
 465               ++from;
 466               if (from == end || !ISDIGIT (*from)) goto lose;
 467               for (numeric = 0; from < end && ISDIGIT (*from); from++)
 468                 numeric = 10 * numeric + (*from) - '0';
 469               if (from < end && ISALPHA (*from)) goto lose;
 470               numeric &= 0xff;
 471               *to++ = numeric;
 472             }
 473 #define FROB(literal) (remain >= (sizeof (literal) - 1)                 \
 474                  && !memcmp (from, literal, sizeof (literal) - 1)       \
 475                  && (*(from + sizeof (literal) - 1) == ';'              \
 476                      || remain == sizeof (literal) - 1                  \
 477                      || !ISALNUM (*(from + sizeof (literal) - 1))))
 478           else if (FROB ("lt"))
 479             *to++ = '<', from += 2;
 480           else if (FROB ("gt"))
 481             *to++ = '>', from += 2;
 482           else if (FROB ("amp"))
 483             *to++ = '&', from += 3;
 484           else if (FROB ("quot"))
 485             *to++ = '\"', from += 4;
 486           /* We don't implement the "Added Latin 1" entities proposed
 487              by rfc1866 (except for nbsp), because it is unnecessary
 488              in the context of Wget, and would require hashing to work
 489              efficiently.  */
 490           else if (FROB ("nbsp"))
 491             *to++ = 160, from += 4;
 492           else
 493             goto lose;
 494 #undef FROB
 495           /* If the entity was followed by `;', we step over the `;'.
 496              Otherwise, it was followed by either a non-alphanumeric
 497              or EOB, in which case we do nothing.  */
 498           if (from < end && *from == ';')
 499             ++from;
 500           continue;
 501
 502         lose:
 503           /* This was not an entity after all.  Back out.  */
 504           from = save;
 505           *to++ = *from++;
 506         }
 507     }
 508   *to++ = '\0';
 509   /* #### Should we try to do this: */
 510 #if 0
 511   newstr = xrealloc (newstr, to - newstr);
 512 #endif
 513   return newstr;
 514 }
 515
 516 /* The function returns the pointer to the malloc-ed quoted version of
 517    string s.  It will recognize and quote numeric and special graphic
 518    entities, as per RFC1866:
 519
 520    `&' -> `&amp;'
 521    `<' -> `&lt;'
 522    `>' -> `&gt;'
 523    `"' -> `&quot;'
 524
 525    No other entities are recognized or replaced.  */
 526 static char *
 527 html_quote_string (const char *s)
 528 {
 529   const char *b = s;
 530   char *p, *res;
 531   int i;
 532
 533   /* Pass through the string, and count the new size.  */
 534   for (i = 0; *s; s++, i++)
 535     {
 536       if (*s == '&')
 537         i += 4;                /* `amp;' */
 538       else if (*s == '<' || *s == '>')
 539         i += 3;                /* `lt;' and `gt;' */
 540       else if (*s == '\"')
 541         i += 5;                /* `quot;' */
 542     }
 543   res = (char *)xmalloc (i + 1);
 544   s = b;
 545   for (p = res; *s; s++)
 546     {
 547       switch (*s)
 548         {
 549         case '&':
 550           *p++ = '&';
 551           *p++ = 'a';
 552           *p++ = 'm';
 553           *p++ = 'p';
 554           *p++ = ';';
 555           break;
 556         case '<': case '>':
 557           *p++ = '&';
 558           *p++ = (*s == '<' ? 'l' : 'g');
 559           *p++ = 't';
 560           *p++ = ';';
 561           break;
 562         case '\"':
 563           *p++ = '&';
 564           *p++ = 'q';
 565           *p++ = 'u';
 566           *p++ = 'o';
 567           *p++ = 't';
 568           *p++ = ';';
 569           break;
 570         default:
 571           *p++ = *s;
 572         }
 573     }
 574   *p = '\0';
 575   return res;
 576 }
 577
 578 /* The function creates an HTML index containing references to given
 579    directories and files on the appropriate host.  The references are
 580    FTP.  */
 581 uerr_t
 582 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
 583 {
 584   FILE *fp;
 585   char *upwd;
 586   char *htclfile;               /* HTML-clean file name */
 587
 588   if (!opt.dfp)
 589     {
 590       fp = fopen (file, "wb");
 591       if (!fp)
 592         {
 593           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 594           return FOPENERR;
 595         }
 596     }
 597   else
 598     fp = opt.dfp;
 599   if (u->user)
 600     {
 601       char *tmpu, *tmpp;        /* temporary, clean user and passwd */
 602
 603       tmpu = CLEANDUP (u->user);
 604       tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
 605       upwd = (char *)xmalloc (strlen (tmpu)
 606                              + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
 607       sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
 608       xfree (tmpu);
 609       FREE_MAYBE (tmpp);
 610     }
 611   else
 612     upwd = xstrdup ("");
 613   fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
 614   fprintf (fp, "<html>\n<head>\n<title>");
 615   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 616   fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
 617   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 618   fprintf (fp, "</h1>\n<hr>\n<pre>\n");
 619   while (f)
 620     {
 621       fprintf (fp, "  ");
 622       if (f->tstamp != -1)
 623         {
 624           /* #### Should we translate the months? */
 625           static char *months[] = {
 626             "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 627             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 628           };
 629           struct tm *ptm = localtime ((time_t *)&f->tstamp);
 630
 631           fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
 632                   ptm->tm_mday);
 633           if (ptm->tm_hour)
 634             fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
 635           else
 636             fprintf (fp, "       ");
 637         }
 638       else
 639         fprintf (fp, _("time unknown       "));
 640       switch (f->type)
 641         {
 642         case FT_PLAINFILE:
 643           fprintf (fp, _("File        "));
 644           break;
 645         case FT_DIRECTORY:
 646           fprintf (fp, _("Directory   "));
 647           break;
 648         case FT_SYMLINK:
 649           fprintf (fp, _("Link        "));
 650           break;
 651         default:
 652           fprintf (fp, _("Not sure    "));
 653           break;
 654         }
 655       htclfile = html_quote_string (f->name);
 656       fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
 657       if (*u->dir != '/')
 658         putc ('/', fp);
 659       fprintf (fp, "%s", u->dir);
 660       if (*u->dir)
 661         putc ('/', fp);
 662       fprintf (fp, "%s", htclfile);
 663       if (f->type == FT_DIRECTORY)
 664         putc ('/', fp);
 665       fprintf (fp, "\">%s", htclfile);
 666       if (f->type == FT_DIRECTORY)
 667         putc ('/', fp);
 668       fprintf (fp, "</a> ");
 669       if (f->type == FT_PLAINFILE)
 670         fprintf (fp, _(" (%s bytes)"), legible (f->size));
 671       else if (f->type == FT_SYMLINK)
 672         fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
 673       putc ('\n', fp);
 674       xfree (htclfile);
 675       f = f->next;
 676     }
 677   fprintf (fp, "</pre>\n</body>\n</html>\n");
 678   xfree (upwd);
 679   if (!opt.dfp)
 680     fclose (fp);
 681   else
 682     fflush (fp);
 683   return FTPOK;
 684 }