sjero.net Git - wget/blob - src/html.c

   1 /* A simple HTML parser.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <sys/types.h>
  31 #include <errno.h>
  32
  33 #include "wget.h"
  34 #include "url.h"
  35 #include "utils.h"
  36 #include "ftp.h"
  37 #include "html.h"
  38
  39 #ifndef errno
  40 extern int errno;
  41 #endif
  42
  43 static state_t global_state;
  44
  45 struct tag_attr {
  46   char *tag;
  47   char *attr;
  48 };
  49
  50
  51 /* Match a string against a null-terminated list of identifiers.  */
  52 static int
  53 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  54 {
  55   int  i, j;
  56
  57   if (tag == NULL || attr == NULL)
  58     return FALSE;
  59
  60   for (i = 0; tags[i].tag; i++)
  61     /* Loop through all the tags wget ever cares about. */
  62     if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
  63       /* The tag and attribute matched one of the ones wget cares about. */
  64       {
  65         if (opt.ignore_tags)
  66           /* --ignore-tags was specified.  Do not match these specific tags.
  67              --ignore-tags takes precedence over --follow-tags, so we process
  68              --ignore first and fall through if there's no match. */
  69           for (j = 0; opt.ignore_tags[j] != NULL; j++)
  70             /* Loop through all the tags this user doesn't care about. */
  71             if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
  72               return FALSE;
  73
  74         if (opt.follow_tags)
  75           /* --follow-tags was specified.  Only match these specific tags, so
  76              return FALSE if we don't match one of them. */
  77           {
  78             for (j = 0; opt.follow_tags[j] != NULL; j++)
  79               /* Loop through all the tags this user cares about. */
  80               if (strcasecmp(opt.follow_tags[j], tag) == EQ)
  81                 return TRUE;
  82
  83             return FALSE;  /* wasn't one of the explicitly desired tags */
  84           }
  85
  86         /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
  87            if specified, didn't include this tag, so it's okay to follow. */
  88         return TRUE;
  89       }
  90
  91   return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
  92 }
  93
  94
  95 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
  96    describing URLs to follow.  When a tag is encountered, extract its
  97    components (as described by html_allow[] array), and return the
  98    address and the length of the string.  Return NULL if no URL is
  99    found.  */
 100 const char *
 101 htmlfindurl (const char *buf, int bufsize, int *size, int init,
 102              int dash_p_leaf_HTML)
 103 {
 104   const char *p, *ph;
 105   state_t    *s = &global_state;
 106
 107   /* NULL-terminated list of tags and modifiers someone would want to
 108      follow -- feel free to edit to suit your needs: */
 109   static struct tag_attr html_allow[] = {
 110     { "script", "src" },
 111     { "img", "src" },
 112     { "img", "href" },
 113     { "body", "background" },
 114     { "frame", "src" },
 115     { "iframe", "src" },
 116     { "fig", "src" },
 117     { "overlay", "src" },
 118     { "applet", "code" },
 119     { "script", "src" },
 120     { "embed", "src" },
 121     { "bgsound", "src" },
 122     { "img", "lowsrc" },
 123     { "input", "src" },
 124     { "layer", "src" },
 125     { "table", "background"},
 126     { "th", "background"},
 127     { "td", "background"},
 128     /* Tags below this line are treated specially.  */
 129     { "a", "href" },
 130     { "area", "href" },
 131     { "base", "href" },
 132     { "link", "href" },
 133     { "link", "rel" },
 134     { "meta", "content" },
 135     { NULL, NULL }
 136   };
 137
 138   if (init)
 139     {
 140       DEBUGP (("Resetting a parser state.\n"));
 141       memset (s, 0, sizeof (*s));
 142     }
 143
 144   while (1)
 145     {
 146       const char*  link_href = NULL;
 147       const char*  link_rel = NULL;
 148       int          link_href_saved_size;
 149
 150       if (!bufsize)
 151         break;
 152       /* Let's look for a tag, if we are not already in one.  */
 153       if (!s->at_value)
 154         {
 155           /* Find '<'.  */
 156           if (*buf != '<')
 157             for (; bufsize && *buf != '<'; ++buf, --bufsize);
 158           if (!bufsize)
 159             break;
 160           /* Skip spaces.  */
 161           for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 162                ++buf, --bufsize);
 163           if (!bufsize)
 164             break;
 165           p = buf;
 166           /* Find the tag end.  */
 167           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 168                ++buf, --bufsize);
 169           if (!bufsize)
 170             break;
 171           if (*buf == '=')
 172             {
 173               /* <tag=something> is illegal.  Just skip it.  */
 174               ++buf, --bufsize;
 175               continue;
 176             }
 177           if (p == buf)
 178             {
 179               /* *buf == '>'.  */
 180               ++buf, --bufsize;
 181               continue;
 182             }
 183           s->tag = strdupdelim (p, buf);
 184           if (*buf == '>')
 185             {
 186               free (s->tag);
 187               s->tag = NULL;
 188               ++buf, --bufsize;
 189               continue;
 190             }
 191         }
 192       else                      /* s->at_value */
 193         {
 194           /* Reset AT_VALUE.  */
 195           s->at_value = 0;
 196           /* If in quotes, just skip out of them and continue living.  */
 197           if (s->in_quote)
 198             {
 199               s->in_quote = 0;
 200               for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
 201               if (!bufsize)
 202                 break;
 203               ++buf, --bufsize;
 204             }
 205           if (!bufsize)
 206             break;
 207           if (*buf == '>')
 208             {
 209               FREE_MAYBE (s->tag);
 210               FREE_MAYBE (s->attr);
 211               s->tag = s->attr = NULL;
 212               continue;
 213             }
 214         }
 215       /* Find the attributes.  */
 216       do
 217         {
 218           FREE_MAYBE (s->attr);
 219           s->attr = NULL;
 220           if (!bufsize)
 221             break;
 222           /* Skip the spaces if we have them.  We don't have them at
 223              places like <img alt="something"src="something-else">.
 224                                              ^ no spaces here */
 225           if (ISSPACE (*buf))
 226             for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 227                  ++buf, --bufsize);
 228           if (!bufsize || *buf == '>')
 229             break;
 230           if (*buf == '=')
 231             {
 232               /* This is the case of <tag = something>, which is
 233                  illegal.  Just skip it.  */
 234               ++buf, --bufsize;
 235               continue;
 236             }
 237           p = buf;
 238           /* Find the attribute end.  */
 239           for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 240                ++buf, --bufsize);
 241           if (!bufsize || *buf == '>')
 242             break;
 243           /* Construct the attribute.  */
 244           s->attr = strdupdelim (p, buf);
 245           /* Now we must skip the spaces to find '='.  */
 246           if (*buf != '=')
 247             {
 248               for (; bufsize && ISSPACE (*buf) && *buf != '>';
 249                    ++buf, --bufsize);
 250               if (!bufsize || *buf == '>')
 251                 break;
 252             }
 253           /* If we still don't have '=', something is amiss.  */
 254           if (*buf != '=')
 255             continue;
 256           /* Find the beginning of attribute value by skipping the
 257              spaces.  */
 258           ++buf, --bufsize;
 259           for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 260           if (!bufsize || *buf == '>')
 261             break;
 262           ph = NULL;
 263           /* The value of an attribute can, but does not have to be
 264              quoted.  */
 265           if (*buf == '\"' || *buf == '\'')
 266             {
 267               s->in_quote = 1;
 268               s->quote_char = *buf;
 269               p = buf + 1;
 270               for (++buf, --bufsize;
 271                    bufsize && *buf != s->quote_char && *buf != '\n';
 272                    ++buf, --bufsize)
 273                 if (ph && *buf == '#')
 274                   ph = buf;
 275               if (!bufsize)
 276                 {
 277                   s->in_quote = 0;
 278                   break;
 279                 }
 280               if (*buf == '\n')
 281                 {
 282                   /* #### Is the following logic good?
 283
 284                      Obviously no longer in quote.  It might be well
 285                      to check whether '>' was encountered, but that
 286                      would be encouraging writers of invalid HTMLs,
 287                      and we don't want that, now do we?  */
 288                   s->in_quote = 0;
 289                   continue;
 290                 }
 291             }
 292           else
 293             {
 294               p = buf;
 295               for (; bufsize && !ISSPACE (*buf) && *buf != '>';
 296                    ++buf, --bufsize)
 297                 if (ph && *buf == '#')
 298                   ph = buf;
 299               if (!bufsize)
 300                 break;
 301             }
 302           /* If '#' was found unprotected in a URI, it is probably an
 303              HTML marker, or color spec.  */
 304           *size = (ph ? ph : buf) - p;
 305           /* The URI is liable to be returned if:
 306              1) *size != 0;
 307              2) its tag and attribute are found in html_allow.  */
 308           if (*size && idmatch (html_allow, s->tag, s->attr))
 309             {
 310               if (strcasecmp(s->tag, "a") == EQ ||
 311                   strcasecmp(s->tag, "area") == EQ)
 312                 {
 313                   /* Only follow these if we're not at a -p leaf node, as they
 314                      always link to external documents. */
 315                   if (!dash_p_leaf_HTML)
 316                     {
 317                       s->at_value = 1;
 318                       return p;
 319                     }
 320                 }
 321               else if (!strcasecmp (s->tag, "base") &&
 322                        !strcasecmp (s->attr, "href"))
 323                 {
 324                   FREE_MAYBE (s->base);
 325                   s->base = strdupdelim (p, buf);
 326                 }
 327               else if (strcasecmp(s->tag, "link") == EQ)
 328                 {
 329                   if (strcasecmp(s->attr, "href") == EQ)
 330                     {
 331                       link_href = p;
 332                       link_href_saved_size = *size;  /* for restoration below */
 333                     }
 334                   else if (strcasecmp(s->attr, "rel") == EQ)
 335                     link_rel = p;
 336
 337                   if (link_href != NULL && link_rel != NULL)
 338                     /* Okay, we've now seen this <LINK> tag's HREF and REL
 339                        attributes (they may be in either order), so it's now
 340                        possible to decide if we want to traverse it. */
 341                     if (!dash_p_leaf_HTML ||
 342                         strncasecmp(link_rel, "stylesheet",
 343                                     sizeof("stylesheet") - 1) == EQ)
 344                       /* In the normal case, all <LINK> tags are fair game.
 345
 346                          In the special case of when -p is active, however, and
 347                          we're at a leaf node (relative to the -l max. depth) in
 348                          the HTML document tree, the only <LINK> tag we'll
 349                          follow is a <LINK REL="stylesheet">, as it's necessary
 350                          for displaying this document properly.  We won't follow
 351                          other <LINK> tags, like <LINK REL="home">, for
 352                          instance, as they refer to external documents.
 353
 354                          Note that the above strncasecmp() will incorrectly
 355                          consider something like '<LINK REL="stylesheet.old"' as
 356                          equivalent to '<LINK REL="stylesheet"'.  Not really
 357                          worth the trouble to explicitly check for such cases --
 358                          if time is spent, it should be spent ripping out wget's
 359                          somewhat kludgy HTML parser and hooking in a real,
 360                          componentized one. */
 361                       {
 362                         /* When we return, the 'size' IN/OUT parameter
 363                            determines where in the buffer the end of the current
 364                            attribute value is.  If REL came after HREF in this
 365                            <LINK> tag, size is currently set to the size for
 366                            REL's value -- set it to what it was when we were
 367                            looking at HREF's value. */
 368                         *size = link_href_saved_size;
 369
 370                         s->at_value = 1;
 371                         return link_href;
 372                       }
 373                 }
 374               else if (!strcasecmp (s->tag, "meta") &&
 375                        !strcasecmp (s->attr, "content"))
 376                 {
 377                   /* Some pages use a META tag to specify that the page
 378                      be refreshed by a new page after a given number of
 379                      seconds.  We need to attempt to extract an URL for
 380                      the new page from the other garbage present.  The
 381                      general format for this is:
 382                      <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
 383
 384                      So we just need to skip past the "0; URL="
 385                      garbage to get to the URL.  META tags are also
 386                      used for specifying random things like the page
 387                      author's name and what editor was used to create
 388                      it.  So we need to be careful to ignore them and
 389                      not assume that an URL will be present at all.  */
 390                   for (; *size && ISDIGIT (*p); p++, *size -= 1);
 391                   if (*p == ';')
 392                     {
 393                       for (p++, *size -= 1;
 394                            *size && ISSPACE (*p);
 395                            p++, *size -= 1) ;
 396                       if (!strncasecmp (p, "URL=", 4))
 397                         {
 398                           p += 4, *size -= 4;
 399                           s->at_value = 1;
 400                           return p;
 401                         }
 402                     }
 403                 }
 404               else
 405                 {
 406                   s->at_value = 1;
 407                   return p;
 408                 }
 409             }
 410           /* Exit from quote.  */
 411           if (*buf == s->quote_char)
 412             {
 413               s->in_quote = 0;
 414               ++buf, --bufsize;
 415             }
 416         } while (*buf != '>');
 417       FREE_MAYBE (s->tag);
 418       FREE_MAYBE (s->attr);
 419       s->tag = s->attr = NULL;
 420       if (!bufsize)
 421         break;
 422     }
 423
 424   FREE_MAYBE (s->tag);
 425   FREE_MAYBE (s->attr);
 426   FREE_MAYBE (s->base);
 427   memset (s, 0, sizeof (*s));   /* just to be sure */
 428   DEBUGP (("HTML parser ends here (state destroyed).\n"));
 429   return NULL;
 430 }
 431
 432 /* The function returns the base reference of HTML buffer id, or NULL
 433    if one wasn't defined for that buffer.  */
 434 const char *
 435 html_base (void)
 436 {
 437   return global_state.base;
 438 }
 439
 440 /* The function returns the pointer to the malloc-ed quoted version of
 441    string s.  It will recognize and quote numeric and special graphic
 442    entities, as per RFC1866:
 443
 444    `&' -> `&amp;'
 445    `<' -> `&lt;'
 446    `>' -> `&gt;'
 447    `"' -> `&quot;'
 448
 449    No other entities are recognized or replaced.  */
 450 static char *
 451 html_quote_string (const char *s)
 452 {
 453   const char *b = s;
 454   char *p, *res;
 455   int i;
 456
 457   /* Pass through the string, and count the new size.  */
 458   for (i = 0; *s; s++, i++)
 459     {
 460       if (*s == '&')
 461         i += 4;                /* `amp;' */
 462       else if (*s == '<' || *s == '>')
 463         i += 3;                /* `lt;' and `gt;' */
 464       else if (*s == '\"')
 465         i += 5;                /* `quot;' */
 466     }
 467   res = (char *)xmalloc (i + 1);
 468   s = b;
 469   for (p = res; *s; s++)
 470     {
 471       switch (*s)
 472         {
 473         case '&':
 474           *p++ = '&';
 475           *p++ = 'a';
 476           *p++ = 'm';
 477           *p++ = 'p';
 478           *p++ = ';';
 479           break;
 480         case '<': case '>':
 481           *p++ = '&';
 482           *p++ = (*s == '<' ? 'l' : 'g');
 483           *p++ = 't';
 484           *p++ = ';';
 485           break;
 486         case '\"':
 487           *p++ = '&';
 488           *p++ = 'q';
 489           *p++ = 'u';
 490           *p++ = 'o';
 491           *p++ = 't';
 492           *p++ = ';';
 493           break;
 494         default:
 495           *p++ = *s;
 496         }
 497     }
 498   *p = '\0';
 499   return res;
 500 }
 501
 502 /* The function creates an HTML index containing references to given
 503    directories and files on the appropriate host.  The references are
 504    FTP.  */
 505 uerr_t
 506 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
 507 {
 508   FILE *fp;
 509   char *upwd;
 510   char *htclfile;               /* HTML-clean file name */
 511
 512   if (!opt.dfp)
 513     {
 514       fp = fopen (file, "wb");
 515       if (!fp)
 516         {
 517           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 518           return FOPENERR;
 519         }
 520     }
 521   else
 522     fp = opt.dfp;
 523   if (u->user)
 524     {
 525       char *tmpu, *tmpp;        /* temporary, clean user and passwd */
 526
 527       tmpu = CLEANDUP (u->user);
 528       tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
 529       upwd = (char *)xmalloc (strlen (tmpu)
 530                              + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
 531       sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
 532       free (tmpu);
 533       FREE_MAYBE (tmpp);
 534     }
 535   else
 536     upwd = xstrdup ("");
 537   fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
 538   fprintf (fp, "<html>\n<head>\n<title>");
 539   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 540   fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
 541   fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
 542   fprintf (fp, "</h1>\n<hr>\n<pre>\n");
 543   while (f)
 544     {
 545       fprintf (fp, "  ");
 546       if (f->tstamp != -1)
 547         {
 548           /* #### Should we translate the months? */
 549           static char *months[] = {
 550             "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 551             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 552           };
 553           struct tm *ptm = localtime ((time_t *)&f->tstamp);
 554
 555           fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
 556                   ptm->tm_mday);
 557           if (ptm->tm_hour)
 558             fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
 559           else
 560             fprintf (fp, "       ");
 561         }
 562       else
 563         fprintf (fp, _("time unknown       "));
 564       switch (f->type)
 565         {
 566         case FT_PLAINFILE:
 567           fprintf (fp, _("File        "));
 568           break;
 569         case FT_DIRECTORY:
 570           fprintf (fp, _("Directory   "));
 571           break;
 572         case FT_SYMLINK:
 573           fprintf (fp, _("Link        "));
 574           break;
 575         default:
 576           fprintf (fp, _("Not sure    "));
 577           break;
 578         }
 579       htclfile = html_quote_string (f->name);
 580       fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
 581       if (*u->dir != '/')
 582         putc ('/', fp);
 583       fprintf (fp, "%s", u->dir);
 584       if (*u->dir)
 585         putc ('/', fp);
 586       fprintf (fp, "%s", htclfile);
 587       if (f->type == FT_DIRECTORY)
 588         putc ('/', fp);
 589       fprintf (fp, "\">%s", htclfile);
 590       if (f->type == FT_DIRECTORY)
 591         putc ('/', fp);
 592       fprintf (fp, "</a> ");
 593       if (f->type == FT_PLAINFILE)
 594         fprintf (fp, _(" (%s bytes)"), legible (f->size));
 595       else if (f->type == FT_SYMLINK)
 596         fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
 597       putc ('\n', fp);
 598       free (htclfile);
 599       f = f->next;
 600     }
 601   fprintf (fp, "</pre>\n</body>\n</html>\n");
 602   free (upwd);
 603   if (!opt.dfp)
 604     fclose (fp);
 605   else
 606     fflush (fp);
 607   return FTPOK;
 608 }