sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998-2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 In addition, as a special exception, the Free Software Foundation
  20 gives permission to link the code of its release of Wget with the
  21 OpenSSL project's "OpenSSL" library (or with modified versions of it
  22 that use the same license as the "OpenSSL" library), and distribute
  23 the linked executables.  You must obey the GNU General Public License
  24 in all respects for all of the code used other than "OpenSSL".  If you
  25 modify this file, you may extend this exception to your version of the
  26 file, but you are not obligated to do so.  If you do not wish to do
  27 so, delete this exception statement from your version.  */
  28
  29 /* The only entry point to this module is map_html_tags(), which see.  */
  30
  31 /* TODO:
  32
  33    - Allow hooks for callers to process contents outside tags.  This
  34      is needed to implement handling <style> and <script>.  The
  35      taginfo structure already carries the information about where the
  36      tags are, but this is not enough, because one would also want to
  37      skip the comments.  (The funny thing is that for <style> and
  38      <script> you *don't* want to skip comments!)
  39
  40    - Create a test suite for regression testing. */
  41
  42 /* HISTORY:
  43
  44    This is the third HTML parser written for Wget.  The first one was
  45    written some time during the Geturl 1.0 beta cycle, and was very
  46    inefficient and buggy.  It also contained some very complex code to
  47    remember a list of parser states, because it was supposed to be
  48    reentrant.
  49
  50    The second HTML parser was written for Wget 1.4 (the first version
  51    by the name `Wget'), and was a complete rewrite.  Although the new
  52    parser behaved much better and made no claims of reentrancy, it
  53    still shared many of the fundamental flaws of the old version -- it
  54    only regarded HTML in terms tag-attribute pairs, where the
  55    attribute's value was a URL to be returned.  Any other property of
  56    HTML, such as <base href=...>, or strange way to specify a URL,
  57    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  58    crudely hacked in -- and the caller had to be aware of these hacks.
  59    Like its predecessor, this parser did not support HTML comments.
  60
  61    After Wget 1.5.1 was released, I set out to write a third HTML
  62    parser.  The objectives of the new parser were to: (1) provide a
  63    clean way to analyze HTML lexically, (2) separate interpretation of
  64    the markup from the parsing process, (3) be as correct as possible,
  65    e.g. correctly skipping comments and other SGML declarations, (4)
  66    understand the most common errors in markup and skip them or be
  67    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  68    minimum copying and minimum or no heap allocation).
  69
  70    I believe this parser meets all of the above goals.  It is
  71    reasonably well structured, and could be relatively easily
  72    separated from Wget and used elsewhere.  While some of its
  73    intrinsic properties limit its value as a general-purpose HTML
  74    parser, I believe that, with minimum modifications, it could serve
  75    as a backend for one.
  76
  77    Due to time and other constraints, this parser was not integrated
  78    into Wget until the version 1.7. */
  79
  80 /* DESCRIPTION:
  81
  82    The single entry point of this parser is map_html_tags(), which
  83    works by calling a function you specify for each tag.  The function
  84    gets called with the pointer to a structure describing the tag and
  85    its attributes.  */
  86
  87 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  88    still need Wget headers to compile.  */
  89
  90 #include <config.h>
  91
  92 #ifdef STANDALONE
  93 # define I_REALLY_WANT_CTYPE_MACROS
  94 #endif
  95
  96 #include <stdio.h>
  97 #include <stdlib.h>
  98 #include <string.h>
  99 #include <assert.h>
 100
 101 #include "wget.h"
 102 #include "html-parse.h"
 103
 104 #ifdef STANDALONE
 105 # undef xmalloc
 106 # undef xrealloc
 107 # undef xfree
 108 # define xmalloc malloc
 109 # define xrealloc realloc
 110 # define xfree free
 111
 112 # undef ISSPACE
 113 # undef ISDIGIT
 114 # undef ISXDIGIT
 115 # undef ISALPHA
 116 # undef ISALNUM
 117 # undef TOLOWER
 118 # undef TOUPPER
 119
 120 # define ISSPACE(x) isspace (x)
 121 # define ISDIGIT(x) isdigit (x)
 122 # define ISXDIGIT(x) isxdigit (x)
 123 # define ISALPHA(x) isalpha (x)
 124 # define ISALNUM(x) isalnum (x)
 125 # define TOLOWER(x) tolower (x)
 126 # define TOUPPER(x) toupper (x)
 127
 128 struct hash_table {
 129   int dummy;
 130 };
 131 static void *
 132 hash_table_get (const struct hash_table *ht, void *ptr)
 133 {
 134   return ptr;
 135 }
 136 #else  /* not STANDALONE */
 137 # include "hash.h"
 138 #endif
 139
 140 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 141    allocated on the stack, and moved to the heap if it needs to be
 142    larger than originally expected.  map_html_tags() uses it to store
 143    the zero-terminated names and values of tags and attributes.
 144
 145    Thus taginfo->name, and attr->name and attr->value for each
 146    attribute, do not point into separately allocated areas, but into
 147    different parts of the pool, separated only by terminating zeros.
 148    This ensures minimum amount of allocation and, for most tags, no
 149    allocation because the entire pool is kept on the stack.  */
 150
 151 struct pool {
 152   char *contents;               /* pointer to the contents. */
 153   int size;                     /* size of the pool. */
 154   int tail;                     /* next available position index. */
 155   bool resized;                 /* whether the pool has been resized
 156                                    using malloc. */
 157
 158   char *orig_contents;          /* original pool contents, usually
 159                                    stack-allocated.  used by POOL_FREE
 160                                    to restore the pool to the initial
 161                                    state. */
 162   int orig_size;
 163 };
 164
 165 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 166
 167 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 168   struct pool *P = (p);                                         \
 169   P->contents = (initial_storage);                              \
 170   P->size = (initial_size);                                     \
 171   P->tail = 0;                                                  \
 172   P->resized = false;                                           \
 173   P->orig_contents = P->contents;                               \
 174   P->orig_size = P->size;                                       \
 175 } while (0)
 176
 177 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 178    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 179
 180 #define POOL_GROW(p, increase)                                  \
 181   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 182               (p)->resized, char)
 183
 184 /* Append text in the range [beg, end) to POOL.  No zero-termination
 185    is done.  */
 186
 187 #define POOL_APPEND(p, beg, end) do {                   \
 188   const char *PA_beg = (beg);                           \
 189   int PA_size = (end) - PA_beg;                         \
 190   POOL_GROW (p, PA_size);                               \
 191   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 192   (p)->tail += PA_size;                                 \
 193 } while (0)
 194
 195 /* Append one character to the pool.  Can be used to zero-terminate
 196    pool strings.  */
 197
 198 #define POOL_APPEND_CHR(p, ch) do {             \
 199   char PAC_char = (ch);                         \
 200   POOL_GROW (p, 1);                             \
 201   (p)->contents[(p)->tail++] = PAC_char;        \
 202 } while (0)
 203
 204 /* Forget old pool contents.  The allocated memory is not freed. */
 205 #define POOL_REWIND(p) (p)->tail = 0
 206
 207 /* Free heap-allocated memory for contents of POOL.  This calls
 208    xfree() if the memory was allocated through malloc.  It also
 209    restores `contents' and `size' to their original, pre-malloc
 210    values.  That way after POOL_FREE, the pool is fully usable, just
 211    as if it were freshly initialized with POOL_INIT.  */
 212
 213 #define POOL_FREE(p) do {                       \
 214   struct pool *P = p;                           \
 215   if (P->resized)                               \
 216     xfree (P->contents);                        \
 217   P->contents = P->orig_contents;               \
 218   P->size = P->orig_size;                       \
 219   P->tail = 0;                                  \
 220   P->resized = false;                           \
 221 } while (0)
 222
 223 /* Used for small stack-allocated memory chunks that might grow.  Like
 224    DO_REALLOC, this macro grows BASEVAR as necessary to take
 225    NEEDED_SIZE items of TYPE.
 226
 227    The difference is that on the first resize, it will use
 228    malloc+memcpy rather than realloc.  That way you can stack-allocate
 229    the initial chunk, and only resort to heap allocation if you
 230    stumble upon large data.
 231
 232    After the first resize, subsequent ones are performed with realloc,
 233    just like DO_REALLOC.  */
 234
 235 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 236   long ga_needed_size = (needed_size);                                          \
 237   long ga_newsize = (sizevar);                                                  \
 238   while (ga_newsize < ga_needed_size)                                           \
 239     ga_newsize <<= 1;                                                           \
 240   if (ga_newsize != (sizevar))                                                  \
 241     {                                                                           \
 242       if (resized)                                                              \
 243         basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
 244       else                                                                      \
 245         {                                                                       \
 246           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 247           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 248           (basevar) = ga_new;                                                   \
 249           resized = true;                                                       \
 250         }                                                                       \
 251       (sizevar) = ga_newsize;                                                   \
 252     }                                                                           \
 253 } while (0)
 254 \f
 255 /* Test whether n+1-sized entity name fits in P.  We don't support
 256    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 257    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 258    other words an entity needs to be terminated by either a
 259    non-alphanumeric or the end of string.  */
 260 #define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
 261
 262 /* Macros that test entity names by returning true if P is followed by
 263    the specified characters.  */
 264 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 265 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 266 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 267
 268 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 269    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 270    to "<foo", but "&lt,foo" to "<,foo".  */
 271 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 272
 273 /* Decode the HTML character entity at *PTR, considering END to be end
 274    of buffer.  It is assumed that the "&" character that marks the
 275    beginning of the entity has been seen at *PTR-1.  If a recognized
 276    ASCII entity is seen, it is returned, and *PTR is moved to the end
 277    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 278
 279    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 280
 281 static int
 282 decode_entity (const char **ptr, const char *end)
 283 {
 284   const char *p = *ptr;
 285   int value = -1;
 286
 287   if (++p == end)
 288     return -1;
 289
 290   switch (*p++)
 291     {
 292     case '#':
 293       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 294       {
 295         int digits = 0;
 296         value = 0;
 297         if (*p == 'x')
 298           for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
 299             value = (value << 4) + XDIGIT_TO_NUM (*p);
 300         else
 301           for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
 302             value = (value * 10) + (*p - '0');
 303         if (!digits)
 304           return -1;
 305         /* Don't interpret 128+ codes and NUL because we cannot
 306            portably reinserted them into HTML.  */
 307         if (!value || (value & ~0x7f))
 308           return -1;
 309         *ptr = SKIP_SEMI (p, 0);
 310         return value;
 311       }
 312     /* Process named ASCII entities.  */
 313     case 'g':
 314       if (ENT1 (p, 't'))
 315         value = '>', *ptr = SKIP_SEMI (p, 1);
 316       break;
 317     case 'l':
 318       if (ENT1 (p, 't'))
 319         value = '<', *ptr = SKIP_SEMI (p, 1);
 320       break;
 321     case 'a':
 322       if (ENT2 (p, 'm', 'p'))
 323         value = '&', *ptr = SKIP_SEMI (p, 2);
 324       else if (ENT3 (p, 'p', 'o', 's'))
 325         /* handle &apos for the sake of the XML/XHTML crowd. */
 326         value = '\'', *ptr = SKIP_SEMI (p, 3);
 327       break;
 328     case 'q':
 329       if (ENT3 (p, 'u', 'o', 't'))
 330         value = '\"', *ptr = SKIP_SEMI (p, 3);
 331       break;
 332     }
 333   return value;
 334 }
 335 #undef ENT1
 336 #undef ENT2
 337 #undef ENT3
 338 #undef FITS
 339 #undef SKIP_SEMI
 340
 341 enum {
 342   AP_DOWNCASE           = 1,
 343   AP_DECODE_ENTITIES    = 2,
 344   AP_TRIM_BLANKS        = 4
 345 };
 346
 347 /* Copy the text in the range [BEG, END) to POOL, optionally
 348    performing operations specified by FLAGS.  FLAGS may be any
 349    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 350    with the following meaning:
 351
 352    * AP_DOWNCASE -- downcase all the letters;
 353
 354    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 355      the ASCII range when copying the string.
 356
 357    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 358      of text, as well as embedded newlines.  */
 359
 360 static void
 361 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 362 {
 363   int old_tail = pool->tail;
 364
 365   /* Skip blanks if required.  We must do this before entities are
 366      processed, so that blanks can still be inserted as, for instance,
 367      `&#32;'.  */
 368   if (flags & AP_TRIM_BLANKS)
 369     {
 370       while (beg < end && ISSPACE (*beg))
 371         ++beg;
 372       while (end > beg && ISSPACE (end[-1]))
 373         --end;
 374     }
 375
 376   if (flags & AP_DECODE_ENTITIES)
 377     {
 378       /* Grow the pool, then copy the text to the pool character by
 379          character, processing the encountered entities as we go
 380          along.
 381
 382          It's safe (and necessary) to grow the pool in advance because
 383          processing the entities can only *shorten* the string, it can
 384          never lengthen it.  */
 385       const char *from = beg;
 386       char *to;
 387       bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
 388
 389       POOL_GROW (pool, end - beg);
 390       to = pool->contents + pool->tail;
 391
 392       while (from < end)
 393         {
 394           if (*from == '&')
 395             {
 396               int entity = decode_entity (&from, end);
 397               if (entity != -1)
 398                 *to++ = entity;
 399               else
 400                 *to++ = *from++;
 401             }
 402           else if ((*from == '\n' || *from == '\r') && squash_newlines)
 403             ++from;
 404           else
 405             *to++ = *from++;
 406         }
 407       /* Verify that we haven't exceeded the original size.  (It
 408          shouldn't happen, hence the assert.)  */
 409       assert (to - (pool->contents + pool->tail) <= end - beg);
 410
 411       /* Make POOL's tail point to the position following the string
 412          we've written.  */
 413       pool->tail = to - pool->contents;
 414       POOL_APPEND_CHR (pool, '\0');
 415     }
 416   else
 417     {
 418       /* Just copy the text to the pool.  */
 419       POOL_APPEND (pool, beg, end);
 420       POOL_APPEND_CHR (pool, '\0');
 421     }
 422
 423   if (flags & AP_DOWNCASE)
 424     {
 425       char *p = pool->contents + old_tail;
 426       for (; *p; p++)
 427         *p = TOLOWER (*p);
 428     }
 429 }
 430 \f
 431 /* Originally we used to adhere to rfc 1866 here, and allowed only
 432    letters, digits, periods, and hyphens as names (of tags or
 433    attributes).  However, this broke too many pages which used
 434    proprietary or strange attributes, e.g. <img src="a.gif"
 435    v:shapes="whatever">.
 436
 437    So now we allow any character except:
 438      * whitespace
 439      * 8-bit and control chars
 440      * characters that clearly cannot be part of name:
 441        '=', '>', '/'.
 442
 443    This only affects attribute and tag names; attribute values allow
 444    an even greater variety of characters.  */
 445
 446 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 447                         && (x) != '=' && (x) != '>' && (x) != '/')
 448
 449 #ifdef STANDALONE
 450 static int comment_backout_count;
 451 #endif
 452
 453 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 454    strict comments mode, this is used for skipping over comments as
 455    well.
 456
 457    To recap: any SGML declaration may have comments associated with
 458    it, e.g.
 459        <!MY-DECL -- isn't this fun? -- foo bar>
 460
 461    An HTML comment is merely an empty declaration (<!>) with a comment
 462    attached, like this:
 463        <!-- some stuff here -->
 464
 465    Several comments may be embedded in one comment declaration:
 466        <!-- have -- -- fun -->
 467
 468    Whitespace is allowed between and after the comments, but not
 469    before the first comment.  Additionally, this function attempts to
 470    handle double quotes in SGML declarations correctly.  */
 471
 472 static const char *
 473 advance_declaration (const char *beg, const char *end)
 474 {
 475   const char *p = beg;
 476   char quote_char = '\0';       /* shut up, gcc! */
 477   char ch;
 478
 479   enum {
 480     AC_S_DONE,
 481     AC_S_BACKOUT,
 482     AC_S_BANG,
 483     AC_S_DEFAULT,
 484     AC_S_DCLNAME,
 485     AC_S_DASH1,
 486     AC_S_DASH2,
 487     AC_S_COMMENT,
 488     AC_S_DASH3,
 489     AC_S_DASH4,
 490     AC_S_QUOTE1,
 491     AC_S_IN_QUOTE,
 492     AC_S_QUOTE2
 493   } state = AC_S_BANG;
 494
 495   if (beg == end)
 496     return beg;
 497   ch = *p++;
 498
 499   /* It looked like a good idea to write this as a state machine, but
 500      now I wonder...  */
 501
 502   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 503     {
 504       if (p == end)
 505         state = AC_S_BACKOUT;
 506       switch (state)
 507         {
 508         case AC_S_DONE:
 509         case AC_S_BACKOUT:
 510           break;
 511         case AC_S_BANG:
 512           if (ch == '!')
 513             {
 514               ch = *p++;
 515               state = AC_S_DEFAULT;
 516             }
 517           else
 518             state = AC_S_BACKOUT;
 519           break;
 520         case AC_S_DEFAULT:
 521           switch (ch)
 522             {
 523             case '-':
 524               state = AC_S_DASH1;
 525               break;
 526             case ' ':
 527             case '\t':
 528             case '\r':
 529             case '\n':
 530               ch = *p++;
 531               break;
 532             case '>':
 533               state = AC_S_DONE;
 534               break;
 535             case '\'':
 536             case '\"':
 537               state = AC_S_QUOTE1;
 538               break;
 539             default:
 540               if (NAME_CHAR_P (ch))
 541                 state = AC_S_DCLNAME;
 542               else
 543                 state = AC_S_BACKOUT;
 544               break;
 545             }
 546           break;
 547         case AC_S_DCLNAME:
 548           if (ch == '-')
 549             state = AC_S_DASH1;
 550           else if (NAME_CHAR_P (ch))
 551             ch = *p++;
 552           else
 553             state = AC_S_DEFAULT;
 554           break;
 555         case AC_S_QUOTE1:
 556           /* We must use 0x22 because broken assert macros choke on
 557              '"' and '\"'.  */
 558           assert (ch == '\'' || ch == 0x22);
 559           quote_char = ch;      /* cheating -- I really don't feel like
 560                                    introducing more different states for
 561                                    different quote characters. */
 562           ch = *p++;
 563           state = AC_S_IN_QUOTE;
 564           break;
 565         case AC_S_IN_QUOTE:
 566           if (ch == quote_char)
 567             state = AC_S_QUOTE2;
 568           else
 569             ch = *p++;
 570           break;
 571         case AC_S_QUOTE2:
 572           assert (ch == quote_char);
 573           ch = *p++;
 574           state = AC_S_DEFAULT;
 575           break;
 576         case AC_S_DASH1:
 577           assert (ch == '-');
 578           ch = *p++;
 579           state = AC_S_DASH2;
 580           break;
 581         case AC_S_DASH2:
 582           switch (ch)
 583             {
 584             case '-':
 585               ch = *p++;
 586               state = AC_S_COMMENT;
 587               break;
 588             default:
 589               state = AC_S_BACKOUT;
 590             }
 591           break;
 592         case AC_S_COMMENT:
 593           switch (ch)
 594             {
 595             case '-':
 596               state = AC_S_DASH3;
 597               break;
 598             default:
 599               ch = *p++;
 600               break;
 601             }
 602           break;
 603         case AC_S_DASH3:
 604           assert (ch == '-');
 605           ch = *p++;
 606           state = AC_S_DASH4;
 607           break;
 608         case AC_S_DASH4:
 609           switch (ch)
 610             {
 611             case '-':
 612               ch = *p++;
 613               state = AC_S_DEFAULT;
 614               break;
 615             default:
 616               state = AC_S_COMMENT;
 617               break;
 618             }
 619           break;
 620         }
 621     }
 622
 623   if (state == AC_S_BACKOUT)
 624     {
 625 #ifdef STANDALONE
 626       ++comment_backout_count;
 627 #endif
 628       return beg + 1;
 629     }
 630   return p;
 631 }
 632
 633 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 634    return the pointer to the character after the substring.  If the
 635    substring is not found, return NULL.  */
 636
 637 static const char *
 638 find_comment_end (const char *beg, const char *end)
 639 {
 640   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 641      if it's not '>' or '-', advance by three characters.  Otherwise,
 642      look at the preceding characters and try to find a match.  */
 643
 644   const char *p = beg - 1;
 645
 646   while ((p += 3) < end)
 647     switch (p[0])
 648       {
 649       case '>':
 650         if (p[-1] == '-' && p[-2] == '-')
 651           return p + 1;
 652         break;
 653       case '-':
 654       at_dash:
 655         if (p[-1] == '-')
 656           {
 657           at_dash_dash:
 658             if (++p == end) return NULL;
 659             switch (p[0])
 660               {
 661               case '>': return p + 1;
 662               case '-': goto at_dash_dash;
 663               }
 664           }
 665         else
 666           {
 667             if ((p += 2) >= end) return NULL;
 668             switch (p[0])
 669               {
 670               case '>':
 671                 if (p[-1] == '-')
 672                   return p + 1;
 673                 break;
 674               case '-':
 675                 goto at_dash;
 676               }
 677           }
 678       }
 679   return NULL;
 680 }
 681 \f
 682 /* Return true if the string containing of characters inside [b, e) is
 683    present in hash table HT.  */
 684
 685 static bool
 686 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 687 {
 688   char *copy;
 689   if (!ht)
 690     return true;
 691   BOUNDED_TO_ALLOCA (b, e, copy);
 692   return hash_table_get (ht, copy) != NULL;
 693 }
 694
 695 /* Advance P (a char pointer), with the explicit intent of being able
 696    to read the next character.  If this is not possible, go to finish.  */
 697
 698 #define ADVANCE(p) do {                         \
 699   ++p;                                          \
 700   if (p >= end)                                 \
 701     goto finish;                                \
 702 } while (0)
 703
 704 /* Skip whitespace, if any. */
 705
 706 #define SKIP_WS(p) do {                         \
 707   while (ISSPACE (*p)) {                        \
 708     ADVANCE (p);                                \
 709   }                                             \
 710 } while (0)
 711
 712 /* Skip non-whitespace, if any. */
 713
 714 #define SKIP_NON_WS(p) do {                     \
 715   while (!ISSPACE (*p)) {                       \
 716     ADVANCE (p);                                \
 717   }                                             \
 718 } while (0)
 719
 720 #ifdef STANDALONE
 721 static int tag_backout_count;
 722 #endif
 723
 724 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 725    MAPFUN will be called with two arguments: pointer to an initialized
 726    struct taginfo, and MAPARG.
 727
 728    ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
 729    which are the tags and attribute names that this function should
 730    use.  If ALLOWED_TAGS is NULL, all tags are processed; if
 731    ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
 732
 733    (Obviously, the caller can filter out unwanted tags and attributes
 734    just as well, but this is just an optimization designed to avoid
 735    unnecessary copying of tags/attributes which the caller doesn't
 736    care about.)  */
 737
 738 void
 739 map_html_tags (const char *text, int size,
 740                void (*mapfun) (struct taginfo *, void *), void *maparg,
 741                int flags,
 742                const struct hash_table *allowed_tags,
 743                const struct hash_table *allowed_attributes)
 744 {
 745   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 746      too little, POOL_APPEND allocates more with malloc. */
 747   char pool_initial_storage[256];
 748   struct pool pool;
 749
 750   const char *p = text;
 751   const char *end = text + size;
 752
 753   struct attr_pair attr_pair_initial_storage[8];
 754   int attr_pair_size = countof (attr_pair_initial_storage);
 755   bool attr_pair_resized = false;
 756   struct attr_pair *pairs = attr_pair_initial_storage;
 757
 758   if (!size)
 759     return;
 760
 761   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 762
 763   {
 764     int nattrs, end_tag;
 765     const char *tag_name_begin, *tag_name_end;
 766     const char *tag_start_position;
 767     bool uninteresting_tag;
 768
 769   look_for_tag:
 770     POOL_REWIND (&pool);
 771
 772     nattrs = 0;
 773     end_tag = 0;
 774
 775     /* Find beginning of tag.  We use memchr() instead of the usual
 776        looping with ADVANCE() for speed. */
 777     p = memchr (p, '<', end - p);
 778     if (!p)
 779       goto finish;
 780
 781     tag_start_position = p;
 782     ADVANCE (p);
 783
 784     /* Establish the type of the tag (start-tag, end-tag or
 785        declaration).  */
 786     if (*p == '!')
 787       {
 788         if (!(flags & MHT_STRICT_COMMENTS)
 789             && p < end + 3 && p[1] == '-' && p[2] == '-')
 790           {
 791             /* If strict comments are not enforced and if we know
 792                we're looking at a comment, simply look for the
 793                terminating "-->".  Non-strict is the default because
 794                it works in other browsers and most HTML writers can't
 795                be bothered with getting the comments right.  */
 796             const char *comment_end = find_comment_end (p + 3, end);
 797             if (comment_end)
 798               p = comment_end;
 799           }
 800         else
 801           {
 802             /* Either in strict comment mode or looking at a non-empty
 803                declaration.  Real declarations are much less likely to
 804                be misused the way comments are, so advance over them
 805                properly regardless of strictness.  */
 806             p = advance_declaration (p, end);
 807           }
 808         if (p == end)
 809           goto finish;
 810         goto look_for_tag;
 811       }
 812     else if (*p == '/')
 813       {
 814         end_tag = 1;
 815         ADVANCE (p);
 816       }
 817     tag_name_begin = p;
 818     while (NAME_CHAR_P (*p))
 819       ADVANCE (p);
 820     if (p == tag_name_begin)
 821       goto look_for_tag;
 822     tag_name_end = p;
 823     SKIP_WS (p);
 824     if (end_tag && *p != '>')
 825       goto backout_tag;
 826
 827     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 828       /* We can't just say "goto look_for_tag" here because we need
 829          the loop below to properly advance over the tag's attributes.  */
 830       uninteresting_tag = true;
 831     else
 832       {
 833         uninteresting_tag = false;
 834         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 835       }
 836
 837     /* Find the attributes. */
 838     while (1)
 839       {
 840         const char *attr_name_begin, *attr_name_end;
 841         const char *attr_value_begin, *attr_value_end;
 842         const char *attr_raw_value_begin, *attr_raw_value_end;
 843         int operation = AP_DOWNCASE; /* stupid compiler. */
 844
 845         SKIP_WS (p);
 846
 847         if (*p == '/')
 848           {
 849             /* A slash at this point means the tag is about to be
 850                closed.  This is legal in XML and has been popularized
 851                in HTML via XHTML.  */
 852             /* <foo a=b c=d /> */
 853             /*              ^  */
 854             ADVANCE (p);
 855             SKIP_WS (p);
 856             if (*p != '>')
 857               goto backout_tag;
 858           }
 859
 860         /* Check for end of tag definition. */
 861         if (*p == '>')
 862           break;
 863
 864         /* Establish bounds of attribute name. */
 865         attr_name_begin = p;    /* <foo bar ...> */
 866                                 /*      ^        */
 867         while (NAME_CHAR_P (*p))
 868           ADVANCE (p);
 869         attr_name_end = p;      /* <foo bar ...> */
 870                                 /*         ^     */
 871         if (attr_name_begin == attr_name_end)
 872           goto backout_tag;
 873
 874         /* Establish bounds of attribute value. */
 875         SKIP_WS (p);
 876         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 877           {
 878             /* Minimized attribute syntax allows `=' to be omitted.
 879                For example, <UL COMPACT> is a valid shorthand for <UL
 880                COMPACT="compact">.  Even if such attributes are not
 881                useful to Wget, we need to support them, so that the
 882                tags containing them can be parsed correctly. */
 883             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 884             attr_raw_value_end = attr_value_end = attr_name_end;
 885           }
 886         else if (*p == '=')
 887           {
 888             ADVANCE (p);
 889             SKIP_WS (p);
 890             if (*p == '\"' || *p == '\'')
 891               {
 892                 bool newline_seen = false;
 893                 char quote_char = *p;
 894                 attr_raw_value_begin = p;
 895                 ADVANCE (p);
 896                 attr_value_begin = p; /* <foo bar="baz"> */
 897                                       /*           ^     */
 898                 while (*p != quote_char)
 899                   {
 900                     if (!newline_seen && *p == '\n')
 901                       {
 902                         /* If a newline is seen within the quotes, it
 903                            is most likely that someone forgot to close
 904                            the quote.  In that case, we back out to
 905                            the value beginning, and terminate the tag
 906                            at either `>' or the delimiter, whichever
 907                            comes first.  Such a tag terminated at `>'
 908                            is discarded.  */
 909                         p = attr_value_begin;
 910                         newline_seen = true;
 911                         continue;
 912                       }
 913                     else if (newline_seen && *p == '>')
 914                       break;
 915                     ADVANCE (p);
 916                   }
 917                 attr_value_end = p; /* <foo bar="baz"> */
 918                                     /*              ^  */
 919                 if (*p == quote_char)
 920                   ADVANCE (p);
 921                 else
 922                   goto look_for_tag;
 923                 attr_raw_value_end = p; /* <foo bar="baz"> */
 924                                         /*               ^ */
 925                 operation = AP_DECODE_ENTITIES;
 926                 if (flags & MHT_TRIM_VALUES)
 927                   operation |= AP_TRIM_BLANKS;
 928               }
 929             else
 930               {
 931                 attr_value_begin = p; /* <foo bar=baz> */
 932                                       /*          ^    */
 933                 /* According to SGML, a name token should consist only
 934                    of alphanumerics, . and -.  However, this is often
 935                    violated by, for instance, `%' in `width=75%'.
 936                    We'll be liberal and allow just about anything as
 937                    an attribute value.  */
 938                 while (!ISSPACE (*p) && *p != '>')
 939                   ADVANCE (p);
 940                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 941                                     /*             ^          */
 942                 if (attr_value_begin == attr_value_end)
 943                   /* <foo bar=> */
 944                   /*          ^ */
 945                   goto backout_tag;
 946                 attr_raw_value_begin = attr_value_begin;
 947                 attr_raw_value_end = attr_value_end;
 948                 operation = AP_DECODE_ENTITIES;
 949               }
 950           }
 951         else
 952           {
 953             /* We skipped the whitespace and found something that is
 954                neither `=' nor the beginning of the next attribute's
 955                name.  Back out.  */
 956             goto backout_tag;   /* <foo bar [... */
 957                                 /*          ^    */
 958           }
 959
 960         /* If we're not interested in the tag, don't bother with any
 961            of the attributes.  */
 962         if (uninteresting_tag)
 963           continue;
 964
 965         /* If we aren't interested in the attribute, skip it.  We
 966            cannot do this test any sooner, because our text pointer
 967            needs to correctly advance over the attribute.  */
 968         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
 969           continue;
 970
 971         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 972                     struct attr_pair);
 973
 974         pairs[nattrs].name_pool_index = pool.tail;
 975         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 976
 977         pairs[nattrs].value_pool_index = pool.tail;
 978         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 979         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 980         pairs[nattrs].value_raw_size = (attr_raw_value_end
 981                                         - attr_raw_value_begin);
 982         ++nattrs;
 983       }
 984
 985     if (uninteresting_tag)
 986       {
 987         ADVANCE (p);
 988         goto look_for_tag;
 989       }
 990
 991     /* By now, we have a valid tag with a name and zero or more
 992        attributes.  Fill in the data and call the mapper function.  */
 993     {
 994       int i;
 995       struct taginfo taginfo;
 996
 997       taginfo.name      = pool.contents;
 998       taginfo.end_tag_p = end_tag;
 999       taginfo.nattrs    = nattrs;
1000       /* We fill in the char pointers only now, when pool can no
1001          longer get realloc'ed.  If we did that above, we could get
1002          hosed by reallocation.  Obviously, after this point, the pool
1003          may no longer be grown.  */
1004       for (i = 0; i < nattrs; i++)
1005         {
1006           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1007           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1008         }
1009       taginfo.attrs = pairs;
1010       taginfo.start_position = tag_start_position;
1011       taginfo.end_position   = p + 1;
1012       mapfun (&taginfo, maparg);
1013       ADVANCE (p);
1014     }
1015     goto look_for_tag;
1016
1017   backout_tag:
1018 #ifdef STANDALONE
1019     ++tag_backout_count;
1020 #endif
1021     /* The tag wasn't really a tag.  Treat its contents as ordinary
1022        data characters. */
1023     p = tag_start_position + 1;
1024     goto look_for_tag;
1025   }
1026
1027  finish:
1028   POOL_FREE (&pool);
1029   if (attr_pair_resized)
1030     xfree (pairs);
1031 }
1032
1033 #undef ADVANCE
1034 #undef SKIP_WS
1035 #undef SKIP_NON_WS
1036 \f
1037 #ifdef STANDALONE
1038 static void
1039 test_mapper (struct taginfo *taginfo, void *arg)
1040 {
1041   int i;
1042
1043   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1044   for (i = 0; i < taginfo->nattrs; i++)
1045     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1046   putchar ('\n');
1047   ++*(int *)arg;
1048 }
1049
1050 int main ()
1051 {
1052   int size = 256;
1053   char *x = xmalloc (size);
1054   int length = 0;
1055   int read_count;
1056   int tag_counter = 0;
1057
1058   while ((read_count = fread (x + length, 1, size - length, stdin)))
1059     {
1060       length += read_count;
1061       size <<= 1;
1062       x = xrealloc (x, size);
1063     }
1064
1065   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1066   printf ("TAGS: %d\n", tag_counter);
1067   printf ("Tag backouts:     %d\n", tag_backout_count);
1068   printf ("Comment backouts: %d\n", comment_backout_count);
1069   return 0;
1070 }
1071 #endif /* STANDALONE */