sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #ifdef HAVE_STRING_H
 100 # include <string.h>
 101 #else
 102 # include <strings.h>
 103 #endif
 104 #include <assert.h>
 105
 106 #include "wget.h"
 107 #include "html-parse.h"
 108
 109 #ifdef STANDALONE
 110 # undef xmalloc
 111 # undef xrealloc
 112 # undef xfree
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # undef ISSPACE
 118 # undef ISDIGIT
 119 # undef ISXDIGIT
 120 # undef ISALPHA
 121 # undef ISALNUM
 122 # undef TOLOWER
 123 # undef TOUPPER
 124
 125 # define ISSPACE(x) isspace (x)
 126 # define ISDIGIT(x) isdigit (x)
 127 # define ISXDIGIT(x) isxdigit (x)
 128 # define ISALPHA(x) isalpha (x)
 129 # define ISALNUM(x) isalnum (x)
 130 # define TOLOWER(x) tolower (x)
 131 # define TOUPPER(x) toupper (x)
 132
 133 struct hash_table {
 134   int dummy;
 135 };
 136 static void *
 137 hash_table_get (const struct hash_table *ht, void *ptr)
 138 {
 139   return ptr;
 140 }
 141 #else  /* not STANDALONE */
 142 # include "hash.h"
 143 #endif
 144
 145 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 146    allocated on the stack, and moved to the heap if it needs to be
 147    larger than originally expected.  map_html_tags() uses it to store
 148    the zero-terminated names and values of tags and attributes.
 149
 150    Thus taginfo->name, and attr->name and attr->value for each
 151    attribute, do not point into separately allocated areas, but into
 152    different parts of the pool, separated only by terminating zeros.
 153    This ensures minimum amount of allocation and, for most tags, no
 154    allocation because the entire pool is kept on the stack.  */
 155
 156 struct pool {
 157   char *contents;               /* pointer to the contents. */
 158   int size;                     /* size of the pool. */
 159   int tail;                     /* next available position index. */
 160   int resized;                  /* whether the pool has been resized
 161                                    using malloc. */
 162
 163   char *orig_contents;          /* original pool contents, usually
 164                                    stack-allocated.  used by POOL_FREE
 165                                    to restore the pool to the initial
 166                                    state. */
 167   int orig_size;
 168 };
 169
 170 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 171
 172 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 173   struct pool *P = (p);                                         \
 174   P->contents = (initial_storage);                              \
 175   P->size = (initial_size);                                     \
 176   P->tail = 0;                                                  \
 177   P->resized = 0;                                               \
 178   P->orig_contents = P->contents;                               \
 179   P->orig_size = P->size;                                       \
 180 } while (0)
 181
 182 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 183    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 184
 185 #define POOL_GROW(p, increase)                                  \
 186   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 187               (p)->resized, char)
 188
 189 /* Append text in the range [beg, end) to POOL.  No zero-termination
 190    is done.  */
 191
 192 #define POOL_APPEND(p, beg, end) do {                   \
 193   const char *PA_beg = (beg);                           \
 194   int PA_size = (end) - PA_beg;                         \
 195   POOL_GROW (p, PA_size);                               \
 196   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 197   (p)->tail += PA_size;                                 \
 198 } while (0)
 199
 200 /* Append one character to the pool.  Can be used to zero-terminate
 201    pool strings.  */
 202
 203 #define POOL_APPEND_CHR(p, ch) do {             \
 204   char PAC_char = (ch);                         \
 205   POOL_GROW (p, 1);                             \
 206   (p)->contents[(p)->tail++] = PAC_char;        \
 207 } while (0)
 208
 209 /* Forget old pool contents.  The allocated memory is not freed. */
 210 #define POOL_REWIND(p) (p)->tail = 0
 211
 212 /* Free heap-allocated memory for contents of POOL.  This calls
 213    xfree() if the memory was allocated through malloc.  It also
 214    restores `contents' and `size' to their original, pre-malloc
 215    values.  That way after POOL_FREE, the pool is fully usable, just
 216    as if it were freshly initialized with POOL_INIT.  */
 217
 218 #define POOL_FREE(p) do {                       \
 219   struct pool *P = p;                           \
 220   if (P->resized)                               \
 221     xfree (P->contents);                        \
 222   P->contents = P->orig_contents;               \
 223   P->size = P->orig_size;                       \
 224   P->tail = 0;                                  \
 225   P->resized = 0;                               \
 226 } while (0)
 227
 228 /* Used for small stack-allocated memory chunks that might grow.  Like
 229    DO_REALLOC, this macro grows BASEVAR as necessary to take
 230    NEEDED_SIZE items of TYPE.
 231
 232    The difference is that on the first resize, it will use
 233    malloc+memcpy rather than realloc.  That way you can stack-allocate
 234    the initial chunk, and only resort to heap allocation if you
 235    stumble upon large data.
 236
 237    After the first resize, subsequent ones are performed with realloc,
 238    just like DO_REALLOC.  */
 239
 240 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 241   long ga_needed_size = (needed_size);                                          \
 242   long ga_newsize = (sizevar);                                                  \
 243   while (ga_newsize < ga_needed_size)                                           \
 244     ga_newsize <<= 1;                                                           \
 245   if (ga_newsize != (sizevar))                                                  \
 246     {                                                                           \
 247       if (resized)                                                              \
 248         basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type));       \
 249       else                                                                      \
 250         {                                                                       \
 251           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 252           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 253           (basevar) = ga_new;                                                   \
 254           resized = 1;                                                          \
 255         }                                                                       \
 256       (sizevar) = ga_newsize;                                                   \
 257     }                                                                           \
 258 } while (0)
 259 \f
 260 #define AP_DOWNCASE             1
 261 #define AP_PROCESS_ENTITIES     2
 262 #define AP_TRIM_BLANKS          4
 263
 264 /* Copy the text in the range [BEG, END) to POOL, optionally
 265    performing operations specified by FLAGS.  FLAGS may be any
 266    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
 267    with the following meaning:
 268
 269    * AP_DOWNCASE -- downcase all the letters;
 270
 271    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 272    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 273    &nbsp and the numerical entities.
 274
 275    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 276    of text.  */
 277
 278 static void
 279 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 280 {
 281   int old_tail = pool->tail;
 282   int size;
 283
 284   /* First, skip blanks if required.  We must do this before entities
 285      are processed, so that blanks can still be inserted as, for
 286      instance, `&#32;'.  */
 287   if (flags & AP_TRIM_BLANKS)
 288     {
 289       while (beg < end && ISSPACE (*beg))
 290         ++beg;
 291       while (end > beg && ISSPACE (end[-1]))
 292         --end;
 293     }
 294   size = end - beg;
 295
 296   if (flags & AP_PROCESS_ENTITIES)
 297     {
 298       /* Grow the pool, then copy the text to the pool character by
 299          character, processing the encountered entities as we go
 300          along.
 301
 302          It's safe (and necessary) to grow the pool in advance because
 303          processing the entities can only *shorten* the string, it can
 304          never lengthen it.  */
 305       const char *from = beg;
 306       char *to;
 307
 308       POOL_GROW (pool, end - beg);
 309       to = pool->contents + pool->tail;
 310
 311       while (from < end)
 312         {
 313           if (*from != '&')
 314             *to++ = *from++;
 315           else
 316             {
 317               const char *save = from;
 318               int remain;
 319
 320               if (++from == end)
 321                 goto lose;
 322               remain = end - from;
 323
 324               /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 325               if (*from == '#')
 326                 {
 327                   int numeric = 0, digits = 0;
 328                   ++from;
 329                   if (*from == 'x')
 330                     {
 331                       ++from;
 332                       for (; from < end && ISXDIGIT (*from); from++, digits++)
 333                         numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
 334                     }
 335                   else
 336                     {
 337                       for (; from < end && ISDIGIT (*from); from++, digits++)
 338                         numeric = (numeric * 10) + (*from - '0');
 339                     }
 340                   if (!digits)
 341                     goto lose;
 342                   numeric &= 0xff;
 343                   *to++ = numeric;
 344                 }
 345 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 346                  && 0 == memcmp (from, x, sizeof (x) - 1)       \
 347                  && (*(from + sizeof (x) - 1) == ';'            \
 348                      || remain == sizeof (x) - 1                \
 349                      || !ISALNUM (*(from + sizeof (x) - 1))))
 350               else if (FROB ("lt"))
 351                 *to++ = '<', from += 2;
 352               else if (FROB ("gt"))
 353                 *to++ = '>', from += 2;
 354               else if (FROB ("amp"))
 355                 *to++ = '&', from += 3;
 356               else if (FROB ("quot"))
 357                 *to++ = '\"', from += 4;
 358               /* We don't implement the proposed "Added Latin 1"
 359                  entities (except for nbsp), because it is unnecessary
 360                  in the context of Wget, and would require hashing to
 361                  work efficiently.  */
 362               else if (FROB ("nbsp"))
 363                 *to++ = 160, from += 4;
 364               else
 365                 goto lose;
 366 #undef FROB
 367               /* If the entity was followed by `;', we step over the
 368                  `;'.  Otherwise, it was followed by either a
 369                  non-alphanumeric or EOB, in which case we do nothing.  */
 370               if (from < end && *from == ';')
 371                 ++from;
 372               continue;
 373
 374             lose:
 375               /* This was not an entity after all.  Back out.  */
 376               from = save;
 377               *to++ = *from++;
 378             }
 379         }
 380       /* Verify that we haven't exceeded the original size.  (It
 381          shouldn't happen, hence the assert.)  */
 382       assert (to - (pool->contents + pool->tail) <= end - beg);
 383
 384       /* Make POOL's tail point to the position following the string
 385          we've written.  */
 386       pool->tail = to - pool->contents;
 387       POOL_APPEND_CHR (pool, '\0');
 388     }
 389   else
 390     {
 391       /* Just copy the text to the pool.  */
 392       POOL_APPEND (pool, beg, end);
 393       POOL_APPEND_CHR (pool, '\0');
 394     }
 395
 396   if (flags & AP_DOWNCASE)
 397     {
 398       char *p = pool->contents + old_tail;
 399       for (; *p; p++)
 400         *p = TOLOWER (*p);
 401     }
 402 }
 403 \f
 404 /* Originally we used to adhere to rfc 1866 here, and allowed only
 405    letters, digits, periods, and hyphens as names (of tags or
 406    attributes).  However, this broke too many pages which used
 407    proprietary or strange attributes, e.g. <img src="a.gif"
 408    v:shapes="whatever">.
 409
 410    So now we allow any character except:
 411      * whitespace
 412      * 8-bit and control chars
 413      * characters that clearly cannot be part of name:
 414        '=', '>', '/'.
 415
 416    This only affects attribute and tag names; attribute values allow
 417    an even greater variety of characters.  */
 418
 419 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 420                         && (x) != '=' && (x) != '>' && (x) != '/')
 421
 422 #ifdef STANDALONE
 423 static int comment_backout_count;
 424 #endif
 425
 426 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 427    strict comments mode, this is used for skipping over comments as
 428    well.
 429
 430    To recap: any SGML declaration may have comments associated with
 431    it, e.g.
 432        <!MY-DECL -- isn't this fun? -- foo bar>
 433
 434    An HTML comment is merely an empty declaration (<!>) with a comment
 435    attached, like this:
 436        <!-- some stuff here -->
 437
 438    Several comments may be embedded in one comment declaration:
 439        <!-- have -- -- fun -->
 440
 441    Whitespace is allowed between and after the comments, but not
 442    before the first comment.  Additionally, this function attempts to
 443    handle double quotes in SGML declarations correctly.  */
 444
 445 static const char *
 446 advance_declaration (const char *beg, const char *end)
 447 {
 448   const char *p = beg;
 449   char quote_char = '\0';       /* shut up, gcc! */
 450   char ch;
 451
 452   enum {
 453     AC_S_DONE,
 454     AC_S_BACKOUT,
 455     AC_S_BANG,
 456     AC_S_DEFAULT,
 457     AC_S_DCLNAME,
 458     AC_S_DASH1,
 459     AC_S_DASH2,
 460     AC_S_COMMENT,
 461     AC_S_DASH3,
 462     AC_S_DASH4,
 463     AC_S_QUOTE1,
 464     AC_S_IN_QUOTE,
 465     AC_S_QUOTE2
 466   } state = AC_S_BANG;
 467
 468   if (beg == end)
 469     return beg;
 470   ch = *p++;
 471
 472   /* It looked like a good idea to write this as a state machine, but
 473      now I wonder...  */
 474
 475   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 476     {
 477       if (p == end)
 478         state = AC_S_BACKOUT;
 479       switch (state)
 480         {
 481         case AC_S_DONE:
 482         case AC_S_BACKOUT:
 483           break;
 484         case AC_S_BANG:
 485           if (ch == '!')
 486             {
 487               ch = *p++;
 488               state = AC_S_DEFAULT;
 489             }
 490           else
 491             state = AC_S_BACKOUT;
 492           break;
 493         case AC_S_DEFAULT:
 494           switch (ch)
 495             {
 496             case '-':
 497               state = AC_S_DASH1;
 498               break;
 499             case ' ':
 500             case '\t':
 501             case '\r':
 502             case '\n':
 503               ch = *p++;
 504               break;
 505             case '>':
 506               state = AC_S_DONE;
 507               break;
 508             case '\'':
 509             case '\"':
 510               state = AC_S_QUOTE1;
 511               break;
 512             default:
 513               if (NAME_CHAR_P (ch))
 514                 state = AC_S_DCLNAME;
 515               else
 516                 state = AC_S_BACKOUT;
 517               break;
 518             }
 519           break;
 520         case AC_S_DCLNAME:
 521           if (ch == '-')
 522             state = AC_S_DASH1;
 523           else if (NAME_CHAR_P (ch))
 524             ch = *p++;
 525           else
 526             state = AC_S_DEFAULT;
 527           break;
 528         case AC_S_QUOTE1:
 529           /* We must use 0x22 because broken assert macros choke on
 530              '"' and '\"'.  */
 531           assert (ch == '\'' || ch == 0x22);
 532           quote_char = ch;      /* cheating -- I really don't feel like
 533                                    introducing more different states for
 534                                    different quote characters. */
 535           ch = *p++;
 536           state = AC_S_IN_QUOTE;
 537           break;
 538         case AC_S_IN_QUOTE:
 539           if (ch == quote_char)
 540             state = AC_S_QUOTE2;
 541           else
 542             ch = *p++;
 543           break;
 544         case AC_S_QUOTE2:
 545           assert (ch == quote_char);
 546           ch = *p++;
 547           state = AC_S_DEFAULT;
 548           break;
 549         case AC_S_DASH1:
 550           assert (ch == '-');
 551           ch = *p++;
 552           state = AC_S_DASH2;
 553           break;
 554         case AC_S_DASH2:
 555           switch (ch)
 556             {
 557             case '-':
 558               ch = *p++;
 559               state = AC_S_COMMENT;
 560               break;
 561             default:
 562               state = AC_S_BACKOUT;
 563             }
 564           break;
 565         case AC_S_COMMENT:
 566           switch (ch)
 567             {
 568             case '-':
 569               state = AC_S_DASH3;
 570               break;
 571             default:
 572               ch = *p++;
 573               break;
 574             }
 575           break;
 576         case AC_S_DASH3:
 577           assert (ch == '-');
 578           ch = *p++;
 579           state = AC_S_DASH4;
 580           break;
 581         case AC_S_DASH4:
 582           switch (ch)
 583             {
 584             case '-':
 585               ch = *p++;
 586               state = AC_S_DEFAULT;
 587               break;
 588             default:
 589               state = AC_S_COMMENT;
 590               break;
 591             }
 592           break;
 593         }
 594     }
 595
 596   if (state == AC_S_BACKOUT)
 597     {
 598 #ifdef STANDALONE
 599       ++comment_backout_count;
 600 #endif
 601       return beg + 1;
 602     }
 603   return p;
 604 }
 605
 606 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 607    return the pointer to the character after the substring.  If the
 608    substring is not found, return NULL.  */
 609
 610 static const char *
 611 find_comment_end (const char *beg, const char *end)
 612 {
 613   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 614      if it's not '>' or '-', advance by three characters.  Otherwise,
 615      look at the preceding characters and try to find a match.  */
 616
 617   const char *p = beg - 1;
 618
 619   while ((p += 3) < end)
 620     switch (p[0])
 621       {
 622       case '>':
 623         if (p[-1] == '-' && p[-2] == '-')
 624           return p + 1;
 625         break;
 626       case '-':
 627       at_dash:
 628         if (p[-1] == '-')
 629           {
 630           at_dash_dash:
 631             if (++p == end) return NULL;
 632             switch (p[0])
 633               {
 634               case '>': return p + 1;
 635               case '-': goto at_dash_dash;
 636               }
 637           }
 638         else
 639           {
 640             if ((p += 2) >= end) return NULL;
 641             switch (p[0])
 642               {
 643               case '>':
 644                 if (p[-1] == '-')
 645                   return p + 1;
 646                 break;
 647               case '-':
 648                 goto at_dash;
 649               }
 650           }
 651       }
 652   return NULL;
 653 }
 654 \f
 655 /* Return non-zero of the string inside [b, e) are present in hash
 656    table HT.  */
 657
 658 static int
 659 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 660 {
 661   char *copy;
 662   if (!ht)
 663     return 1;
 664   BOUNDED_TO_ALLOCA (b, e, copy);
 665   return hash_table_get (ht, copy) != NULL;
 666 }
 667
 668 /* Advance P (a char pointer), with the explicit intent of being able
 669    to read the next character.  If this is not possible, go to finish.  */
 670
 671 #define ADVANCE(p) do {                         \
 672   ++p;                                          \
 673   if (p >= end)                                 \
 674     goto finish;                                \
 675 } while (0)
 676
 677 /* Skip whitespace, if any. */
 678
 679 #define SKIP_WS(p) do {                         \
 680   while (ISSPACE (*p)) {                        \
 681     ADVANCE (p);                                \
 682   }                                             \
 683 } while (0)
 684
 685 /* Skip non-whitespace, if any. */
 686
 687 #define SKIP_NON_WS(p) do {                     \
 688   while (!ISSPACE (*p)) {                       \
 689     ADVANCE (p);                                \
 690   }                                             \
 691 } while (0)
 692
 693 #ifdef STANDALONE
 694 static int tag_backout_count;
 695 #endif
 696
 697 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 698    MAPFUN will be called with two arguments: pointer to an initialized
 699    struct taginfo, and MAPARG.
 700
 701    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 702    be processed by this function.  If it is NULL, all the tags are
 703    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 704
 705    (Obviously, the caller can filter out unwanted tags and attributes
 706    just as well, but this is just an optimization designed to avoid
 707    unnecessary copying for tags/attributes which the caller doesn't
 708    want to know about.  These lists are searched linearly; therefore,
 709    if you're interested in a large number of tags or attributes, you'd
 710    better set these to NULL and filter them out yourself with a
 711    hashing process most appropriate for your application.)  */
 712
 713 void
 714 map_html_tags (const char *text, int size,
 715                void (*mapfun) (struct taginfo *, void *), void *maparg,
 716                int flags,
 717                const struct hash_table *allowed_tags,
 718                const struct hash_table *allowed_attributes)
 719 {
 720   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 721      too little, POOL_APPEND allocates more with malloc. */
 722   char pool_initial_storage[256];
 723   struct pool pool;
 724
 725   const char *p = text;
 726   const char *end = text + size;
 727
 728   struct attr_pair attr_pair_initial_storage[8];
 729   int attr_pair_size = countof (attr_pair_initial_storage);
 730   int attr_pair_resized = 0;
 731   struct attr_pair *pairs = attr_pair_initial_storage;
 732
 733   if (!size)
 734     return;
 735
 736   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 737
 738   {
 739     int nattrs, end_tag;
 740     const char *tag_name_begin, *tag_name_end;
 741     const char *tag_start_position;
 742     int uninteresting_tag;
 743
 744   look_for_tag:
 745     POOL_REWIND (&pool);
 746
 747     nattrs = 0;
 748     end_tag = 0;
 749
 750     /* Find beginning of tag.  We use memchr() instead of the usual
 751        looping with ADVANCE() for speed. */
 752     p = memchr (p, '<', end - p);
 753     if (!p)
 754       goto finish;
 755
 756     tag_start_position = p;
 757     ADVANCE (p);
 758
 759     /* Establish the type of the tag (start-tag, end-tag or
 760        declaration).  */
 761     if (*p == '!')
 762       {
 763         if (!(flags & MHT_STRICT_COMMENTS)
 764             && p < end + 3 && p[1] == '-' && p[2] == '-')
 765           {
 766             /* If strict comments are not enforced and if we know
 767                we're looking at a comment, simply look for the
 768                terminating "-->".  Non-strict is the default because
 769                it works in other browsers and most HTML writers can't
 770                be bothered with getting the comments right.  */
 771             const char *comment_end = find_comment_end (p + 3, end);
 772             if (comment_end)
 773               p = comment_end;
 774           }
 775         else
 776           {
 777             /* Either in strict comment mode or looking at a non-empty
 778                declaration.  Real declarations are much less likely to
 779                be misused the way comments are, so advance over them
 780                properly regardless of strictness.  */
 781             p = advance_declaration (p, end);
 782           }
 783         if (p == end)
 784           goto finish;
 785         goto look_for_tag;
 786       }
 787     else if (*p == '/')
 788       {
 789         end_tag = 1;
 790         ADVANCE (p);
 791       }
 792     tag_name_begin = p;
 793     while (NAME_CHAR_P (*p))
 794       ADVANCE (p);
 795     if (p == tag_name_begin)
 796       goto look_for_tag;
 797     tag_name_end = p;
 798     SKIP_WS (p);
 799     if (end_tag && *p != '>')
 800       goto backout_tag;
 801
 802     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 803       /* We can't just say "goto look_for_tag" here because we need
 804          the loop below to properly advance over the tag's attributes.  */
 805       uninteresting_tag = 1;
 806     else
 807       {
 808         uninteresting_tag = 0;
 809         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 810       }
 811
 812     /* Find the attributes. */
 813     while (1)
 814       {
 815         const char *attr_name_begin, *attr_name_end;
 816         const char *attr_value_begin, *attr_value_end;
 817         const char *attr_raw_value_begin, *attr_raw_value_end;
 818         int operation = AP_DOWNCASE; /* stupid compiler. */
 819
 820         SKIP_WS (p);
 821
 822         if (*p == '/')
 823           {
 824             /* A slash at this point means the tag is about to be
 825                closed.  This is legal in XML and has been popularized
 826                in HTML via XHTML.  */
 827             /* <foo a=b c=d /> */
 828             /*              ^  */
 829             ADVANCE (p);
 830             SKIP_WS (p);
 831             if (*p != '>')
 832               goto backout_tag;
 833           }
 834
 835         /* Check for end of tag definition. */
 836         if (*p == '>')
 837           break;
 838
 839         /* Establish bounds of attribute name. */
 840         attr_name_begin = p;    /* <foo bar ...> */
 841                                 /*      ^        */
 842         while (NAME_CHAR_P (*p))
 843           ADVANCE (p);
 844         attr_name_end = p;      /* <foo bar ...> */
 845                                 /*         ^     */
 846         if (attr_name_begin == attr_name_end)
 847           goto backout_tag;
 848
 849         /* Establish bounds of attribute value. */
 850         SKIP_WS (p);
 851         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 852           {
 853             /* Minimized attribute syntax allows `=' to be omitted.
 854                For example, <UL COMPACT> is a valid shorthand for <UL
 855                COMPACT="compact">.  Even if such attributes are not
 856                useful to Wget, we need to support them, so that the
 857                tags containing them can be parsed correctly. */
 858             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 859             attr_raw_value_end = attr_value_end = attr_name_end;
 860           }
 861         else if (*p == '=')
 862           {
 863             ADVANCE (p);
 864             SKIP_WS (p);
 865             if (*p == '\"' || *p == '\'')
 866               {
 867                 int newline_seen = 0;
 868                 char quote_char = *p;
 869                 attr_raw_value_begin = p;
 870                 ADVANCE (p);
 871                 attr_value_begin = p; /* <foo bar="baz"> */
 872                                       /*           ^     */
 873                 while (*p != quote_char)
 874                   {
 875                     if (!newline_seen && *p == '\n')
 876                       {
 877                         /* If a newline is seen within the quotes, it
 878                            is most likely that someone forgot to close
 879                            the quote.  In that case, we back out to
 880                            the value beginning, and terminate the tag
 881                            at either `>' or the delimiter, whichever
 882                            comes first.  Such a tag terminated at `>'
 883                            is discarded.  */
 884                         p = attr_value_begin;
 885                         newline_seen = 1;
 886                         continue;
 887                       }
 888                     else if (newline_seen && *p == '>')
 889                       break;
 890                     ADVANCE (p);
 891                   }
 892                 attr_value_end = p; /* <foo bar="baz"> */
 893                                     /*              ^  */
 894                 if (*p == quote_char)
 895                   ADVANCE (p);
 896                 else
 897                   goto look_for_tag;
 898                 attr_raw_value_end = p; /* <foo bar="baz"> */
 899                                         /*               ^ */
 900                 operation = AP_PROCESS_ENTITIES;
 901                 if (flags & MHT_TRIM_VALUES)
 902                   operation |= AP_TRIM_BLANKS;
 903               }
 904             else
 905               {
 906                 attr_value_begin = p; /* <foo bar=baz> */
 907                                       /*          ^    */
 908                 /* According to SGML, a name token should consist only
 909                    of alphanumerics, . and -.  However, this is often
 910                    violated by, for instance, `%' in `width=75%'.
 911                    We'll be liberal and allow just about anything as
 912                    an attribute value.  */
 913                 while (!ISSPACE (*p) && *p != '>')
 914                   ADVANCE (p);
 915                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 916                                     /*             ^          */
 917                 if (attr_value_begin == attr_value_end)
 918                   /* <foo bar=> */
 919                   /*          ^ */
 920                   goto backout_tag;
 921                 attr_raw_value_begin = attr_value_begin;
 922                 attr_raw_value_end = attr_value_end;
 923                 operation = AP_PROCESS_ENTITIES;
 924               }
 925           }
 926         else
 927           {
 928             /* We skipped the whitespace and found something that is
 929                neither `=' nor the beginning of the next attribute's
 930                name.  Back out.  */
 931             goto backout_tag;   /* <foo bar [... */
 932                                 /*          ^    */
 933           }
 934
 935         /* If we're not interested in the tag, don't bother with any
 936            of the attributes.  */
 937         if (uninteresting_tag)
 938           continue;
 939
 940         /* If we aren't interested in the attribute, skip it.  We
 941            cannot do this test any sooner, because our text pointer
 942            needs to correctly advance over the attribute.  */
 943         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
 944           continue;
 945
 946         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 947                     struct attr_pair);
 948
 949         pairs[nattrs].name_pool_index = pool.tail;
 950         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 951
 952         pairs[nattrs].value_pool_index = pool.tail;
 953         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 954         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 955         pairs[nattrs].value_raw_size = (attr_raw_value_end
 956                                         - attr_raw_value_begin);
 957         ++nattrs;
 958       }
 959
 960     if (uninteresting_tag)
 961       {
 962         ADVANCE (p);
 963         goto look_for_tag;
 964       }
 965
 966     /* By now, we have a valid tag with a name and zero or more
 967        attributes.  Fill in the data and call the mapper function.  */
 968     {
 969       int i;
 970       struct taginfo taginfo;
 971
 972       taginfo.name      = pool.contents;
 973       taginfo.end_tag_p = end_tag;
 974       taginfo.nattrs    = nattrs;
 975       /* We fill in the char pointers only now, when pool can no
 976          longer get realloc'ed.  If we did that above, we could get
 977          hosed by reallocation.  Obviously, after this point, the pool
 978          may no longer be grown.  */
 979       for (i = 0; i < nattrs; i++)
 980         {
 981           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 982           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 983         }
 984       taginfo.attrs = pairs;
 985       taginfo.start_position = tag_start_position;
 986       taginfo.end_position   = p + 1;
 987       /* Ta-dam! */
 988       (*mapfun) (&taginfo, maparg);
 989       ADVANCE (p);
 990     }
 991     goto look_for_tag;
 992
 993   backout_tag:
 994 #ifdef STANDALONE
 995     ++tag_backout_count;
 996 #endif
 997     /* The tag wasn't really a tag.  Treat its contents as ordinary
 998        data characters. */
 999     p = tag_start_position + 1;
1000     goto look_for_tag;
1001   }
1002
1003  finish:
1004   POOL_FREE (&pool);
1005   if (attr_pair_resized)
1006     xfree (pairs);
1007 }
1008
1009 #undef ADVANCE
1010 #undef SKIP_WS
1011 #undef SKIP_NON_WS
1012 \f
1013 #ifdef STANDALONE
1014 static void
1015 test_mapper (struct taginfo *taginfo, void *arg)
1016 {
1017   int i;
1018
1019   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1020   for (i = 0; i < taginfo->nattrs; i++)
1021     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1022   putchar ('\n');
1023   ++*(int *)arg;
1024 }
1025
1026 int main ()
1027 {
1028   int size = 256;
1029   char *x = (char *)xmalloc (size);
1030   int length = 0;
1031   int read_count;
1032   int tag_counter = 0;
1033
1034   while ((read_count = fread (x + length, 1, size - length, stdin)))
1035     {
1036       length += read_count;
1037       size <<= 1;
1038       x = (char *)xrealloc (x, size);
1039     }
1040
1041   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1042   printf ("TAGS: %d\n", tag_counter);
1043   printf ("Tag backouts:     %d\n", tag_backout_count);
1044   printf ("Comment backouts: %d\n", comment_backout_count);
1045   return 0;
1046 }
1047 #endif /* STANDALONE */