sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998-2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #include <string.h>
 100 #include <assert.h>
 101
 102 #include "wget.h"
 103 #include "html-parse.h"
 104
 105 #ifdef STANDALONE
 106 # undef xmalloc
 107 # undef xrealloc
 108 # undef xfree
 109 # define xmalloc malloc
 110 # define xrealloc realloc
 111 # define xfree free
 112
 113 # undef ISSPACE
 114 # undef ISDIGIT
 115 # undef ISXDIGIT
 116 # undef ISALPHA
 117 # undef ISALNUM
 118 # undef TOLOWER
 119 # undef TOUPPER
 120
 121 # define ISSPACE(x) isspace (x)
 122 # define ISDIGIT(x) isdigit (x)
 123 # define ISXDIGIT(x) isxdigit (x)
 124 # define ISALPHA(x) isalpha (x)
 125 # define ISALNUM(x) isalnum (x)
 126 # define TOLOWER(x) tolower (x)
 127 # define TOUPPER(x) toupper (x)
 128
 129 struct hash_table {
 130   int dummy;
 131 };
 132 static void *
 133 hash_table_get (const struct hash_table *ht, void *ptr)
 134 {
 135   return ptr;
 136 }
 137 #else  /* not STANDALONE */
 138 # include "hash.h"
 139 #endif
 140
 141 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 142    allocated on the stack, and moved to the heap if it needs to be
 143    larger than originally expected.  map_html_tags() uses it to store
 144    the zero-terminated names and values of tags and attributes.
 145
 146    Thus taginfo->name, and attr->name and attr->value for each
 147    attribute, do not point into separately allocated areas, but into
 148    different parts of the pool, separated only by terminating zeros.
 149    This ensures minimum amount of allocation and, for most tags, no
 150    allocation because the entire pool is kept on the stack.  */
 151
 152 struct pool {
 153   char *contents;               /* pointer to the contents. */
 154   int size;                     /* size of the pool. */
 155   int tail;                     /* next available position index. */
 156   bool resized;                 /* whether the pool has been resized
 157                                    using malloc. */
 158
 159   char *orig_contents;          /* original pool contents, usually
 160                                    stack-allocated.  used by POOL_FREE
 161                                    to restore the pool to the initial
 162                                    state. */
 163   int orig_size;
 164 };
 165
 166 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 167
 168 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 169   struct pool *P = (p);                                         \
 170   P->contents = (initial_storage);                              \
 171   P->size = (initial_size);                                     \
 172   P->tail = 0;                                                  \
 173   P->resized = false;                                           \
 174   P->orig_contents = P->contents;                               \
 175   P->orig_size = P->size;                                       \
 176 } while (0)
 177
 178 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 179    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 180
 181 #define POOL_GROW(p, increase)                                  \
 182   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 183               (p)->resized, char)
 184
 185 /* Append text in the range [beg, end) to POOL.  No zero-termination
 186    is done.  */
 187
 188 #define POOL_APPEND(p, beg, end) do {                   \
 189   const char *PA_beg = (beg);                           \
 190   int PA_size = (end) - PA_beg;                         \
 191   POOL_GROW (p, PA_size);                               \
 192   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 193   (p)->tail += PA_size;                                 \
 194 } while (0)
 195
 196 /* Append one character to the pool.  Can be used to zero-terminate
 197    pool strings.  */
 198
 199 #define POOL_APPEND_CHR(p, ch) do {             \
 200   char PAC_char = (ch);                         \
 201   POOL_GROW (p, 1);                             \
 202   (p)->contents[(p)->tail++] = PAC_char;        \
 203 } while (0)
 204
 205 /* Forget old pool contents.  The allocated memory is not freed. */
 206 #define POOL_REWIND(p) (p)->tail = 0
 207
 208 /* Free heap-allocated memory for contents of POOL.  This calls
 209    xfree() if the memory was allocated through malloc.  It also
 210    restores `contents' and `size' to their original, pre-malloc
 211    values.  That way after POOL_FREE, the pool is fully usable, just
 212    as if it were freshly initialized with POOL_INIT.  */
 213
 214 #define POOL_FREE(p) do {                       \
 215   struct pool *P = p;                           \
 216   if (P->resized)                               \
 217     xfree (P->contents);                        \
 218   P->contents = P->orig_contents;               \
 219   P->size = P->orig_size;                       \
 220   P->tail = 0;                                  \
 221   P->resized = false;                           \
 222 } while (0)
 223
 224 /* Used for small stack-allocated memory chunks that might grow.  Like
 225    DO_REALLOC, this macro grows BASEVAR as necessary to take
 226    NEEDED_SIZE items of TYPE.
 227
 228    The difference is that on the first resize, it will use
 229    malloc+memcpy rather than realloc.  That way you can stack-allocate
 230    the initial chunk, and only resort to heap allocation if you
 231    stumble upon large data.
 232
 233    After the first resize, subsequent ones are performed with realloc,
 234    just like DO_REALLOC.  */
 235
 236 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 237   long ga_needed_size = (needed_size);                                          \
 238   long ga_newsize = (sizevar);                                                  \
 239   while (ga_newsize < ga_needed_size)                                           \
 240     ga_newsize <<= 1;                                                           \
 241   if (ga_newsize != (sizevar))                                                  \
 242     {                                                                           \
 243       if (resized)                                                              \
 244         basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
 245       else                                                                      \
 246         {                                                                       \
 247           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 248           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 249           (basevar) = ga_new;                                                   \
 250           resized = true;                                                       \
 251         }                                                                       \
 252       (sizevar) = ga_newsize;                                                   \
 253     }                                                                           \
 254 } while (0)
 255 \f
 256 /* Test whether n+1-sized entity name fits in P.  We don't support
 257    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 258    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 259    other words an entity needs to be terminated by either a
 260    non-alphanumeric or the end of string.  */
 261 #define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
 262
 263 /* Macros that test entity names by returning true if P is followed by
 264    the specified characters.  */
 265 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 266 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 267 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 268
 269 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 270    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 271    to "<foo", but "&lt,foo" to "<,foo".  */
 272 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 273
 274 struct tagstack_item {
 275   const char *tagname_begin;
 276   const char *tagname_end;
 277   const char *contents_begin;
 278   struct tagstack_item *prev;
 279   struct tagstack_item *next;
 280 };
 281
 282 struct tagstack_item *
 283 tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
 284 {
 285   struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
 286   if (*head == NULL)
 287     {
 288       *head = *tail = ts;
 289       ts->prev = ts->next = NULL;
 290     }
 291   else
 292     {
 293       (*tail)->next = ts;
 294       ts->prev = *tail;
 295       *tail = ts;
 296       ts->next = NULL;
 297     }
 298
 299   return ts;
 300 }
 301
 302 /* remove ts and everything after it from the stack */
 303 void
 304 tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
 305               struct tagstack_item *ts)
 306 {
 307   if (*head == NULL)
 308     return;
 309
 310   if (ts == *tail)
 311     {
 312       if (ts == *head)
 313         {
 314           xfree (ts);
 315           *head = *tail = NULL;
 316         }
 317       else
 318         {
 319           ts->prev->next = NULL;
 320           *tail = ts->prev;
 321           xfree (ts);
 322         }
 323     }
 324   else
 325     {
 326       if (ts == *head)
 327         {
 328           *head = NULL;
 329         }
 330       *tail = ts->prev;
 331
 332       if (ts->prev)
 333         {
 334           ts->prev->next = NULL;
 335         }
 336       while (ts)
 337         {
 338           struct tagstack_item *p = ts->next;
 339           xfree (ts);
 340           ts = p;
 341         }
 342     }
 343 }
 344
 345 struct tagstack_item *
 346 tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
 347                const char *tagname_end)
 348 {
 349   int len = tagname_end - tagname_begin;
 350   while (tail)
 351     {
 352       if (len == (tail->tagname_end - tail->tagname_begin))
 353         {
 354           if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
 355             return tail;
 356         }
 357       tail = tail->prev;
 358     }
 359   return NULL;
 360 }
 361
 362 /* Decode the HTML character entity at *PTR, considering END to be end
 363    of buffer.  It is assumed that the "&" character that marks the
 364    beginning of the entity has been seen at *PTR-1.  If a recognized
 365    ASCII entity is seen, it is returned, and *PTR is moved to the end
 366    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 367
 368    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 369
 370 static int
 371 decode_entity (const char **ptr, const char *end)
 372 {
 373   const char *p = *ptr;
 374   int value = -1;
 375
 376   if (++p == end)
 377     return -1;
 378
 379   switch (*p++)
 380     {
 381     case '#':
 382       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 383       {
 384         int digits = 0;
 385         value = 0;
 386         if (*p == 'x')
 387           for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
 388             value = (value << 4) + XDIGIT_TO_NUM (*p);
 389         else
 390           for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
 391             value = (value * 10) + (*p - '0');
 392         if (!digits)
 393           return -1;
 394         /* Don't interpret 128+ codes and NUL because we cannot
 395            portably reinserted them into HTML.  */
 396         if (!value || (value & ~0x7f))
 397           return -1;
 398         *ptr = SKIP_SEMI (p, 0);
 399         return value;
 400       }
 401     /* Process named ASCII entities.  */
 402     case 'g':
 403       if (ENT1 (p, 't'))
 404         value = '>', *ptr = SKIP_SEMI (p, 1);
 405       break;
 406     case 'l':
 407       if (ENT1 (p, 't'))
 408         value = '<', *ptr = SKIP_SEMI (p, 1);
 409       break;
 410     case 'a':
 411       if (ENT2 (p, 'm', 'p'))
 412         value = '&', *ptr = SKIP_SEMI (p, 2);
 413       else if (ENT3 (p, 'p', 'o', 's'))
 414         /* handle &apos for the sake of the XML/XHTML crowd. */
 415         value = '\'', *ptr = SKIP_SEMI (p, 3);
 416       break;
 417     case 'q':
 418       if (ENT3 (p, 'u', 'o', 't'))
 419         value = '\"', *ptr = SKIP_SEMI (p, 3);
 420       break;
 421     }
 422   return value;
 423 }
 424 #undef ENT1
 425 #undef ENT2
 426 #undef ENT3
 427 #undef FITS
 428 #undef SKIP_SEMI
 429
 430 enum {
 431   AP_DOWNCASE           = 1,
 432   AP_DECODE_ENTITIES    = 2,
 433   AP_TRIM_BLANKS        = 4
 434 };
 435
 436 /* Copy the text in the range [BEG, END) to POOL, optionally
 437    performing operations specified by FLAGS.  FLAGS may be any
 438    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 439    with the following meaning:
 440
 441    * AP_DOWNCASE -- downcase all the letters;
 442
 443    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 444      the ASCII range when copying the string.
 445
 446    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 447      of text, as well as embedded newlines.  */
 448
 449 static void
 450 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 451 {
 452   int old_tail = pool->tail;
 453
 454   /* Skip blanks if required.  We must do this before entities are
 455      processed, so that blanks can still be inserted as, for instance,
 456      `&#32;'.  */
 457   if (flags & AP_TRIM_BLANKS)
 458     {
 459       while (beg < end && ISSPACE (*beg))
 460         ++beg;
 461       while (end > beg && ISSPACE (end[-1]))
 462         --end;
 463     }
 464
 465   if (flags & AP_DECODE_ENTITIES)
 466     {
 467       /* Grow the pool, then copy the text to the pool character by
 468          character, processing the encountered entities as we go
 469          along.
 470
 471          It's safe (and necessary) to grow the pool in advance because
 472          processing the entities can only *shorten* the string, it can
 473          never lengthen it.  */
 474       const char *from = beg;
 475       char *to;
 476       bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
 477
 478       POOL_GROW (pool, end - beg);
 479       to = pool->contents + pool->tail;
 480
 481       while (from < end)
 482         {
 483           if (*from == '&')
 484             {
 485               int entity = decode_entity (&from, end);
 486               if (entity != -1)
 487                 *to++ = entity;
 488               else
 489                 *to++ = *from++;
 490             }
 491           else if ((*from == '\n' || *from == '\r') && squash_newlines)
 492             ++from;
 493           else
 494             *to++ = *from++;
 495         }
 496       /* Verify that we haven't exceeded the original size.  (It
 497          shouldn't happen, hence the assert.)  */
 498       assert (to - (pool->contents + pool->tail) <= end - beg);
 499
 500       /* Make POOL's tail point to the position following the string
 501          we've written.  */
 502       pool->tail = to - pool->contents;
 503       POOL_APPEND_CHR (pool, '\0');
 504     }
 505   else
 506     {
 507       /* Just copy the text to the pool.  */
 508       POOL_APPEND (pool, beg, end);
 509       POOL_APPEND_CHR (pool, '\0');
 510     }
 511
 512   if (flags & AP_DOWNCASE)
 513     {
 514       char *p = pool->contents + old_tail;
 515       for (; *p; p++)
 516         *p = TOLOWER (*p);
 517     }
 518 }
 519 \f
 520 /* Originally we used to adhere to rfc 1866 here, and allowed only
 521    letters, digits, periods, and hyphens as names (of tags or
 522    attributes).  However, this broke too many pages which used
 523    proprietary or strange attributes, e.g. <img src="a.gif"
 524    v:shapes="whatever">.
 525
 526    So now we allow any character except:
 527      * whitespace
 528      * 8-bit and control chars
 529      * characters that clearly cannot be part of name:
 530        '=', '>', '/'.
 531
 532    This only affects attribute and tag names; attribute values allow
 533    an even greater variety of characters.  */
 534
 535 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 536                         && (x) != '=' && (x) != '>' && (x) != '/')
 537
 538 #ifdef STANDALONE
 539 static int comment_backout_count;
 540 #endif
 541
 542 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 543    strict comments mode, this is used for skipping over comments as
 544    well.
 545
 546    To recap: any SGML declaration may have comments associated with
 547    it, e.g.
 548        <!MY-DECL -- isn't this fun? -- foo bar>
 549
 550    An HTML comment is merely an empty declaration (<!>) with a comment
 551    attached, like this:
 552        <!-- some stuff here -->
 553
 554    Several comments may be embedded in one comment declaration:
 555        <!-- have -- -- fun -->
 556
 557    Whitespace is allowed between and after the comments, but not
 558    before the first comment.  Additionally, this function attempts to
 559    handle double quotes in SGML declarations correctly.  */
 560
 561 static const char *
 562 advance_declaration (const char *beg, const char *end)
 563 {
 564   const char *p = beg;
 565   char quote_char = '\0';       /* shut up, gcc! */
 566   char ch;
 567
 568   enum {
 569     AC_S_DONE,
 570     AC_S_BACKOUT,
 571     AC_S_BANG,
 572     AC_S_DEFAULT,
 573     AC_S_DCLNAME,
 574     AC_S_DASH1,
 575     AC_S_DASH2,
 576     AC_S_COMMENT,
 577     AC_S_DASH3,
 578     AC_S_DASH4,
 579     AC_S_QUOTE1,
 580     AC_S_IN_QUOTE,
 581     AC_S_QUOTE2
 582   } state = AC_S_BANG;
 583
 584   if (beg == end)
 585     return beg;
 586   ch = *p++;
 587
 588   /* It looked like a good idea to write this as a state machine, but
 589      now I wonder...  */
 590
 591   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 592     {
 593       if (p == end)
 594         state = AC_S_BACKOUT;
 595       switch (state)
 596         {
 597         case AC_S_DONE:
 598         case AC_S_BACKOUT:
 599           break;
 600         case AC_S_BANG:
 601           if (ch == '!')
 602             {
 603               ch = *p++;
 604               state = AC_S_DEFAULT;
 605             }
 606           else
 607             state = AC_S_BACKOUT;
 608           break;
 609         case AC_S_DEFAULT:
 610           switch (ch)
 611             {
 612             case '-':
 613               state = AC_S_DASH1;
 614               break;
 615             case ' ':
 616             case '\t':
 617             case '\r':
 618             case '\n':
 619               ch = *p++;
 620               break;
 621             case '>':
 622               state = AC_S_DONE;
 623               break;
 624             case '\'':
 625             case '\"':
 626               state = AC_S_QUOTE1;
 627               break;
 628             default:
 629               if (NAME_CHAR_P (ch))
 630                 state = AC_S_DCLNAME;
 631               else
 632                 state = AC_S_BACKOUT;
 633               break;
 634             }
 635           break;
 636         case AC_S_DCLNAME:
 637           if (ch == '-')
 638             state = AC_S_DASH1;
 639           else if (NAME_CHAR_P (ch))
 640             ch = *p++;
 641           else
 642             state = AC_S_DEFAULT;
 643           break;
 644         case AC_S_QUOTE1:
 645           /* We must use 0x22 because broken assert macros choke on
 646              '"' and '\"'.  */
 647           assert (ch == '\'' || ch == 0x22);
 648           quote_char = ch;      /* cheating -- I really don't feel like
 649                                    introducing more different states for
 650                                    different quote characters. */
 651           ch = *p++;
 652           state = AC_S_IN_QUOTE;
 653           break;
 654         case AC_S_IN_QUOTE:
 655           if (ch == quote_char)
 656             state = AC_S_QUOTE2;
 657           else
 658             ch = *p++;
 659           break;
 660         case AC_S_QUOTE2:
 661           assert (ch == quote_char);
 662           ch = *p++;
 663           state = AC_S_DEFAULT;
 664           break;
 665         case AC_S_DASH1:
 666           assert (ch == '-');
 667           ch = *p++;
 668           state = AC_S_DASH2;
 669           break;
 670         case AC_S_DASH2:
 671           switch (ch)
 672             {
 673             case '-':
 674               ch = *p++;
 675               state = AC_S_COMMENT;
 676               break;
 677             default:
 678               state = AC_S_BACKOUT;
 679             }
 680           break;
 681         case AC_S_COMMENT:
 682           switch (ch)
 683             {
 684             case '-':
 685               state = AC_S_DASH3;
 686               break;
 687             default:
 688               ch = *p++;
 689               break;
 690             }
 691           break;
 692         case AC_S_DASH3:
 693           assert (ch == '-');
 694           ch = *p++;
 695           state = AC_S_DASH4;
 696           break;
 697         case AC_S_DASH4:
 698           switch (ch)
 699             {
 700             case '-':
 701               ch = *p++;
 702               state = AC_S_DEFAULT;
 703               break;
 704             default:
 705               state = AC_S_COMMENT;
 706               break;
 707             }
 708           break;
 709         }
 710     }
 711
 712   if (state == AC_S_BACKOUT)
 713     {
 714 #ifdef STANDALONE
 715       ++comment_backout_count;
 716 #endif
 717       return beg + 1;
 718     }
 719   return p;
 720 }
 721
 722 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 723    return the pointer to the character after the substring.  If the
 724    substring is not found, return NULL.  */
 725
 726 static const char *
 727 find_comment_end (const char *beg, const char *end)
 728 {
 729   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 730      if it's not '>' or '-', advance by three characters.  Otherwise,
 731      look at the preceding characters and try to find a match.  */
 732
 733   const char *p = beg - 1;
 734
 735   while ((p += 3) < end)
 736     switch (p[0])
 737       {
 738       case '>':
 739         if (p[-1] == '-' && p[-2] == '-')
 740           return p + 1;
 741         break;
 742       case '-':
 743       at_dash:
 744         if (p[-1] == '-')
 745           {
 746           at_dash_dash:
 747             if (++p == end) return NULL;
 748             switch (p[0])
 749               {
 750               case '>': return p + 1;
 751               case '-': goto at_dash_dash;
 752               }
 753           }
 754         else
 755           {
 756             if ((p += 2) >= end) return NULL;
 757             switch (p[0])
 758               {
 759               case '>':
 760                 if (p[-1] == '-')
 761                   return p + 1;
 762                 break;
 763               case '-':
 764                 goto at_dash;
 765               }
 766           }
 767       }
 768   return NULL;
 769 }
 770 \f
 771 /* Return true if the string containing of characters inside [b, e) is
 772    present in hash table HT.  */
 773
 774 static bool
 775 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 776 {
 777   char *copy;
 778   if (!ht)
 779     return true;
 780   BOUNDED_TO_ALLOCA (b, e, copy);
 781   return hash_table_get (ht, copy) != NULL;
 782 }
 783
 784 /* Advance P (a char pointer), with the explicit intent of being able
 785    to read the next character.  If this is not possible, go to finish.  */
 786
 787 #define ADVANCE(p) do {                         \
 788   ++p;                                          \
 789   if (p >= end)                                 \
 790     goto finish;                                \
 791 } while (0)
 792
 793 /* Skip whitespace, if any. */
 794
 795 #define SKIP_WS(p) do {                         \
 796   while (ISSPACE (*p)) {                        \
 797     ADVANCE (p);                                \
 798   }                                             \
 799 } while (0)
 800
 801 /* Skip non-whitespace, if any. */
 802
 803 #define SKIP_NON_WS(p) do {                     \
 804   while (!ISSPACE (*p)) {                       \
 805     ADVANCE (p);                                \
 806   }                                             \
 807 } while (0)
 808
 809 #ifdef STANDALONE
 810 static int tag_backout_count;
 811 #endif
 812
 813 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 814    MAPFUN will be called with two arguments: pointer to an initialized
 815    struct taginfo, and MAPARG.
 816
 817    ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
 818    which are the tags and attribute names that this function should
 819    use.  If ALLOWED_TAGS is NULL, all tags are processed; if
 820    ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
 821
 822    (Obviously, the caller can filter out unwanted tags and attributes
 823    just as well, but this is just an optimization designed to avoid
 824    unnecessary copying of tags/attributes which the caller doesn't
 825    care about.)  */
 826
 827 void
 828 map_html_tags (const char *text, int size,
 829                void (*mapfun) (struct taginfo *, void *), void *maparg,
 830                int flags,
 831                const struct hash_table *allowed_tags,
 832                const struct hash_table *allowed_attributes)
 833 {
 834   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 835      too little, POOL_APPEND allocates more with malloc. */
 836   char pool_initial_storage[256];
 837   struct pool pool;
 838
 839   const char *p = text;
 840   const char *end = text + size;
 841
 842   struct attr_pair attr_pair_initial_storage[8];
 843   int attr_pair_size = countof (attr_pair_initial_storage);
 844   bool attr_pair_resized = false;
 845   struct attr_pair *pairs = attr_pair_initial_storage;
 846
 847   struct tagstack_item *head = NULL;
 848   struct tagstack_item *tail = NULL;
 849
 850   if (!size)
 851     return;
 852
 853   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 854
 855   {
 856     int nattrs, end_tag;
 857     const char *tag_name_begin, *tag_name_end;
 858     const char *tag_start_position;
 859     bool uninteresting_tag;
 860
 861   look_for_tag:
 862     POOL_REWIND (&pool);
 863
 864     nattrs = 0;
 865     end_tag = 0;
 866
 867     /* Find beginning of tag.  We use memchr() instead of the usual
 868        looping with ADVANCE() for speed. */
 869     p = memchr (p, '<', end - p);
 870     if (!p)
 871       goto finish;
 872
 873     tag_start_position = p;
 874     ADVANCE (p);
 875
 876     /* Establish the type of the tag (start-tag, end-tag or
 877        declaration).  */
 878     if (*p == '!')
 879       {
 880         if (!(flags & MHT_STRICT_COMMENTS)
 881             && p < end + 3 && p[1] == '-' && p[2] == '-')
 882           {
 883             /* If strict comments are not enforced and if we know
 884                we're looking at a comment, simply look for the
 885                terminating "-->".  Non-strict is the default because
 886                it works in other browsers and most HTML writers can't
 887                be bothered with getting the comments right.  */
 888             const char *comment_end = find_comment_end (p + 3, end);
 889             if (comment_end)
 890               p = comment_end;
 891           }
 892         else
 893           {
 894             /* Either in strict comment mode or looking at a non-empty
 895                declaration.  Real declarations are much less likely to
 896                be misused the way comments are, so advance over them
 897                properly regardless of strictness.  */
 898             p = advance_declaration (p, end);
 899           }
 900         if (p == end)
 901           goto finish;
 902         goto look_for_tag;
 903       }
 904     else if (*p == '/')
 905       {
 906         end_tag = 1;
 907         ADVANCE (p);
 908       }
 909     tag_name_begin = p;
 910     while (NAME_CHAR_P (*p))
 911       ADVANCE (p);
 912     if (p == tag_name_begin)
 913       goto look_for_tag;
 914     tag_name_end = p;
 915     SKIP_WS (p);
 916
 917     if (!end_tag)
 918       {
 919         struct tagstack_item *ts = tagstack_push (&head, &tail);
 920         if (ts)
 921           {
 922             ts->tagname_begin  = tag_name_begin;
 923             ts->tagname_end    = tag_name_end;
 924             ts->contents_begin = NULL;
 925           }
 926       }
 927
 928     if (end_tag && *p != '>')
 929       goto backout_tag;
 930
 931     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 932       /* We can't just say "goto look_for_tag" here because we need
 933          the loop below to properly advance over the tag's attributes.  */
 934       uninteresting_tag = true;
 935     else
 936       {
 937         uninteresting_tag = false;
 938         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 939       }
 940
 941     /* Find the attributes. */
 942     while (1)
 943       {
 944         const char *attr_name_begin, *attr_name_end;
 945         const char *attr_value_begin, *attr_value_end;
 946         const char *attr_raw_value_begin, *attr_raw_value_end;
 947         int operation = AP_DOWNCASE; /* stupid compiler. */
 948
 949         SKIP_WS (p);
 950
 951         if (*p == '/')
 952           {
 953             /* A slash at this point means the tag is about to be
 954                closed.  This is legal in XML and has been popularized
 955                in HTML via XHTML.  */
 956             /* <foo a=b c=d /> */
 957             /*              ^  */
 958             ADVANCE (p);
 959             SKIP_WS (p);
 960             if (*p != '>')
 961               goto backout_tag;
 962           }
 963
 964         /* Check for end of tag definition. */
 965         if (*p == '>')
 966           break;
 967
 968         /* Establish bounds of attribute name. */
 969         attr_name_begin = p;    /* <foo bar ...> */
 970                                 /*      ^        */
 971         while (NAME_CHAR_P (*p))
 972           ADVANCE (p);
 973         attr_name_end = p;      /* <foo bar ...> */
 974                                 /*         ^     */
 975         if (attr_name_begin == attr_name_end)
 976           goto backout_tag;
 977
 978         /* Establish bounds of attribute value. */
 979         SKIP_WS (p);
 980         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 981           {
 982             /* Minimized attribute syntax allows `=' to be omitted.
 983                For example, <UL COMPACT> is a valid shorthand for <UL
 984                COMPACT="compact">.  Even if such attributes are not
 985                useful to Wget, we need to support them, so that the
 986                tags containing them can be parsed correctly. */
 987             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 988             attr_raw_value_end = attr_value_end = attr_name_end;
 989           }
 990         else if (*p == '=')
 991           {
 992             ADVANCE (p);
 993             SKIP_WS (p);
 994             if (*p == '\"' || *p == '\'')
 995               {
 996                 bool newline_seen = false;
 997                 char quote_char = *p;
 998                 attr_raw_value_begin = p;
 999                 ADVANCE (p);
1000                 attr_value_begin = p; /* <foo bar="baz"> */
1001                                       /*           ^     */
1002                 while (*p != quote_char)
1003                   {
1004                     if (!newline_seen && *p == '\n')
1005                       {
1006                         /* If a newline is seen within the quotes, it
1007                            is most likely that someone forgot to close
1008                            the quote.  In that case, we back out to
1009                            the value beginning, and terminate the tag
1010                            at either `>' or the delimiter, whichever
1011                            comes first.  Such a tag terminated at `>'
1012                            is discarded.  */
1013                         p = attr_value_begin;
1014                         newline_seen = true;
1015                         continue;
1016                       }
1017                     else if (newline_seen && *p == '>')
1018                       break;
1019                     ADVANCE (p);
1020                   }
1021                 attr_value_end = p; /* <foo bar="baz"> */
1022                                     /*              ^  */
1023                 if (*p == quote_char)
1024                   ADVANCE (p);
1025                 else
1026                   goto look_for_tag;
1027                 attr_raw_value_end = p; /* <foo bar="baz"> */
1028                                         /*               ^ */
1029                 operation = AP_DECODE_ENTITIES;
1030                 if (flags & MHT_TRIM_VALUES)
1031                   operation |= AP_TRIM_BLANKS;
1032               }
1033             else
1034               {
1035                 attr_value_begin = p; /* <foo bar=baz> */
1036                                       /*          ^    */
1037                 /* According to SGML, a name token should consist only
1038                    of alphanumerics, . and -.  However, this is often
1039                    violated by, for instance, `%' in `width=75%'.
1040                    We'll be liberal and allow just about anything as
1041                    an attribute value.  */
1042                 while (!ISSPACE (*p) && *p != '>')
1043                   ADVANCE (p);
1044                 attr_value_end = p; /* <foo bar=baz qux=quix> */
1045                                     /*             ^          */
1046                 if (attr_value_begin == attr_value_end)
1047                   /* <foo bar=> */
1048                   /*          ^ */
1049                   goto backout_tag;
1050                 attr_raw_value_begin = attr_value_begin;
1051                 attr_raw_value_end = attr_value_end;
1052                 operation = AP_DECODE_ENTITIES;
1053               }
1054           }
1055         else
1056           {
1057             /* We skipped the whitespace and found something that is
1058                neither `=' nor the beginning of the next attribute's
1059                name.  Back out.  */
1060             goto backout_tag;   /* <foo bar [... */
1061                                 /*          ^    */
1062           }
1063
1064         /* If we're not interested in the tag, don't bother with any
1065            of the attributes.  */
1066         if (uninteresting_tag)
1067           continue;
1068
1069         /* If we aren't interested in the attribute, skip it.  We
1070            cannot do this test any sooner, because our text pointer
1071            needs to correctly advance over the attribute.  */
1072         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
1073           continue;
1074
1075         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
1076                     struct attr_pair);
1077
1078         pairs[nattrs].name_pool_index = pool.tail;
1079         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
1080
1081         pairs[nattrs].value_pool_index = pool.tail;
1082         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
1083         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
1084         pairs[nattrs].value_raw_size = (attr_raw_value_end
1085                                         - attr_raw_value_begin);
1086         ++nattrs;
1087       }
1088
1089     if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
1090       {
1091         tail->contents_begin = p+1;
1092       }
1093
1094     if (uninteresting_tag)
1095       {
1096         ADVANCE (p);
1097         goto look_for_tag;
1098       }
1099
1100     /* By now, we have a valid tag with a name and zero or more
1101        attributes.  Fill in the data and call the mapper function.  */
1102     {
1103       int i;
1104       struct taginfo taginfo;
1105       struct tagstack_item *ts = NULL;
1106
1107       taginfo.name      = pool.contents;
1108       taginfo.end_tag_p = end_tag;
1109       taginfo.nattrs    = nattrs;
1110       /* We fill in the char pointers only now, when pool can no
1111          longer get realloc'ed.  If we did that above, we could get
1112          hosed by reallocation.  Obviously, after this point, the pool
1113          may no longer be grown.  */
1114       for (i = 0; i < nattrs; i++)
1115         {
1116           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1117           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1118         }
1119       taginfo.attrs = pairs;
1120       taginfo.start_position = tag_start_position;
1121       taginfo.end_position   = p + 1;
1122       taginfo.contents_begin = NULL;
1123       taginfo.contents_end = NULL;
1124
1125       if (end_tag)
1126         {
1127           ts = tagstack_find (tail, tag_name_begin, tag_name_end);
1128           if (ts)
1129             {
1130               if (ts->contents_begin)
1131                 {
1132                   taginfo.contents_begin = ts->contents_begin;
1133                   taginfo.contents_end   = tag_start_position;
1134                 }
1135               tagstack_pop (&head, &tail, ts);
1136             }
1137         }
1138
1139       mapfun (&taginfo, maparg);
1140       ADVANCE (p);
1141     }
1142     goto look_for_tag;
1143
1144   backout_tag:
1145 #ifdef STANDALONE
1146     ++tag_backout_count;
1147 #endif
1148     /* The tag wasn't really a tag.  Treat its contents as ordinary
1149        data characters. */
1150     p = tag_start_position + 1;
1151     goto look_for_tag;
1152   }
1153
1154  finish:
1155   POOL_FREE (&pool);
1156   if (attr_pair_resized)
1157     xfree (pairs);
1158   /* pop any tag stack that's left */
1159   tagstack_pop (&head, &tail, head);
1160 }
1161
1162 #undef ADVANCE
1163 #undef SKIP_WS
1164 #undef SKIP_NON_WS
1165 \f
1166 #ifdef STANDALONE
1167 static void
1168 test_mapper (struct taginfo *taginfo, void *arg)
1169 {
1170   int i;
1171
1172   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1173   for (i = 0; i < taginfo->nattrs; i++)
1174     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1175   putchar ('\n');
1176   ++*(int *)arg;
1177 }
1178
1179 int main ()
1180 {
1181   int size = 256;
1182   char *x = xmalloc (size);
1183   int length = 0;
1184   int read_count;
1185   int tag_counter = 0;
1186
1187   while ((read_count = fread (x + length, 1, size - length, stdin)))
1188     {
1189       length += read_count;
1190       size <<= 1;
1191       x = xrealloc (x, size);
1192     }
1193
1194   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1195   printf ("TAGS: %d\n", tag_counter);
1196   printf ("Tag backouts:     %d\n", tag_backout_count);
1197   printf ("Comment backouts: %d\n", comment_backout_count);
1198   return 0;
1199 }
1200 #endif /* STANDALONE */