sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 /* The only entry point to this module is map_html_tags(), which see.  */
  32
  33 /* TODO:
  34
  35    - Allow hooks for callers to process contents outside tags.  This
  36      is needed to implement handling <style> and <script>.  The
  37      taginfo structure already carries the information about where the
  38      tags are, but this is not enough, because one would also want to
  39      skip the comments.  (The funny thing is that for <style> and
  40      <script> you *don't* want to skip comments!)
  41
  42    - Create a test suite for regression testing. */
  43
  44 /* HISTORY:
  45
  46    This is the third HTML parser written for Wget.  The first one was
  47    written some time during the Geturl 1.0 beta cycle, and was very
  48    inefficient and buggy.  It also contained some very complex code to
  49    remember a list of parser states, because it was supposed to be
  50    reentrant.
  51
  52    The second HTML parser was written for Wget 1.4 (the first version
  53    by the name `Wget'), and was a complete rewrite.  Although the new
  54    parser behaved much better and made no claims of reentrancy, it
  55    still shared many of the fundamental flaws of the old version -- it
  56    only regarded HTML in terms tag-attribute pairs, where the
  57    attribute's value was a URL to be returned.  Any other property of
  58    HTML, such as <base href=...>, or strange way to specify a URL,
  59    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  60    crudely hacked in -- and the caller had to be aware of these hacks.
  61    Like its predecessor, this parser did not support HTML comments.
  62
  63    After Wget 1.5.1 was released, I set out to write a third HTML
  64    parser.  The objectives of the new parser were to: (1) provide a
  65    clean way to analyze HTML lexically, (2) separate interpretation of
  66    the markup from the parsing process, (3) be as correct as possible,
  67    e.g. correctly skipping comments and other SGML declarations, (4)
  68    understand the most common errors in markup and skip them or be
  69    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  70    minimum copying and minimum or no heap allocation).
  71
  72    I believe this parser meets all of the above goals.  It is
  73    reasonably well structured, and could be relatively easily
  74    separated from Wget and used elsewhere.  While some of its
  75    intrinsic properties limit its value as a general-purpose HTML
  76    parser, I believe that, with minimum modifications, it could serve
  77    as a backend for one.
  78
  79    Due to time and other constraints, this parser was not integrated
  80    into Wget until the version 1.7. */
  81
  82 /* DESCRIPTION:
  83
  84    The single entry point of this parser is map_html_tags(), which
  85    works by calling a function you specify for each tag.  The function
  86    gets called with the pointer to a structure describing the tag and
  87    its attributes.  */
  88
  89 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  90    still need Wget headers to compile.  */
  91
  92 #include "wget.h"
  93
  94 #ifdef STANDALONE
  95 # define I_REALLY_WANT_CTYPE_MACROS
  96 #endif
  97
  98 #include <stdio.h>
  99 #include <stdlib.h>
 100 #include <string.h>
 101 #include <assert.h>
 102
 103 #include "utils.h"
 104 #include "html-parse.h"
 105
 106 #ifdef STANDALONE
 107 # undef xmalloc
 108 # undef xrealloc
 109 # undef xfree
 110 # define xmalloc malloc
 111 # define xrealloc realloc
 112 # define xfree free
 113
 114 # undef c_isspace
 115 # undef c_isdigit
 116 # undef c_isxdigit
 117 # undef c_isalpha
 118 # undef c_isalnum
 119 # undef c_tolower
 120 # undef c_toupper
 121
 122 # define c_isspace(x) isspace (x)
 123 # define c_isdigit(x) isdigit (x)
 124 # define c_isxdigit(x) isxdigit (x)
 125 # define c_isalpha(x) isalpha (x)
 126 # define c_isalnum(x) isalnum (x)
 127 # define c_tolower(x) tolower (x)
 128 # define c_toupper(x) toupper (x)
 129
 130 struct hash_table {
 131   int dummy;
 132 };
 133 static void *
 134 hash_table_get (const struct hash_table *ht, void *ptr)
 135 {
 136   return ptr;
 137 }
 138 #else  /* not STANDALONE */
 139 # include "hash.h"
 140 #endif
 141
 142 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 143    allocated on the stack, and moved to the heap if it needs to be
 144    larger than originally expected.  map_html_tags() uses it to store
 145    the zero-terminated names and values of tags and attributes.
 146
 147    Thus taginfo->name, and attr->name and attr->value for each
 148    attribute, do not point into separately allocated areas, but into
 149    different parts of the pool, separated only by terminating zeros.
 150    This ensures minimum amount of allocation and, for most tags, no
 151    allocation because the entire pool is kept on the stack.  */
 152
 153 struct pool {
 154   char *contents;               /* pointer to the contents. */
 155   int size;                     /* size of the pool. */
 156   int tail;                     /* next available position index. */
 157   bool resized;                 /* whether the pool has been resized
 158                                    using malloc. */
 159
 160   char *orig_contents;          /* original pool contents, usually
 161                                    stack-allocated.  used by POOL_FREE
 162                                    to restore the pool to the initial
 163                                    state. */
 164   int orig_size;
 165 };
 166
 167 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 168
 169 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 170   struct pool *P = (p);                                         \
 171   P->contents = (initial_storage);                              \
 172   P->size = (initial_size);                                     \
 173   P->tail = 0;                                                  \
 174   P->resized = false;                                           \
 175   P->orig_contents = P->contents;                               \
 176   P->orig_size = P->size;                                       \
 177 } while (0)
 178
 179 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 180    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 181
 182 #define POOL_GROW(p, increase)                                  \
 183   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 184               (p)->resized, char)
 185
 186 /* Append text in the range [beg, end) to POOL.  No zero-termination
 187    is done.  */
 188
 189 #define POOL_APPEND(p, beg, end) do {                   \
 190   const char *PA_beg = (beg);                           \
 191   int PA_size = (end) - PA_beg;                         \
 192   POOL_GROW (p, PA_size);                               \
 193   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 194   (p)->tail += PA_size;                                 \
 195 } while (0)
 196
 197 /* Append one character to the pool.  Can be used to zero-terminate
 198    pool strings.  */
 199
 200 #define POOL_APPEND_CHR(p, ch) do {             \
 201   char PAC_char = (ch);                         \
 202   POOL_GROW (p, 1);                             \
 203   (p)->contents[(p)->tail++] = PAC_char;        \
 204 } while (0)
 205
 206 /* Forget old pool contents.  The allocated memory is not freed. */
 207 #define POOL_REWIND(p) (p)->tail = 0
 208
 209 /* Free heap-allocated memory for contents of POOL.  This calls
 210    xfree() if the memory was allocated through malloc.  It also
 211    restores `contents' and `size' to their original, pre-malloc
 212    values.  That way after POOL_FREE, the pool is fully usable, just
 213    as if it were freshly initialized with POOL_INIT.  */
 214
 215 #define POOL_FREE(p) do {                       \
 216   struct pool *P = p;                           \
 217   if (P->resized)                               \
 218     xfree (P->contents);                        \
 219   P->contents = P->orig_contents;               \
 220   P->size = P->orig_size;                       \
 221   P->tail = 0;                                  \
 222   P->resized = false;                           \
 223 } while (0)
 224
 225 /* Used for small stack-allocated memory chunks that might grow.  Like
 226    DO_REALLOC, this macro grows BASEVAR as necessary to take
 227    NEEDED_SIZE items of TYPE.
 228
 229    The difference is that on the first resize, it will use
 230    malloc+memcpy rather than realloc.  That way you can stack-allocate
 231    the initial chunk, and only resort to heap allocation if you
 232    stumble upon large data.
 233
 234    After the first resize, subsequent ones are performed with realloc,
 235    just like DO_REALLOC.  */
 236
 237 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 238   long ga_needed_size = (needed_size);                                          \
 239   long ga_newsize = (sizevar);                                                  \
 240   while (ga_newsize < ga_needed_size)                                           \
 241     ga_newsize <<= 1;                                                           \
 242   if (ga_newsize != (sizevar))                                                  \
 243     {                                                                           \
 244       if (resized)                                                              \
 245         basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
 246       else                                                                      \
 247         {                                                                       \
 248           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 249           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 250           (basevar) = ga_new;                                                   \
 251           resized = true;                                                       \
 252         }                                                                       \
 253       (sizevar) = ga_newsize;                                                   \
 254     }                                                                           \
 255 } while (0)
 256 \f
 257 /* Test whether n+1-sized entity name fits in P.  We don't support
 258    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 259    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 260    other words an entity needs to be terminated by either a
 261    non-alphanumeric or the end of string.  */
 262 #define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
 263
 264 /* Macros that test entity names by returning true if P is followed by
 265    the specified characters.  */
 266 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 267 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 268 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 269
 270 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 271    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 272    to "<foo", but "&lt,foo" to "<,foo".  */
 273 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 274
 275 struct tagstack_item {
 276   const char *tagname_begin;
 277   const char *tagname_end;
 278   const char *contents_begin;
 279   struct tagstack_item *prev;
 280   struct tagstack_item *next;
 281 };
 282
 283 static struct tagstack_item *
 284 tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
 285 {
 286   struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
 287   if (*head == NULL)
 288     {
 289       *head = *tail = ts;
 290       ts->prev = ts->next = NULL;
 291     }
 292   else
 293     {
 294       (*tail)->next = ts;
 295       ts->prev = *tail;
 296       *tail = ts;
 297       ts->next = NULL;
 298     }
 299
 300   return ts;
 301 }
 302
 303 /* remove ts and everything after it from the stack */
 304 static void
 305 tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
 306               struct tagstack_item *ts)
 307 {
 308   if (*head == NULL)
 309     return;
 310
 311   if (ts == *tail)
 312     {
 313       if (ts == *head)
 314         {
 315           xfree (ts);
 316           *head = *tail = NULL;
 317         }
 318       else
 319         {
 320           ts->prev->next = NULL;
 321           *tail = ts->prev;
 322           xfree (ts);
 323         }
 324     }
 325   else
 326     {
 327       if (ts == *head)
 328         {
 329           *head = NULL;
 330         }
 331       *tail = ts->prev;
 332
 333       if (ts->prev)
 334         {
 335           ts->prev->next = NULL;
 336         }
 337       while (ts)
 338         {
 339           struct tagstack_item *p = ts->next;
 340           xfree (ts);
 341           ts = p;
 342         }
 343     }
 344 }
 345
 346 static struct tagstack_item *
 347 tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
 348                const char *tagname_end)
 349 {
 350   int len = tagname_end - tagname_begin;
 351   while (tail)
 352     {
 353       if (len == (tail->tagname_end - tail->tagname_begin))
 354         {
 355           if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
 356             return tail;
 357         }
 358       tail = tail->prev;
 359     }
 360   return NULL;
 361 }
 362
 363 /* Decode the HTML character entity at *PTR, considering END to be end
 364    of buffer.  It is assumed that the "&" character that marks the
 365    beginning of the entity has been seen at *PTR-1.  If a recognized
 366    ASCII entity is seen, it is returned, and *PTR is moved to the end
 367    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 368
 369    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 370
 371 static int
 372 decode_entity (const char **ptr, const char *end)
 373 {
 374   const char *p = *ptr;
 375   int value = -1;
 376
 377   if (++p == end)
 378     return -1;
 379
 380   switch (*p++)
 381     {
 382     case '#':
 383       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 384       {
 385         int digits = 0;
 386         value = 0;
 387         if (*p == 'x')
 388           for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
 389             value = (value << 4) + XDIGIT_TO_NUM (*p);
 390         else
 391           for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
 392             value = (value * 10) + (*p - '0');
 393         if (!digits)
 394           return -1;
 395         /* Don't interpret 128+ codes and NUL because we cannot
 396            portably reinserted them into HTML.  */
 397         if (!value || (value & ~0x7f))
 398           return -1;
 399         *ptr = SKIP_SEMI (p, 0);
 400         return value;
 401       }
 402     /* Process named ASCII entities.  */
 403     case 'g':
 404       if (ENT1 (p, 't'))
 405         value = '>', *ptr = SKIP_SEMI (p, 1);
 406       break;
 407     case 'l':
 408       if (ENT1 (p, 't'))
 409         value = '<', *ptr = SKIP_SEMI (p, 1);
 410       break;
 411     case 'a':
 412       if (ENT2 (p, 'm', 'p'))
 413         value = '&', *ptr = SKIP_SEMI (p, 2);
 414       else if (ENT3 (p, 'p', 'o', 's'))
 415         /* handle &apos for the sake of the XML/XHTML crowd. */
 416         value = '\'', *ptr = SKIP_SEMI (p, 3);
 417       break;
 418     case 'q':
 419       if (ENT3 (p, 'u', 'o', 't'))
 420         value = '\"', *ptr = SKIP_SEMI (p, 3);
 421       break;
 422     }
 423   return value;
 424 }
 425 #undef ENT1
 426 #undef ENT2
 427 #undef ENT3
 428 #undef FITS
 429 #undef SKIP_SEMI
 430
 431 enum {
 432   AP_DOWNCASE           = 1,
 433   AP_DECODE_ENTITIES    = 2,
 434   AP_TRIM_BLANKS        = 4
 435 };
 436
 437 /* Copy the text in the range [BEG, END) to POOL, optionally
 438    performing operations specified by FLAGS.  FLAGS may be any
 439    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 440    with the following meaning:
 441
 442    * AP_DOWNCASE -- downcase all the letters;
 443
 444    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 445      the ASCII range when copying the string.
 446
 447    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 448      of text, as well as embedded newlines.  */
 449
 450 static void
 451 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 452 {
 453   int old_tail = pool->tail;
 454
 455   /* Skip blanks if required.  We must do this before entities are
 456      processed, so that blanks can still be inserted as, for instance,
 457      `&#32;'.  */
 458   if (flags & AP_TRIM_BLANKS)
 459     {
 460       while (beg < end && c_isspace (*beg))
 461         ++beg;
 462       while (end > beg && c_isspace (end[-1]))
 463         --end;
 464     }
 465
 466   if (flags & AP_DECODE_ENTITIES)
 467     {
 468       /* Grow the pool, then copy the text to the pool character by
 469          character, processing the encountered entities as we go
 470          along.
 471
 472          It's safe (and necessary) to grow the pool in advance because
 473          processing the entities can only *shorten* the string, it can
 474          never lengthen it.  */
 475       const char *from = beg;
 476       char *to;
 477       bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
 478
 479       POOL_GROW (pool, end - beg);
 480       to = pool->contents + pool->tail;
 481
 482       while (from < end)
 483         {
 484           if (*from == '&')
 485             {
 486               int entity = decode_entity (&from, end);
 487               if (entity != -1)
 488                 *to++ = entity;
 489               else
 490                 *to++ = *from++;
 491             }
 492           else if ((*from == '\n' || *from == '\r') && squash_newlines)
 493             ++from;
 494           else
 495             *to++ = *from++;
 496         }
 497       /* Verify that we haven't exceeded the original size.  (It
 498          shouldn't happen, hence the assert.)  */
 499       assert (to - (pool->contents + pool->tail) <= end - beg);
 500
 501       /* Make POOL's tail point to the position following the string
 502          we've written.  */
 503       pool->tail = to - pool->contents;
 504       POOL_APPEND_CHR (pool, '\0');
 505     }
 506   else
 507     {
 508       /* Just copy the text to the pool.  */
 509       POOL_APPEND (pool, beg, end);
 510       POOL_APPEND_CHR (pool, '\0');
 511     }
 512
 513   if (flags & AP_DOWNCASE)
 514     {
 515       char *p = pool->contents + old_tail;
 516       for (; *p; p++)
 517         *p = c_tolower (*p);
 518     }
 519 }
 520 \f
 521 /* Originally we used to adhere to rfc 1866 here, and allowed only
 522    letters, digits, periods, and hyphens as names (of tags or
 523    attributes).  However, this broke too many pages which used
 524    proprietary or strange attributes, e.g. <img src="a.gif"
 525    v:shapes="whatever">.
 526
 527    So now we allow any character except:
 528      * whitespace
 529      * 8-bit and control chars
 530      * characters that clearly cannot be part of name:
 531        '=', '<', '>', '/'.
 532
 533    This only affects attribute and tag names; attribute values allow
 534    an even greater variety of characters.  */
 535
 536 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 537                         && (x) != '=' && (x) != '<' && (x) != '>'       \
 538                         && (x) != '/')
 539
 540 #ifdef STANDALONE
 541 static int comment_backout_count;
 542 #endif
 543
 544 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 545    strict comments mode, this is used for skipping over comments as
 546    well.
 547
 548    To recap: any SGML declaration may have comments associated with
 549    it, e.g.
 550        <!MY-DECL -- isn't this fun? -- foo bar>
 551
 552    An HTML comment is merely an empty declaration (<!>) with a comment
 553    attached, like this:
 554        <!-- some stuff here -->
 555
 556    Several comments may be embedded in one comment declaration:
 557        <!-- have -- -- fun -->
 558
 559    Whitespace is allowed between and after the comments, but not
 560    before the first comment.  Additionally, this function attempts to
 561    handle double quotes in SGML declarations correctly.  */
 562
 563 static const char *
 564 advance_declaration (const char *beg, const char *end)
 565 {
 566   const char *p = beg;
 567   char quote_char = '\0';       /* shut up, gcc! */
 568   char ch;
 569
 570   enum {
 571     AC_S_DONE,
 572     AC_S_BACKOUT,
 573     AC_S_BANG,
 574     AC_S_DEFAULT,
 575     AC_S_DCLNAME,
 576     AC_S_DASH1,
 577     AC_S_DASH2,
 578     AC_S_COMMENT,
 579     AC_S_DASH3,
 580     AC_S_DASH4,
 581     AC_S_QUOTE1,
 582     AC_S_IN_QUOTE,
 583     AC_S_QUOTE2
 584   } state = AC_S_BANG;
 585
 586   if (beg == end)
 587     return beg;
 588   ch = *p++;
 589
 590   /* It looked like a good idea to write this as a state machine, but
 591      now I wonder...  */
 592
 593   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 594     {
 595       if (p == end)
 596         state = AC_S_BACKOUT;
 597       switch (state)
 598         {
 599         case AC_S_DONE:
 600         case AC_S_BACKOUT:
 601           break;
 602         case AC_S_BANG:
 603           if (ch == '!')
 604             {
 605               ch = *p++;
 606               state = AC_S_DEFAULT;
 607             }
 608           else
 609             state = AC_S_BACKOUT;
 610           break;
 611         case AC_S_DEFAULT:
 612           switch (ch)
 613             {
 614             case '-':
 615               state = AC_S_DASH1;
 616               break;
 617             case ' ':
 618             case '\t':
 619             case '\r':
 620             case '\n':
 621               ch = *p++;
 622               break;
 623             case '<':
 624             case '>':
 625               state = AC_S_DONE;
 626               break;
 627             case '\'':
 628             case '\"':
 629               state = AC_S_QUOTE1;
 630               break;
 631             default:
 632               if (NAME_CHAR_P (ch))
 633                 state = AC_S_DCLNAME;
 634               else
 635                 state = AC_S_BACKOUT;
 636               break;
 637             }
 638           break;
 639         case AC_S_DCLNAME:
 640           if (ch == '-')
 641             state = AC_S_DASH1;
 642           else if (NAME_CHAR_P (ch))
 643             ch = *p++;
 644           else
 645             state = AC_S_DEFAULT;
 646           break;
 647         case AC_S_QUOTE1:
 648           /* We must use 0x22 because broken assert macros choke on
 649              '"' and '\"'.  */
 650           assert (ch == '\'' || ch == 0x22);
 651           quote_char = ch;      /* cheating -- I really don't feel like
 652                                    introducing more different states for
 653                                    different quote characters. */
 654           ch = *p++;
 655           state = AC_S_IN_QUOTE;
 656           break;
 657         case AC_S_IN_QUOTE:
 658           if (ch == quote_char)
 659             state = AC_S_QUOTE2;
 660           else
 661             ch = *p++;
 662           break;
 663         case AC_S_QUOTE2:
 664           assert (ch == quote_char);
 665           ch = *p++;
 666           state = AC_S_DEFAULT;
 667           break;
 668         case AC_S_DASH1:
 669           assert (ch == '-');
 670           ch = *p++;
 671           state = AC_S_DASH2;
 672           break;
 673         case AC_S_DASH2:
 674           switch (ch)
 675             {
 676             case '-':
 677               ch = *p++;
 678               state = AC_S_COMMENT;
 679               break;
 680             default:
 681               state = AC_S_BACKOUT;
 682             }
 683           break;
 684         case AC_S_COMMENT:
 685           switch (ch)
 686             {
 687             case '-':
 688               state = AC_S_DASH3;
 689               break;
 690             default:
 691               ch = *p++;
 692               break;
 693             }
 694           break;
 695         case AC_S_DASH3:
 696           assert (ch == '-');
 697           ch = *p++;
 698           state = AC_S_DASH4;
 699           break;
 700         case AC_S_DASH4:
 701           switch (ch)
 702             {
 703             case '-':
 704               ch = *p++;
 705               state = AC_S_DEFAULT;
 706               break;
 707             default:
 708               state = AC_S_COMMENT;
 709               break;
 710             }
 711           break;
 712         }
 713     }
 714
 715   if (state == AC_S_BACKOUT)
 716     {
 717 #ifdef STANDALONE
 718       ++comment_backout_count;
 719 #endif
 720       return beg + 1;
 721     }
 722   return p;
 723 }
 724
 725 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 726    return the pointer to the character after the substring.  If the
 727    substring is not found, return NULL.  */
 728
 729 static const char *
 730 find_comment_end (const char *beg, const char *end)
 731 {
 732   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 733      if it's not '>' or '-', advance by three characters.  Otherwise,
 734      look at the preceding characters and try to find a match.  */
 735
 736   const char *p = beg - 1;
 737
 738   while ((p += 3) < end)
 739     switch (p[0])
 740       {
 741       case '>':
 742         if (p[-1] == '-' && p[-2] == '-')
 743           return p + 1;
 744         break;
 745       case '-':
 746       at_dash:
 747         if (p[-1] == '-')
 748           {
 749           at_dash_dash:
 750             if (++p == end) return NULL;
 751             switch (p[0])
 752               {
 753               case '>': return p + 1;
 754               case '-': goto at_dash_dash;
 755               }
 756           }
 757         else
 758           {
 759             if ((p += 2) >= end) return NULL;
 760             switch (p[0])
 761               {
 762               case '>':
 763                 if (p[-1] == '-')
 764                   return p + 1;
 765                 break;
 766               case '-':
 767                 goto at_dash;
 768               }
 769           }
 770       }
 771   return NULL;
 772 }
 773 \f
 774 /* Return true if the string containing of characters inside [b, e) is
 775    present in hash table HT.  */
 776
 777 static bool
 778 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 779 {
 780   char *copy;
 781   if (!ht)
 782     return true;
 783   BOUNDED_TO_ALLOCA (b, e, copy);
 784   return hash_table_get (ht, copy) != NULL;
 785 }
 786
 787 /* Advance P (a char pointer), with the explicit intent of being able
 788    to read the next character.  If this is not possible, go to finish.  */
 789
 790 #define ADVANCE(p) do {                         \
 791   ++p;                                          \
 792   if (p >= end)                                 \
 793     goto finish;                                \
 794 } while (0)
 795
 796 /* Skip whitespace, if any. */
 797
 798 #define SKIP_WS(p) do {                         \
 799   while (c_isspace (*p)) {                        \
 800     ADVANCE (p);                                \
 801   }                                             \
 802 } while (0)
 803
 804 /* Skip non-whitespace, if any. */
 805
 806 #define SKIP_NON_WS(p) do {                     \
 807   while (!c_isspace (*p)) {                       \
 808     ADVANCE (p);                                \
 809   }                                             \
 810 } while (0)
 811
 812 #ifdef STANDALONE
 813 static int tag_backout_count;
 814 #endif
 815
 816 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 817    MAPFUN will be called with two arguments: pointer to an initialized
 818    struct taginfo, and MAPARG.
 819
 820    ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
 821    which are the tags and attribute names that this function should
 822    use.  If ALLOWED_TAGS is NULL, all tags are processed; if
 823    ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
 824
 825    (Obviously, the caller can filter out unwanted tags and attributes
 826    just as well, but this is just an optimization designed to avoid
 827    unnecessary copying of tags/attributes which the caller doesn't
 828    care about.)  */
 829
 830 void
 831 map_html_tags (const char *text, int size,
 832                void (*mapfun) (struct taginfo *, void *), void *maparg,
 833                int flags,
 834                const struct hash_table *allowed_tags,
 835                const struct hash_table *allowed_attributes)
 836 {
 837   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 838      too little, POOL_APPEND allocates more with malloc. */
 839   char pool_initial_storage[256];
 840   struct pool pool;
 841
 842   const char *p = text;
 843   const char *end = text + size;
 844
 845   struct attr_pair attr_pair_initial_storage[8];
 846   int attr_pair_size = countof (attr_pair_initial_storage);
 847   bool attr_pair_resized = false;
 848   struct attr_pair *pairs = attr_pair_initial_storage;
 849
 850   struct tagstack_item *head = NULL;
 851   struct tagstack_item *tail = NULL;
 852
 853   if (!size)
 854     return;
 855
 856   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 857
 858   {
 859     int nattrs, end_tag;
 860     const char *tag_name_begin, *tag_name_end;
 861     const char *tag_start_position;
 862     bool uninteresting_tag;
 863
 864   look_for_tag:
 865     POOL_REWIND (&pool);
 866
 867     nattrs = 0;
 868     end_tag = 0;
 869
 870     /* Find beginning of tag.  We use memchr() instead of the usual
 871        looping with ADVANCE() for speed. */
 872     p = memchr (p, '<', end - p);
 873     if (!p)
 874       goto finish;
 875
 876     tag_start_position = p;
 877     ADVANCE (p);
 878
 879     /* Establish the type of the tag (start-tag, end-tag or
 880        declaration).  */
 881     if (*p == '!')
 882       {
 883         if (!(flags & MHT_STRICT_COMMENTS)
 884             && p < end + 3 && p[1] == '-' && p[2] == '-')
 885           {
 886             /* If strict comments are not enforced and if we know
 887                we're looking at a comment, simply look for the
 888                terminating "-->".  Non-strict is the default because
 889                it works in other browsers and most HTML writers can't
 890                be bothered with getting the comments right.  */
 891             const char *comment_end = find_comment_end (p + 3, end);
 892             if (comment_end)
 893               p = comment_end;
 894           }
 895         else
 896           {
 897             /* Either in strict comment mode or looking at a non-empty
 898                declaration.  Real declarations are much less likely to
 899                be misused the way comments are, so advance over them
 900                properly regardless of strictness.  */
 901             p = advance_declaration (p, end);
 902           }
 903         if (p == end)
 904           goto finish;
 905         goto look_for_tag;
 906       }
 907     else if (*p == '/')
 908       {
 909         end_tag = 1;
 910         ADVANCE (p);
 911       }
 912     tag_name_begin = p;
 913     while (NAME_CHAR_P (*p))
 914       ADVANCE (p);
 915     if (p == tag_name_begin)
 916       goto look_for_tag;
 917     tag_name_end = p;
 918     SKIP_WS (p);
 919
 920     if (!end_tag)
 921       {
 922         struct tagstack_item *ts = tagstack_push (&head, &tail);
 923         if (ts)
 924           {
 925             ts->tagname_begin  = tag_name_begin;
 926             ts->tagname_end    = tag_name_end;
 927             ts->contents_begin = NULL;
 928           }
 929       }
 930
 931     if (end_tag && *p != '>' && *p != '<')
 932       goto backout_tag;
 933
 934     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 935       /* We can't just say "goto look_for_tag" here because we need
 936          the loop below to properly advance over the tag's attributes.  */
 937       uninteresting_tag = true;
 938     else
 939       {
 940         uninteresting_tag = false;
 941         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 942       }
 943
 944     /* Find the attributes. */
 945     while (1)
 946       {
 947         const char *attr_name_begin, *attr_name_end;
 948         const char *attr_value_begin, *attr_value_end;
 949         const char *attr_raw_value_begin, *attr_raw_value_end;
 950         int operation = AP_DOWNCASE; /* stupid compiler. */
 951
 952         SKIP_WS (p);
 953
 954         if (*p == '/')
 955           {
 956             /* A slash at this point means the tag is about to be
 957                closed.  This is legal in XML and has been popularized
 958                in HTML via XHTML.  */
 959             /* <foo a=b c=d /> */
 960             /*              ^  */
 961             ADVANCE (p);
 962             SKIP_WS (p);
 963             if (*p != '<' && *p != '>')
 964               goto backout_tag;
 965           }
 966
 967         /* Check for end of tag definition. */
 968         if (*p == '<' || *p == '>')
 969           break;
 970
 971         /* Establish bounds of attribute name. */
 972         attr_name_begin = p;    /* <foo bar ...> */
 973                                 /*      ^        */
 974         while (NAME_CHAR_P (*p))
 975           ADVANCE (p);
 976         attr_name_end = p;      /* <foo bar ...> */
 977                                 /*         ^     */
 978         if (attr_name_begin == attr_name_end)
 979           goto backout_tag;
 980
 981         /* Establish bounds of attribute value. */
 982         SKIP_WS (p);
 983
 984         if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
 985           {
 986             /* Minimized attribute syntax allows `=' to be omitted.
 987                For example, <UL COMPACT> is a valid shorthand for <UL
 988                COMPACT="compact">.  Even if such attributes are not
 989                useful to Wget, we need to support them, so that the
 990                tags containing them can be parsed correctly. */
 991             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 992             attr_raw_value_end = attr_value_end = attr_name_end;
 993           }
 994         else if (*p == '=')
 995           {
 996             ADVANCE (p);
 997             SKIP_WS (p);
 998             if (*p == '\"' || *p == '\'')
 999               {
1000                 bool newline_seen = false;
1001                 char quote_char = *p;
1002                 attr_raw_value_begin = p;
1003                 ADVANCE (p);
1004                 attr_value_begin = p; /* <foo bar="baz"> */
1005                                       /*           ^     */
1006                 while (*p != quote_char)
1007                   {
1008                     if (!newline_seen && *p == '\n')
1009                       {
1010                         /* If a newline is seen within the quotes, it
1011                            is most likely that someone forgot to close
1012                            the quote.  In that case, we back out to
1013                            the value beginning, and terminate the tag
1014                            at either `>' or the delimiter, whichever
1015                            comes first.  Such a tag terminated at `>'
1016                            is discarded.  */
1017                         p = attr_value_begin;
1018                         newline_seen = true;
1019                         continue;
1020                       }
1021                     else if (newline_seen && (*p == '<' || *p == '>'))
1022                       break;
1023                     ADVANCE (p);
1024                   }
1025                 attr_value_end = p; /* <foo bar="baz"> */
1026                                     /*              ^  */
1027                 if (*p == quote_char)
1028                   ADVANCE (p);
1029                 else
1030                   goto look_for_tag;
1031                 attr_raw_value_end = p; /* <foo bar="baz"> */
1032                                         /*               ^ */
1033                 operation = AP_DECODE_ENTITIES;
1034                 if (flags & MHT_TRIM_VALUES)
1035                   operation |= AP_TRIM_BLANKS;
1036               }
1037             else
1038               {
1039                 attr_value_begin = p; /* <foo bar=baz> */
1040                                       /*          ^    */
1041                 /* According to SGML, a name token should consist only
1042                    of alphanumerics, . and -.  However, this is often
1043                    violated by, for instance, `%' in `width=75%'.
1044                    We'll be liberal and allow just about anything as
1045                    an attribute value.  */
1046                 while (!c_isspace (*p) && *p != '<' && *p != '>')
1047                   ADVANCE (p);
1048                 attr_value_end = p; /* <foo bar=baz qux=quix> */
1049                                     /*             ^          */
1050                 if (attr_value_begin == attr_value_end)
1051                   /* <foo bar=> */
1052                   /*          ^ */
1053                   goto backout_tag;
1054                 attr_raw_value_begin = attr_value_begin;
1055                 attr_raw_value_end = attr_value_end;
1056                 operation = AP_DECODE_ENTITIES;
1057               }
1058           }
1059         else
1060           {
1061             /* We skipped the whitespace and found something that is
1062                neither `=' nor the beginning of the next attribute's
1063                name.  Back out.  */
1064             goto backout_tag;   /* <foo bar [... */
1065                                 /*          ^    */
1066           }
1067
1068         /* If we're not interested in the tag, don't bother with any
1069            of the attributes.  */
1070         if (uninteresting_tag)
1071           continue;
1072
1073         /* If we aren't interested in the attribute, skip it.  We
1074            cannot do this test any sooner, because our text pointer
1075            needs to correctly advance over the attribute.  */
1076         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
1077           continue;
1078
1079         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
1080                     struct attr_pair);
1081
1082         pairs[nattrs].name_pool_index = pool.tail;
1083         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
1084
1085         pairs[nattrs].value_pool_index = pool.tail;
1086         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
1087         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
1088         pairs[nattrs].value_raw_size = (attr_raw_value_end
1089                                         - attr_raw_value_begin);
1090         ++nattrs;
1091       }
1092
1093     if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
1094       {
1095         tail->contents_begin = p+1;
1096       }
1097
1098     if (uninteresting_tag)
1099       {
1100         ADVANCE (p);
1101         goto look_for_tag;
1102       }
1103
1104     /* By now, we have a valid tag with a name and zero or more
1105        attributes.  Fill in the data and call the mapper function.  */
1106     {
1107       int i;
1108       struct taginfo taginfo;
1109       struct tagstack_item *ts = NULL;
1110
1111       taginfo.name      = pool.contents;
1112       taginfo.end_tag_p = end_tag;
1113       taginfo.nattrs    = nattrs;
1114       /* We fill in the char pointers only now, when pool can no
1115          longer get realloc'ed.  If we did that above, we could get
1116          hosed by reallocation.  Obviously, after this point, the pool
1117          may no longer be grown.  */
1118       for (i = 0; i < nattrs; i++)
1119         {
1120           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1121           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1122         }
1123       taginfo.attrs = pairs;
1124       taginfo.start_position = tag_start_position;
1125       taginfo.end_position   = p + 1;
1126       taginfo.contents_begin = NULL;
1127       taginfo.contents_end = NULL;
1128
1129       if (end_tag)
1130         {
1131           ts = tagstack_find (tail, tag_name_begin, tag_name_end);
1132           if (ts)
1133             {
1134               if (ts->contents_begin)
1135                 {
1136                   taginfo.contents_begin = ts->contents_begin;
1137                   taginfo.contents_end   = tag_start_position;
1138                 }
1139               tagstack_pop (&head, &tail, ts);
1140             }
1141         }
1142
1143       mapfun (&taginfo, maparg);
1144       if (*p != '<')
1145         ADVANCE (p);
1146     }
1147     goto look_for_tag;
1148
1149   backout_tag:
1150 #ifdef STANDALONE
1151     ++tag_backout_count;
1152 #endif
1153     /* The tag wasn't really a tag.  Treat its contents as ordinary
1154        data characters. */
1155     p = tag_start_position + 1;
1156     goto look_for_tag;
1157   }
1158
1159  finish:
1160   POOL_FREE (&pool);
1161   if (attr_pair_resized)
1162     xfree (pairs);
1163   /* pop any tag stack that's left */
1164   tagstack_pop (&head, &tail, head);
1165 }
1166
1167 #undef ADVANCE
1168 #undef SKIP_WS
1169 #undef SKIP_NON_WS
1170 \f
1171 #ifdef STANDALONE
1172 static void
1173 test_mapper (struct taginfo *taginfo, void *arg)
1174 {
1175   int i;
1176
1177   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1178   for (i = 0; i < taginfo->nattrs; i++)
1179     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1180   putchar ('\n');
1181   ++*(int *)arg;
1182 }
1183
1184 int main ()
1185 {
1186   int size = 256;
1187   char *x = xmalloc (size);
1188   int length = 0;
1189   int read_count;
1190   int tag_counter = 0;
1191
1192   while ((read_count = fread (x + length, 1, size - length, stdin)))
1193     {
1194       length += read_count;
1195       size <<= 1;
1196       x = xrealloc (x, size);
1197     }
1198
1199   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1200   printf ("TAGS: %d\n", tag_counter);
1201   printf ("Tag backouts:     %d\n", tag_backout_count);
1202   printf ("Comment backouts: %d\n", comment_backout_count);
1203   return 0;
1204 }
1205 #endif /* STANDALONE */