sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 /* The only entry point to this module is map_html_tags(), which see.  */
  32
  33 /* TODO:
  34
  35    - Allow hooks for callers to process contents outside tags.  This
  36      is needed to implement handling <style> and <script>.  The
  37      taginfo structure already carries the information about where the
  38      tags are, but this is not enough, because one would also want to
  39      skip the comments.  (The funny thing is that for <style> and
  40      <script> you *don't* want to skip comments!)
  41
  42    - Create a test suite for regression testing. */
  43
  44 /* HISTORY:
  45
  46    This is the third HTML parser written for Wget.  The first one was
  47    written some time during the Geturl 1.0 beta cycle, and was very
  48    inefficient and buggy.  It also contained some very complex code to
  49    remember a list of parser states, because it was supposed to be
  50    reentrant.
  51
  52    The second HTML parser was written for Wget 1.4 (the first version
  53    by the name `Wget'), and was a complete rewrite.  Although the new
  54    parser behaved much better and made no claims of reentrancy, it
  55    still shared many of the fundamental flaws of the old version -- it
  56    only regarded HTML in terms tag-attribute pairs, where the
  57    attribute's value was a URL to be returned.  Any other property of
  58    HTML, such as <base href=...>, or strange way to specify a URL,
  59    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  60    crudely hacked in -- and the caller had to be aware of these hacks.
  61    Like its predecessor, this parser did not support HTML comments.
  62
  63    After Wget 1.5.1 was released, I set out to write a third HTML
  64    parser.  The objectives of the new parser were to: (1) provide a
  65    clean way to analyze HTML lexically, (2) separate interpretation of
  66    the markup from the parsing process, (3) be as correct as possible,
  67    e.g. correctly skipping comments and other SGML declarations, (4)
  68    understand the most common errors in markup and skip them or be
  69    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  70    minimum copying and minimum or no heap allocation).
  71
  72    I believe this parser meets all of the above goals.  It is
  73    reasonably well structured, and could be relatively easily
  74    separated from Wget and used elsewhere.  While some of its
  75    intrinsic properties limit its value as a general-purpose HTML
  76    parser, I believe that, with minimum modifications, it could serve
  77    as a backend for one.
  78
  79    Due to time and other constraints, this parser was not integrated
  80    into Wget until the version 1.7. */
  81
  82 /* DESCRIPTION:
  83
  84    The single entry point of this parser is map_html_tags(), which
  85    works by calling a function you specify for each tag.  The function
  86    gets called with the pointer to a structure describing the tag and
  87    its attributes.  */
  88
  89 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  90    still need Wget headers to compile.  */
  91
  92 #include "wget.h"
  93
  94 #ifdef STANDALONE
  95 # define I_REALLY_WANT_CTYPE_MACROS
  96 #endif
  97
  98 #include <stdio.h>
  99 #include <stdlib.h>
 100 #include <string.h>
 101 #include <assert.h>
 102
 103 #include "utils.h"
 104 #include "html-parse.h"
 105
 106 #ifdef STANDALONE
 107 # undef xmalloc
 108 # undef xrealloc
 109 # undef xfree
 110 # define xmalloc malloc
 111 # define xrealloc realloc
 112 # define xfree free
 113
 114 # undef c_isspace
 115 # undef c_isdigit
 116 # undef c_isxdigit
 117 # undef c_isalpha
 118 # undef c_isalnum
 119 # undef c_tolower
 120 # undef c_toupper
 121
 122 # define c_isspace(x) isspace (x)
 123 # define c_isdigit(x) isdigit (x)
 124 # define c_isxdigit(x) isxdigit (x)
 125 # define c_isalpha(x) isalpha (x)
 126 # define c_isalnum(x) isalnum (x)
 127 # define c_tolower(x) tolower (x)
 128 # define c_toupper(x) toupper (x)
 129
 130 struct hash_table {
 131   int dummy;
 132 };
 133 static void *
 134 hash_table_get (const struct hash_table *ht, void *ptr)
 135 {
 136   return ptr;
 137 }
 138 #else  /* not STANDALONE */
 139 # include "hash.h"
 140 #endif
 141
 142 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 143    allocated on the stack, and moved to the heap if it needs to be
 144    larger than originally expected.  map_html_tags() uses it to store
 145    the zero-terminated names and values of tags and attributes.
 146
 147    Thus taginfo->name, and attr->name and attr->value for each
 148    attribute, do not point into separately allocated areas, but into
 149    different parts of the pool, separated only by terminating zeros.
 150    This ensures minimum amount of allocation and, for most tags, no
 151    allocation because the entire pool is kept on the stack.  */
 152
 153 struct pool {
 154   char *contents;               /* pointer to the contents. */
 155   int size;                     /* size of the pool. */
 156   int tail;                     /* next available position index. */
 157   bool resized;                 /* whether the pool has been resized
 158                                    using malloc. */
 159
 160   char *orig_contents;          /* original pool contents, usually
 161                                    stack-allocated.  used by POOL_FREE
 162                                    to restore the pool to the initial
 163                                    state. */
 164   int orig_size;
 165 };
 166
 167 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 168
 169 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 170   struct pool *P = (p);                                         \
 171   P->contents = (initial_storage);                              \
 172   P->size = (initial_size);                                     \
 173   P->tail = 0;                                                  \
 174   P->resized = false;                                           \
 175   P->orig_contents = P->contents;                               \
 176   P->orig_size = P->size;                                       \
 177 } while (0)
 178
 179 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 180    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 181
 182 #define POOL_GROW(p, increase)                                  \
 183   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 184               (p)->resized, char)
 185
 186 /* Append text in the range [beg, end) to POOL.  No zero-termination
 187    is done.  */
 188
 189 #define POOL_APPEND(p, beg, end) do {                   \
 190   const char *PA_beg = (beg);                           \
 191   int PA_size = (end) - PA_beg;                         \
 192   POOL_GROW (p, PA_size);                               \
 193   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 194   (p)->tail += PA_size;                                 \
 195 } while (0)
 196
 197 /* Append one character to the pool.  Can be used to zero-terminate
 198    pool strings.  */
 199
 200 #define POOL_APPEND_CHR(p, ch) do {             \
 201   char PAC_char = (ch);                         \
 202   POOL_GROW (p, 1);                             \
 203   (p)->contents[(p)->tail++] = PAC_char;        \
 204 } while (0)
 205
 206 /* Forget old pool contents.  The allocated memory is not freed. */
 207 #define POOL_REWIND(p) (p)->tail = 0
 208
 209 /* Free heap-allocated memory for contents of POOL.  This calls
 210    xfree() if the memory was allocated through malloc.  It also
 211    restores `contents' and `size' to their original, pre-malloc
 212    values.  That way after POOL_FREE, the pool is fully usable, just
 213    as if it were freshly initialized with POOL_INIT.  */
 214
 215 #define POOL_FREE(p) do {                       \
 216   struct pool *P = p;                           \
 217   if (P->resized)                               \
 218     xfree (P->contents);                        \
 219   P->contents = P->orig_contents;               \
 220   P->size = P->orig_size;                       \
 221   P->tail = 0;                                  \
 222   P->resized = false;                           \
 223 } while (0)
 224
 225 /* Used for small stack-allocated memory chunks that might grow.  Like
 226    DO_REALLOC, this macro grows BASEVAR as necessary to take
 227    NEEDED_SIZE items of TYPE.
 228
 229    The difference is that on the first resize, it will use
 230    malloc+memcpy rather than realloc.  That way you can stack-allocate
 231    the initial chunk, and only resort to heap allocation if you
 232    stumble upon large data.
 233
 234    After the first resize, subsequent ones are performed with realloc,
 235    just like DO_REALLOC.  */
 236
 237 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 238   long ga_needed_size = (needed_size);                                          \
 239   long ga_newsize = (sizevar);                                                  \
 240   while (ga_newsize < ga_needed_size)                                           \
 241     ga_newsize <<= 1;                                                           \
 242   if (ga_newsize != (sizevar))                                                  \
 243     {                                                                           \
 244       if (resized)                                                              \
 245         basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
 246       else                                                                      \
 247         {                                                                       \
 248           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 249           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 250           (basevar) = ga_new;                                                   \
 251           resized = true;                                                       \
 252         }                                                                       \
 253       (sizevar) = ga_newsize;                                                   \
 254     }                                                                           \
 255 } while (0)
 256 \f
 257 /* Test whether n+1-sized entity name fits in P.  We don't support
 258    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 259    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 260    other words an entity needs to be terminated by either a
 261    non-alphanumeric or the end of string.  */
 262 #define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
 263
 264 /* Macros that test entity names by returning true if P is followed by
 265    the specified characters.  */
 266 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 267 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 268 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 269
 270 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 271    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 272    to "<foo", but "&lt,foo" to "<,foo".  */
 273 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 274
 275 /* Decode the HTML character entity at *PTR, considering END to be end
 276    of buffer.  It is assumed that the "&" character that marks the
 277    beginning of the entity has been seen at *PTR-1.  If a recognized
 278    ASCII entity is seen, it is returned, and *PTR is moved to the end
 279    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 280
 281    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 282
 283 static int
 284 decode_entity (const char **ptr, const char *end)
 285 {
 286   const char *p = *ptr;
 287   int value = -1;
 288
 289   if (++p == end)
 290     return -1;
 291
 292   switch (*p++)
 293     {
 294     case '#':
 295       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 296       {
 297         int digits = 0;
 298         value = 0;
 299         if (*p == 'x')
 300           for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
 301             value = (value << 4) + XDIGIT_TO_NUM (*p);
 302         else
 303           for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
 304             value = (value * 10) + (*p - '0');
 305         if (!digits)
 306           return -1;
 307         /* Don't interpret 128+ codes and NUL because we cannot
 308            portably reinserted them into HTML.  */
 309         if (!value || (value & ~0x7f))
 310           return -1;
 311         *ptr = SKIP_SEMI (p, 0);
 312         return value;
 313       }
 314     /* Process named ASCII entities.  */
 315     case 'g':
 316       if (ENT1 (p, 't'))
 317         value = '>', *ptr = SKIP_SEMI (p, 1);
 318       break;
 319     case 'l':
 320       if (ENT1 (p, 't'))
 321         value = '<', *ptr = SKIP_SEMI (p, 1);
 322       break;
 323     case 'a':
 324       if (ENT2 (p, 'm', 'p'))
 325         value = '&', *ptr = SKIP_SEMI (p, 2);
 326       else if (ENT3 (p, 'p', 'o', 's'))
 327         /* handle &apos for the sake of the XML/XHTML crowd. */
 328         value = '\'', *ptr = SKIP_SEMI (p, 3);
 329       break;
 330     case 'q':
 331       if (ENT3 (p, 'u', 'o', 't'))
 332         value = '\"', *ptr = SKIP_SEMI (p, 3);
 333       break;
 334     }
 335   return value;
 336 }
 337 #undef ENT1
 338 #undef ENT2
 339 #undef ENT3
 340 #undef FITS
 341 #undef SKIP_SEMI
 342
 343 enum {
 344   AP_DOWNCASE           = 1,
 345   AP_DECODE_ENTITIES    = 2,
 346   AP_TRIM_BLANKS        = 4
 347 };
 348
 349 /* Copy the text in the range [BEG, END) to POOL, optionally
 350    performing operations specified by FLAGS.  FLAGS may be any
 351    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 352    with the following meaning:
 353
 354    * AP_DOWNCASE -- downcase all the letters;
 355
 356    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 357      the ASCII range when copying the string.
 358
 359    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 360      of text, as well as embedded newlines.  */
 361
 362 static void
 363 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 364 {
 365   int old_tail = pool->tail;
 366
 367   /* Skip blanks if required.  We must do this before entities are
 368      processed, so that blanks can still be inserted as, for instance,
 369      `&#32;'.  */
 370   if (flags & AP_TRIM_BLANKS)
 371     {
 372       while (beg < end && c_isspace (*beg))
 373         ++beg;
 374       while (end > beg && c_isspace (end[-1]))
 375         --end;
 376     }
 377
 378   if (flags & AP_DECODE_ENTITIES)
 379     {
 380       /* Grow the pool, then copy the text to the pool character by
 381          character, processing the encountered entities as we go
 382          along.
 383
 384          It's safe (and necessary) to grow the pool in advance because
 385          processing the entities can only *shorten* the string, it can
 386          never lengthen it.  */
 387       const char *from = beg;
 388       char *to;
 389       bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
 390
 391       POOL_GROW (pool, end - beg);
 392       to = pool->contents + pool->tail;
 393
 394       while (from < end)
 395         {
 396           if (*from == '&')
 397             {
 398               int entity = decode_entity (&from, end);
 399               if (entity != -1)
 400                 *to++ = entity;
 401               else
 402                 *to++ = *from++;
 403             }
 404           else if ((*from == '\n' || *from == '\r') && squash_newlines)
 405             ++from;
 406           else
 407             *to++ = *from++;
 408         }
 409       /* Verify that we haven't exceeded the original size.  (It
 410          shouldn't happen, hence the assert.)  */
 411       assert (to - (pool->contents + pool->tail) <= end - beg);
 412
 413       /* Make POOL's tail point to the position following the string
 414          we've written.  */
 415       pool->tail = to - pool->contents;
 416       POOL_APPEND_CHR (pool, '\0');
 417     }
 418   else
 419     {
 420       /* Just copy the text to the pool.  */
 421       POOL_APPEND (pool, beg, end);
 422       POOL_APPEND_CHR (pool, '\0');
 423     }
 424
 425   if (flags & AP_DOWNCASE)
 426     {
 427       char *p = pool->contents + old_tail;
 428       for (; *p; p++)
 429         *p = c_tolower (*p);
 430     }
 431 }
 432 \f
 433 /* Originally we used to adhere to rfc 1866 here, and allowed only
 434    letters, digits, periods, and hyphens as names (of tags or
 435    attributes).  However, this broke too many pages which used
 436    proprietary or strange attributes, e.g. <img src="a.gif"
 437    v:shapes="whatever">.
 438
 439    So now we allow any character except:
 440      * whitespace
 441      * 8-bit and control chars
 442      * characters that clearly cannot be part of name:
 443        '=', '>', '/'.
 444
 445    This only affects attribute and tag names; attribute values allow
 446    an even greater variety of characters.  */
 447
 448 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 449                         && (x) != '=' && (x) != '>' && (x) != '/')
 450
 451 #ifdef STANDALONE
 452 static int comment_backout_count;
 453 #endif
 454
 455 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 456    strict comments mode, this is used for skipping over comments as
 457    well.
 458
 459    To recap: any SGML declaration may have comments associated with
 460    it, e.g.
 461        <!MY-DECL -- isn't this fun? -- foo bar>
 462
 463    An HTML comment is merely an empty declaration (<!>) with a comment
 464    attached, like this:
 465        <!-- some stuff here -->
 466
 467    Several comments may be embedded in one comment declaration:
 468        <!-- have -- -- fun -->
 469
 470    Whitespace is allowed between and after the comments, but not
 471    before the first comment.  Additionally, this function attempts to
 472    handle double quotes in SGML declarations correctly.  */
 473
 474 static const char *
 475 advance_declaration (const char *beg, const char *end)
 476 {
 477   const char *p = beg;
 478   char quote_char = '\0';       /* shut up, gcc! */
 479   char ch;
 480
 481   enum {
 482     AC_S_DONE,
 483     AC_S_BACKOUT,
 484     AC_S_BANG,
 485     AC_S_DEFAULT,
 486     AC_S_DCLNAME,
 487     AC_S_DASH1,
 488     AC_S_DASH2,
 489     AC_S_COMMENT,
 490     AC_S_DASH3,
 491     AC_S_DASH4,
 492     AC_S_QUOTE1,
 493     AC_S_IN_QUOTE,
 494     AC_S_QUOTE2
 495   } state = AC_S_BANG;
 496
 497   if (beg == end)
 498     return beg;
 499   ch = *p++;
 500
 501   /* It looked like a good idea to write this as a state machine, but
 502      now I wonder...  */
 503
 504   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 505     {
 506       if (p == end)
 507         state = AC_S_BACKOUT;
 508       switch (state)
 509         {
 510         case AC_S_DONE:
 511         case AC_S_BACKOUT:
 512           break;
 513         case AC_S_BANG:
 514           if (ch == '!')
 515             {
 516               ch = *p++;
 517               state = AC_S_DEFAULT;
 518             }
 519           else
 520             state = AC_S_BACKOUT;
 521           break;
 522         case AC_S_DEFAULT:
 523           switch (ch)
 524             {
 525             case '-':
 526               state = AC_S_DASH1;
 527               break;
 528             case ' ':
 529             case '\t':
 530             case '\r':
 531             case '\n':
 532               ch = *p++;
 533               break;
 534             case '>':
 535               state = AC_S_DONE;
 536               break;
 537             case '\'':
 538             case '\"':
 539               state = AC_S_QUOTE1;
 540               break;
 541             default:
 542               if (NAME_CHAR_P (ch))
 543                 state = AC_S_DCLNAME;
 544               else
 545                 state = AC_S_BACKOUT;
 546               break;
 547             }
 548           break;
 549         case AC_S_DCLNAME:
 550           if (ch == '-')
 551             state = AC_S_DASH1;
 552           else if (NAME_CHAR_P (ch))
 553             ch = *p++;
 554           else
 555             state = AC_S_DEFAULT;
 556           break;
 557         case AC_S_QUOTE1:
 558           /* We must use 0x22 because broken assert macros choke on
 559              '"' and '\"'.  */
 560           assert (ch == '\'' || ch == 0x22);
 561           quote_char = ch;      /* cheating -- I really don't feel like
 562                                    introducing more different states for
 563                                    different quote characters. */
 564           ch = *p++;
 565           state = AC_S_IN_QUOTE;
 566           break;
 567         case AC_S_IN_QUOTE:
 568           if (ch == quote_char)
 569             state = AC_S_QUOTE2;
 570           else
 571             ch = *p++;
 572           break;
 573         case AC_S_QUOTE2:
 574           assert (ch == quote_char);
 575           ch = *p++;
 576           state = AC_S_DEFAULT;
 577           break;
 578         case AC_S_DASH1:
 579           assert (ch == '-');
 580           ch = *p++;
 581           state = AC_S_DASH2;
 582           break;
 583         case AC_S_DASH2:
 584           switch (ch)
 585             {
 586             case '-':
 587               ch = *p++;
 588               state = AC_S_COMMENT;
 589               break;
 590             default:
 591               state = AC_S_BACKOUT;
 592             }
 593           break;
 594         case AC_S_COMMENT:
 595           switch (ch)
 596             {
 597             case '-':
 598               state = AC_S_DASH3;
 599               break;
 600             default:
 601               ch = *p++;
 602               break;
 603             }
 604           break;
 605         case AC_S_DASH3:
 606           assert (ch == '-');
 607           ch = *p++;
 608           state = AC_S_DASH4;
 609           break;
 610         case AC_S_DASH4:
 611           switch (ch)
 612             {
 613             case '-':
 614               ch = *p++;
 615               state = AC_S_DEFAULT;
 616               break;
 617             default:
 618               state = AC_S_COMMENT;
 619               break;
 620             }
 621           break;
 622         }
 623     }
 624
 625   if (state == AC_S_BACKOUT)
 626     {
 627 #ifdef STANDALONE
 628       ++comment_backout_count;
 629 #endif
 630       return beg + 1;
 631     }
 632   return p;
 633 }
 634
 635 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 636    return the pointer to the character after the substring.  If the
 637    substring is not found, return NULL.  */
 638
 639 static const char *
 640 find_comment_end (const char *beg, const char *end)
 641 {
 642   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 643      if it's not '>' or '-', advance by three characters.  Otherwise,
 644      look at the preceding characters and try to find a match.  */
 645
 646   const char *p = beg - 1;
 647
 648   while ((p += 3) < end)
 649     switch (p[0])
 650       {
 651       case '>':
 652         if (p[-1] == '-' && p[-2] == '-')
 653           return p + 1;
 654         break;
 655       case '-':
 656       at_dash:
 657         if (p[-1] == '-')
 658           {
 659           at_dash_dash:
 660             if (++p == end) return NULL;
 661             switch (p[0])
 662               {
 663               case '>': return p + 1;
 664               case '-': goto at_dash_dash;
 665               }
 666           }
 667         else
 668           {
 669             if ((p += 2) >= end) return NULL;
 670             switch (p[0])
 671               {
 672               case '>':
 673                 if (p[-1] == '-')
 674                   return p + 1;
 675                 break;
 676               case '-':
 677                 goto at_dash;
 678               }
 679           }
 680       }
 681   return NULL;
 682 }
 683 \f
 684 /* Return true if the string containing of characters inside [b, e) is
 685    present in hash table HT.  */
 686
 687 static bool
 688 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 689 {
 690   char *copy;
 691   if (!ht)
 692     return true;
 693   BOUNDED_TO_ALLOCA (b, e, copy);
 694   return hash_table_get (ht, copy) != NULL;
 695 }
 696
 697 /* Advance P (a char pointer), with the explicit intent of being able
 698    to read the next character.  If this is not possible, go to finish.  */
 699
 700 #define ADVANCE(p) do {                         \
 701   ++p;                                          \
 702   if (p >= end)                                 \
 703     goto finish;                                \
 704 } while (0)
 705
 706 /* Skip whitespace, if any. */
 707
 708 #define SKIP_WS(p) do {                         \
 709   while (c_isspace (*p)) {                        \
 710     ADVANCE (p);                                \
 711   }                                             \
 712 } while (0)
 713
 714 /* Skip non-whitespace, if any. */
 715
 716 #define SKIP_NON_WS(p) do {                     \
 717   while (!c_isspace (*p)) {                       \
 718     ADVANCE (p);                                \
 719   }                                             \
 720 } while (0)
 721
 722 #ifdef STANDALONE
 723 static int tag_backout_count;
 724 #endif
 725
 726 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 727    MAPFUN will be called with two arguments: pointer to an initialized
 728    struct taginfo, and MAPARG.
 729
 730    ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
 731    which are the tags and attribute names that this function should
 732    use.  If ALLOWED_TAGS is NULL, all tags are processed; if
 733    ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
 734
 735    (Obviously, the caller can filter out unwanted tags and attributes
 736    just as well, but this is just an optimization designed to avoid
 737    unnecessary copying of tags/attributes which the caller doesn't
 738    care about.)  */
 739
 740 void
 741 map_html_tags (const char *text, int size,
 742                void (*mapfun) (struct taginfo *, void *), void *maparg,
 743                int flags,
 744                const struct hash_table *allowed_tags,
 745                const struct hash_table *allowed_attributes)
 746 {
 747   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 748      too little, POOL_APPEND allocates more with malloc. */
 749   char pool_initial_storage[256];
 750   struct pool pool;
 751
 752   const char *p = text;
 753   const char *end = text + size;
 754
 755   struct attr_pair attr_pair_initial_storage[8];
 756   int attr_pair_size = countof (attr_pair_initial_storage);
 757   bool attr_pair_resized = false;
 758   struct attr_pair *pairs = attr_pair_initial_storage;
 759
 760   if (!size)
 761     return;
 762
 763   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 764
 765   {
 766     int nattrs, end_tag;
 767     const char *tag_name_begin, *tag_name_end;
 768     const char *tag_start_position;
 769     bool uninteresting_tag;
 770
 771   look_for_tag:
 772     POOL_REWIND (&pool);
 773
 774     nattrs = 0;
 775     end_tag = 0;
 776
 777     /* Find beginning of tag.  We use memchr() instead of the usual
 778        looping with ADVANCE() for speed. */
 779     p = memchr (p, '<', end - p);
 780     if (!p)
 781       goto finish;
 782
 783     tag_start_position = p;
 784     ADVANCE (p);
 785
 786     /* Establish the type of the tag (start-tag, end-tag or
 787        declaration).  */
 788     if (*p == '!')
 789       {
 790         if (!(flags & MHT_STRICT_COMMENTS)
 791             && p < end + 3 && p[1] == '-' && p[2] == '-')
 792           {
 793             /* If strict comments are not enforced and if we know
 794                we're looking at a comment, simply look for the
 795                terminating "-->".  Non-strict is the default because
 796                it works in other browsers and most HTML writers can't
 797                be bothered with getting the comments right.  */
 798             const char *comment_end = find_comment_end (p + 3, end);
 799             if (comment_end)
 800               p = comment_end;
 801           }
 802         else
 803           {
 804             /* Either in strict comment mode or looking at a non-empty
 805                declaration.  Real declarations are much less likely to
 806                be misused the way comments are, so advance over them
 807                properly regardless of strictness.  */
 808             p = advance_declaration (p, end);
 809           }
 810         if (p == end)
 811           goto finish;
 812         goto look_for_tag;
 813       }
 814     else if (*p == '/')
 815       {
 816         end_tag = 1;
 817         ADVANCE (p);
 818       }
 819     tag_name_begin = p;
 820     while (NAME_CHAR_P (*p))
 821       ADVANCE (p);
 822     if (p == tag_name_begin)
 823       goto look_for_tag;
 824     tag_name_end = p;
 825     SKIP_WS (p);
 826     if (end_tag && *p != '>')
 827       goto backout_tag;
 828
 829     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 830       /* We can't just say "goto look_for_tag" here because we need
 831          the loop below to properly advance over the tag's attributes.  */
 832       uninteresting_tag = true;
 833     else
 834       {
 835         uninteresting_tag = false;
 836         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 837       }
 838
 839     /* Find the attributes. */
 840     while (1)
 841       {
 842         const char *attr_name_begin, *attr_name_end;
 843         const char *attr_value_begin, *attr_value_end;
 844         const char *attr_raw_value_begin, *attr_raw_value_end;
 845         int operation = AP_DOWNCASE; /* stupid compiler. */
 846
 847         SKIP_WS (p);
 848
 849         if (*p == '/')
 850           {
 851             /* A slash at this point means the tag is about to be
 852                closed.  This is legal in XML and has been popularized
 853                in HTML via XHTML.  */
 854             /* <foo a=b c=d /> */
 855             /*              ^  */
 856             ADVANCE (p);
 857             SKIP_WS (p);
 858             if (*p != '>')
 859               goto backout_tag;
 860           }
 861
 862         /* Check for end of tag definition. */
 863         if (*p == '>')
 864           break;
 865
 866         /* Establish bounds of attribute name. */
 867         attr_name_begin = p;    /* <foo bar ...> */
 868                                 /*      ^        */
 869         while (NAME_CHAR_P (*p))
 870           ADVANCE (p);
 871         attr_name_end = p;      /* <foo bar ...> */
 872                                 /*         ^     */
 873         if (attr_name_begin == attr_name_end)
 874           goto backout_tag;
 875
 876         /* Establish bounds of attribute value. */
 877         SKIP_WS (p);
 878         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 879           {
 880             /* Minimized attribute syntax allows `=' to be omitted.
 881                For example, <UL COMPACT> is a valid shorthand for <UL
 882                COMPACT="compact">.  Even if such attributes are not
 883                useful to Wget, we need to support them, so that the
 884                tags containing them can be parsed correctly. */
 885             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 886             attr_raw_value_end = attr_value_end = attr_name_end;
 887           }
 888         else if (*p == '=')
 889           {
 890             ADVANCE (p);
 891             SKIP_WS (p);
 892             if (*p == '\"' || *p == '\'')
 893               {
 894                 bool newline_seen = false;
 895                 char quote_char = *p;
 896                 attr_raw_value_begin = p;
 897                 ADVANCE (p);
 898                 attr_value_begin = p; /* <foo bar="baz"> */
 899                                       /*           ^     */
 900                 while (*p != quote_char)
 901                   {
 902                     if (!newline_seen && *p == '\n')
 903                       {
 904                         /* If a newline is seen within the quotes, it
 905                            is most likely that someone forgot to close
 906                            the quote.  In that case, we back out to
 907                            the value beginning, and terminate the tag
 908                            at either `>' or the delimiter, whichever
 909                            comes first.  Such a tag terminated at `>'
 910                            is discarded.  */
 911                         p = attr_value_begin;
 912                         newline_seen = true;
 913                         continue;
 914                       }
 915                     else if (newline_seen && *p == '>')
 916                       break;
 917                     ADVANCE (p);
 918                   }
 919                 attr_value_end = p; /* <foo bar="baz"> */
 920                                     /*              ^  */
 921                 if (*p == quote_char)
 922                   ADVANCE (p);
 923                 else
 924                   goto look_for_tag;
 925                 attr_raw_value_end = p; /* <foo bar="baz"> */
 926                                         /*               ^ */
 927                 operation = AP_DECODE_ENTITIES;
 928                 if (flags & MHT_TRIM_VALUES)
 929                   operation |= AP_TRIM_BLANKS;
 930               }
 931             else
 932               {
 933                 attr_value_begin = p; /* <foo bar=baz> */
 934                                       /*          ^    */
 935                 /* According to SGML, a name token should consist only
 936                    of alphanumerics, . and -.  However, this is often
 937                    violated by, for instance, `%' in `width=75%'.
 938                    We'll be liberal and allow just about anything as
 939                    an attribute value.  */
 940                 while (!c_isspace (*p) && *p != '>')
 941                   ADVANCE (p);
 942                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 943                                     /*             ^          */
 944                 if (attr_value_begin == attr_value_end)
 945                   /* <foo bar=> */
 946                   /*          ^ */
 947                   goto backout_tag;
 948                 attr_raw_value_begin = attr_value_begin;
 949                 attr_raw_value_end = attr_value_end;
 950                 operation = AP_DECODE_ENTITIES;
 951               }
 952           }
 953         else
 954           {
 955             /* We skipped the whitespace and found something that is
 956                neither `=' nor the beginning of the next attribute's
 957                name.  Back out.  */
 958             goto backout_tag;   /* <foo bar [... */
 959                                 /*          ^    */
 960           }
 961
 962         /* If we're not interested in the tag, don't bother with any
 963            of the attributes.  */
 964         if (uninteresting_tag)
 965           continue;
 966
 967         /* If we aren't interested in the attribute, skip it.  We
 968            cannot do this test any sooner, because our text pointer
 969            needs to correctly advance over the attribute.  */
 970         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
 971           continue;
 972
 973         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 974                     struct attr_pair);
 975
 976         pairs[nattrs].name_pool_index = pool.tail;
 977         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 978
 979         pairs[nattrs].value_pool_index = pool.tail;
 980         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 981         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 982         pairs[nattrs].value_raw_size = (attr_raw_value_end
 983                                         - attr_raw_value_begin);
 984         ++nattrs;
 985       }
 986
 987     if (uninteresting_tag)
 988       {
 989         ADVANCE (p);
 990         goto look_for_tag;
 991       }
 992
 993     /* By now, we have a valid tag with a name and zero or more
 994        attributes.  Fill in the data and call the mapper function.  */
 995     {
 996       int i;
 997       struct taginfo taginfo;
 998
 999       taginfo.name      = pool.contents;
1000       taginfo.end_tag_p = end_tag;
1001       taginfo.nattrs    = nattrs;
1002       /* We fill in the char pointers only now, when pool can no
1003          longer get realloc'ed.  If we did that above, we could get
1004          hosed by reallocation.  Obviously, after this point, the pool
1005          may no longer be grown.  */
1006       for (i = 0; i < nattrs; i++)
1007         {
1008           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1009           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1010         }
1011       taginfo.attrs = pairs;
1012       taginfo.start_position = tag_start_position;
1013       taginfo.end_position   = p + 1;
1014       mapfun (&taginfo, maparg);
1015       ADVANCE (p);
1016     }
1017     goto look_for_tag;
1018
1019   backout_tag:
1020 #ifdef STANDALONE
1021     ++tag_backout_count;
1022 #endif
1023     /* The tag wasn't really a tag.  Treat its contents as ordinary
1024        data characters. */
1025     p = tag_start_position + 1;
1026     goto look_for_tag;
1027   }
1028
1029  finish:
1030   POOL_FREE (&pool);
1031   if (attr_pair_resized)
1032     xfree (pairs);
1033 }
1034
1035 #undef ADVANCE
1036 #undef SKIP_WS
1037 #undef SKIP_NON_WS
1038 \f
1039 #ifdef STANDALONE
1040 static void
1041 test_mapper (struct taginfo *taginfo, void *arg)
1042 {
1043   int i;
1044
1045   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1046   for (i = 0; i < taginfo->nattrs; i++)
1047     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1048   putchar ('\n');
1049   ++*(int *)arg;
1050 }
1051
1052 int main ()
1053 {
1054   int size = 256;
1055   char *x = xmalloc (size);
1056   int length = 0;
1057   int read_count;
1058   int tag_counter = 0;
1059
1060   while ((read_count = fread (x + length, 1, size - length, stdin)))
1061     {
1062       length += read_count;
1063       size <<= 1;
1064       x = xrealloc (x, size);
1065     }
1066
1067   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1068   printf ("TAGS: %d\n", tag_counter);
1069   printf ("Tag backouts:     %d\n", tag_backout_count);
1070   printf ("Comment backouts: %d\n", comment_backout_count);
1071   return 0;
1072 }
1073 #endif /* STANDALONE */