sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #ifdef HAVE_STRING_H
 100 # include <string.h>
 101 #else
 102 # include <strings.h>
 103 #endif
 104 #include <assert.h>
 105
 106 #include "wget.h"
 107 #include "html-parse.h"
 108
 109 #ifdef STANDALONE
 110 # undef xmalloc
 111 # undef xrealloc
 112 # undef xfree
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # undef ISSPACE
 118 # undef ISDIGIT
 119 # undef ISXDIGIT
 120 # undef ISALPHA
 121 # undef ISALNUM
 122 # undef TOLOWER
 123 # undef TOUPPER
 124
 125 # define ISSPACE(x) isspace (x)
 126 # define ISDIGIT(x) isdigit (x)
 127 # define ISXDIGIT(x) isxdigit (x)
 128 # define ISALPHA(x) isalpha (x)
 129 # define ISALNUM(x) isalnum (x)
 130 # define TOLOWER(x) tolower (x)
 131 # define TOUPPER(x) toupper (x)
 132
 133 struct hash_table {
 134   int dummy;
 135 };
 136 static void *
 137 hash_table_get (const struct hash_table *ht, void *ptr)
 138 {
 139   return ptr;
 140 }
 141 #else  /* not STANDALONE */
 142 # include "hash.h"
 143 #endif
 144
 145 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 146    allocated on the stack, and moved to the heap if it needs to be
 147    larger than originally expected.  map_html_tags() uses it to store
 148    the zero-terminated names and values of tags and attributes.
 149
 150    Thus taginfo->name, and attr->name and attr->value for each
 151    attribute, do not point into separately allocated areas, but into
 152    different parts of the pool, separated only by terminating zeros.
 153    This ensures minimum amount of allocation and, for most tags, no
 154    allocation because the entire pool is kept on the stack.  */
 155
 156 struct pool {
 157   char *contents;               /* pointer to the contents. */
 158   int size;                     /* size of the pool. */
 159   int tail;                     /* next available position index. */
 160   int resized;                  /* whether the pool has been resized
 161                                    using malloc. */
 162
 163   char *orig_contents;          /* original pool contents, usually
 164                                    stack-allocated.  used by POOL_FREE
 165                                    to restore the pool to the initial
 166                                    state. */
 167   int orig_size;
 168 };
 169
 170 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 171
 172 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 173   struct pool *P = (p);                                         \
 174   P->contents = (initial_storage);                              \
 175   P->size = (initial_size);                                     \
 176   P->tail = 0;                                                  \
 177   P->resized = 0;                                               \
 178   P->orig_contents = P->contents;                               \
 179   P->orig_size = P->size;                                       \
 180 } while (0)
 181
 182 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 183    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 184
 185 #define POOL_GROW(p, increase)                                  \
 186   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 187               (p)->resized, char)
 188
 189 /* Append text in the range [beg, end) to POOL.  No zero-termination
 190    is done.  */
 191
 192 #define POOL_APPEND(p, beg, end) do {                   \
 193   const char *PA_beg = (beg);                           \
 194   int PA_size = (end) - PA_beg;                         \
 195   POOL_GROW (p, PA_size);                               \
 196   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 197   (p)->tail += PA_size;                                 \
 198 } while (0)
 199
 200 /* Append one character to the pool.  Can be used to zero-terminate
 201    pool strings.  */
 202
 203 #define POOL_APPEND_CHR(p, ch) do {             \
 204   char PAC_char = (ch);                         \
 205   POOL_GROW (p, 1);                             \
 206   (p)->contents[(p)->tail++] = PAC_char;        \
 207 } while (0)
 208
 209 /* Forget old pool contents.  The allocated memory is not freed. */
 210 #define POOL_REWIND(p) (p)->tail = 0
 211
 212 /* Free heap-allocated memory for contents of POOL.  This calls
 213    xfree() if the memory was allocated through malloc.  It also
 214    restores `contents' and `size' to their original, pre-malloc
 215    values.  That way after POOL_FREE, the pool is fully usable, just
 216    as if it were freshly initialized with POOL_INIT.  */
 217
 218 #define POOL_FREE(p) do {                       \
 219   struct pool *P = p;                           \
 220   if (P->resized)                               \
 221     xfree (P->contents);                        \
 222   P->contents = P->orig_contents;               \
 223   P->size = P->orig_size;                       \
 224   P->tail = 0;                                  \
 225   P->resized = 0;                               \
 226 } while (0)
 227
 228 /* Used for small stack-allocated memory chunks that might grow.  Like
 229    DO_REALLOC, this macro grows BASEVAR as necessary to take
 230    NEEDED_SIZE items of TYPE.
 231
 232    The difference is that on the first resize, it will use
 233    malloc+memcpy rather than realloc.  That way you can stack-allocate
 234    the initial chunk, and only resort to heap allocation if you
 235    stumble upon large data.
 236
 237    After the first resize, subsequent ones are performed with realloc,
 238    just like DO_REALLOC.  */
 239
 240 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 241   long ga_needed_size = (needed_size);                                          \
 242   long ga_newsize = (sizevar);                                                  \
 243   while (ga_newsize < ga_needed_size)                                           \
 244     ga_newsize <<= 1;                                                           \
 245   if (ga_newsize != (sizevar))                                                  \
 246     {                                                                           \
 247       if (resized)                                                              \
 248         basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type));       \
 249       else                                                                      \
 250         {                                                                       \
 251           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 252           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 253           (basevar) = ga_new;                                                   \
 254           resized = 1;                                                          \
 255         }                                                                       \
 256       (sizevar) = ga_newsize;                                                   \
 257     }                                                                           \
 258 } while (0)
 259 \f
 260 /* Test whether n+1-sized entity name fits in P.  We don't support
 261    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 262    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 263    other words an entity needs to be terminated by either a
 264    non-alphanumeric or the end of string.  */
 265 #define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
 266
 267 /* Macros that test entity names by returning true if P is followed by
 268    the specified characters.  */
 269 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 270 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 271 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 272
 273 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 274    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 275    to "<foo", but "&lt,foo" to "<,foo".  */
 276 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 277
 278 /* Decode the HTML character entity at *PTR, considering END to be end
 279    of buffer.  It is assumed that the "&" character that marks the
 280    beginning of the entity has been seen at *PTR-1.  If a recognized
 281    ASCII entity is seen, it is returned, and *PTR is moved to the end
 282    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 283
 284    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 285
 286 static int
 287 decode_entity (const char **ptr, const char *end)
 288 {
 289   const char *p = *ptr;
 290   int value = -1;
 291
 292   if (++p == end)
 293     return -1;
 294
 295   switch (*p++)
 296     {
 297     case '#':
 298       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 299       {
 300         int digits = 0;
 301         value = 0;
 302         if (*p == 'x')
 303           for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
 304             value = (value << 4) + XDIGIT_TO_NUM (*p);
 305         else
 306           for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
 307             value = (value * 10) + (*p - '0');
 308         if (!digits)
 309           return -1;
 310         /* Don't interpret 128+ codes and NUL because we cannot
 311            portably reinserted them into HTML.  */
 312         if (!value || (value & ~0x7f))
 313           return -1;
 314         *ptr = SKIP_SEMI (p, 0);
 315         return value;
 316       }
 317     /* Process named ASCII entities.  */
 318     case 'g':
 319       if (ENT1 (p, 't'))
 320         value = '>', *ptr = SKIP_SEMI (p, 1);
 321       break;
 322     case 'l':
 323       if (ENT1 (p, 't'))
 324         value = '<', *ptr = SKIP_SEMI (p, 1);
 325       break;
 326     case 'a':
 327       if (ENT2 (p, 'm', 'p'))
 328         value = '&', *ptr = SKIP_SEMI (p, 2);
 329       else if (ENT3 (p, 'p', 'o', 's'))
 330         /* handle &apos for the sake of the XML/XHTML crowd. */
 331         value = '\'', *ptr = SKIP_SEMI (p, 3);
 332       break;
 333     case 'q':
 334       if (ENT3 (p, 'u', 'o', 't'))
 335         value = '\"', *ptr = SKIP_SEMI (p, 3);
 336       break;
 337     }
 338   return value;
 339 }
 340 #undef ENT1
 341 #undef ENT2
 342 #undef ENT3
 343
 344 enum {
 345   AP_DOWNCASE           = 1,
 346   AP_DECODE_ENTITIES    = 2,
 347   AP_TRIM_BLANKS        = 4
 348 };
 349
 350 /* Copy the text in the range [BEG, END) to POOL, optionally
 351    performing operations specified by FLAGS.  FLAGS may be any
 352    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 353    with the following meaning:
 354
 355    * AP_DOWNCASE -- downcase all the letters;
 356
 357    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 358      the ASCII range when copying the string.
 359
 360    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 361      of text.  */
 362
 363 static void
 364 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 365 {
 366   int old_tail = pool->tail;
 367   int size;
 368
 369   /* First, skip blanks if required.  We must do this before entities
 370      are processed, so that blanks can still be inserted as, for
 371      instance, `&#32;'.  */
 372   if (flags & AP_TRIM_BLANKS)
 373     {
 374       while (beg < end && ISSPACE (*beg))
 375         ++beg;
 376       while (end > beg && ISSPACE (end[-1]))
 377         --end;
 378     }
 379   size = end - beg;
 380
 381   if (flags & AP_DECODE_ENTITIES)
 382     {
 383       /* Grow the pool, then copy the text to the pool character by
 384          character, processing the encountered entities as we go
 385          along.
 386
 387          It's safe (and necessary) to grow the pool in advance because
 388          processing the entities can only *shorten* the string, it can
 389          never lengthen it.  */
 390       const char *from = beg;
 391       char *to;
 392
 393       POOL_GROW (pool, end - beg);
 394       to = pool->contents + pool->tail;
 395
 396       while (from < end)
 397         {
 398           if (*from != '&')
 399             *to++ = *from++;
 400           else
 401             {
 402               int entity = decode_entity (&from, end);
 403               if (entity != -1)
 404                 *to++ = entity;
 405               else
 406                 *to++ = *from++;
 407             }
 408         }
 409       /* Verify that we haven't exceeded the original size.  (It
 410          shouldn't happen, hence the assert.)  */
 411       assert (to - (pool->contents + pool->tail) <= end - beg);
 412
 413       /* Make POOL's tail point to the position following the string
 414          we've written.  */
 415       pool->tail = to - pool->contents;
 416       POOL_APPEND_CHR (pool, '\0');
 417     }
 418   else
 419     {
 420       /* Just copy the text to the pool.  */
 421       POOL_APPEND (pool, beg, end);
 422       POOL_APPEND_CHR (pool, '\0');
 423     }
 424
 425   if (flags & AP_DOWNCASE)
 426     {
 427       char *p = pool->contents + old_tail;
 428       for (; *p; p++)
 429         *p = TOLOWER (*p);
 430     }
 431 }
 432 \f
 433 /* Originally we used to adhere to rfc 1866 here, and allowed only
 434    letters, digits, periods, and hyphens as names (of tags or
 435    attributes).  However, this broke too many pages which used
 436    proprietary or strange attributes, e.g. <img src="a.gif"
 437    v:shapes="whatever">.
 438
 439    So now we allow any character except:
 440      * whitespace
 441      * 8-bit and control chars
 442      * characters that clearly cannot be part of name:
 443        '=', '>', '/'.
 444
 445    This only affects attribute and tag names; attribute values allow
 446    an even greater variety of characters.  */
 447
 448 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 449                         && (x) != '=' && (x) != '>' && (x) != '/')
 450
 451 #ifdef STANDALONE
 452 static int comment_backout_count;
 453 #endif
 454
 455 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 456    strict comments mode, this is used for skipping over comments as
 457    well.
 458
 459    To recap: any SGML declaration may have comments associated with
 460    it, e.g.
 461        <!MY-DECL -- isn't this fun? -- foo bar>
 462
 463    An HTML comment is merely an empty declaration (<!>) with a comment
 464    attached, like this:
 465        <!-- some stuff here -->
 466
 467    Several comments may be embedded in one comment declaration:
 468        <!-- have -- -- fun -->
 469
 470    Whitespace is allowed between and after the comments, but not
 471    before the first comment.  Additionally, this function attempts to
 472    handle double quotes in SGML declarations correctly.  */
 473
 474 static const char *
 475 advance_declaration (const char *beg, const char *end)
 476 {
 477   const char *p = beg;
 478   char quote_char = '\0';       /* shut up, gcc! */
 479   char ch;
 480
 481   enum {
 482     AC_S_DONE,
 483     AC_S_BACKOUT,
 484     AC_S_BANG,
 485     AC_S_DEFAULT,
 486     AC_S_DCLNAME,
 487     AC_S_DASH1,
 488     AC_S_DASH2,
 489     AC_S_COMMENT,
 490     AC_S_DASH3,
 491     AC_S_DASH4,
 492     AC_S_QUOTE1,
 493     AC_S_IN_QUOTE,
 494     AC_S_QUOTE2
 495   } state = AC_S_BANG;
 496
 497   if (beg == end)
 498     return beg;
 499   ch = *p++;
 500
 501   /* It looked like a good idea to write this as a state machine, but
 502      now I wonder...  */
 503
 504   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 505     {
 506       if (p == end)
 507         state = AC_S_BACKOUT;
 508       switch (state)
 509         {
 510         case AC_S_DONE:
 511         case AC_S_BACKOUT:
 512           break;
 513         case AC_S_BANG:
 514           if (ch == '!')
 515             {
 516               ch = *p++;
 517               state = AC_S_DEFAULT;
 518             }
 519           else
 520             state = AC_S_BACKOUT;
 521           break;
 522         case AC_S_DEFAULT:
 523           switch (ch)
 524             {
 525             case '-':
 526               state = AC_S_DASH1;
 527               break;
 528             case ' ':
 529             case '\t':
 530             case '\r':
 531             case '\n':
 532               ch = *p++;
 533               break;
 534             case '>':
 535               state = AC_S_DONE;
 536               break;
 537             case '\'':
 538             case '\"':
 539               state = AC_S_QUOTE1;
 540               break;
 541             default:
 542               if (NAME_CHAR_P (ch))
 543                 state = AC_S_DCLNAME;
 544               else
 545                 state = AC_S_BACKOUT;
 546               break;
 547             }
 548           break;
 549         case AC_S_DCLNAME:
 550           if (ch == '-')
 551             state = AC_S_DASH1;
 552           else if (NAME_CHAR_P (ch))
 553             ch = *p++;
 554           else
 555             state = AC_S_DEFAULT;
 556           break;
 557         case AC_S_QUOTE1:
 558           /* We must use 0x22 because broken assert macros choke on
 559              '"' and '\"'.  */
 560           assert (ch == '\'' || ch == 0x22);
 561           quote_char = ch;      /* cheating -- I really don't feel like
 562                                    introducing more different states for
 563                                    different quote characters. */
 564           ch = *p++;
 565           state = AC_S_IN_QUOTE;
 566           break;
 567         case AC_S_IN_QUOTE:
 568           if (ch == quote_char)
 569             state = AC_S_QUOTE2;
 570           else
 571             ch = *p++;
 572           break;
 573         case AC_S_QUOTE2:
 574           assert (ch == quote_char);
 575           ch = *p++;
 576           state = AC_S_DEFAULT;
 577           break;
 578         case AC_S_DASH1:
 579           assert (ch == '-');
 580           ch = *p++;
 581           state = AC_S_DASH2;
 582           break;
 583         case AC_S_DASH2:
 584           switch (ch)
 585             {
 586             case '-':
 587               ch = *p++;
 588               state = AC_S_COMMENT;
 589               break;
 590             default:
 591               state = AC_S_BACKOUT;
 592             }
 593           break;
 594         case AC_S_COMMENT:
 595           switch (ch)
 596             {
 597             case '-':
 598               state = AC_S_DASH3;
 599               break;
 600             default:
 601               ch = *p++;
 602               break;
 603             }
 604           break;
 605         case AC_S_DASH3:
 606           assert (ch == '-');
 607           ch = *p++;
 608           state = AC_S_DASH4;
 609           break;
 610         case AC_S_DASH4:
 611           switch (ch)
 612             {
 613             case '-':
 614               ch = *p++;
 615               state = AC_S_DEFAULT;
 616               break;
 617             default:
 618               state = AC_S_COMMENT;
 619               break;
 620             }
 621           break;
 622         }
 623     }
 624
 625   if (state == AC_S_BACKOUT)
 626     {
 627 #ifdef STANDALONE
 628       ++comment_backout_count;
 629 #endif
 630       return beg + 1;
 631     }
 632   return p;
 633 }
 634
 635 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 636    return the pointer to the character after the substring.  If the
 637    substring is not found, return NULL.  */
 638
 639 static const char *
 640 find_comment_end (const char *beg, const char *end)
 641 {
 642   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 643      if it's not '>' or '-', advance by three characters.  Otherwise,
 644      look at the preceding characters and try to find a match.  */
 645
 646   const char *p = beg - 1;
 647
 648   while ((p += 3) < end)
 649     switch (p[0])
 650       {
 651       case '>':
 652         if (p[-1] == '-' && p[-2] == '-')
 653           return p + 1;
 654         break;
 655       case '-':
 656       at_dash:
 657         if (p[-1] == '-')
 658           {
 659           at_dash_dash:
 660             if (++p == end) return NULL;
 661             switch (p[0])
 662               {
 663               case '>': return p + 1;
 664               case '-': goto at_dash_dash;
 665               }
 666           }
 667         else
 668           {
 669             if ((p += 2) >= end) return NULL;
 670             switch (p[0])
 671               {
 672               case '>':
 673                 if (p[-1] == '-')
 674                   return p + 1;
 675                 break;
 676               case '-':
 677                 goto at_dash;
 678               }
 679           }
 680       }
 681   return NULL;
 682 }
 683 \f
 684 /* Return non-zero of the string inside [b, e) are present in hash
 685    table HT.  */
 686
 687 static int
 688 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 689 {
 690   char *copy;
 691   if (!ht)
 692     return 1;
 693   BOUNDED_TO_ALLOCA (b, e, copy);
 694   return hash_table_get (ht, copy) != NULL;
 695 }
 696
 697 /* Advance P (a char pointer), with the explicit intent of being able
 698    to read the next character.  If this is not possible, go to finish.  */
 699
 700 #define ADVANCE(p) do {                         \
 701   ++p;                                          \
 702   if (p >= end)                                 \
 703     goto finish;                                \
 704 } while (0)
 705
 706 /* Skip whitespace, if any. */
 707
 708 #define SKIP_WS(p) do {                         \
 709   while (ISSPACE (*p)) {                        \
 710     ADVANCE (p);                                \
 711   }                                             \
 712 } while (0)
 713
 714 /* Skip non-whitespace, if any. */
 715
 716 #define SKIP_NON_WS(p) do {                     \
 717   while (!ISSPACE (*p)) {                       \
 718     ADVANCE (p);                                \
 719   }                                             \
 720 } while (0)
 721
 722 #ifdef STANDALONE
 723 static int tag_backout_count;
 724 #endif
 725
 726 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 727    MAPFUN will be called with two arguments: pointer to an initialized
 728    struct taginfo, and MAPARG.
 729
 730    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 731    be processed by this function.  If it is NULL, all the tags are
 732    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 733
 734    (Obviously, the caller can filter out unwanted tags and attributes
 735    just as well, but this is just an optimization designed to avoid
 736    unnecessary copying for tags/attributes which the caller doesn't
 737    want to know about.  These lists are searched linearly; therefore,
 738    if you're interested in a large number of tags or attributes, you'd
 739    better set these to NULL and filter them out yourself with a
 740    hashing process most appropriate for your application.)  */
 741
 742 void
 743 map_html_tags (const char *text, int size,
 744                void (*mapfun) (struct taginfo *, void *), void *maparg,
 745                int flags,
 746                const struct hash_table *allowed_tags,
 747                const struct hash_table *allowed_attributes)
 748 {
 749   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 750      too little, POOL_APPEND allocates more with malloc. */
 751   char pool_initial_storage[256];
 752   struct pool pool;
 753
 754   const char *p = text;
 755   const char *end = text + size;
 756
 757   struct attr_pair attr_pair_initial_storage[8];
 758   int attr_pair_size = countof (attr_pair_initial_storage);
 759   int attr_pair_resized = 0;
 760   struct attr_pair *pairs = attr_pair_initial_storage;
 761
 762   if (!size)
 763     return;
 764
 765   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 766
 767   {
 768     int nattrs, end_tag;
 769     const char *tag_name_begin, *tag_name_end;
 770     const char *tag_start_position;
 771     int uninteresting_tag;
 772
 773   look_for_tag:
 774     POOL_REWIND (&pool);
 775
 776     nattrs = 0;
 777     end_tag = 0;
 778
 779     /* Find beginning of tag.  We use memchr() instead of the usual
 780        looping with ADVANCE() for speed. */
 781     p = memchr (p, '<', end - p);
 782     if (!p)
 783       goto finish;
 784
 785     tag_start_position = p;
 786     ADVANCE (p);
 787
 788     /* Establish the type of the tag (start-tag, end-tag or
 789        declaration).  */
 790     if (*p == '!')
 791       {
 792         if (!(flags & MHT_STRICT_COMMENTS)
 793             && p < end + 3 && p[1] == '-' && p[2] == '-')
 794           {
 795             /* If strict comments are not enforced and if we know
 796                we're looking at a comment, simply look for the
 797                terminating "-->".  Non-strict is the default because
 798                it works in other browsers and most HTML writers can't
 799                be bothered with getting the comments right.  */
 800             const char *comment_end = find_comment_end (p + 3, end);
 801             if (comment_end)
 802               p = comment_end;
 803           }
 804         else
 805           {
 806             /* Either in strict comment mode or looking at a non-empty
 807                declaration.  Real declarations are much less likely to
 808                be misused the way comments are, so advance over them
 809                properly regardless of strictness.  */
 810             p = advance_declaration (p, end);
 811           }
 812         if (p == end)
 813           goto finish;
 814         goto look_for_tag;
 815       }
 816     else if (*p == '/')
 817       {
 818         end_tag = 1;
 819         ADVANCE (p);
 820       }
 821     tag_name_begin = p;
 822     while (NAME_CHAR_P (*p))
 823       ADVANCE (p);
 824     if (p == tag_name_begin)
 825       goto look_for_tag;
 826     tag_name_end = p;
 827     SKIP_WS (p);
 828     if (end_tag && *p != '>')
 829       goto backout_tag;
 830
 831     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 832       /* We can't just say "goto look_for_tag" here because we need
 833          the loop below to properly advance over the tag's attributes.  */
 834       uninteresting_tag = 1;
 835     else
 836       {
 837         uninteresting_tag = 0;
 838         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 839       }
 840
 841     /* Find the attributes. */
 842     while (1)
 843       {
 844         const char *attr_name_begin, *attr_name_end;
 845         const char *attr_value_begin, *attr_value_end;
 846         const char *attr_raw_value_begin, *attr_raw_value_end;
 847         int operation = AP_DOWNCASE; /* stupid compiler. */
 848
 849         SKIP_WS (p);
 850
 851         if (*p == '/')
 852           {
 853             /* A slash at this point means the tag is about to be
 854                closed.  This is legal in XML and has been popularized
 855                in HTML via XHTML.  */
 856             /* <foo a=b c=d /> */
 857             /*              ^  */
 858             ADVANCE (p);
 859             SKIP_WS (p);
 860             if (*p != '>')
 861               goto backout_tag;
 862           }
 863
 864         /* Check for end of tag definition. */
 865         if (*p == '>')
 866           break;
 867
 868         /* Establish bounds of attribute name. */
 869         attr_name_begin = p;    /* <foo bar ...> */
 870                                 /*      ^        */
 871         while (NAME_CHAR_P (*p))
 872           ADVANCE (p);
 873         attr_name_end = p;      /* <foo bar ...> */
 874                                 /*         ^     */
 875         if (attr_name_begin == attr_name_end)
 876           goto backout_tag;
 877
 878         /* Establish bounds of attribute value. */
 879         SKIP_WS (p);
 880         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 881           {
 882             /* Minimized attribute syntax allows `=' to be omitted.
 883                For example, <UL COMPACT> is a valid shorthand for <UL
 884                COMPACT="compact">.  Even if such attributes are not
 885                useful to Wget, we need to support them, so that the
 886                tags containing them can be parsed correctly. */
 887             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 888             attr_raw_value_end = attr_value_end = attr_name_end;
 889           }
 890         else if (*p == '=')
 891           {
 892             ADVANCE (p);
 893             SKIP_WS (p);
 894             if (*p == '\"' || *p == '\'')
 895               {
 896                 int newline_seen = 0;
 897                 char quote_char = *p;
 898                 attr_raw_value_begin = p;
 899                 ADVANCE (p);
 900                 attr_value_begin = p; /* <foo bar="baz"> */
 901                                       /*           ^     */
 902                 while (*p != quote_char)
 903                   {
 904                     if (!newline_seen && *p == '\n')
 905                       {
 906                         /* If a newline is seen within the quotes, it
 907                            is most likely that someone forgot to close
 908                            the quote.  In that case, we back out to
 909                            the value beginning, and terminate the tag
 910                            at either `>' or the delimiter, whichever
 911                            comes first.  Such a tag terminated at `>'
 912                            is discarded.  */
 913                         p = attr_value_begin;
 914                         newline_seen = 1;
 915                         continue;
 916                       }
 917                     else if (newline_seen && *p == '>')
 918                       break;
 919                     ADVANCE (p);
 920                   }
 921                 attr_value_end = p; /* <foo bar="baz"> */
 922                                     /*              ^  */
 923                 if (*p == quote_char)
 924                   ADVANCE (p);
 925                 else
 926                   goto look_for_tag;
 927                 attr_raw_value_end = p; /* <foo bar="baz"> */
 928                                         /*               ^ */
 929                 operation = AP_DECODE_ENTITIES;
 930                 if (flags & MHT_TRIM_VALUES)
 931                   operation |= AP_TRIM_BLANKS;
 932               }
 933             else
 934               {
 935                 attr_value_begin = p; /* <foo bar=baz> */
 936                                       /*          ^    */
 937                 /* According to SGML, a name token should consist only
 938                    of alphanumerics, . and -.  However, this is often
 939                    violated by, for instance, `%' in `width=75%'.
 940                    We'll be liberal and allow just about anything as
 941                    an attribute value.  */
 942                 while (!ISSPACE (*p) && *p != '>')
 943                   ADVANCE (p);
 944                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 945                                     /*             ^          */
 946                 if (attr_value_begin == attr_value_end)
 947                   /* <foo bar=> */
 948                   /*          ^ */
 949                   goto backout_tag;
 950                 attr_raw_value_begin = attr_value_begin;
 951                 attr_raw_value_end = attr_value_end;
 952                 operation = AP_DECODE_ENTITIES;
 953               }
 954           }
 955         else
 956           {
 957             /* We skipped the whitespace and found something that is
 958                neither `=' nor the beginning of the next attribute's
 959                name.  Back out.  */
 960             goto backout_tag;   /* <foo bar [... */
 961                                 /*          ^    */
 962           }
 963
 964         /* If we're not interested in the tag, don't bother with any
 965            of the attributes.  */
 966         if (uninteresting_tag)
 967           continue;
 968
 969         /* If we aren't interested in the attribute, skip it.  We
 970            cannot do this test any sooner, because our text pointer
 971            needs to correctly advance over the attribute.  */
 972         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
 973           continue;
 974
 975         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 976                     struct attr_pair);
 977
 978         pairs[nattrs].name_pool_index = pool.tail;
 979         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 980
 981         pairs[nattrs].value_pool_index = pool.tail;
 982         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 983         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 984         pairs[nattrs].value_raw_size = (attr_raw_value_end
 985                                         - attr_raw_value_begin);
 986         ++nattrs;
 987       }
 988
 989     if (uninteresting_tag)
 990       {
 991         ADVANCE (p);
 992         goto look_for_tag;
 993       }
 994
 995     /* By now, we have a valid tag with a name and zero or more
 996        attributes.  Fill in the data and call the mapper function.  */
 997     {
 998       int i;
 999       struct taginfo taginfo;
1000
1001       taginfo.name      = pool.contents;
1002       taginfo.end_tag_p = end_tag;
1003       taginfo.nattrs    = nattrs;
1004       /* We fill in the char pointers only now, when pool can no
1005          longer get realloc'ed.  If we did that above, we could get
1006          hosed by reallocation.  Obviously, after this point, the pool
1007          may no longer be grown.  */
1008       for (i = 0; i < nattrs; i++)
1009         {
1010           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1011           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1012         }
1013       taginfo.attrs = pairs;
1014       taginfo.start_position = tag_start_position;
1015       taginfo.end_position   = p + 1;
1016       /* Ta-dam! */
1017       (*mapfun) (&taginfo, maparg);
1018       ADVANCE (p);
1019     }
1020     goto look_for_tag;
1021
1022   backout_tag:
1023 #ifdef STANDALONE
1024     ++tag_backout_count;
1025 #endif
1026     /* The tag wasn't really a tag.  Treat its contents as ordinary
1027        data characters. */
1028     p = tag_start_position + 1;
1029     goto look_for_tag;
1030   }
1031
1032  finish:
1033   POOL_FREE (&pool);
1034   if (attr_pair_resized)
1035     xfree (pairs);
1036 }
1037
1038 #undef ADVANCE
1039 #undef SKIP_WS
1040 #undef SKIP_NON_WS
1041 \f
1042 #ifdef STANDALONE
1043 static void
1044 test_mapper (struct taginfo *taginfo, void *arg)
1045 {
1046   int i;
1047
1048   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1049   for (i = 0; i < taginfo->nattrs; i++)
1050     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1051   putchar ('\n');
1052   ++*(int *)arg;
1053 }
1054
1055 int main ()
1056 {
1057   int size = 256;
1058   char *x = (char *)xmalloc (size);
1059   int length = 0;
1060   int read_count;
1061   int tag_counter = 0;
1062
1063   while ((read_count = fread (x + length, 1, size - length, stdin)))
1064     {
1065       length += read_count;
1066       size <<= 1;
1067       x = (char *)xrealloc (x, size);
1068     }
1069
1070   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1071   printf ("TAGS: %d\n", tag_counter);
1072   printf ("Tag backouts:     %d\n", tag_backout_count);
1073   printf ("Comment backouts: %d\n", comment_backout_count);
1074   return 0;
1075 }
1076 #endif /* STANDALONE */