sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.  The idea was that several parsers would be running
  50    concurrently, and you'd have pass the function a unique ID string
  51    (for example, the URL) by which it found the relevant parser state
  52    and returned the next URL.  Over-engineering at its best.
  53
  54    The second HTML parser was written for Wget 1.4 (the first version
  55    by the name `Wget'), and was a complete rewrite.  Although the new
  56    parser behaved much better and made no claims of reentrancy, it
  57    still shared many of the fundamental flaws of the old version -- it
  58    only regarded HTML in terms tag-attribute pairs, where the
  59    attribute's value was a URL to be returned.  Any other property of
  60    HTML, such as <base href=...>, or strange way to specify a URL,
  61    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  62    crudely hacked in -- and the caller had to be aware of these hacks.
  63    Like its predecessor, this parser did not support HTML comments.
  64
  65    After Wget 1.5.1 was released, I set out to write a third HTML
  66    parser.  The objectives of the new parser were to: (1) provide a
  67    clean way to analyze HTML lexically, (2) separate interpretation of
  68    the markup from the parsing process, (3) be as correct as possible,
  69    e.g. correctly skipping comments and other SGML declarations, (4)
  70    understand the most common errors in markup and skip them or be
  71    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  72    minimum copying and minimum or no heap allocation).
  73
  74    I believe this parser meets all of the above goals.  It is
  75    reasonably well structured, and could be relatively easily
  76    separated from Wget and used elsewhere.  While some of its
  77    intrinsic properties limit its value as a general-purpose HTML
  78    parser, I believe that, with minimum modifications, it could serve
  79    as a backend for one.
  80
  81    Due to time and other constraints, this parser was not integrated
  82    into Wget until the version 1.7. */
  83
  84 /* DESCRIPTION:
  85
  86    The single entry point of this parser is map_html_tags(), which
  87    works by calling a function you specify for each tag.  The function
  88    gets called with the pointer to a structure describing the tag and
  89    its attributes.  */
  90
  91 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  92    still need Wget headers to compile.  */
  93
  94 #include <config.h>
  95
  96 #ifdef STANDALONE
  97 # define I_REALLY_WANT_CTYPE_MACROS
  98 #endif
  99
 100 #include <stdio.h>
 101 #include <stdlib.h>
 102 #ifdef HAVE_STRING_H
 103 # include <string.h>
 104 #else
 105 # include <strings.h>
 106 #endif
 107 #include <assert.h>
 108
 109 #include "wget.h"
 110 #include "html-parse.h"
 111
 112 #ifdef STANDALONE
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # define ISSPACE(x) isspace (x)
 118 # define ISDIGIT(x) isdigit (x)
 119 # define ISALPHA(x) isalpha (x)
 120 # define ISALNUM(x) isalnum (x)
 121 # define TOLOWER(x) tolower (x)
 122 #endif /* STANDALONE */
 123
 124 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 125    allocated on the stack, and moved to the heap if it needs to be
 126    larger than originally expected.  map_html_tags() uses it to store
 127    the zero-terminated names and values of tags and attributes.
 128
 129    Thus taginfo->name, and attr->name and attr->value for each
 130    attribute, do not point into separately allocated areas, but into
 131    different parts of the pool, separated only by terminating zeros.
 132    This ensures minimum amount of allocation and, for most tags, no
 133    allocation because the entire pool is kept on the stack.  */
 134
 135 struct pool {
 136   char *contents;               /* pointer to the contents. */
 137   int size;                     /* size of the pool. */
 138   int index;                    /* next unoccupied position in
 139                                    contents. */
 140
 141   int alloca_p;                 /* whether contents was allocated
 142                                    using alloca(). */
 143   char *orig_contents;          /* orig_contents, allocated by
 144                                    alloca().  this is used by
 145                                    POOL_FREE to restore the pool to
 146                                    the "initial" state. */
 147   int orig_size;
 148 };
 149
 150 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 151
 152 #define POOL_INIT(pool, initial_size) do {              \
 153   (pool).size = (initial_size);                         \
 154   (pool).contents = ALLOCA_ARRAY (char, (pool).size);   \
 155   (pool).index = 0;                                     \
 156   (pool).alloca_p = 1;                                  \
 157   (pool).orig_contents = (pool).contents;               \
 158   (pool).orig_size = (pool).size;                       \
 159 } while (0)
 160
 161 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 162    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 163
 164 #define POOL_GROW(pool, increase) do {                                  \
 165   int PG_newsize = (pool).index + increase;                             \
 166   DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize,     \
 167                           (pool).alloca_p, char);                       \
 168 } while (0)
 169
 170 /* Append text in the range [beg, end) to POOL.  No zero-termination
 171    is done.  */
 172
 173 #define POOL_APPEND(pool, beg, end) do {                        \
 174   const char *PA_beg = beg;                                     \
 175   int PA_size = end - PA_beg;                                   \
 176   POOL_GROW (pool, PA_size);                                    \
 177   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 178   (pool).index += PA_size;                                      \
 179 } while (0)
 180
 181 /* The same as the above, but with zero termination. */
 182
 183 #define POOL_APPEND_ZT(pool, beg, end) do {                     \
 184   const char *PA_beg = beg;                                     \
 185   int PA_size = end - PA_beg;                                   \
 186   POOL_GROW (pool, PA_size + 1);                                \
 187   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 188   (pool).contents[(pool).index + PA_size] = '\0';               \
 189   (pool).index += PA_size + 1;                                  \
 190 } while (0)
 191
 192 /* Forget old pool contents.  The allocated memory is not freed. */
 193 #define POOL_REWIND(pool) pool.index = 0
 194
 195 /* Free heap-allocated memory for contents of POOL.  This calls
 196    xfree() if the memory was allocated through malloc.  It also
 197    restores `contents' and `size' to their original, pre-malloc
 198    values.  That way after POOL_FREE, the pool is fully usable, just
 199    as if it were freshly initialized with POOL_INIT.  */
 200
 201 #define POOL_FREE(pool) do {                    \
 202   if (!(pool).alloca_p)                         \
 203     xfree ((pool).contents);                    \
 204   (pool).contents = (pool).orig_contents;       \
 205   (pool).size = (pool).orig_size;               \
 206   (pool).index = 0;                             \
 207   (pool).alloca_p = 1;                          \
 208 } while (0)
 209
 210 \f
 211 #define AP_DOWNCASE             1
 212 #define AP_PROCESS_ENTITIES     2
 213 #define AP_SKIP_BLANKS          4
 214
 215 /* Copy the text in the range [BEG, END) to POOL, optionally
 216    performing operations specified by FLAGS.  FLAGS may be any
 217    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
 218    with the following meaning:
 219
 220    * AP_DOWNCASE -- downcase all the letters;
 221
 222    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 223    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 224    &nbsp and the numerical entities.
 225
 226    * AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
 227    of text.  */
 228 static void
 229 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 230 {
 231   int old_index = pool->index;
 232   int size;
 233
 234   /* First, skip blanks if required.  We must do this before entities
 235      are processed, so that blanks can still be inserted as, for
 236      instance, `&#32;'.  */
 237   if (flags & AP_SKIP_BLANKS)
 238     {
 239       while (beg < end && ISSPACE (*beg))
 240         ++beg;
 241       while (end > beg && ISSPACE (end[-1]))
 242         --end;
 243     }
 244   size = end - beg;
 245
 246   if (flags & AP_PROCESS_ENTITIES)
 247     {
 248       /* Stack-allocate a copy of text, process entities and copy it
 249          to the pool.  */
 250       char *local_copy = (char *)alloca (size + 1);
 251       const char *from = beg;
 252       char *to = local_copy;
 253
 254       while (from < end)
 255         {
 256           if (*from != '&')
 257             *to++ = *from++;
 258           else
 259             {
 260               const char *save = from;
 261               int remain;
 262
 263               if (++from == end) goto lose;
 264               remain = end - from;
 265
 266               if (*from == '#')
 267                 {
 268                   int numeric;
 269                   ++from;
 270                   if (from == end || !ISDIGIT (*from)) goto lose;
 271                   for (numeric = 0; from < end && ISDIGIT (*from); from++)
 272                     numeric = 10 * numeric + (*from) - '0';
 273                   if (from < end && ISALPHA (*from)) goto lose;
 274                   numeric &= 0xff;
 275                   *to++ = numeric;
 276                 }
 277 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 278                  && !memcmp (from, x, sizeof (x) - 1)           \
 279                  && (*(from + sizeof (x) - 1) == ';'            \
 280                      || remain == sizeof (x) - 1                \
 281                      || !ISALNUM (*(from + sizeof (x) - 1))))
 282               else if (FROB ("lt"))
 283                 *to++ = '<', from += 2;
 284               else if (FROB ("gt"))
 285                 *to++ = '>', from += 2;
 286               else if (FROB ("amp"))
 287                 *to++ = '&', from += 3;
 288               else if (FROB ("quot"))
 289                 *to++ = '\"', from += 4;
 290               /* We don't implement the proposed "Added Latin 1"
 291                  entities (except for nbsp), because it is unnecessary
 292                  in the context of Wget, and would require hashing to
 293                  work efficiently.  */
 294               else if (FROB ("nbsp"))
 295                 *to++ = 160, from += 4;
 296               else
 297                 goto lose;
 298 #undef FROB
 299               /* If the entity was followed by `;', we step over the
 300                  `;'.  Otherwise, it was followed by either a
 301                  non-alphanumeric or EOB, in which case we do nothing.  */
 302               if (from < end && *from == ';')
 303                 ++from;
 304               continue;
 305
 306             lose:
 307               /* This was not an entity after all.  Back out.  */
 308               from = save;
 309               *to++ = *from++;
 310             }
 311         }
 312       *to++ = '\0';
 313       POOL_APPEND (*pool, local_copy, to);
 314     }
 315   else
 316     {
 317       /* Just copy the text to the pool.  */
 318       POOL_APPEND_ZT (*pool, beg, end);
 319     }
 320
 321   if (flags & AP_DOWNCASE)
 322     {
 323       char *p = pool->contents + old_index;
 324       for (; *p; p++)
 325         *p = TOLOWER (*p);
 326     }
 327 }
 328 \f
 329 /* Check whether the contents of [POS, POS+LENGTH) match any of the
 330    strings in the ARRAY.  */
 331 static int
 332 array_allowed (const char **array, const char *beg, const char *end)
 333 {
 334   int length = end - beg;
 335   if (array)
 336     {
 337       for (; *array; array++)
 338         if (length >= strlen (*array)
 339             && !strncasecmp (*array, beg, length))
 340           break;
 341       if (!*array)
 342         return 0;
 343     }
 344   return 1;
 345 }
 346 \f
 347 /* Originally we used to adhere to rfc 1866 here, and allowed only
 348    letters, digits, periods, and hyphens as names (of tags or
 349    attributes).  However, this broke too many pages which used
 350    proprietary or strange attributes, e.g. <img src="a.gif"
 351    v:shapes="whatever">.
 352
 353    So now we allow any character except:
 354      * whitespace
 355      * 8-bit and control chars
 356      * characters that clearly cannot be part of name:
 357        '=', '>', '/'.
 358
 359    This only affects attribute and tag names; attribute values allow
 360    an even greater variety of characters.  */
 361
 362 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 363                         && (x) != '=' && (x) != '>' && (x) != '/')
 364
 365 #ifdef STANDALONE
 366 static int comment_backout_count;
 367 #endif
 368
 369 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 370    strict comments mode, this is used for skipping over comments as
 371    well.
 372
 373    To recap: any SGML declaration may have comments associated with
 374    it, e.g.
 375        <!MY-DECL -- isn't this fun? -- foo bar>
 376
 377    An HTML comment is merely an empty declaration (<!>) with a comment
 378    attached, like this:
 379        <!-- some stuff here -->
 380
 381    Several comments may be embedded in one comment declaration:
 382        <!-- have -- -- fun -->
 383
 384    Whitespace is allowed between and after the comments, but not
 385    before the first comment.  Additionally, this function attempts to
 386    handle double quotes in SGML declarations correctly.  */
 387
 388 static const char *
 389 advance_declaration (const char *beg, const char *end)
 390 {
 391   const char *p = beg;
 392   char quote_char = '\0';       /* shut up, gcc! */
 393   char ch;
 394
 395   enum {
 396     AC_S_DONE,
 397     AC_S_BACKOUT,
 398     AC_S_BANG,
 399     AC_S_DEFAULT,
 400     AC_S_DCLNAME,
 401     AC_S_DASH1,
 402     AC_S_DASH2,
 403     AC_S_COMMENT,
 404     AC_S_DASH3,
 405     AC_S_DASH4,
 406     AC_S_QUOTE1,
 407     AC_S_IN_QUOTE,
 408     AC_S_QUOTE2,
 409   } state = AC_S_BANG;
 410
 411   if (beg == end)
 412     return beg;
 413   ch = *p++;
 414
 415   /* It looked like a good idea to write this as a state machine, but
 416      now I wonder...  */
 417
 418   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 419     {
 420       if (p == end)
 421         state = AC_S_BACKOUT;
 422       switch (state)
 423         {
 424         case AC_S_DONE:
 425         case AC_S_BACKOUT:
 426           break;
 427         case AC_S_BANG:
 428           if (ch == '!')
 429             {
 430               ch = *p++;
 431               state = AC_S_DEFAULT;
 432             }
 433           else
 434             state = AC_S_BACKOUT;
 435           break;
 436         case AC_S_DEFAULT:
 437           switch (ch)
 438             {
 439             case '-':
 440               state = AC_S_DASH1;
 441               break;
 442             case ' ':
 443             case '\t':
 444             case '\r':
 445             case '\n':
 446               ch = *p++;
 447               break;
 448             case '>':
 449               state = AC_S_DONE;
 450               break;
 451             case '\'':
 452             case '\"':
 453               state = AC_S_QUOTE1;
 454               break;
 455             default:
 456               if (NAME_CHAR_P (ch))
 457                 state = AC_S_DCLNAME;
 458               else
 459                 state = AC_S_BACKOUT;
 460               break;
 461             }
 462           break;
 463         case AC_S_DCLNAME:
 464           if (ch == '-')
 465             state = AC_S_DASH1;
 466           else if (NAME_CHAR_P (ch))
 467             ch = *p++;
 468           else
 469             state = AC_S_DEFAULT;
 470           break;
 471         case AC_S_QUOTE1:
 472           /* We must use 0x22 because broken assert macros choke on
 473              '"' and '\"'.  */
 474           assert (ch == '\'' || ch == 0x22);
 475           quote_char = ch;      /* cheating -- I really don't feel like
 476                                    introducing more different states for
 477                                    different quote characters. */
 478           ch = *p++;
 479           state = AC_S_IN_QUOTE;
 480           break;
 481         case AC_S_IN_QUOTE:
 482           if (ch == quote_char)
 483             state = AC_S_QUOTE2;
 484           else
 485             ch = *p++;
 486           break;
 487         case AC_S_QUOTE2:
 488           assert (ch == quote_char);
 489           ch = *p++;
 490           state = AC_S_DEFAULT;
 491           break;
 492         case AC_S_DASH1:
 493           assert (ch == '-');
 494           ch = *p++;
 495           state = AC_S_DASH2;
 496           break;
 497         case AC_S_DASH2:
 498           switch (ch)
 499             {
 500             case '-':
 501               ch = *p++;
 502               state = AC_S_COMMENT;
 503               break;
 504             default:
 505               state = AC_S_BACKOUT;
 506             }
 507           break;
 508         case AC_S_COMMENT:
 509           switch (ch)
 510             {
 511             case '-':
 512               state = AC_S_DASH3;
 513               break;
 514             default:
 515               ch = *p++;
 516               break;
 517             }
 518           break;
 519         case AC_S_DASH3:
 520           assert (ch == '-');
 521           ch = *p++;
 522           state = AC_S_DASH4;
 523           break;
 524         case AC_S_DASH4:
 525           switch (ch)
 526             {
 527             case '-':
 528               ch = *p++;
 529               state = AC_S_DEFAULT;
 530               break;
 531             default:
 532               state = AC_S_COMMENT;
 533               break;
 534             }
 535           break;
 536         }
 537     }
 538
 539   if (state == AC_S_BACKOUT)
 540     {
 541 #ifdef STANDALONE
 542       ++comment_backout_count;
 543 #endif
 544       return beg + 1;
 545     }
 546   return p;
 547 }
 548
 549 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 550    return the pointer to the character after the substring.  If the
 551    substring is not found, return NULL.  */
 552
 553 static const char *
 554 find_comment_end (const char *beg, const char *end)
 555 {
 556   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 557      if it's not '>' or '-', advance by three characters.  Otherwise,
 558      look at the preceding characters and try to find a match.  */
 559
 560   const char *p = beg - 1;
 561
 562   while ((p += 3) < end)
 563     switch (p[0])
 564       {
 565       case '>':
 566         if (p[-1] == '-' && p[-2] == '-')
 567           return p + 1;
 568         break;
 569       case '-':
 570       at_dash:
 571         if (p[-1] == '-')
 572           {
 573           at_dash_dash:
 574             if (++p == end) return NULL;
 575             switch (p[0])
 576               {
 577               case '>': return p + 1;
 578               case '-': goto at_dash_dash;
 579               }
 580           }
 581         else
 582           {
 583             if ((p += 2) >= end) return NULL;
 584             switch (p[0])
 585               {
 586               case '>':
 587                 if (p[-1] == '-')
 588                   return p + 1;
 589                 break;
 590               case '-':
 591                 goto at_dash;
 592               }
 593           }
 594       }
 595   return NULL;
 596 }
 597 \f
 598 /* Advance P (a char pointer), with the explicit intent of being able
 599    to read the next character.  If this is not possible, go to finish.  */
 600
 601 #define ADVANCE(p) do {                         \
 602   ++p;                                          \
 603   if (p >= end)                                 \
 604     goto finish;                                \
 605 } while (0)
 606
 607 /* Skip whitespace, if any. */
 608
 609 #define SKIP_WS(p) do {                         \
 610   while (ISSPACE (*p)) {                        \
 611     ADVANCE (p);                                \
 612   }                                             \
 613 } while (0)
 614
 615 /* Skip non-whitespace, if any. */
 616
 617 #define SKIP_NON_WS(p) do {                     \
 618   while (!ISSPACE (*p)) {                       \
 619     ADVANCE (p);                                \
 620   }                                             \
 621 } while (0)
 622
 623 #ifdef STANDALONE
 624 static int tag_backout_count;
 625 #endif
 626
 627 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 628    MAPFUN will be called with two arguments: pointer to an initialized
 629    struct taginfo, and CLOSURE.
 630
 631    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 632    be processed by this function.  If it is NULL, all the tags are
 633    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 634
 635    (Obviously, the caller can filter out unwanted tags and attributes
 636    just as well, but this is just an optimization designed to avoid
 637    unnecessary copying for tags/attributes which the caller doesn't
 638    want to know about.  These lists are searched linearly; therefore,
 639    if you're interested in a large number of tags or attributes, you'd
 640    better set these to NULL and filter them out yourself with a
 641    hashing process most appropriate for your application.)  */
 642
 643 void
 644 map_html_tags (const char *text, int size,
 645                const char **allowed_tag_names,
 646                const char **allowed_attribute_names,
 647                void (*mapfun) (struct taginfo *, void *),
 648                void *closure)
 649 {
 650   const char *p = text;
 651   const char *end = text + size;
 652
 653   int attr_pair_count = 8;
 654   int attr_pair_alloca_p = 1;
 655   struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
 656   struct pool pool;
 657
 658   if (!size)
 659     return;
 660
 661   POOL_INIT (pool, 256);
 662
 663   {
 664     int nattrs, end_tag;
 665     const char *tag_name_begin, *tag_name_end;
 666     const char *tag_start_position;
 667     int uninteresting_tag;
 668
 669   look_for_tag:
 670     POOL_REWIND (pool);
 671
 672     nattrs = 0;
 673     end_tag = 0;
 674
 675     /* Find beginning of tag.  We use memchr() instead of the usual
 676        looping with ADVANCE() for speed. */
 677     p = memchr (p, '<', end - p);
 678     if (!p)
 679       goto finish;
 680
 681     tag_start_position = p;
 682     ADVANCE (p);
 683
 684     /* Establish the type of the tag (start-tag, end-tag or
 685        declaration).  */
 686     if (*p == '!')
 687       {
 688         if (!opt.strict_comments
 689             && p < end + 3 && p[1] == '-' && p[2] == '-')
 690           {
 691             /* If strict comments are not enforced and if we know
 692                we're looking at a comment, simply look for the
 693                terminating "-->".  Non-strict is the default because
 694                it works in other browsers and most HTML writers can't
 695                be bothered with getting the comments right.  */
 696             const char *comment_end = find_comment_end (p + 3, end);
 697             if (comment_end)
 698               p = comment_end;
 699           }
 700         else
 701           {
 702             /* Either in strict comment mode or looking at a non-empty
 703                declaration.  Real declarations are much less likely to
 704                be misused the way comments are, so advance over them
 705                properly regardless of strictness.  */
 706             p = advance_declaration (p, end);
 707           }
 708         if (p == end)
 709           goto finish;
 710         goto look_for_tag;
 711       }
 712     else if (*p == '/')
 713       {
 714         end_tag = 1;
 715         ADVANCE (p);
 716       }
 717     tag_name_begin = p;
 718     while (NAME_CHAR_P (*p))
 719       ADVANCE (p);
 720     if (p == tag_name_begin)
 721       goto look_for_tag;
 722     tag_name_end = p;
 723     SKIP_WS (p);
 724     if (end_tag && *p != '>')
 725       goto backout_tag;
 726
 727     if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
 728       /* We can't just say "goto look_for_tag" here because we need
 729          the loop below to properly advance over the tag's attributes.  */
 730       uninteresting_tag = 1;
 731     else
 732       {
 733         uninteresting_tag = 0;
 734         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 735       }
 736
 737     /* Find the attributes. */
 738     while (1)
 739       {
 740         const char *attr_name_begin, *attr_name_end;
 741         const char *attr_value_begin, *attr_value_end;
 742         const char *attr_raw_value_begin, *attr_raw_value_end;
 743         int operation = AP_DOWNCASE; /* stupid compiler. */
 744
 745         SKIP_WS (p);
 746
 747         if (*p == '/')
 748           {
 749             /* A slash at this point means the tag is about to be
 750                closed.  This is legal in XML and has been popularized
 751                in HTML via XHTML.  */
 752             /* <foo a=b c=d /> */
 753             /*              ^  */
 754             ADVANCE (p);
 755             SKIP_WS (p);
 756             if (*p != '>')
 757               goto backout_tag;
 758           }
 759
 760         /* Check for end of tag definition. */
 761         if (*p == '>')
 762           break;
 763
 764         /* Establish bounds of attribute name. */
 765         attr_name_begin = p;    /* <foo bar ...> */
 766                                 /*      ^        */
 767         while (NAME_CHAR_P (*p))
 768           ADVANCE (p);
 769         attr_name_end = p;      /* <foo bar ...> */
 770                                 /*         ^     */
 771         if (attr_name_begin == attr_name_end)
 772           goto backout_tag;
 773
 774         /* Establish bounds of attribute value. */
 775         SKIP_WS (p);
 776         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 777           {
 778             /* Minimized attribute syntax allows `=' to be omitted.
 779                For example, <UL COMPACT> is a valid shorthand for <UL
 780                COMPACT="compact">.  Even if such attributes are not
 781                useful to Wget, we need to support them, so that the
 782                tags containing them can be parsed correctly. */
 783             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 784             attr_raw_value_end = attr_value_end = attr_name_end;
 785           }
 786         else if (*p == '=')
 787           {
 788             ADVANCE (p);
 789             SKIP_WS (p);
 790             if (*p == '\"' || *p == '\'')
 791               {
 792                 int newline_seen = 0;
 793                 char quote_char = *p;
 794                 attr_raw_value_begin = p;
 795                 ADVANCE (p);
 796                 attr_value_begin = p; /* <foo bar="baz"> */
 797                                       /*           ^     */
 798                 while (*p != quote_char)
 799                   {
 800                     if (!newline_seen && *p == '\n')
 801                       {
 802                         /* If a newline is seen within the quotes, it
 803                            is most likely that someone forgot to close
 804                            the quote.  In that case, we back out to
 805                            the value beginning, and terminate the tag
 806                            at either `>' or the delimiter, whichever
 807                            comes first.  Such a tag terminated at `>'
 808                            is discarded.  */
 809                         p = attr_value_begin;
 810                         newline_seen = 1;
 811                         continue;
 812                       }
 813                     else if (newline_seen && *p == '>')
 814                       break;
 815                     ADVANCE (p);
 816                   }
 817                 attr_value_end = p; /* <foo bar="baz"> */
 818                                     /*              ^  */
 819                 if (*p == quote_char)
 820                   ADVANCE (p);
 821                 else
 822                   goto look_for_tag;
 823                 attr_raw_value_end = p; /* <foo bar="baz"> */
 824                                         /*               ^ */
 825                 /* The AP_SKIP_BLANKS part is not entirely correct,
 826                    because we don't want to skip blanks for all the
 827                    attribute values.  */
 828                 operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
 829               }
 830             else
 831               {
 832                 attr_value_begin = p; /* <foo bar=baz> */
 833                                       /*          ^    */
 834                 /* According to SGML, a name token should consist only
 835                    of alphanumerics, . and -.  However, this is often
 836                    violated by, for instance, `%' in `width=75%'.
 837                    We'll be liberal and allow just about anything as
 838                    an attribute value.  */
 839                 while (!ISSPACE (*p) && *p != '>')
 840                   ADVANCE (p);
 841                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 842                                     /*             ^          */
 843                 if (attr_value_begin == attr_value_end)
 844                   /* <foo bar=> */
 845                   /*          ^ */
 846                   goto backout_tag;
 847                 attr_raw_value_begin = attr_value_begin;
 848                 attr_raw_value_end = attr_value_end;
 849                 operation = AP_PROCESS_ENTITIES;
 850               }
 851           }
 852         else
 853           {
 854             /* We skipped the whitespace and found something that is
 855                neither `=' nor the beginning of the next attribute's
 856                name.  Back out.  */
 857             goto backout_tag;   /* <foo bar [... */
 858                                 /*          ^    */
 859           }
 860
 861         /* If we're not interested in the tag, don't bother with any
 862            of the attributes.  */
 863         if (uninteresting_tag)
 864           continue;
 865
 866         /* If we aren't interested in the attribute, skip it.  We
 867            cannot do this test any sooner, because our text pointer
 868            needs to correctly advance over the attribute.  */
 869         if (allowed_attribute_names
 870             && !array_allowed (allowed_attribute_names, attr_name_begin,
 871                                attr_name_end))
 872           continue;
 873
 874         DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
 875                                 attr_pair_alloca_p, struct attr_pair);
 876
 877         pairs[nattrs].name_pool_index = pool.index;
 878         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 879
 880         pairs[nattrs].value_pool_index = pool.index;
 881         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 882         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 883         pairs[nattrs].value_raw_size = (attr_raw_value_end
 884                                         - attr_raw_value_begin);
 885         ++nattrs;
 886       }
 887
 888     if (uninteresting_tag)
 889       {
 890         ADVANCE (p);
 891         goto look_for_tag;
 892       }
 893
 894     /* By now, we have a valid tag with a name and zero or more
 895        attributes.  Fill in the data and call the mapper function.  */
 896     {
 897       int i;
 898       struct taginfo taginfo;
 899
 900       taginfo.name      = pool.contents;
 901       taginfo.end_tag_p = end_tag;
 902       taginfo.nattrs    = nattrs;
 903       /* We fill in the char pointers only now, when pool can no
 904          longer get realloc'ed.  If we did that above, we could get
 905          hosed by reallocation.  Obviously, after this point, the pool
 906          may no longer be grown.  */
 907       for (i = 0; i < nattrs; i++)
 908         {
 909           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 910           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 911         }
 912       taginfo.attrs = pairs;
 913       taginfo.start_position = tag_start_position;
 914       taginfo.end_position   = p + 1;
 915       /* Ta-dam! */
 916       (*mapfun) (&taginfo, closure);
 917       ADVANCE (p);
 918     }
 919     goto look_for_tag;
 920
 921   backout_tag:
 922 #ifdef STANDALONE
 923     ++tag_backout_count;
 924 #endif
 925     /* The tag wasn't really a tag.  Treat its contents as ordinary
 926        data characters. */
 927     p = tag_start_position + 1;
 928     goto look_for_tag;
 929   }
 930
 931  finish:
 932   POOL_FREE (pool);
 933   if (!attr_pair_alloca_p)
 934     xfree (pairs);
 935 }
 936
 937 #undef ADVANCE
 938 #undef SKIP_WS
 939 #undef SKIP_NON_WS
 940 \f
 941 #ifdef STANDALONE
 942 static void
 943 test_mapper (struct taginfo *taginfo, void *arg)
 944 {
 945   int i;
 946
 947   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
 948   for (i = 0; i < taginfo->nattrs; i++)
 949     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
 950   putchar ('\n');
 951   ++*(int *)arg;
 952 }
 953
 954 int main ()
 955 {
 956   int size = 256;
 957   char *x = (char *)xmalloc (size);
 958   int length = 0;
 959   int read_count;
 960   int tag_counter = 0;
 961
 962   while ((read_count = fread (x + length, 1, size - length, stdin)))
 963     {
 964       length += read_count;
 965       size <<= 1;
 966       x = (char *)xrealloc (x, size);
 967     }
 968
 969   map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
 970   printf ("TAGS: %d\n", tag_counter);
 971   printf ("Tag backouts:     %d\n", tag_backout_count);
 972   printf ("Comment backouts: %d\n", comment_backout_count);
 973   return 0;
 974 }
 975 #endif /* STANDALONE */