sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #ifdef HAVE_STRING_H
 100 # include <string.h>
 101 #else
 102 # include <strings.h>
 103 #endif
 104 #include <assert.h>
 105
 106 #include "wget.h"
 107 #include "html-parse.h"
 108
 109 #ifdef STANDALONE
 110 # undef xmalloc
 111 # undef xrealloc
 112 # undef xfree
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # undef ISSPACE
 118 # undef ISDIGIT
 119 # undef ISXDIGIT
 120 # undef ISALPHA
 121 # undef ISALNUM
 122 # undef TOLOWER
 123 # undef TOUPPER
 124
 125 # define ISSPACE(x) isspace (x)
 126 # define ISDIGIT(x) isdigit (x)
 127 # define ISXDIGIT(x) isxdigit (x)
 128 # define ISALPHA(x) isalpha (x)
 129 # define ISALNUM(x) isalnum (x)
 130 # define TOLOWER(x) tolower (x)
 131 # define TOUPPER(x) toupper (x)
 132
 133 static struct options opt;
 134 #endif /* STANDALONE */
 135
 136 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 137    allocated on the stack, and moved to the heap if it needs to be
 138    larger than originally expected.  map_html_tags() uses it to store
 139    the zero-terminated names and values of tags and attributes.
 140
 141    Thus taginfo->name, and attr->name and attr->value for each
 142    attribute, do not point into separately allocated areas, but into
 143    different parts of the pool, separated only by terminating zeros.
 144    This ensures minimum amount of allocation and, for most tags, no
 145    allocation because the entire pool is kept on the stack.  */
 146
 147 struct pool {
 148   char *contents;               /* pointer to the contents. */
 149   int size;                     /* size of the pool. */
 150   int index;                    /* next unoccupied position in
 151                                    contents. */
 152
 153   int alloca_p;                 /* whether contents was allocated
 154                                    using alloca(). */
 155   char *orig_contents;          /* orig_contents, allocated by
 156                                    alloca().  this is used by
 157                                    POOL_FREE to restore the pool to
 158                                    the "initial" state. */
 159   int orig_size;
 160 };
 161
 162 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 163
 164 #define POOL_INIT(pool, initial_size) do {              \
 165   (pool).size = (initial_size);                         \
 166   (pool).contents = ALLOCA_ARRAY (char, (pool).size);   \
 167   (pool).index = 0;                                     \
 168   (pool).alloca_p = 1;                                  \
 169   (pool).orig_contents = (pool).contents;               \
 170   (pool).orig_size = (pool).size;                       \
 171 } while (0)
 172
 173 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 174    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 175
 176 #define POOL_GROW(pool, increase) do {                                  \
 177   int PG_newsize = (pool).index + increase;                             \
 178   DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize,     \
 179                           (pool).alloca_p, char);                       \
 180 } while (0)
 181
 182 /* Append text in the range [beg, end) to POOL.  No zero-termination
 183    is done.  */
 184
 185 #define POOL_APPEND(pool, beg, end) do {                        \
 186   const char *PA_beg = (beg);                                   \
 187   int PA_size = (end) - PA_beg;                                 \
 188   POOL_GROW (pool, PA_size);                                    \
 189   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 190   (pool).index += PA_size;                                      \
 191 } while (0)
 192
 193 /* Append one character to the pool.  Can be used to zero-terminate
 194    pool strings.  */
 195
 196 #define POOL_APPEND_CHR(pool, ch) do {          \
 197   char PAC_char = (ch);                         \
 198   POOL_GROW (pool, 1);                          \
 199   (pool).contents[(pool).index++] = PAC_char;   \
 200 } while (0)
 201
 202 /* Forget old pool contents.  The allocated memory is not freed. */
 203 #define POOL_REWIND(pool) pool.index = 0
 204
 205 /* Free heap-allocated memory for contents of POOL.  This calls
 206    xfree() if the memory was allocated through malloc.  It also
 207    restores `contents' and `size' to their original, pre-malloc
 208    values.  That way after POOL_FREE, the pool is fully usable, just
 209    as if it were freshly initialized with POOL_INIT.  */
 210
 211 #define POOL_FREE(pool) do {                    \
 212   if (!(pool).alloca_p)                         \
 213     xfree ((pool).contents);                    \
 214   (pool).contents = (pool).orig_contents;       \
 215   (pool).size = (pool).orig_size;               \
 216   (pool).index = 0;                             \
 217   (pool).alloca_p = 1;                          \
 218 } while (0)
 219
 220 \f
 221 #define AP_DOWNCASE             1
 222 #define AP_PROCESS_ENTITIES     2
 223 #define AP_TRIM_BLANKS          4
 224
 225 /* Copy the text in the range [BEG, END) to POOL, optionally
 226    performing operations specified by FLAGS.  FLAGS may be any
 227    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
 228    with the following meaning:
 229
 230    * AP_DOWNCASE -- downcase all the letters;
 231
 232    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 233    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 234    &nbsp and the numerical entities.
 235
 236    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 237    of text.  */
 238
 239 static void
 240 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 241 {
 242   int old_index = pool->index;
 243   int size;
 244
 245   /* First, skip blanks if required.  We must do this before entities
 246      are processed, so that blanks can still be inserted as, for
 247      instance, `&#32;'.  */
 248   if (flags & AP_TRIM_BLANKS)
 249     {
 250       while (beg < end && ISSPACE (*beg))
 251         ++beg;
 252       while (end > beg && ISSPACE (end[-1]))
 253         --end;
 254     }
 255   size = end - beg;
 256
 257   if (flags & AP_PROCESS_ENTITIES)
 258     {
 259       /* Grow the pool, then copy the text to the pool character by
 260          character, processing the encountered entities as we go
 261          along.
 262
 263          It's safe (and necessary) to grow the pool in advance because
 264          processing the entities can only *shorten* the string, it can
 265          never lengthen it.  */
 266       POOL_GROW (*pool, end - beg);
 267       const char *from = beg;
 268       char *to = pool->contents + pool->index;
 269
 270       while (from < end)
 271         {
 272           if (*from != '&')
 273             *to++ = *from++;
 274           else
 275             {
 276               const char *save = from;
 277               int remain;
 278
 279               if (++from == end)
 280                 goto lose;
 281               remain = end - from;
 282
 283               /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 284               if (*from == '#')
 285                 {
 286                   int numeric = 0, digits = 0;
 287                   ++from;
 288                   if (*from == 'x')
 289                     {
 290                       ++from;
 291                       for (; from < end && ISXDIGIT (*from); from++, digits++)
 292                         numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
 293                     }
 294                   else
 295                     {
 296                       for (; from < end && ISDIGIT (*from); from++, digits++)
 297                         numeric = (numeric * 10) + (*from - '0');
 298                     }
 299                   if (!digits)
 300                     goto lose;
 301                   numeric &= 0xff;
 302                   *to++ = numeric;
 303                 }
 304 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 305                  && 0 == memcmp (from, x, sizeof (x) - 1)       \
 306                  && (*(from + sizeof (x) - 1) == ';'            \
 307                      || remain == sizeof (x) - 1                \
 308                      || !ISALNUM (*(from + sizeof (x) - 1))))
 309               else if (FROB ("lt"))
 310                 *to++ = '<', from += 2;
 311               else if (FROB ("gt"))
 312                 *to++ = '>', from += 2;
 313               else if (FROB ("amp"))
 314                 *to++ = '&', from += 3;
 315               else if (FROB ("quot"))
 316                 *to++ = '\"', from += 4;
 317               /* We don't implement the proposed "Added Latin 1"
 318                  entities (except for nbsp), because it is unnecessary
 319                  in the context of Wget, and would require hashing to
 320                  work efficiently.  */
 321               else if (FROB ("nbsp"))
 322                 *to++ = 160, from += 4;
 323               else
 324                 goto lose;
 325 #undef FROB
 326               /* If the entity was followed by `;', we step over the
 327                  `;'.  Otherwise, it was followed by either a
 328                  non-alphanumeric or EOB, in which case we do nothing.  */
 329               if (from < end && *from == ';')
 330                 ++from;
 331               continue;
 332
 333             lose:
 334               /* This was not an entity after all.  Back out.  */
 335               from = save;
 336               *to++ = *from++;
 337             }
 338         }
 339       /* Verify that we haven't exceeded the original size.  (It
 340          shouldn't happen, hence the assert.)  */
 341       assert (to - (pool->contents + pool->index) <= end - beg);
 342
 343       /* Make POOL's tail point to the position following the string
 344          we've written.  */
 345       pool->index = to - pool->contents;
 346       POOL_APPEND_CHR (*pool, '\0');
 347     }
 348   else
 349     {
 350       /* Just copy the text to the pool.  */
 351       POOL_APPEND (*pool, beg, end);
 352       POOL_APPEND_CHR (*pool, '\0');
 353     }
 354
 355   if (flags & AP_DOWNCASE)
 356     {
 357       char *p = pool->contents + old_index;
 358       for (; *p; p++)
 359         *p = TOLOWER (*p);
 360     }
 361 }
 362 \f
 363 /* Check whether the contents of [POS, POS+LENGTH) match any of the
 364    strings in the ARRAY.  */
 365 static int
 366 array_allowed (const char **array, const char *beg, const char *end)
 367 {
 368   int length = end - beg;
 369   if (array)
 370     {
 371       for (; *array; array++)
 372         if (length >= strlen (*array)
 373             && !strncasecmp (*array, beg, length))
 374           break;
 375       if (!*array)
 376         return 0;
 377     }
 378   return 1;
 379 }
 380 \f
 381 /* Originally we used to adhere to rfc 1866 here, and allowed only
 382    letters, digits, periods, and hyphens as names (of tags or
 383    attributes).  However, this broke too many pages which used
 384    proprietary or strange attributes, e.g. <img src="a.gif"
 385    v:shapes="whatever">.
 386
 387    So now we allow any character except:
 388      * whitespace
 389      * 8-bit and control chars
 390      * characters that clearly cannot be part of name:
 391        '=', '>', '/'.
 392
 393    This only affects attribute and tag names; attribute values allow
 394    an even greater variety of characters.  */
 395
 396 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 397                         && (x) != '=' && (x) != '>' && (x) != '/')
 398
 399 #ifdef STANDALONE
 400 static int comment_backout_count;
 401 #endif
 402
 403 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 404    strict comments mode, this is used for skipping over comments as
 405    well.
 406
 407    To recap: any SGML declaration may have comments associated with
 408    it, e.g.
 409        <!MY-DECL -- isn't this fun? -- foo bar>
 410
 411    An HTML comment is merely an empty declaration (<!>) with a comment
 412    attached, like this:
 413        <!-- some stuff here -->
 414
 415    Several comments may be embedded in one comment declaration:
 416        <!-- have -- -- fun -->
 417
 418    Whitespace is allowed between and after the comments, but not
 419    before the first comment.  Additionally, this function attempts to
 420    handle double quotes in SGML declarations correctly.  */
 421
 422 static const char *
 423 advance_declaration (const char *beg, const char *end)
 424 {
 425   const char *p = beg;
 426   char quote_char = '\0';       /* shut up, gcc! */
 427   char ch;
 428
 429   enum {
 430     AC_S_DONE,
 431     AC_S_BACKOUT,
 432     AC_S_BANG,
 433     AC_S_DEFAULT,
 434     AC_S_DCLNAME,
 435     AC_S_DASH1,
 436     AC_S_DASH2,
 437     AC_S_COMMENT,
 438     AC_S_DASH3,
 439     AC_S_DASH4,
 440     AC_S_QUOTE1,
 441     AC_S_IN_QUOTE,
 442     AC_S_QUOTE2,
 443   } state = AC_S_BANG;
 444
 445   if (beg == end)
 446     return beg;
 447   ch = *p++;
 448
 449   /* It looked like a good idea to write this as a state machine, but
 450      now I wonder...  */
 451
 452   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 453     {
 454       if (p == end)
 455         state = AC_S_BACKOUT;
 456       switch (state)
 457         {
 458         case AC_S_DONE:
 459         case AC_S_BACKOUT:
 460           break;
 461         case AC_S_BANG:
 462           if (ch == '!')
 463             {
 464               ch = *p++;
 465               state = AC_S_DEFAULT;
 466             }
 467           else
 468             state = AC_S_BACKOUT;
 469           break;
 470         case AC_S_DEFAULT:
 471           switch (ch)
 472             {
 473             case '-':
 474               state = AC_S_DASH1;
 475               break;
 476             case ' ':
 477             case '\t':
 478             case '\r':
 479             case '\n':
 480               ch = *p++;
 481               break;
 482             case '>':
 483               state = AC_S_DONE;
 484               break;
 485             case '\'':
 486             case '\"':
 487               state = AC_S_QUOTE1;
 488               break;
 489             default:
 490               if (NAME_CHAR_P (ch))
 491                 state = AC_S_DCLNAME;
 492               else
 493                 state = AC_S_BACKOUT;
 494               break;
 495             }
 496           break;
 497         case AC_S_DCLNAME:
 498           if (ch == '-')
 499             state = AC_S_DASH1;
 500           else if (NAME_CHAR_P (ch))
 501             ch = *p++;
 502           else
 503             state = AC_S_DEFAULT;
 504           break;
 505         case AC_S_QUOTE1:
 506           /* We must use 0x22 because broken assert macros choke on
 507              '"' and '\"'.  */
 508           assert (ch == '\'' || ch == 0x22);
 509           quote_char = ch;      /* cheating -- I really don't feel like
 510                                    introducing more different states for
 511                                    different quote characters. */
 512           ch = *p++;
 513           state = AC_S_IN_QUOTE;
 514           break;
 515         case AC_S_IN_QUOTE:
 516           if (ch == quote_char)
 517             state = AC_S_QUOTE2;
 518           else
 519             ch = *p++;
 520           break;
 521         case AC_S_QUOTE2:
 522           assert (ch == quote_char);
 523           ch = *p++;
 524           state = AC_S_DEFAULT;
 525           break;
 526         case AC_S_DASH1:
 527           assert (ch == '-');
 528           ch = *p++;
 529           state = AC_S_DASH2;
 530           break;
 531         case AC_S_DASH2:
 532           switch (ch)
 533             {
 534             case '-':
 535               ch = *p++;
 536               state = AC_S_COMMENT;
 537               break;
 538             default:
 539               state = AC_S_BACKOUT;
 540             }
 541           break;
 542         case AC_S_COMMENT:
 543           switch (ch)
 544             {
 545             case '-':
 546               state = AC_S_DASH3;
 547               break;
 548             default:
 549               ch = *p++;
 550               break;
 551             }
 552           break;
 553         case AC_S_DASH3:
 554           assert (ch == '-');
 555           ch = *p++;
 556           state = AC_S_DASH4;
 557           break;
 558         case AC_S_DASH4:
 559           switch (ch)
 560             {
 561             case '-':
 562               ch = *p++;
 563               state = AC_S_DEFAULT;
 564               break;
 565             default:
 566               state = AC_S_COMMENT;
 567               break;
 568             }
 569           break;
 570         }
 571     }
 572
 573   if (state == AC_S_BACKOUT)
 574     {
 575 #ifdef STANDALONE
 576       ++comment_backout_count;
 577 #endif
 578       return beg + 1;
 579     }
 580   return p;
 581 }
 582
 583 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 584    return the pointer to the character after the substring.  If the
 585    substring is not found, return NULL.  */
 586
 587 static const char *
 588 find_comment_end (const char *beg, const char *end)
 589 {
 590   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 591      if it's not '>' or '-', advance by three characters.  Otherwise,
 592      look at the preceding characters and try to find a match.  */
 593
 594   const char *p = beg - 1;
 595
 596   while ((p += 3) < end)
 597     switch (p[0])
 598       {
 599       case '>':
 600         if (p[-1] == '-' && p[-2] == '-')
 601           return p + 1;
 602         break;
 603       case '-':
 604       at_dash:
 605         if (p[-1] == '-')
 606           {
 607           at_dash_dash:
 608             if (++p == end) return NULL;
 609             switch (p[0])
 610               {
 611               case '>': return p + 1;
 612               case '-': goto at_dash_dash;
 613               }
 614           }
 615         else
 616           {
 617             if ((p += 2) >= end) return NULL;
 618             switch (p[0])
 619               {
 620               case '>':
 621                 if (p[-1] == '-')
 622                   return p + 1;
 623                 break;
 624               case '-':
 625                 goto at_dash;
 626               }
 627           }
 628       }
 629   return NULL;
 630 }
 631 \f
 632 /* Advance P (a char pointer), with the explicit intent of being able
 633    to read the next character.  If this is not possible, go to finish.  */
 634
 635 #define ADVANCE(p) do {                         \
 636   ++p;                                          \
 637   if (p >= end)                                 \
 638     goto finish;                                \
 639 } while (0)
 640
 641 /* Skip whitespace, if any. */
 642
 643 #define SKIP_WS(p) do {                         \
 644   while (ISSPACE (*p)) {                        \
 645     ADVANCE (p);                                \
 646   }                                             \
 647 } while (0)
 648
 649 /* Skip non-whitespace, if any. */
 650
 651 #define SKIP_NON_WS(p) do {                     \
 652   while (!ISSPACE (*p)) {                       \
 653     ADVANCE (p);                                \
 654   }                                             \
 655 } while (0)
 656
 657 #ifdef STANDALONE
 658 static int tag_backout_count;
 659 #endif
 660
 661 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 662    MAPFUN will be called with two arguments: pointer to an initialized
 663    struct taginfo, and CLOSURE.
 664
 665    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 666    be processed by this function.  If it is NULL, all the tags are
 667    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 668
 669    (Obviously, the caller can filter out unwanted tags and attributes
 670    just as well, but this is just an optimization designed to avoid
 671    unnecessary copying for tags/attributes which the caller doesn't
 672    want to know about.  These lists are searched linearly; therefore,
 673    if you're interested in a large number of tags or attributes, you'd
 674    better set these to NULL and filter them out yourself with a
 675    hashing process most appropriate for your application.)  */
 676
 677 void
 678 map_html_tags (const char *text, int size,
 679                const char **allowed_tag_names,
 680                const char **allowed_attribute_names,
 681                void (*mapfun) (struct taginfo *, void *),
 682                void *closure)
 683 {
 684   const char *p = text;
 685   const char *end = text + size;
 686
 687   int attr_pair_count = 8;
 688   int attr_pair_alloca_p = 1;
 689   struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
 690   struct pool pool;
 691
 692   if (!size)
 693     return;
 694
 695   POOL_INIT (pool, 256);
 696
 697   {
 698     int nattrs, end_tag;
 699     const char *tag_name_begin, *tag_name_end;
 700     const char *tag_start_position;
 701     int uninteresting_tag;
 702
 703   look_for_tag:
 704     POOL_REWIND (pool);
 705
 706     nattrs = 0;
 707     end_tag = 0;
 708
 709     /* Find beginning of tag.  We use memchr() instead of the usual
 710        looping with ADVANCE() for speed. */
 711     p = memchr (p, '<', end - p);
 712     if (!p)
 713       goto finish;
 714
 715     tag_start_position = p;
 716     ADVANCE (p);
 717
 718     /* Establish the type of the tag (start-tag, end-tag or
 719        declaration).  */
 720     if (*p == '!')
 721       {
 722         if (!opt.strict_comments
 723             && p < end + 3 && p[1] == '-' && p[2] == '-')
 724           {
 725             /* If strict comments are not enforced and if we know
 726                we're looking at a comment, simply look for the
 727                terminating "-->".  Non-strict is the default because
 728                it works in other browsers and most HTML writers can't
 729                be bothered with getting the comments right.  */
 730             const char *comment_end = find_comment_end (p + 3, end);
 731             if (comment_end)
 732               p = comment_end;
 733           }
 734         else
 735           {
 736             /* Either in strict comment mode or looking at a non-empty
 737                declaration.  Real declarations are much less likely to
 738                be misused the way comments are, so advance over them
 739                properly regardless of strictness.  */
 740             p = advance_declaration (p, end);
 741           }
 742         if (p == end)
 743           goto finish;
 744         goto look_for_tag;
 745       }
 746     else if (*p == '/')
 747       {
 748         end_tag = 1;
 749         ADVANCE (p);
 750       }
 751     tag_name_begin = p;
 752     while (NAME_CHAR_P (*p))
 753       ADVANCE (p);
 754     if (p == tag_name_begin)
 755       goto look_for_tag;
 756     tag_name_end = p;
 757     SKIP_WS (p);
 758     if (end_tag && *p != '>')
 759       goto backout_tag;
 760
 761     if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
 762       /* We can't just say "goto look_for_tag" here because we need
 763          the loop below to properly advance over the tag's attributes.  */
 764       uninteresting_tag = 1;
 765     else
 766       {
 767         uninteresting_tag = 0;
 768         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 769       }
 770
 771     /* Find the attributes. */
 772     while (1)
 773       {
 774         const char *attr_name_begin, *attr_name_end;
 775         const char *attr_value_begin, *attr_value_end;
 776         const char *attr_raw_value_begin, *attr_raw_value_end;
 777         int operation = AP_DOWNCASE; /* stupid compiler. */
 778
 779         SKIP_WS (p);
 780
 781         if (*p == '/')
 782           {
 783             /* A slash at this point means the tag is about to be
 784                closed.  This is legal in XML and has been popularized
 785                in HTML via XHTML.  */
 786             /* <foo a=b c=d /> */
 787             /*              ^  */
 788             ADVANCE (p);
 789             SKIP_WS (p);
 790             if (*p != '>')
 791               goto backout_tag;
 792           }
 793
 794         /* Check for end of tag definition. */
 795         if (*p == '>')
 796           break;
 797
 798         /* Establish bounds of attribute name. */
 799         attr_name_begin = p;    /* <foo bar ...> */
 800                                 /*      ^        */
 801         while (NAME_CHAR_P (*p))
 802           ADVANCE (p);
 803         attr_name_end = p;      /* <foo bar ...> */
 804                                 /*         ^     */
 805         if (attr_name_begin == attr_name_end)
 806           goto backout_tag;
 807
 808         /* Establish bounds of attribute value. */
 809         SKIP_WS (p);
 810         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 811           {
 812             /* Minimized attribute syntax allows `=' to be omitted.
 813                For example, <UL COMPACT> is a valid shorthand for <UL
 814                COMPACT="compact">.  Even if such attributes are not
 815                useful to Wget, we need to support them, so that the
 816                tags containing them can be parsed correctly. */
 817             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 818             attr_raw_value_end = attr_value_end = attr_name_end;
 819           }
 820         else if (*p == '=')
 821           {
 822             ADVANCE (p);
 823             SKIP_WS (p);
 824             if (*p == '\"' || *p == '\'')
 825               {
 826                 int newline_seen = 0;
 827                 char quote_char = *p;
 828                 attr_raw_value_begin = p;
 829                 ADVANCE (p);
 830                 attr_value_begin = p; /* <foo bar="baz"> */
 831                                       /*           ^     */
 832                 while (*p != quote_char)
 833                   {
 834                     if (!newline_seen && *p == '\n')
 835                       {
 836                         /* If a newline is seen within the quotes, it
 837                            is most likely that someone forgot to close
 838                            the quote.  In that case, we back out to
 839                            the value beginning, and terminate the tag
 840                            at either `>' or the delimiter, whichever
 841                            comes first.  Such a tag terminated at `>'
 842                            is discarded.  */
 843                         p = attr_value_begin;
 844                         newline_seen = 1;
 845                         continue;
 846                       }
 847                     else if (newline_seen && *p == '>')
 848                       break;
 849                     ADVANCE (p);
 850                   }
 851                 attr_value_end = p; /* <foo bar="baz"> */
 852                                     /*              ^  */
 853                 if (*p == quote_char)
 854                   ADVANCE (p);
 855                 else
 856                   goto look_for_tag;
 857                 attr_raw_value_end = p; /* <foo bar="baz"> */
 858                                         /*               ^ */
 859                 /* The AP_TRIM_BLANKS is there for buggy HTML
 860                    generators that generate <a href=" foo"> instead of
 861                    <a href="foo"> (Netscape ignores spaces as well.)
 862                    If you really mean space, use &32; or %20.  */
 863                 operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS;
 864               }
 865             else
 866               {
 867                 attr_value_begin = p; /* <foo bar=baz> */
 868                                       /*          ^    */
 869                 /* According to SGML, a name token should consist only
 870                    of alphanumerics, . and -.  However, this is often
 871                    violated by, for instance, `%' in `width=75%'.
 872                    We'll be liberal and allow just about anything as
 873                    an attribute value.  */
 874                 while (!ISSPACE (*p) && *p != '>')
 875                   ADVANCE (p);
 876                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 877                                     /*             ^          */
 878                 if (attr_value_begin == attr_value_end)
 879                   /* <foo bar=> */
 880                   /*          ^ */
 881                   goto backout_tag;
 882                 attr_raw_value_begin = attr_value_begin;
 883                 attr_raw_value_end = attr_value_end;
 884                 operation = AP_PROCESS_ENTITIES;
 885               }
 886           }
 887         else
 888           {
 889             /* We skipped the whitespace and found something that is
 890                neither `=' nor the beginning of the next attribute's
 891                name.  Back out.  */
 892             goto backout_tag;   /* <foo bar [... */
 893                                 /*          ^    */
 894           }
 895
 896         /* If we're not interested in the tag, don't bother with any
 897            of the attributes.  */
 898         if (uninteresting_tag)
 899           continue;
 900
 901         /* If we aren't interested in the attribute, skip it.  We
 902            cannot do this test any sooner, because our text pointer
 903            needs to correctly advance over the attribute.  */
 904         if (allowed_attribute_names
 905             && !array_allowed (allowed_attribute_names, attr_name_begin,
 906                                attr_name_end))
 907           continue;
 908
 909         DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
 910                                 attr_pair_alloca_p, struct attr_pair);
 911
 912         pairs[nattrs].name_pool_index = pool.index;
 913         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 914
 915         pairs[nattrs].value_pool_index = pool.index;
 916         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 917         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 918         pairs[nattrs].value_raw_size = (attr_raw_value_end
 919                                         - attr_raw_value_begin);
 920         ++nattrs;
 921       }
 922
 923     if (uninteresting_tag)
 924       {
 925         ADVANCE (p);
 926         goto look_for_tag;
 927       }
 928
 929     /* By now, we have a valid tag with a name and zero or more
 930        attributes.  Fill in the data and call the mapper function.  */
 931     {
 932       int i;
 933       struct taginfo taginfo;
 934
 935       taginfo.name      = pool.contents;
 936       taginfo.end_tag_p = end_tag;
 937       taginfo.nattrs    = nattrs;
 938       /* We fill in the char pointers only now, when pool can no
 939          longer get realloc'ed.  If we did that above, we could get
 940          hosed by reallocation.  Obviously, after this point, the pool
 941          may no longer be grown.  */
 942       for (i = 0; i < nattrs; i++)
 943         {
 944           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 945           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 946         }
 947       taginfo.attrs = pairs;
 948       taginfo.start_position = tag_start_position;
 949       taginfo.end_position   = p + 1;
 950       /* Ta-dam! */
 951       (*mapfun) (&taginfo, closure);
 952       ADVANCE (p);
 953     }
 954     goto look_for_tag;
 955
 956   backout_tag:
 957 #ifdef STANDALONE
 958     ++tag_backout_count;
 959 #endif
 960     /* The tag wasn't really a tag.  Treat its contents as ordinary
 961        data characters. */
 962     p = tag_start_position + 1;
 963     goto look_for_tag;
 964   }
 965
 966  finish:
 967   POOL_FREE (pool);
 968   if (!attr_pair_alloca_p)
 969     xfree (pairs);
 970 }
 971
 972 #undef ADVANCE
 973 #undef SKIP_WS
 974 #undef SKIP_NON_WS
 975 \f
 976 #ifdef STANDALONE
 977 static void
 978 test_mapper (struct taginfo *taginfo, void *arg)
 979 {
 980   int i;
 981
 982   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
 983   for (i = 0; i < taginfo->nattrs; i++)
 984     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
 985   putchar ('\n');
 986   ++*(int *)arg;
 987 }
 988
 989 int main ()
 990 {
 991   int size = 256;
 992   char *x = (char *)xmalloc (size);
 993   int length = 0;
 994   int read_count;
 995   int tag_counter = 0;
 996
 997   while ((read_count = fread (x + length, 1, size - length, stdin)))
 998     {
 999       length += read_count;
1000       size <<= 1;
1001       x = (char *)xrealloc (x, size);
1002     }
1003
1004   map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
1005   printf ("TAGS: %d\n", tag_counter);
1006   printf ("Tag backouts:     %d\n", tag_backout_count);
1007   printf ("Comment backouts: %d\n", comment_backout_count);
1008   return 0;
1009 }
1010 #endif /* STANDALONE */