sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.  The idea was that several parsers would be running
  50    concurrently, and you'd have pass the function a unique ID string
  51    (for example, the URL) by which it found the relevant parser state
  52    and returned the next URL.  Over-engineering at its best.
  53
  54    The second HTML parser was written for Wget 1.4 (the first version
  55    by the name `Wget'), and was a complete rewrite.  Although the new
  56    parser behaved much better and made no claims of reentrancy, it
  57    still shared many of the fundamental flaws of the old version -- it
  58    only regarded HTML in terms tag-attribute pairs, where the
  59    attribute's value was a URL to be returned.  Any other property of
  60    HTML, such as <base href=...>, or strange way to specify a URL,
  61    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  62    crudely hacked in -- and the caller had to be aware of these hacks.
  63    Like its predecessor, this parser did not support HTML comments.
  64
  65    After Wget 1.5.1 was released, I set out to write a third HTML
  66    parser.  The objectives of the new parser were to: (1) provide a
  67    clean way to analyze HTML lexically, (2) separate interpretation of
  68    the markup from the parsing process, (3) be as correct as possible,
  69    e.g. correctly skipping comments and other SGML declarations, (4)
  70    understand the most common errors in markup and skip them or be
  71    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  72    minimum copying and minimum or no heap allocation).
  73
  74    I believe this parser meets all of the above goals.  It is
  75    reasonably well structured, and could be relatively easily
  76    separated from Wget and used elsewhere.  While some of its
  77    intrinsic properties limit its value as a general-purpose HTML
  78    parser, I believe that, with minimum modifications, it could serve
  79    as a backend for one.
  80
  81    Due to time and other constraints, this parser was not integrated
  82    into Wget until the version 1.7. */
  83
  84 /* DESCRIPTION:
  85
  86    The single entry point of this parser is map_html_tags(), which
  87    works by calling a function you specify for each tag.  The function
  88    gets called with the pointer to a structure describing the tag and
  89    its attributes.  */
  90
  91 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  92    still need Wget headers to compile.  */
  93
  94 #include <config.h>
  95
  96 #ifdef STANDALONE
  97 # define I_REALLY_WANT_CTYPE_MACROS
  98 #endif
  99
 100 #include <stdio.h>
 101 #include <stdlib.h>
 102 #ifdef HAVE_STRING_H
 103 # include <string.h>
 104 #else
 105 # include <strings.h>
 106 #endif
 107 #include <assert.h>
 108
 109 #include "wget.h"
 110 #include "html-parse.h"
 111
 112 #ifdef STANDALONE
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # define ISSPACE(x) isspace (x)
 118 # define ISDIGIT(x) isdigit (x)
 119 # define ISALPHA(x) isalpha (x)
 120 # define ISALNUM(x) isalnum (x)
 121 # define TOLOWER(x) tolower (x)
 122 #endif /* STANDALONE */
 123
 124 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 125    allocated on the stack, and moved to the heap if it needs to be
 126    larger than originally expected.  map_html_tags() uses it to store
 127    the zero-terminated names and values of tags and attributes.
 128
 129    Thus taginfo->name, and attr->name and attr->value for each
 130    attribute, do not point into separately allocated areas, but into
 131    different parts of the pool, separated only by terminating zeros.
 132    This ensures minimum amount of allocation and, for most tags, no
 133    allocation because the entire pool is kept on the stack.  */
 134
 135 struct pool {
 136   char *contents;               /* pointer to the contents. */
 137   int size;                     /* size of the pool. */
 138   int index;                    /* next unoccupied position in
 139                                    contents. */
 140
 141   int alloca_p;                 /* whether contents was allocated
 142                                    using alloca(). */
 143   char *orig_contents;          /* orig_contents, allocated by
 144                                    alloca().  this is used by
 145                                    POOL_FREE to restore the pool to
 146                                    the "initial" state. */
 147   int orig_size;
 148 };
 149
 150 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 151
 152 #define POOL_INIT(pool, initial_size) do {              \
 153   (pool).size = (initial_size);                         \
 154   (pool).contents = ALLOCA_ARRAY (char, (pool).size);   \
 155   (pool).index = 0;                                     \
 156   (pool).alloca_p = 1;                                  \
 157   (pool).orig_contents = (pool).contents;               \
 158   (pool).orig_size = (pool).size;                       \
 159 } while (0)
 160
 161 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 162    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 163
 164 #define POOL_GROW(pool, increase) do {                                  \
 165   int PG_newsize = (pool).index + increase;                             \
 166   DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize,     \
 167                           (pool).alloca_p, char);                       \
 168 } while (0)
 169
 170 /* Append text in the range [beg, end) to POOL.  No zero-termination
 171    is done.  */
 172
 173 #define POOL_APPEND(pool, beg, end) do {                        \
 174   const char *PA_beg = beg;                                     \
 175   int PA_size = end - PA_beg;                                   \
 176   POOL_GROW (pool, PA_size);                                    \
 177   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 178   (pool).index += PA_size;                                      \
 179 } while (0)
 180
 181 /* The same as the above, but with zero termination. */
 182
 183 #define POOL_APPEND_ZT(pool, beg, end) do {                     \
 184   const char *PA_beg = beg;                                     \
 185   int PA_size = end - PA_beg;                                   \
 186   POOL_GROW (pool, PA_size + 1);                                \
 187   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 188   (pool).contents[(pool).index + PA_size] = '\0';               \
 189   (pool).index += PA_size + 1;                                  \
 190 } while (0)
 191
 192 /* Forget old pool contents.  The allocated memory is not freed. */
 193 #define POOL_REWIND(pool) pool.index = 0
 194
 195 /* Free heap-allocated memory for contents of POOL.  This calls
 196    xfree() if the memory was allocated through malloc.  It also
 197    restores `contents' and `size' to their original, pre-malloc
 198    values.  That way after POOL_FREE, the pool is fully usable, just
 199    as if it were freshly initialized with POOL_INIT.  */
 200
 201 #define POOL_FREE(pool) do {                    \
 202   if (!(pool).alloca_p)                         \
 203     xfree ((pool).contents);                    \
 204   (pool).contents = (pool).orig_contents;       \
 205   (pool).size = (pool).orig_size;               \
 206   (pool).index = 0;                             \
 207   (pool).alloca_p = 1;                          \
 208 } while (0)
 209
 210 \f
 211 #define AP_DOWNCASE             1
 212 #define AP_PROCESS_ENTITIES     2
 213 #define AP_SKIP_BLANKS          4
 214
 215 /* Copy the text in the range [BEG, END) to POOL, optionally
 216    performing operations specified by FLAGS.  FLAGS may be any
 217    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
 218    with the following meaning:
 219
 220    * AP_DOWNCASE -- downcase all the letters;
 221
 222    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 223    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 224    &nbsp and the numerical entities.
 225
 226    * AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
 227    of text.  */
 228 static void
 229 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 230 {
 231   int old_index = pool->index;
 232   int size;
 233
 234   /* First, skip blanks if required.  We must do this before entities
 235      are processed, so that blanks can still be inserted as, for
 236      instance, `&#32;'.  */
 237   if (flags & AP_SKIP_BLANKS)
 238     {
 239       while (beg < end && ISSPACE (*beg))
 240         ++beg;
 241       while (end > beg && ISSPACE (end[-1]))
 242         --end;
 243     }
 244   size = end - beg;
 245
 246   if (flags & AP_PROCESS_ENTITIES)
 247     {
 248       /* Stack-allocate a copy of text, process entities and copy it
 249          to the pool.  */
 250       char *local_copy = (char *)alloca (size + 1);
 251       const char *from = beg;
 252       char *to = local_copy;
 253
 254       while (from < end)
 255         {
 256           if (*from != '&')
 257             *to++ = *from++;
 258           else
 259             {
 260               const char *save = from;
 261               int remain;
 262
 263               if (++from == end) goto lose;
 264               remain = end - from;
 265
 266               if (*from == '#')
 267                 {
 268                   int numeric;
 269                   ++from;
 270                   if (from == end || !ISDIGIT (*from)) goto lose;
 271                   for (numeric = 0; from < end && ISDIGIT (*from); from++)
 272                     numeric = 10 * numeric + (*from) - '0';
 273                   if (from < end && ISALPHA (*from)) goto lose;
 274                   numeric &= 0xff;
 275                   *to++ = numeric;
 276                 }
 277 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 278                  && !memcmp (from, x, sizeof (x) - 1)           \
 279                  && (*(from + sizeof (x) - 1) == ';'            \
 280                      || remain == sizeof (x) - 1                \
 281                      || !ISALNUM (*(from + sizeof (x) - 1))))
 282               else if (FROB ("lt"))
 283                 *to++ = '<', from += 2;
 284               else if (FROB ("gt"))
 285                 *to++ = '>', from += 2;
 286               else if (FROB ("amp"))
 287                 *to++ = '&', from += 3;
 288               else if (FROB ("quot"))
 289                 *to++ = '\"', from += 4;
 290               /* We don't implement the proposed "Added Latin 1"
 291                  entities (except for nbsp), because it is unnecessary
 292                  in the context of Wget, and would require hashing to
 293                  work efficiently.  */
 294               else if (FROB ("nbsp"))
 295                 *to++ = 160, from += 4;
 296               else
 297                 goto lose;
 298 #undef FROB
 299               /* If the entity was followed by `;', we step over the
 300                  `;'.  Otherwise, it was followed by either a
 301                  non-alphanumeric or EOB, in which case we do nothing.  */
 302               if (from < end && *from == ';')
 303                 ++from;
 304               continue;
 305
 306             lose:
 307               /* This was not an entity after all.  Back out.  */
 308               from = save;
 309               *to++ = *from++;
 310             }
 311         }
 312       *to++ = '\0';
 313       POOL_APPEND (*pool, local_copy, to);
 314     }
 315   else
 316     {
 317       /* Just copy the text to the pool.  */
 318       POOL_APPEND_ZT (*pool, beg, end);
 319     }
 320
 321   if (flags & AP_DOWNCASE)
 322     {
 323       char *p = pool->contents + old_index;
 324       for (; *p; p++)
 325         *p = TOLOWER (*p);
 326     }
 327 }
 328 \f
 329 /* Check whether the contents of [POS, POS+LENGTH) match any of the
 330    strings in the ARRAY.  */
 331 static int
 332 array_allowed (const char **array, const char *beg, const char *end)
 333 {
 334   int length = end - beg;
 335   if (array)
 336     {
 337       for (; *array; array++)
 338         if (length >= strlen (*array)
 339             && !strncasecmp (*array, beg, length))
 340           break;
 341       if (!*array)
 342         return 0;
 343     }
 344   return 1;
 345 }
 346 \f
 347 /* Originally we used to adhere to RFC1866 here, and allowed only
 348    letters, digits, periods, and hyphens as names (of tags or
 349    attributes).  However, this broke too many pages which used
 350    proprietary or strange attributes, e.g.  <img src="a.gif"
 351    v:shapes="whatever">.
 352
 353    So now we allow any character except:
 354      * whitespace
 355      * 8-bit and control chars
 356      * characters that clearly cannot be part of name:
 357        '=', '>', '/'.
 358
 359    This only affects attribute and tag names; attribute values allow
 360    an even greater variety of characters.  */
 361
 362 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 363                         && (x) != '=' && (x) != '>' && (x) != '/')
 364
 365 /* States while advancing through comments. */
 366 #define AC_S_DONE       0
 367 #define AC_S_BACKOUT    1
 368 #define AC_S_BANG       2
 369 #define AC_S_DEFAULT    3
 370 #define AC_S_DCLNAME    4
 371 #define AC_S_DASH1      5
 372 #define AC_S_DASH2      6
 373 #define AC_S_COMMENT    7
 374 #define AC_S_DASH3      8
 375 #define AC_S_DASH4      9
 376 #define AC_S_QUOTE1     10
 377 #define AC_S_IN_QUOTE   11
 378 #define AC_S_QUOTE2     12
 379
 380 #ifdef STANDALONE
 381 static int comment_backout_count;
 382 #endif
 383
 384 /* Advance over an SGML declaration (the <!...> forms you find in HTML
 385    documents).  The function returns the location after the
 386    declaration.  The reason we need this is that HTML comments are
 387    expressed as comments in so-called "empty declarations".
 388
 389    To recap: any SGML declaration may have comments associated with
 390    it, e.g.
 391        <!MY-DECL -- isn't this fun? -- foo bar>
 392
 393    An HTML comment is merely an empty declaration (<!>) with a comment
 394    attached, like this:
 395        <!-- some stuff here -->
 396
 397    Several comments may be embedded in one comment declaration:
 398        <!-- have -- -- fun -->
 399
 400    Whitespace is allowed between and after the comments, but not
 401    before the first comment.
 402
 403    Additionally, this function attempts to handle double quotes in
 404    SGML declarations correctly.  */
 405 static const char *
 406 advance_declaration (const char *beg, const char *end)
 407 {
 408   const char *p = beg;
 409   char quote_char = '\0';       /* shut up, gcc! */
 410   char ch;
 411   int state = AC_S_BANG;
 412
 413   if (beg == end)
 414     return beg;
 415   ch = *p++;
 416
 417   /* It looked like a good idea to write this as a state machine, but
 418      now I wonder...  */
 419
 420   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 421     {
 422       if (p == end)
 423         state = AC_S_BACKOUT;
 424       switch (state)
 425         {
 426         case AC_S_DONE:
 427         case AC_S_BACKOUT:
 428           break;
 429         case AC_S_BANG:
 430           if (ch == '!')
 431             {
 432               ch = *p++;
 433               state = AC_S_DEFAULT;
 434             }
 435           else
 436             state = AC_S_BACKOUT;
 437           break;
 438         case AC_S_DEFAULT:
 439           switch (ch)
 440             {
 441             case '-':
 442               state = AC_S_DASH1;
 443               break;
 444             case ' ':
 445             case '\t':
 446             case '\r':
 447             case '\n':
 448               ch = *p++;
 449               break;
 450             case '>':
 451               state = AC_S_DONE;
 452               break;
 453             case '\'':
 454             case '\"':
 455               state = AC_S_QUOTE1;
 456               break;
 457             default:
 458               if (NAME_CHAR_P (ch))
 459                 state = AC_S_DCLNAME;
 460               else
 461                 state = AC_S_BACKOUT;
 462               break;
 463             }
 464           break;
 465         case AC_S_DCLNAME:
 466           if (ch == '-')
 467             state = AC_S_DASH1;
 468           else if (NAME_CHAR_P (ch))
 469             ch = *p++;
 470           else
 471             state = AC_S_DEFAULT;
 472           break;
 473         case AC_S_QUOTE1:
 474           /* We must use 0x22 because broken assert macros choke on
 475              '"' and '\"'.  */
 476           assert (ch == '\'' || ch == 0x22);
 477           quote_char = ch;      /* cheating -- I really don't feel like
 478                                    introducing more different states for
 479                                    different quote characters. */
 480           ch = *p++;
 481           state = AC_S_IN_QUOTE;
 482           break;
 483         case AC_S_IN_QUOTE:
 484           if (ch == quote_char)
 485             state = AC_S_QUOTE2;
 486           else
 487             ch = *p++;
 488           break;
 489         case AC_S_QUOTE2:
 490           assert (ch == quote_char);
 491           ch = *p++;
 492           state = AC_S_DEFAULT;
 493           break;
 494         case AC_S_DASH1:
 495           assert (ch == '-');
 496           ch = *p++;
 497           state = AC_S_DASH2;
 498           break;
 499         case AC_S_DASH2:
 500           switch (ch)
 501             {
 502             case '-':
 503               ch = *p++;
 504               state = AC_S_COMMENT;
 505               break;
 506             default:
 507               state = AC_S_BACKOUT;
 508             }
 509           break;
 510         case AC_S_COMMENT:
 511           switch (ch)
 512             {
 513             case '-':
 514               state = AC_S_DASH3;
 515               break;
 516             default:
 517               ch = *p++;
 518               break;
 519             }
 520           break;
 521         case AC_S_DASH3:
 522           assert (ch == '-');
 523           ch = *p++;
 524           state = AC_S_DASH4;
 525           break;
 526         case AC_S_DASH4:
 527           switch (ch)
 528             {
 529             case '-':
 530               ch = *p++;
 531               state = AC_S_DEFAULT;
 532               break;
 533             default:
 534               state = AC_S_COMMENT;
 535               break;
 536             }
 537           break;
 538         }
 539     }
 540
 541   if (state == AC_S_BACKOUT)
 542     {
 543 #ifdef STANDALONE
 544       ++comment_backout_count;
 545 #endif
 546       return beg + 1;
 547     }
 548   return p;
 549 }
 550 \f
 551 /* Advance P (a char pointer), with the explicit intent of being able
 552    to read the next character.  If this is not possible, go to finish.  */
 553
 554 #define ADVANCE(p) do {                         \
 555   ++p;                                          \
 556   if (p >= end)                                 \
 557     goto finish;                                \
 558 } while (0)
 559
 560 /* Skip whitespace, if any. */
 561
 562 #define SKIP_WS(p) do {                         \
 563   while (ISSPACE (*p)) {                        \
 564     ADVANCE (p);                                \
 565   }                                             \
 566 } while (0)
 567
 568 /* Skip non-whitespace, if any. */
 569
 570 #define SKIP_NON_WS(p) do {                     \
 571   while (!ISSPACE (*p)) {                       \
 572     ADVANCE (p);                                \
 573   }                                             \
 574 } while (0)
 575
 576 #ifdef STANDALONE
 577 static int tag_backout_count;
 578 #endif
 579
 580 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 581    MAPFUN will be called with two arguments: pointer to an initialized
 582    struct taginfo, and CLOSURE.
 583
 584    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 585    be processed by this function.  If it is NULL, all the tags are
 586    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 587
 588    (Obviously, the caller can filter out unwanted tags and attributes
 589    just as well, but this is just an optimization designed to avoid
 590    unnecessary copying for tags/attributes which the caller doesn't
 591    want to know about.  These lists are searched linearly; therefore,
 592    if you're interested in a large number of tags or attributes, you'd
 593    better set these to NULL and filter them out yourself with a
 594    hashing process most appropriate for your application.)  */
 595
 596 void
 597 map_html_tags (const char *text, int size,
 598                const char **allowed_tag_names,
 599                const char **allowed_attribute_names,
 600                void (*mapfun) (struct taginfo *, void *),
 601                void *closure)
 602 {
 603   const char *p = text;
 604   const char *end = text + size;
 605
 606   int attr_pair_count = 8;
 607   int attr_pair_alloca_p = 1;
 608   struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
 609   struct pool pool;
 610
 611   if (!size)
 612     return;
 613
 614   POOL_INIT (pool, 256);
 615
 616   {
 617     int nattrs, end_tag;
 618     const char *tag_name_begin, *tag_name_end;
 619     const char *tag_start_position;
 620     int uninteresting_tag;
 621
 622   look_for_tag:
 623     POOL_REWIND (pool);
 624
 625     nattrs = 0;
 626     end_tag = 0;
 627
 628     /* Find beginning of tag.  We use memchr() instead of the usual
 629        looping with ADVANCE() for speed. */
 630     p = memchr (p, '<', end - p);
 631     if (!p)
 632       goto finish;
 633
 634     tag_start_position = p;
 635     ADVANCE (p);
 636
 637     /* Establish the type of the tag (start-tag, end-tag or
 638        declaration).  */
 639     if (*p == '!')
 640       {
 641         /* This is an SGML declaration -- just skip it.  */
 642         p = advance_declaration (p, end);
 643         if (p == end)
 644           goto finish;
 645         goto look_for_tag;
 646       }
 647     else if (*p == '/')
 648       {
 649         end_tag = 1;
 650         ADVANCE (p);
 651       }
 652     tag_name_begin = p;
 653     while (NAME_CHAR_P (*p))
 654       ADVANCE (p);
 655     if (p == tag_name_begin)
 656       goto look_for_tag;
 657     tag_name_end = p;
 658     SKIP_WS (p);
 659     if (end_tag && *p != '>')
 660       goto backout_tag;
 661
 662     if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
 663       /* We can't just say "goto look_for_tag" here because we need
 664          the loop below to properly advance over the tag's attributes.  */
 665       uninteresting_tag = 1;
 666     else
 667       {
 668         uninteresting_tag = 0;
 669         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 670       }
 671
 672     /* Find the attributes. */
 673     while (1)
 674       {
 675         const char *attr_name_begin, *attr_name_end;
 676         const char *attr_value_begin, *attr_value_end;
 677         const char *attr_raw_value_begin, *attr_raw_value_end;
 678         int operation = AP_DOWNCASE; /* stupid compiler. */
 679
 680         SKIP_WS (p);
 681
 682         if (*p == '/')
 683           {
 684             /* A slash at this point means the tag is about to be
 685                closed.  This is legal in XML and has been popularized
 686                in HTML via XHTML.  */
 687             /* <foo a=b c=d /> */
 688             /*              ^  */
 689             ADVANCE (p);
 690             SKIP_WS (p);
 691             if (*p != '>')
 692               goto backout_tag;
 693           }
 694
 695         /* Check for end of tag definition. */
 696         if (*p == '>')
 697           break;
 698
 699         /* Establish bounds of attribute name. */
 700         attr_name_begin = p;    /* <foo bar ...> */
 701                                 /*      ^        */
 702         while (NAME_CHAR_P (*p))
 703           ADVANCE (p);
 704         attr_name_end = p;      /* <foo bar ...> */
 705                                 /*         ^     */
 706         if (attr_name_begin == attr_name_end)
 707           goto backout_tag;
 708
 709         /* Establish bounds of attribute value. */
 710         SKIP_WS (p);
 711         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 712           {
 713             /* Minimized attribute syntax allows `=' to be omitted.
 714                For example, <UL COMPACT> is a valid shorthand for <UL
 715                COMPACT="compact">.  Even if such attributes are not
 716                useful to Wget, we need to support them, so that the
 717                tags containing them can be parsed correctly. */
 718             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 719             attr_raw_value_end = attr_value_end = attr_name_end;
 720           }
 721         else if (*p == '=')
 722           {
 723             ADVANCE (p);
 724             SKIP_WS (p);
 725             if (*p == '\"' || *p == '\'')
 726               {
 727                 int newline_seen = 0;
 728                 char quote_char = *p;
 729                 attr_raw_value_begin = p;
 730                 ADVANCE (p);
 731                 attr_value_begin = p; /* <foo bar="baz"> */
 732                                       /*           ^     */
 733                 while (*p != quote_char)
 734                   {
 735                     if (!newline_seen && *p == '\n')
 736                       {
 737                         /* If a newline is seen within the quotes, it
 738                            is most likely that someone forgot to close
 739                            the quote.  In that case, we back out to
 740                            the value beginning, and terminate the tag
 741                            at either `>' or the delimiter, whichever
 742                            comes first.  Such a tag terminated at `>'
 743                            is discarded.  */
 744                         p = attr_value_begin;
 745                         newline_seen = 1;
 746                         continue;
 747                       }
 748                     else if (newline_seen && *p == '>')
 749                       break;
 750                     ADVANCE (p);
 751                   }
 752                 attr_value_end = p; /* <foo bar="baz"> */
 753                                     /*              ^  */
 754                 if (*p == quote_char)
 755                   ADVANCE (p);
 756                 else
 757                   goto look_for_tag;
 758                 attr_raw_value_end = p; /* <foo bar="baz"> */
 759                                         /*               ^ */
 760                 /* The AP_SKIP_BLANKS part is not entirely correct,
 761                    because we don't want to skip blanks for all the
 762                    attribute values.  */
 763                 operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
 764               }
 765             else
 766               {
 767                 attr_value_begin = p; /* <foo bar=baz> */
 768                                       /*          ^    */
 769                 /* According to SGML, a name token should consist only
 770                    of alphanumerics, . and -.  However, this is often
 771                    violated by, for instance, `%' in `width=75%'.
 772                    We'll be liberal and allow just about anything as
 773                    an attribute value.  */
 774                 while (!ISSPACE (*p) && *p != '>')
 775                   ADVANCE (p);
 776                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 777                                     /*             ^          */
 778                 if (attr_value_begin == attr_value_end)
 779                   /* <foo bar=> */
 780                   /*          ^ */
 781                   goto backout_tag;
 782                 attr_raw_value_begin = attr_value_begin;
 783                 attr_raw_value_end = attr_value_end;
 784                 operation = AP_PROCESS_ENTITIES;
 785               }
 786           }
 787         else
 788           {
 789             /* We skipped the whitespace and found something that is
 790                neither `=' nor the beginning of the next attribute's
 791                name.  Back out.  */
 792             goto backout_tag;   /* <foo bar [... */
 793                                 /*          ^    */
 794           }
 795
 796         /* If we're not interested in the tag, don't bother with any
 797            of the attributes.  */
 798         if (uninteresting_tag)
 799           continue;
 800
 801         /* If we aren't interested in the attribute, skip it.  We
 802            cannot do this test any sooner, because our text pointer
 803            needs to correctly advance over the attribute.  */
 804         if (allowed_attribute_names
 805             && !array_allowed (allowed_attribute_names, attr_name_begin,
 806                                attr_name_end))
 807           continue;
 808
 809         DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
 810                                 attr_pair_alloca_p, struct attr_pair);
 811
 812         pairs[nattrs].name_pool_index = pool.index;
 813         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 814
 815         pairs[nattrs].value_pool_index = pool.index;
 816         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 817         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 818         pairs[nattrs].value_raw_size = (attr_raw_value_end
 819                                         - attr_raw_value_begin);
 820         ++nattrs;
 821       }
 822
 823     if (uninteresting_tag)
 824       {
 825         ADVANCE (p);
 826         goto look_for_tag;
 827       }
 828
 829     /* By now, we have a valid tag with a name and zero or more
 830        attributes.  Fill in the data and call the mapper function.  */
 831     {
 832       int i;
 833       struct taginfo taginfo;
 834
 835       taginfo.name      = pool.contents;
 836       taginfo.end_tag_p = end_tag;
 837       taginfo.nattrs    = nattrs;
 838       /* We fill in the char pointers only now, when pool can no
 839          longer get realloc'ed.  If we did that above, we could get
 840          hosed by reallocation.  Obviously, after this point, the pool
 841          may no longer be grown.  */
 842       for (i = 0; i < nattrs; i++)
 843         {
 844           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 845           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 846         }
 847       taginfo.attrs = pairs;
 848       taginfo.start_position = tag_start_position;
 849       taginfo.end_position   = p + 1;
 850       /* Ta-dam! */
 851       (*mapfun) (&taginfo, closure);
 852       ADVANCE (p);
 853     }
 854     goto look_for_tag;
 855
 856   backout_tag:
 857 #ifdef STANDALONE
 858     ++tag_backout_count;
 859 #endif
 860     /* The tag wasn't really a tag.  Treat its contents as ordinary
 861        data characters. */
 862     p = tag_start_position + 1;
 863     goto look_for_tag;
 864   }
 865
 866  finish:
 867   POOL_FREE (pool);
 868   if (!attr_pair_alloca_p)
 869     xfree (pairs);
 870 }
 871
 872 #undef ADVANCE
 873 #undef SKIP_WS
 874 #undef SKIP_NON_WS
 875 \f
 876 #ifdef STANDALONE
 877 static void
 878 test_mapper (struct taginfo *taginfo, void *arg)
 879 {
 880   int i;
 881
 882   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
 883   for (i = 0; i < taginfo->nattrs; i++)
 884     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
 885   putchar ('\n');
 886   ++*(int *)arg;
 887 }
 888
 889 int main ()
 890 {
 891   int size = 256;
 892   char *x = (char *)xmalloc (size);
 893   int length = 0;
 894   int read_count;
 895   int tag_counter = 0;
 896
 897   while ((read_count = fread (x + length, 1, size - length, stdin)))
 898     {
 899       length += read_count;
 900       size <<= 1;
 901       x = (char *)xrealloc (x, size);
 902     }
 903
 904   map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
 905   printf ("TAGS: %d\n", tag_counter);
 906   printf ("Tag backouts:     %d\n", tag_backout_count);
 907   printf ("Comment backouts: %d\n", comment_backout_count);
 908   return 0;
 909 }
 910 #endif /* STANDALONE */