sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 /* The only entry point to this module is map_html_tags(), which see.  */
  21
  22 /* TODO:
  23
  24    - Allow hooks for callers to process contents outside tags.  This
  25      is needed to implement handling <style> and <script>.  The
  26      taginfo structure already carries the information about where the
  27      tags are, but this is not enough, because one would also want to
  28      skip the comments.  (The funny thing is that for <style> and
  29      <script> you *don't* want to skip comments!)
  30
  31    - Create a test suite for regression testing. */
  32
  33 /* HISTORY:
  34
  35    This is the third HTML parser written for Wget.  The first one was
  36    written some time during the Geturl 1.0 beta cycle, and was very
  37    inefficient and buggy.  It also contained some very complex code to
  38    remember a list of parser states, because it was supposed to be
  39    reentrant.  The idea was that several parsers would be running
  40    concurrently, and you'd have pass the function a unique ID string
  41    (for example, the URL) by which it found the relevant parser state
  42    and returned the next URL.  Over-engineering at its best.
  43
  44    The second HTML parser was written for Wget 1.4 (the first version
  45    by the name `Wget'), and was a complete rewrite.  Although the new
  46    parser behaved much better and made no claims of reentrancy, it
  47    still shared many of the fundamental flaws of the old version -- it
  48    only regarded HTML in terms tag-attribute pairs, where the
  49    attribute's value was a URL to be returned.  Any other property of
  50    HTML, such as <base href=...>, or strange way to specify a URL,
  51    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  52    crudely hacked in -- and the caller had to be aware of these hacks.
  53    Like its predecessor, this parser did not support HTML comments.
  54
  55    After Wget 1.5.1 was released, I set out to write a third HTML
  56    parser.  The objectives of the new parser were to: (1) provide a
  57    clean way to analyze HTML lexically, (2) separate interpretation of
  58    the markup from the parsing process, (3) be as correct as possible,
  59    e.g. correctly skipping comments and other SGML declarations, (4)
  60    understand the most common errors in markup and skip them or be
  61    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  62    minimum copying and minimum or no heap allocation).
  63
  64    I believe this parser meets all of the above goals.  It is
  65    reasonably well structured, and could be relatively easily
  66    separated from Wget and used elsewhere.  While some of its
  67    intrinsic properties limit its value as a general-purpose HTML
  68    parser, I believe that, with minimum modifications, it could serve
  69    as a backend for one.
  70
  71    Due to time and other constraints, this parser was not integrated
  72    into Wget until the version 1.7. */
  73
  74 /* DESCRIPTION:
  75
  76    The single entry point of this parser is map_html_tags(), which
  77    works by calling a function you specify for each tag.  The function
  78    gets called with the pointer to a structure describing the tag and
  79    its attributes.  */
  80
  81 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  82    still need Wget headers to compile.  */
  83
  84 #include <config.h>
  85
  86 #ifdef STANDALONE
  87 # define I_REALLY_WANT_CTYPE_MACROS
  88 #endif
  89
  90 #include <stdio.h>
  91 #include <stdlib.h>
  92 #ifdef HAVE_STRING_H
  93 # include <string.h>
  94 #else
  95 # include <strings.h>
  96 #endif
  97 #include <assert.h>
  98
  99 #include "wget.h"
 100 #include "html-parse.h"
 101
 102 #ifdef STANDALONE
 103 # define xmalloc malloc
 104 # define xrealloc realloc
 105 # define xfree free
 106
 107 # define ISSPACE(x) isspace (x)
 108 # define ISDIGIT(x) isdigit (x)
 109 # define ISALPHA(x) isalpha (x)
 110 # define ISALNUM(x) isalnum (x)
 111 # define TOLOWER(x) tolower (x)
 112 #endif /* STANDALONE */
 113
 114 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 115    allocated on the stack, and moved to the heap if it needs to be
 116    larger than originally expected.  map_html_tags() uses it to store
 117    the zero-terminated names and values of tags and attributes.
 118
 119    Thus taginfo->name, and attr->name and attr->value for each
 120    attribute, do not point into separately allocated areas, but into
 121    different parts of the pool, separated only by terminating zeros.
 122    This ensures minimum amount of allocation and, for most tags, no
 123    allocation because the entire pool is kept on the stack.  */
 124
 125 struct pool {
 126   char *contents;               /* pointer to the contents. */
 127   int size;                     /* size of the pool. */
 128   int index;                    /* next unoccupied position in
 129                                    contents. */
 130
 131   int alloca_p;                 /* whether contents was allocated
 132                                    using alloca(). */
 133   char *orig_contents;          /* orig_contents, allocated by
 134                                    alloca().  this is used by
 135                                    POOL_FREE to restore the pool to
 136                                    the "initial" state. */
 137   int orig_size;
 138 };
 139
 140 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 141
 142 #define POOL_INIT(pool, initial_size) do {              \
 143   (pool).size = (initial_size);                         \
 144   (pool).contents = ALLOCA_ARRAY (char, (pool).size);   \
 145   (pool).index = 0;                                     \
 146   (pool).alloca_p = 1;                                  \
 147   (pool).orig_contents = (pool).contents;               \
 148   (pool).orig_size = (pool).size;                       \
 149 } while (0)
 150
 151 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 152    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 153
 154 #define POOL_GROW(pool, increase) do {                                  \
 155   int PG_newsize = (pool).index + increase;                             \
 156   DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize,     \
 157                           (pool).alloca_p, char);                       \
 158 } while (0)
 159
 160 /* Append text in the range [beg, end) to POOL.  No zero-termination
 161    is done.  */
 162
 163 #define POOL_APPEND(pool, beg, end) do {                        \
 164   const char *PA_beg = beg;                                     \
 165   int PA_size = end - PA_beg;                                   \
 166   POOL_GROW (pool, PA_size);                                    \
 167   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 168   (pool).index += PA_size;                                      \
 169 } while (0)
 170
 171 /* The same as the above, but with zero termination. */
 172
 173 #define POOL_APPEND_ZT(pool, beg, end) do {                     \
 174   const char *PA_beg = beg;                                     \
 175   int PA_size = end - PA_beg;                                   \
 176   POOL_GROW (pool, PA_size + 1);                                \
 177   memcpy ((pool).contents + (pool).index, PA_beg, PA_size);     \
 178   (pool).contents[(pool).index + PA_size] = '\0';               \
 179   (pool).index += PA_size + 1;                                  \
 180 } while (0)
 181
 182 /* Forget old pool contents.  The allocated memory is not freed. */
 183 #define POOL_REWIND(pool) pool.index = 0
 184
 185 /* Free heap-allocated memory for contents of POOL.  This calls
 186    xfree() if the memory was allocated through malloc.  It also
 187    restores `contents' and `size' to their original, pre-malloc
 188    values.  That way after POOL_FREE, the pool is fully usable, just
 189    as if it were freshly initialized with POOL_INIT.  */
 190
 191 #define POOL_FREE(pool) do {                    \
 192   if (!(pool).alloca_p)                         \
 193     xfree ((pool).contents);                    \
 194   (pool).contents = (pool).orig_contents;       \
 195   (pool).size = (pool).orig_size;               \
 196   (pool).index = 0;                             \
 197   (pool).alloca_p = 1;                          \
 198 } while (0)
 199
 200 \f
 201 #define AP_DOWNCASE             1
 202 #define AP_PROCESS_ENTITIES     2
 203 #define AP_SKIP_BLANKS          4
 204
 205 /* Copy the text in the range [BEG, END) to POOL, optionally
 206    performing operations specified by FLAGS.  FLAGS may be any
 207    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
 208    with the following meaning:
 209
 210    * AP_DOWNCASE -- downcase all the letters;
 211
 212    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 213    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 214    &nbsp and the numerical entities.
 215
 216    * AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
 217    of text.  */
 218 static void
 219 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 220 {
 221   int old_index = pool->index;
 222   int size;
 223
 224   /* First, skip blanks if required.  We must do this before entities
 225      are processed, so that blanks can still be inserted as, for
 226      instance, `&#32;'.  */
 227   if (flags & AP_SKIP_BLANKS)
 228     {
 229       while (beg < end && ISSPACE (*beg))
 230         ++beg;
 231       while (end > beg && ISSPACE (end[-1]))
 232         --end;
 233     }
 234   size = end - beg;
 235
 236   if (flags & AP_PROCESS_ENTITIES)
 237     {
 238       /* Stack-allocate a copy of text, process entities and copy it
 239          to the pool.  */
 240       char *local_copy = (char *)alloca (size + 1);
 241       const char *from = beg;
 242       char *to = local_copy;
 243
 244       while (from < end)
 245         {
 246           if (*from != '&')
 247             *to++ = *from++;
 248           else
 249             {
 250               const char *save = from;
 251               int remain;
 252
 253               if (++from == end) goto lose;
 254               remain = end - from;
 255
 256               if (*from == '#')
 257                 {
 258                   int numeric;
 259                   ++from;
 260                   if (from == end || !ISDIGIT (*from)) goto lose;
 261                   for (numeric = 0; from < end && ISDIGIT (*from); from++)
 262                     numeric = 10 * numeric + (*from) - '0';
 263                   if (from < end && ISALPHA (*from)) goto lose;
 264                   numeric &= 0xff;
 265                   *to++ = numeric;
 266                 }
 267 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 268                  && !memcmp (from, x, sizeof (x) - 1)           \
 269                  && (*(from + sizeof (x) - 1) == ';'            \
 270                      || remain == sizeof (x) - 1                \
 271                      || !ISALNUM (*(from + sizeof (x) - 1))))
 272               else if (FROB ("lt"))
 273                 *to++ = '<', from += 2;
 274               else if (FROB ("gt"))
 275                 *to++ = '>', from += 2;
 276               else if (FROB ("amp"))
 277                 *to++ = '&', from += 3;
 278               else if (FROB ("quot"))
 279                 *to++ = '\"', from += 4;
 280               /* We don't implement the proposed "Added Latin 1"
 281                  entities (except for nbsp), because it is unnecessary
 282                  in the context of Wget, and would require hashing to
 283                  work efficiently.  */
 284               else if (FROB ("nbsp"))
 285                 *to++ = 160, from += 4;
 286               else
 287                 goto lose;
 288 #undef FROB
 289               /* If the entity was followed by `;', we step over the
 290                  `;'.  Otherwise, it was followed by either a
 291                  non-alphanumeric or EOB, in which case we do nothing.  */
 292               if (from < end && *from == ';')
 293                 ++from;
 294               continue;
 295
 296             lose:
 297               /* This was not an entity after all.  Back out.  */
 298               from = save;
 299               *to++ = *from++;
 300             }
 301         }
 302       *to++ = '\0';
 303       POOL_APPEND (*pool, local_copy, to);
 304     }
 305   else
 306     {
 307       /* Just copy the text to the pool.  */
 308       POOL_APPEND_ZT (*pool, beg, end);
 309     }
 310
 311   if (flags & AP_DOWNCASE)
 312     {
 313       char *p = pool->contents + old_index;
 314       for (; *p; p++)
 315         *p = TOLOWER (*p);
 316     }
 317 }
 318 \f
 319 /* Check whether the contents of [POS, POS+LENGTH) match any of the
 320    strings in the ARRAY.  */
 321 static int
 322 array_allowed (const char **array, const char *beg, const char *end)
 323 {
 324   int length = end - beg;
 325   if (array)
 326     {
 327       for (; *array; array++)
 328         if (length >= strlen (*array)
 329             && !strncasecmp (*array, beg, length))
 330           break;
 331       if (!*array)
 332         return 0;
 333     }
 334   return 1;
 335 }
 336 \f
 337 /* RFC1866: name [of attribute or tag] consists of letters, digits,
 338    periods, or hyphens.  We also allow _, for compatibility with
 339    brain-damaged generators.  */
 340 #define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_')
 341
 342 /* States while advancing through comments. */
 343 #define AC_S_DONE       0
 344 #define AC_S_BACKOUT    1
 345 #define AC_S_BANG       2
 346 #define AC_S_DEFAULT    3
 347 #define AC_S_DCLNAME    4
 348 #define AC_S_DASH1      5
 349 #define AC_S_DASH2      6
 350 #define AC_S_COMMENT    7
 351 #define AC_S_DASH3      8
 352 #define AC_S_DASH4      9
 353 #define AC_S_QUOTE1     10
 354 #define AC_S_IN_QUOTE   11
 355 #define AC_S_QUOTE2     12
 356
 357 #ifdef STANDALONE
 358 static int comment_backout_count;
 359 #endif
 360
 361 /* Advance over an SGML declaration (the <!...> forms you find in HTML
 362    documents).  The function returns the location after the
 363    declaration.  The reason we need this is that HTML comments are
 364    expressed as comments in so-called "empty declarations".
 365
 366    To recap: any SGML declaration may have comments associated with
 367    it, e.g.
 368        <!MY-DECL -- isn't this fun? -- foo bar>
 369
 370    An HTML comment is merely an empty declaration (<!>) with a comment
 371    attached, like this:
 372        <!-- some stuff here -->
 373
 374    Several comments may be embedded in one comment declaration:
 375        <!-- have -- -- fun -->
 376
 377    Whitespace is allowed between and after the comments, but not
 378    before the first comment.
 379
 380    Additionally, this function attempts to handle double quotes in
 381    SGML declarations correctly.  */
 382 static const char *
 383 advance_declaration (const char *beg, const char *end)
 384 {
 385   const char *p = beg;
 386   char quote_char = '\0';       /* shut up, gcc! */
 387   char ch;
 388   int state = AC_S_BANG;
 389
 390   if (beg == end)
 391     return beg;
 392   ch = *p++;
 393
 394   /* It looked like a good idea to write this as a state machine, but
 395      now I wonder...  */
 396
 397   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 398     {
 399       if (p == end)
 400         state = AC_S_BACKOUT;
 401       switch (state)
 402         {
 403         case AC_S_DONE:
 404         case AC_S_BACKOUT:
 405           break;
 406         case AC_S_BANG:
 407           if (ch == '!')
 408             {
 409               ch = *p++;
 410               state = AC_S_DEFAULT;
 411             }
 412           else
 413             state = AC_S_BACKOUT;
 414           break;
 415         case AC_S_DEFAULT:
 416           switch (ch)
 417             {
 418             case '-':
 419               state = AC_S_DASH1;
 420               break;
 421             case ' ':
 422             case '\t':
 423             case '\r':
 424             case '\n':
 425               ch = *p++;
 426               break;
 427             case '>':
 428               state = AC_S_DONE;
 429               break;
 430             case '\'':
 431             case '\"':
 432               state = AC_S_QUOTE1;
 433               break;
 434             default:
 435               if (NAME_CHAR_P (ch))
 436                 state = AC_S_DCLNAME;
 437               else
 438                 state = AC_S_BACKOUT;
 439               break;
 440             }
 441           break;
 442         case AC_S_DCLNAME:
 443           if (NAME_CHAR_P (ch))
 444             ch = *p++;
 445           else if (ch == '-')
 446             state = AC_S_DASH1;
 447           else
 448             state = AC_S_DEFAULT;
 449           break;
 450         case AC_S_QUOTE1:
 451           /* We must use 0x22 because broken assert macros choke on
 452              '"' and '\"'.  */
 453           assert (ch == '\'' || ch == 0x22);
 454           quote_char = ch;      /* cheating -- I really don't feel like
 455                                    introducing more different states for
 456                                    different quote characters. */
 457           ch = *p++;
 458           state = AC_S_IN_QUOTE;
 459           break;
 460         case AC_S_IN_QUOTE:
 461           if (ch == quote_char)
 462             state = AC_S_QUOTE2;
 463           else
 464             ch = *p++;
 465           break;
 466         case AC_S_QUOTE2:
 467           assert (ch == quote_char);
 468           ch = *p++;
 469           state = AC_S_DEFAULT;
 470           break;
 471         case AC_S_DASH1:
 472           assert (ch == '-');
 473           ch = *p++;
 474           state = AC_S_DASH2;
 475           break;
 476         case AC_S_DASH2:
 477           switch (ch)
 478             {
 479             case '-':
 480               ch = *p++;
 481               state = AC_S_COMMENT;
 482               break;
 483             default:
 484               state = AC_S_BACKOUT;
 485             }
 486           break;
 487         case AC_S_COMMENT:
 488           switch (ch)
 489             {
 490             case '-':
 491               state = AC_S_DASH3;
 492               break;
 493             default:
 494               ch = *p++;
 495               break;
 496             }
 497           break;
 498         case AC_S_DASH3:
 499           assert (ch == '-');
 500           ch = *p++;
 501           state = AC_S_DASH4;
 502           break;
 503         case AC_S_DASH4:
 504           switch (ch)
 505             {
 506             case '-':
 507               ch = *p++;
 508               state = AC_S_DEFAULT;
 509               break;
 510             default:
 511               state = AC_S_COMMENT;
 512               break;
 513             }
 514           break;
 515         }
 516     }
 517
 518   if (state == AC_S_BACKOUT)
 519     {
 520 #ifdef STANDALONE
 521       ++comment_backout_count;
 522 #endif
 523       return beg + 1;
 524     }
 525   return p;
 526 }
 527 \f
 528 /* Advance P (a char pointer), with the explicit intent of being able
 529    to read the next character.  If this is not possible, go to finish.  */
 530
 531 #define ADVANCE(p) do {                         \
 532   ++p;                                          \
 533   if (p >= end)                                 \
 534     goto finish;                                \
 535 } while (0)
 536
 537 /* Skip whitespace, if any. */
 538
 539 #define SKIP_WS(p) do {                         \
 540   while (ISSPACE (*p)) {                        \
 541     ADVANCE (p);                                \
 542   }                                             \
 543 } while (0)
 544
 545 /* Skip non-whitespace, if any. */
 546
 547 #define SKIP_NON_WS(p) do {                     \
 548   while (!ISSPACE (*p)) {                       \
 549     ADVANCE (p);                                \
 550   }                                             \
 551 } while (0)
 552
 553 #ifdef STANDALONE
 554 static int tag_backout_count;
 555 #endif
 556
 557 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 558    MAPFUN will be called with two arguments: pointer to an initialized
 559    struct taginfo, and CLOSURE.
 560
 561    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 562    be processed by this function.  If it is NULL, all the tags are
 563    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 564
 565    (Obviously, the caller can filter out unwanted tags and attributes
 566    just as well, but this is just an optimization designed to avoid
 567    unnecessary copying for tags/attributes which the caller doesn't
 568    want to know about.  These lists are searched linearly; therefore,
 569    if you're interested in a large number of tags or attributes, you'd
 570    better set these to NULL and filter them out yourself with a
 571    hashing process most appropriate for your application.)  */
 572
 573 void
 574 map_html_tags (const char *text, int size,
 575                const char **allowed_tag_names,
 576                const char **allowed_attribute_names,
 577                void (*mapfun) (struct taginfo *, void *),
 578                void *closure)
 579 {
 580   const char *p = text;
 581   const char *end = text + size;
 582
 583   int attr_pair_count = 8;
 584   int attr_pair_alloca_p = 1;
 585   struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
 586   struct pool pool;
 587
 588   if (!size)
 589     return;
 590
 591   POOL_INIT (pool, 256);
 592
 593   {
 594     int nattrs, end_tag;
 595     const char *tag_name_begin, *tag_name_end;
 596     const char *tag_start_position;
 597     int uninteresting_tag;
 598
 599   look_for_tag:
 600     POOL_REWIND (pool);
 601
 602     nattrs = 0;
 603     end_tag = 0;
 604
 605     /* Find beginning of tag.  We use memchr() instead of the usual
 606        looping with ADVANCE() for speed. */
 607     p = memchr (p, '<', end - p);
 608     if (!p)
 609       goto finish;
 610
 611     tag_start_position = p;
 612     ADVANCE (p);
 613
 614     /* Establish the type of the tag (start-tag, end-tag or
 615        declaration).  */
 616     if (*p == '!')
 617       {
 618         /* This is an SGML declaration -- just skip it.  */
 619         p = advance_declaration (p, end);
 620         if (p == end)
 621           goto finish;
 622         goto look_for_tag;
 623       }
 624     else if (*p == '/')
 625       {
 626         end_tag = 1;
 627         ADVANCE (p);
 628       }
 629     tag_name_begin = p;
 630     while (NAME_CHAR_P (*p))
 631       ADVANCE (p);
 632     if (p == tag_name_begin)
 633       goto look_for_tag;
 634     tag_name_end = p;
 635     SKIP_WS (p);
 636     if (end_tag && *p != '>')
 637       goto backout_tag;
 638
 639     if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
 640       /* We can't just say "goto look_for_tag" here because we need
 641          the loop below to properly advance over the tag's attributes.  */
 642       uninteresting_tag = 1;
 643     else
 644       {
 645         uninteresting_tag = 0;
 646         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 647       }
 648
 649     /* Find the attributes. */
 650     while (1)
 651       {
 652         const char *attr_name_begin, *attr_name_end;
 653         const char *attr_value_begin, *attr_value_end;
 654         const char *attr_raw_value_begin, *attr_raw_value_end;
 655         int operation = AP_DOWNCASE; /* stupid compiler. */
 656
 657         SKIP_WS (p);
 658
 659         if (*p == '/')
 660           {
 661             /* A slash at this point means the tag is about to be
 662                closed.  This is legal in XML and has been popularized
 663                in HTML via XHTML.  */
 664             /* <foo a=b c=d /> */
 665             /*              ^  */
 666             ADVANCE (p);
 667             SKIP_WS (p);
 668             if (*p != '>')
 669               goto backout_tag;
 670           }
 671
 672         /* Check for end of tag definition. */
 673         if (*p == '>')
 674           break;
 675
 676         /* Establish bounds of attribute name. */
 677         attr_name_begin = p;    /* <foo bar ...> */
 678                                 /*      ^        */
 679         while (NAME_CHAR_P (*p))
 680           ADVANCE (p);
 681         attr_name_end = p;      /* <foo bar ...> */
 682                                 /*         ^     */
 683         if (attr_name_begin == attr_name_end)
 684           goto backout_tag;
 685
 686         /* Establish bounds of attribute value. */
 687         SKIP_WS (p);
 688         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 689           {
 690             /* Minimized attribute syntax allows `=' to be omitted.
 691                For example, <UL COMPACT> is a valid shorthand for <UL
 692                COMPACT="compact">.  Even if such attributes are not
 693                useful to Wget, we need to support them, so that the
 694                tags containing them can be parsed correctly. */
 695             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 696             attr_raw_value_end = attr_value_end = attr_name_end;
 697           }
 698         else if (*p == '=')
 699           {
 700             ADVANCE (p);
 701             SKIP_WS (p);
 702             if (*p == '\"' || *p == '\'')
 703               {
 704                 int newline_seen = 0;
 705                 char quote_char = *p;
 706                 attr_raw_value_begin = p;
 707                 ADVANCE (p);
 708                 attr_value_begin = p; /* <foo bar="baz"> */
 709                                       /*           ^     */
 710                 while (*p != quote_char)
 711                   {
 712                     if (!newline_seen && *p == '\n')
 713                       {
 714                         /* If a newline is seen within the quotes, it
 715                            is most likely that someone forgot to close
 716                            the quote.  In that case, we back out to
 717                            the value beginning, and terminate the tag
 718                            at either `>' or the delimiter, whichever
 719                            comes first.  Such a tag terminated at `>'
 720                            is discarded.  */
 721                         p = attr_value_begin;
 722                         newline_seen = 1;
 723                         continue;
 724                       }
 725                     else if (newline_seen && *p == '>')
 726                       break;
 727                     ADVANCE (p);
 728                   }
 729                 attr_value_end = p; /* <foo bar="baz"> */
 730                                     /*              ^  */
 731                 if (*p == quote_char)
 732                   ADVANCE (p);
 733                 else
 734                   goto look_for_tag;
 735                 attr_raw_value_end = p; /* <foo bar="baz"> */
 736                                         /*               ^ */
 737                 /* The AP_SKIP_BLANKS part is not entirely correct,
 738                    because we don't want to skip blanks for all the
 739                    attribute values.  */
 740                 operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
 741               }
 742             else
 743               {
 744                 attr_value_begin = p; /* <foo bar=baz> */
 745                                       /*          ^    */
 746                 /* According to SGML, a name token should consist only
 747                    of alphanumerics, . and -.  However, this is often
 748                    violated by, for instance, `%' in `width=75%'.
 749                    We'll be liberal and allow just about anything as
 750                    an attribute value.  */
 751                 while (!ISSPACE (*p) && *p != '>')
 752                   ADVANCE (p);
 753                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 754                                     /*             ^          */
 755                 if (attr_value_begin == attr_value_end)
 756                   /* <foo bar=> */
 757                   /*          ^ */
 758                   goto backout_tag;
 759                 attr_raw_value_begin = attr_value_begin;
 760                 attr_raw_value_end = attr_value_end;
 761                 operation = AP_PROCESS_ENTITIES;
 762               }
 763           }
 764         else
 765           {
 766             /* We skipped the whitespace and found something that is
 767                neither `=' nor the beginning of the next attribute's
 768                name.  Back out.  */
 769             goto backout_tag;   /* <foo bar [... */
 770                                 /*          ^    */
 771           }
 772
 773         /* If we're not interested in the tag, don't bother with any
 774            of the attributes.  */
 775         if (uninteresting_tag)
 776           continue;
 777
 778         /* If we aren't interested in the attribute, skip it.  We
 779            cannot do this test any sooner, because our text pointer
 780            needs to correctly advance over the attribute.  */
 781         if (allowed_attribute_names
 782             && !array_allowed (allowed_attribute_names, attr_name_begin,
 783                                attr_name_end))
 784           continue;
 785
 786         DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
 787                                 attr_pair_alloca_p, struct attr_pair);
 788
 789         pairs[nattrs].name_pool_index = pool.index;
 790         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 791
 792         pairs[nattrs].value_pool_index = pool.index;
 793         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 794         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 795         pairs[nattrs].value_raw_size = (attr_raw_value_end
 796                                         - attr_raw_value_begin);
 797         ++nattrs;
 798       }
 799
 800     if (uninteresting_tag)
 801       {
 802         ADVANCE (p);
 803         goto look_for_tag;
 804       }
 805
 806     /* By now, we have a valid tag with a name and zero or more
 807        attributes.  Fill in the data and call the mapper function.  */
 808     {
 809       int i;
 810       struct taginfo taginfo;
 811
 812       taginfo.name      = pool.contents;
 813       taginfo.end_tag_p = end_tag;
 814       taginfo.nattrs    = nattrs;
 815       /* We fill in the char pointers only now, when pool can no
 816          longer get realloc'ed.  If we did that above, we could get
 817          hosed by reallocation.  Obviously, after this point, the pool
 818          may no longer be grown.  */
 819       for (i = 0; i < nattrs; i++)
 820         {
 821           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 822           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 823         }
 824       taginfo.attrs = pairs;
 825       taginfo.start_position = tag_start_position;
 826       taginfo.end_position   = p + 1;
 827       /* Ta-dam! */
 828       (*mapfun) (&taginfo, closure);
 829       ADVANCE (p);
 830     }
 831     goto look_for_tag;
 832
 833   backout_tag:
 834 #ifdef STANDALONE
 835     ++tag_backout_count;
 836 #endif
 837     /* The tag wasn't really a tag.  Treat its contents as ordinary
 838        data characters. */
 839     p = tag_start_position + 1;
 840     goto look_for_tag;
 841   }
 842
 843  finish:
 844   POOL_FREE (pool);
 845   if (!attr_pair_alloca_p)
 846     xfree (pairs);
 847 }
 848
 849 #undef ADVANCE
 850 #undef SKIP_WS
 851 #undef SKIP_NON_WS
 852 \f
 853 #ifdef STANDALONE
 854 static void
 855 test_mapper (struct taginfo *taginfo, void *arg)
 856 {
 857   int i;
 858
 859   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
 860   for (i = 0; i < taginfo->nattrs; i++)
 861     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
 862   putchar ('\n');
 863   ++*(int *)arg;
 864 }
 865
 866 int main ()
 867 {
 868   int size = 256;
 869   char *x = (char *)xmalloc (size);
 870   int length = 0;
 871   int read_count;
 872   int tag_counter = 0;
 873
 874   while ((read_count = fread (x + length, 1, size - length, stdin)))
 875     {
 876       length += read_count;
 877       size <<= 1;
 878       x = (char *)xrealloc (x, size);
 879     }
 880
 881   map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
 882   printf ("TAGS: %d\n", tag_counter);
 883   printf ("Tag backouts:     %d\n", tag_backout_count);
 884   printf ("Comment backouts: %d\n", comment_backout_count);
 885   return 0;
 886 }
 887 #endif /* STANDALONE */