sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #ifdef HAVE_STRING_H
 100 # include <string.h>
 101 #else
 102 # include <strings.h>
 103 #endif
 104 #include <assert.h>
 105
 106 #include "wget.h"
 107 #include "html-parse.h"
 108
 109 #ifdef STANDALONE
 110 # undef xmalloc
 111 # undef xrealloc
 112 # undef xfree
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # undef ISSPACE
 118 # undef ISDIGIT
 119 # undef ISXDIGIT
 120 # undef ISALPHA
 121 # undef ISALNUM
 122 # undef TOLOWER
 123 # undef TOUPPER
 124
 125 # define ISSPACE(x) isspace (x)
 126 # define ISDIGIT(x) isdigit (x)
 127 # define ISXDIGIT(x) isxdigit (x)
 128 # define ISALPHA(x) isalpha (x)
 129 # define ISALNUM(x) isalnum (x)
 130 # define TOLOWER(x) tolower (x)
 131 # define TOUPPER(x) toupper (x)
 132
 133 static struct options opt;
 134 #endif /* STANDALONE */
 135
 136 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 137    allocated on the stack, and moved to the heap if it needs to be
 138    larger than originally expected.  map_html_tags() uses it to store
 139    the zero-terminated names and values of tags and attributes.
 140
 141    Thus taginfo->name, and attr->name and attr->value for each
 142    attribute, do not point into separately allocated areas, but into
 143    different parts of the pool, separated only by terminating zeros.
 144    This ensures minimum amount of allocation and, for most tags, no
 145    allocation because the entire pool is kept on the stack.  */
 146
 147 struct pool {
 148   char *contents;               /* pointer to the contents. */
 149   int size;                     /* size of the pool. */
 150   int tail;                     /* next available position index. */
 151   int resized;                  /* whether the pool has been resized
 152                                    using malloc. */
 153
 154   char *orig_contents;          /* original pool contents, usually
 155                                    stack-allocated.  used by POOL_FREE
 156                                    to restore the pool to the initial
 157                                    state. */
 158   int orig_size;
 159 };
 160
 161 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 162
 163 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 164   struct pool *P = (p);                                         \
 165   P->contents = (initial_storage);                              \
 166   P->size = (initial_size);                                     \
 167   P->tail = 0;                                                  \
 168   P->resized = 0;                                               \
 169   P->orig_contents = P->contents;                               \
 170   P->orig_size = P->size;                                       \
 171 } while (0)
 172
 173 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 174    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 175
 176 #define POOL_GROW(p, increase)                                  \
 177   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 178               (p)->resized, char)
 179
 180 /* Append text in the range [beg, end) to POOL.  No zero-termination
 181    is done.  */
 182
 183 #define POOL_APPEND(p, beg, end) do {                   \
 184   const char *PA_beg = (beg);                           \
 185   int PA_size = (end) - PA_beg;                         \
 186   POOL_GROW (p, PA_size);                               \
 187   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 188   (p)->tail += PA_size;                                 \
 189 } while (0)
 190
 191 /* Append one character to the pool.  Can be used to zero-terminate
 192    pool strings.  */
 193
 194 #define POOL_APPEND_CHR(p, ch) do {             \
 195   char PAC_char = (ch);                         \
 196   POOL_GROW (p, 1);                             \
 197   (p)->contents[(p)->tail++] = PAC_char;        \
 198 } while (0)
 199
 200 /* Forget old pool contents.  The allocated memory is not freed. */
 201 #define POOL_REWIND(p) (p)->tail = 0
 202
 203 /* Free heap-allocated memory for contents of POOL.  This calls
 204    xfree() if the memory was allocated through malloc.  It also
 205    restores `contents' and `size' to their original, pre-malloc
 206    values.  That way after POOL_FREE, the pool is fully usable, just
 207    as if it were freshly initialized with POOL_INIT.  */
 208
 209 #define POOL_FREE(p) do {                       \
 210   struct pool *P = p;                           \
 211   if (P->resized)                               \
 212     xfree (P->contents);                        \
 213   P->contents = P->orig_contents;               \
 214   P->size = P->orig_size;                       \
 215   P->tail = 0;                                  \
 216   P->resized = 0;                               \
 217 } while (0)
 218
 219 /* Used for small stack-allocated memory chunks that might grow.  Like
 220    DO_REALLOC, this macro grows BASEVAR as necessary to take
 221    NEEDED_SIZE items of TYPE.
 222
 223    The difference is that on the first resize, it will use
 224    malloc+memcpy rather than realloc.  That way you can stack-allocate
 225    the initial chunk, and only resort to heap allocation if you
 226    stumble upon large data.
 227
 228    After the first resize, subsequent ones are performed with realloc,
 229    just like DO_REALLOC.  */
 230
 231 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 232   long ga_needed_size = (needed_size);                                          \
 233   long ga_newsize = (sizevar);                                                  \
 234   while (ga_newsize < ga_needed_size)                                           \
 235     ga_newsize <<= 1;                                                           \
 236   if (ga_newsize != (sizevar))                                                  \
 237     {                                                                           \
 238       if (resized)                                                              \
 239         basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type));       \
 240       else                                                                      \
 241         {                                                                       \
 242           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 243           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 244           (basevar) = ga_new;                                                   \
 245           resized = 1;                                                          \
 246         }                                                                       \
 247       (sizevar) = ga_newsize;                                                   \
 248     }                                                                           \
 249 } while (0)
 250 \f
 251 #define AP_DOWNCASE             1
 252 #define AP_PROCESS_ENTITIES     2
 253 #define AP_TRIM_BLANKS          4
 254
 255 /* Copy the text in the range [BEG, END) to POOL, optionally
 256    performing operations specified by FLAGS.  FLAGS may be any
 257    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
 258    with the following meaning:
 259
 260    * AP_DOWNCASE -- downcase all the letters;
 261
 262    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 263    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 264    &nbsp and the numerical entities.
 265
 266    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 267    of text.  */
 268
 269 static void
 270 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 271 {
 272   int old_tail = pool->tail;
 273   int size;
 274
 275   /* First, skip blanks if required.  We must do this before entities
 276      are processed, so that blanks can still be inserted as, for
 277      instance, `&#32;'.  */
 278   if (flags & AP_TRIM_BLANKS)
 279     {
 280       while (beg < end && ISSPACE (*beg))
 281         ++beg;
 282       while (end > beg && ISSPACE (end[-1]))
 283         --end;
 284     }
 285   size = end - beg;
 286
 287   if (flags & AP_PROCESS_ENTITIES)
 288     {
 289       /* Grow the pool, then copy the text to the pool character by
 290          character, processing the encountered entities as we go
 291          along.
 292
 293          It's safe (and necessary) to grow the pool in advance because
 294          processing the entities can only *shorten* the string, it can
 295          never lengthen it.  */
 296       const char *from = beg;
 297       char *to;
 298
 299       POOL_GROW (pool, end - beg);
 300       to = pool->contents + pool->tail;
 301
 302       while (from < end)
 303         {
 304           if (*from != '&')
 305             *to++ = *from++;
 306           else
 307             {
 308               const char *save = from;
 309               int remain;
 310
 311               if (++from == end)
 312                 goto lose;
 313               remain = end - from;
 314
 315               /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 316               if (*from == '#')
 317                 {
 318                   int numeric = 0, digits = 0;
 319                   ++from;
 320                   if (*from == 'x')
 321                     {
 322                       ++from;
 323                       for (; from < end && ISXDIGIT (*from); from++, digits++)
 324                         numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
 325                     }
 326                   else
 327                     {
 328                       for (; from < end && ISDIGIT (*from); from++, digits++)
 329                         numeric = (numeric * 10) + (*from - '0');
 330                     }
 331                   if (!digits)
 332                     goto lose;
 333                   numeric &= 0xff;
 334                   *to++ = numeric;
 335                 }
 336 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 337                  && 0 == memcmp (from, x, sizeof (x) - 1)       \
 338                  && (*(from + sizeof (x) - 1) == ';'            \
 339                      || remain == sizeof (x) - 1                \
 340                      || !ISALNUM (*(from + sizeof (x) - 1))))
 341               else if (FROB ("lt"))
 342                 *to++ = '<', from += 2;
 343               else if (FROB ("gt"))
 344                 *to++ = '>', from += 2;
 345               else if (FROB ("amp"))
 346                 *to++ = '&', from += 3;
 347               else if (FROB ("quot"))
 348                 *to++ = '\"', from += 4;
 349               /* We don't implement the proposed "Added Latin 1"
 350                  entities (except for nbsp), because it is unnecessary
 351                  in the context of Wget, and would require hashing to
 352                  work efficiently.  */
 353               else if (FROB ("nbsp"))
 354                 *to++ = 160, from += 4;
 355               else
 356                 goto lose;
 357 #undef FROB
 358               /* If the entity was followed by `;', we step over the
 359                  `;'.  Otherwise, it was followed by either a
 360                  non-alphanumeric or EOB, in which case we do nothing.  */
 361               if (from < end && *from == ';')
 362                 ++from;
 363               continue;
 364
 365             lose:
 366               /* This was not an entity after all.  Back out.  */
 367               from = save;
 368               *to++ = *from++;
 369             }
 370         }
 371       /* Verify that we haven't exceeded the original size.  (It
 372          shouldn't happen, hence the assert.)  */
 373       assert (to - (pool->contents + pool->tail) <= end - beg);
 374
 375       /* Make POOL's tail point to the position following the string
 376          we've written.  */
 377       pool->tail = to - pool->contents;
 378       POOL_APPEND_CHR (pool, '\0');
 379     }
 380   else
 381     {
 382       /* Just copy the text to the pool.  */
 383       POOL_APPEND (pool, beg, end);
 384       POOL_APPEND_CHR (pool, '\0');
 385     }
 386
 387   if (flags & AP_DOWNCASE)
 388     {
 389       char *p = pool->contents + old_tail;
 390       for (; *p; p++)
 391         *p = TOLOWER (*p);
 392     }
 393 }
 394 \f
 395 /* Check whether the contents of [POS, POS+LENGTH) match any of the
 396    strings in the ARRAY.  */
 397 static int
 398 array_allowed (const char **array, const char *beg, const char *end)
 399 {
 400   int length = end - beg;
 401   if (array)
 402     {
 403       for (; *array; array++)
 404         if (length >= strlen (*array)
 405             && !strncasecmp (*array, beg, length))
 406           break;
 407       if (!*array)
 408         return 0;
 409     }
 410   return 1;
 411 }
 412 \f
 413 /* Originally we used to adhere to rfc 1866 here, and allowed only
 414    letters, digits, periods, and hyphens as names (of tags or
 415    attributes).  However, this broke too many pages which used
 416    proprietary or strange attributes, e.g. <img src="a.gif"
 417    v:shapes="whatever">.
 418
 419    So now we allow any character except:
 420      * whitespace
 421      * 8-bit and control chars
 422      * characters that clearly cannot be part of name:
 423        '=', '>', '/'.
 424
 425    This only affects attribute and tag names; attribute values allow
 426    an even greater variety of characters.  */
 427
 428 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 429                         && (x) != '=' && (x) != '>' && (x) != '/')
 430
 431 #ifdef STANDALONE
 432 static int comment_backout_count;
 433 #endif
 434
 435 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 436    strict comments mode, this is used for skipping over comments as
 437    well.
 438
 439    To recap: any SGML declaration may have comments associated with
 440    it, e.g.
 441        <!MY-DECL -- isn't this fun? -- foo bar>
 442
 443    An HTML comment is merely an empty declaration (<!>) with a comment
 444    attached, like this:
 445        <!-- some stuff here -->
 446
 447    Several comments may be embedded in one comment declaration:
 448        <!-- have -- -- fun -->
 449
 450    Whitespace is allowed between and after the comments, but not
 451    before the first comment.  Additionally, this function attempts to
 452    handle double quotes in SGML declarations correctly.  */
 453
 454 static const char *
 455 advance_declaration (const char *beg, const char *end)
 456 {
 457   const char *p = beg;
 458   char quote_char = '\0';       /* shut up, gcc! */
 459   char ch;
 460
 461   enum {
 462     AC_S_DONE,
 463     AC_S_BACKOUT,
 464     AC_S_BANG,
 465     AC_S_DEFAULT,
 466     AC_S_DCLNAME,
 467     AC_S_DASH1,
 468     AC_S_DASH2,
 469     AC_S_COMMENT,
 470     AC_S_DASH3,
 471     AC_S_DASH4,
 472     AC_S_QUOTE1,
 473     AC_S_IN_QUOTE,
 474     AC_S_QUOTE2,
 475   } state = AC_S_BANG;
 476
 477   if (beg == end)
 478     return beg;
 479   ch = *p++;
 480
 481   /* It looked like a good idea to write this as a state machine, but
 482      now I wonder...  */
 483
 484   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 485     {
 486       if (p == end)
 487         state = AC_S_BACKOUT;
 488       switch (state)
 489         {
 490         case AC_S_DONE:
 491         case AC_S_BACKOUT:
 492           break;
 493         case AC_S_BANG:
 494           if (ch == '!')
 495             {
 496               ch = *p++;
 497               state = AC_S_DEFAULT;
 498             }
 499           else
 500             state = AC_S_BACKOUT;
 501           break;
 502         case AC_S_DEFAULT:
 503           switch (ch)
 504             {
 505             case '-':
 506               state = AC_S_DASH1;
 507               break;
 508             case ' ':
 509             case '\t':
 510             case '\r':
 511             case '\n':
 512               ch = *p++;
 513               break;
 514             case '>':
 515               state = AC_S_DONE;
 516               break;
 517             case '\'':
 518             case '\"':
 519               state = AC_S_QUOTE1;
 520               break;
 521             default:
 522               if (NAME_CHAR_P (ch))
 523                 state = AC_S_DCLNAME;
 524               else
 525                 state = AC_S_BACKOUT;
 526               break;
 527             }
 528           break;
 529         case AC_S_DCLNAME:
 530           if (ch == '-')
 531             state = AC_S_DASH1;
 532           else if (NAME_CHAR_P (ch))
 533             ch = *p++;
 534           else
 535             state = AC_S_DEFAULT;
 536           break;
 537         case AC_S_QUOTE1:
 538           /* We must use 0x22 because broken assert macros choke on
 539              '"' and '\"'.  */
 540           assert (ch == '\'' || ch == 0x22);
 541           quote_char = ch;      /* cheating -- I really don't feel like
 542                                    introducing more different states for
 543                                    different quote characters. */
 544           ch = *p++;
 545           state = AC_S_IN_QUOTE;
 546           break;
 547         case AC_S_IN_QUOTE:
 548           if (ch == quote_char)
 549             state = AC_S_QUOTE2;
 550           else
 551             ch = *p++;
 552           break;
 553         case AC_S_QUOTE2:
 554           assert (ch == quote_char);
 555           ch = *p++;
 556           state = AC_S_DEFAULT;
 557           break;
 558         case AC_S_DASH1:
 559           assert (ch == '-');
 560           ch = *p++;
 561           state = AC_S_DASH2;
 562           break;
 563         case AC_S_DASH2:
 564           switch (ch)
 565             {
 566             case '-':
 567               ch = *p++;
 568               state = AC_S_COMMENT;
 569               break;
 570             default:
 571               state = AC_S_BACKOUT;
 572             }
 573           break;
 574         case AC_S_COMMENT:
 575           switch (ch)
 576             {
 577             case '-':
 578               state = AC_S_DASH3;
 579               break;
 580             default:
 581               ch = *p++;
 582               break;
 583             }
 584           break;
 585         case AC_S_DASH3:
 586           assert (ch == '-');
 587           ch = *p++;
 588           state = AC_S_DASH4;
 589           break;
 590         case AC_S_DASH4:
 591           switch (ch)
 592             {
 593             case '-':
 594               ch = *p++;
 595               state = AC_S_DEFAULT;
 596               break;
 597             default:
 598               state = AC_S_COMMENT;
 599               break;
 600             }
 601           break;
 602         }
 603     }
 604
 605   if (state == AC_S_BACKOUT)
 606     {
 607 #ifdef STANDALONE
 608       ++comment_backout_count;
 609 #endif
 610       return beg + 1;
 611     }
 612   return p;
 613 }
 614
 615 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 616    return the pointer to the character after the substring.  If the
 617    substring is not found, return NULL.  */
 618
 619 static const char *
 620 find_comment_end (const char *beg, const char *end)
 621 {
 622   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 623      if it's not '>' or '-', advance by three characters.  Otherwise,
 624      look at the preceding characters and try to find a match.  */
 625
 626   const char *p = beg - 1;
 627
 628   while ((p += 3) < end)
 629     switch (p[0])
 630       {
 631       case '>':
 632         if (p[-1] == '-' && p[-2] == '-')
 633           return p + 1;
 634         break;
 635       case '-':
 636       at_dash:
 637         if (p[-1] == '-')
 638           {
 639           at_dash_dash:
 640             if (++p == end) return NULL;
 641             switch (p[0])
 642               {
 643               case '>': return p + 1;
 644               case '-': goto at_dash_dash;
 645               }
 646           }
 647         else
 648           {
 649             if ((p += 2) >= end) return NULL;
 650             switch (p[0])
 651               {
 652               case '>':
 653                 if (p[-1] == '-')
 654                   return p + 1;
 655                 break;
 656               case '-':
 657                 goto at_dash;
 658               }
 659           }
 660       }
 661   return NULL;
 662 }
 663 \f
 664 /* Advance P (a char pointer), with the explicit intent of being able
 665    to read the next character.  If this is not possible, go to finish.  */
 666
 667 #define ADVANCE(p) do {                         \
 668   ++p;                                          \
 669   if (p >= end)                                 \
 670     goto finish;                                \
 671 } while (0)
 672
 673 /* Skip whitespace, if any. */
 674
 675 #define SKIP_WS(p) do {                         \
 676   while (ISSPACE (*p)) {                        \
 677     ADVANCE (p);                                \
 678   }                                             \
 679 } while (0)
 680
 681 /* Skip non-whitespace, if any. */
 682
 683 #define SKIP_NON_WS(p) do {                     \
 684   while (!ISSPACE (*p)) {                       \
 685     ADVANCE (p);                                \
 686   }                                             \
 687 } while (0)
 688
 689 #ifdef STANDALONE
 690 static int tag_backout_count;
 691 #endif
 692
 693 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 694    MAPFUN will be called with two arguments: pointer to an initialized
 695    struct taginfo, and CLOSURE.
 696
 697    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 698    be processed by this function.  If it is NULL, all the tags are
 699    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 700
 701    (Obviously, the caller can filter out unwanted tags and attributes
 702    just as well, but this is just an optimization designed to avoid
 703    unnecessary copying for tags/attributes which the caller doesn't
 704    want to know about.  These lists are searched linearly; therefore,
 705    if you're interested in a large number of tags or attributes, you'd
 706    better set these to NULL and filter them out yourself with a
 707    hashing process most appropriate for your application.)  */
 708
 709 void
 710 map_html_tags (const char *text, int size,
 711                const char **allowed_tag_names,
 712                const char **allowed_attribute_names,
 713                void (*mapfun) (struct taginfo *, void *),
 714                void *closure)
 715 {
 716   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 717      too little, POOL_APPEND allocates more with malloc. */
 718   char pool_initial_storage[256];
 719   struct pool pool;
 720
 721   const char *p = text;
 722   const char *end = text + size;
 723
 724   struct attr_pair attr_pair_initial_storage[8];
 725   int attr_pair_size = countof (attr_pair_initial_storage);
 726   int attr_pair_resized = 0;
 727   struct attr_pair *pairs = attr_pair_initial_storage;
 728
 729   if (!size)
 730     return;
 731
 732   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 733
 734   {
 735     int nattrs, end_tag;
 736     const char *tag_name_begin, *tag_name_end;
 737     const char *tag_start_position;
 738     int uninteresting_tag;
 739
 740   look_for_tag:
 741     POOL_REWIND (&pool);
 742
 743     nattrs = 0;
 744     end_tag = 0;
 745
 746     /* Find beginning of tag.  We use memchr() instead of the usual
 747        looping with ADVANCE() for speed. */
 748     p = memchr (p, '<', end - p);
 749     if (!p)
 750       goto finish;
 751
 752     tag_start_position = p;
 753     ADVANCE (p);
 754
 755     /* Establish the type of the tag (start-tag, end-tag or
 756        declaration).  */
 757     if (*p == '!')
 758       {
 759         if (!opt.strict_comments
 760             && p < end + 3 && p[1] == '-' && p[2] == '-')
 761           {
 762             /* If strict comments are not enforced and if we know
 763                we're looking at a comment, simply look for the
 764                terminating "-->".  Non-strict is the default because
 765                it works in other browsers and most HTML writers can't
 766                be bothered with getting the comments right.  */
 767             const char *comment_end = find_comment_end (p + 3, end);
 768             if (comment_end)
 769               p = comment_end;
 770           }
 771         else
 772           {
 773             /* Either in strict comment mode or looking at a non-empty
 774                declaration.  Real declarations are much less likely to
 775                be misused the way comments are, so advance over them
 776                properly regardless of strictness.  */
 777             p = advance_declaration (p, end);
 778           }
 779         if (p == end)
 780           goto finish;
 781         goto look_for_tag;
 782       }
 783     else if (*p == '/')
 784       {
 785         end_tag = 1;
 786         ADVANCE (p);
 787       }
 788     tag_name_begin = p;
 789     while (NAME_CHAR_P (*p))
 790       ADVANCE (p);
 791     if (p == tag_name_begin)
 792       goto look_for_tag;
 793     tag_name_end = p;
 794     SKIP_WS (p);
 795     if (end_tag && *p != '>')
 796       goto backout_tag;
 797
 798     if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
 799       /* We can't just say "goto look_for_tag" here because we need
 800          the loop below to properly advance over the tag's attributes.  */
 801       uninteresting_tag = 1;
 802     else
 803       {
 804         uninteresting_tag = 0;
 805         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 806       }
 807
 808     /* Find the attributes. */
 809     while (1)
 810       {
 811         const char *attr_name_begin, *attr_name_end;
 812         const char *attr_value_begin, *attr_value_end;
 813         const char *attr_raw_value_begin, *attr_raw_value_end;
 814         int operation = AP_DOWNCASE; /* stupid compiler. */
 815
 816         SKIP_WS (p);
 817
 818         if (*p == '/')
 819           {
 820             /* A slash at this point means the tag is about to be
 821                closed.  This is legal in XML and has been popularized
 822                in HTML via XHTML.  */
 823             /* <foo a=b c=d /> */
 824             /*              ^  */
 825             ADVANCE (p);
 826             SKIP_WS (p);
 827             if (*p != '>')
 828               goto backout_tag;
 829           }
 830
 831         /* Check for end of tag definition. */
 832         if (*p == '>')
 833           break;
 834
 835         /* Establish bounds of attribute name. */
 836         attr_name_begin = p;    /* <foo bar ...> */
 837                                 /*      ^        */
 838         while (NAME_CHAR_P (*p))
 839           ADVANCE (p);
 840         attr_name_end = p;      /* <foo bar ...> */
 841                                 /*         ^     */
 842         if (attr_name_begin == attr_name_end)
 843           goto backout_tag;
 844
 845         /* Establish bounds of attribute value. */
 846         SKIP_WS (p);
 847         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 848           {
 849             /* Minimized attribute syntax allows `=' to be omitted.
 850                For example, <UL COMPACT> is a valid shorthand for <UL
 851                COMPACT="compact">.  Even if such attributes are not
 852                useful to Wget, we need to support them, so that the
 853                tags containing them can be parsed correctly. */
 854             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 855             attr_raw_value_end = attr_value_end = attr_name_end;
 856           }
 857         else if (*p == '=')
 858           {
 859             ADVANCE (p);
 860             SKIP_WS (p);
 861             if (*p == '\"' || *p == '\'')
 862               {
 863                 int newline_seen = 0;
 864                 char quote_char = *p;
 865                 attr_raw_value_begin = p;
 866                 ADVANCE (p);
 867                 attr_value_begin = p; /* <foo bar="baz"> */
 868                                       /*           ^     */
 869                 while (*p != quote_char)
 870                   {
 871                     if (!newline_seen && *p == '\n')
 872                       {
 873                         /* If a newline is seen within the quotes, it
 874                            is most likely that someone forgot to close
 875                            the quote.  In that case, we back out to
 876                            the value beginning, and terminate the tag
 877                            at either `>' or the delimiter, whichever
 878                            comes first.  Such a tag terminated at `>'
 879                            is discarded.  */
 880                         p = attr_value_begin;
 881                         newline_seen = 1;
 882                         continue;
 883                       }
 884                     else if (newline_seen && *p == '>')
 885                       break;
 886                     ADVANCE (p);
 887                   }
 888                 attr_value_end = p; /* <foo bar="baz"> */
 889                                     /*              ^  */
 890                 if (*p == quote_char)
 891                   ADVANCE (p);
 892                 else
 893                   goto look_for_tag;
 894                 attr_raw_value_end = p; /* <foo bar="baz"> */
 895                                         /*               ^ */
 896                 /* The AP_TRIM_BLANKS is there for buggy HTML
 897                    generators that generate <a href=" foo"> instead of
 898                    <a href="foo"> (Netscape ignores spaces as well.)
 899                    If you really mean space, use &32; or %20.  */
 900                 operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS;
 901               }
 902             else
 903               {
 904                 attr_value_begin = p; /* <foo bar=baz> */
 905                                       /*          ^    */
 906                 /* According to SGML, a name token should consist only
 907                    of alphanumerics, . and -.  However, this is often
 908                    violated by, for instance, `%' in `width=75%'.
 909                    We'll be liberal and allow just about anything as
 910                    an attribute value.  */
 911                 while (!ISSPACE (*p) && *p != '>')
 912                   ADVANCE (p);
 913                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 914                                     /*             ^          */
 915                 if (attr_value_begin == attr_value_end)
 916                   /* <foo bar=> */
 917                   /*          ^ */
 918                   goto backout_tag;
 919                 attr_raw_value_begin = attr_value_begin;
 920                 attr_raw_value_end = attr_value_end;
 921                 operation = AP_PROCESS_ENTITIES;
 922               }
 923           }
 924         else
 925           {
 926             /* We skipped the whitespace and found something that is
 927                neither `=' nor the beginning of the next attribute's
 928                name.  Back out.  */
 929             goto backout_tag;   /* <foo bar [... */
 930                                 /*          ^    */
 931           }
 932
 933         /* If we're not interested in the tag, don't bother with any
 934            of the attributes.  */
 935         if (uninteresting_tag)
 936           continue;
 937
 938         /* If we aren't interested in the attribute, skip it.  We
 939            cannot do this test any sooner, because our text pointer
 940            needs to correctly advance over the attribute.  */
 941         if (allowed_attribute_names
 942             && !array_allowed (allowed_attribute_names, attr_name_begin,
 943                                attr_name_end))
 944           continue;
 945
 946         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 947                     struct attr_pair);
 948
 949         pairs[nattrs].name_pool_index = pool.tail;
 950         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 951
 952         pairs[nattrs].value_pool_index = pool.tail;
 953         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 954         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 955         pairs[nattrs].value_raw_size = (attr_raw_value_end
 956                                         - attr_raw_value_begin);
 957         ++nattrs;
 958       }
 959
 960     if (uninteresting_tag)
 961       {
 962         ADVANCE (p);
 963         goto look_for_tag;
 964       }
 965
 966     /* By now, we have a valid tag with a name and zero or more
 967        attributes.  Fill in the data and call the mapper function.  */
 968     {
 969       int i;
 970       struct taginfo taginfo;
 971
 972       taginfo.name      = pool.contents;
 973       taginfo.end_tag_p = end_tag;
 974       taginfo.nattrs    = nattrs;
 975       /* We fill in the char pointers only now, when pool can no
 976          longer get realloc'ed.  If we did that above, we could get
 977          hosed by reallocation.  Obviously, after this point, the pool
 978          may no longer be grown.  */
 979       for (i = 0; i < nattrs; i++)
 980         {
 981           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 982           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 983         }
 984       taginfo.attrs = pairs;
 985       taginfo.start_position = tag_start_position;
 986       taginfo.end_position   = p + 1;
 987       /* Ta-dam! */
 988       (*mapfun) (&taginfo, closure);
 989       ADVANCE (p);
 990     }
 991     goto look_for_tag;
 992
 993   backout_tag:
 994 #ifdef STANDALONE
 995     ++tag_backout_count;
 996 #endif
 997     /* The tag wasn't really a tag.  Treat its contents as ordinary
 998        data characters. */
 999     p = tag_start_position + 1;
1000     goto look_for_tag;
1001   }
1002
1003  finish:
1004   POOL_FREE (&pool);
1005   if (attr_pair_resized)
1006     xfree (pairs);
1007 }
1008
1009 #undef ADVANCE
1010 #undef SKIP_WS
1011 #undef SKIP_NON_WS
1012 \f
1013 #ifdef STANDALONE
1014 static void
1015 test_mapper (struct taginfo *taginfo, void *arg)
1016 {
1017   int i;
1018
1019   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1020   for (i = 0; i < taginfo->nattrs; i++)
1021     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1022   putchar ('\n');
1023   ++*(int *)arg;
1024 }
1025
1026 int main ()
1027 {
1028   int size = 256;
1029   char *x = (char *)xmalloc (size);
1030   int length = 0;
1031   int read_count;
1032   int tag_counter = 0;
1033
1034   while ((read_count = fread (x + length, 1, size - length, stdin)))
1035     {
1036       length += read_count;
1037       size <<= 1;
1038       x = (char *)xrealloc (x, size);
1039     }
1040
1041   map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
1042   printf ("TAGS: %d\n", tag_counter);
1043   printf ("Tag backouts:     %d\n", tag_backout_count);
1044   printf ("Comment backouts: %d\n", comment_backout_count);
1045   return 0;
1046 }
1047 #endif /* STANDALONE */