sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #ifdef HAVE_STRING_H
 100 # include <string.h>
 101 #else
 102 # include <strings.h>
 103 #endif
 104 #include <assert.h>
 105
 106 #include "wget.h"
 107 #include "html-parse.h"
 108
 109 #ifdef STANDALONE
 110 # undef xmalloc
 111 # undef xrealloc
 112 # undef xfree
 113 # define xmalloc malloc
 114 # define xrealloc realloc
 115 # define xfree free
 116
 117 # undef ISSPACE
 118 # undef ISDIGIT
 119 # undef ISXDIGIT
 120 # undef ISALPHA
 121 # undef ISALNUM
 122 # undef TOLOWER
 123 # undef TOUPPER
 124
 125 # define ISSPACE(x) isspace (x)
 126 # define ISDIGIT(x) isdigit (x)
 127 # define ISXDIGIT(x) isxdigit (x)
 128 # define ISALPHA(x) isalpha (x)
 129 # define ISALNUM(x) isalnum (x)
 130 # define TOLOWER(x) tolower (x)
 131 # define TOUPPER(x) toupper (x)
 132
 133 static struct options opt;
 134 #endif /* STANDALONE */
 135
 136 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 137    allocated on the stack, and moved to the heap if it needs to be
 138    larger than originally expected.  map_html_tags() uses it to store
 139    the zero-terminated names and values of tags and attributes.
 140
 141    Thus taginfo->name, and attr->name and attr->value for each
 142    attribute, do not point into separately allocated areas, but into
 143    different parts of the pool, separated only by terminating zeros.
 144    This ensures minimum amount of allocation and, for most tags, no
 145    allocation because the entire pool is kept on the stack.  */
 146
 147 struct pool {
 148   char *contents;               /* pointer to the contents. */
 149   int size;                     /* size of the pool. */
 150   int tail;                     /* next available position index. */
 151   int resized;                  /* whether the pool has been resized
 152                                    using malloc. */
 153
 154   char *orig_contents;          /* original pool contents, usually
 155                                    stack-allocated.  used by POOL_FREE
 156                                    to restore the pool to the initial
 157                                    state. */
 158   int orig_size;
 159 };
 160
 161 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 162
 163 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 164   struct pool *P = (p);                                         \
 165   P->contents = (initial_storage);                              \
 166   P->size = (initial_size);                                     \
 167   P->tail = 0;                                                  \
 168   P->resized = 0;                                               \
 169   P->orig_contents = P->contents;                               \
 170   P->orig_size = P->size;                                       \
 171 } while (0)
 172
 173 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 174    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 175
 176 #define POOL_GROW(p, increase)                                  \
 177   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + increase,   \
 178               (p)->resized, char)
 179
 180 /* Append text in the range [beg, end) to POOL.  No zero-termination
 181    is done.  */
 182
 183 #define POOL_APPEND(p, beg, end) do {                   \
 184   const char *PA_beg = (beg);                           \
 185   int PA_size = (end) - PA_beg;                         \
 186   POOL_GROW (p, PA_size);                               \
 187   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 188   (p)->tail += PA_size;                                 \
 189 } while (0)
 190
 191 /* Append one character to the pool.  Can be used to zero-terminate
 192    pool strings.  */
 193
 194 #define POOL_APPEND_CHR(p, ch) do {             \
 195   char PAC_char = (ch);                         \
 196   POOL_GROW (p, 1);                             \
 197   (p)->contents[(p)->tail++] = PAC_char;        \
 198 } while (0)
 199
 200 /* Forget old pool contents.  The allocated memory is not freed. */
 201 #define POOL_REWIND(p) (p)->tail = 0
 202
 203 /* Free heap-allocated memory for contents of POOL.  This calls
 204    xfree() if the memory was allocated through malloc.  It also
 205    restores `contents' and `size' to their original, pre-malloc
 206    values.  That way after POOL_FREE, the pool is fully usable, just
 207    as if it were freshly initialized with POOL_INIT.  */
 208
 209 #define POOL_FREE(p) do {                       \
 210   struct pool *P = p;                           \
 211   if (P->resized)                               \
 212     xfree (P->contents);                        \
 213   P->contents = P->orig_contents;               \
 214   P->size = P->orig_size;                       \
 215   P->tail = 0;                                  \
 216   P->resized = 0;                               \
 217 } while (0)
 218
 219 /* Used for small stack-allocated memory chunks that might grow.  Like
 220    DO_REALLOC, this macro grows BASEVAR as necessary to take
 221    NEEDED_SIZE items of TYPE.
 222
 223    The difference is that on the first resize, it will use
 224    malloc+memcpy rather than realloc.  That way you can stack-allocate
 225    the initial chunk, and only resort to heap allocation if you
 226    stumble upon large data.
 227
 228    After the first resize, subsequent ones are performed with realloc,
 229    just like DO_REALLOC.  */
 230
 231 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 232   long ga_needed_size = (needed_size);                                          \
 233   long ga_newsize = (sizevar);                                                  \
 234   while (ga_newsize < ga_needed_size)                                           \
 235     ga_newsize <<= 1;                                                           \
 236   if (ga_newsize != (sizevar))                                                  \
 237     {                                                                           \
 238       if (resized)                                                              \
 239         basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type));       \
 240       else                                                                      \
 241         {                                                                       \
 242           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 243           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 244           (basevar) = ga_new;                                                   \
 245           resized = 1;                                                          \
 246         }                                                                       \
 247       (sizevar) = ga_newsize;                                                   \
 248     }                                                                           \
 249 } while (0)
 250 \f
 251 #define AP_DOWNCASE             1
 252 #define AP_PROCESS_ENTITIES     2
 253 #define AP_TRIM_BLANKS          4
 254
 255 /* Copy the text in the range [BEG, END) to POOL, optionally
 256    performing operations specified by FLAGS.  FLAGS may be any
 257    combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
 258    with the following meaning:
 259
 260    * AP_DOWNCASE -- downcase all the letters;
 261
 262    * AP_PROCESS_ENTITIES -- process the SGML entities and write out
 263    the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
 264    &nbsp and the numerical entities.
 265
 266    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 267    of text.  */
 268
 269 static void
 270 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 271 {
 272   int old_tail = pool->tail;
 273   int size;
 274
 275   /* First, skip blanks if required.  We must do this before entities
 276      are processed, so that blanks can still be inserted as, for
 277      instance, `&#32;'.  */
 278   if (flags & AP_TRIM_BLANKS)
 279     {
 280       while (beg < end && ISSPACE (*beg))
 281         ++beg;
 282       while (end > beg && ISSPACE (end[-1]))
 283         --end;
 284     }
 285   size = end - beg;
 286
 287   if (flags & AP_PROCESS_ENTITIES)
 288     {
 289       /* Grow the pool, then copy the text to the pool character by
 290          character, processing the encountered entities as we go
 291          along.
 292
 293          It's safe (and necessary) to grow the pool in advance because
 294          processing the entities can only *shorten* the string, it can
 295          never lengthen it.  */
 296       POOL_GROW (pool, end - beg);
 297       const char *from = beg;
 298       char *to = pool->contents + pool->tail;
 299
 300       while (from < end)
 301         {
 302           if (*from != '&')
 303             *to++ = *from++;
 304           else
 305             {
 306               const char *save = from;
 307               int remain;
 308
 309               if (++from == end)
 310                 goto lose;
 311               remain = end - from;
 312
 313               /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 314               if (*from == '#')
 315                 {
 316                   int numeric = 0, digits = 0;
 317                   ++from;
 318                   if (*from == 'x')
 319                     {
 320                       ++from;
 321                       for (; from < end && ISXDIGIT (*from); from++, digits++)
 322                         numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
 323                     }
 324                   else
 325                     {
 326                       for (; from < end && ISDIGIT (*from); from++, digits++)
 327                         numeric = (numeric * 10) + (*from - '0');
 328                     }
 329                   if (!digits)
 330                     goto lose;
 331                   numeric &= 0xff;
 332                   *to++ = numeric;
 333                 }
 334 #define FROB(x) (remain >= (sizeof (x) - 1)                     \
 335                  && 0 == memcmp (from, x, sizeof (x) - 1)       \
 336                  && (*(from + sizeof (x) - 1) == ';'            \
 337                      || remain == sizeof (x) - 1                \
 338                      || !ISALNUM (*(from + sizeof (x) - 1))))
 339               else if (FROB ("lt"))
 340                 *to++ = '<', from += 2;
 341               else if (FROB ("gt"))
 342                 *to++ = '>', from += 2;
 343               else if (FROB ("amp"))
 344                 *to++ = '&', from += 3;
 345               else if (FROB ("quot"))
 346                 *to++ = '\"', from += 4;
 347               /* We don't implement the proposed "Added Latin 1"
 348                  entities (except for nbsp), because it is unnecessary
 349                  in the context of Wget, and would require hashing to
 350                  work efficiently.  */
 351               else if (FROB ("nbsp"))
 352                 *to++ = 160, from += 4;
 353               else
 354                 goto lose;
 355 #undef FROB
 356               /* If the entity was followed by `;', we step over the
 357                  `;'.  Otherwise, it was followed by either a
 358                  non-alphanumeric or EOB, in which case we do nothing.  */
 359               if (from < end && *from == ';')
 360                 ++from;
 361               continue;
 362
 363             lose:
 364               /* This was not an entity after all.  Back out.  */
 365               from = save;
 366               *to++ = *from++;
 367             }
 368         }
 369       /* Verify that we haven't exceeded the original size.  (It
 370          shouldn't happen, hence the assert.)  */
 371       assert (to - (pool->contents + pool->tail) <= end - beg);
 372
 373       /* Make POOL's tail point to the position following the string
 374          we've written.  */
 375       pool->tail = to - pool->contents;
 376       POOL_APPEND_CHR (pool, '\0');
 377     }
 378   else
 379     {
 380       /* Just copy the text to the pool.  */
 381       POOL_APPEND (pool, beg, end);
 382       POOL_APPEND_CHR (pool, '\0');
 383     }
 384
 385   if (flags & AP_DOWNCASE)
 386     {
 387       char *p = pool->contents + old_tail;
 388       for (; *p; p++)
 389         *p = TOLOWER (*p);
 390     }
 391 }
 392 \f
 393 /* Check whether the contents of [POS, POS+LENGTH) match any of the
 394    strings in the ARRAY.  */
 395 static int
 396 array_allowed (const char **array, const char *beg, const char *end)
 397 {
 398   int length = end - beg;
 399   if (array)
 400     {
 401       for (; *array; array++)
 402         if (length >= strlen (*array)
 403             && !strncasecmp (*array, beg, length))
 404           break;
 405       if (!*array)
 406         return 0;
 407     }
 408   return 1;
 409 }
 410 \f
 411 /* Originally we used to adhere to rfc 1866 here, and allowed only
 412    letters, digits, periods, and hyphens as names (of tags or
 413    attributes).  However, this broke too many pages which used
 414    proprietary or strange attributes, e.g. <img src="a.gif"
 415    v:shapes="whatever">.
 416
 417    So now we allow any character except:
 418      * whitespace
 419      * 8-bit and control chars
 420      * characters that clearly cannot be part of name:
 421        '=', '>', '/'.
 422
 423    This only affects attribute and tag names; attribute values allow
 424    an even greater variety of characters.  */
 425
 426 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 427                         && (x) != '=' && (x) != '>' && (x) != '/')
 428
 429 #ifdef STANDALONE
 430 static int comment_backout_count;
 431 #endif
 432
 433 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 434    strict comments mode, this is used for skipping over comments as
 435    well.
 436
 437    To recap: any SGML declaration may have comments associated with
 438    it, e.g.
 439        <!MY-DECL -- isn't this fun? -- foo bar>
 440
 441    An HTML comment is merely an empty declaration (<!>) with a comment
 442    attached, like this:
 443        <!-- some stuff here -->
 444
 445    Several comments may be embedded in one comment declaration:
 446        <!-- have -- -- fun -->
 447
 448    Whitespace is allowed between and after the comments, but not
 449    before the first comment.  Additionally, this function attempts to
 450    handle double quotes in SGML declarations correctly.  */
 451
 452 static const char *
 453 advance_declaration (const char *beg, const char *end)
 454 {
 455   const char *p = beg;
 456   char quote_char = '\0';       /* shut up, gcc! */
 457   char ch;
 458
 459   enum {
 460     AC_S_DONE,
 461     AC_S_BACKOUT,
 462     AC_S_BANG,
 463     AC_S_DEFAULT,
 464     AC_S_DCLNAME,
 465     AC_S_DASH1,
 466     AC_S_DASH2,
 467     AC_S_COMMENT,
 468     AC_S_DASH3,
 469     AC_S_DASH4,
 470     AC_S_QUOTE1,
 471     AC_S_IN_QUOTE,
 472     AC_S_QUOTE2,
 473   } state = AC_S_BANG;
 474
 475   if (beg == end)
 476     return beg;
 477   ch = *p++;
 478
 479   /* It looked like a good idea to write this as a state machine, but
 480      now I wonder...  */
 481
 482   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 483     {
 484       if (p == end)
 485         state = AC_S_BACKOUT;
 486       switch (state)
 487         {
 488         case AC_S_DONE:
 489         case AC_S_BACKOUT:
 490           break;
 491         case AC_S_BANG:
 492           if (ch == '!')
 493             {
 494               ch = *p++;
 495               state = AC_S_DEFAULT;
 496             }
 497           else
 498             state = AC_S_BACKOUT;
 499           break;
 500         case AC_S_DEFAULT:
 501           switch (ch)
 502             {
 503             case '-':
 504               state = AC_S_DASH1;
 505               break;
 506             case ' ':
 507             case '\t':
 508             case '\r':
 509             case '\n':
 510               ch = *p++;
 511               break;
 512             case '>':
 513               state = AC_S_DONE;
 514               break;
 515             case '\'':
 516             case '\"':
 517               state = AC_S_QUOTE1;
 518               break;
 519             default:
 520               if (NAME_CHAR_P (ch))
 521                 state = AC_S_DCLNAME;
 522               else
 523                 state = AC_S_BACKOUT;
 524               break;
 525             }
 526           break;
 527         case AC_S_DCLNAME:
 528           if (ch == '-')
 529             state = AC_S_DASH1;
 530           else if (NAME_CHAR_P (ch))
 531             ch = *p++;
 532           else
 533             state = AC_S_DEFAULT;
 534           break;
 535         case AC_S_QUOTE1:
 536           /* We must use 0x22 because broken assert macros choke on
 537              '"' and '\"'.  */
 538           assert (ch == '\'' || ch == 0x22);
 539           quote_char = ch;      /* cheating -- I really don't feel like
 540                                    introducing more different states for
 541                                    different quote characters. */
 542           ch = *p++;
 543           state = AC_S_IN_QUOTE;
 544           break;
 545         case AC_S_IN_QUOTE:
 546           if (ch == quote_char)
 547             state = AC_S_QUOTE2;
 548           else
 549             ch = *p++;
 550           break;
 551         case AC_S_QUOTE2:
 552           assert (ch == quote_char);
 553           ch = *p++;
 554           state = AC_S_DEFAULT;
 555           break;
 556         case AC_S_DASH1:
 557           assert (ch == '-');
 558           ch = *p++;
 559           state = AC_S_DASH2;
 560           break;
 561         case AC_S_DASH2:
 562           switch (ch)
 563             {
 564             case '-':
 565               ch = *p++;
 566               state = AC_S_COMMENT;
 567               break;
 568             default:
 569               state = AC_S_BACKOUT;
 570             }
 571           break;
 572         case AC_S_COMMENT:
 573           switch (ch)
 574             {
 575             case '-':
 576               state = AC_S_DASH3;
 577               break;
 578             default:
 579               ch = *p++;
 580               break;
 581             }
 582           break;
 583         case AC_S_DASH3:
 584           assert (ch == '-');
 585           ch = *p++;
 586           state = AC_S_DASH4;
 587           break;
 588         case AC_S_DASH4:
 589           switch (ch)
 590             {
 591             case '-':
 592               ch = *p++;
 593               state = AC_S_DEFAULT;
 594               break;
 595             default:
 596               state = AC_S_COMMENT;
 597               break;
 598             }
 599           break;
 600         }
 601     }
 602
 603   if (state == AC_S_BACKOUT)
 604     {
 605 #ifdef STANDALONE
 606       ++comment_backout_count;
 607 #endif
 608       return beg + 1;
 609     }
 610   return p;
 611 }
 612
 613 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 614    return the pointer to the character after the substring.  If the
 615    substring is not found, return NULL.  */
 616
 617 static const char *
 618 find_comment_end (const char *beg, const char *end)
 619 {
 620   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 621      if it's not '>' or '-', advance by three characters.  Otherwise,
 622      look at the preceding characters and try to find a match.  */
 623
 624   const char *p = beg - 1;
 625
 626   while ((p += 3) < end)
 627     switch (p[0])
 628       {
 629       case '>':
 630         if (p[-1] == '-' && p[-2] == '-')
 631           return p + 1;
 632         break;
 633       case '-':
 634       at_dash:
 635         if (p[-1] == '-')
 636           {
 637           at_dash_dash:
 638             if (++p == end) return NULL;
 639             switch (p[0])
 640               {
 641               case '>': return p + 1;
 642               case '-': goto at_dash_dash;
 643               }
 644           }
 645         else
 646           {
 647             if ((p += 2) >= end) return NULL;
 648             switch (p[0])
 649               {
 650               case '>':
 651                 if (p[-1] == '-')
 652                   return p + 1;
 653                 break;
 654               case '-':
 655                 goto at_dash;
 656               }
 657           }
 658       }
 659   return NULL;
 660 }
 661 \f
 662 /* Advance P (a char pointer), with the explicit intent of being able
 663    to read the next character.  If this is not possible, go to finish.  */
 664
 665 #define ADVANCE(p) do {                         \
 666   ++p;                                          \
 667   if (p >= end)                                 \
 668     goto finish;                                \
 669 } while (0)
 670
 671 /* Skip whitespace, if any. */
 672
 673 #define SKIP_WS(p) do {                         \
 674   while (ISSPACE (*p)) {                        \
 675     ADVANCE (p);                                \
 676   }                                             \
 677 } while (0)
 678
 679 /* Skip non-whitespace, if any. */
 680
 681 #define SKIP_NON_WS(p) do {                     \
 682   while (!ISSPACE (*p)) {                       \
 683     ADVANCE (p);                                \
 684   }                                             \
 685 } while (0)
 686
 687 #ifdef STANDALONE
 688 static int tag_backout_count;
 689 #endif
 690
 691 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 692    MAPFUN will be called with two arguments: pointer to an initialized
 693    struct taginfo, and CLOSURE.
 694
 695    ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
 696    be processed by this function.  If it is NULL, all the tags are
 697    allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
 698
 699    (Obviously, the caller can filter out unwanted tags and attributes
 700    just as well, but this is just an optimization designed to avoid
 701    unnecessary copying for tags/attributes which the caller doesn't
 702    want to know about.  These lists are searched linearly; therefore,
 703    if you're interested in a large number of tags or attributes, you'd
 704    better set these to NULL and filter them out yourself with a
 705    hashing process most appropriate for your application.)  */
 706
 707 void
 708 map_html_tags (const char *text, int size,
 709                const char **allowed_tag_names,
 710                const char **allowed_attribute_names,
 711                void (*mapfun) (struct taginfo *, void *),
 712                void *closure)
 713 {
 714   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 715      too little, POOL_APPEND allocates more with malloc. */
 716   char pool_initial_storage[256];
 717   struct pool pool;
 718
 719   const char *p = text;
 720   const char *end = text + size;
 721
 722   struct attr_pair attr_pair_initial_storage[8];
 723   int attr_pair_size = countof (attr_pair_initial_storage);
 724   int attr_pair_resized = 0;
 725   struct attr_pair *pairs = attr_pair_initial_storage;
 726
 727   if (!size)
 728     return;
 729
 730   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 731
 732   {
 733     int nattrs, end_tag;
 734     const char *tag_name_begin, *tag_name_end;
 735     const char *tag_start_position;
 736     int uninteresting_tag;
 737
 738   look_for_tag:
 739     POOL_REWIND (&pool);
 740
 741     nattrs = 0;
 742     end_tag = 0;
 743
 744     /* Find beginning of tag.  We use memchr() instead of the usual
 745        looping with ADVANCE() for speed. */
 746     p = memchr (p, '<', end - p);
 747     if (!p)
 748       goto finish;
 749
 750     tag_start_position = p;
 751     ADVANCE (p);
 752
 753     /* Establish the type of the tag (start-tag, end-tag or
 754        declaration).  */
 755     if (*p == '!')
 756       {
 757         if (!opt.strict_comments
 758             && p < end + 3 && p[1] == '-' && p[2] == '-')
 759           {
 760             /* If strict comments are not enforced and if we know
 761                we're looking at a comment, simply look for the
 762                terminating "-->".  Non-strict is the default because
 763                it works in other browsers and most HTML writers can't
 764                be bothered with getting the comments right.  */
 765             const char *comment_end = find_comment_end (p + 3, end);
 766             if (comment_end)
 767               p = comment_end;
 768           }
 769         else
 770           {
 771             /* Either in strict comment mode or looking at a non-empty
 772                declaration.  Real declarations are much less likely to
 773                be misused the way comments are, so advance over them
 774                properly regardless of strictness.  */
 775             p = advance_declaration (p, end);
 776           }
 777         if (p == end)
 778           goto finish;
 779         goto look_for_tag;
 780       }
 781     else if (*p == '/')
 782       {
 783         end_tag = 1;
 784         ADVANCE (p);
 785       }
 786     tag_name_begin = p;
 787     while (NAME_CHAR_P (*p))
 788       ADVANCE (p);
 789     if (p == tag_name_begin)
 790       goto look_for_tag;
 791     tag_name_end = p;
 792     SKIP_WS (p);
 793     if (end_tag && *p != '>')
 794       goto backout_tag;
 795
 796     if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
 797       /* We can't just say "goto look_for_tag" here because we need
 798          the loop below to properly advance over the tag's attributes.  */
 799       uninteresting_tag = 1;
 800     else
 801       {
 802         uninteresting_tag = 0;
 803         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 804       }
 805
 806     /* Find the attributes. */
 807     while (1)
 808       {
 809         const char *attr_name_begin, *attr_name_end;
 810         const char *attr_value_begin, *attr_value_end;
 811         const char *attr_raw_value_begin, *attr_raw_value_end;
 812         int operation = AP_DOWNCASE; /* stupid compiler. */
 813
 814         SKIP_WS (p);
 815
 816         if (*p == '/')
 817           {
 818             /* A slash at this point means the tag is about to be
 819                closed.  This is legal in XML and has been popularized
 820                in HTML via XHTML.  */
 821             /* <foo a=b c=d /> */
 822             /*              ^  */
 823             ADVANCE (p);
 824             SKIP_WS (p);
 825             if (*p != '>')
 826               goto backout_tag;
 827           }
 828
 829         /* Check for end of tag definition. */
 830         if (*p == '>')
 831           break;
 832
 833         /* Establish bounds of attribute name. */
 834         attr_name_begin = p;    /* <foo bar ...> */
 835                                 /*      ^        */
 836         while (NAME_CHAR_P (*p))
 837           ADVANCE (p);
 838         attr_name_end = p;      /* <foo bar ...> */
 839                                 /*         ^     */
 840         if (attr_name_begin == attr_name_end)
 841           goto backout_tag;
 842
 843         /* Establish bounds of attribute value. */
 844         SKIP_WS (p);
 845         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 846           {
 847             /* Minimized attribute syntax allows `=' to be omitted.
 848                For example, <UL COMPACT> is a valid shorthand for <UL
 849                COMPACT="compact">.  Even if such attributes are not
 850                useful to Wget, we need to support them, so that the
 851                tags containing them can be parsed correctly. */
 852             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 853             attr_raw_value_end = attr_value_end = attr_name_end;
 854           }
 855         else if (*p == '=')
 856           {
 857             ADVANCE (p);
 858             SKIP_WS (p);
 859             if (*p == '\"' || *p == '\'')
 860               {
 861                 int newline_seen = 0;
 862                 char quote_char = *p;
 863                 attr_raw_value_begin = p;
 864                 ADVANCE (p);
 865                 attr_value_begin = p; /* <foo bar="baz"> */
 866                                       /*           ^     */
 867                 while (*p != quote_char)
 868                   {
 869                     if (!newline_seen && *p == '\n')
 870                       {
 871                         /* If a newline is seen within the quotes, it
 872                            is most likely that someone forgot to close
 873                            the quote.  In that case, we back out to
 874                            the value beginning, and terminate the tag
 875                            at either `>' or the delimiter, whichever
 876                            comes first.  Such a tag terminated at `>'
 877                            is discarded.  */
 878                         p = attr_value_begin;
 879                         newline_seen = 1;
 880                         continue;
 881                       }
 882                     else if (newline_seen && *p == '>')
 883                       break;
 884                     ADVANCE (p);
 885                   }
 886                 attr_value_end = p; /* <foo bar="baz"> */
 887                                     /*              ^  */
 888                 if (*p == quote_char)
 889                   ADVANCE (p);
 890                 else
 891                   goto look_for_tag;
 892                 attr_raw_value_end = p; /* <foo bar="baz"> */
 893                                         /*               ^ */
 894                 /* The AP_TRIM_BLANKS is there for buggy HTML
 895                    generators that generate <a href=" foo"> instead of
 896                    <a href="foo"> (Netscape ignores spaces as well.)
 897                    If you really mean space, use &32; or %20.  */
 898                 operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS;
 899               }
 900             else
 901               {
 902                 attr_value_begin = p; /* <foo bar=baz> */
 903                                       /*          ^    */
 904                 /* According to SGML, a name token should consist only
 905                    of alphanumerics, . and -.  However, this is often
 906                    violated by, for instance, `%' in `width=75%'.
 907                    We'll be liberal and allow just about anything as
 908                    an attribute value.  */
 909                 while (!ISSPACE (*p) && *p != '>')
 910                   ADVANCE (p);
 911                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 912                                     /*             ^          */
 913                 if (attr_value_begin == attr_value_end)
 914                   /* <foo bar=> */
 915                   /*          ^ */
 916                   goto backout_tag;
 917                 attr_raw_value_begin = attr_value_begin;
 918                 attr_raw_value_end = attr_value_end;
 919                 operation = AP_PROCESS_ENTITIES;
 920               }
 921           }
 922         else
 923           {
 924             /* We skipped the whitespace and found something that is
 925                neither `=' nor the beginning of the next attribute's
 926                name.  Back out.  */
 927             goto backout_tag;   /* <foo bar [... */
 928                                 /*          ^    */
 929           }
 930
 931         /* If we're not interested in the tag, don't bother with any
 932            of the attributes.  */
 933         if (uninteresting_tag)
 934           continue;
 935
 936         /* If we aren't interested in the attribute, skip it.  We
 937            cannot do this test any sooner, because our text pointer
 938            needs to correctly advance over the attribute.  */
 939         if (allowed_attribute_names
 940             && !array_allowed (allowed_attribute_names, attr_name_begin,
 941                                attr_name_end))
 942           continue;
 943
 944         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 945                     struct attr_pair);
 946
 947         pairs[nattrs].name_pool_index = pool.tail;
 948         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 949
 950         pairs[nattrs].value_pool_index = pool.tail;
 951         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 952         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 953         pairs[nattrs].value_raw_size = (attr_raw_value_end
 954                                         - attr_raw_value_begin);
 955         ++nattrs;
 956       }
 957
 958     if (uninteresting_tag)
 959       {
 960         ADVANCE (p);
 961         goto look_for_tag;
 962       }
 963
 964     /* By now, we have a valid tag with a name and zero or more
 965        attributes.  Fill in the data and call the mapper function.  */
 966     {
 967       int i;
 968       struct taginfo taginfo;
 969
 970       taginfo.name      = pool.contents;
 971       taginfo.end_tag_p = end_tag;
 972       taginfo.nattrs    = nattrs;
 973       /* We fill in the char pointers only now, when pool can no
 974          longer get realloc'ed.  If we did that above, we could get
 975          hosed by reallocation.  Obviously, after this point, the pool
 976          may no longer be grown.  */
 977       for (i = 0; i < nattrs; i++)
 978         {
 979           pairs[i].name = pool.contents + pairs[i].name_pool_index;
 980           pairs[i].value = pool.contents + pairs[i].value_pool_index;
 981         }
 982       taginfo.attrs = pairs;
 983       taginfo.start_position = tag_start_position;
 984       taginfo.end_position   = p + 1;
 985       /* Ta-dam! */
 986       (*mapfun) (&taginfo, closure);
 987       ADVANCE (p);
 988     }
 989     goto look_for_tag;
 990
 991   backout_tag:
 992 #ifdef STANDALONE
 993     ++tag_backout_count;
 994 #endif
 995     /* The tag wasn't really a tag.  Treat its contents as ordinary
 996        data characters. */
 997     p = tag_start_position + 1;
 998     goto look_for_tag;
 999   }
1000
1001  finish:
1002   POOL_FREE (&pool);
1003   if (attr_pair_resized)
1004     xfree (pairs);
1005 }
1006
1007 #undef ADVANCE
1008 #undef SKIP_WS
1009 #undef SKIP_NON_WS
1010 \f
1011 #ifdef STANDALONE
1012 static void
1013 test_mapper (struct taginfo *taginfo, void *arg)
1014 {
1015   int i;
1016
1017   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1018   for (i = 0; i < taginfo->nattrs; i++)
1019     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1020   putchar ('\n');
1021   ++*(int *)arg;
1022 }
1023
1024 int main ()
1025 {
1026   int size = 256;
1027   char *x = (char *)xmalloc (size);
1028   int length = 0;
1029   int read_count;
1030   int tag_counter = 0;
1031
1032   while ((read_count = fread (x + length, 1, size - length, stdin)))
1033     {
1034       length += read_count;
1035       size <<= 1;
1036       x = (char *)xrealloc (x, size);
1037     }
1038
1039   map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
1040   printf ("TAGS: %d\n", tag_counter);
1041   printf ("Tag backouts:     %d\n", tag_backout_count);
1042   printf ("Comment backouts: %d\n", comment_backout_count);
1043   return 0;
1044 }
1045 #endif /* STANDALONE */