sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 /* The only entry point to this module is map_html_tags(), which see.  */
  31
  32 /* TODO:
  33
  34    - Allow hooks for callers to process contents outside tags.  This
  35      is needed to implement handling <style> and <script>.  The
  36      taginfo structure already carries the information about where the
  37      tags are, but this is not enough, because one would also want to
  38      skip the comments.  (The funny thing is that for <style> and
  39      <script> you *don't* want to skip comments!)
  40
  41    - Create a test suite for regression testing. */
  42
  43 /* HISTORY:
  44
  45    This is the third HTML parser written for Wget.  The first one was
  46    written some time during the Geturl 1.0 beta cycle, and was very
  47    inefficient and buggy.  It also contained some very complex code to
  48    remember a list of parser states, because it was supposed to be
  49    reentrant.
  50
  51    The second HTML parser was written for Wget 1.4 (the first version
  52    by the name `Wget'), and was a complete rewrite.  Although the new
  53    parser behaved much better and made no claims of reentrancy, it
  54    still shared many of the fundamental flaws of the old version -- it
  55    only regarded HTML in terms tag-attribute pairs, where the
  56    attribute's value was a URL to be returned.  Any other property of
  57    HTML, such as <base href=...>, or strange way to specify a URL,
  58    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  59    crudely hacked in -- and the caller had to be aware of these hacks.
  60    Like its predecessor, this parser did not support HTML comments.
  61
  62    After Wget 1.5.1 was released, I set out to write a third HTML
  63    parser.  The objectives of the new parser were to: (1) provide a
  64    clean way to analyze HTML lexically, (2) separate interpretation of
  65    the markup from the parsing process, (3) be as correct as possible,
  66    e.g. correctly skipping comments and other SGML declarations, (4)
  67    understand the most common errors in markup and skip them or be
  68    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  69    minimum copying and minimum or no heap allocation).
  70
  71    I believe this parser meets all of the above goals.  It is
  72    reasonably well structured, and could be relatively easily
  73    separated from Wget and used elsewhere.  While some of its
  74    intrinsic properties limit its value as a general-purpose HTML
  75    parser, I believe that, with minimum modifications, it could serve
  76    as a backend for one.
  77
  78    Due to time and other constraints, this parser was not integrated
  79    into Wget until the version 1.7. */
  80
  81 /* DESCRIPTION:
  82
  83    The single entry point of this parser is map_html_tags(), which
  84    works by calling a function you specify for each tag.  The function
  85    gets called with the pointer to a structure describing the tag and
  86    its attributes.  */
  87
  88 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  89    still need Wget headers to compile.  */
  90
  91 #include <config.h>
  92
  93 #ifdef STANDALONE
  94 # define I_REALLY_WANT_CTYPE_MACROS
  95 #endif
  96
  97 #include <stdio.h>
  98 #include <stdlib.h>
  99 #include <string.h>
 100 #include <assert.h>
 101
 102 #include "wget.h"
 103 #include "html-parse.h"
 104
 105 #ifdef STANDALONE
 106 # undef xmalloc
 107 # undef xrealloc
 108 # undef xfree
 109 # define xmalloc malloc
 110 # define xrealloc realloc
 111 # define xfree free
 112
 113 # undef ISSPACE
 114 # undef ISDIGIT
 115 # undef ISXDIGIT
 116 # undef ISALPHA
 117 # undef ISALNUM
 118 # undef TOLOWER
 119 # undef TOUPPER
 120
 121 # define ISSPACE(x) isspace (x)
 122 # define ISDIGIT(x) isdigit (x)
 123 # define ISXDIGIT(x) isxdigit (x)
 124 # define ISALPHA(x) isalpha (x)
 125 # define ISALNUM(x) isalnum (x)
 126 # define TOLOWER(x) tolower (x)
 127 # define TOUPPER(x) toupper (x)
 128
 129 struct hash_table {
 130   int dummy;
 131 };
 132 static void *
 133 hash_table_get (const struct hash_table *ht, void *ptr)
 134 {
 135   return ptr;
 136 }
 137 #else  /* not STANDALONE */
 138 # include "hash.h"
 139 #endif
 140
 141 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 142    allocated on the stack, and moved to the heap if it needs to be
 143    larger than originally expected.  map_html_tags() uses it to store
 144    the zero-terminated names and values of tags and attributes.
 145
 146    Thus taginfo->name, and attr->name and attr->value for each
 147    attribute, do not point into separately allocated areas, but into
 148    different parts of the pool, separated only by terminating zeros.
 149    This ensures minimum amount of allocation and, for most tags, no
 150    allocation because the entire pool is kept on the stack.  */
 151
 152 struct pool {
 153   char *contents;               /* pointer to the contents. */
 154   int size;                     /* size of the pool. */
 155   int tail;                     /* next available position index. */
 156   int resized;                  /* whether the pool has been resized
 157                                    using malloc. */
 158
 159   char *orig_contents;          /* original pool contents, usually
 160                                    stack-allocated.  used by POOL_FREE
 161                                    to restore the pool to the initial
 162                                    state. */
 163   int orig_size;
 164 };
 165
 166 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 167
 168 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 169   struct pool *P = (p);                                         \
 170   P->contents = (initial_storage);                              \
 171   P->size = (initial_size);                                     \
 172   P->tail = 0;                                                  \
 173   P->resized = 0;                                               \
 174   P->orig_contents = P->contents;                               \
 175   P->orig_size = P->size;                                       \
 176 } while (0)
 177
 178 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 179    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 180
 181 #define POOL_GROW(p, increase)                                  \
 182   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 183               (p)->resized, char)
 184
 185 /* Append text in the range [beg, end) to POOL.  No zero-termination
 186    is done.  */
 187
 188 #define POOL_APPEND(p, beg, end) do {                   \
 189   const char *PA_beg = (beg);                           \
 190   int PA_size = (end) - PA_beg;                         \
 191   POOL_GROW (p, PA_size);                               \
 192   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 193   (p)->tail += PA_size;                                 \
 194 } while (0)
 195
 196 /* Append one character to the pool.  Can be used to zero-terminate
 197    pool strings.  */
 198
 199 #define POOL_APPEND_CHR(p, ch) do {             \
 200   char PAC_char = (ch);                         \
 201   POOL_GROW (p, 1);                             \
 202   (p)->contents[(p)->tail++] = PAC_char;        \
 203 } while (0)
 204
 205 /* Forget old pool contents.  The allocated memory is not freed. */
 206 #define POOL_REWIND(p) (p)->tail = 0
 207
 208 /* Free heap-allocated memory for contents of POOL.  This calls
 209    xfree() if the memory was allocated through malloc.  It also
 210    restores `contents' and `size' to their original, pre-malloc
 211    values.  That way after POOL_FREE, the pool is fully usable, just
 212    as if it were freshly initialized with POOL_INIT.  */
 213
 214 #define POOL_FREE(p) do {                       \
 215   struct pool *P = p;                           \
 216   if (P->resized)                               \
 217     xfree (P->contents);                        \
 218   P->contents = P->orig_contents;               \
 219   P->size = P->orig_size;                       \
 220   P->tail = 0;                                  \
 221   P->resized = 0;                               \
 222 } while (0)
 223
 224 /* Used for small stack-allocated memory chunks that might grow.  Like
 225    DO_REALLOC, this macro grows BASEVAR as necessary to take
 226    NEEDED_SIZE items of TYPE.
 227
 228    The difference is that on the first resize, it will use
 229    malloc+memcpy rather than realloc.  That way you can stack-allocate
 230    the initial chunk, and only resort to heap allocation if you
 231    stumble upon large data.
 232
 233    After the first resize, subsequent ones are performed with realloc,
 234    just like DO_REALLOC.  */
 235
 236 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 237   long ga_needed_size = (needed_size);                                          \
 238   long ga_newsize = (sizevar);                                                  \
 239   while (ga_newsize < ga_needed_size)                                           \
 240     ga_newsize <<= 1;                                                           \
 241   if (ga_newsize != (sizevar))                                                  \
 242     {                                                                           \
 243       if (resized)                                                              \
 244         basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
 245       else                                                                      \
 246         {                                                                       \
 247           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 248           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 249           (basevar) = ga_new;                                                   \
 250           resized = 1;                                                          \
 251         }                                                                       \
 252       (sizevar) = ga_newsize;                                                   \
 253     }                                                                           \
 254 } while (0)
 255 \f
 256 /* Test whether n+1-sized entity name fits in P.  We don't support
 257    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 258    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 259    other words an entity needs to be terminated by either a
 260    non-alphanumeric or the end of string.  */
 261 #define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
 262
 263 /* Macros that test entity names by returning true if P is followed by
 264    the specified characters.  */
 265 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 266 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 267 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 268
 269 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 270    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 271    to "<foo", but "&lt,foo" to "<,foo".  */
 272 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 273
 274 /* Decode the HTML character entity at *PTR, considering END to be end
 275    of buffer.  It is assumed that the "&" character that marks the
 276    beginning of the entity has been seen at *PTR-1.  If a recognized
 277    ASCII entity is seen, it is returned, and *PTR is moved to the end
 278    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 279
 280    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 281
 282 static int
 283 decode_entity (const char **ptr, const char *end)
 284 {
 285   const char *p = *ptr;
 286   int value = -1;
 287
 288   if (++p == end)
 289     return -1;
 290
 291   switch (*p++)
 292     {
 293     case '#':
 294       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 295       {
 296         int digits = 0;
 297         value = 0;
 298         if (*p == 'x')
 299           for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
 300             value = (value << 4) + XDIGIT_TO_NUM (*p);
 301         else
 302           for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
 303             value = (value * 10) + (*p - '0');
 304         if (!digits)
 305           return -1;
 306         /* Don't interpret 128+ codes and NUL because we cannot
 307            portably reinserted them into HTML.  */
 308         if (!value || (value & ~0x7f))
 309           return -1;
 310         *ptr = SKIP_SEMI (p, 0);
 311         return value;
 312       }
 313     /* Process named ASCII entities.  */
 314     case 'g':
 315       if (ENT1 (p, 't'))
 316         value = '>', *ptr = SKIP_SEMI (p, 1);
 317       break;
 318     case 'l':
 319       if (ENT1 (p, 't'))
 320         value = '<', *ptr = SKIP_SEMI (p, 1);
 321       break;
 322     case 'a':
 323       if (ENT2 (p, 'm', 'p'))
 324         value = '&', *ptr = SKIP_SEMI (p, 2);
 325       else if (ENT3 (p, 'p', 'o', 's'))
 326         /* handle &apos for the sake of the XML/XHTML crowd. */
 327         value = '\'', *ptr = SKIP_SEMI (p, 3);
 328       break;
 329     case 'q':
 330       if (ENT3 (p, 'u', 'o', 't'))
 331         value = '\"', *ptr = SKIP_SEMI (p, 3);
 332       break;
 333     }
 334   return value;
 335 }
 336 #undef ENT1
 337 #undef ENT2
 338 #undef ENT3
 339 #undef FITS
 340 #undef SKIP_SEMI
 341
 342 enum {
 343   AP_DOWNCASE           = 1,
 344   AP_DECODE_ENTITIES    = 2,
 345   AP_TRIM_BLANKS        = 4
 346 };
 347
 348 /* Copy the text in the range [BEG, END) to POOL, optionally
 349    performing operations specified by FLAGS.  FLAGS may be any
 350    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 351    with the following meaning:
 352
 353    * AP_DOWNCASE -- downcase all the letters;
 354
 355    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 356      the ASCII range when copying the string.
 357
 358    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 359      of text, as well as embedded newlines.  */
 360
 361 static void
 362 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 363 {
 364   int old_tail = pool->tail;
 365
 366   /* Skip blanks if required.  We must do this before entities are
 367      processed, so that blanks can still be inserted as, for instance,
 368      `&#32;'.  */
 369   if (flags & AP_TRIM_BLANKS)
 370     {
 371       while (beg < end && ISSPACE (*beg))
 372         ++beg;
 373       while (end > beg && ISSPACE (end[-1]))
 374         --end;
 375     }
 376
 377   if (flags & AP_DECODE_ENTITIES)
 378     {
 379       /* Grow the pool, then copy the text to the pool character by
 380          character, processing the encountered entities as we go
 381          along.
 382
 383          It's safe (and necessary) to grow the pool in advance because
 384          processing the entities can only *shorten* the string, it can
 385          never lengthen it.  */
 386       const char *from = beg;
 387       char *to;
 388       int squash_newlines = flags & AP_TRIM_BLANKS;
 389
 390       POOL_GROW (pool, end - beg);
 391       to = pool->contents + pool->tail;
 392
 393       while (from < end)
 394         {
 395           if (*from == '&')
 396             {
 397               int entity = decode_entity (&from, end);
 398               if (entity != -1)
 399                 *to++ = entity;
 400               else
 401                 *to++ = *from++;
 402             }
 403           else if ((*from == '\n' || *from == '\r') && squash_newlines)
 404             ++from;
 405           else
 406             *to++ = *from++;
 407         }
 408       /* Verify that we haven't exceeded the original size.  (It
 409          shouldn't happen, hence the assert.)  */
 410       assert (to - (pool->contents + pool->tail) <= end - beg);
 411
 412       /* Make POOL's tail point to the position following the string
 413          we've written.  */
 414       pool->tail = to - pool->contents;
 415       POOL_APPEND_CHR (pool, '\0');
 416     }
 417   else
 418     {
 419       /* Just copy the text to the pool.  */
 420       POOL_APPEND (pool, beg, end);
 421       POOL_APPEND_CHR (pool, '\0');
 422     }
 423
 424   if (flags & AP_DOWNCASE)
 425     {
 426       char *p = pool->contents + old_tail;
 427       for (; *p; p++)
 428         *p = TOLOWER (*p);
 429     }
 430 }
 431 \f
 432 /* Originally we used to adhere to rfc 1866 here, and allowed only
 433    letters, digits, periods, and hyphens as names (of tags or
 434    attributes).  However, this broke too many pages which used
 435    proprietary or strange attributes, e.g. <img src="a.gif"
 436    v:shapes="whatever">.
 437
 438    So now we allow any character except:
 439      * whitespace
 440      * 8-bit and control chars
 441      * characters that clearly cannot be part of name:
 442        '=', '>', '/'.
 443
 444    This only affects attribute and tag names; attribute values allow
 445    an even greater variety of characters.  */
 446
 447 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 448                         && (x) != '=' && (x) != '>' && (x) != '/')
 449
 450 #ifdef STANDALONE
 451 static int comment_backout_count;
 452 #endif
 453
 454 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 455    strict comments mode, this is used for skipping over comments as
 456    well.
 457
 458    To recap: any SGML declaration may have comments associated with
 459    it, e.g.
 460        <!MY-DECL -- isn't this fun? -- foo bar>
 461
 462    An HTML comment is merely an empty declaration (<!>) with a comment
 463    attached, like this:
 464        <!-- some stuff here -->
 465
 466    Several comments may be embedded in one comment declaration:
 467        <!-- have -- -- fun -->
 468
 469    Whitespace is allowed between and after the comments, but not
 470    before the first comment.  Additionally, this function attempts to
 471    handle double quotes in SGML declarations correctly.  */
 472
 473 static const char *
 474 advance_declaration (const char *beg, const char *end)
 475 {
 476   const char *p = beg;
 477   char quote_char = '\0';       /* shut up, gcc! */
 478   char ch;
 479
 480   enum {
 481     AC_S_DONE,
 482     AC_S_BACKOUT,
 483     AC_S_BANG,
 484     AC_S_DEFAULT,
 485     AC_S_DCLNAME,
 486     AC_S_DASH1,
 487     AC_S_DASH2,
 488     AC_S_COMMENT,
 489     AC_S_DASH3,
 490     AC_S_DASH4,
 491     AC_S_QUOTE1,
 492     AC_S_IN_QUOTE,
 493     AC_S_QUOTE2
 494   } state = AC_S_BANG;
 495
 496   if (beg == end)
 497     return beg;
 498   ch = *p++;
 499
 500   /* It looked like a good idea to write this as a state machine, but
 501      now I wonder...  */
 502
 503   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 504     {
 505       if (p == end)
 506         state = AC_S_BACKOUT;
 507       switch (state)
 508         {
 509         case AC_S_DONE:
 510         case AC_S_BACKOUT:
 511           break;
 512         case AC_S_BANG:
 513           if (ch == '!')
 514             {
 515               ch = *p++;
 516               state = AC_S_DEFAULT;
 517             }
 518           else
 519             state = AC_S_BACKOUT;
 520           break;
 521         case AC_S_DEFAULT:
 522           switch (ch)
 523             {
 524             case '-':
 525               state = AC_S_DASH1;
 526               break;
 527             case ' ':
 528             case '\t':
 529             case '\r':
 530             case '\n':
 531               ch = *p++;
 532               break;
 533             case '>':
 534               state = AC_S_DONE;
 535               break;
 536             case '\'':
 537             case '\"':
 538               state = AC_S_QUOTE1;
 539               break;
 540             default:
 541               if (NAME_CHAR_P (ch))
 542                 state = AC_S_DCLNAME;
 543               else
 544                 state = AC_S_BACKOUT;
 545               break;
 546             }
 547           break;
 548         case AC_S_DCLNAME:
 549           if (ch == '-')
 550             state = AC_S_DASH1;
 551           else if (NAME_CHAR_P (ch))
 552             ch = *p++;
 553           else
 554             state = AC_S_DEFAULT;
 555           break;
 556         case AC_S_QUOTE1:
 557           /* We must use 0x22 because broken assert macros choke on
 558              '"' and '\"'.  */
 559           assert (ch == '\'' || ch == 0x22);
 560           quote_char = ch;      /* cheating -- I really don't feel like
 561                                    introducing more different states for
 562                                    different quote characters. */
 563           ch = *p++;
 564           state = AC_S_IN_QUOTE;
 565           break;
 566         case AC_S_IN_QUOTE:
 567           if (ch == quote_char)
 568             state = AC_S_QUOTE2;
 569           else
 570             ch = *p++;
 571           break;
 572         case AC_S_QUOTE2:
 573           assert (ch == quote_char);
 574           ch = *p++;
 575           state = AC_S_DEFAULT;
 576           break;
 577         case AC_S_DASH1:
 578           assert (ch == '-');
 579           ch = *p++;
 580           state = AC_S_DASH2;
 581           break;
 582         case AC_S_DASH2:
 583           switch (ch)
 584             {
 585             case '-':
 586               ch = *p++;
 587               state = AC_S_COMMENT;
 588               break;
 589             default:
 590               state = AC_S_BACKOUT;
 591             }
 592           break;
 593         case AC_S_COMMENT:
 594           switch (ch)
 595             {
 596             case '-':
 597               state = AC_S_DASH3;
 598               break;
 599             default:
 600               ch = *p++;
 601               break;
 602             }
 603           break;
 604         case AC_S_DASH3:
 605           assert (ch == '-');
 606           ch = *p++;
 607           state = AC_S_DASH4;
 608           break;
 609         case AC_S_DASH4:
 610           switch (ch)
 611             {
 612             case '-':
 613               ch = *p++;
 614               state = AC_S_DEFAULT;
 615               break;
 616             default:
 617               state = AC_S_COMMENT;
 618               break;
 619             }
 620           break;
 621         }
 622     }
 623
 624   if (state == AC_S_BACKOUT)
 625     {
 626 #ifdef STANDALONE
 627       ++comment_backout_count;
 628 #endif
 629       return beg + 1;
 630     }
 631   return p;
 632 }
 633
 634 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 635    return the pointer to the character after the substring.  If the
 636    substring is not found, return NULL.  */
 637
 638 static const char *
 639 find_comment_end (const char *beg, const char *end)
 640 {
 641   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 642      if it's not '>' or '-', advance by three characters.  Otherwise,
 643      look at the preceding characters and try to find a match.  */
 644
 645   const char *p = beg - 1;
 646
 647   while ((p += 3) < end)
 648     switch (p[0])
 649       {
 650       case '>':
 651         if (p[-1] == '-' && p[-2] == '-')
 652           return p + 1;
 653         break;
 654       case '-':
 655       at_dash:
 656         if (p[-1] == '-')
 657           {
 658           at_dash_dash:
 659             if (++p == end) return NULL;
 660             switch (p[0])
 661               {
 662               case '>': return p + 1;
 663               case '-': goto at_dash_dash;
 664               }
 665           }
 666         else
 667           {
 668             if ((p += 2) >= end) return NULL;
 669             switch (p[0])
 670               {
 671               case '>':
 672                 if (p[-1] == '-')
 673                   return p + 1;
 674                 break;
 675               case '-':
 676                 goto at_dash;
 677               }
 678           }
 679       }
 680   return NULL;
 681 }
 682 \f
 683 /* Return non-zero of the string inside [b, e) are present in hash
 684    table HT.  */
 685
 686 static int
 687 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 688 {
 689   char *copy;
 690   if (!ht)
 691     return 1;
 692   BOUNDED_TO_ALLOCA (b, e, copy);
 693   return hash_table_get (ht, copy) != NULL;
 694 }
 695
 696 /* Advance P (a char pointer), with the explicit intent of being able
 697    to read the next character.  If this is not possible, go to finish.  */
 698
 699 #define ADVANCE(p) do {                         \
 700   ++p;                                          \
 701   if (p >= end)                                 \
 702     goto finish;                                \
 703 } while (0)
 704
 705 /* Skip whitespace, if any. */
 706
 707 #define SKIP_WS(p) do {                         \
 708   while (ISSPACE (*p)) {                        \
 709     ADVANCE (p);                                \
 710   }                                             \
 711 } while (0)
 712
 713 /* Skip non-whitespace, if any. */
 714
 715 #define SKIP_NON_WS(p) do {                     \
 716   while (!ISSPACE (*p)) {                       \
 717     ADVANCE (p);                                \
 718   }                                             \
 719 } while (0)
 720
 721 #ifdef STANDALONE
 722 static int tag_backout_count;
 723 #endif
 724
 725 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 726    MAPFUN will be called with two arguments: pointer to an initialized
 727    struct taginfo, and MAPARG.
 728
 729    ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
 730    which are the tags and attribute names that this function should
 731    use.  If ALLOWED_TAGS is NULL, all tags are processed; if
 732    ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
 733
 734    (Obviously, the caller can filter out unwanted tags and attributes
 735    just as well, but this is just an optimization designed to avoid
 736    unnecessary copying of tags/attributes which the caller doesn't
 737    care about.)  */
 738
 739 void
 740 map_html_tags (const char *text, int size,
 741                void (*mapfun) (struct taginfo *, void *), void *maparg,
 742                int flags,
 743                const struct hash_table *allowed_tags,
 744                const struct hash_table *allowed_attributes)
 745 {
 746   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 747      too little, POOL_APPEND allocates more with malloc. */
 748   char pool_initial_storage[256];
 749   struct pool pool;
 750
 751   const char *p = text;
 752   const char *end = text + size;
 753
 754   struct attr_pair attr_pair_initial_storage[8];
 755   int attr_pair_size = countof (attr_pair_initial_storage);
 756   int attr_pair_resized = 0;
 757   struct attr_pair *pairs = attr_pair_initial_storage;
 758
 759   if (!size)
 760     return;
 761
 762   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 763
 764   {
 765     int nattrs, end_tag;
 766     const char *tag_name_begin, *tag_name_end;
 767     const char *tag_start_position;
 768     int uninteresting_tag;
 769
 770   look_for_tag:
 771     POOL_REWIND (&pool);
 772
 773     nattrs = 0;
 774     end_tag = 0;
 775
 776     /* Find beginning of tag.  We use memchr() instead of the usual
 777        looping with ADVANCE() for speed. */
 778     p = memchr (p, '<', end - p);
 779     if (!p)
 780       goto finish;
 781
 782     tag_start_position = p;
 783     ADVANCE (p);
 784
 785     /* Establish the type of the tag (start-tag, end-tag or
 786        declaration).  */
 787     if (*p == '!')
 788       {
 789         if (!(flags & MHT_STRICT_COMMENTS)
 790             && p < end + 3 && p[1] == '-' && p[2] == '-')
 791           {
 792             /* If strict comments are not enforced and if we know
 793                we're looking at a comment, simply look for the
 794                terminating "-->".  Non-strict is the default because
 795                it works in other browsers and most HTML writers can't
 796                be bothered with getting the comments right.  */
 797             const char *comment_end = find_comment_end (p + 3, end);
 798             if (comment_end)
 799               p = comment_end;
 800           }
 801         else
 802           {
 803             /* Either in strict comment mode or looking at a non-empty
 804                declaration.  Real declarations are much less likely to
 805                be misused the way comments are, so advance over them
 806                properly regardless of strictness.  */
 807             p = advance_declaration (p, end);
 808           }
 809         if (p == end)
 810           goto finish;
 811         goto look_for_tag;
 812       }
 813     else if (*p == '/')
 814       {
 815         end_tag = 1;
 816         ADVANCE (p);
 817       }
 818     tag_name_begin = p;
 819     while (NAME_CHAR_P (*p))
 820       ADVANCE (p);
 821     if (p == tag_name_begin)
 822       goto look_for_tag;
 823     tag_name_end = p;
 824     SKIP_WS (p);
 825     if (end_tag && *p != '>')
 826       goto backout_tag;
 827
 828     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 829       /* We can't just say "goto look_for_tag" here because we need
 830          the loop below to properly advance over the tag's attributes.  */
 831       uninteresting_tag = 1;
 832     else
 833       {
 834         uninteresting_tag = 0;
 835         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 836       }
 837
 838     /* Find the attributes. */
 839     while (1)
 840       {
 841         const char *attr_name_begin, *attr_name_end;
 842         const char *attr_value_begin, *attr_value_end;
 843         const char *attr_raw_value_begin, *attr_raw_value_end;
 844         int operation = AP_DOWNCASE; /* stupid compiler. */
 845
 846         SKIP_WS (p);
 847
 848         if (*p == '/')
 849           {
 850             /* A slash at this point means the tag is about to be
 851                closed.  This is legal in XML and has been popularized
 852                in HTML via XHTML.  */
 853             /* <foo a=b c=d /> */
 854             /*              ^  */
 855             ADVANCE (p);
 856             SKIP_WS (p);
 857             if (*p != '>')
 858               goto backout_tag;
 859           }
 860
 861         /* Check for end of tag definition. */
 862         if (*p == '>')
 863           break;
 864
 865         /* Establish bounds of attribute name. */
 866         attr_name_begin = p;    /* <foo bar ...> */
 867                                 /*      ^        */
 868         while (NAME_CHAR_P (*p))
 869           ADVANCE (p);
 870         attr_name_end = p;      /* <foo bar ...> */
 871                                 /*         ^     */
 872         if (attr_name_begin == attr_name_end)
 873           goto backout_tag;
 874
 875         /* Establish bounds of attribute value. */
 876         SKIP_WS (p);
 877         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 878           {
 879             /* Minimized attribute syntax allows `=' to be omitted.
 880                For example, <UL COMPACT> is a valid shorthand for <UL
 881                COMPACT="compact">.  Even if such attributes are not
 882                useful to Wget, we need to support them, so that the
 883                tags containing them can be parsed correctly. */
 884             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 885             attr_raw_value_end = attr_value_end = attr_name_end;
 886           }
 887         else if (*p == '=')
 888           {
 889             ADVANCE (p);
 890             SKIP_WS (p);
 891             if (*p == '\"' || *p == '\'')
 892               {
 893                 int newline_seen = 0;
 894                 char quote_char = *p;
 895                 attr_raw_value_begin = p;
 896                 ADVANCE (p);
 897                 attr_value_begin = p; /* <foo bar="baz"> */
 898                                       /*           ^     */
 899                 while (*p != quote_char)
 900                   {
 901                     if (!newline_seen && *p == '\n')
 902                       {
 903                         /* If a newline is seen within the quotes, it
 904                            is most likely that someone forgot to close
 905                            the quote.  In that case, we back out to
 906                            the value beginning, and terminate the tag
 907                            at either `>' or the delimiter, whichever
 908                            comes first.  Such a tag terminated at `>'
 909                            is discarded.  */
 910                         p = attr_value_begin;
 911                         newline_seen = 1;
 912                         continue;
 913                       }
 914                     else if (newline_seen && *p == '>')
 915                       break;
 916                     ADVANCE (p);
 917                   }
 918                 attr_value_end = p; /* <foo bar="baz"> */
 919                                     /*              ^  */
 920                 if (*p == quote_char)
 921                   ADVANCE (p);
 922                 else
 923                   goto look_for_tag;
 924                 attr_raw_value_end = p; /* <foo bar="baz"> */
 925                                         /*               ^ */
 926                 operation = AP_DECODE_ENTITIES;
 927                 if (flags & MHT_TRIM_VALUES)
 928                   operation |= AP_TRIM_BLANKS;
 929               }
 930             else
 931               {
 932                 attr_value_begin = p; /* <foo bar=baz> */
 933                                       /*          ^    */
 934                 /* According to SGML, a name token should consist only
 935                    of alphanumerics, . and -.  However, this is often
 936                    violated by, for instance, `%' in `width=75%'.
 937                    We'll be liberal and allow just about anything as
 938                    an attribute value.  */
 939                 while (!ISSPACE (*p) && *p != '>')
 940                   ADVANCE (p);
 941                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 942                                     /*             ^          */
 943                 if (attr_value_begin == attr_value_end)
 944                   /* <foo bar=> */
 945                   /*          ^ */
 946                   goto backout_tag;
 947                 attr_raw_value_begin = attr_value_begin;
 948                 attr_raw_value_end = attr_value_end;
 949                 operation = AP_DECODE_ENTITIES;
 950               }
 951           }
 952         else
 953           {
 954             /* We skipped the whitespace and found something that is
 955                neither `=' nor the beginning of the next attribute's
 956                name.  Back out.  */
 957             goto backout_tag;   /* <foo bar [... */
 958                                 /*          ^    */
 959           }
 960
 961         /* If we're not interested in the tag, don't bother with any
 962            of the attributes.  */
 963         if (uninteresting_tag)
 964           continue;
 965
 966         /* If we aren't interested in the attribute, skip it.  We
 967            cannot do this test any sooner, because our text pointer
 968            needs to correctly advance over the attribute.  */
 969         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
 970           continue;
 971
 972         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 973                     struct attr_pair);
 974
 975         pairs[nattrs].name_pool_index = pool.tail;
 976         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 977
 978         pairs[nattrs].value_pool_index = pool.tail;
 979         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 980         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 981         pairs[nattrs].value_raw_size = (attr_raw_value_end
 982                                         - attr_raw_value_begin);
 983         ++nattrs;
 984       }
 985
 986     if (uninteresting_tag)
 987       {
 988         ADVANCE (p);
 989         goto look_for_tag;
 990       }
 991
 992     /* By now, we have a valid tag with a name and zero or more
 993        attributes.  Fill in the data and call the mapper function.  */
 994     {
 995       int i;
 996       struct taginfo taginfo;
 997
 998       taginfo.name      = pool.contents;
 999       taginfo.end_tag_p = end_tag;
1000       taginfo.nattrs    = nattrs;
1001       /* We fill in the char pointers only now, when pool can no
1002          longer get realloc'ed.  If we did that above, we could get
1003          hosed by reallocation.  Obviously, after this point, the pool
1004          may no longer be grown.  */
1005       for (i = 0; i < nattrs; i++)
1006         {
1007           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1008           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1009         }
1010       taginfo.attrs = pairs;
1011       taginfo.start_position = tag_start_position;
1012       taginfo.end_position   = p + 1;
1013       mapfun (&taginfo, maparg);
1014       ADVANCE (p);
1015     }
1016     goto look_for_tag;
1017
1018   backout_tag:
1019 #ifdef STANDALONE
1020     ++tag_backout_count;
1021 #endif
1022     /* The tag wasn't really a tag.  Treat its contents as ordinary
1023        data characters. */
1024     p = tag_start_position + 1;
1025     goto look_for_tag;
1026   }
1027
1028  finish:
1029   POOL_FREE (&pool);
1030   if (attr_pair_resized)
1031     xfree (pairs);
1032 }
1033
1034 #undef ADVANCE
1035 #undef SKIP_WS
1036 #undef SKIP_NON_WS
1037 \f
1038 #ifdef STANDALONE
1039 static void
1040 test_mapper (struct taginfo *taginfo, void *arg)
1041 {
1042   int i;
1043
1044   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1045   for (i = 0; i < taginfo->nattrs; i++)
1046     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1047   putchar ('\n');
1048   ++*(int *)arg;
1049 }
1050
1051 int main ()
1052 {
1053   int size = 256;
1054   char *x = xmalloc (size);
1055   int length = 0;
1056   int read_count;
1057   int tag_counter = 0;
1058
1059   while ((read_count = fread (x + length, 1, size - length, stdin)))
1060     {
1061       length += read_count;
1062       size <<= 1;
1063       x = xrealloc (x, size);
1064     }
1065
1066   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1067   printf ("TAGS: %d\n", tag_counter);
1068   printf ("Tag backouts:     %d\n", tag_backout_count);
1069   printf ("Comment backouts: %d\n", comment_backout_count);
1070   return 0;
1071 }
1072 #endif /* STANDALONE */