sjero.net Git - wget/blob - src/html-parse.c

   1 /* HTML parser for Wget.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 /* The only entry point to this module is map_html_tags(), which see.  */
  32
  33 /* TODO:
  34
  35    - Allow hooks for callers to process contents outside tags.  This
  36      is needed to implement handling <style> and <script>.  The
  37      taginfo structure already carries the information about where the
  38      tags are, but this is not enough, because one would also want to
  39      skip the comments.  (The funny thing is that for <style> and
  40      <script> you *don't* want to skip comments!)
  41
  42    - Create a test suite for regression testing. */
  43
  44 /* HISTORY:
  45
  46    This is the third HTML parser written for Wget.  The first one was
  47    written some time during the Geturl 1.0 beta cycle, and was very
  48    inefficient and buggy.  It also contained some very complex code to
  49    remember a list of parser states, because it was supposed to be
  50    reentrant.
  51
  52    The second HTML parser was written for Wget 1.4 (the first version
  53    by the name `Wget'), and was a complete rewrite.  Although the new
  54    parser behaved much better and made no claims of reentrancy, it
  55    still shared many of the fundamental flaws of the old version -- it
  56    only regarded HTML in terms tag-attribute pairs, where the
  57    attribute's value was a URL to be returned.  Any other property of
  58    HTML, such as <base href=...>, or strange way to specify a URL,
  59    such as <meta http-equiv=Refresh content="0; URL=..."> had to be
  60    crudely hacked in -- and the caller had to be aware of these hacks.
  61    Like its predecessor, this parser did not support HTML comments.
  62
  63    After Wget 1.5.1 was released, I set out to write a third HTML
  64    parser.  The objectives of the new parser were to: (1) provide a
  65    clean way to analyze HTML lexically, (2) separate interpretation of
  66    the markup from the parsing process, (3) be as correct as possible,
  67    e.g. correctly skipping comments and other SGML declarations, (4)
  68    understand the most common errors in markup and skip them or be
  69    relaxed towrds them, and (5) be reasonably efficient (no regexps,
  70    minimum copying and minimum or no heap allocation).
  71
  72    I believe this parser meets all of the above goals.  It is
  73    reasonably well structured, and could be relatively easily
  74    separated from Wget and used elsewhere.  While some of its
  75    intrinsic properties limit its value as a general-purpose HTML
  76    parser, I believe that, with minimum modifications, it could serve
  77    as a backend for one.
  78
  79    Due to time and other constraints, this parser was not integrated
  80    into Wget until the version 1.7. */
  81
  82 /* DESCRIPTION:
  83
  84    The single entry point of this parser is map_html_tags(), which
  85    works by calling a function you specify for each tag.  The function
  86    gets called with the pointer to a structure describing the tag and
  87    its attributes.  */
  88
  89 /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
  90    still need Wget headers to compile.  */
  91
  92 #define USE_GNULIB_ALLOC
  93
  94 #include "wget.h"
  95
  96 #ifdef STANDALONE
  97 # define I_REALLY_WANT_CTYPE_MACROS
  98 #endif
  99
 100 #include <stdio.h>
 101 #include <stdlib.h>
 102 #include <string.h>
 103 #include <assert.h>
 104
 105 #include "utils.h"
 106 #include "html-parse.h"
 107
 108 #ifdef STANDALONE
 109 # undef xmalloc
 110 # undef xrealloc
 111 # undef xfree
 112 # define xmalloc malloc
 113 # define xrealloc realloc
 114 # define xfree free
 115
 116 # undef c_isspace
 117 # undef c_isdigit
 118 # undef c_isxdigit
 119 # undef c_isalpha
 120 # undef c_isalnum
 121 # undef c_tolower
 122 # undef c_toupper
 123
 124 # define c_isspace(x) isspace (x)
 125 # define c_isdigit(x) isdigit (x)
 126 # define c_isxdigit(x) isxdigit (x)
 127 # define c_isalpha(x) isalpha (x)
 128 # define c_isalnum(x) isalnum (x)
 129 # define c_tolower(x) tolower (x)
 130 # define c_toupper(x) toupper (x)
 131
 132 struct hash_table {
 133   int dummy;
 134 };
 135 static void *
 136 hash_table_get (const struct hash_table *ht, void *ptr)
 137 {
 138   return ptr;
 139 }
 140 #else  /* not STANDALONE */
 141 # include "hash.h"
 142 #endif
 143
 144 /* Pool support.  A pool is a resizable chunk of memory.  It is first
 145    allocated on the stack, and moved to the heap if it needs to be
 146    larger than originally expected.  map_html_tags() uses it to store
 147    the zero-terminated names and values of tags and attributes.
 148
 149    Thus taginfo->name, and attr->name and attr->value for each
 150    attribute, do not point into separately allocated areas, but into
 151    different parts of the pool, separated only by terminating zeros.
 152    This ensures minimum amount of allocation and, for most tags, no
 153    allocation because the entire pool is kept on the stack.  */
 154
 155 struct pool {
 156   char *contents;               /* pointer to the contents. */
 157   int size;                     /* size of the pool. */
 158   int tail;                     /* next available position index. */
 159   bool resized;                 /* whether the pool has been resized
 160                                    using malloc. */
 161
 162   char *orig_contents;          /* original pool contents, usually
 163                                    stack-allocated.  used by POOL_FREE
 164                                    to restore the pool to the initial
 165                                    state. */
 166   int orig_size;
 167 };
 168
 169 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
 170
 171 #define POOL_INIT(p, initial_storage, initial_size) do {        \
 172   struct pool *P = (p);                                         \
 173   P->contents = (initial_storage);                              \
 174   P->size = (initial_size);                                     \
 175   P->tail = 0;                                                  \
 176   P->resized = false;                                           \
 177   P->orig_contents = P->contents;                               \
 178   P->orig_size = P->size;                                       \
 179 } while (0)
 180
 181 /* Grow the pool to accomodate at least SIZE new bytes.  If the pool
 182    already has room to accomodate SIZE bytes of data, this is a no-op.  */
 183
 184 #define POOL_GROW(p, increase)                                  \
 185   GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
 186               (p)->resized, char)
 187
 188 /* Append text in the range [beg, end) to POOL.  No zero-termination
 189    is done.  */
 190
 191 #define POOL_APPEND(p, beg, end) do {                   \
 192   const char *PA_beg = (beg);                           \
 193   int PA_size = (end) - PA_beg;                         \
 194   POOL_GROW (p, PA_size);                               \
 195   memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
 196   (p)->tail += PA_size;                                 \
 197 } while (0)
 198
 199 /* Append one character to the pool.  Can be used to zero-terminate
 200    pool strings.  */
 201
 202 #define POOL_APPEND_CHR(p, ch) do {             \
 203   char PAC_char = (ch);                         \
 204   POOL_GROW (p, 1);                             \
 205   (p)->contents[(p)->tail++] = PAC_char;        \
 206 } while (0)
 207
 208 /* Forget old pool contents.  The allocated memory is not freed. */
 209 #define POOL_REWIND(p) (p)->tail = 0
 210
 211 /* Free heap-allocated memory for contents of POOL.  This calls
 212    xfree() if the memory was allocated through malloc.  It also
 213    restores `contents' and `size' to their original, pre-malloc
 214    values.  That way after POOL_FREE, the pool is fully usable, just
 215    as if it were freshly initialized with POOL_INIT.  */
 216
 217 #define POOL_FREE(p) do {                       \
 218   struct pool *P = p;                           \
 219   if (P->resized)                               \
 220     xfree (P->contents);                        \
 221   P->contents = P->orig_contents;               \
 222   P->size = P->orig_size;                       \
 223   P->tail = 0;                                  \
 224   P->resized = false;                           \
 225 } while (0)
 226
 227 /* Used for small stack-allocated memory chunks that might grow.  Like
 228    DO_REALLOC, this macro grows BASEVAR as necessary to take
 229    NEEDED_SIZE items of TYPE.
 230
 231    The difference is that on the first resize, it will use
 232    malloc+memcpy rather than realloc.  That way you can stack-allocate
 233    the initial chunk, and only resort to heap allocation if you
 234    stumble upon large data.
 235
 236    After the first resize, subsequent ones are performed with realloc,
 237    just like DO_REALLOC.  */
 238
 239 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
 240   long ga_needed_size = (needed_size);                                          \
 241   long ga_newsize = (sizevar);                                                  \
 242   while (ga_newsize < ga_needed_size)                                           \
 243     ga_newsize <<= 1;                                                           \
 244   if (ga_newsize != (sizevar))                                                  \
 245     {                                                                           \
 246       if (resized)                                                              \
 247         basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
 248       else                                                                      \
 249         {                                                                       \
 250           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
 251           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
 252           (basevar) = ga_new;                                                   \
 253           resized = true;                                                       \
 254         }                                                                       \
 255       (sizevar) = ga_newsize;                                                   \
 256     }                                                                           \
 257 } while (0)
 258 \f
 259 /* Test whether n+1-sized entity name fits in P.  We don't support
 260    IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
 261    However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
 262    other words an entity needs to be terminated by either a
 263    non-alphanumeric or the end of string.  */
 264 #define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
 265
 266 /* Macros that test entity names by returning true if P is followed by
 267    the specified characters.  */
 268 #define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
 269 #define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
 270 #define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
 271
 272 /* Increment P by INC chars.  If P lands at a semicolon, increment it
 273    past the semicolon.  This ensures that e.g. "&lt;foo" is converted
 274    to "<foo", but "&lt,foo" to "<,foo".  */
 275 #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
 276
 277 /* Decode the HTML character entity at *PTR, considering END to be end
 278    of buffer.  It is assumed that the "&" character that marks the
 279    beginning of the entity has been seen at *PTR-1.  If a recognized
 280    ASCII entity is seen, it is returned, and *PTR is moved to the end
 281    of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
 282
 283    The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
 284
 285 static int
 286 decode_entity (const char **ptr, const char *end)
 287 {
 288   const char *p = *ptr;
 289   int value = -1;
 290
 291   if (++p == end)
 292     return -1;
 293
 294   switch (*p++)
 295     {
 296     case '#':
 297       /* Process numeric entities "&#DDD;" and "&#xHH;".  */
 298       {
 299         int digits = 0;
 300         value = 0;
 301         if (*p == 'x')
 302           for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
 303             value = (value << 4) + XDIGIT_TO_NUM (*p);
 304         else
 305           for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
 306             value = (value * 10) + (*p - '0');
 307         if (!digits)
 308           return -1;
 309         /* Don't interpret 128+ codes and NUL because we cannot
 310            portably reinserted them into HTML.  */
 311         if (!value || (value & ~0x7f))
 312           return -1;
 313         *ptr = SKIP_SEMI (p, 0);
 314         return value;
 315       }
 316     /* Process named ASCII entities.  */
 317     case 'g':
 318       if (ENT1 (p, 't'))
 319         value = '>', *ptr = SKIP_SEMI (p, 1);
 320       break;
 321     case 'l':
 322       if (ENT1 (p, 't'))
 323         value = '<', *ptr = SKIP_SEMI (p, 1);
 324       break;
 325     case 'a':
 326       if (ENT2 (p, 'm', 'p'))
 327         value = '&', *ptr = SKIP_SEMI (p, 2);
 328       else if (ENT3 (p, 'p', 'o', 's'))
 329         /* handle &apos for the sake of the XML/XHTML crowd. */
 330         value = '\'', *ptr = SKIP_SEMI (p, 3);
 331       break;
 332     case 'q':
 333       if (ENT3 (p, 'u', 'o', 't'))
 334         value = '\"', *ptr = SKIP_SEMI (p, 3);
 335       break;
 336     }
 337   return value;
 338 }
 339 #undef ENT1
 340 #undef ENT2
 341 #undef ENT3
 342 #undef FITS
 343 #undef SKIP_SEMI
 344
 345 enum {
 346   AP_DOWNCASE           = 1,
 347   AP_DECODE_ENTITIES    = 2,
 348   AP_TRIM_BLANKS        = 4
 349 };
 350
 351 /* Copy the text in the range [BEG, END) to POOL, optionally
 352    performing operations specified by FLAGS.  FLAGS may be any
 353    combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
 354    with the following meaning:
 355
 356    * AP_DOWNCASE -- downcase all the letters;
 357
 358    * AP_DECODE_ENTITIES -- decode the named and numeric entities in
 359      the ASCII range when copying the string.
 360
 361    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
 362      of text, as well as embedded newlines.  */
 363
 364 static void
 365 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 366 {
 367   int old_tail = pool->tail;
 368
 369   /* Skip blanks if required.  We must do this before entities are
 370      processed, so that blanks can still be inserted as, for instance,
 371      `&#32;'.  */
 372   if (flags & AP_TRIM_BLANKS)
 373     {
 374       while (beg < end && c_isspace (*beg))
 375         ++beg;
 376       while (end > beg && c_isspace (end[-1]))
 377         --end;
 378     }
 379
 380   if (flags & AP_DECODE_ENTITIES)
 381     {
 382       /* Grow the pool, then copy the text to the pool character by
 383          character, processing the encountered entities as we go
 384          along.
 385
 386          It's safe (and necessary) to grow the pool in advance because
 387          processing the entities can only *shorten* the string, it can
 388          never lengthen it.  */
 389       const char *from = beg;
 390       char *to;
 391       bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
 392
 393       POOL_GROW (pool, end - beg);
 394       to = pool->contents + pool->tail;
 395
 396       while (from < end)
 397         {
 398           if (*from == '&')
 399             {
 400               int entity = decode_entity (&from, end);
 401               if (entity != -1)
 402                 *to++ = entity;
 403               else
 404                 *to++ = *from++;
 405             }
 406           else if ((*from == '\n' || *from == '\r') && squash_newlines)
 407             ++from;
 408           else
 409             *to++ = *from++;
 410         }
 411       /* Verify that we haven't exceeded the original size.  (It
 412          shouldn't happen, hence the assert.)  */
 413       assert (to - (pool->contents + pool->tail) <= end - beg);
 414
 415       /* Make POOL's tail point to the position following the string
 416          we've written.  */
 417       pool->tail = to - pool->contents;
 418       POOL_APPEND_CHR (pool, '\0');
 419     }
 420   else
 421     {
 422       /* Just copy the text to the pool.  */
 423       POOL_APPEND (pool, beg, end);
 424       POOL_APPEND_CHR (pool, '\0');
 425     }
 426
 427   if (flags & AP_DOWNCASE)
 428     {
 429       char *p = pool->contents + old_tail;
 430       for (; *p; p++)
 431         *p = c_tolower (*p);
 432     }
 433 }
 434 \f
 435 /* Originally we used to adhere to rfc 1866 here, and allowed only
 436    letters, digits, periods, and hyphens as names (of tags or
 437    attributes).  However, this broke too many pages which used
 438    proprietary or strange attributes, e.g. <img src="a.gif"
 439    v:shapes="whatever">.
 440
 441    So now we allow any character except:
 442      * whitespace
 443      * 8-bit and control chars
 444      * characters that clearly cannot be part of name:
 445        '=', '>', '/'.
 446
 447    This only affects attribute and tag names; attribute values allow
 448    an even greater variety of characters.  */
 449
 450 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
 451                         && (x) != '=' && (x) != '>' && (x) != '/')
 452
 453 #ifdef STANDALONE
 454 static int comment_backout_count;
 455 #endif
 456
 457 /* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
 458    strict comments mode, this is used for skipping over comments as
 459    well.
 460
 461    To recap: any SGML declaration may have comments associated with
 462    it, e.g.
 463        <!MY-DECL -- isn't this fun? -- foo bar>
 464
 465    An HTML comment is merely an empty declaration (<!>) with a comment
 466    attached, like this:
 467        <!-- some stuff here -->
 468
 469    Several comments may be embedded in one comment declaration:
 470        <!-- have -- -- fun -->
 471
 472    Whitespace is allowed between and after the comments, but not
 473    before the first comment.  Additionally, this function attempts to
 474    handle double quotes in SGML declarations correctly.  */
 475
 476 static const char *
 477 advance_declaration (const char *beg, const char *end)
 478 {
 479   const char *p = beg;
 480   char quote_char = '\0';       /* shut up, gcc! */
 481   char ch;
 482
 483   enum {
 484     AC_S_DONE,
 485     AC_S_BACKOUT,
 486     AC_S_BANG,
 487     AC_S_DEFAULT,
 488     AC_S_DCLNAME,
 489     AC_S_DASH1,
 490     AC_S_DASH2,
 491     AC_S_COMMENT,
 492     AC_S_DASH3,
 493     AC_S_DASH4,
 494     AC_S_QUOTE1,
 495     AC_S_IN_QUOTE,
 496     AC_S_QUOTE2
 497   } state = AC_S_BANG;
 498
 499   if (beg == end)
 500     return beg;
 501   ch = *p++;
 502
 503   /* It looked like a good idea to write this as a state machine, but
 504      now I wonder...  */
 505
 506   while (state != AC_S_DONE && state != AC_S_BACKOUT)
 507     {
 508       if (p == end)
 509         state = AC_S_BACKOUT;
 510       switch (state)
 511         {
 512         case AC_S_DONE:
 513         case AC_S_BACKOUT:
 514           break;
 515         case AC_S_BANG:
 516           if (ch == '!')
 517             {
 518               ch = *p++;
 519               state = AC_S_DEFAULT;
 520             }
 521           else
 522             state = AC_S_BACKOUT;
 523           break;
 524         case AC_S_DEFAULT:
 525           switch (ch)
 526             {
 527             case '-':
 528               state = AC_S_DASH1;
 529               break;
 530             case ' ':
 531             case '\t':
 532             case '\r':
 533             case '\n':
 534               ch = *p++;
 535               break;
 536             case '>':
 537               state = AC_S_DONE;
 538               break;
 539             case '\'':
 540             case '\"':
 541               state = AC_S_QUOTE1;
 542               break;
 543             default:
 544               if (NAME_CHAR_P (ch))
 545                 state = AC_S_DCLNAME;
 546               else
 547                 state = AC_S_BACKOUT;
 548               break;
 549             }
 550           break;
 551         case AC_S_DCLNAME:
 552           if (ch == '-')
 553             state = AC_S_DASH1;
 554           else if (NAME_CHAR_P (ch))
 555             ch = *p++;
 556           else
 557             state = AC_S_DEFAULT;
 558           break;
 559         case AC_S_QUOTE1:
 560           /* We must use 0x22 because broken assert macros choke on
 561              '"' and '\"'.  */
 562           assert (ch == '\'' || ch == 0x22);
 563           quote_char = ch;      /* cheating -- I really don't feel like
 564                                    introducing more different states for
 565                                    different quote characters. */
 566           ch = *p++;
 567           state = AC_S_IN_QUOTE;
 568           break;
 569         case AC_S_IN_QUOTE:
 570           if (ch == quote_char)
 571             state = AC_S_QUOTE2;
 572           else
 573             ch = *p++;
 574           break;
 575         case AC_S_QUOTE2:
 576           assert (ch == quote_char);
 577           ch = *p++;
 578           state = AC_S_DEFAULT;
 579           break;
 580         case AC_S_DASH1:
 581           assert (ch == '-');
 582           ch = *p++;
 583           state = AC_S_DASH2;
 584           break;
 585         case AC_S_DASH2:
 586           switch (ch)
 587             {
 588             case '-':
 589               ch = *p++;
 590               state = AC_S_COMMENT;
 591               break;
 592             default:
 593               state = AC_S_BACKOUT;
 594             }
 595           break;
 596         case AC_S_COMMENT:
 597           switch (ch)
 598             {
 599             case '-':
 600               state = AC_S_DASH3;
 601               break;
 602             default:
 603               ch = *p++;
 604               break;
 605             }
 606           break;
 607         case AC_S_DASH3:
 608           assert (ch == '-');
 609           ch = *p++;
 610           state = AC_S_DASH4;
 611           break;
 612         case AC_S_DASH4:
 613           switch (ch)
 614             {
 615             case '-':
 616               ch = *p++;
 617               state = AC_S_DEFAULT;
 618               break;
 619             default:
 620               state = AC_S_COMMENT;
 621               break;
 622             }
 623           break;
 624         }
 625     }
 626
 627   if (state == AC_S_BACKOUT)
 628     {
 629 #ifdef STANDALONE
 630       ++comment_backout_count;
 631 #endif
 632       return beg + 1;
 633     }
 634   return p;
 635 }
 636
 637 /* Find the first occurrence of the substring "-->" in [BEG, END) and
 638    return the pointer to the character after the substring.  If the
 639    substring is not found, return NULL.  */
 640
 641 static const char *
 642 find_comment_end (const char *beg, const char *end)
 643 {
 644   /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
 645      if it's not '>' or '-', advance by three characters.  Otherwise,
 646      look at the preceding characters and try to find a match.  */
 647
 648   const char *p = beg - 1;
 649
 650   while ((p += 3) < end)
 651     switch (p[0])
 652       {
 653       case '>':
 654         if (p[-1] == '-' && p[-2] == '-')
 655           return p + 1;
 656         break;
 657       case '-':
 658       at_dash:
 659         if (p[-1] == '-')
 660           {
 661           at_dash_dash:
 662             if (++p == end) return NULL;
 663             switch (p[0])
 664               {
 665               case '>': return p + 1;
 666               case '-': goto at_dash_dash;
 667               }
 668           }
 669         else
 670           {
 671             if ((p += 2) >= end) return NULL;
 672             switch (p[0])
 673               {
 674               case '>':
 675                 if (p[-1] == '-')
 676                   return p + 1;
 677                 break;
 678               case '-':
 679                 goto at_dash;
 680               }
 681           }
 682       }
 683   return NULL;
 684 }
 685 \f
 686 /* Return true if the string containing of characters inside [b, e) is
 687    present in hash table HT.  */
 688
 689 static bool
 690 name_allowed (const struct hash_table *ht, const char *b, const char *e)
 691 {
 692   char *copy;
 693   if (!ht)
 694     return true;
 695   BOUNDED_TO_ALLOCA (b, e, copy);
 696   return hash_table_get (ht, copy) != NULL;
 697 }
 698
 699 /* Advance P (a char pointer), with the explicit intent of being able
 700    to read the next character.  If this is not possible, go to finish.  */
 701
 702 #define ADVANCE(p) do {                         \
 703   ++p;                                          \
 704   if (p >= end)                                 \
 705     goto finish;                                \
 706 } while (0)
 707
 708 /* Skip whitespace, if any. */
 709
 710 #define SKIP_WS(p) do {                         \
 711   while (c_isspace (*p)) {                        \
 712     ADVANCE (p);                                \
 713   }                                             \
 714 } while (0)
 715
 716 /* Skip non-whitespace, if any. */
 717
 718 #define SKIP_NON_WS(p) do {                     \
 719   while (!c_isspace (*p)) {                       \
 720     ADVANCE (p);                                \
 721   }                                             \
 722 } while (0)
 723
 724 #ifdef STANDALONE
 725 static int tag_backout_count;
 726 #endif
 727
 728 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
 729    MAPFUN will be called with two arguments: pointer to an initialized
 730    struct taginfo, and MAPARG.
 731
 732    ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
 733    which are the tags and attribute names that this function should
 734    use.  If ALLOWED_TAGS is NULL, all tags are processed; if
 735    ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
 736
 737    (Obviously, the caller can filter out unwanted tags and attributes
 738    just as well, but this is just an optimization designed to avoid
 739    unnecessary copying of tags/attributes which the caller doesn't
 740    care about.)  */
 741
 742 void
 743 map_html_tags (const char *text, int size,
 744                void (*mapfun) (struct taginfo *, void *), void *maparg,
 745                int flags,
 746                const struct hash_table *allowed_tags,
 747                const struct hash_table *allowed_attributes)
 748 {
 749   /* storage for strings passed to MAPFUN callback; if 256 bytes is
 750      too little, POOL_APPEND allocates more with malloc. */
 751   char pool_initial_storage[256];
 752   struct pool pool;
 753
 754   const char *p = text;
 755   const char *end = text + size;
 756
 757   struct attr_pair attr_pair_initial_storage[8];
 758   int attr_pair_size = countof (attr_pair_initial_storage);
 759   bool attr_pair_resized = false;
 760   struct attr_pair *pairs = attr_pair_initial_storage;
 761
 762   if (!size)
 763     return;
 764
 765   POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
 766
 767   {
 768     int nattrs, end_tag;
 769     const char *tag_name_begin, *tag_name_end;
 770     const char *tag_start_position;
 771     bool uninteresting_tag;
 772
 773   look_for_tag:
 774     POOL_REWIND (&pool);
 775
 776     nattrs = 0;
 777     end_tag = 0;
 778
 779     /* Find beginning of tag.  We use memchr() instead of the usual
 780        looping with ADVANCE() for speed. */
 781     p = memchr (p, '<', end - p);
 782     if (!p)
 783       goto finish;
 784
 785     tag_start_position = p;
 786     ADVANCE (p);
 787
 788     /* Establish the type of the tag (start-tag, end-tag or
 789        declaration).  */
 790     if (*p == '!')
 791       {
 792         if (!(flags & MHT_STRICT_COMMENTS)
 793             && p < end + 3 && p[1] == '-' && p[2] == '-')
 794           {
 795             /* If strict comments are not enforced and if we know
 796                we're looking at a comment, simply look for the
 797                terminating "-->".  Non-strict is the default because
 798                it works in other browsers and most HTML writers can't
 799                be bothered with getting the comments right.  */
 800             const char *comment_end = find_comment_end (p + 3, end);
 801             if (comment_end)
 802               p = comment_end;
 803           }
 804         else
 805           {
 806             /* Either in strict comment mode or looking at a non-empty
 807                declaration.  Real declarations are much less likely to
 808                be misused the way comments are, so advance over them
 809                properly regardless of strictness.  */
 810             p = advance_declaration (p, end);
 811           }
 812         if (p == end)
 813           goto finish;
 814         goto look_for_tag;
 815       }
 816     else if (*p == '/')
 817       {
 818         end_tag = 1;
 819         ADVANCE (p);
 820       }
 821     tag_name_begin = p;
 822     while (NAME_CHAR_P (*p))
 823       ADVANCE (p);
 824     if (p == tag_name_begin)
 825       goto look_for_tag;
 826     tag_name_end = p;
 827     SKIP_WS (p);
 828     if (end_tag && *p != '>')
 829       goto backout_tag;
 830
 831     if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
 832       /* We can't just say "goto look_for_tag" here because we need
 833          the loop below to properly advance over the tag's attributes.  */
 834       uninteresting_tag = true;
 835     else
 836       {
 837         uninteresting_tag = false;
 838         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
 839       }
 840
 841     /* Find the attributes. */
 842     while (1)
 843       {
 844         const char *attr_name_begin, *attr_name_end;
 845         const char *attr_value_begin, *attr_value_end;
 846         const char *attr_raw_value_begin, *attr_raw_value_end;
 847         int operation = AP_DOWNCASE; /* stupid compiler. */
 848
 849         SKIP_WS (p);
 850
 851         if (*p == '/')
 852           {
 853             /* A slash at this point means the tag is about to be
 854                closed.  This is legal in XML and has been popularized
 855                in HTML via XHTML.  */
 856             /* <foo a=b c=d /> */
 857             /*              ^  */
 858             ADVANCE (p);
 859             SKIP_WS (p);
 860             if (*p != '>')
 861               goto backout_tag;
 862           }
 863
 864         /* Check for end of tag definition. */
 865         if (*p == '>')
 866           break;
 867
 868         /* Establish bounds of attribute name. */
 869         attr_name_begin = p;    /* <foo bar ...> */
 870                                 /*      ^        */
 871         while (NAME_CHAR_P (*p))
 872           ADVANCE (p);
 873         attr_name_end = p;      /* <foo bar ...> */
 874                                 /*         ^     */
 875         if (attr_name_begin == attr_name_end)
 876           goto backout_tag;
 877
 878         /* Establish bounds of attribute value. */
 879         SKIP_WS (p);
 880         if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
 881           {
 882             /* Minimized attribute syntax allows `=' to be omitted.
 883                For example, <UL COMPACT> is a valid shorthand for <UL
 884                COMPACT="compact">.  Even if such attributes are not
 885                useful to Wget, we need to support them, so that the
 886                tags containing them can be parsed correctly. */
 887             attr_raw_value_begin = attr_value_begin = attr_name_begin;
 888             attr_raw_value_end = attr_value_end = attr_name_end;
 889           }
 890         else if (*p == '=')
 891           {
 892             ADVANCE (p);
 893             SKIP_WS (p);
 894             if (*p == '\"' || *p == '\'')
 895               {
 896                 bool newline_seen = false;
 897                 char quote_char = *p;
 898                 attr_raw_value_begin = p;
 899                 ADVANCE (p);
 900                 attr_value_begin = p; /* <foo bar="baz"> */
 901                                       /*           ^     */
 902                 while (*p != quote_char)
 903                   {
 904                     if (!newline_seen && *p == '\n')
 905                       {
 906                         /* If a newline is seen within the quotes, it
 907                            is most likely that someone forgot to close
 908                            the quote.  In that case, we back out to
 909                            the value beginning, and terminate the tag
 910                            at either `>' or the delimiter, whichever
 911                            comes first.  Such a tag terminated at `>'
 912                            is discarded.  */
 913                         p = attr_value_begin;
 914                         newline_seen = true;
 915                         continue;
 916                       }
 917                     else if (newline_seen && *p == '>')
 918                       break;
 919                     ADVANCE (p);
 920                   }
 921                 attr_value_end = p; /* <foo bar="baz"> */
 922                                     /*              ^  */
 923                 if (*p == quote_char)
 924                   ADVANCE (p);
 925                 else
 926                   goto look_for_tag;
 927                 attr_raw_value_end = p; /* <foo bar="baz"> */
 928                                         /*               ^ */
 929                 operation = AP_DECODE_ENTITIES;
 930                 if (flags & MHT_TRIM_VALUES)
 931                   operation |= AP_TRIM_BLANKS;
 932               }
 933             else
 934               {
 935                 attr_value_begin = p; /* <foo bar=baz> */
 936                                       /*          ^    */
 937                 /* According to SGML, a name token should consist only
 938                    of alphanumerics, . and -.  However, this is often
 939                    violated by, for instance, `%' in `width=75%'.
 940                    We'll be liberal and allow just about anything as
 941                    an attribute value.  */
 942                 while (!c_isspace (*p) && *p != '>')
 943                   ADVANCE (p);
 944                 attr_value_end = p; /* <foo bar=baz qux=quix> */
 945                                     /*             ^          */
 946                 if (attr_value_begin == attr_value_end)
 947                   /* <foo bar=> */
 948                   /*          ^ */
 949                   goto backout_tag;
 950                 attr_raw_value_begin = attr_value_begin;
 951                 attr_raw_value_end = attr_value_end;
 952                 operation = AP_DECODE_ENTITIES;
 953               }
 954           }
 955         else
 956           {
 957             /* We skipped the whitespace and found something that is
 958                neither `=' nor the beginning of the next attribute's
 959                name.  Back out.  */
 960             goto backout_tag;   /* <foo bar [... */
 961                                 /*          ^    */
 962           }
 963
 964         /* If we're not interested in the tag, don't bother with any
 965            of the attributes.  */
 966         if (uninteresting_tag)
 967           continue;
 968
 969         /* If we aren't interested in the attribute, skip it.  We
 970            cannot do this test any sooner, because our text pointer
 971            needs to correctly advance over the attribute.  */
 972         if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
 973           continue;
 974
 975         GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
 976                     struct attr_pair);
 977
 978         pairs[nattrs].name_pool_index = pool.tail;
 979         convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
 980
 981         pairs[nattrs].value_pool_index = pool.tail;
 982         convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
 983         pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
 984         pairs[nattrs].value_raw_size = (attr_raw_value_end
 985                                         - attr_raw_value_begin);
 986         ++nattrs;
 987       }
 988
 989     if (uninteresting_tag)
 990       {
 991         ADVANCE (p);
 992         goto look_for_tag;
 993       }
 994
 995     /* By now, we have a valid tag with a name and zero or more
 996        attributes.  Fill in the data and call the mapper function.  */
 997     {
 998       int i;
 999       struct taginfo taginfo;
1000
1001       taginfo.name      = pool.contents;
1002       taginfo.end_tag_p = end_tag;
1003       taginfo.nattrs    = nattrs;
1004       /* We fill in the char pointers only now, when pool can no
1005          longer get realloc'ed.  If we did that above, we could get
1006          hosed by reallocation.  Obviously, after this point, the pool
1007          may no longer be grown.  */
1008       for (i = 0; i < nattrs; i++)
1009         {
1010           pairs[i].name = pool.contents + pairs[i].name_pool_index;
1011           pairs[i].value = pool.contents + pairs[i].value_pool_index;
1012         }
1013       taginfo.attrs = pairs;
1014       taginfo.start_position = tag_start_position;
1015       taginfo.end_position   = p + 1;
1016       mapfun (&taginfo, maparg);
1017       ADVANCE (p);
1018     }
1019     goto look_for_tag;
1020
1021   backout_tag:
1022 #ifdef STANDALONE
1023     ++tag_backout_count;
1024 #endif
1025     /* The tag wasn't really a tag.  Treat its contents as ordinary
1026        data characters. */
1027     p = tag_start_position + 1;
1028     goto look_for_tag;
1029   }
1030
1031  finish:
1032   POOL_FREE (&pool);
1033   if (attr_pair_resized)
1034     xfree (pairs);
1035 }
1036
1037 #undef ADVANCE
1038 #undef SKIP_WS
1039 #undef SKIP_NON_WS
1040 \f
1041 #ifdef STANDALONE
1042 static void
1043 test_mapper (struct taginfo *taginfo, void *arg)
1044 {
1045   int i;
1046
1047   printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1048   for (i = 0; i < taginfo->nattrs; i++)
1049     printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1050   putchar ('\n');
1051   ++*(int *)arg;
1052 }
1053
1054 int main ()
1055 {
1056   int size = 256;
1057   char *x = xmalloc (size);
1058   int length = 0;
1059   int read_count;
1060   int tag_counter = 0;
1061
1062   while ((read_count = fread (x + length, 1, size - length, stdin)))
1063     {
1064       length += read_count;
1065       size <<= 1;
1066       x = xrealloc (x, size);
1067     }
1068
1069   map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1070   printf ("TAGS: %d\n", tag_counter);
1071   printf ("Tag backouts:     %d\n", tag_backout_count);
1072   printf ("Comment backouts: %d\n", comment_backout_count);
1073   return 0;
1074 }
1075 #endif /* STANDALONE */