[svn] Merge of fix for bugs 20341 and 20410.

[wget] / src / html-parse.c
diff --git a/src/html-parse.c b/src/html-parse.c

index 2a09ff09c0e6cd1c9e8e2cc86867d261e31b16e3..5033f8e3926be042c3b2c44dd18c27bea6112420 100644 (file)
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -1,11 +1,11 @@
  /* HTML parser for Wget.
-   Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
+   Copyright (C) 1998-2006 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
  GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or (at
+the Free Software Foundation; either version 3 of the License, or (at
  your option) any later version.
  
  GNU Wget is distributed in the hope that it will be useful,
@@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  
  In addition, as a special exception, the Free Software Foundation
  gives permission to link the code of its release of Wget with the
@@ -96,11 +95,7 @@ so, delete this exception statement from your version.  */
  
  #include <stdio.h>
  #include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif
+#include <string.h>
  #include <assert.h>
  
  #include "wget.h"
@@ -157,7 +152,7 @@ struct pool {
    char *contents;              /* pointer to the contents. */
    int size;                    /* size of the pool. */
    int tail;                    /* next available position index. */
-  int resized;                 /* whether the pool has been resized
+  bool resized;                        /* whether the pool has been resized
                                    using malloc. */
  
    char *orig_contents;         /* original pool contents, usually
@@ -174,7 +169,7 @@ struct pool {
    P->contents = (initial_storage);                             \
    P->size = (initial_size);                                    \
    P->tail = 0;                                                 \
-  P->resized = 0;                                              \
+  P->resized = false;                                          \
    P->orig_contents = P->contents;                              \
    P->orig_size = P->size;                                      \
  } while (0)
@@ -222,7 +217,7 @@ struct pool {
    P->contents = P->orig_contents;              \
    P->size = P->orig_size;                      \
    P->tail = 0;                                 \
-  P->resized = 0;                              \
+  P->resized = false;                          \
  } while (0)
  
  /* Used for small stack-allocated memory chunks that might grow.  Like
@@ -245,13 +240,13 @@ struct pool {
    if (ga_newsize != (sizevar))                                                 \
      {                                                                          \
        if (resized)                                                             \
-       basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type));       \
+       basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
        else                                                                     \
         {                                                                       \
           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
           (basevar) = ga_new;                                                   \
-         resized = 1;                                                          \
+         resized = true;                                                       \
         }                                                                       \
        (sizevar) = ga_newsize;                                                  \
      }                                                                          \
@@ -360,17 +355,16 @@ enum {
       the ASCII range when copying the string.
  
     * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
-     of text.  */
+     of text, as well as embedded newlines.  */
  
  static void
  convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
  {
    int old_tail = pool->tail;
-  int size;
  
-  /* First, skip blanks if required.  We must do this before entities
-     are processed, so that blanks can still be inserted as, for
-     instance, `&#32;'.  */
+  /* Skip blanks if required.  We must do this before entities are
+     processed, so that blanks can still be inserted as, for instance,
+     `&#32;'.  */
    if (flags & AP_TRIM_BLANKS)
      {
        while (beg < end && ISSPACE (*beg))
@@ -378,7 +372,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
        while (end > beg && ISSPACE (end[-1]))
         --end;
      }
-  size = end - beg;
  
    if (flags & AP_DECODE_ENTITIES)
      {
@@ -391,15 +384,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
          never lengthen it.  */
        const char *from = beg;
        char *to;
+      bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
  
        POOL_GROW (pool, end - beg);
        to = pool->contents + pool->tail;
  
        while (from < end)
         {
-         if (*from != '&')
-           *to++ = *from++;
-         else
+         if (*from == '&')
             {
               int entity = decode_entity (&from, end);
               if (entity != -1)
@@ -407,6 +399,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
               else
                 *to++ = *from++;
             }
+         else if ((*from == '\n' || *from == '\r') && squash_newlines)
+           ++from;
+         else
+           *to++ = *from++;
         }
        /* Verify that we haven't exceeded the original size.  (It
          shouldn't happen, hence the assert.)  */
@@ -683,15 +679,15 @@ find_comment_end (const char *beg, const char *end)
    return NULL;
  }
  \f
-/* Return non-zero of the string inside [b, e) are present in hash
-   table HT.  */
+/* Return true if the string containing of characters inside [b, e) is
+   present in hash table HT.  */
  
-static int
+static bool
  name_allowed (const struct hash_table *ht, const char *b, const char *e)
  {
    char *copy;
    if (!ht)
-    return 1;
+    return true;
    BOUNDED_TO_ALLOCA (b, e, copy);
    return hash_table_get (ht, copy) != NULL;
  }
@@ -729,17 +725,15 @@ static int tag_backout_count;
     MAPFUN will be called with two arguments: pointer to an initialized
     struct taginfo, and MAPARG.
  
-   ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
-   be processed by this function.  If it is NULL, all the tags are
-   allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
+   ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
+   which are the tags and attribute names that this function should
+   use.  If ALLOWED_TAGS is NULL, all tags are processed; if
+   ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
  
     (Obviously, the caller can filter out unwanted tags and attributes
     just as well, but this is just an optimization designed to avoid
-   unnecessary copying for tags/attributes which the caller doesn't
-   want to know about.  These lists are searched linearly; therefore,
-   if you're interested in a large number of tags or attributes, you'd
-   better set these to NULL and filter them out yourself with a
-   hashing process most appropriate for your application.)  */
+   unnecessary copying of tags/attributes which the caller doesn't
+   care about.)  */
  
  void
  map_html_tags (const char *text, int size,
@@ -758,7 +752,7 @@ map_html_tags (const char *text, int size,
  
    struct attr_pair attr_pair_initial_storage[8];
    int attr_pair_size = countof (attr_pair_initial_storage);
-  int attr_pair_resized = 0;
+  bool attr_pair_resized = false;
    struct attr_pair *pairs = attr_pair_initial_storage;
  
    if (!size)
@@ -770,7 +764,7 @@ map_html_tags (const char *text, int size,
      int nattrs, end_tag;
      const char *tag_name_begin, *tag_name_end;
      const char *tag_start_position;
-    int uninteresting_tag;
+    bool uninteresting_tag;
  
    look_for_tag:
      POOL_REWIND (&pool);
@@ -833,10 +827,10 @@ map_html_tags (const char *text, int size,
      if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
        /* We can't just say "goto look_for_tag" here because we need
           the loop below to properly advance over the tag's attributes.  */
-      uninteresting_tag = 1;
+      uninteresting_tag = true;
      else
        {
-       uninteresting_tag = 0;
+       uninteresting_tag = false;
         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
        }
  
@@ -895,7 +889,7 @@ map_html_tags (const char *text, int size,
             SKIP_WS (p);
             if (*p == '\"' || *p == '\'')
               {
-               int newline_seen = 0;
+               bool newline_seen = false;
                 char quote_char = *p;
                 attr_raw_value_begin = p;
                 ADVANCE (p);
@@ -913,7 +907,7 @@ map_html_tags (const char *text, int size,
                            comes first.  Such a tag terminated at `>'
                            is discarded.  */
                         p = attr_value_begin;
-                       newline_seen = 1;
+                       newline_seen = true;
                         continue;
                       }
                     else if (newline_seen && *p == '>')
@@ -1015,8 +1009,7 @@ map_html_tags (const char *text, int size,
        taginfo.attrs = pairs;
        taginfo.start_position = tag_start_position;
        taginfo.end_position   = p + 1;
-      /* Ta-dam! */
-      (*mapfun) (&taginfo, maparg);
+      mapfun (&taginfo, maparg);
        ADVANCE (p);
      }
      goto look_for_tag;
@@ -1057,7 +1050,7 @@ test_mapper (struct taginfo *taginfo, void *arg)
  int main ()
  {
    int size = 256;
-  char *x = (char *)xmalloc (size);
+  char *x = xmalloc (size);
    int length = 0;
    int read_count;
    int tag_counter = 0;
@@ -1066,7 +1059,7 @@ int main ()
      {
        length += read_count;
        size <<= 1;
-      x = (char *)xrealloc (x, size);
+      x = xrealloc (x, size);
      }
  
    map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);