Ted Mielczarek's CSS wonder-patch, applied against the source from around the time...

[wget] / src / html-parse.c
diff --git a/src/html-parse.c b/src/html-parse.c

index ea63254688059e74ef245b5f8202c26dd94ec101..8254c6dc15d416d42909aec750028b697bf0557a 100644 (file)
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -1,5 +1,5 @@
  /* HTML parser for Wget.
-   Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
+   Copyright (C) 1998-2006 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -14,8 +14,8 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  
  In addition, as a special exception, the Free Software Foundation
  gives permission to link the code of its release of Wget with the
@@ -96,11 +96,7 @@ so, delete this exception statement from your version.  */
  
  #include <stdio.h>
  #include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif
+#include <string.h>
  #include <assert.h>
  
  #include "wget.h"
@@ -157,7 +153,7 @@ struct pool {
    char *contents;              /* pointer to the contents. */
    int size;                    /* size of the pool. */
    int tail;                    /* next available position index. */
-  int resized;                 /* whether the pool has been resized
+  bool resized;                        /* whether the pool has been resized
                                    using malloc. */
  
    char *orig_contents;         /* original pool contents, usually
@@ -174,7 +170,7 @@ struct pool {
    P->contents = (initial_storage);                             \
    P->size = (initial_size);                                    \
    P->tail = 0;                                                 \
-  P->resized = 0;                                              \
+  P->resized = false;                                          \
    P->orig_contents = P->contents;                              \
    P->orig_size = P->size;                                      \
  } while (0)
@@ -222,7 +218,7 @@ struct pool {
    P->contents = P->orig_contents;              \
    P->size = P->orig_size;                      \
    P->tail = 0;                                 \
-  P->resized = 0;                              \
+  P->resized = false;                          \
  } while (0)
  
  /* Used for small stack-allocated memory chunks that might grow.  Like
@@ -245,13 +241,13 @@ struct pool {
    if (ga_newsize != (sizevar))                                                 \
      {                                                                          \
        if (resized)                                                             \
-       basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type));       \
+       basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
        else                                                                     \
         {                                                                       \
           void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
           memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
           (basevar) = ga_new;                                                   \
-         resized = 1;                                                          \
+         resized = true;                                                       \
         }                                                                       \
        (sizevar) = ga_newsize;                                                  \
      }                                                                          \
@@ -275,6 +271,94 @@ struct pool {
     to "<foo", but "&lt,foo" to "<,foo".  */
  #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
  
+struct tagstack_item {
+  const char *tagname_begin;
+  const char *tagname_end;
+  const char *contents_begin;
+  struct tagstack_item *prev;
+  struct tagstack_item *next;
+};
+
+struct tagstack_item *
+tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
+{
+  struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
+  if (*head == NULL)
+    {
+      *head = *tail = ts;
+      ts->prev = ts->next = NULL;
+    }
+  else
+    {
+      (*tail)->next = ts;
+      ts->prev = *tail;
+      *tail = ts;
+      ts->next = NULL;
+    }
+
+  return ts;
+}
+
+/* remove ts and everything after it from the stack */
+void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+              struct tagstack_item *ts)
+{
+  if (*head == NULL)
+    return;
+
+  if (ts == *tail)
+    {
+      if (ts == *head)
+        {
+          xfree (ts);
+          *head = *tail = NULL;
+        }
+      else
+        {
+          ts->prev->next = NULL;
+          *tail = ts->prev;
+          xfree (ts);
+        }
+    }
+  else
+    {
+      if (ts == *head)
+        {
+          *head = NULL;
+        }
+      *tail = ts->prev;
+
+      if (ts->prev)
+        {
+          ts->prev->next = NULL;
+        }
+      while (ts)
+        {
+          struct tagstack_item *p = ts->next;
+          xfree (ts);
+          ts = p;
+        }
+    }
+}
+
+struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+               const char *tagname_end)
+{
+  int len = tagname_end - tagname_begin;
+  while (tail)
+    {
+      if (len == (tail->tagname_end - tail->tagname_begin))
+        {
+          if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+            return tail;
+        }
+      tail = tail->prev;
+    }
+  return NULL;
+}
+
  /* Decode the HTML character entity at *PTR, considering END to be end
     of buffer.  It is assumed that the "&" character that marks the
     beginning of the entity has been seen at *PTR-1.  If a recognized
@@ -340,6 +424,8 @@ decode_entity (const char **ptr, const char *end)
  #undef ENT1
  #undef ENT2
  #undef ENT3
+#undef FITS
+#undef SKIP_SEMI
  
  enum {
    AP_DOWNCASE          = 1,
@@ -358,17 +444,16 @@ enum {
       the ASCII range when copying the string.
  
     * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
-     of text.  */
+     of text, as well as embedded newlines.  */
  
  static void
  convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
  {
    int old_tail = pool->tail;
-  int size;
  
-  /* First, skip blanks if required.  We must do this before entities
-     are processed, so that blanks can still be inserted as, for
-     instance, `&#32;'.  */
+  /* Skip blanks if required.  We must do this before entities are
+     processed, so that blanks can still be inserted as, for instance,
+     `&#32;'.  */
    if (flags & AP_TRIM_BLANKS)
      {
        while (beg < end && ISSPACE (*beg))
@@ -376,7 +461,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
        while (end > beg && ISSPACE (end[-1]))
         --end;
      }
-  size = end - beg;
  
    if (flags & AP_DECODE_ENTITIES)
      {
@@ -389,15 +473,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
          never lengthen it.  */
        const char *from = beg;
        char *to;
+      bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
  
        POOL_GROW (pool, end - beg);
        to = pool->contents + pool->tail;
  
        while (from < end)
         {
-         if (*from != '&')
-           *to++ = *from++;
-         else
+         if (*from == '&')
             {
               int entity = decode_entity (&from, end);
               if (entity != -1)
@@ -405,6 +488,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
               else
                 *to++ = *from++;
             }
+         else if ((*from == '\n' || *from == '\r') && squash_newlines)
+           ++from;
+         else
+           *to++ = *from++;
         }
        /* Verify that we haven't exceeded the original size.  (It
          shouldn't happen, hence the assert.)  */
@@ -681,15 +768,15 @@ find_comment_end (const char *beg, const char *end)
    return NULL;
  }
  \f
-/* Return non-zero of the string inside [b, e) are present in hash
-   table HT.  */
+/* Return true if the string containing of characters inside [b, e) is
+   present in hash table HT.  */
  
-static int
+static bool
  name_allowed (const struct hash_table *ht, const char *b, const char *e)
  {
    char *copy;
    if (!ht)
-    return 1;
+    return true;
    BOUNDED_TO_ALLOCA (b, e, copy);
    return hash_table_get (ht, copy) != NULL;
  }
@@ -727,17 +814,15 @@ static int tag_backout_count;
     MAPFUN will be called with two arguments: pointer to an initialized
     struct taginfo, and MAPARG.
  
-   ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
-   be processed by this function.  If it is NULL, all the tags are
-   allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
+   ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
+   which are the tags and attribute names that this function should
+   use.  If ALLOWED_TAGS is NULL, all tags are processed; if
+   ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
  
     (Obviously, the caller can filter out unwanted tags and attributes
     just as well, but this is just an optimization designed to avoid
-   unnecessary copying for tags/attributes which the caller doesn't
-   want to know about.  These lists are searched linearly; therefore,
-   if you're interested in a large number of tags or attributes, you'd
-   better set these to NULL and filter them out yourself with a
-   hashing process most appropriate for your application.)  */
+   unnecessary copying of tags/attributes which the caller doesn't
+   care about.)  */
  
  void
  map_html_tags (const char *text, int size,
@@ -756,9 +841,12 @@ map_html_tags (const char *text, int size,
  
    struct attr_pair attr_pair_initial_storage[8];
    int attr_pair_size = countof (attr_pair_initial_storage);
-  int attr_pair_resized = 0;
+  bool attr_pair_resized = false;
    struct attr_pair *pairs = attr_pair_initial_storage;
  
+  struct tagstack_item *head = NULL;
+  struct tagstack_item *tail = NULL;
+
    if (!size)
      return;
  
@@ -768,7 +856,7 @@ map_html_tags (const char *text, int size,
      int nattrs, end_tag;
      const char *tag_name_begin, *tag_name_end;
      const char *tag_start_position;
-    int uninteresting_tag;
+    bool uninteresting_tag;
  
    look_for_tag:
      POOL_REWIND (&pool);
@@ -825,16 +913,28 @@ map_html_tags (const char *text, int size,
        goto look_for_tag;
      tag_name_end = p;
      SKIP_WS (p);
+
+    if (!end_tag)
+      {
+        struct tagstack_item *ts = tagstack_push (&head, &tail);
+        if (ts)
+          {
+            ts->tagname_begin  = tag_name_begin;
+            ts->tagname_end    = tag_name_end;
+            ts->contents_begin = NULL;
+          }
+      }
+
      if (end_tag && *p != '>')
        goto backout_tag;
  
      if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
        /* We can't just say "goto look_for_tag" here because we need
           the loop below to properly advance over the tag's attributes.  */
-      uninteresting_tag = 1;
+      uninteresting_tag = true;
      else
        {
-       uninteresting_tag = 0;
+       uninteresting_tag = false;
         convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
        }
  
@@ -893,7 +993,7 @@ map_html_tags (const char *text, int size,
             SKIP_WS (p);
             if (*p == '\"' || *p == '\'')
               {
-               int newline_seen = 0;
+               bool newline_seen = false;
                 char quote_char = *p;
                 attr_raw_value_begin = p;
                 ADVANCE (p);
@@ -911,7 +1011,7 @@ map_html_tags (const char *text, int size,
                            comes first.  Such a tag terminated at `>'
                            is discarded.  */
                         p = attr_value_begin;
-                       newline_seen = 1;
+                       newline_seen = true;
                         continue;
                       }
                     else if (newline_seen && *p == '>')
@@ -986,6 +1086,11 @@ map_html_tags (const char *text, int size,
         ++nattrs;
        }
  
+    if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+      {
+        tail->contents_begin = p+1;
+      }
+
      if (uninteresting_tag)
        {
         ADVANCE (p);
@@ -997,6 +1102,7 @@ map_html_tags (const char *text, int size,
      {
        int i;
        struct taginfo taginfo;
+      struct tagstack_item *ts = NULL;
  
        taginfo.name      = pool.contents;
        taginfo.end_tag_p = end_tag;
@@ -1013,8 +1119,24 @@ map_html_tags (const char *text, int size,
        taginfo.attrs = pairs;
        taginfo.start_position = tag_start_position;
        taginfo.end_position   = p + 1;
-      /* Ta-dam! */
-      (*mapfun) (&taginfo, maparg);
+      taginfo.contents_begin = NULL;
+      taginfo.contents_end = NULL;
+
+      if (end_tag)
+        {
+          ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+          if (ts)
+            {
+              if (ts->contents_begin)
+                {
+                  taginfo.contents_begin = ts->contents_begin;
+                  taginfo.contents_end   = tag_start_position;
+                }
+              tagstack_pop (&head, &tail, ts);
+            }
+        }
+
+      mapfun (&taginfo, maparg);
        ADVANCE (p);
      }
      goto look_for_tag;
@@ -1033,6 +1155,8 @@ map_html_tags (const char *text, int size,
    POOL_FREE (&pool);
    if (attr_pair_resized)
      xfree (pairs);
+  /* pop any tag stack that's left */
+  tagstack_pop (&head, &tail, head);
  }
  
  #undef ADVANCE
@@ -1055,7 +1179,7 @@ test_mapper (struct taginfo *taginfo, void *arg)
  int main ()
  {
    int size = 256;
-  char *x = (char *)xmalloc (size);
+  char *x = xmalloc (size);
    int length = 0;
    int read_count;
    int tag_counter = 0;
@@ -1064,7 +1188,7 @@ int main ()
      {
        length += read_count;
        size <<= 1;
-      x = (char *)xrealloc (x, size);
+      x = xrealloc (x, size);
      }
  
    map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);