Fix build when libpsl is not available

[wget] / src / html-parse.c
diff --git a/src/html-parse.c b/src/html-parse.c

index 2e7465a42c9b437e364d55dcfb77aa3d71d3c712..20791cd83450272210c17a129ecd6904aec86231 100644 (file)
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -1,6 +1,6 @@
  /* HTML parser for Wget.
     Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
-   2007 Free Software Foundation, Inc.
+   2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -89,7 +89,7 @@ as that of the covered work.  */
  /* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
     still need Wget headers to compile.  */
  
-#include <config.h>
+#include "wget.h"
  
  #ifdef STANDALONE
  # define I_REALLY_WANT_CTYPE_MACROS
@@ -100,7 +100,7 @@ as that of the covered work.  */
  #include <string.h>
  #include <assert.h>
  
-#include "wget.h"
+#include "utils.h"
  #include "html-parse.h"
  
  #ifdef STANDALONE
@@ -111,21 +111,21 @@ as that of the covered work.  */
  # define xrealloc realloc
  # define xfree free
  
-# undef ISSPACE
-# undef ISDIGIT
-# undef ISXDIGIT
-# undef ISALPHA
-# undef ISALNUM
-# undef TOLOWER
-# undef TOUPPER
-
-# define ISSPACE(x) isspace (x)
-# define ISDIGIT(x) isdigit (x)
-# define ISXDIGIT(x) isxdigit (x)
-# define ISALPHA(x) isalpha (x)
-# define ISALNUM(x) isalnum (x)
-# define TOLOWER(x) tolower (x)
-# define TOUPPER(x) toupper (x)
+# undef c_isspace
+# undef c_isdigit
+# undef c_isxdigit
+# undef c_isalpha
+# undef c_isalnum
+# undef c_tolower
+# undef c_toupper
+
+# define c_isspace(x) isspace (x)
+# define c_isdigit(x) isdigit (x)
+# define c_isxdigit(x) isxdigit (x)
+# define c_isalpha(x) isalpha (x)
+# define c_isalnum(x) isalnum (x)
+# define c_tolower(x) tolower (x)
+# define c_toupper(x) toupper (x)
  
  struct hash_table {
    int dummy;
@@ -259,7 +259,7 @@ struct pool {
     However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
     other words an entity needs to be terminated by either a
     non-alphanumeric or the end of string.  */
-#define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
+#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
  
  /* Macros that test entity names by returning true if P is followed by
     the specified characters.  */
@@ -272,6 +272,94 @@ struct pool {
     to "<foo", but "&lt,foo" to "<,foo".  */
  #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
  
+struct tagstack_item {
+  const char *tagname_begin;
+  const char *tagname_end;
+  const char *contents_begin;
+  struct tagstack_item *prev;
+  struct tagstack_item *next;
+};
+
+static struct tagstack_item *
+tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
+{
+  struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
+  if (*head == NULL)
+    {
+      *head = *tail = ts;
+      ts->prev = ts->next = NULL;
+    }
+  else
+    {
+      (*tail)->next = ts;
+      ts->prev = *tail;
+      *tail = ts;
+      ts->next = NULL;
+    }
+
+  return ts;
+}
+
+/* remove ts and everything after it from the stack */
+static void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+              struct tagstack_item *ts)
+{
+  if (*head == NULL)
+    return;
+
+  if (ts == *tail)
+    {
+      if (ts == *head)
+        {
+          xfree (ts);
+          *head = *tail = NULL;
+        }
+      else
+        {
+          ts->prev->next = NULL;
+          *tail = ts->prev;
+          xfree (ts);
+        }
+    }
+  else
+    {
+      if (ts == *head)
+        {
+          *head = NULL;
+        }
+      *tail = ts->prev;
+
+      if (ts->prev)
+        {
+          ts->prev->next = NULL;
+        }
+      while (ts)
+        {
+          struct tagstack_item *p = ts->next;
+          xfree (ts);
+          ts = p;
+        }
+    }
+}
+
+static struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+               const char *tagname_end)
+{
+  int len = tagname_end - tagname_begin;
+  while (tail)
+    {
+      if (len == (tail->tagname_end - tail->tagname_begin))
+        {
+          if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+            return tail;
+        }
+      tail = tail->prev;
+    }
+  return NULL;
+}
+
  /* Decode the HTML character entity at *PTR, considering END to be end
     of buffer.  It is assumed that the "&" character that marks the
     beginning of the entity has been seen at *PTR-1.  If a recognized
@@ -297,10 +385,10 @@ decode_entity (const char **ptr, const char *end)
          int digits = 0;
          value = 0;
          if (*p == 'x')
-          for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
+          for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
              value = (value << 4) + XDIGIT_TO_NUM (*p);
          else
-          for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
+          for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
              value = (value * 10) + (*p - '0');
          if (!digits)
            return -1;
@@ -369,9 +457,9 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
       `&#32;'.  */
    if (flags & AP_TRIM_BLANKS)
      {
-      while (beg < end && ISSPACE (*beg))
+      while (beg < end && c_isspace (*beg))
          ++beg;
-      while (end > beg && ISSPACE (end[-1]))
+      while (end > beg && c_isspace (end[-1]))
          --end;
      }
  
@@ -426,7 +514,7 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
      {
        char *p = pool->contents + old_tail;
        for (; *p; p++)
-        *p = TOLOWER (*p);
+        *p = c_tolower (*p);
      }
  }
  \f
@@ -440,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
       * whitespace
       * 8-bit and control chars
       * characters that clearly cannot be part of name:
-       '=', '>', '/'.
+       '=', '<', '>', '/'.
  
     This only affects attribute and tag names; attribute values allow
     an even greater variety of characters.  */
  
  #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
-                        && (x) != '=' && (x) != '>' && (x) != '/')
+                        && (x) != '=' && (x) != '<' && (x) != '>'       \
+                        && (x) != '/')
  
  #ifdef STANDALONE
  static int comment_backout_count;
@@ -531,6 +620,7 @@ advance_declaration (const char *beg, const char *end)
              case '\n':
                ch = *p++;
                break;
+            case '<':
              case '>':
                state = AC_S_DONE;
                break;
@@ -706,7 +796,7 @@ name_allowed (const struct hash_table *ht, const char *b, const char *e)
  /* Skip whitespace, if any. */
  
  #define SKIP_WS(p) do {                         \
-  while (ISSPACE (*p)) {                        \
+  while (c_isspace (*p)) {                        \
      ADVANCE (p);                                \
    }                                             \
  } while (0)
@@ -714,7 +804,7 @@ name_allowed (const struct hash_table *ht, const char *b, const char *e)
  /* Skip non-whitespace, if any. */
  
  #define SKIP_NON_WS(p) do {                     \
-  while (!ISSPACE (*p)) {                       \
+  while (!c_isspace (*p)) {                       \
      ADVANCE (p);                                \
    }                                             \
  } while (0)
@@ -757,6 +847,9 @@ map_html_tags (const char *text, int size,
    bool attr_pair_resized = false;
    struct attr_pair *pairs = attr_pair_initial_storage;
  
+  struct tagstack_item *head = NULL;
+  struct tagstack_item *tail = NULL;
+
    if (!size)
      return;
  
@@ -823,7 +916,19 @@ map_html_tags (const char *text, int size,
        goto look_for_tag;
      tag_name_end = p;
      SKIP_WS (p);
-    if (end_tag && *p != '>')
+
+    if (!end_tag)
+      {
+        struct tagstack_item *ts = tagstack_push (&head, &tail);
+        if (ts)
+          {
+            ts->tagname_begin  = tag_name_begin;
+            ts->tagname_end    = tag_name_end;
+            ts->contents_begin = NULL;
+          }
+      }
+
+    if (end_tag && *p != '>' && *p != '<')
        goto backout_tag;
  
      if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
@@ -855,12 +960,12 @@ map_html_tags (const char *text, int size,
              /*              ^  */
              ADVANCE (p);
              SKIP_WS (p);
-            if (*p != '>')
+            if (*p != '<' && *p != '>')
                goto backout_tag;
            }
  
          /* Check for end of tag definition. */
-        if (*p == '>')
+        if (*p == '<' || *p == '>')
            break;
  
          /* Establish bounds of attribute name. */
@@ -875,7 +980,8 @@ map_html_tags (const char *text, int size,
  
          /* Establish bounds of attribute value. */
          SKIP_WS (p);
-        if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
+
+        if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
            {
              /* Minimized attribute syntax allows `=' to be omitted.
                 For example, <UL COMPACT> is a valid shorthand for <UL
@@ -912,7 +1018,7 @@ map_html_tags (const char *text, int size,
                          newline_seen = true;
                          continue;
                        }
-                    else if (newline_seen && *p == '>')
+                    else if (newline_seen && (*p == '<' || *p == '>'))
                        break;
                      ADVANCE (p);
                    }
@@ -937,7 +1043,7 @@ map_html_tags (const char *text, int size,
                     violated by, for instance, `%' in `width=75%'.
                     We'll be liberal and allow just about anything as
                     an attribute value.  */
-                while (!ISSPACE (*p) && *p != '>')
+                while (!c_isspace (*p) && *p != '<' && *p != '>')
                    ADVANCE (p);
                  attr_value_end = p; /* <foo bar=baz qux=quix> */
                                      /*             ^          */
@@ -984,6 +1090,11 @@ map_html_tags (const char *text, int size,
          ++nattrs;
        }
  
+    if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+      {
+        tail->contents_begin = p+1;
+      }
+
      if (uninteresting_tag)
        {
          ADVANCE (p);
@@ -995,6 +1106,7 @@ map_html_tags (const char *text, int size,
      {
        int i;
        struct taginfo taginfo;
+      struct tagstack_item *ts = NULL;
  
        taginfo.name      = pool.contents;
        taginfo.end_tag_p = end_tag;
@@ -1011,8 +1123,26 @@ map_html_tags (const char *text, int size,
        taginfo.attrs = pairs;
        taginfo.start_position = tag_start_position;
        taginfo.end_position   = p + 1;
+      taginfo.contents_begin = NULL;
+      taginfo.contents_end = NULL;
+
+      if (end_tag)
+        {
+          ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+          if (ts)
+            {
+              if (ts->contents_begin)
+                {
+                  taginfo.contents_begin = ts->contents_begin;
+                  taginfo.contents_end   = tag_start_position;
+                }
+              tagstack_pop (&head, &tail, ts);
+            }
+        }
+
        mapfun (&taginfo, maparg);
-      ADVANCE (p);
+      if (*p != '<')
+        ADVANCE (p);
      }
      goto look_for_tag;
  
@@ -1030,6 +1160,8 @@ map_html_tags (const char *text, int size,
    POOL_FREE (&pool);
    if (attr_pair_resized)
      xfree (pairs);
+  /* pop any tag stack that's left */
+  tagstack_pop (&head, &tail, head);
  }
  
  #undef ADVANCE