[svn] Updated long_to_string(); enhanced opt.downloaded to use

[wget] / src / html.c
diff --git a/src/html.c b/src/html.c

index a27edac157bb96d9c1b36efa07a6a0dc1783e234..7d9905058d523c4f6bc8d4579d5607acb89ee1a0 100644 (file)
--- a/src/html.c
+++ b/src/html.c
@@ -52,15 +52,43 @@ struct tag_attr {
  static int
  idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  {
-  int i;
-
-  if (!tag || !attr)
-    return 0;
-
+  int  i, j;
+  
+  if (tag == NULL || attr == NULL)
+    return FALSE;
+  
    for (i = 0; tags[i].tag; i++)
+    /* Loop through all the tags wget ever cares about. */
      if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
-      return 1;
-  return 0;
+      /* The tag and attribute matched one of the ones wget cares about. */
+      {
+       if (opt.ignore_tags)
+         /* --ignore-tags was specified.  Do not match these specific tags.
+            --ignore-tags takes precedence over --follow-tags, so we process
+            --ignore first and fall through if there's no match. */
+         for (j = 0; opt.ignore_tags[j] != NULL; j++)
+           /* Loop through all the tags this user doesn't care about. */
+           if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
+             return FALSE;
+       
+       if (opt.follow_tags)
+         /* --follow-tags was specified.  Only match these specific tags, so
+            return FALSE if we don't match one of them. */
+         {
+           for (j = 0; opt.follow_tags[j] != NULL; j++)
+             /* Loop through all the tags this user cares about. */
+             if (strcasecmp(opt.follow_tags[j], tag) == EQ)
+               return TRUE;
+           
+           return FALSE;  /* wasn't one of the explicitly desired tags */
+         }
+       
+       /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
+          if specified, didn't include this tag, so it's okay to follow. */
+       return TRUE;
+      }
+
+  return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
  }
  
  /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
@@ -69,15 +97,15 @@ idmatch (struct tag_attr *tags, const char *tag, const char *attr)
     address and the length of the string.  Return NULL if no URL is
     found.  */
  const char *
-htmlfindurl (const char *buf, int bufsize, int *size, int init)
+htmlfindurl (const char *buf, int bufsize, int *size, int init,
+            int dash_p_leaf_HTML)
  {
    const char *p, *ph;
-  state_t *s;
+  state_t    *s = &global_state;
+
    /* NULL-terminated list of tags and modifiers someone would want to
       follow -- feel free to edit to suit your needs: */
    static struct tag_attr html_allow[] = {
-    { "a", "href" },
-    { "link", "href" },
      { "script", "src" },
      { "img", "src" },
      { "img", "href" },
@@ -90,7 +118,6 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
      { "script", "src" },
      { "embed", "src" },
      { "bgsound", "src" },
-    { "area", "href" },
      { "img", "lowsrc" },
      { "input", "src" },
      { "layer", "src" },
@@ -98,13 +125,15 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
      { "th", "background"},
      { "td", "background"},
      /* Tags below this line are treated specially.  */
+    { "a", "href" },
+    { "area", "href" },
      { "base", "href" },
+    { "link", "href" },
+    { "link", "rel" },
      { "meta", "content" },
      { NULL, NULL }
    };
  
-  s = &global_state;
-
    if (init)
      {
        DEBUGP (("Resetting a parser state.\n"));
@@ -113,6 +142,10 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
  
    while (1)
      {
+      const char*  link_href = NULL;
+      const char*  link_rel = NULL;
+      int          link_href_saved_size = 0; /* init. just to shut up warning */
+
        if (!bufsize)
         break;
        /* Let's look for a tag, if we are not already in one.  */
@@ -211,7 +244,8 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
           /* Now we must skip the spaces to find '='.  */
           if (*buf != '=')
             {
-             for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
+             for (; bufsize && ISSPACE (*buf) && *buf != '>';
+                  ++buf, --bufsize);
               if (!bufsize || *buf == '>')
                 break;
             }
@@ -235,7 +269,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
               for (++buf, --bufsize;
                    bufsize && *buf != s->quote_char && *buf != '\n';
                    ++buf, --bufsize)
-               if (*buf == '#')
+               if (!ph && *buf == '#' && *(buf - 1) != '&')
                   ph = buf;
               if (!bufsize)
                 {
@@ -257,8 +291,9 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
           else
             {
               p = buf;
-             for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
-               if (*buf == '#')
+             for (; bufsize && !ISSPACE (*buf) && *buf != '>';
+                  ++buf, --bufsize)
+               if (!ph && *buf == '#' && *(buf - 1) != '&')
                   ph = buf;
               if (!bufsize)
                 break;
@@ -271,12 +306,72 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
              2) its tag and attribute are found in html_allow.  */
           if (*size && idmatch (html_allow, s->tag, s->attr))
             {
-             if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
+             if (strcasecmp(s->tag, "a") == EQ ||
+                 strcasecmp(s->tag, "area") == EQ)
+               {
+                 /* Only follow these if we're not at a -p leaf node, as they
+                    always link to external documents. */
+                 if (!dash_p_leaf_HTML)
+                   {
+                     s->at_value = 1;
+                     return p;
+                   }
+               }
+             else if (!strcasecmp (s->tag, "base") &&
+                      !strcasecmp (s->attr, "href"))
                 {
                   FREE_MAYBE (s->base);
                   s->base = strdupdelim (p, buf);
                 }
-             else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
+             else if (strcasecmp(s->tag, "link") == EQ)
+               {
+                 if (strcasecmp(s->attr, "href") == EQ)
+                   {
+                     link_href = p;
+                     link_href_saved_size = *size;  /* for restoration below */
+                   }
+                 else if (strcasecmp(s->attr, "rel") == EQ)
+                   link_rel = p;
+
+                 if (link_href != NULL && link_rel != NULL)
+                   /* Okay, we've now seen this <LINK> tag's HREF and REL
+                      attributes (they may be in either order), so it's now
+                      possible to decide if we want to traverse it. */
+                   if (!dash_p_leaf_HTML ||
+                       strncasecmp(link_rel, "stylesheet",
+                                   sizeof("stylesheet") - 1) == EQ)
+                     /* In the normal case, all <LINK> tags are fair game.
+                        
+                        In the special case of when -p is active, however, and
+                        we're at a leaf node (relative to the -l max. depth) in
+                        the HTML document tree, the only <LINK> tag we'll
+                        follow is a <LINK REL="stylesheet">, as it's necessary
+                        for displaying this document properly.  We won't follow
+                        other <LINK> tags, like <LINK REL="home">, for
+                        instance, as they refer to external documents.
+                        
+                        Note that the above strncasecmp() will incorrectly
+                        consider something like '<LINK REL="stylesheet.old"' as
+                        equivalent to '<LINK REL="stylesheet"'.  Not really
+                        worth the trouble to explicitly check for such cases --
+                        if time is spent, it should be spent ripping out wget's
+                        somewhat kludgy HTML parser and hooking in a real,
+                        componentized one. */
+                     {
+                       /* When we return, the 'size' IN/OUT parameter
+                          determines where in the buffer the end of the current
+                          attribute value is.  If REL came after HREF in this
+                          <LINK> tag, size is currently set to the size for
+                          REL's value -- set it to what it was when we were
+                          looking at HREF's value. */
+                       *size = link_href_saved_size;
+                       
+                       s->at_value = 1;
+                       return link_href;
+                     }
+               }
+             else if (!strcasecmp (s->tag, "meta") &&
+                      !strcasecmp (s->attr, "content"))
                 {
                   /* Some pages use a META tag to specify that the page
                      be refreshed by a new page after a given number of
@@ -294,7 +389,9 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init)
                   for (; *size && ISDIGIT (*p); p++, *size -= 1);
                   if (*p == ';')
                     {
-                     for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
+                     for (p++, *size -= 1;
+                          *size && ISSPACE (*p);
+                          p++, *size -= 1) ;
                       if (!strncasecmp (p, "URL=", 4))
                         {
                           p += 4, *size -= 4;
@@ -339,6 +436,83 @@ html_base (void)
    return global_state.base;
  }
  
+/* Create a malloc'ed copy of text in the range [beg, end), but with
+   the HTML entities processed.  Recognized entities are &lt, &gt,
+   &amp, &quot, &nbsp and the numerical entities.  */
+
+char *
+html_decode_entities (const char *beg, const char *end)
+{
+  char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
+  const char *from = beg;
+  char *to = newstr;
+
+  while (from < end)
+    {
+      if (*from != '&')
+       *to++ = *from++;
+      else
+       {
+         const char *save = from;
+         int remain;
+
+         if (++from == end) goto lose;
+         remain = end - from;
+
+         if (*from == '#')
+           {
+             int numeric;
+             ++from;
+             if (from == end || !ISDIGIT (*from)) goto lose;
+             for (numeric = 0; from < end && ISDIGIT (*from); from++)
+               numeric = 10 * numeric + (*from) - '0';
+             if (from < end && ISALPHA (*from)) goto lose;
+             numeric &= 0xff;
+             *to++ = numeric;
+           }
+#define FROB(literal) (remain >= (sizeof (literal) - 1)                        \
+                && !memcmp (from, literal, sizeof (literal) - 1)       \
+                && (*(from + sizeof (literal) - 1) == ';'              \
+                    || remain == sizeof (literal) - 1                  \
+                    || !ISALNUM (*(from + sizeof (literal) - 1))))
+         else if (FROB ("lt"))
+           *to++ = '<', from += 2;
+         else if (FROB ("gt"))
+           *to++ = '>', from += 2;
+         else if (FROB ("amp"))
+           *to++ = '&', from += 3;
+         else if (FROB ("quot"))
+           *to++ = '\"', from += 4;
+         /* We don't implement the "Added Latin 1" entities proposed
+            by rfc1866 (except for nbsp), because it is unnecessary
+            in the context of Wget, and would require hashing to work
+            efficiently.  */
+         else if (FROB ("nbsp"))
+           *to++ = 160, from += 4;
+         else
+           goto lose;
+#undef FROB
+         /* If the entity was followed by `;', we step over the `;'.
+            Otherwise, it was followed by either a non-alphanumeric
+            or EOB, in which case we do nothing.  */
+         if (from < end && *from == ';')
+           ++from;
+         continue;
+
+       lose:
+         /* This was not an entity after all.  Back out.  */
+         from = save;
+         *to++ = *from++;
+       }
+    }
+  *to++ = '\0';
+  /* #### Should we try to do this: */
+#if 0
+  newstr = xrealloc (newstr, to - newstr);
+#endif
+  return newstr;
+}
+
  /* The function returns the pointer to the malloc-ed quoted version of
     string s.  It will recognize and quote numeric and special graphic
     entities, as per RFC1866: