[svn] Committed my patch from <sxsy9z4xz5m.fsf@florida.arsdigita.de>

author hniksic <devnull@localhost>

Wed, 1 Nov 2000 01:25:12 +0000 (17:25 -0800)

committer hniksic <devnull@localhost>

Wed, 1 Nov 2000 01:25:12 +0000 (17:25 -0800)
author hniksic <devnull@localhost>
Wed, 1 Nov 2000 01:25:12 +0000 (17:25 -0800)
committer hniksic <devnull@localhost>
Wed, 1 Nov 2000 01:25:12 +0000 (17:25 -0800)
diff --git a/src/ChangeLog b/src/ChangeLog

index 88c4501360025f1abeba3a35630416ea97170ff0..f712bacc78de6b46b1abe385aff299693508b7e6 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,12 @@
+2000-11-01  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+       * url.c (get_urls_html): Decode HTML entities using
+       html_decode_entities.
+
+       * html.c (htmlfindurl): Don't count the `#' in numeric entities
+       (&#NNN;) as an HTML fragemnt.
+       (html_decode_entities): New function.
+
  2000-11-01  Hrvoje Niksic  <hniksic@arsdigita.com>
  
         * html.c (htmlfindurl): Fix recognition of # HTML fragments.
diff --git a/src/html.c b/src/html.c

index ace0e31b8dcf00ef4b2bbf8a383f64727b55dd19..7d9905058d523c4f6bc8d4579d5607acb89ee1a0 100644 (file)
--- a/src/html.c
+++ b/src/html.c
@@ -91,7 +91,6 @@ idmatch (struct tag_attr *tags, const char *tag, const char *attr)
    return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
  }
  
-
  /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
     describing URLs to follow.  When a tag is encountered, extract its
     components (as described by html_allow[] array), and return the
@@ -270,7 +269,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
               for (++buf, --bufsize;
                    bufsize && *buf != s->quote_char && *buf != '\n';
                    ++buf, --bufsize)
-               if (!ph && *buf == '#')
+               if (!ph && *buf == '#' && *(buf - 1) != '&')
                   ph = buf;
               if (!bufsize)
                 {
@@ -294,7 +293,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
               p = buf;
               for (; bufsize && !ISSPACE (*buf) && *buf != '>';
                    ++buf, --bufsize)
-               if (!ph && *buf == '#')
+               if (!ph && *buf == '#' && *(buf - 1) != '&')
                   ph = buf;
               if (!bufsize)
                 break;
@@ -437,6 +436,83 @@ html_base (void)
    return global_state.base;
  }
  
+/* Create a malloc'ed copy of text in the range [beg, end), but with
+   the HTML entities processed.  Recognized entities are &lt, &gt,
+   &amp, &quot, &nbsp and the numerical entities.  */
+
+char *
+html_decode_entities (const char *beg, const char *end)
+{
+  char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
+  const char *from = beg;
+  char *to = newstr;
+
+  while (from < end)
+    {
+      if (*from != '&')
+       *to++ = *from++;
+      else
+       {
+         const char *save = from;
+         int remain;
+
+         if (++from == end) goto lose;
+         remain = end - from;
+
+         if (*from == '#')
+           {
+             int numeric;
+             ++from;
+             if (from == end || !ISDIGIT (*from)) goto lose;
+             for (numeric = 0; from < end && ISDIGIT (*from); from++)
+               numeric = 10 * numeric + (*from) - '0';
+             if (from < end && ISALPHA (*from)) goto lose;
+             numeric &= 0xff;
+             *to++ = numeric;
+           }
+#define FROB(literal) (remain >= (sizeof (literal) - 1)                        \
+                && !memcmp (from, literal, sizeof (literal) - 1)       \
+                && (*(from + sizeof (literal) - 1) == ';'              \
+                    || remain == sizeof (literal) - 1                  \
+                    || !ISALNUM (*(from + sizeof (literal) - 1))))
+         else if (FROB ("lt"))
+           *to++ = '<', from += 2;
+         else if (FROB ("gt"))
+           *to++ = '>', from += 2;
+         else if (FROB ("amp"))
+           *to++ = '&', from += 3;
+         else if (FROB ("quot"))
+           *to++ = '\"', from += 4;
+         /* We don't implement the "Added Latin 1" entities proposed
+            by rfc1866 (except for nbsp), because it is unnecessary
+            in the context of Wget, and would require hashing to work
+            efficiently.  */
+         else if (FROB ("nbsp"))
+           *to++ = 160, from += 4;
+         else
+           goto lose;
+#undef FROB
+         /* If the entity was followed by `;', we step over the `;'.
+            Otherwise, it was followed by either a non-alphanumeric
+            or EOB, in which case we do nothing.  */
+         if (from < end && *from == ';')
+           ++from;
+         continue;
+
+       lose:
+         /* This was not an entity after all.  Back out.  */
+         from = save;
+         *to++ = *from++;
+       }
+    }
+  *to++ = '\0';
+  /* #### Should we try to do this: */
+#if 0
+  newstr = xrealloc (newstr, to - newstr);
+#endif
+  return newstr;
+}
+
  /* The function returns the pointer to the malloc-ed quoted version of
     string s.  It will recognize and quote numeric and special graphic
     entities, as per RFC1866:
diff --git a/src/html.h b/src/html.h

index 7fa0132e1534f71fa530961aa68fe0c0dfbccf2b..824b6ca951c63176a05528c23e7934f8b08aa9a9 100644 (file)
--- a/src/html.h
+++ b/src/html.h
@@ -34,6 +34,7 @@ struct fileinfo;
  /* Function declarations */
  const char *htmlfindurl PARAMS ((const char *, int, int *, int, int));
  const char *html_base PARAMS ((void));
+char *html_decode_entities PARAMS ((const char *, const char *));
  uerr_t ftp_index PARAMS ((const char *, struct urlinfo *, struct fileinfo *));
  
  #endif /* HTML_H */
diff --git a/src/url.c b/src/url.c

index d68f1851e2e69da211983aa647578c1781413b40..0a9fa4daeac2f4ea760c90e9fa5356c37023f03e 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -869,6 +869,7 @@ get_urls_html (const char *file, const char *this_url, int silent,
        const char *pbuf = buf;
        char *constr, *base;
        const char *cbase;
+      char *needs_freeing, *url_data;
  
        first_time = 0;
  
@@ -889,16 +890,27 @@ get_urls_html (const char *file, const char *this_url, int silent,
        if (!size)
         break;
  
+      /* It would be nice if we could avoid allocating memory in this
+         loop, but I don't see an easy way.  To process the entities,
+         we need to either copy the data, or change it destructively.
+         I choose the former.
+
+        We have two pointers: needs_freeing and url_data, because the
+        code below does thing like url_data += <something>, and we
+        want to pass the original string to free(). */
+      needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
+      size = strlen (url_data);
+
        for (i = 0; protostrings[i]; i++)
         {
-         if (!strncasecmp (protostrings[i], pbuf,
+         if (!strncasecmp (protostrings[i], url_data,
                             MINVAL (strlen (protostrings[i]), size)))
             break;
         }
        /* Check for http:RELATIVE_URI.  See below for details.  */
        if (protostrings[i]
-         && !(strncasecmp (pbuf, "http:", 5) == 0
-              && strncasecmp (pbuf, "http://", 7) != 0))
+         && !(strncasecmp (url_data, "http:", 5) == 0
+              && strncasecmp (url_data, "http://", 7) != 0))
         {
           no_proto = 0;
         }
@@ -909,20 +921,23 @@ get_urls_html (const char *file, const char *this_url, int silent,
              relative URI-s as <a href="http:URL">.  Just strip off the
              silly leading "http:" (as well as any leading blanks
              before it).  */
-         if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
-           pbuf += 5, size -= 5;
+         if ((size > 5) && !strncasecmp ("http:", url_data, 5))
+           url_data += 5, size -= 5;
         }
        if (!no_proto)
         {
           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
             {
-             if (!strncasecmp (sup_protos[i].name, pbuf,
+             if (!strncasecmp (sup_protos[i].name, url_data,
                                MINVAL (strlen (sup_protos[i].name), size)))
                 break;
             }
           /* Do *not* accept a non-supported protocol.  */
           if (i == ARRAY_SIZE (sup_protos))
-           continue;
+           {
+             free (needs_freeing);
+             continue;
+           }
         }
        if (no_proto)
         {
@@ -945,13 +960,14 @@ get_urls_html (const char *file, const char *this_url, int silent,
                   /* Use malloc, not alloca because this is called in
                       a loop. */
                   char *temp = (char *)malloc (size + 1);
-                 strncpy (temp, pbuf, size);
+                 strncpy (temp, url_data, size);
                   temp[size] = '\0';
                   logprintf (LOG_NOTQUIET,
                              _("Error (%s): Link %s without a base provided.\n"),
                              file, temp);
                   free (temp);
                 }
+             free (needs_freeing);
               continue;
             }
           if (this_url)
@@ -966,17 +982,18 @@ get_urls_html (const char *file, const char *this_url, int silent,
                   logprintf (LOG_NOTQUIET, _("\
  Error (%s): Base %s relative, without referer URL.\n"),
                              file, cbase);
+                 free (needs_freeing);
                   continue;
                 }
               base = xstrdup (cbase);
             }
-         constr = construct (base, pbuf, size, no_proto);
+         constr = construct (base, url_data, size, no_proto);
           free (base);
         }
        else /* has proto */
         {
           constr = (char *)xmalloc (size + 1);
-         strncpy (constr, pbuf, size);
+         strncpy (constr, url_data, size);
           constr[size] = '\0';
         }
  #ifdef DEBUG
@@ -988,7 +1005,7 @@ Error (%s): Base %s relative, without referer URL.\n"),
           tmp2 = html_base ();
           /* Use malloc, not alloca because this is called in a loop. */
           tmp = (char *)xmalloc (size + 1);
-         strncpy (tmp, pbuf, size);
+         strncpy (tmp, url_data, size);
           tmp[size] = '\0';
           logprintf (LOG_ALWAYS,
                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
@@ -1009,14 +1026,15 @@ Error (%s): Base %s relative, without referer URL.\n"),
        memset (current, 0, sizeof (*current));
        current->next = NULL;
        current->url = constr;
-      current->size = size;
-      current->pos = pbuf - orig_buf;
+      current->size = step;
+      current->pos = buf - orig_buf;
        /* A URL is relative if the host and protocol are not named,
          and the name does not start with `/'.  */
-      if (no_proto && *pbuf != '/')
+      if (no_proto && *url_data != '/')
         current->flags |= (URELATIVE | UNOPROTO);
        else if (no_proto)
         current->flags |= UNOPROTO;
+      free (needs_freeing);
      }
    free (orig_buf);
author	hniksic <devnull@localhost>
	Wed, 1 Nov 2000 01:25:12 +0000 (17:25 -0800)
committer	hniksic <devnull@localhost>
	Wed, 1 Nov 2000 01:25:12 +0000 (17:25 -0800)
src/ChangeLog		patch \| blob \| history
src/html.c		patch \| blob \| history
src/html.h		patch \| blob \| history
src/url.c		patch \| blob \| history