[svn] Don't descend into HTML that was downloaded by following <img src=...>

author hniksic <devnull@localhost>

Fri, 10 Oct 2003 14:25:10 +0000 (07:25 -0700)

committer hniksic <devnull@localhost>

Fri, 10 Oct 2003 14:25:10 +0000 (07:25 -0700)
author hniksic <devnull@localhost>
Fri, 10 Oct 2003 14:25:10 +0000 (07:25 -0700)
committer hniksic <devnull@localhost>
Fri, 10 Oct 2003 14:25:10 +0000 (07:25 -0700)
diff --git a/src/ChangeLog b/src/ChangeLog

index e31b1989e7584a11e3cdc8d71af2f70ab4f48eb4..a554f15070a62347411876fa61c8490fbf19f451 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,13 @@
+2003-10-10  Hrvoje Niksic  <hniksic@xemacs.org>
+
+       * recur.c (retrieve_tree): Don't descend into documents that are
+       not expected to contain HTML, regardless of their content-type.
+
+       * html-url.c (tag_url_attributes): Record which attributes are
+       supposed to yield HTML links that can be followed.
+       (tag_find_urls): Propagate that information to the caller through
+       struct urlpos.
+
  2003-10-10  Hrvoje Niksic  <hniksic@xemacs.org>
  
         * hash.c (find_mapping): Return the next available mapping when
diff --git a/src/convert.h b/src/convert.h

index 66b7dfc7d9dd9ff81e946352443ac4c3e5cfdef4..fff8410f009966af6224a518689658a33b8b90c3 100644 (file)
--- a/src/convert.h
+++ b/src/convert.h
@@ -56,11 +56,11 @@ struct urlpos {
  
    /* Information about the original link: */
  
-  unsigned int link_relative_p :1; /* was the link relative? */
-  unsigned int link_complete_p :1; /* was the link complete (with the
-                                      host name, etc.) */
-  unsigned int link_base_p     :1; /* was the link <base href=...> */
-  unsigned int link_inline_p   :1; /* needed to render the page. */
+  unsigned int link_relative_p :1; /* the link was relative */
+  unsigned int link_complete_p :1; /* the link was complete (had host name) */
+  unsigned int link_base_p     :1; /* the url came from <base href=...> */
+  unsigned int link_inline_p   :1; /* needed to render the page */
+  unsigned int link_expect_html        :1; /* expected to contain HTML */
  
    unsigned int link_refresh_p  :1; /* link was received from
                                        <meta http-equiv=refresh content=...> */
diff --git a/src/html-url.c b/src/html-url.c

index 80f5b96c19ffcec006a0f9d3565868ddd11be95a..c2ed2c588c26f27355e2864e42838165a81bdde6 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -121,11 +121,19 @@ static struct known_tag {
  /* tag_url_attributes documents which attributes of which tags contain
     URLs to harvest.  It is used by tag_find_urls.  */
  
-/* Defines for the FLAGS field; currently only one flag is defined. */
+/* Defines for the FLAGS. */
  
-/* This tag points to an external document not necessary for rendering this 
-   document (i.e. it's not an inlined image, stylesheet, etc.). */
-#define TUA_EXTERNAL 1
+/* The link is "inline", i.e. needs to be retrieved for this document
+   to be correctly rendered.  Inline links include inlined images,
+   stylesheets, children frames, etc.  */
+#define ATTR_INLINE    1
+
+/* The link is expected to yield HTML contents.  It's important not to
+   try to follow HTML obtained by following e.g. <img src="...">
+   regardless of content-type.  Doing this causes infinite loops for
+   "images" that return non-404 error pages with links to the same
+   image.  */
+#define ATTR_HTML      2
  
  /* For tags handled by tag_find_urls: attributes that contain URLs to
     download. */
@@ -134,26 +142,26 @@ static struct {
    const char *attr_name;
    int flags;
  } tag_url_attributes[] = {
-  { TAG_A,             "href",         TUA_EXTERNAL },
-  { TAG_APPLET,                "code",         0 },
-  { TAG_AREA,          "href",         TUA_EXTERNAL },
-  { TAG_BGSOUND,       "src",          0 },
-  { TAG_BODY,          "background",   0 },
-  { TAG_EMBED,         "href",         TUA_EXTERNAL },
-  { TAG_EMBED,         "src",          0 },
-  { TAG_FIG,           "src",          0 },
-  { TAG_FRAME,         "src",          0 },
-  { TAG_IFRAME,                "src",          0 },
-  { TAG_IMG,           "href",         0 },
-  { TAG_IMG,           "lowsrc",       0 },
-  { TAG_IMG,           "src",          0 },
-  { TAG_INPUT,         "src",          0 },
-  { TAG_LAYER,         "src",          0 },
-  { TAG_OVERLAY,       "src",          0 },
-  { TAG_SCRIPT,                "src",          0 },
-  { TAG_TABLE,         "background",   0 },
-  { TAG_TD,            "background",   0 },
-  { TAG_TH,            "background",   0 }
+  { TAG_A,             "href",         ATTR_HTML },
+  { TAG_APPLET,                "code",         ATTR_INLINE },
+  { TAG_AREA,          "href",         ATTR_HTML },
+  { TAG_BGSOUND,       "src",          ATTR_INLINE },
+  { TAG_BODY,          "background",   ATTR_INLINE },
+  { TAG_EMBED,         "href",         ATTR_HTML },
+  { TAG_EMBED,         "src",          ATTR_INLINE | ATTR_HTML },
+  { TAG_FIG,           "src",          ATTR_INLINE },
+  { TAG_FRAME,         "src",          ATTR_INLINE | ATTR_HTML },
+  { TAG_IFRAME,                "src",          ATTR_INLINE | ATTR_HTML },
+  { TAG_IMG,           "href",         ATTR_INLINE },
+  { TAG_IMG,           "lowsrc",       ATTR_INLINE },
+  { TAG_IMG,           "src",          ATTR_INLINE },
+  { TAG_INPUT,         "src",          ATTR_INLINE },
+  { TAG_LAYER,         "src",          ATTR_INLINE | ATTR_HTML },
+  { TAG_OVERLAY,       "src",          ATTR_INLINE | ATTR_HTML },
+  { TAG_SCRIPT,                "src",          ATTR_INLINE },
+  { TAG_TABLE,         "background",   ATTR_INLINE },
+  { TAG_TD,            "background",   ATTR_INLINE },
+  { TAG_TH,            "background",   ATTR_INLINE }
  };
  
  /* The lists of interesting tags and attributes are built dynamically,
@@ -262,7 +270,7 @@ struct map_context {
     size.  */
  
  static struct urlpos *
-append_one_url (const char *link_uri, int inlinep,
+append_one_url (const char *link_uri,
                 struct taginfo *tag, int attrind, struct map_context *ctx)
  {
    int link_has_scheme = url_has_scheme (link_uri);
@@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
    newel->url = url;
    newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
    newel->size = tag->attrs[attrind].value_raw_size;
-  newel->link_inline_p = inlinep;
  
    /* A URL is relative if the host is not named, and the name does not
       start with `/'.  */
@@ -393,8 +400,15 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
           if (0 == strcasecmp (tag->attrs[attrind].name,
                                tag_url_attributes[i].attr_name))
             {
-             int flags = tag_url_attributes[i].flags;
-             append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
+             struct urlpos *up = append_one_url (link, tag, attrind, ctx);
+             if (up)
+               {
+                 int flags = tag_url_attributes[i].flags;
+                 if (flags & ATTR_INLINE)
+                   up->link_inline_p = 1;
+                 if (flags & ATTR_HTML)
+                   up->link_expect_html = 1;
+               }
             }
         }
      }
@@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
    if (!newbase)
      return;
  
-  base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
+  base_urlpos = append_one_url (newbase, tag, attrind, ctx);
    if (!base_urlpos)
      return;
    base_urlpos->ignore_when_downloading = 1;
@@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
    char *action = find_attr (tag, "action", &attrind);
    if (action)
      {
-      struct urlpos *action_urlpos = append_one_url (action, 0, tag,
-                                                    attrind, ctx);
-      if (action_urlpos)
-       action_urlpos->ignore_when_downloading = 1;
+      struct urlpos *up = append_one_url (action, tag, attrind, ctx);
+      if (up)
+       up->ignore_when_downloading = 1;
      }
  }
  
@@ -458,11 +471,15 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
    */
    if (href)
      {
-      char *rel  = find_attr (tag, "rel", NULL);
-      int inlinep = (rel
-                    && (0 == strcasecmp (rel, "stylesheet")
-                        || 0 == strcasecmp (rel, "shortcut icon")));
-      append_one_url (href, inlinep, tag, attrind, ctx);
+      struct urlpos *up = append_one_url (href, tag, attrind, ctx);
+      if (up)
+       {
+         char *rel = find_attr (tag, "rel", NULL);
+         if (rel
+             && (0 == strcasecmp (rel, "stylesheet")
+                 || 0 == strcasecmp (rel, "shortcut icon")))
+           up->link_inline_p = 1;
+       }
      }
  }
  
@@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
        while (ISSPACE (*p))
         ++p;
  
-      entry = append_one_url (p, 0, tag, attrind, ctx);
+      entry = append_one_url (p, tag, attrind, ctx);
        if (entry)
         {
           entry->link_refresh_p = 1;
diff --git a/src/recur.c b/src/recur.c

index 007354b76770af936ed3b3588e46a4d2e1d856d8..bf367074d9a0f2c4a6846f7b473e0cc32a834e45 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -6,7 +6,7 @@ This file is part of GNU Wget.
  GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+ (at your option) any later version.
  
  GNU Wget is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
  /* Functions for maintaining the URL queue.  */
  
  struct queue_element {
-  const char *url;
-  const char *referer;
-  int depth;
-  struct queue_element *next;
+  const char *url;             /* the URL to download */
+  const char *referer;         /* the referring document */
+  int depth;                   /* the depth */
+  unsigned int html_allowed :1;        /* whether the document is allowed to
+                                  be treated as HTML. */
+
+  struct queue_element *next;  /* next element in queue */
  };
  
  struct url_queue {
@@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
  
  static void
  url_enqueue (struct url_queue *queue,
-            const char *url, const char *referer, int depth)
+            const char *url, const char *referer, int depth, int html_allowed)
  {
    struct queue_element *qel = xmalloc (sizeof (*qel));
    qel->url = url;
    qel->referer = referer;
    qel->depth = depth;
+  qel->html_allowed = html_allowed;
    qel->next = NULL;
  
    ++queue->count;
@@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
  
  static int
  url_dequeue (struct url_queue *queue,
-            const char **url, const char **referer, int *depth)
+            const char **url, const char **referer, int *depth,
+            int *html_allowed)
  {
    struct queue_element *qel = queue->head;
  
@@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
    *url = qel->url;
    *referer = qel->referer;
    *depth = qel->depth;
+  *html_allowed = qel->html_allowed;
  
    --queue->count;
  
@@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
  
    /* Enqueue the starting URL.  Use start_url_parsed->url rather than
       just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0);
+  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
    string_set_add (blacklist, start_url_parsed->url);
  
    while (1)
      {
        int descend = 0;
        char *url, *referer, *file = NULL;
-      int depth;
+      int depth, html_allowed;
        boolean dash_p_leaf_HTML = FALSE;
  
        if (downloaded_exceeds_quota ())
@@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
  
        if (!url_dequeue (queue,
                         (const char **)&url, (const char **)&referer,
-                       &depth))
+                       &depth, &html_allowed))
         break;
  
        /* ...and download it.  Note that this download is in most cases
@@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
                    url, file));
  
-         if (downloaded_html_set
+         if (html_allowed
+             && downloaded_html_set
               && string_set_contains (downloaded_html_set, file))
             descend = 1;
         }
@@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
           status = retrieve_url (url, &file, &redirected, referer, &dt);
           opt.recursive = oldrec;
  
-         if (file && status == RETROK
+         if (html_allowed && file && status == RETROK
               && (dt & RETROKF) && (dt & TEXTHTML))
             descend = 1;
  
@@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
                                         blacklist))
                     {
                       url_enqueue (queue, xstrdup (child->url->url),
-                                  xstrdup (url), depth + 1);
+                                  xstrdup (url), depth + 1,
+                                  child->link_expect_html);
                       /* We blacklist the URL we have enqueued, because we
                          don't want to enqueue (and hence download) the
                          same URL twice.  */
@@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
       now.  */
    {
      char *d1, *d2;
-    int d3;
-    while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
+    int d3, d4;
+    while (url_dequeue (queue,
+                       (const char **)&d1, (const char **)&d2, &d3, &d4))
        {
         xfree (d1);
         FREE_MAYBE (d2);
author	hniksic <devnull@localhost>
	Fri, 10 Oct 2003 14:25:10 +0000 (07:25 -0700)
committer	hniksic <devnull@localhost>
	Fri, 10 Oct 2003 14:25:10 +0000 (07:25 -0700)
src/ChangeLog		patch \| blob \| history
src/convert.h		patch \| blob \| history
src/html-url.c		patch \| blob \| history
src/recur.c		patch \| blob \| history