[svn] Make -p work with framed pages.

[wget] / src / recur.c
diff --git a/src/recur.c b/src/recur.c

index 7af7d0509809f7e7cddad9896255f1a68d0724d8..091df379deefad9a4224810e9a358d44d3cf2bf7 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -149,9 +149,12 @@ url_dequeue (struct url_queue *queue,
    xfree (qel);
    return 1;
  }
-
+\f
  static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
                                   struct url *, struct hash_table *));
+static int descend_redirect_p PARAMS ((const char *, const char *, int,
+                                      struct url *, struct hash_table *));
+
  
  /* Retrieve a part of the web beginning with START_URL.  This used to
     be called "recursive retrieval", because the old function was
@@ -182,7 +185,8 @@ retrieve_tree (const char *start_url)
    /* The queue of URLs we need to load. */
    struct url_queue *queue = url_queue_new ();
  
-  /* The URLs we decided we don't want to load. */
+  /* The URLs we do not wish to enqueue, because they are already in
+     the queue, but haven't been downloaded yet.  */
    struct hash_table *blacklist = make_string_hash_table (0);
  
    /* We'll need various components of this, so better get it over with
@@ -223,33 +227,50 @@ retrieve_tree (const char *start_url)
         status = retrieve_url (url, &file, &redirected, NULL, &dt);
         opt.recursive = oldrec;
  
+       if (file && status == RETROK
+           && (dt & RETROKF) && (dt & TEXTHTML))
+         descend = 1;
+
         if (redirected)
           {
+           /* We have been redirected, possibly to another host, or
+              different path, or wherever.  Check whether we really
+              want to follow it.  */
+           if (descend)
+             {
+               if (!descend_redirect_p (redirected, url, depth,
+                                        start_url_parsed, blacklist))
+                 descend = 0;
+               else
+                 /* Make sure that the old pre-redirect form gets
+                    blacklisted. */
+                 string_set_add (blacklist, url);
+             }
+
             xfree (url);
             url = redirected;
           }
-       if (file && status == RETROK
-           && (dt & RETROKF) && (dt & TEXTHTML))
-         descend = 1;
        }
  
        if (descend
           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
         {
-         if (opt.page_requisites && depth == opt.reclevel)
-           /* When -p is specified, we can do one more partial
-              recursion from the "leaf nodes" on the HTML document
-              tree.  The recursion is partial in that we won't
-              traverse any <A> or <AREA> tags, nor any <LINK> tags
-              except for <LINK REL="stylesheet">. */
-           /* #### This would be the place to implement the TODO
-              entry saying that -p should do two more hops on
-              framesets.  */
-           dash_p_leaf_HTML = TRUE;
+         if (opt.page_requisites
+             && (depth == opt.reclevel || depth == opt.reclevel + 1))
+           {
+             /* When -p is specified, we are allowed to exceed the
+                maximum depth, but only for the "inline" links,
+                i.e. those that are needed to display the page.
+                Originally this could exceed the depth at most by
+                one, but we allow one more level so that the leaf
+                pages that contain frames can be loaded
+                correctly.  */
+             dash_p_leaf_HTML = TRUE;
+           }
           else
             {
               /* Either -p wasn't specified or it was and we've
-                already gone the one extra (pseudo-)level that it
+                already spent the two extra (pseudo-)levels that it
                  affords us, so we need to bail out. */
               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
                        depth, opt.reclevel));
@@ -263,8 +284,8 @@ retrieve_tree (const char *start_url)
        if (descend)
         {
           int meta_disallow_follow = 0;
-         struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
-                                                  &meta_disallow_follow);
+         struct urlpos *children
+           = get_urls_html (file, url, &meta_disallow_follow);
  
           if (opt.use_robots && meta_disallow_follow)
             {
@@ -282,6 +303,8 @@ retrieve_tree (const char *start_url)
                 {
                   if (child->ignore_when_downloading)
                     continue;
+                 if (dash_p_leaf_HTML && !child->link_inline_p)
+                   continue;
                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
                                      blacklist))
                     {
@@ -309,7 +332,8 @@ retrieve_tree (const char *start_url)
                    opt.delete_after ? "--delete-after" :
                    "recursive rejection criteria"));
           logprintf (LOG_VERBOSE,
-                    (opt.delete_after ? _("Removing %s.\n")
+                    (opt.delete_after
+                     ? _("Removing %s.\n")
                       : _("Removing %s since it should be rejected.\n")),
                      file);
           if (unlink (file))
@@ -348,7 +372,11 @@ retrieve_tree (const char *start_url)
  
  /* Based on the context provided by retrieve_tree, decide whether a
     URL is to be descended to.  This is only ever called from
-   retrieve_tree, but is in a separate function for clarity.  */
+   retrieve_tree, but is in a separate function for clarity.
+
+   The most expensive checks (such as those for robots) are memoized
+   by storing these URLs to BLACKLIST.  This may or may not help.  It
+   will help if those URLs are encountered many times.  */
  
  static int
  descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
@@ -391,7 +419,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
        && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
      {
        DEBUGP (("Not following non-HTTP schemes.\n"));
-      goto blacklist;
+      goto out;
      }
  
    /* 2. If it is an absolute link and they are not followed, throw it
@@ -400,7 +428,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      if (opt.relative_only && !upos->link_relative_p)
        {
         DEBUGP (("It doesn't really look like a relative link.\n"));
-       goto blacklist;
+       goto out;
        }
  
    /* 3. If its domain is not to be accepted/looked-up, chuck it
@@ -408,22 +436,25 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
    if (!accept_domain (u))
      {
        DEBUGP (("The domain was not accepted.\n"));
-      goto blacklist;
+      goto out;
      }
  
    /* 4. Check for parent directory.
  
       If we descended to a different host or changed the scheme, ignore
-     opt.no_parent.  Also ignore it for -p leaf retrievals.  */
+     opt.no_parent.  Also ignore it for documents needed to display
+     the parent page when in -p mode.  */
    if (opt.no_parent
-      && u->scheme == parent->scheme
-      && 0 == strcasecmp (u->host, parent->host)
-      && u->port == parent->port)
+      && u->scheme == start_url_parsed->scheme
+      && 0 == strcasecmp (u->host, start_url_parsed->host)
+      && u->port == start_url_parsed->port
+      && !(opt.page_requisites && upos->link_inline_p))
      {
-      if (!frontcmp (parent->dir, u->dir))
+      if (!frontcmp (start_url_parsed->dir, u->dir))
         {
-         DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
-         goto blacklist;
+         DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
+                  u->dir, start_url_parsed->dir));
+         goto out;
         }
      }
  
@@ -435,13 +466,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
        if (!accdir (u->dir, ALLABS))
         {
           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
-         goto blacklist;
+         goto out;
         }
      }
  
    /* 6. */
    {
-    char *suf = NULL;
+    char *suf;
      /* Check for acceptance/rejection rules.  We ignore these rules
         for HTML documents because they might lead to other files which
         need to be downloaded.  Of course, we don't know which
@@ -460,17 +491,15 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      if (u->file[0] != '\0'
         && ((suf = suffix (url)) == NULL
             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
-           || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
+           || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
        {
         if (!acceptable (u->file))
           {
             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
                      url, u->file));
-           FREE_MAYBE (suf);
-           goto blacklist;
+           goto out;
           }
        }
-    FREE_MAYBE (suf);
    }
  
    /* 7. */
@@ -479,7 +508,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
        {
         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
                  u->host, parent->host));
-       goto blacklist;
+       goto out;
        }
  
    /* 8. */
@@ -509,7 +538,8 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
        if (!res_match_path (specs, u->path))
         {
           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
-         goto blacklist;
+         string_set_add (blacklist, url);
+         goto out;
         }
      }
  
@@ -519,14 +549,48 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
  
    return 1;
  
- blacklist:
-  string_set_add (blacklist, url);
-
   out:
    DEBUGP (("Decided NOT to load it.\n"));
  
    return 0;
  }
+
+/* This function determines whether we should descend the children of
+   the URL whose download resulted in a redirection, possibly to
+   another host, etc.  It is needed very rarely, and thus it is merely
+   a simple-minded wrapper around descend_url_p.  */
+
+static int
+descend_redirect_p (const char *redirected, const char *original, int depth,
+                   struct url *start_url_parsed, struct hash_table *blacklist)
+{
+  struct url *orig_parsed, *new_parsed;
+  struct urlpos *upos;
+  int success;
+
+  orig_parsed = url_parse (original, NULL);
+  assert (orig_parsed != NULL);
+
+  new_parsed = url_parse (redirected, NULL);
+  assert (new_parsed != NULL);
+
+  upos = xmalloc (sizeof (struct urlpos));
+  memset (upos, 0, sizeof (*upos));
+  upos->url = new_parsed;
+
+  success = descend_url_p (upos, orig_parsed, depth,
+                          start_url_parsed, blacklist);
+
+  url_free (orig_parsed);
+  url_free (new_parsed);
+  xfree (upos);
+
+  if (!success)
+    DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
+
+  return success;
+}
+
  \f
  /* Register that URL has been successfully downloaded to FILE. */
  
@@ -574,36 +638,30 @@ register_html (const char *url, const char *file)
    downloaded_html_files = slist_prepend (downloaded_html_files, file);
  }
  
-/* convert_links() is called from recursive_retrieve() after we're
-   done with an HTML file.  This call to convert_links is not complete
-   because it converts only the downloaded files, and Wget cannot know
-   which files will be downloaded afterwards.  So, if we have file
-   fileone.html with:
-
-   <a href="/c/something.gif">
-
-   and /c/something.gif was not downloaded because it exceeded the
-   recursion depth, the reference will *not* be changed.
-
-   However, later we can encounter /c/something.gif from an "upper"
-   level HTML (let's call it filetwo.html), and it gets downloaded.
+/* This function is called when the retrieval is done to convert the
+   links that have been downloaded.  It has to be called at the end of
+   the retrieval, because only then does Wget know conclusively which
+   URLs have been downloaded, and which not, so it can tell which
+   direction to convert to.
  
-   But now we have a problem because /c/something.gif will be
-   correctly transformed in filetwo.html, but not in fileone.html,
-   since Wget could not have known that /c/something.gif will be
-   downloaded in the future.
+   The "direction" means that the URLs to the files that have been
+   downloaded get converted to the relative URL which will point to
+   that file.  And the other URLs get converted to the remote URL on
+   the server.
  
-   This is why Wget must, after the whole retrieval, call
-   convert_all_links to go once more through the entire list of
-   retrieved HTMLs, and re-convert them.
+   All the downloaded HTMLs are kept in downloaded_html_files, and
+   downloaded URLs in urls_downloaded.  All the information is
+   extracted from these two lists.  */
  
-   All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
-   in urls_downloaded.  From these two lists information is
-   extracted.  */
  void
  convert_all_links (void)
  {
    slist *html;
+  struct wget_timer *timer;
+  long msecs;
+  int file_count = 0;
+
+  timer = wtimer_new ();
  
    /* Destructively reverse downloaded_html_files to get it in the right order.
       recursive_retrieve() used slist_prepend() consistently.  */
@@ -625,7 +683,7 @@ convert_all_links (void)
         DEBUGP (("I cannot find the corresponding URL.\n"));
  
        /* Parse the HTML file...  */
-      urls = get_urls_html (html->string, url, FALSE, NULL);
+      urls = get_urls_html (html->string, url, NULL);
  
        /* We don't respect meta_disallow_follow here because, even if
           the file is not followed, we might still want to convert the
@@ -675,11 +733,19 @@ convert_all_links (void)
               cur_url->local_name = NULL;
             }
         }
+
        /* Convert the links in the file.  */
        convert_links (html->string, urls);
+      ++file_count;
+
        /* Free the data.  */
        free_urlpos (urls);
      }
+
+  msecs = wtimer_elapsed (timer);
+  wtimer_delete (timer);
+  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
+            file_count, (double)msecs / 1000);
  }
  
  /* Cleanup the data structures associated with recursive retrieving