[svn] Ignore -np when in -p mode.

author hniksic <devnull@localhost>

Fri, 30 Nov 2001 21:17:53 +0000 (13:17 -0800)

committer hniksic <devnull@localhost>

Fri, 30 Nov 2001 21:17:53 +0000 (13:17 -0800)
author hniksic <devnull@localhost>
Fri, 30 Nov 2001 21:17:53 +0000 (13:17 -0800)
committer hniksic <devnull@localhost>
Fri, 30 Nov 2001 21:17:53 +0000 (13:17 -0800)
diff --git a/TODO b/TODO

index 4589bab9e1d14011fb5935c0b44b55853d6a1f3f..84c796a3ed4bd0a2b2cbb2481a8a11d751d0b8fe 100644 (file)
--- a/TODO
+++ b/TODO
@@ -17,12 +17,6 @@ changes.
  
  * -p should probably go "_two_ more hops" on <FRAMESET> pages.
  
-* Only normal link-following recursion should respect -np.  Page-requisite
-  recursion should not.  When -np -p is specified, Wget should still retrieve
-  requisite images and such on the server, even if they aren't in that directory
-  or a subdirectory of it.  Likewise, -H -np -p should retrieve requisite files
-  from other hosts. 
-
  * Add a --range parameter allowing you to explicitly specify a range of bytes to
    get from a file over HTTP (FTP only supports ranges ending at the end of the
    file, though forcibly disconnecting from the server at the desired endpoint
diff --git a/src/ChangeLog b/src/ChangeLog

index ed710030e4ccdd110e63be242838c9c1e49690ce..f480009a390f27a5afdbeca6185ca258074fdda0 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,16 @@
+2001-11-30  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+       * recur.c (retrieve_tree): Skip the non-inline entries when
+       enqueuing the children of a leaf HTML node in -p mode.
+       (descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
+       "inline".
+
+       * html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
+       (collect_tags_mapper): When an entry is "inline", mark it as such.
+
+       * recur.c (descend_url_p): Fix test when checking for
+       acceptance/rejection rules.
+
  2001-10-31 Daniel BODEA <dali@dali-designs.com>
  
         * netrc.c (search_netrc): When slack_default is 0, still look for
diff --git a/src/html-url.c b/src/html-url.c

index 051f5057439fcefd6e31b16aabaaf0603f1a3eff..5942a49f61e8d8c0358f646ee42ee88ad817b5af 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -287,9 +287,6 @@ struct collect_urls_closure {
    struct urlpos *head, *tail;  /* List of URLs */
    const char *parent_base;     /* Base of the current document. */
    const char *document_file;   /* File name of this document. */
-  int dash_p_leaf_HTML;                /* Whether -p is specified, and this
-                                   document is the "leaf" node of the
-                                   HTML tree. */
    int nofollow;                        /* whether NOFOLLOW was specified in a
                                     <meta name=robots> tag. */
  };
@@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
             for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
                  i++)
               {
-               char *attr_value;
-               if (closure->dash_p_leaf_HTML
-                   && (url_tag_attr_map[i].flags & AF_EXTERNAL))
-                 /* If we're at a -p leaf node, we don't want to retrieve
-                    links to references we know are external to this document,
-                    such as <a href=...>.  */
-                 continue;
-
-               if (!strcasecmp (tag->attrs[id].name,
-                                url_tag_attr_map[i].attr_name))
+               if (0 == strcasecmp (tag->attrs[id].name,
+                                    url_tag_attr_map[i].attr_name))
                   {
-                   attr_value = tag->attrs[id].value;
+                   char *attr_value = tag->attrs[id].value;
                     if (attr_value)
-                     handle_link (closure, attr_value, tag, id);
+                     {
+                       struct urlpos *entry;
+                       entry = handle_link (closure, attr_value, tag, id);
+                       if (entry != NULL
+                           && !(url_tag_attr_map[i].flags & AF_EXTERNAL))
+                         entry->link_inline_p = 1;
+                     }
                   }
               }
           }
@@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
         case TAG_LINK:
           {
             int id;
-           char *rel  = find_attr (tag, "rel", NULL);
             char *href = find_attr (tag, "href", &id);
+
+           /* All <link href="..."> link references are external,
+              except for <link rel="stylesheet" href="...">.  */
             if (href)
               {
-               /* In the normal case, all <link href=...> tags are
-                  fair game.
-
-                  In the special case of when -p is active, however,
-                  and we're at a leaf node (relative to the -l
-                  max. depth) in the HTML document tree, the only
-                  <LINK> tag we'll follow is a <LINK REL=
-                  "stylesheet">, as it'll be necessary for displaying
-                  this document properly.  We won't follow other
-                  <LINK> tags, like <LINK REL="home">, for instance,
-                  as they refer to external documents.  */
-               if (!closure->dash_p_leaf_HTML
-                   || (rel && !strcasecmp (rel, "stylesheet")))
-                 handle_link (closure, href, tag, id);
+               struct urlpos *entry;
+               entry = handle_link (closure, href, tag, id);
+               if (entry != NULL)
+                 {
+                   char *rel  = find_attr (tag, "rel", NULL);
+                   if (rel && 0 == strcasecmp (rel, "stylesheet"))
+                     entry->link_inline_p = 1;
+                 }
               }
           }
           break;
@@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
  
  /* Analyze HTML tags FILE and construct a list of URLs referenced from
     it.  It merges relative links in FILE with URL.  It is aware of
-   <base href=...> and does the right thing.
-
-   If dash_p_leaf_HTML is non-zero, only the elements needed to render
-   FILE ("non-external" links) will be returned.  */
+   <base href=...> and does the right thing.  */
  struct urlpos *
-get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
-              int *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
  {
    struct file_memory *fm;
    struct collect_urls_closure closure;
@@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
    closure.base = NULL;
    closure.parent_base = url ? url : opt.base_href;
    closure.document_file = file;
-  closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
    closure.nofollow = 0;
  
    if (!interesting_tags)
diff --git a/src/recur.c b/src/recur.c

index 0aa96498fa2c9cb059bef6164996e7d836b891d1..6b8c41b079e9c6cf72581ada7c5823ce2ea9b19b 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
        if (descend)
         {
           int meta_disallow_follow = 0;
-         struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
-                                                  &meta_disallow_follow);
+         struct urlpos *children
+           = get_urls_html (file, url, &meta_disallow_follow);
  
           if (opt.use_robots && meta_disallow_follow)
             {
@@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
                 {
                   if (child->ignore_when_downloading)
                     continue;
+                 if (dash_p_leaf_HTML && !child->link_inline_p)
+                   continue;
                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
                                      blacklist))
                     {
@@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
    /* 4. Check for parent directory.
  
       If we descended to a different host or changed the scheme, ignore
-     opt.no_parent.  Also ignore it for -p leaf retrievals.  */
+     opt.no_parent.  Also ignore it for documents needed to display
+     the parent page when in -p mode.  */
    if (opt.no_parent
        && u->scheme == start_url_parsed->scheme
        && 0 == strcasecmp (u->host, start_url_parsed->host)
-      && u->port == start_url_parsed->port)
+      && u->port == start_url_parsed->port
+      && !(opt.page_requisites && upos->link_inline_p))
      {
        if (!frontcmp (start_url_parsed->dir, u->dir))
         {
@@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      if (u->file[0] != '\0'
         && ((suf = suffix (url)) == NULL
             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
-           || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
+           || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
        {
         if (!acceptable (u->file))
           {
@@ -674,7 +678,7 @@ convert_all_links (void)
         DEBUGP (("I cannot find the corresponding URL.\n"));
  
        /* Parse the HTML file...  */
-      urls = get_urls_html (html->string, url, FALSE, NULL);
+      urls = get_urls_html (html->string, url, NULL);
  
        /* We don't respect meta_disallow_follow here because, even if
           the file is not followed, we might still want to convert the
diff --git a/src/retr.c b/src/retr.c

index 858226799194df2f7350424c89fdb02cb353f98d..6c12462a2a52b5444fdb9b924e564dee0dae8ba2 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
    uerr_t status;
    struct urlpos *url_list, *cur_url;
  
-  url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
+  url_list = (html ? get_urls_html (file, NULL, NULL)
               : get_urls_file (file));
    status = RETROK;             /* Suppose everything is OK.  */
    *count = 0;                  /* Reset the URL count.  */
diff --git a/src/url.h b/src/url.h

index 836cdad1bb2661eb22cb78456b6458a8015efa85..3c42c5aa1c661199970dd2b1b7f2d450bdc996b4 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -79,16 +79,17 @@ struct urlpos {
    char *local_name;            /* local file to which it was saved
                                    (used by convert_links) */
  
-  int ignore_when_downloading; /* reserved for special links such as
-                                  <base href="..."> which are used
-                                  when converting links, but ignored
-                                  when downloading.  */
+  /* reserved for special links such as <base href="..."> which are
+     used when converting links, but ignored when downloading.  */
+  unsigned int ignore_when_downloading :1;
  
    /* Information about the original link: */
-  int link_relative_p;         /* was the link relative? */
-  int link_complete_p;         /* was the link complete (with the
-                                   host name, etc.) */
-  int link_base_p;             /* was the link <base href=...> */
+
+  unsigned int link_relative_p :1; /* was the link relative? */
+  unsigned int link_complete_p :1; /* was the link complete (with the
+                                      host name, etc.) */
+  unsigned int link_base_p     :1; /* was the link <base href=...> */
+  unsigned int link_inline_p   :1; /* needed to render the page. */
  
    /* Conversion requirements: */
    enum convert_options convert;        /* is conversion required? */
@@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
  char *url_string PARAMS ((const struct url *, int));
  
  struct urlpos *get_urls_file PARAMS ((const char *));
-struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
+struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
  void free_urlpos PARAMS ((struct urlpos *));
  
  char *uri_merge PARAMS ((const char *, const char *));
author	hniksic <devnull@localhost>
	Fri, 30 Nov 2001 21:17:53 +0000 (13:17 -0800)
committer	hniksic <devnull@localhost>
	Fri, 30 Nov 2001 21:17:53 +0000 (13:17 -0800)
TODO		patch \| blob \| history
src/ChangeLog		patch \| blob \| history
src/html-url.c		patch \| blob \| history
src/recur.c		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/url.h		patch \| blob \| history