[svn] Handle <base href=...> when converting links.

author hniksic <devnull@localhost>

Sun, 25 Nov 2001 18:40:55 +0000 (10:40 -0800)

committer hniksic <devnull@localhost>

Sun, 25 Nov 2001 18:40:55 +0000 (10:40 -0800)
author hniksic <devnull@localhost>
Sun, 25 Nov 2001 18:40:55 +0000 (10:40 -0800)
committer hniksic <devnull@localhost>
Sun, 25 Nov 2001 18:40:55 +0000 (10:40 -0800)
diff --git a/NEWS b/NEWS

index 8a3b0b70d62bca1899d977616d41a0fdafd1bae4..d9849ef0f57c391afcca172c1111d7fc611a06a4 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -7,13 +7,24 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
  \f
  * Changes in Wget 1.8.
  
-** "Recursive retrieval" now uses a breadth-first algorithm.
-Recursive downloads are faster and consume *significantly* less memory
-than before.
-
  ** A new progress indicator is now available.  Try it with
  --progress=bar or using `progress = bar' in `.wgetrc'.
  
+** "Recursive retrieval" has been revamped:
+
+*** Wget now traverses links breadth-first.  This makes the
+calculation of depth much more reliable than before.  Also, recursive
+downloads are faster and consume *significantly* less memory than
+before.
+
+*** Links are converted only when the entire retrieval is complete.
+This is the only safe thing to do, as only then is it known what URLs
+have been downloaded.
+
+*** BASE tags are handled correctly when converting links.  Since Wget
+already resolves <base href="..."> when resolving handling URLs, link
+conversion now makes the BASE tags point to an empty string.
+
  ** Host directories now contain port information if the URL is at a
  non-standard port.
  
diff --git a/TODO b/TODO

index a9cb902e8f7aa90c0d685e18bccf90e31e3b6b0a..83ad664dc696ea6d090a497adc75955cb1a9f382 100644 (file)
--- a/TODO
+++ b/TODO
@@ -55,8 +55,6 @@ changes.
  * Make -K compare X.orig to X and move the former on top of the latter if 
    they're the same, rather than leaving identical .orig files laying around.
  
-* Make `-k' convert <base href=...> too.
-
  * Make `-k' check for files that were downloaded in the past and convert links 
    to them in newly-downloaded documents.
  
diff --git a/src/ChangeLog b/src/ChangeLog

index 58ed40e74eb27cfebbb2c23ee4b9dd73d959da64..6d2bbcf18a3fc8f5cff09a11b864e24c13d2f869 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,14 @@
+2001-11-25  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+       * url.c (convert_links): Handle CO_NULLIFY_BASE.
+
+       * recur.c (retrieve_tree): Ignore download-ignorable children.
+       (convert_all_links): Specify CO_NULLIFY_BASE when link_base_p.
+
+       * html-url.c (handle_link): Return the newly created urlpos.
+       (collect_tags_mapper): When dealing with BASE, store the base
+       reference and mark it as download-ignorable.
+
  2001-11-25  Hrvoje Niksic  <hniksic@arsdigita.com>
  
         * url.c (convert_links): Attempt to quote '?' as "%3F" when
diff --git a/src/html-url.c b/src/html-url.c

index 918778203a2c46ae6ddb6a961d40ebd4d48d8c5e..051f5057439fcefd6e31b16aabaaf0603f1a3eff 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -297,7 +297,7 @@ struct collect_urls_closure {
  /* Resolve LINK_URI and append it to closure->tail.  TAG and ATTRID
     are the necessary context to store the position and size.  */
  
-static void
+static struct urlpos *
  handle_link (struct collect_urls_closure *closure, const char *link_uri,
              struct taginfo *tag, int attrid)
  {
@@ -316,7 +316,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
           /* We have no base, and the link does not have a host
              attached to it.  Nothing we can do.  */
           /* #### Should we print a warning here?  Wget 1.5.x used to.  */
-         return;
+         return NULL;
         }
  
        url = url_parse (link_uri, NULL);
@@ -324,7 +324,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
         {
           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
                    closure->document_file, link_uri));
-         return;
+         return NULL;
         }
      }
    else
@@ -344,7 +344,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
                    closure->document_file, complete_uri));
           xfree (complete_uri);
-         return;
+         return NULL;
         }
        xfree (complete_uri);
      }
@@ -371,6 +371,8 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
      }
    else
      closure->tail = closure->head = newel;
+
+  return newel;
  }
  
  /* Examine name and attributes of TAG and take appropriate action.
@@ -435,9 +437,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
         {
         case TAG_BASE:
           {
-           char *newbase = find_attr (tag, "href", NULL);
+           struct urlpos *base_urlpos;
+           int id;
+           char *newbase = find_attr (tag, "href", &id);
             if (!newbase)
               break;
+
+           base_urlpos = handle_link (closure, newbase, tag, id);
+           if (!base_urlpos)
+             break;
+           base_urlpos->ignore_when_downloading = 1;
+           base_urlpos->link_base_p = 1;
+
             if (closure->base)
               xfree (closure->base);
             if (closure->parent_base)
@@ -545,13 +556,13 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
  }
  
  /* Analyze HTML tags FILE and construct a list of URLs referenced from
-   it.  It merges relative links in FILE with THIS_URL.  It is aware
-   of <base href=...> and does the right thing.
+   it.  It merges relative links in FILE with URL.  It is aware of
+   <base href=...> and does the right thing.
  
     If dash_p_leaf_HTML is non-zero, only the elements needed to render
     FILE ("non-external" links) will be returned.  */
  struct urlpos *
-get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
+get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
                int *meta_disallow_follow)
  {
    struct file_memory *fm;
@@ -569,7 +580,7 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
    closure.text = fm->content;
    closure.head = closure.tail = NULL;
    closure.base = NULL;
-  closure.parent_base = this_url ? this_url : opt.base_href;
+  closure.parent_base = url ? url : opt.base_href;
    closure.document_file = file;
    closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
    closure.nofollow = 0;
diff --git a/src/recur.c b/src/recur.c

index 2c26157933646915eba8801cea8dd8f2d84da284..7af7d0509809f7e7cddad9896255f1a68d0724d8 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -280,6 +280,8 @@ retrieve_tree (const char *start_url)
  
               for (; child; child = child->next)
                 {
+                 if (child->ignore_when_downloading)
+                   continue;
                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
                                      blacklist))
                     {
@@ -634,6 +636,15 @@ convert_all_links (void)
           char *local_name;
           struct url *u = cur_url->url;
  
+         if (cur_url->link_base_p)
+           {
+             /* Base references have been resolved by our parser, so
+                we turn the base URL into an empty string.  (Perhaps
+                we should remove the tag entirely?)  */
+             cur_url->convert = CO_NULLIFY_BASE;
+             continue;
+           }
+
           /* We decide the direction of conversion according to whether
              a URL was downloaded.  Downloaded URLs will be converted
              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
@@ -642,7 +653,7 @@ convert_all_links (void)
             DEBUGP (("%s marked for conversion, local %s\n",
                      u->url, local_name));
  
-         /* Decide on the conversion direction.  */
+         /* Decide on the conversion type.  */
           if (local_name)
             {
               /* We've downloaded this URL.  Convert it to relative
diff --git a/src/retr.c b/src/retr.c

index d613da7c8037f0ab0fe0308cae3949778919629b..07d37d8bf8529054a44e9641eb94a0f5084ce724 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -462,6 +462,9 @@ retrieve_from_file (const char *file, int html, int *count)
        char *filename = NULL, *new_file;
        int dt;
  
+      if (cur_url->ignore_when_downloading)
+       continue;
+
        if (downloaded_exceeds_quota ())
         {
           status = QUOTEXC;
diff --git a/src/url.c b/src/url.c

index 26642e59d9e62f70f8c23f70c3cb22cd7fe0a873..1ce7222a7d06e2053e0ee21987d1b9c02b04d17a 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1698,7 +1698,7 @@ no_proxy_match (const char *host, const char **no_proxy)
  }
  \f
  static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
+static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
  static char *local_quote_string PARAMS ((const char *));
  
  /* Change the links in one HTML file.  LINKS is a list of links in the
@@ -1765,6 +1765,7 @@ convert_links (const char *file, struct urlpos *links)
        read_file_free (fm);
        return;
      }
+
    /* Here we loop through all the URLs in file, replacing those of
       them that are downloaded with relative references.  */
    p = fm->content;
@@ -1788,35 +1789,50 @@ convert_links (const char *file, struct urlpos *links)
           quote, to the outfile.  */
        fwrite (p, 1, url_start - p, fp);
        p = url_start;
-      if (link->convert == CO_CONVERT_TO_RELATIVE)
+
+      switch (link->convert)
         {
+       case CO_CONVERT_TO_RELATIVE:
           /* Convert absolute URL to relative. */
-         char *newname = construct_relative (file, link->local_name);
-         char *quoted_newname = local_quote_string (newname);
-         replace_attr (&p, link->size, fp, quoted_newname);
-         DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
-                  link->url->url, newname, link->pos, file));
-         xfree (newname);
-         xfree (quoted_newname);
-         ++to_file_count;
-       }
-      else if (link->convert == CO_CONVERT_TO_COMPLETE)
-       {
+         {
+           char *newname = construct_relative (file, link->local_name);
+           char *quoted_newname = local_quote_string (newname);
+           p = replace_attr (p, link->size, fp, quoted_newname);
+           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
+                    link->url->url, newname, link->pos, file));
+           xfree (newname);
+           xfree (quoted_newname);
+           ++to_file_count;
+           break;
+         }
+       case CO_CONVERT_TO_COMPLETE:
           /* Convert the link to absolute URL. */
-         char *newlink = link->url->url;
-         char *quoted_newlink = html_quote_string (newlink);
-         replace_attr (&p, link->size, fp, quoted_newlink);
-         DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
-                  newlink, link->pos, file));
-         xfree (quoted_newlink);
-         ++to_url_count;
+         {
+           char *newlink = link->url->url;
+           char *quoted_newlink = html_quote_string (newlink);
+           p = replace_attr (p, link->size, fp, quoted_newlink);
+           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
+                    newlink, link->pos, file));
+           xfree (quoted_newlink);
+           ++to_url_count;
+           break;
+         }
+       case CO_NULLIFY_BASE:
+         /* Change the base href to "". */
+         p = replace_attr (p, link->size, fp, "");
+         break;
+       case CO_NOCONVERT:
+         abort ();
+         break;
         }
      }
+
    /* Output the rest of the file. */
    if (p - fm->content < fm->length)
      fwrite (p, 1, fm->length - (p - fm->content), fp);
    fclose (fp);
    read_file_free (fm);
+
    logprintf (LOG_VERBOSE,
              _("%d-%d\n"), to_file_count, to_url_count);
  }
@@ -1960,13 +1976,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
  static int find_fragment PARAMS ((const char *, int, const char **,
                                   const char **));
  
-static void
-replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
+/* Replace an attribute's original text with NEW_TEXT. */
+
+static const char *
+replace_attr (const char *p, int size, FILE *fp, const char *new_text)
  {
-  const char *p = *pp;
    int quote_flag = 0;
-  int size = raw_size;
-  char quote_char = '\"';
+  char quote_char = '\"';      /* use "..." for quoting, unless the
+                                  original value is quoted, in which
+                                  case reuse its quoting char. */
    const char *frag_beg, *frag_end;
  
    /* Structure of our string is:
@@ -1984,7 +2002,7 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
        size -= 2;               /* disregard opening and closing quote */
      }
    putc (quote_char, fp);
-  fputs (new_str, fp);
+  fputs (new_text, fp);
  
    /* Look for fragment identifier, if any. */
    if (find_fragment (p, size, &frag_beg, &frag_end))
@@ -1993,7 +2011,8 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
    if (quote_flag)
      ++p;
    putc (quote_char, fp);
-  *pp = p;
+
+  return p;
  }
  
  /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
diff --git a/src/url.h b/src/url.h

index aed9bc382abac3de69cec21449178ff41219dfaf..836cdad1bb2661eb22cb78456b6458a8015efa85 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -65,8 +65,9 @@ enum convert_options {
    CO_NOCONVERT = 0,            /* don't convert this URL */
    CO_CONVERT_TO_RELATIVE,      /* convert to relative, e.g. to
                                     "../../otherdir/foo.gif" */
-  CO_CONVERT_TO_COMPLETE       /* convert to absolute, e.g. to
+  CO_CONVERT_TO_COMPLETE,      /* convert to absolute, e.g. to
                                    "http://orighost/somedir/bar.jpg". */
+  CO_NULLIFY_BASE              /* change to empty string. */
  };
  
  /* A structure that defines the whereabouts of a URL, i.e. its
@@ -78,10 +79,16 @@ struct urlpos {
    char *local_name;            /* local file to which it was saved
                                    (used by convert_links) */
  
+  int ignore_when_downloading; /* reserved for special links such as
+                                  <base href="..."> which are used
+                                  when converting links, but ignored
+                                  when downloading.  */
+
    /* Information about the original link: */
    int link_relative_p;         /* was the link relative? */
    int link_complete_p;         /* was the link complete (with the
                                     host name, etc.) */
+  int link_base_p;             /* was the link <base href=...> */
  
    /* Conversion requirements: */
    enum convert_options convert;        /* is conversion required? */
author	hniksic <devnull@localhost>
	Sun, 25 Nov 2001 18:40:55 +0000 (10:40 -0800)
committer	hniksic <devnull@localhost>
	Sun, 25 Nov 2001 18:40:55 +0000 (10:40 -0800)
NEWS		patch \| blob \| history
TODO		patch \| blob \| history
src/ChangeLog		patch \| blob \| history
src/html-url.c		patch \| blob \| history
src/recur.c		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history