[svn] Split off non-URL related stuff from url.c to convert.c.

author hniksic <devnull@localhost>

Sun, 21 Sep 2003 22:47:14 +0000 (15:47 -0700)

committer hniksic <devnull@localhost>

Sun, 21 Sep 2003 22:47:14 +0000 (15:47 -0700)
author hniksic <devnull@localhost>
Sun, 21 Sep 2003 22:47:14 +0000 (15:47 -0700)
committer hniksic <devnull@localhost>
Sun, 21 Sep 2003 22:47:14 +0000 (15:47 -0700)
diff --git a/src/ChangeLog b/src/ChangeLog

index a93073b87a434b8dd05dea3ac22c1d115b945cd1..f619c189429e343c3973272b1836b2bc207ccfb9 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,10 @@
+2003-09-22  Hrvoje Niksic  <hniksic@xemacs.org>
+
+       * retr.c (getproxy): Moved from url.c.
+
+       * convert.c: Split off link conversion from url.c into separate
+       file.  Also included the book-keeping stuff from recur.c.
+
  2003-09-21  Hrvoje Niksic  <hniksic@xemacs.org>
  
         * init.c: Improved documentation of functions.
diff --git a/src/Makefile.in b/src/Makefile.in

index 81c2f5f61ffde8a3c6fc4349d248885cd353ba66..d96abcd33957140079fc1bbe511dca1dca54365f 100644 (file)
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -72,8 +72,8 @@ OPIE_OBJ   = @OPIE_OBJ@
  SSL_OBJ    = @SSL_OBJ@
  GETOPT_OBJ = @GETOPT_OBJ@
  
-OBJ = $(ALLOCA) cmpt$o connect$o cookies$o fnmatch$o ftp$o        \
-      ftp-basic$o ftp-ls$o $(OPIE_OBJ) $(GETOPT_OBJ) hash$o       \
+OBJ = $(ALLOCA) cmpt$o connect$o convert$o cookies$o fnmatch$o    \
+      ftp$o ftp-basic$o ftp-ls$o $(OPIE_OBJ) $(GETOPT_OBJ) hash$o \
        headers$o host$o html-parse$o html-url$o http$o init$o      \
        log$o main$o $(MD5_OBJ) netrc$o progress$o rbuf$o recur$o   \
        res$o retr$o safe-ctype$o snprintf$o $(SSL_OBJ) url$o       \
@@ -154,6 +154,7 @@ TAGS: *.c *.h
  alloca$o:
  cmpt$o: wget.h sysdep.h options.h safe-ctype.h
  connect$o: wget.h sysdep.h options.h safe-ctype.h utils.h connect.h host.h
+convert$o: wget.h convert.h url.h recur.h utils.h hash.h
  cookies$o: wget.h sysdep.h options.h safe-ctype.h cookies.h hash.h url.h utils.h
  fnmatch$o: wget.h sysdep.h options.h safe-ctype.h fnmatch.h
  ftp-basic$o: wget.h sysdep.h options.h safe-ctype.h utils.h rbuf.h connect.h \
diff --git a/src/convert.c b/src/convert.c

new file mode 100644 (file)

index 0000000..b199956
--- /dev/null
+++ b/src/convert.c
@@ -0,0 +1,959 @@
+/* Conversion of links to local files.
+   Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif /* HAVE_STRING_H */
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif /* HAVE_UNISTD_H */
+#include <errno.h>
+#include <assert.h>
+#include <sys/types.h>
+
+#include "wget.h"
+#include "convert.h"
+#include "url.h"
+#include "recur.h"
+#include "utils.h"
+#include "hash.h"
+
+static struct hash_table *dl_file_url_map;
+struct hash_table *dl_url_file_map;
+
+/* List of HTML files downloaded in this Wget run, used for link
+   conversion after Wget is done.  The list and the set contain the
+   same information, except the list maintains the order.  Perhaps I
+   should get rid of the list, it's there for historical reasons.  */
+static slist *downloaded_html_list;
+struct hash_table *downloaded_html_set;
+
+static void convert_links PARAMS ((const char *, struct urlpos *));
+
+/* This function is called when the retrieval is done to convert the
+   links that have been downloaded.  It has to be called at the end of
+   the retrieval, because only then does Wget know conclusively which
+   URLs have been downloaded, and which not, so it can tell which
+   direction to convert to.
+
+   The "direction" means that the URLs to the files that have been
+   downloaded get converted to the relative URL which will point to
+   that file.  And the other URLs get converted to the remote URL on
+   the server.
+
+   All the downloaded HTMLs are kept in downloaded_html_files, and
+   downloaded URLs in urls_downloaded.  All the information is
+   extracted from these two lists.  */
+
+void
+convert_all_links (void)
+{
+  slist *html;
+  long msecs;
+  int file_count = 0;
+
+  struct wget_timer *timer = wtimer_new ();
+
+  /* Destructively reverse downloaded_html_files to get it in the right order.
+     recursive_retrieve() used slist_prepend() consistently.  */
+  downloaded_html_list = slist_nreverse (downloaded_html_list);
+
+  for (html = downloaded_html_list; html; html = html->next)
+    {
+      struct urlpos *urls, *cur_url;
+      char *url;
+      char *file = html->string;
+
+      /* Determine the URL of the HTML file.  get_urls_html will need
+        it.  */
+      url = hash_table_get (dl_file_url_map, file);
+      if (!url)
+       {
+         DEBUGP (("Apparently %s has been removed.\n", file));
+         continue;
+       }
+
+      DEBUGP (("Scanning %s (from %s)\n", file, url));
+
+      /* Parse the HTML file...  */
+      urls = get_urls_html (file, url, NULL);
+
+      /* We don't respect meta_disallow_follow here because, even if
+         the file is not followed, we might still want to convert the
+         links that have been followed from other files.  */
+
+      for (cur_url = urls; cur_url; cur_url = cur_url->next)
+       {
+         char *local_name;
+         struct url *u = cur_url->url;
+
+         if (cur_url->link_base_p)
+           {
+             /* Base references have been resolved by our parser, so
+                we turn the base URL into an empty string.  (Perhaps
+                we should remove the tag entirely?)  */
+             cur_url->convert = CO_NULLIFY_BASE;
+             continue;
+           }
+
+         /* We decide the direction of conversion according to whether
+            a URL was downloaded.  Downloaded URLs will be converted
+            ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
+         local_name = hash_table_get (dl_url_file_map, u->url);
+
+         /* Decide on the conversion type.  */
+         if (local_name)
+           {
+             /* We've downloaded this URL.  Convert it to relative
+                 form.  We do this even if the URL already is in
+                 relative form, because our directory structure may
+                 not be identical to that on the server (think `-nd',
+                 `--cut-dirs', etc.)  */
+             cur_url->convert = CO_CONVERT_TO_RELATIVE;
+             cur_url->local_name = xstrdup (local_name);
+             DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
+           }
+         else
+           {
+             /* We haven't downloaded this URL.  If it's not already
+                 complete (including a full host name), convert it to
+                 that form, so it can be reached while browsing this
+                 HTML locally.  */
+             if (!cur_url->link_complete_p)
+               cur_url->convert = CO_CONVERT_TO_COMPLETE;
+             cur_url->local_name = NULL;
+             DEBUGP (("will convert url %s to complete\n", u->url));
+           }
+       }
+
+      /* Convert the links in the file.  */
+      convert_links (file, urls);
+      ++file_count;
+
+      /* Free the data.  */
+      free_urlpos (urls);
+    }
+
+  msecs = wtimer_elapsed (timer);
+  wtimer_delete (timer);
+  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
+            file_count, (double)msecs / 1000);
+}
+
+static void write_backup_file PARAMS ((const char *, downloaded_file_t));
+static const char *replace_attr PARAMS ((const char *, int, FILE *,
+                                        const char *));
+static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
+                                                     const char *, int));
+static char *local_quote_string PARAMS ((const char *));
+static char *construct_relative PARAMS ((const char *, const char *));
+
+/* Change the links in one HTML file.  LINKS is a list of links in the
+   document, along with their positions and the desired direction of
+   the conversion.  */
+static void
+convert_links (const char *file, struct urlpos *links)
+{
+  struct file_memory *fm;
+  FILE *fp;
+  const char *p;
+  downloaded_file_t downloaded_file_return;
+
+  struct urlpos *link;
+  int to_url_count = 0, to_file_count = 0;
+
+  logprintf (LOG_VERBOSE, _("Converting %s... "), file);
+
+  {
+    /* First we do a "dry run": go through the list L and see whether
+       any URL needs to be converted in the first place.  If not, just
+       leave the file alone.  */
+    int dry_count = 0;
+    struct urlpos *dry = links;
+    for (dry = links; dry; dry = dry->next)
+      if (dry->convert != CO_NOCONVERT)
+       ++dry_count;
+    if (!dry_count)
+      {
+       logputs (LOG_VERBOSE, _("nothing to do.\n"));
+       return;
+      }
+  }
+
+  fm = read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
+                file, strerror (errno));
+      return;
+    }
+
+  downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
+  if (opt.backup_converted && downloaded_file_return)
+    write_backup_file (file, downloaded_file_return);
+
+  /* Before opening the file for writing, unlink the file.  This is
+     important if the data in FM is mmaped.  In such case, nulling the
+     file, which is what fopen() below does, would make us read all
+     zeroes from the mmaped region.  */
+  if (unlink (file) < 0 && errno != ENOENT)
+    {
+      logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
+                file, strerror (errno));
+      read_file_free (fm);
+      return;
+    }
+  /* Now open the file for writing.  */
+  fp = fopen (file, "wb");
+  if (!fp)
+    {
+      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
+                file, strerror (errno));
+      read_file_free (fm);
+      return;
+    }
+
+  /* Here we loop through all the URLs in file, replacing those of
+     them that are downloaded with relative references.  */
+  p = fm->content;
+  for (link = links; link; link = link->next)
+    {
+      char *url_start = fm->content + link->pos;
+
+      if (link->pos >= fm->length)
+       {
+         DEBUGP (("Something strange is going on.  Please investigate."));
+         break;
+       }
+      /* If the URL is not to be converted, skip it.  */
+      if (link->convert == CO_NOCONVERT)
+       {
+         DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
+         continue;
+       }
+
+      /* Echo the file contents, up to the offending URL's opening
+         quote, to the outfile.  */
+      fwrite (p, 1, url_start - p, fp);
+      p = url_start;
+
+      switch (link->convert)
+       {
+       case CO_CONVERT_TO_RELATIVE:
+         /* Convert absolute URL to relative. */
+         {
+           char *newname = construct_relative (file, link->local_name);
+           char *quoted_newname = local_quote_string (newname);
+
+           if (!link->link_refresh_p)
+             p = replace_attr (p, link->size, fp, quoted_newname);
+           else
+             p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
+                                            link->refresh_timeout);
+
+           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
+                    link->url->url, newname, link->pos, file));
+           xfree (newname);
+           xfree (quoted_newname);
+           ++to_file_count;
+           break;
+         }
+       case CO_CONVERT_TO_COMPLETE:
+         /* Convert the link to absolute URL. */
+         {
+           char *newlink = link->url->url;
+           char *quoted_newlink = html_quote_string (newlink);
+
+           if (!link->link_refresh_p)
+             p = replace_attr (p, link->size, fp, quoted_newlink);
+           else
+             p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
+                                            link->refresh_timeout);
+
+           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
+                    newlink, link->pos, file));
+           xfree (quoted_newlink);
+           ++to_url_count;
+           break;
+         }
+       case CO_NULLIFY_BASE:
+         /* Change the base href to "". */
+         p = replace_attr (p, link->size, fp, "");
+         break;
+       case CO_NOCONVERT:
+         abort ();
+         break;
+       }
+    }
+
+  /* Output the rest of the file. */
+  if (p - fm->content < fm->length)
+    fwrite (p, 1, fm->length - (p - fm->content), fp);
+  fclose (fp);
+  read_file_free (fm);
+
+  logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
+}
+
+/* Construct and return a malloced copy of the relative link from two
+   pieces of information: local name S1 of the referring file and
+   local name S2 of the referred file.
+
+   So, if S1 is "jagor.srce.hr/index.html" and S2 is
+   "jagor.srce.hr/images/news.gif", the function will return
+   "images/news.gif".
+
+   Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
+   "fly.cc.fer.hr/images/fly.gif", the function will return
+   "../images/fly.gif".
+
+   Caveats: S1 should not begin with `/', unless S2 also begins with
+   '/'.  S1 should not contain things like ".." and such --
+   construct_relative ("fly/ioccc/../index.html",
+   "fly/images/fly.gif") will fail.  (A workaround is to call
+   something like path_simplify() on S1).  */
+static char *
+construct_relative (const char *s1, const char *s2)
+{
+  int i, cnt, sepdirs1;
+  char *res;
+
+  if (*s2 == '/')
+    return xstrdup (s2);
+  /* S1 should *not* be absolute, if S2 wasn't.  */
+  assert (*s1 != '/');
+  i = cnt = 0;
+  /* Skip the directories common to both strings.  */
+  while (1)
+    {
+      while (s1[i] && s2[i]
+            && (s1[i] == s2[i])
+            && (s1[i] != '/')
+            && (s2[i] != '/'))
+       ++i;
+      if (s1[i] == '/' && s2[i] == '/')
+       cnt = ++i;
+      else
+       break;
+    }
+  for (sepdirs1 = 0; s1[i]; i++)
+    if (s1[i] == '/')
+      ++sepdirs1;
+  /* Now, construct the file as of:
+     - ../ repeated sepdirs1 time
+     - all the non-mutual directories of S2.  */
+  res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
+  for (i = 0; i < sepdirs1; i++)
+    memcpy (res + 3 * i, "../", 3);
+  strcpy (res + 3 * i, s2 + cnt);
+  return res;
+}
+
+static void
+write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
+{
+  /* Rather than just writing over the original .html file with the
+     converted version, save the former to *.orig.  Note we only do
+     this for files we've _successfully_ downloaded, so we don't
+     clobber .orig files sitting around from previous invocations. */
+
+  /* Construct the backup filename as the original name plus ".orig". */
+  size_t         filename_len = strlen(file);
+  char*          filename_plus_orig_suffix;
+  boolean        already_wrote_backup_file = FALSE;
+  slist*         converted_file_ptr;
+  static slist*  converted_files = NULL;
+
+  if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+    {
+      /* Just write "orig" over "html".  We need to do it this way
+        because when we're checking to see if we've downloaded the
+        file before (to see if we can skip downloading it), we don't
+        know if it's a text/html file.  Therefore we don't know yet
+        at that stage that -E is going to cause us to tack on
+        ".html", so we need to compare vs. the original URL plus
+        ".orig", not the original URL plus ".html.orig". */
+      filename_plus_orig_suffix = alloca (filename_len + 1);
+      strcpy(filename_plus_orig_suffix, file);
+      strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+    }
+  else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+    {
+      /* Append ".orig" to the name. */
+      filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
+      strcpy(filename_plus_orig_suffix, file);
+      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+    }
+
+  /* We can get called twice on the same URL thanks to the
+     convert_all_links() call in main().  If we write the .orig file
+     each time in such a case, it'll end up containing the first-pass
+     conversion, not the original file.  So, see if we've already been
+     called on this file. */
+  converted_file_ptr = converted_files;
+  while (converted_file_ptr != NULL)
+    if (strcmp(converted_file_ptr->string, file) == 0)
+      {
+       already_wrote_backup_file = TRUE;
+       break;
+      }
+    else
+      converted_file_ptr = converted_file_ptr->next;
+
+  if (!already_wrote_backup_file)
+    {
+      /* Rename <file> to <file>.orig before former gets written over. */
+      if (rename(file, filename_plus_orig_suffix) != 0)
+       logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
+                  file, filename_plus_orig_suffix, strerror (errno));
+
+      /* Remember that we've already written a .orig backup for this file.
+        Note that we never free this memory since we need it till the
+        convert_all_links() call, which is one of the last things the
+        program does before terminating.  BTW, I'm not sure if it would be
+        safe to just set 'converted_file_ptr->string' to 'file' below,
+        rather than making a copy of the string...  Another note is that I
+        thought I could just add a field to the urlpos structure saying
+        that we'd written a .orig file for this URL, but that didn't work,
+        so I had to make this separate list.
+        -- Dan Harkless <wget@harkless.org>
+
+         This [adding a field to the urlpos structure] didn't work
+         because convert_file() is called from convert_all_links at
+         the end of the retrieval with a freshly built new urlpos
+         list.
+        -- Hrvoje Niksic <hniksic@arsdigita.com>
+      */
+      converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
+      converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
+      converted_file_ptr->next = converted_files;
+      converted_files = converted_file_ptr;
+    }
+}
+
+static int find_fragment PARAMS ((const char *, int, const char **,
+                                 const char **));
+
+/* Replace an attribute's original text with NEW_TEXT. */
+
+static const char *
+replace_attr (const char *p, int size, FILE *fp, const char *new_text)
+{
+  int quote_flag = 0;
+  char quote_char = '\"';      /* use "..." for quoting, unless the
+                                  original value is quoted, in which
+                                  case reuse its quoting char. */
+  const char *frag_beg, *frag_end;
+
+  /* Structure of our string is:
+       "...old-contents..."
+       <---    size    --->  (with quotes)
+     OR:
+       ...old-contents...
+       <---    size   -->    (no quotes)   */
+
+  if (*p == '\"' || *p == '\'')
+    {
+      quote_char = *p;
+      quote_flag = 1;
+      ++p;
+      size -= 2;               /* disregard opening and closing quote */
+    }
+  putc (quote_char, fp);
+  fputs (new_text, fp);
+
+  /* Look for fragment identifier, if any. */
+  if (find_fragment (p, size, &frag_beg, &frag_end))
+    fwrite (frag_beg, 1, frag_end - frag_beg, fp);
+  p += size;
+  if (quote_flag)
+    ++p;
+  putc (quote_char, fp);
+
+  return p;
+}
+
+/* The same as REPLACE_ATTR, but used when replacing
+   <meta http-equiv=refresh content="new_text"> because we need to
+   append "timeout_value; URL=" before the next_text.  */
+
+static const char *
+replace_attr_refresh_hack (const char *p, int size, FILE *fp,
+                          const char *new_text, int timeout)
+{
+  /* "0; URL=..." */
+  char *new_with_timeout = (char *)alloca (numdigit (timeout)
+                                          + 6 /* "; URL=" */
+                                          + strlen (new_text)
+                                          + 1);
+  sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
+
+  return replace_attr (p, size, fp, new_with_timeout);
+}
+
+/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
+   preceded by '&'.  If the character is not found, return zero.  If
+   the character is found, return 1 and set BP and EP to point to the
+   beginning and end of the region.
+
+   This is used for finding the fragment indentifiers in URLs.  */
+
+static int
+find_fragment (const char *beg, int size, const char **bp, const char **ep)
+{
+  const char *end = beg + size;
+  int saw_amp = 0;
+  for (; beg < end; beg++)
+    {
+      switch (*beg)
+       {
+       case '&':
+         saw_amp = 1;
+         break;
+       case '#':
+         if (!saw_amp)
+           {
+             *bp = beg;
+             *ep = end;
+             return 1;
+           }
+         /* fallthrough */
+       default:
+         saw_amp = 0;
+       }
+    }
+  return 0;
+}
+
+/* Quote FILE for use as local reference to an HTML file.
+
+   We quote ? as %3F to avoid passing part of the file name as the
+   parameter when browsing the converted file through HTTP.  However,
+   it is safe to do this only when `--html-extension' is turned on.
+   This is because converting "index.html?foo=bar" to
+   "index.html%3Ffoo=bar" would break local browsing, as the latter
+   isn't even recognized as an HTML file!  However, converting
+   "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
+   safe for both local and HTTP-served browsing.  */
+
+static char *
+local_quote_string (const char *file)
+{
+  const char *file_sans_qmark;
+  int qm;
+
+  if (!opt.html_extension)
+    return html_quote_string (file);
+
+  qm = count_char (file, '?');
+
+  if (qm)
+    {
+      const char *from = file;
+      char *to, *newname;
+
+      /* qm * 2 because we replace each question mark with "%3F",
+        i.e. replace one char with three, hence two more.  */
+      int fsqlen = strlen (file) + qm * 2;
+
+      to = newname = (char *)alloca (fsqlen + 1);
+      for (; *from; from++)
+       {
+         if (*from != '?')
+           *to++ = *from;
+         else
+           {
+             *to++ = '%';
+             *to++ = '3';
+             *to++ = 'F';
+           }
+       }
+      assert (to - newname == fsqlen);
+      *to = '\0';
+
+      file_sans_qmark = newname;
+    }
+  else
+    file_sans_qmark = file;
+
+  return html_quote_string (file_sans_qmark);
+}
+\f
+/* Book-keeping code for dl_file_url_map, dl_url_file_map,
+   downloaded_html_list, and downloaded_html_set.  Other code calls
+   these functions to let us know that a file has been downloaded.  */
+
+#define ENSURE_TABLES_EXIST do {                       \
+  if (!dl_file_url_map)                                        \
+    dl_file_url_map = make_string_hash_table (0);      \
+  if (!dl_url_file_map)                                        \
+    dl_url_file_map = make_string_hash_table (0);      \
+} while (0)
+
+/* Return 1 if S1 and S2 are the same, except for "/index.html".  The
+   three cases in which it returns one are (substitute any substring
+   for "foo"):
+
+   m("foo/index.html", "foo/")  ==> 1
+   m("foo/", "foo/index.html")  ==> 1
+   m("foo", "foo/index.html")   ==> 1
+   m("foo", "foo/"              ==> 1
+   m("foo", "foo")              ==> 1  */
+
+static int
+match_except_index (const char *s1, const char *s2)
+{
+  int i;
+  const char *lng;
+
+  /* Skip common substring. */
+  for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
+    ;
+  if (i == 0)
+    /* Strings differ at the very beginning -- bail out.  We need to
+       check this explicitly to avoid `lng - 1' reading outside the
+       array.  */
+    return 0;
+
+  if (!*s1 && !*s2)
+    /* Both strings hit EOF -- strings are equal. */
+    return 1;
+  else if (*s1 && *s2)
+    /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
+    return 0;
+  else if (*s1)
+    /* S1 is the longer one. */
+    lng = s1;
+  else
+    /* S2 is the longer one. */
+    lng = s2;
+
+  /* foo            */            /* foo/           */
+  /* foo/index.html */  /* or */  /* foo/index.html */
+  /*    ^           */            /*     ^          */
+
+  if (*lng != '/')
+    /* The right-hand case. */
+    --lng;
+
+  if (*lng == '/' && *(lng + 1) == '\0')
+    /* foo  */
+    /* foo/ */
+    return 1;
+
+  return 0 == strcmp (lng, "/index.html");
+}
+
+static int
+dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
+{
+  char *mapping_url = (char *)key;
+  char *mapping_file = (char *)value;
+  char *file = (char *)arg;
+
+  if (0 == strcmp (mapping_file, file))
+    {
+      hash_table_remove (dl_url_file_map, mapping_url);
+      xfree (mapping_url);
+      xfree (mapping_file);
+    }
+
+  /* Continue mapping. */
+  return 0;
+}
+
+/* Remove all associations from various URLs to FILE from dl_url_file_map. */
+
+static void
+dissociate_urls_from_file (const char *file)
+{
+  hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
+                 (char *)file);
+}
+
+/* Register that URL has been successfully downloaded to FILE.  This
+   is used by the link conversion code to convert references to URLs
+   to references to local files.  It is also being used to check if a
+   URL has already been downloaded.  */
+
+void
+register_download (const char *url, const char *file)
+{
+  char *old_file, *old_url;
+
+  ENSURE_TABLES_EXIST;
+
+  /* With some forms of retrieval, it is possible, although not likely
+     or particularly desirable.  If both are downloaded, the second
+     download will override the first one.  When that happens,
+     dissociate the old file name from the URL.  */
+
+  if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
+    {
+      if (0 == strcmp (url, old_url))
+       /* We have somehow managed to download the same URL twice.
+          Nothing to do.  */
+       return;
+
+      if (match_except_index (url, old_url)
+         && !hash_table_contains (dl_url_file_map, url))
+       /* The two URLs differ only in the "index.html" ending.  For
+          example, one is "http://www.server.com/", and the other is
+          "http://www.server.com/index.html".  Don't remove the old
+          one, just add the new one as a non-canonical entry.  */
+       goto url_only;
+
+      hash_table_remove (dl_file_url_map, file);
+      xfree (old_file);
+      xfree (old_url);
+
+      /* Remove all the URLs that point to this file.  Yes, there can
+        be more than one such URL, because we store redirections as
+        multiple entries in dl_url_file_map.  For example, if URL1
+        redirects to URL2 which gets downloaded to FILE, we map both
+        URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
+        only points to URL2.)  When another URL gets loaded to FILE,
+        we want both URL1 and URL2 dissociated from it.
+
+         This is a relatively expensive operation because it performs
+         a linear search of the whole hash table, but it should be
+         called very rarely, only when two URLs resolve to the same
+         file name, *and* the "<file>.1" extensions are turned off.
+         In other words, almost never.  */
+      dissociate_urls_from_file (file);
+    }
+
+  hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
+
+ url_only:
+  /* A URL->FILE mapping is not possible without a FILE->URL mapping.
+     If the latter were present, it should have been removed by the
+     above `if'.  So we could write:
+
+         assert (!hash_table_contains (dl_url_file_map, url));
+
+     The above is correct when running in recursive mode where the
+     same URL always resolves to the same file.  But if you do
+     something like:
+
+         wget URL URL
+
+     then the first URL will resolve to "FILE", and the other to
+     "FILE.1".  In that case, FILE.1 will not be found in
+     dl_file_url_map, but URL will still point to FILE in
+     dl_url_file_map.  */
+  if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
+    {
+      hash_table_remove (dl_url_file_map, url);
+      xfree (old_url);
+      xfree (old_file);
+    }
+
+  hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
+}
+
+/* Register that FROM has been redirected to TO.  This assumes that TO
+   is successfully downloaded and already registered using
+   register_download() above.  */
+
+void
+register_redirection (const char *from, const char *to)
+{
+  char *file;
+
+  ENSURE_TABLES_EXIST;
+
+  file = hash_table_get (dl_url_file_map, to);
+  assert (file != NULL);
+  if (!hash_table_contains (dl_url_file_map, from))
+    hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
+}
+
+/* Register that the file has been deleted. */
+
+void
+register_delete_file (const char *file)
+{
+  char *old_url, *old_file;
+
+  ENSURE_TABLES_EXIST;
+
+  if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
+    return;
+
+  hash_table_remove (dl_file_url_map, file);
+  xfree (old_file);
+  xfree (old_url);
+  dissociate_urls_from_file (file);
+}
+
+/* Register that FILE is an HTML file that has been downloaded. */
+
+void
+register_html (const char *url, const char *file)
+{
+  if (!downloaded_html_set)
+    downloaded_html_set = make_string_hash_table (0);
+  else if (hash_table_contains (downloaded_html_set, file))
+    return;
+
+  /* The set and the list should use the same copy of FILE, but the
+     slist interface insists on strduping the string it gets.  Oh
+     well. */
+  string_set_add (downloaded_html_set, file);
+  downloaded_html_list = slist_prepend (downloaded_html_list, file);
+}
+
+/* Cleanup the data structures associated with recursive retrieving
+   (the variables above).  */
+void
+convert_cleanup (void)
+{
+  if (dl_file_url_map)
+    {
+      free_keys_and_values (dl_file_url_map);
+      hash_table_destroy (dl_file_url_map);
+      dl_file_url_map = NULL;
+    }
+  if (dl_url_file_map)
+    {
+      free_keys_and_values (dl_url_file_map);
+      hash_table_destroy (dl_url_file_map);
+      dl_url_file_map = NULL;
+    }
+  if (downloaded_html_set)
+    string_set_free (downloaded_html_set);
+  slist_free (downloaded_html_list);
+  downloaded_html_list = NULL;
+}
+\f
+/* Book-keeping code for downloaded files that enables extension
+   hacks.  */
+
+/* This table should really be merged with dl_file_url_map and
+   downloaded_html_files.  This was originally a list, but I changed
+   it to a hash table beause it was actually taking a lot of time to
+   find things in it.  */
+
+static struct hash_table *downloaded_files_hash;
+
+/* We're storing "modes" of type downloaded_file_t in the hash table.
+   However, our hash tables only accept pointers for keys and values.
+   So when we need a pointer, we use the address of a
+   downloaded_file_t variable of static storage.  */
+   
+static downloaded_file_t *
+downloaded_mode_to_ptr (downloaded_file_t mode)
+{
+  static downloaded_file_t
+    v1 = FILE_NOT_ALREADY_DOWNLOADED,
+    v2 = FILE_DOWNLOADED_NORMALLY,
+    v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
+    v4 = CHECK_FOR_FILE;
+
+  switch (mode)
+    {
+    case FILE_NOT_ALREADY_DOWNLOADED:
+      return &v1;
+    case FILE_DOWNLOADED_NORMALLY:
+      return &v2;
+    case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
+      return &v3;
+    case CHECK_FOR_FILE:
+      return &v4;
+    }
+  return NULL;
+}
+
+/* Remembers which files have been downloaded.  In the standard case,
+   should be called with mode == FILE_DOWNLOADED_NORMALLY for each
+   file we actually download successfully (i.e. not for ones we have
+   failures on or that we skip due to -N).
+
+   When we've downloaded a file and tacked on a ".html" extension due
+   to -E, call this function with
+   FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
+   FILE_DOWNLOADED_NORMALLY.
+
+   If you just want to check if a file has been previously added
+   without adding it, call with mode == CHECK_FOR_FILE.  Please be
+   sure to call this function with local filenames, not remote
+   URLs.  */
+
+downloaded_file_t
+downloaded_file (downloaded_file_t mode, const char *file)
+{
+  downloaded_file_t *ptr;
+
+  if (mode == CHECK_FOR_FILE)
+    {
+      if (!downloaded_files_hash)
+       return FILE_NOT_ALREADY_DOWNLOADED;
+      ptr = hash_table_get (downloaded_files_hash, file);
+      if (!ptr)
+       return FILE_NOT_ALREADY_DOWNLOADED;
+      return *ptr;
+    }
+
+  if (!downloaded_files_hash)
+    downloaded_files_hash = make_string_hash_table (0);
+
+  ptr = hash_table_get (downloaded_files_hash, file);
+  if (ptr)
+    return *ptr;
+
+  ptr = downloaded_mode_to_ptr (mode);
+  hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
+
+  return FILE_NOT_ALREADY_DOWNLOADED;
+}
+
+static int
+df_free_mapper (void *key, void *value, void *ignored)
+{
+  xfree (key);
+  return 0;
+}
+
+void
+downloaded_files_free (void)
+{
+  if (downloaded_files_hash)
+    {
+      hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
+      hash_table_destroy (downloaded_files_hash);
+      downloaded_files_hash = NULL;
+    }
+}
diff --git a/src/convert.h b/src/convert.h

new file mode 100644 (file)

index 0000000..66b7dfc
--- /dev/null
+++ b/src/convert.h
@@ -0,0 +1,100 @@
+/* Declarations for convert.c
+   Copyright (C) 2003 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#ifndef CONVERT_H
+#define CONVERT_H
+
+enum convert_options {
+  CO_NOCONVERT = 0,            /* don't convert this URL */
+  CO_CONVERT_TO_RELATIVE,      /* convert to relative, e.g. to
+                                   "../../otherdir/foo.gif" */
+  CO_CONVERT_TO_COMPLETE,      /* convert to absolute, e.g. to
+                                  "http://orighost/somedir/bar.jpg". */
+  CO_NULLIFY_BASE              /* change to empty string. */
+};
+
+struct url;
+
+/* A structure that defines the whereabouts of a URL, i.e. its
+   position in an HTML document, etc.  */
+
+struct urlpos {
+  struct url *url;             /* the URL of the link, after it has
+                                  been merged with the base */
+  char *local_name;            /* local file to which it was saved
+                                  (used by convert_links) */
+
+  /* reserved for special links such as <base href="..."> which are
+     used when converting links, but ignored when downloading.  */
+  unsigned int ignore_when_downloading :1;
+
+  /* Information about the original link: */
+
+  unsigned int link_relative_p :1; /* was the link relative? */
+  unsigned int link_complete_p :1; /* was the link complete (with the
+                                      host name, etc.) */
+  unsigned int link_base_p     :1; /* was the link <base href=...> */
+  unsigned int link_inline_p   :1; /* needed to render the page. */
+
+  unsigned int link_refresh_p  :1; /* link was received from
+                                      <meta http-equiv=refresh content=...> */
+  int refresh_timeout;         /* for reconstructing the refresh. */
+
+  /* Conversion requirements: */
+  enum convert_options convert;        /* is conversion required? */
+
+  /* URL's position in the buffer. */
+  int pos, size;
+
+  struct urlpos *next;         /* next list element */
+};
+
+/* downloaded_file() takes a parameter of this type and returns this type. */
+typedef enum
+{
+  /* Return enumerators: */
+  FILE_NOT_ALREADY_DOWNLOADED = 0,
+
+  /* Return / parameter enumerators: */
+  FILE_DOWNLOADED_NORMALLY,
+  FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
+
+  /* Parameter enumerators: */
+  CHECK_FOR_FILE
+} downloaded_file_t;
+
+downloaded_file_t downloaded_file PARAMS ((downloaded_file_t, const char *));
+
+void register_download PARAMS ((const char *, const char *));
+void register_redirection PARAMS ((const char *, const char *));
+void register_html PARAMS ((const char *, const char *));
+void register_delete_file PARAMS ((const char *));
+void convert_all_links PARAMS ((void));
+
+#endif /* CONVERT_H */
diff --git a/src/ftp.c b/src/ftp.c

index 5c619be141fc4f4a2b12067dd900657541d8d5aa..d3efa8050b39af9fcea9d416d37098e46d519221 100644 (file)
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -54,6 +54,7 @@ so, delete this exception statement from your version.  */
  #include "host.h"
  #include "fnmatch.h"
  #include "netrc.h"
+#include "convert.h"           /* for downloaded_file */
  
  #ifndef errno
  extern int errno;
@@ -1165,7 +1166,7 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con)
  
        /* If we get out of the switch above without continue'ing, we've
          successfully downloaded a file.  Remember this fact. */
-      downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
+      downloaded_file (FILE_DOWNLOADED_NORMALLY, locf);
  
        if (con->st & ON_YOUR_OWN)
         {
diff --git a/src/html-url.c b/src/html-url.c

index a3208e468395c33d22325dc5da982a85052b7090..756bf2abb128180e539bade30815027430cf9c9f 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -43,6 +43,7 @@ so, delete this exception statement from your version.  */
  #include "html-parse.h"
  #include "url.h"
  #include "utils.h"
+#include "convert.h"
  
  #ifndef errno
  extern int errno;
@@ -630,6 +631,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
  /* Analyze HTML tags FILE and construct a list of URLs referenced from
     it.  It merges relative links in FILE with URL.  It is aware of
     <base href=...> and does the right thing.  */
+
  struct urlpos *
  get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
  {
@@ -667,6 +669,91 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
    return ctx.head;
  }
  
+/* This doesn't really have anything to do with HTML, but it's similar
+   to get_urls_html, so we put it here.  */
+
+struct urlpos *
+get_urls_file (const char *file)
+{
+  struct file_memory *fm;
+  struct urlpos *head, *tail;
+  const char *text, *text_end;
+
+  /* Load the file.  */
+  fm = read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
+    }
+  DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+
+  head = tail = NULL;
+  text = fm->content;
+  text_end = fm->content + fm->length;
+  while (text < text_end)
+    {
+      int up_error_code;
+      char *url_text;
+      struct urlpos *entry;
+      struct url *url;
+
+      const char *line_beg = text;
+      const char *line_end = memchr (text, '\n', text_end - text);
+      if (!line_end)
+       line_end = text_end;
+      else
+       ++line_end;
+      text = line_end;
+
+      /* Strip whitespace from the beginning and end of line. */
+      while (line_beg < line_end && ISSPACE (*line_beg))
+       ++line_beg;
+      while (line_end > line_beg && ISSPACE (*(line_end - 1)))
+       --line_end;
+
+      if (line_beg == line_end)
+       continue;
+
+      /* The URL is in the [line_beg, line_end) region. */
+
+      /* We must copy the URL to a zero-terminated string, and we
+        can't use alloca because we're in a loop.  *sigh*.  */
+      url_text = strdupdelim (line_beg, line_end);
+
+      if (opt.base_href)
+       {
+         /* Merge opt.base_href with URL. */
+         char *merged = uri_merge (opt.base_href, url_text);
+         xfree (url_text);
+         url_text = merged;
+       }
+
+      url = url_parse (url_text, &up_error_code);
+      if (!url)
+       {
+         logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
+                    file, url_text, url_error (up_error_code));
+         xfree (url_text);
+         continue;
+       }
+      xfree (url_text);
+
+      entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
+      memset (entry, 0, sizeof (*entry));
+      entry->next = NULL;
+      entry->url = url;
+
+      if (!head)
+       head = entry;
+      else
+       tail->next = entry;
+      tail = entry;
+    }
+  read_file_free (fm);
+  return head;
+}
+
  void
  cleanup_html_url (void)
  {
diff --git a/src/http.c b/src/http.c

index 13a8364e917a477842aac4dc356fe0cc5d4f0814..6bb53f2d62cda9b780aa4bbf1f224e40890a60ac 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -71,6 +71,7 @@ so, delete this exception statement from your version.  */
  #ifdef USE_DIGEST
  # include "gen-md5.h"
  #endif
+#include "convert.h"
  
  extern char *version_string;
  
diff --git a/src/init.c b/src/init.c

index 3673d20829b5d74568d88aab3c5e87ba502409e5..8d586b172db999bd4f0d3fe0222f4f152dd09fed 100644 (file)
--- a/src/init.c
+++ b/src/init.c
@@ -62,7 +62,6 @@ so, delete this exception statement from your version.  */
  #include "utils.h"
  #include "init.h"
  #include "host.h"
-#include "recur.h"
  #include "netrc.h"
  #include "cookies.h"           /* for cookie_jar_delete */
  #include "progress.h"
@@ -1279,7 +1278,7 @@ cleanup (void)
       memory which grows with the size of the program.  */
  
  #ifdef DEBUG_MALLOC
-  recursive_cleanup ();
+  convert_cleanup ();
    res_cleanup ();
    http_cleanup ();
    cleanup_html_url ();
diff --git a/src/recur.c b/src/recur.c

index 9507c2f20cccd0e9cd22bfc36f46247354f413c3..9a9e8ba11ddd18f5c08ce2e93e05ee3adf84c814 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -53,6 +53,7 @@ so, delete this exception statement from your version.  */
  #include "host.h"
  #include "hash.h"
  #include "res.h"
+#include "convert.h"
  
  #ifndef errno
  extern int errno;
@@ -60,17 +61,8 @@ extern int errno;
  
  extern char *version_string;
  
-static struct hash_table *dl_file_url_map;
-static struct hash_table *dl_url_file_map;
-
-/* List of HTML files downloaded in this Wget run, used for link
-   conversion after Wget is done.  The list and the set contain the
-   same information, except the list maintains the order.  Perhaps I
-   should get rid of the list, it's there for historical reasons.  */
-static slist *downloaded_html_list;
-static struct hash_table *downloaded_html_set;
-
-static void register_delete_file PARAMS ((const char *));
+extern struct hash_table *dl_url_file_map;
+extern struct hash_table *downloaded_html_set;
  \f
  /* Functions for maintaining the URL queue.  */
  
@@ -620,358 +612,3 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
  
    return success;
  }
-
-\f
-#define ENSURE_TABLES_EXIST do {                       \
-  if (!dl_file_url_map)                                        \
-    dl_file_url_map = make_string_hash_table (0);      \
-  if (!dl_url_file_map)                                        \
-    dl_url_file_map = make_string_hash_table (0);      \
-} while (0)
-
-/* Return 1 if S1 and S2 are the same, except for "/index.html".  The
-   three cases in which it returns one are (substitute any substring
-   for "foo"):
-
-   m("foo/index.html", "foo/")  ==> 1
-   m("foo/", "foo/index.html")  ==> 1
-   m("foo", "foo/index.html")   ==> 1
-   m("foo", "foo/"              ==> 1
-   m("foo", "foo")              ==> 1  */
-
-static int
-match_except_index (const char *s1, const char *s2)
-{
-  int i;
-  const char *lng;
-
-  /* Skip common substring. */
-  for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
-    ;
-  if (i == 0)
-    /* Strings differ at the very beginning -- bail out.  We need to
-       check this explicitly to avoid `lng - 1' reading outside the
-       array.  */
-    return 0;
-
-  if (!*s1 && !*s2)
-    /* Both strings hit EOF -- strings are equal. */
-    return 1;
-  else if (*s1 && *s2)
-    /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
-    return 0;
-  else if (*s1)
-    /* S1 is the longer one. */
-    lng = s1;
-  else
-    /* S2 is the longer one. */
-    lng = s2;
-
-  /* foo            */            /* foo/           */
-  /* foo/index.html */  /* or */  /* foo/index.html */
-  /*    ^           */            /*     ^          */
-
-  if (*lng != '/')
-    /* The right-hand case. */
-    --lng;
-
-  if (*lng == '/' && *(lng + 1) == '\0')
-    /* foo  */
-    /* foo/ */
-    return 1;
-
-  return 0 == strcmp (lng, "/index.html");
-}
-
-static int
-dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
-{
-  char *mapping_url = (char *)key;
-  char *mapping_file = (char *)value;
-  char *file = (char *)arg;
-
-  if (0 == strcmp (mapping_file, file))
-    {
-      hash_table_remove (dl_url_file_map, mapping_url);
-      xfree (mapping_url);
-      xfree (mapping_file);
-    }
-
-  /* Continue mapping. */
-  return 0;
-}
-
-/* Remove all associations from various URLs to FILE from dl_url_file_map. */
-
-static void
-dissociate_urls_from_file (const char *file)
-{
-  hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
-                 (char *)file);
-}
-
-/* Register that URL has been successfully downloaded to FILE.  This
-   is used by the link conversion code to convert references to URLs
-   to references to local files.  It is also being used to check if a
-   URL has already been downloaded.  */
-
-void
-register_download (const char *url, const char *file)
-{
-  char *old_file, *old_url;
-
-  ENSURE_TABLES_EXIST;
-
-  /* With some forms of retrieval, it is possible, although not likely
-     or particularly desirable.  If both are downloaded, the second
-     download will override the first one.  When that happens,
-     dissociate the old file name from the URL.  */
-
-  if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
-    {
-      if (0 == strcmp (url, old_url))
-       /* We have somehow managed to download the same URL twice.
-          Nothing to do.  */
-       return;
-
-      if (match_except_index (url, old_url)
-         && !hash_table_contains (dl_url_file_map, url))
-       /* The two URLs differ only in the "index.html" ending.  For
-          example, one is "http://www.server.com/", and the other is
-          "http://www.server.com/index.html".  Don't remove the old
-          one, just add the new one as a non-canonical entry.  */
-       goto url_only;
-
-      hash_table_remove (dl_file_url_map, file);
-      xfree (old_file);
-      xfree (old_url);
-
-      /* Remove all the URLs that point to this file.  Yes, there can
-        be more than one such URL, because we store redirections as
-        multiple entries in dl_url_file_map.  For example, if URL1
-        redirects to URL2 which gets downloaded to FILE, we map both
-        URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
-        only points to URL2.)  When another URL gets loaded to FILE,
-        we want both URL1 and URL2 dissociated from it.
-
-         This is a relatively expensive operation because it performs
-         a linear search of the whole hash table, but it should be
-         called very rarely, only when two URLs resolve to the same
-         file name, *and* the "<file>.1" extensions are turned off.
-         In other words, almost never.  */
-      dissociate_urls_from_file (file);
-    }
-
-  hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
-
- url_only:
-  /* A URL->FILE mapping is not possible without a FILE->URL mapping.
-     If the latter were present, it should have been removed by the
-     above `if'.  So we could write:
-
-         assert (!hash_table_contains (dl_url_file_map, url));
-
-     The above is correct when running in recursive mode where the
-     same URL always resolves to the same file.  But if you do
-     something like:
-
-         wget URL URL
-
-     then the first URL will resolve to "FILE", and the other to
-     "FILE.1".  In that case, FILE.1 will not be found in
-     dl_file_url_map, but URL will still point to FILE in
-     dl_url_file_map.  */
-  if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
-    {
-      hash_table_remove (dl_url_file_map, url);
-      xfree (old_url);
-      xfree (old_file);
-    }
-
-  hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
-}
-
-/* Register that FROM has been redirected to TO.  This assumes that TO
-   is successfully downloaded and already registered using
-   register_download() above.  */
-
-void
-register_redirection (const char *from, const char *to)
-{
-  char *file;
-
-  ENSURE_TABLES_EXIST;
-
-  file = hash_table_get (dl_url_file_map, to);
-  assert (file != NULL);
-  if (!hash_table_contains (dl_url_file_map, from))
-    hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
-}
-
-/* Register that the file has been deleted. */
-
-static void
-register_delete_file (const char *file)
-{
-  char *old_url, *old_file;
-
-  ENSURE_TABLES_EXIST;
-
-  if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
-    return;
-
-  hash_table_remove (dl_file_url_map, file);
-  xfree (old_file);
-  xfree (old_url);
-  dissociate_urls_from_file (file);
-}
-
-/* Register that FILE is an HTML file that has been downloaded. */
-
-void
-register_html (const char *url, const char *file)
-{
-  if (!downloaded_html_set)
-    downloaded_html_set = make_string_hash_table (0);
-  else if (hash_table_contains (downloaded_html_set, file))
-    return;
-
-  /* The set and the list should use the same copy of FILE, but the
-     slist interface insists on strduping the string it gets.  Oh
-     well. */
-  string_set_add (downloaded_html_set, file);
-  downloaded_html_list = slist_prepend (downloaded_html_list, file);
-}
-
-/* This function is called when the retrieval is done to convert the
-   links that have been downloaded.  It has to be called at the end of
-   the retrieval, because only then does Wget know conclusively which
-   URLs have been downloaded, and which not, so it can tell which
-   direction to convert to.
-
-   The "direction" means that the URLs to the files that have been
-   downloaded get converted to the relative URL which will point to
-   that file.  And the other URLs get converted to the remote URL on
-   the server.
-
-   All the downloaded HTMLs are kept in downloaded_html_files, and
-   downloaded URLs in urls_downloaded.  All the information is
-   extracted from these two lists.  */
-
-void
-convert_all_links (void)
-{
-  slist *html;
-  long msecs;
-  int file_count = 0;
-
-  struct wget_timer *timer = wtimer_new ();
-
-  /* Destructively reverse downloaded_html_files to get it in the right order.
-     recursive_retrieve() used slist_prepend() consistently.  */
-  downloaded_html_list = slist_nreverse (downloaded_html_list);
-
-  for (html = downloaded_html_list; html; html = html->next)
-    {
-      struct urlpos *urls, *cur_url;
-      char *url;
-      char *file = html->string;
-
-      /* Determine the URL of the HTML file.  get_urls_html will need
-        it.  */
-      url = hash_table_get (dl_file_url_map, file);
-      if (!url)
-       {
-         DEBUGP (("Apparently %s has been removed.\n", file));
-         continue;
-       }
-
-      DEBUGP (("Scanning %s (from %s)\n", file, url));
-
-      /* Parse the HTML file...  */
-      urls = get_urls_html (file, url, NULL);
-
-      /* We don't respect meta_disallow_follow here because, even if
-         the file is not followed, we might still want to convert the
-         links that have been followed from other files.  */
-
-      for (cur_url = urls; cur_url; cur_url = cur_url->next)
-       {
-         char *local_name;
-         struct url *u = cur_url->url;
-
-         if (cur_url->link_base_p)
-           {
-             /* Base references have been resolved by our parser, so
-                we turn the base URL into an empty string.  (Perhaps
-                we should remove the tag entirely?)  */
-             cur_url->convert = CO_NULLIFY_BASE;
-             continue;
-           }
-
-         /* We decide the direction of conversion according to whether
-            a URL was downloaded.  Downloaded URLs will be converted
-            ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
-         local_name = hash_table_get (dl_url_file_map, u->url);
-
-         /* Decide on the conversion type.  */
-         if (local_name)
-           {
-             /* We've downloaded this URL.  Convert it to relative
-                 form.  We do this even if the URL already is in
-                 relative form, because our directory structure may
-                 not be identical to that on the server (think `-nd',
-                 `--cut-dirs', etc.)  */
-             cur_url->convert = CO_CONVERT_TO_RELATIVE;
-             cur_url->local_name = xstrdup (local_name);
-             DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
-           }
-         else
-           {
-             /* We haven't downloaded this URL.  If it's not already
-                 complete (including a full host name), convert it to
-                 that form, so it can be reached while browsing this
-                 HTML locally.  */
-             if (!cur_url->link_complete_p)
-               cur_url->convert = CO_CONVERT_TO_COMPLETE;
-             cur_url->local_name = NULL;
-             DEBUGP (("will convert url %s to complete\n", u->url));
-           }
-       }
-
-      /* Convert the links in the file.  */
-      convert_links (file, urls);
-      ++file_count;
-
-      /* Free the data.  */
-      free_urlpos (urls);
-    }
-
-  msecs = wtimer_elapsed (timer);
-  wtimer_delete (timer);
-  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
-            file_count, (double)msecs / 1000);
-}
-
-/* Cleanup the data structures associated with recursive retrieving
-   (the variables above).  */
-void
-recursive_cleanup (void)
-{
-  if (dl_file_url_map)
-    {
-      free_keys_and_values (dl_file_url_map);
-      hash_table_destroy (dl_file_url_map);
-      dl_file_url_map = NULL;
-    }
-  if (dl_url_file_map)
-    {
-      free_keys_and_values (dl_url_file_map);
-      hash_table_destroy (dl_url_file_map);
-      dl_url_file_map = NULL;
-    }
-  if (downloaded_html_set)
-    string_set_free (downloaded_html_set);
-  slist_free (downloaded_html_list);
-  downloaded_html_list = NULL;
-}
diff --git a/src/recur.h b/src/recur.h

index 73abccaaf7879298653ec636ddd85e3acebbbc29..69d6ed985e9c993390e3398af12fd9addb6d7247 100644 (file)
--- a/src/recur.h
+++ b/src/recur.h
@@ -30,12 +30,14 @@ so, delete this exception statement from your version.  */
  #ifndef RECUR_H
  #define RECUR_H
  
+struct urlpos;
+
  void recursive_cleanup PARAMS ((void));
  uerr_t retrieve_tree PARAMS ((const char *));
  
-void register_download PARAMS ((const char *, const char *));
-void register_redirection PARAMS ((const char *, const char *));
-void register_html PARAMS ((const char *, const char *));
-void convert_all_links PARAMS ((void));
+/* These are really in html-url.c. */
+struct urlpos *get_urls_file PARAMS ((const char *));
+struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
+void free_urlpos PARAMS ((struct urlpos *));
  
  #endif /* RECUR_H */
diff --git a/src/retr.c b/src/retr.c

index fa69be0ca90081de77fd3416ba7eb7a9f8c6f3c1..1af9a1b899ce6723cb328b4b8a0bfa5c49ac7f3e 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -53,6 +53,7 @@ so, delete this exception statement from your version.  */
  #include "host.h"
  #include "connect.h"
  #include "hash.h"
+#include "convert.h"
  
  #ifdef HAVE_SSL
  # include "gen_sslfunc.h"      /* for ssl_iread */
@@ -325,6 +326,8 @@ calc_rate (long bytes, double msecs, int *units)
      }                                                  \
  } while (0)
  
+static char *getproxy PARAMS ((struct url *));
+
  /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
     FTP, proxy, etc.  */
  
@@ -682,3 +685,101 @@ sleep_between_retrievals (int count)
         }
      }
  }
+
+/* Free the linked list of urlpos.  */
+void
+free_urlpos (struct urlpos *l)
+{
+  while (l)
+    {
+      struct urlpos *next = l->next;
+      if (l->url)
+       url_free (l->url);
+      FREE_MAYBE (l->local_name);
+      xfree (l);
+      l = next;
+    }
+}
+
+/* Rotate FNAME opt.backups times */
+void
+rotate_backups(const char *fname)
+{
+  int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
+  char *from = (char *)alloca (maxlen);
+  char *to = (char *)alloca (maxlen);
+  struct stat sb;
+  int i;
+
+  if (stat (fname, &sb) == 0)
+    if (S_ISREG (sb.st_mode) == 0)
+      return;
+
+  for (i = opt.backups; i > 1; i--)
+    {
+      sprintf (from, "%s.%d", fname, i - 1);
+      sprintf (to, "%s.%d", fname, i);
+      rename (from, to);
+    }
+
+  sprintf (to, "%s.%d", fname, 1);
+  rename(fname, to);
+}
+
+static int no_proxy_match PARAMS ((const char *, const char **));
+
+/* Return the URL of the proxy appropriate for url U.  */
+
+static char *
+getproxy (struct url *u)
+{
+  char *proxy = NULL;
+  char *rewritten_url;
+  static char rewritten_storage[1024];
+
+  if (!opt.use_proxy)
+    return NULL;
+  if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
+    return NULL;
+
+  switch (u->scheme)
+    {
+    case SCHEME_HTTP:
+      proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
+      break;
+#ifdef HAVE_SSL
+    case SCHEME_HTTPS:
+      proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
+      break;
+#endif
+    case SCHEME_FTP:
+      proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
+      break;
+    case SCHEME_INVALID:
+      break;
+    }
+  if (!proxy || !*proxy)
+    return NULL;
+
+  /* Handle shorthands.  `rewritten_storage' is a kludge to allow
+     getproxy() to return static storage. */
+  rewritten_url = rewrite_shorthand_url (proxy);
+  if (rewritten_url)
+    {
+      strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
+      rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
+      proxy = rewritten_storage;
+    }
+
+  return proxy;
+}
+
+/* Should a host be accessed through proxy, concerning no_proxy?  */
+int
+no_proxy_match (const char *host, const char **no_proxy)
+{
+  if (!no_proxy)
+    return 1;
+  else
+    return !sufmatch (no_proxy, host);
+}
diff --git a/src/retr.h b/src/retr.h

index c2867c415d2435d08f5103d006e5634dfea1f061..2b804754687106c078e5dfe4fc5c64581b89409e 100644 (file)
--- a/src/retr.h
+++ b/src/retr.h
@@ -48,6 +48,8 @@ int downloaded_exceeds_quota PARAMS ((void));
  
  void sleep_between_retrievals PARAMS ((int));
  
+void rotate_backups PARAMS ((const char *));
+
  /* Because there's no http.h. */
  
  struct url;
diff --git a/src/url.c b/src/url.c

index d48031916aa747391e4a3ddffc744d728290fe50..1b823fcb8b61e5f381a4165c46a058f6b2ad6969 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -76,10 +76,7 @@ static struct scheme_data supported_schemes[] =
  
  /* Forward declarations: */
  
-static char *construct_relative PARAMS ((const char *, const char *));
  static int path_simplify PARAMS ((char *));
-
-
  \f
  /* Support for encoding and decoding of URL strings.  We determine
     whether a character is unsafe through static table lookup.  This
@@ -1234,128 +1231,6 @@ url_free (struct url *url)
    xfree (url);
  }
  \f
-struct urlpos *
-get_urls_file (const char *file)
-{
-  struct file_memory *fm;
-  struct urlpos *head, *tail;
-  const char *text, *text_end;
-
-  /* Load the file.  */
-  fm = read_file (file);
-  if (!fm)
-    {
-      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
-      return NULL;
-    }
-  DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
-
-  head = tail = NULL;
-  text = fm->content;
-  text_end = fm->content + fm->length;
-  while (text < text_end)
-    {
-      const char *line_beg = text;
-      const char *line_end = memchr (text, '\n', text_end - text);
-      if (!line_end)
-       line_end = text_end;
-      else
-       ++line_end;
-      text = line_end;
-
-      /* Strip whitespace from the beginning and end of line. */
-      while (line_beg < line_end && ISSPACE (*line_beg))
-       ++line_beg;
-      while (line_end > line_beg && ISSPACE (*(line_end - 1)))
-       --line_end;
-
-      if (line_end > line_beg)
-       {
-         /* URL is in the [line_beg, line_end) region. */
-
-         int up_error_code;
-         char *url_text;
-         struct urlpos *entry;
-         struct url *url;
-
-         /* We must copy the URL to a zero-terminated string, and we
-            can't use alloca because we're in a loop.  *sigh*.  */
-         url_text = strdupdelim (line_beg, line_end);
-
-         if (opt.base_href)
-           {
-             /* Merge opt.base_href with URL. */
-             char *merged = uri_merge (opt.base_href, url_text);
-             xfree (url_text);
-             url_text = merged;
-           }
-
-         url = url_parse (url_text, &up_error_code);
-         if (!url)
-           {
-             logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
-                        file, url_text, url_error (up_error_code));
-             xfree (url_text);
-             continue;
-           }
-         xfree (url_text);
-
-         entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
-         memset (entry, 0, sizeof (*entry));
-         entry->next = NULL;
-         entry->url = url;
-
-         if (!head)
-           head = entry;
-         else
-           tail->next = entry;
-         tail = entry;
-       }
-    }
-  read_file_free (fm);
-  return head;
-}
-\f
-/* Free the linked list of urlpos.  */
-void
-free_urlpos (struct urlpos *l)
-{
-  while (l)
-    {
-      struct urlpos *next = l->next;
-      if (l->url)
-       url_free (l->url);
-      FREE_MAYBE (l->local_name);
-      xfree (l);
-      l = next;
-    }
-}
-
-/* Rotate FNAME opt.backups times */
-void
-rotate_backups(const char *fname)
-{
-  int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
-  char *from = (char *)alloca (maxlen);
-  char *to = (char *)alloca (maxlen);
-  struct stat sb;
-  int i;
-
-  if (stat (fname, &sb) == 0)
-    if (S_ISREG (sb.st_mode) == 0)
-      return;
-
-  for (i = opt.backups; i > 1; i--)
-    {
-      sprintf (from, "%s.%d", fname, i - 1);
-      sprintf (to, "%s.%d", fname, i);
-      rename (from, to);
-    }
-
-  sprintf (to, "%s.%d", fname, 1);
-  rename(fname, to);
-}
-
  /* Create all the necessary directories for PATH (a file).  Calls
     mkdirhier() internally.  */
  int
@@ -2168,596 +2043,6 @@ url_string (const struct url *url, int hide_password)
    return result;
  }
  \f
-/* Return the URL of the proxy appropriate for url U.  */
-char *
-getproxy (struct url *u)
-{
-  char *proxy = NULL;
-  char *rewritten_url;
-  static char rewritten_storage[1024];
-
-  if (!opt.use_proxy)
-    return NULL;
-  if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
-    return NULL;
-
-  switch (u->scheme)
-    {
-    case SCHEME_HTTP:
-      proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
-      break;
-#ifdef HAVE_SSL
-    case SCHEME_HTTPS:
-      proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
-      break;
-#endif
-    case SCHEME_FTP:
-      proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
-      break;
-    case SCHEME_INVALID:
-      break;
-    }
-  if (!proxy || !*proxy)
-    return NULL;
-
-  /* Handle shorthands.  `rewritten_storage' is a kludge to allow
-     getproxy() to return static storage. */
-  rewritten_url = rewrite_shorthand_url (proxy);
-  if (rewritten_url)
-    {
-      strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
-      rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
-      proxy = rewritten_storage;
-    }
-
-  return proxy;
-}
-
-/* Should a host be accessed through proxy, concerning no_proxy?  */
-int
-no_proxy_match (const char *host, const char **no_proxy)
-{
-  if (!no_proxy)
-    return 1;
-  else
-    return !sufmatch (no_proxy, host);
-}
-\f
-/* Support for converting links for local viewing in downloaded HTML
-   files.  This should be moved to another file, because it has
-   nothing to do with processing URLs.  */
-
-static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static const char *replace_attr PARAMS ((const char *, int, FILE *,
-                                        const char *));
-static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
-                                                     const char *, int));
-static char *local_quote_string PARAMS ((const char *));
-
-/* Change the links in one HTML file.  LINKS is a list of links in the
-   document, along with their positions and the desired direction of
-   the conversion.  */
-void
-convert_links (const char *file, struct urlpos *links)
-{
-  struct file_memory *fm;
-  FILE *fp;
-  const char *p;
-  downloaded_file_t downloaded_file_return;
-
-  struct urlpos *link;
-  int to_url_count = 0, to_file_count = 0;
-
-  logprintf (LOG_VERBOSE, _("Converting %s... "), file);
-
-  {
-    /* First we do a "dry run": go through the list L and see whether
-       any URL needs to be converted in the first place.  If not, just
-       leave the file alone.  */
-    int dry_count = 0;
-    struct urlpos *dry = links;
-    for (dry = links; dry; dry = dry->next)
-      if (dry->convert != CO_NOCONVERT)
-       ++dry_count;
-    if (!dry_count)
-      {
-       logputs (LOG_VERBOSE, _("nothing to do.\n"));
-       return;
-      }
-  }
-
-  fm = read_file (file);
-  if (!fm)
-    {
-      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
-                file, strerror (errno));
-      return;
-    }
-
-  downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
-  if (opt.backup_converted && downloaded_file_return)
-    write_backup_file (file, downloaded_file_return);
-
-  /* Before opening the file for writing, unlink the file.  This is
-     important if the data in FM is mmaped.  In such case, nulling the
-     file, which is what fopen() below does, would make us read all
-     zeroes from the mmaped region.  */
-  if (unlink (file) < 0 && errno != ENOENT)
-    {
-      logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
-                file, strerror (errno));
-      read_file_free (fm);
-      return;
-    }
-  /* Now open the file for writing.  */
-  fp = fopen (file, "wb");
-  if (!fp)
-    {
-      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
-                file, strerror (errno));
-      read_file_free (fm);
-      return;
-    }
-
-  /* Here we loop through all the URLs in file, replacing those of
-     them that are downloaded with relative references.  */
-  p = fm->content;
-  for (link = links; link; link = link->next)
-    {
-      char *url_start = fm->content + link->pos;
-
-      if (link->pos >= fm->length)
-       {
-         DEBUGP (("Something strange is going on.  Please investigate."));
-         break;
-       }
-      /* If the URL is not to be converted, skip it.  */
-      if (link->convert == CO_NOCONVERT)
-       {
-         DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
-         continue;
-       }
-
-      /* Echo the file contents, up to the offending URL's opening
-         quote, to the outfile.  */
-      fwrite (p, 1, url_start - p, fp);
-      p = url_start;
-
-      switch (link->convert)
-       {
-       case CO_CONVERT_TO_RELATIVE:
-         /* Convert absolute URL to relative. */
-         {
-           char *newname = construct_relative (file, link->local_name);
-           char *quoted_newname = local_quote_string (newname);
-
-           if (!link->link_refresh_p)
-             p = replace_attr (p, link->size, fp, quoted_newname);
-           else
-             p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
-                                            link->refresh_timeout);
-
-           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
-                    link->url->url, newname, link->pos, file));
-           xfree (newname);
-           xfree (quoted_newname);
-           ++to_file_count;
-           break;
-         }
-       case CO_CONVERT_TO_COMPLETE:
-         /* Convert the link to absolute URL. */
-         {
-           char *newlink = link->url->url;
-           char *quoted_newlink = html_quote_string (newlink);
-
-           if (!link->link_refresh_p)
-             p = replace_attr (p, link->size, fp, quoted_newlink);
-           else
-             p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
-                                            link->refresh_timeout);
-
-           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
-                    newlink, link->pos, file));
-           xfree (quoted_newlink);
-           ++to_url_count;
-           break;
-         }
-       case CO_NULLIFY_BASE:
-         /* Change the base href to "". */
-         p = replace_attr (p, link->size, fp, "");
-         break;
-       case CO_NOCONVERT:
-         abort ();
-         break;
-       }
-    }
-
-  /* Output the rest of the file. */
-  if (p - fm->content < fm->length)
-    fwrite (p, 1, fm->length - (p - fm->content), fp);
-  fclose (fp);
-  read_file_free (fm);
-
-  logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
-}
-
-/* Construct and return a malloced copy of the relative link from two
-   pieces of information: local name S1 of the referring file and
-   local name S2 of the referred file.
-
-   So, if S1 is "jagor.srce.hr/index.html" and S2 is
-   "jagor.srce.hr/images/news.gif", the function will return
-   "images/news.gif".
-
-   Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
-   "fly.cc.fer.hr/images/fly.gif", the function will return
-   "../images/fly.gif".
-
-   Caveats: S1 should not begin with `/', unless S2 also begins with
-   '/'.  S1 should not contain things like ".." and such --
-   construct_relative ("fly/ioccc/../index.html",
-   "fly/images/fly.gif") will fail.  (A workaround is to call
-   something like path_simplify() on S1).  */
-static char *
-construct_relative (const char *s1, const char *s2)
-{
-  int i, cnt, sepdirs1;
-  char *res;
-
-  if (*s2 == '/')
-    return xstrdup (s2);
-  /* S1 should *not* be absolute, if S2 wasn't.  */
-  assert (*s1 != '/');
-  i = cnt = 0;
-  /* Skip the directories common to both strings.  */
-  while (1)
-    {
-      while (s1[i] && s2[i]
-            && (s1[i] == s2[i])
-            && (s1[i] != '/')
-            && (s2[i] != '/'))
-       ++i;
-      if (s1[i] == '/' && s2[i] == '/')
-       cnt = ++i;
-      else
-       break;
-    }
-  for (sepdirs1 = 0; s1[i]; i++)
-    if (s1[i] == '/')
-      ++sepdirs1;
-  /* Now, construct the file as of:
-     - ../ repeated sepdirs1 time
-     - all the non-mutual directories of S2.  */
-  res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
-  for (i = 0; i < sepdirs1; i++)
-    memcpy (res + 3 * i, "../", 3);
-  strcpy (res + 3 * i, s2 + cnt);
-  return res;
-}
-\f
-static void
-write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
-{
-  /* Rather than just writing over the original .html file with the
-     converted version, save the former to *.orig.  Note we only do
-     this for files we've _successfully_ downloaded, so we don't
-     clobber .orig files sitting around from previous invocations. */
-
-  /* Construct the backup filename as the original name plus ".orig". */
-  size_t         filename_len = strlen(file);
-  char*          filename_plus_orig_suffix;
-  boolean        already_wrote_backup_file = FALSE;
-  slist*         converted_file_ptr;
-  static slist*  converted_files = NULL;
-
-  if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
-    {
-      /* Just write "orig" over "html".  We need to do it this way
-        because when we're checking to see if we've downloaded the
-        file before (to see if we can skip downloading it), we don't
-        know if it's a text/html file.  Therefore we don't know yet
-        at that stage that -E is going to cause us to tack on
-        ".html", so we need to compare vs. the original URL plus
-        ".orig", not the original URL plus ".html.orig". */
-      filename_plus_orig_suffix = alloca (filename_len + 1);
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
-    }
-  else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
-    {
-      /* Append ".orig" to the name. */
-      filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
-    }
-
-  /* We can get called twice on the same URL thanks to the
-     convert_all_links() call in main().  If we write the .orig file
-     each time in such a case, it'll end up containing the first-pass
-     conversion, not the original file.  So, see if we've already been
-     called on this file. */
-  converted_file_ptr = converted_files;
-  while (converted_file_ptr != NULL)
-    if (strcmp(converted_file_ptr->string, file) == 0)
-      {
-       already_wrote_backup_file = TRUE;
-       break;
-      }
-    else
-      converted_file_ptr = converted_file_ptr->next;
-
-  if (!already_wrote_backup_file)
-    {
-      /* Rename <file> to <file>.orig before former gets written over. */
-      if (rename(file, filename_plus_orig_suffix) != 0)
-       logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
-                  file, filename_plus_orig_suffix, strerror (errno));
-
-      /* Remember that we've already written a .orig backup for this file.
-        Note that we never free this memory since we need it till the
-        convert_all_links() call, which is one of the last things the
-        program does before terminating.  BTW, I'm not sure if it would be
-        safe to just set 'converted_file_ptr->string' to 'file' below,
-        rather than making a copy of the string...  Another note is that I
-        thought I could just add a field to the urlpos structure saying
-        that we'd written a .orig file for this URL, but that didn't work,
-        so I had to make this separate list.
-        -- Dan Harkless <wget@harkless.org>
-
-         This [adding a field to the urlpos structure] didn't work
-         because convert_file() is called from convert_all_links at
-         the end of the retrieval with a freshly built new urlpos
-         list.
-        -- Hrvoje Niksic <hniksic@arsdigita.com>
-      */
-      converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
-      converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
-      converted_file_ptr->next = converted_files;
-      converted_files = converted_file_ptr;
-    }
-}
-
-static int find_fragment PARAMS ((const char *, int, const char **,
-                                 const char **));
-
-/* Replace an attribute's original text with NEW_TEXT. */
-
-static const char *
-replace_attr (const char *p, int size, FILE *fp, const char *new_text)
-{
-  int quote_flag = 0;
-  char quote_char = '\"';      /* use "..." for quoting, unless the
-                                  original value is quoted, in which
-                                  case reuse its quoting char. */
-  const char *frag_beg, *frag_end;
-
-  /* Structure of our string is:
-       "...old-contents..."
-       <---    size    --->  (with quotes)
-     OR:
-       ...old-contents...
-       <---    size   -->    (no quotes)   */
-
-  if (*p == '\"' || *p == '\'')
-    {
-      quote_char = *p;
-      quote_flag = 1;
-      ++p;
-      size -= 2;               /* disregard opening and closing quote */
-    }
-  putc (quote_char, fp);
-  fputs (new_text, fp);
-
-  /* Look for fragment identifier, if any. */
-  if (find_fragment (p, size, &frag_beg, &frag_end))
-    fwrite (frag_beg, 1, frag_end - frag_beg, fp);
-  p += size;
-  if (quote_flag)
-    ++p;
-  putc (quote_char, fp);
-
-  return p;
-}
-
-/* The same as REPLACE_ATTR, but used when replacing
-   <meta http-equiv=refresh content="new_text"> because we need to
-   append "timeout_value; URL=" before the next_text.  */
-
-static const char *
-replace_attr_refresh_hack (const char *p, int size, FILE *fp,
-                          const char *new_text, int timeout)
-{
-  /* "0; URL=..." */
-  char *new_with_timeout = (char *)alloca (numdigit (timeout)
-                                          + 6 /* "; URL=" */
-                                          + strlen (new_text)
-                                          + 1);
-  sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
-
-  return replace_attr (p, size, fp, new_with_timeout);
-}
-
-/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
-   preceded by '&'.  If the character is not found, return zero.  If
-   the character is found, return 1 and set BP and EP to point to the
-   beginning and end of the region.
-
-   This is used for finding the fragment indentifiers in URLs.  */
-
-static int
-find_fragment (const char *beg, int size, const char **bp, const char **ep)
-{
-  const char *end = beg + size;
-  int saw_amp = 0;
-  for (; beg < end; beg++)
-    {
-      switch (*beg)
-       {
-       case '&':
-         saw_amp = 1;
-         break;
-       case '#':
-         if (!saw_amp)
-           {
-             *bp = beg;
-             *ep = end;
-             return 1;
-           }
-         /* fallthrough */
-       default:
-         saw_amp = 0;
-       }
-    }
-  return 0;
-}
-
-/* Quote FILE for use as local reference to an HTML file.
-
-   We quote ? as %3F to avoid passing part of the file name as the
-   parameter when browsing the converted file through HTTP.  However,
-   it is safe to do this only when `--html-extension' is turned on.
-   This is because converting "index.html?foo=bar" to
-   "index.html%3Ffoo=bar" would break local browsing, as the latter
-   isn't even recognized as an HTML file!  However, converting
-   "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
-   safe for both local and HTTP-served browsing.  */
-
-static char *
-local_quote_string (const char *file)
-{
-  const char *file_sans_qmark;
-  int qm;
-
-  if (!opt.html_extension)
-    return html_quote_string (file);
-
-  qm = count_char (file, '?');
-
-  if (qm)
-    {
-      const char *from = file;
-      char *to, *newname;
-
-      /* qm * 2 because we replace each question mark with "%3F",
-        i.e. replace one char with three, hence two more.  */
-      int fsqlen = strlen (file) + qm * 2;
-
-      to = newname = (char *)alloca (fsqlen + 1);
-      for (; *from; from++)
-       {
-         if (*from != '?')
-           *to++ = *from;
-         else
-           {
-             *to++ = '%';
-             *to++ = '3';
-             *to++ = 'F';
-           }
-       }
-      assert (to - newname == fsqlen);
-      *to = '\0';
-
-      file_sans_qmark = newname;
-    }
-  else
-    file_sans_qmark = file;
-
-  return html_quote_string (file_sans_qmark);
-}
-
-/* We're storing "modes" of type downloaded_file_t in the hash table.
-   However, our hash tables only accept pointers for keys and values.
-   So when we need a pointer, we use the address of a
-   downloaded_file_t variable of static storage.  */
-   
-static downloaded_file_t *
-downloaded_mode_to_ptr (downloaded_file_t mode)
-{
-  static downloaded_file_t
-    v1 = FILE_NOT_ALREADY_DOWNLOADED,
-    v2 = FILE_DOWNLOADED_NORMALLY,
-    v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
-    v4 = CHECK_FOR_FILE;
-
-  switch (mode)
-    {
-    case FILE_NOT_ALREADY_DOWNLOADED:
-      return &v1;
-    case FILE_DOWNLOADED_NORMALLY:
-      return &v2;
-    case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
-      return &v3;
-    case CHECK_FOR_FILE:
-      return &v4;
-    }
-  return NULL;
-}
-
-/* This should really be merged with dl_file_url_map and
-   downloaded_html_files in recur.c.  This was originally a list, but
-   I changed it to a hash table beause it was actually taking a lot of
-   time to find things in it.  */
-
-static struct hash_table *downloaded_files_hash;
-
-/* Remembers which files have been downloaded.  In the standard case, should be
-   called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
-   download successfully (i.e. not for ones we have failures on or that we skip
-   due to -N).
-
-   When we've downloaded a file and tacked on a ".html" extension due to -E,
-   call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
-   FILE_DOWNLOADED_NORMALLY.
-
-   If you just want to check if a file has been previously added without adding
-   it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
-   with local filenames, not remote URLs. */
-downloaded_file_t
-downloaded_file (downloaded_file_t mode, const char *file)
-{
-  downloaded_file_t *ptr;
-
-  if (mode == CHECK_FOR_FILE)
-    {
-      if (!downloaded_files_hash)
-       return FILE_NOT_ALREADY_DOWNLOADED;
-      ptr = hash_table_get (downloaded_files_hash, file);
-      if (!ptr)
-       return FILE_NOT_ALREADY_DOWNLOADED;
-      return *ptr;
-    }
-
-  if (!downloaded_files_hash)
-    downloaded_files_hash = make_string_hash_table (0);
-
-  ptr = hash_table_get (downloaded_files_hash, file);
-  if (ptr)
-    return *ptr;
-
-  ptr = downloaded_mode_to_ptr (mode);
-  hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
-
-  return FILE_NOT_ALREADY_DOWNLOADED;
-}
-
-static int
-df_free_mapper (void *key, void *value, void *ignored)
-{
-  xfree (key);
-  return 0;
-}
-
-void
-downloaded_files_free (void)
-{
-  if (downloaded_files_hash)
-    {
-      hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
-      hash_table_destroy (downloaded_files_hash);
-      downloaded_files_hash = NULL;
-    }
-}
-
  /* Return non-zero if scheme a is similar to scheme b.
   
     Schemes are similar if they are equal.  If SSL is supported, schemes
diff --git a/src/url.h b/src/url.h

index 5cf9e4f6aee096691a2e835b7ee8eb780d6ff5f6..750f3f5cda6815606c046b185632a6514817e1b3 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -71,63 +71,6 @@ struct url
    char *passwd;
  };
  
-enum convert_options {
-  CO_NOCONVERT = 0,            /* don't convert this URL */
-  CO_CONVERT_TO_RELATIVE,      /* convert to relative, e.g. to
-                                   "../../otherdir/foo.gif" */
-  CO_CONVERT_TO_COMPLETE,      /* convert to absolute, e.g. to
-                                  "http://orighost/somedir/bar.jpg". */
-  CO_NULLIFY_BASE              /* change to empty string. */
-};
-
-/* A structure that defines the whereabouts of a URL, i.e. its
-   position in an HTML document, etc.  */
-
-struct urlpos {
-  struct url *url;             /* the URL of the link, after it has
-                                  been merged with the base */
-  char *local_name;            /* local file to which it was saved
-                                  (used by convert_links) */
-
-  /* reserved for special links such as <base href="..."> which are
-     used when converting links, but ignored when downloading.  */
-  unsigned int ignore_when_downloading :1;
-
-  /* Information about the original link: */
-
-  unsigned int link_relative_p :1; /* was the link relative? */
-  unsigned int link_complete_p :1; /* was the link complete (with the
-                                      host name, etc.) */
-  unsigned int link_base_p     :1; /* was the link <base href=...> */
-  unsigned int link_inline_p   :1; /* needed to render the page. */
-
-  unsigned int link_refresh_p  :1; /* link was received from
-                                      <meta http-equiv=refresh content=...> */
-  int refresh_timeout;         /* for reconstructing the refresh. */
-
-  /* Conversion requirements: */
-  enum convert_options convert;        /* is conversion required? */
-
-  /* URL's position in the buffer. */
-  int pos, size;
-
-  struct urlpos *next;         /* next list element */
-};
-
-/* downloaded_file() takes a parameter of this type and returns this type. */
-typedef enum
-{
-  /* Return enumerators: */
-  FILE_NOT_ALREADY_DOWNLOADED = 0,
-
-  /* Return / parameter enumerators: */
-  FILE_DOWNLOADED_NORMALLY,
-  FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
-
-  /* Parameter enumerators: */
-  CHECK_FOR_FILE
-} downloaded_file_t;
-
  /* Function declarations */
  
  char *url_escape PARAMS ((const char *));
@@ -145,26 +88,13 @@ int scheme_default_port PARAMS ((enum url_scheme));
  void scheme_disable PARAMS ((enum url_scheme));
  
  char *url_string PARAMS ((const struct url *, int));
-
-struct urlpos *get_urls_file PARAMS ((const char *));
-struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
-void free_urlpos PARAMS ((struct urlpos *));
+char *url_file_name PARAMS ((const struct url *));
  
  char *uri_merge PARAMS ((const char *, const char *));
  
-void rotate_backups PARAMS ((const char *));
  int mkalldirs PARAMS ((const char *));
-char *url_file_name PARAMS ((const struct url *));
-
-char *getproxy PARAMS ((struct url *));
-int no_proxy_match PARAMS ((const char *, const char **));
-
-void convert_links PARAMS ((const char *, struct urlpos *));
-
-downloaded_file_t downloaded_file PARAMS ((downloaded_file_t, const char *));
  
  char *rewrite_shorthand_url PARAMS ((const char *));
-
  int schemes_are_similar_p PARAMS ((enum url_scheme a, enum url_scheme b));
  
  #endif /* URL_H */
author	hniksic <devnull@localhost>
	Sun, 21 Sep 2003 22:47:14 +0000 (15:47 -0700)
committer	hniksic <devnull@localhost>
	Sun, 21 Sep 2003 22:47:14 +0000 (15:47 -0700)
src/ChangeLog		patch \| blob \| history
src/Makefile.in		patch \| blob \| history
src/convert.c	[new file with mode: 0644]	patch \| blob
src/convert.h	[new file with mode: 0644]	patch \| blob
src/ftp.c		patch \| blob \| history
src/html-url.c		patch \| blob \| history
src/http.c		patch \| blob \| history
src/init.c		patch \| blob \| history
src/recur.c		patch \| blob \| history
src/recur.h		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/retr.h		patch \| blob \| history
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history