From: dan <devnull@localhost>
Date: Thu, 2 Mar 2000 06:33:48 +0000 (-0800)
Subject: [svn] Implemented the item I formerly had in the TODO:  When -K and -N are used
X-Git-Tag: v1.13~2519
X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=4331c39c9ad0a942bd2b1eb8663e762d2403b90d

[svn] Implemented the item I formerly had in the TODO:  When -K and -N are used
together, we compare local file X.orig (if extant) against server file X.
Previously -k and -N were worthless in combination because the local converted
files always differed from the server versions.
---

diff --git a/ChangeLog b/ChangeLog
index 92039a92..6edc5cbc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2000-03-01  Dan Harkless  <dan-wget@dilvish.speed.net>
+
+	* NEWS (-K): Now possible to use -N with -k thanks to this option.
+
+	* TODO: Removed the -K / -N interaction item.
+
 2000-02-29  Dan Harkless  <dan-wget@dilvish.speed.net>
 
 	* NEWS (-K / --backup-converted): Mentioned this new option.
diff --git a/NEWS b/NEWS
index 35c5dc41..eeab6a3e 100644
--- a/NEWS
+++ b/NEWS
@@ -8,7 +8,8 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
 * Changes in Wget 1.5.3+dev
 
 ** New -K / --backup-converted / backup_converted = on option causes files
-modified due to -k to be saved with a .orig prefix before being changed.
+modified due to -k to be saved with a .orig prefix before being changed.  When
+using -N as well, it is these .orig files that are compared against the server.
 
 * Wget 1.5.3 is a bugfix release with no user-visible changes.
 
diff --git a/TODO b/TODO
index f25ac4f2..9f547699 100644
--- a/TODO
+++ b/TODO
@@ -76,6 +76,3 @@ particular order.  Not all of them are user-visible changes.
 * Implement more HTTP/1.1 bells and whistles (ETag, Content-MD5 etc.)
 
 * Support SSL encryption through SSLeay or OpenSSL.
-
-* When -K is used with -N, check local file X.orig (if extant) against server 
-  file X.
diff --git a/src/ChangeLog b/src/ChangeLog
index 36613747..54704959 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,35 @@
+2000-03-01  Dan Harkless  <dan-wget@dilvish.speed.net>
+
+	* ftp.c (ftp_loop_internal): Call new downloaded_file() function,
+	even though we don't do conversion on HTML files retrieved via
+	FTP, so _current_ usage of downloaded_file() makes this call unneeded. 
+	(ftp_retrieve_list): Added a comment saying where we need to
+	stat() a .orig file if FTP'd HTML file conversion is ever implemented.
+	(ftp_retrieve_list): "Local file '%s' is more recent," is sometimes
+	a lie -- reworded as "Server file no newer than local file '%s' --".
+	
+	* http.c (http_loop): Fixed a typo and clarified a comment.
+	(http_loop): When -K and -N are specified together, compare size
+ 	and timestamp of server file X against local file X.orig (if
+ 	extant) rather than converted local file X.
+	(http_loop): "Local file '%s' is more recent," is sometimes a lie
+ 	-- reworded as "Server file no newer than local file '%s' --".
+	(http_loop): Call new downloaded_file() function to prevent
+	wrongful overwriting of .orig file when -N is specified.
+	
+	* url.c (convert_links): When -K specified, only rename X to
+ 	X.orig if downloaded_file() returns TRUE.  Otherwise when we skip
+ 	file X due to -N, we clobber an X.orig from a previous invocation.
+	(convert_links): Call the failsafe xstrdup(), not the real strdup().
+	(convert_links): Added a note asking anyone who understands how
+	multiple URLs can correspond to a single file to comment it.
+	(downloaded_file): Added this new function.
+	
+	* url.h (downloaded_file): Added prototype for this new function
+	as well as its downloaded_file_t enum type.
+
+	* wget.h (boolean): Added this new typedef and TRUE and FALSE #defines.
+
 2000-02-29  Dan Harkless  <dan-wget@dilvish.speed.net>
 
 	* version.c: Upped version to developer-only "1.5.3+dev".
diff --git a/src/ftp.c b/src/ftp.c
index 61507db4..9077c674 100644
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -947,6 +947,11 @@ ftp_loop_internal (struct urlinfo *u, struct fileinfo *f, ccon *con)
 	  /* Not as great.  */
 	  abort ();
 	}
+
+      /* If we get out of the switch above without continue'ing, we've
+	 successfully downloaded a file.  Remember this fact. */
+      downloaded_file(ADD_FILE, locf);
+
       if (con->st & ON_YOUR_OWN)
 	{
 	  CLOSE (RBUF_FD (&con->rbuf));
@@ -1086,6 +1091,11 @@ ftp_retrieve_list (struct urlinfo *u, struct fileinfo *f, ccon *con)
       if (opt.timestamping && f->type == FT_PLAINFILE)
 	{
 	  struct stat st;
+	  /* If conversion of HTML files retrieved via FTP is ever implemented,
+	     we'll need to stat() <file>.orig here when -K has been specified.
+	     I'm not implementing it now since files on an FTP server are much
+	     more likely than files on an HTTP server to legitimately have a
+	     .orig suffix. */
 	  if (!stat (u->local, &st))
 	    {
 	      /* Else, get it from the file.  */
@@ -1094,13 +1104,13 @@ ftp_retrieve_list (struct urlinfo *u, struct fileinfo *f, ccon *con)
 	      if (local_size == f->size && tml >= f->tstamp)
 		{
 		  logprintf (LOG_VERBOSE, _("\
-Local file `%s' is more recent, not retrieving.\n\n"), u->local);
+Server file no newer than local file `%s' -- not retrieving.\n\n"), u->local);
 		  dlthis = 0;
 		}
 	      else if (local_size != f->size)
 		{
 		  logprintf (LOG_VERBOSE, _("\
-The sizes do not match (local %ld), retrieving.\n"), local_size);
+The sizes do not match (local %ld) -- retrieving.\n"), local_size);
 		}
 	    }
 	}	/* opt.timestamping && f->type == FT_PLAINFILE */
diff --git a/src/http.c b/src/http.c
index 9ea0670f..c79e6fec 100644
--- a/src/http.c
+++ b/src/http.c
@@ -840,6 +840,7 @@ http_loop (struct urlinfo *u, char **newloc, int *dt)
   static int first_retrieval = 1;
 
   int count;
+  int local_dot_orig_file_exists = FALSE;
   int use_ts, got_head = 0;	/* time-stamping info */
   char *tms, *suf, *locf, *tmrate;
   uerr_t err;
@@ -851,7 +852,7 @@ http_loop (struct urlinfo *u, char **newloc, int *dt)
   *newloc = NULL;
 
   /* Warn on (likely bogus) wildcard usage in HTTP.  Don't use
-     has_wildcards_p because it would also warn on `?', and we that
+     has_wildcards_p because it would also warn on `?', and we know that
      shows up in CGI paths a *lot*.  */
   if (strchr (u->url, '*'))
     logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
@@ -888,7 +889,43 @@ File `%s' already there, will not retrieve.\n"), u->local);
   use_ts = 0;
   if (opt.timestamping)
     {
-      if (stat (u->local, &st) == 0)
+      boolean  local_file_exists = FALSE;
+
+      if (opt.backup_converted)
+	/* If -K is specified, we'll act on the assumption that it was specified
+	   last time these files were downloaded as well, and instead of just
+	   comparing local file X against server file X, we'll compare local
+	   file X.orig (if extant, else X) against server file X.  If -K
+	   _wasn't_ specified last time, or the server contains files called
+	   *.orig, -N will be back to not operating correctly with -k. */
+	{
+	  size_t filename_len = strlen(u->local);
+	  char*  filename_plus_orig_suffix = malloc(filename_len +
+						    sizeof(".orig"));
+
+	  /* Would a single s[n]printf() call be faster? */
+	  strcpy(filename_plus_orig_suffix, u->local);
+	  strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+
+	  /* Try to stat() the .orig file. */
+	  if (stat(filename_plus_orig_suffix, &st) == 0)
+	    {
+	      local_file_exists = TRUE;
+	      local_dot_orig_file_exists = TRUE;
+	    }
+
+	  free(filename_plus_orig_suffix);
+	}      
+
+      if (!local_dot_orig_file_exists)
+	/* Couldn't stat() <file>.orig, so try to stat() <file>. */
+	if (stat (u->local, &st) == 0)
+	  local_file_exists = TRUE;
+
+      if (local_file_exists)
+	/* There was a local file, so we'll check later to see if the version
+	   the server has is the same version we already have, allowing us to
+	   skip a download. */
 	{
 	  use_ts = 1;
 	  tml = st.st_mtime;
@@ -1051,14 +1088,26 @@ Last-modified header invalid -- time-stamp ignored.\n"));
 	      if (tml >= tmr &&
 		  (hstat.contlen == -1 || local_size == hstat.contlen))
 		{
-		  logprintf (LOG_VERBOSE, _("\
-Local file `%s' is more recent, not retrieving.\n\n"), u->local);
+		  if (local_dot_orig_file_exists)
+		    /* We can't collapse this down into just one logprintf()
+		       call with a variable set to u->local or the .orig
+		       filename because we have to malloc() space for the
+		       latter, and because there are multiple returns above (a
+		       coding style no-no by many measures, for reasons such as
+		       this) we'd have to remember to free() the string at each
+		       one to avoid a memory leak. */
+		    logprintf (LOG_VERBOSE, _("\
+Server file no newer than local file `%s.orig' -- not retrieving.\n\n"),
+			       u->local);
+		  else
+		    logprintf (LOG_VERBOSE, _("\
+Server file no newer than local file `%s' -- not retrieving.\n\n"), u->local);
 		  FREEHSTAT (hstat);
 		  return RETROK;
 		}
 	      else if (tml >= tmr)
 		logprintf (LOG_VERBOSE, _("\
-The sizes do not match (local %ld), retrieving.\n"), local_size);
+The sizes do not match (local %ld) -- retrieving.\n"), local_size);
 	      else
 		logputs (LOG_VERBOSE,
 			 _("Remote file is newer, retrieving.\n"));
@@ -1103,12 +1152,13 @@ The sizes do not match (local %ld), retrieving.\n"), local_size);
 	    }
 	  ++opt.numurls;
 	  opt.downloaded += hstat.len;
+	  downloaded_file(ADD_FILE, locf);
 	  return RETROK;
 	}
       else if (hstat.res == 0) /* No read error */
 	{
-	  if (hstat.contlen == -1)  /* We don't know how much we were
-				       supposed to get, so...  */
+	  if (hstat.contlen == -1)  /* We don't know how much we were supposed
+				       to get, so assume we succeeded. */ 
 	    {
 	      if (*dt & RETROKF)
 		{
@@ -1121,6 +1171,7 @@ The sizes do not match (local %ld), retrieving.\n"), local_size);
 		}
 	      ++opt.numurls;
 	      opt.downloaded += hstat.len;
+	      downloaded_file(ADD_FILE, locf);
 	      return RETROK;
 	    }
 	  else if (hstat.len < hstat.contlen) /* meaning we lost the
@@ -1142,6 +1193,7 @@ The sizes do not match (local %ld), retrieving.\n"), local_size);
 			 tms, u->url, hstat.len, hstat.contlen, locf, count);
 	      ++opt.numurls;
 	      opt.downloaded += hstat.len;
+	      downloaded_file(ADD_FILE, locf);
 	      return RETROK;
 	    }
 	  else			/* the same, but not accepted */
diff --git a/src/retr.c b/src/retr.c
index 59250054..d90fa529 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -350,7 +350,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       if (result != URLOK || u->proto != URLHTTP)
 	{
 	  if (u->proto == URLHTTP)
-	    logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg (result));
+	    logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg(result));
 	  else
 	    logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
 	  freeurl (u, 1);
@@ -455,7 +455,8 @@ retrieve_from_file (const char *file, int html, int *count)
 	}
       status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt);
       if (opt.recursive && status == RETROK && (dt & TEXTHTML))
-	status = recursive_retrieve (filename, new_file ? new_file : cur_url->url);
+	status = recursive_retrieve (filename, new_file ? new_file
+				                        : cur_url->url);
 
       if (filename && opt.delete_after && file_exists_p (filename))
 	{
diff --git a/src/url.c b/src/url.c
index 87a97b30..33aa37b8 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1366,10 +1366,9 @@ no_proxy_match (const char *host, const char **no_proxy)
 void
 convert_links (const char *file, urlpos *l)
 {
-  FILE           *fp;
-  char           *buf, *p, *p2;
-  long           size;
-  static slist*  converted_files = NULL;
+  FILE *fp;
+  char *buf, *p, *p2;
+  long size;
 
   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
   /* Read from the file....  */
@@ -1383,28 +1382,34 @@ convert_links (const char *file, urlpos *l)
   /* ...to a buffer.  */
   load_file (fp, &buf, &size);
   fclose (fp);
-  if (opt.backup_converted)
+  if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
     /* Rather than just writing over the original .html file with the converted
-       version, save the former to *.orig. */
+       version, save the former to *.orig.  Note we only do this for files we've
+       _successfully_ downloaded, so we don't clobber .orig files sitting around
+       from previous invocations. */
     {
       /* Construct the backup filename as the original name plus ".orig". */
-      size_t filename_len = strlen(file);
-      char*  filename_plus_orig_suffix = malloc(filename_len + sizeof(".orig"));
-      int    already_wrote_backup_file = 0;
-      slist* converted_file_ptr;
-
+      size_t         filename_len = strlen(file);
+      char*          filename_plus_orig_suffix = malloc(filename_len +
+							sizeof(".orig"));
+      boolean        already_wrote_backup_file = FALSE;
+      slist*         converted_file_ptr;
+      static slist*  converted_files = NULL;
+
+      /* Would a single s[n]printf() call be faster? */
       strcpy(filename_plus_orig_suffix, file);
       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
 
       /* We can get called twice on the same URL thanks to the
 	 convert_all_links() call in main().  If we write the .orig file each
 	 time in such a case, it'll end up containing the first-pass conversion,
-	 not the original file. */
+	 not the original file.  So, see if we've already been called on this
+	 file. */
       converted_file_ptr = converted_files;
       while (converted_file_ptr != NULL)
 	if (strcmp(converted_file_ptr->string, file) == 0)
 	  {
-	    already_wrote_backup_file = 1;
+	    already_wrote_backup_file = TRUE;
 	    break;
 	  }
 	else
@@ -1421,10 +1426,13 @@ convert_links (const char *file, urlpos *l)
 	     Note that we never free this memory since we need it till the
 	     convert_all_links() call, which is one of the last things the
 	     program does before terminating.  BTW, I'm not sure if it would be
-	     safe to just set converted_file_ptr->string to file below, rather
-	     than making a copy of the string... */
+	     safe to just set 'converted_file_ptr->string' to 'file' below,
+	     rather than making a copy of the string...  Another note is that I
+	     thought I could just add a field to the urlpos structure saying
+	     that we'd written a .orig file for this URL, but that didn't work,
+	     so I had to make this separate list. */
 	  converted_file_ptr = malloc(sizeof(slist));
-	  converted_file_ptr->string = strdup(file);
+	  converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
 	  converted_file_ptr->next = converted_files;
 	  converted_files = converted_file_ptr;
 	}
@@ -1440,6 +1448,8 @@ convert_links (const char *file, urlpos *l)
       free (buf);
       return;
     }
+  /* [If someone understands why multiple URLs can correspond to one local file,
+     can they please add a comment here...?] */
   for (p = buf; l; l = l->next)
     {
       if (l->pos >= size)
@@ -1547,3 +1557,44 @@ add_url (urlpos *l, const char *url, const char *file)
   t->next = l;
   return t;
 }
+
+
+/* Remembers which files have been downloaded.  Should be called with
+   add_or_check == ADD_FILE for each file we actually download successfully
+   (i.e. not for ones we have failures on or that we skip due to -N).  If you
+   just want to check if a file has been previously added without adding it,
+   call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
+   function with local filenames, not remote URLs -- by some means that isn't
+   commented well enough for me understand, multiple remote URLs can apparently
+   correspond to a single local file. */
+boolean
+downloaded_file (downloaded_file_t  add_or_check, const char*  file)
+{
+  boolean        found_file = FALSE;
+  static slist*  downloaded_files = NULL;
+  slist*         rover = downloaded_files;
+
+  while (rover != NULL)
+    if (strcmp(rover->string, file) == 0)
+      {
+	found_file = TRUE;
+	break;
+      }
+    else
+      rover = rover->next;
+
+  if (found_file)
+    return TRUE;  /* file had already been downloaded */
+  else
+    {
+      if (add_or_check == ADD_FILE)
+	{
+	  rover = malloc(sizeof(slist));
+	  rover->string = xstrdup(file);  /* die on out-of-mem. */
+	  rover->next = downloaded_files;
+	  downloaded_files = rover;
+	}
+
+      return FALSE;  /* file had not already been downloaded */
+    }
+}
diff --git a/src/url.h b/src/url.h
index 771f17a4..87044cdd 100644
--- a/src/url.h
+++ b/src/url.h
@@ -62,6 +62,12 @@ typedef struct _urlpos
   struct _urlpos *next;        /* Next struct in list */
 } urlpos;
 
+/* Controls how downloaded_file() behaves. */
+typedef enum
+{
+  ADD_FILE,
+  CHECK_FOR_FILE
+} downloaded_file_t;
 
 /* Function declarations */
 
@@ -95,4 +101,6 @@ int no_proxy_match PARAMS ((const char *, const char **));
 void convert_links PARAMS ((const char *, urlpos *));
 urlpos *add_url PARAMS ((urlpos *, const char *, const char *));
 
+boolean downloaded_file PARAMS ((downloaded_file_t, const char *));
+
 #endif /* URL_H */
diff --git a/src/wget.h b/src/wget.h
index cdc301cf..4467f5fb 100644
--- a/src/wget.h
+++ b/src/wget.h
@@ -198,4 +198,12 @@ typedef enum
   ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED, QUOTEXC, WRITEFAILED
 } uerr_t;
 
+typedef unsigned char  boolean;
+#ifndef FALSE
+#define FALSE 0
+#endif
+#ifndef TRUE
+#define TRUE  1
+#endif
+
 #endif /* WGET_H */