[svn] Implemented the item I formerly had in the TODO: When -K and -N are used

author dan <devnull@localhost>

Thu, 2 Mar 2000 06:33:48 +0000 (22:33 -0800)

committer dan <devnull@localhost>

Thu, 2 Mar 2000 06:33:48 +0000 (22:33 -0800)
author dan <devnull@localhost>
Thu, 2 Mar 2000 06:33:48 +0000 (22:33 -0800)
committer dan <devnull@localhost>
Thu, 2 Mar 2000 06:33:48 +0000 (22:33 -0800)
diff --git a/ChangeLog b/ChangeLog

index 92039a925ad4ece92e34e5d5d725c8d0143bd0ae..6edc5cbc5aba5fb01b8649ac73625fc15f7bb259 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2000-03-01  Dan Harkless  <dan-wget@dilvish.speed.net>
+
+       * NEWS (-K): Now possible to use -N with -k thanks to this option.
+
+       * TODO: Removed the -K / -N interaction item.
+
  2000-02-29  Dan Harkless  <dan-wget@dilvish.speed.net>
  
         * NEWS (-K / --backup-converted): Mentioned this new option.
diff --git a/NEWS b/NEWS

index 35c5dc41b36cc1efab4c46b4897d2a20325f5aca..eeab6a3e82386da4f6449a17bb8d8763943efc2e 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -8,7 +8,8 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
  * Changes in Wget 1.5.3+dev
  
  ** New -K / --backup-converted / backup_converted = on option causes files
-modified due to -k to be saved with a .orig prefix before being changed.
+modified due to -k to be saved with a .orig prefix before being changed.  When
+using -N as well, it is these .orig files that are compared against the server.
  \f
  * Wget 1.5.3 is a bugfix release with no user-visible changes.
  \f
diff --git a/TODO b/TODO

index f25ac4f2d4817ecad864bdf652bdcea68d99a4e3..9f5476993fa95178bd0983119f0f78ae6f38c7c9 100644 (file)
--- a/TODO
+++ b/TODO
@@ -76,6 +76,3 @@ particular order.  Not all of them are user-visible changes.
  * Implement more HTTP/1.1 bells and whistles (ETag, Content-MD5 etc.)
  
  * Support SSL encryption through SSLeay or OpenSSL.
-
-* When -K is used with -N, check local file X.orig (if extant) against server 
-  file X.
diff --git a/src/ChangeLog b/src/ChangeLog

index 3661374715206105c3940a94e98e9066bd4fa45e..54704959744cb986e15e1b40cdcf6d39ff378211 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,35 @@
+2000-03-01  Dan Harkless  <dan-wget@dilvish.speed.net>
+
+       * ftp.c (ftp_loop_internal): Call new downloaded_file() function,
+       even though we don't do conversion on HTML files retrieved via
+       FTP, so _current_ usage of downloaded_file() makes this call unneeded. 
+       (ftp_retrieve_list): Added a comment saying where we need to
+       stat() a .orig file if FTP'd HTML file conversion is ever implemented.
+       (ftp_retrieve_list): "Local file '%s' is more recent," is sometimes
+       a lie -- reworded as "Server file no newer than local file '%s' --".
+       
+       * http.c (http_loop): Fixed a typo and clarified a comment.
+       (http_loop): When -K and -N are specified together, compare size
+       and timestamp of server file X against local file X.orig (if
+       extant) rather than converted local file X.
+       (http_loop): "Local file '%s' is more recent," is sometimes a lie
+       -- reworded as "Server file no newer than local file '%s' --".
+       (http_loop): Call new downloaded_file() function to prevent
+       wrongful overwriting of .orig file when -N is specified.
+       
+       * url.c (convert_links): When -K specified, only rename X to
+       X.orig if downloaded_file() returns TRUE.  Otherwise when we skip
+       file X due to -N, we clobber an X.orig from a previous invocation.
+       (convert_links): Call the failsafe xstrdup(), not the real strdup().
+       (convert_links): Added a note asking anyone who understands how
+       multiple URLs can correspond to a single file to comment it.
+       (downloaded_file): Added this new function.
+       
+       * url.h (downloaded_file): Added prototype for this new function
+       as well as its downloaded_file_t enum type.
+
+       * wget.h (boolean): Added this new typedef and TRUE and FALSE #defines.
+
  2000-02-29  Dan Harkless  <dan-wget@dilvish.speed.net>
  
         * version.c: Upped version to developer-only "1.5.3+dev".
diff --git a/src/ftp.c b/src/ftp.c

index 61507db4742a26618758cf7c7dc9dbf87ad96ea8..9077c674c4b243477754e304553698711a20e3dd 100644 (file)
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -947,6 +947,11 @@ ftp_loop_internal (struct urlinfo *u, struct fileinfo *f, ccon *con)
           /* Not as great.  */
           abort ();
         }
+
+      /* If we get out of the switch above without continue'ing, we've
+        successfully downloaded a file.  Remember this fact. */
+      downloaded_file(ADD_FILE, locf);
+
        if (con->st & ON_YOUR_OWN)
         {
           CLOSE (RBUF_FD (&con->rbuf));
@@ -1086,6 +1091,11 @@ ftp_retrieve_list (struct urlinfo *u, struct fileinfo *f, ccon *con)
        if (opt.timestamping && f->type == FT_PLAINFILE)
         {
           struct stat st;
+         /* If conversion of HTML files retrieved via FTP is ever implemented,
+            we'll need to stat() <file>.orig here when -K has been specified.
+            I'm not implementing it now since files on an FTP server are much
+            more likely than files on an HTTP server to legitimately have a
+            .orig suffix. */
           if (!stat (u->local, &st))
             {
               /* Else, get it from the file.  */
@@ -1094,13 +1104,13 @@ ftp_retrieve_list (struct urlinfo *u, struct fileinfo *f, ccon *con)
               if (local_size == f->size && tml >= f->tstamp)
                 {
                   logprintf (LOG_VERBOSE, _("\
-Local file `%s' is more recent, not retrieving.\n\n"), u->local);
+Server file no newer than local file `%s' -- not retrieving.\n\n"), u->local);
                   dlthis = 0;
                 }
               else if (local_size != f->size)
                 {
                   logprintf (LOG_VERBOSE, _("\
-The sizes do not match (local %ld), retrieving.\n"), local_size);
+The sizes do not match (local %ld) -- retrieving.\n"), local_size);
                 }
             }
         }       /* opt.timestamping && f->type == FT_PLAINFILE */
diff --git a/src/http.c b/src/http.c

index 9ea0670f4988a5b756e7911cbf0ffcf8b85be956..c79e6fecbfe31210f30d3aaf9f3ba7abb0cade0c 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -840,6 +840,7 @@ http_loop (struct urlinfo *u, char **newloc, int *dt)
    static int first_retrieval = 1;
  
    int count;
+  int local_dot_orig_file_exists = FALSE;
    int use_ts, got_head = 0;    /* time-stamping info */
    char *tms, *suf, *locf, *tmrate;
    uerr_t err;
@@ -851,7 +852,7 @@ http_loop (struct urlinfo *u, char **newloc, int *dt)
    *newloc = NULL;
  
    /* Warn on (likely bogus) wildcard usage in HTTP.  Don't use
-     has_wildcards_p because it would also warn on `?', and we that
+     has_wildcards_p because it would also warn on `?', and we know that
       shows up in CGI paths a *lot*.  */
    if (strchr (u->url, '*'))
      logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
@@ -888,7 +889,43 @@ File `%s' already there, will not retrieve.\n"), u->local);
    use_ts = 0;
    if (opt.timestamping)
      {
-      if (stat (u->local, &st) == 0)
+      boolean  local_file_exists = FALSE;
+
+      if (opt.backup_converted)
+       /* If -K is specified, we'll act on the assumption that it was specified
+          last time these files were downloaded as well, and instead of just
+          comparing local file X against server file X, we'll compare local
+          file X.orig (if extant, else X) against server file X.  If -K
+          _wasn't_ specified last time, or the server contains files called
+          *.orig, -N will be back to not operating correctly with -k. */
+       {
+         size_t filename_len = strlen(u->local);
+         char*  filename_plus_orig_suffix = malloc(filename_len +
+                                                   sizeof(".orig"));
+
+         /* Would a single s[n]printf() call be faster? */
+         strcpy(filename_plus_orig_suffix, u->local);
+         strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+
+         /* Try to stat() the .orig file. */
+         if (stat(filename_plus_orig_suffix, &st) == 0)
+           {
+             local_file_exists = TRUE;
+             local_dot_orig_file_exists = TRUE;
+           }
+
+         free(filename_plus_orig_suffix);
+       }      
+
+      if (!local_dot_orig_file_exists)
+       /* Couldn't stat() <file>.orig, so try to stat() <file>. */
+       if (stat (u->local, &st) == 0)
+         local_file_exists = TRUE;
+
+      if (local_file_exists)
+       /* There was a local file, so we'll check later to see if the version
+          the server has is the same version we already have, allowing us to
+          skip a download. */
         {
           use_ts = 1;
           tml = st.st_mtime;
@@ -1051,14 +1088,26 @@ Last-modified header invalid -- time-stamp ignored.\n"));
               if (tml >= tmr &&
                   (hstat.contlen == -1 || local_size == hstat.contlen))
                 {
-                 logprintf (LOG_VERBOSE, _("\
-Local file `%s' is more recent, not retrieving.\n\n"), u->local);
+                 if (local_dot_orig_file_exists)
+                   /* We can't collapse this down into just one logprintf()
+                      call with a variable set to u->local or the .orig
+                      filename because we have to malloc() space for the
+                      latter, and because there are multiple returns above (a
+                      coding style no-no by many measures, for reasons such as
+                      this) we'd have to remember to free() the string at each
+                      one to avoid a memory leak. */
+                   logprintf (LOG_VERBOSE, _("\
+Server file no newer than local file `%s.orig' -- not retrieving.\n\n"),
+                              u->local);
+                 else
+                   logprintf (LOG_VERBOSE, _("\
+Server file no newer than local file `%s' -- not retrieving.\n\n"), u->local);
                   FREEHSTAT (hstat);
                   return RETROK;
                 }
               else if (tml >= tmr)
                 logprintf (LOG_VERBOSE, _("\
-The sizes do not match (local %ld), retrieving.\n"), local_size);
+The sizes do not match (local %ld) -- retrieving.\n"), local_size);
               else
                 logputs (LOG_VERBOSE,
                          _("Remote file is newer, retrieving.\n"));
@@ -1103,12 +1152,13 @@ The sizes do not match (local %ld), retrieving.\n"), local_size);
             }
           ++opt.numurls;
           opt.downloaded += hstat.len;
+         downloaded_file(ADD_FILE, locf);
           return RETROK;
         }
        else if (hstat.res == 0) /* No read error */
         {
-         if (hstat.contlen == -1)  /* We don't know how much we were
-                                      supposed to get, so...  */
+         if (hstat.contlen == -1)  /* We don't know how much we were supposed
+                                      to get, so assume we succeeded. */ 
             {
               if (*dt & RETROKF)
                 {
@@ -1121,6 +1171,7 @@ The sizes do not match (local %ld), retrieving.\n"), local_size);
                 }
               ++opt.numurls;
               opt.downloaded += hstat.len;
+             downloaded_file(ADD_FILE, locf);
               return RETROK;
             }
           else if (hstat.len < hstat.contlen) /* meaning we lost the
@@ -1142,6 +1193,7 @@ The sizes do not match (local %ld), retrieving.\n"), local_size);
                          tms, u->url, hstat.len, hstat.contlen, locf, count);
               ++opt.numurls;
               opt.downloaded += hstat.len;
+             downloaded_file(ADD_FILE, locf);
               return RETROK;
             }
           else                  /* the same, but not accepted */
diff --git a/src/retr.c b/src/retr.c

index 592500547255f3c9f5afdfb4a0d70790618e8958..d90fa5296fa8a48e90d7c61ad47af7d3b38452c9 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -350,7 +350,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
        if (result != URLOK || u->proto != URLHTTP)
         {
           if (u->proto == URLHTTP)
-           logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg (result));
+           logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg(result));
           else
             logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
           freeurl (u, 1);
@@ -455,7 +455,8 @@ retrieve_from_file (const char *file, int html, int *count)
         }
        status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt);
        if (opt.recursive && status == RETROK && (dt & TEXTHTML))
-       status = recursive_retrieve (filename, new_file ? new_file : cur_url->url);
+       status = recursive_retrieve (filename, new_file ? new_file
+                                                       : cur_url->url);
  
        if (filename && opt.delete_after && file_exists_p (filename))
         {
diff --git a/src/url.c b/src/url.c

index 87a97b30cec0a0d6f2448fe5fe80517df601bda6..33aa37b82bfa07099931d041cd5ec30244f7477d 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1366,10 +1366,9 @@ no_proxy_match (const char *host, const char **no_proxy)
  void
  convert_links (const char *file, urlpos *l)
  {
-  FILE           *fp;
-  char           *buf, *p, *p2;
-  long           size;
-  static slist*  converted_files = NULL;
+  FILE *fp;
+  char *buf, *p, *p2;
+  long size;
  
    logprintf (LOG_VERBOSE, _("Converting %s... "), file);
    /* Read from the file....  */
@@ -1383,28 +1382,34 @@ convert_links (const char *file, urlpos *l)
    /* ...to a buffer.  */
    load_file (fp, &buf, &size);
    fclose (fp);
-  if (opt.backup_converted)
+  if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
      /* Rather than just writing over the original .html file with the converted
-       version, save the former to *.orig. */
+       version, save the former to *.orig.  Note we only do this for files we've
+       _successfully_ downloaded, so we don't clobber .orig files sitting around
+       from previous invocations. */
      {
        /* Construct the backup filename as the original name plus ".orig". */
-      size_t filename_len = strlen(file);
-      char*  filename_plus_orig_suffix = malloc(filename_len + sizeof(".orig"));
-      int    already_wrote_backup_file = 0;
-      slist* converted_file_ptr;
-
+      size_t         filename_len = strlen(file);
+      char*          filename_plus_orig_suffix = malloc(filename_len +
+                                                       sizeof(".orig"));
+      boolean        already_wrote_backup_file = FALSE;
+      slist*         converted_file_ptr;
+      static slist*  converted_files = NULL;
+
+      /* Would a single s[n]printf() call be faster? */
        strcpy(filename_plus_orig_suffix, file);
        strcpy(filename_plus_orig_suffix + filename_len, ".orig");
  
        /* We can get called twice on the same URL thanks to the
          convert_all_links() call in main().  If we write the .orig file each
          time in such a case, it'll end up containing the first-pass conversion,
-        not the original file. */
+        not the original file.  So, see if we've already been called on this
+        file. */
        converted_file_ptr = converted_files;
        while (converted_file_ptr != NULL)
         if (strcmp(converted_file_ptr->string, file) == 0)
           {
-           already_wrote_backup_file = 1;
+           already_wrote_backup_file = TRUE;
             break;
           }
         else
@@ -1421,10 +1426,13 @@ convert_links (const char *file, urlpos *l)
              Note that we never free this memory since we need it till the
              convert_all_links() call, which is one of the last things the
              program does before terminating.  BTW, I'm not sure if it would be
-            safe to just set converted_file_ptr->string to file below, rather
-            than making a copy of the string... */
+            safe to just set 'converted_file_ptr->string' to 'file' below,
+            rather than making a copy of the string...  Another note is that I
+            thought I could just add a field to the urlpos structure saying
+            that we'd written a .orig file for this URL, but that didn't work,
+            so I had to make this separate list. */
           converted_file_ptr = malloc(sizeof(slist));
-         converted_file_ptr->string = strdup(file);
+         converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
           converted_file_ptr->next = converted_files;
           converted_files = converted_file_ptr;
         }
@@ -1440,6 +1448,8 @@ convert_links (const char *file, urlpos *l)
        free (buf);
        return;
      }
+  /* [If someone understands why multiple URLs can correspond to one local file,
+     can they please add a comment here...?] */
    for (p = buf; l; l = l->next)
      {
        if (l->pos >= size)
@@ -1547,3 +1557,44 @@ add_url (urlpos *l, const char *url, const char *file)
    t->next = l;
    return t;
  }
+
+
+/* Remembers which files have been downloaded.  Should be called with
+   add_or_check == ADD_FILE for each file we actually download successfully
+   (i.e. not for ones we have failures on or that we skip due to -N).  If you
+   just want to check if a file has been previously added without adding it,
+   call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
+   function with local filenames, not remote URLs -- by some means that isn't
+   commented well enough for me understand, multiple remote URLs can apparently
+   correspond to a single local file. */
+boolean
+downloaded_file (downloaded_file_t  add_or_check, const char*  file)
+{
+  boolean        found_file = FALSE;
+  static slist*  downloaded_files = NULL;
+  slist*         rover = downloaded_files;
+
+  while (rover != NULL)
+    if (strcmp(rover->string, file) == 0)
+      {
+       found_file = TRUE;
+       break;
+      }
+    else
+      rover = rover->next;
+
+  if (found_file)
+    return TRUE;  /* file had already been downloaded */
+  else
+    {
+      if (add_or_check == ADD_FILE)
+       {
+         rover = malloc(sizeof(slist));
+         rover->string = xstrdup(file);  /* die on out-of-mem. */
+         rover->next = downloaded_files;
+         downloaded_files = rover;
+       }
+
+      return FALSE;  /* file had not already been downloaded */
+    }
+}
diff --git a/src/url.h b/src/url.h

index 771f17a4e89776ebbca6c49cb300f84c3f65d42e..87044cdd497c9309d54bb15a5df376cd4936e57c 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -62,6 +62,12 @@ typedef struct _urlpos
    struct _urlpos *next;        /* Next struct in list */
  } urlpos;
  
+/* Controls how downloaded_file() behaves. */
+typedef enum
+{
+  ADD_FILE,
+  CHECK_FOR_FILE
+} downloaded_file_t;
  
  /* Function declarations */
  
@@ -95,4 +101,6 @@ int no_proxy_match PARAMS ((const char *, const char **));
  void convert_links PARAMS ((const char *, urlpos *));
  urlpos *add_url PARAMS ((urlpos *, const char *, const char *));
  
+boolean downloaded_file PARAMS ((downloaded_file_t, const char *));
+
  #endif /* URL_H */
diff --git a/src/wget.h b/src/wget.h

index cdc301cfced6a39e10c72a7868ab530d30a925fd..4467f5fbd6829a787f62d89b7855a2f22fc77368 100644 (file)
--- a/src/wget.h
+++ b/src/wget.h
@@ -198,4 +198,12 @@ typedef enum
    ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED, QUOTEXC, WRITEFAILED
  } uerr_t;
  
+typedef unsigned char  boolean;
+#ifndef FALSE
+#define FALSE 0
+#endif
+#ifndef TRUE
+#define TRUE  1
+#endif
+
  #endif /* WGET_H */
author	dan <devnull@localhost>
	Thu, 2 Mar 2000 06:33:48 +0000 (22:33 -0800)
committer	dan <devnull@localhost>
	Thu, 2 Mar 2000 06:33:48 +0000 (22:33 -0800)
ChangeLog		patch \| blob \| history
NEWS		patch \| blob \| history
TODO		patch \| blob \| history
src/ChangeLog		patch \| blob \| history
src/ftp.c		patch \| blob \| history
src/http.c		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history
src/wget.h		patch \| blob \| history