From: dan Date: Thu, 2 Mar 2000 06:33:48 +0000 (-0800) Subject: [svn] Implemented the item I formerly had in the TODO: When -K and -N are used X-Git-Tag: v1.13~2519 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=4331c39c9ad0a942bd2b1eb8663e762d2403b90d [svn] Implemented the item I formerly had in the TODO: When -K and -N are used together, we compare local file X.orig (if extant) against server file X. Previously -k and -N were worthless in combination because the local converted files always differed from the server versions. --- diff --git a/ChangeLog b/ChangeLog index 92039a92..6edc5cbc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2000-03-01 Dan Harkless + + * NEWS (-K): Now possible to use -N with -k thanks to this option. + + * TODO: Removed the -K / -N interaction item. + 2000-02-29 Dan Harkless * NEWS (-K / --backup-converted): Mentioned this new option. diff --git a/NEWS b/NEWS index 35c5dc41..eeab6a3e 100644 --- a/NEWS +++ b/NEWS @@ -8,7 +8,8 @@ Please send GNU Wget bug reports to . * Changes in Wget 1.5.3+dev ** New -K / --backup-converted / backup_converted = on option causes files -modified due to -k to be saved with a .orig prefix before being changed. +modified due to -k to be saved with a .orig prefix before being changed. When +using -N as well, it is these .orig files that are compared against the server. * Wget 1.5.3 is a bugfix release with no user-visible changes. diff --git a/TODO b/TODO index f25ac4f2..9f547699 100644 --- a/TODO +++ b/TODO @@ -76,6 +76,3 @@ particular order. Not all of them are user-visible changes. * Implement more HTTP/1.1 bells and whistles (ETag, Content-MD5 etc.) * Support SSL encryption through SSLeay or OpenSSL. - -* When -K is used with -N, check local file X.orig (if extant) against server - file X. diff --git a/src/ChangeLog b/src/ChangeLog index 36613747..54704959 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,35 @@ +2000-03-01 Dan Harkless + + * ftp.c (ftp_loop_internal): Call new downloaded_file() function, + even though we don't do conversion on HTML files retrieved via + FTP, so _current_ usage of downloaded_file() makes this call unneeded. + (ftp_retrieve_list): Added a comment saying where we need to + stat() a .orig file if FTP'd HTML file conversion is ever implemented. + (ftp_retrieve_list): "Local file '%s' is more recent," is sometimes + a lie -- reworded as "Server file no newer than local file '%s' --". + + * http.c (http_loop): Fixed a typo and clarified a comment. + (http_loop): When -K and -N are specified together, compare size + and timestamp of server file X against local file X.orig (if + extant) rather than converted local file X. + (http_loop): "Local file '%s' is more recent," is sometimes a lie + -- reworded as "Server file no newer than local file '%s' --". + (http_loop): Call new downloaded_file() function to prevent + wrongful overwriting of .orig file when -N is specified. + + * url.c (convert_links): When -K specified, only rename X to + X.orig if downloaded_file() returns TRUE. Otherwise when we skip + file X due to -N, we clobber an X.orig from a previous invocation. + (convert_links): Call the failsafe xstrdup(), not the real strdup(). + (convert_links): Added a note asking anyone who understands how + multiple URLs can correspond to a single file to comment it. + (downloaded_file): Added this new function. + + * url.h (downloaded_file): Added prototype for this new function + as well as its downloaded_file_t enum type. + + * wget.h (boolean): Added this new typedef and TRUE and FALSE #defines. + 2000-02-29 Dan Harkless * version.c: Upped version to developer-only "1.5.3+dev". diff --git a/src/ftp.c b/src/ftp.c index 61507db4..9077c674 100644 --- a/src/ftp.c +++ b/src/ftp.c @@ -947,6 +947,11 @@ ftp_loop_internal (struct urlinfo *u, struct fileinfo *f, ccon *con) /* Not as great. */ abort (); } + + /* If we get out of the switch above without continue'ing, we've + successfully downloaded a file. Remember this fact. */ + downloaded_file(ADD_FILE, locf); + if (con->st & ON_YOUR_OWN) { CLOSE (RBUF_FD (&con->rbuf)); @@ -1086,6 +1091,11 @@ ftp_retrieve_list (struct urlinfo *u, struct fileinfo *f, ccon *con) if (opt.timestamping && f->type == FT_PLAINFILE) { struct stat st; + /* If conversion of HTML files retrieved via FTP is ever implemented, + we'll need to stat() .orig here when -K has been specified. + I'm not implementing it now since files on an FTP server are much + more likely than files on an HTTP server to legitimately have a + .orig suffix. */ if (!stat (u->local, &st)) { /* Else, get it from the file. */ @@ -1094,13 +1104,13 @@ ftp_retrieve_list (struct urlinfo *u, struct fileinfo *f, ccon *con) if (local_size == f->size && tml >= f->tstamp) { logprintf (LOG_VERBOSE, _("\ -Local file `%s' is more recent, not retrieving.\n\n"), u->local); +Server file no newer than local file `%s' -- not retrieving.\n\n"), u->local); dlthis = 0; } else if (local_size != f->size) { logprintf (LOG_VERBOSE, _("\ -The sizes do not match (local %ld), retrieving.\n"), local_size); +The sizes do not match (local %ld) -- retrieving.\n"), local_size); } } } /* opt.timestamping && f->type == FT_PLAINFILE */ diff --git a/src/http.c b/src/http.c index 9ea0670f..c79e6fec 100644 --- a/src/http.c +++ b/src/http.c @@ -840,6 +840,7 @@ http_loop (struct urlinfo *u, char **newloc, int *dt) static int first_retrieval = 1; int count; + int local_dot_orig_file_exists = FALSE; int use_ts, got_head = 0; /* time-stamping info */ char *tms, *suf, *locf, *tmrate; uerr_t err; @@ -851,7 +852,7 @@ http_loop (struct urlinfo *u, char **newloc, int *dt) *newloc = NULL; /* Warn on (likely bogus) wildcard usage in HTTP. Don't use - has_wildcards_p because it would also warn on `?', and we that + has_wildcards_p because it would also warn on `?', and we know that shows up in CGI paths a *lot*. */ if (strchr (u->url, '*')) logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n")); @@ -888,7 +889,43 @@ File `%s' already there, will not retrieve.\n"), u->local); use_ts = 0; if (opt.timestamping) { - if (stat (u->local, &st) == 0) + boolean local_file_exists = FALSE; + + if (opt.backup_converted) + /* If -K is specified, we'll act on the assumption that it was specified + last time these files were downloaded as well, and instead of just + comparing local file X against server file X, we'll compare local + file X.orig (if extant, else X) against server file X. If -K + _wasn't_ specified last time, or the server contains files called + *.orig, -N will be back to not operating correctly with -k. */ + { + size_t filename_len = strlen(u->local); + char* filename_plus_orig_suffix = malloc(filename_len + + sizeof(".orig")); + + /* Would a single s[n]printf() call be faster? */ + strcpy(filename_plus_orig_suffix, u->local); + strcpy(filename_plus_orig_suffix + filename_len, ".orig"); + + /* Try to stat() the .orig file. */ + if (stat(filename_plus_orig_suffix, &st) == 0) + { + local_file_exists = TRUE; + local_dot_orig_file_exists = TRUE; + } + + free(filename_plus_orig_suffix); + } + + if (!local_dot_orig_file_exists) + /* Couldn't stat() .orig, so try to stat() . */ + if (stat (u->local, &st) == 0) + local_file_exists = TRUE; + + if (local_file_exists) + /* There was a local file, so we'll check later to see if the version + the server has is the same version we already have, allowing us to + skip a download. */ { use_ts = 1; tml = st.st_mtime; @@ -1051,14 +1088,26 @@ Last-modified header invalid -- time-stamp ignored.\n")); if (tml >= tmr && (hstat.contlen == -1 || local_size == hstat.contlen)) { - logprintf (LOG_VERBOSE, _("\ -Local file `%s' is more recent, not retrieving.\n\n"), u->local); + if (local_dot_orig_file_exists) + /* We can't collapse this down into just one logprintf() + call with a variable set to u->local or the .orig + filename because we have to malloc() space for the + latter, and because there are multiple returns above (a + coding style no-no by many measures, for reasons such as + this) we'd have to remember to free() the string at each + one to avoid a memory leak. */ + logprintf (LOG_VERBOSE, _("\ +Server file no newer than local file `%s.orig' -- not retrieving.\n\n"), + u->local); + else + logprintf (LOG_VERBOSE, _("\ +Server file no newer than local file `%s' -- not retrieving.\n\n"), u->local); FREEHSTAT (hstat); return RETROK; } else if (tml >= tmr) logprintf (LOG_VERBOSE, _("\ -The sizes do not match (local %ld), retrieving.\n"), local_size); +The sizes do not match (local %ld) -- retrieving.\n"), local_size); else logputs (LOG_VERBOSE, _("Remote file is newer, retrieving.\n")); @@ -1103,12 +1152,13 @@ The sizes do not match (local %ld), retrieving.\n"), local_size); } ++opt.numurls; opt.downloaded += hstat.len; + downloaded_file(ADD_FILE, locf); return RETROK; } else if (hstat.res == 0) /* No read error */ { - if (hstat.contlen == -1) /* We don't know how much we were - supposed to get, so... */ + if (hstat.contlen == -1) /* We don't know how much we were supposed + to get, so assume we succeeded. */ { if (*dt & RETROKF) { @@ -1121,6 +1171,7 @@ The sizes do not match (local %ld), retrieving.\n"), local_size); } ++opt.numurls; opt.downloaded += hstat.len; + downloaded_file(ADD_FILE, locf); return RETROK; } else if (hstat.len < hstat.contlen) /* meaning we lost the @@ -1142,6 +1193,7 @@ The sizes do not match (local %ld), retrieving.\n"), local_size); tms, u->url, hstat.len, hstat.contlen, locf, count); ++opt.numurls; opt.downloaded += hstat.len; + downloaded_file(ADD_FILE, locf); return RETROK; } else /* the same, but not accepted */ diff --git a/src/retr.c b/src/retr.c index 59250054..d90fa529 100644 --- a/src/retr.c +++ b/src/retr.c @@ -350,7 +350,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (result != URLOK || u->proto != URLHTTP) { if (u->proto == URLHTTP) - logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg (result)); + logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg(result)); else logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy); freeurl (u, 1); @@ -455,7 +455,8 @@ retrieve_from_file (const char *file, int html, int *count) } status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt); if (opt.recursive && status == RETROK && (dt & TEXTHTML)) - status = recursive_retrieve (filename, new_file ? new_file : cur_url->url); + status = recursive_retrieve (filename, new_file ? new_file + : cur_url->url); if (filename && opt.delete_after && file_exists_p (filename)) { diff --git a/src/url.c b/src/url.c index 87a97b30..33aa37b8 100644 --- a/src/url.c +++ b/src/url.c @@ -1366,10 +1366,9 @@ no_proxy_match (const char *host, const char **no_proxy) void convert_links (const char *file, urlpos *l) { - FILE *fp; - char *buf, *p, *p2; - long size; - static slist* converted_files = NULL; + FILE *fp; + char *buf, *p, *p2; + long size; logprintf (LOG_VERBOSE, _("Converting %s... "), file); /* Read from the file.... */ @@ -1383,28 +1382,34 @@ convert_links (const char *file, urlpos *l) /* ...to a buffer. */ load_file (fp, &buf, &size); fclose (fp); - if (opt.backup_converted) + if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file)) /* Rather than just writing over the original .html file with the converted - version, save the former to *.orig. */ + version, save the former to *.orig. Note we only do this for files we've + _successfully_ downloaded, so we don't clobber .orig files sitting around + from previous invocations. */ { /* Construct the backup filename as the original name plus ".orig". */ - size_t filename_len = strlen(file); - char* filename_plus_orig_suffix = malloc(filename_len + sizeof(".orig")); - int already_wrote_backup_file = 0; - slist* converted_file_ptr; - + size_t filename_len = strlen(file); + char* filename_plus_orig_suffix = malloc(filename_len + + sizeof(".orig")); + boolean already_wrote_backup_file = FALSE; + slist* converted_file_ptr; + static slist* converted_files = NULL; + + /* Would a single s[n]printf() call be faster? */ strcpy(filename_plus_orig_suffix, file); strcpy(filename_plus_orig_suffix + filename_len, ".orig"); /* We can get called twice on the same URL thanks to the convert_all_links() call in main(). If we write the .orig file each time in such a case, it'll end up containing the first-pass conversion, - not the original file. */ + not the original file. So, see if we've already been called on this + file. */ converted_file_ptr = converted_files; while (converted_file_ptr != NULL) if (strcmp(converted_file_ptr->string, file) == 0) { - already_wrote_backup_file = 1; + already_wrote_backup_file = TRUE; break; } else @@ -1421,10 +1426,13 @@ convert_links (const char *file, urlpos *l) Note that we never free this memory since we need it till the convert_all_links() call, which is one of the last things the program does before terminating. BTW, I'm not sure if it would be - safe to just set converted_file_ptr->string to file below, rather - than making a copy of the string... */ + safe to just set 'converted_file_ptr->string' to 'file' below, + rather than making a copy of the string... Another note is that I + thought I could just add a field to the urlpos structure saying + that we'd written a .orig file for this URL, but that didn't work, + so I had to make this separate list. */ converted_file_ptr = malloc(sizeof(slist)); - converted_file_ptr->string = strdup(file); + converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */ converted_file_ptr->next = converted_files; converted_files = converted_file_ptr; } @@ -1440,6 +1448,8 @@ convert_links (const char *file, urlpos *l) free (buf); return; } + /* [If someone understands why multiple URLs can correspond to one local file, + can they please add a comment here...?] */ for (p = buf; l; l = l->next) { if (l->pos >= size) @@ -1547,3 +1557,44 @@ add_url (urlpos *l, const char *url, const char *file) t->next = l; return t; } + + +/* Remembers which files have been downloaded. Should be called with + add_or_check == ADD_FILE for each file we actually download successfully + (i.e. not for ones we have failures on or that we skip due to -N). If you + just want to check if a file has been previously added without adding it, + call with add_or_check == CHECK_FOR_FILE. Please be sure to call this + function with local filenames, not remote URLs -- by some means that isn't + commented well enough for me understand, multiple remote URLs can apparently + correspond to a single local file. */ +boolean +downloaded_file (downloaded_file_t add_or_check, const char* file) +{ + boolean found_file = FALSE; + static slist* downloaded_files = NULL; + slist* rover = downloaded_files; + + while (rover != NULL) + if (strcmp(rover->string, file) == 0) + { + found_file = TRUE; + break; + } + else + rover = rover->next; + + if (found_file) + return TRUE; /* file had already been downloaded */ + else + { + if (add_or_check == ADD_FILE) + { + rover = malloc(sizeof(slist)); + rover->string = xstrdup(file); /* die on out-of-mem. */ + rover->next = downloaded_files; + downloaded_files = rover; + } + + return FALSE; /* file had not already been downloaded */ + } +} diff --git a/src/url.h b/src/url.h index 771f17a4..87044cdd 100644 --- a/src/url.h +++ b/src/url.h @@ -62,6 +62,12 @@ typedef struct _urlpos struct _urlpos *next; /* Next struct in list */ } urlpos; +/* Controls how downloaded_file() behaves. */ +typedef enum +{ + ADD_FILE, + CHECK_FOR_FILE +} downloaded_file_t; /* Function declarations */ @@ -95,4 +101,6 @@ int no_proxy_match PARAMS ((const char *, const char **)); void convert_links PARAMS ((const char *, urlpos *)); urlpos *add_url PARAMS ((urlpos *, const char *, const char *)); +boolean downloaded_file PARAMS ((downloaded_file_t, const char *)); + #endif /* URL_H */ diff --git a/src/wget.h b/src/wget.h index cdc301cf..4467f5fb 100644 --- a/src/wget.h +++ b/src/wget.h @@ -198,4 +198,12 @@ typedef enum ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED, QUOTEXC, WRITEFAILED } uerr_t; +typedef unsigned char boolean; +#ifndef FALSE +#define FALSE 0 +#endif +#ifndef TRUE +#define TRUE 1 +#endif + #endif /* WGET_H */