/* A list of unsafe characters for encoding, as per RFC1738. '@' and
':' (not listed in RFC) were added because of user/password
- encoding, and \033 for safe printing. */
+ encoding. */
#ifndef WINDOWS
-# define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
+# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
#else /* WINDOWS */
-# define URL_UNSAFE " <>\"%{}|\\^[]`\033"
+# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
#endif /* WINDOWS */
+#define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
+ || ((unsigned char)(c) > '~') /* ASCII 127 */ \
+ || strchr (URL_UNSAFE_CHARS, c))
+
/* If S contains unsafe characters, free it and replace it with a
version that doesn't. */
#define URL_CLEANSE(s) do \
{
int i;
- if (toupper (url[0]) == 'U'
- && toupper (url[1]) == 'R'
- && toupper (url[2]) == 'L'
+ if (TOUPPER (url[0]) == 'U'
+ && TOUPPER (url[1]) == 'R'
+ && TOUPPER (url[2]) == 'L'
&& url[3] == ':')
{
/* Skip blanks. */
contains_unsafe (const char *s)
{
for (; *s; s++)
- if (strchr (URL_UNSAFE, *s))
+ if (UNSAFE_CHAR (*s))
return 1;
return 0;
}
*p = '\0';
}
-/* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
- string, returning a malloc-ed %XX encoded string. */
+/* Encode the unsafe characters (as determined by URL_UNSAFE) in a
+ given string, returning a malloc-ed %XX encoded string. */
char *
encode_string (const char *s)
{
b = s;
for (i = 0; *s; s++, i++)
- if (strchr (URL_UNSAFE, *s))
+ if (UNSAFE_CHAR (*s))
i += 2; /* Two more characters (hex digits) */
res = (char *)xmalloc (i + 1);
s = b;
for (p = res; *s; s++)
- if (strchr (URL_UNSAFE, *s))
+ if (UNSAFE_CHAR (*s))
{
const unsigned char c = *s;
*p++ = '%';
{
u->ftp_type = process_ftp_type (u->path);
/* #### We don't handle type `d' correctly yet. */
- if (!u->ftp_type || toupper (u->ftp_type) == 'D')
+ if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
u->ftp_type = 'I';
}
DEBUGP (("opath %s -> ", u->path));
{
char *res, *host, *user, *passwd, *proto_name, *dir, *file;
int i, l, ln, lu, lh, lp, lf, ld;
+ unsigned short proto_default_port;
/* Look for the protocol name. */
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
if (i == ARRAY_SIZE (sup_protos))
return NULL;
proto_name = sup_protos[i].name;
+ proto_default_port = sup_protos[i].port;
host = CLEANDUP (u->host);
dir = CLEANDUP (u->dir);
file = CLEANDUP (u->file);
}
memcpy (res + l, host, lh);
l += lh;
- res[l++] = ':';
- long_to_string (res + l, (long)u->port);
- l += numdigit (u->port);
+ if (u->port != proto_default_port)
+ {
+ res[l++] = ':';
+ long_to_string (res + l, (long)u->port);
+ l += numdigit (u->port);
+ }
res[l++] = '/';
memcpy (res + l, dir, ld);
l += ld;
If SILENT is non-zero, do not barf on baseless relative links. */
urlpos *
-get_urls_html (const char *file, const char *this_url, int silent)
+get_urls_html (const char *file, const char *this_url, int silent,
+ int dash_p_leaf_HTML)
{
long nread;
FILE *fp;
first_time = 1;
/* Iterate over the URLs in BUF, picked by htmlfindurl(). */
for (buf = orig_buf;
- (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
+ (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
+ dash_p_leaf_HTML));
buf += step)
{
int i, no_proto;
void
convert_links (const char *file, urlpos *l)
{
- FILE *fp;
- char *buf, *p, *p2;
- long size;
- static slist* converted_files = NULL;
+ FILE *fp;
+ char *buf, *p, *p2;
+ downloaded_file_t downloaded_file_return;
+ long size;
logprintf (LOG_VERBOSE, _("Converting %s... "), file);
/* Read from the file.... */
/* ...to a buffer. */
load_file (fp, &buf, &size);
fclose (fp);
- if (opt.backup_converted)
+
+ downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
+
+ if (opt.backup_converted && downloaded_file_return)
/* Rather than just writing over the original .html file with the converted
- version, save the former to *.orig. */
+ version, save the former to *.orig. Note we only do this for files we've
+ _successfully_ downloaded, so we don't clobber .orig files sitting around
+ from previous invocations. */
{
/* Construct the backup filename as the original name plus ".orig". */
- size_t filename_len = strlen(file);
- char* filename_plus_orig_suffix = malloc(filename_len + sizeof(".orig"));
- int already_wrote_backup_file = 0;
- slist* converted_file_ptr;
+ size_t filename_len = strlen(file);
+ char* filename_plus_orig_suffix;
+ boolean already_wrote_backup_file = FALSE;
+ slist* converted_file_ptr;
+ static slist* converted_files = NULL;
- strcpy(filename_plus_orig_suffix, file);
- strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+ {
+ /* Just write "orig" over "html". We need to do it this way because
+ when we're checking to see if we've downloaded the file before (to
+ see if we can skip downloading it), we don't know if it's a
+ text/html file. Therefore we don't know yet at that stage that -E
+ is going to cause us to tack on ".html", so we need to compare
+ vs. the original URL plus ".orig", not the original URL plus
+ ".html.orig". */
+ filename_plus_orig_suffix = xmalloc(filename_len + 1);
+ strcpy(filename_plus_orig_suffix, file);
+ strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+ }
+ else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+ {
+ /* Append ".orig" to the name. */
+ filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
+ strcpy(filename_plus_orig_suffix, file);
+ strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ }
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file each
time in such a case, it'll end up containing the first-pass conversion,
- not the original file. */
+ not the original file. So, see if we've already been called on this
+ file. */
converted_file_ptr = converted_files;
while (converted_file_ptr != NULL)
if (strcmp(converted_file_ptr->string, file) == 0)
{
- already_wrote_backup_file = 1;
+ already_wrote_backup_file = TRUE;
break;
}
else
Note that we never free this memory since we need it till the
convert_all_links() call, which is one of the last things the
program does before terminating. BTW, I'm not sure if it would be
- safe to just set converted_file_ptr->string to file below, rather
- than making a copy of the string... */
- converted_file_ptr = malloc(sizeof(slist));
- converted_file_ptr->string = strdup(file);
+ safe to just set 'converted_file_ptr->string' to 'file' below,
+ rather than making a copy of the string... Another note is that I
+ thought I could just add a field to the urlpos structure saying
+ that we'd written a .orig file for this URL, but that didn't work,
+ so I had to make this separate list. */
+ converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
+ converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
converted_file_ptr->next = converted_files;
converted_files = converted_file_ptr;
}
free (buf);
return;
}
+ /* Presumably we have to loop through multiple URLs here (even though we're
+ only talking about a single local file) because of the -O option. */
for (p = buf; l; l = l->next)
{
if (l->pos >= size)
for (p2 = buf + l->pos; p < p2; p++)
putc (*p, fp);
if (l->flags & UABS2REL)
+ /* Convert absolute URL to relative. */
{
char *newname = construct_relative (file, l->local_name);
fprintf (fp, "%s", newname);
}
p += l->size;
}
+ /* Output the rest of the file. */
if (p - buf < size)
{
for (p2 = buf + size; p < p2; p++)
t->next = l;
return t;
}
+
+
+/* Remembers which files have been downloaded. In the standard case, should be
+ called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
+ download successfully (i.e. not for ones we have failures on or that we skip
+ due to -N).
+
+ When we've downloaded a file and tacked on a ".html" extension due to -E,
+ call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
+ FILE_DOWNLOADED_NORMALLY.
+
+ If you just want to check if a file has been previously added without adding
+ it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
+ with local filenames, not remote URLs. */
+downloaded_file_t
+downloaded_file (downloaded_file_t mode, const char* file)
+{
+ typedef struct _downloaded_file_list
+ {
+ char* file;
+ downloaded_file_t download_type;
+ struct _downloaded_file_list* next;
+ } downloaded_file_list;
+
+ boolean found_file = FALSE;
+ static downloaded_file_list* downloaded_files = NULL;
+ downloaded_file_list* rover = downloaded_files;
+
+ while (rover != NULL)
+ if (strcmp(rover->file, file) == 0)
+ {
+ found_file = TRUE;
+ break;
+ }
+ else
+ rover = rover->next;
+
+ if (found_file)
+ return rover->download_type; /* file had already been downloaded */
+ else
+ {
+ if (mode != CHECK_FOR_FILE)
+ {
+ rover = xmalloc(sizeof(*rover));
+ rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
+ rover->download_type = mode;
+ rover->next = downloaded_files;
+ downloaded_files = rover;
+ }
+
+ return FILE_NOT_ALREADY_DOWNLOADED;
+ }
+}