/* URL handling.
- Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+ Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
#endif /* WINDOWS */
-#define UNSAFE_CHAR(c) (((c) >= 0 && (c) <= 32) \
+#define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
+ || ((unsigned char)(c) > '~') /* ASCII 127 */ \
|| strchr (URL_UNSAFE_CHARS, c))
/* If S contains unsafe characters, free it and replace it with a
/* Is a directory ".."? */
#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
+#if 0
+static void path_simplify_with_kludge PARAMS ((char *));
+#endif
+static int urlpath_length PARAMS ((const char *));
+
/* NULL-terminated list of strings to be recognized as prototypes (URL
schemes). Note that recognized doesn't mean supported -- only HTTP
and FTP are currently supported.
*p = '\0';
}
-/* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
+/* Encode the unsafe characters (as determined by URL_UNSAFE) in a
given string, returning a malloc-ed %XX encoded string. */
char *
encode_string (const char *s)
}
/* If protocol is recognizable, but unsupported, bail out, else
suppose unknown. */
- if (recognizable && !sup_protos[i].name)
+ if (recognizable && i == ARRAY_SIZE (sup_protos))
return URLUNKNOWN;
else if (i == ARRAY_SIZE (sup_protos))
type = URLUNKNOWN;
strcat (u->path, *u->dir ? "/" : "");
strcat (u->path, u->file);
URL_CLEANSE (u->path);
+ DEBUGP (("newpath: %s\n", u->path));
/* Create the clean URL. */
u->url = str_url (u, 0);
return URLOK;
}
\f
+/* Special versions of DOTP and DDOTP for parse_dir(). */
+
+#define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
+#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
+ && (!*((x) + 2) || *((x) + 2) == '?'))
+
/* Build the directory and filename components of the path. Both
components are *separately* malloc-ed strings! It does not change
the contents of path.
{
int i, l;
- for (i = l = strlen (path); i && path[i] != '/'; i--);
+ l = urlpath_length (path);
+ for (i = l; i && path[i] != '/'; i--);
+
if (!i && *path != '/') /* Just filename */
{
- if (DOTP (path) || DDOTP (path))
+ if (PD_DOTP (path) || PD_DDOTP (path))
{
- *dir = xstrdup (path);
- *file = xstrdup ("");
+ *dir = strdupdelim (path, path + l);
+ *file = xstrdup (path + l); /* normally empty, but could
+ contain ?... */
}
else
{
}
else if (!i) /* /filename */
{
- if (DOTP (path + 1) || DDOTP (path + 1))
+ if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
{
- *dir = xstrdup (path);
- *file = xstrdup ("");
+ *dir = strdupdelim (path, path + l);
+ *file = xstrdup (path + l); /* normally empty, but could
+ contain ?... */
}
else
{
}
else /* Nonempty directory with or without a filename */
{
- if (DOTP (path + i + 1) || DDOTP (path + i + 1))
+ if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
{
- *dir = xstrdup (path);
- *file = xstrdup ("");
+ *dir = strdupdelim (path, path + l);
+ *file = xstrdup (path + l); /* normally empty, but could
+ contain ?... */
}
else
{
*dir = strdupdelim (path, path + i);
- *file = strdupdelim (path + i + 1, path + l + 1);
+ *file = xstrdup (path + i + 1);
}
}
}
return '\0';
}
\f
-/* Return the URL as fine-formed string, with a proper protocol, port
- number, directory and optional user/password. If HIDE is non-zero,
- password will be hidden. The forbidden characters in the URL will
- be cleansed. */
+/* Return the URL as fine-formed string, with a proper protocol,
+ optional port number, directory and optional user/password. If
+ HIDE is non-zero, password will be hidden. The forbidden
+ characters in the URL will be cleansed. */
char *
str_url (const struct urlinfo *u, int hide)
{
char *res, *host, *user, *passwd, *proto_name, *dir, *file;
int i, l, ln, lu, lh, lp, lf, ld;
+ unsigned short proto_default_port;
/* Look for the protocol name. */
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
if (i == ARRAY_SIZE (sup_protos))
return NULL;
proto_name = sup_protos[i].name;
+ proto_default_port = sup_protos[i].port;
host = CLEANDUP (u->host);
dir = CLEANDUP (u->dir);
file = CLEANDUP (u->file);
{
char *tmp = (char *)xmalloc (strlen (dir) + 3);
/*sprintf (tmp, "%%2F%s", dir + 1);*/
- *tmp = '%';
+ tmp[0] = '%';
tmp[1] = '2';
tmp[2] = 'F';
strcpy (tmp + 3, dir + 1);
}
memcpy (res + l, host, lh);
l += lh;
- res[l++] = ':';
- long_to_string (res + l, (long)u->port);
- l += numdigit (u->port);
+ if (u->port != proto_default_port)
+ {
+ res[l++] = ':';
+ long_to_string (res + l, (long)u->port);
+ l += numdigit (u->port);
+ }
res[l++] = '/';
memcpy (res + l, dir, ld);
l += ld;
If SILENT is non-zero, do not barf on baseless relative links. */
urlpos *
-get_urls_html (const char *file, const char *this_url, int silent)
+get_urls_html (const char *file, const char *this_url, int silent,
+ int dash_p_leaf_HTML)
{
long nread;
FILE *fp;
first_time = 1;
/* Iterate over the URLs in BUF, picked by htmlfindurl(). */
for (buf = orig_buf;
- (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
+ (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
+ dash_p_leaf_HTML));
buf += step)
{
int i, no_proto;
const char *pbuf = buf;
char *constr, *base;
const char *cbase;
+ char *needs_freeing, *url_data;
first_time = 0;
if (!size)
break;
+ /* It would be nice if we could avoid allocating memory in this
+ loop, but I don't see an easy way. To process the entities,
+ we need to either copy the data, or change it destructively.
+ I choose the former.
+
+ We have two pointers: needs_freeing and url_data, because the
+ code below does thing like url_data += <something>, and we
+ want to pass the original string to free(). */
+ needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
+ size = strlen (url_data);
+
for (i = 0; protostrings[i]; i++)
{
- if (!strncasecmp (protostrings[i], pbuf,
+ if (!strncasecmp (protostrings[i], url_data,
MINVAL (strlen (protostrings[i]), size)))
break;
}
/* Check for http:RELATIVE_URI. See below for details. */
if (protostrings[i]
- && !(strncasecmp (pbuf, "http:", 5) == 0
- && strncasecmp (pbuf, "http://", 7) != 0))
+ && !(strncasecmp (url_data, "http:", 5) == 0
+ && strncasecmp (url_data, "http://", 7) != 0))
{
no_proto = 0;
}
relative URI-s as <a href="http:URL">. Just strip off the
silly leading "http:" (as well as any leading blanks
before it). */
- if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
- pbuf += 5, size -= 5;
+ if ((size > 5) && !strncasecmp ("http:", url_data, 5))
+ url_data += 5, size -= 5;
}
if (!no_proto)
{
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
{
- if (!strncasecmp (sup_protos[i].name, pbuf,
+ if (!strncasecmp (sup_protos[i].name, url_data,
MINVAL (strlen (sup_protos[i].name), size)))
break;
}
/* Do *not* accept a non-supported protocol. */
if (i == ARRAY_SIZE (sup_protos))
- continue;
+ {
+ free (needs_freeing);
+ continue;
+ }
}
if (no_proto)
{
/* Use malloc, not alloca because this is called in
a loop. */
char *temp = (char *)malloc (size + 1);
- strncpy (temp, pbuf, size);
+ strncpy (temp, url_data, size);
temp[size] = '\0';
logprintf (LOG_NOTQUIET,
_("Error (%s): Link %s without a base provided.\n"),
file, temp);
free (temp);
}
+ free (needs_freeing);
continue;
}
if (this_url)
logprintf (LOG_NOTQUIET, _("\
Error (%s): Base %s relative, without referer URL.\n"),
file, cbase);
+ free (needs_freeing);
continue;
}
base = xstrdup (cbase);
}
- constr = construct (base, pbuf, size, no_proto);
+ constr = construct (base, url_data, size, no_proto);
free (base);
}
else /* has proto */
{
constr = (char *)xmalloc (size + 1);
- strncpy (constr, pbuf, size);
+ strncpy (constr, url_data, size);
constr[size] = '\0';
}
#ifdef DEBUG
tmp2 = html_base ();
/* Use malloc, not alloca because this is called in a loop. */
tmp = (char *)xmalloc (size + 1);
- strncpy (tmp, pbuf, size);
+ strncpy (tmp, url_data, size);
tmp[size] = '\0';
logprintf (LOG_ALWAYS,
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
memset (current, 0, sizeof (*current));
current->next = NULL;
current->url = constr;
- current->size = size;
- current->pos = pbuf - orig_buf;
+ current->size = step;
+ current->pos = buf - orig_buf;
/* A URL is relative if the host and protocol are not named,
and the name does not start with `/'. */
- if (no_proto && *pbuf != '/')
+ if (no_proto && *url_data != '/')
current->flags |= (URELATIVE | UNOPROTO);
else if (no_proto)
current->flags |= UNOPROTO;
+ free (needs_freeing);
}
free (orig_buf);
return name;
}
+/* Like strlen(), but allow the URL to be ended with '?'. */
+static int
+urlpath_length (const char *url)
+{
+ const char *q = strchr (url, '?');
+ if (q)
+ return q - url;
+ return strlen (url);
+}
+
+static const char *
+find_last_char (const char *b, const char *e, char c)
+{
+ for (; e > b; e--)
+ if (*e == c)
+ return e;
+ return NULL;
+}
+
/* Construct an absolute URL, given a (possibly) relative one. This
- is more tricky than it might seem, but it works. */
+ gets tricky if you want to cover all the "reasonable" cases, but
+ I'm satisfied with the result. */
static char *
construct (const char *url, const char *sub, int subsize, int no_proto)
{
if (no_proto)
{
- int i;
+ const char *end = url + urlpath_length (url);
if (*sub != '/')
{
- for (i = strlen (url); i && url[i] != '/'; i--);
- if (!i || (url[i] == url[i - 1]))
+ /* SUB is a relative URL: we need to replace everything
+ after last slash (possibly empty) with SUB.
+
+ So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
+ our result should be "whatever/foo/qux/xyzzy". */
+ int need_explicit_slash = 0;
+ int span;
+ const char *start_insert;
+ const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
+ if (!last_slash)
+ {
+ /* No slash found at all. Append SUB to what we have,
+ but we'll need a slash as a separator.
+
+ Example: if url == "foo" and sub == "qux/xyzzy", then
+ we cannot just append sub to url, because we'd get
+ "fooqux/xyzzy", whereas what we want is
+ "foo/qux/xyzzy".
+
+ To make sure the / gets inserted, we set
+ need_explicit_slash to 1. We also set start_insert
+ to end + 1, so that the length calculations work out
+ correctly for one more (slash) character. Accessing
+ that character is fine, since it will be the
+ delimiter, '\0' or '?'. */
+ /* example: "foo?..." */
+ /* ^ ('?' gets changed to '/') */
+ start_insert = end + 1;
+ need_explicit_slash = 1;
+ }
+ else
{
- int l = strlen (url);
- char *t = (char *)alloca (l + 2);
- strcpy (t, url);
- t[l] = '/';
- t[l + 1] = '\0';
- url = t;
- i = l;
+ /* example: "whatever/foo/bar" */
+ /* ^ */
+ start_insert = last_slash + 1;
}
- constr = (char *)xmalloc (i + 1 + subsize + 1);
- strncpy (constr, url, i + 1);
- constr[i + 1] = '\0';
- strncat (constr, sub, subsize);
+
+ span = start_insert - url;
+ constr = (char *)xmalloc (span + subsize + 1);
+ if (span)
+ memcpy (constr, url, span);
+ if (need_explicit_slash)
+ constr[span - 1] = '/';
+ if (subsize)
+ memcpy (constr + span, sub, subsize);
+ constr[span + subsize] = '\0';
}
else /* *sub == `/' */
{
- int fl;
-
- i = 0;
- do
- {
- for (; url[i] && url[i] != '/'; i++);
- if (!url[i])
- break;
- fl = (url[i] == url[i + 1] && url[i + 1] == '/');
- if (fl)
- i += 2;
- }
- while (fl);
- if (!url[i])
- {
- int l = strlen (url);
- char *t = (char *)alloca (l + 2);
- strcpy (t, url);
- t[l] = '/';
- t[l + 1] = '\0';
- url = t;
- }
- constr = (char *)xmalloc (i + 1 + subsize + 1);
- strncpy (constr, url, i);
- constr[i] = '\0';
- strncat (constr + i, sub, subsize);
- constr[i + subsize] = '\0';
- } /* *sub == `/' */
+ /* SUB is an absolute path: we need to replace everything
+ after (and including) the FIRST slash with SUB.
+
+ So, if URL is "http://host/whatever/foo/bar", and SUB is
+ "/qux/xyzzy", our result should be
+ "http://host/qux/xyzzy". */
+ int span;
+ const char *slash, *start_insert;
+ const char *pos = url;
+ int seen_slash_slash = 0;
+ /* We're looking for the first slash, but want to ignore
+ double slash. */
+ again:
+ slash = memchr (pos, '/', end - pos);
+ if (slash && !seen_slash_slash)
+ if (*(slash + 1) == '/')
+ {
+ pos = slash + 2;
+ seen_slash_slash = 1;
+ goto again;
+ }
+
+ /* At this point, SLASH is the location of the first / after
+ "//", or the first slash altogether. START_INSERT is the
+ pointer to the location where SUB will be inserted. When
+ examining the last two examples, keep in mind that SUB
+ begins with '/'. */
+
+ if (!slash && !seen_slash_slash)
+ /* example: "foo" */
+ /* ^ */
+ start_insert = url;
+ else if (!slash && seen_slash_slash)
+ /* example: "http://foo" */
+ /* ^ */
+ start_insert = end;
+ else if (slash && !seen_slash_slash)
+ /* example: "foo/bar" */
+ /* ^ */
+ start_insert = url;
+ else if (slash && seen_slash_slash)
+ /* example: "http://something/" */
+ /* ^ */
+ start_insert = slash;
+
+ span = start_insert - url;
+ constr = (char *)xmalloc (span + subsize + 1);
+ if (span)
+ memcpy (constr, url, span);
+ if (subsize)
+ memcpy (constr + span, sub, subsize);
+ constr[span + subsize] = '\0';
+ }
}
else /* !no_proto */
{
- constr = (char *)xmalloc (subsize + 1);
- strncpy (constr, sub, subsize);
- constr[subsize] = '\0';
+ constr = strdupdelim (sub, sub + subsize);
}
return constr;
}
+
+/* Like the function above, but with a saner caller interface. */
+char *
+url_concat (const char *base_url, const char *new_url)
+{
+ return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
+}
\f
/* Optimize URL by host, destructively replacing u->host with realhost
(u->host). Do this regardless of opt.simple_check. */
free (u->url);
u->url = str_url (u, 0);
}
+
+/* This beautiful kludge is fortunately not needed, as I've made
+ parse_dir do the (almost) right thing, so that a query can never
+ become a part of directory. */
+#if 0
+/* Call path_simplify, but make sure that the part after the
+ question-mark, if any, is not destroyed by path_simplify's
+ "optimizations". */
+void
+path_simplify_with_kludge (char *path)
+{
+ char *query = strchr (path, '?');
+ if (query)
+ /* path_simplify also works destructively, so we also have the
+ license to write. */
+ *query = '\0';
+ path_simplify (path);
+ if (query)
+ {
+ char *newend = path + strlen (path);
+ *query = '?';
+ if (newend != query)
+ memmove (newend, query, strlen (query) + 1);
+ }
+}
+#endif
\f
/* Returns proxy host address, in accordance with PROTO. */
char *
void
convert_links (const char *file, urlpos *l)
{
- FILE *fp;
- char *buf, *p, *p2;
- long size;
+ FILE *fp;
+ char *buf, *p, *p2;
+ downloaded_file_t downloaded_file_return;
+ long size;
logprintf (LOG_VERBOSE, _("Converting %s... "), file);
/* Read from the file.... */
/* ...to a buffer. */
load_file (fp, &buf, &size);
fclose (fp);
- if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
+
+ downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
+
+ if (opt.backup_converted && downloaded_file_return)
/* Rather than just writing over the original .html file with the converted
version, save the former to *.orig. Note we only do this for files we've
_successfully_ downloaded, so we don't clobber .orig files sitting around
{
/* Construct the backup filename as the original name plus ".orig". */
size_t filename_len = strlen(file);
- char* filename_plus_orig_suffix = malloc(filename_len +
- sizeof(".orig"));
+ char* filename_plus_orig_suffix;
boolean already_wrote_backup_file = FALSE;
slist* converted_file_ptr;
static slist* converted_files = NULL;
- /* Would a single s[n]printf() call be faster? */
- strcpy(filename_plus_orig_suffix, file);
- strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+ {
+ /* Just write "orig" over "html". We need to do it this way because
+ when we're checking to see if we've downloaded the file before (to
+ see if we can skip downloading it), we don't know if it's a
+ text/html file. Therefore we don't know yet at that stage that -E
+ is going to cause us to tack on ".html", so we need to compare
+ vs. the original URL plus ".orig", not the original URL plus
+ ".html.orig". */
+ filename_plus_orig_suffix = xmalloc(filename_len + 1);
+ strcpy(filename_plus_orig_suffix, file);
+ strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+ }
+ else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+ {
+ /* Append ".orig" to the name. */
+ filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
+ strcpy(filename_plus_orig_suffix, file);
+ strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ }
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file each
thought I could just add a field to the urlpos structure saying
that we'd written a .orig file for this URL, but that didn't work,
so I had to make this separate list. */
- converted_file_ptr = malloc(sizeof(slist));
+ converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
converted_file_ptr->next = converted_files;
converted_files = converted_file_ptr;
free (buf);
return;
}
- /* [If someone understands why multiple URLs can correspond to one local file,
- can they please add a comment here...?] */
+ /* Presumably we have to loop through multiple URLs here (even though we're
+ only talking about a single local file) because of the -O option. */
for (p = buf; l; l = l->next)
{
if (l->pos >= size)
for (p2 = buf + l->pos; p < p2; p++)
putc (*p, fp);
if (l->flags & UABS2REL)
+ /* Convert absolute URL to relative. */
{
char *newname = construct_relative (file, l->local_name);
fprintf (fp, "%s", newname);
}
p += l->size;
}
+ /* Output the rest of the file. */
if (p - buf < size)
{
for (p2 = buf + size; p < p2; p++)
}
-/* Remembers which files have been downloaded. Should be called with
- add_or_check == ADD_FILE for each file we actually download successfully
- (i.e. not for ones we have failures on or that we skip due to -N). If you
- just want to check if a file has been previously added without adding it,
- call with add_or_check == CHECK_FOR_FILE. Please be sure to call this
- function with local filenames, not remote URLs -- by some means that isn't
- commented well enough for me understand, multiple remote URLs can apparently
- correspond to a single local file. */
-boolean
-downloaded_file (downloaded_file_t add_or_check, const char* file)
+/* Remembers which files have been downloaded. In the standard case, should be
+ called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
+ download successfully (i.e. not for ones we have failures on or that we skip
+ due to -N).
+
+ When we've downloaded a file and tacked on a ".html" extension due to -E,
+ call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
+ FILE_DOWNLOADED_NORMALLY.
+
+ If you just want to check if a file has been previously added without adding
+ it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
+ with local filenames, not remote URLs. */
+downloaded_file_t
+downloaded_file (downloaded_file_t mode, const char* file)
{
- boolean found_file = FALSE;
- static slist* downloaded_files = NULL;
- slist* rover = downloaded_files;
+ typedef struct _downloaded_file_list
+ {
+ char* file;
+ downloaded_file_t download_type;
+ struct _downloaded_file_list* next;
+ } downloaded_file_list;
+
+ boolean found_file = FALSE;
+ static downloaded_file_list* downloaded_files = NULL;
+ downloaded_file_list* rover = downloaded_files;
while (rover != NULL)
- if (strcmp(rover->string, file) == 0)
+ if (strcmp(rover->file, file) == 0)
{
found_file = TRUE;
break;
rover = rover->next;
if (found_file)
- return TRUE; /* file had already been downloaded */
+ return rover->download_type; /* file had already been downloaded */
else
{
- if (add_or_check == ADD_FILE)
+ if (mode != CHECK_FOR_FILE)
{
- rover = malloc(sizeof(slist));
- rover->string = xstrdup(file); /* die on out-of-mem. */
+ rover = xmalloc(sizeof(*rover));
+ rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
+ rover->download_type = mode;
rover->next = downloaded_files;
downloaded_files = rover;
}
- return FALSE; /* file had not already been downloaded */
+ return FILE_NOT_ALREADY_DOWNLOADED;
}
}