#include "utils.h"
#include "url.h"
#include "host.h"
+#include "hash.h"
#ifndef errno
extern int errno;
struct scheme_data
{
- enum url_scheme scheme;
char *leading_string;
int default_port;
};
/* Supported schemes: */
static struct scheme_data supported_schemes[] =
{
- { SCHEME_HTTP, "http://", DEFAULT_HTTP_PORT },
+ { "http://", DEFAULT_HTTP_PORT },
#ifdef HAVE_SSL
- { SCHEME_HTTPS, "https://", DEFAULT_HTTPS_PORT },
+ { "https://", DEFAULT_HTTPS_PORT },
#endif
- { SCHEME_FTP, "ftp://", DEFAULT_FTP_PORT }
+ { "ftp://", DEFAULT_FTP_PORT },
+
+ /* SCHEME_INVALID */
+ { NULL, -1 }
};
-static void parse_dir PARAMS ((const char *, char **, char **));
-static uerr_t parse_uname PARAMS ((const char *, char **, char **));
static char *construct_relative PARAMS ((const char *, const char *));
-static char process_ftp_type PARAMS ((char *));
\f
/* Support for encoding and decoding of URL strings. We determine
#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
-/* rfc1738 reserved chars. We don't use this yet; preservation of
- reserved chars will be implemented when I integrate the new
- `reencode_string' function. */
+/* rfc1738 reserved chars, preserved from encoding. */
#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
-/* Unsafe chars:
- - anything <= 32;
- - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
- - '@' and ':'; needed for encoding URL username and password.
- - anything >= 127. */
+/* rfc1738 unsafe chars, plus some more. */
#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
- U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */
+ U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
- 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */
+ 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
{
if (UNSAFE_CHAR (*p1))
{
- const unsigned char c = *p1++;
+ unsigned char c = *p1++;
*p2++ = '%';
*p2++ = XDIGIT_TO_XCHAR (c >> 4);
*p2++ = XDIGIT_TO_XCHAR (c & 0xf);
} \
} while (0)
\f
+enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
+
+/* Decide whether to encode, decode, or pass through the char at P.
+ This used to be a macro, but it got a little too convoluted. */
+static inline enum copy_method
+decide_copy_method (const char *p)
+{
+ if (*p == '%')
+ {
+ if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
+ {
+ /* %xx sequence: decode it, unless it would decode to an
+ unsafe or a reserved char; in that case, leave it as
+ is. */
+ char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
+ XCHAR_TO_XDIGIT (*(p + 2));
+
+ if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
+ return CM_PASSTHROUGH;
+ else
+ return CM_DECODE;
+ }
+ else
+ /* Garbled %.. sequence: encode `%'. */
+ return CM_ENCODE;
+ }
+ else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
+ return CM_ENCODE;
+ else
+ return CM_PASSTHROUGH;
+}
+
+/* Translate a %-quoting (but possibly non-conformant) input string S
+ into a %-quoting (and conformant) output string. If no characters
+ are encoded or decoded, return the same string S; otherwise, return
+ a freshly allocated string with the new contents.
+
+ After a URL has been run through this function, the protocols that
+ use `%' as the quote character can use the resulting string as-is,
+ while those that don't call decode_string() to get to the intended
+ data. This function is also stable: after an input string is
+ transformed the first time, all further transformations of the
+ result yield the same result string.
+
+ Let's discuss why this function is needed.
+
+ Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
+ space character would mess up the HTTP request, it needs to be
+ quoted, like this:
+
+ GET /abc%20def HTTP/1.0
+
+ So it appears that the unsafe chars need to be quoted, as with
+ encode_string. But what if we're requested to download
+ `abc%20def'? Remember that %-encoding is valid URL syntax, so what
+ the user meant was a literal space, and he was kind enough to quote
+ it. In that case, Wget should obviously leave the `%20' as is, and
+ send the same request as above. So in this case we may not call
+ encode_string.
+
+ But what if the requested URI is `abc%20 def'? If we call
+ encode_string, we end up with `/abc%2520%20def', which is almost
+ certainly not intended. If we don't call encode_string, we are
+ left with the embedded space and cannot send the request. What the
+ user meant was for Wget to request `/abc%20%20def', and this is
+ where reencode_string kicks in.
+
+ Wget used to solve this by first decoding %-quotes, and then
+ encoding all the "unsafe" characters found in the resulting string.
+ This was wrong because it didn't preserve certain URL special
+ (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
+ == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
+ whether we considered `+' reserved (it is). One of these results
+ is inevitable because by the second step we would lose information
+ on whether the `+' was originally encoded or not. Both results
+ were wrong because in CGI parameters + means space, while %2B means
+ literal plus. reencode_string correctly translates the above to
+ "a%2B+b", i.e. returns the original string.
+
+ This function uses an algorithm proposed by Anon Sricharoenchai:
+
+ 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
+ hexdigits.
+
+ 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
+ "+".
+
+ ...except that this code conflates the two steps, and decides
+ whether to encode, decode, or pass through each character in turn.
+ The function still uses two passes, but their logic is the same --
+ the first pass exists merely for the sake of allocation. Another
+ small difference is that we include `+' to URL_RESERVED.
+
+ Anon's test case:
+
+ "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
+ ->
+ "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
+
+ Simpler test cases:
+
+ "foo bar" -> "foo%20bar"
+ "foo%20bar" -> "foo%20bar"
+ "foo %20bar" -> "foo%20%20bar"
+ "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
+ "foo%25%20bar" -> "foo%25%20bar"
+ "foo%2%20bar" -> "foo%252%20bar"
+ "foo+bar" -> "foo+bar" (plus is reserved!)
+ "foo%2b+bar" -> "foo%2b+bar" */
+
+char *
+reencode_string (const char *s)
+{
+ const char *p1;
+ char *newstr, *p2;
+ int oldlen, newlen;
+
+ int encode_count = 0;
+ int decode_count = 0;
+
+ /* First, pass through the string to see if there's anything to do,
+ and to calculate the new length. */
+ for (p1 = s; *p1; p1++)
+ {
+ switch (decide_copy_method (p1))
+ {
+ case CM_ENCODE:
+ ++encode_count;
+ break;
+ case CM_DECODE:
+ ++decode_count;
+ break;
+ case CM_PASSTHROUGH:
+ break;
+ }
+ }
+
+ if (!encode_count && !decode_count)
+ /* The string is good as it is. */
+ return (char *)s; /* C const model sucks. */
+
+ oldlen = p1 - s;
+ /* Each encoding adds two characters (hex digits), while each
+ decoding removes two characters. */
+ newlen = oldlen + 2 * (encode_count - decode_count);
+ newstr = xmalloc (newlen + 1);
+
+ p1 = s;
+ p2 = newstr;
+
+ while (*p1)
+ {
+ switch (decide_copy_method (p1))
+ {
+ case CM_ENCODE:
+ {
+ unsigned char c = *p1++;
+ *p2++ = '%';
+ *p2++ = XDIGIT_TO_XCHAR (c >> 4);
+ *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
+ }
+ break;
+ case CM_DECODE:
+ *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
+ + (XCHAR_TO_XDIGIT (*(p1 + 2))));
+ p1 += 3; /* skip %xx */
+ break;
+ case CM_PASSTHROUGH:
+ *p2++ = *p1++;
+ }
+ }
+ *p2 = '\0';
+ assert (p2 - newstr == newlen);
+ return newstr;
+}
+
+/* Run PTR_VAR through reencode_string. If a new string is consed,
+ free PTR_VAR and make it point to the new storage. Obviously,
+ PTR_VAR needs to be an lvalue. */
+
+#define REENCODE(ptr_var) do { \
+ char *rf_new = reencode_string (ptr_var); \
+ if (rf_new != ptr_var) \
+ { \
+ xfree (ptr_var); \
+ ptr_var = rf_new; \
+ } \
+} while (0)
+\f
/* Returns the scheme type if the scheme is supported, or
SCHEME_INVALID if not. */
enum url_scheme
{
int i;
- for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
+ for (i = 0; supported_schemes[i].leading_string; i++)
if (!strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string)))
- return supported_schemes[i].scheme;
+ return (enum url_scheme)i;
return SCHEME_INVALID;
}
return *p == ':';
}
+int
+scheme_default_port (enum url_scheme scheme)
+{
+ return supported_schemes[scheme].default_port;
+}
+
/* Skip the username and password, if present here. The function
should be called *not* with the complete URL, but with the part
right after the scheme.
url_skip_uname (const char *url)
{
const char *p;
- const char *q = NULL;
- for (p = url ; *p && *p != '/'; p++)
- if (*p == '@') q = p;
- /* If a `@' was found before the first occurrence of `/', skip
- it. */
- if (q != NULL)
- return q - url + 1;
- else
+
+ /* Look for '@' that comes before '/' or '?'. */
+ p = (const char *)strpbrk (url, "/?@");
+ if (!p || *p != '@')
return 0;
+
+ return p - url + 1;
+}
+
+static int
+parse_uname (const char *str, int len, char **user, char **passwd)
+{
+ char *colon;
+
+ if (len == 0)
+ /* Empty user name not allowed. */
+ return 0;
+
+ colon = memchr (str, ':', len);
+ if (colon == str)
+ /* Empty user name again. */
+ return 0;
+
+ if (colon)
+ {
+ int pwlen = len - (colon + 1 - str);
+ *passwd = xmalloc (pwlen + 1);
+ memcpy (*passwd, colon + 1, pwlen);
+ (*passwd)[pwlen] = '\0';
+ len -= pwlen + 1;
+ }
+ else
+ *passwd = NULL;
+
+ *user = xmalloc (len + 1);
+ memcpy (*user, str, len);
+ (*user)[len] = '\0';
+
+ return 1;
}
/* Used by main.c: detect URLs written using the "shorthand" URL forms
If the URL needs not or cannot be rewritten, return NULL. */
char *
-rewrite_url_maybe (const char *url)
+rewrite_shorthand_url (const char *url)
{
const char *p;
}
}
\f
-/* Allocate a new urlinfo structure, fill it with default values and
- return a pointer to it. */
-struct urlinfo *
-newurl (void)
-{
- struct urlinfo *u;
+static void parse_path PARAMS ((const char *, char **, char **));
- u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
- memset (u, 0, sizeof (*u));
- u->scheme = SCHEME_INVALID;
- return u;
+static char *
+strpbrk_or_eos (const char *s, const char *accept)
+{
+ char *p = strpbrk (s, accept);
+ if (!p)
+ p = (char *)s + strlen (s);
+ return p;
}
-/* Perform a "deep" free of the urlinfo structure. The structure
- should have been created with newurl, but need not have been used.
- If free_pointer is non-0, free the pointer itself. */
-void
-freeurl (struct urlinfo *u, int complete)
+/* Turn STR into lowercase; return non-zero if a character was
+ actually changed. */
+
+static int
+lowercase_str (char *str)
{
- assert (u != NULL);
- FREE_MAYBE (u->url);
- FREE_MAYBE (u->host);
- FREE_MAYBE (u->path);
- FREE_MAYBE (u->file);
- FREE_MAYBE (u->dir);
- FREE_MAYBE (u->user);
- FREE_MAYBE (u->passwd);
- FREE_MAYBE (u->local);
- FREE_MAYBE (u->referer);
- if (u->proxy)
- freeurl (u->proxy, 1);
- if (complete)
- xfree (u);
- return;
+ int change = 0;
+ for (; *str; str++)
+ if (!ISLOWER (*str))
+ {
+ change = 1;
+ *str = TOLOWER (*str);
+ }
+ return change;
}
-\f
-enum url_parse_error {
- PE_UNRECOGNIZED_SCHEME, PE_BAD_PORT
+
+static char *parse_errors[] = {
+#define PE_NO_ERROR 0
+ "No error",
+#define PE_UNRECOGNIZED_SCHEME 1
+ "Unrecognized scheme",
+#define PE_EMPTY_HOST 2
+ "Empty host",
+#define PE_BAD_PORT_NUMBER 3
+ "Bad port number",
+#define PE_INVALID_USER_NAME 4
+ "Invalid user name"
};
-/* Extract the given URL of the form
- (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
- 1. hostname (terminated with `/' or `:')
- 2. port number (terminated with `/'), or chosen for the scheme
- 3. dirname (everything after hostname)
- Most errors are handled. No allocation is done, you must supply
- pointers to allocated memory.
- ...and a host of other stuff :-)
-
- - Recognizes hostname:dir/file for FTP and
- hostname (:portnum)?/dir/file for HTTP.
- - Parses the path to yield directory and file
- - Parses the URL to yield the username and passwd (if present)
- - Decodes the strings, in case they contain "forbidden" characters
- - Writes the result to struct urlinfo
-
- If the argument STRICT is set, it recognizes only the canonical
- form. */
-uerr_t
-parseurl (const char *url, struct urlinfo *u, int strict)
+#define SETERR(p, v) do { \
+ if (p) \
+ *(p) = (v); \
+} while (0)
+
+/* Parse a URL.
+
+ Return a new struct url if successful, NULL on error. In case of
+ error, and if ERROR is not NULL, also set *ERROR to the appropriate
+ error code. */
+struct url *
+url_parse (const char *url, int *error)
{
- int i, l, abs_ftp;
- int recognizable; /* Recognizable URL is the one where
- the scheme was explicitly named,
- i.e. it wasn't deduced from the URL
- format. */
- uerr_t type;
-
- DEBUGP (("parseurl (\"%s\") -> ", url));
- recognizable = url_has_scheme (url);
- if (strict && !recognizable)
- return URLUNKNOWN;
- for (i = 0, l = 0; i < ARRAY_SIZE (supported_schemes); i++)
+ struct url *u;
+ const char *p;
+ int path_modified, host_modified;
+
+ enum url_scheme scheme;
+
+ const char *uname_b, *uname_e;
+ const char *host_b, *host_e;
+ const char *path_b, *path_e;
+ const char *params_b, *params_e;
+ const char *query_b, *query_e;
+ const char *fragment_b, *fragment_e;
+
+ int port;
+ char *user = NULL, *passwd = NULL;
+
+ char *url_encoded;
+
+ scheme = url_scheme (url);
+ if (scheme == SCHEME_INVALID)
{
- l = strlen (supported_schemes[i].leading_string);
- if (!strncasecmp (supported_schemes[i].leading_string, url, l))
- break;
+ SETERR (error, PE_UNRECOGNIZED_SCHEME);
+ return NULL;
}
- /* If scheme is recognizable, but unsupported, bail out, else
- suppose unknown. */
- if (recognizable && i == ARRAY_SIZE (supported_schemes))
- return URLUNKNOWN;
- else if (i == ARRAY_SIZE (supported_schemes))
- type = URLUNKNOWN;
- else
- u->scheme = type = supported_schemes[i].scheme;
-
- if (type == URLUNKNOWN)
- l = 0;
- /* Allow a username and password to be specified (i.e. just skip
- them for now). */
- if (recognizable)
- l += url_skip_uname (url + l);
- for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
- if (i == l)
- return URLBADHOST;
- /* Get the hostname. */
- u->host = strdupdelim (url + l, url + i);
- DEBUGP (("host %s -> ", u->host));
-
- /* Assume no port has been given. */
- u->port = 0;
- if (url[i] == ':')
+
+ url_encoded = reencode_string (url);
+ p = url_encoded;
+
+ p += strlen (supported_schemes[scheme].leading_string);
+ uname_b = p;
+ p += url_skip_uname (p);
+ uname_e = p;
+
+ /* scheme://user:pass@host[:port]... */
+ /* ^ */
+
+ /* We attempt to break down the URL into the components path,
+ params, query, and fragment. They are ordered like this:
+
+ scheme://host[:port][/path][;params][?query][#fragment] */
+
+ params_b = params_e = NULL;
+ query_b = query_e = NULL;
+ fragment_b = fragment_e = NULL;
+
+ host_b = p;
+ p = strpbrk_or_eos (p, ":/;?#");
+ host_e = p;
+
+ if (host_b == host_e)
+ {
+ SETERR (error, PE_EMPTY_HOST);
+ return NULL;
+ }
+
+ port = scheme_default_port (scheme);
+ if (*p == ':')
{
- /* We have a colon delimiting the hostname. It could mean that
- a port number is following it, or a directory. */
- if (ISDIGIT (url[++i])) /* A port number */
+ const char *port_b, *port_e, *pp;
+
+ /* scheme://host:port/tralala */
+ /* ^ */
+ ++p;
+ port_b = p;
+ p = strpbrk_or_eos (p, "/;?#");
+ port_e = p;
+
+ if (port_b == port_e)
{
- if (type == URLUNKNOWN)
- {
- type = URLHTTP;
- u->scheme = SCHEME_HTTP;
- }
- for (; url[i] && url[i] != '/'; i++)
- if (ISDIGIT (url[i]))
- u->port = 10 * u->port + (url[i] - '0');
- else
- return URLBADPORT;
- if (!u->port)
- return URLBADPORT;
- DEBUGP (("port %hu -> ", u->port));
+ /* http://host:/whatever */
+ /* ^ */
+ SETERR (error, PE_BAD_PORT_NUMBER);
+ return NULL;
}
- else if (type == URLUNKNOWN) /* or a directory */
+
+ for (port = 0, pp = port_b; pp < port_e; pp++)
{
- type = URLFTP;
- u->scheme = SCHEME_FTP;
+ if (!ISDIGIT (*pp))
+ {
+ /* http://host:12randomgarbage/blah */
+ /* ^ */
+ SETERR (error, PE_BAD_PORT_NUMBER);
+ return NULL;
+ }
+ port = 10 * port + (*pp - '0');
}
- else /* or just a misformed port number */
- return URLBADPORT;
}
- else if (type == URLUNKNOWN)
+
+ if (*p == '/')
{
- type = URLHTTP;
- u->scheme = SCHEME_HTTP;
+ ++p;
+ path_b = p;
+ p = strpbrk_or_eos (p, ";?#");
+ path_e = p;
}
- if (!u->port)
+ else
{
- int ind;
- for (ind = 0; ind < ARRAY_SIZE (supported_schemes); ind++)
- if (supported_schemes[ind].scheme == u->scheme)
- break;
- if (ind == ARRAY_SIZE (supported_schemes))
- return URLUNKNOWN;
- u->port = supported_schemes[ind].default_port;
+ /* Path is not allowed not to exist. */
+ path_b = path_e = p;
}
- /* Some delimiter troubles... */
- if (url[i] == '/' && url[i - 1] != ':')
- ++i;
- if (type == URLHTTP)
- while (url[i] && url[i] == '/')
- ++i;
- u->path = (char *)xmalloc (strlen (url + i) + 8);
- strcpy (u->path, url + i);
- if (type == URLFTP)
+
+ if (*p == ';')
{
- u->ftp_type = process_ftp_type (u->path);
- /* #### We don't handle type `d' correctly yet. */
- if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
- u->ftp_type = 'I';
- DEBUGP (("ftp_type %c -> ", u->ftp_type));
+ ++p;
+ params_b = p;
+ p = strpbrk_or_eos (p, "?#");
+ params_e = p;
}
- DEBUGP (("opath %s -> ", u->path));
- /* Parse the username and password (if existing). */
- parse_uname (url, &u->user, &u->passwd);
- /* Decode the strings, as per RFC 1738. */
- decode_string (u->host);
- decode_string (u->path);
- if (u->user)
- decode_string (u->user);
- if (u->passwd)
- decode_string (u->passwd);
- /* Parse the directory. */
- parse_dir (u->path, &u->dir, &u->file);
- DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
- /* Simplify the directory. */
- path_simplify (u->dir);
- /* Remove the leading `/' in HTTP. */
- if (type == URLHTTP && *u->dir == '/')
- strcpy (u->dir, u->dir + 1);
- DEBUGP (("ndir %s\n", u->dir));
- /* Strip trailing `/'. */
- l = strlen (u->dir);
- if (l > 1 && u->dir[l - 1] == '/')
- u->dir[l - 1] = '\0';
- /* Re-create the path: */
- abs_ftp = (u->scheme == SCHEME_FTP && *u->dir == '/');
- /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
- abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
- strcpy (u->path, abs_ftp ? "%2F" : "/");
- strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
- strcat (u->path, *u->dir ? "/" : "");
- strcat (u->path, u->file);
- ENCODE (u->path);
- DEBUGP (("newpath: %s\n", u->path));
- /* Create the clean URL. */
- u->url = str_url (u, 0);
- return URLOK;
-}
-\f
-/* Special versions of DOTP and DDOTP for parse_dir(). They work like
- DOTP and DDOTP, but they also recognize `?' as end-of-string
- delimiter. This is needed for correct handling of query
- strings. */
-
-#define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
-#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
- && (!*((x) + 2) || *((x) + 2) == '?'))
-
-/* Build the directory and filename components of the path. Both
- components are *separately* malloc-ed strings! It does not change
- the contents of path.
-
- If the path ends with "." or "..", they are (correctly) counted as
- directories. */
-static void
-parse_dir (const char *path, char **dir, char **file)
-{
- int i, l;
-
- l = urlpath_length (path);
- for (i = l; i && path[i] != '/'; i--);
+ if (*p == '?')
+ {
+ ++p;
+ query_b = p;
+ p = strpbrk_or_eos (p, "#");
+ query_e = p;
+ }
+ if (*p == '#')
+ {
+ ++p;
+ fragment_b = p;
+ p += strlen (p);
+ fragment_e = p;
+ }
+ assert (*p == 0);
- if (!i && *path != '/') /* Just filename */
+ if (uname_b != uname_e)
{
- if (PD_DOTP (path) || PD_DDOTP (path))
- {
- *dir = strdupdelim (path, path + l);
- *file = xstrdup (path + l); /* normally empty, but could
- contain ?... */
- }
- else
+ /* http://user:pass@host */
+ /* ^ ^ */
+ /* uname_b uname_e */
+ if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
{
- *dir = xstrdup (""); /* This is required because of FTP */
- *file = xstrdup (path);
+ SETERR (error, PE_INVALID_USER_NAME);
+ return NULL;
}
}
- else if (!i) /* /filename */
+
+ u = (struct url *)xmalloc (sizeof (struct url));
+ memset (u, 0, sizeof (*u));
+
+ u->scheme = scheme;
+ u->host = strdupdelim (host_b, host_e);
+ u->port = port;
+ u->user = user;
+ u->passwd = passwd;
+
+ u->path = strdupdelim (path_b, path_e);
+ path_modified = path_simplify (u->path);
+ parse_path (u->path, &u->dir, &u->file);
+
+ host_modified = lowercase_str (u->host);
+
+ if (params_b)
+ u->params = strdupdelim (params_b, params_e);
+ if (query_b)
+ u->query = strdupdelim (query_b, query_e);
+ if (fragment_b)
+ u->fragment = strdupdelim (fragment_b, fragment_e);
+
+
+ if (path_modified || u->fragment || host_modified)
{
- if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
- {
- *dir = strdupdelim (path, path + l);
- *file = xstrdup (path + l); /* normally empty, but could
- contain ?... */
- }
- else
- {
- *dir = xstrdup ("/");
- *file = xstrdup (path + 1);
- }
+ /* If path_simplify modified the path, or if a fragment is
+ present, or if the original host name had caps in it, make
+ sure that u->url is equivalent to what would be printed by
+ url_string. */
+ u->url = url_string (u, 0);
+
+ if (url_encoded != url)
+ xfree ((char *) url_encoded);
}
- else /* Nonempty directory with or without a filename */
+ else
{
- if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
- {
- *dir = strdupdelim (path, path + l);
- *file = xstrdup (path + l); /* normally empty, but could
- contain ?... */
- }
+ if (url_encoded == url)
+ u->url = xstrdup (url);
else
- {
- *dir = strdupdelim (path, path + i);
- *file = xstrdup (path + i + 1);
- }
+ u->url = url_encoded;
}
+ url_encoded = NULL;
+
+ return u;
}
-/* Find the optional username and password within the URL, as per
- RFC1738. The returned user and passwd char pointers are
- malloc-ed. */
-static uerr_t
-parse_uname (const char *url, char **user, char **passwd)
+const char *
+url_error (int error_code)
{
- int l;
- const char *p, *q, *col;
- char **where;
-
- *user = NULL;
- *passwd = NULL;
-
- /* Look for the end of the scheme identifier. */
- l = url_skip_scheme (url);
- if (!l)
- return URLUNKNOWN;
- url += l;
- /* Is there an `@' character? */
- for (p = url; *p && *p != '/'; p++)
- if (*p == '@')
- break;
- /* If not, return. */
- if (*p != '@')
- return URLOK;
- /* Else find the username and password. */
- for (p = q = col = url; *p && *p != '/'; p++)
- {
- if (*p == ':' && !*user)
- {
- *user = (char *)xmalloc (p - url + 1);
- memcpy (*user, url, p - url);
- (*user)[p - url] = '\0';
- col = p + 1;
- }
- if (*p == '@') q = p;
- }
- /* Decide whether you have only the username or both. */
- where = *user ? passwd : user;
- *where = (char *)xmalloc (q - col + 1);
- memcpy (*where, col, q - col);
- (*where)[q - col] = '\0';
- return URLOK;
+ assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
+ return parse_errors[error_code];
}
-/* If PATH ends with `;type=X', return the character X. */
-static char
-process_ftp_type (char *path)
+static void
+parse_path (const char *quoted_path, char **dir, char **file)
{
- int len = strlen (path);
+ char *path, *last_slash;
- if (len >= 7
- && !memcmp (path + len - 7, ";type=", 6))
+ STRDUP_ALLOCA (path, quoted_path);
+ decode_string (path);
+
+ last_slash = strrchr (path, '/');
+ if (!last_slash)
{
- path[len - 7] = '\0';
- return path[len - 1];
+ *dir = xstrdup ("");
+ *file = xstrdup (path);
}
else
- return '\0';
+ {
+ *dir = strdupdelim (path, last_slash);
+ *file = xstrdup (last_slash + 1);
+ }
}
-\f
-/* Recreate the URL string from the data in urlinfo. This can be used
- to create a "canonical" representation of the URL. If `hide' is
- non-zero (as it is when we're calling this on a URL we plan to
- print, but not when calling it to canonicalize a URL for use within
- the program), password will be hidden. The forbidden characters in
- the URL will be cleansed. */
+
+/* Note: URL's "full path" is the path with the query string and
+ params appended. The "fragment" (#foo) is intentionally ignored,
+ but that might be changed. For example, if the original URL was
+ "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
+ the full path will be "/foo/bar/baz;bullshit?querystring". */
+
+/* Return the length of the full path, without the terminating
+ zero. */
+
+static int
+full_path_length (const struct url *url)
+{
+ int len = 0;
+
+#define FROB(el) if (url->el) len += 1 + strlen (url->el)
+
+ FROB (path);
+ FROB (params);
+ FROB (query);
+
+#undef FROB
+
+ return len;
+}
+
+/* Write out the full path. */
+
+static void
+full_path_write (const struct url *url, char *where)
+{
+#define FROB(el, chr) do { \
+ char *f_el = url->el; \
+ if (f_el) { \
+ int l = strlen (f_el); \
+ *where++ = chr; \
+ memcpy (where, f_el, l); \
+ where += l; \
+ } \
+} while (0)
+
+ FROB (path, '/');
+ FROB (params, ';');
+ FROB (query, '?');
+
+#undef FROB
+}
+
+/* Public function for getting the "full path". */
char *
-str_url (const struct urlinfo *u, int hide)
+url_full_path (const struct url *url)
{
- char *res, *host, *user, *passwd, *scheme_name, *dir, *file;
- int i, l, ln, lu, lh, lp, lf, ld;
- unsigned short default_port;
+ int length = full_path_length (url);
+ char *full_path = (char *)xmalloc(length + 1);
- /* Look for the scheme. */
- for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
- if (supported_schemes[i].scheme == u->scheme)
- break;
- if (i == ARRAY_SIZE (supported_schemes))
- return NULL;
- scheme_name = supported_schemes[i].leading_string;
- default_port = supported_schemes[i].default_port;
- host = encode_string (u->host);
- dir = encode_string (u->dir);
- file = encode_string (u->file);
- user = passwd = NULL;
- if (u->user)
- user = encode_string (u->user);
- if (u->passwd)
- {
- if (hide)
- /* Don't output the password, or someone might see it over the user's
- shoulder (or in saved wget output). Don't give away the number of
- characters in the password, either, as we did in past versions of
- this code, when we replaced the password characters with 'x's. */
- passwd = xstrdup("<password>");
- else
- passwd = encode_string (u->passwd);
- }
- if (u->scheme == SCHEME_FTP && *dir == '/')
- {
- char *tmp = (char *)xmalloc (strlen (dir) + 3);
- /*sprintf (tmp, "%%2F%s", dir + 1);*/
- tmp[0] = '%';
- tmp[1] = '2';
- tmp[2] = 'F';
- strcpy (tmp + 3, dir + 1);
- xfree (dir);
- dir = tmp;
- }
+ full_path_write (url, full_path);
+ full_path[length] = '\0';
- ln = strlen (scheme_name);
- lu = user ? strlen (user) : 0;
- lp = passwd ? strlen (passwd) : 0;
- lh = strlen (host);
- ld = strlen (dir);
- lf = strlen (file);
- res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
- /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", scheme_name,
- (user ? user : ""), (passwd ? ":" : ""),
- (passwd ? passwd : ""), (user ? "@" : ""),
- host, u->port, dir, *dir ? "/" : "", file); */
- l = 0;
- memcpy (res, scheme_name, ln);
- l += ln;
- if (user)
+ return full_path;
+}
+
+/* Sync u->path and u->url with u->dir and u->file. */
+static void
+sync_path (struct url *url)
+{
+ char *newpath;
+
+ xfree (url->path);
+
+ if (!*url->dir)
{
- memcpy (res + l, user, lu);
- l += lu;
- if (passwd)
- {
- res[l++] = ':';
- memcpy (res + l, passwd, lp);
- l += lp;
- }
- res[l++] = '@';
+ newpath = xstrdup (url->file);
+ REENCODE (newpath);
}
- memcpy (res + l, host, lh);
- l += lh;
- if (u->port != default_port)
+ else
{
- res[l++] = ':';
- long_to_string (res + l, (long)u->port);
- l += numdigit (u->port);
+ int dirlen = strlen (url->dir);
+ int filelen = strlen (url->file);
+
+ newpath = xmalloc (dirlen + 1 + filelen + 1);
+ memcpy (newpath, url->dir, dirlen);
+ newpath[dirlen] = '/';
+ memcpy (newpath + dirlen + 1, url->file, filelen);
+ newpath[dirlen + 1 + filelen] = '\0';
+ REENCODE (newpath);
}
- res[l++] = '/';
- memcpy (res + l, dir, ld);
- l += ld;
- if (*dir)
- res[l++] = '/';
- strcpy (res + l, file);
- xfree (host);
- xfree (dir);
- xfree (file);
- FREE_MAYBE (user);
- FREE_MAYBE (passwd);
- return res;
+
+ url->path = newpath;
+
+ /* Synchronize u->url. */
+ xfree (url->url);
+ url->url = url_string (url, 0);
}
-/* Check whether two URL-s are equivalent, i.e. pointing to the same
- location. Uses parseurl to parse them, and compares the canonical
- forms.
+/* Mutators. Code in ftp.c insists on changing u->dir and u->file.
+ This way we can sync u->path and u->url when they get changed. */
- Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
- return 0 on error. */
-/* Do not compile unused code. */
-#if 0
-int
-url_equal (const char *url1, const char *url2)
+void
+url_set_dir (struct url *url, const char *newdir)
{
- struct urlinfo *u1, *u2;
- uerr_t err;
- int res;
+ xfree (url->dir);
+ url->dir = xstrdup (newdir);
+ sync_path (url);
+}
- u1 = newurl ();
- err = parseurl (url1, u1, 0);
- if (err != URLOK)
- {
- freeurl (u1, 1);
- return 0;
- }
- u2 = newurl ();
- err = parseurl (url2, u2, 0);
- if (err != URLOK)
- {
- freeurl (u1, 1);
- freeurl (u2, 1);
- return 0;
- }
- res = !strcmp (u1->url, u2->url);
- freeurl (u1, 1);
- freeurl (u2, 1);
- return res;
+void
+url_set_file (struct url *url, const char *newfile)
+{
+ xfree (url->file);
+ url->file = xstrdup (newfile);
+ sync_path (url);
+}
+
+void
+url_free (struct url *url)
+{
+ xfree (url->host);
+ xfree (url->path);
+ xfree (url->url);
+
+ FREE_MAYBE (url->params);
+ FREE_MAYBE (url->query);
+ FREE_MAYBE (url->fragment);
+ FREE_MAYBE (url->user);
+ FREE_MAYBE (url->passwd);
+
+ xfree (url->dir);
+ xfree (url->file);
+
+ xfree (url);
}
-#endif /* 0 */
\f
-urlpos *
+struct urlpos *
get_urls_file (const char *file)
{
struct file_memory *fm;
- urlpos *head, *tail;
+ struct urlpos *head, *tail;
const char *text, *text_end;
/* Load the file. */
--line_end;
if (line_end > line_beg)
{
- urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
+ int up_error_code;
+ char *url_text;
+ struct urlpos *entry;
+ struct url *url;
+
+ /* We must copy the URL to a zero-terminated string. *sigh*. */
+ url_text = strdupdelim (line_beg, line_end);
+ url = url_parse (url_text, &up_error_code);
+ if (!url)
+ {
+ logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
+ file, url_text, url_error (up_error_code));
+ xfree (url_text);
+ continue;
+ }
+ xfree (url_text);
+
+ entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
memset (entry, 0, sizeof (*entry));
entry->next = NULL;
- entry->url = strdupdelim (line_beg, line_end);
+ entry->url = url;
+
if (!head)
head = entry;
else
\f
/* Free the linked list of urlpos. */
void
-free_urlpos (urlpos *l)
+free_urlpos (struct urlpos *l)
{
while (l)
{
- urlpos *next = l->next;
- xfree (l->url);
+ struct urlpos *next = l->next;
+ if (l->url)
+ url_free (l->url);
FREE_MAYBE (l->local_name);
xfree (l);
l = next;
/* Return the path name of the URL-equivalent file name, with a
remote-like structure of directories. */
static char *
-mkstruct (const struct urlinfo *u)
+mkstruct (const struct url *u)
{
- char *host, *dir, *file, *res, *dirpref;
+ char *dir, *dir_preencoding;
+ char *file, *res, *dirpref;
+ char *query = u->query && *u->query ? u->query : NULL;
int l;
- assert (u->dir != NULL);
- assert (u->host != NULL);
-
if (opt.cut_dirs)
{
char *ptr = u->dir + (*u->dir == '/');
else
dir = u->dir + (*u->dir == '/');
- host = xstrdup (u->host);
/* Check for the true name (or at least a consistent name for saving
to directory) of HOST, reusing the hlist if possible. */
- if (opt.add_hostdir && !opt.simple_check)
- {
- char *nhost = realhost (host);
- xfree (host);
- host = nhost;
- }
- /* Add dir_prefix and hostname (if required) to the beginning of
- dir. */
if (opt.add_hostdir)
{
+ /* Add dir_prefix and hostname (if required) to the beginning of
+ dir. */
+ dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
+ + strlen (u->host)
+ + 1 + numdigit (u->port)
+ + 1);
if (!DOTP (opt.dir_prefix))
+ sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
+ else
+ strcpy (dirpref, u->host);
+
+ if (u->port != scheme_default_port (u->scheme))
{
- dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
- + strlen (host) + 1);
- sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
+ int len = strlen (dirpref);
+ dirpref[len] = ':';
+ long_to_string (dirpref + len + 1, u->port);
}
- else
- STRDUP_ALLOCA (dirpref, host);
}
- else /* not add_hostdir */
+ else /* not add_hostdir */
{
if (!DOTP (opt.dir_prefix))
dirpref = opt.dir_prefix;
else
dirpref = "";
}
- xfree (host);
/* If there is a prefix, prepend it. */
if (*dirpref)
sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
dir = newdir;
}
- dir = encode_string (dir);
+
+ dir_preencoding = dir;
+ dir = reencode_string (dir_preencoding);
+
l = strlen (dir);
if (l && dir[l - 1] == '/')
dir[l - 1] = '\0';
file = u->file;
/* Finally, construct the full name. */
- res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
+ res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
+ + (query ? (1 + strlen (query)) : 0)
+ + 1);
sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
- xfree (dir);
+ if (query)
+ {
+ strcat (res, "?");
+ strcat (res, query);
+ }
+ if (dir != dir_preencoding)
+ xfree (dir);
return res;
}
-/* Return a malloced copy of S, but protect any '/' characters. */
+/* Compose a file name out of BASE, an unescaped file name, and QUERY,
+ an escaped query string. The trick is to make sure that unsafe
+ characters in BASE are escaped, and that slashes in QUERY are also
+ escaped. */
static char *
-file_name_protect_query_string (const char *s)
+compose_file_name (char *base, char *query)
{
- const char *from;
- char *to, *dest;
- int destlen = 0;
- for (from = s; *from; from++)
+ char result[256];
+ char *from;
+ char *to = result;
+
+ /* Copy BASE to RESULT and encode all unsafe characters. */
+ from = base;
+ while (*from && to - result < sizeof (result))
{
- ++destlen;
- if (*from == '/')
- destlen += 2; /* each / gets replaced with %2F, so
- it adds two more chars. */
+ if (UNSAFE_CHAR (*from))
+ {
+ unsigned char c = *from++;
+ *to++ = '%';
+ *to++ = XDIGIT_TO_XCHAR (c >> 4);
+ *to++ = XDIGIT_TO_XCHAR (c & 0xf);
+ }
+ else
+ *to++ = *from++;
}
- dest = (char *)xmalloc (destlen + 1);
- for (from = s, to = dest; *from; from++)
+
+ if (query && to - result < sizeof (result))
{
- if (*from != '/')
- *to++ = *from;
- else
+ *to++ = '?';
+
+ /* Copy QUERY to RESULT and encode all '/' characters. */
+ from = query;
+ while (*from && to - result < sizeof (result))
{
- *to++ = '%';
- *to++ = '2';
- *to++ = 'F';
+ if (*from == '/')
+ {
+ *to++ = '%';
+ *to++ = '2';
+ *to++ = 'F';
+ ++from;
+ }
+ else
+ *to++ = *from++;
}
}
- assert (to - dest == destlen);
- *to = '\0';
- return dest;
+
+ if (to - result < sizeof (result))
+ *to = '\0';
+ else
+ /* Truncate input which is too long, presumably due to a huge
+ query string. */
+ result[sizeof (result) - 1] = '\0';
+
+ return xstrdup (result);
}
/* Create a unique filename, corresponding to a given URL. Calls
mkstruct if necessary. Does *not* actually create any directories. */
char *
-url_filename (const struct urlinfo *u)
+url_filename (const struct url *u)
{
char *file, *name;
int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
}
else
{
- if (!*u->file)
- file = xstrdup ("index.html");
- else
- {
- /* If the URL came with a query string, u->file will contain
- a question mark followed by query string contents. These
- contents can contain '/' which would make us create
- unwanted directories. These slashes must be protected
- explicitly. */
- if (!strchr (u->file, '/'))
- file = xstrdup (u->file);
- else
- {
- /*assert (strchr (u->file, '?') != NULL);*/
- file = file_name_protect_query_string (u->file);
- }
- }
+ char *base = *u->file ? u->file : "index.html";
+ char *query = u->query && *u->query ? u->query : NULL;
+ file = compose_file_name (base, query);
}
if (!have_prefix)
static int
urlpath_length (const char *url)
{
- const char *q = strchr (url, '?');
- if (q)
- return q - url;
- return strlen (url);
+ const char *q = strpbrk_or_eos (url, "?;#");
+ return q - url;
}
/* Find the last occurrence of character C in the range [b, e), or
{
const char *end = base + urlpath_length (base);
- if (*link != '/')
+ if (!*link)
{
- /* LINK is a relative URL: we need to replace everything
- after last slash (possibly empty) with LINK.
-
- So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
- our result should be "whatever/foo/qux/xyzzy". */
- int need_explicit_slash = 0;
- int span;
- const char *start_insert;
- const char *last_slash = find_last_char (base, end, '/');
- if (!last_slash)
- {
- /* No slash found at all. Append LINK to what we have,
- but we'll need a slash as a separator.
-
- Example: if base == "foo" and link == "qux/xyzzy", then
- we cannot just append link to base, because we'd get
- "fooqux/xyzzy", whereas what we want is
- "foo/qux/xyzzy".
-
- To make sure the / gets inserted, we set
- need_explicit_slash to 1. We also set start_insert
- to end + 1, so that the length calculations work out
- correctly for one more (slash) character. Accessing
- that character is fine, since it will be the
- delimiter, '\0' or '?'. */
- /* example: "foo?..." */
- /* ^ ('?' gets changed to '/') */
- start_insert = end + 1;
- need_explicit_slash = 1;
- }
- else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
- {
- /* example: http://host" */
- /* ^ */
- start_insert = end + 1;
- need_explicit_slash = 1;
- }
- else
- {
- /* example: "whatever/foo/bar" */
- /* ^ */
- start_insert = last_slash + 1;
- }
-
- span = start_insert - base;
- constr = (char *)xmalloc (span + linklength + 1);
- if (span)
- memcpy (constr, base, span);
- if (need_explicit_slash)
- constr[span - 1] = '/';
- if (linklength)
- memcpy (constr + span, link, linklength);
- constr[span + linklength] = '\0';
+ /* Empty LINK points back to BASE, query string and all. */
+ constr = xstrdup (base);
+ }
+ else if (*link == '?')
+ {
+ /* LINK points to the same location, but changes the query
+ string. Examples: */
+ /* uri_merge("path", "?new") -> "path?new" */
+ /* uri_merge("path?foo", "?new") -> "path?new" */
+ /* uri_merge("path?foo#bar", "?new") -> "path?new" */
+ /* uri_merge("path#foo", "?new") -> "path?new" */
+ int baselength = end - base;
+ constr = xmalloc (baselength + linklength + 1);
+ memcpy (constr, base, baselength);
+ memcpy (constr + baselength, link, linklength);
+ constr[baselength + linklength] = '\0';
+ }
+ else if (*link == '#')
+ {
+ /* uri_merge("path", "#new") -> "path#new" */
+ /* uri_merge("path#foo", "#new") -> "path#new" */
+ /* uri_merge("path?foo", "#new") -> "path?foo#new" */
+ /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
+ int baselength;
+ const char *end1 = strchr (base, '#');
+ if (!end1)
+ end1 = base + strlen (base);
+ baselength = end1 - base;
+ constr = xmalloc (baselength + linklength + 1);
+ memcpy (constr, base, baselength);
+ memcpy (constr + baselength, link, linklength);
+ constr[baselength + linklength] = '\0';
}
- else /* *link == `/' */
+ else if (*link == '/')
{
/* LINK is an absolute path: we need to replace everything
after (and including) the FIRST slash with LINK.
memcpy (constr + span, link, linklength);
constr[span + linklength] = '\0';
}
+ else
+ {
+ /* LINK is a relative URL: we need to replace everything
+ after last slash (possibly empty) with LINK.
+
+ So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
+ our result should be "whatever/foo/qux/xyzzy". */
+ int need_explicit_slash = 0;
+ int span;
+ const char *start_insert;
+ const char *last_slash = find_last_char (base, end, '/');
+ if (!last_slash)
+ {
+ /* No slash found at all. Append LINK to what we have,
+ but we'll need a slash as a separator.
+
+ Example: if base == "foo" and link == "qux/xyzzy", then
+ we cannot just append link to base, because we'd get
+ "fooqux/xyzzy", whereas what we want is
+ "foo/qux/xyzzy".
+
+ To make sure the / gets inserted, we set
+ need_explicit_slash to 1. We also set start_insert
+ to end + 1, so that the length calculations work out
+ correctly for one more (slash) character. Accessing
+ that character is fine, since it will be the
+ delimiter, '\0' or '?'. */
+ /* example: "foo?..." */
+ /* ^ ('?' gets changed to '/') */
+ start_insert = end + 1;
+ need_explicit_slash = 1;
+ }
+ else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
+ {
+ /* example: http://host" */
+ /* ^ */
+ start_insert = end + 1;
+ need_explicit_slash = 1;
+ }
+ else
+ {
+ /* example: "whatever/foo/bar" */
+ /* ^ */
+ start_insert = last_slash + 1;
+ }
+
+ span = start_insert - base;
+ constr = (char *)xmalloc (span + linklength + 1);
+ if (span)
+ memcpy (constr, base, span);
+ if (need_explicit_slash)
+ constr[span - 1] = '/';
+ if (linklength)
+ memcpy (constr + span, link, linklength);
+ constr[span + linklength] = '\0';
+ }
}
else /* !no_scheme */
{
return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
}
\f
-/* Optimize URL by host, destructively replacing u->host with realhost
- (u->host). Do this regardless of opt.simple_check. */
-void
-opt_url (struct urlinfo *u)
+#define APPEND(p, s) do { \
+ int len = strlen (s); \
+ memcpy (p, s, len); \
+ p += len; \
+} while (0)
+
+/* Use this instead of password when the actual password is supposed
+ to be hidden. We intentionally use a generic string without giving
+ away the number of characters in the password, like previous
+ versions did. */
+#define HIDDEN_PASSWORD "*password*"
+
+/* Recreate the URL string from the data in URL.
+
+ If HIDE is non-zero (as it is when we're calling this on a URL we
+ plan to print, but not when calling it to canonicalize a URL for
+ use within the program), password will be hidden. Unsafe
+ characters in the URL will be quoted. */
+
+char *
+url_string (const struct url *url, int hide_password)
{
- /* Find the "true" host. */
- char *host = realhost (u->host);
- xfree (u->host);
- u->host = host;
- assert (u->dir != NULL); /* the URL must have been parsed */
- /* Refresh the printed representation. */
- xfree (u->url);
- u->url = str_url (u, 0);
+ int size;
+ char *result, *p;
+ char *quoted_user = NULL, *quoted_passwd = NULL;
+
+ int scheme_port = supported_schemes[url->scheme].default_port;
+ char *scheme_str = supported_schemes[url->scheme].leading_string;
+ int fplen = full_path_length (url);
+
+ assert (scheme_str != NULL);
+
+ /* Make sure the user name and password are quoted. */
+ if (url->user)
+ {
+ quoted_user = encode_string_maybe (url->user);
+ if (url->passwd)
+ {
+ if (hide_password)
+ quoted_passwd = HIDDEN_PASSWORD;
+ else
+ quoted_passwd = encode_string_maybe (url->passwd);
+ }
+ }
+
+ size = (strlen (scheme_str)
+ + strlen (url->host)
+ + fplen
+ + 1);
+ if (url->port != scheme_port)
+ size += 1 + numdigit (url->port);
+ if (quoted_user)
+ {
+ size += 1 + strlen (quoted_user);
+ if (quoted_passwd)
+ size += 1 + strlen (quoted_passwd);
+ }
+
+ p = result = xmalloc (size);
+
+ APPEND (p, scheme_str);
+ if (quoted_user)
+ {
+ APPEND (p, quoted_user);
+ if (quoted_passwd)
+ {
+ *p++ = ':';
+ APPEND (p, quoted_passwd);
+ }
+ *p++ = '@';
+ }
+
+ APPEND (p, url->host);
+ if (url->port != scheme_port)
+ {
+ *p++ = ':';
+ long_to_string (p, url->port);
+ p += strlen (p);
+ }
+
+ full_path_write (url, p);
+ p += fplen;
+ *p++ = '\0';
+
+ assert (p - result == size);
+
+ if (quoted_user && quoted_user != url->user)
+ xfree (quoted_user);
+ if (quoted_passwd && !hide_password
+ && quoted_passwd != url->passwd)
+ xfree (quoted_passwd);
+
+ return result;
}
\f
/* Returns proxy host address, in accordance with SCHEME. */
return NULL;
/* Handle shorthands. */
- rewritten_url = rewrite_url_maybe (proxy);
+ rewritten_url = rewrite_shorthand_url (proxy);
if (rewritten_url)
{
strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
/* Change the links in an HTML document. Accepts a structure that
defines the positions of all the links. */
void
-convert_links (const char *file, urlpos *l)
+convert_links (const char *file, struct urlpos *l)
{
struct file_memory *fm;
FILE *fp;
const char *p;
downloaded_file_t downloaded_file_return;
+ int to_url_count = 0, to_file_count = 0;
logprintf (LOG_VERBOSE, _("Converting %s... "), file);
/* First we do a "dry run": go through the list L and see whether
any URL needs to be converted in the first place. If not, just
leave the file alone. */
- int count = 0;
- urlpos *dry = l;
+ int dry_count = 0;
+ struct urlpos *dry = l;
for (dry = l; dry; dry = dry->next)
if (dry->convert != CO_NOCONVERT)
- ++count;
- if (!count)
+ ++dry_count;
+ if (!dry_count)
{
logputs (LOG_VERBOSE, _("nothing to do.\n"));
return;
/* If the URL is not to be converted, skip it. */
if (l->convert == CO_NOCONVERT)
{
- DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
+ DEBUGP (("Skipping %s at position %d.\n", l->url->url, l->pos));
continue;
}
char *quoted_newname = html_quote_string (newname);
replace_attr (&p, l->size, fp, quoted_newname);
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
- l->url, newname, l->pos, file));
+ l->url->url, newname, l->pos, file));
xfree (newname);
xfree (quoted_newname);
+ ++to_file_count;
}
else if (l->convert == CO_CONVERT_TO_COMPLETE)
{
/* Convert the link to absolute URL. */
- char *newlink = l->url;
+ char *newlink = l->url->url;
char *quoted_newlink = html_quote_string (newlink);
replace_attr (&p, l->size, fp, quoted_newlink);
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
newlink, l->pos, file));
xfree (quoted_newlink);
+ ++to_url_count;
}
}
/* Output the rest of the file. */
fwrite (p, 1, fm->length - (p - fm->content), fp);
fclose (fp);
read_file_free (fm);
- logputs (LOG_VERBOSE, _("done.\n"));
+ logprintf (LOG_VERBOSE,
+ _("%d-%d\n"), to_file_count, to_url_count);
}
/* Construct and return a malloced copy of the relative link from two
return res;
}
\f
-/* Add URL to the head of the list L. */
-urlpos *
-add_url (urlpos *l, const char *url, const char *file)
-{
- urlpos *t;
-
- t = (urlpos *)xmalloc (sizeof (urlpos));
- memset (t, 0, sizeof (*t));
- t->url = xstrdup (url);
- t->local_name = xstrdup (file);
- t->next = l;
- return t;
-}
-
static void
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
{
-- Dan Harkless <wget@harkless.org>
This [adding a field to the urlpos structure] didn't work
- because convert_file() is called twice: once after all its
- sublinks have been retrieved in recursive_retrieve(), and
- once at the end of the day in convert_all_links(). The
- original linked list collected in recursive_retrieve() is
- lost after the first invocation of convert_links(), and
- convert_all_links() makes a new one (it calls get_urls_html()
- for each file it covers.) That's why your first approach didn't
- work. The way to make it work is perhaps to make this flag a
- field in the `urls_html' list.
+ because convert_file() is called from convert_all_links at
+ the end of the retrieval with a freshly built new urlpos
+ list.
-- Hrvoje Niksic <hniksic@arsdigita.com>
*/
converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
return 0;
}
-typedef struct _downloaded_file_list {
- char* file;
- downloaded_file_t download_type;
- struct _downloaded_file_list* next;
-} downloaded_file_list;
+/* We're storing "modes" of type downloaded_file_t in the hash table.
+ However, our hash tables only accept pointers for keys and values.
+ So when we need a pointer, we use the address of a
+ downloaded_file_t variable of static storage. */
+
+static downloaded_file_t *
+downloaded_mode_to_ptr (downloaded_file_t mode)
+{
+ static downloaded_file_t
+ v1 = FILE_NOT_ALREADY_DOWNLOADED,
+ v2 = FILE_DOWNLOADED_NORMALLY,
+ v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
+ v4 = CHECK_FOR_FILE;
+
+ switch (mode)
+ {
+ case FILE_NOT_ALREADY_DOWNLOADED:
+ return &v1;
+ case FILE_DOWNLOADED_NORMALLY:
+ return &v2;
+ case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
+ return &v3;
+ case CHECK_FOR_FILE:
+ return &v4;
+ }
+ return NULL;
+}
+
+/* This should really be merged with dl_file_url_map and
+ downloaded_html_files in recur.c. This was originally a list, but
+ I changed it to a hash table beause it was actually taking a lot of
+ time to find things in it. */
-static downloaded_file_list *downloaded_files;
+static struct hash_table *downloaded_files_hash;
/* Remembers which files have been downloaded. In the standard case, should be
called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
with local filenames, not remote URLs. */
downloaded_file_t
-downloaded_file (downloaded_file_t mode, const char* file)
+downloaded_file (downloaded_file_t mode, const char *file)
{
- boolean found_file = FALSE;
- downloaded_file_list* rover = downloaded_files;
-
- while (rover != NULL)
- if (strcmp(rover->file, file) == 0)
- {
- found_file = TRUE;
- break;
- }
- else
- rover = rover->next;
+ downloaded_file_t *ptr;
- if (found_file)
- return rover->download_type; /* file had already been downloaded */
- else
+ if (mode == CHECK_FOR_FILE)
{
- if (mode != CHECK_FOR_FILE)
- {
- rover = xmalloc(sizeof(*rover));
- rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
- rover->download_type = mode;
- rover->next = downloaded_files;
- downloaded_files = rover;
- }
-
- return FILE_NOT_ALREADY_DOWNLOADED;
+ if (!downloaded_files_hash)
+ return FILE_NOT_ALREADY_DOWNLOADED;
+ ptr = hash_table_get (downloaded_files_hash, file);
+ if (!ptr)
+ return FILE_NOT_ALREADY_DOWNLOADED;
+ return *ptr;
}
+
+ if (!downloaded_files_hash)
+ downloaded_files_hash = make_string_hash_table (0);
+
+ ptr = hash_table_get (downloaded_files_hash, file);
+ if (ptr)
+ return *ptr;
+
+ ptr = downloaded_mode_to_ptr (mode);
+ hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
+
+ return FILE_NOT_ALREADY_DOWNLOADED;
+}
+
+static int
+df_free_mapper (void *key, void *value, void *ignored)
+{
+ xfree (key);
+ return 0;
}
void
downloaded_files_free (void)
{
- downloaded_file_list* rover = downloaded_files;
- while (rover)
+ if (downloaded_files_hash)
{
- downloaded_file_list *next = rover->next;
- xfree (rover->file);
- xfree (rover);
- rover = next;
+ hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
+ hash_table_destroy (downloaded_files_hash);
+ downloaded_files_hash = NULL;
}
}