X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Furl.c;h=8cbb8923e2fed5898ca42967eefe0f479dcf671e;hb=857c3e2ca4ce9f87bc6b9bf71b80f63f6db08fef;hp=da270019ccea36f24f657253ca5de11fc0179dac;hpb=3a8f6494d7fefbea98114261cfce7b971b8b1fa5;p=wget diff --git a/src/url.c b/src/url.c index da270019..8cbb8923 100644 --- a/src/url.c +++ b/src/url.c @@ -1,6 +1,5 @@ /* URL handling. - Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003 - Free Software Foundation, Inc. + Copyright (C) 2005 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -88,13 +87,14 @@ static int path_simplify PARAMS ((char *)); changing the meaning of the URL. For example, you can't decode "/foo/%2f/bar" into "/foo///bar" because the number and contents of path components is different. Non-reserved characters can be - changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". Wget - uses the rfc1738 set of reserved characters, plus "$" and ",", as - recommended by rfc2396. - - An unsafe characters is the one that should be encoded when URLs - are placed in foreign environments. E.g. space and newline are - unsafe in HTTP contexts because HTTP uses them as separator and + changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The + unsafe characters are loosely based on rfc1738, plus "$" and ",", + as recommended by rfc2396, and minus "~", which is very frequently + used (and sometimes unrecognized as %7E by broken servers). + + An unsafe character is the one that should be encoded when URLs are + placed in foreign environments. E.g. space and newline are unsafe + in HTTP contexts because HTTP uses them as separator and line terminator, so they must be encoded to %20 and %0A respectively. "*" is unsafe in shell context, etc. @@ -118,7 +118,7 @@ enum { #define U urlchr_unsafe #define RU R|U -const static unsigned char urlchr_table[256] = +static const unsigned char urlchr_table[256] = { U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ @@ -135,7 +135,7 @@ const static unsigned char urlchr_table[256] = U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ - 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */ + 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, @@ -175,10 +175,16 @@ url_unescape (char *s) } else { + char c; /* Do nothing if '%' is not followed by two hex digits. */ if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2]))) goto copychar; - *t = X2DIGITS_TO_NUM (h[1], h[2]); + c = X2DIGITS_TO_NUM (h[1], h[2]); + /* Don't unescape %00 because there is no way to insert it + into a C string without effectively truncating it. */ + if (c == '\0') + goto copychar; + *t = c; h += 2; } } @@ -249,34 +255,27 @@ url_escape_allow_passthrough (const char *s) return url_escape_1 (s, urlchr_unsafe, 1); } -enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH }; +/* Decide whether the char at position P needs to be encoded. (It is + not enough to pass a single char *P because the function may need + to inspect the surrounding context.) -/* Decide whether to encode, decode, or pass through the char at P. - This used to be a macro, but it got a little too convoluted. */ -static inline enum copy_method -decide_copy_method (const char *p) + Return 1 if the char should be escaped as %XX, 0 otherwise. */ + +static inline int +char_needs_escaping (const char *p) { if (*p == '%') { if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2))) - { - /* %xx sequence: decode it, unless it would decode to an - unsafe or a reserved char; in that case, leave it as - is. */ - char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2)); - if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt)) - return CM_PASSTHROUGH; - else - return CM_DECODE; - } + return 0; else /* Garbled %.. sequence: encode `%'. */ - return CM_ENCODE; + return 1; } else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) - return CM_ENCODE; + return 1; else - return CM_PASSTHROUGH; + return 0; } /* Translate a %-escaped (but possibly non-conformant) input string S @@ -286,34 +285,33 @@ decide_copy_method (const char *p) After a URL has been run through this function, the protocols that use `%' as the quote character can use the resulting string as-is, - while those that don't call url_unescape() to get to the intended - data. This function is also stable: after an input string is - transformed the first time, all further transformations of the - result yield the same result string. + while those that don't can use url_unescape to get to the intended + data. This function is stable: once the input is transformed, + further transformations of the result yield the same output. Let's discuss why this function is needed. - Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw - space character would mess up the HTTP request, it needs to be - quoted, like this: + Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since + a raw space character would mess up the HTTP request, it needs to + be quoted, like this: GET /abc%20def HTTP/1.0 - It appears that the unsafe chars need to be quoted, for example - with url_escape. But what if we're requested to download + It would appear that the unsafe chars need to be quoted, for + example with url_escape. But what if we're requested to download `abc%20def'? url_escape transforms "%" to "%25", which would leave us with `abc%2520def'. This is incorrect -- since %-escapes are part of URL syntax, "%20" is the correct way to denote a literal - space on the Wget command line. This leaves us in the conclusion - that in that case Wget should not call url_escape, but leave the - `%20' as is. + space on the Wget command line. This leads to the conclusion that + in that case Wget should not call url_escape, but leave the `%20' + as is. This is clearly contradictory, but it only gets worse. - And what if the requested URI is `abc%20 def'? If we call - url_escape, we end up with `/abc%2520%20def', which is almost - certainly not intended. If we don't call url_escape, we are left - with the embedded space and cannot complete the request. What the - user meant was for Wget to request `/abc%20%20def', and this is - where reencode_escapes kicks in. + What if the requested URI is `abc%20 def'? If we call url_escape, + we end up with `/abc%2520%20def', which is almost certainly not + intended. If we don't call url_escape, we are left with the + embedded space and cannot complete the request. What the user + meant was for Wget to request `/abc%20%20def', and this is where + reencode_escapes kicks in. Wget used to solve this by first decoding %-quotes, and then encoding all the "unsafe" characters found in the resulting string. @@ -327,25 +325,25 @@ decide_copy_method (const char *p) literal plus. reencode_escapes correctly translates the above to "a%2B+b", i.e. returns the original string. - This function uses an algorithm proposed by Anon Sricharoenchai: + This function uses a modified version of the algorithm originally + proposed by Anon Sricharoenchai: - 1. Encode all URL_UNSAFE and the "%" that are not followed by 2 - hexdigits. + * Encode all "unsafe" characters, except those that are also + "reserved", to %XX. See urlchr_table for which characters are + unsafe and reserved. - 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and - "+". + * Encode the "%" characters not followed by two hex digits to + "%25". - ...except that this code conflates the two steps, and decides - whether to encode, decode, or pass through each character in turn. - The function still uses two passes, but their logic is the same -- - the first pass exists merely for the sake of allocation. Another - small difference is that we include `+' to URL_RESERVED. + * Pass through all other characters and %XX escapes as-is. (Up to + Wget 1.10 this decoded %XX escapes corresponding to "safe" + characters, but that was obtrusive and broke some servers.) Anon's test case: "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc" -> - "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc" + "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc" Simpler test cases: @@ -366,58 +364,38 @@ reencode_escapes (const char *s) int oldlen, newlen; int encode_count = 0; - int decode_count = 0; - /* First, pass through the string to see if there's anything to do, + /* First pass: inspect the string to see if there's anything to do, and to calculate the new length. */ for (p1 = s; *p1; p1++) - { - switch (decide_copy_method (p1)) - { - case CM_ENCODE: - ++encode_count; - break; - case CM_DECODE: - ++decode_count; - break; - case CM_PASSTHROUGH: - break; - } - } + if (char_needs_escaping (p1)) + ++encode_count; - if (!encode_count && !decode_count) + if (!encode_count) /* The string is good as it is. */ - return (char *)s; /* C const model sucks. */ + return (char *) s; /* C const model sucks. */ oldlen = p1 - s; - /* Each encoding adds two characters (hex digits), while each - decoding removes two characters. */ - newlen = oldlen + 2 * (encode_count - decode_count); + /* Each encoding adds two characters (hex digits). */ + newlen = oldlen + 2 * encode_count; newstr = xmalloc (newlen + 1); + /* Second pass: copy the string to the destination address, encoding + chars when needed. */ p1 = s; p2 = newstr; while (*p1) - { - switch (decide_copy_method (p1)) - { - case CM_ENCODE: - { - unsigned char c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); - } - break; - case CM_DECODE: - *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]); - p1 += 3; /* skip %xx */ - break; - case CM_PASSTHROUGH: - *p2++ = *p1++; - } - } + if (char_needs_escaping (p1)) + { + unsigned char c = *p1++; + *p2++ = '%'; + *p2++ = XNUM_TO_DIGIT (c >> 4); + *p2++ = XNUM_TO_DIGIT (c & 0xf); + } + else + *p2++ = *p1++; + *p2 = '\0'; assert (p2 - newstr == newlen); return newstr; @@ -545,7 +523,7 @@ rewrite_shorthand_url (const char *url) { const char *p; - if (url_has_scheme (url)) + if (url_scheme (url) != SCHEME_INVALID) return NULL; /* Look for a ':' or '/'. The former signifies NcFTP syntax, the @@ -556,6 +534,12 @@ rewrite_shorthand_url (const char *url) if (p == url) return NULL; + /* If we're looking at "://", it means the URL uses a scheme we + don't support, which may include "https" when compiled without + SSL support. Don't bogusly rewrite such URLs. */ + if (p[0] == ':' && p[1] == '/' && p[2] == '/') + return NULL; + if (*p == ':') { const char *pp; @@ -605,21 +589,21 @@ static void split_path PARAMS ((const char *, char **, char **)); #define strpbrk_or_eos(s, accept) ({ \ char *SOE_p = strpbrk (s, accept); \ if (!SOE_p) \ - SOE_p = (char *)s + strlen (s); \ + SOE_p = strchr (s, '\0'); \ SOE_p; \ }) #else /* not __GNUC__ */ -static char * +static inline char * strpbrk_or_eos (const char *s, const char *accept) { char *p = strpbrk (s, accept); if (!p) - p = (char *)s + strlen (s); + p = strchr (s, '\0'); return p; } -#endif +#endif /* not __GNUC__ */ /* Turn STR into lowercase; return non-zero if a character was actually changed. */ @@ -688,7 +672,7 @@ url_parse (const char *url, int *error) if (scheme == SCHEME_INVALID) { error_code = PE_UNSUPPORTED_SCHEME; - goto error; + goto err; } url_encoded = reencode_escapes (url); @@ -726,7 +710,7 @@ url_parse (const char *url, int *error) if (!host_e) { error_code = PE_UNTERMINATED_IPV6_ADDRESS; - goto error; + goto err; } #ifdef ENABLE_IPV6 @@ -734,14 +718,14 @@ url_parse (const char *url, int *error) if (!is_valid_ipv6_address(host_b, host_e)) { error_code = PE_INVALID_IPV6_ADDRESS; - goto error; + goto err; } /* Continue parsing after the closing ']'. */ p = host_e + 1; #else error_code = PE_IPV6_NOT_SUPPORTED; - goto error; + goto err; #endif } else @@ -753,7 +737,7 @@ url_parse (const char *url, int *error) if (host_b == host_e) { error_code = PE_EMPTY_HOST; - goto error; + goto err; } port = scheme_default_port (scheme); @@ -778,9 +762,16 @@ url_parse (const char *url, int *error) /* http://host:12randomgarbage/blah */ /* ^ */ error_code = PE_BAD_PORT_NUMBER; - goto error; + goto err; } port = 10 * port + (*pp - '0'); + /* Check for too large port numbers here, before we have + a chance to overflow on bogus port values. */ + if (port > 65535) + { + error_code = PE_BAD_PORT_NUMBER; + goto err; + } } } } @@ -838,7 +829,7 @@ url_parse (const char *url, int *error) if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) { error_code = PE_INVALID_USER_NAME; - goto error; + goto err; } } @@ -856,8 +847,9 @@ url_parse (const char *url, int *error) host_modified = lowercase_str (u->host); /* Decode %HH sequences in host name. This is important not so much - to support %HH sequences, but to support binary characters (which - will have been converted to %HH by reencode_escapes). */ + to support %HH sequences in host names (which other browser + don't), but to support binary characters (which will have been + converted to %HH by reencode_escapes). */ if (strchr (u->host, '%')) { url_unescape (u->host); @@ -888,11 +880,10 @@ url_parse (const char *url, int *error) else u->url = url_encoded; } - url_encoded = NULL; return u; - error: + err: /* Cleanup in case of error: */ if (url_encoded && url_encoded != url) xfree (url_encoded); @@ -1002,7 +993,7 @@ char * url_full_path (const struct url *url) { int length = full_path_length (url); - char *full_path = (char *)xmalloc(length + 1); + char *full_path = (char *) xmalloc (length + 1); full_path_write (url, full_path); full_path[length] = '\0'; @@ -1010,35 +1001,41 @@ url_full_path (const struct url *url) return full_path; } -/* Escape unsafe and reserved characters, except for the slash - characters. */ +/* Unescape CHR in an otherwise escaped STR. Used to selectively + escaping of certain characters, such as "/" and ":". Returns a + count of unescaped chars. */ -static char * -url_escape_dir (const char *dir) +static void +unescape_single_char (char *str, char chr) { - char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1); - char *h, *t; - if (newdir == dir) - return (char *)dir; - - /* Unescape slashes in NEWDIR. */ - - h = newdir; /* hare */ - t = newdir; /* tortoise */ - + const char c1 = XNUM_TO_DIGIT (chr >> 4); + const char c2 = XNUM_TO_DIGIT (chr & 0xf); + char *h = str; /* hare */ + char *t = str; /* tortoise */ for (; *h; h++, t++) { - /* url_escape_1 having converted '/' to "%2F" exactly. */ - if (*h == '%' && h[1] == '2' && h[2] == 'F') + if (h[0] == '%' && h[1] == c1 && h[2] == c2) { - *t = '/'; + *t = chr; h += 2; } else *t = *h; } *t = '\0'; +} + +/* Escape unsafe and reserved characters, except for the slash + characters. */ +static char * +url_escape_dir (const char *dir) +{ + char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1); + if (newdir == dir) + return (char *)dir; + + unescape_single_char (newdir, '/'); return newdir; } @@ -1075,7 +1072,7 @@ sync_path (struct url *u) *p++ = '/'; memcpy (p, efile, filelen); p += filelen; - *p++ = '\0'; + *p = '\0'; } u->path = newpath; @@ -1129,7 +1126,7 @@ url_free (struct url *url) } /* Create all the necessary directories for PATH (a file). Calls - mkdirhier() internally. */ + make_directory internally. */ int mkalldirs (const char *path) { @@ -1257,7 +1254,7 @@ enum { translate file name back to URL, this would become important crucial. Right now, it's better to be minimal in escaping. */ -const static unsigned char filechr_table[256] = +static const unsigned char filechr_table[256] = { UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */ C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */ @@ -1520,11 +1517,6 @@ url_file_name (const struct url *u) "back up one element". Single leading and trailing slashes are preserved. - This function does not handle URL escapes explicitly. If you're - passing paths from URLs, make sure to unquote "%2e" and "%2E" to - ".", so that this function can find the dots. (Wget's URL parser - calls reencode_escapes, which see.) - For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive test examples are provided below. If you change anything in this function, run test_path_simplify to make sure you haven't broken a @@ -1838,7 +1830,7 @@ url_string (const struct url *url, int hide_password) { int size; char *result, *p; - char *quoted_user = NULL, *quoted_passwd = NULL; + char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL; int scheme_port = supported_schemes[url->scheme].default_port; const char *scheme_str = supported_schemes[url->scheme].leading_string; @@ -1861,12 +1853,19 @@ url_string (const struct url *url, int hide_password) } } - /* Numeric IPv6 addresses can contain ':' and need to be quoted with - brackets. */ - brackets_around_host = strchr (url->host, ':') != NULL; + /* In the unlikely event that the host name contains non-printable + characters, quote it for displaying to the user. */ + quoted_host = url_escape_allow_passthrough (url->host); + + /* Undo the quoting of colons that URL escaping performs. IPv6 + addresses may legally contain colons, and in that case must be + placed in square brackets. */ + if (quoted_host != url->host) + unescape_single_char (quoted_host, ':'); + brackets_around_host = strchr (quoted_host, ':') != NULL; size = (strlen (scheme_str) - + strlen (url->host) + + strlen (quoted_host) + (brackets_around_host ? 2 : 0) + fplen + 1); @@ -1895,7 +1894,7 @@ url_string (const struct url *url, int hide_password) if (brackets_around_host) *p++ = '['; - APPEND (p, url->host); + APPEND (p, quoted_host); if (brackets_around_host) *p++ = ']'; if (url->port != scheme_port) @@ -1912,9 +1911,10 @@ url_string (const struct url *url, int hide_password) if (quoted_user && quoted_user != url->user) xfree (quoted_user); - if (quoted_passwd && !hide_password - && quoted_passwd != url->passwd) + if (quoted_passwd && !hide_password && quoted_passwd != url->passwd) xfree (quoted_passwd); + if (quoted_host != url->host) + xfree (quoted_host); return result; }