2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
3 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
42 #include "host.h" /* for is_valid_ipv6_address */
46 #endif /* def __VMS */
53 scm_disabled = 1, /* for https when OpenSSL fails to init. */
54 scm_has_params = 2, /* whether scheme has ;params */
55 scm_has_query = 4, /* whether scheme has ?query */
56 scm_has_fragment = 8 /* whether scheme has #fragment */
61 /* Short name of the scheme, such as "http" or "ftp". */
63 /* Leading string that identifies the scheme, such as "https://". */
64 const char *leading_string;
65 /* Default port of the scheme when none is specified. */
71 /* Supported schemes: */
72 static struct scheme_data supported_schemes[] =
74 { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment },
76 { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
78 { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment },
84 /* Forward declarations: */
86 static bool path_simplify (enum url_scheme, char *);
88 /* Support for escaping and unescaping of URL strings. */
90 /* Table of "reserved" and "unsafe" characters. Those terms are
91 rfc1738-speak, as such largely obsoleted by rfc2396 and later
92 specs, but the general idea remains.
94 A reserved character is the one that you can't decode without
95 changing the meaning of the URL. For example, you can't decode
96 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
97 path components is different. Non-reserved characters can be
98 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
99 unsafe characters are loosely based on rfc1738, plus "$" and ",",
100 as recommended by rfc2396, and minus "~", which is very frequently
101 used (and sometimes unrecognized as %7E by broken servers).
103 An unsafe character is the one that should be encoded when URLs are
104 placed in foreign environments. E.g. space and newline are unsafe
105 in HTTP contexts because HTTP uses them as separator and line
106 terminator, so they must be encoded to %20 and %0A respectively.
107 "*" is unsafe in shell context, etc.
109 We determine whether a character is unsafe through static table
110 lookup. This code assumes ASCII character set and 8-bit chars. */
113 /* rfc1738 reserved chars + "$" and ",". */
116 /* rfc1738 unsafe chars, plus non-printables. */
120 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
121 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
122 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
124 /* Shorthands for the table: */
125 #define R urlchr_reserved
126 #define U urlchr_unsafe
129 static const unsigned char urlchr_table[256] =
131 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
132 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
133 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
134 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
135 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
136 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
137 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
138 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
139 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
140 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
141 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
142 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
143 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
144 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
145 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
146 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
148 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
149 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
150 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
151 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
153 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
154 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
155 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
156 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
162 /* URL-unescape the string S.
164 This is done by transforming the sequences "%HH" to the character
165 represented by the hexadecimal digits HH. If % is not followed by
166 two hexadecimal digits, it is inserted literally.
168 The transformation is done in place. If you need the original
169 string intact, make a copy before calling this function. */
172 url_unescape (char *s)
174 char *t = s; /* t - tortoise */
175 char *h = s; /* h - hare */
187 /* Do nothing if '%' is not followed by two hex digits. */
188 if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
190 c = X2DIGITS_TO_NUM (h[1], h[2]);
191 /* Don't unescape %00 because there is no way to insert it
192 into a C string without effectively truncating it. */
202 /* The core of url_escape_* functions. Escapes the characters that
203 match the provided mask in urlchr_table.
205 If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
206 returned unchanged. If ALLOW_PASSTHROUGH is false, a freshly
207 allocated string will be returned in all cases. */
210 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
217 for (p1 = s; *p1; p1++)
218 if (urlchr_test (*p1, mask))
219 addition += 2; /* Two more characters (hex digits) */
222 return allow_passthrough ? (char *)s : xstrdup (s);
224 newlen = (p1 - s) + addition;
225 newstr = xmalloc (newlen + 1);
231 /* Quote the characters that match the test mask. */
232 if (urlchr_test (*p1, mask))
234 unsigned char c = *p1++;
236 *p2++ = XNUM_TO_DIGIT (c >> 4);
237 *p2++ = XNUM_TO_DIGIT (c & 0xf);
242 assert (p2 - newstr == newlen);
248 /* URL-escape the unsafe characters (see urlchr_table) in a given
249 string, returning a freshly allocated string. */
252 url_escape (const char *s)
254 return url_escape_1 (s, urlchr_unsafe, false);
257 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
258 a given string, returning a freshly allocated string. */
261 url_escape_unsafe_and_reserved (const char *s)
263 return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
266 /* URL-escape the unsafe characters (see urlchr_table) in a given
267 string. If no characters are unsafe, S is returned. */
270 url_escape_allow_passthrough (const char *s)
272 return url_escape_1 (s, urlchr_unsafe, true);
275 /* Decide whether the char at position P needs to be encoded. (It is
276 not enough to pass a single char *P because the function may need
277 to inspect the surrounding context.)
279 Return true if the char should be escaped as %XX, false otherwise. */
282 char_needs_escaping (const char *p)
286 if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
289 /* Garbled %.. sequence: encode `%'. */
292 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
298 /* Translate a %-escaped (but possibly non-conformant) input string S
299 into a %-escaped (and conformant) output string. If no characters
300 are encoded or decoded, return the same string S; otherwise, return
301 a freshly allocated string with the new contents.
303 After a URL has been run through this function, the protocols that
304 use `%' as the quote character can use the resulting string as-is,
305 while those that don't can use url_unescape to get to the intended
306 data. This function is stable: once the input is transformed,
307 further transformations of the result yield the same output.
309 Let's discuss why this function is needed.
311 Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since
312 a raw space character would mess up the HTTP request, it needs to
313 be quoted, like this:
315 GET /abc%20def HTTP/1.0
317 It would appear that the unsafe chars need to be quoted, for
318 example with url_escape. But what if we're requested to download
319 `abc%20def'? url_escape transforms "%" to "%25", which would leave
320 us with `abc%2520def'. This is incorrect -- since %-escapes are
321 part of URL syntax, "%20" is the correct way to denote a literal
322 space on the Wget command line. This leads to the conclusion that
323 in that case Wget should not call url_escape, but leave the `%20'
324 as is. This is clearly contradictory, but it only gets worse.
326 What if the requested URI is `abc%20 def'? If we call url_escape,
327 we end up with `/abc%2520%20def', which is almost certainly not
328 intended. If we don't call url_escape, we are left with the
329 embedded space and cannot complete the request. What the user
330 meant was for Wget to request `/abc%20%20def', and this is where
331 reencode_escapes kicks in.
333 Wget used to solve this by first decoding %-quotes, and then
334 encoding all the "unsafe" characters found in the resulting string.
335 This was wrong because it didn't preserve certain URL special
336 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
337 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
338 whether we considered `+' reserved (it is). One of these results
339 is inevitable because by the second step we would lose information
340 on whether the `+' was originally encoded or not. Both results
341 were wrong because in CGI parameters + means space, while %2B means
342 literal plus. reencode_escapes correctly translates the above to
343 "a%2B+b", i.e. returns the original string.
345 This function uses a modified version of the algorithm originally
346 proposed by Anon Sricharoenchai:
348 * Encode all "unsafe" characters, except those that are also
349 "reserved", to %XX. See urlchr_table for which characters are
352 * Encode the "%" characters not followed by two hex digits to
355 * Pass through all other characters and %XX escapes as-is. (Up to
356 Wget 1.10 this decoded %XX escapes corresponding to "safe"
357 characters, but that was obtrusive and broke some servers.)
361 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
363 "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
367 "foo bar" -> "foo%20bar"
368 "foo%20bar" -> "foo%20bar"
369 "foo %20bar" -> "foo%20%20bar"
370 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
371 "foo%25%20bar" -> "foo%25%20bar"
372 "foo%2%20bar" -> "foo%252%20bar"
373 "foo+bar" -> "foo+bar" (plus is reserved!)
374 "foo%2b+bar" -> "foo%2b+bar" */
377 reencode_escapes (const char *s)
383 int encode_count = 0;
385 /* First pass: inspect the string to see if there's anything to do,
386 and to calculate the new length. */
387 for (p1 = s; *p1; p1++)
388 if (char_needs_escaping (p1))
392 /* The string is good as it is. */
393 return (char *) s; /* C const model sucks. */
396 /* Each encoding adds two characters (hex digits). */
397 newlen = oldlen + 2 * encode_count;
398 newstr = xmalloc (newlen + 1);
400 /* Second pass: copy the string to the destination address, encoding
401 chars when needed. */
406 if (char_needs_escaping (p1))
408 unsigned char c = *p1++;
410 *p2++ = XNUM_TO_DIGIT (c >> 4);
411 *p2++ = XNUM_TO_DIGIT (c & 0xf);
417 assert (p2 - newstr == newlen);
421 /* Returns the scheme type if the scheme is supported, or
422 SCHEME_INVALID if not. */
425 url_scheme (const char *url)
429 for (i = 0; supported_schemes[i].leading_string; i++)
430 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
431 strlen (supported_schemes[i].leading_string)))
433 if (!(supported_schemes[i].flags & scm_disabled))
434 return (enum url_scheme) i;
436 return SCHEME_INVALID;
439 return SCHEME_INVALID;
442 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
444 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
445 currently implemented, it returns true if URL begins with
449 url_has_scheme (const char *url)
453 /* The first char must be a scheme char. */
454 if (!*p || !SCHEME_CHAR (*p))
457 /* Followed by 0 or more scheme chars. */
458 while (*p && SCHEME_CHAR (*p))
460 /* Terminated by ':'. */
465 url_valid_scheme (const char *url)
467 enum url_scheme scheme = url_scheme (url);
468 return scheme != SCHEME_INVALID;
472 scheme_default_port (enum url_scheme scheme)
474 return supported_schemes[scheme].default_port;
478 scheme_disable (enum url_scheme scheme)
480 supported_schemes[scheme].flags |= scm_disabled;
483 /* Skip the username and password, if present in the URL. The
484 function should *not* be called with the complete URL, but with the
485 portion after the scheme.
487 If no username and password are found, return URL. */
490 url_skip_credentials (const char *url)
492 /* Look for '@' that comes before terminators, such as '/', '?',
494 const char *p = (const char *)strpbrk (url, "@/?#;");
500 /* Parse credentials contained in [BEG, END). The region is expected
501 to have come from a URL and is unescaped. */
504 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
510 return false; /* empty user name */
512 colon = memchr (beg, ':', end - beg);
514 return false; /* again empty user name */
518 *passwd = strdupdelim (colon + 1, end);
520 url_unescape (*passwd);
527 *user = strdupdelim (beg, userend);
528 url_unescape (*user);
532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
533 originally popularized by Netscape and NcFTP. HTTP shorthands look
536 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
537 www.foo.com[:port] -> http://www.foo.com[:port]
539 FTP shorthands look like this:
541 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
542 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
544 If the URL needs not or cannot be rewritten, return NULL. */
547 rewrite_shorthand_url (const char *url)
552 if (url_scheme (url) != SCHEME_INVALID)
555 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
557 p = strpbrk (url, ":/");
561 /* If we're looking at "://", it means the URL uses a scheme we
562 don't support, which may include "https" when compiled without
563 SSL support. Don't bogusly rewrite such URLs. */
564 if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
569 /* Colon indicates ftp, as in foo.bar.com:path. Check for
570 special case of http port number ("localhost:10000"). */
571 int digits = strspn (p + 1, "0123456789");
572 if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
575 /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
576 ret = aprintf ("ftp://%s", url);
577 ret[6 + (p - url)] = '/';
582 /* Just prepend "http://" to URL. */
583 ret = aprintf ("http://%s", url);
588 static void split_path (const char *, char **, char **);
590 /* Like strpbrk, with the exception that it returns the pointer to the
591 terminating zero (end-of-string aka "eos") if no matching character
595 strpbrk_or_eos (const char *s, const char *accept)
597 char *p = strpbrk (s, accept);
599 p = strchr (s, '\0');
603 /* Turn STR into lowercase; return true if a character was actually
607 lowercase_str (char *str)
609 bool changed = false;
611 if (c_isupper (*str))
614 *str = c_tolower (*str);
620 init_seps (enum url_scheme scheme)
622 static char seps[8] = ":/";
624 int flags = supported_schemes[scheme].flags;
626 if (flags & scm_has_params)
628 if (flags & scm_has_query)
630 if (flags & scm_has_fragment)
636 static const char *parse_errors[] = {
637 #define PE_NO_ERROR 0
639 #define PE_UNSUPPORTED_SCHEME 1
640 N_("Unsupported scheme %s"), /* support for format token only here */
641 #define PE_MISSING_SCHEME 2
642 N_("Scheme missing"),
643 #define PE_INVALID_HOST_NAME 3
644 N_("Invalid host name"),
645 #define PE_BAD_PORT_NUMBER 4
646 N_("Bad port number"),
647 #define PE_INVALID_USER_NAME 5
648 N_("Invalid user name"),
649 #define PE_UNTERMINATED_IPV6_ADDRESS 6
650 N_("Unterminated IPv6 numeric address"),
651 #define PE_IPV6_NOT_SUPPORTED 7
652 N_("IPv6 addresses not supported"),
653 #define PE_INVALID_IPV6_ADDRESS 8
654 N_("Invalid IPv6 numeric address")
659 Return a new struct url if successful, NULL on error. In case of
660 error, and if ERROR is not NULL, also set *ERROR to the appropriate
663 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
667 bool path_modified, host_modified;
669 enum url_scheme scheme;
672 const char *uname_b, *uname_e;
673 const char *host_b, *host_e;
674 const char *path_b, *path_e;
675 const char *params_b, *params_e;
676 const char *query_b, *query_e;
677 const char *fragment_b, *fragment_e;
680 char *user = NULL, *passwd = NULL;
682 const char *url_encoded = NULL;
683 char *new_url = NULL;
687 scheme = url_scheme (url);
688 if (scheme == SCHEME_INVALID)
690 if (url_has_scheme (url))
691 error_code = PE_UNSUPPORTED_SCHEME;
693 error_code = PE_MISSING_SCHEME;
697 if (iri && iri->utf8_encode)
699 iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
700 if (!iri->utf8_encode)
703 iri->orig_url = xstrdup (url);
706 /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/
708 url_encoded = reencode_escapes (new_url ? new_url : url);
710 url_encoded = new_url ? new_url : url;
714 if (new_url && url_encoded != new_url)
717 p += strlen (supported_schemes[scheme].leading_string);
719 p = url_skip_credentials (p);
722 /* scheme://user:pass@host[:port]... */
725 /* We attempt to break down the URL into the components path,
726 params, query, and fragment. They are ordered like this:
728 scheme://host[:port][/path][;params][?query][#fragment] */
730 path_b = path_e = NULL;
731 params_b = params_e = NULL;
732 query_b = query_e = NULL;
733 fragment_b = fragment_e = NULL;
735 /* Initialize separators for optional parts of URL, depending on the
736 scheme. For example, FTP has params, and HTTP and HTTPS have
737 query string and fragment. */
738 seps = init_seps (scheme);
744 /* Handle IPv6 address inside square brackets. Ideally we'd
745 just look for the terminating ']', but rfc2732 mandates
746 rejecting invalid IPv6 addresses. */
748 /* The address begins after '['. */
750 host_e = strchr (host_b, ']');
754 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
759 /* Check if the IPv6 address is valid. */
760 if (!is_valid_ipv6_address(host_b, host_e))
762 error_code = PE_INVALID_IPV6_ADDRESS;
766 /* Continue parsing after the closing ']'. */
769 error_code = PE_IPV6_NOT_SUPPORTED;
773 /* The closing bracket must be followed by a separator or by the
775 /* http://[::1]... */
777 if (!strchr (seps, *p))
779 /* Trailing garbage after []-delimited IPv6 address. */
780 error_code = PE_INVALID_HOST_NAME;
786 p = strpbrk_or_eos (p, seps);
789 ++seps; /* advance to '/' */
791 if (host_b == host_e)
793 error_code = PE_INVALID_HOST_NAME;
797 port = scheme_default_port (scheme);
800 const char *port_b, *port_e, *pp;
802 /* scheme://host:port/tralala */
806 p = strpbrk_or_eos (p, seps);
809 /* Allow empty port, as per rfc2396. */
810 if (port_b != port_e)
811 for (port = 0, pp = port_b; pp < port_e; pp++)
813 if (!c_isdigit (*pp))
815 /* http://host:12randomgarbage/blah */
817 error_code = PE_BAD_PORT_NUMBER;
820 port = 10 * port + (*pp - '0');
821 /* Check for too large port numbers here, before we have
822 a chance to overflow on bogus port values. */
825 error_code = PE_BAD_PORT_NUMBER;
830 /* Advance to the first separator *after* '/' (either ';' or '?',
831 depending on the scheme). */
834 /* Get the optional parts of URL, each part being delimited by
835 current location and the position of the next separator. */
836 #define GET_URL_PART(sepchar, var) do { \
838 var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \
842 GET_URL_PART ('/', path);
843 if (supported_schemes[scheme].flags & scm_has_params)
844 GET_URL_PART (';', params);
845 if (supported_schemes[scheme].flags & scm_has_query)
846 GET_URL_PART ('?', query);
847 if (supported_schemes[scheme].flags & scm_has_fragment)
848 GET_URL_PART ('#', fragment);
853 if (uname_b != uname_e)
855 /* http://user:pass@host */
857 /* uname_b uname_e */
858 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
860 error_code = PE_INVALID_USER_NAME;
865 u = xnew0 (struct url);
867 u->host = strdupdelim (host_b, host_e);
872 u->path = strdupdelim (path_b, path_e);
873 path_modified = path_simplify (scheme, u->path);
874 split_path (u->path, &u->dir, &u->file);
876 host_modified = lowercase_str (u->host);
878 /* Decode %HH sequences in host name. This is important not so much
879 to support %HH sequences in host names (which other browser
880 don't), but to support binary characters (which will have been
881 converted to %HH by reencode_escapes). */
882 if (strchr (u->host, '%'))
884 url_unescape (u->host);
885 host_modified = true;
887 /* Apply IDNA regardless of iri->utf8_encode status */
888 if (opt.enable_iri && iri)
890 char *new = idn_encode (iri, u->host);
895 host_modified = true;
901 u->params = strdupdelim (params_b, params_e);
903 u->query = strdupdelim (query_b, query_e);
905 u->fragment = strdupdelim (fragment_b, fragment_e);
907 if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
909 /* If we suspect that a transformation has rendered what
910 url_string might return different from URL_ENCODED, rebuild
911 u->url using url_string. */
912 u->url = url_string (u, URL_AUTH_SHOW);
914 if (url_encoded != url)
915 xfree ((char *) url_encoded);
919 if (url_encoded == url)
920 u->url = xstrdup (url);
922 u->url = (char *) url_encoded;
928 /* Cleanup in case of error: */
929 if (url_encoded && url_encoded != url)
930 xfree ((char *) url_encoded);
932 /* Transmit the error code to the caller, if the caller wants to
939 /* Return the error message string from ERROR_CODE, which should have
940 been retrieved from url_parse. The error message is translated. */
943 url_error (const char *url, int error_code)
945 assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
947 if (error_code == PE_UNSUPPORTED_SCHEME)
950 char *scheme = xstrdup (url);
951 assert (url_has_scheme (url));
953 if ((p = strchr (scheme, ':')))
955 if (!strcasecmp (scheme, "https"))
956 error = aprintf (_("HTTPS support not compiled in"));
958 error = aprintf (_(parse_errors[error_code]), quote (scheme));
964 return xstrdup (_(parse_errors[error_code]));
967 /* Split PATH into DIR and FILE. PATH comes from the URL and is
968 expected to be URL-escaped.
970 The path is split into directory (the part up to the last slash)
971 and file (the part after the last slash), which are subsequently
975 "foo/bar/baz" "foo/bar" "baz"
976 "foo/bar/" "foo/bar" ""
978 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
980 DIR and FILE are freshly allocated. */
983 split_path (const char *path, char **dir, char **file)
985 char *last_slash = strrchr (path, '/');
989 *file = xstrdup (path);
993 *dir = strdupdelim (path, last_slash);
994 *file = xstrdup (last_slash + 1);
997 url_unescape (*file);
1000 /* Note: URL's "full path" is the path with the query string and
1001 params appended. The "fragment" (#foo) is intentionally ignored,
1002 but that might be changed. For example, if the original URL was
1003 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1004 the full path will be "/foo/bar/baz;bullshit?querystring". */
1006 /* Return the length of the full path, without the terminating
1010 full_path_length (const struct url *url)
1014 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1025 /* Write out the full path. */
1028 full_path_write (const struct url *url, char *where)
1030 #define FROB(el, chr) do { \
1031 char *f_el = url->el; \
1033 int l = strlen (f_el); \
1035 memcpy (where, f_el, l); \
1047 /* Public function for getting the "full path". E.g. if u->path is
1048 "foo/bar" and u->query is "param=value", full_path will be
1049 "/foo/bar?param=value". */
1052 url_full_path (const struct url *url)
1054 int length = full_path_length (url);
1055 char *full_path = xmalloc (length + 1);
1057 full_path_write (url, full_path);
1058 full_path[length] = '\0';
1063 /* Unescape CHR in an otherwise escaped STR. Used to selectively
1064 escaping of certain characters, such as "/" and ":". Returns a
1065 count of unescaped chars. */
1068 unescape_single_char (char *str, char chr)
1070 const char c1 = XNUM_TO_DIGIT (chr >> 4);
1071 const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1072 char *h = str; /* hare */
1073 char *t = str; /* tortoise */
1074 for (; *h; h++, t++)
1076 if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1087 /* Escape unsafe and reserved characters, except for the slash
1091 url_escape_dir (const char *dir)
1093 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1097 unescape_single_char (newdir, '/');
1101 /* Sync u->path and u->url with u->dir and u->file. Called after
1102 u->file or u->dir have been changed, typically by the FTP code. */
1105 sync_path (struct url *u)
1107 char *newpath, *efile, *edir;
1111 /* u->dir and u->file are not escaped. URL-escape them before
1112 reassembling them into u->path. That way, if they contain
1113 separators like '?' or even if u->file contains slashes, the
1114 path will be correctly assembled. (u->file can contain slashes
1115 if the URL specifies it with %2f, or if an FTP server returns
1117 edir = url_escape_dir (u->dir);
1118 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1121 newpath = xstrdup (efile);
1124 int dirlen = strlen (edir);
1125 int filelen = strlen (efile);
1127 /* Copy "DIR/FILE" to newpath. */
1128 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1129 memcpy (p, edir, dirlen);
1132 memcpy (p, efile, filelen);
1141 if (efile != u->file)
1144 /* Regenerate u->url as well. */
1146 u->url = url_string (u, URL_AUTH_SHOW);
1149 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1150 This way we can sync u->path and u->url when they get changed. */
1153 url_set_dir (struct url *url, const char *newdir)
1156 url->dir = xstrdup (newdir);
1161 url_set_file (struct url *url, const char *newfile)
1164 url->file = xstrdup (newfile);
1169 url_free (struct url *url)
1175 xfree_null (url->params);
1176 xfree_null (url->query);
1177 xfree_null (url->fragment);
1178 xfree_null (url->user);
1179 xfree_null (url->passwd);
1187 /* Create all the necessary directories for PATH (a file). Calls
1188 make_directory internally. */
1190 mkalldirs (const char *path)
1197 p = path + strlen (path);
1198 for (; *p != '/' && p != path; p--)
1201 /* Don't create if it's just a file. */
1202 if ((p == path) && (*p != '/'))
1204 t = strdupdelim (path, p);
1206 /* Check whether the directory exists. */
1207 if ((stat (t, &st) == 0))
1209 if (S_ISDIR (st.st_mode))
1216 /* If the dir exists as a file name, remove it first. This
1217 is *only* for Wget to work with buggy old CERN http
1218 servers. Here is the scenario: When Wget tries to
1219 retrieve a directory without a slash, e.g.
1220 http://foo/bar (bar being a directory), CERN server will
1221 not redirect it too http://foo/bar/ -- it will generate a
1222 directory listing containing links to bar/file1,
1223 bar/file2, etc. Wget will lose because it saves this
1224 HTML listing to a file `bar', so it cannot create the
1225 directory. To work around this, if the file of the same
1226 name exists, we just remove it and create the directory
1228 DEBUGP (("Removing %s because of directory danger!\n", t));
1232 res = make_directory (t);
1234 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1239 /* Functions for constructing the file name out of URL components. */
1241 /* A growable string structure, used by url_file_name and friends.
1242 This should perhaps be moved to utils.c.
1244 The idea is to have a convenient and efficient way to construct a
1245 string by having various functions append data to it. Instead of
1246 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1247 functions in questions, we pass the pointer to this struct. */
1255 /* Ensure that the string can accept APPEND_COUNT more characters past
1256 the current TAIL position. If necessary, this will grow the string
1257 and update its allocated size. If the string is already large
1258 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1259 #define GROW(g, append_size) do { \
1260 struct growable *G_ = g; \
1261 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1264 /* Return the tail position of the string. */
1265 #define TAIL(r) ((r)->base + (r)->tail)
1267 /* Move the tail position by APPEND_COUNT characters. */
1268 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1270 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1274 append_string (const char *str, struct growable *dest)
1276 int l = strlen (str);
1278 memcpy (TAIL (dest), str, l);
1279 TAIL_INCR (dest, l);
1282 /* Append CH to DEST. For example, append_char (0, DEST)
1283 zero-terminates DEST. */
1286 append_char (char ch, struct growable *dest)
1290 TAIL_INCR (dest, 1);
1294 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1295 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1296 filechr_control = 4 /* a control character, e.g. 0-31 */
1299 #define FILE_CHAR_TEST(c, mask) \
1300 ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1301 (filechr_table[(unsigned char)(c)] & (mask)))
1303 /* Shorthands for the table: */
1304 #define U filechr_not_unix
1305 #define W filechr_not_windows
1306 #define C filechr_control
1311 /* Table of characters unsafe under various conditions (see above).
1313 Arguably we could also claim `%' to be unsafe, since we use it as
1314 the escape character. If we ever want to be able to reliably
1315 translate file name back to URL, this would become important
1316 crucial. Right now, it's better to be minimal in escaping. */
1318 static const unsigned char filechr_table[256] =
1320 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1321 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1322 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1323 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1324 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1325 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1326 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1327 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1328 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1329 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1330 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1331 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1332 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1333 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1334 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1335 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
1337 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1338 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1339 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1340 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1353 /* FN_PORT_SEP is the separator between host and port in file names
1354 for non-standard port numbers. On Unix this is normally ':', as in
1355 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1356 because Windows can't handle ':' in file names. */
1357 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1359 /* FN_QUERY_SEP is the separator between the file name and the URL
1360 query, normally '?'. Since Windows cannot handle '?' as part of
1361 file name, we use '@' instead there. */
1362 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1364 /* Quote path element, characters in [b, e), as file name, and append
1365 the quoted string to DEST. Each character is quoted as per
1366 file_unsafe_char and the corresponding table.
1368 If ESCAPED is true, the path element is considered to be
1369 URL-escaped and will be unescaped prior to inspection. */
1372 append_uri_pathel (const char *b, const char *e, bool escaped,
1373 struct growable *dest)
1379 if (opt.restrict_files_os == restrict_unix)
1380 mask = filechr_not_unix;
1382 mask = filechr_not_windows;
1383 if (opt.restrict_files_ctrl)
1384 mask |= filechr_control;
1386 /* Copy [b, e) to PATHEL and URL-unescape it. */
1390 BOUNDED_TO_ALLOCA (b, e, unescaped);
1391 url_unescape (unescaped);
1393 e = unescaped + strlen (unescaped);
1396 /* Defang ".." when found as component of path. Remember that path
1397 comes from the URL and might contain malicious input. */
1398 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1404 /* Walk the PATHEL string and check how many characters we'll need
1407 for (p = b; p < e; p++)
1408 if (FILE_CHAR_TEST (*p, mask))
1411 /* Calculate the length of the output string. e-b is the input
1412 string length. Each quoted char introduces two additional
1413 characters in the string, hence 2*quoted. */
1414 outlen = (e - b) + (2 * quoted);
1415 GROW (dest, outlen);
1419 /* If there's nothing to quote, we can simply append the string
1420 without processing it again. */
1421 memcpy (TAIL (dest), b, outlen);
1425 char *q = TAIL (dest);
1426 for (p = b; p < e; p++)
1428 if (!FILE_CHAR_TEST (*p, mask))
1432 unsigned char ch = *p;
1434 *q++ = XNUM_TO_DIGIT (ch >> 4);
1435 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1438 assert (q - TAIL (dest) == outlen);
1441 /* Perform inline case transformation if required. */
1442 if (opt.restrict_files_case == restrict_lowercase
1443 || opt.restrict_files_case == restrict_uppercase)
1446 for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1448 if (opt.restrict_files_case == restrict_lowercase)
1449 *q = c_tolower (*q);
1451 *q = c_toupper (*q);
1455 TAIL_INCR (dest, outlen);
1458 /* Append to DEST the directory structure that corresponds the
1459 directory part of URL's path. For example, if the URL is
1460 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1462 Each path element ("dir1" and "dir2" in the above example) is
1463 examined, url-unescaped, and re-escaped as file name element.
1465 Additionally, it cuts as many directories from the path as
1466 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1467 will produce "bar" for the above example. For 2 or more, it will
1470 Each component of the path is quoted for use as file name. */
1473 append_dir_structure (const struct url *u, struct growable *dest)
1475 char *pathel, *next;
1476 int cut = opt.cut_dirs;
1478 /* Go through the path components, de-URL-quote them, and quote them
1479 (if necessary) as file names. */
1482 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1487 /* Ignore empty pathels. */
1491 append_char ('/', dest);
1492 append_uri_pathel (pathel, next, true, dest);
1496 /* Return a unique file name that matches the given URL as good as
1497 possible. Does not create directories on the file system. */
1500 url_file_name (const struct url *u, char *replaced_filename)
1502 struct growable fnres; /* stands for "file name result" */
1504 const char *u_file, *u_query;
1505 char *fname, *unique;
1506 char *index_filename = "index.html"; /* The default index file is index.html */
1512 /* If an alternative index file was defined, change index_filename */
1513 if (opt.default_page)
1514 index_filename = opt.default_page;
1517 /* Start with the directory prefix, if specified. */
1519 append_string (opt.dir_prefix, &fnres);
1521 /* If "dirstruct" is turned on (typically the case with -r), add
1522 the host and port (unless those have been turned off) and
1523 directory structure. */
1526 if (opt.protocol_directories)
1529 append_char ('/', &fnres);
1530 append_string (supported_schemes[u->scheme].name, &fnres);
1532 if (opt.add_hostdir)
1535 append_char ('/', &fnres);
1536 if (0 != strcmp (u->host, ".."))
1537 append_string (u->host, &fnres);
1539 /* Host name can come from the network; malicious DNS may
1540 allow ".." to be resolved, causing us to write to
1541 "../<file>". Defang such host names. */
1542 append_string ("%2E%2E", &fnres);
1543 if (u->port != scheme_default_port (u->scheme))
1546 number_to_string (portstr, u->port);
1547 append_char (FN_PORT_SEP, &fnres);
1548 append_string (portstr, &fnres);
1552 append_dir_structure (u, &fnres);
1555 if (!replaced_filename)
1557 /* Add the file name. */
1559 append_char ('/', &fnres);
1560 u_file = *u->file ? u->file : index_filename;
1561 append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1563 /* Append "?query" to the file name. */
1564 u_query = u->query && *u->query ? u->query : NULL;
1567 append_char (FN_QUERY_SEP, &fnres);
1568 append_uri_pathel (u_query, u_query + strlen (u_query),
1575 append_char ('/', &fnres);
1576 u_file = replaced_filename;
1577 append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1580 /* Zero-terminate the file name. */
1581 append_char ('\0', &fnres);
1585 /* Check the cases in which the unique extensions are not used:
1586 1) Clobbering is turned off (-nc).
1587 2) Retrieval with regetting.
1588 3) Timestamping is used.
1589 4) Hierarchy is built.
1591 The exception is the case when file does exist and is a
1592 directory (see `mkalldirs' for explanation). */
1594 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1595 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1601 unique = unique_name (fname, true);
1602 if (unique != fname)
1606 /* On VMS, alter the name as required. */
1611 unique2 = ods_conform( unique);
1612 if (unique2 != unique)
1618 #endif /* def __VMS */
1623 /* Resolve "." and ".." elements of PATH by destructively modifying
1624 PATH and return true if PATH has been modified, false otherwise.
1626 The algorithm is in spirit similar to the one described in rfc1808,
1627 although implemented differently, in one pass. To recap, path
1628 elements containing only "." are removed, and ".." is taken to mean
1629 "back up one element". Single leading and trailing slashes are
1632 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1633 test examples are provided below. If you change anything in this
1634 function, run test_path_simplify to make sure you haven't broken a
1638 path_simplify (enum url_scheme scheme, char *path)
1640 char *h = path; /* hare */
1641 char *t = path; /* tortoise */
1643 char *end = strchr (path, '\0');
1647 /* Hare should be at the beginning of a path element. */
1649 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1654 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1656 /* Handle "../" by retreating the tortoise by one path
1657 element -- but not past beggining. */
1660 /* Move backwards until T hits the beginning of the
1661 previous path element or the beginning of path. */
1662 for (--t; t > beg && t[-1] != '/'; t--)
1665 else if (scheme == SCHEME_FTP)
1667 /* If we're at the beginning, copy the "../" literally
1668 and move the beginning so a later ".." doesn't remove
1669 it. This violates RFC 3986; but we do it for FTP
1670 anyway because there is otherwise no way to get at a
1671 parent directory, when the FTP server drops us in a
1672 non-root directory (which is not uncommon). */
1681 /* A regular path element. If H hasn't advanced past T,
1682 simply skip to the next path element. Otherwise, copy
1683 the path element until the next slash. */
1686 /* Skip the path element, including the slash. */
1687 while (h < end && *h != '/')
1694 /* Copy the path element, including the final slash. */
1695 while (h < end && *h != '/')
1709 /* Return the length of URL's path. Path is considered to be
1710 terminated by one or more of the ?query or ;params or #fragment,
1711 depending on the scheme. */
1714 path_end (const char *url)
1716 enum url_scheme scheme = url_scheme (url);
1718 if (scheme == SCHEME_INVALID)
1719 scheme = SCHEME_HTTP; /* use http semantics for rel links */
1720 /* +2 to ignore the first two separators ':' and '/' */
1721 seps = init_seps (scheme) + 2;
1722 return strpbrk_or_eos (url, seps);
1725 /* Find the last occurrence of character C in the range [b, e), or
1726 NULL, if none are present. */
1727 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1729 /* Merge BASE with LINK and return the resulting URI.
1731 Either of the URIs may be absolute or relative, complete with the
1732 host name, or path only. This tries to reasonably handle all
1733 foreseeable cases. It only employs minimal URL parsing, without
1734 knowledge of the specifics of schemes.
1736 I briefly considered making this function call path_simplify after
1737 the merging process, as rfc1738 seems to suggest. This is a bad
1738 idea for several reasons: 1) it complexifies the code, and 2)
1739 url_parse has to simplify path anyway, so it's wasteful to boot. */
1742 uri_merge (const char *base, const char *link)
1748 if (url_has_scheme (link))
1749 return xstrdup (link);
1751 /* We may not examine BASE past END. */
1752 end = path_end (base);
1753 linklength = strlen (link);
1757 /* Empty LINK points back to BASE, query string and all. */
1758 return xstrdup (base);
1760 else if (*link == '?')
1762 /* LINK points to the same location, but changes the query
1763 string. Examples: */
1764 /* uri_merge("path", "?new") -> "path?new" */
1765 /* uri_merge("path?foo", "?new") -> "path?new" */
1766 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1767 /* uri_merge("path#foo", "?new") -> "path?new" */
1768 int baselength = end - base;
1769 merge = xmalloc (baselength + linklength + 1);
1770 memcpy (merge, base, baselength);
1771 memcpy (merge + baselength, link, linklength);
1772 merge[baselength + linklength] = '\0';
1774 else if (*link == '#')
1776 /* uri_merge("path", "#new") -> "path#new" */
1777 /* uri_merge("path#foo", "#new") -> "path#new" */
1778 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1779 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1781 const char *end1 = strchr (base, '#');
1783 end1 = base + strlen (base);
1784 baselength = end1 - base;
1785 merge = xmalloc (baselength + linklength + 1);
1786 memcpy (merge, base, baselength);
1787 memcpy (merge + baselength, link, linklength);
1788 merge[baselength + linklength] = '\0';
1790 else if (*link == '/' && *(link + 1) == '/')
1792 /* LINK begins with "//" and so is a net path: we need to
1793 replace everything after (and including) the double slash
1796 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1797 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1798 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1802 const char *start_insert;
1804 /* Look for first slash. */
1805 slash = memchr (base, '/', end - base);
1806 /* If found slash and it is a double slash, then replace
1807 from this point, else default to replacing from the
1809 if (slash && *(slash + 1) == '/')
1810 start_insert = slash;
1812 start_insert = base;
1814 span = start_insert - base;
1815 merge = xmalloc (span + linklength + 1);
1817 memcpy (merge, base, span);
1818 memcpy (merge + span, link, linklength);
1819 merge[span + linklength] = '\0';
1821 else if (*link == '/')
1823 /* LINK is an absolute path: we need to replace everything
1824 after (and including) the FIRST slash with LINK.
1826 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1827 "/qux/xyzzy", our result should be
1828 "http://host/qux/xyzzy". */
1831 const char *start_insert = NULL; /* for gcc to shut up. */
1832 const char *pos = base;
1833 bool seen_slash_slash = false;
1834 /* We're looking for the first slash, but want to ignore
1837 slash = memchr (pos, '/', end - pos);
1838 if (slash && !seen_slash_slash)
1839 if (*(slash + 1) == '/')
1842 seen_slash_slash = true;
1846 /* At this point, SLASH is the location of the first / after
1847 "//", or the first slash altogether. START_INSERT is the
1848 pointer to the location where LINK will be inserted. When
1849 examining the last two examples, keep in mind that LINK
1852 if (!slash && !seen_slash_slash)
1853 /* example: "foo" */
1855 start_insert = base;
1856 else if (!slash && seen_slash_slash)
1857 /* example: "http://foo" */
1860 else if (slash && !seen_slash_slash)
1861 /* example: "foo/bar" */
1863 start_insert = base;
1864 else if (slash && seen_slash_slash)
1865 /* example: "http://something/" */
1867 start_insert = slash;
1869 span = start_insert - base;
1870 merge = xmalloc (span + linklength + 1);
1872 memcpy (merge, base, span);
1873 memcpy (merge + span, link, linklength);
1874 merge[span + linklength] = '\0';
1878 /* LINK is a relative URL: we need to replace everything
1879 after last slash (possibly empty) with LINK.
1881 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1882 our result should be "whatever/foo/qux/xyzzy". */
1883 bool need_explicit_slash = false;
1885 const char *start_insert;
1886 const char *last_slash = find_last_char (base, end, '/');
1889 /* No slash found at all. Replace what we have with LINK. */
1890 start_insert = base;
1892 else if (last_slash && last_slash >= base + 2
1893 && last_slash[-2] == ':' && last_slash[-1] == '/')
1895 /* example: http://host" */
1897 start_insert = end + 1;
1898 need_explicit_slash = true;
1902 /* example: "whatever/foo/bar" */
1904 start_insert = last_slash + 1;
1907 span = start_insert - base;
1908 merge = xmalloc (span + linklength + 1);
1910 memcpy (merge, base, span);
1911 if (need_explicit_slash)
1912 merge[span - 1] = '/';
1913 memcpy (merge + span, link, linklength);
1914 merge[span + linklength] = '\0';
1920 #define APPEND(p, s) do { \
1921 int len = strlen (s); \
1922 memcpy (p, s, len); \
1926 /* Use this instead of password when the actual password is supposed
1927 to be hidden. We intentionally use a generic string without giving
1928 away the number of characters in the password, like previous
1930 #define HIDDEN_PASSWORD "*password*"
1932 /* Recreate the URL string from the data in URL.
1934 If HIDE is true (as it is when we're calling this on a URL we plan
1935 to print, but not when calling it to canonicalize a URL for use
1936 within the program), password will be hidden. Unsafe characters in
1937 the URL will be quoted. */
1940 url_string (const struct url *url, enum url_auth_mode auth_mode)
1944 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1946 int scheme_port = supported_schemes[url->scheme].default_port;
1947 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1948 int fplen = full_path_length (url);
1950 bool brackets_around_host;
1952 assert (scheme_str != NULL);
1954 /* Make sure the user name and password are quoted. */
1957 if (auth_mode != URL_AUTH_HIDE)
1959 quoted_user = url_escape_allow_passthrough (url->user);
1962 if (auth_mode == URL_AUTH_HIDE_PASSWD)
1963 quoted_passwd = HIDDEN_PASSWORD;
1965 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1970 /* In the unlikely event that the host name contains non-printable
1971 characters, quote it for displaying to the user. */
1972 quoted_host = url_escape_allow_passthrough (url->host);
1974 /* Undo the quoting of colons that URL escaping performs. IPv6
1975 addresses may legally contain colons, and in that case must be
1976 placed in square brackets. */
1977 if (quoted_host != url->host)
1978 unescape_single_char (quoted_host, ':');
1979 brackets_around_host = strchr (quoted_host, ':') != NULL;
1981 size = (strlen (scheme_str)
1982 + strlen (quoted_host)
1983 + (brackets_around_host ? 2 : 0)
1986 if (url->port != scheme_port)
1987 size += 1 + numdigit (url->port);
1990 size += 1 + strlen (quoted_user);
1992 size += 1 + strlen (quoted_passwd);
1995 p = result = xmalloc (size);
1997 APPEND (p, scheme_str);
2000 APPEND (p, quoted_user);
2004 APPEND (p, quoted_passwd);
2009 if (brackets_around_host)
2011 APPEND (p, quoted_host);
2012 if (brackets_around_host)
2014 if (url->port != scheme_port)
2017 p = number_to_string (p, url->port);
2020 full_path_write (url, p);
2024 assert (p - result == size);
2026 if (quoted_user && quoted_user != url->user)
2027 xfree (quoted_user);
2028 if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2029 && quoted_passwd != url->passwd)
2030 xfree (quoted_passwd);
2031 if (quoted_host != url->host)
2032 xfree (quoted_host);
2037 /* Return true if scheme a is similar to scheme b.
2039 Schemes are similar if they are equal. If SSL is supported, schemes
2040 are also similar if one is http (SCHEME_HTTP) and the other is https
2043 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2048 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2049 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2056 getchar_from_escaped_string (const char *str, char *c)
2058 const char *p = str;
2060 assert (str && *str);
2065 if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2073 return 0; /* error: invalid string */
2075 *c = X2DIGITS_TO_NUM (p[1], p[2]);
2076 if (URL_RESERVED_CHAR(*c))
2094 are_urls_equal (const char *u1, const char *u2)
2105 && (pp = getchar_from_escaped_string (p, &ch1))
2106 && (qq = getchar_from_escaped_string (q, &ch2))
2107 && (c_tolower(ch1) == c_tolower(ch2)))
2113 return (*p == 0 && *q == 0 ? true : false);
2117 /* Debugging and testing support for path_simplify. */
2120 /* Debug: run path_simplify on PATH and return the result in a new
2121 string. Useful for calling from the debugger. */
2125 char *copy = xstrdup (path);
2126 path_simplify (copy);
2132 run_test (char *test, char *expected_result, enum url_scheme scheme,
2133 bool expected_change)
2135 char *test_copy = xstrdup (test);
2136 bool modified = path_simplify (scheme, test_copy);
2138 if (0 != strcmp (test_copy, expected_result))
2140 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2141 test, expected_result, test_copy);
2144 if (modified != expected_change)
2146 if (expected_change)
2147 printf ("Expected modification with path_simplify(\"%s\").\n",
2150 printf ("Expected no modification with path_simplify(\"%s\").\n",
2154 mu_assert ("", modified == expected_change);
2159 test_path_simplify (void)
2162 char *test, *result;
2163 enum url_scheme scheme;
2166 { "", "", SCHEME_HTTP, false },
2167 { ".", "", SCHEME_HTTP, true },
2168 { "./", "", SCHEME_HTTP, true },
2169 { "..", "", SCHEME_HTTP, true },
2170 { "../", "", SCHEME_HTTP, true },
2171 { "..", "..", SCHEME_FTP, false },
2172 { "../", "../", SCHEME_FTP, false },
2173 { "foo", "foo", SCHEME_HTTP, false },
2174 { "foo/bar", "foo/bar", SCHEME_HTTP, false },
2175 { "foo///bar", "foo///bar", SCHEME_HTTP, false },
2176 { "foo/.", "foo/", SCHEME_HTTP, true },
2177 { "foo/./", "foo/", SCHEME_HTTP, true },
2178 { "foo./", "foo./", SCHEME_HTTP, false },
2179 { "foo/../bar", "bar", SCHEME_HTTP, true },
2180 { "foo/../bar/", "bar/", SCHEME_HTTP, true },
2181 { "foo/bar/..", "foo/", SCHEME_HTTP, true },
2182 { "foo/bar/../x", "foo/x", SCHEME_HTTP, true },
2183 { "foo/bar/../x/", "foo/x/", SCHEME_HTTP, true },
2184 { "foo/..", "", SCHEME_HTTP, true },
2185 { "foo/../..", "", SCHEME_HTTP, true },
2186 { "foo/../../..", "", SCHEME_HTTP, true },
2187 { "foo/../../bar/../../baz", "baz", SCHEME_HTTP, true },
2188 { "foo/../..", "..", SCHEME_FTP, true },
2189 { "foo/../../..", "../..", SCHEME_FTP, true },
2190 { "foo/../../bar/../../baz", "../../baz", SCHEME_FTP, true },
2191 { "a/b/../../c", "c", SCHEME_HTTP, true },
2192 { "./a/../b", "b", SCHEME_HTTP, true }
2196 for (i = 0; i < countof (tests); i++)
2198 const char *message;
2199 char *test = tests[i].test;
2200 char *expected_result = tests[i].result;
2201 enum url_scheme scheme = tests[i].scheme;
2202 bool expected_change = tests[i].should_modify;
2203 message = run_test (test, expected_result, scheme, expected_change);
2204 if (message) return message;
2210 test_append_uri_pathel()
2217 char *expected_result;
2219 { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2222 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2224 struct growable dest;
2225 const char *p = test_array[i].input;
2227 memset (&dest, 0, sizeof (dest));
2229 append_string (test_array[i].original_url, &dest);
2230 append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2231 append_char ('\0', &dest);
2233 mu_assert ("test_append_uri_pathel: wrong result",
2234 strcmp (dest.base, test_array[i].expected_result) == 0);
2241 test_are_urls_equal()
2247 bool expected_result;
2249 { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true },
2250 { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2251 { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false },
2252 { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true },
2253 { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/", false },
2254 { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/", false },
2257 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2259 mu_assert ("test_are_urls_equal: wrong result",
2260 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2266 #endif /* TESTING */