2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
39 #include <sys/types.h>
49 #include "host.h" /* for is_valid_ipv6_address */
58 const char *leading_string;
63 /* Supported schemes: */
64 static struct scheme_data supported_schemes[] =
66 { "http", "http://", DEFAULT_HTTP_PORT, 1 },
68 { "https", "https://", DEFAULT_HTTPS_PORT, 1 },
70 { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
76 /* Forward declarations: */
78 static int path_simplify PARAMS ((char *));
80 /* Support for escaping and unescaping of URL strings. */
82 /* Table of "reserved" and "unsafe" characters. Those terms are
83 rfc1738-speak, as such largely obsoleted by rfc2396 and later
84 specs, but the general idea remains.
86 A reserved character is the one that you can't decode without
87 changing the meaning of the URL. For example, you can't decode
88 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
89 path components is different. Non-reserved characters can be
90 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
91 unsafe characters are loosely based on rfc1738, plus "$" and ",",
92 as recommended by rfc2396, and minus "~", which is very frequently
93 used (and sometimes unrecognized as %7E by broken servers).
95 An unsafe character is the one that should be encoded when URLs are
96 placed in foreign environments. E.g. space and newline are unsafe
97 in HTTP contexts because HTTP uses them as separator and line
98 terminator, so they must be encoded to %20 and %0A respectively.
99 "*" is unsafe in shell context, etc.
101 We determine whether a character is unsafe through static table
102 lookup. This code assumes ASCII character set and 8-bit chars. */
105 /* rfc1738 reserved chars + "$" and ",". */
108 /* rfc1738 unsafe chars, plus non-printables. */
112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
116 /* Shorthands for the table: */
117 #define R urlchr_reserved
118 #define U urlchr_unsafe
121 static const unsigned char urlchr_table[256] =
123 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
124 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
125 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
126 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
127 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
128 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
129 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
130 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
131 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
132 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
133 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
134 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
135 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
136 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
137 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
138 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
143 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
145 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
154 /* URL-unescape the string S.
156 This is done by transforming the sequences "%HH" to the character
157 represented by the hexadecimal digits HH. If % is not followed by
158 two hexadecimal digits, it is inserted literally.
160 The transformation is done in place. If you need the original
161 string intact, make a copy before calling this function. */
164 url_unescape (char *s)
166 char *t = s; /* t - tortoise */
167 char *h = s; /* h - hare */
179 /* Do nothing if '%' is not followed by two hex digits. */
180 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
182 c = X2DIGITS_TO_NUM (h[1], h[2]);
183 /* Don't unescape %00 because there is no way to insert it
184 into a C string without effectively truncating it. */
194 /* The core of url_escape_* functions. Escapes the characters that
195 match the provided mask in urlchr_table.
197 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
198 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
199 freshly allocated string will be returned in all cases. */
202 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
209 for (p1 = s; *p1; p1++)
210 if (urlchr_test (*p1, mask))
211 addition += 2; /* Two more characters (hex digits) */
214 return allow_passthrough ? (char *)s : xstrdup (s);
216 newlen = (p1 - s) + addition;
217 newstr = (char *)xmalloc (newlen + 1);
223 /* Quote the characters that match the test mask. */
224 if (urlchr_test (*p1, mask))
226 unsigned char c = *p1++;
228 *p2++ = XNUM_TO_DIGIT (c >> 4);
229 *p2++ = XNUM_TO_DIGIT (c & 0xf);
234 assert (p2 - newstr == newlen);
240 /* URL-escape the unsafe characters (see urlchr_table) in a given
241 string, returning a freshly allocated string. */
244 url_escape (const char *s)
246 return url_escape_1 (s, urlchr_unsafe, 0);
249 /* URL-escape the unsafe characters (see urlchr_table) in a given
250 string. If no characters are unsafe, S is returned. */
253 url_escape_allow_passthrough (const char *s)
255 return url_escape_1 (s, urlchr_unsafe, 1);
258 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
260 /* Decide whether to encode, decode, or pass through the char at P.
261 This used to be a macro, but it got a little too convoluted. */
262 static inline enum copy_method
263 decide_copy_method (const char *p)
267 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
269 /* %xx sequence: decode it, unless it would decode to an
270 unsafe or a reserved char; in that case, leave it as
272 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
273 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
274 return CM_PASSTHROUGH;
279 /* Garbled %.. sequence: encode `%'. */
282 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
285 return CM_PASSTHROUGH;
288 /* Translate a %-escaped (but possibly non-conformant) input string S
289 into a %-escaped (and conformant) output string. If no characters
290 are encoded or decoded, return the same string S; otherwise, return
291 a freshly allocated string with the new contents.
293 After a URL has been run through this function, the protocols that
294 use `%' as the quote character can use the resulting string as-is,
295 while those that don't call url_unescape() to get to the intended
296 data. This function is also stable: after an input string is
297 transformed the first time, all further transformations of the
298 result yield the same result string.
300 Let's discuss why this function is needed.
302 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
303 space character would mess up the HTTP request, it needs to be
306 GET /abc%20def HTTP/1.0
308 It appears that the unsafe chars need to be quoted, for example
309 with url_escape. But what if we're requested to download
310 `abc%20def'? url_escape transforms "%" to "%25", which would leave
311 us with `abc%2520def'. This is incorrect -- since %-escapes are
312 part of URL syntax, "%20" is the correct way to denote a literal
313 space on the Wget command line. This leaves us in the conclusion
314 that in that case Wget should not call url_escape, but leave the
317 And what if the requested URI is `abc%20 def'? If we call
318 url_escape, we end up with `/abc%2520%20def', which is almost
319 certainly not intended. If we don't call url_escape, we are left
320 with the embedded space and cannot complete the request. What the
321 user meant was for Wget to request `/abc%20%20def', and this is
322 where reencode_escapes kicks in.
324 Wget used to solve this by first decoding %-quotes, and then
325 encoding all the "unsafe" characters found in the resulting string.
326 This was wrong because it didn't preserve certain URL special
327 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
328 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
329 whether we considered `+' reserved (it is). One of these results
330 is inevitable because by the second step we would lose information
331 on whether the `+' was originally encoded or not. Both results
332 were wrong because in CGI parameters + means space, while %2B means
333 literal plus. reencode_escapes correctly translates the above to
334 "a%2B+b", i.e. returns the original string.
336 This function uses an algorithm proposed by Anon Sricharoenchai:
338 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
341 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
344 ...except that this code conflates the two steps, and decides
345 whether to encode, decode, or pass through each character in turn.
346 The function still uses two passes, but their logic is the same --
347 the first pass exists merely for the sake of allocation. Another
348 small difference is that we include `+' to URL_RESERVED.
352 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
354 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
358 "foo bar" -> "foo%20bar"
359 "foo%20bar" -> "foo%20bar"
360 "foo %20bar" -> "foo%20%20bar"
361 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
362 "foo%25%20bar" -> "foo%25%20bar"
363 "foo%2%20bar" -> "foo%252%20bar"
364 "foo+bar" -> "foo+bar" (plus is reserved!)
365 "foo%2b+bar" -> "foo%2b+bar" */
368 reencode_escapes (const char *s)
374 int encode_count = 0;
375 int decode_count = 0;
377 /* First, pass through the string to see if there's anything to do,
378 and to calculate the new length. */
379 for (p1 = s; *p1; p1++)
381 switch (decide_copy_method (p1))
394 if (!encode_count && !decode_count)
395 /* The string is good as it is. */
396 return (char *)s; /* C const model sucks. */
399 /* Each encoding adds two characters (hex digits), while each
400 decoding removes two characters. */
401 newlen = oldlen + 2 * (encode_count - decode_count);
402 newstr = xmalloc (newlen + 1);
409 switch (decide_copy_method (p1))
413 unsigned char c = *p1++;
415 *p2++ = XNUM_TO_DIGIT (c >> 4);
416 *p2++ = XNUM_TO_DIGIT (c & 0xf);
420 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
421 p1 += 3; /* skip %xx */
428 assert (p2 - newstr == newlen);
432 /* Returns the scheme type if the scheme is supported, or
433 SCHEME_INVALID if not. */
436 url_scheme (const char *url)
440 for (i = 0; supported_schemes[i].leading_string; i++)
441 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
442 strlen (supported_schemes[i].leading_string)))
444 if (supported_schemes[i].enabled)
445 return (enum url_scheme) i;
447 return SCHEME_INVALID;
450 return SCHEME_INVALID;
453 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
455 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
456 currently implemented, it returns true if URL begins with
460 url_has_scheme (const char *url)
464 /* The first char must be a scheme char. */
465 if (!*p || !SCHEME_CHAR (*p))
468 /* Followed by 0 or more scheme chars. */
469 while (*p && SCHEME_CHAR (*p))
471 /* Terminated by ':'. */
476 scheme_default_port (enum url_scheme scheme)
478 return supported_schemes[scheme].default_port;
482 scheme_disable (enum url_scheme scheme)
484 supported_schemes[scheme].enabled = 0;
487 /* Skip the username and password, if present in the URL. The
488 function should *not* be called with the complete URL, but with the
489 portion after the scheme.
491 If no username and password are found, return URL. */
494 url_skip_credentials (const char *url)
496 /* Look for '@' that comes before terminators, such as '/', '?',
498 const char *p = (const char *)strpbrk (url, "@/?#;");
504 /* Parse credentials contained in [BEG, END). The region is expected
505 to have come from a URL and is unescaped. */
508 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
514 return 0; /* empty user name */
516 colon = memchr (beg, ':', end - beg);
518 return 0; /* again empty user name */
522 *passwd = strdupdelim (colon + 1, end);
524 url_unescape (*passwd);
531 *user = strdupdelim (beg, userend);
532 url_unescape (*user);
536 /* Used by main.c: detect URLs written using the "shorthand" URL forms
537 popularized by Netscape and NcFTP. HTTP shorthands look like this:
539 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
540 www.foo.com[:port] -> http://www.foo.com[:port]
542 FTP shorthands look like this:
544 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
545 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
547 If the URL needs not or cannot be rewritten, return NULL. */
550 rewrite_shorthand_url (const char *url)
554 if (url_scheme (url) != SCHEME_INVALID)
557 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
559 for (p = url; *p && *p != ':' && *p != '/'; p++)
569 /* If the characters after the colon and before the next slash
570 or end of string are all digits, it's HTTP. */
572 for (pp = p + 1; ISDIGIT (*pp); pp++)
574 if (digits > 0 && (*pp == '/' || *pp == '\0'))
577 /* Prepend "ftp://" to the entire URL... */
578 res = xmalloc (6 + strlen (url) + 1);
579 sprintf (res, "ftp://%s", url);
580 /* ...and replace ':' with '/'. */
581 res[6 + (p - url)] = '/';
588 /* Just prepend "http://" to what we have. */
589 res = xmalloc (7 + strlen (url) + 1);
590 sprintf (res, "http://%s", url);
595 static void split_path PARAMS ((const char *, char **, char **));
597 /* Like strpbrk, with the exception that it returns the pointer to the
598 terminating zero (end-of-string aka "eos") if no matching character
601 Although I normally balk at Gcc-specific optimizations, it probably
602 makes sense here: glibc has optimizations that detect strpbrk being
603 called with literal string as ACCEPT and inline the search. That
604 optimization is defeated if strpbrk is hidden within the call to
605 another function. (And no, making strpbrk_or_eos inline doesn't
606 help because the check for literal accept is in the
611 #define strpbrk_or_eos(s, accept) ({ \
612 char *SOE_p = strpbrk (s, accept); \
614 SOE_p = (char *)s + strlen (s); \
618 #else /* not __GNUC__ */
621 strpbrk_or_eos (const char *s, const char *accept)
623 char *p = strpbrk (s, accept);
625 p = (char *)s + strlen (s);
630 /* Turn STR into lowercase; return non-zero if a character was
634 lowercase_str (char *str)
641 *str = TOLOWER (*str);
646 static const char *parse_errors[] = {
647 #define PE_NO_ERROR 0
649 #define PE_UNSUPPORTED_SCHEME 1
650 N_("Unsupported scheme"),
651 #define PE_EMPTY_HOST 2
653 #define PE_BAD_PORT_NUMBER 3
654 N_("Bad port number"),
655 #define PE_INVALID_USER_NAME 4
656 N_("Invalid user name"),
657 #define PE_UNTERMINATED_IPV6_ADDRESS 5
658 N_("Unterminated IPv6 numeric address"),
659 #define PE_IPV6_NOT_SUPPORTED 6
660 N_("IPv6 addresses not supported"),
661 #define PE_INVALID_IPV6_ADDRESS 7
662 N_("Invalid IPv6 numeric address")
667 Return a new struct url if successful, NULL on error. In case of
668 error, and if ERROR is not NULL, also set *ERROR to the appropriate
671 url_parse (const char *url, int *error)
675 int path_modified, host_modified;
677 enum url_scheme scheme;
679 const char *uname_b, *uname_e;
680 const char *host_b, *host_e;
681 const char *path_b, *path_e;
682 const char *params_b, *params_e;
683 const char *query_b, *query_e;
684 const char *fragment_b, *fragment_e;
687 char *user = NULL, *passwd = NULL;
689 char *url_encoded = NULL;
693 scheme = url_scheme (url);
694 if (scheme == SCHEME_INVALID)
696 error_code = PE_UNSUPPORTED_SCHEME;
700 url_encoded = reencode_escapes (url);
703 p += strlen (supported_schemes[scheme].leading_string);
705 p = url_skip_credentials (p);
708 /* scheme://user:pass@host[:port]... */
711 /* We attempt to break down the URL into the components path,
712 params, query, and fragment. They are ordered like this:
714 scheme://host[:port][/path][;params][?query][#fragment] */
716 params_b = params_e = NULL;
717 query_b = query_e = NULL;
718 fragment_b = fragment_e = NULL;
724 /* Handle IPv6 address inside square brackets. Ideally we'd
725 just look for the terminating ']', but rfc2732 mandates
726 rejecting invalid IPv6 addresses. */
728 /* The address begins after '['. */
730 host_e = strchr (host_b, ']');
734 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
739 /* Check if the IPv6 address is valid. */
740 if (!is_valid_ipv6_address(host_b, host_e))
742 error_code = PE_INVALID_IPV6_ADDRESS;
746 /* Continue parsing after the closing ']'. */
749 error_code = PE_IPV6_NOT_SUPPORTED;
755 p = strpbrk_or_eos (p, ":/;?#");
759 if (host_b == host_e)
761 error_code = PE_EMPTY_HOST;
765 port = scheme_default_port (scheme);
768 const char *port_b, *port_e, *pp;
770 /* scheme://host:port/tralala */
774 p = strpbrk_or_eos (p, "/;?#");
777 /* Allow empty port, as per rfc2396. */
778 if (port_b != port_e)
780 for (port = 0, pp = port_b; pp < port_e; pp++)
784 /* http://host:12randomgarbage/blah */
786 error_code = PE_BAD_PORT_NUMBER;
789 port = 10 * port + (*pp - '0');
790 /* Check for too large port numbers here, before we have
791 a chance to overflow on bogus port values. */
794 error_code = PE_BAD_PORT_NUMBER;
805 p = strpbrk_or_eos (p, ";?#");
810 /* Path is not allowed not to exist. */
818 p = strpbrk_or_eos (p, "?#");
825 p = strpbrk_or_eos (p, "#");
828 /* Hack that allows users to use '?' (a wildcard character) in
829 FTP URLs without it being interpreted as a query string
831 if (scheme == SCHEME_FTP)
833 query_b = query_e = NULL;
846 if (uname_b != uname_e)
848 /* http://user:pass@host */
850 /* uname_b uname_e */
851 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
853 error_code = PE_INVALID_USER_NAME;
858 u = xnew0 (struct url);
860 u->host = strdupdelim (host_b, host_e);
865 u->path = strdupdelim (path_b, path_e);
866 path_modified = path_simplify (u->path);
867 split_path (u->path, &u->dir, &u->file);
869 host_modified = lowercase_str (u->host);
871 /* Decode %HH sequences in host name. This is important not so much
872 to support %HH sequences, but to support binary characters (which
873 will have been converted to %HH by reencode_escapes). */
874 if (strchr (u->host, '%'))
876 url_unescape (u->host);
881 u->params = strdupdelim (params_b, params_e);
883 u->query = strdupdelim (query_b, query_e);
885 u->fragment = strdupdelim (fragment_b, fragment_e);
887 if (path_modified || u->fragment || host_modified || path_b == path_e)
889 /* If we suspect that a transformation has rendered what
890 url_string might return different from URL_ENCODED, rebuild
891 u->url using url_string. */
892 u->url = url_string (u, 0);
894 if (url_encoded != url)
895 xfree ((char *) url_encoded);
899 if (url_encoded == url)
900 u->url = xstrdup (url);
902 u->url = url_encoded;
909 /* Cleanup in case of error: */
910 if (url_encoded && url_encoded != url)
913 /* Transmit the error code to the caller, if the caller wants to
920 /* Return the error message string from ERROR_CODE, which should have
921 been retrieved from url_parse. The error message is translated. */
924 url_error (int error_code)
926 assert (error_code >= 0 && error_code < countof (parse_errors));
927 return _(parse_errors[error_code]);
930 /* Split PATH into DIR and FILE. PATH comes from the URL and is
931 expected to be URL-escaped.
933 The path is split into directory (the part up to the last slash)
934 and file (the part after the last slash), which are subsequently
938 "foo/bar/baz" "foo/bar" "baz"
939 "foo/bar/" "foo/bar" ""
941 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
943 DIR and FILE are freshly allocated. */
946 split_path (const char *path, char **dir, char **file)
948 char *last_slash = strrchr (path, '/');
952 *file = xstrdup (path);
956 *dir = strdupdelim (path, last_slash);
957 *file = xstrdup (last_slash + 1);
960 url_unescape (*file);
963 /* Note: URL's "full path" is the path with the query string and
964 params appended. The "fragment" (#foo) is intentionally ignored,
965 but that might be changed. For example, if the original URL was
966 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
967 the full path will be "/foo/bar/baz;bullshit?querystring". */
969 /* Return the length of the full path, without the terminating
973 full_path_length (const struct url *url)
977 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
988 /* Write out the full path. */
991 full_path_write (const struct url *url, char *where)
993 #define FROB(el, chr) do { \
994 char *f_el = url->el; \
996 int l = strlen (f_el); \
998 memcpy (where, f_el, l); \
1010 /* Public function for getting the "full path". E.g. if u->path is
1011 "foo/bar" and u->query is "param=value", full_path will be
1012 "/foo/bar?param=value". */
1015 url_full_path (const struct url *url)
1017 int length = full_path_length (url);
1018 char *full_path = (char *) xmalloc (length + 1);
1020 full_path_write (url, full_path);
1021 full_path[length] = '\0';
1026 /* Unescape CHR in an otherwise escaped STR. Used to selectively
1027 escaping of certain characters, such as "/" and ":". Returns a
1028 count of unescaped chars. */
1031 unescape_single_char (char *str, char chr)
1033 const char c1 = XNUM_TO_DIGIT (chr >> 4);
1034 const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1035 char *h = str; /* hare */
1036 char *t = str; /* tortoise */
1037 for (; *h; h++, t++)
1039 if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1050 /* Escape unsafe and reserved characters, except for the slash
1054 url_escape_dir (const char *dir)
1056 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1060 unescape_single_char (newdir, '/');
1064 /* Sync u->path and u->url with u->dir and u->file. Called after
1065 u->file or u->dir have been changed, typically by the FTP code. */
1068 sync_path (struct url *u)
1070 char *newpath, *efile, *edir;
1074 /* u->dir and u->file are not escaped. URL-escape them before
1075 reassembling them into u->path. That way, if they contain
1076 separators like '?' or even if u->file contains slashes, the
1077 path will be correctly assembled. (u->file can contain slashes
1078 if the URL specifies it with %2f, or if an FTP server returns
1080 edir = url_escape_dir (u->dir);
1081 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1084 newpath = xstrdup (efile);
1087 int dirlen = strlen (edir);
1088 int filelen = strlen (efile);
1090 /* Copy "DIR/FILE" to newpath. */
1091 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1092 memcpy (p, edir, dirlen);
1095 memcpy (p, efile, filelen);
1104 if (efile != u->file)
1107 /* Regenerate u->url as well. */
1109 u->url = url_string (u, 0);
1112 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1113 This way we can sync u->path and u->url when they get changed. */
1116 url_set_dir (struct url *url, const char *newdir)
1119 url->dir = xstrdup (newdir);
1124 url_set_file (struct url *url, const char *newfile)
1127 url->file = xstrdup (newfile);
1132 url_free (struct url *url)
1138 xfree_null (url->params);
1139 xfree_null (url->query);
1140 xfree_null (url->fragment);
1141 xfree_null (url->user);
1142 xfree_null (url->passwd);
1150 /* Create all the necessary directories for PATH (a file). Calls
1151 mkdirhier() internally. */
1153 mkalldirs (const char *path)
1160 p = path + strlen (path);
1161 for (; *p != '/' && p != path; p--)
1164 /* Don't create if it's just a file. */
1165 if ((p == path) && (*p != '/'))
1167 t = strdupdelim (path, p);
1169 /* Check whether the directory exists. */
1170 if ((stat (t, &st) == 0))
1172 if (S_ISDIR (st.st_mode))
1179 /* If the dir exists as a file name, remove it first. This
1180 is *only* for Wget to work with buggy old CERN http
1181 servers. Here is the scenario: When Wget tries to
1182 retrieve a directory without a slash, e.g.
1183 http://foo/bar (bar being a directory), CERN server will
1184 not redirect it too http://foo/bar/ -- it will generate a
1185 directory listing containing links to bar/file1,
1186 bar/file2, etc. Wget will lose because it saves this
1187 HTML listing to a file `bar', so it cannot create the
1188 directory. To work around this, if the file of the same
1189 name exists, we just remove it and create the directory
1191 DEBUGP (("Removing %s because of directory danger!\n", t));
1195 res = make_directory (t);
1197 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1202 /* Functions for constructing the file name out of URL components. */
1204 /* A growable string structure, used by url_file_name and friends.
1205 This should perhaps be moved to utils.c.
1207 The idea is to have a convenient and efficient way to construct a
1208 string by having various functions append data to it. Instead of
1209 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1210 functions in questions, we pass the pointer to this struct. */
1218 /* Ensure that the string can accept APPEND_COUNT more characters past
1219 the current TAIL position. If necessary, this will grow the string
1220 and update its allocated size. If the string is already large
1221 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1222 #define GROW(g, append_size) do { \
1223 struct growable *G_ = g; \
1224 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1227 /* Return the tail position of the string. */
1228 #define TAIL(r) ((r)->base + (r)->tail)
1230 /* Move the tail position by APPEND_COUNT characters. */
1231 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1233 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1237 append_string (const char *str, struct growable *dest)
1239 int l = strlen (str);
1241 memcpy (TAIL (dest), str, l);
1242 TAIL_INCR (dest, l);
1245 /* Append CH to DEST. For example, append_char (0, DEST)
1246 zero-terminates DEST. */
1249 append_char (char ch, struct growable *dest)
1253 TAIL_INCR (dest, 1);
1257 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1258 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1259 filechr_control = 4 /* a control character, e.g. 0-31 */
1262 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1264 /* Shorthands for the table: */
1265 #define U filechr_not_unix
1266 #define W filechr_not_windows
1267 #define C filechr_control
1272 /* Table of characters unsafe under various conditions (see above).
1274 Arguably we could also claim `%' to be unsafe, since we use it as
1275 the escape character. If we ever want to be able to reliably
1276 translate file name back to URL, this would become important
1277 crucial. Right now, it's better to be minimal in escaping. */
1279 static const unsigned char filechr_table[256] =
1281 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1282 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1283 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1284 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1285 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1286 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1287 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1288 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1289 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1290 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1291 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1292 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1293 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1294 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1295 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1296 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1298 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1299 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1304 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1305 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1314 /* FN_PORT_SEP is the separator between host and port in file names
1315 for non-standard port numbers. On Unix this is normally ':', as in
1316 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1317 because Windows can't handle ':' in file names. */
1318 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1320 /* FN_QUERY_SEP is the separator between the file name and the URL
1321 query, normally '?'. Since Windows cannot handle '?' as part of
1322 file name, we use '@' instead there. */
1323 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1325 /* Quote path element, characters in [b, e), as file name, and append
1326 the quoted string to DEST. Each character is quoted as per
1327 file_unsafe_char and the corresponding table.
1329 If ESCAPED_P is non-zero, the path element is considered to be
1330 URL-escaped and will be unescaped prior to inspection. */
1333 append_uri_pathel (const char *b, const char *e, int escaped_p,
1334 struct growable *dest)
1340 if (opt.restrict_files_os == restrict_unix)
1341 mask = filechr_not_unix;
1343 mask = filechr_not_windows;
1344 if (opt.restrict_files_ctrl)
1345 mask |= filechr_control;
1347 /* Copy [b, e) to PATHEL and URL-unescape it. */
1351 BOUNDED_TO_ALLOCA (b, e, unescaped);
1352 url_unescape (unescaped);
1354 e = unescaped + strlen (unescaped);
1357 /* Defang ".." when found as component of path. Remember that path
1358 comes from the URL and might contain malicious input. */
1359 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1365 /* Walk the PATHEL string and check how many characters we'll need
1368 for (p = b; p < e; p++)
1369 if (FILE_CHAR_TEST (*p, mask))
1372 /* Calculate the length of the output string. e-b is the input
1373 string length. Each quoted char introduces two additional
1374 characters in the string, hence 2*quoted. */
1375 outlen = (e - b) + (2 * quoted);
1376 GROW (dest, outlen);
1380 /* If there's nothing to quote, we can simply append the string
1381 without processing it again. */
1382 memcpy (TAIL (dest), b, outlen);
1386 char *q = TAIL (dest);
1387 for (p = b; p < e; p++)
1389 if (!FILE_CHAR_TEST (*p, mask))
1393 unsigned char ch = *p;
1395 *q++ = XNUM_TO_DIGIT (ch >> 4);
1396 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1399 assert (q - TAIL (dest) == outlen);
1401 TAIL_INCR (dest, outlen);
1404 /* Append to DEST the directory structure that corresponds the
1405 directory part of URL's path. For example, if the URL is
1406 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1408 Each path element ("dir1" and "dir2" in the above example) is
1409 examined, url-unescaped, and re-escaped as file name element.
1411 Additionally, it cuts as many directories from the path as
1412 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1413 will produce "bar" for the above example. For 2 or more, it will
1416 Each component of the path is quoted for use as file name. */
1419 append_dir_structure (const struct url *u, struct growable *dest)
1421 char *pathel, *next;
1422 int cut = opt.cut_dirs;
1424 /* Go through the path components, de-URL-quote them, and quote them
1425 (if necessary) as file names. */
1428 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1433 /* Ignore empty pathels. */
1437 append_char ('/', dest);
1438 append_uri_pathel (pathel, next, 1, dest);
1442 /* Return a unique file name that matches the given URL as good as
1443 possible. Does not create directories on the file system. */
1446 url_file_name (const struct url *u)
1448 struct growable fnres; /* stands for "file name result" */
1450 const char *u_file, *u_query;
1451 char *fname, *unique;
1457 /* Start with the directory prefix, if specified. */
1459 append_string (opt.dir_prefix, &fnres);
1461 /* If "dirstruct" is turned on (typically the case with -r), add
1462 the host and port (unless those have been turned off) and
1463 directory structure. */
1466 if (opt.protocol_directories)
1469 append_char ('/', &fnres);
1470 append_string (supported_schemes[u->scheme].name, &fnres);
1472 if (opt.add_hostdir)
1475 append_char ('/', &fnres);
1476 if (0 != strcmp (u->host, ".."))
1477 append_string (u->host, &fnres);
1479 /* Host name can come from the network; malicious DNS may
1480 allow ".." to be resolved, causing us to write to
1481 "../<file>". Defang such host names. */
1482 append_string ("%2E%2E", &fnres);
1483 if (u->port != scheme_default_port (u->scheme))
1486 number_to_string (portstr, u->port);
1487 append_char (FN_PORT_SEP, &fnres);
1488 append_string (portstr, &fnres);
1492 append_dir_structure (u, &fnres);
1495 /* Add the file name. */
1497 append_char ('/', &fnres);
1498 u_file = *u->file ? u->file : "index.html";
1499 append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1501 /* Append "?query" to the file name. */
1502 u_query = u->query && *u->query ? u->query : NULL;
1505 append_char (FN_QUERY_SEP, &fnres);
1506 append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1509 /* Zero-terminate the file name. */
1510 append_char ('\0', &fnres);
1514 /* Check the cases in which the unique extensions are not used:
1515 1) Clobbering is turned off (-nc).
1516 2) Retrieval with regetting.
1517 3) Timestamping is used.
1518 4) Hierarchy is built.
1520 The exception is the case when file does exist and is a
1521 directory (see `mkalldirs' for explanation). */
1523 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1524 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1527 unique = unique_name (fname, 1);
1528 if (unique != fname)
1533 /* Resolve "." and ".." elements of PATH by destructively modifying
1534 PATH and return non-zero if PATH has been modified, zero otherwise.
1536 The algorithm is in spirit similar to the one described in rfc1808,
1537 although implemented differently, in one pass. To recap, path
1538 elements containing only "." are removed, and ".." is taken to mean
1539 "back up one element". Single leading and trailing slashes are
1542 This function does not handle URL escapes explicitly. If you're
1543 passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1544 ".", so that this function can find the dots. (Wget's URL parser
1545 calls reencode_escapes, which see.)
1547 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1548 test examples are provided below. If you change anything in this
1549 function, run test_path_simplify to make sure you haven't broken a
1553 path_simplify (char *path)
1555 char *h = path; /* hare */
1556 char *t = path; /* tortoise */
1557 char *beg = path; /* boundary for backing the tortoise */
1558 char *end = path + strlen (path);
1562 /* Hare should be at the beginning of a path element. */
1564 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1569 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1571 /* Handle "../" by retreating the tortoise by one path
1572 element -- but not past beggining. */
1575 /* Move backwards until T hits the beginning of the
1576 previous path element or the beginning of path. */
1577 for (--t; t > beg && t[-1] != '/'; t--)
1582 /* If we're at the beginning, copy the "../" literally
1583 move the beginning so a later ".." doesn't remove
1593 /* A regular path element. If H hasn't advanced past T,
1594 simply skip to the next path element. Otherwise, copy
1595 the path element until the next slash. */
1598 /* Skip the path element, including the slash. */
1599 while (h < end && *h != '/')
1606 /* Copy the path element, including the final slash. */
1607 while (h < end && *h != '/')
1621 /* Return the length of URL's path. Path is considered to be
1622 terminated by one of '?', ';', '#', or by the end of the
1626 path_length (const char *url)
1628 const char *q = strpbrk_or_eos (url, "?;#");
1632 /* Find the last occurrence of character C in the range [b, e), or
1633 NULL, if none are present. We might want to use memrchr (a GNU
1634 extension) under GNU libc. */
1637 find_last_char (const char *b, const char *e, char c)
1645 /* Merge BASE with LINK and return the resulting URI.
1647 Either of the URIs may be absolute or relative, complete with the
1648 host name, or path only. This tries to reasonably handle all
1649 foreseeable cases. It only employs minimal URL parsing, without
1650 knowledge of the specifics of schemes.
1652 I briefly considered making this function call path_simplify after
1653 the merging process, as rfc1738 seems to suggest. This is a bad
1654 idea for several reasons: 1) it complexifies the code, and 2)
1655 url_parse has to simplify path anyway, so it's wasteful to boot. */
1658 uri_merge (const char *base, const char *link)
1664 if (url_has_scheme (link))
1665 return xstrdup (link);
1667 /* We may not examine BASE past END. */
1668 end = base + path_length (base);
1669 linklength = strlen (link);
1673 /* Empty LINK points back to BASE, query string and all. */
1674 return xstrdup (base);
1676 else if (*link == '?')
1678 /* LINK points to the same location, but changes the query
1679 string. Examples: */
1680 /* uri_merge("path", "?new") -> "path?new" */
1681 /* uri_merge("path?foo", "?new") -> "path?new" */
1682 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1683 /* uri_merge("path#foo", "?new") -> "path?new" */
1684 int baselength = end - base;
1685 merge = xmalloc (baselength + linklength + 1);
1686 memcpy (merge, base, baselength);
1687 memcpy (merge + baselength, link, linklength);
1688 merge[baselength + linklength] = '\0';
1690 else if (*link == '#')
1692 /* uri_merge("path", "#new") -> "path#new" */
1693 /* uri_merge("path#foo", "#new") -> "path#new" */
1694 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1695 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1697 const char *end1 = strchr (base, '#');
1699 end1 = base + strlen (base);
1700 baselength = end1 - base;
1701 merge = xmalloc (baselength + linklength + 1);
1702 memcpy (merge, base, baselength);
1703 memcpy (merge + baselength, link, linklength);
1704 merge[baselength + linklength] = '\0';
1706 else if (*link == '/' && *(link + 1) == '/')
1708 /* LINK begins with "//" and so is a net path: we need to
1709 replace everything after (and including) the double slash
1712 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1713 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1714 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1718 const char *start_insert;
1720 /* Look for first slash. */
1721 slash = memchr (base, '/', end - base);
1722 /* If found slash and it is a double slash, then replace
1723 from this point, else default to replacing from the
1725 if (slash && *(slash + 1) == '/')
1726 start_insert = slash;
1728 start_insert = base;
1730 span = start_insert - base;
1731 merge = (char *)xmalloc (span + linklength + 1);
1733 memcpy (merge, base, span);
1734 memcpy (merge + span, link, linklength);
1735 merge[span + linklength] = '\0';
1737 else if (*link == '/')
1739 /* LINK is an absolute path: we need to replace everything
1740 after (and including) the FIRST slash with LINK.
1742 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1743 "/qux/xyzzy", our result should be
1744 "http://host/qux/xyzzy". */
1747 const char *start_insert = NULL; /* for gcc to shut up. */
1748 const char *pos = base;
1749 int seen_slash_slash = 0;
1750 /* We're looking for the first slash, but want to ignore
1753 slash = memchr (pos, '/', end - pos);
1754 if (slash && !seen_slash_slash)
1755 if (*(slash + 1) == '/')
1758 seen_slash_slash = 1;
1762 /* At this point, SLASH is the location of the first / after
1763 "//", or the first slash altogether. START_INSERT is the
1764 pointer to the location where LINK will be inserted. When
1765 examining the last two examples, keep in mind that LINK
1768 if (!slash && !seen_slash_slash)
1769 /* example: "foo" */
1771 start_insert = base;
1772 else if (!slash && seen_slash_slash)
1773 /* example: "http://foo" */
1776 else if (slash && !seen_slash_slash)
1777 /* example: "foo/bar" */
1779 start_insert = base;
1780 else if (slash && seen_slash_slash)
1781 /* example: "http://something/" */
1783 start_insert = slash;
1785 span = start_insert - base;
1786 merge = (char *)xmalloc (span + linklength + 1);
1788 memcpy (merge, base, span);
1789 memcpy (merge + span, link, linklength);
1790 merge[span + linklength] = '\0';
1794 /* LINK is a relative URL: we need to replace everything
1795 after last slash (possibly empty) with LINK.
1797 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1798 our result should be "whatever/foo/qux/xyzzy". */
1799 int need_explicit_slash = 0;
1801 const char *start_insert;
1802 const char *last_slash = find_last_char (base, end, '/');
1805 /* No slash found at all. Replace what we have with LINK. */
1806 start_insert = base;
1808 else if (last_slash && last_slash >= base + 2
1809 && last_slash[-2] == ':' && last_slash[-1] == '/')
1811 /* example: http://host" */
1813 start_insert = end + 1;
1814 need_explicit_slash = 1;
1818 /* example: "whatever/foo/bar" */
1820 start_insert = last_slash + 1;
1823 span = start_insert - base;
1824 merge = (char *)xmalloc (span + linklength + 1);
1826 memcpy (merge, base, span);
1827 if (need_explicit_slash)
1828 merge[span - 1] = '/';
1829 memcpy (merge + span, link, linklength);
1830 merge[span + linklength] = '\0';
1836 #define APPEND(p, s) do { \
1837 int len = strlen (s); \
1838 memcpy (p, s, len); \
1842 /* Use this instead of password when the actual password is supposed
1843 to be hidden. We intentionally use a generic string without giving
1844 away the number of characters in the password, like previous
1846 #define HIDDEN_PASSWORD "*password*"
1848 /* Recreate the URL string from the data in URL.
1850 If HIDE is non-zero (as it is when we're calling this on a URL we
1851 plan to print, but not when calling it to canonicalize a URL for
1852 use within the program), password will be hidden. Unsafe
1853 characters in the URL will be quoted. */
1856 url_string (const struct url *url, int hide_password)
1860 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1862 int scheme_port = supported_schemes[url->scheme].default_port;
1863 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1864 int fplen = full_path_length (url);
1866 int brackets_around_host;
1868 assert (scheme_str != NULL);
1870 /* Make sure the user name and password are quoted. */
1873 quoted_user = url_escape_allow_passthrough (url->user);
1877 quoted_passwd = HIDDEN_PASSWORD;
1879 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1883 /* In the unlikely event that the host name contains non-printable
1884 characters, quote it for displaying to the user. */
1885 quoted_host = url_escape_allow_passthrough (url->host);
1887 /* Undo the quoting of colons that URL escaping performs. IPv6
1888 addresses may legally contain colons, and in that case must be
1889 placed in square brackets. */
1890 if (quoted_host != url->host)
1891 unescape_single_char (quoted_host, ':');
1892 brackets_around_host = strchr (quoted_host, ':') != NULL;
1894 size = (strlen (scheme_str)
1895 + strlen (quoted_host)
1896 + (brackets_around_host ? 2 : 0)
1899 if (url->port != scheme_port)
1900 size += 1 + numdigit (url->port);
1903 size += 1 + strlen (quoted_user);
1905 size += 1 + strlen (quoted_passwd);
1908 p = result = xmalloc (size);
1910 APPEND (p, scheme_str);
1913 APPEND (p, quoted_user);
1917 APPEND (p, quoted_passwd);
1922 if (brackets_around_host)
1924 APPEND (p, quoted_host);
1925 if (brackets_around_host)
1927 if (url->port != scheme_port)
1930 p = number_to_string (p, url->port);
1933 full_path_write (url, p);
1937 assert (p - result == size);
1939 if (quoted_user && quoted_user != url->user)
1940 xfree (quoted_user);
1941 if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1942 xfree (quoted_passwd);
1943 if (quoted_host != url->host)
1944 xfree (quoted_host);
1949 /* Return non-zero if scheme a is similar to scheme b.
1951 Schemes are similar if they are equal. If SSL is supported, schemes
1952 are also similar if one is http (SCHEME_HTTP) and the other is https
1955 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1960 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1961 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1968 /* Debugging and testing support for path_simplify. */
1970 /* Debug: run path_simplify on PATH and return the result in a new
1971 string. Useful for calling from the debugger. */
1975 char *copy = xstrdup (path);
1976 path_simplify (copy);
1981 run_test (char *test, char *expected_result, int expected_change)
1983 char *test_copy = xstrdup (test);
1984 int modified = path_simplify (test_copy);
1986 if (0 != strcmp (test_copy, expected_result))
1988 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1989 test, expected_result, test_copy);
1991 if (modified != expected_change)
1993 if (expected_change == 1)
1994 printf ("Expected modification with path_simplify(\"%s\").\n",
1997 printf ("Expected no modification with path_simplify(\"%s\").\n",
2004 test_path_simplify (void)
2007 char *test, *result;
2014 { "../", "../", 0 },
2015 { "foo", "foo", 0 },
2016 { "foo/bar", "foo/bar", 0 },
2017 { "foo///bar", "foo///bar", 0 },
2018 { "foo/.", "foo/", 1 },
2019 { "foo/./", "foo/", 1 },
2020 { "foo./", "foo./", 0 },
2021 { "foo/../bar", "bar", 1 },
2022 { "foo/../bar/", "bar/", 1 },
2023 { "foo/bar/..", "foo/", 1 },
2024 { "foo/bar/../x", "foo/x", 1 },
2025 { "foo/bar/../x/", "foo/x/", 1 },
2026 { "foo/..", "", 1 },
2027 { "foo/../..", "..", 1 },
2028 { "foo/../../..", "../..", 1 },
2029 { "foo/../../bar/../../baz", "../../baz", 1 },
2030 { "a/b/../../c", "c", 1 },
2031 { "./a/../b", "b", 1 }
2035 for (i = 0; i < countof (tests); i++)
2037 char *test = tests[i].test;
2038 char *expected_result = tests[i].result;
2039 int expected_change = tests[i].should_modify;
2040 run_test (test, expected_result, expected_change);