2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 const char *leading_string;
63 /* Supported schemes: */
64 static struct scheme_data supported_schemes[] =
66 { "http", "http://", DEFAULT_HTTP_PORT, 1 },
68 { "https", "https://", DEFAULT_HTTPS_PORT, 1 },
70 { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
76 /* Forward declarations: */
78 static int path_simplify PARAMS ((char *));
80 /* Support for escaping and unescaping of URL strings. */
82 /* Table of "reserved" and "unsafe" characters. Those terms are
83 rfc1738-speak, as such largely obsoleted by rfc2396 and later
84 specs, but the general idea remains.
86 A reserved character is the one that you can't decode without
87 changing the meaning of the URL. For example, you can't decode
88 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
89 path components is different. Non-reserved characters can be
90 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". Wget
91 uses the rfc1738 set of reserved characters, plus "$" and ",", as
92 recommended by rfc2396.
94 An unsafe characters is the one that should be encoded when URLs
95 are placed in foreign environments. E.g. space and newline are
96 unsafe in HTTP contexts because HTTP uses them as separator and
97 terminator, so they must be encoded to %20 and %0A respectively.
98 "*" is unsafe in shell context, etc.
100 We determine whether a character is unsafe through static table
101 lookup. This code assumes ASCII character set and 8-bit chars. */
104 /* rfc1738 reserved chars + "$" and ",". */
107 /* rfc1738 unsafe chars, plus non-printables. */
111 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
112 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
113 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
115 /* Shorthands for the table: */
116 #define R urlchr_reserved
117 #define U urlchr_unsafe
120 const static unsigned char urlchr_table[256] =
122 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
123 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
124 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
125 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
126 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
127 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
128 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
129 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
130 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
131 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
132 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
133 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
134 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
135 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
136 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
137 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
144 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
145 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
153 /* URL-unescape the string S.
155 This is done by transforming the sequences "%HH" to the character
156 represented by the hexadecimal digits HH. If % is not followed by
157 two hexadecimal digits, it is inserted literally.
159 The transformation is done in place. If you need the original
160 string intact, make a copy before calling this function. */
163 url_unescape (char *s)
165 char *t = s; /* t - tortoise */
166 char *h = s; /* h - hare */
177 /* Do nothing if '%' is not followed by two hex digits. */
178 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
180 *t = X2DIGITS_TO_NUM (h[1], h[2]);
187 /* The core of url_escape_* functions. Escapes the characters that
188 match the provided mask in urlchr_table.
190 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
191 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
192 freshly allocated string will be returned in all cases. */
195 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
202 for (p1 = s; *p1; p1++)
203 if (urlchr_test (*p1, mask))
204 addition += 2; /* Two more characters (hex digits) */
207 return allow_passthrough ? (char *)s : xstrdup (s);
209 newlen = (p1 - s) + addition;
210 newstr = (char *)xmalloc (newlen + 1);
216 /* Quote the characters that match the test mask. */
217 if (urlchr_test (*p1, mask))
219 unsigned char c = *p1++;
221 *p2++ = XNUM_TO_DIGIT (c >> 4);
222 *p2++ = XNUM_TO_DIGIT (c & 0xf);
227 assert (p2 - newstr == newlen);
233 /* URL-escape the unsafe characters (see urlchr_table) in a given
234 string, returning a freshly allocated string. */
237 url_escape (const char *s)
239 return url_escape_1 (s, urlchr_unsafe, 0);
242 /* URL-escape the unsafe characters (see urlchr_table) in a given
243 string. If no characters are unsafe, S is returned. */
246 url_escape_allow_passthrough (const char *s)
248 return url_escape_1 (s, urlchr_unsafe, 1);
251 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
253 /* Decide whether to encode, decode, or pass through the char at P.
254 This used to be a macro, but it got a little too convoluted. */
255 static inline enum copy_method
256 decide_copy_method (const char *p)
260 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
262 /* %xx sequence: decode it, unless it would decode to an
263 unsafe or a reserved char; in that case, leave it as
265 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
266 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
267 return CM_PASSTHROUGH;
272 /* Garbled %.. sequence: encode `%'. */
275 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
278 return CM_PASSTHROUGH;
281 /* Translate a %-escaped (but possibly non-conformant) input string S
282 into a %-escaped (and conformant) output string. If no characters
283 are encoded or decoded, return the same string S; otherwise, return
284 a freshly allocated string with the new contents.
286 After a URL has been run through this function, the protocols that
287 use `%' as the quote character can use the resulting string as-is,
288 while those that don't call url_unescape() to get to the intended
289 data. This function is also stable: after an input string is
290 transformed the first time, all further transformations of the
291 result yield the same result string.
293 Let's discuss why this function is needed.
295 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
296 space character would mess up the HTTP request, it needs to be
299 GET /abc%20def HTTP/1.0
301 It appears that the unsafe chars need to be quoted, for example
302 with url_escape. But what if we're requested to download
303 `abc%20def'? url_escape transforms "%" to "%25", which would leave
304 us with `abc%2520def'. This is incorrect -- since %-escapes are
305 part of URL syntax, "%20" is the correct way to denote a literal
306 space on the Wget command line. This leaves us in the conclusion
307 that in that case Wget should not call url_escape, but leave the
310 And what if the requested URI is `abc%20 def'? If we call
311 url_escape, we end up with `/abc%2520%20def', which is almost
312 certainly not intended. If we don't call url_escape, we are left
313 with the embedded space and cannot complete the request. What the
314 user meant was for Wget to request `/abc%20%20def', and this is
315 where reencode_escapes kicks in.
317 Wget used to solve this by first decoding %-quotes, and then
318 encoding all the "unsafe" characters found in the resulting string.
319 This was wrong because it didn't preserve certain URL special
320 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
321 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
322 whether we considered `+' reserved (it is). One of these results
323 is inevitable because by the second step we would lose information
324 on whether the `+' was originally encoded or not. Both results
325 were wrong because in CGI parameters + means space, while %2B means
326 literal plus. reencode_escapes correctly translates the above to
327 "a%2B+b", i.e. returns the original string.
329 This function uses an algorithm proposed by Anon Sricharoenchai:
331 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
334 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
337 ...except that this code conflates the two steps, and decides
338 whether to encode, decode, or pass through each character in turn.
339 The function still uses two passes, but their logic is the same --
340 the first pass exists merely for the sake of allocation. Another
341 small difference is that we include `+' to URL_RESERVED.
345 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
347 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
351 "foo bar" -> "foo%20bar"
352 "foo%20bar" -> "foo%20bar"
353 "foo %20bar" -> "foo%20%20bar"
354 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
355 "foo%25%20bar" -> "foo%25%20bar"
356 "foo%2%20bar" -> "foo%252%20bar"
357 "foo+bar" -> "foo+bar" (plus is reserved!)
358 "foo%2b+bar" -> "foo%2b+bar" */
361 reencode_escapes (const char *s)
367 int encode_count = 0;
368 int decode_count = 0;
370 /* First, pass through the string to see if there's anything to do,
371 and to calculate the new length. */
372 for (p1 = s; *p1; p1++)
374 switch (decide_copy_method (p1))
387 if (!encode_count && !decode_count)
388 /* The string is good as it is. */
389 return (char *)s; /* C const model sucks. */
392 /* Each encoding adds two characters (hex digits), while each
393 decoding removes two characters. */
394 newlen = oldlen + 2 * (encode_count - decode_count);
395 newstr = xmalloc (newlen + 1);
402 switch (decide_copy_method (p1))
406 unsigned char c = *p1++;
408 *p2++ = XNUM_TO_DIGIT (c >> 4);
409 *p2++ = XNUM_TO_DIGIT (c & 0xf);
413 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
414 p1 += 3; /* skip %xx */
421 assert (p2 - newstr == newlen);
425 /* Returns the scheme type if the scheme is supported, or
426 SCHEME_INVALID if not. */
429 url_scheme (const char *url)
433 for (i = 0; supported_schemes[i].leading_string; i++)
434 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
435 strlen (supported_schemes[i].leading_string)))
437 if (supported_schemes[i].enabled)
438 return (enum url_scheme) i;
440 return SCHEME_INVALID;
443 return SCHEME_INVALID;
446 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
448 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
449 currently implemented, it returns true if URL begins with
453 url_has_scheme (const char *url)
457 /* The first char must be a scheme char. */
458 if (!*p || !SCHEME_CHAR (*p))
461 /* Followed by 0 or more scheme chars. */
462 while (*p && SCHEME_CHAR (*p))
464 /* Terminated by ':'. */
469 scheme_default_port (enum url_scheme scheme)
471 return supported_schemes[scheme].default_port;
475 scheme_disable (enum url_scheme scheme)
477 supported_schemes[scheme].enabled = 0;
480 /* Skip the username and password, if present in the URL. The
481 function should *not* be called with the complete URL, but with the
482 portion after the scheme.
484 If no username and password are found, return URL. */
487 url_skip_credentials (const char *url)
489 /* Look for '@' that comes before terminators, such as '/', '?',
491 const char *p = (const char *)strpbrk (url, "@/?#;");
497 /* Parse credentials contained in [BEG, END). The region is expected
498 to have come from a URL and is unescaped. */
501 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
507 return 0; /* empty user name */
509 colon = memchr (beg, ':', end - beg);
511 return 0; /* again empty user name */
515 *passwd = strdupdelim (colon + 1, end);
517 url_unescape (*passwd);
524 *user = strdupdelim (beg, userend);
525 url_unescape (*user);
529 /* Used by main.c: detect URLs written using the "shorthand" URL forms
530 popularized by Netscape and NcFTP. HTTP shorthands look like this:
532 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
533 www.foo.com[:port] -> http://www.foo.com[:port]
535 FTP shorthands look like this:
537 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
538 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
540 If the URL needs not or cannot be rewritten, return NULL. */
543 rewrite_shorthand_url (const char *url)
547 if (url_has_scheme (url))
550 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
552 for (p = url; *p && *p != ':' && *p != '/'; p++)
562 /* If the characters after the colon and before the next slash
563 or end of string are all digits, it's HTTP. */
565 for (pp = p + 1; ISDIGIT (*pp); pp++)
567 if (digits > 0 && (*pp == '/' || *pp == '\0'))
570 /* Prepend "ftp://" to the entire URL... */
571 res = xmalloc (6 + strlen (url) + 1);
572 sprintf (res, "ftp://%s", url);
573 /* ...and replace ':' with '/'. */
574 res[6 + (p - url)] = '/';
581 /* Just prepend "http://" to what we have. */
582 res = xmalloc (7 + strlen (url) + 1);
583 sprintf (res, "http://%s", url);
588 static void split_path PARAMS ((const char *, char **, char **));
590 /* Like strpbrk, with the exception that it returns the pointer to the
591 terminating zero (end-of-string aka "eos") if no matching character
594 Although I normally balk at Gcc-specific optimizations, it probably
595 makes sense here: glibc has optimizations that detect strpbrk being
596 called with literal string as ACCEPT and inline the search. That
597 optimization is defeated if strpbrk is hidden within the call to
598 another function. (And no, making strpbrk_or_eos inline doesn't
599 help because the check for literal accept is in the
604 #define strpbrk_or_eos(s, accept) ({ \
605 char *SOE_p = strpbrk (s, accept); \
607 SOE_p = (char *)s + strlen (s); \
611 #else /* not __GNUC__ */
614 strpbrk_or_eos (const char *s, const char *accept)
616 char *p = strpbrk (s, accept);
618 p = (char *)s + strlen (s);
623 /* Turn STR into lowercase; return non-zero if a character was
627 lowercase_str (char *str)
634 *str = TOLOWER (*str);
639 static const char *parse_errors[] = {
640 #define PE_NO_ERROR 0
642 #define PE_UNSUPPORTED_SCHEME 1
643 N_("Unsupported scheme"),
644 #define PE_EMPTY_HOST 2
646 #define PE_BAD_PORT_NUMBER 3
647 N_("Bad port number"),
648 #define PE_INVALID_USER_NAME 4
649 N_("Invalid user name"),
650 #define PE_UNTERMINATED_IPV6_ADDRESS 5
651 N_("Unterminated IPv6 numeric address"),
652 #define PE_IPV6_NOT_SUPPORTED 6
653 N_("IPv6 addresses not supported"),
654 #define PE_INVALID_IPV6_ADDRESS 7
655 N_("Invalid IPv6 numeric address")
659 /* The following two functions were adapted from glibc. */
662 is_valid_ipv4_address (const char *str, const char *end)
672 if (ch >= '0' && ch <= '9')
674 val = val * 10 + (ch - '0');
685 else if (ch == '.' && saw_digit == 1)
702 is_valid_ipv6_address (const char *str, const char *end)
722 /* Leading :: requires some special handling. */
726 if (str == end || *str != ':')
738 /* if ch is a number, add it to val. */
742 val |= XDIGIT_TO_NUM (ch);
749 /* if ch is a colon ... */
762 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
770 /* if ch is a dot ... */
771 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ)
772 && is_valid_ipv4_address (curtok, end) == 1)
784 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
791 if (tp == NS_IN6ADDRSZ)
796 if (tp != NS_IN6ADDRSZ)
805 Return a new struct url if successful, NULL on error. In case of
806 error, and if ERROR is not NULL, also set *ERROR to the appropriate
809 url_parse (const char *url, int *error)
813 int path_modified, host_modified;
815 enum url_scheme scheme;
817 const char *uname_b, *uname_e;
818 const char *host_b, *host_e;
819 const char *path_b, *path_e;
820 const char *params_b, *params_e;
821 const char *query_b, *query_e;
822 const char *fragment_b, *fragment_e;
825 char *user = NULL, *passwd = NULL;
827 char *url_encoded = NULL;
831 scheme = url_scheme (url);
832 if (scheme == SCHEME_INVALID)
834 error_code = PE_UNSUPPORTED_SCHEME;
838 url_encoded = reencode_escapes (url);
841 p += strlen (supported_schemes[scheme].leading_string);
843 p = url_skip_credentials (p);
846 /* scheme://user:pass@host[:port]... */
849 /* We attempt to break down the URL into the components path,
850 params, query, and fragment. They are ordered like this:
852 scheme://host[:port][/path][;params][?query][#fragment] */
854 params_b = params_e = NULL;
855 query_b = query_e = NULL;
856 fragment_b = fragment_e = NULL;
862 /* Handle IPv6 address inside square brackets. Ideally we'd
863 just look for the terminating ']', but rfc2732 mandates
864 rejecting invalid IPv6 addresses. */
866 /* The address begins after '['. */
868 host_e = strchr (host_b, ']');
872 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
877 /* Check if the IPv6 address is valid. */
878 if (!is_valid_ipv6_address(host_b, host_e))
880 error_code = PE_INVALID_IPV6_ADDRESS;
884 /* Continue parsing after the closing ']'. */
887 error_code = PE_IPV6_NOT_SUPPORTED;
893 p = strpbrk_or_eos (p, ":/;?#");
897 if (host_b == host_e)
899 error_code = PE_EMPTY_HOST;
903 port = scheme_default_port (scheme);
906 const char *port_b, *port_e, *pp;
908 /* scheme://host:port/tralala */
912 p = strpbrk_or_eos (p, "/;?#");
915 /* Allow empty port, as per rfc2396. */
916 if (port_b != port_e)
918 for (port = 0, pp = port_b; pp < port_e; pp++)
922 /* http://host:12randomgarbage/blah */
924 error_code = PE_BAD_PORT_NUMBER;
927 port = 10 * port + (*pp - '0');
936 p = strpbrk_or_eos (p, ";?#");
941 /* Path is not allowed not to exist. */
949 p = strpbrk_or_eos (p, "?#");
956 p = strpbrk_or_eos (p, "#");
959 /* Hack that allows users to use '?' (a wildcard character) in
960 FTP URLs without it being interpreted as a query string
962 if (scheme == SCHEME_FTP)
964 query_b = query_e = NULL;
977 if (uname_b != uname_e)
979 /* http://user:pass@host */
981 /* uname_b uname_e */
982 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
984 error_code = PE_INVALID_USER_NAME;
989 u = xnew0 (struct url);
991 u->host = strdupdelim (host_b, host_e);
996 u->path = strdupdelim (path_b, path_e);
997 path_modified = path_simplify (u->path);
998 split_path (u->path, &u->dir, &u->file);
1000 host_modified = lowercase_str (u->host);
1002 /* Decode %HH sequences in host name. This is important not so much
1003 to support %HH sequences, but to support binary characters (which
1004 will have been converted to %HH by reencode_escapes). */
1005 if (strchr (u->host, '%'))
1007 url_unescape (u->host);
1012 u->params = strdupdelim (params_b, params_e);
1014 u->query = strdupdelim (query_b, query_e);
1016 u->fragment = strdupdelim (fragment_b, fragment_e);
1018 if (path_modified || u->fragment || host_modified || path_b == path_e)
1020 /* If we suspect that a transformation has rendered what
1021 url_string might return different from URL_ENCODED, rebuild
1022 u->url using url_string. */
1023 u->url = url_string (u, 0);
1025 if (url_encoded != url)
1026 xfree ((char *) url_encoded);
1030 if (url_encoded == url)
1031 u->url = xstrdup (url);
1033 u->url = url_encoded;
1040 /* Cleanup in case of error: */
1041 if (url_encoded && url_encoded != url)
1042 xfree (url_encoded);
1044 /* Transmit the error code to the caller, if the caller wants to
1047 *error = error_code;
1051 /* Return the error message string from ERROR_CODE, which should have
1052 been retrieved from url_parse. The error message is translated. */
1055 url_error (int error_code)
1057 assert (error_code >= 0 && error_code < countof (parse_errors));
1058 return _(parse_errors[error_code]);
1061 /* Split PATH into DIR and FILE. PATH comes from the URL and is
1062 expected to be URL-escaped.
1064 The path is split into directory (the part up to the last slash)
1065 and file (the part after the last slash), which are subsequently
1066 unescaped. Examples:
1069 "foo/bar/baz" "foo/bar" "baz"
1070 "foo/bar/" "foo/bar" ""
1072 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
1074 DIR and FILE are freshly allocated. */
1077 split_path (const char *path, char **dir, char **file)
1079 char *last_slash = strrchr (path, '/');
1082 *dir = xstrdup ("");
1083 *file = xstrdup (path);
1087 *dir = strdupdelim (path, last_slash);
1088 *file = xstrdup (last_slash + 1);
1090 url_unescape (*dir);
1091 url_unescape (*file);
1094 /* Note: URL's "full path" is the path with the query string and
1095 params appended. The "fragment" (#foo) is intentionally ignored,
1096 but that might be changed. For example, if the original URL was
1097 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1098 the full path will be "/foo/bar/baz;bullshit?querystring". */
1100 /* Return the length of the full path, without the terminating
1104 full_path_length (const struct url *url)
1108 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1119 /* Write out the full path. */
1122 full_path_write (const struct url *url, char *where)
1124 #define FROB(el, chr) do { \
1125 char *f_el = url->el; \
1127 int l = strlen (f_el); \
1129 memcpy (where, f_el, l); \
1141 /* Public function for getting the "full path". E.g. if u->path is
1142 "foo/bar" and u->query is "param=value", full_path will be
1143 "/foo/bar?param=value". */
1146 url_full_path (const struct url *url)
1148 int length = full_path_length (url);
1149 char *full_path = (char *)xmalloc(length + 1);
1151 full_path_write (url, full_path);
1152 full_path[length] = '\0';
1157 /* Escape unsafe and reserved characters, except for the slash
1161 url_escape_dir (const char *dir)
1163 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1168 /* Unescape slashes in NEWDIR. */
1170 h = newdir; /* hare */
1171 t = newdir; /* tortoise */
1173 for (; *h; h++, t++)
1175 /* url_escape_1 having converted '/' to "%2F" exactly. */
1176 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1189 /* Sync u->path and u->url with u->dir and u->file. Called after
1190 u->file or u->dir have been changed, typically by the FTP code. */
1193 sync_path (struct url *u)
1195 char *newpath, *efile, *edir;
1199 /* u->dir and u->file are not escaped. URL-escape them before
1200 reassembling them into u->path. That way, if they contain
1201 separators like '?' or even if u->file contains slashes, the
1202 path will be correctly assembled. (u->file can contain slashes
1203 if the URL specifies it with %2f, or if an FTP server returns
1205 edir = url_escape_dir (u->dir);
1206 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1209 newpath = xstrdup (efile);
1212 int dirlen = strlen (edir);
1213 int filelen = strlen (efile);
1215 /* Copy "DIR/FILE" to newpath. */
1216 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1217 memcpy (p, edir, dirlen);
1220 memcpy (p, efile, filelen);
1229 if (efile != u->file)
1232 /* Regenerate u->url as well. */
1234 u->url = url_string (u, 0);
1237 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1238 This way we can sync u->path and u->url when they get changed. */
1241 url_set_dir (struct url *url, const char *newdir)
1244 url->dir = xstrdup (newdir);
1249 url_set_file (struct url *url, const char *newfile)
1252 url->file = xstrdup (newfile);
1257 url_free (struct url *url)
1263 xfree_null (url->params);
1264 xfree_null (url->query);
1265 xfree_null (url->fragment);
1266 xfree_null (url->user);
1267 xfree_null (url->passwd);
1275 /* Create all the necessary directories for PATH (a file). Calls
1276 mkdirhier() internally. */
1278 mkalldirs (const char *path)
1285 p = path + strlen (path);
1286 for (; *p != '/' && p != path; p--)
1289 /* Don't create if it's just a file. */
1290 if ((p == path) && (*p != '/'))
1292 t = strdupdelim (path, p);
1294 /* Check whether the directory exists. */
1295 if ((stat (t, &st) == 0))
1297 if (S_ISDIR (st.st_mode))
1304 /* If the dir exists as a file name, remove it first. This
1305 is *only* for Wget to work with buggy old CERN http
1306 servers. Here is the scenario: When Wget tries to
1307 retrieve a directory without a slash, e.g.
1308 http://foo/bar (bar being a directory), CERN server will
1309 not redirect it too http://foo/bar/ -- it will generate a
1310 directory listing containing links to bar/file1,
1311 bar/file2, etc. Wget will lose because it saves this
1312 HTML listing to a file `bar', so it cannot create the
1313 directory. To work around this, if the file of the same
1314 name exists, we just remove it and create the directory
1316 DEBUGP (("Removing %s because of directory danger!\n", t));
1320 res = make_directory (t);
1322 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1327 /* Functions for constructing the file name out of URL components. */
1329 /* A growable string structure, used by url_file_name and friends.
1330 This should perhaps be moved to utils.c.
1332 The idea is to have a convenient and efficient way to construct a
1333 string by having various functions append data to it. Instead of
1334 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1335 functions in questions, we pass the pointer to this struct. */
1343 /* Ensure that the string can accept APPEND_COUNT more characters past
1344 the current TAIL position. If necessary, this will grow the string
1345 and update its allocated size. If the string is already large
1346 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1347 #define GROW(g, append_size) do { \
1348 struct growable *G_ = g; \
1349 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1352 /* Return the tail position of the string. */
1353 #define TAIL(r) ((r)->base + (r)->tail)
1355 /* Move the tail position by APPEND_COUNT characters. */
1356 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1358 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1362 append_string (const char *str, struct growable *dest)
1364 int l = strlen (str);
1366 memcpy (TAIL (dest), str, l);
1367 TAIL_INCR (dest, l);
1370 /* Append CH to DEST. For example, append_char (0, DEST)
1371 zero-terminates DEST. */
1374 append_char (char ch, struct growable *dest)
1378 TAIL_INCR (dest, 1);
1382 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1383 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1384 filechr_control = 4 /* a control character, e.g. 0-31 */
1387 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1389 /* Shorthands for the table: */
1390 #define U filechr_not_unix
1391 #define W filechr_not_windows
1392 #define C filechr_control
1397 /* Table of characters unsafe under various conditions (see above).
1399 Arguably we could also claim `%' to be unsafe, since we use it as
1400 the escape character. If we ever want to be able to reliably
1401 translate file name back to URL, this would become important
1402 crucial. Right now, it's better to be minimal in escaping. */
1404 const static unsigned char filechr_table[256] =
1406 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1407 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1408 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1409 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1410 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1411 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1412 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1413 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1414 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1415 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1416 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1417 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1418 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1419 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1420 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1421 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1423 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1424 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1439 /* FN_PORT_SEP is the separator between host and port in file names
1440 for non-standard port numbers. On Unix this is normally ':', as in
1441 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1442 because Windows can't handle ':' in file names. */
1443 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1445 /* FN_QUERY_SEP is the separator between the file name and the URL
1446 query, normally '?'. Since Windows cannot handle '?' as part of
1447 file name, we use '@' instead there. */
1448 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1450 /* Quote path element, characters in [b, e), as file name, and append
1451 the quoted string to DEST. Each character is quoted as per
1452 file_unsafe_char and the corresponding table.
1454 If ESCAPED_P is non-zero, the path element is considered to be
1455 URL-escaped and will be unescaped prior to inspection. */
1458 append_uri_pathel (const char *b, const char *e, int escaped_p,
1459 struct growable *dest)
1465 if (opt.restrict_files_os == restrict_unix)
1466 mask = filechr_not_unix;
1468 mask = filechr_not_windows;
1469 if (opt.restrict_files_ctrl)
1470 mask |= filechr_control;
1472 /* Copy [b, e) to PATHEL and URL-unescape it. */
1476 BOUNDED_TO_ALLOCA (b, e, unescaped);
1477 url_unescape (unescaped);
1479 e = unescaped + strlen (unescaped);
1482 /* Defang ".." when found as component of path. Remember that path
1483 comes from the URL and might contain malicious input. */
1484 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1490 /* Walk the PATHEL string and check how many characters we'll need
1493 for (p = b; p < e; p++)
1494 if (FILE_CHAR_TEST (*p, mask))
1497 /* Calculate the length of the output string. e-b is the input
1498 string length. Each quoted char introduces two additional
1499 characters in the string, hence 2*quoted. */
1500 outlen = (e - b) + (2 * quoted);
1501 GROW (dest, outlen);
1505 /* If there's nothing to quote, we can simply append the string
1506 without processing it again. */
1507 memcpy (TAIL (dest), b, outlen);
1511 char *q = TAIL (dest);
1512 for (p = b; p < e; p++)
1514 if (!FILE_CHAR_TEST (*p, mask))
1518 unsigned char ch = *p;
1520 *q++ = XNUM_TO_DIGIT (ch >> 4);
1521 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1524 assert (q - TAIL (dest) == outlen);
1526 TAIL_INCR (dest, outlen);
1529 /* Append to DEST the directory structure that corresponds the
1530 directory part of URL's path. For example, if the URL is
1531 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1533 Each path element ("dir1" and "dir2" in the above example) is
1534 examined, url-unescaped, and re-escaped as file name element.
1536 Additionally, it cuts as many directories from the path as
1537 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1538 will produce "bar" for the above example. For 2 or more, it will
1541 Each component of the path is quoted for use as file name. */
1544 append_dir_structure (const struct url *u, struct growable *dest)
1546 char *pathel, *next;
1547 int cut = opt.cut_dirs;
1549 /* Go through the path components, de-URL-quote them, and quote them
1550 (if necessary) as file names. */
1553 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1558 /* Ignore empty pathels. */
1562 append_char ('/', dest);
1563 append_uri_pathel (pathel, next, 1, dest);
1567 /* Return a unique file name that matches the given URL as good as
1568 possible. Does not create directories on the file system. */
1571 url_file_name (const struct url *u)
1573 struct growable fnres;
1575 const char *u_file, *u_query;
1576 char *fname, *unique;
1582 /* Start with the directory prefix, if specified. */
1584 append_string (opt.dir_prefix, &fnres);
1586 /* If "dirstruct" is turned on (typically the case with -r), add
1587 the host and port (unless those have been turned off) and
1588 directory structure. */
1591 if (opt.protocol_directories)
1594 append_char ('/', &fnres);
1595 append_string (supported_schemes[u->scheme].name, &fnres);
1597 if (opt.add_hostdir)
1600 append_char ('/', &fnres);
1601 append_string (u->host, &fnres);
1602 if (u->port != scheme_default_port (u->scheme))
1605 number_to_string (portstr, u->port);
1606 append_char (FN_PORT_SEP, &fnres);
1607 append_string (portstr, &fnres);
1611 append_dir_structure (u, &fnres);
1614 /* Add the file name. */
1616 append_char ('/', &fnres);
1617 u_file = *u->file ? u->file : "index.html";
1618 append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1620 /* Append "?query" to the file name. */
1621 u_query = u->query && *u->query ? u->query : NULL;
1624 append_char (FN_QUERY_SEP, &fnres);
1625 append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1628 /* Zero-terminate the file name. */
1629 append_char ('\0', &fnres);
1633 /* Check the cases in which the unique extensions are not used:
1634 1) Clobbering is turned off (-nc).
1635 2) Retrieval with regetting.
1636 3) Timestamping is used.
1637 4) Hierarchy is built.
1639 The exception is the case when file does exist and is a
1640 directory (see `mkalldirs' for explanation). */
1642 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1643 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1646 unique = unique_name (fname, 1);
1647 if (unique != fname)
1652 /* Resolve "." and ".." elements of PATH by destructively modifying
1653 PATH and return non-zero if PATH has been modified, zero otherwise.
1655 The algorithm is in spirit similar to the one described in rfc1808,
1656 although implemented differently, in one pass. To recap, path
1657 elements containing only "." are removed, and ".." is taken to mean
1658 "back up one element". Single leading and trailing slashes are
1661 This function does not handle URL escapes explicitly. If you're
1662 passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1663 ".", so that this function can find the dots. (Wget's URL parser
1664 calls reencode_escapes, which see.)
1666 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1667 test examples are provided below. If you change anything in this
1668 function, run test_path_simplify to make sure you haven't broken a
1672 path_simplify (char *path)
1674 char *h = path; /* hare */
1675 char *t = path; /* tortoise */
1676 char *beg = path; /* boundary for backing the tortoise */
1677 char *end = path + strlen (path);
1681 /* Hare should be at the beginning of a path element. */
1683 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1688 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1690 /* Handle "../" by retreating the tortoise by one path
1691 element -- but not past beggining. */
1694 /* Move backwards until T hits the beginning of the
1695 previous path element or the beginning of path. */
1696 for (--t; t > beg && t[-1] != '/'; t--)
1701 /* If we're at the beginning, copy the "../" literally
1702 move the beginning so a later ".." doesn't remove
1712 /* A regular path element. If H hasn't advanced past T,
1713 simply skip to the next path element. Otherwise, copy
1714 the path element until the next slash. */
1717 /* Skip the path element, including the slash. */
1718 while (h < end && *h != '/')
1725 /* Copy the path element, including the final slash. */
1726 while (h < end && *h != '/')
1740 /* Return the length of URL's path. Path is considered to be
1741 terminated by one of '?', ';', '#', or by the end of the
1745 path_length (const char *url)
1747 const char *q = strpbrk_or_eos (url, "?;#");
1751 /* Find the last occurrence of character C in the range [b, e), or
1752 NULL, if none are present. We might want to use memrchr (a GNU
1753 extension) under GNU libc. */
1756 find_last_char (const char *b, const char *e, char c)
1764 /* Merge BASE with LINK and return the resulting URI.
1766 Either of the URIs may be absolute or relative, complete with the
1767 host name, or path only. This tries to reasonably handle all
1768 foreseeable cases. It only employs minimal URL parsing, without
1769 knowledge of the specifics of schemes.
1771 I briefly considered making this function call path_simplify after
1772 the merging process, as rfc1738 seems to suggest. This is a bad
1773 idea for several reasons: 1) it complexifies the code, and 2)
1774 url_parse has to simplify path anyway, so it's wasteful to boot. */
1777 uri_merge (const char *base, const char *link)
1783 if (url_has_scheme (link))
1784 return xstrdup (link);
1786 /* We may not examine BASE past END. */
1787 end = base + path_length (base);
1788 linklength = strlen (link);
1792 /* Empty LINK points back to BASE, query string and all. */
1793 return xstrdup (base);
1795 else if (*link == '?')
1797 /* LINK points to the same location, but changes the query
1798 string. Examples: */
1799 /* uri_merge("path", "?new") -> "path?new" */
1800 /* uri_merge("path?foo", "?new") -> "path?new" */
1801 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1802 /* uri_merge("path#foo", "?new") -> "path?new" */
1803 int baselength = end - base;
1804 merge = xmalloc (baselength + linklength + 1);
1805 memcpy (merge, base, baselength);
1806 memcpy (merge + baselength, link, linklength);
1807 merge[baselength + linklength] = '\0';
1809 else if (*link == '#')
1811 /* uri_merge("path", "#new") -> "path#new" */
1812 /* uri_merge("path#foo", "#new") -> "path#new" */
1813 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1814 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1816 const char *end1 = strchr (base, '#');
1818 end1 = base + strlen (base);
1819 baselength = end1 - base;
1820 merge = xmalloc (baselength + linklength + 1);
1821 memcpy (merge, base, baselength);
1822 memcpy (merge + baselength, link, linklength);
1823 merge[baselength + linklength] = '\0';
1825 else if (*link == '/' && *(link + 1) == '/')
1827 /* LINK begins with "//" and so is a net path: we need to
1828 replace everything after (and including) the double slash
1831 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1832 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1833 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1837 const char *start_insert;
1839 /* Look for first slash. */
1840 slash = memchr (base, '/', end - base);
1841 /* If found slash and it is a double slash, then replace
1842 from this point, else default to replacing from the
1844 if (slash && *(slash + 1) == '/')
1845 start_insert = slash;
1847 start_insert = base;
1849 span = start_insert - base;
1850 merge = (char *)xmalloc (span + linklength + 1);
1852 memcpy (merge, base, span);
1853 memcpy (merge + span, link, linklength);
1854 merge[span + linklength] = '\0';
1856 else if (*link == '/')
1858 /* LINK is an absolute path: we need to replace everything
1859 after (and including) the FIRST slash with LINK.
1861 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1862 "/qux/xyzzy", our result should be
1863 "http://host/qux/xyzzy". */
1866 const char *start_insert = NULL; /* for gcc to shut up. */
1867 const char *pos = base;
1868 int seen_slash_slash = 0;
1869 /* We're looking for the first slash, but want to ignore
1872 slash = memchr (pos, '/', end - pos);
1873 if (slash && !seen_slash_slash)
1874 if (*(slash + 1) == '/')
1877 seen_slash_slash = 1;
1881 /* At this point, SLASH is the location of the first / after
1882 "//", or the first slash altogether. START_INSERT is the
1883 pointer to the location where LINK will be inserted. When
1884 examining the last two examples, keep in mind that LINK
1887 if (!slash && !seen_slash_slash)
1888 /* example: "foo" */
1890 start_insert = base;
1891 else if (!slash && seen_slash_slash)
1892 /* example: "http://foo" */
1895 else if (slash && !seen_slash_slash)
1896 /* example: "foo/bar" */
1898 start_insert = base;
1899 else if (slash && seen_slash_slash)
1900 /* example: "http://something/" */
1902 start_insert = slash;
1904 span = start_insert - base;
1905 merge = (char *)xmalloc (span + linklength + 1);
1907 memcpy (merge, base, span);
1908 memcpy (merge + span, link, linklength);
1909 merge[span + linklength] = '\0';
1913 /* LINK is a relative URL: we need to replace everything
1914 after last slash (possibly empty) with LINK.
1916 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1917 our result should be "whatever/foo/qux/xyzzy". */
1918 int need_explicit_slash = 0;
1920 const char *start_insert;
1921 const char *last_slash = find_last_char (base, end, '/');
1924 /* No slash found at all. Replace what we have with LINK. */
1925 start_insert = base;
1927 else if (last_slash && last_slash >= base + 2
1928 && last_slash[-2] == ':' && last_slash[-1] == '/')
1930 /* example: http://host" */
1932 start_insert = end + 1;
1933 need_explicit_slash = 1;
1937 /* example: "whatever/foo/bar" */
1939 start_insert = last_slash + 1;
1942 span = start_insert - base;
1943 merge = (char *)xmalloc (span + linklength + 1);
1945 memcpy (merge, base, span);
1946 if (need_explicit_slash)
1947 merge[span - 1] = '/';
1948 memcpy (merge + span, link, linklength);
1949 merge[span + linklength] = '\0';
1955 #define APPEND(p, s) do { \
1956 int len = strlen (s); \
1957 memcpy (p, s, len); \
1961 /* Use this instead of password when the actual password is supposed
1962 to be hidden. We intentionally use a generic string without giving
1963 away the number of characters in the password, like previous
1965 #define HIDDEN_PASSWORD "*password*"
1967 /* Recreate the URL string from the data in URL.
1969 If HIDE is non-zero (as it is when we're calling this on a URL we
1970 plan to print, but not when calling it to canonicalize a URL for
1971 use within the program), password will be hidden. Unsafe
1972 characters in the URL will be quoted. */
1975 url_string (const struct url *url, int hide_password)
1979 char *quoted_user = NULL, *quoted_passwd = NULL;
1981 int scheme_port = supported_schemes[url->scheme].default_port;
1982 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1983 int fplen = full_path_length (url);
1985 int brackets_around_host;
1987 assert (scheme_str != NULL);
1989 /* Make sure the user name and password are quoted. */
1992 quoted_user = url_escape_allow_passthrough (url->user);
1996 quoted_passwd = HIDDEN_PASSWORD;
1998 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2002 /* Numeric IPv6 addresses can contain ':' and need to be quoted with
2004 brackets_around_host = strchr (url->host, ':') != NULL;
2006 size = (strlen (scheme_str)
2007 + strlen (url->host)
2008 + (brackets_around_host ? 2 : 0)
2011 if (url->port != scheme_port)
2012 size += 1 + numdigit (url->port);
2015 size += 1 + strlen (quoted_user);
2017 size += 1 + strlen (quoted_passwd);
2020 p = result = xmalloc (size);
2022 APPEND (p, scheme_str);
2025 APPEND (p, quoted_user);
2029 APPEND (p, quoted_passwd);
2034 if (brackets_around_host)
2036 APPEND (p, url->host);
2037 if (brackets_around_host)
2039 if (url->port != scheme_port)
2042 p = number_to_string (p, url->port);
2045 full_path_write (url, p);
2049 assert (p - result == size);
2051 if (quoted_user && quoted_user != url->user)
2052 xfree (quoted_user);
2053 if (quoted_passwd && !hide_password
2054 && quoted_passwd != url->passwd)
2055 xfree (quoted_passwd);
2060 /* Return non-zero if scheme a is similar to scheme b.
2062 Schemes are similar if they are equal. If SSL is supported, schemes
2063 are also similar if one is http (SCHEME_HTTP) and the other is https
2066 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2071 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2072 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2079 /* Debugging and testing support for path_simplify. */
2081 /* Debug: run path_simplify on PATH and return the result in a new
2082 string. Useful for calling from the debugger. */
2086 char *copy = xstrdup (path);
2087 path_simplify (copy);
2092 run_test (char *test, char *expected_result, int expected_change)
2094 char *test_copy = xstrdup (test);
2095 int modified = path_simplify (test_copy);
2097 if (0 != strcmp (test_copy, expected_result))
2099 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2100 test, expected_result, test_copy);
2102 if (modified != expected_change)
2104 if (expected_change == 1)
2105 printf ("Expected modification with path_simplify(\"%s\").\n",
2108 printf ("Expected no modification with path_simplify(\"%s\").\n",
2115 test_path_simplify (void)
2118 char *test, *result;
2125 { "../", "../", 0 },
2126 { "foo", "foo", 0 },
2127 { "foo/bar", "foo/bar", 0 },
2128 { "foo///bar", "foo///bar", 0 },
2129 { "foo/.", "foo/", 1 },
2130 { "foo/./", "foo/", 1 },
2131 { "foo./", "foo./", 0 },
2132 { "foo/../bar", "bar", 1 },
2133 { "foo/../bar/", "bar/", 1 },
2134 { "foo/bar/..", "foo/", 1 },
2135 { "foo/bar/../x", "foo/x", 1 },
2136 { "foo/bar/../x/", "foo/x/", 1 },
2137 { "foo/..", "", 1 },
2138 { "foo/../..", "..", 1 },
2139 { "foo/../../..", "../..", 1 },
2140 { "foo/../../bar/../../baz", "../../baz", 1 },
2141 { "a/b/../../c", "c", 1 },
2142 { "./a/../b", "b", 1 }
2146 for (i = 0; i < countof (tests); i++)
2148 char *test = tests[i].test;
2149 char *expected_result = tests[i].result;
2150 int expected_change = tests[i].should_modify;
2151 run_test (test, expected_result, expected_change);