2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 const char *leading_string;
63 /* Supported schemes: */
64 static struct scheme_data supported_schemes[] =
66 { "http", "http://", DEFAULT_HTTP_PORT, 1 },
68 { "https", "https://", DEFAULT_HTTPS_PORT, 1 },
70 { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
76 /* Forward declarations: */
78 static int path_simplify PARAMS ((char *));
80 /* Support for escaping and unescaping of URL strings. */
82 /* Table of "reserved" and "unsafe" characters. Those terms are
83 rfc1738-speak, as such largely obsoleted by rfc2396 and later
84 specs, but the general idea remains.
86 A reserved character is the one that you can't decode without
87 changing the meaning of the URL. For example, you can't decode
88 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
89 path components is different. Non-reserved characters can be
90 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". Wget
91 uses the rfc1738 set of reserved characters, plus "$" and ",", as
92 recommended by rfc2396.
94 An unsafe characters is the one that should be encoded when URLs
95 are placed in foreign environments. E.g. space and newline are
96 unsafe in HTTP contexts because HTTP uses them as separator and
97 terminator, so they must be encoded to %20 and %0A respectively.
98 "*" is unsafe in shell context, etc.
100 We determine whether a character is unsafe through static table
101 lookup. This code assumes ASCII character set and 8-bit chars. */
104 /* rfc1738 reserved chars + "$" and ",". */
107 /* rfc1738 unsafe chars, plus non-printables. */
111 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
112 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
113 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
115 /* Shorthands for the table: */
116 #define R urlchr_reserved
117 #define U urlchr_unsafe
120 const static unsigned char urlchr_table[256] =
122 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
123 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
124 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
125 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
126 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
127 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
128 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
129 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
130 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
131 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
132 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
133 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
134 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
135 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
136 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
137 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
144 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
145 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
153 /* URL-unescape the string S.
155 This is done by transforming the sequences "%HH" to the character
156 represented by the hexadecimal digits HH. If % is not followed by
157 two hexadecimal digits, it is inserted literally.
159 The transformation is done in place. If you need the original
160 string intact, make a copy before calling this function. */
163 url_unescape (char *s)
165 char *t = s; /* t - tortoise */
166 char *h = s; /* h - hare */
177 /* Do nothing if '%' is not followed by two hex digits. */
178 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
180 *t = X2DIGITS_TO_NUM (h[1], h[2]);
187 /* The core of url_escape_* functions. Escapes the characters that
188 match the provided mask in urlchr_table.
190 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
191 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
192 freshly allocated string will be returned in all cases. */
195 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
202 for (p1 = s; *p1; p1++)
203 if (urlchr_test (*p1, mask))
204 addition += 2; /* Two more characters (hex digits) */
207 return allow_passthrough ? (char *)s : xstrdup (s);
209 newlen = (p1 - s) + addition;
210 newstr = (char *)xmalloc (newlen + 1);
216 /* Quote the characters that match the test mask. */
217 if (urlchr_test (*p1, mask))
219 unsigned char c = *p1++;
221 *p2++ = XNUM_TO_DIGIT (c >> 4);
222 *p2++ = XNUM_TO_DIGIT (c & 0xf);
227 assert (p2 - newstr == newlen);
233 /* URL-escape the unsafe characters (see urlchr_table) in a given
234 string, returning a freshly allocated string. */
237 url_escape (const char *s)
239 return url_escape_1 (s, urlchr_unsafe, 0);
242 /* URL-escape the unsafe characters (see urlchr_table) in a given
243 string. If no characters are unsafe, S is returned. */
246 url_escape_allow_passthrough (const char *s)
248 return url_escape_1 (s, urlchr_unsafe, 1);
251 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
253 /* Decide whether to encode, decode, or pass through the char at P.
254 This used to be a macro, but it got a little too convoluted. */
255 static inline enum copy_method
256 decide_copy_method (const char *p)
260 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
262 /* %xx sequence: decode it, unless it would decode to an
263 unsafe or a reserved char; in that case, leave it as
265 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
266 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
267 return CM_PASSTHROUGH;
272 /* Garbled %.. sequence: encode `%'. */
275 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
278 return CM_PASSTHROUGH;
281 /* Translate a %-escaped (but possibly non-conformant) input string S
282 into a %-escaped (and conformant) output string. If no characters
283 are encoded or decoded, return the same string S; otherwise, return
284 a freshly allocated string with the new contents.
286 After a URL has been run through this function, the protocols that
287 use `%' as the quote character can use the resulting string as-is,
288 while those that don't call url_unescape() to get to the intended
289 data. This function is also stable: after an input string is
290 transformed the first time, all further transformations of the
291 result yield the same result string.
293 Let's discuss why this function is needed.
295 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
296 space character would mess up the HTTP request, it needs to be
299 GET /abc%20def HTTP/1.0
301 It appears that the unsafe chars need to be quoted, for example
302 with url_escape. But what if we're requested to download
303 `abc%20def'? url_escape transforms "%" to "%25", which would leave
304 us with `abc%2520def'. This is incorrect -- since %-escapes are
305 part of URL syntax, "%20" is the correct way to denote a literal
306 space on the Wget command line. This leaves us in the conclusion
307 that in that case Wget should not call url_escape, but leave the
310 And what if the requested URI is `abc%20 def'? If we call
311 url_escape, we end up with `/abc%2520%20def', which is almost
312 certainly not intended. If we don't call url_escape, we are left
313 with the embedded space and cannot complete the request. What the
314 user meant was for Wget to request `/abc%20%20def', and this is
315 where reencode_escapes kicks in.
317 Wget used to solve this by first decoding %-quotes, and then
318 encoding all the "unsafe" characters found in the resulting string.
319 This was wrong because it didn't preserve certain URL special
320 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
321 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
322 whether we considered `+' reserved (it is). One of these results
323 is inevitable because by the second step we would lose information
324 on whether the `+' was originally encoded or not. Both results
325 were wrong because in CGI parameters + means space, while %2B means
326 literal plus. reencode_escapes correctly translates the above to
327 "a%2B+b", i.e. returns the original string.
329 This function uses an algorithm proposed by Anon Sricharoenchai:
331 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
334 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
337 ...except that this code conflates the two steps, and decides
338 whether to encode, decode, or pass through each character in turn.
339 The function still uses two passes, but their logic is the same --
340 the first pass exists merely for the sake of allocation. Another
341 small difference is that we include `+' to URL_RESERVED.
345 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
347 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
351 "foo bar" -> "foo%20bar"
352 "foo%20bar" -> "foo%20bar"
353 "foo %20bar" -> "foo%20%20bar"
354 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
355 "foo%25%20bar" -> "foo%25%20bar"
356 "foo%2%20bar" -> "foo%252%20bar"
357 "foo+bar" -> "foo+bar" (plus is reserved!)
358 "foo%2b+bar" -> "foo%2b+bar" */
361 reencode_escapes (const char *s)
367 int encode_count = 0;
368 int decode_count = 0;
370 /* First, pass through the string to see if there's anything to do,
371 and to calculate the new length. */
372 for (p1 = s; *p1; p1++)
374 switch (decide_copy_method (p1))
387 if (!encode_count && !decode_count)
388 /* The string is good as it is. */
389 return (char *)s; /* C const model sucks. */
392 /* Each encoding adds two characters (hex digits), while each
393 decoding removes two characters. */
394 newlen = oldlen + 2 * (encode_count - decode_count);
395 newstr = xmalloc (newlen + 1);
402 switch (decide_copy_method (p1))
406 unsigned char c = *p1++;
408 *p2++ = XNUM_TO_DIGIT (c >> 4);
409 *p2++ = XNUM_TO_DIGIT (c & 0xf);
413 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
414 p1 += 3; /* skip %xx */
421 assert (p2 - newstr == newlen);
425 /* Returns the scheme type if the scheme is supported, or
426 SCHEME_INVALID if not. */
429 url_scheme (const char *url)
433 for (i = 0; supported_schemes[i].leading_string; i++)
434 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
435 strlen (supported_schemes[i].leading_string)))
437 if (supported_schemes[i].enabled)
438 return (enum url_scheme) i;
440 return SCHEME_INVALID;
443 return SCHEME_INVALID;
446 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
448 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
449 currently implemented, it returns true if URL begins with
453 url_has_scheme (const char *url)
457 /* The first char must be a scheme char. */
458 if (!*p || !SCHEME_CHAR (*p))
461 /* Followed by 0 or more scheme chars. */
462 while (*p && SCHEME_CHAR (*p))
464 /* Terminated by ':'. */
469 scheme_default_port (enum url_scheme scheme)
471 return supported_schemes[scheme].default_port;
475 scheme_disable (enum url_scheme scheme)
477 supported_schemes[scheme].enabled = 0;
480 /* Skip the username and password, if present here. The function
481 should *not* be called with the complete URL, but with the part
482 right after the scheme.
484 If no username and password are found, return 0. */
487 url_skip_credentials (const char *url)
489 /* Look for '@' that comes before terminators, such as '/', '?',
491 const char *p = (const char *)strpbrk (url, "@/?#;");
497 /* Parse credentials contained in [BEG, END). The region is expected
498 to have come from a URL and is unescaped. */
501 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
507 return 0; /* empty user name */
509 colon = memchr (beg, ':', end - beg);
511 return 0; /* again empty user name */
515 *passwd = strdupdelim (colon + 1, end);
517 url_unescape (*passwd);
524 *user = strdupdelim (beg, userend);
525 url_unescape (*user);
529 /* Used by main.c: detect URLs written using the "shorthand" URL forms
530 popularized by Netscape and NcFTP. HTTP shorthands look like this:
532 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
533 www.foo.com[:port] -> http://www.foo.com[:port]
535 FTP shorthands look like this:
537 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
538 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
540 If the URL needs not or cannot be rewritten, return NULL. */
543 rewrite_shorthand_url (const char *url)
547 if (url_has_scheme (url))
550 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
552 for (p = url; *p && *p != ':' && *p != '/'; p++)
562 /* If the characters after the colon and before the next slash
563 or end of string are all digits, it's HTTP. */
565 for (pp = p + 1; ISDIGIT (*pp); pp++)
567 if (digits > 0 && (*pp == '/' || *pp == '\0'))
570 /* Prepend "ftp://" to the entire URL... */
571 res = xmalloc (6 + strlen (url) + 1);
572 sprintf (res, "ftp://%s", url);
573 /* ...and replace ':' with '/'. */
574 res[6 + (p - url)] = '/';
581 /* Just prepend "http://" to what we have. */
582 res = xmalloc (7 + strlen (url) + 1);
583 sprintf (res, "http://%s", url);
588 static void split_path PARAMS ((const char *, char **, char **));
590 /* Like strpbrk, with the exception that it returns the pointer to the
591 terminating zero (end-of-string aka "eos") if no matching character
594 Although I normally balk at Gcc-specific optimizations, it probably
595 makes sense here: glibc has optimizations that detect strpbrk being
596 called with literal string as ACCEPT and inline the search. That
597 optimization is defeated if strpbrk is hidden within the call to
598 another function. (And no, making strpbrk_or_eos inline doesn't
599 help because the check for literal accept is in the
604 #define strpbrk_or_eos(s, accept) ({ \
605 char *SOE_p = strpbrk (s, accept); \
607 SOE_p = (char *)s + strlen (s); \
611 #else /* not __GNUC__ */
614 strpbrk_or_eos (const char *s, const char *accept)
616 char *p = strpbrk (s, accept);
618 p = (char *)s + strlen (s);
623 /* Turn STR into lowercase; return non-zero if a character was
627 lowercase_str (char *str)
634 *str = TOLOWER (*str);
639 static const char *parse_errors[] = {
640 #define PE_NO_ERROR 0
642 #define PE_UNSUPPORTED_SCHEME 1
643 N_("Unsupported scheme"),
644 #define PE_EMPTY_HOST 2
646 #define PE_BAD_PORT_NUMBER 3
647 N_("Bad port number"),
648 #define PE_INVALID_USER_NAME 4
649 N_("Invalid user name"),
650 #define PE_UNTERMINATED_IPV6_ADDRESS 5
651 N_("Unterminated IPv6 numeric address"),
652 #define PE_IPV6_NOT_SUPPORTED 6
653 N_("IPv6 addresses not supported"),
654 #define PE_INVALID_IPV6_ADDRESS 7
655 N_("Invalid IPv6 numeric address")
659 /* The following two functions were adapted from glibc. */
662 is_valid_ipv4_address (const char *str, const char *end)
672 if (ch >= '0' && ch <= '9')
674 val = val * 10 + (ch - '0');
685 else if (ch == '.' && saw_digit == 1)
702 is_valid_ipv6_address (const char *str, const char *end)
722 /* Leading :: requires some special handling. */
726 if (str == end || *str != ':')
738 /* if ch is a number, add it to val. */
742 val |= XDIGIT_TO_NUM (ch);
749 /* if ch is a colon ... */
762 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
770 /* if ch is a dot ... */
771 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ)
772 && is_valid_ipv4_address (curtok, end) == 1)
784 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
791 if (tp == NS_IN6ADDRSZ)
796 if (tp != NS_IN6ADDRSZ)
805 Return a new struct url if successful, NULL on error. In case of
806 error, and if ERROR is not NULL, also set *ERROR to the appropriate
809 url_parse (const char *url, int *error)
813 int path_modified, host_modified;
815 enum url_scheme scheme;
817 const char *uname_b, *uname_e;
818 const char *host_b, *host_e;
819 const char *path_b, *path_e;
820 const char *params_b, *params_e;
821 const char *query_b, *query_e;
822 const char *fragment_b, *fragment_e;
825 char *user = NULL, *passwd = NULL;
827 char *url_encoded = NULL;
831 scheme = url_scheme (url);
832 if (scheme == SCHEME_INVALID)
834 error_code = PE_UNSUPPORTED_SCHEME;
838 url_encoded = reencode_escapes (url);
841 p += strlen (supported_schemes[scheme].leading_string);
843 p += url_skip_credentials (p);
846 /* scheme://user:pass@host[:port]... */
849 /* We attempt to break down the URL into the components path,
850 params, query, and fragment. They are ordered like this:
852 scheme://host[:port][/path][;params][?query][#fragment] */
854 params_b = params_e = NULL;
855 query_b = query_e = NULL;
856 fragment_b = fragment_e = NULL;
862 /* Handle IPv6 address inside square brackets. Ideally we'd
863 just look for the terminating ']', but rfc2732 mandates
864 rejecting invalid IPv6 addresses. */
866 /* The address begins after '['. */
868 host_e = strchr (host_b, ']');
872 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
877 /* Check if the IPv6 address is valid. */
878 if (!is_valid_ipv6_address(host_b, host_e))
880 error_code = PE_INVALID_IPV6_ADDRESS;
884 /* Continue parsing after the closing ']'. */
887 error_code = PE_IPV6_NOT_SUPPORTED;
893 p = strpbrk_or_eos (p, ":/;?#");
897 if (host_b == host_e)
899 error_code = PE_EMPTY_HOST;
903 port = scheme_default_port (scheme);
906 const char *port_b, *port_e, *pp;
908 /* scheme://host:port/tralala */
912 p = strpbrk_or_eos (p, "/;?#");
915 /* Allow empty port, as per rfc2396. */
916 if (port_b != port_e)
918 for (port = 0, pp = port_b; pp < port_e; pp++)
922 /* http://host:12randomgarbage/blah */
924 error_code = PE_BAD_PORT_NUMBER;
927 port = 10 * port + (*pp - '0');
936 p = strpbrk_or_eos (p, ";?#");
941 /* Path is not allowed not to exist. */
949 p = strpbrk_or_eos (p, "?#");
956 p = strpbrk_or_eos (p, "#");
959 /* Hack that allows users to use '?' (a wildcard character) in
960 FTP URLs without it being interpreted as a query string
962 if (scheme == SCHEME_FTP)
964 query_b = query_e = NULL;
977 if (uname_b != uname_e)
979 /* http://user:pass@host */
981 /* uname_b uname_e */
982 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
984 error_code = PE_INVALID_USER_NAME;
989 u = xnew0 (struct url);
991 u->host = strdupdelim (host_b, host_e);
996 u->path = strdupdelim (path_b, path_e);
997 path_modified = path_simplify (u->path);
998 split_path (u->path, &u->dir, &u->file);
1000 host_modified = lowercase_str (u->host);
1003 u->params = strdupdelim (params_b, params_e);
1005 u->query = strdupdelim (query_b, query_e);
1007 u->fragment = strdupdelim (fragment_b, fragment_e);
1009 if (path_modified || u->fragment || host_modified || path_b == path_e)
1011 /* If we suspect that a transformation has rendered what
1012 url_string might return different from URL_ENCODED, rebuild
1013 u->url using url_string. */
1014 u->url = url_string (u, 0);
1016 if (url_encoded != url)
1017 xfree ((char *) url_encoded);
1021 if (url_encoded == url)
1022 u->url = xstrdup (url);
1024 u->url = url_encoded;
1031 /* Cleanup in case of error: */
1032 if (url_encoded && url_encoded != url)
1033 xfree (url_encoded);
1035 /* Transmit the error code to the caller, if the caller wants to
1038 *error = error_code;
1042 /* Return the error message string from ERROR_CODE, which should have
1043 been retrieved from url_parse. The error message is translated. */
1046 url_error (int error_code)
1048 assert (error_code >= 0 && error_code < countof (parse_errors));
1049 return _(parse_errors[error_code]);
1052 /* Split PATH into DIR and FILE. PATH comes from the URL and is
1053 expected to be URL-escaped.
1055 The path is split into directory (the part up to the last slash)
1056 and file (the part after the last slash), which are subsequently
1057 unescaped. Examples:
1060 "foo/bar/baz" "foo/bar" "baz"
1061 "foo/bar/" "foo/bar" ""
1063 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
1065 DIR and FILE are freshly allocated. */
1068 split_path (const char *path, char **dir, char **file)
1070 char *last_slash = strrchr (path, '/');
1073 *dir = xstrdup ("");
1074 *file = xstrdup (path);
1078 *dir = strdupdelim (path, last_slash);
1079 *file = xstrdup (last_slash + 1);
1081 url_unescape (*dir);
1082 url_unescape (*file);
1085 /* Note: URL's "full path" is the path with the query string and
1086 params appended. The "fragment" (#foo) is intentionally ignored,
1087 but that might be changed. For example, if the original URL was
1088 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1089 the full path will be "/foo/bar/baz;bullshit?querystring". */
1091 /* Return the length of the full path, without the terminating
1095 full_path_length (const struct url *url)
1099 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1110 /* Write out the full path. */
1113 full_path_write (const struct url *url, char *where)
1115 #define FROB(el, chr) do { \
1116 char *f_el = url->el; \
1118 int l = strlen (f_el); \
1120 memcpy (where, f_el, l); \
1132 /* Public function for getting the "full path". E.g. if u->path is
1133 "foo/bar" and u->query is "param=value", full_path will be
1134 "/foo/bar?param=value". */
1137 url_full_path (const struct url *url)
1139 int length = full_path_length (url);
1140 char *full_path = (char *)xmalloc(length + 1);
1142 full_path_write (url, full_path);
1143 full_path[length] = '\0';
1148 /* Escape unsafe and reserved characters, except for the slash
1152 url_escape_dir (const char *dir)
1154 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1159 /* Unescape slashes in NEWDIR. */
1161 h = newdir; /* hare */
1162 t = newdir; /* tortoise */
1164 for (; *h; h++, t++)
1166 /* url_escape_1 having converted '/' to "%2F" exactly. */
1167 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1180 /* Sync u->path and u->url with u->dir and u->file. Called after
1181 u->file or u->dir have been changed, typically by the FTP code. */
1184 sync_path (struct url *u)
1186 char *newpath, *efile, *edir;
1190 /* u->dir and u->file are not escaped. URL-escape them before
1191 reassembling them into u->path. That way, if they contain
1192 separators like '?' or even if u->file contains slashes, the
1193 path will be correctly assembled. (u->file can contain slashes
1194 if the URL specifies it with %2f, or if an FTP server returns
1196 edir = url_escape_dir (u->dir);
1197 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1200 newpath = xstrdup (efile);
1203 int dirlen = strlen (edir);
1204 int filelen = strlen (efile);
1206 /* Copy "DIR/FILE" to newpath. */
1207 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1208 memcpy (p, edir, dirlen);
1211 memcpy (p, efile, filelen);
1220 if (efile != u->file)
1223 /* Regenerate u->url as well. */
1225 u->url = url_string (u, 0);
1228 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1229 This way we can sync u->path and u->url when they get changed. */
1232 url_set_dir (struct url *url, const char *newdir)
1235 url->dir = xstrdup (newdir);
1240 url_set_file (struct url *url, const char *newfile)
1243 url->file = xstrdup (newfile);
1248 url_free (struct url *url)
1254 xfree_null (url->params);
1255 xfree_null (url->query);
1256 xfree_null (url->fragment);
1257 xfree_null (url->user);
1258 xfree_null (url->passwd);
1266 /* Create all the necessary directories for PATH (a file). Calls
1267 mkdirhier() internally. */
1269 mkalldirs (const char *path)
1276 p = path + strlen (path);
1277 for (; *p != '/' && p != path; p--)
1280 /* Don't create if it's just a file. */
1281 if ((p == path) && (*p != '/'))
1283 t = strdupdelim (path, p);
1285 /* Check whether the directory exists. */
1286 if ((stat (t, &st) == 0))
1288 if (S_ISDIR (st.st_mode))
1295 /* If the dir exists as a file name, remove it first. This
1296 is *only* for Wget to work with buggy old CERN http
1297 servers. Here is the scenario: When Wget tries to
1298 retrieve a directory without a slash, e.g.
1299 http://foo/bar (bar being a directory), CERN server will
1300 not redirect it too http://foo/bar/ -- it will generate a
1301 directory listing containing links to bar/file1,
1302 bar/file2, etc. Wget will lose because it saves this
1303 HTML listing to a file `bar', so it cannot create the
1304 directory. To work around this, if the file of the same
1305 name exists, we just remove it and create the directory
1307 DEBUGP (("Removing %s because of directory danger!\n", t));
1311 res = make_directory (t);
1313 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1318 /* Functions for constructing the file name out of URL components. */
1320 /* A growable string structure, used by url_file_name and friends.
1321 This should perhaps be moved to utils.c.
1323 The idea is to have a convenient and efficient way to construct a
1324 string by having various functions append data to it. Instead of
1325 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1326 functions in questions, we pass the pointer to this struct. */
1334 /* Ensure that the string can accept APPEND_COUNT more characters past
1335 the current TAIL position. If necessary, this will grow the string
1336 and update its allocated size. If the string is already large
1337 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1338 #define GROW(g, append_size) do { \
1339 struct growable *G_ = g; \
1340 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1343 /* Return the tail position of the string. */
1344 #define TAIL(r) ((r)->base + (r)->tail)
1346 /* Move the tail position by APPEND_COUNT characters. */
1347 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1349 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1353 append_string (const char *str, struct growable *dest)
1355 int l = strlen (str);
1357 memcpy (TAIL (dest), str, l);
1358 TAIL_INCR (dest, l);
1361 /* Append CH to DEST. For example, append_char (0, DEST)
1362 zero-terminates DEST. */
1365 append_char (char ch, struct growable *dest)
1369 TAIL_INCR (dest, 1);
1373 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1374 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1375 filechr_control = 4 /* a control character, e.g. 0-31 */
1378 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1380 /* Shorthands for the table: */
1381 #define U filechr_not_unix
1382 #define W filechr_not_windows
1383 #define C filechr_control
1388 /* Table of characters unsafe under various conditions (see above).
1390 Arguably we could also claim `%' to be unsafe, since we use it as
1391 the escape character. If we ever want to be able to reliably
1392 translate file name back to URL, this would become important
1393 crucial. Right now, it's better to be minimal in escaping. */
1395 const static unsigned char filechr_table[256] =
1397 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1398 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1399 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1400 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1401 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1402 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1403 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1404 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1405 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1406 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1407 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1408 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1409 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1410 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1411 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1412 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1414 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1415 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430 /* FN_PORT_SEP is the separator between host and port in file names
1431 for non-standard port numbers. On Unix this is normally ':', as in
1432 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1433 because Windows can't handle ':' in file names. */
1434 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1436 /* FN_QUERY_SEP is the separator between the file name and the URL
1437 query, normally '?'. Since Windows cannot handle '?' as part of
1438 file name, we use '@' instead there. */
1439 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1441 /* Quote path element, characters in [b, e), as file name, and append
1442 the quoted string to DEST. Each character is quoted as per
1443 file_unsafe_char and the corresponding table.
1445 If ESCAPED_P is non-zero, the path element is considered to be
1446 URL-escaped and will be unescaped prior to inspection. */
1449 append_uri_pathel (const char *b, const char *e, int escaped_p,
1450 struct growable *dest)
1456 if (opt.restrict_files_os == restrict_unix)
1457 mask = filechr_not_unix;
1459 mask = filechr_not_windows;
1460 if (opt.restrict_files_ctrl)
1461 mask |= filechr_control;
1463 /* Copy [b, e) to PATHEL and URL-unescape it. */
1467 BOUNDED_TO_ALLOCA (b, e, unescaped);
1468 url_unescape (unescaped);
1470 e = unescaped + strlen (unescaped);
1473 /* Defang ".." when found as component of path. Remember that path
1474 comes from the URL and might contain malicious input. */
1475 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1481 /* Walk the PATHEL string and check how many characters we'll need
1484 for (p = b; p < e; p++)
1485 if (FILE_CHAR_TEST (*p, mask))
1488 /* Calculate the length of the output string. e-b is the input
1489 string length. Each quoted char introduces two additional
1490 characters in the string, hence 2*quoted. */
1491 outlen = (e - b) + (2 * quoted);
1492 GROW (dest, outlen);
1496 /* If there's nothing to quote, we can simply append the string
1497 without processing it again. */
1498 memcpy (TAIL (dest), b, outlen);
1502 char *q = TAIL (dest);
1503 for (p = b; p < e; p++)
1505 if (!FILE_CHAR_TEST (*p, mask))
1509 unsigned char ch = *p;
1511 *q++ = XNUM_TO_DIGIT (ch >> 4);
1512 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1515 assert (q - TAIL (dest) == outlen);
1517 TAIL_INCR (dest, outlen);
1520 /* Append to DEST the directory structure that corresponds the
1521 directory part of URL's path. For example, if the URL is
1522 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1524 Each path element ("dir1" and "dir2" in the above example) is
1525 examined, url-unescaped, and re-escaped as file name element.
1527 Additionally, it cuts as many directories from the path as
1528 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1529 will produce "bar" for the above example. For 2 or more, it will
1532 Each component of the path is quoted for use as file name. */
1535 append_dir_structure (const struct url *u, struct growable *dest)
1537 char *pathel, *next;
1538 int cut = opt.cut_dirs;
1540 /* Go through the path components, de-URL-quote them, and quote them
1541 (if necessary) as file names. */
1544 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1549 /* Ignore empty pathels. */
1553 append_char ('/', dest);
1554 append_uri_pathel (pathel, next, 1, dest);
1558 /* Return a unique file name that matches the given URL as good as
1559 possible. Does not create directories on the file system. */
1562 url_file_name (const struct url *u)
1564 struct growable fnres;
1566 const char *u_file, *u_query;
1567 char *fname, *unique;
1573 /* Start with the directory prefix, if specified. */
1575 append_string (opt.dir_prefix, &fnres);
1577 /* If "dirstruct" is turned on (typically the case with -r), add
1578 the host and port (unless those have been turned off) and
1579 directory structure. */
1582 if (opt.protocol_directories)
1585 append_char ('/', &fnres);
1586 append_string (supported_schemes[u->scheme].name, &fnres);
1588 if (opt.add_hostdir)
1591 append_char ('/', &fnres);
1592 append_string (u->host, &fnres);
1593 if (u->port != scheme_default_port (u->scheme))
1596 number_to_string (portstr, u->port);
1597 append_char (FN_PORT_SEP, &fnres);
1598 append_string (portstr, &fnres);
1602 append_dir_structure (u, &fnres);
1605 /* Add the file name. */
1607 append_char ('/', &fnres);
1608 u_file = *u->file ? u->file : "index.html";
1609 append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1611 /* Append "?query" to the file name. */
1612 u_query = u->query && *u->query ? u->query : NULL;
1615 append_char (FN_QUERY_SEP, &fnres);
1616 append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1619 /* Zero-terminate the file name. */
1620 append_char ('\0', &fnres);
1624 /* Check the cases in which the unique extensions are not used:
1625 1) Clobbering is turned off (-nc).
1626 2) Retrieval with regetting.
1627 3) Timestamping is used.
1628 4) Hierarchy is built.
1630 The exception is the case when file does exist and is a
1631 directory (see `mkalldirs' for explanation). */
1633 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1634 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1637 unique = unique_name (fname, 1);
1638 if (unique != fname)
1643 /* Resolve "." and ".." elements of PATH by destructively modifying
1644 PATH and return non-zero if PATH has been modified, zero otherwise.
1646 The algorithm is in spirit similar to the one described in rfc1808,
1647 although implemented differently, in one pass. To recap, path
1648 elements containing only "." are removed, and ".." is taken to mean
1649 "back up one element". Single leading and trailing slashes are
1652 This function does not handle URL escapes explicitly. If you're
1653 passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1654 ".", so that this function can find the dots. (Wget's URL parser
1655 calls reencode_escapes, which see.)
1657 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1658 test examples are provided below. If you change anything in this
1659 function, run test_path_simplify to make sure you haven't broken a
1663 path_simplify (char *path)
1665 char *h = path; /* hare */
1666 char *t = path; /* tortoise */
1667 char *beg = path; /* boundary for backing the tortoise */
1668 char *end = path + strlen (path);
1672 /* Hare should be at the beginning of a path element. */
1674 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1679 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1681 /* Handle "../" by retreating the tortoise by one path
1682 element -- but not past beggining. */
1685 /* Move backwards until T hits the beginning of the
1686 previous path element or the beginning of path. */
1687 for (--t; t > beg && t[-1] != '/'; t--)
1692 /* If we're at the beginning, copy the "../" literally
1693 move the beginning so a later ".." doesn't remove
1703 /* A regular path element. If H hasn't advanced past T,
1704 simply skip to the next path element. Otherwise, copy
1705 the path element until the next slash. */
1708 /* Skip the path element, including the slash. */
1709 while (h < end && *h != '/')
1716 /* Copy the path element, including the final slash. */
1717 while (h < end && *h != '/')
1731 /* Return the length of URL's path. Path is considered to be
1732 terminated by one of '?', ';', '#', or by the end of the
1736 path_length (const char *url)
1738 const char *q = strpbrk_or_eos (url, "?;#");
1742 /* Find the last occurrence of character C in the range [b, e), or
1743 NULL, if none are present. We might want to use memrchr (a GNU
1744 extension) under GNU libc. */
1747 find_last_char (const char *b, const char *e, char c)
1755 /* Merge BASE with LINK and return the resulting URI.
1757 Either of the URIs may be absolute or relative, complete with the
1758 host name, or path only. This tries to reasonably handle all
1759 foreseeable cases. It only employs minimal URL parsing, without
1760 knowledge of the specifics of schemes.
1762 I briefly considered making this function call path_simplify after
1763 the merging process, as rfc1738 seems to suggest. This is a bad
1764 idea for several reasons: 1) it complexifies the code, and 2)
1765 url_parse has to simplify path anyway, so it's wasteful to boot. */
1768 uri_merge (const char *base, const char *link)
1774 if (url_has_scheme (link))
1775 return xstrdup (link);
1777 /* We may not examine BASE past END. */
1778 end = base + path_length (base);
1779 linklength = strlen (link);
1783 /* Empty LINK points back to BASE, query string and all. */
1784 return xstrdup (base);
1786 else if (*link == '?')
1788 /* LINK points to the same location, but changes the query
1789 string. Examples: */
1790 /* uri_merge("path", "?new") -> "path?new" */
1791 /* uri_merge("path?foo", "?new") -> "path?new" */
1792 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1793 /* uri_merge("path#foo", "?new") -> "path?new" */
1794 int baselength = end - base;
1795 merge = xmalloc (baselength + linklength + 1);
1796 memcpy (merge, base, baselength);
1797 memcpy (merge + baselength, link, linklength);
1798 merge[baselength + linklength] = '\0';
1800 else if (*link == '#')
1802 /* uri_merge("path", "#new") -> "path#new" */
1803 /* uri_merge("path#foo", "#new") -> "path#new" */
1804 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1805 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1807 const char *end1 = strchr (base, '#');
1809 end1 = base + strlen (base);
1810 baselength = end1 - base;
1811 merge = xmalloc (baselength + linklength + 1);
1812 memcpy (merge, base, baselength);
1813 memcpy (merge + baselength, link, linklength);
1814 merge[baselength + linklength] = '\0';
1816 else if (*link == '/' && *(link + 1) == '/')
1818 /* LINK begins with "//" and so is a net path: we need to
1819 replace everything after (and including) the double slash
1822 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1823 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1824 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1828 const char *start_insert;
1830 /* Look for first slash. */
1831 slash = memchr (base, '/', end - base);
1832 /* If found slash and it is a double slash, then replace
1833 from this point, else default to replacing from the
1835 if (slash && *(slash + 1) == '/')
1836 start_insert = slash;
1838 start_insert = base;
1840 span = start_insert - base;
1841 merge = (char *)xmalloc (span + linklength + 1);
1843 memcpy (merge, base, span);
1844 memcpy (merge + span, link, linklength);
1845 merge[span + linklength] = '\0';
1847 else if (*link == '/')
1849 /* LINK is an absolute path: we need to replace everything
1850 after (and including) the FIRST slash with LINK.
1852 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1853 "/qux/xyzzy", our result should be
1854 "http://host/qux/xyzzy". */
1857 const char *start_insert = NULL; /* for gcc to shut up. */
1858 const char *pos = base;
1859 int seen_slash_slash = 0;
1860 /* We're looking for the first slash, but want to ignore
1863 slash = memchr (pos, '/', end - pos);
1864 if (slash && !seen_slash_slash)
1865 if (*(slash + 1) == '/')
1868 seen_slash_slash = 1;
1872 /* At this point, SLASH is the location of the first / after
1873 "//", or the first slash altogether. START_INSERT is the
1874 pointer to the location where LINK will be inserted. When
1875 examining the last two examples, keep in mind that LINK
1878 if (!slash && !seen_slash_slash)
1879 /* example: "foo" */
1881 start_insert = base;
1882 else if (!slash && seen_slash_slash)
1883 /* example: "http://foo" */
1886 else if (slash && !seen_slash_slash)
1887 /* example: "foo/bar" */
1889 start_insert = base;
1890 else if (slash && seen_slash_slash)
1891 /* example: "http://something/" */
1893 start_insert = slash;
1895 span = start_insert - base;
1896 merge = (char *)xmalloc (span + linklength + 1);
1898 memcpy (merge, base, span);
1899 memcpy (merge + span, link, linklength);
1900 merge[span + linklength] = '\0';
1904 /* LINK is a relative URL: we need to replace everything
1905 after last slash (possibly empty) with LINK.
1907 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1908 our result should be "whatever/foo/qux/xyzzy". */
1909 int need_explicit_slash = 0;
1911 const char *start_insert;
1912 const char *last_slash = find_last_char (base, end, '/');
1915 /* No slash found at all. Replace what we have with LINK. */
1916 start_insert = base;
1918 else if (last_slash && last_slash >= base + 2
1919 && last_slash[-2] == ':' && last_slash[-1] == '/')
1921 /* example: http://host" */
1923 start_insert = end + 1;
1924 need_explicit_slash = 1;
1928 /* example: "whatever/foo/bar" */
1930 start_insert = last_slash + 1;
1933 span = start_insert - base;
1934 merge = (char *)xmalloc (span + linklength + 1);
1936 memcpy (merge, base, span);
1937 if (need_explicit_slash)
1938 merge[span - 1] = '/';
1939 memcpy (merge + span, link, linklength);
1940 merge[span + linklength] = '\0';
1946 #define APPEND(p, s) do { \
1947 int len = strlen (s); \
1948 memcpy (p, s, len); \
1952 /* Use this instead of password when the actual password is supposed
1953 to be hidden. We intentionally use a generic string without giving
1954 away the number of characters in the password, like previous
1956 #define HIDDEN_PASSWORD "*password*"
1958 /* Recreate the URL string from the data in URL.
1960 If HIDE is non-zero (as it is when we're calling this on a URL we
1961 plan to print, but not when calling it to canonicalize a URL for
1962 use within the program), password will be hidden. Unsafe
1963 characters in the URL will be quoted. */
1966 url_string (const struct url *url, int hide_password)
1970 char *quoted_user = NULL, *quoted_passwd = NULL;
1972 int scheme_port = supported_schemes[url->scheme].default_port;
1973 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1974 int fplen = full_path_length (url);
1976 int brackets_around_host;
1978 assert (scheme_str != NULL);
1980 /* Make sure the user name and password are quoted. */
1983 quoted_user = url_escape_allow_passthrough (url->user);
1987 quoted_passwd = HIDDEN_PASSWORD;
1989 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1993 /* Numeric IPv6 addresses can contain ':' and need to be quoted with
1995 brackets_around_host = strchr (url->host, ':') != NULL;
1997 size = (strlen (scheme_str)
1998 + strlen (url->host)
1999 + (brackets_around_host ? 2 : 0)
2002 if (url->port != scheme_port)
2003 size += 1 + numdigit (url->port);
2006 size += 1 + strlen (quoted_user);
2008 size += 1 + strlen (quoted_passwd);
2011 p = result = xmalloc (size);
2013 APPEND (p, scheme_str);
2016 APPEND (p, quoted_user);
2020 APPEND (p, quoted_passwd);
2025 if (brackets_around_host)
2027 APPEND (p, url->host);
2028 if (brackets_around_host)
2030 if (url->port != scheme_port)
2033 p = number_to_string (p, url->port);
2036 full_path_write (url, p);
2040 assert (p - result == size);
2042 if (quoted_user && quoted_user != url->user)
2043 xfree (quoted_user);
2044 if (quoted_passwd && !hide_password
2045 && quoted_passwd != url->passwd)
2046 xfree (quoted_passwd);
2051 /* Return non-zero if scheme a is similar to scheme b.
2053 Schemes are similar if they are equal. If SSL is supported, schemes
2054 are also similar if one is http (SCHEME_HTTP) and the other is https
2057 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2062 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2063 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2070 /* Debugging and testing support for path_simplify. */
2072 /* Debug: run path_simplify on PATH and return the result in a new
2073 string. Useful for calling from the debugger. */
2077 char *copy = xstrdup (path);
2078 path_simplify (copy);
2083 run_test (char *test, char *expected_result, int expected_change)
2085 char *test_copy = xstrdup (test);
2086 int modified = path_simplify (test_copy);
2088 if (0 != strcmp (test_copy, expected_result))
2090 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2091 test, expected_result, test_copy);
2093 if (modified != expected_change)
2095 if (expected_change == 1)
2096 printf ("Expected modification with path_simplify(\"%s\").\n",
2099 printf ("Expected no modification with path_simplify(\"%s\").\n",
2106 test_path_simplify (void)
2109 char *test, *result;
2116 { "../", "../", 0 },
2117 { "foo", "foo", 0 },
2118 { "foo/bar", "foo/bar", 0 },
2119 { "foo///bar", "foo///bar", 0 },
2120 { "foo/.", "foo/", 1 },
2121 { "foo/./", "foo/", 1 },
2122 { "foo./", "foo./", 0 },
2123 { "foo/../bar", "bar", 1 },
2124 { "foo/../bar/", "bar/", 1 },
2125 { "foo/bar/..", "foo/", 1 },
2126 { "foo/bar/../x", "foo/x", 1 },
2127 { "foo/bar/../x/", "foo/x/", 1 },
2128 { "foo/..", "", 1 },
2129 { "foo/../..", "..", 1 },
2130 { "foo/../../..", "../..", 1 },
2131 { "foo/../../bar/../../baz", "../../baz", 1 },
2132 { "a/b/../../c", "c", 1 },
2133 { "./a/../b", "b", 1 }
2137 for (i = 0; i < countof (tests); i++)
2139 char *test = tests[i].test;
2140 char *expected_result = tests[i].result;
2141 int expected_change = tests[i].should_modify;
2142 run_test (test, expected_result, expected_change);