2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
39 #include <sys/types.h>
49 #include "host.h" /* for is_valid_ipv6_address */
58 const char *leading_string;
63 /* Supported schemes: */
64 static struct scheme_data supported_schemes[] =
66 { "http", "http://", DEFAULT_HTTP_PORT, 1 },
68 { "https", "https://", DEFAULT_HTTPS_PORT, 1 },
70 { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
76 /* Forward declarations: */
78 static int path_simplify PARAMS ((char *));
80 /* Support for escaping and unescaping of URL strings. */
82 /* Table of "reserved" and "unsafe" characters. Those terms are
83 rfc1738-speak, as such largely obsoleted by rfc2396 and later
84 specs, but the general idea remains.
86 A reserved character is the one that you can't decode without
87 changing the meaning of the URL. For example, you can't decode
88 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
89 path components is different. Non-reserved characters can be
90 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". Wget
91 uses the rfc1738 set of reserved characters, plus "$" and ",", as
92 recommended by rfc2396.
94 An unsafe characters is the one that should be encoded when URLs
95 are placed in foreign environments. E.g. space and newline are
96 unsafe in HTTP contexts because HTTP uses them as separator and
97 terminator, so they must be encoded to %20 and %0A respectively.
98 "*" is unsafe in shell context, etc.
100 We determine whether a character is unsafe through static table
101 lookup. This code assumes ASCII character set and 8-bit chars. */
104 /* rfc1738 reserved chars + "$" and ",". */
107 /* rfc1738 unsafe chars, plus non-printables. */
111 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
112 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
113 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
115 /* Shorthands for the table: */
116 #define R urlchr_reserved
117 #define U urlchr_unsafe
120 const static unsigned char urlchr_table[256] =
122 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
123 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
124 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
125 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
126 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
127 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
128 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
129 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
130 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
131 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
132 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
133 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
134 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
135 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
136 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
137 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
144 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
145 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
153 /* URL-unescape the string S.
155 This is done by transforming the sequences "%HH" to the character
156 represented by the hexadecimal digits HH. If % is not followed by
157 two hexadecimal digits, it is inserted literally.
159 The transformation is done in place. If you need the original
160 string intact, make a copy before calling this function. */
163 url_unescape (char *s)
165 char *t = s; /* t - tortoise */
166 char *h = s; /* h - hare */
177 /* Do nothing if '%' is not followed by two hex digits. */
178 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
180 *t = X2DIGITS_TO_NUM (h[1], h[2]);
187 /* The core of url_escape_* functions. Escapes the characters that
188 match the provided mask in urlchr_table.
190 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
191 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
192 freshly allocated string will be returned in all cases. */
195 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
202 for (p1 = s; *p1; p1++)
203 if (urlchr_test (*p1, mask))
204 addition += 2; /* Two more characters (hex digits) */
207 return allow_passthrough ? (char *)s : xstrdup (s);
209 newlen = (p1 - s) + addition;
210 newstr = (char *)xmalloc (newlen + 1);
216 /* Quote the characters that match the test mask. */
217 if (urlchr_test (*p1, mask))
219 unsigned char c = *p1++;
221 *p2++ = XNUM_TO_DIGIT (c >> 4);
222 *p2++ = XNUM_TO_DIGIT (c & 0xf);
227 assert (p2 - newstr == newlen);
233 /* URL-escape the unsafe characters (see urlchr_table) in a given
234 string, returning a freshly allocated string. */
237 url_escape (const char *s)
239 return url_escape_1 (s, urlchr_unsafe, 0);
242 /* URL-escape the unsafe characters (see urlchr_table) in a given
243 string. If no characters are unsafe, S is returned. */
246 url_escape_allow_passthrough (const char *s)
248 return url_escape_1 (s, urlchr_unsafe, 1);
251 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
253 /* Decide whether to encode, decode, or pass through the char at P.
254 This used to be a macro, but it got a little too convoluted. */
255 static inline enum copy_method
256 decide_copy_method (const char *p)
260 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
262 /* %xx sequence: decode it, unless it would decode to an
263 unsafe or a reserved char; in that case, leave it as
265 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
266 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
267 return CM_PASSTHROUGH;
272 /* Garbled %.. sequence: encode `%'. */
275 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
278 return CM_PASSTHROUGH;
281 /* Translate a %-escaped (but possibly non-conformant) input string S
282 into a %-escaped (and conformant) output string. If no characters
283 are encoded or decoded, return the same string S; otherwise, return
284 a freshly allocated string with the new contents.
286 After a URL has been run through this function, the protocols that
287 use `%' as the quote character can use the resulting string as-is,
288 while those that don't call url_unescape() to get to the intended
289 data. This function is also stable: after an input string is
290 transformed the first time, all further transformations of the
291 result yield the same result string.
293 Let's discuss why this function is needed.
295 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
296 space character would mess up the HTTP request, it needs to be
299 GET /abc%20def HTTP/1.0
301 It appears that the unsafe chars need to be quoted, for example
302 with url_escape. But what if we're requested to download
303 `abc%20def'? url_escape transforms "%" to "%25", which would leave
304 us with `abc%2520def'. This is incorrect -- since %-escapes are
305 part of URL syntax, "%20" is the correct way to denote a literal
306 space on the Wget command line. This leaves us in the conclusion
307 that in that case Wget should not call url_escape, but leave the
310 And what if the requested URI is `abc%20 def'? If we call
311 url_escape, we end up with `/abc%2520%20def', which is almost
312 certainly not intended. If we don't call url_escape, we are left
313 with the embedded space and cannot complete the request. What the
314 user meant was for Wget to request `/abc%20%20def', and this is
315 where reencode_escapes kicks in.
317 Wget used to solve this by first decoding %-quotes, and then
318 encoding all the "unsafe" characters found in the resulting string.
319 This was wrong because it didn't preserve certain URL special
320 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
321 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
322 whether we considered `+' reserved (it is). One of these results
323 is inevitable because by the second step we would lose information
324 on whether the `+' was originally encoded or not. Both results
325 were wrong because in CGI parameters + means space, while %2B means
326 literal plus. reencode_escapes correctly translates the above to
327 "a%2B+b", i.e. returns the original string.
329 This function uses an algorithm proposed by Anon Sricharoenchai:
331 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
334 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
337 ...except that this code conflates the two steps, and decides
338 whether to encode, decode, or pass through each character in turn.
339 The function still uses two passes, but their logic is the same --
340 the first pass exists merely for the sake of allocation. Another
341 small difference is that we include `+' to URL_RESERVED.
345 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
347 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
351 "foo bar" -> "foo%20bar"
352 "foo%20bar" -> "foo%20bar"
353 "foo %20bar" -> "foo%20%20bar"
354 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
355 "foo%25%20bar" -> "foo%25%20bar"
356 "foo%2%20bar" -> "foo%252%20bar"
357 "foo+bar" -> "foo+bar" (plus is reserved!)
358 "foo%2b+bar" -> "foo%2b+bar" */
361 reencode_escapes (const char *s)
367 int encode_count = 0;
368 int decode_count = 0;
370 /* First, pass through the string to see if there's anything to do,
371 and to calculate the new length. */
372 for (p1 = s; *p1; p1++)
374 switch (decide_copy_method (p1))
387 if (!encode_count && !decode_count)
388 /* The string is good as it is. */
389 return (char *)s; /* C const model sucks. */
392 /* Each encoding adds two characters (hex digits), while each
393 decoding removes two characters. */
394 newlen = oldlen + 2 * (encode_count - decode_count);
395 newstr = xmalloc (newlen + 1);
402 switch (decide_copy_method (p1))
406 unsigned char c = *p1++;
408 *p2++ = XNUM_TO_DIGIT (c >> 4);
409 *p2++ = XNUM_TO_DIGIT (c & 0xf);
413 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
414 p1 += 3; /* skip %xx */
421 assert (p2 - newstr == newlen);
425 /* Returns the scheme type if the scheme is supported, or
426 SCHEME_INVALID if not. */
429 url_scheme (const char *url)
433 for (i = 0; supported_schemes[i].leading_string; i++)
434 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
435 strlen (supported_schemes[i].leading_string)))
437 if (supported_schemes[i].enabled)
438 return (enum url_scheme) i;
440 return SCHEME_INVALID;
443 return SCHEME_INVALID;
446 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
448 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
449 currently implemented, it returns true if URL begins with
453 url_has_scheme (const char *url)
457 /* The first char must be a scheme char. */
458 if (!*p || !SCHEME_CHAR (*p))
461 /* Followed by 0 or more scheme chars. */
462 while (*p && SCHEME_CHAR (*p))
464 /* Terminated by ':'. */
469 scheme_default_port (enum url_scheme scheme)
471 return supported_schemes[scheme].default_port;
475 scheme_disable (enum url_scheme scheme)
477 supported_schemes[scheme].enabled = 0;
480 /* Skip the username and password, if present in the URL. The
481 function should *not* be called with the complete URL, but with the
482 portion after the scheme.
484 If no username and password are found, return URL. */
487 url_skip_credentials (const char *url)
489 /* Look for '@' that comes before terminators, such as '/', '?',
491 const char *p = (const char *)strpbrk (url, "@/?#;");
497 /* Parse credentials contained in [BEG, END). The region is expected
498 to have come from a URL and is unescaped. */
501 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
507 return 0; /* empty user name */
509 colon = memchr (beg, ':', end - beg);
511 return 0; /* again empty user name */
515 *passwd = strdupdelim (colon + 1, end);
517 url_unescape (*passwd);
524 *user = strdupdelim (beg, userend);
525 url_unescape (*user);
529 /* Used by main.c: detect URLs written using the "shorthand" URL forms
530 popularized by Netscape and NcFTP. HTTP shorthands look like this:
532 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
533 www.foo.com[:port] -> http://www.foo.com[:port]
535 FTP shorthands look like this:
537 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
538 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
540 If the URL needs not or cannot be rewritten, return NULL. */
543 rewrite_shorthand_url (const char *url)
547 if (url_has_scheme (url))
550 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
552 for (p = url; *p && *p != ':' && *p != '/'; p++)
562 /* If the characters after the colon and before the next slash
563 or end of string are all digits, it's HTTP. */
565 for (pp = p + 1; ISDIGIT (*pp); pp++)
567 if (digits > 0 && (*pp == '/' || *pp == '\0'))
570 /* Prepend "ftp://" to the entire URL... */
571 res = xmalloc (6 + strlen (url) + 1);
572 sprintf (res, "ftp://%s", url);
573 /* ...and replace ':' with '/'. */
574 res[6 + (p - url)] = '/';
581 /* Just prepend "http://" to what we have. */
582 res = xmalloc (7 + strlen (url) + 1);
583 sprintf (res, "http://%s", url);
588 static void split_path PARAMS ((const char *, char **, char **));
590 /* Like strpbrk, with the exception that it returns the pointer to the
591 terminating zero (end-of-string aka "eos") if no matching character
594 Although I normally balk at Gcc-specific optimizations, it probably
595 makes sense here: glibc has optimizations that detect strpbrk being
596 called with literal string as ACCEPT and inline the search. That
597 optimization is defeated if strpbrk is hidden within the call to
598 another function. (And no, making strpbrk_or_eos inline doesn't
599 help because the check for literal accept is in the
604 #define strpbrk_or_eos(s, accept) ({ \
605 char *SOE_p = strpbrk (s, accept); \
607 SOE_p = (char *)s + strlen (s); \
611 #else /* not __GNUC__ */
614 strpbrk_or_eos (const char *s, const char *accept)
616 char *p = strpbrk (s, accept);
618 p = (char *)s + strlen (s);
623 /* Turn STR into lowercase; return non-zero if a character was
627 lowercase_str (char *str)
634 *str = TOLOWER (*str);
639 static const char *parse_errors[] = {
640 #define PE_NO_ERROR 0
642 #define PE_UNSUPPORTED_SCHEME 1
643 N_("Unsupported scheme"),
644 #define PE_EMPTY_HOST 2
646 #define PE_BAD_PORT_NUMBER 3
647 N_("Bad port number"),
648 #define PE_INVALID_USER_NAME 4
649 N_("Invalid user name"),
650 #define PE_UNTERMINATED_IPV6_ADDRESS 5
651 N_("Unterminated IPv6 numeric address"),
652 #define PE_IPV6_NOT_SUPPORTED 6
653 N_("IPv6 addresses not supported"),
654 #define PE_INVALID_IPV6_ADDRESS 7
655 N_("Invalid IPv6 numeric address")
660 Return a new struct url if successful, NULL on error. In case of
661 error, and if ERROR is not NULL, also set *ERROR to the appropriate
664 url_parse (const char *url, int *error)
668 int path_modified, host_modified;
670 enum url_scheme scheme;
672 const char *uname_b, *uname_e;
673 const char *host_b, *host_e;
674 const char *path_b, *path_e;
675 const char *params_b, *params_e;
676 const char *query_b, *query_e;
677 const char *fragment_b, *fragment_e;
680 char *user = NULL, *passwd = NULL;
682 char *url_encoded = NULL;
686 scheme = url_scheme (url);
687 if (scheme == SCHEME_INVALID)
689 error_code = PE_UNSUPPORTED_SCHEME;
693 url_encoded = reencode_escapes (url);
696 p += strlen (supported_schemes[scheme].leading_string);
698 p = url_skip_credentials (p);
701 /* scheme://user:pass@host[:port]... */
704 /* We attempt to break down the URL into the components path,
705 params, query, and fragment. They are ordered like this:
707 scheme://host[:port][/path][;params][?query][#fragment] */
709 params_b = params_e = NULL;
710 query_b = query_e = NULL;
711 fragment_b = fragment_e = NULL;
717 /* Handle IPv6 address inside square brackets. Ideally we'd
718 just look for the terminating ']', but rfc2732 mandates
719 rejecting invalid IPv6 addresses. */
721 /* The address begins after '['. */
723 host_e = strchr (host_b, ']');
727 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
732 /* Check if the IPv6 address is valid. */
733 if (!is_valid_ipv6_address(host_b, host_e))
735 error_code = PE_INVALID_IPV6_ADDRESS;
739 /* Continue parsing after the closing ']'. */
742 error_code = PE_IPV6_NOT_SUPPORTED;
748 p = strpbrk_or_eos (p, ":/;?#");
752 if (host_b == host_e)
754 error_code = PE_EMPTY_HOST;
758 port = scheme_default_port (scheme);
761 const char *port_b, *port_e, *pp;
763 /* scheme://host:port/tralala */
767 p = strpbrk_or_eos (p, "/;?#");
770 /* Allow empty port, as per rfc2396. */
771 if (port_b != port_e)
773 for (port = 0, pp = port_b; pp < port_e; pp++)
777 /* http://host:12randomgarbage/blah */
779 error_code = PE_BAD_PORT_NUMBER;
782 port = 10 * port + (*pp - '0');
783 /* Check for too large port numbers here, before we have
784 a chance to overflow on bogus port values. */
787 error_code = PE_BAD_PORT_NUMBER;
798 p = strpbrk_or_eos (p, ";?#");
803 /* Path is not allowed not to exist. */
811 p = strpbrk_or_eos (p, "?#");
818 p = strpbrk_or_eos (p, "#");
821 /* Hack that allows users to use '?' (a wildcard character) in
822 FTP URLs without it being interpreted as a query string
824 if (scheme == SCHEME_FTP)
826 query_b = query_e = NULL;
839 if (uname_b != uname_e)
841 /* http://user:pass@host */
843 /* uname_b uname_e */
844 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
846 error_code = PE_INVALID_USER_NAME;
851 u = xnew0 (struct url);
853 u->host = strdupdelim (host_b, host_e);
858 u->path = strdupdelim (path_b, path_e);
859 path_modified = path_simplify (u->path);
860 split_path (u->path, &u->dir, &u->file);
862 host_modified = lowercase_str (u->host);
864 /* Decode %HH sequences in host name. This is important not so much
865 to support %HH sequences, but to support binary characters (which
866 will have been converted to %HH by reencode_escapes). */
867 if (strchr (u->host, '%'))
869 url_unescape (u->host);
874 u->params = strdupdelim (params_b, params_e);
876 u->query = strdupdelim (query_b, query_e);
878 u->fragment = strdupdelim (fragment_b, fragment_e);
880 if (path_modified || u->fragment || host_modified || path_b == path_e)
882 /* If we suspect that a transformation has rendered what
883 url_string might return different from URL_ENCODED, rebuild
884 u->url using url_string. */
885 u->url = url_string (u, 0);
887 if (url_encoded != url)
888 xfree ((char *) url_encoded);
892 if (url_encoded == url)
893 u->url = xstrdup (url);
895 u->url = url_encoded;
902 /* Cleanup in case of error: */
903 if (url_encoded && url_encoded != url)
906 /* Transmit the error code to the caller, if the caller wants to
913 /* Return the error message string from ERROR_CODE, which should have
914 been retrieved from url_parse. The error message is translated. */
917 url_error (int error_code)
919 assert (error_code >= 0 && error_code < countof (parse_errors));
920 return _(parse_errors[error_code]);
923 /* Split PATH into DIR and FILE. PATH comes from the URL and is
924 expected to be URL-escaped.
926 The path is split into directory (the part up to the last slash)
927 and file (the part after the last slash), which are subsequently
931 "foo/bar/baz" "foo/bar" "baz"
932 "foo/bar/" "foo/bar" ""
934 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
936 DIR and FILE are freshly allocated. */
939 split_path (const char *path, char **dir, char **file)
941 char *last_slash = strrchr (path, '/');
945 *file = xstrdup (path);
949 *dir = strdupdelim (path, last_slash);
950 *file = xstrdup (last_slash + 1);
953 url_unescape (*file);
956 /* Note: URL's "full path" is the path with the query string and
957 params appended. The "fragment" (#foo) is intentionally ignored,
958 but that might be changed. For example, if the original URL was
959 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
960 the full path will be "/foo/bar/baz;bullshit?querystring". */
962 /* Return the length of the full path, without the terminating
966 full_path_length (const struct url *url)
970 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
981 /* Write out the full path. */
984 full_path_write (const struct url *url, char *where)
986 #define FROB(el, chr) do { \
987 char *f_el = url->el; \
989 int l = strlen (f_el); \
991 memcpy (where, f_el, l); \
1003 /* Public function for getting the "full path". E.g. if u->path is
1004 "foo/bar" and u->query is "param=value", full_path will be
1005 "/foo/bar?param=value". */
1008 url_full_path (const struct url *url)
1010 int length = full_path_length (url);
1011 char *full_path = (char *) xmalloc (length + 1);
1013 full_path_write (url, full_path);
1014 full_path[length] = '\0';
1019 /* Unescape CHR in an otherwise escaped STR. Used to selectively
1020 escaping of certain characters, such as "/" and ":". Returns a
1021 count of unescaped chars. */
1024 unescape_single_char (char *str, char chr)
1026 const char c1 = XNUM_TO_DIGIT (chr >> 4);
1027 const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1028 char *h = str; /* hare */
1029 char *t = str; /* tortoise */
1030 for (; *h; h++, t++)
1032 if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1043 /* Escape unsafe and reserved characters, except for the slash
1047 url_escape_dir (const char *dir)
1049 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1053 unescape_single_char (newdir, '/');
1057 /* Sync u->path and u->url with u->dir and u->file. Called after
1058 u->file or u->dir have been changed, typically by the FTP code. */
1061 sync_path (struct url *u)
1063 char *newpath, *efile, *edir;
1067 /* u->dir and u->file are not escaped. URL-escape them before
1068 reassembling them into u->path. That way, if they contain
1069 separators like '?' or even if u->file contains slashes, the
1070 path will be correctly assembled. (u->file can contain slashes
1071 if the URL specifies it with %2f, or if an FTP server returns
1073 edir = url_escape_dir (u->dir);
1074 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1077 newpath = xstrdup (efile);
1080 int dirlen = strlen (edir);
1081 int filelen = strlen (efile);
1083 /* Copy "DIR/FILE" to newpath. */
1084 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1085 memcpy (p, edir, dirlen);
1088 memcpy (p, efile, filelen);
1097 if (efile != u->file)
1100 /* Regenerate u->url as well. */
1102 u->url = url_string (u, 0);
1105 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1106 This way we can sync u->path and u->url when they get changed. */
1109 url_set_dir (struct url *url, const char *newdir)
1112 url->dir = xstrdup (newdir);
1117 url_set_file (struct url *url, const char *newfile)
1120 url->file = xstrdup (newfile);
1125 url_free (struct url *url)
1131 xfree_null (url->params);
1132 xfree_null (url->query);
1133 xfree_null (url->fragment);
1134 xfree_null (url->user);
1135 xfree_null (url->passwd);
1143 /* Create all the necessary directories for PATH (a file). Calls
1144 mkdirhier() internally. */
1146 mkalldirs (const char *path)
1153 p = path + strlen (path);
1154 for (; *p != '/' && p != path; p--)
1157 /* Don't create if it's just a file. */
1158 if ((p == path) && (*p != '/'))
1160 t = strdupdelim (path, p);
1162 /* Check whether the directory exists. */
1163 if ((stat (t, &st) == 0))
1165 if (S_ISDIR (st.st_mode))
1172 /* If the dir exists as a file name, remove it first. This
1173 is *only* for Wget to work with buggy old CERN http
1174 servers. Here is the scenario: When Wget tries to
1175 retrieve a directory without a slash, e.g.
1176 http://foo/bar (bar being a directory), CERN server will
1177 not redirect it too http://foo/bar/ -- it will generate a
1178 directory listing containing links to bar/file1,
1179 bar/file2, etc. Wget will lose because it saves this
1180 HTML listing to a file `bar', so it cannot create the
1181 directory. To work around this, if the file of the same
1182 name exists, we just remove it and create the directory
1184 DEBUGP (("Removing %s because of directory danger!\n", t));
1188 res = make_directory (t);
1190 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1195 /* Functions for constructing the file name out of URL components. */
1197 /* A growable string structure, used by url_file_name and friends.
1198 This should perhaps be moved to utils.c.
1200 The idea is to have a convenient and efficient way to construct a
1201 string by having various functions append data to it. Instead of
1202 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1203 functions in questions, we pass the pointer to this struct. */
1211 /* Ensure that the string can accept APPEND_COUNT more characters past
1212 the current TAIL position. If necessary, this will grow the string
1213 and update its allocated size. If the string is already large
1214 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1215 #define GROW(g, append_size) do { \
1216 struct growable *G_ = g; \
1217 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1220 /* Return the tail position of the string. */
1221 #define TAIL(r) ((r)->base + (r)->tail)
1223 /* Move the tail position by APPEND_COUNT characters. */
1224 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1226 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1230 append_string (const char *str, struct growable *dest)
1232 int l = strlen (str);
1234 memcpy (TAIL (dest), str, l);
1235 TAIL_INCR (dest, l);
1238 /* Append CH to DEST. For example, append_char (0, DEST)
1239 zero-terminates DEST. */
1242 append_char (char ch, struct growable *dest)
1246 TAIL_INCR (dest, 1);
1250 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1251 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1252 filechr_control = 4 /* a control character, e.g. 0-31 */
1255 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1257 /* Shorthands for the table: */
1258 #define U filechr_not_unix
1259 #define W filechr_not_windows
1260 #define C filechr_control
1265 /* Table of characters unsafe under various conditions (see above).
1267 Arguably we could also claim `%' to be unsafe, since we use it as
1268 the escape character. If we ever want to be able to reliably
1269 translate file name back to URL, this would become important
1270 crucial. Right now, it's better to be minimal in escaping. */
1272 const static unsigned char filechr_table[256] =
1274 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1275 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1276 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1277 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1278 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1279 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1280 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1281 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1282 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1283 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1284 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1285 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1286 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1287 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1288 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1289 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1291 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1292 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1307 /* FN_PORT_SEP is the separator between host and port in file names
1308 for non-standard port numbers. On Unix this is normally ':', as in
1309 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1310 because Windows can't handle ':' in file names. */
1311 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1313 /* FN_QUERY_SEP is the separator between the file name and the URL
1314 query, normally '?'. Since Windows cannot handle '?' as part of
1315 file name, we use '@' instead there. */
1316 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1318 /* Quote path element, characters in [b, e), as file name, and append
1319 the quoted string to DEST. Each character is quoted as per
1320 file_unsafe_char and the corresponding table.
1322 If ESCAPED_P is non-zero, the path element is considered to be
1323 URL-escaped and will be unescaped prior to inspection. */
1326 append_uri_pathel (const char *b, const char *e, int escaped_p,
1327 struct growable *dest)
1333 if (opt.restrict_files_os == restrict_unix)
1334 mask = filechr_not_unix;
1336 mask = filechr_not_windows;
1337 if (opt.restrict_files_ctrl)
1338 mask |= filechr_control;
1340 /* Copy [b, e) to PATHEL and URL-unescape it. */
1344 BOUNDED_TO_ALLOCA (b, e, unescaped);
1345 url_unescape (unescaped);
1347 e = unescaped + strlen (unescaped);
1350 /* Defang ".." when found as component of path. Remember that path
1351 comes from the URL and might contain malicious input. */
1352 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1358 /* Walk the PATHEL string and check how many characters we'll need
1361 for (p = b; p < e; p++)
1362 if (FILE_CHAR_TEST (*p, mask))
1365 /* Calculate the length of the output string. e-b is the input
1366 string length. Each quoted char introduces two additional
1367 characters in the string, hence 2*quoted. */
1368 outlen = (e - b) + (2 * quoted);
1369 GROW (dest, outlen);
1373 /* If there's nothing to quote, we can simply append the string
1374 without processing it again. */
1375 memcpy (TAIL (dest), b, outlen);
1379 char *q = TAIL (dest);
1380 for (p = b; p < e; p++)
1382 if (!FILE_CHAR_TEST (*p, mask))
1386 unsigned char ch = *p;
1388 *q++ = XNUM_TO_DIGIT (ch >> 4);
1389 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1392 assert (q - TAIL (dest) == outlen);
1394 TAIL_INCR (dest, outlen);
1397 /* Append to DEST the directory structure that corresponds the
1398 directory part of URL's path. For example, if the URL is
1399 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1401 Each path element ("dir1" and "dir2" in the above example) is
1402 examined, url-unescaped, and re-escaped as file name element.
1404 Additionally, it cuts as many directories from the path as
1405 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1406 will produce "bar" for the above example. For 2 or more, it will
1409 Each component of the path is quoted for use as file name. */
1412 append_dir_structure (const struct url *u, struct growable *dest)
1414 char *pathel, *next;
1415 int cut = opt.cut_dirs;
1417 /* Go through the path components, de-URL-quote them, and quote them
1418 (if necessary) as file names. */
1421 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1426 /* Ignore empty pathels. */
1430 append_char ('/', dest);
1431 append_uri_pathel (pathel, next, 1, dest);
1435 /* Return a unique file name that matches the given URL as good as
1436 possible. Does not create directories on the file system. */
1439 url_file_name (const struct url *u)
1441 struct growable fnres; /* stands for "file name result" */
1443 const char *u_file, *u_query;
1444 char *fname, *unique;
1450 /* Start with the directory prefix, if specified. */
1452 append_string (opt.dir_prefix, &fnres);
1454 /* If "dirstruct" is turned on (typically the case with -r), add
1455 the host and port (unless those have been turned off) and
1456 directory structure. */
1459 if (opt.protocol_directories)
1462 append_char ('/', &fnres);
1463 append_string (supported_schemes[u->scheme].name, &fnres);
1465 if (opt.add_hostdir)
1468 append_char ('/', &fnres);
1469 if (0 != strcmp (u->host, ".."))
1470 append_string (u->host, &fnres);
1472 /* Host name can come from the network; malicious DNS may
1473 allow ".." to be resolved, causing us to write to
1474 "../<file>". Defang such host names. */
1475 append_string ("%2E%2E", &fnres);
1476 if (u->port != scheme_default_port (u->scheme))
1479 number_to_string (portstr, u->port);
1480 append_char (FN_PORT_SEP, &fnres);
1481 append_string (portstr, &fnres);
1485 append_dir_structure (u, &fnres);
1488 /* Add the file name. */
1490 append_char ('/', &fnres);
1491 u_file = *u->file ? u->file : "index.html";
1492 append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1494 /* Append "?query" to the file name. */
1495 u_query = u->query && *u->query ? u->query : NULL;
1498 append_char (FN_QUERY_SEP, &fnres);
1499 append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1502 /* Zero-terminate the file name. */
1503 append_char ('\0', &fnres);
1507 /* Check the cases in which the unique extensions are not used:
1508 1) Clobbering is turned off (-nc).
1509 2) Retrieval with regetting.
1510 3) Timestamping is used.
1511 4) Hierarchy is built.
1513 The exception is the case when file does exist and is a
1514 directory (see `mkalldirs' for explanation). */
1516 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1517 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1520 unique = unique_name (fname, 1);
1521 if (unique != fname)
1526 /* Resolve "." and ".." elements of PATH by destructively modifying
1527 PATH and return non-zero if PATH has been modified, zero otherwise.
1529 The algorithm is in spirit similar to the one described in rfc1808,
1530 although implemented differently, in one pass. To recap, path
1531 elements containing only "." are removed, and ".." is taken to mean
1532 "back up one element". Single leading and trailing slashes are
1535 This function does not handle URL escapes explicitly. If you're
1536 passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1537 ".", so that this function can find the dots. (Wget's URL parser
1538 calls reencode_escapes, which see.)
1540 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1541 test examples are provided below. If you change anything in this
1542 function, run test_path_simplify to make sure you haven't broken a
1546 path_simplify (char *path)
1548 char *h = path; /* hare */
1549 char *t = path; /* tortoise */
1550 char *beg = path; /* boundary for backing the tortoise */
1551 char *end = path + strlen (path);
1555 /* Hare should be at the beginning of a path element. */
1557 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1562 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1564 /* Handle "../" by retreating the tortoise by one path
1565 element -- but not past beggining. */
1568 /* Move backwards until T hits the beginning of the
1569 previous path element or the beginning of path. */
1570 for (--t; t > beg && t[-1] != '/'; t--)
1575 /* If we're at the beginning, copy the "../" literally
1576 move the beginning so a later ".." doesn't remove
1586 /* A regular path element. If H hasn't advanced past T,
1587 simply skip to the next path element. Otherwise, copy
1588 the path element until the next slash. */
1591 /* Skip the path element, including the slash. */
1592 while (h < end && *h != '/')
1599 /* Copy the path element, including the final slash. */
1600 while (h < end && *h != '/')
1614 /* Return the length of URL's path. Path is considered to be
1615 terminated by one of '?', ';', '#', or by the end of the
1619 path_length (const char *url)
1621 const char *q = strpbrk_or_eos (url, "?;#");
1625 /* Find the last occurrence of character C in the range [b, e), or
1626 NULL, if none are present. We might want to use memrchr (a GNU
1627 extension) under GNU libc. */
1630 find_last_char (const char *b, const char *e, char c)
1638 /* Merge BASE with LINK and return the resulting URI.
1640 Either of the URIs may be absolute or relative, complete with the
1641 host name, or path only. This tries to reasonably handle all
1642 foreseeable cases. It only employs minimal URL parsing, without
1643 knowledge of the specifics of schemes.
1645 I briefly considered making this function call path_simplify after
1646 the merging process, as rfc1738 seems to suggest. This is a bad
1647 idea for several reasons: 1) it complexifies the code, and 2)
1648 url_parse has to simplify path anyway, so it's wasteful to boot. */
1651 uri_merge (const char *base, const char *link)
1657 if (url_has_scheme (link))
1658 return xstrdup (link);
1660 /* We may not examine BASE past END. */
1661 end = base + path_length (base);
1662 linklength = strlen (link);
1666 /* Empty LINK points back to BASE, query string and all. */
1667 return xstrdup (base);
1669 else if (*link == '?')
1671 /* LINK points to the same location, but changes the query
1672 string. Examples: */
1673 /* uri_merge("path", "?new") -> "path?new" */
1674 /* uri_merge("path?foo", "?new") -> "path?new" */
1675 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1676 /* uri_merge("path#foo", "?new") -> "path?new" */
1677 int baselength = end - base;
1678 merge = xmalloc (baselength + linklength + 1);
1679 memcpy (merge, base, baselength);
1680 memcpy (merge + baselength, link, linklength);
1681 merge[baselength + linklength] = '\0';
1683 else if (*link == '#')
1685 /* uri_merge("path", "#new") -> "path#new" */
1686 /* uri_merge("path#foo", "#new") -> "path#new" */
1687 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1688 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1690 const char *end1 = strchr (base, '#');
1692 end1 = base + strlen (base);
1693 baselength = end1 - base;
1694 merge = xmalloc (baselength + linklength + 1);
1695 memcpy (merge, base, baselength);
1696 memcpy (merge + baselength, link, linklength);
1697 merge[baselength + linklength] = '\0';
1699 else if (*link == '/' && *(link + 1) == '/')
1701 /* LINK begins with "//" and so is a net path: we need to
1702 replace everything after (and including) the double slash
1705 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1706 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1707 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1711 const char *start_insert;
1713 /* Look for first slash. */
1714 slash = memchr (base, '/', end - base);
1715 /* If found slash and it is a double slash, then replace
1716 from this point, else default to replacing from the
1718 if (slash && *(slash + 1) == '/')
1719 start_insert = slash;
1721 start_insert = base;
1723 span = start_insert - base;
1724 merge = (char *)xmalloc (span + linklength + 1);
1726 memcpy (merge, base, span);
1727 memcpy (merge + span, link, linklength);
1728 merge[span + linklength] = '\0';
1730 else if (*link == '/')
1732 /* LINK is an absolute path: we need to replace everything
1733 after (and including) the FIRST slash with LINK.
1735 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1736 "/qux/xyzzy", our result should be
1737 "http://host/qux/xyzzy". */
1740 const char *start_insert = NULL; /* for gcc to shut up. */
1741 const char *pos = base;
1742 int seen_slash_slash = 0;
1743 /* We're looking for the first slash, but want to ignore
1746 slash = memchr (pos, '/', end - pos);
1747 if (slash && !seen_slash_slash)
1748 if (*(slash + 1) == '/')
1751 seen_slash_slash = 1;
1755 /* At this point, SLASH is the location of the first / after
1756 "//", or the first slash altogether. START_INSERT is the
1757 pointer to the location where LINK will be inserted. When
1758 examining the last two examples, keep in mind that LINK
1761 if (!slash && !seen_slash_slash)
1762 /* example: "foo" */
1764 start_insert = base;
1765 else if (!slash && seen_slash_slash)
1766 /* example: "http://foo" */
1769 else if (slash && !seen_slash_slash)
1770 /* example: "foo/bar" */
1772 start_insert = base;
1773 else if (slash && seen_slash_slash)
1774 /* example: "http://something/" */
1776 start_insert = slash;
1778 span = start_insert - base;
1779 merge = (char *)xmalloc (span + linklength + 1);
1781 memcpy (merge, base, span);
1782 memcpy (merge + span, link, linklength);
1783 merge[span + linklength] = '\0';
1787 /* LINK is a relative URL: we need to replace everything
1788 after last slash (possibly empty) with LINK.
1790 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1791 our result should be "whatever/foo/qux/xyzzy". */
1792 int need_explicit_slash = 0;
1794 const char *start_insert;
1795 const char *last_slash = find_last_char (base, end, '/');
1798 /* No slash found at all. Replace what we have with LINK. */
1799 start_insert = base;
1801 else if (last_slash && last_slash >= base + 2
1802 && last_slash[-2] == ':' && last_slash[-1] == '/')
1804 /* example: http://host" */
1806 start_insert = end + 1;
1807 need_explicit_slash = 1;
1811 /* example: "whatever/foo/bar" */
1813 start_insert = last_slash + 1;
1816 span = start_insert - base;
1817 merge = (char *)xmalloc (span + linklength + 1);
1819 memcpy (merge, base, span);
1820 if (need_explicit_slash)
1821 merge[span - 1] = '/';
1822 memcpy (merge + span, link, linklength);
1823 merge[span + linklength] = '\0';
1829 #define APPEND(p, s) do { \
1830 int len = strlen (s); \
1831 memcpy (p, s, len); \
1835 /* Use this instead of password when the actual password is supposed
1836 to be hidden. We intentionally use a generic string without giving
1837 away the number of characters in the password, like previous
1839 #define HIDDEN_PASSWORD "*password*"
1841 /* Recreate the URL string from the data in URL.
1843 If HIDE is non-zero (as it is when we're calling this on a URL we
1844 plan to print, but not when calling it to canonicalize a URL for
1845 use within the program), password will be hidden. Unsafe
1846 characters in the URL will be quoted. */
1849 url_string (const struct url *url, int hide_password)
1853 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1855 int scheme_port = supported_schemes[url->scheme].default_port;
1856 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1857 int fplen = full_path_length (url);
1859 int brackets_around_host;
1861 assert (scheme_str != NULL);
1863 /* Make sure the user name and password are quoted. */
1866 quoted_user = url_escape_allow_passthrough (url->user);
1870 quoted_passwd = HIDDEN_PASSWORD;
1872 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1876 /* In the unlikely event that the host name contains non-printable
1877 characters, quote it for displaying to the user. */
1878 quoted_host = url_escape_allow_passthrough (url->host);
1880 /* Undo the quoting of colons that URL escaping performs. IPv6
1881 addresses may legally contain colons, and in that case must be
1882 placed in square brackets. */
1883 if (quoted_host != url->host)
1884 unescape_single_char (quoted_host, ':');
1885 brackets_around_host = strchr (quoted_host, ':') != NULL;
1887 size = (strlen (scheme_str)
1888 + strlen (quoted_host)
1889 + (brackets_around_host ? 2 : 0)
1892 if (url->port != scheme_port)
1893 size += 1 + numdigit (url->port);
1896 size += 1 + strlen (quoted_user);
1898 size += 1 + strlen (quoted_passwd);
1901 p = result = xmalloc (size);
1903 APPEND (p, scheme_str);
1906 APPEND (p, quoted_user);
1910 APPEND (p, quoted_passwd);
1915 if (brackets_around_host)
1917 APPEND (p, quoted_host);
1918 if (brackets_around_host)
1920 if (url->port != scheme_port)
1923 p = number_to_string (p, url->port);
1926 full_path_write (url, p);
1930 assert (p - result == size);
1932 if (quoted_user && quoted_user != url->user)
1933 xfree (quoted_user);
1934 if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1935 xfree (quoted_passwd);
1936 if (quoted_host != url->host)
1937 xfree (quoted_host);
1942 /* Return non-zero if scheme a is similar to scheme b.
1944 Schemes are similar if they are equal. If SSL is supported, schemes
1945 are also similar if one is http (SCHEME_HTTP) and the other is https
1948 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1953 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1954 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1961 /* Debugging and testing support for path_simplify. */
1963 /* Debug: run path_simplify on PATH and return the result in a new
1964 string. Useful for calling from the debugger. */
1968 char *copy = xstrdup (path);
1969 path_simplify (copy);
1974 run_test (char *test, char *expected_result, int expected_change)
1976 char *test_copy = xstrdup (test);
1977 int modified = path_simplify (test_copy);
1979 if (0 != strcmp (test_copy, expected_result))
1981 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1982 test, expected_result, test_copy);
1984 if (modified != expected_change)
1986 if (expected_change == 1)
1987 printf ("Expected modification with path_simplify(\"%s\").\n",
1990 printf ("Expected no modification with path_simplify(\"%s\").\n",
1997 test_path_simplify (void)
2000 char *test, *result;
2007 { "../", "../", 0 },
2008 { "foo", "foo", 0 },
2009 { "foo/bar", "foo/bar", 0 },
2010 { "foo///bar", "foo///bar", 0 },
2011 { "foo/.", "foo/", 1 },
2012 { "foo/./", "foo/", 1 },
2013 { "foo./", "foo./", 0 },
2014 { "foo/../bar", "bar", 1 },
2015 { "foo/../bar/", "bar/", 1 },
2016 { "foo/bar/..", "foo/", 1 },
2017 { "foo/bar/../x", "foo/x", 1 },
2018 { "foo/bar/../x/", "foo/x/", 1 },
2019 { "foo/..", "", 1 },
2020 { "foo/../..", "..", 1 },
2021 { "foo/../../..", "../..", 1 },
2022 { "foo/../../bar/../../baz", "../../baz", 1 },
2023 { "a/b/../../c", "c", 1 },
2024 { "./a/../b", "b", 1 }
2028 for (i = 0; i < countof (tests); i++)
2030 char *test = tests[i].test;
2031 char *expected_result = tests[i].result;
2032 int expected_change = tests[i].should_modify;
2033 run_test (test, expected_result, expected_change);