2 Copyright (C) 1996-2006 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 In addition, as a special exception, the Free Software Foundation
20 gives permission to link the code of its release of Wget with the
21 OpenSSL project's "OpenSSL" library (or with modified versions of it
22 that use the same license as the "OpenSSL" library), and distribute
23 the linked executables. You must obey the GNU General Public License
24 in all respects for all of the code used other than "OpenSSL". If you
25 modify this file, you may extend this exception to your version of the
26 file, but you are not obligated to do so. If you do not wish to do
27 so, delete this exception statement from your version. */
43 #include "host.h" /* for is_valid_ipv6_address */
50 scm_disabled = 1, /* for https when OpenSSL fails to init. */
51 scm_has_params = 2, /* whether scheme has ;params */
52 scm_has_query = 4, /* whether scheme has ?query */
53 scm_has_fragment = 8 /* whether scheme has #fragment */
58 /* Short name of the scheme, such as "http" or "ftp". */
60 /* Leading string that identifies the scheme, such as "https://". */
61 const char *leading_string;
62 /* Default port of the scheme when none is specified. */
68 /* Supported schemes: */
69 static struct scheme_data supported_schemes[] =
71 { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment },
73 { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
75 { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment },
81 /* Forward declarations: */
83 static bool path_simplify (char *);
85 /* Support for escaping and unescaping of URL strings. */
87 /* Table of "reserved" and "unsafe" characters. Those terms are
88 rfc1738-speak, as such largely obsoleted by rfc2396 and later
89 specs, but the general idea remains.
91 A reserved character is the one that you can't decode without
92 changing the meaning of the URL. For example, you can't decode
93 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
94 path components is different. Non-reserved characters can be
95 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
96 unsafe characters are loosely based on rfc1738, plus "$" and ",",
97 as recommended by rfc2396, and minus "~", which is very frequently
98 used (and sometimes unrecognized as %7E by broken servers).
100 An unsafe character is the one that should be encoded when URLs are
101 placed in foreign environments. E.g. space and newline are unsafe
102 in HTTP contexts because HTTP uses them as separator and line
103 terminator, so they must be encoded to %20 and %0A respectively.
104 "*" is unsafe in shell context, etc.
106 We determine whether a character is unsafe through static table
107 lookup. This code assumes ASCII character set and 8-bit chars. */
110 /* rfc1738 reserved chars + "$" and ",". */
113 /* rfc1738 unsafe chars, plus non-printables. */
117 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
118 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
119 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
121 /* Shorthands for the table: */
122 #define R urlchr_reserved
123 #define U urlchr_unsafe
126 static const unsigned char urlchr_table[256] =
128 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
129 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
130 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
131 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
132 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
133 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
134 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
135 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
136 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
137 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
138 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
139 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
140 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
141 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
142 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
143 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
145 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
150 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
151 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
152 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
153 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
159 /* URL-unescape the string S.
161 This is done by transforming the sequences "%HH" to the character
162 represented by the hexadecimal digits HH. If % is not followed by
163 two hexadecimal digits, it is inserted literally.
165 The transformation is done in place. If you need the original
166 string intact, make a copy before calling this function. */
169 url_unescape (char *s)
171 char *t = s; /* t - tortoise */
172 char *h = s; /* h - hare */
184 /* Do nothing if '%' is not followed by two hex digits. */
185 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
187 c = X2DIGITS_TO_NUM (h[1], h[2]);
188 /* Don't unescape %00 because there is no way to insert it
189 into a C string without effectively truncating it. */
199 /* The core of url_escape_* functions. Escapes the characters that
200 match the provided mask in urlchr_table.
202 If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
203 returned unchanged. If ALLOW_PASSTHROUGH is false, a freshly
204 allocated string will be returned in all cases. */
207 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
214 for (p1 = s; *p1; p1++)
215 if (urlchr_test (*p1, mask))
216 addition += 2; /* Two more characters (hex digits) */
219 return allow_passthrough ? (char *)s : xstrdup (s);
221 newlen = (p1 - s) + addition;
222 newstr = xmalloc (newlen + 1);
228 /* Quote the characters that match the test mask. */
229 if (urlchr_test (*p1, mask))
231 unsigned char c = *p1++;
233 *p2++ = XNUM_TO_DIGIT (c >> 4);
234 *p2++ = XNUM_TO_DIGIT (c & 0xf);
239 assert (p2 - newstr == newlen);
245 /* URL-escape the unsafe characters (see urlchr_table) in a given
246 string, returning a freshly allocated string. */
249 url_escape (const char *s)
251 return url_escape_1 (s, urlchr_unsafe, false);
254 /* URL-escape the unsafe characters (see urlchr_table) in a given
255 string. If no characters are unsafe, S is returned. */
258 url_escape_allow_passthrough (const char *s)
260 return url_escape_1 (s, urlchr_unsafe, true);
263 /* Decide whether the char at position P needs to be encoded. (It is
264 not enough to pass a single char *P because the function may need
265 to inspect the surrounding context.)
267 Return true if the char should be escaped as %XX, false otherwise. */
270 char_needs_escaping (const char *p)
274 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
277 /* Garbled %.. sequence: encode `%'. */
280 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
286 /* Translate a %-escaped (but possibly non-conformant) input string S
287 into a %-escaped (and conformant) output string. If no characters
288 are encoded or decoded, return the same string S; otherwise, return
289 a freshly allocated string with the new contents.
291 After a URL has been run through this function, the protocols that
292 use `%' as the quote character can use the resulting string as-is,
293 while those that don't can use url_unescape to get to the intended
294 data. This function is stable: once the input is transformed,
295 further transformations of the result yield the same output.
297 Let's discuss why this function is needed.
299 Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since
300 a raw space character would mess up the HTTP request, it needs to
301 be quoted, like this:
303 GET /abc%20def HTTP/1.0
305 It would appear that the unsafe chars need to be quoted, for
306 example with url_escape. But what if we're requested to download
307 `abc%20def'? url_escape transforms "%" to "%25", which would leave
308 us with `abc%2520def'. This is incorrect -- since %-escapes are
309 part of URL syntax, "%20" is the correct way to denote a literal
310 space on the Wget command line. This leads to the conclusion that
311 in that case Wget should not call url_escape, but leave the `%20'
312 as is. This is clearly contradictory, but it only gets worse.
314 What if the requested URI is `abc%20 def'? If we call url_escape,
315 we end up with `/abc%2520%20def', which is almost certainly not
316 intended. If we don't call url_escape, we are left with the
317 embedded space and cannot complete the request. What the user
318 meant was for Wget to request `/abc%20%20def', and this is where
319 reencode_escapes kicks in.
321 Wget used to solve this by first decoding %-quotes, and then
322 encoding all the "unsafe" characters found in the resulting string.
323 This was wrong because it didn't preserve certain URL special
324 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
325 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
326 whether we considered `+' reserved (it is). One of these results
327 is inevitable because by the second step we would lose information
328 on whether the `+' was originally encoded or not. Both results
329 were wrong because in CGI parameters + means space, while %2B means
330 literal plus. reencode_escapes correctly translates the above to
331 "a%2B+b", i.e. returns the original string.
333 This function uses a modified version of the algorithm originally
334 proposed by Anon Sricharoenchai:
336 * Encode all "unsafe" characters, except those that are also
337 "reserved", to %XX. See urlchr_table for which characters are
340 * Encode the "%" characters not followed by two hex digits to
343 * Pass through all other characters and %XX escapes as-is. (Up to
344 Wget 1.10 this decoded %XX escapes corresponding to "safe"
345 characters, but that was obtrusive and broke some servers.)
349 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
351 "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
355 "foo bar" -> "foo%20bar"
356 "foo%20bar" -> "foo%20bar"
357 "foo %20bar" -> "foo%20%20bar"
358 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
359 "foo%25%20bar" -> "foo%25%20bar"
360 "foo%2%20bar" -> "foo%252%20bar"
361 "foo+bar" -> "foo+bar" (plus is reserved!)
362 "foo%2b+bar" -> "foo%2b+bar" */
365 reencode_escapes (const char *s)
371 int encode_count = 0;
373 /* First pass: inspect the string to see if there's anything to do,
374 and to calculate the new length. */
375 for (p1 = s; *p1; p1++)
376 if (char_needs_escaping (p1))
380 /* The string is good as it is. */
381 return (char *) s; /* C const model sucks. */
384 /* Each encoding adds two characters (hex digits). */
385 newlen = oldlen + 2 * encode_count;
386 newstr = xmalloc (newlen + 1);
388 /* Second pass: copy the string to the destination address, encoding
389 chars when needed. */
394 if (char_needs_escaping (p1))
396 unsigned char c = *p1++;
398 *p2++ = XNUM_TO_DIGIT (c >> 4);
399 *p2++ = XNUM_TO_DIGIT (c & 0xf);
405 assert (p2 - newstr == newlen);
409 /* Returns the scheme type if the scheme is supported, or
410 SCHEME_INVALID if not. */
413 url_scheme (const char *url)
417 for (i = 0; supported_schemes[i].leading_string; i++)
418 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
419 strlen (supported_schemes[i].leading_string)))
421 if (!(supported_schemes[i].flags & scm_disabled))
422 return (enum url_scheme) i;
424 return SCHEME_INVALID;
427 return SCHEME_INVALID;
430 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
432 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
433 currently implemented, it returns true if URL begins with
437 url_has_scheme (const char *url)
441 /* The first char must be a scheme char. */
442 if (!*p || !SCHEME_CHAR (*p))
445 /* Followed by 0 or more scheme chars. */
446 while (*p && SCHEME_CHAR (*p))
448 /* Terminated by ':'. */
453 scheme_default_port (enum url_scheme scheme)
455 return supported_schemes[scheme].default_port;
459 scheme_disable (enum url_scheme scheme)
461 supported_schemes[scheme].flags |= scm_disabled;
464 /* Skip the username and password, if present in the URL. The
465 function should *not* be called with the complete URL, but with the
466 portion after the scheme.
468 If no username and password are found, return URL. */
471 url_skip_credentials (const char *url)
473 /* Look for '@' that comes before terminators, such as '/', '?',
475 const char *p = (const char *)strpbrk (url, "@/?#;");
481 /* Parse credentials contained in [BEG, END). The region is expected
482 to have come from a URL and is unescaped. */
485 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
491 return false; /* empty user name */
493 colon = memchr (beg, ':', end - beg);
495 return false; /* again empty user name */
499 *passwd = strdupdelim (colon + 1, end);
501 url_unescape (*passwd);
508 *user = strdupdelim (beg, userend);
509 url_unescape (*user);
513 /* Used by main.c: detect URLs written using the "shorthand" URL forms
514 originally popularized by Netscape and NcFTP. HTTP shorthands look
517 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
518 www.foo.com[:port] -> http://www.foo.com[:port]
520 FTP shorthands look like this:
522 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
523 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
525 If the URL needs not or cannot be rewritten, return NULL. */
528 rewrite_shorthand_url (const char *url)
533 if (url_scheme (url) != SCHEME_INVALID)
536 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
538 p = strpbrk (url, ":/");
542 /* If we're looking at "://", it means the URL uses a scheme we
543 don't support, which may include "https" when compiled without
544 SSL support. Don't bogusly rewrite such URLs. */
545 if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
550 /* Colon indicates ftp, as in foo.bar.com:path. Check for
551 special case of http port number ("localhost:10000"). */
552 int digits = strspn (p + 1, "0123456789");
553 if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
556 /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
557 ret = aprintf ("ftp://%s", url);
558 ret[6 + (p - url)] = '/';
563 /* Just prepend "http://" to URL. */
564 ret = aprintf ("http://%s", url);
569 static void split_path (const char *, char **, char **);
571 /* Like strpbrk, with the exception that it returns the pointer to the
572 terminating zero (end-of-string aka "eos") if no matching character
576 strpbrk_or_eos (const char *s, const char *accept)
578 char *p = strpbrk (s, accept);
580 p = strchr (s, '\0');
584 /* Turn STR into lowercase; return true if a character was actually
588 lowercase_str (char *str)
590 bool changed = false;
595 *str = TOLOWER (*str);
601 init_seps (enum url_scheme scheme)
603 static char seps[8] = ":/";
605 int flags = supported_schemes[scheme].flags;
607 if (flags & scm_has_params)
609 if (flags & scm_has_query)
611 if (flags & scm_has_fragment)
617 static const char *parse_errors[] = {
618 #define PE_NO_ERROR 0
620 #define PE_UNSUPPORTED_SCHEME 1
621 N_("Unsupported scheme"),
622 #define PE_INVALID_HOST_NAME 2
623 N_("Invalid host name"),
624 #define PE_BAD_PORT_NUMBER 3
625 N_("Bad port number"),
626 #define PE_INVALID_USER_NAME 4
627 N_("Invalid user name"),
628 #define PE_UNTERMINATED_IPV6_ADDRESS 5
629 N_("Unterminated IPv6 numeric address"),
630 #define PE_IPV6_NOT_SUPPORTED 6
631 N_("IPv6 addresses not supported"),
632 #define PE_INVALID_IPV6_ADDRESS 7
633 N_("Invalid IPv6 numeric address")
638 Return a new struct url if successful, NULL on error. In case of
639 error, and if ERROR is not NULL, also set *ERROR to the appropriate
642 url_parse (const char *url, int *error)
646 bool path_modified, host_modified;
648 enum url_scheme scheme;
651 const char *uname_b, *uname_e;
652 const char *host_b, *host_e;
653 const char *path_b, *path_e;
654 const char *params_b, *params_e;
655 const char *query_b, *query_e;
656 const char *fragment_b, *fragment_e;
659 char *user = NULL, *passwd = NULL;
661 char *url_encoded = NULL;
665 scheme = url_scheme (url);
666 if (scheme == SCHEME_INVALID)
668 error_code = PE_UNSUPPORTED_SCHEME;
672 url_encoded = reencode_escapes (url);
675 p += strlen (supported_schemes[scheme].leading_string);
677 p = url_skip_credentials (p);
680 /* scheme://user:pass@host[:port]... */
683 /* We attempt to break down the URL into the components path,
684 params, query, and fragment. They are ordered like this:
686 scheme://host[:port][/path][;params][?query][#fragment] */
688 path_b = path_e = NULL;
689 params_b = params_e = NULL;
690 query_b = query_e = NULL;
691 fragment_b = fragment_e = NULL;
693 /* Initialize separators for optional parts of URL, depending on the
694 scheme. For example, FTP has params, and HTTP and HTTPS have
695 query string and fragment. */
696 seps = init_seps (scheme);
702 /* Handle IPv6 address inside square brackets. Ideally we'd
703 just look for the terminating ']', but rfc2732 mandates
704 rejecting invalid IPv6 addresses. */
706 /* The address begins after '['. */
708 host_e = strchr (host_b, ']');
712 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
717 /* Check if the IPv6 address is valid. */
718 if (!is_valid_ipv6_address(host_b, host_e))
720 error_code = PE_INVALID_IPV6_ADDRESS;
724 /* Continue parsing after the closing ']'. */
727 error_code = PE_IPV6_NOT_SUPPORTED;
731 /* The closing bracket must be followed by a separator or by the
733 /* http://[::1]... */
735 if (!strchr (seps, *p))
737 /* Trailing garbage after []-delimited IPv6 address. */
738 error_code = PE_INVALID_HOST_NAME;
744 p = strpbrk_or_eos (p, seps);
747 ++seps; /* advance to '/' */
749 if (host_b == host_e)
751 error_code = PE_INVALID_HOST_NAME;
755 port = scheme_default_port (scheme);
758 const char *port_b, *port_e, *pp;
760 /* scheme://host:port/tralala */
764 p = strpbrk_or_eos (p, seps);
767 /* Allow empty port, as per rfc2396. */
768 if (port_b != port_e)
769 for (port = 0, pp = port_b; pp < port_e; pp++)
773 /* http://host:12randomgarbage/blah */
775 error_code = PE_BAD_PORT_NUMBER;
778 port = 10 * port + (*pp - '0');
779 /* Check for too large port numbers here, before we have
780 a chance to overflow on bogus port values. */
783 error_code = PE_BAD_PORT_NUMBER;
788 /* Advance to the first separator *after* '/' (either ';' or '?',
789 depending on the scheme). */
792 /* Get the optional parts of URL, each part being delimited by
793 current location and the position of the next separator. */
794 #define GET_URL_PART(sepchar, var) do { \
796 var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \
800 GET_URL_PART ('/', path);
801 if (supported_schemes[scheme].flags & scm_has_params)
802 GET_URL_PART (';', params);
803 if (supported_schemes[scheme].flags & scm_has_query)
804 GET_URL_PART ('?', query);
805 if (supported_schemes[scheme].flags & scm_has_fragment)
806 GET_URL_PART ('#', fragment);
811 if (uname_b != uname_e)
813 /* http://user:pass@host */
815 /* uname_b uname_e */
816 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
818 error_code = PE_INVALID_USER_NAME;
823 u = xnew0 (struct url);
825 u->host = strdupdelim (host_b, host_e);
830 u->path = strdupdelim (path_b, path_e);
831 path_modified = path_simplify (u->path);
832 split_path (u->path, &u->dir, &u->file);
834 host_modified = lowercase_str (u->host);
836 /* Decode %HH sequences in host name. This is important not so much
837 to support %HH sequences in host names (which other browser
838 don't), but to support binary characters (which will have been
839 converted to %HH by reencode_escapes). */
840 if (strchr (u->host, '%'))
842 url_unescape (u->host);
843 host_modified = true;
847 u->params = strdupdelim (params_b, params_e);
849 u->query = strdupdelim (query_b, query_e);
851 u->fragment = strdupdelim (fragment_b, fragment_e);
853 if (path_modified || u->fragment || host_modified || path_b == path_e)
855 /* If we suspect that a transformation has rendered what
856 url_string might return different from URL_ENCODED, rebuild
857 u->url using url_string. */
858 u->url = url_string (u, URL_AUTH_SHOW);
860 if (url_encoded != url)
861 xfree ((char *) url_encoded);
865 if (url_encoded == url)
866 u->url = xstrdup (url);
868 u->url = url_encoded;
874 /* Cleanup in case of error: */
875 if (url_encoded && url_encoded != url)
878 /* Transmit the error code to the caller, if the caller wants to
885 /* Return the error message string from ERROR_CODE, which should have
886 been retrieved from url_parse. The error message is translated. */
889 url_error (int error_code)
891 assert (error_code >= 0 && error_code < countof (parse_errors));
892 return _(parse_errors[error_code]);
895 /* Split PATH into DIR and FILE. PATH comes from the URL and is
896 expected to be URL-escaped.
898 The path is split into directory (the part up to the last slash)
899 and file (the part after the last slash), which are subsequently
903 "foo/bar/baz" "foo/bar" "baz"
904 "foo/bar/" "foo/bar" ""
906 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
908 DIR and FILE are freshly allocated. */
911 split_path (const char *path, char **dir, char **file)
913 char *last_slash = strrchr (path, '/');
917 *file = xstrdup (path);
921 *dir = strdupdelim (path, last_slash);
922 *file = xstrdup (last_slash + 1);
925 url_unescape (*file);
928 /* Note: URL's "full path" is the path with the query string and
929 params appended. The "fragment" (#foo) is intentionally ignored,
930 but that might be changed. For example, if the original URL was
931 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
932 the full path will be "/foo/bar/baz;bullshit?querystring". */
934 /* Return the length of the full path, without the terminating
938 full_path_length (const struct url *url)
942 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
953 /* Write out the full path. */
956 full_path_write (const struct url *url, char *where)
958 #define FROB(el, chr) do { \
959 char *f_el = url->el; \
961 int l = strlen (f_el); \
963 memcpy (where, f_el, l); \
975 /* Public function for getting the "full path". E.g. if u->path is
976 "foo/bar" and u->query is "param=value", full_path will be
977 "/foo/bar?param=value". */
980 url_full_path (const struct url *url)
982 int length = full_path_length (url);
983 char *full_path = xmalloc (length + 1);
985 full_path_write (url, full_path);
986 full_path[length] = '\0';
991 /* Unescape CHR in an otherwise escaped STR. Used to selectively
992 escaping of certain characters, such as "/" and ":". Returns a
993 count of unescaped chars. */
996 unescape_single_char (char *str, char chr)
998 const char c1 = XNUM_TO_DIGIT (chr >> 4);
999 const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1000 char *h = str; /* hare */
1001 char *t = str; /* tortoise */
1002 for (; *h; h++, t++)
1004 if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1015 /* Escape unsafe and reserved characters, except for the slash
1019 url_escape_dir (const char *dir)
1021 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1025 unescape_single_char (newdir, '/');
1029 /* Sync u->path and u->url with u->dir and u->file. Called after
1030 u->file or u->dir have been changed, typically by the FTP code. */
1033 sync_path (struct url *u)
1035 char *newpath, *efile, *edir;
1039 /* u->dir and u->file are not escaped. URL-escape them before
1040 reassembling them into u->path. That way, if they contain
1041 separators like '?' or even if u->file contains slashes, the
1042 path will be correctly assembled. (u->file can contain slashes
1043 if the URL specifies it with %2f, or if an FTP server returns
1045 edir = url_escape_dir (u->dir);
1046 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1049 newpath = xstrdup (efile);
1052 int dirlen = strlen (edir);
1053 int filelen = strlen (efile);
1055 /* Copy "DIR/FILE" to newpath. */
1056 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1057 memcpy (p, edir, dirlen);
1060 memcpy (p, efile, filelen);
1069 if (efile != u->file)
1072 /* Regenerate u->url as well. */
1074 u->url = url_string (u, URL_AUTH_SHOW);
1077 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1078 This way we can sync u->path and u->url when they get changed. */
1081 url_set_dir (struct url *url, const char *newdir)
1084 url->dir = xstrdup (newdir);
1089 url_set_file (struct url *url, const char *newfile)
1092 url->file = xstrdup (newfile);
1097 url_free (struct url *url)
1103 xfree_null (url->params);
1104 xfree_null (url->query);
1105 xfree_null (url->fragment);
1106 xfree_null (url->user);
1107 xfree_null (url->passwd);
1115 /* Create all the necessary directories for PATH (a file). Calls
1116 make_directory internally. */
1118 mkalldirs (const char *path)
1125 p = path + strlen (path);
1126 for (; *p != '/' && p != path; p--)
1129 /* Don't create if it's just a file. */
1130 if ((p == path) && (*p != '/'))
1132 t = strdupdelim (path, p);
1134 /* Check whether the directory exists. */
1135 if ((stat (t, &st) == 0))
1137 if (S_ISDIR (st.st_mode))
1144 /* If the dir exists as a file name, remove it first. This
1145 is *only* for Wget to work with buggy old CERN http
1146 servers. Here is the scenario: When Wget tries to
1147 retrieve a directory without a slash, e.g.
1148 http://foo/bar (bar being a directory), CERN server will
1149 not redirect it too http://foo/bar/ -- it will generate a
1150 directory listing containing links to bar/file1,
1151 bar/file2, etc. Wget will lose because it saves this
1152 HTML listing to a file `bar', so it cannot create the
1153 directory. To work around this, if the file of the same
1154 name exists, we just remove it and create the directory
1156 DEBUGP (("Removing %s because of directory danger!\n", t));
1160 res = make_directory (t);
1162 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1167 /* Functions for constructing the file name out of URL components. */
1169 /* A growable string structure, used by url_file_name and friends.
1170 This should perhaps be moved to utils.c.
1172 The idea is to have a convenient and efficient way to construct a
1173 string by having various functions append data to it. Instead of
1174 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1175 functions in questions, we pass the pointer to this struct. */
1183 /* Ensure that the string can accept APPEND_COUNT more characters past
1184 the current TAIL position. If necessary, this will grow the string
1185 and update its allocated size. If the string is already large
1186 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1187 #define GROW(g, append_size) do { \
1188 struct growable *G_ = g; \
1189 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1192 /* Return the tail position of the string. */
1193 #define TAIL(r) ((r)->base + (r)->tail)
1195 /* Move the tail position by APPEND_COUNT characters. */
1196 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1198 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1202 append_string (const char *str, struct growable *dest)
1204 int l = strlen (str);
1206 memcpy (TAIL (dest), str, l);
1207 TAIL_INCR (dest, l);
1210 /* Append CH to DEST. For example, append_char (0, DEST)
1211 zero-terminates DEST. */
1214 append_char (char ch, struct growable *dest)
1218 TAIL_INCR (dest, 1);
1222 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1223 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1224 filechr_control = 4 /* a control character, e.g. 0-31 */
1227 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1229 /* Shorthands for the table: */
1230 #define U filechr_not_unix
1231 #define W filechr_not_windows
1232 #define C filechr_control
1237 /* Table of characters unsafe under various conditions (see above).
1239 Arguably we could also claim `%' to be unsafe, since we use it as
1240 the escape character. If we ever want to be able to reliably
1241 translate file name back to URL, this would become important
1242 crucial. Right now, it's better to be minimal in escaping. */
1244 static const unsigned char filechr_table[256] =
1246 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1247 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1248 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1249 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1250 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1251 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1252 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1253 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1254 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1255 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1256 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1257 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1258 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1259 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1260 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1261 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
1263 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1264 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1271 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1279 /* FN_PORT_SEP is the separator between host and port in file names
1280 for non-standard port numbers. On Unix this is normally ':', as in
1281 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1282 because Windows can't handle ':' in file names. */
1283 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1285 /* FN_QUERY_SEP is the separator between the file name and the URL
1286 query, normally '?'. Since Windows cannot handle '?' as part of
1287 file name, we use '@' instead there. */
1288 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1290 /* Quote path element, characters in [b, e), as file name, and append
1291 the quoted string to DEST. Each character is quoted as per
1292 file_unsafe_char and the corresponding table.
1294 If ESCAPED is true, the path element is considered to be
1295 URL-escaped and will be unescaped prior to inspection. */
1298 append_uri_pathel (const char *b, const char *e, bool escaped,
1299 struct growable *dest)
1305 if (opt.restrict_files_os == restrict_unix)
1306 mask = filechr_not_unix;
1308 mask = filechr_not_windows;
1309 if (opt.restrict_files_ctrl)
1310 mask |= filechr_control;
1312 /* Copy [b, e) to PATHEL and URL-unescape it. */
1316 BOUNDED_TO_ALLOCA (b, e, unescaped);
1317 url_unescape (unescaped);
1319 e = unescaped + strlen (unescaped);
1322 /* Defang ".." when found as component of path. Remember that path
1323 comes from the URL and might contain malicious input. */
1324 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1330 /* Walk the PATHEL string and check how many characters we'll need
1333 for (p = b; p < e; p++)
1334 if (FILE_CHAR_TEST (*p, mask))
1337 /* Calculate the length of the output string. e-b is the input
1338 string length. Each quoted char introduces two additional
1339 characters in the string, hence 2*quoted. */
1340 outlen = (e - b) + (2 * quoted);
1341 GROW (dest, outlen);
1345 /* If there's nothing to quote, we can simply append the string
1346 without processing it again. */
1347 memcpy (TAIL (dest), b, outlen);
1351 char *q = TAIL (dest);
1352 for (p = b; p < e; p++)
1354 if (!FILE_CHAR_TEST (*p, mask))
1358 unsigned char ch = *p;
1360 *q++ = XNUM_TO_DIGIT (ch >> 4);
1361 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1364 assert (q - TAIL (dest) == outlen);
1367 /* Perform inline case transformation if required. */
1368 if (opt.restrict_files_case == restrict_lowercase
1369 || opt.restrict_files_case == restrict_uppercase)
1372 for (q = TAIL (dest); *q; ++q)
1374 if (opt.restrict_files_case == restrict_lowercase)
1381 TAIL_INCR (dest, outlen);
1384 /* Append to DEST the directory structure that corresponds the
1385 directory part of URL's path. For example, if the URL is
1386 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1388 Each path element ("dir1" and "dir2" in the above example) is
1389 examined, url-unescaped, and re-escaped as file name element.
1391 Additionally, it cuts as many directories from the path as
1392 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1393 will produce "bar" for the above example. For 2 or more, it will
1396 Each component of the path is quoted for use as file name. */
1399 append_dir_structure (const struct url *u, struct growable *dest)
1401 char *pathel, *next;
1402 int cut = opt.cut_dirs;
1404 /* Go through the path components, de-URL-quote them, and quote them
1405 (if necessary) as file names. */
1408 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1413 /* Ignore empty pathels. */
1417 append_char ('/', dest);
1418 append_uri_pathel (pathel, next, true, dest);
1422 /* Return a unique file name that matches the given URL as good as
1423 possible. Does not create directories on the file system. */
1426 url_file_name (const struct url *u)
1428 struct growable fnres; /* stands for "file name result" */
1430 const char *u_file, *u_query;
1431 char *fname, *unique;
1437 /* Start with the directory prefix, if specified. */
1439 append_string (opt.dir_prefix, &fnres);
1441 /* If "dirstruct" is turned on (typically the case with -r), add
1442 the host and port (unless those have been turned off) and
1443 directory structure. */
1446 if (opt.protocol_directories)
1449 append_char ('/', &fnres);
1450 append_string (supported_schemes[u->scheme].name, &fnres);
1452 if (opt.add_hostdir)
1455 append_char ('/', &fnres);
1456 if (0 != strcmp (u->host, ".."))
1457 append_string (u->host, &fnres);
1459 /* Host name can come from the network; malicious DNS may
1460 allow ".." to be resolved, causing us to write to
1461 "../<file>". Defang such host names. */
1462 append_string ("%2E%2E", &fnres);
1463 if (u->port != scheme_default_port (u->scheme))
1466 number_to_string (portstr, u->port);
1467 append_char (FN_PORT_SEP, &fnres);
1468 append_string (portstr, &fnres);
1472 append_dir_structure (u, &fnres);
1475 /* Add the file name. */
1477 append_char ('/', &fnres);
1478 u_file = *u->file ? u->file : "index.html";
1479 append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1481 /* Append "?query" to the file name. */
1482 u_query = u->query && *u->query ? u->query : NULL;
1485 append_char (FN_QUERY_SEP, &fnres);
1486 append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1489 /* Zero-terminate the file name. */
1490 append_char ('\0', &fnres);
1494 /* Check the cases in which the unique extensions are not used:
1495 1) Clobbering is turned off (-nc).
1496 2) Retrieval with regetting.
1497 3) Timestamping is used.
1498 4) Hierarchy is built.
1500 The exception is the case when file does exist and is a
1501 directory (see `mkalldirs' for explanation). */
1503 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1504 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1507 unique = unique_name (fname, true);
1508 if (unique != fname)
1513 /* Resolve "." and ".." elements of PATH by destructively modifying
1514 PATH and return true if PATH has been modified, false otherwise.
1516 The algorithm is in spirit similar to the one described in rfc1808,
1517 although implemented differently, in one pass. To recap, path
1518 elements containing only "." are removed, and ".." is taken to mean
1519 "back up one element". Single leading and trailing slashes are
1522 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1523 test examples are provided below. If you change anything in this
1524 function, run test_path_simplify to make sure you haven't broken a
1528 path_simplify (char *path)
1530 char *h = path; /* hare */
1531 char *t = path; /* tortoise */
1532 char *end = strchr (path, '\0');
1536 /* Hare should be at the beginning of a path element. */
1538 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1543 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1545 /* Handle "../" by retreating the tortoise by one path
1546 element -- but not past beggining. */
1549 /* Move backwards until T hits the beginning of the
1550 previous path element or the beginning of path. */
1551 for (--t; t > path && t[-1] != '/'; t--)
1558 /* A regular path element. If H hasn't advanced past T,
1559 simply skip to the next path element. Otherwise, copy
1560 the path element until the next slash. */
1563 /* Skip the path element, including the slash. */
1564 while (h < end && *h != '/')
1571 /* Copy the path element, including the final slash. */
1572 while (h < end && *h != '/')
1586 /* Return the length of URL's path. Path is considered to be
1587 terminated by one or more of the ?query or ;params or #fragment,
1588 depending on the scheme. */
1591 path_end (const char *url)
1593 enum url_scheme scheme = url_scheme (url);
1595 if (scheme == SCHEME_INVALID)
1596 scheme = SCHEME_HTTP; /* use http semantics for rel links */
1597 /* +2 to ignore the first two separators ':' and '/' */
1598 seps = init_seps (scheme) + 2;
1599 return strpbrk_or_eos (url, seps);
1602 /* Find the last occurrence of character C in the range [b, e), or
1603 NULL, if none are present. */
1604 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1606 /* Merge BASE with LINK and return the resulting URI.
1608 Either of the URIs may be absolute or relative, complete with the
1609 host name, or path only. This tries to reasonably handle all
1610 foreseeable cases. It only employs minimal URL parsing, without
1611 knowledge of the specifics of schemes.
1613 I briefly considered making this function call path_simplify after
1614 the merging process, as rfc1738 seems to suggest. This is a bad
1615 idea for several reasons: 1) it complexifies the code, and 2)
1616 url_parse has to simplify path anyway, so it's wasteful to boot. */
1619 uri_merge (const char *base, const char *link)
1625 if (url_has_scheme (link))
1626 return xstrdup (link);
1628 /* We may not examine BASE past END. */
1629 end = path_end (base);
1630 linklength = strlen (link);
1634 /* Empty LINK points back to BASE, query string and all. */
1635 return xstrdup (base);
1637 else if (*link == '?')
1639 /* LINK points to the same location, but changes the query
1640 string. Examples: */
1641 /* uri_merge("path", "?new") -> "path?new" */
1642 /* uri_merge("path?foo", "?new") -> "path?new" */
1643 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1644 /* uri_merge("path#foo", "?new") -> "path?new" */
1645 int baselength = end - base;
1646 merge = xmalloc (baselength + linklength + 1);
1647 memcpy (merge, base, baselength);
1648 memcpy (merge + baselength, link, linklength);
1649 merge[baselength + linklength] = '\0';
1651 else if (*link == '#')
1653 /* uri_merge("path", "#new") -> "path#new" */
1654 /* uri_merge("path#foo", "#new") -> "path#new" */
1655 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1656 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1658 const char *end1 = strchr (base, '#');
1660 end1 = base + strlen (base);
1661 baselength = end1 - base;
1662 merge = xmalloc (baselength + linklength + 1);
1663 memcpy (merge, base, baselength);
1664 memcpy (merge + baselength, link, linklength);
1665 merge[baselength + linklength] = '\0';
1667 else if (*link == '/' && *(link + 1) == '/')
1669 /* LINK begins with "//" and so is a net path: we need to
1670 replace everything after (and including) the double slash
1673 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1674 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1675 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1679 const char *start_insert;
1681 /* Look for first slash. */
1682 slash = memchr (base, '/', end - base);
1683 /* If found slash and it is a double slash, then replace
1684 from this point, else default to replacing from the
1686 if (slash && *(slash + 1) == '/')
1687 start_insert = slash;
1689 start_insert = base;
1691 span = start_insert - base;
1692 merge = xmalloc (span + linklength + 1);
1694 memcpy (merge, base, span);
1695 memcpy (merge + span, link, linklength);
1696 merge[span + linklength] = '\0';
1698 else if (*link == '/')
1700 /* LINK is an absolute path: we need to replace everything
1701 after (and including) the FIRST slash with LINK.
1703 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1704 "/qux/xyzzy", our result should be
1705 "http://host/qux/xyzzy". */
1708 const char *start_insert = NULL; /* for gcc to shut up. */
1709 const char *pos = base;
1710 bool seen_slash_slash = false;
1711 /* We're looking for the first slash, but want to ignore
1714 slash = memchr (pos, '/', end - pos);
1715 if (slash && !seen_slash_slash)
1716 if (*(slash + 1) == '/')
1719 seen_slash_slash = true;
1723 /* At this point, SLASH is the location of the first / after
1724 "//", or the first slash altogether. START_INSERT is the
1725 pointer to the location where LINK will be inserted. When
1726 examining the last two examples, keep in mind that LINK
1729 if (!slash && !seen_slash_slash)
1730 /* example: "foo" */
1732 start_insert = base;
1733 else if (!slash && seen_slash_slash)
1734 /* example: "http://foo" */
1737 else if (slash && !seen_slash_slash)
1738 /* example: "foo/bar" */
1740 start_insert = base;
1741 else if (slash && seen_slash_slash)
1742 /* example: "http://something/" */
1744 start_insert = slash;
1746 span = start_insert - base;
1747 merge = xmalloc (span + linklength + 1);
1749 memcpy (merge, base, span);
1750 memcpy (merge + span, link, linklength);
1751 merge[span + linklength] = '\0';
1755 /* LINK is a relative URL: we need to replace everything
1756 after last slash (possibly empty) with LINK.
1758 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1759 our result should be "whatever/foo/qux/xyzzy". */
1760 bool need_explicit_slash = false;
1762 const char *start_insert;
1763 const char *last_slash = find_last_char (base, end, '/');
1766 /* No slash found at all. Replace what we have with LINK. */
1767 start_insert = base;
1769 else if (last_slash && last_slash >= base + 2
1770 && last_slash[-2] == ':' && last_slash[-1] == '/')
1772 /* example: http://host" */
1774 start_insert = end + 1;
1775 need_explicit_slash = true;
1779 /* example: "whatever/foo/bar" */
1781 start_insert = last_slash + 1;
1784 span = start_insert - base;
1785 merge = xmalloc (span + linklength + 1);
1787 memcpy (merge, base, span);
1788 if (need_explicit_slash)
1789 merge[span - 1] = '/';
1790 memcpy (merge + span, link, linklength);
1791 merge[span + linklength] = '\0';
1797 #define APPEND(p, s) do { \
1798 int len = strlen (s); \
1799 memcpy (p, s, len); \
1803 /* Use this instead of password when the actual password is supposed
1804 to be hidden. We intentionally use a generic string without giving
1805 away the number of characters in the password, like previous
1807 #define HIDDEN_PASSWORD "*password*"
1809 /* Recreate the URL string from the data in URL.
1811 If HIDE is true (as it is when we're calling this on a URL we plan
1812 to print, but not when calling it to canonicalize a URL for use
1813 within the program), password will be hidden. Unsafe characters in
1814 the URL will be quoted. */
1817 url_string (const struct url *url, enum url_auth_mode auth_mode)
1821 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1823 int scheme_port = supported_schemes[url->scheme].default_port;
1824 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1825 int fplen = full_path_length (url);
1827 bool brackets_around_host;
1829 assert (scheme_str != NULL);
1831 /* Make sure the user name and password are quoted. */
1834 if (auth_mode != URL_AUTH_HIDE)
1836 quoted_user = url_escape_allow_passthrough (url->user);
1839 if (auth_mode == URL_AUTH_HIDE_PASSWD)
1840 quoted_passwd = HIDDEN_PASSWORD;
1842 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1847 /* In the unlikely event that the host name contains non-printable
1848 characters, quote it for displaying to the user. */
1849 quoted_host = url_escape_allow_passthrough (url->host);
1851 /* Undo the quoting of colons that URL escaping performs. IPv6
1852 addresses may legally contain colons, and in that case must be
1853 placed in square brackets. */
1854 if (quoted_host != url->host)
1855 unescape_single_char (quoted_host, ':');
1856 brackets_around_host = strchr (quoted_host, ':') != NULL;
1858 size = (strlen (scheme_str)
1859 + strlen (quoted_host)
1860 + (brackets_around_host ? 2 : 0)
1863 if (url->port != scheme_port)
1864 size += 1 + numdigit (url->port);
1867 size += 1 + strlen (quoted_user);
1869 size += 1 + strlen (quoted_passwd);
1872 p = result = xmalloc (size);
1874 APPEND (p, scheme_str);
1877 APPEND (p, quoted_user);
1881 APPEND (p, quoted_passwd);
1886 if (brackets_around_host)
1888 APPEND (p, quoted_host);
1889 if (brackets_around_host)
1891 if (url->port != scheme_port)
1894 p = number_to_string (p, url->port);
1897 full_path_write (url, p);
1901 assert (p - result == size);
1903 if (quoted_user && quoted_user != url->user)
1904 xfree (quoted_user);
1905 if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1906 && quoted_passwd != url->passwd)
1907 xfree (quoted_passwd);
1908 if (quoted_host != url->host)
1909 xfree (quoted_host);
1914 /* Return true if scheme a is similar to scheme b.
1916 Schemes are similar if they are equal. If SSL is supported, schemes
1917 are also similar if one is http (SCHEME_HTTP) and the other is https
1920 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1925 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1926 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1933 getchar_from_escaped_string (const char *str, char *c)
1935 const char *p = str;
1937 assert (str && *str);
1943 return 0; /* error: invalid string */
1953 return 0; /* error: invalid string */
1955 *c = X2DIGITS_TO_NUM (p[1], p[2]);
1969 are_urls_equal (const char *u1, const char *u2)
1979 && (pp = getchar_from_escaped_string (p, &ch1))
1980 && (qq = getchar_from_escaped_string (q, &ch2))
1981 && (TOLOWER(ch1) == TOLOWER(ch2)))
1987 return (*p == 0 && *q == 0 ? true : false);
1991 /* Debugging and testing support for path_simplify. */
1993 /* Debug: run path_simplify on PATH and return the result in a new
1994 string. Useful for calling from the debugger. */
1998 char *copy = xstrdup (path);
1999 path_simplify (copy);
2004 run_test (char *test, char *expected_result, bool expected_change)
2006 char *test_copy = xstrdup (test);
2007 bool modified = path_simplify (test_copy);
2009 if (0 != strcmp (test_copy, expected_result))
2011 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2012 test, expected_result, test_copy);
2014 if (modified != expected_change)
2016 if (expected_change)
2017 printf ("Expected modification with path_simplify(\"%s\").\n",
2020 printf ("Expected no modification with path_simplify(\"%s\").\n",
2027 test_path_simplify (void)
2030 char *test, *result;
2037 { "../", "", true },
2038 { "foo", "foo", false },
2039 { "foo/bar", "foo/bar", false },
2040 { "foo///bar", "foo///bar", false },
2041 { "foo/.", "foo/", true },
2042 { "foo/./", "foo/", true },
2043 { "foo./", "foo./", false },
2044 { "foo/../bar", "bar", true },
2045 { "foo/../bar/", "bar/", true },
2046 { "foo/bar/..", "foo/", true },
2047 { "foo/bar/../x", "foo/x", true },
2048 { "foo/bar/../x/", "foo/x/", true },
2049 { "foo/..", "", true },
2050 { "foo/../..", "", true },
2051 { "foo/../../..", "", true },
2052 { "foo/../../bar/../../baz", "baz", true },
2053 { "a/b/../../c", "c", true },
2054 { "./a/../b", "b", true }
2058 for (i = 0; i < countof (tests); i++)
2060 char *test = tests[i].test;
2061 char *expected_result = tests[i].result;
2062 bool expected_change = tests[i].should_modify;
2063 run_test (test, expected_result, expected_change);
2071 test_append_uri_pathel()
2078 char *expected_result;
2080 { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2083 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2085 struct growable dest;
2086 const char *p = test_array[i].input;
2088 memset (&dest, 0, sizeof (dest));
2090 append_string (test_array[i].original_url, &dest);
2091 append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2093 mu_assert ("test_append_uri_pathel: wrong result",
2094 strcmp (dest.base, test_array[i].expected_result) == 0);
2101 test_are_urls_equal()
2107 bool expected_result;
2109 { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true },
2110 { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2111 { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false },
2112 { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true },
2115 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2117 mu_assert ("test_are_urls_equal: wrong result",
2118 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2124 #endif /* TESTING */