2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
62 static const int NS_INADDRSZ = 4;
63 static const int NS_IN6ADDRSZ = 16;
64 static const int NS_INT16SZ = 2;
74 /* Supported schemes: */
75 static struct scheme_data supported_schemes[] =
77 { "http://", DEFAULT_HTTP_PORT, 1 },
79 { "https://", DEFAULT_HTTPS_PORT, 1 },
81 { "ftp://", DEFAULT_FTP_PORT, 1 },
87 /* Forward declarations: */
89 static char *construct_relative PARAMS ((const char *, const char *));
90 static int path_simplify PARAMS ((char *));
94 /* Support for encoding and decoding of URL strings. We determine
95 whether a character is unsafe through static table lookup. This
96 code assumes ASCII character set and 8-bit chars. */
99 /* rfc1738 reserved chars, preserved from encoding. */
102 /* rfc1738 unsafe chars, plus some more. */
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
110 /* Shorthands for the table: */
111 #define R urlchr_reserved
112 #define U urlchr_unsafe
115 const static unsigned char urlchr_table[256] =
117 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
118 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
119 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
120 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
121 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
122 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
123 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
124 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
125 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
126 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
127 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
128 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
129 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
130 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
131 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
132 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
134 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 /* Decodes the forms %xy in a URL to the character the hexadecimal
149 code of which is xy. xy are hexadecimal digits from
150 [0123456789ABCDEF] (case-insensitive). If x or y are not
151 hex-digits or `%' precedes `\0', the sequence is inserted
155 url_unescape (char *s)
157 char *t = s; /* t - tortoise */
158 char *h = s; /* h - hare */
169 /* Do nothing if '%' is not followed by two hex digits. */
170 if (!*(h + 1) || !*(h + 2)
171 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
173 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
180 /* Like url_escape, but return S if there are no unsafe chars. */
183 url_escape_allow_passthrough (const char *s)
190 for (p1 = s; *p1; p1++)
191 if (URL_UNSAFE_CHAR (*p1))
192 addition += 2; /* Two more characters (hex digits) */
197 newlen = (p1 - s) + addition;
198 newstr = (char *)xmalloc (newlen + 1);
204 if (URL_UNSAFE_CHAR (*p1))
206 unsigned char c = *p1++;
208 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
209 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
215 assert (p2 - newstr == newlen);
220 /* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
221 given string, returning a malloc-ed %XX encoded string. */
224 url_escape (const char *s)
226 char *encoded = url_escape_allow_passthrough (s);
233 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
234 the old value of PTR is freed and PTR is made to point to the newly
235 allocated storage. */
237 #define ENCODE(ptr) do { \
238 char *e_new = url_escape_allow_passthrough (ptr); \
246 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
248 /* Decide whether to encode, decode, or pass through the char at P.
249 This used to be a macro, but it got a little too convoluted. */
250 static inline enum copy_method
251 decide_copy_method (const char *p)
255 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
257 /* %xx sequence: decode it, unless it would decode to an
258 unsafe or a reserved char; in that case, leave it as
260 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
261 XCHAR_TO_XDIGIT (*(p + 2));
263 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
264 return CM_PASSTHROUGH;
269 /* Garbled %.. sequence: encode `%'. */
272 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
275 return CM_PASSTHROUGH;
278 /* Translate a %-escaped (but possibly non-conformant) input string S
279 into a %-escaped (and conformant) output string. If no characters
280 are encoded or decoded, return the same string S; otherwise, return
281 a freshly allocated string with the new contents.
283 After a URL has been run through this function, the protocols that
284 use `%' as the quote character can use the resulting string as-is,
285 while those that don't call url_unescape() to get to the intended
286 data. This function is also stable: after an input string is
287 transformed the first time, all further transformations of the
288 result yield the same result string.
290 Let's discuss why this function is needed.
292 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
293 space character would mess up the HTTP request, it needs to be
296 GET /abc%20def HTTP/1.0
298 It appears that the unsafe chars need to be quoted, for example
299 with url_escape. But what if we're requested to download
300 `abc%20def'? url_escape transforms "%" to "%25", which would leave
301 us with `abc%2520def'. This is incorrect -- since %-escapes are
302 part of URL syntax, "%20" is the correct way to denote a literal
303 space on the Wget command line. This leaves us in the conclusion
304 that in that case Wget should not call url_escape, but leave the
307 And what if the requested URI is `abc%20 def'? If we call
308 url_escape, we end up with `/abc%2520%20def', which is almost
309 certainly not intended. If we don't call url_escape, we are left
310 with the embedded space and cannot complete the request. What the
311 user meant was for Wget to request `/abc%20%20def', and this is
312 where reencode_escapes kicks in.
314 Wget used to solve this by first decoding %-quotes, and then
315 encoding all the "unsafe" characters found in the resulting string.
316 This was wrong because it didn't preserve certain URL special
317 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
318 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
319 whether we considered `+' reserved (it is). One of these results
320 is inevitable because by the second step we would lose information
321 on whether the `+' was originally encoded or not. Both results
322 were wrong because in CGI parameters + means space, while %2B means
323 literal plus. reencode_escapes correctly translates the above to
324 "a%2B+b", i.e. returns the original string.
326 This function uses an algorithm proposed by Anon Sricharoenchai:
328 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
331 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
334 ...except that this code conflates the two steps, and decides
335 whether to encode, decode, or pass through each character in turn.
336 The function still uses two passes, but their logic is the same --
337 the first pass exists merely for the sake of allocation. Another
338 small difference is that we include `+' to URL_RESERVED.
342 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
344 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
348 "foo bar" -> "foo%20bar"
349 "foo%20bar" -> "foo%20bar"
350 "foo %20bar" -> "foo%20%20bar"
351 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
352 "foo%25%20bar" -> "foo%25%20bar"
353 "foo%2%20bar" -> "foo%252%20bar"
354 "foo+bar" -> "foo+bar" (plus is reserved!)
355 "foo%2b+bar" -> "foo%2b+bar" */
358 reencode_escapes (const char *s)
364 int encode_count = 0;
365 int decode_count = 0;
367 /* First, pass through the string to see if there's anything to do,
368 and to calculate the new length. */
369 for (p1 = s; *p1; p1++)
371 switch (decide_copy_method (p1))
384 if (!encode_count && !decode_count)
385 /* The string is good as it is. */
386 return (char *)s; /* C const model sucks. */
389 /* Each encoding adds two characters (hex digits), while each
390 decoding removes two characters. */
391 newlen = oldlen + 2 * (encode_count - decode_count);
392 newstr = xmalloc (newlen + 1);
399 switch (decide_copy_method (p1))
403 unsigned char c = *p1++;
405 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
406 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
410 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
411 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
412 p1 += 3; /* skip %xx */
419 assert (p2 - newstr == newlen);
423 /* Run PTR_VAR through reencode_escapes. If a new string is consed,
424 free PTR_VAR and make it point to the new storage. Obviously,
425 PTR_VAR needs to be an lvalue. */
427 #define REENCODE(ptr_var) do { \
428 char *rf_new = reencode_escapes (ptr_var); \
429 if (rf_new != ptr_var) \
436 /* Returns the scheme type if the scheme is supported, or
437 SCHEME_INVALID if not. */
439 url_scheme (const char *url)
443 for (i = 0; supported_schemes[i].leading_string; i++)
444 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
445 strlen (supported_schemes[i].leading_string)))
447 if (supported_schemes[i].enabled)
448 return (enum url_scheme) i;
450 return SCHEME_INVALID;
453 return SCHEME_INVALID;
456 /* Return the number of characters needed to skip the scheme part of
457 the URL, e.g. `http://'. If no scheme is found, returns 0. */
459 url_skip_scheme (const char *url)
463 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
465 while (ISALNUM (*p) || *p == '-' || *p == '+')
472 /* Skip "//" if found. */
473 if (*p == '/' && *(p + 1) == '/')
479 /* Returns 1 if the URL begins with a scheme (supported or
480 unsupported), 0 otherwise. */
482 url_has_scheme (const char *url)
485 while (ISALNUM (*p) || *p == '-' || *p == '+')
491 scheme_default_port (enum url_scheme scheme)
493 return supported_schemes[scheme].default_port;
497 scheme_disable (enum url_scheme scheme)
499 supported_schemes[scheme].enabled = 0;
502 /* Skip the username and password, if present here. The function
503 should be called *not* with the complete URL, but with the part
504 right after the scheme.
506 If no username and password are found, return 0. */
508 url_skip_uname (const char *url)
512 /* Look for '@' that comes before '/' or '?'. */
513 p = (const char *)strpbrk (url, "/?@");
521 parse_uname (const char *str, int len, char **user, char **passwd)
526 /* Empty user name not allowed. */
529 colon = memchr (str, ':', len);
531 /* Empty user name again. */
536 int pwlen = len - (colon + 1 - str);
537 *passwd = xmalloc (pwlen + 1);
538 memcpy (*passwd, colon + 1, pwlen);
539 (*passwd)[pwlen] = '\0';
545 *user = xmalloc (len + 1);
546 memcpy (*user, str, len);
550 url_unescape (*user);
552 url_unescape (*passwd);
557 /* Used by main.c: detect URLs written using the "shorthand" URL forms
558 popularized by Netscape and NcFTP. HTTP shorthands look like this:
560 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
561 www.foo.com[:port] -> http://www.foo.com[:port]
563 FTP shorthands look like this:
565 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
566 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
568 If the URL needs not or cannot be rewritten, return NULL. */
570 rewrite_shorthand_url (const char *url)
574 if (url_has_scheme (url))
577 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
579 for (p = url; *p && *p != ':' && *p != '/'; p++)
589 /* If the characters after the colon and before the next slash
590 or end of string are all digits, it's HTTP. */
592 for (pp = p + 1; ISDIGIT (*pp); pp++)
594 if (digits > 0 && (*pp == '/' || *pp == '\0'))
597 /* Prepend "ftp://" to the entire URL... */
598 res = xmalloc (6 + strlen (url) + 1);
599 sprintf (res, "ftp://%s", url);
600 /* ...and replace ':' with '/'. */
601 res[6 + (p - url)] = '/';
608 /* Just prepend "http://" to what we have. */
609 res = xmalloc (7 + strlen (url) + 1);
610 sprintf (res, "http://%s", url);
615 static void parse_path PARAMS ((const char *, char **, char **));
617 /* Like strpbrk, with the exception that it returns the pointer to the
618 terminating zero (end-of-string aka "eos") if no matching character
622 strpbrk_or_eos (const char *s, const char *accept)
624 char *p = strpbrk (s, accept);
626 p = (char *)s + strlen (s);
630 /* Turn STR into lowercase; return non-zero if a character was
634 lowercase_str (char *str)
641 *str = TOLOWER (*str);
646 static char *parse_errors[] = {
647 #define PE_NO_ERROR 0
649 #define PE_UNSUPPORTED_SCHEME 1
650 "Unsupported scheme",
651 #define PE_EMPTY_HOST 2
653 #define PE_BAD_PORT_NUMBER 3
655 #define PE_INVALID_USER_NAME 4
657 #define PE_UNTERMINATED_IPV6_ADDRESS 5
658 "Unterminated IPv6 numeric address",
659 #define PE_IPV6_NOT_SUPPORTED 6
660 "IPv6 addresses not supported",
661 #define PE_INVALID_IPV6_ADDRESS 7
662 "Invalid IPv6 numeric address"
665 #define SETERR(p, v) do { \
671 /* The following two functions were adapted from glibc. */
674 is_valid_ipv4_address (const char *str, const char *end)
676 int saw_digit, octets;
686 if (ch >= '0' && ch <= '9') {
687 val = val * 10 + (ch - '0');
691 if (saw_digit == 0) {
696 } else if (ch == '.' && saw_digit == 1) {
711 is_valid_ipv6_address (const char *str, const char *end)
713 static const char xdigits[] = "0123456789abcdef";
726 /* Leading :: requires some special handling. */
730 if (str == end || *str != ':')
742 /* if ch is a number, add it to val. */
743 pch = strchr(xdigits, ch);
746 val |= (pch - xdigits);
753 /* if ch is a colon ... */
756 if (saw_xdigit == 0) {
761 } else if (str == end) {
764 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
772 /* if ch is a dot ... */
773 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
774 is_valid_ipv4_address(curtok, end) == 1) {
783 if (saw_xdigit == 1) {
784 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
789 if (colonp != NULL) {
790 if (tp == NS_IN6ADDRSZ)
795 if (tp != NS_IN6ADDRSZ)
804 Return a new struct url if successful, NULL on error. In case of
805 error, and if ERROR is not NULL, also set *ERROR to the appropriate
808 url_parse (const char *url, int *error)
812 int path_modified, host_modified;
814 enum url_scheme scheme;
816 const char *uname_b, *uname_e;
817 const char *host_b, *host_e;
818 const char *path_b, *path_e;
819 const char *params_b, *params_e;
820 const char *query_b, *query_e;
821 const char *fragment_b, *fragment_e;
824 char *user = NULL, *passwd = NULL;
828 scheme = url_scheme (url);
829 if (scheme == SCHEME_INVALID)
831 SETERR (error, PE_UNSUPPORTED_SCHEME);
835 url_encoded = reencode_escapes (url);
838 p += strlen (supported_schemes[scheme].leading_string);
840 p += url_skip_uname (p);
843 /* scheme://user:pass@host[:port]... */
846 /* We attempt to break down the URL into the components path,
847 params, query, and fragment. They are ordered like this:
849 scheme://host[:port][/path][;params][?query][#fragment] */
851 params_b = params_e = NULL;
852 query_b = query_e = NULL;
853 fragment_b = fragment_e = NULL;
859 /* Handle IPv6 address inside square brackets. Ideally we'd
860 just look for the terminating ']', but rfc2732 mandates
861 rejecting invalid IPv6 addresses. */
863 /* The address begins after '['. */
865 host_e = strchr (host_b, ']');
869 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
874 /* Check if the IPv6 address is valid. */
875 if (!is_valid_ipv6_address(host_b, host_e))
877 SETERR (error, PE_INVALID_IPV6_ADDRESS);
881 /* Continue parsing after the closing ']'. */
884 SETERR (error, PE_IPV6_NOT_SUPPORTED);
890 p = strpbrk_or_eos (p, ":/;?#");
894 if (host_b == host_e)
896 SETERR (error, PE_EMPTY_HOST);
900 port = scheme_default_port (scheme);
903 const char *port_b, *port_e, *pp;
905 /* scheme://host:port/tralala */
909 p = strpbrk_or_eos (p, "/;?#");
912 if (port_b == port_e)
914 /* http://host:/whatever */
916 SETERR (error, PE_BAD_PORT_NUMBER);
920 for (port = 0, pp = port_b; pp < port_e; pp++)
924 /* http://host:12randomgarbage/blah */
926 SETERR (error, PE_BAD_PORT_NUMBER);
930 port = 10 * port + (*pp - '0');
938 p = strpbrk_or_eos (p, ";?#");
943 /* Path is not allowed not to exist. */
951 p = strpbrk_or_eos (p, "?#");
958 p = strpbrk_or_eos (p, "#");
961 /* Hack that allows users to use '?' (a wildcard character) in
962 FTP URLs without it being interpreted as a query string
964 if (scheme == SCHEME_FTP)
966 query_b = query_e = NULL;
979 if (uname_b != uname_e)
981 /* http://user:pass@host */
983 /* uname_b uname_e */
984 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
986 SETERR (error, PE_INVALID_USER_NAME);
991 u = (struct url *)xmalloc (sizeof (struct url));
992 memset (u, 0, sizeof (*u));
995 u->host = strdupdelim (host_b, host_e);
1000 u->path = strdupdelim (path_b, path_e);
1001 path_modified = path_simplify (u->path);
1002 parse_path (u->path, &u->dir, &u->file);
1004 host_modified = lowercase_str (u->host);
1007 u->params = strdupdelim (params_b, params_e);
1009 u->query = strdupdelim (query_b, query_e);
1011 u->fragment = strdupdelim (fragment_b, fragment_e);
1013 if (path_modified || u->fragment || host_modified || path_b == path_e)
1015 /* If we suspect that a transformation has rendered what
1016 url_string might return different from URL_ENCODED, rebuild
1017 u->url using url_string. */
1018 u->url = url_string (u, 0);
1020 if (url_encoded != url)
1021 xfree ((char *) url_encoded);
1025 if (url_encoded == url)
1026 u->url = xstrdup (url);
1028 u->url = url_encoded;
1036 url_error (int error_code)
1038 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1039 return parse_errors[error_code];
1042 /* Parse PATH into dir and file. PATH is extracted from the URL and
1043 is URL-escaped. The function returns unescaped DIR and FILE. */
1046 parse_path (const char *path, char **dir, char **file)
1050 last_slash = strrchr (path, '/');
1053 *dir = xstrdup ("");
1054 *file = xstrdup (path);
1058 *dir = strdupdelim (path, last_slash);
1059 *file = xstrdup (last_slash + 1);
1061 url_unescape (*dir);
1062 url_unescape (*file);
1065 /* Note: URL's "full path" is the path with the query string and
1066 params appended. The "fragment" (#foo) is intentionally ignored,
1067 but that might be changed. For example, if the original URL was
1068 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1069 the full path will be "/foo/bar/baz;bullshit?querystring". */
1071 /* Return the length of the full path, without the terminating
1075 full_path_length (const struct url *url)
1079 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1090 /* Write out the full path. */
1093 full_path_write (const struct url *url, char *where)
1095 #define FROB(el, chr) do { \
1096 char *f_el = url->el; \
1098 int l = strlen (f_el); \
1100 memcpy (where, f_el, l); \
1112 /* Public function for getting the "full path". E.g. if u->path is
1113 "foo/bar" and u->query is "param=value", full_path will be
1114 "/foo/bar?param=value". */
1117 url_full_path (const struct url *url)
1119 int length = full_path_length (url);
1120 char *full_path = (char *)xmalloc(length + 1);
1122 full_path_write (url, full_path);
1123 full_path[length] = '\0';
1128 /* Sync u->path and u->url with u->dir and u->file. */
1131 sync_path (struct url *url)
1139 newpath = xstrdup (url->file);
1144 int dirlen = strlen (url->dir);
1145 int filelen = strlen (url->file);
1147 newpath = xmalloc (dirlen + 1 + filelen + 1);
1148 memcpy (newpath, url->dir, dirlen);
1149 newpath[dirlen] = '/';
1150 memcpy (newpath + dirlen + 1, url->file, filelen);
1151 newpath[dirlen + 1 + filelen] = '\0';
1155 url->path = newpath;
1157 /* Synchronize u->url. */
1159 url->url = url_string (url, 0);
1162 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1163 This way we can sync u->path and u->url when they get changed. */
1166 url_set_dir (struct url *url, const char *newdir)
1169 url->dir = xstrdup (newdir);
1174 url_set_file (struct url *url, const char *newfile)
1177 url->file = xstrdup (newfile);
1182 url_free (struct url *url)
1188 FREE_MAYBE (url->params);
1189 FREE_MAYBE (url->query);
1190 FREE_MAYBE (url->fragment);
1191 FREE_MAYBE (url->user);
1192 FREE_MAYBE (url->passwd);
1201 get_urls_file (const char *file)
1203 struct file_memory *fm;
1204 struct urlpos *head, *tail;
1205 const char *text, *text_end;
1207 /* Load the file. */
1208 fm = read_file (file);
1211 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1214 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1218 text_end = fm->content + fm->length;
1219 while (text < text_end)
1221 const char *line_beg = text;
1222 const char *line_end = memchr (text, '\n', text_end - text);
1224 line_end = text_end;
1229 /* Strip whitespace from the beginning and end of line. */
1230 while (line_beg < line_end && ISSPACE (*line_beg))
1232 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1235 if (line_end > line_beg)
1237 /* URL is in the [line_beg, line_end) region. */
1241 struct urlpos *entry;
1244 /* We must copy the URL to a zero-terminated string, and we
1245 can't use alloca because we're in a loop. *sigh*. */
1246 url_text = strdupdelim (line_beg, line_end);
1250 /* Merge opt.base_href with URL. */
1251 char *merged = uri_merge (opt.base_href, url_text);
1256 url = url_parse (url_text, &up_error_code);
1259 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1260 file, url_text, url_error (up_error_code));
1266 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1267 memset (entry, 0, sizeof (*entry));
1278 read_file_free (fm);
1282 /* Free the linked list of urlpos. */
1284 free_urlpos (struct urlpos *l)
1288 struct urlpos *next = l->next;
1291 FREE_MAYBE (l->local_name);
1297 /* Rotate FNAME opt.backups times */
1299 rotate_backups(const char *fname)
1301 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1302 char *from = (char *)alloca (maxlen);
1303 char *to = (char *)alloca (maxlen);
1307 if (stat (fname, &sb) == 0)
1308 if (S_ISREG (sb.st_mode) == 0)
1311 for (i = opt.backups; i > 1; i--)
1313 sprintf (from, "%s.%d", fname, i - 1);
1314 sprintf (to, "%s.%d", fname, i);
1318 sprintf (to, "%s.%d", fname, 1);
1322 /* Create all the necessary directories for PATH (a file). Calls
1323 mkdirhier() internally. */
1325 mkalldirs (const char *path)
1332 p = path + strlen (path);
1333 for (; *p != '/' && p != path; p--)
1336 /* Don't create if it's just a file. */
1337 if ((p == path) && (*p != '/'))
1339 t = strdupdelim (path, p);
1341 /* Check whether the directory exists. */
1342 if ((stat (t, &st) == 0))
1344 if (S_ISDIR (st.st_mode))
1351 /* If the dir exists as a file name, remove it first. This
1352 is *only* for Wget to work with buggy old CERN http
1353 servers. Here is the scenario: When Wget tries to
1354 retrieve a directory without a slash, e.g.
1355 http://foo/bar (bar being a directory), CERN server will
1356 not redirect it too http://foo/bar/ -- it will generate a
1357 directory listing containing links to bar/file1,
1358 bar/file2, etc. Wget will lose because it saves this
1359 HTML listing to a file `bar', so it cannot create the
1360 directory. To work around this, if the file of the same
1361 name exists, we just remove it and create the directory
1363 DEBUGP (("Removing %s because of directory danger!\n", t));
1367 res = make_directory (t);
1369 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1374 /* Functions for constructing the file name out of URL components. */
1376 /* A growable string structure, used by url_file_name and friends.
1377 This should perhaps be moved to utils.c.
1379 The idea is to have an easy way to construct a string by having
1380 various functions append data to it. Instead of passing the
1381 obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
1382 questions, we pass the pointer to this struct. */
1390 /* Ensure that the string can accept APPEND_COUNT more characters past
1391 the current TAIL position. If necessary, this will grow the string
1392 and update its allocated size. If the string is already large
1393 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1394 #define GROW(g, append_size) do { \
1395 struct growable *G_ = g; \
1396 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1399 /* Return the tail position of the string. */
1400 #define TAIL(r) ((r)->base + (r)->tail)
1402 /* Move the tail position by APPEND_COUNT characters. */
1403 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1405 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1409 append_string (const char *str, struct growable *dest)
1411 int l = strlen (str);
1413 memcpy (TAIL (dest), str, l);
1414 TAIL_INCR (dest, l);
1417 /* Append CH to DEST. For example, append_char (0, DEST)
1418 zero-terminates DEST. */
1421 append_char (char ch, struct growable *dest)
1425 TAIL_INCR (dest, 1);
1429 filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
1430 filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
1431 filechr_unsafe_windows = 2, /* disallowed on Windows file system */
1434 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1436 /* Shorthands for the table: */
1437 #define A filechr_unsafe_always
1438 #define S filechr_unsafe_shell
1439 #define W filechr_unsafe_windows
1444 Unix shell: 0-31, 128-159
1445 Windows: \, |, /, <, >, ?, :
1447 Arguably we could also claim `%' to be unsafe, since we use it as
1448 the escape character. If we ever want to be able to reliably
1449 translate file name back to URL, this would become important
1450 crucial. Right now, it's better to be minimal in escaping. */
1452 const static unsigned char filechr_table[256] =
1454 A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1455 S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
1456 S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1457 S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
1458 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1459 0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
1460 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1461 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1462 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1463 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1464 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1465 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1466 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1467 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1468 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1469 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1471 S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
1472 S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
1473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1482 /* Return non-zero if character CH is unsafe for use in file or
1483 directory name. Called by append_uri_pathel. */
1486 file_unsafe_char (char ch, int restrict)
1488 int mask = filechr_unsafe_always;
1489 if (restrict == restrict_shell)
1490 mask |= filechr_unsafe_shell;
1491 else if (restrict == restrict_windows)
1492 mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
1493 return FILE_CHAR_TEST (ch, mask);
1496 /* FN_PORT_SEP is the separator between host and port in file names
1497 for non-standard port numbers. On Unix this is normally ':', as in
1498 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1499 because Windows can't handle ':' in file names. */
1500 #define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
1502 /* FN_QUERY_SEP is the separator between the file name and the URL
1503 query, normally '?'. Since Windows cannot handle '?' as part of
1504 file name, we use '@' instead there. */
1505 #define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
1507 /* Quote path element, characters in [b, e), as file name, and append
1508 the quoted string to DEST. Each character is quoted as per
1509 file_unsafe_char and the corresponding table. */
1512 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1520 /* Currently restrict_for_windows is determined at compile time
1521 only. But some users download files to Windows partitions; they
1522 should be able to say --windows-file-names so Wget escapes
1523 characters invalid on Windows. Similar run-time restrictions for
1524 other file systems can be implemented. */
1525 const int restrict = opt.restrict_file_names;
1527 /* Copy [b, e) to PATHEL and URL-unescape it. */
1528 BOUNDED_TO_ALLOCA (b, e, pathel);
1529 url_unescape (pathel);
1530 pathlen = strlen (pathel);
1532 /* Go through PATHEL and check how many characters we'll need to
1533 add for file quoting. */
1535 for (p = pathel; *p; p++)
1536 if (file_unsafe_char (*p, restrict))
1539 /* p - pathel is the string length. Each quoted char means two
1540 additional characters in the string, hence 2*quoted. */
1541 outlen = (p - pathel) + (2 * quoted);
1542 GROW (dest, outlen);
1546 /* If there's nothing to quote, we don't need to go through the
1547 string the second time. */
1548 memcpy (TAIL (dest), pathel, outlen);
1552 char *q = TAIL (dest);
1553 for (p = pathel; *p; p++)
1555 if (!file_unsafe_char (*p, restrict))
1559 unsigned char ch = *p;
1561 *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1562 *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1565 assert (q - TAIL (dest) == outlen);
1567 TAIL_INCR (dest, outlen);
1570 /* Append to DEST the directory structure that corresponds the
1571 directory part of URL's path. For example, if the URL is
1572 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1574 Each path element ("dir1" and "dir2" in the above example) is
1575 examined, url-unescaped, and re-escaped as file name element.
1577 Additionally, it cuts as many directories from the path as
1578 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1579 will produce "bar" for the above example. For 2 or more, it will
1582 Each component of the path is quoted for use as file name. */
1585 append_dir_structure (const struct url *u, struct growable *dest)
1587 char *pathel, *next;
1588 int cut = opt.cut_dirs;
1590 /* Go through the path components, de-URL-quote them, and quote them
1591 (if necessary) as file names. */
1594 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1599 /* Ignore empty pathels. path_simplify should remove
1600 occurrences of "//" from the path, but it has special cases
1601 for starting / which generates an empty pathel here. */
1605 append_char ('/', dest);
1606 append_uri_pathel (pathel, next, dest);
1610 /* Return a unique file name that matches the given URL as good as
1611 possible. Does not create directories on the file system. */
1614 url_file_name (const struct url *u)
1616 struct growable fnres;
1618 char *u_file, *u_query;
1619 char *fname, *unique;
1625 /* Start with the directory prefix, if specified. */
1626 if (!DOTP (opt.dir_prefix))
1627 append_string (opt.dir_prefix, &fnres);
1629 /* If "dirstruct" is turned on (typically the case with -r), add
1630 the host and port (unless those have been turned off) and
1631 directory structure. */
1634 if (opt.add_hostdir)
1637 append_char ('/', &fnres);
1638 append_string (u->host, &fnres);
1639 if (u->port != scheme_default_port (u->scheme))
1642 number_to_string (portstr, u->port);
1643 append_char (FN_PORT_SEP, &fnres);
1644 append_string (portstr, &fnres);
1648 append_dir_structure (u, &fnres);
1651 /* Add the file name. */
1653 append_char ('/', &fnres);
1654 u_file = *u->file ? u->file : "index.html";
1655 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1657 /* Append "?query" to the file name. */
1658 u_query = u->query && *u->query ? u->query : NULL;
1661 append_char (FN_QUERY_SEP, &fnres);
1662 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1665 /* Zero-terminate the file name. */
1666 append_char ('\0', &fnres);
1670 /* Check the cases in which the unique extensions are not used:
1671 1) Clobbering is turned off (-nc).
1672 2) Retrieval with regetting.
1673 3) Timestamping is used.
1674 4) Hierarchy is built.
1676 The exception is the case when file does exist and is a
1677 directory (actually support for bad httpd-s). */
1679 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1680 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1683 /* Find a unique name. */
1684 unique = unique_name (fname);
1689 /* Return the length of URL's path. Path is considered to be
1690 terminated by one of '?', ';', '#', or by the end of the
1693 path_length (const char *url)
1695 const char *q = strpbrk_or_eos (url, "?;#");
1699 /* Find the last occurrence of character C in the range [b, e), or
1700 NULL, if none are present. This is equivalent to strrchr(b, c),
1701 except that it accepts an END argument instead of requiring the
1702 string to be zero-terminated. Why is there no memrchr()? */
1704 find_last_char (const char *b, const char *e, char c)
1712 /* Resolve "." and ".." elements of PATH by destructively modifying
1713 PATH. "." is resolved by removing that path element, and ".." is
1714 resolved by removing the preceding path element. Leading and
1715 trailing slashes are preserved.
1717 Return non-zero if any changes have been made.
1719 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1720 test examples are provided below. If you change anything in this
1721 function, run test_path_simplify to make sure you haven't broken a
1724 A previous version of this function was based on path_simplify()
1725 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1728 path_simplify (char *path)
1734 ++path; /* preserve the leading '/'. */
1737 end = p + strlen (p) + 1; /* position past the terminating zero. */
1742 /* P should point to the beginning of a path element. */
1744 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1746 /* Handle "./foo" by moving "foo" two characters to the
1748 if (*(p + 1) == '/')
1751 memmove (p, p + 2, end - p);
1762 else if (*p == '.' && *(p + 1) == '.'
1763 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1765 /* Handle "../foo" by moving "foo" one path element to the
1767 char *b = p; /* not p-1 because P can equal PATH */
1769 /* Backtrack by one path element, but not past the beginning
1772 /* foo/bar/../baz */
1778 /* Move backwards until B hits the beginning of the
1779 previous path element or the beginning of path. */
1780 for (--b; b > path && *(b - 1) != '/'; b--)
1785 if (*(p + 2) == '/')
1787 memmove (b, p + 3, end - (p + 3));
1801 /* Remove empty path elements. Not mandated by rfc1808 et
1802 al, but it seems like a good idea to get rid of them.
1803 Supporting them properly is hard (in which directory do
1804 you save http://x.com///y.html?) and they don't seem to
1815 memmove (p, q, end - q);
1820 /* Skip to the next path element. */
1821 while (*p && *p != '/')
1826 /* Make sure P points to the beginning of the next path element,
1827 which is location after the slash. */
1834 /* Resolve the result of "linking" a base URI (BASE) to a
1835 link-specified URI (LINK).
1837 Either of the URIs may be absolute or relative, complete with the
1838 host name, or path only. This tries to behave "reasonably" in all
1839 foreseeable cases. It employs little specific knowledge about
1840 schemes or URL-specific stuff -- it just works on strings.
1842 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1843 See uri_merge for a gentler interface to this functionality.
1845 Perhaps this function should call path_simplify so that the callers
1846 don't have to call url_parse unconditionally. */
1848 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1854 const char *end = base + path_length (base);
1858 /* Empty LINK points back to BASE, query string and all. */
1859 constr = xstrdup (base);
1861 else if (*link == '?')
1863 /* LINK points to the same location, but changes the query
1864 string. Examples: */
1865 /* uri_merge("path", "?new") -> "path?new" */
1866 /* uri_merge("path?foo", "?new") -> "path?new" */
1867 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1868 /* uri_merge("path#foo", "?new") -> "path?new" */
1869 int baselength = end - base;
1870 constr = xmalloc (baselength + linklength + 1);
1871 memcpy (constr, base, baselength);
1872 memcpy (constr + baselength, link, linklength);
1873 constr[baselength + linklength] = '\0';
1875 else if (*link == '#')
1877 /* uri_merge("path", "#new") -> "path#new" */
1878 /* uri_merge("path#foo", "#new") -> "path#new" */
1879 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1880 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1882 const char *end1 = strchr (base, '#');
1884 end1 = base + strlen (base);
1885 baselength = end1 - base;
1886 constr = xmalloc (baselength + linklength + 1);
1887 memcpy (constr, base, baselength);
1888 memcpy (constr + baselength, link, linklength);
1889 constr[baselength + linklength] = '\0';
1891 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1893 /* LINK begins with "//" and so is a net path: we need to
1894 replace everything after (and including) the double slash
1897 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1898 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1899 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1903 const char *start_insert;
1905 /* Look for first slash. */
1906 slash = memchr (base, '/', end - base);
1907 /* If found slash and it is a double slash, then replace
1908 from this point, else default to replacing from the
1910 if (slash && *(slash + 1) == '/')
1911 start_insert = slash;
1913 start_insert = base;
1915 span = start_insert - base;
1916 constr = (char *)xmalloc (span + linklength + 1);
1918 memcpy (constr, base, span);
1919 memcpy (constr + span, link, linklength);
1920 constr[span + linklength] = '\0';
1922 else if (*link == '/')
1924 /* LINK is an absolute path: we need to replace everything
1925 after (and including) the FIRST slash with LINK.
1927 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1928 "/qux/xyzzy", our result should be
1929 "http://host/qux/xyzzy". */
1932 const char *start_insert = NULL; /* for gcc to shut up. */
1933 const char *pos = base;
1934 int seen_slash_slash = 0;
1935 /* We're looking for the first slash, but want to ignore
1938 slash = memchr (pos, '/', end - pos);
1939 if (slash && !seen_slash_slash)
1940 if (*(slash + 1) == '/')
1943 seen_slash_slash = 1;
1947 /* At this point, SLASH is the location of the first / after
1948 "//", or the first slash altogether. START_INSERT is the
1949 pointer to the location where LINK will be inserted. When
1950 examining the last two examples, keep in mind that LINK
1953 if (!slash && !seen_slash_slash)
1954 /* example: "foo" */
1956 start_insert = base;
1957 else if (!slash && seen_slash_slash)
1958 /* example: "http://foo" */
1961 else if (slash && !seen_slash_slash)
1962 /* example: "foo/bar" */
1964 start_insert = base;
1965 else if (slash && seen_slash_slash)
1966 /* example: "http://something/" */
1968 start_insert = slash;
1970 span = start_insert - base;
1971 constr = (char *)xmalloc (span + linklength + 1);
1973 memcpy (constr, base, span);
1975 memcpy (constr + span, link, linklength);
1976 constr[span + linklength] = '\0';
1980 /* LINK is a relative URL: we need to replace everything
1981 after last slash (possibly empty) with LINK.
1983 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1984 our result should be "whatever/foo/qux/xyzzy". */
1985 int need_explicit_slash = 0;
1987 const char *start_insert;
1988 const char *last_slash = find_last_char (base, end, '/');
1991 /* No slash found at all. Append LINK to what we have,
1992 but we'll need a slash as a separator.
1994 Example: if base == "foo" and link == "qux/xyzzy", then
1995 we cannot just append link to base, because we'd get
1996 "fooqux/xyzzy", whereas what we want is
1999 To make sure the / gets inserted, we set
2000 need_explicit_slash to 1. We also set start_insert
2001 to end + 1, so that the length calculations work out
2002 correctly for one more (slash) character. Accessing
2003 that character is fine, since it will be the
2004 delimiter, '\0' or '?'. */
2005 /* example: "foo?..." */
2006 /* ^ ('?' gets changed to '/') */
2007 start_insert = end + 1;
2008 need_explicit_slash = 1;
2010 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2012 /* example: http://host" */
2014 start_insert = end + 1;
2015 need_explicit_slash = 1;
2019 /* example: "whatever/foo/bar" */
2021 start_insert = last_slash + 1;
2024 span = start_insert - base;
2025 constr = (char *)xmalloc (span + linklength + 1);
2027 memcpy (constr, base, span);
2028 if (need_explicit_slash)
2029 constr[span - 1] = '/';
2031 memcpy (constr + span, link, linklength);
2032 constr[span + linklength] = '\0';
2035 else /* !no_scheme */
2037 constr = strdupdelim (link, link + linklength);
2042 /* Merge BASE with LINK and return the resulting URI. This is an
2043 interface to uri_merge_1 that assumes that LINK is a
2044 zero-terminated string. */
2046 uri_merge (const char *base, const char *link)
2048 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2051 #define APPEND(p, s) do { \
2052 int len = strlen (s); \
2053 memcpy (p, s, len); \
2057 /* Use this instead of password when the actual password is supposed
2058 to be hidden. We intentionally use a generic string without giving
2059 away the number of characters in the password, like previous
2061 #define HIDDEN_PASSWORD "*password*"
2063 /* Recreate the URL string from the data in URL.
2065 If HIDE is non-zero (as it is when we're calling this on a URL we
2066 plan to print, but not when calling it to canonicalize a URL for
2067 use within the program), password will be hidden. Unsafe
2068 characters in the URL will be quoted. */
2071 url_string (const struct url *url, int hide_password)
2075 char *quoted_user = NULL, *quoted_passwd = NULL;
2077 int scheme_port = supported_schemes[url->scheme].default_port;
2078 char *scheme_str = supported_schemes[url->scheme].leading_string;
2079 int fplen = full_path_length (url);
2081 int brackets_around_host = 0;
2083 assert (scheme_str != NULL);
2085 /* Make sure the user name and password are quoted. */
2088 quoted_user = url_escape_allow_passthrough (url->user);
2092 quoted_passwd = HIDDEN_PASSWORD;
2094 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2098 if (strchr (url->host, ':'))
2099 brackets_around_host = 1;
2101 size = (strlen (scheme_str)
2102 + strlen (url->host)
2103 + (brackets_around_host ? 2 : 0)
2106 if (url->port != scheme_port)
2107 size += 1 + numdigit (url->port);
2110 size += 1 + strlen (quoted_user);
2112 size += 1 + strlen (quoted_passwd);
2115 p = result = xmalloc (size);
2117 APPEND (p, scheme_str);
2120 APPEND (p, quoted_user);
2124 APPEND (p, quoted_passwd);
2129 if (brackets_around_host)
2131 APPEND (p, url->host);
2132 if (brackets_around_host)
2134 if (url->port != scheme_port)
2137 p = number_to_string (p, url->port);
2140 full_path_write (url, p);
2144 assert (p - result == size);
2146 if (quoted_user && quoted_user != url->user)
2147 xfree (quoted_user);
2148 if (quoted_passwd && !hide_password
2149 && quoted_passwd != url->passwd)
2150 xfree (quoted_passwd);
2155 /* Return the URL of the proxy appropriate for url U. */
2157 getproxy (struct url *u)
2160 char *rewritten_url;
2161 static char rewritten_storage[1024];
2165 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2171 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2175 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2179 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2181 case SCHEME_INVALID:
2184 if (!proxy || !*proxy)
2187 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2188 getproxy() to return static storage. */
2189 rewritten_url = rewrite_shorthand_url (proxy);
2192 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2193 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2194 proxy = rewritten_storage;
2200 /* Should a host be accessed through proxy, concerning no_proxy? */
2202 no_proxy_match (const char *host, const char **no_proxy)
2207 return !sufmatch (no_proxy, host);
2210 /* Support for converting links for local viewing in downloaded HTML
2211 files. This should be moved to another file, because it has
2212 nothing to do with processing URLs. */
2214 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2215 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2217 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2218 const char *, int));
2219 static char *local_quote_string PARAMS ((const char *));
2221 /* Change the links in one HTML file. LINKS is a list of links in the
2222 document, along with their positions and the desired direction of
2225 convert_links (const char *file, struct urlpos *links)
2227 struct file_memory *fm;
2230 downloaded_file_t downloaded_file_return;
2232 struct urlpos *link;
2233 int to_url_count = 0, to_file_count = 0;
2235 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2238 /* First we do a "dry run": go through the list L and see whether
2239 any URL needs to be converted in the first place. If not, just
2240 leave the file alone. */
2242 struct urlpos *dry = links;
2243 for (dry = links; dry; dry = dry->next)
2244 if (dry->convert != CO_NOCONVERT)
2248 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2253 fm = read_file (file);
2256 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2257 file, strerror (errno));
2261 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2262 if (opt.backup_converted && downloaded_file_return)
2263 write_backup_file (file, downloaded_file_return);
2265 /* Before opening the file for writing, unlink the file. This is
2266 important if the data in FM is mmaped. In such case, nulling the
2267 file, which is what fopen() below does, would make us read all
2268 zeroes from the mmaped region. */
2269 if (unlink (file) < 0 && errno != ENOENT)
2271 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2272 file, strerror (errno));
2273 read_file_free (fm);
2276 /* Now open the file for writing. */
2277 fp = fopen (file, "wb");
2280 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2281 file, strerror (errno));
2282 read_file_free (fm);
2286 /* Here we loop through all the URLs in file, replacing those of
2287 them that are downloaded with relative references. */
2289 for (link = links; link; link = link->next)
2291 char *url_start = fm->content + link->pos;
2293 if (link->pos >= fm->length)
2295 DEBUGP (("Something strange is going on. Please investigate."));
2298 /* If the URL is not to be converted, skip it. */
2299 if (link->convert == CO_NOCONVERT)
2301 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2305 /* Echo the file contents, up to the offending URL's opening
2306 quote, to the outfile. */
2307 fwrite (p, 1, url_start - p, fp);
2310 switch (link->convert)
2312 case CO_CONVERT_TO_RELATIVE:
2313 /* Convert absolute URL to relative. */
2315 char *newname = construct_relative (file, link->local_name);
2316 char *quoted_newname = local_quote_string (newname);
2318 if (!link->link_refresh_p)
2319 p = replace_attr (p, link->size, fp, quoted_newname);
2321 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2322 link->refresh_timeout);
2324 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2325 link->url->url, newname, link->pos, file));
2327 xfree (quoted_newname);
2331 case CO_CONVERT_TO_COMPLETE:
2332 /* Convert the link to absolute URL. */
2334 char *newlink = link->url->url;
2335 char *quoted_newlink = html_quote_string (newlink);
2337 if (!link->link_refresh_p)
2338 p = replace_attr (p, link->size, fp, quoted_newlink);
2340 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2341 link->refresh_timeout);
2343 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2344 newlink, link->pos, file));
2345 xfree (quoted_newlink);
2349 case CO_NULLIFY_BASE:
2350 /* Change the base href to "". */
2351 p = replace_attr (p, link->size, fp, "");
2359 /* Output the rest of the file. */
2360 if (p - fm->content < fm->length)
2361 fwrite (p, 1, fm->length - (p - fm->content), fp);
2363 read_file_free (fm);
2365 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2368 /* Construct and return a malloced copy of the relative link from two
2369 pieces of information: local name S1 of the referring file and
2370 local name S2 of the referred file.
2372 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2373 "jagor.srce.hr/images/news.gif", the function will return
2376 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2377 "fly.cc.fer.hr/images/fly.gif", the function will return
2378 "../images/fly.gif".
2380 Caveats: S1 should not begin with `/', unless S2 also begins with
2381 '/'. S1 should not contain things like ".." and such --
2382 construct_relative ("fly/ioccc/../index.html",
2383 "fly/images/fly.gif") will fail. (A workaround is to call
2384 something like path_simplify() on S1). */
2386 construct_relative (const char *s1, const char *s2)
2388 int i, cnt, sepdirs1;
2392 return xstrdup (s2);
2393 /* S1 should *not* be absolute, if S2 wasn't. */
2394 assert (*s1 != '/');
2396 /* Skip the directories common to both strings. */
2399 while (s1[i] && s2[i]
2404 if (s1[i] == '/' && s2[i] == '/')
2409 for (sepdirs1 = 0; s1[i]; i++)
2412 /* Now, construct the file as of:
2413 - ../ repeated sepdirs1 time
2414 - all the non-mutual directories of S2. */
2415 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2416 for (i = 0; i < sepdirs1; i++)
2417 memcpy (res + 3 * i, "../", 3);
2418 strcpy (res + 3 * i, s2 + cnt);
2423 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2425 /* Rather than just writing over the original .html file with the
2426 converted version, save the former to *.orig. Note we only do
2427 this for files we've _successfully_ downloaded, so we don't
2428 clobber .orig files sitting around from previous invocations. */
2430 /* Construct the backup filename as the original name plus ".orig". */
2431 size_t filename_len = strlen(file);
2432 char* filename_plus_orig_suffix;
2433 boolean already_wrote_backup_file = FALSE;
2434 slist* converted_file_ptr;
2435 static slist* converted_files = NULL;
2437 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2439 /* Just write "orig" over "html". We need to do it this way
2440 because when we're checking to see if we've downloaded the
2441 file before (to see if we can skip downloading it), we don't
2442 know if it's a text/html file. Therefore we don't know yet
2443 at that stage that -E is going to cause us to tack on
2444 ".html", so we need to compare vs. the original URL plus
2445 ".orig", not the original URL plus ".html.orig". */
2446 filename_plus_orig_suffix = alloca (filename_len + 1);
2447 strcpy(filename_plus_orig_suffix, file);
2448 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2450 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2452 /* Append ".orig" to the name. */
2453 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2454 strcpy(filename_plus_orig_suffix, file);
2455 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2458 /* We can get called twice on the same URL thanks to the
2459 convert_all_links() call in main(). If we write the .orig file
2460 each time in such a case, it'll end up containing the first-pass
2461 conversion, not the original file. So, see if we've already been
2462 called on this file. */
2463 converted_file_ptr = converted_files;
2464 while (converted_file_ptr != NULL)
2465 if (strcmp(converted_file_ptr->string, file) == 0)
2467 already_wrote_backup_file = TRUE;
2471 converted_file_ptr = converted_file_ptr->next;
2473 if (!already_wrote_backup_file)
2475 /* Rename <file> to <file>.orig before former gets written over. */
2476 if (rename(file, filename_plus_orig_suffix) != 0)
2477 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2478 file, filename_plus_orig_suffix, strerror (errno));
2480 /* Remember that we've already written a .orig backup for this file.
2481 Note that we never free this memory since we need it till the
2482 convert_all_links() call, which is one of the last things the
2483 program does before terminating. BTW, I'm not sure if it would be
2484 safe to just set 'converted_file_ptr->string' to 'file' below,
2485 rather than making a copy of the string... Another note is that I
2486 thought I could just add a field to the urlpos structure saying
2487 that we'd written a .orig file for this URL, but that didn't work,
2488 so I had to make this separate list.
2489 -- Dan Harkless <wget@harkless.org>
2491 This [adding a field to the urlpos structure] didn't work
2492 because convert_file() is called from convert_all_links at
2493 the end of the retrieval with a freshly built new urlpos
2495 -- Hrvoje Niksic <hniksic@arsdigita.com>
2497 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2498 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2499 converted_file_ptr->next = converted_files;
2500 converted_files = converted_file_ptr;
2504 static int find_fragment PARAMS ((const char *, int, const char **,
2507 /* Replace an attribute's original text with NEW_TEXT. */
2510 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2513 char quote_char = '\"'; /* use "..." for quoting, unless the
2514 original value is quoted, in which
2515 case reuse its quoting char. */
2516 const char *frag_beg, *frag_end;
2518 /* Structure of our string is:
2519 "...old-contents..."
2520 <--- size ---> (with quotes)
2523 <--- size --> (no quotes) */
2525 if (*p == '\"' || *p == '\'')
2530 size -= 2; /* disregard opening and closing quote */
2532 putc (quote_char, fp);
2533 fputs (new_text, fp);
2535 /* Look for fragment identifier, if any. */
2536 if (find_fragment (p, size, &frag_beg, &frag_end))
2537 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2541 putc (quote_char, fp);
2546 /* The same as REPLACE_ATTR, but used when replacing
2547 <meta http-equiv=refresh content="new_text"> because we need to
2548 append "timeout_value; URL=" before the next_text. */
2551 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2552 const char *new_text, int timeout)
2555 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2559 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2561 return replace_attr (p, size, fp, new_with_timeout);
2564 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2565 preceded by '&'. If the character is not found, return zero. If
2566 the character is found, return 1 and set BP and EP to point to the
2567 beginning and end of the region.
2569 This is used for finding the fragment indentifiers in URLs. */
2572 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2574 const char *end = beg + size;
2576 for (; beg < end; beg++)
2598 /* Quote FILE for use as local reference to an HTML file.
2600 We quote ? as %3F to avoid passing part of the file name as the
2601 parameter when browsing the converted file through HTTP. However,
2602 it is safe to do this only when `--html-extension' is turned on.
2603 This is because converting "index.html?foo=bar" to
2604 "index.html%3Ffoo=bar" would break local browsing, as the latter
2605 isn't even recognized as an HTML file! However, converting
2606 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2607 safe for both local and HTTP-served browsing. */
2610 local_quote_string (const char *file)
2612 const char *file_sans_qmark;
2615 if (!opt.html_extension)
2616 return html_quote_string (file);
2618 qm = count_char (file, '?');
2622 const char *from = file;
2625 /* qm * 2 because we replace each question mark with "%3F",
2626 i.e. replace one char with three, hence two more. */
2627 int fsqlen = strlen (file) + qm * 2;
2629 to = newname = (char *)alloca (fsqlen + 1);
2630 for (; *from; from++)
2641 assert (to - newname == fsqlen);
2644 file_sans_qmark = newname;
2647 file_sans_qmark = file;
2649 return html_quote_string (file_sans_qmark);
2652 /* We're storing "modes" of type downloaded_file_t in the hash table.
2653 However, our hash tables only accept pointers for keys and values.
2654 So when we need a pointer, we use the address of a
2655 downloaded_file_t variable of static storage. */
2657 static downloaded_file_t *
2658 downloaded_mode_to_ptr (downloaded_file_t mode)
2660 static downloaded_file_t
2661 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2662 v2 = FILE_DOWNLOADED_NORMALLY,
2663 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2664 v4 = CHECK_FOR_FILE;
2668 case FILE_NOT_ALREADY_DOWNLOADED:
2670 case FILE_DOWNLOADED_NORMALLY:
2672 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2674 case CHECK_FOR_FILE:
2680 /* This should really be merged with dl_file_url_map and
2681 downloaded_html_files in recur.c. This was originally a list, but
2682 I changed it to a hash table beause it was actually taking a lot of
2683 time to find things in it. */
2685 static struct hash_table *downloaded_files_hash;
2687 /* Remembers which files have been downloaded. In the standard case, should be
2688 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2689 download successfully (i.e. not for ones we have failures on or that we skip
2692 When we've downloaded a file and tacked on a ".html" extension due to -E,
2693 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2694 FILE_DOWNLOADED_NORMALLY.
2696 If you just want to check if a file has been previously added without adding
2697 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2698 with local filenames, not remote URLs. */
2700 downloaded_file (downloaded_file_t mode, const char *file)
2702 downloaded_file_t *ptr;
2704 if (mode == CHECK_FOR_FILE)
2706 if (!downloaded_files_hash)
2707 return FILE_NOT_ALREADY_DOWNLOADED;
2708 ptr = hash_table_get (downloaded_files_hash, file);
2710 return FILE_NOT_ALREADY_DOWNLOADED;
2714 if (!downloaded_files_hash)
2715 downloaded_files_hash = make_string_hash_table (0);
2717 ptr = hash_table_get (downloaded_files_hash, file);
2721 ptr = downloaded_mode_to_ptr (mode);
2722 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2724 return FILE_NOT_ALREADY_DOWNLOADED;
2728 df_free_mapper (void *key, void *value, void *ignored)
2735 downloaded_files_free (void)
2737 if (downloaded_files_hash)
2739 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2740 hash_table_destroy (downloaded_files_hash);
2741 downloaded_files_hash = NULL;
2745 /* Return non-zero if scheme a is similar to scheme b.
2747 Schemes are similar if they are equal. If SSL is supported, schemes
2748 are also similar if one is http (SCHEME_HTTP) and the other is https
2751 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2756 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2757 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2764 /* Debugging and testing support for path_simplify. */
2766 /* Debug: run path_simplify on PATH and return the result in a new
2767 string. Useful for calling from the debugger. */
2771 char *copy = xstrdup (path);
2772 path_simplify (copy);
2777 run_test (char *test, char *expected_result, int expected_change)
2779 char *test_copy = xstrdup (test);
2780 int modified = path_simplify (test_copy);
2782 if (0 != strcmp (test_copy, expected_result))
2784 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2785 test, expected_result, test_copy);
2787 if (modified != expected_change)
2789 if (expected_change == 1)
2790 printf ("Expected no modification with path_simplify(\"%s\").\n",
2793 printf ("Expected modification with path_simplify(\"%s\").\n",
2800 test_path_simplify (void)
2803 char *test, *result;
2809 { "foo", "foo", 0 },
2810 { "foo/bar", "foo/bar", 0 },
2811 { "foo///bar", "foo/bar", 1 },
2812 { "foo/.", "foo/", 1 },
2813 { "foo/./", "foo/", 1 },
2814 { "foo./", "foo./", 0 },
2815 { "foo/../bar", "bar", 1 },
2816 { "foo/../bar/", "bar/", 1 },
2817 { "foo/bar/..", "foo/", 1 },
2818 { "foo/bar/../x", "foo/x", 1 },
2819 { "foo/bar/../x/", "foo/x/", 1 },
2820 { "foo/..", "", 1 },
2821 { "foo/../..", "", 1 },
2822 { "a/b/../../c", "c", 1 },
2823 { "./a/../b", "b", 1 }
2827 for (i = 0; i < ARRAY_SIZE (tests); i++)
2829 char *test = tests[i].test;
2830 char *expected_result = tests[i].result;
2831 int expected_change = tests[i].should_modify;
2832 run_test (test, expected_result, expected_change);
2835 /* Now run all the tests with a leading slash before the test case,
2836 to prove that the slash is being preserved. */
2837 for (i = 0; i < ARRAY_SIZE (tests); i++)
2839 char *test, *expected_result;
2840 int expected_change = tests[i].should_modify;
2842 test = xmalloc (1 + strlen (tests[i].test) + 1);
2843 sprintf (test, "/%s", tests[i].test);
2845 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2846 sprintf (expected_result, "/%s", tests[i].result);
2848 run_test (test, expected_result, expected_change);
2851 xfree (expected_result);