2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
39 #include <sys/types.h>
57 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
59 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
61 static const int NS_INADDRSZ = 4;
62 static const int NS_IN6ADDRSZ = 16;
63 static const int NS_INT16SZ = 2;
73 /* Supported schemes: */
74 static struct scheme_data supported_schemes[] =
76 { "http://", DEFAULT_HTTP_PORT, 1 },
78 { "https://", DEFAULT_HTTPS_PORT, 1 },
80 { "ftp://", DEFAULT_FTP_PORT, 1 },
86 /* Forward declarations: */
88 static char *construct_relative PARAMS ((const char *, const char *));
89 static int path_simplify PARAMS ((char *));
93 /* Support for encoding and decoding of URL strings. We determine
94 whether a character is unsafe through static table lookup. This
95 code assumes ASCII character set and 8-bit chars. */
102 #define R urlchr_reserved
103 #define U urlchr_unsafe
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
108 /* rfc1738 reserved chars, preserved from encoding. */
110 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
112 /* rfc1738 unsafe chars, plus some more. */
114 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
116 const static unsigned char urlchr_table[256] =
118 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
119 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
120 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
121 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
122 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
123 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
124 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
125 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
126 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
127 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
128 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
129 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
130 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
131 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
132 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
133 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
138 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
143 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 /* Decodes the forms %xy in a URL to the character the hexadecimal
147 code of which is xy. xy are hexadecimal digits from
148 [0123456789ABCDEF] (case-insensitive). If x or y are not
149 hex-digits or `%' precedes `\0', the sequence is inserted
153 decode_string (char *s)
155 char *t = s; /* t - tortoise */
156 char *h = s; /* h - hare */
167 /* Do nothing if '%' is not followed by two hex digits. */
168 if (!*(h + 1) || !*(h + 2)
169 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
171 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
178 /* Like encode_string, but return S if there are no unsafe chars. */
181 encode_string_maybe (const char *s)
188 for (p1 = s; *p1; p1++)
189 if (UNSAFE_CHAR (*p1))
190 addition += 2; /* Two more characters (hex digits) */
195 newlen = (p1 - s) + addition;
196 newstr = (char *)xmalloc (newlen + 1);
202 if (UNSAFE_CHAR (*p1))
204 unsigned char c = *p1++;
206 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
207 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
213 assert (p2 - newstr == newlen);
218 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
219 given string, returning a malloc-ed %XX encoded string. */
222 encode_string (const char *s)
224 char *encoded = encode_string_maybe (s);
231 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
232 the old value of PTR is freed and PTR is made to point to the newly
233 allocated storage. */
235 #define ENCODE(ptr) do { \
236 char *e_new = encode_string_maybe (ptr); \
244 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
246 /* Decide whether to encode, decode, or pass through the char at P.
247 This used to be a macro, but it got a little too convoluted. */
248 static inline enum copy_method
249 decide_copy_method (const char *p)
253 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
255 /* %xx sequence: decode it, unless it would decode to an
256 unsafe or a reserved char; in that case, leave it as
258 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
259 XCHAR_TO_XDIGIT (*(p + 2));
261 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
262 return CM_PASSTHROUGH;
267 /* Garbled %.. sequence: encode `%'. */
270 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
273 return CM_PASSTHROUGH;
276 /* Translate a %-quoting (but possibly non-conformant) input string S
277 into a %-quoting (and conformant) output string. If no characters
278 are encoded or decoded, return the same string S; otherwise, return
279 a freshly allocated string with the new contents.
281 After a URL has been run through this function, the protocols that
282 use `%' as the quote character can use the resulting string as-is,
283 while those that don't call decode_string() to get to the intended
284 data. This function is also stable: after an input string is
285 transformed the first time, all further transformations of the
286 result yield the same result string.
288 Let's discuss why this function is needed.
290 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
291 space character would mess up the HTTP request, it needs to be
294 GET /abc%20def HTTP/1.0
296 So it appears that the unsafe chars need to be quoted, as with
297 encode_string. But what if we're requested to download
298 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
299 the user meant was a literal space, and he was kind enough to quote
300 it. In that case, Wget should obviously leave the `%20' as is, and
301 send the same request as above. So in this case we may not call
304 But what if the requested URI is `abc%20 def'? If we call
305 encode_string, we end up with `/abc%2520%20def', which is almost
306 certainly not intended. If we don't call encode_string, we are
307 left with the embedded space and cannot send the request. What the
308 user meant was for Wget to request `/abc%20%20def', and this is
309 where reencode_string kicks in.
311 Wget used to solve this by first decoding %-quotes, and then
312 encoding all the "unsafe" characters found in the resulting string.
313 This was wrong because it didn't preserve certain URL special
314 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
315 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
316 whether we considered `+' reserved (it is). One of these results
317 is inevitable because by the second step we would lose information
318 on whether the `+' was originally encoded or not. Both results
319 were wrong because in CGI parameters + means space, while %2B means
320 literal plus. reencode_string correctly translates the above to
321 "a%2B+b", i.e. returns the original string.
323 This function uses an algorithm proposed by Anon Sricharoenchai:
325 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
328 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
331 ...except that this code conflates the two steps, and decides
332 whether to encode, decode, or pass through each character in turn.
333 The function still uses two passes, but their logic is the same --
334 the first pass exists merely for the sake of allocation. Another
335 small difference is that we include `+' to URL_RESERVED.
339 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
341 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
345 "foo bar" -> "foo%20bar"
346 "foo%20bar" -> "foo%20bar"
347 "foo %20bar" -> "foo%20%20bar"
348 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
349 "foo%25%20bar" -> "foo%25%20bar"
350 "foo%2%20bar" -> "foo%252%20bar"
351 "foo+bar" -> "foo+bar" (plus is reserved!)
352 "foo%2b+bar" -> "foo%2b+bar" */
355 reencode_string (const char *s)
361 int encode_count = 0;
362 int decode_count = 0;
364 /* First, pass through the string to see if there's anything to do,
365 and to calculate the new length. */
366 for (p1 = s; *p1; p1++)
368 switch (decide_copy_method (p1))
381 if (!encode_count && !decode_count)
382 /* The string is good as it is. */
383 return (char *)s; /* C const model sucks. */
386 /* Each encoding adds two characters (hex digits), while each
387 decoding removes two characters. */
388 newlen = oldlen + 2 * (encode_count - decode_count);
389 newstr = xmalloc (newlen + 1);
396 switch (decide_copy_method (p1))
400 unsigned char c = *p1++;
402 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
403 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
407 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
408 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
409 p1 += 3; /* skip %xx */
416 assert (p2 - newstr == newlen);
420 /* Run PTR_VAR through reencode_string. If a new string is consed,
421 free PTR_VAR and make it point to the new storage. Obviously,
422 PTR_VAR needs to be an lvalue. */
424 #define REENCODE(ptr_var) do { \
425 char *rf_new = reencode_string (ptr_var); \
426 if (rf_new != ptr_var) \
433 /* Returns the scheme type if the scheme is supported, or
434 SCHEME_INVALID if not. */
436 url_scheme (const char *url)
440 for (i = 0; supported_schemes[i].leading_string; i++)
441 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
442 strlen (supported_schemes[i].leading_string)))
444 if (supported_schemes[i].enabled)
445 return (enum url_scheme) i;
447 return SCHEME_INVALID;
450 return SCHEME_INVALID;
453 /* Return the number of characters needed to skip the scheme part of
454 the URL, e.g. `http://'. If no scheme is found, returns 0. */
456 url_skip_scheme (const char *url)
460 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
462 while (ISALNUM (*p) || *p == '-' || *p == '+')
469 /* Skip "//" if found. */
470 if (*p == '/' && *(p + 1) == '/')
476 /* Returns 1 if the URL begins with a scheme (supported or
477 unsupported), 0 otherwise. */
479 url_has_scheme (const char *url)
482 while (ISALNUM (*p) || *p == '-' || *p == '+')
488 scheme_default_port (enum url_scheme scheme)
490 return supported_schemes[scheme].default_port;
494 scheme_disable (enum url_scheme scheme)
496 supported_schemes[scheme].enabled = 0;
499 /* Skip the username and password, if present here. The function
500 should be called *not* with the complete URL, but with the part
501 right after the scheme.
503 If no username and password are found, return 0. */
505 url_skip_uname (const char *url)
509 /* Look for '@' that comes before '/' or '?'. */
510 p = (const char *)strpbrk (url, "/?@");
518 parse_uname (const char *str, int len, char **user, char **passwd)
523 /* Empty user name not allowed. */
526 colon = memchr (str, ':', len);
528 /* Empty user name again. */
533 int pwlen = len - (colon + 1 - str);
534 *passwd = xmalloc (pwlen + 1);
535 memcpy (*passwd, colon + 1, pwlen);
536 (*passwd)[pwlen] = '\0';
542 *user = xmalloc (len + 1);
543 memcpy (*user, str, len);
547 decode_string (*user);
549 decode_string (*passwd);
554 /* Used by main.c: detect URLs written using the "shorthand" URL forms
555 popularized by Netscape and NcFTP. HTTP shorthands look like this:
557 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
558 www.foo.com[:port] -> http://www.foo.com[:port]
560 FTP shorthands look like this:
562 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
563 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
565 If the URL needs not or cannot be rewritten, return NULL. */
567 rewrite_shorthand_url (const char *url)
571 if (url_has_scheme (url))
574 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
576 for (p = url; *p && *p != ':' && *p != '/'; p++)
586 /* If the characters after the colon and before the next slash
587 or end of string are all digits, it's HTTP. */
589 for (pp = p + 1; ISDIGIT (*pp); pp++)
591 if (digits > 0 && (*pp == '/' || *pp == '\0'))
594 /* Prepend "ftp://" to the entire URL... */
595 res = xmalloc (6 + strlen (url) + 1);
596 sprintf (res, "ftp://%s", url);
597 /* ...and replace ':' with '/'. */
598 res[6 + (p - url)] = '/';
605 /* Just prepend "http://" to what we have. */
606 res = xmalloc (7 + strlen (url) + 1);
607 sprintf (res, "http://%s", url);
612 static void parse_path PARAMS ((const char *, char **, char **));
615 strpbrk_or_eos (const char *s, const char *accept)
617 char *p = strpbrk (s, accept);
619 p = (char *)s + strlen (s);
623 /* Turn STR into lowercase; return non-zero if a character was
627 lowercase_str (char *str)
634 *str = TOLOWER (*str);
639 static char *parse_errors[] = {
640 #define PE_NO_ERROR 0
642 #define PE_UNSUPPORTED_SCHEME 1
643 "Unsupported scheme",
644 #define PE_EMPTY_HOST 2
646 #define PE_BAD_PORT_NUMBER 3
648 #define PE_INVALID_USER_NAME 4
650 #define PE_UNTERMINATED_IPV6_ADDRESS 5
651 "Unterminated IPv6 numeric address",
652 #define PE_IPV6_NOT_SUPPORTED 6
653 "IPv6 addresses not supported",
654 #define PE_INVALID_IPV6_ADDRESS 7
655 "Invalid IPv6 numeric address"
658 #define SETERR(p, v) do { \
664 /* The following two functions were adapted from glibc. */
667 is_valid_ipv4_address (const char *str, const char *end)
669 int saw_digit, octets;
679 if (ch >= '0' && ch <= '9') {
680 val = val * 10 + (ch - '0');
684 if (saw_digit == 0) {
689 } else if (ch == '.' && saw_digit == 1) {
704 is_valid_ipv6_address (const char *str, const char *end)
706 static const char xdigits[] = "0123456789abcdef";
719 /* Leading :: requires some special handling. */
723 if (str == end || *str != ':')
735 /* if ch is a number, add it to val. */
736 pch = strchr(xdigits, ch);
739 val |= (pch - xdigits);
746 /* if ch is a colon ... */
749 if (saw_xdigit == 0) {
754 } else if (str == end) {
757 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
765 /* if ch is a dot ... */
766 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
767 is_valid_ipv4_address(curtok, end) == 1) {
776 if (saw_xdigit == 1) {
777 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
782 if (colonp != NULL) {
783 if (tp == NS_IN6ADDRSZ)
788 if (tp != NS_IN6ADDRSZ)
797 Return a new struct url if successful, NULL on error. In case of
798 error, and if ERROR is not NULL, also set *ERROR to the appropriate
801 url_parse (const char *url, int *error)
805 int path_modified, host_modified;
807 enum url_scheme scheme;
809 const char *uname_b, *uname_e;
810 const char *host_b, *host_e;
811 const char *path_b, *path_e;
812 const char *params_b, *params_e;
813 const char *query_b, *query_e;
814 const char *fragment_b, *fragment_e;
817 char *user = NULL, *passwd = NULL;
821 scheme = url_scheme (url);
822 if (scheme == SCHEME_INVALID)
824 SETERR (error, PE_UNSUPPORTED_SCHEME);
828 url_encoded = reencode_string (url);
831 p += strlen (supported_schemes[scheme].leading_string);
833 p += url_skip_uname (p);
836 /* scheme://user:pass@host[:port]... */
839 /* We attempt to break down the URL into the components path,
840 params, query, and fragment. They are ordered like this:
842 scheme://host[:port][/path][;params][?query][#fragment] */
844 params_b = params_e = NULL;
845 query_b = query_e = NULL;
846 fragment_b = fragment_e = NULL;
852 /* Handle IPv6 address inside square brackets. Ideally we'd
853 just look for the terminating ']', but rfc2732 mandates
854 rejecting invalid IPv6 addresses. */
856 /* The address begins after '['. */
858 host_e = strchr (host_b, ']');
862 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
867 /* Check if the IPv6 address is valid. */
868 if (!is_valid_ipv6_address(host_b, host_e))
870 SETERR (error, PE_INVALID_IPV6_ADDRESS);
874 /* Continue parsing after the closing ']'. */
877 SETERR (error, PE_IPV6_NOT_SUPPORTED);
883 p = strpbrk_or_eos (p, ":/;?#");
887 if (host_b == host_e)
889 SETERR (error, PE_EMPTY_HOST);
893 port = scheme_default_port (scheme);
896 const char *port_b, *port_e, *pp;
898 /* scheme://host:port/tralala */
902 p = strpbrk_or_eos (p, "/;?#");
905 if (port_b == port_e)
907 /* http://host:/whatever */
909 SETERR (error, PE_BAD_PORT_NUMBER);
913 for (port = 0, pp = port_b; pp < port_e; pp++)
917 /* http://host:12randomgarbage/blah */
919 SETERR (error, PE_BAD_PORT_NUMBER);
923 port = 10 * port + (*pp - '0');
931 p = strpbrk_or_eos (p, ";?#");
936 /* Path is not allowed not to exist. */
944 p = strpbrk_or_eos (p, "?#");
951 p = strpbrk_or_eos (p, "#");
954 /* Hack that allows users to use '?' (a wildcard character) in
955 FTP URLs without it being interpreted as a query string
957 if (scheme == SCHEME_FTP)
959 query_b = query_e = NULL;
972 if (uname_b != uname_e)
974 /* http://user:pass@host */
976 /* uname_b uname_e */
977 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
979 SETERR (error, PE_INVALID_USER_NAME);
984 u = (struct url *)xmalloc (sizeof (struct url));
985 memset (u, 0, sizeof (*u));
988 u->host = strdupdelim (host_b, host_e);
993 u->path = strdupdelim (path_b, path_e);
994 path_modified = path_simplify (u->path);
995 parse_path (u->path, &u->dir, &u->file);
997 host_modified = lowercase_str (u->host);
1000 u->params = strdupdelim (params_b, params_e);
1002 u->query = strdupdelim (query_b, query_e);
1004 u->fragment = strdupdelim (fragment_b, fragment_e);
1006 if (path_modified || u->fragment || host_modified || path_b == path_e)
1008 /* If we suspect that a transformation has rendered what
1009 url_string might return different from URL_ENCODED, rebuild
1010 u->url using url_string. */
1011 u->url = url_string (u, 0);
1013 if (url_encoded != url)
1014 xfree ((char *) url_encoded);
1018 if (url_encoded == url)
1019 u->url = xstrdup (url);
1021 u->url = url_encoded;
1029 url_error (int error_code)
1031 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1032 return parse_errors[error_code];
1036 parse_path (const char *quoted_path, char **dir, char **file)
1038 char *path, *last_slash;
1040 STRDUP_ALLOCA (path, quoted_path);
1041 decode_string (path);
1043 last_slash = strrchr (path, '/');
1046 *dir = xstrdup ("");
1047 *file = xstrdup (path);
1051 *dir = strdupdelim (path, last_slash);
1052 *file = xstrdup (last_slash + 1);
1056 /* Note: URL's "full path" is the path with the query string and
1057 params appended. The "fragment" (#foo) is intentionally ignored,
1058 but that might be changed. For example, if the original URL was
1059 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1060 the full path will be "/foo/bar/baz;bullshit?querystring". */
1062 /* Return the length of the full path, without the terminating
1066 full_path_length (const struct url *url)
1070 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1081 /* Write out the full path. */
1084 full_path_write (const struct url *url, char *where)
1086 #define FROB(el, chr) do { \
1087 char *f_el = url->el; \
1089 int l = strlen (f_el); \
1091 memcpy (where, f_el, l); \
1103 /* Public function for getting the "full path". E.g. if u->path is
1104 "foo/bar" and u->query is "param=value", full_path will be
1105 "/foo/bar?param=value". */
1108 url_full_path (const struct url *url)
1110 int length = full_path_length (url);
1111 char *full_path = (char *)xmalloc(length + 1);
1113 full_path_write (url, full_path);
1114 full_path[length] = '\0';
1119 /* Sync u->path and u->url with u->dir and u->file. */
1122 sync_path (struct url *url)
1130 newpath = xstrdup (url->file);
1135 int dirlen = strlen (url->dir);
1136 int filelen = strlen (url->file);
1138 newpath = xmalloc (dirlen + 1 + filelen + 1);
1139 memcpy (newpath, url->dir, dirlen);
1140 newpath[dirlen] = '/';
1141 memcpy (newpath + dirlen + 1, url->file, filelen);
1142 newpath[dirlen + 1 + filelen] = '\0';
1146 url->path = newpath;
1148 /* Synchronize u->url. */
1150 url->url = url_string (url, 0);
1153 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1154 This way we can sync u->path and u->url when they get changed. */
1157 url_set_dir (struct url *url, const char *newdir)
1160 url->dir = xstrdup (newdir);
1165 url_set_file (struct url *url, const char *newfile)
1168 url->file = xstrdup (newfile);
1173 url_free (struct url *url)
1179 FREE_MAYBE (url->params);
1180 FREE_MAYBE (url->query);
1181 FREE_MAYBE (url->fragment);
1182 FREE_MAYBE (url->user);
1183 FREE_MAYBE (url->passwd);
1192 get_urls_file (const char *file)
1194 struct file_memory *fm;
1195 struct urlpos *head, *tail;
1196 const char *text, *text_end;
1198 /* Load the file. */
1199 fm = read_file (file);
1202 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1205 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1209 text_end = fm->content + fm->length;
1210 while (text < text_end)
1212 const char *line_beg = text;
1213 const char *line_end = memchr (text, '\n', text_end - text);
1215 line_end = text_end;
1220 /* Strip whitespace from the beginning and end of line. */
1221 while (line_beg < line_end && ISSPACE (*line_beg))
1223 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1226 if (line_end > line_beg)
1228 /* URL is in the [line_beg, line_end) region. */
1232 struct urlpos *entry;
1235 /* We must copy the URL to a zero-terminated string, and we
1236 can't use alloca because we're in a loop. *sigh*. */
1237 url_text = strdupdelim (line_beg, line_end);
1241 /* Merge opt.base_href with URL. */
1242 char *merged = uri_merge (opt.base_href, url_text);
1247 url = url_parse (url_text, &up_error_code);
1250 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1251 file, url_text, url_error (up_error_code));
1257 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1258 memset (entry, 0, sizeof (*entry));
1269 read_file_free (fm);
1273 /* Free the linked list of urlpos. */
1275 free_urlpos (struct urlpos *l)
1279 struct urlpos *next = l->next;
1282 FREE_MAYBE (l->local_name);
1288 /* Rotate FNAME opt.backups times */
1290 rotate_backups(const char *fname)
1292 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1293 char *from = (char *)alloca (maxlen);
1294 char *to = (char *)alloca (maxlen);
1298 if (stat (fname, &sb) == 0)
1299 if (S_ISREG (sb.st_mode) == 0)
1302 for (i = opt.backups; i > 1; i--)
1304 sprintf (from, "%s.%d", fname, i - 1);
1305 sprintf (to, "%s.%d", fname, i);
1306 /* #### This will fail on machines without the rename() system
1311 sprintf (to, "%s.%d", fname, 1);
1315 /* Create all the necessary directories for PATH (a file). Calls
1316 mkdirhier() internally. */
1318 mkalldirs (const char *path)
1325 p = path + strlen (path);
1326 for (; *p != '/' && p != path; p--);
1327 /* Don't create if it's just a file. */
1328 if ((p == path) && (*p != '/'))
1330 t = strdupdelim (path, p);
1331 /* Check whether the directory exists. */
1332 if ((stat (t, &st) == 0))
1334 if (S_ISDIR (st.st_mode))
1341 /* If the dir exists as a file name, remove it first. This
1342 is *only* for Wget to work with buggy old CERN http
1343 servers. Here is the scenario: When Wget tries to
1344 retrieve a directory without a slash, e.g.
1345 http://foo/bar (bar being a directory), CERN server will
1346 not redirect it too http://foo/bar/ -- it will generate a
1347 directory listing containing links to bar/file1,
1348 bar/file2, etc. Wget will lose because it saves this
1349 HTML listing to a file `bar', so it cannot create the
1350 directory. To work around this, if the file of the same
1351 name exists, we just remove it and create the directory
1353 DEBUGP (("Removing %s because of directory danger!\n", t));
1357 res = make_directory (t);
1359 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1365 count_slashes (const char *s)
1374 /* Return the path name of the URL-equivalent file name, with a
1375 remote-like structure of directories. */
1377 mkstruct (const struct url *u)
1380 char *res, *dirpref;
1385 char *ptr = u->dir + (*u->dir == '/');
1386 int slash_count = 1 + count_slashes (ptr);
1387 int cut = MINVAL (opt.cut_dirs, slash_count);
1388 for (; cut && *ptr; ptr++)
1391 STRDUP_ALLOCA (dir, ptr);
1394 dir = u->dir + (*u->dir == '/');
1396 /* Check for the true name (or at least a consistent name for saving
1397 to directory) of HOST, reusing the hlist if possible. */
1398 if (opt.add_hostdir)
1400 /* Add dir_prefix and hostname (if required) to the beginning of
1402 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1404 + 1 + numdigit (u->port)
1406 if (!DOTP (opt.dir_prefix))
1407 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1409 strcpy (dirpref, u->host);
1411 if (u->port != scheme_default_port (u->scheme))
1413 int len = strlen (dirpref);
1415 number_to_string (dirpref + len + 1, u->port);
1418 else /* not add_hostdir */
1420 if (!DOTP (opt.dir_prefix))
1421 dirpref = opt.dir_prefix;
1426 /* If there is a prefix, prepend it. */
1429 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1430 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1435 if (l && dir[l - 1] == '/')
1439 file = "index.html";
1443 /* Finally, construct the full name. */
1444 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1446 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1451 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1452 an escaped query string. The trick is to make sure that unsafe
1453 characters in BASE are escaped, and that slashes in QUERY are also
1457 compose_file_name (char *base, char *query)
1463 /* Copy BASE to RESULT and encode all unsafe characters. */
1465 while (*from && to - result < sizeof (result))
1467 if (UNSAFE_CHAR (*from))
1469 unsigned char c = *from++;
1471 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1472 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1478 if (query && to - result < sizeof (result))
1482 /* Copy QUERY to RESULT and encode all '/' characters. */
1484 while (*from && to - result < sizeof (result))
1498 if (to - result < sizeof (result))
1501 /* Truncate input which is too long, presumably due to a huge
1503 result[sizeof (result) - 1] = '\0';
1505 return xstrdup (result);
1508 /* Create a unique filename, corresponding to a given URL. Calls
1509 mkstruct if necessary. Does *not* actually create any directories. */
1511 url_filename (const struct url *u)
1515 char *query = u->query && *u->query ? u->query : NULL;
1519 char *base = mkstruct (u);
1520 file = compose_file_name (base, query);
1525 char *base = *u->file ? u->file : "index.html";
1526 file = compose_file_name (base, query);
1528 /* Check whether the prefix directory is something other than "."
1529 before prepending it. */
1530 if (!DOTP (opt.dir_prefix))
1532 /* #### should just realloc FILE and prepend dir_prefix. */
1533 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1534 + 1 + strlen (file) + 1);
1535 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1541 /* DOS-ish file systems don't like `%' signs in them; we change it
1546 for (p = file; *p; p++)
1550 #endif /* WINDOWS */
1552 /* Check the cases in which the unique extensions are not used:
1553 1) Clobbering is turned off (-nc).
1554 2) Retrieval with regetting.
1555 3) Timestamping is used.
1556 4) Hierarchy is built.
1558 The exception is the case when file does exist and is a
1559 directory (actually support for bad httpd-s). */
1560 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1561 && !(file_exists_p (file) && !file_non_directory_p (file)))
1564 /* Find a unique name. */
1565 name = unique_name (file);
1570 /* Return the langth of URL's path. Path is considered to be
1571 terminated by one of '?', ';', '#', or by the end of the
1574 path_length (const char *url)
1576 const char *q = strpbrk_or_eos (url, "?;#");
1580 /* Find the last occurrence of character C in the range [b, e), or
1581 NULL, if none are present. This is equivalent to strrchr(b, c),
1582 except that it accepts an END argument instead of requiring the
1583 string to be zero-terminated. Why is there no memrchr()? */
1585 find_last_char (const char *b, const char *e, char c)
1593 /* Resolve "." and ".." elements of PATH by destructively modifying
1594 PATH. "." is resolved by removing that path element, and ".." is
1595 resolved by removing the preceding path element. Leading and
1596 trailing slashes are preserved.
1598 Return non-zero if any changes have been made.
1600 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1601 test examples are provided below. If you change anything in this
1602 function, run test_path_simplify to make sure you haven't broken a
1605 A previous version of this function was based on path_simplify()
1606 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1609 path_simplify (char *path)
1615 ++path; /* preserve the leading '/'. */
1618 end = p + strlen (p) + 1; /* position past the terminating zero. */
1623 /* P should point to the beginning of a path element. */
1625 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1627 /* Handle "./foo" by moving "foo" two characters to the
1629 if (*(p + 1) == '/')
1632 memmove (p, p + 2, end - p);
1643 else if (*p == '.' && *(p + 1) == '.'
1644 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1646 /* Handle "../foo" by moving "foo" one path element to the
1648 char *b = p; /* not p-1 because P can equal PATH */
1650 /* Backtrack by one path element, but not past the beginning
1653 /* foo/bar/../baz */
1659 /* Move backwards until B hits the beginning of the
1660 previous path element or the beginning of path. */
1661 for (--b; b > path && *(b - 1) != '/'; b--)
1666 if (*(p + 2) == '/')
1668 memmove (b, p + 3, end - (p + 3));
1682 /* Remove empty path elements. Not mandated by rfc1808 et
1683 al, but empty path elements are not all that useful, and
1684 the rest of Wget might not deal with them well. */
1694 memmove (p, q, end - q);
1699 /* Skip to the next path element. */
1700 while (*p && *p != '/')
1705 /* Make sure P points to the beginning of the next path element,
1706 which is location after the slash. */
1713 /* Resolve the result of "linking" a base URI (BASE) to a
1714 link-specified URI (LINK).
1716 Either of the URIs may be absolute or relative, complete with the
1717 host name, or path only. This tries to behave "reasonably" in all
1718 foreseeable cases. It employs little specific knowledge about
1719 schemes or URL-specific stuff -- it just works on strings.
1721 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1722 See uri_merge for a gentler interface to this functionality.
1724 Perhaps this function should call path_simplify so that the callers
1725 don't have to call url_parse unconditionally. */
1727 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1733 const char *end = base + path_length (base);
1737 /* Empty LINK points back to BASE, query string and all. */
1738 constr = xstrdup (base);
1740 else if (*link == '?')
1742 /* LINK points to the same location, but changes the query
1743 string. Examples: */
1744 /* uri_merge("path", "?new") -> "path?new" */
1745 /* uri_merge("path?foo", "?new") -> "path?new" */
1746 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1747 /* uri_merge("path#foo", "?new") -> "path?new" */
1748 int baselength = end - base;
1749 constr = xmalloc (baselength + linklength + 1);
1750 memcpy (constr, base, baselength);
1751 memcpy (constr + baselength, link, linklength);
1752 constr[baselength + linklength] = '\0';
1754 else if (*link == '#')
1756 /* uri_merge("path", "#new") -> "path#new" */
1757 /* uri_merge("path#foo", "#new") -> "path#new" */
1758 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1759 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1761 const char *end1 = strchr (base, '#');
1763 end1 = base + strlen (base);
1764 baselength = end1 - base;
1765 constr = xmalloc (baselength + linklength + 1);
1766 memcpy (constr, base, baselength);
1767 memcpy (constr + baselength, link, linklength);
1768 constr[baselength + linklength] = '\0';
1770 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1772 /* LINK begins with "//" and so is a net path: we need to
1773 replace everything after (and including) the double slash
1776 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1777 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1778 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1782 const char *start_insert;
1784 /* Look for first slash. */
1785 slash = memchr (base, '/', end - base);
1786 /* If found slash and it is a double slash, then replace
1787 from this point, else default to replacing from the
1789 if (slash && *(slash + 1) == '/')
1790 start_insert = slash;
1792 start_insert = base;
1794 span = start_insert - base;
1795 constr = (char *)xmalloc (span + linklength + 1);
1797 memcpy (constr, base, span);
1798 memcpy (constr + span, link, linklength);
1799 constr[span + linklength] = '\0';
1801 else if (*link == '/')
1803 /* LINK is an absolute path: we need to replace everything
1804 after (and including) the FIRST slash with LINK.
1806 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1807 "/qux/xyzzy", our result should be
1808 "http://host/qux/xyzzy". */
1811 const char *start_insert = NULL; /* for gcc to shut up. */
1812 const char *pos = base;
1813 int seen_slash_slash = 0;
1814 /* We're looking for the first slash, but want to ignore
1817 slash = memchr (pos, '/', end - pos);
1818 if (slash && !seen_slash_slash)
1819 if (*(slash + 1) == '/')
1822 seen_slash_slash = 1;
1826 /* At this point, SLASH is the location of the first / after
1827 "//", or the first slash altogether. START_INSERT is the
1828 pointer to the location where LINK will be inserted. When
1829 examining the last two examples, keep in mind that LINK
1832 if (!slash && !seen_slash_slash)
1833 /* example: "foo" */
1835 start_insert = base;
1836 else if (!slash && seen_slash_slash)
1837 /* example: "http://foo" */
1840 else if (slash && !seen_slash_slash)
1841 /* example: "foo/bar" */
1843 start_insert = base;
1844 else if (slash && seen_slash_slash)
1845 /* example: "http://something/" */
1847 start_insert = slash;
1849 span = start_insert - base;
1850 constr = (char *)xmalloc (span + linklength + 1);
1852 memcpy (constr, base, span);
1854 memcpy (constr + span, link, linklength);
1855 constr[span + linklength] = '\0';
1859 /* LINK is a relative URL: we need to replace everything
1860 after last slash (possibly empty) with LINK.
1862 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1863 our result should be "whatever/foo/qux/xyzzy". */
1864 int need_explicit_slash = 0;
1866 const char *start_insert;
1867 const char *last_slash = find_last_char (base, end, '/');
1870 /* No slash found at all. Append LINK to what we have,
1871 but we'll need a slash as a separator.
1873 Example: if base == "foo" and link == "qux/xyzzy", then
1874 we cannot just append link to base, because we'd get
1875 "fooqux/xyzzy", whereas what we want is
1878 To make sure the / gets inserted, we set
1879 need_explicit_slash to 1. We also set start_insert
1880 to end + 1, so that the length calculations work out
1881 correctly for one more (slash) character. Accessing
1882 that character is fine, since it will be the
1883 delimiter, '\0' or '?'. */
1884 /* example: "foo?..." */
1885 /* ^ ('?' gets changed to '/') */
1886 start_insert = end + 1;
1887 need_explicit_slash = 1;
1889 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1891 /* example: http://host" */
1893 start_insert = end + 1;
1894 need_explicit_slash = 1;
1898 /* example: "whatever/foo/bar" */
1900 start_insert = last_slash + 1;
1903 span = start_insert - base;
1904 constr = (char *)xmalloc (span + linklength + 1);
1906 memcpy (constr, base, span);
1907 if (need_explicit_slash)
1908 constr[span - 1] = '/';
1910 memcpy (constr + span, link, linklength);
1911 constr[span + linklength] = '\0';
1914 else /* !no_scheme */
1916 constr = strdupdelim (link, link + linklength);
1921 /* Merge BASE with LINK and return the resulting URI. This is an
1922 interface to uri_merge_1 that assumes that LINK is a
1923 zero-terminated string. */
1925 uri_merge (const char *base, const char *link)
1927 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1930 #define APPEND(p, s) do { \
1931 int len = strlen (s); \
1932 memcpy (p, s, len); \
1936 /* Use this instead of password when the actual password is supposed
1937 to be hidden. We intentionally use a generic string without giving
1938 away the number of characters in the password, like previous
1940 #define HIDDEN_PASSWORD "*password*"
1942 /* Recreate the URL string from the data in URL.
1944 If HIDE is non-zero (as it is when we're calling this on a URL we
1945 plan to print, but not when calling it to canonicalize a URL for
1946 use within the program), password will be hidden. Unsafe
1947 characters in the URL will be quoted. */
1950 url_string (const struct url *url, int hide_password)
1954 char *quoted_user = NULL, *quoted_passwd = NULL;
1956 int scheme_port = supported_schemes[url->scheme].default_port;
1957 char *scheme_str = supported_schemes[url->scheme].leading_string;
1958 int fplen = full_path_length (url);
1960 int brackets_around_host = 0;
1962 assert (scheme_str != NULL);
1964 /* Make sure the user name and password are quoted. */
1967 quoted_user = encode_string_maybe (url->user);
1971 quoted_passwd = HIDDEN_PASSWORD;
1973 quoted_passwd = encode_string_maybe (url->passwd);
1977 if (strchr (url->host, ':'))
1978 brackets_around_host = 1;
1980 size = (strlen (scheme_str)
1981 + strlen (url->host)
1982 + (brackets_around_host ? 2 : 0)
1985 if (url->port != scheme_port)
1986 size += 1 + numdigit (url->port);
1989 size += 1 + strlen (quoted_user);
1991 size += 1 + strlen (quoted_passwd);
1994 p = result = xmalloc (size);
1996 APPEND (p, scheme_str);
1999 APPEND (p, quoted_user);
2003 APPEND (p, quoted_passwd);
2008 if (brackets_around_host)
2010 APPEND (p, url->host);
2011 if (brackets_around_host)
2013 if (url->port != scheme_port)
2016 p = number_to_string (p, url->port);
2019 full_path_write (url, p);
2023 assert (p - result == size);
2025 if (quoted_user && quoted_user != url->user)
2026 xfree (quoted_user);
2027 if (quoted_passwd && !hide_password
2028 && quoted_passwd != url->passwd)
2029 xfree (quoted_passwd);
2034 /* Return the URL of the proxy appropriate for url U. */
2036 getproxy (struct url *u)
2039 char *rewritten_url;
2040 static char rewritten_storage[1024];
2044 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2050 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2054 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2058 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2060 case SCHEME_INVALID:
2063 if (!proxy || !*proxy)
2066 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2067 getproxy() to return static storage. */
2068 rewritten_url = rewrite_shorthand_url (proxy);
2071 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2072 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2073 proxy = rewritten_storage;
2079 /* Should a host be accessed through proxy, concerning no_proxy? */
2081 no_proxy_match (const char *host, const char **no_proxy)
2086 return !sufmatch (no_proxy, host);
2089 /* Support for converting links for local viewing in downloaded HTML
2090 files. This should be moved to another file, because it has
2091 nothing to do with processing URLs. */
2093 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2094 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2096 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2097 const char *, int));
2098 static char *local_quote_string PARAMS ((const char *));
2100 /* Change the links in one HTML file. LINKS is a list of links in the
2101 document, along with their positions and the desired direction of
2104 convert_links (const char *file, struct urlpos *links)
2106 struct file_memory *fm;
2109 downloaded_file_t downloaded_file_return;
2111 struct urlpos *link;
2112 int to_url_count = 0, to_file_count = 0;
2114 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2117 /* First we do a "dry run": go through the list L and see whether
2118 any URL needs to be converted in the first place. If not, just
2119 leave the file alone. */
2121 struct urlpos *dry = links;
2122 for (dry = links; dry; dry = dry->next)
2123 if (dry->convert != CO_NOCONVERT)
2127 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2132 fm = read_file (file);
2135 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2136 file, strerror (errno));
2140 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2141 if (opt.backup_converted && downloaded_file_return)
2142 write_backup_file (file, downloaded_file_return);
2144 /* Before opening the file for writing, unlink the file. This is
2145 important if the data in FM is mmaped. In such case, nulling the
2146 file, which is what fopen() below does, would make us read all
2147 zeroes from the mmaped region. */
2148 if (unlink (file) < 0 && errno != ENOENT)
2150 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2151 file, strerror (errno));
2152 read_file_free (fm);
2155 /* Now open the file for writing. */
2156 fp = fopen (file, "wb");
2159 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2160 file, strerror (errno));
2161 read_file_free (fm);
2165 /* Here we loop through all the URLs in file, replacing those of
2166 them that are downloaded with relative references. */
2168 for (link = links; link; link = link->next)
2170 char *url_start = fm->content + link->pos;
2172 if (link->pos >= fm->length)
2174 DEBUGP (("Something strange is going on. Please investigate."));
2177 /* If the URL is not to be converted, skip it. */
2178 if (link->convert == CO_NOCONVERT)
2180 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2184 /* Echo the file contents, up to the offending URL's opening
2185 quote, to the outfile. */
2186 fwrite (p, 1, url_start - p, fp);
2189 switch (link->convert)
2191 case CO_CONVERT_TO_RELATIVE:
2192 /* Convert absolute URL to relative. */
2194 char *newname = construct_relative (file, link->local_name);
2195 char *quoted_newname = local_quote_string (newname);
2197 if (!link->link_refresh_p)
2198 p = replace_attr (p, link->size, fp, quoted_newname);
2200 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2201 link->refresh_timeout);
2203 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2204 link->url->url, newname, link->pos, file));
2206 xfree (quoted_newname);
2210 case CO_CONVERT_TO_COMPLETE:
2211 /* Convert the link to absolute URL. */
2213 char *newlink = link->url->url;
2214 char *quoted_newlink = html_quote_string (newlink);
2216 if (!link->link_refresh_p)
2217 p = replace_attr (p, link->size, fp, quoted_newlink);
2219 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2220 link->refresh_timeout);
2222 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2223 newlink, link->pos, file));
2224 xfree (quoted_newlink);
2228 case CO_NULLIFY_BASE:
2229 /* Change the base href to "". */
2230 p = replace_attr (p, link->size, fp, "");
2238 /* Output the rest of the file. */
2239 if (p - fm->content < fm->length)
2240 fwrite (p, 1, fm->length - (p - fm->content), fp);
2242 read_file_free (fm);
2244 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2247 /* Construct and return a malloced copy of the relative link from two
2248 pieces of information: local name S1 of the referring file and
2249 local name S2 of the referred file.
2251 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2252 "jagor.srce.hr/images/news.gif", the function will return
2255 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2256 "fly.cc.fer.hr/images/fly.gif", the function will return
2257 "../images/fly.gif".
2259 Caveats: S1 should not begin with `/', unless S2 also begins with
2260 '/'. S1 should not contain things like ".." and such --
2261 construct_relative ("fly/ioccc/../index.html",
2262 "fly/images/fly.gif") will fail. (A workaround is to call
2263 something like path_simplify() on S1). */
2265 construct_relative (const char *s1, const char *s2)
2267 int i, cnt, sepdirs1;
2271 return xstrdup (s2);
2272 /* S1 should *not* be absolute, if S2 wasn't. */
2273 assert (*s1 != '/');
2275 /* Skip the directories common to both strings. */
2278 while (s1[i] && s2[i]
2283 if (s1[i] == '/' && s2[i] == '/')
2288 for (sepdirs1 = 0; s1[i]; i++)
2291 /* Now, construct the file as of:
2292 - ../ repeated sepdirs1 time
2293 - all the non-mutual directories of S2. */
2294 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2295 for (i = 0; i < sepdirs1; i++)
2296 memcpy (res + 3 * i, "../", 3);
2297 strcpy (res + 3 * i, s2 + cnt);
2302 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2304 /* Rather than just writing over the original .html file with the
2305 converted version, save the former to *.orig. Note we only do
2306 this for files we've _successfully_ downloaded, so we don't
2307 clobber .orig files sitting around from previous invocations. */
2309 /* Construct the backup filename as the original name plus ".orig". */
2310 size_t filename_len = strlen(file);
2311 char* filename_plus_orig_suffix;
2312 boolean already_wrote_backup_file = FALSE;
2313 slist* converted_file_ptr;
2314 static slist* converted_files = NULL;
2316 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2318 /* Just write "orig" over "html". We need to do it this way
2319 because when we're checking to see if we've downloaded the
2320 file before (to see if we can skip downloading it), we don't
2321 know if it's a text/html file. Therefore we don't know yet
2322 at that stage that -E is going to cause us to tack on
2323 ".html", so we need to compare vs. the original URL plus
2324 ".orig", not the original URL plus ".html.orig". */
2325 filename_plus_orig_suffix = alloca (filename_len + 1);
2326 strcpy(filename_plus_orig_suffix, file);
2327 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2329 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2331 /* Append ".orig" to the name. */
2332 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2333 strcpy(filename_plus_orig_suffix, file);
2334 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2337 /* We can get called twice on the same URL thanks to the
2338 convert_all_links() call in main(). If we write the .orig file
2339 each time in such a case, it'll end up containing the first-pass
2340 conversion, not the original file. So, see if we've already been
2341 called on this file. */
2342 converted_file_ptr = converted_files;
2343 while (converted_file_ptr != NULL)
2344 if (strcmp(converted_file_ptr->string, file) == 0)
2346 already_wrote_backup_file = TRUE;
2350 converted_file_ptr = converted_file_ptr->next;
2352 if (!already_wrote_backup_file)
2354 /* Rename <file> to <file>.orig before former gets written over. */
2355 if (rename(file, filename_plus_orig_suffix) != 0)
2356 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2357 file, filename_plus_orig_suffix, strerror (errno));
2359 /* Remember that we've already written a .orig backup for this file.
2360 Note that we never free this memory since we need it till the
2361 convert_all_links() call, which is one of the last things the
2362 program does before terminating. BTW, I'm not sure if it would be
2363 safe to just set 'converted_file_ptr->string' to 'file' below,
2364 rather than making a copy of the string... Another note is that I
2365 thought I could just add a field to the urlpos structure saying
2366 that we'd written a .orig file for this URL, but that didn't work,
2367 so I had to make this separate list.
2368 -- Dan Harkless <wget@harkless.org>
2370 This [adding a field to the urlpos structure] didn't work
2371 because convert_file() is called from convert_all_links at
2372 the end of the retrieval with a freshly built new urlpos
2374 -- Hrvoje Niksic <hniksic@arsdigita.com>
2376 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2377 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2378 converted_file_ptr->next = converted_files;
2379 converted_files = converted_file_ptr;
2383 static int find_fragment PARAMS ((const char *, int, const char **,
2386 /* Replace an attribute's original text with NEW_TEXT. */
2389 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2392 char quote_char = '\"'; /* use "..." for quoting, unless the
2393 original value is quoted, in which
2394 case reuse its quoting char. */
2395 const char *frag_beg, *frag_end;
2397 /* Structure of our string is:
2398 "...old-contents..."
2399 <--- size ---> (with quotes)
2402 <--- size --> (no quotes) */
2404 if (*p == '\"' || *p == '\'')
2409 size -= 2; /* disregard opening and closing quote */
2411 putc (quote_char, fp);
2412 fputs (new_text, fp);
2414 /* Look for fragment identifier, if any. */
2415 if (find_fragment (p, size, &frag_beg, &frag_end))
2416 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2420 putc (quote_char, fp);
2425 /* The same as REPLACE_ATTR, but used when replacing
2426 <meta http-equiv=refresh content="new_text"> because we need to
2427 append "timeout_value; URL=" before the next_text. */
2430 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2431 const char *new_text, int timeout)
2434 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2438 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2440 return replace_attr (p, size, fp, new_with_timeout);
2443 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2444 preceded by '&'. If the character is not found, return zero. If
2445 the character is found, return 1 and set BP and EP to point to the
2446 beginning and end of the region.
2448 This is used for finding the fragment indentifiers in URLs. */
2451 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2453 const char *end = beg + size;
2455 for (; beg < end; beg++)
2477 /* Quote FILE for use as local reference to an HTML file.
2479 We quote ? as %3F to avoid passing part of the file name as the
2480 parameter when browsing the converted file through HTTP. However,
2481 it is safe to do this only when `--html-extension' is turned on.
2482 This is because converting "index.html?foo=bar" to
2483 "index.html%3Ffoo=bar" would break local browsing, as the latter
2484 isn't even recognized as an HTML file! However, converting
2485 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2486 safe for both local and HTTP-served browsing. */
2489 local_quote_string (const char *file)
2491 const char *file_sans_qmark;
2494 if (!opt.html_extension)
2495 return html_quote_string (file);
2497 qm = count_char (file, '?');
2501 const char *from = file;
2504 /* qm * 2 because we replace each question mark with "%3F",
2505 i.e. replace one char with three, hence two more. */
2506 int fsqlen = strlen (file) + qm * 2;
2508 to = newname = (char *)alloca (fsqlen + 1);
2509 for (; *from; from++)
2520 assert (to - newname == fsqlen);
2523 file_sans_qmark = newname;
2526 file_sans_qmark = file;
2528 return html_quote_string (file_sans_qmark);
2531 /* We're storing "modes" of type downloaded_file_t in the hash table.
2532 However, our hash tables only accept pointers for keys and values.
2533 So when we need a pointer, we use the address of a
2534 downloaded_file_t variable of static storage. */
2536 static downloaded_file_t *
2537 downloaded_mode_to_ptr (downloaded_file_t mode)
2539 static downloaded_file_t
2540 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2541 v2 = FILE_DOWNLOADED_NORMALLY,
2542 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2543 v4 = CHECK_FOR_FILE;
2547 case FILE_NOT_ALREADY_DOWNLOADED:
2549 case FILE_DOWNLOADED_NORMALLY:
2551 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2553 case CHECK_FOR_FILE:
2559 /* This should really be merged with dl_file_url_map and
2560 downloaded_html_files in recur.c. This was originally a list, but
2561 I changed it to a hash table beause it was actually taking a lot of
2562 time to find things in it. */
2564 static struct hash_table *downloaded_files_hash;
2566 /* Remembers which files have been downloaded. In the standard case, should be
2567 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2568 download successfully (i.e. not for ones we have failures on or that we skip
2571 When we've downloaded a file and tacked on a ".html" extension due to -E,
2572 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2573 FILE_DOWNLOADED_NORMALLY.
2575 If you just want to check if a file has been previously added without adding
2576 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2577 with local filenames, not remote URLs. */
2579 downloaded_file (downloaded_file_t mode, const char *file)
2581 downloaded_file_t *ptr;
2583 if (mode == CHECK_FOR_FILE)
2585 if (!downloaded_files_hash)
2586 return FILE_NOT_ALREADY_DOWNLOADED;
2587 ptr = hash_table_get (downloaded_files_hash, file);
2589 return FILE_NOT_ALREADY_DOWNLOADED;
2593 if (!downloaded_files_hash)
2594 downloaded_files_hash = make_string_hash_table (0);
2596 ptr = hash_table_get (downloaded_files_hash, file);
2600 ptr = downloaded_mode_to_ptr (mode);
2601 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2603 return FILE_NOT_ALREADY_DOWNLOADED;
2607 df_free_mapper (void *key, void *value, void *ignored)
2614 downloaded_files_free (void)
2616 if (downloaded_files_hash)
2618 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2619 hash_table_destroy (downloaded_files_hash);
2620 downloaded_files_hash = NULL;
2624 /* Return non-zero if scheme a is similar to scheme b.
2626 Schemes are similar if they are equal. If SSL is supported, schemes
2627 are also similar if one is http (SCHEME_HTTP) and the other is https
2630 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2635 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2636 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2643 /* Debugging and testing support for path_simplify. */
2645 /* Debug: run path_simplify on PATH and return the result in a new
2646 string. Useful for calling from the debugger. */
2650 char *copy = xstrdup (path);
2651 path_simplify (copy);
2656 run_test (char *test, char *expected_result, int expected_change)
2658 char *test_copy = xstrdup (test);
2659 int modified = path_simplify (test_copy);
2661 if (0 != strcmp (test_copy, expected_result))
2663 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2664 test, expected_result, test_copy);
2666 if (modified != expected_change)
2668 if (expected_change == 1)
2669 printf ("Expected no modification with path_simplify(\"%s\").\n",
2672 printf ("Expected modification with path_simplify(\"%s\").\n",
2679 test_path_simplify (void)
2682 char *test, *result;
2688 { "foo", "foo", 0 },
2689 { "foo/bar", "foo/bar", 0 },
2690 { "foo///bar", "foo/bar", 1 },
2691 { "foo/.", "foo/", 1 },
2692 { "foo/./", "foo/", 1 },
2693 { "foo./", "foo./", 0 },
2694 { "foo/../bar", "bar", 1 },
2695 { "foo/../bar/", "bar/", 1 },
2696 { "foo/bar/..", "foo/", 1 },
2697 { "foo/bar/../x", "foo/x", 1 },
2698 { "foo/bar/../x/", "foo/x/", 1 },
2699 { "foo/..", "", 1 },
2700 { "foo/../..", "", 1 },
2701 { "a/b/../../c", "c", 1 },
2702 { "./a/../b", "b", 1 }
2706 for (i = 0; i < ARRAY_SIZE (tests); i++)
2708 char *test = tests[i].test;
2709 char *expected_result = tests[i].result;
2710 int expected_change = tests[i].should_modify;
2711 run_test (test, expected_result, expected_change);
2714 /* Now run all the tests with a leading slash before the test case,
2715 to prove that the slash is being preserved. */
2716 for (i = 0; i < ARRAY_SIZE (tests); i++)
2718 char *test, *expected_result;
2719 int expected_change = tests[i].should_modify;
2721 test = xmalloc (1 + strlen (tests[i].test) + 1);
2722 sprintf (test, "/%s", tests[i].test);
2724 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2725 sprintf (expected_result, "/%s", tests[i].result);
2727 run_test (test, expected_result, expected_change);
2730 xfree (expected_result);