2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
58 /* Supported schemes: */
59 static struct scheme_data supported_schemes[] =
61 { "http://", DEFAULT_HTTP_PORT, 1 },
63 { "https://", DEFAULT_HTTPS_PORT, 1 },
65 { "ftp://", DEFAULT_FTP_PORT, 1 },
71 /* Forward declarations: */
73 static char *construct_relative PARAMS ((const char *, const char *));
74 static int path_simplify PARAMS ((char *));
78 /* Support for encoding and decoding of URL strings. We determine
79 whether a character is unsafe through static table lookup. This
80 code assumes ASCII character set and 8-bit chars. */
87 #define R urlchr_reserved
88 #define U urlchr_unsafe
91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
93 /* rfc1738 reserved chars, preserved from encoding. */
95 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
97 /* rfc1738 unsafe chars, plus some more. */
99 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
101 const static unsigned char urlchr_table[256] =
103 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
104 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
105 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
106 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
107 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
108 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
110 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
111 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
113 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
114 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
115 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
116 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
117 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
118 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
121 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
127 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 /* Decodes the forms %xy in a URL to the character the hexadecimal
132 code of which is xy. xy are hexadecimal digits from
133 [0123456789ABCDEF] (case-insensitive). If x or y are not
134 hex-digits or `%' precedes `\0', the sequence is inserted
138 decode_string (char *s)
140 char *t = s; /* t - tortoise */
141 char *h = s; /* h - hare */
152 /* Do nothing if '%' is not followed by two hex digits. */
153 if (!*(h + 1) || !*(h + 2)
154 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
156 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
163 /* Like encode_string, but return S if there are no unsafe chars. */
166 encode_string_maybe (const char *s)
173 for (p1 = s; *p1; p1++)
174 if (UNSAFE_CHAR (*p1))
175 addition += 2; /* Two more characters (hex digits) */
180 newlen = (p1 - s) + addition;
181 newstr = (char *)xmalloc (newlen + 1);
187 if (UNSAFE_CHAR (*p1))
189 unsigned char c = *p1++;
191 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
192 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
198 assert (p2 - newstr == newlen);
203 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
204 given string, returning a malloc-ed %XX encoded string. */
207 encode_string (const char *s)
209 char *encoded = encode_string_maybe (s);
216 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
217 the old value of PTR is freed and PTR is made to point to the newly
218 allocated storage. */
220 #define ENCODE(ptr) do { \
221 char *e_new = encode_string_maybe (ptr); \
229 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
231 /* Decide whether to encode, decode, or pass through the char at P.
232 This used to be a macro, but it got a little too convoluted. */
233 static inline enum copy_method
234 decide_copy_method (const char *p)
238 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
240 /* %xx sequence: decode it, unless it would decode to an
241 unsafe or a reserved char; in that case, leave it as
243 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
244 XCHAR_TO_XDIGIT (*(p + 2));
246 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
247 return CM_PASSTHROUGH;
252 /* Garbled %.. sequence: encode `%'. */
255 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
258 return CM_PASSTHROUGH;
261 /* Translate a %-quoting (but possibly non-conformant) input string S
262 into a %-quoting (and conformant) output string. If no characters
263 are encoded or decoded, return the same string S; otherwise, return
264 a freshly allocated string with the new contents.
266 After a URL has been run through this function, the protocols that
267 use `%' as the quote character can use the resulting string as-is,
268 while those that don't call decode_string() to get to the intended
269 data. This function is also stable: after an input string is
270 transformed the first time, all further transformations of the
271 result yield the same result string.
273 Let's discuss why this function is needed.
275 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
276 space character would mess up the HTTP request, it needs to be
279 GET /abc%20def HTTP/1.0
281 So it appears that the unsafe chars need to be quoted, as with
282 encode_string. But what if we're requested to download
283 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
284 the user meant was a literal space, and he was kind enough to quote
285 it. In that case, Wget should obviously leave the `%20' as is, and
286 send the same request as above. So in this case we may not call
289 But what if the requested URI is `abc%20 def'? If we call
290 encode_string, we end up with `/abc%2520%20def', which is almost
291 certainly not intended. If we don't call encode_string, we are
292 left with the embedded space and cannot send the request. What the
293 user meant was for Wget to request `/abc%20%20def', and this is
294 where reencode_string kicks in.
296 Wget used to solve this by first decoding %-quotes, and then
297 encoding all the "unsafe" characters found in the resulting string.
298 This was wrong because it didn't preserve certain URL special
299 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
300 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
301 whether we considered `+' reserved (it is). One of these results
302 is inevitable because by the second step we would lose information
303 on whether the `+' was originally encoded or not. Both results
304 were wrong because in CGI parameters + means space, while %2B means
305 literal plus. reencode_string correctly translates the above to
306 "a%2B+b", i.e. returns the original string.
308 This function uses an algorithm proposed by Anon Sricharoenchai:
310 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
313 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
316 ...except that this code conflates the two steps, and decides
317 whether to encode, decode, or pass through each character in turn.
318 The function still uses two passes, but their logic is the same --
319 the first pass exists merely for the sake of allocation. Another
320 small difference is that we include `+' to URL_RESERVED.
324 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
326 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
330 "foo bar" -> "foo%20bar"
331 "foo%20bar" -> "foo%20bar"
332 "foo %20bar" -> "foo%20%20bar"
333 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
334 "foo%25%20bar" -> "foo%25%20bar"
335 "foo%2%20bar" -> "foo%252%20bar"
336 "foo+bar" -> "foo+bar" (plus is reserved!)
337 "foo%2b+bar" -> "foo%2b+bar" */
340 reencode_string (const char *s)
346 int encode_count = 0;
347 int decode_count = 0;
349 /* First, pass through the string to see if there's anything to do,
350 and to calculate the new length. */
351 for (p1 = s; *p1; p1++)
353 switch (decide_copy_method (p1))
366 if (!encode_count && !decode_count)
367 /* The string is good as it is. */
368 return (char *)s; /* C const model sucks. */
371 /* Each encoding adds two characters (hex digits), while each
372 decoding removes two characters. */
373 newlen = oldlen + 2 * (encode_count - decode_count);
374 newstr = xmalloc (newlen + 1);
381 switch (decide_copy_method (p1))
385 unsigned char c = *p1++;
387 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
388 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
392 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
393 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
394 p1 += 3; /* skip %xx */
401 assert (p2 - newstr == newlen);
405 /* Run PTR_VAR through reencode_string. If a new string is consed,
406 free PTR_VAR and make it point to the new storage. Obviously,
407 PTR_VAR needs to be an lvalue. */
409 #define REENCODE(ptr_var) do { \
410 char *rf_new = reencode_string (ptr_var); \
411 if (rf_new != ptr_var) \
418 /* Returns the scheme type if the scheme is supported, or
419 SCHEME_INVALID if not. */
421 url_scheme (const char *url)
425 for (i = 0; supported_schemes[i].leading_string; i++)
426 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
427 strlen (supported_schemes[i].leading_string)))
429 if (supported_schemes[i].enabled)
430 return (enum url_scheme) i;
432 return SCHEME_INVALID;
435 return SCHEME_INVALID;
438 /* Return the number of characters needed to skip the scheme part of
439 the URL, e.g. `http://'. If no scheme is found, returns 0. */
441 url_skip_scheme (const char *url)
445 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
447 while (ISALNUM (*p) || *p == '-' || *p == '+')
454 /* Skip "//" if found. */
455 if (*p == '/' && *(p + 1) == '/')
461 /* Returns 1 if the URL begins with a scheme (supported or
462 unsupported), 0 otherwise. */
464 url_has_scheme (const char *url)
467 while (ISALNUM (*p) || *p == '-' || *p == '+')
473 scheme_default_port (enum url_scheme scheme)
475 return supported_schemes[scheme].default_port;
479 scheme_disable (enum url_scheme scheme)
481 supported_schemes[scheme].enabled = 0;
484 /* Skip the username and password, if present here. The function
485 should be called *not* with the complete URL, but with the part
486 right after the scheme.
488 If no username and password are found, return 0. */
490 url_skip_uname (const char *url)
494 /* Look for '@' that comes before '/' or '?'. */
495 p = (const char *)strpbrk (url, "/?@");
503 parse_uname (const char *str, int len, char **user, char **passwd)
508 /* Empty user name not allowed. */
511 colon = memchr (str, ':', len);
513 /* Empty user name again. */
518 int pwlen = len - (colon + 1 - str);
519 *passwd = xmalloc (pwlen + 1);
520 memcpy (*passwd, colon + 1, pwlen);
521 (*passwd)[pwlen] = '\0';
527 *user = xmalloc (len + 1);
528 memcpy (*user, str, len);
532 decode_string (*user);
534 decode_string (*passwd);
539 /* Used by main.c: detect URLs written using the "shorthand" URL forms
540 popularized by Netscape and NcFTP. HTTP shorthands look like this:
542 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
543 www.foo.com[:port] -> http://www.foo.com[:port]
545 FTP shorthands look like this:
547 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
548 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
550 If the URL needs not or cannot be rewritten, return NULL. */
552 rewrite_shorthand_url (const char *url)
556 if (url_has_scheme (url))
559 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
561 for (p = url; *p && *p != ':' && *p != '/'; p++)
571 /* If the characters after the colon and before the next slash
572 or end of string are all digits, it's HTTP. */
574 for (pp = p + 1; ISDIGIT (*pp); pp++)
576 if (digits > 0 && (*pp == '/' || *pp == '\0'))
579 /* Prepend "ftp://" to the entire URL... */
580 res = xmalloc (6 + strlen (url) + 1);
581 sprintf (res, "ftp://%s", url);
582 /* ...and replace ':' with '/'. */
583 res[6 + (p - url)] = '/';
590 /* Just prepend "http://" to what we have. */
591 res = xmalloc (7 + strlen (url) + 1);
592 sprintf (res, "http://%s", url);
597 static void parse_path PARAMS ((const char *, char **, char **));
600 strpbrk_or_eos (const char *s, const char *accept)
602 char *p = strpbrk (s, accept);
604 p = (char *)s + strlen (s);
608 /* Turn STR into lowercase; return non-zero if a character was
612 lowercase_str (char *str)
619 *str = TOLOWER (*str);
624 static char *parse_errors[] = {
625 #define PE_NO_ERROR 0
627 #define PE_UNSUPPORTED_SCHEME 1
628 "Unsupported scheme",
629 #define PE_EMPTY_HOST 2
631 #define PE_BAD_PORT_NUMBER 3
633 #define PE_INVALID_USER_NAME 4
635 #define PE_UNTERMINATED_IPV6_ADDRESS 5
636 "Unterminated IPv6 numeric address",
637 #define PE_INVALID_IPV6_ADDRESS 6
638 "Invalid char in IPv6 numeric address"
641 #define SETERR(p, v) do { \
648 Return a new struct url if successful, NULL on error. In case of
649 error, and if ERROR is not NULL, also set *ERROR to the appropriate
652 url_parse (const char *url, int *error)
656 int path_modified, host_modified;
658 enum url_scheme scheme;
660 const char *uname_b, *uname_e;
661 const char *host_b, *host_e;
662 const char *path_b, *path_e;
663 const char *params_b, *params_e;
664 const char *query_b, *query_e;
665 const char *fragment_b, *fragment_e;
668 char *user = NULL, *passwd = NULL;
672 scheme = url_scheme (url);
673 if (scheme == SCHEME_INVALID)
675 SETERR (error, PE_UNSUPPORTED_SCHEME);
679 url_encoded = reencode_string (url);
682 p += strlen (supported_schemes[scheme].leading_string);
684 p += url_skip_uname (p);
687 /* scheme://user:pass@host[:port]... */
690 /* We attempt to break down the URL into the components path,
691 params, query, and fragment. They are ordered like this:
693 scheme://host[:port][/path][;params][?query][#fragment] */
695 params_b = params_e = NULL;
696 query_b = query_e = NULL;
697 fragment_b = fragment_e = NULL;
703 /* Support http://[::1]/ used by IPv6. */
714 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
727 SETERR (error, PE_INVALID_IPV6_ADDRESS);
730 /* Don't include brackets in [host_b, host_p). */
736 p = strpbrk_or_eos (p, ":/;?#");
740 if (host_b == host_e)
742 SETERR (error, PE_EMPTY_HOST);
746 port = scheme_default_port (scheme);
749 const char *port_b, *port_e, *pp;
751 /* scheme://host:port/tralala */
755 p = strpbrk_or_eos (p, "/;?#");
758 if (port_b == port_e)
760 /* http://host:/whatever */
762 SETERR (error, PE_BAD_PORT_NUMBER);
766 for (port = 0, pp = port_b; pp < port_e; pp++)
770 /* http://host:12randomgarbage/blah */
772 SETERR (error, PE_BAD_PORT_NUMBER);
775 port = 10 * port + (*pp - '0');
783 p = strpbrk_or_eos (p, ";?#");
788 /* Path is not allowed not to exist. */
796 p = strpbrk_or_eos (p, "?#");
803 p = strpbrk_or_eos (p, "#");
806 /* Hack that allows users to use '?' (a wildcard character) in
807 FTP URLs without it being interpreted as a query string
809 if (scheme == SCHEME_FTP)
811 query_b = query_e = NULL;
824 if (uname_b != uname_e)
826 /* http://user:pass@host */
828 /* uname_b uname_e */
829 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
831 SETERR (error, PE_INVALID_USER_NAME);
836 u = (struct url *)xmalloc (sizeof (struct url));
837 memset (u, 0, sizeof (*u));
840 u->host = strdupdelim (host_b, host_e);
845 u->path = strdupdelim (path_b, path_e);
846 path_modified = path_simplify (u->path);
847 parse_path (u->path, &u->dir, &u->file);
849 host_modified = lowercase_str (u->host);
852 u->params = strdupdelim (params_b, params_e);
854 u->query = strdupdelim (query_b, query_e);
856 u->fragment = strdupdelim (fragment_b, fragment_e);
858 if (path_modified || u->fragment || host_modified || path_b == path_e)
860 /* If we suspect that a transformation has rendered what
861 url_string might return different from URL_ENCODED, rebuild
862 u->url using url_string. */
863 u->url = url_string (u, 0);
865 if (url_encoded != url)
866 xfree ((char *) url_encoded);
870 if (url_encoded == url)
871 u->url = xstrdup (url);
873 u->url = url_encoded;
881 url_error (int error_code)
883 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
884 return parse_errors[error_code];
888 parse_path (const char *quoted_path, char **dir, char **file)
890 char *path, *last_slash;
892 STRDUP_ALLOCA (path, quoted_path);
893 decode_string (path);
895 last_slash = strrchr (path, '/');
899 *file = xstrdup (path);
903 *dir = strdupdelim (path, last_slash);
904 *file = xstrdup (last_slash + 1);
908 /* Note: URL's "full path" is the path with the query string and
909 params appended. The "fragment" (#foo) is intentionally ignored,
910 but that might be changed. For example, if the original URL was
911 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
912 the full path will be "/foo/bar/baz;bullshit?querystring". */
914 /* Return the length of the full path, without the terminating
918 full_path_length (const struct url *url)
922 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
933 /* Write out the full path. */
936 full_path_write (const struct url *url, char *where)
938 #define FROB(el, chr) do { \
939 char *f_el = url->el; \
941 int l = strlen (f_el); \
943 memcpy (where, f_el, l); \
955 /* Public function for getting the "full path". E.g. if u->path is
956 "foo/bar" and u->query is "param=value", full_path will be
957 "/foo/bar?param=value". */
960 url_full_path (const struct url *url)
962 int length = full_path_length (url);
963 char *full_path = (char *)xmalloc(length + 1);
965 full_path_write (url, full_path);
966 full_path[length] = '\0';
971 /* Sync u->path and u->url with u->dir and u->file. */
974 sync_path (struct url *url)
982 newpath = xstrdup (url->file);
987 int dirlen = strlen (url->dir);
988 int filelen = strlen (url->file);
990 newpath = xmalloc (dirlen + 1 + filelen + 1);
991 memcpy (newpath, url->dir, dirlen);
992 newpath[dirlen] = '/';
993 memcpy (newpath + dirlen + 1, url->file, filelen);
994 newpath[dirlen + 1 + filelen] = '\0';
1000 /* Synchronize u->url. */
1002 url->url = url_string (url, 0);
1005 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1006 This way we can sync u->path and u->url when they get changed. */
1009 url_set_dir (struct url *url, const char *newdir)
1012 url->dir = xstrdup (newdir);
1017 url_set_file (struct url *url, const char *newfile)
1020 url->file = xstrdup (newfile);
1025 url_free (struct url *url)
1031 FREE_MAYBE (url->params);
1032 FREE_MAYBE (url->query);
1033 FREE_MAYBE (url->fragment);
1034 FREE_MAYBE (url->user);
1035 FREE_MAYBE (url->passwd);
1044 get_urls_file (const char *file)
1046 struct file_memory *fm;
1047 struct urlpos *head, *tail;
1048 const char *text, *text_end;
1050 /* Load the file. */
1051 fm = read_file (file);
1054 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1057 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1061 text_end = fm->content + fm->length;
1062 while (text < text_end)
1064 const char *line_beg = text;
1065 const char *line_end = memchr (text, '\n', text_end - text);
1067 line_end = text_end;
1072 /* Strip whitespace from the beginning and end of line. */
1073 while (line_beg < line_end && ISSPACE (*line_beg))
1075 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1078 if (line_end > line_beg)
1080 /* URL is in the [line_beg, line_end) region. */
1084 struct urlpos *entry;
1087 /* We must copy the URL to a zero-terminated string, and we
1088 can't use alloca because we're in a loop. *sigh*. */
1089 url_text = strdupdelim (line_beg, line_end);
1093 /* Merge opt.base_href with URL. */
1094 char *merged = uri_merge (opt.base_href, url_text);
1099 url = url_parse (url_text, &up_error_code);
1102 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1103 file, url_text, url_error (up_error_code));
1109 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1110 memset (entry, 0, sizeof (*entry));
1121 read_file_free (fm);
1125 /* Free the linked list of urlpos. */
1127 free_urlpos (struct urlpos *l)
1131 struct urlpos *next = l->next;
1134 FREE_MAYBE (l->local_name);
1140 /* Rotate FNAME opt.backups times */
1142 rotate_backups(const char *fname)
1144 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1145 char *from = (char *)alloca (maxlen);
1146 char *to = (char *)alloca (maxlen);
1150 if (stat (fname, &sb) == 0)
1151 if (S_ISREG (sb.st_mode) == 0)
1154 for (i = opt.backups; i > 1; i--)
1156 sprintf (from, "%s.%d", fname, i - 1);
1157 sprintf (to, "%s.%d", fname, i);
1158 /* #### This will fail on machines without the rename() system
1163 sprintf (to, "%s.%d", fname, 1);
1167 /* Create all the necessary directories for PATH (a file). Calls
1168 mkdirhier() internally. */
1170 mkalldirs (const char *path)
1177 p = path + strlen (path);
1178 for (; *p != '/' && p != path; p--);
1179 /* Don't create if it's just a file. */
1180 if ((p == path) && (*p != '/'))
1182 t = strdupdelim (path, p);
1183 /* Check whether the directory exists. */
1184 if ((stat (t, &st) == 0))
1186 if (S_ISDIR (st.st_mode))
1193 /* If the dir exists as a file name, remove it first. This
1194 is *only* for Wget to work with buggy old CERN http
1195 servers. Here is the scenario: When Wget tries to
1196 retrieve a directory without a slash, e.g.
1197 http://foo/bar (bar being a directory), CERN server will
1198 not redirect it too http://foo/bar/ -- it will generate a
1199 directory listing containing links to bar/file1,
1200 bar/file2, etc. Wget will lose because it saves this
1201 HTML listing to a file `bar', so it cannot create the
1202 directory. To work around this, if the file of the same
1203 name exists, we just remove it and create the directory
1205 DEBUGP (("Removing %s because of directory danger!\n", t));
1209 res = make_directory (t);
1211 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1217 count_slashes (const char *s)
1226 /* Return the path name of the URL-equivalent file name, with a
1227 remote-like structure of directories. */
1229 mkstruct (const struct url *u)
1232 char *res, *dirpref;
1237 char *ptr = u->dir + (*u->dir == '/');
1238 int slash_count = 1 + count_slashes (ptr);
1239 int cut = MINVAL (opt.cut_dirs, slash_count);
1240 for (; cut && *ptr; ptr++)
1243 STRDUP_ALLOCA (dir, ptr);
1246 dir = u->dir + (*u->dir == '/');
1248 /* Check for the true name (or at least a consistent name for saving
1249 to directory) of HOST, reusing the hlist if possible. */
1250 if (opt.add_hostdir)
1252 /* Add dir_prefix and hostname (if required) to the beginning of
1254 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1256 + 1 + numdigit (u->port)
1258 if (!DOTP (opt.dir_prefix))
1259 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1261 strcpy (dirpref, u->host);
1263 if (u->port != scheme_default_port (u->scheme))
1265 int len = strlen (dirpref);
1267 number_to_string (dirpref + len + 1, u->port);
1270 else /* not add_hostdir */
1272 if (!DOTP (opt.dir_prefix))
1273 dirpref = opt.dir_prefix;
1278 /* If there is a prefix, prepend it. */
1281 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1282 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1287 if (l && dir[l - 1] == '/')
1291 file = "index.html";
1295 /* Finally, construct the full name. */
1296 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1298 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1303 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1304 an escaped query string. The trick is to make sure that unsafe
1305 characters in BASE are escaped, and that slashes in QUERY are also
1309 compose_file_name (char *base, char *query)
1315 /* Copy BASE to RESULT and encode all unsafe characters. */
1317 while (*from && to - result < sizeof (result))
1319 if (UNSAFE_CHAR (*from))
1321 unsigned char c = *from++;
1323 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1324 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1330 if (query && to - result < sizeof (result))
1334 /* Copy QUERY to RESULT and encode all '/' characters. */
1336 while (*from && to - result < sizeof (result))
1350 if (to - result < sizeof (result))
1353 /* Truncate input which is too long, presumably due to a huge
1355 result[sizeof (result) - 1] = '\0';
1357 return xstrdup (result);
1360 /* Create a unique filename, corresponding to a given URL. Calls
1361 mkstruct if necessary. Does *not* actually create any directories. */
1363 url_filename (const struct url *u)
1367 char *query = u->query && *u->query ? u->query : NULL;
1371 char *base = mkstruct (u);
1372 file = compose_file_name (base, query);
1377 char *base = *u->file ? u->file : "index.html";
1378 file = compose_file_name (base, query);
1380 /* Check whether the prefix directory is something other than "."
1381 before prepending it. */
1382 if (!DOTP (opt.dir_prefix))
1384 /* #### should just realloc FILE and prepend dir_prefix. */
1385 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1386 + 1 + strlen (file) + 1);
1387 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1393 /* DOS-ish file systems don't like `%' signs in them; we change it
1398 for (p = file; *p; p++)
1402 #endif /* WINDOWS */
1404 /* Check the cases in which the unique extensions are not used:
1405 1) Clobbering is turned off (-nc).
1406 2) Retrieval with regetting.
1407 3) Timestamping is used.
1408 4) Hierarchy is built.
1410 The exception is the case when file does exist and is a
1411 directory (actually support for bad httpd-s). */
1412 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1413 && !(file_exists_p (file) && !file_non_directory_p (file)))
1416 /* Find a unique name. */
1417 name = unique_name (file);
1422 /* Return the langth of URL's path. Path is considered to be
1423 terminated by one of '?', ';', '#', or by the end of the
1426 path_length (const char *url)
1428 const char *q = strpbrk_or_eos (url, "?;#");
1432 /* Find the last occurrence of character C in the range [b, e), or
1433 NULL, if none are present. This is equivalent to strrchr(b, c),
1434 except that it accepts an END argument instead of requiring the
1435 string to be zero-terminated. Why is there no memrchr()? */
1437 find_last_char (const char *b, const char *e, char c)
1445 /* Resolve "." and ".." elements of PATH by destructively modifying
1446 PATH. "." is resolved by removing that path element, and ".." is
1447 resolved by removing the preceding path element. Leading and
1448 trailing slashes are preserved.
1450 Return non-zero if any changes have been made.
1452 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1453 test examples are provided below. If you change anything in this
1454 function, run test_path_simplify to make sure you haven't broken a
1457 A previous version of this function was based on path_simplify()
1458 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1461 path_simplify (char *path)
1467 ++path; /* preserve the leading '/'. */
1470 end = p + strlen (p) + 1; /* position past the terminating zero. */
1475 /* P should point to the beginning of a path element. */
1477 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1479 /* Handle "./foo" by moving "foo" two characters to the
1481 if (*(p + 1) == '/')
1484 memmove (p, p + 2, end - p);
1495 else if (*p == '.' && *(p + 1) == '.'
1496 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1498 /* Handle "../foo" by moving "foo" one path element to the
1500 char *b = p; /* not p-1 because P can equal PATH */
1502 /* Backtrack by one path element, but not past the beginning
1505 /* foo/bar/../baz */
1511 /* Move backwards until B hits the beginning of the
1512 previous path element or the beginning of path. */
1513 for (--b; b > path && *(b - 1) != '/'; b--)
1518 if (*(p + 2) == '/')
1520 memmove (b, p + 3, end - (p + 3));
1534 /* Remove empty path elements. Not mandated by rfc1808 et
1535 al, but empty path elements are not all that useful, and
1536 the rest of Wget might not deal with them well. */
1546 memmove (p, q, end - q);
1551 /* Skip to the next path element. */
1552 while (*p && *p != '/')
1557 /* Make sure P points to the beginning of the next path element,
1558 which is location after the slash. */
1565 /* Resolve the result of "linking" a base URI (BASE) to a
1566 link-specified URI (LINK).
1568 Either of the URIs may be absolute or relative, complete with the
1569 host name, or path only. This tries to behave "reasonably" in all
1570 foreseeable cases. It employs little specific knowledge about
1571 schemes or URL-specific stuff -- it just works on strings.
1573 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1574 See uri_merge for a gentler interface to this functionality.
1576 Perhaps this function should call path_simplify so that the callers
1577 don't have to call url_parse unconditionally. */
1579 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1585 const char *end = base + path_length (base);
1589 /* Empty LINK points back to BASE, query string and all. */
1590 constr = xstrdup (base);
1592 else if (*link == '?')
1594 /* LINK points to the same location, but changes the query
1595 string. Examples: */
1596 /* uri_merge("path", "?new") -> "path?new" */
1597 /* uri_merge("path?foo", "?new") -> "path?new" */
1598 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1599 /* uri_merge("path#foo", "?new") -> "path?new" */
1600 int baselength = end - base;
1601 constr = xmalloc (baselength + linklength + 1);
1602 memcpy (constr, base, baselength);
1603 memcpy (constr + baselength, link, linklength);
1604 constr[baselength + linklength] = '\0';
1606 else if (*link == '#')
1608 /* uri_merge("path", "#new") -> "path#new" */
1609 /* uri_merge("path#foo", "#new") -> "path#new" */
1610 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1611 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1613 const char *end1 = strchr (base, '#');
1615 end1 = base + strlen (base);
1616 baselength = end1 - base;
1617 constr = xmalloc (baselength + linklength + 1);
1618 memcpy (constr, base, baselength);
1619 memcpy (constr + baselength, link, linklength);
1620 constr[baselength + linklength] = '\0';
1622 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1624 /* LINK begins with "//" and so is a net path: we need to
1625 replace everything after (and including) the double slash
1628 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1629 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1630 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1634 const char *start_insert;
1636 /* Look for first slash. */
1637 slash = memchr (base, '/', end - base);
1638 /* If found slash and it is a double slash, then replace
1639 from this point, else default to replacing from the
1641 if (slash && *(slash + 1) == '/')
1642 start_insert = slash;
1644 start_insert = base;
1646 span = start_insert - base;
1647 constr = (char *)xmalloc (span + linklength + 1);
1649 memcpy (constr, base, span);
1650 memcpy (constr + span, link, linklength);
1651 constr[span + linklength] = '\0';
1653 else if (*link == '/')
1655 /* LINK is an absolute path: we need to replace everything
1656 after (and including) the FIRST slash with LINK.
1658 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1659 "/qux/xyzzy", our result should be
1660 "http://host/qux/xyzzy". */
1663 const char *start_insert = NULL; /* for gcc to shut up. */
1664 const char *pos = base;
1665 int seen_slash_slash = 0;
1666 /* We're looking for the first slash, but want to ignore
1669 slash = memchr (pos, '/', end - pos);
1670 if (slash && !seen_slash_slash)
1671 if (*(slash + 1) == '/')
1674 seen_slash_slash = 1;
1678 /* At this point, SLASH is the location of the first / after
1679 "//", or the first slash altogether. START_INSERT is the
1680 pointer to the location where LINK will be inserted. When
1681 examining the last two examples, keep in mind that LINK
1684 if (!slash && !seen_slash_slash)
1685 /* example: "foo" */
1687 start_insert = base;
1688 else if (!slash && seen_slash_slash)
1689 /* example: "http://foo" */
1692 else if (slash && !seen_slash_slash)
1693 /* example: "foo/bar" */
1695 start_insert = base;
1696 else if (slash && seen_slash_slash)
1697 /* example: "http://something/" */
1699 start_insert = slash;
1701 span = start_insert - base;
1702 constr = (char *)xmalloc (span + linklength + 1);
1704 memcpy (constr, base, span);
1706 memcpy (constr + span, link, linklength);
1707 constr[span + linklength] = '\0';
1711 /* LINK is a relative URL: we need to replace everything
1712 after last slash (possibly empty) with LINK.
1714 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1715 our result should be "whatever/foo/qux/xyzzy". */
1716 int need_explicit_slash = 0;
1718 const char *start_insert;
1719 const char *last_slash = find_last_char (base, end, '/');
1722 /* No slash found at all. Append LINK to what we have,
1723 but we'll need a slash as a separator.
1725 Example: if base == "foo" and link == "qux/xyzzy", then
1726 we cannot just append link to base, because we'd get
1727 "fooqux/xyzzy", whereas what we want is
1730 To make sure the / gets inserted, we set
1731 need_explicit_slash to 1. We also set start_insert
1732 to end + 1, so that the length calculations work out
1733 correctly for one more (slash) character. Accessing
1734 that character is fine, since it will be the
1735 delimiter, '\0' or '?'. */
1736 /* example: "foo?..." */
1737 /* ^ ('?' gets changed to '/') */
1738 start_insert = end + 1;
1739 need_explicit_slash = 1;
1741 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1743 /* example: http://host" */
1745 start_insert = end + 1;
1746 need_explicit_slash = 1;
1750 /* example: "whatever/foo/bar" */
1752 start_insert = last_slash + 1;
1755 span = start_insert - base;
1756 constr = (char *)xmalloc (span + linklength + 1);
1758 memcpy (constr, base, span);
1759 if (need_explicit_slash)
1760 constr[span - 1] = '/';
1762 memcpy (constr + span, link, linklength);
1763 constr[span + linklength] = '\0';
1766 else /* !no_scheme */
1768 constr = strdupdelim (link, link + linklength);
1773 /* Merge BASE with LINK and return the resulting URI. This is an
1774 interface to uri_merge_1 that assumes that LINK is a
1775 zero-terminated string. */
1777 uri_merge (const char *base, const char *link)
1779 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1782 #define APPEND(p, s) do { \
1783 int len = strlen (s); \
1784 memcpy (p, s, len); \
1788 /* Use this instead of password when the actual password is supposed
1789 to be hidden. We intentionally use a generic string without giving
1790 away the number of characters in the password, like previous
1792 #define HIDDEN_PASSWORD "*password*"
1794 /* Recreate the URL string from the data in URL.
1796 If HIDE is non-zero (as it is when we're calling this on a URL we
1797 plan to print, but not when calling it to canonicalize a URL for
1798 use within the program), password will be hidden. Unsafe
1799 characters in the URL will be quoted. */
1802 url_string (const struct url *url, int hide_password)
1806 char *quoted_user = NULL, *quoted_passwd = NULL;
1808 int scheme_port = supported_schemes[url->scheme].default_port;
1809 char *scheme_str = supported_schemes[url->scheme].leading_string;
1810 int fplen = full_path_length (url);
1812 int brackets_around_host = 0;
1814 assert (scheme_str != NULL);
1816 /* Make sure the user name and password are quoted. */
1819 quoted_user = encode_string_maybe (url->user);
1823 quoted_passwd = HIDDEN_PASSWORD;
1825 quoted_passwd = encode_string_maybe (url->passwd);
1829 if (strchr (url->host, ':'))
1830 brackets_around_host = 1;
1832 size = (strlen (scheme_str)
1833 + strlen (url->host)
1834 + (brackets_around_host ? 2 : 0)
1837 if (url->port != scheme_port)
1838 size += 1 + numdigit (url->port);
1841 size += 1 + strlen (quoted_user);
1843 size += 1 + strlen (quoted_passwd);
1846 p = result = xmalloc (size);
1848 APPEND (p, scheme_str);
1851 APPEND (p, quoted_user);
1855 APPEND (p, quoted_passwd);
1860 if (brackets_around_host)
1862 APPEND (p, url->host);
1863 if (brackets_around_host)
1865 if (url->port != scheme_port)
1868 p = number_to_string (p, url->port);
1871 full_path_write (url, p);
1875 assert (p - result == size);
1877 if (quoted_user && quoted_user != url->user)
1878 xfree (quoted_user);
1879 if (quoted_passwd && !hide_password
1880 && quoted_passwd != url->passwd)
1881 xfree (quoted_passwd);
1886 /* Return the URL of the proxy appropriate for url U. */
1888 getproxy (struct url *u)
1891 char *rewritten_url;
1892 static char rewritten_storage[1024];
1896 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
1902 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1906 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1910 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1912 case SCHEME_INVALID:
1915 if (!proxy || !*proxy)
1918 /* Handle shorthands. `rewritten_storage' is a kludge to allow
1919 getproxy() to return static storage. */
1920 rewritten_url = rewrite_shorthand_url (proxy);
1923 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1924 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1925 proxy = rewritten_storage;
1931 /* Should a host be accessed through proxy, concerning no_proxy? */
1933 no_proxy_match (const char *host, const char **no_proxy)
1938 return !sufmatch (no_proxy, host);
1941 /* Support for converting links for local viewing in downloaded HTML
1942 files. This should be moved to another file, because it has
1943 nothing to do with processing URLs. */
1945 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1946 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1948 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1949 const char *, int));
1950 static char *local_quote_string PARAMS ((const char *));
1952 /* Change the links in one HTML file. LINKS is a list of links in the
1953 document, along with their positions and the desired direction of
1956 convert_links (const char *file, struct urlpos *links)
1958 struct file_memory *fm;
1961 downloaded_file_t downloaded_file_return;
1963 struct urlpos *link;
1964 int to_url_count = 0, to_file_count = 0;
1966 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1969 /* First we do a "dry run": go through the list L and see whether
1970 any URL needs to be converted in the first place. If not, just
1971 leave the file alone. */
1973 struct urlpos *dry = links;
1974 for (dry = links; dry; dry = dry->next)
1975 if (dry->convert != CO_NOCONVERT)
1979 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1984 fm = read_file (file);
1987 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1988 file, strerror (errno));
1992 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1993 if (opt.backup_converted && downloaded_file_return)
1994 write_backup_file (file, downloaded_file_return);
1996 /* Before opening the file for writing, unlink the file. This is
1997 important if the data in FM is mmaped. In such case, nulling the
1998 file, which is what fopen() below does, would make us read all
1999 zeroes from the mmaped region. */
2000 if (unlink (file) < 0 && errno != ENOENT)
2002 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2003 file, strerror (errno));
2004 read_file_free (fm);
2007 /* Now open the file for writing. */
2008 fp = fopen (file, "wb");
2011 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2012 file, strerror (errno));
2013 read_file_free (fm);
2017 /* Here we loop through all the URLs in file, replacing those of
2018 them that are downloaded with relative references. */
2020 for (link = links; link; link = link->next)
2022 char *url_start = fm->content + link->pos;
2024 if (link->pos >= fm->length)
2026 DEBUGP (("Something strange is going on. Please investigate."));
2029 /* If the URL is not to be converted, skip it. */
2030 if (link->convert == CO_NOCONVERT)
2032 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2036 /* Echo the file contents, up to the offending URL's opening
2037 quote, to the outfile. */
2038 fwrite (p, 1, url_start - p, fp);
2041 switch (link->convert)
2043 case CO_CONVERT_TO_RELATIVE:
2044 /* Convert absolute URL to relative. */
2046 char *newname = construct_relative (file, link->local_name);
2047 char *quoted_newname = local_quote_string (newname);
2049 if (!link->link_refresh_p)
2050 p = replace_attr (p, link->size, fp, quoted_newname);
2052 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2053 link->refresh_timeout);
2055 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2056 link->url->url, newname, link->pos, file));
2058 xfree (quoted_newname);
2062 case CO_CONVERT_TO_COMPLETE:
2063 /* Convert the link to absolute URL. */
2065 char *newlink = link->url->url;
2066 char *quoted_newlink = html_quote_string (newlink);
2068 if (!link->link_refresh_p)
2069 p = replace_attr (p, link->size, fp, quoted_newlink);
2071 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2072 link->refresh_timeout);
2074 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2075 newlink, link->pos, file));
2076 xfree (quoted_newlink);
2080 case CO_NULLIFY_BASE:
2081 /* Change the base href to "". */
2082 p = replace_attr (p, link->size, fp, "");
2090 /* Output the rest of the file. */
2091 if (p - fm->content < fm->length)
2092 fwrite (p, 1, fm->length - (p - fm->content), fp);
2094 read_file_free (fm);
2096 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2099 /* Construct and return a malloced copy of the relative link from two
2100 pieces of information: local name S1 of the referring file and
2101 local name S2 of the referred file.
2103 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2104 "jagor.srce.hr/images/news.gif", the function will return
2107 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2108 "fly.cc.fer.hr/images/fly.gif", the function will return
2109 "../images/fly.gif".
2111 Caveats: S1 should not begin with `/', unless S2 also begins with
2112 '/'. S1 should not contain things like ".." and such --
2113 construct_relative ("fly/ioccc/../index.html",
2114 "fly/images/fly.gif") will fail. (A workaround is to call
2115 something like path_simplify() on S1). */
2117 construct_relative (const char *s1, const char *s2)
2119 int i, cnt, sepdirs1;
2123 return xstrdup (s2);
2124 /* S1 should *not* be absolute, if S2 wasn't. */
2125 assert (*s1 != '/');
2127 /* Skip the directories common to both strings. */
2130 while (s1[i] && s2[i]
2135 if (s1[i] == '/' && s2[i] == '/')
2140 for (sepdirs1 = 0; s1[i]; i++)
2143 /* Now, construct the file as of:
2144 - ../ repeated sepdirs1 time
2145 - all the non-mutual directories of S2. */
2146 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2147 for (i = 0; i < sepdirs1; i++)
2148 memcpy (res + 3 * i, "../", 3);
2149 strcpy (res + 3 * i, s2 + cnt);
2154 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2156 /* Rather than just writing over the original .html file with the
2157 converted version, save the former to *.orig. Note we only do
2158 this for files we've _successfully_ downloaded, so we don't
2159 clobber .orig files sitting around from previous invocations. */
2161 /* Construct the backup filename as the original name plus ".orig". */
2162 size_t filename_len = strlen(file);
2163 char* filename_plus_orig_suffix;
2164 boolean already_wrote_backup_file = FALSE;
2165 slist* converted_file_ptr;
2166 static slist* converted_files = NULL;
2168 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2170 /* Just write "orig" over "html". We need to do it this way
2171 because when we're checking to see if we've downloaded the
2172 file before (to see if we can skip downloading it), we don't
2173 know if it's a text/html file. Therefore we don't know yet
2174 at that stage that -E is going to cause us to tack on
2175 ".html", so we need to compare vs. the original URL plus
2176 ".orig", not the original URL plus ".html.orig". */
2177 filename_plus_orig_suffix = alloca (filename_len + 1);
2178 strcpy(filename_plus_orig_suffix, file);
2179 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2181 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2183 /* Append ".orig" to the name. */
2184 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2185 strcpy(filename_plus_orig_suffix, file);
2186 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2189 /* We can get called twice on the same URL thanks to the
2190 convert_all_links() call in main(). If we write the .orig file
2191 each time in such a case, it'll end up containing the first-pass
2192 conversion, not the original file. So, see if we've already been
2193 called on this file. */
2194 converted_file_ptr = converted_files;
2195 while (converted_file_ptr != NULL)
2196 if (strcmp(converted_file_ptr->string, file) == 0)
2198 already_wrote_backup_file = TRUE;
2202 converted_file_ptr = converted_file_ptr->next;
2204 if (!already_wrote_backup_file)
2206 /* Rename <file> to <file>.orig before former gets written over. */
2207 if (rename(file, filename_plus_orig_suffix) != 0)
2208 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2209 file, filename_plus_orig_suffix, strerror (errno));
2211 /* Remember that we've already written a .orig backup for this file.
2212 Note that we never free this memory since we need it till the
2213 convert_all_links() call, which is one of the last things the
2214 program does before terminating. BTW, I'm not sure if it would be
2215 safe to just set 'converted_file_ptr->string' to 'file' below,
2216 rather than making a copy of the string... Another note is that I
2217 thought I could just add a field to the urlpos structure saying
2218 that we'd written a .orig file for this URL, but that didn't work,
2219 so I had to make this separate list.
2220 -- Dan Harkless <wget@harkless.org>
2222 This [adding a field to the urlpos structure] didn't work
2223 because convert_file() is called from convert_all_links at
2224 the end of the retrieval with a freshly built new urlpos
2226 -- Hrvoje Niksic <hniksic@arsdigita.com>
2228 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2229 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2230 converted_file_ptr->next = converted_files;
2231 converted_files = converted_file_ptr;
2235 static int find_fragment PARAMS ((const char *, int, const char **,
2238 /* Replace an attribute's original text with NEW_TEXT. */
2241 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2244 char quote_char = '\"'; /* use "..." for quoting, unless the
2245 original value is quoted, in which
2246 case reuse its quoting char. */
2247 const char *frag_beg, *frag_end;
2249 /* Structure of our string is:
2250 "...old-contents..."
2251 <--- size ---> (with quotes)
2254 <--- size --> (no quotes) */
2256 if (*p == '\"' || *p == '\'')
2261 size -= 2; /* disregard opening and closing quote */
2263 putc (quote_char, fp);
2264 fputs (new_text, fp);
2266 /* Look for fragment identifier, if any. */
2267 if (find_fragment (p, size, &frag_beg, &frag_end))
2268 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2272 putc (quote_char, fp);
2277 /* The same as REPLACE_ATTR, but used when replacing
2278 <meta http-equiv=refresh content="new_text"> because we need to
2279 append "timeout_value; URL=" before the next_text. */
2282 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2283 const char *new_text, int timeout)
2286 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2290 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2292 return replace_attr (p, size, fp, new_with_timeout);
2295 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2296 preceded by '&'. If the character is not found, return zero. If
2297 the character is found, return 1 and set BP and EP to point to the
2298 beginning and end of the region.
2300 This is used for finding the fragment indentifiers in URLs. */
2303 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2305 const char *end = beg + size;
2307 for (; beg < end; beg++)
2329 /* Quote FILE for use as local reference to an HTML file.
2331 We quote ? as %3F to avoid passing part of the file name as the
2332 parameter when browsing the converted file through HTTP. However,
2333 it is safe to do this only when `--html-extension' is turned on.
2334 This is because converting "index.html?foo=bar" to
2335 "index.html%3Ffoo=bar" would break local browsing, as the latter
2336 isn't even recognized as an HTML file! However, converting
2337 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2338 safe for both local and HTTP-served browsing. */
2341 local_quote_string (const char *file)
2343 const char *file_sans_qmark;
2346 if (!opt.html_extension)
2347 return html_quote_string (file);
2349 qm = count_char (file, '?');
2353 const char *from = file;
2356 /* qm * 2 because we replace each question mark with "%3F",
2357 i.e. replace one char with three, hence two more. */
2358 int fsqlen = strlen (file) + qm * 2;
2360 to = newname = (char *)alloca (fsqlen + 1);
2361 for (; *from; from++)
2372 assert (to - newname == fsqlen);
2375 file_sans_qmark = newname;
2378 file_sans_qmark = file;
2380 return html_quote_string (file_sans_qmark);
2383 /* We're storing "modes" of type downloaded_file_t in the hash table.
2384 However, our hash tables only accept pointers for keys and values.
2385 So when we need a pointer, we use the address of a
2386 downloaded_file_t variable of static storage. */
2388 static downloaded_file_t *
2389 downloaded_mode_to_ptr (downloaded_file_t mode)
2391 static downloaded_file_t
2392 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2393 v2 = FILE_DOWNLOADED_NORMALLY,
2394 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2395 v4 = CHECK_FOR_FILE;
2399 case FILE_NOT_ALREADY_DOWNLOADED:
2401 case FILE_DOWNLOADED_NORMALLY:
2403 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2405 case CHECK_FOR_FILE:
2411 /* This should really be merged with dl_file_url_map and
2412 downloaded_html_files in recur.c. This was originally a list, but
2413 I changed it to a hash table beause it was actually taking a lot of
2414 time to find things in it. */
2416 static struct hash_table *downloaded_files_hash;
2418 /* Remembers which files have been downloaded. In the standard case, should be
2419 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2420 download successfully (i.e. not for ones we have failures on or that we skip
2423 When we've downloaded a file and tacked on a ".html" extension due to -E,
2424 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2425 FILE_DOWNLOADED_NORMALLY.
2427 If you just want to check if a file has been previously added without adding
2428 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2429 with local filenames, not remote URLs. */
2431 downloaded_file (downloaded_file_t mode, const char *file)
2433 downloaded_file_t *ptr;
2435 if (mode == CHECK_FOR_FILE)
2437 if (!downloaded_files_hash)
2438 return FILE_NOT_ALREADY_DOWNLOADED;
2439 ptr = hash_table_get (downloaded_files_hash, file);
2441 return FILE_NOT_ALREADY_DOWNLOADED;
2445 if (!downloaded_files_hash)
2446 downloaded_files_hash = make_string_hash_table (0);
2448 ptr = hash_table_get (downloaded_files_hash, file);
2452 ptr = downloaded_mode_to_ptr (mode);
2453 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2455 return FILE_NOT_ALREADY_DOWNLOADED;
2459 df_free_mapper (void *key, void *value, void *ignored)
2466 downloaded_files_free (void)
2468 if (downloaded_files_hash)
2470 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2471 hash_table_destroy (downloaded_files_hash);
2472 downloaded_files_hash = NULL;
2476 /* Return non-zero if scheme a is similar to scheme b.
2478 Schemes are similar if they are equal. If SSL is supported, schemes
2479 are also similar if one is http (SCHEME_HTTP) and the other is https
2482 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2487 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2488 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2495 /* Debugging and testing support for path_simplify. */
2497 /* Debug: run path_simplify on PATH and return the result in a new
2498 string. Useful for calling from the debugger. */
2502 char *copy = xstrdup (path);
2503 path_simplify (copy);
2508 run_test (char *test, char *expected_result, int expected_change)
2510 char *test_copy = xstrdup (test);
2511 int modified = path_simplify (test_copy);
2513 if (0 != strcmp (test_copy, expected_result))
2515 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2516 test, expected_result, test_copy);
2518 if (modified != expected_change)
2520 if (expected_change == 1)
2521 printf ("Expected no modification with path_simplify(\"%s\").\n",
2524 printf ("Expected modification with path_simplify(\"%s\").\n",
2531 test_path_simplify (void)
2534 char *test, *result;
2540 { "foo", "foo", 0 },
2541 { "foo/bar", "foo/bar", 0 },
2542 { "foo///bar", "foo/bar", 1 },
2543 { "foo/.", "foo/", 1 },
2544 { "foo/./", "foo/", 1 },
2545 { "foo./", "foo./", 0 },
2546 { "foo/../bar", "bar", 1 },
2547 { "foo/../bar/", "bar/", 1 },
2548 { "foo/bar/..", "foo/", 1 },
2549 { "foo/bar/../x", "foo/x", 1 },
2550 { "foo/bar/../x/", "foo/x/", 1 },
2551 { "foo/..", "", 1 },
2552 { "foo/../..", "", 1 },
2553 { "a/b/../../c", "c", 1 },
2554 { "./a/../b", "b", 1 }
2558 for (i = 0; i < ARRAY_SIZE (tests); i++)
2560 char *test = tests[i].test;
2561 char *expected_result = tests[i].result;
2562 int expected_change = tests[i].should_modify;
2563 run_test (test, expected_result, expected_change);
2566 /* Now run all the tests with a leading slash before the test case,
2567 to prove that the slash is being preserved. */
2568 for (i = 0; i < ARRAY_SIZE (tests); i++)
2570 char *test, *expected_result;
2571 int expected_change = tests[i].should_modify;
2573 test = xmalloc (1 + strlen (tests[i].test) + 1);
2574 sprintf (test, "/%s", tests[i].test);
2576 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2577 sprintf (expected_result, "/%s", tests[i].result);
2579 run_test (test, expected_result, expected_change);
2582 xfree (expected_result);