2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
58 /* Supported schemes: */
59 static struct scheme_data supported_schemes[] =
61 { "http://", DEFAULT_HTTP_PORT, 1 },
63 { "https://", DEFAULT_HTTPS_PORT, 1 },
65 { "ftp://", DEFAULT_FTP_PORT, 1 },
71 /* Forward declarations: */
73 static char *construct_relative PARAMS ((const char *, const char *));
74 static int path_simplify PARAMS ((char *));
78 /* Support for encoding and decoding of URL strings. We determine
79 whether a character is unsafe through static table lookup. This
80 code assumes ASCII character set and 8-bit chars. */
87 #define R urlchr_reserved
88 #define U urlchr_unsafe
91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
93 /* rfc1738 reserved chars, preserved from encoding. */
95 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
97 /* rfc1738 unsafe chars, plus some more. */
99 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
101 const static unsigned char urlchr_table[256] =
103 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
104 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
105 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
106 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
107 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
108 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
110 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
111 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
113 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
114 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
115 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
116 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
117 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
118 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
121 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
127 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 /* Decodes the forms %xy in a URL to the character the hexadecimal
132 code of which is xy. xy are hexadecimal digits from
133 [0123456789ABCDEF] (case-insensitive). If x or y are not
134 hex-digits or `%' precedes `\0', the sequence is inserted
138 decode_string (char *s)
140 char *t = s; /* t - tortoise */
141 char *h = s; /* h - hare */
152 /* Do nothing if '%' is not followed by two hex digits. */
153 if (!*(h + 1) || !*(h + 2)
154 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
156 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
163 /* Like encode_string, but return S if there are no unsafe chars. */
166 encode_string_maybe (const char *s)
173 for (p1 = s; *p1; p1++)
174 if (UNSAFE_CHAR (*p1))
175 addition += 2; /* Two more characters (hex digits) */
180 newlen = (p1 - s) + addition;
181 newstr = (char *)xmalloc (newlen + 1);
187 if (UNSAFE_CHAR (*p1))
189 unsigned char c = *p1++;
191 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
192 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
198 assert (p2 - newstr == newlen);
203 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
204 given string, returning a malloc-ed %XX encoded string. */
207 encode_string (const char *s)
209 char *encoded = encode_string_maybe (s);
216 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
217 the old value of PTR is freed and PTR is made to point to the newly
218 allocated storage. */
220 #define ENCODE(ptr) do { \
221 char *e_new = encode_string_maybe (ptr); \
229 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
231 /* Decide whether to encode, decode, or pass through the char at P.
232 This used to be a macro, but it got a little too convoluted. */
233 static inline enum copy_method
234 decide_copy_method (const char *p)
238 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
240 /* %xx sequence: decode it, unless it would decode to an
241 unsafe or a reserved char; in that case, leave it as
243 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
244 XCHAR_TO_XDIGIT (*(p + 2));
246 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
247 return CM_PASSTHROUGH;
252 /* Garbled %.. sequence: encode `%'. */
255 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
258 return CM_PASSTHROUGH;
261 /* Translate a %-quoting (but possibly non-conformant) input string S
262 into a %-quoting (and conformant) output string. If no characters
263 are encoded or decoded, return the same string S; otherwise, return
264 a freshly allocated string with the new contents.
266 After a URL has been run through this function, the protocols that
267 use `%' as the quote character can use the resulting string as-is,
268 while those that don't call decode_string() to get to the intended
269 data. This function is also stable: after an input string is
270 transformed the first time, all further transformations of the
271 result yield the same result string.
273 Let's discuss why this function is needed.
275 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
276 space character would mess up the HTTP request, it needs to be
279 GET /abc%20def HTTP/1.0
281 So it appears that the unsafe chars need to be quoted, as with
282 encode_string. But what if we're requested to download
283 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
284 the user meant was a literal space, and he was kind enough to quote
285 it. In that case, Wget should obviously leave the `%20' as is, and
286 send the same request as above. So in this case we may not call
289 But what if the requested URI is `abc%20 def'? If we call
290 encode_string, we end up with `/abc%2520%20def', which is almost
291 certainly not intended. If we don't call encode_string, we are
292 left with the embedded space and cannot send the request. What the
293 user meant was for Wget to request `/abc%20%20def', and this is
294 where reencode_string kicks in.
296 Wget used to solve this by first decoding %-quotes, and then
297 encoding all the "unsafe" characters found in the resulting string.
298 This was wrong because it didn't preserve certain URL special
299 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
300 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
301 whether we considered `+' reserved (it is). One of these results
302 is inevitable because by the second step we would lose information
303 on whether the `+' was originally encoded or not. Both results
304 were wrong because in CGI parameters + means space, while %2B means
305 literal plus. reencode_string correctly translates the above to
306 "a%2B+b", i.e. returns the original string.
308 This function uses an algorithm proposed by Anon Sricharoenchai:
310 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
313 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
316 ...except that this code conflates the two steps, and decides
317 whether to encode, decode, or pass through each character in turn.
318 The function still uses two passes, but their logic is the same --
319 the first pass exists merely for the sake of allocation. Another
320 small difference is that we include `+' to URL_RESERVED.
324 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
326 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
330 "foo bar" -> "foo%20bar"
331 "foo%20bar" -> "foo%20bar"
332 "foo %20bar" -> "foo%20%20bar"
333 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
334 "foo%25%20bar" -> "foo%25%20bar"
335 "foo%2%20bar" -> "foo%252%20bar"
336 "foo+bar" -> "foo+bar" (plus is reserved!)
337 "foo%2b+bar" -> "foo%2b+bar" */
340 reencode_string (const char *s)
346 int encode_count = 0;
347 int decode_count = 0;
349 /* First, pass through the string to see if there's anything to do,
350 and to calculate the new length. */
351 for (p1 = s; *p1; p1++)
353 switch (decide_copy_method (p1))
366 if (!encode_count && !decode_count)
367 /* The string is good as it is. */
368 return (char *)s; /* C const model sucks. */
371 /* Each encoding adds two characters (hex digits), while each
372 decoding removes two characters. */
373 newlen = oldlen + 2 * (encode_count - decode_count);
374 newstr = xmalloc (newlen + 1);
381 switch (decide_copy_method (p1))
385 unsigned char c = *p1++;
387 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
388 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
392 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
393 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
394 p1 += 3; /* skip %xx */
401 assert (p2 - newstr == newlen);
405 /* Run PTR_VAR through reencode_string. If a new string is consed,
406 free PTR_VAR and make it point to the new storage. Obviously,
407 PTR_VAR needs to be an lvalue. */
409 #define REENCODE(ptr_var) do { \
410 char *rf_new = reencode_string (ptr_var); \
411 if (rf_new != ptr_var) \
418 /* Returns the scheme type if the scheme is supported, or
419 SCHEME_INVALID if not. */
421 url_scheme (const char *url)
425 for (i = 0; supported_schemes[i].leading_string; i++)
426 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
427 strlen (supported_schemes[i].leading_string)))
429 if (supported_schemes[i].enabled)
430 return (enum url_scheme) i;
432 return SCHEME_INVALID;
435 return SCHEME_INVALID;
438 /* Return the number of characters needed to skip the scheme part of
439 the URL, e.g. `http://'. If no scheme is found, returns 0. */
441 url_skip_scheme (const char *url)
445 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
447 while (ISALNUM (*p) || *p == '-' || *p == '+')
454 /* Skip "//" if found. */
455 if (*p == '/' && *(p + 1) == '/')
461 /* Returns 1 if the URL begins with a scheme (supported or
462 unsupported), 0 otherwise. */
464 url_has_scheme (const char *url)
467 while (ISALNUM (*p) || *p == '-' || *p == '+')
473 scheme_default_port (enum url_scheme scheme)
475 return supported_schemes[scheme].default_port;
479 scheme_disable (enum url_scheme scheme)
481 supported_schemes[scheme].enabled = 0;
484 /* Skip the username and password, if present here. The function
485 should be called *not* with the complete URL, but with the part
486 right after the scheme.
488 If no username and password are found, return 0. */
490 url_skip_uname (const char *url)
494 /* Look for '@' that comes before '/' or '?'. */
495 p = (const char *)strpbrk (url, "/?@");
503 parse_uname (const char *str, int len, char **user, char **passwd)
508 /* Empty user name not allowed. */
511 colon = memchr (str, ':', len);
513 /* Empty user name again. */
518 int pwlen = len - (colon + 1 - str);
519 *passwd = xmalloc (pwlen + 1);
520 memcpy (*passwd, colon + 1, pwlen);
521 (*passwd)[pwlen] = '\0';
527 *user = xmalloc (len + 1);
528 memcpy (*user, str, len);
534 /* Used by main.c: detect URLs written using the "shorthand" URL forms
535 popularized by Netscape and NcFTP. HTTP shorthands look like this:
537 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
538 www.foo.com[:port] -> http://www.foo.com[:port]
540 FTP shorthands look like this:
542 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
543 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
545 If the URL needs not or cannot be rewritten, return NULL. */
547 rewrite_shorthand_url (const char *url)
551 if (url_has_scheme (url))
554 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
556 for (p = url; *p && *p != ':' && *p != '/'; p++)
566 /* If the characters after the colon and before the next slash
567 or end of string are all digits, it's HTTP. */
569 for (pp = p + 1; ISDIGIT (*pp); pp++)
571 if (digits > 0 && (*pp == '/' || *pp == '\0'))
574 /* Prepend "ftp://" to the entire URL... */
575 res = xmalloc (6 + strlen (url) + 1);
576 sprintf (res, "ftp://%s", url);
577 /* ...and replace ':' with '/'. */
578 res[6 + (p - url)] = '/';
585 /* Just prepend "http://" to what we have. */
586 res = xmalloc (7 + strlen (url) + 1);
587 sprintf (res, "http://%s", url);
592 static void parse_path PARAMS ((const char *, char **, char **));
595 strpbrk_or_eos (const char *s, const char *accept)
597 char *p = strpbrk (s, accept);
599 p = (char *)s + strlen (s);
603 /* Turn STR into lowercase; return non-zero if a character was
607 lowercase_str (char *str)
614 *str = TOLOWER (*str);
619 static char *parse_errors[] = {
620 #define PE_NO_ERROR 0
622 #define PE_UNSUPPORTED_SCHEME 1
623 "Unsupported scheme",
624 #define PE_EMPTY_HOST 2
626 #define PE_BAD_PORT_NUMBER 3
628 #define PE_INVALID_USER_NAME 4
632 #define SETERR(p, v) do { \
639 Return a new struct url if successful, NULL on error. In case of
640 error, and if ERROR is not NULL, also set *ERROR to the appropriate
643 url_parse (const char *url, int *error)
647 int path_modified, host_modified;
649 enum url_scheme scheme;
651 const char *uname_b, *uname_e;
652 const char *host_b, *host_e;
653 const char *path_b, *path_e;
654 const char *params_b, *params_e;
655 const char *query_b, *query_e;
656 const char *fragment_b, *fragment_e;
659 char *user = NULL, *passwd = NULL;
663 scheme = url_scheme (url);
664 if (scheme == SCHEME_INVALID)
666 SETERR (error, PE_UNSUPPORTED_SCHEME);
670 url_encoded = reencode_string (url);
673 p += strlen (supported_schemes[scheme].leading_string);
675 p += url_skip_uname (p);
678 /* scheme://user:pass@host[:port]... */
681 /* We attempt to break down the URL into the components path,
682 params, query, and fragment. They are ordered like this:
684 scheme://host[:port][/path][;params][?query][#fragment] */
686 params_b = params_e = NULL;
687 query_b = query_e = NULL;
688 fragment_b = fragment_e = NULL;
691 p = strpbrk_or_eos (p, ":/;?#");
694 if (host_b == host_e)
696 SETERR (error, PE_EMPTY_HOST);
700 port = scheme_default_port (scheme);
703 const char *port_b, *port_e, *pp;
705 /* scheme://host:port/tralala */
709 p = strpbrk_or_eos (p, "/;?#");
712 if (port_b == port_e)
714 /* http://host:/whatever */
716 SETERR (error, PE_BAD_PORT_NUMBER);
720 for (port = 0, pp = port_b; pp < port_e; pp++)
724 /* http://host:12randomgarbage/blah */
726 SETERR (error, PE_BAD_PORT_NUMBER);
729 port = 10 * port + (*pp - '0');
737 p = strpbrk_or_eos (p, ";?#");
742 /* Path is not allowed not to exist. */
750 p = strpbrk_or_eos (p, "?#");
757 p = strpbrk_or_eos (p, "#");
769 if (uname_b != uname_e)
771 /* http://user:pass@host */
773 /* uname_b uname_e */
774 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
776 SETERR (error, PE_INVALID_USER_NAME);
781 u = (struct url *)xmalloc (sizeof (struct url));
782 memset (u, 0, sizeof (*u));
785 u->host = strdupdelim (host_b, host_e);
790 u->path = strdupdelim (path_b, path_e);
791 path_modified = path_simplify (u->path);
792 parse_path (u->path, &u->dir, &u->file);
794 host_modified = lowercase_str (u->host);
797 u->params = strdupdelim (params_b, params_e);
799 u->query = strdupdelim (query_b, query_e);
801 u->fragment = strdupdelim (fragment_b, fragment_e);
803 if (path_modified || u->fragment || host_modified || path_b == path_e)
805 /* If we suspect that a transformation has rendered what
806 url_string might return different from URL_ENCODED, rebuild
807 u->url using url_string. */
808 u->url = url_string (u, 0);
810 if (url_encoded != url)
811 xfree ((char *) url_encoded);
815 if (url_encoded == url)
816 u->url = xstrdup (url);
818 u->url = url_encoded;
826 url_error (int error_code)
828 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
829 return parse_errors[error_code];
833 parse_path (const char *quoted_path, char **dir, char **file)
835 char *path, *last_slash;
837 STRDUP_ALLOCA (path, quoted_path);
838 decode_string (path);
840 last_slash = strrchr (path, '/');
844 *file = xstrdup (path);
848 *dir = strdupdelim (path, last_slash);
849 *file = xstrdup (last_slash + 1);
853 /* Note: URL's "full path" is the path with the query string and
854 params appended. The "fragment" (#foo) is intentionally ignored,
855 but that might be changed. For example, if the original URL was
856 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
857 the full path will be "/foo/bar/baz;bullshit?querystring". */
859 /* Return the length of the full path, without the terminating
863 full_path_length (const struct url *url)
867 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
878 /* Write out the full path. */
881 full_path_write (const struct url *url, char *where)
883 #define FROB(el, chr) do { \
884 char *f_el = url->el; \
886 int l = strlen (f_el); \
888 memcpy (where, f_el, l); \
900 /* Public function for getting the "full path". E.g. if u->path is
901 "foo/bar" and u->query is "param=value", full_path will be
902 "/foo/bar?param=value". */
905 url_full_path (const struct url *url)
907 int length = full_path_length (url);
908 char *full_path = (char *)xmalloc(length + 1);
910 full_path_write (url, full_path);
911 full_path[length] = '\0';
916 /* Sync u->path and u->url with u->dir and u->file. */
919 sync_path (struct url *url)
927 newpath = xstrdup (url->file);
932 int dirlen = strlen (url->dir);
933 int filelen = strlen (url->file);
935 newpath = xmalloc (dirlen + 1 + filelen + 1);
936 memcpy (newpath, url->dir, dirlen);
937 newpath[dirlen] = '/';
938 memcpy (newpath + dirlen + 1, url->file, filelen);
939 newpath[dirlen + 1 + filelen] = '\0';
945 /* Synchronize u->url. */
947 url->url = url_string (url, 0);
950 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
951 This way we can sync u->path and u->url when they get changed. */
954 url_set_dir (struct url *url, const char *newdir)
957 url->dir = xstrdup (newdir);
962 url_set_file (struct url *url, const char *newfile)
965 url->file = xstrdup (newfile);
970 url_free (struct url *url)
976 FREE_MAYBE (url->params);
977 FREE_MAYBE (url->query);
978 FREE_MAYBE (url->fragment);
979 FREE_MAYBE (url->user);
980 FREE_MAYBE (url->passwd);
989 get_urls_file (const char *file)
991 struct file_memory *fm;
992 struct urlpos *head, *tail;
993 const char *text, *text_end;
996 fm = read_file (file);
999 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1002 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1006 text_end = fm->content + fm->length;
1007 while (text < text_end)
1009 const char *line_beg = text;
1010 const char *line_end = memchr (text, '\n', text_end - text);
1012 line_end = text_end;
1017 /* Strip whitespace from the beginning and end of line. */
1018 while (line_beg < line_end && ISSPACE (*line_beg))
1020 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1023 if (line_end > line_beg)
1025 /* URL is in the [line_beg, line_end) region. */
1029 struct urlpos *entry;
1032 /* We must copy the URL to a zero-terminated string, and we
1033 can't use alloca because we're in a loop. *sigh*. */
1034 url_text = strdupdelim (line_beg, line_end);
1038 /* Merge opt.base_href with URL. */
1039 char *merged = uri_merge (opt.base_href, url_text);
1044 url = url_parse (url_text, &up_error_code);
1047 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1048 file, url_text, url_error (up_error_code));
1054 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1055 memset (entry, 0, sizeof (*entry));
1066 read_file_free (fm);
1070 /* Free the linked list of urlpos. */
1072 free_urlpos (struct urlpos *l)
1076 struct urlpos *next = l->next;
1079 FREE_MAYBE (l->local_name);
1085 /* Rotate FNAME opt.backups times */
1087 rotate_backups(const char *fname)
1089 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1090 char *from = (char *)alloca (maxlen);
1091 char *to = (char *)alloca (maxlen);
1095 if (stat (fname, &sb) == 0)
1096 if (S_ISREG (sb.st_mode) == 0)
1099 for (i = opt.backups; i > 1; i--)
1101 sprintf (from, "%s.%d", fname, i - 1);
1102 sprintf (to, "%s.%d", fname, i);
1103 /* #### This will fail on machines without the rename() system
1108 sprintf (to, "%s.%d", fname, 1);
1112 /* Create all the necessary directories for PATH (a file). Calls
1113 mkdirhier() internally. */
1115 mkalldirs (const char *path)
1122 p = path + strlen (path);
1123 for (; *p != '/' && p != path; p--);
1124 /* Don't create if it's just a file. */
1125 if ((p == path) && (*p != '/'))
1127 t = strdupdelim (path, p);
1128 /* Check whether the directory exists. */
1129 if ((stat (t, &st) == 0))
1131 if (S_ISDIR (st.st_mode))
1138 /* If the dir exists as a file name, remove it first. This
1139 is *only* for Wget to work with buggy old CERN http
1140 servers. Here is the scenario: When Wget tries to
1141 retrieve a directory without a slash, e.g.
1142 http://foo/bar (bar being a directory), CERN server will
1143 not redirect it too http://foo/bar/ -- it will generate a
1144 directory listing containing links to bar/file1,
1145 bar/file2, etc. Wget will lose because it saves this
1146 HTML listing to a file `bar', so it cannot create the
1147 directory. To work around this, if the file of the same
1148 name exists, we just remove it and create the directory
1150 DEBUGP (("Removing %s because of directory danger!\n", t));
1154 res = make_directory (t);
1156 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1162 count_slashes (const char *s)
1171 /* Return the path name of the URL-equivalent file name, with a
1172 remote-like structure of directories. */
1174 mkstruct (const struct url *u)
1176 char *dir, *dir_preencoding;
1177 char *file, *res, *dirpref;
1178 char *query = u->query && *u->query ? u->query : NULL;
1183 char *ptr = u->dir + (*u->dir == '/');
1184 int slash_count = 1 + count_slashes (ptr);
1185 int cut = MINVAL (opt.cut_dirs, slash_count);
1186 for (; cut && *ptr; ptr++)
1189 STRDUP_ALLOCA (dir, ptr);
1192 dir = u->dir + (*u->dir == '/');
1194 /* Check for the true name (or at least a consistent name for saving
1195 to directory) of HOST, reusing the hlist if possible. */
1196 if (opt.add_hostdir)
1198 /* Add dir_prefix and hostname (if required) to the beginning of
1200 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1202 + 1 + numdigit (u->port)
1204 if (!DOTP (opt.dir_prefix))
1205 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1207 strcpy (dirpref, u->host);
1209 if (u->port != scheme_default_port (u->scheme))
1211 int len = strlen (dirpref);
1213 number_to_string (dirpref + len + 1, u->port);
1216 else /* not add_hostdir */
1218 if (!DOTP (opt.dir_prefix))
1219 dirpref = opt.dir_prefix;
1224 /* If there is a prefix, prepend it. */
1227 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1228 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1232 dir_preencoding = dir;
1233 dir = reencode_string (dir_preencoding);
1236 if (l && dir[l - 1] == '/')
1240 file = "index.html";
1244 /* Finally, construct the full name. */
1245 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1246 + (query ? (1 + strlen (query)) : 0)
1248 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1252 strcat (res, query);
1254 if (dir != dir_preencoding)
1259 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1260 an escaped query string. The trick is to make sure that unsafe
1261 characters in BASE are escaped, and that slashes in QUERY are also
1265 compose_file_name (char *base, char *query)
1271 /* Copy BASE to RESULT and encode all unsafe characters. */
1273 while (*from && to - result < sizeof (result))
1275 if (UNSAFE_CHAR (*from))
1277 unsigned char c = *from++;
1279 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1280 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1286 if (query && to - result < sizeof (result))
1290 /* Copy QUERY to RESULT and encode all '/' characters. */
1292 while (*from && to - result < sizeof (result))
1306 if (to - result < sizeof (result))
1309 /* Truncate input which is too long, presumably due to a huge
1311 result[sizeof (result) - 1] = '\0';
1313 return xstrdup (result);
1316 /* Create a unique filename, corresponding to a given URL. Calls
1317 mkstruct if necessary. Does *not* actually create any directories. */
1319 url_filename (const struct url *u)
1322 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1326 file = mkstruct (u);
1331 char *base = *u->file ? u->file : "index.html";
1332 char *query = u->query && *u->query ? u->query : NULL;
1333 file = compose_file_name (base, query);
1338 /* Check whether the prefix directory is something other than "."
1339 before prepending it. */
1340 if (!DOTP (opt.dir_prefix))
1342 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1343 + 1 + strlen (file) + 1);
1344 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1349 /* DOS-ish file systems don't like `%' signs in them; we change it
1354 for (p = file; *p; p++)
1358 #endif /* WINDOWS */
1360 /* Check the cases in which the unique extensions are not used:
1361 1) Clobbering is turned off (-nc).
1362 2) Retrieval with regetting.
1363 3) Timestamping is used.
1364 4) Hierarchy is built.
1366 The exception is the case when file does exist and is a
1367 directory (actually support for bad httpd-s). */
1368 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1369 && !(file_exists_p (file) && !file_non_directory_p (file)))
1372 /* Find a unique name. */
1373 name = unique_name (file);
1378 /* Return the langth of URL's path. Path is considered to be
1379 terminated by one of '?', ';', '#', or by the end of the
1382 path_length (const char *url)
1384 const char *q = strpbrk_or_eos (url, "?;#");
1388 /* Find the last occurrence of character C in the range [b, e), or
1389 NULL, if none are present. This is equivalent to strrchr(b, c),
1390 except that it accepts an END argument instead of requiring the
1391 string to be zero-terminated. Why is there no memrchr()? */
1393 find_last_char (const char *b, const char *e, char c)
1401 /* Resolve "." and ".." elements of PATH by destructively modifying
1402 PATH. "." is resolved by removing that path element, and ".." is
1403 resolved by removing the preceding path element. Leading and
1404 trailing slashes are preserved.
1406 Return non-zero if any changes have been made.
1408 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1409 test examples are provided below. If you change anything in this
1410 function, run test_path_simplify to make sure you haven't broken a
1413 A previous version of this function was based on path_simplify()
1414 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1417 path_simplify (char *path)
1423 ++path; /* preserve the leading '/'. */
1426 end = p + strlen (p) + 1; /* position past the terminating zero. */
1431 /* P should point to the beginning of a path element. */
1433 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1435 /* Handle "./foo" by moving "foo" two characters to the
1437 if (*(p + 1) == '/')
1440 memmove (p, p + 2, end - p);
1451 else if (*p == '.' && *(p + 1) == '.'
1452 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1454 /* Handle "../foo" by moving "foo" one path element to the
1456 char *b = p; /* not p-1 because P can equal PATH */
1458 /* Backtrack by one path element, but not past the beginning
1461 /* foo/bar/../baz */
1467 /* Move backwards until B hits the beginning of the
1468 previous path element or the beginning of path. */
1469 for (--b; b > path && *(b - 1) != '/'; b--)
1474 if (*(p + 2) == '/')
1476 memmove (b, p + 3, end - (p + 3));
1490 /* Remove empty path elements. Not mandated by rfc1808 et
1491 al, but empty path elements are not all that useful, and
1492 the rest of Wget might not deal with them well. */
1502 memmove (p, q, end - q);
1507 /* Skip to the next path element. */
1508 while (*p && *p != '/')
1513 /* Make sure P points to the beginning of the next path element,
1514 which is location after the slash. */
1521 /* Resolve the result of "linking" a base URI (BASE) to a
1522 link-specified URI (LINK).
1524 Either of the URIs may be absolute or relative, complete with the
1525 host name, or path only. This tries to behave "reasonably" in all
1526 foreseeable cases. It employs little specific knowledge about
1527 schemes or URL-specific stuff -- it just works on strings.
1529 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1530 See uri_merge for a gentler interface to this functionality.
1532 Perhaps this function should call path_simplify so that the callers
1533 don't have to call url_parse unconditionally. */
1535 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1541 const char *end = base + path_length (base);
1545 /* Empty LINK points back to BASE, query string and all. */
1546 constr = xstrdup (base);
1548 else if (*link == '?')
1550 /* LINK points to the same location, but changes the query
1551 string. Examples: */
1552 /* uri_merge("path", "?new") -> "path?new" */
1553 /* uri_merge("path?foo", "?new") -> "path?new" */
1554 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1555 /* uri_merge("path#foo", "?new") -> "path?new" */
1556 int baselength = end - base;
1557 constr = xmalloc (baselength + linklength + 1);
1558 memcpy (constr, base, baselength);
1559 memcpy (constr + baselength, link, linklength);
1560 constr[baselength + linklength] = '\0';
1562 else if (*link == '#')
1564 /* uri_merge("path", "#new") -> "path#new" */
1565 /* uri_merge("path#foo", "#new") -> "path#new" */
1566 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1567 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1569 const char *end1 = strchr (base, '#');
1571 end1 = base + strlen (base);
1572 baselength = end1 - base;
1573 constr = xmalloc (baselength + linklength + 1);
1574 memcpy (constr, base, baselength);
1575 memcpy (constr + baselength, link, linklength);
1576 constr[baselength + linklength] = '\0';
1578 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1580 /* LINK begins with "//" and so is a net path: we need to
1581 replace everything after (and including) the double slash
1584 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1585 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1586 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1590 const char *start_insert;
1592 /* Look for first slash. */
1593 slash = memchr (base, '/', end - base);
1594 /* If found slash and it is a double slash, then replace
1595 from this point, else default to replacing from the
1597 if (slash && *(slash + 1) == '/')
1598 start_insert = slash;
1600 start_insert = base;
1602 span = start_insert - base;
1603 constr = (char *)xmalloc (span + linklength + 1);
1605 memcpy (constr, base, span);
1606 memcpy (constr + span, link, linklength);
1607 constr[span + linklength] = '\0';
1609 else if (*link == '/')
1611 /* LINK is an absolute path: we need to replace everything
1612 after (and including) the FIRST slash with LINK.
1614 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1615 "/qux/xyzzy", our result should be
1616 "http://host/qux/xyzzy". */
1619 const char *start_insert = NULL; /* for gcc to shut up. */
1620 const char *pos = base;
1621 int seen_slash_slash = 0;
1622 /* We're looking for the first slash, but want to ignore
1625 slash = memchr (pos, '/', end - pos);
1626 if (slash && !seen_slash_slash)
1627 if (*(slash + 1) == '/')
1630 seen_slash_slash = 1;
1634 /* At this point, SLASH is the location of the first / after
1635 "//", or the first slash altogether. START_INSERT is the
1636 pointer to the location where LINK will be inserted. When
1637 examining the last two examples, keep in mind that LINK
1640 if (!slash && !seen_slash_slash)
1641 /* example: "foo" */
1643 start_insert = base;
1644 else if (!slash && seen_slash_slash)
1645 /* example: "http://foo" */
1648 else if (slash && !seen_slash_slash)
1649 /* example: "foo/bar" */
1651 start_insert = base;
1652 else if (slash && seen_slash_slash)
1653 /* example: "http://something/" */
1655 start_insert = slash;
1657 span = start_insert - base;
1658 constr = (char *)xmalloc (span + linklength + 1);
1660 memcpy (constr, base, span);
1662 memcpy (constr + span, link, linklength);
1663 constr[span + linklength] = '\0';
1667 /* LINK is a relative URL: we need to replace everything
1668 after last slash (possibly empty) with LINK.
1670 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1671 our result should be "whatever/foo/qux/xyzzy". */
1672 int need_explicit_slash = 0;
1674 const char *start_insert;
1675 const char *last_slash = find_last_char (base, end, '/');
1678 /* No slash found at all. Append LINK to what we have,
1679 but we'll need a slash as a separator.
1681 Example: if base == "foo" and link == "qux/xyzzy", then
1682 we cannot just append link to base, because we'd get
1683 "fooqux/xyzzy", whereas what we want is
1686 To make sure the / gets inserted, we set
1687 need_explicit_slash to 1. We also set start_insert
1688 to end + 1, so that the length calculations work out
1689 correctly for one more (slash) character. Accessing
1690 that character is fine, since it will be the
1691 delimiter, '\0' or '?'. */
1692 /* example: "foo?..." */
1693 /* ^ ('?' gets changed to '/') */
1694 start_insert = end + 1;
1695 need_explicit_slash = 1;
1697 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1699 /* example: http://host" */
1701 start_insert = end + 1;
1702 need_explicit_slash = 1;
1706 /* example: "whatever/foo/bar" */
1708 start_insert = last_slash + 1;
1711 span = start_insert - base;
1712 constr = (char *)xmalloc (span + linklength + 1);
1714 memcpy (constr, base, span);
1715 if (need_explicit_slash)
1716 constr[span - 1] = '/';
1718 memcpy (constr + span, link, linklength);
1719 constr[span + linklength] = '\0';
1722 else /* !no_scheme */
1724 constr = strdupdelim (link, link + linklength);
1729 /* Merge BASE with LINK and return the resulting URI. This is an
1730 interface to uri_merge_1 that assumes that LINK is a
1731 zero-terminated string. */
1733 uri_merge (const char *base, const char *link)
1735 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1738 #define APPEND(p, s) do { \
1739 int len = strlen (s); \
1740 memcpy (p, s, len); \
1744 /* Use this instead of password when the actual password is supposed
1745 to be hidden. We intentionally use a generic string without giving
1746 away the number of characters in the password, like previous
1748 #define HIDDEN_PASSWORD "*password*"
1750 /* Recreate the URL string from the data in URL.
1752 If HIDE is non-zero (as it is when we're calling this on a URL we
1753 plan to print, but not when calling it to canonicalize a URL for
1754 use within the program), password will be hidden. Unsafe
1755 characters in the URL will be quoted. */
1758 url_string (const struct url *url, int hide_password)
1762 char *quoted_user = NULL, *quoted_passwd = NULL;
1764 int scheme_port = supported_schemes[url->scheme].default_port;
1765 char *scheme_str = supported_schemes[url->scheme].leading_string;
1766 int fplen = full_path_length (url);
1768 assert (scheme_str != NULL);
1770 /* Make sure the user name and password are quoted. */
1773 quoted_user = encode_string_maybe (url->user);
1777 quoted_passwd = HIDDEN_PASSWORD;
1779 quoted_passwd = encode_string_maybe (url->passwd);
1783 size = (strlen (scheme_str)
1784 + strlen (url->host)
1787 if (url->port != scheme_port)
1788 size += 1 + numdigit (url->port);
1791 size += 1 + strlen (quoted_user);
1793 size += 1 + strlen (quoted_passwd);
1796 p = result = xmalloc (size);
1798 APPEND (p, scheme_str);
1801 APPEND (p, quoted_user);
1805 APPEND (p, quoted_passwd);
1810 APPEND (p, url->host);
1811 if (url->port != scheme_port)
1814 p = number_to_string (p, url->port);
1817 full_path_write (url, p);
1821 assert (p - result == size);
1823 if (quoted_user && quoted_user != url->user)
1824 xfree (quoted_user);
1825 if (quoted_passwd && !hide_password
1826 && quoted_passwd != url->passwd)
1827 xfree (quoted_passwd);
1832 /* Returns proxy host address, in accordance with SCHEME. */
1834 getproxy (enum url_scheme scheme)
1837 char *rewritten_url;
1838 static char rewritten_storage[1024];
1843 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1847 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1851 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1853 case SCHEME_INVALID:
1856 if (!proxy || !*proxy)
1859 /* Handle shorthands. */
1860 rewritten_url = rewrite_shorthand_url (proxy);
1863 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1864 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1865 proxy = rewritten_storage;
1871 /* Should a host be accessed through proxy, concerning no_proxy? */
1873 no_proxy_match (const char *host, const char **no_proxy)
1878 return !sufmatch (no_proxy, host);
1881 /* Support for converting links for local viewing in downloaded HTML
1882 files. This should be moved to another file, because it has
1883 nothing to do with processing URLs. */
1885 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1886 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1888 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1889 const char *, int));
1890 static char *local_quote_string PARAMS ((const char *));
1892 /* Change the links in one HTML file. LINKS is a list of links in the
1893 document, along with their positions and the desired direction of
1896 convert_links (const char *file, struct urlpos *links)
1898 struct file_memory *fm;
1901 downloaded_file_t downloaded_file_return;
1903 struct urlpos *link;
1904 int to_url_count = 0, to_file_count = 0;
1906 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1909 /* First we do a "dry run": go through the list L and see whether
1910 any URL needs to be converted in the first place. If not, just
1911 leave the file alone. */
1913 struct urlpos *dry = links;
1914 for (dry = links; dry; dry = dry->next)
1915 if (dry->convert != CO_NOCONVERT)
1919 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1924 fm = read_file (file);
1927 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1928 file, strerror (errno));
1932 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1933 if (opt.backup_converted && downloaded_file_return)
1934 write_backup_file (file, downloaded_file_return);
1936 /* Before opening the file for writing, unlink the file. This is
1937 important if the data in FM is mmaped. In such case, nulling the
1938 file, which is what fopen() below does, would make us read all
1939 zeroes from the mmaped region. */
1940 if (unlink (file) < 0 && errno != ENOENT)
1942 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1943 file, strerror (errno));
1944 read_file_free (fm);
1947 /* Now open the file for writing. */
1948 fp = fopen (file, "wb");
1951 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1952 file, strerror (errno));
1953 read_file_free (fm);
1957 /* Here we loop through all the URLs in file, replacing those of
1958 them that are downloaded with relative references. */
1960 for (link = links; link; link = link->next)
1962 char *url_start = fm->content + link->pos;
1964 if (link->pos >= fm->length)
1966 DEBUGP (("Something strange is going on. Please investigate."));
1969 /* If the URL is not to be converted, skip it. */
1970 if (link->convert == CO_NOCONVERT)
1972 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1976 /* Echo the file contents, up to the offending URL's opening
1977 quote, to the outfile. */
1978 fwrite (p, 1, url_start - p, fp);
1981 switch (link->convert)
1983 case CO_CONVERT_TO_RELATIVE:
1984 /* Convert absolute URL to relative. */
1986 char *newname = construct_relative (file, link->local_name);
1987 char *quoted_newname = local_quote_string (newname);
1989 if (!link->link_refresh_p)
1990 p = replace_attr (p, link->size, fp, quoted_newname);
1992 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1993 link->refresh_timeout);
1995 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1996 link->url->url, newname, link->pos, file));
1998 xfree (quoted_newname);
2002 case CO_CONVERT_TO_COMPLETE:
2003 /* Convert the link to absolute URL. */
2005 char *newlink = link->url->url;
2006 char *quoted_newlink = html_quote_string (newlink);
2008 if (!link->link_refresh_p)
2009 p = replace_attr (p, link->size, fp, quoted_newlink);
2011 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2012 link->refresh_timeout);
2014 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2015 newlink, link->pos, file));
2016 xfree (quoted_newlink);
2020 case CO_NULLIFY_BASE:
2021 /* Change the base href to "". */
2022 p = replace_attr (p, link->size, fp, "");
2030 /* Output the rest of the file. */
2031 if (p - fm->content < fm->length)
2032 fwrite (p, 1, fm->length - (p - fm->content), fp);
2034 read_file_free (fm);
2036 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2039 /* Construct and return a malloced copy of the relative link from two
2040 pieces of information: local name S1 of the referring file and
2041 local name S2 of the referred file.
2043 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2044 "jagor.srce.hr/images/news.gif", the function will return
2047 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2048 "fly.cc.fer.hr/images/fly.gif", the function will return
2049 "../images/fly.gif".
2051 Caveats: S1 should not begin with `/', unless S2 also begins with
2052 '/'. S1 should not contain things like ".." and such --
2053 construct_relative ("fly/ioccc/../index.html",
2054 "fly/images/fly.gif") will fail. (A workaround is to call
2055 something like path_simplify() on S1). */
2057 construct_relative (const char *s1, const char *s2)
2059 int i, cnt, sepdirs1;
2063 return xstrdup (s2);
2064 /* S1 should *not* be absolute, if S2 wasn't. */
2065 assert (*s1 != '/');
2067 /* Skip the directories common to both strings. */
2070 while (s1[i] && s2[i]
2075 if (s1[i] == '/' && s2[i] == '/')
2080 for (sepdirs1 = 0; s1[i]; i++)
2083 /* Now, construct the file as of:
2084 - ../ repeated sepdirs1 time
2085 - all the non-mutual directories of S2. */
2086 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2087 for (i = 0; i < sepdirs1; i++)
2088 memcpy (res + 3 * i, "../", 3);
2089 strcpy (res + 3 * i, s2 + cnt);
2094 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2096 /* Rather than just writing over the original .html file with the
2097 converted version, save the former to *.orig. Note we only do
2098 this for files we've _successfully_ downloaded, so we don't
2099 clobber .orig files sitting around from previous invocations. */
2101 /* Construct the backup filename as the original name plus ".orig". */
2102 size_t filename_len = strlen(file);
2103 char* filename_plus_orig_suffix;
2104 boolean already_wrote_backup_file = FALSE;
2105 slist* converted_file_ptr;
2106 static slist* converted_files = NULL;
2108 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2110 /* Just write "orig" over "html". We need to do it this way
2111 because when we're checking to see if we've downloaded the
2112 file before (to see if we can skip downloading it), we don't
2113 know if it's a text/html file. Therefore we don't know yet
2114 at that stage that -E is going to cause us to tack on
2115 ".html", so we need to compare vs. the original URL plus
2116 ".orig", not the original URL plus ".html.orig". */
2117 filename_plus_orig_suffix = alloca (filename_len + 1);
2118 strcpy(filename_plus_orig_suffix, file);
2119 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2121 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2123 /* Append ".orig" to the name. */
2124 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2125 strcpy(filename_plus_orig_suffix, file);
2126 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2129 /* We can get called twice on the same URL thanks to the
2130 convert_all_links() call in main(). If we write the .orig file
2131 each time in such a case, it'll end up containing the first-pass
2132 conversion, not the original file. So, see if we've already been
2133 called on this file. */
2134 converted_file_ptr = converted_files;
2135 while (converted_file_ptr != NULL)
2136 if (strcmp(converted_file_ptr->string, file) == 0)
2138 already_wrote_backup_file = TRUE;
2142 converted_file_ptr = converted_file_ptr->next;
2144 if (!already_wrote_backup_file)
2146 /* Rename <file> to <file>.orig before former gets written over. */
2147 if (rename(file, filename_plus_orig_suffix) != 0)
2148 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2149 file, filename_plus_orig_suffix, strerror (errno));
2151 /* Remember that we've already written a .orig backup for this file.
2152 Note that we never free this memory since we need it till the
2153 convert_all_links() call, which is one of the last things the
2154 program does before terminating. BTW, I'm not sure if it would be
2155 safe to just set 'converted_file_ptr->string' to 'file' below,
2156 rather than making a copy of the string... Another note is that I
2157 thought I could just add a field to the urlpos structure saying
2158 that we'd written a .orig file for this URL, but that didn't work,
2159 so I had to make this separate list.
2160 -- Dan Harkless <wget@harkless.org>
2162 This [adding a field to the urlpos structure] didn't work
2163 because convert_file() is called from convert_all_links at
2164 the end of the retrieval with a freshly built new urlpos
2166 -- Hrvoje Niksic <hniksic@arsdigita.com>
2168 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2169 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2170 converted_file_ptr->next = converted_files;
2171 converted_files = converted_file_ptr;
2175 static int find_fragment PARAMS ((const char *, int, const char **,
2178 /* Replace an attribute's original text with NEW_TEXT. */
2181 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2184 char quote_char = '\"'; /* use "..." for quoting, unless the
2185 original value is quoted, in which
2186 case reuse its quoting char. */
2187 const char *frag_beg, *frag_end;
2189 /* Structure of our string is:
2190 "...old-contents..."
2191 <--- size ---> (with quotes)
2194 <--- size --> (no quotes) */
2196 if (*p == '\"' || *p == '\'')
2201 size -= 2; /* disregard opening and closing quote */
2203 putc (quote_char, fp);
2204 fputs (new_text, fp);
2206 /* Look for fragment identifier, if any. */
2207 if (find_fragment (p, size, &frag_beg, &frag_end))
2208 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2212 putc (quote_char, fp);
2217 /* The same as REPLACE_ATTR, but used when replacing
2218 <meta http-equiv=refresh content="new_text"> because we need to
2219 append "timeout_value; URL=" before the next_text. */
2222 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2223 const char *new_text, int timeout)
2226 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2230 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2232 return replace_attr (p, size, fp, new_with_timeout);
2235 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2236 preceded by '&'. If the character is not found, return zero. If
2237 the character is found, return 1 and set BP and EP to point to the
2238 beginning and end of the region.
2240 This is used for finding the fragment indentifiers in URLs. */
2243 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2245 const char *end = beg + size;
2247 for (; beg < end; beg++)
2269 /* Quote FILE for use as local reference to an HTML file.
2271 We quote ? as %3F to avoid passing part of the file name as the
2272 parameter when browsing the converted file through HTTP. However,
2273 it is safe to do this only when `--html-extension' is turned on.
2274 This is because converting "index.html?foo=bar" to
2275 "index.html%3Ffoo=bar" would break local browsing, as the latter
2276 isn't even recognized as an HTML file! However, converting
2277 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2278 safe for both local and HTTP-served browsing. */
2281 local_quote_string (const char *file)
2283 const char *file_sans_qmark;
2286 if (!opt.html_extension)
2287 return html_quote_string (file);
2289 qm = count_char (file, '?');
2293 const char *from = file;
2296 /* qm * 2 because we replace each question mark with "%3F",
2297 i.e. replace one char with three, hence two more. */
2298 int fsqlen = strlen (file) + qm * 2;
2300 to = newname = (char *)alloca (fsqlen + 1);
2301 for (; *from; from++)
2312 assert (to - newname == fsqlen);
2315 file_sans_qmark = newname;
2318 file_sans_qmark = file;
2320 return html_quote_string (file_sans_qmark);
2323 /* We're storing "modes" of type downloaded_file_t in the hash table.
2324 However, our hash tables only accept pointers for keys and values.
2325 So when we need a pointer, we use the address of a
2326 downloaded_file_t variable of static storage. */
2328 static downloaded_file_t *
2329 downloaded_mode_to_ptr (downloaded_file_t mode)
2331 static downloaded_file_t
2332 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2333 v2 = FILE_DOWNLOADED_NORMALLY,
2334 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2335 v4 = CHECK_FOR_FILE;
2339 case FILE_NOT_ALREADY_DOWNLOADED:
2341 case FILE_DOWNLOADED_NORMALLY:
2343 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2345 case CHECK_FOR_FILE:
2351 /* This should really be merged with dl_file_url_map and
2352 downloaded_html_files in recur.c. This was originally a list, but
2353 I changed it to a hash table beause it was actually taking a lot of
2354 time to find things in it. */
2356 static struct hash_table *downloaded_files_hash;
2358 /* Remembers which files have been downloaded. In the standard case, should be
2359 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2360 download successfully (i.e. not for ones we have failures on or that we skip
2363 When we've downloaded a file and tacked on a ".html" extension due to -E,
2364 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2365 FILE_DOWNLOADED_NORMALLY.
2367 If you just want to check if a file has been previously added without adding
2368 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2369 with local filenames, not remote URLs. */
2371 downloaded_file (downloaded_file_t mode, const char *file)
2373 downloaded_file_t *ptr;
2375 if (mode == CHECK_FOR_FILE)
2377 if (!downloaded_files_hash)
2378 return FILE_NOT_ALREADY_DOWNLOADED;
2379 ptr = hash_table_get (downloaded_files_hash, file);
2381 return FILE_NOT_ALREADY_DOWNLOADED;
2385 if (!downloaded_files_hash)
2386 downloaded_files_hash = make_string_hash_table (0);
2388 ptr = hash_table_get (downloaded_files_hash, file);
2392 ptr = downloaded_mode_to_ptr (mode);
2393 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2395 return FILE_NOT_ALREADY_DOWNLOADED;
2399 df_free_mapper (void *key, void *value, void *ignored)
2406 downloaded_files_free (void)
2408 if (downloaded_files_hash)
2410 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2411 hash_table_destroy (downloaded_files_hash);
2412 downloaded_files_hash = NULL;
2417 /* Debugging and testing support for path_simplify. */
2419 /* Debug: run path_simplify on PATH and return the result in a new
2420 string. Useful for calling from the debugger. */
2424 char *copy = xstrdup (path);
2425 path_simplify (copy);
2430 run_test (char *test, char *expected_result, int expected_change)
2432 char *test_copy = xstrdup (test);
2433 int modified = path_simplify (test_copy);
2435 if (0 != strcmp (test_copy, expected_result))
2437 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2438 test, expected_result, test_copy);
2440 if (modified != expected_change)
2442 if (expected_change == 1)
2443 printf ("Expected no modification with path_simplify(\"%s\").\n",
2446 printf ("Expected modification with path_simplify(\"%s\").\n",
2453 test_path_simplify (void)
2456 char *test, *result;
2462 { "foo", "foo", 0 },
2463 { "foo/bar", "foo/bar", 0 },
2464 { "foo///bar", "foo/bar", 1 },
2465 { "foo/.", "foo/", 1 },
2466 { "foo/./", "foo/", 1 },
2467 { "foo./", "foo./", 0 },
2468 { "foo/../bar", "bar", 1 },
2469 { "foo/../bar/", "bar/", 1 },
2470 { "foo/bar/..", "foo/", 1 },
2471 { "foo/bar/../x", "foo/x", 1 },
2472 { "foo/bar/../x/", "foo/x/", 1 },
2473 { "foo/..", "", 1 },
2474 { "foo/../..", "", 1 },
2475 { "a/b/../../c", "c", 1 },
2476 { "./a/../b", "b", 1 }
2480 for (i = 0; i < ARRAY_SIZE (tests); i++)
2482 char *test = tests[i].test;
2483 char *expected_result = tests[i].result;
2484 int expected_change = tests[i].should_modify;
2485 run_test (test, expected_result, expected_change);
2488 /* Now run all the tests with a leading slash before the test case,
2489 to prove that the slash is being preserved. */
2490 for (i = 0; i < ARRAY_SIZE (tests); i++)
2492 char *test, *expected_result;
2493 int expected_change = tests[i].should_modify;
2495 test = xmalloc (1 + strlen (tests[i].test) + 1);
2496 sprintf (test, "/%s", tests[i].test);
2498 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2499 sprintf (expected_result, "/%s", tests[i].result);
2501 run_test (test, expected_result, expected_change);
2504 xfree (expected_result);