2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
58 /* Supported schemes: */
59 static struct scheme_data supported_schemes[] =
61 { "http://", DEFAULT_HTTP_PORT, 1 },
63 { "https://", DEFAULT_HTTPS_PORT, 1 },
65 { "ftp://", DEFAULT_FTP_PORT, 1 },
71 /* Forward declarations: */
73 static char *construct_relative PARAMS ((const char *, const char *));
74 static int path_simplify PARAMS ((char *));
78 /* Support for encoding and decoding of URL strings. We determine
79 whether a character is unsafe through static table lookup. This
80 code assumes ASCII character set and 8-bit chars. */
87 #define R urlchr_reserved
88 #define U urlchr_unsafe
91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
93 /* rfc1738 reserved chars, preserved from encoding. */
95 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
97 /* rfc1738 unsafe chars, plus some more. */
99 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
101 const static unsigned char urlchr_table[256] =
103 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
104 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
105 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
106 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
107 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
108 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
110 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
111 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
113 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
114 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
115 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
116 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
117 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
118 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
121 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
127 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 /* Decodes the forms %xy in a URL to the character the hexadecimal
132 code of which is xy. xy are hexadecimal digits from
133 [0123456789ABCDEF] (case-insensitive). If x or y are not
134 hex-digits or `%' precedes `\0', the sequence is inserted
138 decode_string (char *s)
140 char *t = s; /* t - tortoise */
141 char *h = s; /* h - hare */
152 /* Do nothing if '%' is not followed by two hex digits. */
153 if (!*(h + 1) || !*(h + 2)
154 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
156 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
163 /* Like encode_string, but return S if there are no unsafe chars. */
166 encode_string_maybe (const char *s)
173 for (p1 = s; *p1; p1++)
174 if (UNSAFE_CHAR (*p1))
175 addition += 2; /* Two more characters (hex digits) */
180 newlen = (p1 - s) + addition;
181 newstr = (char *)xmalloc (newlen + 1);
187 if (UNSAFE_CHAR (*p1))
189 unsigned char c = *p1++;
191 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
192 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
198 assert (p2 - newstr == newlen);
203 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
204 given string, returning a malloc-ed %XX encoded string. */
207 encode_string (const char *s)
209 char *encoded = encode_string_maybe (s);
216 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
217 the old value of PTR is freed and PTR is made to point to the newly
218 allocated storage. */
220 #define ENCODE(ptr) do { \
221 char *e_new = encode_string_maybe (ptr); \
229 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
231 /* Decide whether to encode, decode, or pass through the char at P.
232 This used to be a macro, but it got a little too convoluted. */
233 static inline enum copy_method
234 decide_copy_method (const char *p)
238 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
240 /* %xx sequence: decode it, unless it would decode to an
241 unsafe or a reserved char; in that case, leave it as
243 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
244 XCHAR_TO_XDIGIT (*(p + 2));
246 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
247 return CM_PASSTHROUGH;
252 /* Garbled %.. sequence: encode `%'. */
255 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
258 return CM_PASSTHROUGH;
261 /* Translate a %-quoting (but possibly non-conformant) input string S
262 into a %-quoting (and conformant) output string. If no characters
263 are encoded or decoded, return the same string S; otherwise, return
264 a freshly allocated string with the new contents.
266 After a URL has been run through this function, the protocols that
267 use `%' as the quote character can use the resulting string as-is,
268 while those that don't call decode_string() to get to the intended
269 data. This function is also stable: after an input string is
270 transformed the first time, all further transformations of the
271 result yield the same result string.
273 Let's discuss why this function is needed.
275 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
276 space character would mess up the HTTP request, it needs to be
279 GET /abc%20def HTTP/1.0
281 So it appears that the unsafe chars need to be quoted, as with
282 encode_string. But what if we're requested to download
283 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
284 the user meant was a literal space, and he was kind enough to quote
285 it. In that case, Wget should obviously leave the `%20' as is, and
286 send the same request as above. So in this case we may not call
289 But what if the requested URI is `abc%20 def'? If we call
290 encode_string, we end up with `/abc%2520%20def', which is almost
291 certainly not intended. If we don't call encode_string, we are
292 left with the embedded space and cannot send the request. What the
293 user meant was for Wget to request `/abc%20%20def', and this is
294 where reencode_string kicks in.
296 Wget used to solve this by first decoding %-quotes, and then
297 encoding all the "unsafe" characters found in the resulting string.
298 This was wrong because it didn't preserve certain URL special
299 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
300 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
301 whether we considered `+' reserved (it is). One of these results
302 is inevitable because by the second step we would lose information
303 on whether the `+' was originally encoded or not. Both results
304 were wrong because in CGI parameters + means space, while %2B means
305 literal plus. reencode_string correctly translates the above to
306 "a%2B+b", i.e. returns the original string.
308 This function uses an algorithm proposed by Anon Sricharoenchai:
310 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
313 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
316 ...except that this code conflates the two steps, and decides
317 whether to encode, decode, or pass through each character in turn.
318 The function still uses two passes, but their logic is the same --
319 the first pass exists merely for the sake of allocation. Another
320 small difference is that we include `+' to URL_RESERVED.
324 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
326 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
330 "foo bar" -> "foo%20bar"
331 "foo%20bar" -> "foo%20bar"
332 "foo %20bar" -> "foo%20%20bar"
333 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
334 "foo%25%20bar" -> "foo%25%20bar"
335 "foo%2%20bar" -> "foo%252%20bar"
336 "foo+bar" -> "foo+bar" (plus is reserved!)
337 "foo%2b+bar" -> "foo%2b+bar" */
340 reencode_string (const char *s)
346 int encode_count = 0;
347 int decode_count = 0;
349 /* First, pass through the string to see if there's anything to do,
350 and to calculate the new length. */
351 for (p1 = s; *p1; p1++)
353 switch (decide_copy_method (p1))
366 if (!encode_count && !decode_count)
367 /* The string is good as it is. */
368 return (char *)s; /* C const model sucks. */
371 /* Each encoding adds two characters (hex digits), while each
372 decoding removes two characters. */
373 newlen = oldlen + 2 * (encode_count - decode_count);
374 newstr = xmalloc (newlen + 1);
381 switch (decide_copy_method (p1))
385 unsigned char c = *p1++;
387 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
388 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
392 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
393 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
394 p1 += 3; /* skip %xx */
401 assert (p2 - newstr == newlen);
405 /* Run PTR_VAR through reencode_string. If a new string is consed,
406 free PTR_VAR and make it point to the new storage. Obviously,
407 PTR_VAR needs to be an lvalue. */
409 #define REENCODE(ptr_var) do { \
410 char *rf_new = reencode_string (ptr_var); \
411 if (rf_new != ptr_var) \
418 /* Returns the scheme type if the scheme is supported, or
419 SCHEME_INVALID if not. */
421 url_scheme (const char *url)
425 for (i = 0; supported_schemes[i].leading_string; i++)
426 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
427 strlen (supported_schemes[i].leading_string)))
429 if (supported_schemes[i].enabled)
430 return (enum url_scheme) i;
432 return SCHEME_INVALID;
435 return SCHEME_INVALID;
438 /* Return the number of characters needed to skip the scheme part of
439 the URL, e.g. `http://'. If no scheme is found, returns 0. */
441 url_skip_scheme (const char *url)
445 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
447 while (ISALNUM (*p) || *p == '-' || *p == '+')
454 /* Skip "//" if found. */
455 if (*p == '/' && *(p + 1) == '/')
461 /* Returns 1 if the URL begins with a scheme (supported or
462 unsupported), 0 otherwise. */
464 url_has_scheme (const char *url)
467 while (ISALNUM (*p) || *p == '-' || *p == '+')
473 scheme_default_port (enum url_scheme scheme)
475 return supported_schemes[scheme].default_port;
479 scheme_disable (enum url_scheme scheme)
481 supported_schemes[scheme].enabled = 0;
484 /* Skip the username and password, if present here. The function
485 should be called *not* with the complete URL, but with the part
486 right after the scheme.
488 If no username and password are found, return 0. */
490 url_skip_uname (const char *url)
494 /* Look for '@' that comes before '/' or '?'. */
495 p = (const char *)strpbrk (url, "/?@");
503 parse_uname (const char *str, int len, char **user, char **passwd)
508 /* Empty user name not allowed. */
511 colon = memchr (str, ':', len);
513 /* Empty user name again. */
518 int pwlen = len - (colon + 1 - str);
519 *passwd = xmalloc (pwlen + 1);
520 memcpy (*passwd, colon + 1, pwlen);
521 (*passwd)[pwlen] = '\0';
527 *user = xmalloc (len + 1);
528 memcpy (*user, str, len);
532 decode_string (*user);
534 decode_string (*passwd);
539 /* Used by main.c: detect URLs written using the "shorthand" URL forms
540 popularized by Netscape and NcFTP. HTTP shorthands look like this:
542 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
543 www.foo.com[:port] -> http://www.foo.com[:port]
545 FTP shorthands look like this:
547 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
548 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
550 If the URL needs not or cannot be rewritten, return NULL. */
552 rewrite_shorthand_url (const char *url)
556 if (url_has_scheme (url))
559 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
561 for (p = url; *p && *p != ':' && *p != '/'; p++)
571 /* If the characters after the colon and before the next slash
572 or end of string are all digits, it's HTTP. */
574 for (pp = p + 1; ISDIGIT (*pp); pp++)
576 if (digits > 0 && (*pp == '/' || *pp == '\0'))
579 /* Prepend "ftp://" to the entire URL... */
580 res = xmalloc (6 + strlen (url) + 1);
581 sprintf (res, "ftp://%s", url);
582 /* ...and replace ':' with '/'. */
583 res[6 + (p - url)] = '/';
590 /* Just prepend "http://" to what we have. */
591 res = xmalloc (7 + strlen (url) + 1);
592 sprintf (res, "http://%s", url);
597 static void parse_path PARAMS ((const char *, char **, char **));
600 strpbrk_or_eos (const char *s, const char *accept)
602 char *p = strpbrk (s, accept);
604 p = (char *)s + strlen (s);
608 /* Turn STR into lowercase; return non-zero if a character was
612 lowercase_str (char *str)
619 *str = TOLOWER (*str);
624 static char *parse_errors[] = {
625 #define PE_NO_ERROR 0
627 #define PE_UNSUPPORTED_SCHEME 1
628 "Unsupported scheme",
629 #define PE_EMPTY_HOST 2
631 #define PE_BAD_PORT_NUMBER 3
633 #define PE_INVALID_USER_NAME 4
637 #define SETERR(p, v) do { \
644 Return a new struct url if successful, NULL on error. In case of
645 error, and if ERROR is not NULL, also set *ERROR to the appropriate
648 url_parse (const char *url, int *error)
652 int path_modified, host_modified;
654 enum url_scheme scheme;
656 const char *uname_b, *uname_e;
657 const char *host_b, *host_e;
658 const char *path_b, *path_e;
659 const char *params_b, *params_e;
660 const char *query_b, *query_e;
661 const char *fragment_b, *fragment_e;
664 char *user = NULL, *passwd = NULL;
668 scheme = url_scheme (url);
669 if (scheme == SCHEME_INVALID)
671 SETERR (error, PE_UNSUPPORTED_SCHEME);
675 url_encoded = reencode_string (url);
678 p += strlen (supported_schemes[scheme].leading_string);
680 p += url_skip_uname (p);
683 /* scheme://user:pass@host[:port]... */
686 /* We attempt to break down the URL into the components path,
687 params, query, and fragment. They are ordered like this:
689 scheme://host[:port][/path][;params][?query][#fragment] */
691 params_b = params_e = NULL;
692 query_b = query_e = NULL;
693 fragment_b = fragment_e = NULL;
696 p = strpbrk_or_eos (p, ":/;?#");
699 if (host_b == host_e)
701 SETERR (error, PE_EMPTY_HOST);
705 port = scheme_default_port (scheme);
708 const char *port_b, *port_e, *pp;
710 /* scheme://host:port/tralala */
714 p = strpbrk_or_eos (p, "/;?#");
717 if (port_b == port_e)
719 /* http://host:/whatever */
721 SETERR (error, PE_BAD_PORT_NUMBER);
725 for (port = 0, pp = port_b; pp < port_e; pp++)
729 /* http://host:12randomgarbage/blah */
731 SETERR (error, PE_BAD_PORT_NUMBER);
734 port = 10 * port + (*pp - '0');
742 p = strpbrk_or_eos (p, ";?#");
747 /* Path is not allowed not to exist. */
755 p = strpbrk_or_eos (p, "?#");
762 p = strpbrk_or_eos (p, "#");
774 if (uname_b != uname_e)
776 /* http://user:pass@host */
778 /* uname_b uname_e */
779 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
781 SETERR (error, PE_INVALID_USER_NAME);
786 u = (struct url *)xmalloc (sizeof (struct url));
787 memset (u, 0, sizeof (*u));
790 u->host = strdupdelim (host_b, host_e);
795 u->path = strdupdelim (path_b, path_e);
796 path_modified = path_simplify (u->path);
797 parse_path (u->path, &u->dir, &u->file);
799 host_modified = lowercase_str (u->host);
802 u->params = strdupdelim (params_b, params_e);
804 u->query = strdupdelim (query_b, query_e);
806 u->fragment = strdupdelim (fragment_b, fragment_e);
808 if (path_modified || u->fragment || host_modified || path_b == path_e)
810 /* If we suspect that a transformation has rendered what
811 url_string might return different from URL_ENCODED, rebuild
812 u->url using url_string. */
813 u->url = url_string (u, 0);
815 if (url_encoded != url)
816 xfree ((char *) url_encoded);
820 if (url_encoded == url)
821 u->url = xstrdup (url);
823 u->url = url_encoded;
831 url_error (int error_code)
833 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
834 return parse_errors[error_code];
838 parse_path (const char *quoted_path, char **dir, char **file)
840 char *path, *last_slash;
842 STRDUP_ALLOCA (path, quoted_path);
843 decode_string (path);
845 last_slash = strrchr (path, '/');
849 *file = xstrdup (path);
853 *dir = strdupdelim (path, last_slash);
854 *file = xstrdup (last_slash + 1);
858 /* Note: URL's "full path" is the path with the query string and
859 params appended. The "fragment" (#foo) is intentionally ignored,
860 but that might be changed. For example, if the original URL was
861 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
862 the full path will be "/foo/bar/baz;bullshit?querystring". */
864 /* Return the length of the full path, without the terminating
868 full_path_length (const struct url *url)
872 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
883 /* Write out the full path. */
886 full_path_write (const struct url *url, char *where)
888 #define FROB(el, chr) do { \
889 char *f_el = url->el; \
891 int l = strlen (f_el); \
893 memcpy (where, f_el, l); \
905 /* Public function for getting the "full path". E.g. if u->path is
906 "foo/bar" and u->query is "param=value", full_path will be
907 "/foo/bar?param=value". */
910 url_full_path (const struct url *url)
912 int length = full_path_length (url);
913 char *full_path = (char *)xmalloc(length + 1);
915 full_path_write (url, full_path);
916 full_path[length] = '\0';
921 /* Sync u->path and u->url with u->dir and u->file. */
924 sync_path (struct url *url)
932 newpath = xstrdup (url->file);
937 int dirlen = strlen (url->dir);
938 int filelen = strlen (url->file);
940 newpath = xmalloc (dirlen + 1 + filelen + 1);
941 memcpy (newpath, url->dir, dirlen);
942 newpath[dirlen] = '/';
943 memcpy (newpath + dirlen + 1, url->file, filelen);
944 newpath[dirlen + 1 + filelen] = '\0';
950 /* Synchronize u->url. */
952 url->url = url_string (url, 0);
955 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
956 This way we can sync u->path and u->url when they get changed. */
959 url_set_dir (struct url *url, const char *newdir)
962 url->dir = xstrdup (newdir);
967 url_set_file (struct url *url, const char *newfile)
970 url->file = xstrdup (newfile);
975 url_free (struct url *url)
981 FREE_MAYBE (url->params);
982 FREE_MAYBE (url->query);
983 FREE_MAYBE (url->fragment);
984 FREE_MAYBE (url->user);
985 FREE_MAYBE (url->passwd);
994 get_urls_file (const char *file)
996 struct file_memory *fm;
997 struct urlpos *head, *tail;
998 const char *text, *text_end;
1000 /* Load the file. */
1001 fm = read_file (file);
1004 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1007 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1011 text_end = fm->content + fm->length;
1012 while (text < text_end)
1014 const char *line_beg = text;
1015 const char *line_end = memchr (text, '\n', text_end - text);
1017 line_end = text_end;
1022 /* Strip whitespace from the beginning and end of line. */
1023 while (line_beg < line_end && ISSPACE (*line_beg))
1025 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1028 if (line_end > line_beg)
1030 /* URL is in the [line_beg, line_end) region. */
1034 struct urlpos *entry;
1037 /* We must copy the URL to a zero-terminated string, and we
1038 can't use alloca because we're in a loop. *sigh*. */
1039 url_text = strdupdelim (line_beg, line_end);
1043 /* Merge opt.base_href with URL. */
1044 char *merged = uri_merge (opt.base_href, url_text);
1049 url = url_parse (url_text, &up_error_code);
1052 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1053 file, url_text, url_error (up_error_code));
1059 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1060 memset (entry, 0, sizeof (*entry));
1071 read_file_free (fm);
1075 /* Free the linked list of urlpos. */
1077 free_urlpos (struct urlpos *l)
1081 struct urlpos *next = l->next;
1084 FREE_MAYBE (l->local_name);
1090 /* Rotate FNAME opt.backups times */
1092 rotate_backups(const char *fname)
1094 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1095 char *from = (char *)alloca (maxlen);
1096 char *to = (char *)alloca (maxlen);
1100 if (stat (fname, &sb) == 0)
1101 if (S_ISREG (sb.st_mode) == 0)
1104 for (i = opt.backups; i > 1; i--)
1106 sprintf (from, "%s.%d", fname, i - 1);
1107 sprintf (to, "%s.%d", fname, i);
1108 /* #### This will fail on machines without the rename() system
1113 sprintf (to, "%s.%d", fname, 1);
1117 /* Create all the necessary directories for PATH (a file). Calls
1118 mkdirhier() internally. */
1120 mkalldirs (const char *path)
1127 p = path + strlen (path);
1128 for (; *p != '/' && p != path; p--);
1129 /* Don't create if it's just a file. */
1130 if ((p == path) && (*p != '/'))
1132 t = strdupdelim (path, p);
1133 /* Check whether the directory exists. */
1134 if ((stat (t, &st) == 0))
1136 if (S_ISDIR (st.st_mode))
1143 /* If the dir exists as a file name, remove it first. This
1144 is *only* for Wget to work with buggy old CERN http
1145 servers. Here is the scenario: When Wget tries to
1146 retrieve a directory without a slash, e.g.
1147 http://foo/bar (bar being a directory), CERN server will
1148 not redirect it too http://foo/bar/ -- it will generate a
1149 directory listing containing links to bar/file1,
1150 bar/file2, etc. Wget will lose because it saves this
1151 HTML listing to a file `bar', so it cannot create the
1152 directory. To work around this, if the file of the same
1153 name exists, we just remove it and create the directory
1155 DEBUGP (("Removing %s because of directory danger!\n", t));
1159 res = make_directory (t);
1161 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1167 count_slashes (const char *s)
1176 /* Return the path name of the URL-equivalent file name, with a
1177 remote-like structure of directories. */
1179 mkstruct (const struct url *u)
1181 char *dir, *dir_preencoding;
1182 char *file, *res, *dirpref;
1183 char *query = u->query && *u->query ? u->query : NULL;
1188 char *ptr = u->dir + (*u->dir == '/');
1189 int slash_count = 1 + count_slashes (ptr);
1190 int cut = MINVAL (opt.cut_dirs, slash_count);
1191 for (; cut && *ptr; ptr++)
1194 STRDUP_ALLOCA (dir, ptr);
1197 dir = u->dir + (*u->dir == '/');
1199 /* Check for the true name (or at least a consistent name for saving
1200 to directory) of HOST, reusing the hlist if possible. */
1201 if (opt.add_hostdir)
1203 /* Add dir_prefix and hostname (if required) to the beginning of
1205 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1207 + 1 + numdigit (u->port)
1209 if (!DOTP (opt.dir_prefix))
1210 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1212 strcpy (dirpref, u->host);
1214 if (u->port != scheme_default_port (u->scheme))
1216 int len = strlen (dirpref);
1218 number_to_string (dirpref + len + 1, u->port);
1221 else /* not add_hostdir */
1223 if (!DOTP (opt.dir_prefix))
1224 dirpref = opt.dir_prefix;
1229 /* If there is a prefix, prepend it. */
1232 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1233 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1237 dir_preencoding = dir;
1238 dir = reencode_string (dir_preencoding);
1241 if (l && dir[l - 1] == '/')
1245 file = "index.html";
1249 /* Finally, construct the full name. */
1250 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1251 + (query ? (1 + strlen (query)) : 0)
1253 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1257 strcat (res, query);
1259 if (dir != dir_preencoding)
1264 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1265 an escaped query string. The trick is to make sure that unsafe
1266 characters in BASE are escaped, and that slashes in QUERY are also
1270 compose_file_name (char *base, char *query)
1276 /* Copy BASE to RESULT and encode all unsafe characters. */
1278 while (*from && to - result < sizeof (result))
1280 if (UNSAFE_CHAR (*from))
1282 unsigned char c = *from++;
1284 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1285 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1291 if (query && to - result < sizeof (result))
1295 /* Copy QUERY to RESULT and encode all '/' characters. */
1297 while (*from && to - result < sizeof (result))
1311 if (to - result < sizeof (result))
1314 /* Truncate input which is too long, presumably due to a huge
1316 result[sizeof (result) - 1] = '\0';
1318 return xstrdup (result);
1321 /* Create a unique filename, corresponding to a given URL. Calls
1322 mkstruct if necessary. Does *not* actually create any directories. */
1324 url_filename (const struct url *u)
1327 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1331 file = mkstruct (u);
1336 char *base = *u->file ? u->file : "index.html";
1337 char *query = u->query && *u->query ? u->query : NULL;
1338 file = compose_file_name (base, query);
1343 /* Check whether the prefix directory is something other than "."
1344 before prepending it. */
1345 if (!DOTP (opt.dir_prefix))
1347 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1348 + 1 + strlen (file) + 1);
1349 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1354 /* DOS-ish file systems don't like `%' signs in them; we change it
1359 for (p = file; *p; p++)
1363 #endif /* WINDOWS */
1365 /* Check the cases in which the unique extensions are not used:
1366 1) Clobbering is turned off (-nc).
1367 2) Retrieval with regetting.
1368 3) Timestamping is used.
1369 4) Hierarchy is built.
1371 The exception is the case when file does exist and is a
1372 directory (actually support for bad httpd-s). */
1373 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1374 && !(file_exists_p (file) && !file_non_directory_p (file)))
1377 /* Find a unique name. */
1378 name = unique_name (file);
1383 /* Return the langth of URL's path. Path is considered to be
1384 terminated by one of '?', ';', '#', or by the end of the
1387 path_length (const char *url)
1389 const char *q = strpbrk_or_eos (url, "?;#");
1393 /* Find the last occurrence of character C in the range [b, e), or
1394 NULL, if none are present. This is equivalent to strrchr(b, c),
1395 except that it accepts an END argument instead of requiring the
1396 string to be zero-terminated. Why is there no memrchr()? */
1398 find_last_char (const char *b, const char *e, char c)
1406 /* Resolve "." and ".." elements of PATH by destructively modifying
1407 PATH. "." is resolved by removing that path element, and ".." is
1408 resolved by removing the preceding path element. Leading and
1409 trailing slashes are preserved.
1411 Return non-zero if any changes have been made.
1413 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1414 test examples are provided below. If you change anything in this
1415 function, run test_path_simplify to make sure you haven't broken a
1418 A previous version of this function was based on path_simplify()
1419 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1422 path_simplify (char *path)
1428 ++path; /* preserve the leading '/'. */
1431 end = p + strlen (p) + 1; /* position past the terminating zero. */
1436 /* P should point to the beginning of a path element. */
1438 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1440 /* Handle "./foo" by moving "foo" two characters to the
1442 if (*(p + 1) == '/')
1445 memmove (p, p + 2, end - p);
1456 else if (*p == '.' && *(p + 1) == '.'
1457 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1459 /* Handle "../foo" by moving "foo" one path element to the
1461 char *b = p; /* not p-1 because P can equal PATH */
1463 /* Backtrack by one path element, but not past the beginning
1466 /* foo/bar/../baz */
1472 /* Move backwards until B hits the beginning of the
1473 previous path element or the beginning of path. */
1474 for (--b; b > path && *(b - 1) != '/'; b--)
1479 if (*(p + 2) == '/')
1481 memmove (b, p + 3, end - (p + 3));
1495 /* Remove empty path elements. Not mandated by rfc1808 et
1496 al, but empty path elements are not all that useful, and
1497 the rest of Wget might not deal with them well. */
1507 memmove (p, q, end - q);
1512 /* Skip to the next path element. */
1513 while (*p && *p != '/')
1518 /* Make sure P points to the beginning of the next path element,
1519 which is location after the slash. */
1526 /* Resolve the result of "linking" a base URI (BASE) to a
1527 link-specified URI (LINK).
1529 Either of the URIs may be absolute or relative, complete with the
1530 host name, or path only. This tries to behave "reasonably" in all
1531 foreseeable cases. It employs little specific knowledge about
1532 schemes or URL-specific stuff -- it just works on strings.
1534 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1535 See uri_merge for a gentler interface to this functionality.
1537 Perhaps this function should call path_simplify so that the callers
1538 don't have to call url_parse unconditionally. */
1540 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1546 const char *end = base + path_length (base);
1550 /* Empty LINK points back to BASE, query string and all. */
1551 constr = xstrdup (base);
1553 else if (*link == '?')
1555 /* LINK points to the same location, but changes the query
1556 string. Examples: */
1557 /* uri_merge("path", "?new") -> "path?new" */
1558 /* uri_merge("path?foo", "?new") -> "path?new" */
1559 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1560 /* uri_merge("path#foo", "?new") -> "path?new" */
1561 int baselength = end - base;
1562 constr = xmalloc (baselength + linklength + 1);
1563 memcpy (constr, base, baselength);
1564 memcpy (constr + baselength, link, linklength);
1565 constr[baselength + linklength] = '\0';
1567 else if (*link == '#')
1569 /* uri_merge("path", "#new") -> "path#new" */
1570 /* uri_merge("path#foo", "#new") -> "path#new" */
1571 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1572 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1574 const char *end1 = strchr (base, '#');
1576 end1 = base + strlen (base);
1577 baselength = end1 - base;
1578 constr = xmalloc (baselength + linklength + 1);
1579 memcpy (constr, base, baselength);
1580 memcpy (constr + baselength, link, linklength);
1581 constr[baselength + linklength] = '\0';
1583 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1585 /* LINK begins with "//" and so is a net path: we need to
1586 replace everything after (and including) the double slash
1589 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1590 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1591 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1595 const char *start_insert;
1597 /* Look for first slash. */
1598 slash = memchr (base, '/', end - base);
1599 /* If found slash and it is a double slash, then replace
1600 from this point, else default to replacing from the
1602 if (slash && *(slash + 1) == '/')
1603 start_insert = slash;
1605 start_insert = base;
1607 span = start_insert - base;
1608 constr = (char *)xmalloc (span + linklength + 1);
1610 memcpy (constr, base, span);
1611 memcpy (constr + span, link, linklength);
1612 constr[span + linklength] = '\0';
1614 else if (*link == '/')
1616 /* LINK is an absolute path: we need to replace everything
1617 after (and including) the FIRST slash with LINK.
1619 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1620 "/qux/xyzzy", our result should be
1621 "http://host/qux/xyzzy". */
1624 const char *start_insert = NULL; /* for gcc to shut up. */
1625 const char *pos = base;
1626 int seen_slash_slash = 0;
1627 /* We're looking for the first slash, but want to ignore
1630 slash = memchr (pos, '/', end - pos);
1631 if (slash && !seen_slash_slash)
1632 if (*(slash + 1) == '/')
1635 seen_slash_slash = 1;
1639 /* At this point, SLASH is the location of the first / after
1640 "//", or the first slash altogether. START_INSERT is the
1641 pointer to the location where LINK will be inserted. When
1642 examining the last two examples, keep in mind that LINK
1645 if (!slash && !seen_slash_slash)
1646 /* example: "foo" */
1648 start_insert = base;
1649 else if (!slash && seen_slash_slash)
1650 /* example: "http://foo" */
1653 else if (slash && !seen_slash_slash)
1654 /* example: "foo/bar" */
1656 start_insert = base;
1657 else if (slash && seen_slash_slash)
1658 /* example: "http://something/" */
1660 start_insert = slash;
1662 span = start_insert - base;
1663 constr = (char *)xmalloc (span + linklength + 1);
1665 memcpy (constr, base, span);
1667 memcpy (constr + span, link, linklength);
1668 constr[span + linklength] = '\0';
1672 /* LINK is a relative URL: we need to replace everything
1673 after last slash (possibly empty) with LINK.
1675 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1676 our result should be "whatever/foo/qux/xyzzy". */
1677 int need_explicit_slash = 0;
1679 const char *start_insert;
1680 const char *last_slash = find_last_char (base, end, '/');
1683 /* No slash found at all. Append LINK to what we have,
1684 but we'll need a slash as a separator.
1686 Example: if base == "foo" and link == "qux/xyzzy", then
1687 we cannot just append link to base, because we'd get
1688 "fooqux/xyzzy", whereas what we want is
1691 To make sure the / gets inserted, we set
1692 need_explicit_slash to 1. We also set start_insert
1693 to end + 1, so that the length calculations work out
1694 correctly for one more (slash) character. Accessing
1695 that character is fine, since it will be the
1696 delimiter, '\0' or '?'. */
1697 /* example: "foo?..." */
1698 /* ^ ('?' gets changed to '/') */
1699 start_insert = end + 1;
1700 need_explicit_slash = 1;
1702 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1704 /* example: http://host" */
1706 start_insert = end + 1;
1707 need_explicit_slash = 1;
1711 /* example: "whatever/foo/bar" */
1713 start_insert = last_slash + 1;
1716 span = start_insert - base;
1717 constr = (char *)xmalloc (span + linklength + 1);
1719 memcpy (constr, base, span);
1720 if (need_explicit_slash)
1721 constr[span - 1] = '/';
1723 memcpy (constr + span, link, linklength);
1724 constr[span + linklength] = '\0';
1727 else /* !no_scheme */
1729 constr = strdupdelim (link, link + linklength);
1734 /* Merge BASE with LINK and return the resulting URI. This is an
1735 interface to uri_merge_1 that assumes that LINK is a
1736 zero-terminated string. */
1738 uri_merge (const char *base, const char *link)
1740 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1743 #define APPEND(p, s) do { \
1744 int len = strlen (s); \
1745 memcpy (p, s, len); \
1749 /* Use this instead of password when the actual password is supposed
1750 to be hidden. We intentionally use a generic string without giving
1751 away the number of characters in the password, like previous
1753 #define HIDDEN_PASSWORD "*password*"
1755 /* Recreate the URL string from the data in URL.
1757 If HIDE is non-zero (as it is when we're calling this on a URL we
1758 plan to print, but not when calling it to canonicalize a URL for
1759 use within the program), password will be hidden. Unsafe
1760 characters in the URL will be quoted. */
1763 url_string (const struct url *url, int hide_password)
1767 char *quoted_user = NULL, *quoted_passwd = NULL;
1769 int scheme_port = supported_schemes[url->scheme].default_port;
1770 char *scheme_str = supported_schemes[url->scheme].leading_string;
1771 int fplen = full_path_length (url);
1773 assert (scheme_str != NULL);
1775 /* Make sure the user name and password are quoted. */
1778 quoted_user = encode_string_maybe (url->user);
1782 quoted_passwd = HIDDEN_PASSWORD;
1784 quoted_passwd = encode_string_maybe (url->passwd);
1788 size = (strlen (scheme_str)
1789 + strlen (url->host)
1792 if (url->port != scheme_port)
1793 size += 1 + numdigit (url->port);
1796 size += 1 + strlen (quoted_user);
1798 size += 1 + strlen (quoted_passwd);
1801 p = result = xmalloc (size);
1803 APPEND (p, scheme_str);
1806 APPEND (p, quoted_user);
1810 APPEND (p, quoted_passwd);
1815 APPEND (p, url->host);
1816 if (url->port != scheme_port)
1819 p = number_to_string (p, url->port);
1822 full_path_write (url, p);
1826 assert (p - result == size);
1828 if (quoted_user && quoted_user != url->user)
1829 xfree (quoted_user);
1830 if (quoted_passwd && !hide_password
1831 && quoted_passwd != url->passwd)
1832 xfree (quoted_passwd);
1837 /* Returns proxy host address, in accordance with SCHEME. */
1839 getproxy (enum url_scheme scheme)
1842 char *rewritten_url;
1843 static char rewritten_storage[1024];
1848 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1852 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1856 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1858 case SCHEME_INVALID:
1861 if (!proxy || !*proxy)
1864 /* Handle shorthands. */
1865 rewritten_url = rewrite_shorthand_url (proxy);
1868 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1869 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1870 proxy = rewritten_storage;
1876 /* Should a host be accessed through proxy, concerning no_proxy? */
1878 no_proxy_match (const char *host, const char **no_proxy)
1883 return !sufmatch (no_proxy, host);
1886 /* Support for converting links for local viewing in downloaded HTML
1887 files. This should be moved to another file, because it has
1888 nothing to do with processing URLs. */
1890 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1891 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1893 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1894 const char *, int));
1895 static char *local_quote_string PARAMS ((const char *));
1897 /* Change the links in one HTML file. LINKS is a list of links in the
1898 document, along with their positions and the desired direction of
1901 convert_links (const char *file, struct urlpos *links)
1903 struct file_memory *fm;
1906 downloaded_file_t downloaded_file_return;
1908 struct urlpos *link;
1909 int to_url_count = 0, to_file_count = 0;
1911 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1914 /* First we do a "dry run": go through the list L and see whether
1915 any URL needs to be converted in the first place. If not, just
1916 leave the file alone. */
1918 struct urlpos *dry = links;
1919 for (dry = links; dry; dry = dry->next)
1920 if (dry->convert != CO_NOCONVERT)
1924 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1929 fm = read_file (file);
1932 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1933 file, strerror (errno));
1937 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1938 if (opt.backup_converted && downloaded_file_return)
1939 write_backup_file (file, downloaded_file_return);
1941 /* Before opening the file for writing, unlink the file. This is
1942 important if the data in FM is mmaped. In such case, nulling the
1943 file, which is what fopen() below does, would make us read all
1944 zeroes from the mmaped region. */
1945 if (unlink (file) < 0 && errno != ENOENT)
1947 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1948 file, strerror (errno));
1949 read_file_free (fm);
1952 /* Now open the file for writing. */
1953 fp = fopen (file, "wb");
1956 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1957 file, strerror (errno));
1958 read_file_free (fm);
1962 /* Here we loop through all the URLs in file, replacing those of
1963 them that are downloaded with relative references. */
1965 for (link = links; link; link = link->next)
1967 char *url_start = fm->content + link->pos;
1969 if (link->pos >= fm->length)
1971 DEBUGP (("Something strange is going on. Please investigate."));
1974 /* If the URL is not to be converted, skip it. */
1975 if (link->convert == CO_NOCONVERT)
1977 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1981 /* Echo the file contents, up to the offending URL's opening
1982 quote, to the outfile. */
1983 fwrite (p, 1, url_start - p, fp);
1986 switch (link->convert)
1988 case CO_CONVERT_TO_RELATIVE:
1989 /* Convert absolute URL to relative. */
1991 char *newname = construct_relative (file, link->local_name);
1992 char *quoted_newname = local_quote_string (newname);
1994 if (!link->link_refresh_p)
1995 p = replace_attr (p, link->size, fp, quoted_newname);
1997 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1998 link->refresh_timeout);
2000 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2001 link->url->url, newname, link->pos, file));
2003 xfree (quoted_newname);
2007 case CO_CONVERT_TO_COMPLETE:
2008 /* Convert the link to absolute URL. */
2010 char *newlink = link->url->url;
2011 char *quoted_newlink = html_quote_string (newlink);
2013 if (!link->link_refresh_p)
2014 p = replace_attr (p, link->size, fp, quoted_newlink);
2016 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2017 link->refresh_timeout);
2019 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2020 newlink, link->pos, file));
2021 xfree (quoted_newlink);
2025 case CO_NULLIFY_BASE:
2026 /* Change the base href to "". */
2027 p = replace_attr (p, link->size, fp, "");
2035 /* Output the rest of the file. */
2036 if (p - fm->content < fm->length)
2037 fwrite (p, 1, fm->length - (p - fm->content), fp);
2039 read_file_free (fm);
2041 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2044 /* Construct and return a malloced copy of the relative link from two
2045 pieces of information: local name S1 of the referring file and
2046 local name S2 of the referred file.
2048 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2049 "jagor.srce.hr/images/news.gif", the function will return
2052 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2053 "fly.cc.fer.hr/images/fly.gif", the function will return
2054 "../images/fly.gif".
2056 Caveats: S1 should not begin with `/', unless S2 also begins with
2057 '/'. S1 should not contain things like ".." and such --
2058 construct_relative ("fly/ioccc/../index.html",
2059 "fly/images/fly.gif") will fail. (A workaround is to call
2060 something like path_simplify() on S1). */
2062 construct_relative (const char *s1, const char *s2)
2064 int i, cnt, sepdirs1;
2068 return xstrdup (s2);
2069 /* S1 should *not* be absolute, if S2 wasn't. */
2070 assert (*s1 != '/');
2072 /* Skip the directories common to both strings. */
2075 while (s1[i] && s2[i]
2080 if (s1[i] == '/' && s2[i] == '/')
2085 for (sepdirs1 = 0; s1[i]; i++)
2088 /* Now, construct the file as of:
2089 - ../ repeated sepdirs1 time
2090 - all the non-mutual directories of S2. */
2091 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2092 for (i = 0; i < sepdirs1; i++)
2093 memcpy (res + 3 * i, "../", 3);
2094 strcpy (res + 3 * i, s2 + cnt);
2099 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2101 /* Rather than just writing over the original .html file with the
2102 converted version, save the former to *.orig. Note we only do
2103 this for files we've _successfully_ downloaded, so we don't
2104 clobber .orig files sitting around from previous invocations. */
2106 /* Construct the backup filename as the original name plus ".orig". */
2107 size_t filename_len = strlen(file);
2108 char* filename_plus_orig_suffix;
2109 boolean already_wrote_backup_file = FALSE;
2110 slist* converted_file_ptr;
2111 static slist* converted_files = NULL;
2113 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2115 /* Just write "orig" over "html". We need to do it this way
2116 because when we're checking to see if we've downloaded the
2117 file before (to see if we can skip downloading it), we don't
2118 know if it's a text/html file. Therefore we don't know yet
2119 at that stage that -E is going to cause us to tack on
2120 ".html", so we need to compare vs. the original URL plus
2121 ".orig", not the original URL plus ".html.orig". */
2122 filename_plus_orig_suffix = alloca (filename_len + 1);
2123 strcpy(filename_plus_orig_suffix, file);
2124 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2126 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2128 /* Append ".orig" to the name. */
2129 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2130 strcpy(filename_plus_orig_suffix, file);
2131 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2134 /* We can get called twice on the same URL thanks to the
2135 convert_all_links() call in main(). If we write the .orig file
2136 each time in such a case, it'll end up containing the first-pass
2137 conversion, not the original file. So, see if we've already been
2138 called on this file. */
2139 converted_file_ptr = converted_files;
2140 while (converted_file_ptr != NULL)
2141 if (strcmp(converted_file_ptr->string, file) == 0)
2143 already_wrote_backup_file = TRUE;
2147 converted_file_ptr = converted_file_ptr->next;
2149 if (!already_wrote_backup_file)
2151 /* Rename <file> to <file>.orig before former gets written over. */
2152 if (rename(file, filename_plus_orig_suffix) != 0)
2153 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2154 file, filename_plus_orig_suffix, strerror (errno));
2156 /* Remember that we've already written a .orig backup for this file.
2157 Note that we never free this memory since we need it till the
2158 convert_all_links() call, which is one of the last things the
2159 program does before terminating. BTW, I'm not sure if it would be
2160 safe to just set 'converted_file_ptr->string' to 'file' below,
2161 rather than making a copy of the string... Another note is that I
2162 thought I could just add a field to the urlpos structure saying
2163 that we'd written a .orig file for this URL, but that didn't work,
2164 so I had to make this separate list.
2165 -- Dan Harkless <wget@harkless.org>
2167 This [adding a field to the urlpos structure] didn't work
2168 because convert_file() is called from convert_all_links at
2169 the end of the retrieval with a freshly built new urlpos
2171 -- Hrvoje Niksic <hniksic@arsdigita.com>
2173 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2174 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2175 converted_file_ptr->next = converted_files;
2176 converted_files = converted_file_ptr;
2180 static int find_fragment PARAMS ((const char *, int, const char **,
2183 /* Replace an attribute's original text with NEW_TEXT. */
2186 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2189 char quote_char = '\"'; /* use "..." for quoting, unless the
2190 original value is quoted, in which
2191 case reuse its quoting char. */
2192 const char *frag_beg, *frag_end;
2194 /* Structure of our string is:
2195 "...old-contents..."
2196 <--- size ---> (with quotes)
2199 <--- size --> (no quotes) */
2201 if (*p == '\"' || *p == '\'')
2206 size -= 2; /* disregard opening and closing quote */
2208 putc (quote_char, fp);
2209 fputs (new_text, fp);
2211 /* Look for fragment identifier, if any. */
2212 if (find_fragment (p, size, &frag_beg, &frag_end))
2213 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2217 putc (quote_char, fp);
2222 /* The same as REPLACE_ATTR, but used when replacing
2223 <meta http-equiv=refresh content="new_text"> because we need to
2224 append "timeout_value; URL=" before the next_text. */
2227 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2228 const char *new_text, int timeout)
2231 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2235 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2237 return replace_attr (p, size, fp, new_with_timeout);
2240 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2241 preceded by '&'. If the character is not found, return zero. If
2242 the character is found, return 1 and set BP and EP to point to the
2243 beginning and end of the region.
2245 This is used for finding the fragment indentifiers in URLs. */
2248 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2250 const char *end = beg + size;
2252 for (; beg < end; beg++)
2274 /* Quote FILE for use as local reference to an HTML file.
2276 We quote ? as %3F to avoid passing part of the file name as the
2277 parameter when browsing the converted file through HTTP. However,
2278 it is safe to do this only when `--html-extension' is turned on.
2279 This is because converting "index.html?foo=bar" to
2280 "index.html%3Ffoo=bar" would break local browsing, as the latter
2281 isn't even recognized as an HTML file! However, converting
2282 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2283 safe for both local and HTTP-served browsing. */
2286 local_quote_string (const char *file)
2288 const char *file_sans_qmark;
2291 if (!opt.html_extension)
2292 return html_quote_string (file);
2294 qm = count_char (file, '?');
2298 const char *from = file;
2301 /* qm * 2 because we replace each question mark with "%3F",
2302 i.e. replace one char with three, hence two more. */
2303 int fsqlen = strlen (file) + qm * 2;
2305 to = newname = (char *)alloca (fsqlen + 1);
2306 for (; *from; from++)
2317 assert (to - newname == fsqlen);
2320 file_sans_qmark = newname;
2323 file_sans_qmark = file;
2325 return html_quote_string (file_sans_qmark);
2328 /* We're storing "modes" of type downloaded_file_t in the hash table.
2329 However, our hash tables only accept pointers for keys and values.
2330 So when we need a pointer, we use the address of a
2331 downloaded_file_t variable of static storage. */
2333 static downloaded_file_t *
2334 downloaded_mode_to_ptr (downloaded_file_t mode)
2336 static downloaded_file_t
2337 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2338 v2 = FILE_DOWNLOADED_NORMALLY,
2339 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2340 v4 = CHECK_FOR_FILE;
2344 case FILE_NOT_ALREADY_DOWNLOADED:
2346 case FILE_DOWNLOADED_NORMALLY:
2348 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2350 case CHECK_FOR_FILE:
2356 /* This should really be merged with dl_file_url_map and
2357 downloaded_html_files in recur.c. This was originally a list, but
2358 I changed it to a hash table beause it was actually taking a lot of
2359 time to find things in it. */
2361 static struct hash_table *downloaded_files_hash;
2363 /* Remembers which files have been downloaded. In the standard case, should be
2364 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2365 download successfully (i.e. not for ones we have failures on or that we skip
2368 When we've downloaded a file and tacked on a ".html" extension due to -E,
2369 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2370 FILE_DOWNLOADED_NORMALLY.
2372 If you just want to check if a file has been previously added without adding
2373 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2374 with local filenames, not remote URLs. */
2376 downloaded_file (downloaded_file_t mode, const char *file)
2378 downloaded_file_t *ptr;
2380 if (mode == CHECK_FOR_FILE)
2382 if (!downloaded_files_hash)
2383 return FILE_NOT_ALREADY_DOWNLOADED;
2384 ptr = hash_table_get (downloaded_files_hash, file);
2386 return FILE_NOT_ALREADY_DOWNLOADED;
2390 if (!downloaded_files_hash)
2391 downloaded_files_hash = make_string_hash_table (0);
2393 ptr = hash_table_get (downloaded_files_hash, file);
2397 ptr = downloaded_mode_to_ptr (mode);
2398 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2400 return FILE_NOT_ALREADY_DOWNLOADED;
2404 df_free_mapper (void *key, void *value, void *ignored)
2411 downloaded_files_free (void)
2413 if (downloaded_files_hash)
2415 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2416 hash_table_destroy (downloaded_files_hash);
2417 downloaded_files_hash = NULL;
2422 /* Debugging and testing support for path_simplify. */
2424 /* Debug: run path_simplify on PATH and return the result in a new
2425 string. Useful for calling from the debugger. */
2429 char *copy = xstrdup (path);
2430 path_simplify (copy);
2435 run_test (char *test, char *expected_result, int expected_change)
2437 char *test_copy = xstrdup (test);
2438 int modified = path_simplify (test_copy);
2440 if (0 != strcmp (test_copy, expected_result))
2442 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2443 test, expected_result, test_copy);
2445 if (modified != expected_change)
2447 if (expected_change == 1)
2448 printf ("Expected no modification with path_simplify(\"%s\").\n",
2451 printf ("Expected modification with path_simplify(\"%s\").\n",
2458 test_path_simplify (void)
2461 char *test, *result;
2467 { "foo", "foo", 0 },
2468 { "foo/bar", "foo/bar", 0 },
2469 { "foo///bar", "foo/bar", 1 },
2470 { "foo/.", "foo/", 1 },
2471 { "foo/./", "foo/", 1 },
2472 { "foo./", "foo./", 0 },
2473 { "foo/../bar", "bar", 1 },
2474 { "foo/../bar/", "bar/", 1 },
2475 { "foo/bar/..", "foo/", 1 },
2476 { "foo/bar/../x", "foo/x", 1 },
2477 { "foo/bar/../x/", "foo/x/", 1 },
2478 { "foo/..", "", 1 },
2479 { "foo/../..", "", 1 },
2480 { "a/b/../../c", "c", 1 },
2481 { "./a/../b", "b", 1 }
2485 for (i = 0; i < ARRAY_SIZE (tests); i++)
2487 char *test = tests[i].test;
2488 char *expected_result = tests[i].result;
2489 int expected_change = tests[i].should_modify;
2490 run_test (test, expected_result, expected_change);
2493 /* Now run all the tests with a leading slash before the test case,
2494 to prove that the slash is being preserved. */
2495 for (i = 0; i < ARRAY_SIZE (tests); i++)
2497 char *test, *expected_result;
2498 int expected_change = tests[i].should_modify;
2500 test = xmalloc (1 + strlen (tests[i].test) + 1);
2501 sprintf (test, "/%s", tests[i].test);
2503 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2504 sprintf (expected_result, "/%s", tests[i].result);
2506 run_test (test, expected_result, expected_change);
2509 xfree (expected_result);