2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
51 static int urlpath_length PARAMS ((const char *));
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { "http://", DEFAULT_HTTP_PORT },
64 { "https://", DEFAULT_HTTPS_PORT },
66 { "ftp://", DEFAULT_FTP_PORT },
72 static char *construct_relative PARAMS ((const char *, const char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars, preserved from encoding. */
92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
94 /* rfc1738 unsafe chars, plus some more. */
96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
98 const static unsigned char urlchr_table[256] =
100 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
101 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
102 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
103 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
104 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
105 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
106 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
107 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
108 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
109 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
110 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
111 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
112 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
113 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
114 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
115 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
117 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
118 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
119 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 /* Decodes the forms %xy in a URL to the character the hexadecimal
129 code of which is xy. xy are hexadecimal digits from
130 [0123456789ABCDEF] (case-insensitive). If x or y are not
131 hex-digits or `%' precedes `\0', the sequence is inserted
135 decode_string (char *s)
137 char *t = s; /* t - tortoise */
138 char *h = s; /* h - hare */
149 /* Do nothing if '%' is not followed by two hex digits. */
150 if (!*(h + 1) || !*(h + 2)
151 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
153 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
160 /* Like encode_string, but return S if there are no unsafe chars. */
163 encode_string_maybe (const char *s)
170 for (p1 = s; *p1; p1++)
171 if (UNSAFE_CHAR (*p1))
172 addition += 2; /* Two more characters (hex digits) */
177 newlen = (p1 - s) + addition;
178 newstr = (char *)xmalloc (newlen + 1);
184 if (UNSAFE_CHAR (*p1))
186 unsigned char c = *p1++;
188 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
189 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
195 assert (p2 - newstr == newlen);
200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
201 given string, returning a malloc-ed %XX encoded string. */
204 encode_string (const char *s)
206 char *encoded = encode_string_maybe (s);
213 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
214 the old value of PTR is freed and PTR is made to point to the newly
215 allocated storage. */
217 #define ENCODE(ptr) do { \
218 char *e_new = encode_string_maybe (ptr); \
226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
228 /* Decide whether to encode, decode, or pass through the char at P.
229 This used to be a macro, but it got a little too convoluted. */
230 static inline enum copy_method
231 decide_copy_method (const char *p)
235 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
237 /* %xx sequence: decode it, unless it would decode to an
238 unsafe or a reserved char; in that case, leave it as
240 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
241 XCHAR_TO_XDIGIT (*(p + 2));
243 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
244 return CM_PASSTHROUGH;
249 /* Garbled %.. sequence: encode `%'. */
252 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
255 return CM_PASSTHROUGH;
258 /* Translate a %-quoting (but possibly non-conformant) input string S
259 into a %-quoting (and conformant) output string. If no characters
260 are encoded or decoded, return the same string S; otherwise, return
261 a freshly allocated string with the new contents.
263 After a URL has been run through this function, the protocols that
264 use `%' as the quote character can use the resulting string as-is,
265 while those that don't call decode_string() to get to the intended
266 data. This function is also stable: after an input string is
267 transformed the first time, all further transformations of the
268 result yield the same result string.
270 Let's discuss why this function is needed.
272 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
273 space character would mess up the HTTP request, it needs to be
276 GET /abc%20def HTTP/1.0
278 So it appears that the unsafe chars need to be quoted, as with
279 encode_string. But what if we're requested to download
280 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
281 the user meant was a literal space, and he was kind enough to quote
282 it. In that case, Wget should obviously leave the `%20' as is, and
283 send the same request as above. So in this case we may not call
286 But what if the requested URI is `abc%20 def'? If we call
287 encode_string, we end up with `/abc%2520%20def', which is almost
288 certainly not intended. If we don't call encode_string, we are
289 left with the embedded space and cannot send the request. What the
290 user meant was for Wget to request `/abc%20%20def', and this is
291 where reencode_string kicks in.
293 Wget used to solve this by first decoding %-quotes, and then
294 encoding all the "unsafe" characters found in the resulting string.
295 This was wrong because it didn't preserve certain URL special
296 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
297 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
298 whether we considered `+' reserved (it is). One of these results
299 is inevitable because by the second step we would lose information
300 on whether the `+' was originally encoded or not. Both results
301 were wrong because in CGI parameters + means space, while %2B means
302 literal plus. reencode_string correctly translates the above to
303 "a%2B+b", i.e. returns the original string.
305 This function uses an algorithm proposed by Anon Sricharoenchai:
307 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
310 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
313 ...except that this code conflates the two steps, and decides
314 whether to encode, decode, or pass through each character in turn.
315 The function still uses two passes, but their logic is the same --
316 the first pass exists merely for the sake of allocation. Another
317 small difference is that we include `+' to URL_RESERVED.
321 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
323 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
327 "foo bar" -> "foo%20bar"
328 "foo%20bar" -> "foo%20bar"
329 "foo %20bar" -> "foo%20%20bar"
330 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
331 "foo%25%20bar" -> "foo%25%20bar"
332 "foo%2%20bar" -> "foo%252%20bar"
333 "foo+bar" -> "foo+bar" (plus is reserved!)
334 "foo%2b+bar" -> "foo%2b+bar" */
337 reencode_string (const char *s)
343 int encode_count = 0;
344 int decode_count = 0;
346 /* First, pass through the string to see if there's anything to do,
347 and to calculate the new length. */
348 for (p1 = s; *p1; p1++)
350 switch (decide_copy_method (p1))
363 if (!encode_count && !decode_count)
364 /* The string is good as it is. */
365 return (char *)s; /* C const model sucks. */
368 /* Each encoding adds two characters (hex digits), while each
369 decoding removes two characters. */
370 newlen = oldlen + 2 * (encode_count - decode_count);
371 newstr = xmalloc (newlen + 1);
378 switch (decide_copy_method (p1))
382 unsigned char c = *p1++;
384 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
385 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
389 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
390 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
391 p1 += 3; /* skip %xx */
398 assert (p2 - newstr == newlen);
402 /* Run PTR_VAR through reencode_string. If a new string is consed,
403 free PTR_VAR and make it point to the new storage. Obviously,
404 PTR_VAR needs to be an lvalue. */
406 #define REENCODE(ptr_var) do { \
407 char *rf_new = reencode_string (ptr_var); \
408 if (rf_new != ptr_var) \
415 /* Returns the scheme type if the scheme is supported, or
416 SCHEME_INVALID if not. */
418 url_scheme (const char *url)
422 for (i = 0; supported_schemes[i].leading_string; i++)
423 if (!strncasecmp (url, supported_schemes[i].leading_string,
424 strlen (supported_schemes[i].leading_string)))
425 return (enum url_scheme)i;
426 return SCHEME_INVALID;
429 /* Return the number of characters needed to skip the scheme part of
430 the URL, e.g. `http://'. If no scheme is found, returns 0. */
432 url_skip_scheme (const char *url)
436 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
438 while (ISALNUM (*p) || *p == '-' || *p == '+')
445 /* Skip "//" if found. */
446 if (*p == '/' && *(p + 1) == '/')
452 /* Returns 1 if the URL begins with a scheme (supported or
453 unsupported), 0 otherwise. */
455 url_has_scheme (const char *url)
458 while (ISALNUM (*p) || *p == '-' || *p == '+')
464 scheme_default_port (enum url_scheme scheme)
466 return supported_schemes[scheme].default_port;
469 /* Skip the username and password, if present here. The function
470 should be called *not* with the complete URL, but with the part
471 right after the scheme.
473 If no username and password are found, return 0. */
475 url_skip_uname (const char *url)
479 /* Look for '@' that comes before '/' or '?'. */
480 p = (const char *)strpbrk (url, "/?@");
488 parse_uname (const char *str, int len, char **user, char **passwd)
493 /* Empty user name not allowed. */
496 colon = memchr (str, ':', len);
498 /* Empty user name again. */
503 int pwlen = len - (colon + 1 - str);
504 *passwd = xmalloc (pwlen + 1);
505 memcpy (*passwd, colon + 1, pwlen);
506 (*passwd)[pwlen] = '\0';
512 *user = xmalloc (len + 1);
513 memcpy (*user, str, len);
519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
520 popularized by Netscape and NcFTP. HTTP shorthands look like this:
522 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
523 www.foo.com[:port] -> http://www.foo.com[:port]
525 FTP shorthands look like this:
527 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
528 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
530 If the URL needs not or cannot be rewritten, return NULL. */
532 rewrite_shorthand_url (const char *url)
536 if (url_has_scheme (url))
539 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
541 for (p = url; *p && *p != ':' && *p != '/'; p++)
549 const char *pp, *path;
551 /* If the characters after the colon and before the next slash
552 or end of string are all digits, it's HTTP. */
554 for (pp = p + 1; ISDIGIT (*pp); pp++)
557 && (*pp == '/' || *pp == '\0'))
560 /* Prepend "ftp://" to the entire URL... */
562 res = xmalloc (6 + strlen (url) + 1);
563 sprintf (res, "ftp://%s", url);
564 /* ...and replace ':' with '/'. */
565 res[6 + (p - url)] = '/';
572 /* Just prepend "http://" to what we have. */
573 res = xmalloc (7 + strlen (url) + 1);
574 sprintf (res, "http://%s", url);
579 static void parse_path PARAMS ((const char *, char **, char **));
582 strpbrk_or_eos (const char *s, const char *accept)
584 char *p = strpbrk (s, accept);
586 p = (char *)s + strlen (s);
590 /* Turn STR into lowercase; return non-zero if a character was
594 lowercase_str (char *str)
601 *str = TOLOWER (*str);
606 static char *parse_errors[] = {
607 #define PE_NO_ERROR 0
609 #define PE_UNRECOGNIZED_SCHEME 1
610 "Unrecognized scheme",
611 #define PE_EMPTY_HOST 2
613 #define PE_BAD_PORT_NUMBER 3
615 #define PE_INVALID_USER_NAME 4
619 #define SETERR(p, v) do { \
626 Return a new struct url if successful, NULL on error. In case of
627 error, and if ERROR is not NULL, also set *ERROR to the appropriate
630 url_parse (const char *url, int *error)
634 int path_modified, host_modified;
636 enum url_scheme scheme;
638 const char *uname_b, *uname_e;
639 const char *host_b, *host_e;
640 const char *path_b, *path_e;
641 const char *params_b, *params_e;
642 const char *query_b, *query_e;
643 const char *fragment_b, *fragment_e;
646 char *user = NULL, *passwd = NULL;
650 scheme = url_scheme (url);
651 if (scheme == SCHEME_INVALID)
653 SETERR (error, PE_UNRECOGNIZED_SCHEME);
657 url_encoded = reencode_string (url);
660 p += strlen (supported_schemes[scheme].leading_string);
662 p += url_skip_uname (p);
665 /* scheme://user:pass@host[:port]... */
668 /* We attempt to break down the URL into the components path,
669 params, query, and fragment. They are ordered like this:
671 scheme://host[:port][/path][;params][?query][#fragment] */
673 params_b = params_e = NULL;
674 query_b = query_e = NULL;
675 fragment_b = fragment_e = NULL;
678 p = strpbrk_or_eos (p, ":/;?#");
681 if (host_b == host_e)
683 SETERR (error, PE_EMPTY_HOST);
687 port = scheme_default_port (scheme);
690 const char *port_b, *port_e, *pp;
692 /* scheme://host:port/tralala */
696 p = strpbrk_or_eos (p, "/;?#");
699 if (port_b == port_e)
701 /* http://host:/whatever */
703 SETERR (error, PE_BAD_PORT_NUMBER);
707 for (port = 0, pp = port_b; pp < port_e; pp++)
711 /* http://host:12randomgarbage/blah */
713 SETERR (error, PE_BAD_PORT_NUMBER);
716 port = 10 * port + (*pp - '0');
724 p = strpbrk_or_eos (p, ";?#");
729 /* Path is not allowed not to exist. */
737 p = strpbrk_or_eos (p, "?#");
744 p = strpbrk_or_eos (p, "#");
756 if (uname_b != uname_e)
758 /* http://user:pass@host */
760 /* uname_b uname_e */
761 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
763 SETERR (error, PE_INVALID_USER_NAME);
768 u = (struct url *)xmalloc (sizeof (struct url));
769 memset (u, 0, sizeof (*u));
772 u->host = strdupdelim (host_b, host_e);
777 u->path = strdupdelim (path_b, path_e);
778 path_modified = path_simplify (u->path);
779 parse_path (u->path, &u->dir, &u->file);
781 host_modified = lowercase_str (u->host);
784 u->params = strdupdelim (params_b, params_e);
786 u->query = strdupdelim (query_b, query_e);
788 u->fragment = strdupdelim (fragment_b, fragment_e);
791 if (path_modified || u->fragment || host_modified)
793 /* If path_simplify modified the path, or if a fragment is
794 present, or if the original host name had caps in it, make
795 sure that u->url is equivalent to what would be printed by
797 u->url = url_string (u, 0);
799 if (url_encoded != url)
800 xfree ((char *) url_encoded);
804 if (url_encoded == url)
805 u->url = xstrdup (url);
807 u->url = url_encoded;
815 url_error (int error_code)
817 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
818 return parse_errors[error_code];
822 parse_path (const char *quoted_path, char **dir, char **file)
824 char *path, *last_slash;
826 STRDUP_ALLOCA (path, quoted_path);
827 decode_string (path);
829 last_slash = strrchr (path, '/');
833 *file = xstrdup (path);
837 *dir = strdupdelim (path, last_slash);
838 *file = xstrdup (last_slash + 1);
842 /* Note: URL's "full path" is the path with the query string and
843 params appended. The "fragment" (#foo) is intentionally ignored,
844 but that might be changed. For example, if the original URL was
845 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
846 the full path will be "/foo/bar/baz;bullshit?querystring". */
848 /* Return the length of the full path, without the terminating
852 full_path_length (const struct url *url)
856 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
867 /* Write out the full path. */
870 full_path_write (const struct url *url, char *where)
872 #define FROB(el, chr) do { \
873 char *f_el = url->el; \
875 int l = strlen (f_el); \
877 memcpy (where, f_el, l); \
889 /* Public function for getting the "full path". */
891 url_full_path (const struct url *url)
893 int length = full_path_length (url);
894 char *full_path = (char *)xmalloc(length + 1);
896 full_path_write (url, full_path);
897 full_path[length] = '\0';
902 /* Sync u->path and u->url with u->dir and u->file. */
904 sync_path (struct url *url)
912 newpath = xstrdup (url->file);
917 int dirlen = strlen (url->dir);
918 int filelen = strlen (url->file);
920 newpath = xmalloc (dirlen + 1 + filelen + 1);
921 memcpy (newpath, url->dir, dirlen);
922 newpath[dirlen] = '/';
923 memcpy (newpath + dirlen + 1, url->file, filelen);
924 newpath[dirlen + 1 + filelen] = '\0';
930 /* Synchronize u->url. */
932 url->url = url_string (url, 0);
935 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
936 This way we can sync u->path and u->url when they get changed. */
939 url_set_dir (struct url *url, const char *newdir)
942 url->dir = xstrdup (newdir);
947 url_set_file (struct url *url, const char *newfile)
950 url->file = xstrdup (newfile);
955 url_free (struct url *url)
961 FREE_MAYBE (url->params);
962 FREE_MAYBE (url->query);
963 FREE_MAYBE (url->fragment);
964 FREE_MAYBE (url->user);
965 FREE_MAYBE (url->passwd);
974 get_urls_file (const char *file)
976 struct file_memory *fm;
977 struct urlpos *head, *tail;
978 const char *text, *text_end;
981 fm = read_file (file);
984 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
987 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
990 text_end = fm->content + fm->length;
991 while (text < text_end)
993 const char *line_beg = text;
994 const char *line_end = memchr (text, '\n', text_end - text);
1000 while (line_beg < line_end
1001 && ISSPACE (*line_beg))
1003 while (line_end > line_beg + 1
1004 && ISSPACE (*(line_end - 1)))
1006 if (line_end > line_beg)
1010 struct urlpos *entry;
1013 /* We must copy the URL to a zero-terminated string. *sigh*. */
1014 url_text = strdupdelim (line_beg, line_end);
1015 url = url_parse (url_text, &up_error_code);
1018 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1019 file, url_text, url_error (up_error_code));
1025 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1026 memset (entry, 0, sizeof (*entry));
1037 read_file_free (fm);
1041 /* Free the linked list of urlpos. */
1043 free_urlpos (struct urlpos *l)
1047 struct urlpos *next = l->next;
1050 FREE_MAYBE (l->local_name);
1056 /* Rotate FNAME opt.backups times */
1058 rotate_backups(const char *fname)
1060 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1061 char *from = (char *)alloca (maxlen);
1062 char *to = (char *)alloca (maxlen);
1066 if (stat (fname, &sb) == 0)
1067 if (S_ISREG (sb.st_mode) == 0)
1070 for (i = opt.backups; i > 1; i--)
1072 sprintf (from, "%s.%d", fname, i - 1);
1073 sprintf (to, "%s.%d", fname, i);
1074 /* #### This will fail on machines without the rename() system
1079 sprintf (to, "%s.%d", fname, 1);
1083 /* Create all the necessary directories for PATH (a file). Calls
1084 mkdirhier() internally. */
1086 mkalldirs (const char *path)
1093 p = path + strlen (path);
1094 for (; *p != '/' && p != path; p--);
1095 /* Don't create if it's just a file. */
1096 if ((p == path) && (*p != '/'))
1098 t = strdupdelim (path, p);
1099 /* Check whether the directory exists. */
1100 if ((stat (t, &st) == 0))
1102 if (S_ISDIR (st.st_mode))
1109 /* If the dir exists as a file name, remove it first. This
1110 is *only* for Wget to work with buggy old CERN http
1111 servers. Here is the scenario: When Wget tries to
1112 retrieve a directory without a slash, e.g.
1113 http://foo/bar (bar being a directory), CERN server will
1114 not redirect it too http://foo/bar/ -- it will generate a
1115 directory listing containing links to bar/file1,
1116 bar/file2, etc. Wget will lose because it saves this
1117 HTML listing to a file `bar', so it cannot create the
1118 directory. To work around this, if the file of the same
1119 name exists, we just remove it and create the directory
1121 DEBUGP (("Removing %s because of directory danger!\n", t));
1125 res = make_directory (t);
1127 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1133 count_slashes (const char *s)
1142 /* Return the path name of the URL-equivalent file name, with a
1143 remote-like structure of directories. */
1145 mkstruct (const struct url *u)
1147 char *dir, *dir_preencoding;
1148 char *file, *res, *dirpref;
1149 char *query = u->query && *u->query ? u->query : NULL;
1154 char *ptr = u->dir + (*u->dir == '/');
1155 int slash_count = 1 + count_slashes (ptr);
1156 int cut = MINVAL (opt.cut_dirs, slash_count);
1157 for (; cut && *ptr; ptr++)
1160 STRDUP_ALLOCA (dir, ptr);
1163 dir = u->dir + (*u->dir == '/');
1165 /* Check for the true name (or at least a consistent name for saving
1166 to directory) of HOST, reusing the hlist if possible. */
1167 if (opt.add_hostdir)
1169 /* Add dir_prefix and hostname (if required) to the beginning of
1171 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1173 + 1 + numdigit (u->port)
1175 if (!DOTP (opt.dir_prefix))
1176 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1178 strcpy (dirpref, u->host);
1180 if (u->port != scheme_default_port (u->scheme))
1182 int len = strlen (dirpref);
1184 long_to_string (dirpref + len + 1, u->port);
1187 else /* not add_hostdir */
1189 if (!DOTP (opt.dir_prefix))
1190 dirpref = opt.dir_prefix;
1195 /* If there is a prefix, prepend it. */
1198 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1199 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1203 dir_preencoding = dir;
1204 dir = reencode_string (dir_preencoding);
1207 if (l && dir[l - 1] == '/')
1211 file = "index.html";
1215 /* Finally, construct the full name. */
1216 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1217 + (query ? (1 + strlen (query)) : 0)
1219 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1223 strcat (res, query);
1225 if (dir != dir_preencoding)
1230 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1231 an escaped query string. The trick is to make sure that unsafe
1232 characters in BASE are escaped, and that slashes in QUERY are also
1236 compose_file_name (char *base, char *query)
1242 /* Copy BASE to RESULT and encode all unsafe characters. */
1244 while (*from && to - result < sizeof (result))
1246 if (UNSAFE_CHAR (*from))
1248 unsigned char c = *from++;
1250 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1251 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1257 if (query && to - result < sizeof (result))
1261 /* Copy QUERY to RESULT and encode all '/' characters. */
1263 while (*from && to - result < sizeof (result))
1277 if (to - result < sizeof (result))
1280 /* Truncate input which is too long, presumably due to a huge
1282 result[sizeof (result) - 1] = '\0';
1284 return xstrdup (result);
1287 /* Create a unique filename, corresponding to a given URL. Calls
1288 mkstruct if necessary. Does *not* actually create any directories. */
1290 url_filename (const struct url *u)
1293 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1297 file = mkstruct (u);
1302 char *base = *u->file ? u->file : "index.html";
1303 char *query = u->query && *u->query ? u->query : NULL;
1304 file = compose_file_name (base, query);
1309 /* Check whether the prefix directory is something other than "."
1310 before prepending it. */
1311 if (!DOTP (opt.dir_prefix))
1313 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1314 + 1 + strlen (file) + 1);
1315 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1320 /* DOS-ish file systems don't like `%' signs in them; we change it
1325 for (p = file; *p; p++)
1329 #endif /* WINDOWS */
1331 /* Check the cases in which the unique extensions are not used:
1332 1) Clobbering is turned off (-nc).
1333 2) Retrieval with regetting.
1334 3) Timestamping is used.
1335 4) Hierarchy is built.
1337 The exception is the case when file does exist and is a
1338 directory (actually support for bad httpd-s). */
1339 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1340 && !(file_exists_p (file) && !file_non_directory_p (file)))
1343 /* Find a unique name. */
1344 name = unique_name (file);
1349 /* Like strlen(), but allow the URL to be ended with '?'. */
1351 urlpath_length (const char *url)
1353 const char *q = strpbrk_or_eos (url, "?;#");
1357 /* Find the last occurrence of character C in the range [b, e), or
1358 NULL, if none are present. This is almost completely equivalent to
1359 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1360 the contents of the string. */
1362 find_last_char (const char *b, const char *e, char c)
1370 /* Resolve the result of "linking" a base URI (BASE) to a
1371 link-specified URI (LINK).
1373 Either of the URIs may be absolute or relative, complete with the
1374 host name, or path only. This tries to behave "reasonably" in all
1375 foreseeable cases. It employs little specific knowledge about
1376 schemes or URL-specific stuff -- it just works on strings.
1378 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1379 See uri_merge for a gentler interface to this functionality.
1381 #### This function should handle `./' and `../' so that the evil
1382 path_simplify can go. */
1384 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1390 const char *end = base + urlpath_length (base);
1394 /* Empty LINK points back to BASE, query string and all. */
1395 constr = xstrdup (base);
1397 else if (*link == '?')
1399 /* LINK points to the same location, but changes the query
1400 string. Examples: */
1401 /* uri_merge("path", "?new") -> "path?new" */
1402 /* uri_merge("path?foo", "?new") -> "path?new" */
1403 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1404 /* uri_merge("path#foo", "?new") -> "path?new" */
1405 int baselength = end - base;
1406 constr = xmalloc (baselength + linklength + 1);
1407 memcpy (constr, base, baselength);
1408 memcpy (constr + baselength, link, linklength);
1409 constr[baselength + linklength] = '\0';
1411 else if (*link == '#')
1413 /* uri_merge("path", "#new") -> "path#new" */
1414 /* uri_merge("path#foo", "#new") -> "path#new" */
1415 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1416 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1418 const char *end1 = strchr (base, '#');
1420 end1 = base + strlen (base);
1421 baselength = end1 - base;
1422 constr = xmalloc (baselength + linklength + 1);
1423 memcpy (constr, base, baselength);
1424 memcpy (constr + baselength, link, linklength);
1425 constr[baselength + linklength] = '\0';
1427 else if (*link == '/')
1429 /* LINK is an absolute path: we need to replace everything
1430 after (and including) the FIRST slash with LINK.
1432 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1433 "/qux/xyzzy", our result should be
1434 "http://host/qux/xyzzy". */
1437 const char *start_insert = NULL; /* for gcc to shut up. */
1438 const char *pos = base;
1439 int seen_slash_slash = 0;
1440 /* We're looking for the first slash, but want to ignore
1443 slash = memchr (pos, '/', end - pos);
1444 if (slash && !seen_slash_slash)
1445 if (*(slash + 1) == '/')
1448 seen_slash_slash = 1;
1452 /* At this point, SLASH is the location of the first / after
1453 "//", or the first slash altogether. START_INSERT is the
1454 pointer to the location where LINK will be inserted. When
1455 examining the last two examples, keep in mind that LINK
1458 if (!slash && !seen_slash_slash)
1459 /* example: "foo" */
1461 start_insert = base;
1462 else if (!slash && seen_slash_slash)
1463 /* example: "http://foo" */
1466 else if (slash && !seen_slash_slash)
1467 /* example: "foo/bar" */
1469 start_insert = base;
1470 else if (slash && seen_slash_slash)
1471 /* example: "http://something/" */
1473 start_insert = slash;
1475 span = start_insert - base;
1476 constr = (char *)xmalloc (span + linklength + 1);
1478 memcpy (constr, base, span);
1480 memcpy (constr + span, link, linklength);
1481 constr[span + linklength] = '\0';
1485 /* LINK is a relative URL: we need to replace everything
1486 after last slash (possibly empty) with LINK.
1488 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1489 our result should be "whatever/foo/qux/xyzzy". */
1490 int need_explicit_slash = 0;
1492 const char *start_insert;
1493 const char *last_slash = find_last_char (base, end, '/');
1496 /* No slash found at all. Append LINK to what we have,
1497 but we'll need a slash as a separator.
1499 Example: if base == "foo" and link == "qux/xyzzy", then
1500 we cannot just append link to base, because we'd get
1501 "fooqux/xyzzy", whereas what we want is
1504 To make sure the / gets inserted, we set
1505 need_explicit_slash to 1. We also set start_insert
1506 to end + 1, so that the length calculations work out
1507 correctly for one more (slash) character. Accessing
1508 that character is fine, since it will be the
1509 delimiter, '\0' or '?'. */
1510 /* example: "foo?..." */
1511 /* ^ ('?' gets changed to '/') */
1512 start_insert = end + 1;
1513 need_explicit_slash = 1;
1515 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1517 /* example: http://host" */
1519 start_insert = end + 1;
1520 need_explicit_slash = 1;
1524 /* example: "whatever/foo/bar" */
1526 start_insert = last_slash + 1;
1529 span = start_insert - base;
1530 constr = (char *)xmalloc (span + linklength + 1);
1532 memcpy (constr, base, span);
1533 if (need_explicit_slash)
1534 constr[span - 1] = '/';
1536 memcpy (constr + span, link, linklength);
1537 constr[span + linklength] = '\0';
1540 else /* !no_scheme */
1542 constr = strdupdelim (link, link + linklength);
1547 /* Merge BASE with LINK and return the resulting URI. This is an
1548 interface to uri_merge_1 that assumes that LINK is a
1549 zero-terminated string. */
1551 uri_merge (const char *base, const char *link)
1553 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1556 #define APPEND(p, s) do { \
1557 int len = strlen (s); \
1558 memcpy (p, s, len); \
1562 /* Use this instead of password when the actual password is supposed
1563 to be hidden. We intentionally use a generic string without giving
1564 away the number of characters in the password, like previous
1566 #define HIDDEN_PASSWORD "*password*"
1568 /* Recreate the URL string from the data in URL.
1570 If HIDE is non-zero (as it is when we're calling this on a URL we
1571 plan to print, but not when calling it to canonicalize a URL for
1572 use within the program), password will be hidden. Unsafe
1573 characters in the URL will be quoted. */
1576 url_string (const struct url *url, int hide_password)
1580 char *quoted_user = NULL, *quoted_passwd = NULL;
1582 int scheme_port = supported_schemes[url->scheme].default_port;
1583 char *scheme_str = supported_schemes[url->scheme].leading_string;
1584 int fplen = full_path_length (url);
1586 assert (scheme_str != NULL);
1588 /* Make sure the user name and password are quoted. */
1591 quoted_user = encode_string_maybe (url->user);
1595 quoted_passwd = HIDDEN_PASSWORD;
1597 quoted_passwd = encode_string_maybe (url->passwd);
1601 size = (strlen (scheme_str)
1602 + strlen (url->host)
1605 if (url->port != scheme_port)
1606 size += 1 + numdigit (url->port);
1609 size += 1 + strlen (quoted_user);
1611 size += 1 + strlen (quoted_passwd);
1614 p = result = xmalloc (size);
1616 APPEND (p, scheme_str);
1619 APPEND (p, quoted_user);
1623 APPEND (p, quoted_passwd);
1628 APPEND (p, url->host);
1629 if (url->port != scheme_port)
1632 long_to_string (p, url->port);
1636 full_path_write (url, p);
1640 assert (p - result == size);
1642 if (quoted_user && quoted_user != url->user)
1643 xfree (quoted_user);
1644 if (quoted_passwd && !hide_password
1645 && quoted_passwd != url->passwd)
1646 xfree (quoted_passwd);
1651 /* Returns proxy host address, in accordance with SCHEME. */
1653 getproxy (enum url_scheme scheme)
1656 char *rewritten_url;
1657 static char rewritten_storage[1024];
1662 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1666 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1670 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1672 case SCHEME_INVALID:
1675 if (!proxy || !*proxy)
1678 /* Handle shorthands. */
1679 rewritten_url = rewrite_shorthand_url (proxy);
1682 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1683 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1684 proxy = rewritten_storage;
1690 /* Should a host be accessed through proxy, concerning no_proxy? */
1692 no_proxy_match (const char *host, const char **no_proxy)
1697 return !sufmatch (no_proxy, host);
1700 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1701 static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
1702 static char *local_quote_string PARAMS ((const char *));
1704 /* Change the links in one HTML file. LINKS is a list of links in the
1705 document, along with their positions and the desired direction of
1708 convert_links (const char *file, struct urlpos *links)
1710 struct file_memory *fm;
1713 downloaded_file_t downloaded_file_return;
1715 struct urlpos *link;
1716 int to_url_count = 0, to_file_count = 0;
1718 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1721 /* First we do a "dry run": go through the list L and see whether
1722 any URL needs to be converted in the first place. If not, just
1723 leave the file alone. */
1725 struct urlpos *dry = links;
1726 for (dry = links; dry; dry = dry->next)
1727 if (dry->convert != CO_NOCONVERT)
1731 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1736 fm = read_file (file);
1739 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1740 file, strerror (errno));
1744 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1745 if (opt.backup_converted && downloaded_file_return)
1746 write_backup_file (file, downloaded_file_return);
1748 /* Before opening the file for writing, unlink the file. This is
1749 important if the data in FM is mmaped. In such case, nulling the
1750 file, which is what fopen() below does, would make us read all
1751 zeroes from the mmaped region. */
1752 if (unlink (file) < 0 && errno != ENOENT)
1754 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1755 file, strerror (errno));
1756 read_file_free (fm);
1759 /* Now open the file for writing. */
1760 fp = fopen (file, "wb");
1763 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1764 file, strerror (errno));
1765 read_file_free (fm);
1769 /* Here we loop through all the URLs in file, replacing those of
1770 them that are downloaded with relative references. */
1772 for (link = links; link; link = link->next)
1774 char *url_start = fm->content + link->pos;
1776 if (link->pos >= fm->length)
1778 DEBUGP (("Something strange is going on. Please investigate."));
1781 /* If the URL is not to be converted, skip it. */
1782 if (link->convert == CO_NOCONVERT)
1784 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1788 /* Echo the file contents, up to the offending URL's opening
1789 quote, to the outfile. */
1790 fwrite (p, 1, url_start - p, fp);
1793 switch (link->convert)
1795 case CO_CONVERT_TO_RELATIVE:
1796 /* Convert absolute URL to relative. */
1798 char *newname = construct_relative (file, link->local_name);
1799 char *quoted_newname = local_quote_string (newname);
1800 p = replace_attr (p, link->size, fp, quoted_newname);
1801 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1802 link->url->url, newname, link->pos, file));
1804 xfree (quoted_newname);
1808 case CO_CONVERT_TO_COMPLETE:
1809 /* Convert the link to absolute URL. */
1811 char *newlink = link->url->url;
1812 char *quoted_newlink = html_quote_string (newlink);
1813 p = replace_attr (p, link->size, fp, quoted_newlink);
1814 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1815 newlink, link->pos, file));
1816 xfree (quoted_newlink);
1820 case CO_NULLIFY_BASE:
1821 /* Change the base href to "". */
1822 p = replace_attr (p, link->size, fp, "");
1830 /* Output the rest of the file. */
1831 if (p - fm->content < fm->length)
1832 fwrite (p, 1, fm->length - (p - fm->content), fp);
1834 read_file_free (fm);
1836 logprintf (LOG_VERBOSE,
1837 _("%d-%d\n"), to_file_count, to_url_count);
1840 /* Construct and return a malloced copy of the relative link from two
1841 pieces of information: local name S1 of the referring file and
1842 local name S2 of the referred file.
1844 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1845 "jagor.srce.hr/images/news.gif", the function will return
1848 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1849 "fly.cc.fer.hr/images/fly.gif", the function will return
1850 "../images/fly.gif".
1852 Caveats: S1 should not begin with `/', unless S2 also begins with
1853 '/'. S1 should not contain things like ".." and such --
1854 construct_relative ("fly/ioccc/../index.html",
1855 "fly/images/fly.gif") will fail. (A workaround is to call
1856 something like path_simplify() on S1). */
1858 construct_relative (const char *s1, const char *s2)
1860 int i, cnt, sepdirs1;
1864 return xstrdup (s2);
1865 /* S1 should *not* be absolute, if S2 wasn't. */
1866 assert (*s1 != '/');
1868 /* Skip the directories common to both strings. */
1871 while (s1[i] && s2[i]
1876 if (s1[i] == '/' && s2[i] == '/')
1881 for (sepdirs1 = 0; s1[i]; i++)
1884 /* Now, construct the file as of:
1885 - ../ repeated sepdirs1 time
1886 - all the non-mutual directories of S2. */
1887 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1888 for (i = 0; i < sepdirs1; i++)
1889 memcpy (res + 3 * i, "../", 3);
1890 strcpy (res + 3 * i, s2 + cnt);
1895 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1897 /* Rather than just writing over the original .html file with the
1898 converted version, save the former to *.orig. Note we only do
1899 this for files we've _successfully_ downloaded, so we don't
1900 clobber .orig files sitting around from previous invocations. */
1902 /* Construct the backup filename as the original name plus ".orig". */
1903 size_t filename_len = strlen(file);
1904 char* filename_plus_orig_suffix;
1905 boolean already_wrote_backup_file = FALSE;
1906 slist* converted_file_ptr;
1907 static slist* converted_files = NULL;
1909 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1911 /* Just write "orig" over "html". We need to do it this way
1912 because when we're checking to see if we've downloaded the
1913 file before (to see if we can skip downloading it), we don't
1914 know if it's a text/html file. Therefore we don't know yet
1915 at that stage that -E is going to cause us to tack on
1916 ".html", so we need to compare vs. the original URL plus
1917 ".orig", not the original URL plus ".html.orig". */
1918 filename_plus_orig_suffix = alloca (filename_len + 1);
1919 strcpy(filename_plus_orig_suffix, file);
1920 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1922 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1924 /* Append ".orig" to the name. */
1925 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1926 strcpy(filename_plus_orig_suffix, file);
1927 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1930 /* We can get called twice on the same URL thanks to the
1931 convert_all_links() call in main(). If we write the .orig file
1932 each time in such a case, it'll end up containing the first-pass
1933 conversion, not the original file. So, see if we've already been
1934 called on this file. */
1935 converted_file_ptr = converted_files;
1936 while (converted_file_ptr != NULL)
1937 if (strcmp(converted_file_ptr->string, file) == 0)
1939 already_wrote_backup_file = TRUE;
1943 converted_file_ptr = converted_file_ptr->next;
1945 if (!already_wrote_backup_file)
1947 /* Rename <file> to <file>.orig before former gets written over. */
1948 if (rename(file, filename_plus_orig_suffix) != 0)
1949 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1950 file, filename_plus_orig_suffix, strerror (errno));
1952 /* Remember that we've already written a .orig backup for this file.
1953 Note that we never free this memory since we need it till the
1954 convert_all_links() call, which is one of the last things the
1955 program does before terminating. BTW, I'm not sure if it would be
1956 safe to just set 'converted_file_ptr->string' to 'file' below,
1957 rather than making a copy of the string... Another note is that I
1958 thought I could just add a field to the urlpos structure saying
1959 that we'd written a .orig file for this URL, but that didn't work,
1960 so I had to make this separate list.
1961 -- Dan Harkless <wget@harkless.org>
1963 This [adding a field to the urlpos structure] didn't work
1964 because convert_file() is called from convert_all_links at
1965 the end of the retrieval with a freshly built new urlpos
1967 -- Hrvoje Niksic <hniksic@arsdigita.com>
1969 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1970 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1971 converted_file_ptr->next = converted_files;
1972 converted_files = converted_file_ptr;
1976 static int find_fragment PARAMS ((const char *, int, const char **,
1979 /* Replace an attribute's original text with NEW_TEXT. */
1982 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
1985 char quote_char = '\"'; /* use "..." for quoting, unless the
1986 original value is quoted, in which
1987 case reuse its quoting char. */
1988 const char *frag_beg, *frag_end;
1990 /* Structure of our string is:
1991 "...old-contents..."
1992 <--- size ---> (with quotes)
1995 <--- size --> (no quotes) */
1997 if (*p == '\"' || *p == '\'')
2002 size -= 2; /* disregard opening and closing quote */
2004 putc (quote_char, fp);
2005 fputs (new_text, fp);
2007 /* Look for fragment identifier, if any. */
2008 if (find_fragment (p, size, &frag_beg, &frag_end))
2009 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2013 putc (quote_char, fp);
2018 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2019 preceded by '&'. If the character is not found, return zero. If
2020 the character is found, return 1 and set BP and EP to point to the
2021 beginning and end of the region.
2023 This is used for finding the fragment indentifiers in URLs. */
2026 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2028 const char *end = beg + size;
2030 for (; beg < end; beg++)
2052 /* The idea here was to quote ? as %3F to avoid passing part of the
2053 file name as the parameter when browsing the converted file through
2054 HTTP. However, actually doing that breaks local browsing because
2055 "index.html%3Ffoo=bar" isn't even recognized as an HTML file!
2056 Perhaps this should be controlled by an option, but for now I'm
2057 leaving the question marks.
2059 This is the original docstring of this function:
2061 FILE should be a relative link to a local file. It should be
2062 quoted as HTML because it will be used in HTML context. However,
2063 we need to quote ? as %3F to avoid passing part of the file name as
2064 the parameter. (This is not a problem when viewing locally, but is
2065 if the downloaded and converted tree is served by an HTTP
2068 /* Quote string as HTML. */
2071 local_quote_string (const char *file)
2073 return html_quote_string (file);
2076 const char *file_sans_qmark;
2077 int qm = count_char (file, '?');
2081 const char *from = file;
2084 /* qm * 2 because we replace each question mark with "%3F",
2085 i.e. replace one char with three, hence two more. */
2086 int fsqlen = strlen (file) + qm * 2;
2088 to = newname = (char *)alloca (fsqlen + 1);
2089 for (; *from; from++)
2100 assert (to - newname == fsqlen);
2103 file_sans_qmark = newname;
2106 file_sans_qmark = file;
2108 return html_quote_string (file_sans_qmark);
2112 /* We're storing "modes" of type downloaded_file_t in the hash table.
2113 However, our hash tables only accept pointers for keys and values.
2114 So when we need a pointer, we use the address of a
2115 downloaded_file_t variable of static storage. */
2117 static downloaded_file_t *
2118 downloaded_mode_to_ptr (downloaded_file_t mode)
2120 static downloaded_file_t
2121 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2122 v2 = FILE_DOWNLOADED_NORMALLY,
2123 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2124 v4 = CHECK_FOR_FILE;
2128 case FILE_NOT_ALREADY_DOWNLOADED:
2130 case FILE_DOWNLOADED_NORMALLY:
2132 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2134 case CHECK_FOR_FILE:
2140 /* This should really be merged with dl_file_url_map and
2141 downloaded_html_files in recur.c. This was originally a list, but
2142 I changed it to a hash table beause it was actually taking a lot of
2143 time to find things in it. */
2145 static struct hash_table *downloaded_files_hash;
2147 /* Remembers which files have been downloaded. In the standard case, should be
2148 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2149 download successfully (i.e. not for ones we have failures on or that we skip
2152 When we've downloaded a file and tacked on a ".html" extension due to -E,
2153 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2154 FILE_DOWNLOADED_NORMALLY.
2156 If you just want to check if a file has been previously added without adding
2157 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2158 with local filenames, not remote URLs. */
2160 downloaded_file (downloaded_file_t mode, const char *file)
2162 downloaded_file_t *ptr;
2164 if (mode == CHECK_FOR_FILE)
2166 if (!downloaded_files_hash)
2167 return FILE_NOT_ALREADY_DOWNLOADED;
2168 ptr = hash_table_get (downloaded_files_hash, file);
2170 return FILE_NOT_ALREADY_DOWNLOADED;
2174 if (!downloaded_files_hash)
2175 downloaded_files_hash = make_string_hash_table (0);
2177 ptr = hash_table_get (downloaded_files_hash, file);
2181 ptr = downloaded_mode_to_ptr (mode);
2182 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2184 return FILE_NOT_ALREADY_DOWNLOADED;
2188 df_free_mapper (void *key, void *value, void *ignored)
2195 downloaded_files_free (void)
2197 if (downloaded_files_hash)
2199 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2200 hash_table_destroy (downloaded_files_hash);
2201 downloaded_files_hash = NULL;