2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
51 static int urlpath_length PARAMS ((const char *));
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { "http://", DEFAULT_HTTP_PORT },
64 { "https://", DEFAULT_HTTPS_PORT },
66 { "ftp://", DEFAULT_FTP_PORT },
72 static char *construct_relative PARAMS ((const char *, const char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars, preserved from encoding. */
92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
94 /* rfc1738 unsafe chars, plus some more. */
96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
98 const static unsigned char urlchr_table[256] =
100 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
101 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
102 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
103 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
104 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
105 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
106 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
107 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
108 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
109 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
110 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
111 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
112 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
113 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
114 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
115 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
117 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
118 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
119 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 /* Decodes the forms %xy in a URL to the character the hexadecimal
129 code of which is xy. xy are hexadecimal digits from
130 [0123456789ABCDEF] (case-insensitive). If x or y are not
131 hex-digits or `%' precedes `\0', the sequence is inserted
135 decode_string (char *s)
137 char *t = s; /* t - tortoise */
138 char *h = s; /* h - hare */
149 /* Do nothing if '%' is not followed by two hex digits. */
150 if (!*(h + 1) || !*(h + 2)
151 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
153 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
160 /* Like encode_string, but return S if there are no unsafe chars. */
163 encode_string_maybe (const char *s)
170 for (p1 = s; *p1; p1++)
171 if (UNSAFE_CHAR (*p1))
172 addition += 2; /* Two more characters (hex digits) */
177 newlen = (p1 - s) + addition;
178 newstr = (char *)xmalloc (newlen + 1);
184 if (UNSAFE_CHAR (*p1))
186 unsigned char c = *p1++;
188 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
189 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
195 assert (p2 - newstr == newlen);
200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
201 given string, returning a malloc-ed %XX encoded string. */
204 encode_string (const char *s)
206 char *encoded = encode_string_maybe (s);
213 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
214 the old value of PTR is freed and PTR is made to point to the newly
215 allocated storage. */
217 #define ENCODE(ptr) do { \
218 char *e_new = encode_string_maybe (ptr); \
226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
228 /* Decide whether to encode, decode, or pass through the char at P.
229 This used to be a macro, but it got a little too convoluted. */
230 static inline enum copy_method
231 decide_copy_method (const char *p)
235 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
237 /* %xx sequence: decode it, unless it would decode to an
238 unsafe or a reserved char; in that case, leave it as
240 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
241 XCHAR_TO_XDIGIT (*(p + 2));
243 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
244 return CM_PASSTHROUGH;
249 /* Garbled %.. sequence: encode `%'. */
252 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
255 return CM_PASSTHROUGH;
258 /* Translate a %-quoting (but possibly non-conformant) input string S
259 into a %-quoting (and conformant) output string. If no characters
260 are encoded or decoded, return the same string S; otherwise, return
261 a freshly allocated string with the new contents.
263 After a URL has been run through this function, the protocols that
264 use `%' as the quote character can use the resulting string as-is,
265 while those that don't call decode_string() to get to the intended
266 data. This function is also stable: after an input string is
267 transformed the first time, all further transformations of the
268 result yield the same result string.
270 Let's discuss why this function is needed.
272 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
273 space character would mess up the HTTP request, it needs to be
276 GET /abc%20def HTTP/1.0
278 So it appears that the unsafe chars need to be quoted, as with
279 encode_string. But what if we're requested to download
280 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
281 the user meant was a literal space, and he was kind enough to quote
282 it. In that case, Wget should obviously leave the `%20' as is, and
283 send the same request as above. So in this case we may not call
286 But what if the requested URI is `abc%20 def'? If we call
287 encode_string, we end up with `/abc%2520%20def', which is almost
288 certainly not intended. If we don't call encode_string, we are
289 left with the embedded space and cannot send the request. What the
290 user meant was for Wget to request `/abc%20%20def', and this is
291 where reencode_string kicks in.
293 Wget used to solve this by first decoding %-quotes, and then
294 encoding all the "unsafe" characters found in the resulting string.
295 This was wrong because it didn't preserve certain URL special
296 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
297 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
298 whether we considered `+' reserved (it is). One of these results
299 is inevitable because by the second step we would lose information
300 on whether the `+' was originally encoded or not. Both results
301 were wrong because in CGI parameters + means space, while %2B means
302 literal plus. reencode_string correctly translates the above to
303 "a%2B+b", i.e. returns the original string.
305 This function uses an algorithm proposed by Anon Sricharoenchai:
307 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
310 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
313 ...except that this code conflates the two steps, and decides
314 whether to encode, decode, or pass through each character in turn.
315 The function still uses two passes, but their logic is the same --
316 the first pass exists merely for the sake of allocation. Another
317 small difference is that we include `+' to URL_RESERVED.
321 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
323 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
327 "foo bar" -> "foo%20bar"
328 "foo%20bar" -> "foo%20bar"
329 "foo %20bar" -> "foo%20%20bar"
330 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
331 "foo%25%20bar" -> "foo%25%20bar"
332 "foo%2%20bar" -> "foo%252%20bar"
333 "foo+bar" -> "foo+bar" (plus is reserved!)
334 "foo%2b+bar" -> "foo%2b+bar" */
337 reencode_string (const char *s)
343 int encode_count = 0;
344 int decode_count = 0;
346 /* First, pass through the string to see if there's anything to do,
347 and to calculate the new length. */
348 for (p1 = s; *p1; p1++)
350 switch (decide_copy_method (p1))
363 if (!encode_count && !decode_count)
364 /* The string is good as it is. */
365 return (char *)s; /* C const model sucks. */
368 /* Each encoding adds two characters (hex digits), while each
369 decoding removes two characters. */
370 newlen = oldlen + 2 * (encode_count - decode_count);
371 newstr = xmalloc (newlen + 1);
378 switch (decide_copy_method (p1))
382 unsigned char c = *p1++;
384 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
385 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
389 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
390 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
391 p1 += 3; /* skip %xx */
398 assert (p2 - newstr == newlen);
402 /* Run PTR_VAR through reencode_string. If a new string is consed,
403 free PTR_VAR and make it point to the new storage. Obviously,
404 PTR_VAR needs to be an lvalue. */
406 #define REENCODE(ptr_var) do { \
407 char *rf_new = reencode_string (ptr_var); \
408 if (rf_new != ptr_var) \
415 /* Returns the scheme type if the scheme is supported, or
416 SCHEME_INVALID if not. */
418 url_scheme (const char *url)
422 for (i = 0; supported_schemes[i].leading_string; i++)
423 if (!strncasecmp (url, supported_schemes[i].leading_string,
424 strlen (supported_schemes[i].leading_string)))
425 return (enum url_scheme)i;
426 return SCHEME_INVALID;
429 /* Return the number of characters needed to skip the scheme part of
430 the URL, e.g. `http://'. If no scheme is found, returns 0. */
432 url_skip_scheme (const char *url)
436 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
438 while (ISALNUM (*p) || *p == '-' || *p == '+')
445 /* Skip "//" if found. */
446 if (*p == '/' && *(p + 1) == '/')
452 /* Returns 1 if the URL begins with a scheme (supported or
453 unsupported), 0 otherwise. */
455 url_has_scheme (const char *url)
458 while (ISALNUM (*p) || *p == '-' || *p == '+')
464 scheme_default_port (enum url_scheme scheme)
466 return supported_schemes[scheme].default_port;
469 /* Skip the username and password, if present here. The function
470 should be called *not* with the complete URL, but with the part
471 right after the scheme.
473 If no username and password are found, return 0. */
475 url_skip_uname (const char *url)
479 /* Look for '@' that comes before '/' or '?'. */
480 p = (const char *)strpbrk (url, "/?@");
488 parse_uname (const char *str, int len, char **user, char **passwd)
493 /* Empty user name not allowed. */
496 colon = memchr (str, ':', len);
498 /* Empty user name again. */
503 int pwlen = len - (colon + 1 - str);
504 *passwd = xmalloc (pwlen + 1);
505 memcpy (*passwd, colon + 1, pwlen);
506 (*passwd)[pwlen] = '\0';
512 *user = xmalloc (len + 1);
513 memcpy (*user, str, len);
519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
520 popularized by Netscape and NcFTP. HTTP shorthands look like this:
522 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
523 www.foo.com[:port] -> http://www.foo.com[:port]
525 FTP shorthands look like this:
527 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
528 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
530 If the URL needs not or cannot be rewritten, return NULL. */
532 rewrite_shorthand_url (const char *url)
536 if (url_has_scheme (url))
539 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
541 for (p = url; *p && *p != ':' && *p != '/'; p++)
549 const char *pp, *path;
551 /* If the characters after the colon and before the next slash
552 or end of string are all digits, it's HTTP. */
554 for (pp = p + 1; ISDIGIT (*pp); pp++)
557 && (*pp == '/' || *pp == '\0'))
560 /* Prepend "ftp://" to the entire URL... */
562 res = xmalloc (6 + strlen (url) + 1);
563 sprintf (res, "ftp://%s", url);
564 /* ...and replace ':' with '/'. */
565 res[6 + (p - url)] = '/';
572 /* Just prepend "http://" to what we have. */
573 res = xmalloc (7 + strlen (url) + 1);
574 sprintf (res, "http://%s", url);
579 static void parse_path PARAMS ((const char *, char **, char **));
582 strpbrk_or_eos (const char *s, const char *accept)
584 char *p = strpbrk (s, accept);
586 p = (char *)s + strlen (s);
590 /* Turn STR into lowercase; return non-zero if a character was
594 lowercase_str (char *str)
601 *str = TOLOWER (*str);
606 static char *parse_errors[] = {
607 #define PE_NO_ERROR 0
609 #define PE_UNRECOGNIZED_SCHEME 1
610 "Unrecognized scheme",
611 #define PE_EMPTY_HOST 2
613 #define PE_BAD_PORT_NUMBER 3
615 #define PE_INVALID_USER_NAME 4
619 #define SETERR(p, v) do { \
626 Return a new struct url if successful, NULL on error. In case of
627 error, and if ERROR is not NULL, also set *ERROR to the appropriate
630 url_parse (const char *url, int *error)
634 int path_modified, host_modified;
636 enum url_scheme scheme;
638 const char *uname_b, *uname_e;
639 const char *host_b, *host_e;
640 const char *path_b, *path_e;
641 const char *params_b, *params_e;
642 const char *query_b, *query_e;
643 const char *fragment_b, *fragment_e;
646 char *user = NULL, *passwd = NULL;
650 scheme = url_scheme (url);
651 if (scheme == SCHEME_INVALID)
653 SETERR (error, PE_UNRECOGNIZED_SCHEME);
657 url_encoded = reencode_string (url);
660 p += strlen (supported_schemes[scheme].leading_string);
662 p += url_skip_uname (p);
665 /* scheme://user:pass@host[:port]... */
668 /* We attempt to break down the URL into the components path,
669 params, query, and fragment. They are ordered like this:
671 scheme://host[:port][/path][;params][?query][#fragment] */
673 params_b = params_e = NULL;
674 query_b = query_e = NULL;
675 fragment_b = fragment_e = NULL;
678 p = strpbrk_or_eos (p, ":/;?#");
681 if (host_b == host_e)
683 SETERR (error, PE_EMPTY_HOST);
687 port = scheme_default_port (scheme);
690 const char *port_b, *port_e, *pp;
692 /* scheme://host:port/tralala */
696 p = strpbrk_or_eos (p, "/;?#");
699 if (port_b == port_e)
701 /* http://host:/whatever */
703 SETERR (error, PE_BAD_PORT_NUMBER);
707 for (port = 0, pp = port_b; pp < port_e; pp++)
711 /* http://host:12randomgarbage/blah */
713 SETERR (error, PE_BAD_PORT_NUMBER);
716 port = 10 * port + (*pp - '0');
724 p = strpbrk_or_eos (p, ";?#");
729 /* Path is not allowed not to exist. */
737 p = strpbrk_or_eos (p, "?#");
744 p = strpbrk_or_eos (p, "#");
756 if (uname_b != uname_e)
758 /* http://user:pass@host */
760 /* uname_b uname_e */
761 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
763 SETERR (error, PE_INVALID_USER_NAME);
768 u = (struct url *)xmalloc (sizeof (struct url));
769 memset (u, 0, sizeof (*u));
772 u->host = strdupdelim (host_b, host_e);
777 u->path = strdupdelim (path_b, path_e);
778 path_modified = path_simplify (u->path);
779 parse_path (u->path, &u->dir, &u->file);
781 host_modified = lowercase_str (u->host);
784 u->params = strdupdelim (params_b, params_e);
786 u->query = strdupdelim (query_b, query_e);
788 u->fragment = strdupdelim (fragment_b, fragment_e);
791 if (path_modified || u->fragment || host_modified)
793 /* If path_simplify modified the path, or if a fragment is
794 present, or if the original host name had caps in it, make
795 sure that u->url is equivalent to what would be printed by
797 u->url = url_string (u, 0);
799 if (url_encoded != url)
800 xfree ((char *) url_encoded);
804 if (url_encoded == url)
805 u->url = xstrdup (url);
807 u->url = url_encoded;
815 url_error (int error_code)
817 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
818 return parse_errors[error_code];
822 parse_path (const char *quoted_path, char **dir, char **file)
824 char *path, *last_slash;
826 STRDUP_ALLOCA (path, quoted_path);
827 decode_string (path);
829 last_slash = strrchr (path, '/');
833 *file = xstrdup (path);
837 *dir = strdupdelim (path, last_slash);
838 *file = xstrdup (last_slash + 1);
842 /* Note: URL's "full path" is the path with the query string and
843 params appended. The "fragment" (#foo) is intentionally ignored,
844 but that might be changed. For example, if the original URL was
845 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
846 the full path will be "/foo/bar/baz;bullshit?querystring". */
848 /* Return the length of the full path, without the terminating
852 full_path_length (const struct url *url)
856 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
867 /* Write out the full path. */
870 full_path_write (const struct url *url, char *where)
872 #define FROB(el, chr) do { \
873 char *f_el = url->el; \
875 int l = strlen (f_el); \
877 memcpy (where, f_el, l); \
889 /* Public function for getting the "full path". E.g. if u->path is
890 "foo/bar" and u->query is "param=value", full_path will be
891 "/foo/bar?param=value". */
894 url_full_path (const struct url *url)
896 int length = full_path_length (url);
897 char *full_path = (char *)xmalloc(length + 1);
899 full_path_write (url, full_path);
900 full_path[length] = '\0';
905 /* Sync u->path and u->url with u->dir and u->file. */
908 sync_path (struct url *url)
916 newpath = xstrdup (url->file);
921 int dirlen = strlen (url->dir);
922 int filelen = strlen (url->file);
924 newpath = xmalloc (dirlen + 1 + filelen + 1);
925 memcpy (newpath, url->dir, dirlen);
926 newpath[dirlen] = '/';
927 memcpy (newpath + dirlen + 1, url->file, filelen);
928 newpath[dirlen + 1 + filelen] = '\0';
934 /* Synchronize u->url. */
936 url->url = url_string (url, 0);
939 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
940 This way we can sync u->path and u->url when they get changed. */
943 url_set_dir (struct url *url, const char *newdir)
946 url->dir = xstrdup (newdir);
951 url_set_file (struct url *url, const char *newfile)
954 url->file = xstrdup (newfile);
959 url_free (struct url *url)
965 FREE_MAYBE (url->params);
966 FREE_MAYBE (url->query);
967 FREE_MAYBE (url->fragment);
968 FREE_MAYBE (url->user);
969 FREE_MAYBE (url->passwd);
978 get_urls_file (const char *file)
980 struct file_memory *fm;
981 struct urlpos *head, *tail;
982 const char *text, *text_end;
985 fm = read_file (file);
988 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
991 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
994 text_end = fm->content + fm->length;
995 while (text < text_end)
997 const char *line_beg = text;
998 const char *line_end = memchr (text, '\n', text_end - text);
1000 line_end = text_end;
1004 while (line_beg < line_end
1005 && ISSPACE (*line_beg))
1007 while (line_end > line_beg + 1
1008 && ISSPACE (*(line_end - 1)))
1010 if (line_end > line_beg)
1014 struct urlpos *entry;
1017 /* We must copy the URL to a zero-terminated string. *sigh*. */
1018 url_text = strdupdelim (line_beg, line_end);
1019 url = url_parse (url_text, &up_error_code);
1022 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1023 file, url_text, url_error (up_error_code));
1029 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1030 memset (entry, 0, sizeof (*entry));
1041 read_file_free (fm);
1045 /* Free the linked list of urlpos. */
1047 free_urlpos (struct urlpos *l)
1051 struct urlpos *next = l->next;
1054 FREE_MAYBE (l->local_name);
1060 /* Rotate FNAME opt.backups times */
1062 rotate_backups(const char *fname)
1064 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1065 char *from = (char *)alloca (maxlen);
1066 char *to = (char *)alloca (maxlen);
1070 if (stat (fname, &sb) == 0)
1071 if (S_ISREG (sb.st_mode) == 0)
1074 for (i = opt.backups; i > 1; i--)
1076 sprintf (from, "%s.%d", fname, i - 1);
1077 sprintf (to, "%s.%d", fname, i);
1078 /* #### This will fail on machines without the rename() system
1083 sprintf (to, "%s.%d", fname, 1);
1087 /* Create all the necessary directories for PATH (a file). Calls
1088 mkdirhier() internally. */
1090 mkalldirs (const char *path)
1097 p = path + strlen (path);
1098 for (; *p != '/' && p != path; p--);
1099 /* Don't create if it's just a file. */
1100 if ((p == path) && (*p != '/'))
1102 t = strdupdelim (path, p);
1103 /* Check whether the directory exists. */
1104 if ((stat (t, &st) == 0))
1106 if (S_ISDIR (st.st_mode))
1113 /* If the dir exists as a file name, remove it first. This
1114 is *only* for Wget to work with buggy old CERN http
1115 servers. Here is the scenario: When Wget tries to
1116 retrieve a directory without a slash, e.g.
1117 http://foo/bar (bar being a directory), CERN server will
1118 not redirect it too http://foo/bar/ -- it will generate a
1119 directory listing containing links to bar/file1,
1120 bar/file2, etc. Wget will lose because it saves this
1121 HTML listing to a file `bar', so it cannot create the
1122 directory. To work around this, if the file of the same
1123 name exists, we just remove it and create the directory
1125 DEBUGP (("Removing %s because of directory danger!\n", t));
1129 res = make_directory (t);
1131 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1137 count_slashes (const char *s)
1146 /* Return the path name of the URL-equivalent file name, with a
1147 remote-like structure of directories. */
1149 mkstruct (const struct url *u)
1151 char *dir, *dir_preencoding;
1152 char *file, *res, *dirpref;
1153 char *query = u->query && *u->query ? u->query : NULL;
1158 char *ptr = u->dir + (*u->dir == '/');
1159 int slash_count = 1 + count_slashes (ptr);
1160 int cut = MINVAL (opt.cut_dirs, slash_count);
1161 for (; cut && *ptr; ptr++)
1164 STRDUP_ALLOCA (dir, ptr);
1167 dir = u->dir + (*u->dir == '/');
1169 /* Check for the true name (or at least a consistent name for saving
1170 to directory) of HOST, reusing the hlist if possible. */
1171 if (opt.add_hostdir)
1173 /* Add dir_prefix and hostname (if required) to the beginning of
1175 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1177 + 1 + numdigit (u->port)
1179 if (!DOTP (opt.dir_prefix))
1180 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1182 strcpy (dirpref, u->host);
1184 if (u->port != scheme_default_port (u->scheme))
1186 int len = strlen (dirpref);
1188 long_to_string (dirpref + len + 1, u->port);
1191 else /* not add_hostdir */
1193 if (!DOTP (opt.dir_prefix))
1194 dirpref = opt.dir_prefix;
1199 /* If there is a prefix, prepend it. */
1202 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1203 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1207 dir_preencoding = dir;
1208 dir = reencode_string (dir_preencoding);
1211 if (l && dir[l - 1] == '/')
1215 file = "index.html";
1219 /* Finally, construct the full name. */
1220 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1221 + (query ? (1 + strlen (query)) : 0)
1223 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1227 strcat (res, query);
1229 if (dir != dir_preencoding)
1234 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1235 an escaped query string. The trick is to make sure that unsafe
1236 characters in BASE are escaped, and that slashes in QUERY are also
1240 compose_file_name (char *base, char *query)
1246 /* Copy BASE to RESULT and encode all unsafe characters. */
1248 while (*from && to - result < sizeof (result))
1250 if (UNSAFE_CHAR (*from))
1252 unsigned char c = *from++;
1254 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1255 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1261 if (query && to - result < sizeof (result))
1265 /* Copy QUERY to RESULT and encode all '/' characters. */
1267 while (*from && to - result < sizeof (result))
1281 if (to - result < sizeof (result))
1284 /* Truncate input which is too long, presumably due to a huge
1286 result[sizeof (result) - 1] = '\0';
1288 return xstrdup (result);
1291 /* Create a unique filename, corresponding to a given URL. Calls
1292 mkstruct if necessary. Does *not* actually create any directories. */
1294 url_filename (const struct url *u)
1297 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1301 file = mkstruct (u);
1306 char *base = *u->file ? u->file : "index.html";
1307 char *query = u->query && *u->query ? u->query : NULL;
1308 file = compose_file_name (base, query);
1313 /* Check whether the prefix directory is something other than "."
1314 before prepending it. */
1315 if (!DOTP (opt.dir_prefix))
1317 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1318 + 1 + strlen (file) + 1);
1319 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1324 /* DOS-ish file systems don't like `%' signs in them; we change it
1329 for (p = file; *p; p++)
1333 #endif /* WINDOWS */
1335 /* Check the cases in which the unique extensions are not used:
1336 1) Clobbering is turned off (-nc).
1337 2) Retrieval with regetting.
1338 3) Timestamping is used.
1339 4) Hierarchy is built.
1341 The exception is the case when file does exist and is a
1342 directory (actually support for bad httpd-s). */
1343 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1344 && !(file_exists_p (file) && !file_non_directory_p (file)))
1347 /* Find a unique name. */
1348 name = unique_name (file);
1353 /* Like strlen(), but allow the URL to be ended with '?'. */
1355 urlpath_length (const char *url)
1357 const char *q = strpbrk_or_eos (url, "?;#");
1361 /* Find the last occurrence of character C in the range [b, e), or
1362 NULL, if none are present. This is almost completely equivalent to
1363 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1364 the contents of the string. */
1366 find_last_char (const char *b, const char *e, char c)
1374 /* Resolve the result of "linking" a base URI (BASE) to a
1375 link-specified URI (LINK).
1377 Either of the URIs may be absolute or relative, complete with the
1378 host name, or path only. This tries to behave "reasonably" in all
1379 foreseeable cases. It employs little specific knowledge about
1380 schemes or URL-specific stuff -- it just works on strings.
1382 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1383 See uri_merge for a gentler interface to this functionality.
1385 #### This function should handle `./' and `../' so that the evil
1386 path_simplify can go. */
1388 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1394 const char *end = base + urlpath_length (base);
1398 /* Empty LINK points back to BASE, query string and all. */
1399 constr = xstrdup (base);
1401 else if (*link == '?')
1403 /* LINK points to the same location, but changes the query
1404 string. Examples: */
1405 /* uri_merge("path", "?new") -> "path?new" */
1406 /* uri_merge("path?foo", "?new") -> "path?new" */
1407 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1408 /* uri_merge("path#foo", "?new") -> "path?new" */
1409 int baselength = end - base;
1410 constr = xmalloc (baselength + linklength + 1);
1411 memcpy (constr, base, baselength);
1412 memcpy (constr + baselength, link, linklength);
1413 constr[baselength + linklength] = '\0';
1415 else if (*link == '#')
1417 /* uri_merge("path", "#new") -> "path#new" */
1418 /* uri_merge("path#foo", "#new") -> "path#new" */
1419 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1420 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1422 const char *end1 = strchr (base, '#');
1424 end1 = base + strlen (base);
1425 baselength = end1 - base;
1426 constr = xmalloc (baselength + linklength + 1);
1427 memcpy (constr, base, baselength);
1428 memcpy (constr + baselength, link, linklength);
1429 constr[baselength + linklength] = '\0';
1431 else if (*link == '/')
1433 /* LINK is an absolute path: we need to replace everything
1434 after (and including) the FIRST slash with LINK.
1436 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1437 "/qux/xyzzy", our result should be
1438 "http://host/qux/xyzzy". */
1441 const char *start_insert = NULL; /* for gcc to shut up. */
1442 const char *pos = base;
1443 int seen_slash_slash = 0;
1444 /* We're looking for the first slash, but want to ignore
1447 slash = memchr (pos, '/', end - pos);
1448 if (slash && !seen_slash_slash)
1449 if (*(slash + 1) == '/')
1452 seen_slash_slash = 1;
1456 /* At this point, SLASH is the location of the first / after
1457 "//", or the first slash altogether. START_INSERT is the
1458 pointer to the location where LINK will be inserted. When
1459 examining the last two examples, keep in mind that LINK
1462 if (!slash && !seen_slash_slash)
1463 /* example: "foo" */
1465 start_insert = base;
1466 else if (!slash && seen_slash_slash)
1467 /* example: "http://foo" */
1470 else if (slash && !seen_slash_slash)
1471 /* example: "foo/bar" */
1473 start_insert = base;
1474 else if (slash && seen_slash_slash)
1475 /* example: "http://something/" */
1477 start_insert = slash;
1479 span = start_insert - base;
1480 constr = (char *)xmalloc (span + linklength + 1);
1482 memcpy (constr, base, span);
1484 memcpy (constr + span, link, linklength);
1485 constr[span + linklength] = '\0';
1489 /* LINK is a relative URL: we need to replace everything
1490 after last slash (possibly empty) with LINK.
1492 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1493 our result should be "whatever/foo/qux/xyzzy". */
1494 int need_explicit_slash = 0;
1496 const char *start_insert;
1497 const char *last_slash = find_last_char (base, end, '/');
1500 /* No slash found at all. Append LINK to what we have,
1501 but we'll need a slash as a separator.
1503 Example: if base == "foo" and link == "qux/xyzzy", then
1504 we cannot just append link to base, because we'd get
1505 "fooqux/xyzzy", whereas what we want is
1508 To make sure the / gets inserted, we set
1509 need_explicit_slash to 1. We also set start_insert
1510 to end + 1, so that the length calculations work out
1511 correctly for one more (slash) character. Accessing
1512 that character is fine, since it will be the
1513 delimiter, '\0' or '?'. */
1514 /* example: "foo?..." */
1515 /* ^ ('?' gets changed to '/') */
1516 start_insert = end + 1;
1517 need_explicit_slash = 1;
1519 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1521 /* example: http://host" */
1523 start_insert = end + 1;
1524 need_explicit_slash = 1;
1528 /* example: "whatever/foo/bar" */
1530 start_insert = last_slash + 1;
1533 span = start_insert - base;
1534 constr = (char *)xmalloc (span + linklength + 1);
1536 memcpy (constr, base, span);
1537 if (need_explicit_slash)
1538 constr[span - 1] = '/';
1540 memcpy (constr + span, link, linklength);
1541 constr[span + linklength] = '\0';
1544 else /* !no_scheme */
1546 constr = strdupdelim (link, link + linklength);
1551 /* Merge BASE with LINK and return the resulting URI. This is an
1552 interface to uri_merge_1 that assumes that LINK is a
1553 zero-terminated string. */
1555 uri_merge (const char *base, const char *link)
1557 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1560 #define APPEND(p, s) do { \
1561 int len = strlen (s); \
1562 memcpy (p, s, len); \
1566 /* Use this instead of password when the actual password is supposed
1567 to be hidden. We intentionally use a generic string without giving
1568 away the number of characters in the password, like previous
1570 #define HIDDEN_PASSWORD "*password*"
1572 /* Recreate the URL string from the data in URL.
1574 If HIDE is non-zero (as it is when we're calling this on a URL we
1575 plan to print, but not when calling it to canonicalize a URL for
1576 use within the program), password will be hidden. Unsafe
1577 characters in the URL will be quoted. */
1580 url_string (const struct url *url, int hide_password)
1584 char *quoted_user = NULL, *quoted_passwd = NULL;
1586 int scheme_port = supported_schemes[url->scheme].default_port;
1587 char *scheme_str = supported_schemes[url->scheme].leading_string;
1588 int fplen = full_path_length (url);
1590 assert (scheme_str != NULL);
1592 /* Make sure the user name and password are quoted. */
1595 quoted_user = encode_string_maybe (url->user);
1599 quoted_passwd = HIDDEN_PASSWORD;
1601 quoted_passwd = encode_string_maybe (url->passwd);
1605 size = (strlen (scheme_str)
1606 + strlen (url->host)
1609 if (url->port != scheme_port)
1610 size += 1 + numdigit (url->port);
1613 size += 1 + strlen (quoted_user);
1615 size += 1 + strlen (quoted_passwd);
1618 p = result = xmalloc (size);
1620 APPEND (p, scheme_str);
1623 APPEND (p, quoted_user);
1627 APPEND (p, quoted_passwd);
1632 APPEND (p, url->host);
1633 if (url->port != scheme_port)
1636 long_to_string (p, url->port);
1640 full_path_write (url, p);
1644 assert (p - result == size);
1646 if (quoted_user && quoted_user != url->user)
1647 xfree (quoted_user);
1648 if (quoted_passwd && !hide_password
1649 && quoted_passwd != url->passwd)
1650 xfree (quoted_passwd);
1655 /* Returns proxy host address, in accordance with SCHEME. */
1657 getproxy (enum url_scheme scheme)
1660 char *rewritten_url;
1661 static char rewritten_storage[1024];
1666 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1670 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1674 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1676 case SCHEME_INVALID:
1679 if (!proxy || !*proxy)
1682 /* Handle shorthands. */
1683 rewritten_url = rewrite_shorthand_url (proxy);
1686 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1687 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1688 proxy = rewritten_storage;
1694 /* Should a host be accessed through proxy, concerning no_proxy? */
1696 no_proxy_match (const char *host, const char **no_proxy)
1701 return !sufmatch (no_proxy, host);
1704 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1705 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1707 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1708 const char *, int));
1709 static char *local_quote_string PARAMS ((const char *));
1711 /* Change the links in one HTML file. LINKS is a list of links in the
1712 document, along with their positions and the desired direction of
1715 convert_links (const char *file, struct urlpos *links)
1717 struct file_memory *fm;
1720 downloaded_file_t downloaded_file_return;
1722 struct urlpos *link;
1723 int to_url_count = 0, to_file_count = 0;
1725 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1728 /* First we do a "dry run": go through the list L and see whether
1729 any URL needs to be converted in the first place. If not, just
1730 leave the file alone. */
1732 struct urlpos *dry = links;
1733 for (dry = links; dry; dry = dry->next)
1734 if (dry->convert != CO_NOCONVERT)
1738 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1743 fm = read_file (file);
1746 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1747 file, strerror (errno));
1751 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1752 if (opt.backup_converted && downloaded_file_return)
1753 write_backup_file (file, downloaded_file_return);
1755 /* Before opening the file for writing, unlink the file. This is
1756 important if the data in FM is mmaped. In such case, nulling the
1757 file, which is what fopen() below does, would make us read all
1758 zeroes from the mmaped region. */
1759 if (unlink (file) < 0 && errno != ENOENT)
1761 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1762 file, strerror (errno));
1763 read_file_free (fm);
1766 /* Now open the file for writing. */
1767 fp = fopen (file, "wb");
1770 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1771 file, strerror (errno));
1772 read_file_free (fm);
1776 /* Here we loop through all the URLs in file, replacing those of
1777 them that are downloaded with relative references. */
1779 for (link = links; link; link = link->next)
1781 char *url_start = fm->content + link->pos;
1783 if (link->pos >= fm->length)
1785 DEBUGP (("Something strange is going on. Please investigate."));
1788 /* If the URL is not to be converted, skip it. */
1789 if (link->convert == CO_NOCONVERT)
1791 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1795 /* Echo the file contents, up to the offending URL's opening
1796 quote, to the outfile. */
1797 fwrite (p, 1, url_start - p, fp);
1800 switch (link->convert)
1802 case CO_CONVERT_TO_RELATIVE:
1803 /* Convert absolute URL to relative. */
1805 char *newname = construct_relative (file, link->local_name);
1806 char *quoted_newname = local_quote_string (newname);
1808 if (!link->link_refresh_p)
1809 p = replace_attr (p, link->size, fp, quoted_newname);
1811 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1812 link->refresh_timeout);
1814 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1815 link->url->url, newname, link->pos, file));
1817 xfree (quoted_newname);
1821 case CO_CONVERT_TO_COMPLETE:
1822 /* Convert the link to absolute URL. */
1824 char *newlink = link->url->url;
1825 char *quoted_newlink = html_quote_string (newlink);
1827 if (!link->link_refresh_p)
1828 p = replace_attr (p, link->size, fp, quoted_newlink);
1830 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1831 link->refresh_timeout);
1833 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1834 newlink, link->pos, file));
1835 xfree (quoted_newlink);
1839 case CO_NULLIFY_BASE:
1840 /* Change the base href to "". */
1841 p = replace_attr (p, link->size, fp, "");
1849 /* Output the rest of the file. */
1850 if (p - fm->content < fm->length)
1851 fwrite (p, 1, fm->length - (p - fm->content), fp);
1853 read_file_free (fm);
1855 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1858 /* Construct and return a malloced copy of the relative link from two
1859 pieces of information: local name S1 of the referring file and
1860 local name S2 of the referred file.
1862 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1863 "jagor.srce.hr/images/news.gif", the function will return
1866 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1867 "fly.cc.fer.hr/images/fly.gif", the function will return
1868 "../images/fly.gif".
1870 Caveats: S1 should not begin with `/', unless S2 also begins with
1871 '/'. S1 should not contain things like ".." and such --
1872 construct_relative ("fly/ioccc/../index.html",
1873 "fly/images/fly.gif") will fail. (A workaround is to call
1874 something like path_simplify() on S1). */
1876 construct_relative (const char *s1, const char *s2)
1878 int i, cnt, sepdirs1;
1882 return xstrdup (s2);
1883 /* S1 should *not* be absolute, if S2 wasn't. */
1884 assert (*s1 != '/');
1886 /* Skip the directories common to both strings. */
1889 while (s1[i] && s2[i]
1894 if (s1[i] == '/' && s2[i] == '/')
1899 for (sepdirs1 = 0; s1[i]; i++)
1902 /* Now, construct the file as of:
1903 - ../ repeated sepdirs1 time
1904 - all the non-mutual directories of S2. */
1905 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1906 for (i = 0; i < sepdirs1; i++)
1907 memcpy (res + 3 * i, "../", 3);
1908 strcpy (res + 3 * i, s2 + cnt);
1913 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1915 /* Rather than just writing over the original .html file with the
1916 converted version, save the former to *.orig. Note we only do
1917 this for files we've _successfully_ downloaded, so we don't
1918 clobber .orig files sitting around from previous invocations. */
1920 /* Construct the backup filename as the original name plus ".orig". */
1921 size_t filename_len = strlen(file);
1922 char* filename_plus_orig_suffix;
1923 boolean already_wrote_backup_file = FALSE;
1924 slist* converted_file_ptr;
1925 static slist* converted_files = NULL;
1927 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1929 /* Just write "orig" over "html". We need to do it this way
1930 because when we're checking to see if we've downloaded the
1931 file before (to see if we can skip downloading it), we don't
1932 know if it's a text/html file. Therefore we don't know yet
1933 at that stage that -E is going to cause us to tack on
1934 ".html", so we need to compare vs. the original URL plus
1935 ".orig", not the original URL plus ".html.orig". */
1936 filename_plus_orig_suffix = alloca (filename_len + 1);
1937 strcpy(filename_plus_orig_suffix, file);
1938 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1940 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1942 /* Append ".orig" to the name. */
1943 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1944 strcpy(filename_plus_orig_suffix, file);
1945 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1948 /* We can get called twice on the same URL thanks to the
1949 convert_all_links() call in main(). If we write the .orig file
1950 each time in such a case, it'll end up containing the first-pass
1951 conversion, not the original file. So, see if we've already been
1952 called on this file. */
1953 converted_file_ptr = converted_files;
1954 while (converted_file_ptr != NULL)
1955 if (strcmp(converted_file_ptr->string, file) == 0)
1957 already_wrote_backup_file = TRUE;
1961 converted_file_ptr = converted_file_ptr->next;
1963 if (!already_wrote_backup_file)
1965 /* Rename <file> to <file>.orig before former gets written over. */
1966 if (rename(file, filename_plus_orig_suffix) != 0)
1967 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1968 file, filename_plus_orig_suffix, strerror (errno));
1970 /* Remember that we've already written a .orig backup for this file.
1971 Note that we never free this memory since we need it till the
1972 convert_all_links() call, which is one of the last things the
1973 program does before terminating. BTW, I'm not sure if it would be
1974 safe to just set 'converted_file_ptr->string' to 'file' below,
1975 rather than making a copy of the string... Another note is that I
1976 thought I could just add a field to the urlpos structure saying
1977 that we'd written a .orig file for this URL, but that didn't work,
1978 so I had to make this separate list.
1979 -- Dan Harkless <wget@harkless.org>
1981 This [adding a field to the urlpos structure] didn't work
1982 because convert_file() is called from convert_all_links at
1983 the end of the retrieval with a freshly built new urlpos
1985 -- Hrvoje Niksic <hniksic@arsdigita.com>
1987 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1988 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1989 converted_file_ptr->next = converted_files;
1990 converted_files = converted_file_ptr;
1994 static int find_fragment PARAMS ((const char *, int, const char **,
1997 /* Replace an attribute's original text with NEW_TEXT. */
2000 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2003 char quote_char = '\"'; /* use "..." for quoting, unless the
2004 original value is quoted, in which
2005 case reuse its quoting char. */
2006 const char *frag_beg, *frag_end;
2008 /* Structure of our string is:
2009 "...old-contents..."
2010 <--- size ---> (with quotes)
2013 <--- size --> (no quotes) */
2015 if (*p == '\"' || *p == '\'')
2020 size -= 2; /* disregard opening and closing quote */
2022 putc (quote_char, fp);
2023 fputs (new_text, fp);
2025 /* Look for fragment identifier, if any. */
2026 if (find_fragment (p, size, &frag_beg, &frag_end))
2027 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2031 putc (quote_char, fp);
2036 /* The same as REPLACE_ATTR, but used when replacing
2037 <meta http-equiv=refresh content="new_text"> because we need to
2038 append "timeout_value; URL=" before the next_text. */
2041 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2042 const char *new_text, int timeout)
2045 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2049 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2051 return replace_attr (p, size, fp, new_with_timeout);
2054 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2055 preceded by '&'. If the character is not found, return zero. If
2056 the character is found, return 1 and set BP and EP to point to the
2057 beginning and end of the region.
2059 This is used for finding the fragment indentifiers in URLs. */
2062 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2064 const char *end = beg + size;
2066 for (; beg < end; beg++)
2088 /* The idea here was to quote ? as %3F to avoid passing part of the
2089 file name as the parameter when browsing the converted file through
2090 HTTP. However, actually doing that breaks local browsing because
2091 "index.html%3Ffoo=bar" isn't even recognized as an HTML file!
2092 Perhaps this should be controlled by an option, but for now I'm
2093 leaving the question marks.
2095 This is the original docstring of this function:
2097 FILE should be a relative link to a local file. It should be
2098 quoted as HTML because it will be used in HTML context. However,
2099 we need to quote ? as %3F to avoid passing part of the file name as
2100 the parameter. (This is not a problem when viewing locally, but is
2101 if the downloaded and converted tree is served by an HTTP
2104 /* Quote string as HTML. */
2107 local_quote_string (const char *file)
2109 return html_quote_string (file);
2112 const char *file_sans_qmark;
2113 int qm = count_char (file, '?');
2117 const char *from = file;
2120 /* qm * 2 because we replace each question mark with "%3F",
2121 i.e. replace one char with three, hence two more. */
2122 int fsqlen = strlen (file) + qm * 2;
2124 to = newname = (char *)alloca (fsqlen + 1);
2125 for (; *from; from++)
2136 assert (to - newname == fsqlen);
2139 file_sans_qmark = newname;
2142 file_sans_qmark = file;
2144 return html_quote_string (file_sans_qmark);
2148 /* We're storing "modes" of type downloaded_file_t in the hash table.
2149 However, our hash tables only accept pointers for keys and values.
2150 So when we need a pointer, we use the address of a
2151 downloaded_file_t variable of static storage. */
2153 static downloaded_file_t *
2154 downloaded_mode_to_ptr (downloaded_file_t mode)
2156 static downloaded_file_t
2157 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2158 v2 = FILE_DOWNLOADED_NORMALLY,
2159 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2160 v4 = CHECK_FOR_FILE;
2164 case FILE_NOT_ALREADY_DOWNLOADED:
2166 case FILE_DOWNLOADED_NORMALLY:
2168 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2170 case CHECK_FOR_FILE:
2176 /* This should really be merged with dl_file_url_map and
2177 downloaded_html_files in recur.c. This was originally a list, but
2178 I changed it to a hash table beause it was actually taking a lot of
2179 time to find things in it. */
2181 static struct hash_table *downloaded_files_hash;
2183 /* Remembers which files have been downloaded. In the standard case, should be
2184 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2185 download successfully (i.e. not for ones we have failures on or that we skip
2188 When we've downloaded a file and tacked on a ".html" extension due to -E,
2189 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2190 FILE_DOWNLOADED_NORMALLY.
2192 If you just want to check if a file has been previously added without adding
2193 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2194 with local filenames, not remote URLs. */
2196 downloaded_file (downloaded_file_t mode, const char *file)
2198 downloaded_file_t *ptr;
2200 if (mode == CHECK_FOR_FILE)
2202 if (!downloaded_files_hash)
2203 return FILE_NOT_ALREADY_DOWNLOADED;
2204 ptr = hash_table_get (downloaded_files_hash, file);
2206 return FILE_NOT_ALREADY_DOWNLOADED;
2210 if (!downloaded_files_hash)
2211 downloaded_files_hash = make_string_hash_table (0);
2213 ptr = hash_table_get (downloaded_files_hash, file);
2217 ptr = downloaded_mode_to_ptr (mode);
2218 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2220 return FILE_NOT_ALREADY_DOWNLOADED;
2224 df_free_mapper (void *key, void *value, void *ignored)
2231 downloaded_files_free (void)
2233 if (downloaded_files_hash)
2235 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2236 hash_table_destroy (downloaded_files_hash);
2237 downloaded_files_hash = NULL;