2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
50 static int urlpath_length PARAMS ((const char *));
54 enum url_scheme scheme;
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { SCHEME_HTTP, "http://", DEFAULT_HTTP_PORT },
64 { SCHEME_HTTPS, "https://", DEFAULT_HTTPS_PORT },
66 { SCHEME_FTP, "ftp://", DEFAULT_FTP_PORT }
69 static void parse_dir PARAMS ((const char *, char **, char **));
70 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
71 static char *construct_relative PARAMS ((const char *, const char *));
72 static char process_ftp_type PARAMS ((char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars. We don't use this yet; preservation of
91 reserved chars will be implemented when I integrate the new
92 `reencode_string' function. */
94 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
98 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
99 - '@' and ':'; needed for encoding URL username and password.
100 - anything >= 127. */
102 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
104 const static unsigned char urlchr_table[256] =
106 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
107 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
108 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
109 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
110 U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */
111 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
112 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
113 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */
114 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
115 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
116 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
117 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
118 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
119 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
120 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
121 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
134 /* Decodes the forms %xy in a URL to the character the hexadecimal
135 code of which is xy. xy are hexadecimal digits from
136 [0123456789ABCDEF] (case-insensitive). If x or y are not
137 hex-digits or `%' precedes `\0', the sequence is inserted
141 decode_string (char *s)
143 char *t = s; /* t - tortoise */
144 char *h = s; /* h - hare */
155 /* Do nothing if '%' is not followed by two hex digits. */
156 if (!*(h + 1) || !*(h + 2)
157 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
159 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
166 /* Like encode_string, but return S if there are no unsafe chars. */
169 encode_string_maybe (const char *s)
176 for (p1 = s; *p1; p1++)
177 if (UNSAFE_CHAR (*p1))
178 addition += 2; /* Two more characters (hex digits) */
183 newlen = (p1 - s) + addition;
184 newstr = (char *)xmalloc (newlen + 1);
190 if (UNSAFE_CHAR (*p1))
192 const unsigned char c = *p1++;
194 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
195 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
201 assert (p2 - newstr == newlen);
206 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
207 given string, returning a malloc-ed %XX encoded string. */
210 encode_string (const char *s)
212 char *encoded = encode_string_maybe (s);
219 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
220 the old value of PTR is freed and PTR is made to point to the newly
221 allocated storage. */
223 #define ENCODE(ptr) do { \
224 char *e_new = encode_string_maybe (ptr); \
232 /* Returns the scheme type if the scheme is supported, or
233 SCHEME_INVALID if not. */
235 url_scheme (const char *url)
239 for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
240 if (!strncasecmp (url, supported_schemes[i].leading_string,
241 strlen (supported_schemes[i].leading_string)))
242 return supported_schemes[i].scheme;
243 return SCHEME_INVALID;
246 /* Return the number of characters needed to skip the scheme part of
247 the URL, e.g. `http://'. If no scheme is found, returns 0. */
249 url_skip_scheme (const char *url)
253 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
255 while (ISALNUM (*p) || *p == '-' || *p == '+')
262 /* Skip "//" if found. */
263 if (*p == '/' && *(p + 1) == '/')
269 /* Returns 1 if the URL begins with a scheme (supported or
270 unsupported), 0 otherwise. */
272 url_has_scheme (const char *url)
275 while (ISALNUM (*p) || *p == '-' || *p == '+')
280 /* Skip the username and password, if present here. The function
281 should be called *not* with the complete URL, but with the part
282 right after the scheme.
284 If no username and password are found, return 0. */
286 url_skip_uname (const char *url)
289 const char *q = NULL;
290 for (p = url ; *p && *p != '/'; p++)
291 if (*p == '@') q = p;
292 /* If a `@' was found before the first occurrence of `/', skip
300 /* Allocate a new urlinfo structure, fill it with default values and
301 return a pointer to it. */
307 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
308 memset (u, 0, sizeof (*u));
309 u->scheme = SCHEME_INVALID;
313 /* Perform a "deep" free of the urlinfo structure. The structure
314 should have been created with newurl, but need not have been used.
315 If free_pointer is non-0, free the pointer itself. */
317 freeurl (struct urlinfo *u, int complete)
321 FREE_MAYBE (u->host);
322 FREE_MAYBE (u->path);
323 FREE_MAYBE (u->file);
325 FREE_MAYBE (u->user);
326 FREE_MAYBE (u->passwd);
327 FREE_MAYBE (u->local);
328 FREE_MAYBE (u->referer);
330 freeurl (u->proxy, 1);
336 enum url_parse_error {
337 PE_UNRECOGNIZED_SCHEME, PE_BAD_PORT
340 /* Extract the given URL of the form
341 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
342 1. hostname (terminated with `/' or `:')
343 2. port number (terminated with `/'), or chosen for the scheme
344 3. dirname (everything after hostname)
345 Most errors are handled. No allocation is done, you must supply
346 pointers to allocated memory.
347 ...and a host of other stuff :-)
349 - Recognizes hostname:dir/file for FTP and
350 hostname (:portnum)?/dir/file for HTTP.
351 - Parses the path to yield directory and file
352 - Parses the URL to yield the username and passwd (if present)
353 - Decodes the strings, in case they contain "forbidden" characters
354 - Writes the result to struct urlinfo
356 If the argument STRICT is set, it recognizes only the canonical
359 parseurl (const char *url, struct urlinfo *u, int strict)
362 int recognizable; /* Recognizable URL is the one where
363 the scheme was explicitly named,
364 i.e. it wasn't deduced from the URL
368 DEBUGP (("parseurl (\"%s\") -> ", url));
369 recognizable = url_has_scheme (url);
370 if (strict && !recognizable)
372 for (i = 0, l = 0; i < ARRAY_SIZE (supported_schemes); i++)
374 l = strlen (supported_schemes[i].leading_string);
375 if (!strncasecmp (supported_schemes[i].leading_string, url, l))
378 /* If scheme is recognizable, but unsupported, bail out, else
380 if (recognizable && i == ARRAY_SIZE (supported_schemes))
382 else if (i == ARRAY_SIZE (supported_schemes))
385 u->scheme = type = supported_schemes[i].scheme;
387 if (type == URLUNKNOWN)
389 /* Allow a username and password to be specified (i.e. just skip
392 l += url_skip_uname (url + l);
393 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
396 /* Get the hostname. */
397 u->host = strdupdelim (url + l, url + i);
398 DEBUGP (("host %s -> ", u->host));
400 /* Assume no port has been given. */
404 /* We have a colon delimiting the hostname. It could mean that
405 a port number is following it, or a directory. */
406 if (ISDIGIT (url[++i])) /* A port number */
408 if (type == URLUNKNOWN)
411 u->scheme = SCHEME_HTTP;
413 for (; url[i] && url[i] != '/'; i++)
414 if (ISDIGIT (url[i]))
415 u->port = 10 * u->port + (url[i] - '0');
420 DEBUGP (("port %hu -> ", u->port));
422 else if (type == URLUNKNOWN) /* or a directory */
425 u->scheme = SCHEME_FTP;
427 else /* or just a misformed port number */
430 else if (type == URLUNKNOWN)
433 u->scheme = SCHEME_HTTP;
438 for (ind = 0; ind < ARRAY_SIZE (supported_schemes); ind++)
439 if (supported_schemes[ind].scheme == u->scheme)
441 if (ind == ARRAY_SIZE (supported_schemes))
443 u->port = supported_schemes[ind].default_port;
445 /* Some delimiter troubles... */
446 if (url[i] == '/' && url[i - 1] != ':')
449 while (url[i] && url[i] == '/')
451 u->path = (char *)xmalloc (strlen (url + i) + 8);
452 strcpy (u->path, url + i);
455 u->ftp_type = process_ftp_type (u->path);
456 /* #### We don't handle type `d' correctly yet. */
457 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
459 DEBUGP (("ftp_type %c -> ", u->ftp_type));
461 DEBUGP (("opath %s -> ", u->path));
462 /* Parse the username and password (if existing). */
463 parse_uname (url, &u->user, &u->passwd);
464 /* Decode the strings, as per RFC 1738. */
465 decode_string (u->host);
466 decode_string (u->path);
468 decode_string (u->user);
470 decode_string (u->passwd);
471 /* Parse the directory. */
472 parse_dir (u->path, &u->dir, &u->file);
473 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
474 /* Simplify the directory. */
475 path_simplify (u->dir);
476 /* Remove the leading `/' in HTTP. */
477 if (type == URLHTTP && *u->dir == '/')
478 strcpy (u->dir, u->dir + 1);
479 DEBUGP (("ndir %s\n", u->dir));
480 /* Strip trailing `/'. */
482 if (l > 1 && u->dir[l - 1] == '/')
483 u->dir[l - 1] = '\0';
484 /* Re-create the path: */
485 abs_ftp = (u->scheme == SCHEME_FTP && *u->dir == '/');
486 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
487 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
488 strcpy (u->path, abs_ftp ? "%2F" : "/");
489 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
490 strcat (u->path, *u->dir ? "/" : "");
491 strcat (u->path, u->file);
493 DEBUGP (("newpath: %s\n", u->path));
494 /* Create the clean URL. */
495 u->url = str_url (u, 0);
499 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
500 DOTP and DDOTP, but they also recognize `?' as end-of-string
501 delimiter. This is needed for correct handling of query
504 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
505 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
506 && (!*((x) + 2) || *((x) + 2) == '?'))
508 /* Build the directory and filename components of the path. Both
509 components are *separately* malloc-ed strings! It does not change
510 the contents of path.
512 If the path ends with "." or "..", they are (correctly) counted as
515 parse_dir (const char *path, char **dir, char **file)
519 l = urlpath_length (path);
520 for (i = l; i && path[i] != '/'; i--);
522 if (!i && *path != '/') /* Just filename */
524 if (PD_DOTP (path) || PD_DDOTP (path))
526 *dir = strdupdelim (path, path + l);
527 *file = xstrdup (path + l); /* normally empty, but could
532 *dir = xstrdup (""); /* This is required because of FTP */
533 *file = xstrdup (path);
536 else if (!i) /* /filename */
538 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
540 *dir = strdupdelim (path, path + l);
541 *file = xstrdup (path + l); /* normally empty, but could
546 *dir = xstrdup ("/");
547 *file = xstrdup (path + 1);
550 else /* Nonempty directory with or without a filename */
552 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
554 *dir = strdupdelim (path, path + l);
555 *file = xstrdup (path + l); /* normally empty, but could
560 *dir = strdupdelim (path, path + i);
561 *file = xstrdup (path + i + 1);
566 /* Find the optional username and password within the URL, as per
567 RFC1738. The returned user and passwd char pointers are
570 parse_uname (const char *url, char **user, char **passwd)
573 const char *p, *q, *col;
579 /* Look for the end of the scheme identifier. */
580 l = url_skip_scheme (url);
584 /* Is there an `@' character? */
585 for (p = url; *p && *p != '/'; p++)
588 /* If not, return. */
591 /* Else find the username and password. */
592 for (p = q = col = url; *p && *p != '/'; p++)
594 if (*p == ':' && !*user)
596 *user = (char *)xmalloc (p - url + 1);
597 memcpy (*user, url, p - url);
598 (*user)[p - url] = '\0';
601 if (*p == '@') q = p;
603 /* Decide whether you have only the username or both. */
604 where = *user ? passwd : user;
605 *where = (char *)xmalloc (q - col + 1);
606 memcpy (*where, col, q - col);
607 (*where)[q - col] = '\0';
611 /* If PATH ends with `;type=X', return the character X. */
613 process_ftp_type (char *path)
615 int len = strlen (path);
618 && !memcmp (path + len - 7, ";type=", 6))
620 path[len - 7] = '\0';
621 return path[len - 1];
627 /* Recreate the URL string from the data in urlinfo. This can be used
628 to create a "canonical" representation of the URL. If `hide' is
629 non-zero (as it is when we're calling this on a URL we plan to
630 print, but not when calling it to canonicalize a URL for use within
631 the program), password will be hidden. The forbidden characters in
632 the URL will be cleansed. */
634 str_url (const struct urlinfo *u, int hide)
636 char *res, *host, *user, *passwd, *scheme_name, *dir, *file;
637 int i, l, ln, lu, lh, lp, lf, ld;
638 unsigned short default_port;
640 /* Look for the scheme. */
641 for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
642 if (supported_schemes[i].scheme == u->scheme)
644 if (i == ARRAY_SIZE (supported_schemes))
646 scheme_name = supported_schemes[i].leading_string;
647 default_port = supported_schemes[i].default_port;
648 host = encode_string (u->host);
649 dir = encode_string (u->dir);
650 file = encode_string (u->file);
651 user = passwd = NULL;
653 user = encode_string (u->user);
657 /* Don't output the password, or someone might see it over the user's
658 shoulder (or in saved wget output). Don't give away the number of
659 characters in the password, either, as we did in past versions of
660 this code, when we replaced the password characters with 'x's. */
661 passwd = xstrdup("<password>");
663 passwd = encode_string (u->passwd);
665 if (u->scheme == SCHEME_FTP && *dir == '/')
667 char *tmp = (char *)xmalloc (strlen (dir) + 3);
668 /*sprintf (tmp, "%%2F%s", dir + 1);*/
672 strcpy (tmp + 3, dir + 1);
677 ln = strlen (scheme_name);
678 lu = user ? strlen (user) : 0;
679 lp = passwd ? strlen (passwd) : 0;
683 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
684 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", scheme_name,
685 (user ? user : ""), (passwd ? ":" : ""),
686 (passwd ? passwd : ""), (user ? "@" : ""),
687 host, u->port, dir, *dir ? "/" : "", file); */
689 memcpy (res, scheme_name, ln);
693 memcpy (res + l, user, lu);
698 memcpy (res + l, passwd, lp);
703 memcpy (res + l, host, lh);
705 if (u->port != default_port)
708 long_to_string (res + l, (long)u->port);
709 l += numdigit (u->port);
712 memcpy (res + l, dir, ld);
716 strcpy (res + l, file);
725 /* Check whether two URL-s are equivalent, i.e. pointing to the same
726 location. Uses parseurl to parse them, and compares the canonical
729 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
730 return 0 on error. */
731 /* Do not compile unused code. */
734 url_equal (const char *url1, const char *url2)
736 struct urlinfo *u1, *u2;
741 err = parseurl (url1, u1, 0);
748 err = parseurl (url2, u2, 0);
755 res = !strcmp (u1->url, u2->url);
763 get_urls_file (const char *file)
765 struct file_memory *fm;
767 const char *text, *text_end;
770 fm = read_file (file);
773 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
776 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
779 text_end = fm->content + fm->length;
780 while (text < text_end)
782 const char *line_beg = text;
783 const char *line_end = memchr (text, '\n', text_end - text);
789 while (line_beg < line_end
790 && ISSPACE (*line_beg))
792 while (line_end > line_beg + 1
793 && ISSPACE (*(line_end - 1)))
795 if (line_end > line_beg)
797 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
798 memset (entry, 0, sizeof (*entry));
800 entry->url = strdupdelim (line_beg, line_end);
812 /* Free the linked list of urlpos. */
814 free_urlpos (urlpos *l)
818 urlpos *next = l->next;
820 FREE_MAYBE (l->local_name);
826 /* Rotate FNAME opt.backups times */
828 rotate_backups(const char *fname)
830 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
831 char *from = (char *)alloca (maxlen);
832 char *to = (char *)alloca (maxlen);
836 if (stat (fname, &sb) == 0)
837 if (S_ISREG (sb.st_mode) == 0)
840 for (i = opt.backups; i > 1; i--)
842 sprintf (from, "%s.%d", fname, i - 1);
843 sprintf (to, "%s.%d", fname, i);
844 /* #### This will fail on machines without the rename() system
849 sprintf (to, "%s.%d", fname, 1);
853 /* Create all the necessary directories for PATH (a file). Calls
854 mkdirhier() internally. */
856 mkalldirs (const char *path)
863 p = path + strlen (path);
864 for (; *p != '/' && p != path; p--);
865 /* Don't create if it's just a file. */
866 if ((p == path) && (*p != '/'))
868 t = strdupdelim (path, p);
869 /* Check whether the directory exists. */
870 if ((stat (t, &st) == 0))
872 if (S_ISDIR (st.st_mode))
879 /* If the dir exists as a file name, remove it first. This
880 is *only* for Wget to work with buggy old CERN http
881 servers. Here is the scenario: When Wget tries to
882 retrieve a directory without a slash, e.g.
883 http://foo/bar (bar being a directory), CERN server will
884 not redirect it too http://foo/bar/ -- it will generate a
885 directory listing containing links to bar/file1,
886 bar/file2, etc. Wget will lose because it saves this
887 HTML listing to a file `bar', so it cannot create the
888 directory. To work around this, if the file of the same
889 name exists, we just remove it and create the directory
891 DEBUGP (("Removing %s because of directory danger!\n", t));
895 res = make_directory (t);
897 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
903 count_slashes (const char *s)
912 /* Return the path name of the URL-equivalent file name, with a
913 remote-like structure of directories. */
915 mkstruct (const struct urlinfo *u)
917 char *host, *dir, *file, *res, *dirpref;
920 assert (u->dir != NULL);
921 assert (u->host != NULL);
925 char *ptr = u->dir + (*u->dir == '/');
926 int slash_count = 1 + count_slashes (ptr);
927 int cut = MINVAL (opt.cut_dirs, slash_count);
928 for (; cut && *ptr; ptr++)
931 STRDUP_ALLOCA (dir, ptr);
934 dir = u->dir + (*u->dir == '/');
936 host = xstrdup (u->host);
937 /* Check for the true name (or at least a consistent name for saving
938 to directory) of HOST, reusing the hlist if possible. */
939 if (opt.add_hostdir && !opt.simple_check)
941 char *nhost = realhost (host);
945 /* Add dir_prefix and hostname (if required) to the beginning of
949 if (!DOTP (opt.dir_prefix))
951 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
952 + strlen (host) + 1);
953 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
956 STRDUP_ALLOCA (dirpref, host);
958 else /* not add_hostdir */
960 if (!DOTP (opt.dir_prefix))
961 dirpref = opt.dir_prefix;
967 /* If there is a prefix, prepend it. */
970 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
971 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
974 dir = encode_string (dir);
976 if (l && dir[l - 1] == '/')
984 /* Finally, construct the full name. */
985 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
986 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
991 /* Return a malloced copy of S, but protect any '/' characters. */
994 file_name_protect_query_string (const char *s)
999 for (from = s; *from; from++)
1003 destlen += 2; /* each / gets replaced with %2F, so
1004 it adds two more chars. */
1006 dest = (char *)xmalloc (destlen + 1);
1007 for (from = s, to = dest; *from; from++)
1018 assert (to - dest == destlen);
1023 /* Create a unique filename, corresponding to a given URL. Calls
1024 mkstruct if necessary. Does *not* actually create any directories. */
1026 url_filename (const struct urlinfo *u)
1029 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1033 file = mkstruct (u);
1039 file = xstrdup ("index.html");
1042 /* If the URL came with a query string, u->file will contain
1043 a question mark followed by query string contents. These
1044 contents can contain '/' which would make us create
1045 unwanted directories. These slashes must be protected
1047 if (!strchr (u->file, '/'))
1048 file = xstrdup (u->file);
1051 /*assert (strchr (u->file, '?') != NULL);*/
1052 file = file_name_protect_query_string (u->file);
1059 /* Check whether the prefix directory is something other than "."
1060 before prepending it. */
1061 if (!DOTP (opt.dir_prefix))
1063 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1064 + 1 + strlen (file) + 1);
1065 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1070 /* DOS-ish file systems don't like `%' signs in them; we change it
1075 for (p = file; *p; p++)
1079 #endif /* WINDOWS */
1081 /* Check the cases in which the unique extensions are not used:
1082 1) Clobbering is turned off (-nc).
1083 2) Retrieval with regetting.
1084 3) Timestamping is used.
1085 4) Hierarchy is built.
1087 The exception is the case when file does exist and is a
1088 directory (actually support for bad httpd-s). */
1089 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1090 && !(file_exists_p (file) && !file_non_directory_p (file)))
1093 /* Find a unique name. */
1094 name = unique_name (file);
1099 /* Like strlen(), but allow the URL to be ended with '?'. */
1101 urlpath_length (const char *url)
1103 const char *q = strchr (url, '?');
1106 return strlen (url);
1109 /* Find the last occurrence of character C in the range [b, e), or
1110 NULL, if none are present. This is almost completely equivalent to
1111 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1112 the contents of the string. */
1114 find_last_char (const char *b, const char *e, char c)
1122 /* Resolve the result of "linking" a base URI (BASE) to a
1123 link-specified URI (LINK).
1125 Either of the URIs may be absolute or relative, complete with the
1126 host name, or path only. This tries to behave "reasonably" in all
1127 foreseeable cases. It employs little specific knowledge about
1128 schemes or URL-specific stuff -- it just works on strings.
1130 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1131 See uri_merge for a gentler interface to this functionality.
1133 #### This function should handle `./' and `../' so that the evil
1134 path_simplify can go. */
1136 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1142 const char *end = base + urlpath_length (base);
1146 /* LINK is a relative URL: we need to replace everything
1147 after last slash (possibly empty) with LINK.
1149 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1150 our result should be "whatever/foo/qux/xyzzy". */
1151 int need_explicit_slash = 0;
1153 const char *start_insert;
1154 const char *last_slash = find_last_char (base, end, '/');
1157 /* No slash found at all. Append LINK to what we have,
1158 but we'll need a slash as a separator.
1160 Example: if base == "foo" and link == "qux/xyzzy", then
1161 we cannot just append link to base, because we'd get
1162 "fooqux/xyzzy", whereas what we want is
1165 To make sure the / gets inserted, we set
1166 need_explicit_slash to 1. We also set start_insert
1167 to end + 1, so that the length calculations work out
1168 correctly for one more (slash) character. Accessing
1169 that character is fine, since it will be the
1170 delimiter, '\0' or '?'. */
1171 /* example: "foo?..." */
1172 /* ^ ('?' gets changed to '/') */
1173 start_insert = end + 1;
1174 need_explicit_slash = 1;
1176 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1178 /* example: http://host" */
1180 start_insert = end + 1;
1181 need_explicit_slash = 1;
1185 /* example: "whatever/foo/bar" */
1187 start_insert = last_slash + 1;
1190 span = start_insert - base;
1191 constr = (char *)xmalloc (span + linklength + 1);
1193 memcpy (constr, base, span);
1194 if (need_explicit_slash)
1195 constr[span - 1] = '/';
1197 memcpy (constr + span, link, linklength);
1198 constr[span + linklength] = '\0';
1200 else /* *link == `/' */
1202 /* LINK is an absolute path: we need to replace everything
1203 after (and including) the FIRST slash with LINK.
1205 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1206 "/qux/xyzzy", our result should be
1207 "http://host/qux/xyzzy". */
1210 const char *start_insert = NULL; /* for gcc to shut up. */
1211 const char *pos = base;
1212 int seen_slash_slash = 0;
1213 /* We're looking for the first slash, but want to ignore
1216 slash = memchr (pos, '/', end - pos);
1217 if (slash && !seen_slash_slash)
1218 if (*(slash + 1) == '/')
1221 seen_slash_slash = 1;
1225 /* At this point, SLASH is the location of the first / after
1226 "//", or the first slash altogether. START_INSERT is the
1227 pointer to the location where LINK will be inserted. When
1228 examining the last two examples, keep in mind that LINK
1231 if (!slash && !seen_slash_slash)
1232 /* example: "foo" */
1234 start_insert = base;
1235 else if (!slash && seen_slash_slash)
1236 /* example: "http://foo" */
1239 else if (slash && !seen_slash_slash)
1240 /* example: "foo/bar" */
1242 start_insert = base;
1243 else if (slash && seen_slash_slash)
1244 /* example: "http://something/" */
1246 start_insert = slash;
1248 span = start_insert - base;
1249 constr = (char *)xmalloc (span + linklength + 1);
1251 memcpy (constr, base, span);
1253 memcpy (constr + span, link, linklength);
1254 constr[span + linklength] = '\0';
1257 else /* !no_scheme */
1259 constr = strdupdelim (link, link + linklength);
1264 /* Merge BASE with LINK and return the resulting URI. This is an
1265 interface to uri_merge_1 that assumes that LINK is a
1266 zero-terminated string. */
1268 uri_merge (const char *base, const char *link)
1270 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1273 /* Optimize URL by host, destructively replacing u->host with realhost
1274 (u->host). Do this regardless of opt.simple_check. */
1276 opt_url (struct urlinfo *u)
1278 /* Find the "true" host. */
1279 char *host = realhost (u->host);
1282 assert (u->dir != NULL); /* the URL must have been parsed */
1283 /* Refresh the printed representation. */
1285 u->url = str_url (u, 0);
1288 /* Returns proxy host address, in accordance with SCHEME. */
1290 getproxy (enum url_scheme scheme)
1297 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1301 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1305 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1307 case SCHEME_INVALID:
1310 if (!proxy || !*proxy)
1315 /* Should a host be accessed through proxy, concerning no_proxy? */
1317 no_proxy_match (const char *host, const char **no_proxy)
1322 return !sufmatch (no_proxy, host);
1325 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1326 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1328 /* Change the links in an HTML document. Accepts a structure that
1329 defines the positions of all the links. */
1331 convert_links (const char *file, urlpos *l)
1333 struct file_memory *fm;
1336 downloaded_file_t downloaded_file_return;
1338 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1341 /* First we do a "dry run": go through the list L and see whether
1342 any URL needs to be converted in the first place. If not, just
1343 leave the file alone. */
1346 for (dry = l; dry; dry = dry->next)
1347 if (dry->convert != CO_NOCONVERT)
1351 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1356 fm = read_file (file);
1359 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1360 file, strerror (errno));
1364 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1365 if (opt.backup_converted && downloaded_file_return)
1366 write_backup_file (file, downloaded_file_return);
1368 /* Before opening the file for writing, unlink the file. This is
1369 important if the data in FM is mmaped. In such case, nulling the
1370 file, which is what fopen() below does, would make us read all
1371 zeroes from the mmaped region. */
1372 if (unlink (file) < 0 && errno != ENOENT)
1374 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1375 file, strerror (errno));
1376 read_file_free (fm);
1379 /* Now open the file for writing. */
1380 fp = fopen (file, "wb");
1383 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1384 file, strerror (errno));
1385 read_file_free (fm);
1388 /* Here we loop through all the URLs in file, replacing those of
1389 them that are downloaded with relative references. */
1391 for (; l; l = l->next)
1393 char *url_start = fm->content + l->pos;
1395 if (l->pos >= fm->length)
1397 DEBUGP (("Something strange is going on. Please investigate."));
1400 /* If the URL is not to be converted, skip it. */
1401 if (l->convert == CO_NOCONVERT)
1403 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1407 /* Echo the file contents, up to the offending URL's opening
1408 quote, to the outfile. */
1409 fwrite (p, 1, url_start - p, fp);
1411 if (l->convert == CO_CONVERT_TO_RELATIVE)
1413 /* Convert absolute URL to relative. */
1414 char *newname = construct_relative (file, l->local_name);
1415 char *quoted_newname = html_quote_string (newname);
1416 replace_attr (&p, l->size, fp, quoted_newname);
1417 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1418 l->url, newname, l->pos, file));
1420 xfree (quoted_newname);
1422 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1424 /* Convert the link to absolute URL. */
1425 char *newlink = l->url;
1426 char *quoted_newlink = html_quote_string (newlink);
1427 replace_attr (&p, l->size, fp, quoted_newlink);
1428 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1429 newlink, l->pos, file));
1430 xfree (quoted_newlink);
1433 /* Output the rest of the file. */
1434 if (p - fm->content < fm->length)
1435 fwrite (p, 1, fm->length - (p - fm->content), fp);
1437 read_file_free (fm);
1438 logputs (LOG_VERBOSE, _("done.\n"));
1441 /* Construct and return a malloced copy of the relative link from two
1442 pieces of information: local name S1 of the referring file and
1443 local name S2 of the referred file.
1445 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1446 "jagor.srce.hr/images/news.gif", the function will return
1449 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1450 "fly.cc.fer.hr/images/fly.gif", the function will return
1451 "../images/fly.gif".
1453 Caveats: S1 should not begin with `/', unless S2 also begins with
1454 '/'. S1 should not contain things like ".." and such --
1455 construct_relative ("fly/ioccc/../index.html",
1456 "fly/images/fly.gif") will fail. (A workaround is to call
1457 something like path_simplify() on S1). */
1459 construct_relative (const char *s1, const char *s2)
1461 int i, cnt, sepdirs1;
1465 return xstrdup (s2);
1466 /* S1 should *not* be absolute, if S2 wasn't. */
1467 assert (*s1 != '/');
1469 /* Skip the directories common to both strings. */
1472 while (s1[i] && s2[i]
1477 if (s1[i] == '/' && s2[i] == '/')
1482 for (sepdirs1 = 0; s1[i]; i++)
1485 /* Now, construct the file as of:
1486 - ../ repeated sepdirs1 time
1487 - all the non-mutual directories of S2. */
1488 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1489 for (i = 0; i < sepdirs1; i++)
1490 memcpy (res + 3 * i, "../", 3);
1491 strcpy (res + 3 * i, s2 + cnt);
1495 /* Add URL to the head of the list L. */
1497 add_url (urlpos *l, const char *url, const char *file)
1501 t = (urlpos *)xmalloc (sizeof (urlpos));
1502 memset (t, 0, sizeof (*t));
1503 t->url = xstrdup (url);
1504 t->local_name = xstrdup (file);
1510 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1512 /* Rather than just writing over the original .html file with the
1513 converted version, save the former to *.orig. Note we only do
1514 this for files we've _successfully_ downloaded, so we don't
1515 clobber .orig files sitting around from previous invocations. */
1517 /* Construct the backup filename as the original name plus ".orig". */
1518 size_t filename_len = strlen(file);
1519 char* filename_plus_orig_suffix;
1520 boolean already_wrote_backup_file = FALSE;
1521 slist* converted_file_ptr;
1522 static slist* converted_files = NULL;
1524 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1526 /* Just write "orig" over "html". We need to do it this way
1527 because when we're checking to see if we've downloaded the
1528 file before (to see if we can skip downloading it), we don't
1529 know if it's a text/html file. Therefore we don't know yet
1530 at that stage that -E is going to cause us to tack on
1531 ".html", so we need to compare vs. the original URL plus
1532 ".orig", not the original URL plus ".html.orig". */
1533 filename_plus_orig_suffix = alloca (filename_len + 1);
1534 strcpy(filename_plus_orig_suffix, file);
1535 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1537 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1539 /* Append ".orig" to the name. */
1540 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1541 strcpy(filename_plus_orig_suffix, file);
1542 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1545 /* We can get called twice on the same URL thanks to the
1546 convert_all_links() call in main(). If we write the .orig file
1547 each time in such a case, it'll end up containing the first-pass
1548 conversion, not the original file. So, see if we've already been
1549 called on this file. */
1550 converted_file_ptr = converted_files;
1551 while (converted_file_ptr != NULL)
1552 if (strcmp(converted_file_ptr->string, file) == 0)
1554 already_wrote_backup_file = TRUE;
1558 converted_file_ptr = converted_file_ptr->next;
1560 if (!already_wrote_backup_file)
1562 /* Rename <file> to <file>.orig before former gets written over. */
1563 if (rename(file, filename_plus_orig_suffix) != 0)
1564 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1565 file, filename_plus_orig_suffix, strerror (errno));
1567 /* Remember that we've already written a .orig backup for this file.
1568 Note that we never free this memory since we need it till the
1569 convert_all_links() call, which is one of the last things the
1570 program does before terminating. BTW, I'm not sure if it would be
1571 safe to just set 'converted_file_ptr->string' to 'file' below,
1572 rather than making a copy of the string... Another note is that I
1573 thought I could just add a field to the urlpos structure saying
1574 that we'd written a .orig file for this URL, but that didn't work,
1575 so I had to make this separate list.
1576 -- Dan Harkless <wget@harkless.org>
1578 This [adding a field to the urlpos structure] didn't work
1579 because convert_file() is called twice: once after all its
1580 sublinks have been retrieved in recursive_retrieve(), and
1581 once at the end of the day in convert_all_links(). The
1582 original linked list collected in recursive_retrieve() is
1583 lost after the first invocation of convert_links(), and
1584 convert_all_links() makes a new one (it calls get_urls_html()
1585 for each file it covers.) That's why your first approach didn't
1586 work. The way to make it work is perhaps to make this flag a
1587 field in the `urls_html' list.
1588 -- Hrvoje Niksic <hniksic@arsdigita.com>
1590 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1591 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1592 converted_file_ptr->next = converted_files;
1593 converted_files = converted_file_ptr;
1597 static int find_fragment PARAMS ((const char *, int, const char **,
1601 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1603 const char *p = *pp;
1605 int size = raw_size;
1606 char quote_char = '\"';
1607 const char *frag_beg, *frag_end;
1609 /* Structure of our string is:
1610 "...old-contents..."
1611 <--- l->size ---> (with quotes)
1614 <--- l->size --> (no quotes) */
1616 if (*p == '\"' || *p == '\'')
1621 size -= 2; /* disregard opening and closing quote */
1623 putc (quote_char, fp);
1624 fputs (new_str, fp);
1626 /* Look for fragment identifier, if any. */
1627 if (find_fragment (p, size, &frag_beg, &frag_end))
1628 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1632 putc (quote_char, fp);
1636 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1637 preceded by '&'. If the character is not found, return zero. If
1638 the character is found, return 1 and set BP and EP to point to the
1639 beginning and end of the region.
1641 This is used for finding the fragment indentifiers in URLs. */
1644 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1646 const char *end = beg + size;
1648 for (; beg < end; beg++)
1670 typedef struct _downloaded_file_list {
1672 downloaded_file_t download_type;
1673 struct _downloaded_file_list* next;
1674 } downloaded_file_list;
1676 static downloaded_file_list *downloaded_files;
1678 /* Remembers which files have been downloaded. In the standard case, should be
1679 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1680 download successfully (i.e. not for ones we have failures on or that we skip
1683 When we've downloaded a file and tacked on a ".html" extension due to -E,
1684 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1685 FILE_DOWNLOADED_NORMALLY.
1687 If you just want to check if a file has been previously added without adding
1688 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1689 with local filenames, not remote URLs. */
1691 downloaded_file (downloaded_file_t mode, const char* file)
1693 boolean found_file = FALSE;
1694 downloaded_file_list* rover = downloaded_files;
1696 while (rover != NULL)
1697 if (strcmp(rover->file, file) == 0)
1703 rover = rover->next;
1706 return rover->download_type; /* file had already been downloaded */
1709 if (mode != CHECK_FOR_FILE)
1711 rover = xmalloc(sizeof(*rover));
1712 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1713 rover->download_type = mode;
1714 rover->next = downloaded_files;
1715 downloaded_files = rover;
1718 return FILE_NOT_ALREADY_DOWNLOADED;
1723 downloaded_files_free (void)
1725 downloaded_file_list* rover = downloaded_files;
1728 downloaded_file_list *next = rover->next;
1729 xfree (rover->file);