2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
50 static int urlpath_length PARAMS ((const char *));
54 enum url_scheme scheme;
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { SCHEME_HTTP, "http://", DEFAULT_HTTP_PORT },
64 { SCHEME_HTTPS, "https://", DEFAULT_HTTPS_PORT },
66 { SCHEME_FTP, "ftp://", DEFAULT_FTP_PORT }
69 static void parse_dir PARAMS ((const char *, char **, char **));
70 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
71 static char *construct_relative PARAMS ((const char *, const char *));
72 static char process_ftp_type PARAMS ((char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars. We don't use this yet; preservation of
91 reserved chars will be implemented when I integrate the new
92 `reencode_string' function. */
94 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
98 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
99 - '@' and ':'; needed for encoding URL username and password.
100 - anything >= 127. */
102 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
104 const static unsigned char urlchr_table[256] =
106 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
107 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
108 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
109 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
110 U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */
111 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
112 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
113 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */
114 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
115 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
116 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
117 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
118 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
119 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
120 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
121 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
134 /* Decodes the forms %xy in a URL to the character the hexadecimal
135 code of which is xy. xy are hexadecimal digits from
136 [0123456789ABCDEF] (case-insensitive). If x or y are not
137 hex-digits or `%' precedes `\0', the sequence is inserted
141 decode_string (char *s)
143 char *t = s; /* t - tortoise */
144 char *h = s; /* h - hare */
155 /* Do nothing if '%' is not followed by two hex digits. */
156 if (!*(h + 1) || !*(h + 2)
157 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
159 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
166 /* Like encode_string, but return S if there are no unsafe chars. */
169 encode_string_maybe (const char *s)
176 for (p1 = s; *p1; p1++)
177 if (UNSAFE_CHAR (*p1))
178 addition += 2; /* Two more characters (hex digits) */
183 newlen = (p1 - s) + addition;
184 newstr = (char *)xmalloc (newlen + 1);
190 if (UNSAFE_CHAR (*p1))
192 const unsigned char c = *p1++;
194 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
195 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
201 assert (p2 - newstr == newlen);
206 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
207 given string, returning a malloc-ed %XX encoded string. */
210 encode_string (const char *s)
212 char *encoded = encode_string_maybe (s);
219 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
220 the old value of PTR is freed and PTR is made to point to the newly
221 allocated storage. */
223 #define ENCODE(ptr) do { \
224 char *e_new = encode_string_maybe (ptr); \
232 /* Returns the scheme type if the scheme is supported, or
233 SCHEME_INVALID if not. */
235 url_scheme (const char *url)
239 for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
240 if (!strncasecmp (url, supported_schemes[i].leading_string,
241 strlen (supported_schemes[i].leading_string)))
242 return supported_schemes[i].scheme;
243 return SCHEME_INVALID;
246 /* Return the number of characters needed to skip the scheme part of
247 the URL, e.g. `http://'. If no scheme is found, returns 0. */
249 url_skip_scheme (const char *url)
253 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
255 while (ISALNUM (*p) || *p == '-' || *p == '+')
262 /* Skip "//" if found. */
263 if (*p == '/' && *(p + 1) == '/')
269 /* Returns 1 if the URL begins with a scheme (supported or
270 unsupported), 0 otherwise. */
272 url_has_scheme (const char *url)
275 while (ISALNUM (*p) || *p == '-' || *p == '+')
280 /* Skip the username and password, if present here. The function
281 should be called *not* with the complete URL, but with the part
282 right after the scheme.
284 If no username and password are found, return 0. */
286 url_skip_uname (const char *url)
289 const char *q = NULL;
290 for (p = url ; *p && *p != '/'; p++)
291 if (*p == '@') q = p;
292 /* If a `@' was found before the first occurrence of `/', skip
300 /* Used by main.c: detect URLs written using the "shorthand" URL forms
301 popularized by Netscape and NcFTP. HTTP shorthands look like this:
303 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
304 www.foo.com[:port] -> http://www.foo.com[:port]
306 FTP shorthands look like this:
308 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
309 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
311 If the URL needs not or cannot be rewritten, return NULL. */
313 rewrite_url_maybe (const char *url)
317 if (url_has_scheme (url))
320 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
322 for (p = url; *p && *p != ':' && *p != '/'; p++)
330 const char *pp, *path;
332 /* If the characters after the colon and before the next slash
333 or end of string are all digits, it's HTTP. */
335 for (pp = p + 1; ISDIGIT (*pp); pp++)
338 && (*pp == '/' || *pp == '\0'))
341 /* Prepend "ftp://" to the entire URL... */
343 res = xmalloc (6 + strlen (url) + 1);
344 sprintf (res, "ftp://%s", url);
345 /* ...and replace ':' with '/'. */
346 res[6 + (p - url)] = '/';
353 /* Just prepend "http://" to what we have. */
354 res = xmalloc (7 + strlen (url) + 1);
355 sprintf (res, "http://%s", url);
360 /* Allocate a new urlinfo structure, fill it with default values and
361 return a pointer to it. */
367 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
368 memset (u, 0, sizeof (*u));
369 u->scheme = SCHEME_INVALID;
373 /* Perform a "deep" free of the urlinfo structure. The structure
374 should have been created with newurl, but need not have been used.
375 If free_pointer is non-0, free the pointer itself. */
377 freeurl (struct urlinfo *u, int complete)
381 FREE_MAYBE (u->host);
382 FREE_MAYBE (u->path);
383 FREE_MAYBE (u->file);
385 FREE_MAYBE (u->user);
386 FREE_MAYBE (u->passwd);
387 FREE_MAYBE (u->local);
388 FREE_MAYBE (u->referer);
390 freeurl (u->proxy, 1);
396 enum url_parse_error {
397 PE_UNRECOGNIZED_SCHEME, PE_BAD_PORT
400 /* Extract the given URL of the form
401 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
402 1. hostname (terminated with `/' or `:')
403 2. port number (terminated with `/'), or chosen for the scheme
404 3. dirname (everything after hostname)
405 Most errors are handled. No allocation is done, you must supply
406 pointers to allocated memory.
407 ...and a host of other stuff :-)
409 - Recognizes hostname:dir/file for FTP and
410 hostname (:portnum)?/dir/file for HTTP.
411 - Parses the path to yield directory and file
412 - Parses the URL to yield the username and passwd (if present)
413 - Decodes the strings, in case they contain "forbidden" characters
414 - Writes the result to struct urlinfo
416 If the argument STRICT is set, it recognizes only the canonical
419 parseurl (const char *url, struct urlinfo *u, int strict)
422 int recognizable; /* Recognizable URL is the one where
423 the scheme was explicitly named,
424 i.e. it wasn't deduced from the URL
428 DEBUGP (("parseurl (\"%s\") -> ", url));
429 recognizable = url_has_scheme (url);
430 if (strict && !recognizable)
432 for (i = 0, l = 0; i < ARRAY_SIZE (supported_schemes); i++)
434 l = strlen (supported_schemes[i].leading_string);
435 if (!strncasecmp (supported_schemes[i].leading_string, url, l))
438 /* If scheme is recognizable, but unsupported, bail out, else
440 if (recognizable && i == ARRAY_SIZE (supported_schemes))
442 else if (i == ARRAY_SIZE (supported_schemes))
445 u->scheme = type = supported_schemes[i].scheme;
447 if (type == URLUNKNOWN)
449 /* Allow a username and password to be specified (i.e. just skip
452 l += url_skip_uname (url + l);
453 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
456 /* Get the hostname. */
457 u->host = strdupdelim (url + l, url + i);
458 DEBUGP (("host %s -> ", u->host));
460 /* Assume no port has been given. */
464 /* We have a colon delimiting the hostname. It could mean that
465 a port number is following it, or a directory. */
466 if (ISDIGIT (url[++i])) /* A port number */
468 if (type == URLUNKNOWN)
471 u->scheme = SCHEME_HTTP;
473 for (; url[i] && url[i] != '/'; i++)
474 if (ISDIGIT (url[i]))
475 u->port = 10 * u->port + (url[i] - '0');
480 DEBUGP (("port %hu -> ", u->port));
482 else if (type == URLUNKNOWN) /* or a directory */
485 u->scheme = SCHEME_FTP;
487 else /* or just a misformed port number */
490 else if (type == URLUNKNOWN)
493 u->scheme = SCHEME_HTTP;
498 for (ind = 0; ind < ARRAY_SIZE (supported_schemes); ind++)
499 if (supported_schemes[ind].scheme == u->scheme)
501 if (ind == ARRAY_SIZE (supported_schemes))
503 u->port = supported_schemes[ind].default_port;
505 /* Some delimiter troubles... */
506 if (url[i] == '/' && url[i - 1] != ':')
509 while (url[i] && url[i] == '/')
511 u->path = (char *)xmalloc (strlen (url + i) + 8);
512 strcpy (u->path, url + i);
515 u->ftp_type = process_ftp_type (u->path);
516 /* #### We don't handle type `d' correctly yet. */
517 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
519 DEBUGP (("ftp_type %c -> ", u->ftp_type));
521 DEBUGP (("opath %s -> ", u->path));
522 /* Parse the username and password (if existing). */
523 parse_uname (url, &u->user, &u->passwd);
524 /* Decode the strings, as per RFC 1738. */
525 decode_string (u->host);
526 decode_string (u->path);
528 decode_string (u->user);
530 decode_string (u->passwd);
531 /* Parse the directory. */
532 parse_dir (u->path, &u->dir, &u->file);
533 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
534 /* Simplify the directory. */
535 path_simplify (u->dir);
536 /* Remove the leading `/' in HTTP. */
537 if (type == URLHTTP && *u->dir == '/')
538 strcpy (u->dir, u->dir + 1);
539 DEBUGP (("ndir %s\n", u->dir));
540 /* Strip trailing `/'. */
542 if (l > 1 && u->dir[l - 1] == '/')
543 u->dir[l - 1] = '\0';
544 /* Re-create the path: */
545 abs_ftp = (u->scheme == SCHEME_FTP && *u->dir == '/');
546 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
547 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
548 strcpy (u->path, abs_ftp ? "%2F" : "/");
549 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
550 strcat (u->path, *u->dir ? "/" : "");
551 strcat (u->path, u->file);
553 DEBUGP (("newpath: %s\n", u->path));
554 /* Create the clean URL. */
555 u->url = str_url (u, 0);
559 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
560 DOTP and DDOTP, but they also recognize `?' as end-of-string
561 delimiter. This is needed for correct handling of query
564 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
565 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
566 && (!*((x) + 2) || *((x) + 2) == '?'))
568 /* Build the directory and filename components of the path. Both
569 components are *separately* malloc-ed strings! It does not change
570 the contents of path.
572 If the path ends with "." or "..", they are (correctly) counted as
575 parse_dir (const char *path, char **dir, char **file)
579 l = urlpath_length (path);
580 for (i = l; i && path[i] != '/'; i--);
582 if (!i && *path != '/') /* Just filename */
584 if (PD_DOTP (path) || PD_DDOTP (path))
586 *dir = strdupdelim (path, path + l);
587 *file = xstrdup (path + l); /* normally empty, but could
592 *dir = xstrdup (""); /* This is required because of FTP */
593 *file = xstrdup (path);
596 else if (!i) /* /filename */
598 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
600 *dir = strdupdelim (path, path + l);
601 *file = xstrdup (path + l); /* normally empty, but could
606 *dir = xstrdup ("/");
607 *file = xstrdup (path + 1);
610 else /* Nonempty directory with or without a filename */
612 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
614 *dir = strdupdelim (path, path + l);
615 *file = xstrdup (path + l); /* normally empty, but could
620 *dir = strdupdelim (path, path + i);
621 *file = xstrdup (path + i + 1);
626 /* Find the optional username and password within the URL, as per
627 RFC1738. The returned user and passwd char pointers are
630 parse_uname (const char *url, char **user, char **passwd)
633 const char *p, *q, *col;
639 /* Look for the end of the scheme identifier. */
640 l = url_skip_scheme (url);
644 /* Is there an `@' character? */
645 for (p = url; *p && *p != '/'; p++)
648 /* If not, return. */
651 /* Else find the username and password. */
652 for (p = q = col = url; *p && *p != '/'; p++)
654 if (*p == ':' && !*user)
656 *user = (char *)xmalloc (p - url + 1);
657 memcpy (*user, url, p - url);
658 (*user)[p - url] = '\0';
661 if (*p == '@') q = p;
663 /* Decide whether you have only the username or both. */
664 where = *user ? passwd : user;
665 *where = (char *)xmalloc (q - col + 1);
666 memcpy (*where, col, q - col);
667 (*where)[q - col] = '\0';
671 /* If PATH ends with `;type=X', return the character X. */
673 process_ftp_type (char *path)
675 int len = strlen (path);
678 && !memcmp (path + len - 7, ";type=", 6))
680 path[len - 7] = '\0';
681 return path[len - 1];
687 /* Recreate the URL string from the data in urlinfo. This can be used
688 to create a "canonical" representation of the URL. If `hide' is
689 non-zero (as it is when we're calling this on a URL we plan to
690 print, but not when calling it to canonicalize a URL for use within
691 the program), password will be hidden. The forbidden characters in
692 the URL will be cleansed. */
694 str_url (const struct urlinfo *u, int hide)
696 char *res, *host, *user, *passwd, *scheme_name, *dir, *file;
697 int i, l, ln, lu, lh, lp, lf, ld;
698 unsigned short default_port;
700 /* Look for the scheme. */
701 for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
702 if (supported_schemes[i].scheme == u->scheme)
704 if (i == ARRAY_SIZE (supported_schemes))
706 scheme_name = supported_schemes[i].leading_string;
707 default_port = supported_schemes[i].default_port;
708 host = encode_string (u->host);
709 dir = encode_string (u->dir);
710 file = encode_string (u->file);
711 user = passwd = NULL;
713 user = encode_string (u->user);
717 /* Don't output the password, or someone might see it over the user's
718 shoulder (or in saved wget output). Don't give away the number of
719 characters in the password, either, as we did in past versions of
720 this code, when we replaced the password characters with 'x's. */
721 passwd = xstrdup("<password>");
723 passwd = encode_string (u->passwd);
725 if (u->scheme == SCHEME_FTP && *dir == '/')
727 char *tmp = (char *)xmalloc (strlen (dir) + 3);
728 /*sprintf (tmp, "%%2F%s", dir + 1);*/
732 strcpy (tmp + 3, dir + 1);
737 ln = strlen (scheme_name);
738 lu = user ? strlen (user) : 0;
739 lp = passwd ? strlen (passwd) : 0;
743 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
744 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", scheme_name,
745 (user ? user : ""), (passwd ? ":" : ""),
746 (passwd ? passwd : ""), (user ? "@" : ""),
747 host, u->port, dir, *dir ? "/" : "", file); */
749 memcpy (res, scheme_name, ln);
753 memcpy (res + l, user, lu);
758 memcpy (res + l, passwd, lp);
763 memcpy (res + l, host, lh);
765 if (u->port != default_port)
768 long_to_string (res + l, (long)u->port);
769 l += numdigit (u->port);
772 memcpy (res + l, dir, ld);
776 strcpy (res + l, file);
785 /* Check whether two URL-s are equivalent, i.e. pointing to the same
786 location. Uses parseurl to parse them, and compares the canonical
789 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
790 return 0 on error. */
791 /* Do not compile unused code. */
794 url_equal (const char *url1, const char *url2)
796 struct urlinfo *u1, *u2;
801 err = parseurl (url1, u1, 0);
808 err = parseurl (url2, u2, 0);
815 res = !strcmp (u1->url, u2->url);
823 get_urls_file (const char *file)
825 struct file_memory *fm;
827 const char *text, *text_end;
830 fm = read_file (file);
833 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
836 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
839 text_end = fm->content + fm->length;
840 while (text < text_end)
842 const char *line_beg = text;
843 const char *line_end = memchr (text, '\n', text_end - text);
849 while (line_beg < line_end
850 && ISSPACE (*line_beg))
852 while (line_end > line_beg + 1
853 && ISSPACE (*(line_end - 1)))
855 if (line_end > line_beg)
857 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
858 memset (entry, 0, sizeof (*entry));
860 entry->url = strdupdelim (line_beg, line_end);
872 /* Free the linked list of urlpos. */
874 free_urlpos (urlpos *l)
878 urlpos *next = l->next;
880 FREE_MAYBE (l->local_name);
886 /* Rotate FNAME opt.backups times */
888 rotate_backups(const char *fname)
890 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
891 char *from = (char *)alloca (maxlen);
892 char *to = (char *)alloca (maxlen);
896 if (stat (fname, &sb) == 0)
897 if (S_ISREG (sb.st_mode) == 0)
900 for (i = opt.backups; i > 1; i--)
902 sprintf (from, "%s.%d", fname, i - 1);
903 sprintf (to, "%s.%d", fname, i);
904 /* #### This will fail on machines without the rename() system
909 sprintf (to, "%s.%d", fname, 1);
913 /* Create all the necessary directories for PATH (a file). Calls
914 mkdirhier() internally. */
916 mkalldirs (const char *path)
923 p = path + strlen (path);
924 for (; *p != '/' && p != path; p--);
925 /* Don't create if it's just a file. */
926 if ((p == path) && (*p != '/'))
928 t = strdupdelim (path, p);
929 /* Check whether the directory exists. */
930 if ((stat (t, &st) == 0))
932 if (S_ISDIR (st.st_mode))
939 /* If the dir exists as a file name, remove it first. This
940 is *only* for Wget to work with buggy old CERN http
941 servers. Here is the scenario: When Wget tries to
942 retrieve a directory without a slash, e.g.
943 http://foo/bar (bar being a directory), CERN server will
944 not redirect it too http://foo/bar/ -- it will generate a
945 directory listing containing links to bar/file1,
946 bar/file2, etc. Wget will lose because it saves this
947 HTML listing to a file `bar', so it cannot create the
948 directory. To work around this, if the file of the same
949 name exists, we just remove it and create the directory
951 DEBUGP (("Removing %s because of directory danger!\n", t));
955 res = make_directory (t);
957 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
963 count_slashes (const char *s)
972 /* Return the path name of the URL-equivalent file name, with a
973 remote-like structure of directories. */
975 mkstruct (const struct urlinfo *u)
977 char *host, *dir, *file, *res, *dirpref;
980 assert (u->dir != NULL);
981 assert (u->host != NULL);
985 char *ptr = u->dir + (*u->dir == '/');
986 int slash_count = 1 + count_slashes (ptr);
987 int cut = MINVAL (opt.cut_dirs, slash_count);
988 for (; cut && *ptr; ptr++)
991 STRDUP_ALLOCA (dir, ptr);
994 dir = u->dir + (*u->dir == '/');
996 host = xstrdup (u->host);
997 /* Check for the true name (or at least a consistent name for saving
998 to directory) of HOST, reusing the hlist if possible. */
999 if (opt.add_hostdir && !opt.simple_check)
1001 char *nhost = realhost (host);
1005 /* Add dir_prefix and hostname (if required) to the beginning of
1007 if (opt.add_hostdir)
1009 if (!DOTP (opt.dir_prefix))
1011 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1012 + strlen (host) + 1);
1013 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1016 STRDUP_ALLOCA (dirpref, host);
1018 else /* not add_hostdir */
1020 if (!DOTP (opt.dir_prefix))
1021 dirpref = opt.dir_prefix;
1027 /* If there is a prefix, prepend it. */
1030 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1031 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1034 dir = encode_string (dir);
1036 if (l && dir[l - 1] == '/')
1040 file = "index.html";
1044 /* Finally, construct the full name. */
1045 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1046 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1051 /* Return a malloced copy of S, but protect any '/' characters. */
1054 file_name_protect_query_string (const char *s)
1059 for (from = s; *from; from++)
1063 destlen += 2; /* each / gets replaced with %2F, so
1064 it adds two more chars. */
1066 dest = (char *)xmalloc (destlen + 1);
1067 for (from = s, to = dest; *from; from++)
1078 assert (to - dest == destlen);
1083 /* Create a unique filename, corresponding to a given URL. Calls
1084 mkstruct if necessary. Does *not* actually create any directories. */
1086 url_filename (const struct urlinfo *u)
1089 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1093 file = mkstruct (u);
1099 file = xstrdup ("index.html");
1102 /* If the URL came with a query string, u->file will contain
1103 a question mark followed by query string contents. These
1104 contents can contain '/' which would make us create
1105 unwanted directories. These slashes must be protected
1107 if (!strchr (u->file, '/'))
1108 file = xstrdup (u->file);
1111 /*assert (strchr (u->file, '?') != NULL);*/
1112 file = file_name_protect_query_string (u->file);
1119 /* Check whether the prefix directory is something other than "."
1120 before prepending it. */
1121 if (!DOTP (opt.dir_prefix))
1123 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1124 + 1 + strlen (file) + 1);
1125 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1130 /* DOS-ish file systems don't like `%' signs in them; we change it
1135 for (p = file; *p; p++)
1139 #endif /* WINDOWS */
1141 /* Check the cases in which the unique extensions are not used:
1142 1) Clobbering is turned off (-nc).
1143 2) Retrieval with regetting.
1144 3) Timestamping is used.
1145 4) Hierarchy is built.
1147 The exception is the case when file does exist and is a
1148 directory (actually support for bad httpd-s). */
1149 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1150 && !(file_exists_p (file) && !file_non_directory_p (file)))
1153 /* Find a unique name. */
1154 name = unique_name (file);
1159 /* Like strlen(), but allow the URL to be ended with '?'. */
1161 urlpath_length (const char *url)
1163 const char *q = strchr (url, '?');
1166 return strlen (url);
1169 /* Find the last occurrence of character C in the range [b, e), or
1170 NULL, if none are present. This is almost completely equivalent to
1171 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1172 the contents of the string. */
1174 find_last_char (const char *b, const char *e, char c)
1182 /* Resolve the result of "linking" a base URI (BASE) to a
1183 link-specified URI (LINK).
1185 Either of the URIs may be absolute or relative, complete with the
1186 host name, or path only. This tries to behave "reasonably" in all
1187 foreseeable cases. It employs little specific knowledge about
1188 schemes or URL-specific stuff -- it just works on strings.
1190 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1191 See uri_merge for a gentler interface to this functionality.
1193 #### This function should handle `./' and `../' so that the evil
1194 path_simplify can go. */
1196 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1202 const char *end = base + urlpath_length (base);
1206 /* LINK is a relative URL: we need to replace everything
1207 after last slash (possibly empty) with LINK.
1209 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1210 our result should be "whatever/foo/qux/xyzzy". */
1211 int need_explicit_slash = 0;
1213 const char *start_insert;
1214 const char *last_slash = find_last_char (base, end, '/');
1217 /* No slash found at all. Append LINK to what we have,
1218 but we'll need a slash as a separator.
1220 Example: if base == "foo" and link == "qux/xyzzy", then
1221 we cannot just append link to base, because we'd get
1222 "fooqux/xyzzy", whereas what we want is
1225 To make sure the / gets inserted, we set
1226 need_explicit_slash to 1. We also set start_insert
1227 to end + 1, so that the length calculations work out
1228 correctly for one more (slash) character. Accessing
1229 that character is fine, since it will be the
1230 delimiter, '\0' or '?'. */
1231 /* example: "foo?..." */
1232 /* ^ ('?' gets changed to '/') */
1233 start_insert = end + 1;
1234 need_explicit_slash = 1;
1236 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1238 /* example: http://host" */
1240 start_insert = end + 1;
1241 need_explicit_slash = 1;
1245 /* example: "whatever/foo/bar" */
1247 start_insert = last_slash + 1;
1250 span = start_insert - base;
1251 constr = (char *)xmalloc (span + linklength + 1);
1253 memcpy (constr, base, span);
1254 if (need_explicit_slash)
1255 constr[span - 1] = '/';
1257 memcpy (constr + span, link, linklength);
1258 constr[span + linklength] = '\0';
1260 else /* *link == `/' */
1262 /* LINK is an absolute path: we need to replace everything
1263 after (and including) the FIRST slash with LINK.
1265 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1266 "/qux/xyzzy", our result should be
1267 "http://host/qux/xyzzy". */
1270 const char *start_insert = NULL; /* for gcc to shut up. */
1271 const char *pos = base;
1272 int seen_slash_slash = 0;
1273 /* We're looking for the first slash, but want to ignore
1276 slash = memchr (pos, '/', end - pos);
1277 if (slash && !seen_slash_slash)
1278 if (*(slash + 1) == '/')
1281 seen_slash_slash = 1;
1285 /* At this point, SLASH is the location of the first / after
1286 "//", or the first slash altogether. START_INSERT is the
1287 pointer to the location where LINK will be inserted. When
1288 examining the last two examples, keep in mind that LINK
1291 if (!slash && !seen_slash_slash)
1292 /* example: "foo" */
1294 start_insert = base;
1295 else if (!slash && seen_slash_slash)
1296 /* example: "http://foo" */
1299 else if (slash && !seen_slash_slash)
1300 /* example: "foo/bar" */
1302 start_insert = base;
1303 else if (slash && seen_slash_slash)
1304 /* example: "http://something/" */
1306 start_insert = slash;
1308 span = start_insert - base;
1309 constr = (char *)xmalloc (span + linklength + 1);
1311 memcpy (constr, base, span);
1313 memcpy (constr + span, link, linklength);
1314 constr[span + linklength] = '\0';
1317 else /* !no_scheme */
1319 constr = strdupdelim (link, link + linklength);
1324 /* Merge BASE with LINK and return the resulting URI. This is an
1325 interface to uri_merge_1 that assumes that LINK is a
1326 zero-terminated string. */
1328 uri_merge (const char *base, const char *link)
1330 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1333 /* Optimize URL by host, destructively replacing u->host with realhost
1334 (u->host). Do this regardless of opt.simple_check. */
1336 opt_url (struct urlinfo *u)
1338 /* Find the "true" host. */
1339 char *host = realhost (u->host);
1342 assert (u->dir != NULL); /* the URL must have been parsed */
1343 /* Refresh the printed representation. */
1345 u->url = str_url (u, 0);
1348 /* Returns proxy host address, in accordance with SCHEME. */
1350 getproxy (enum url_scheme scheme)
1353 char *rewritten_url;
1354 static char rewritten_storage[1024];
1359 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1363 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1367 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1369 case SCHEME_INVALID:
1372 if (!proxy || !*proxy)
1375 /* Handle shorthands. */
1376 rewritten_url = rewrite_url_maybe (proxy);
1379 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1380 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1381 proxy = rewritten_storage;
1387 /* Should a host be accessed through proxy, concerning no_proxy? */
1389 no_proxy_match (const char *host, const char **no_proxy)
1394 return !sufmatch (no_proxy, host);
1397 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1398 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1400 /* Change the links in an HTML document. Accepts a structure that
1401 defines the positions of all the links. */
1403 convert_links (const char *file, urlpos *l)
1405 struct file_memory *fm;
1408 downloaded_file_t downloaded_file_return;
1410 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1413 /* First we do a "dry run": go through the list L and see whether
1414 any URL needs to be converted in the first place. If not, just
1415 leave the file alone. */
1418 for (dry = l; dry; dry = dry->next)
1419 if (dry->convert != CO_NOCONVERT)
1423 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1428 fm = read_file (file);
1431 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1432 file, strerror (errno));
1436 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1437 if (opt.backup_converted && downloaded_file_return)
1438 write_backup_file (file, downloaded_file_return);
1440 /* Before opening the file for writing, unlink the file. This is
1441 important if the data in FM is mmaped. In such case, nulling the
1442 file, which is what fopen() below does, would make us read all
1443 zeroes from the mmaped region. */
1444 if (unlink (file) < 0 && errno != ENOENT)
1446 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1447 file, strerror (errno));
1448 read_file_free (fm);
1451 /* Now open the file for writing. */
1452 fp = fopen (file, "wb");
1455 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1456 file, strerror (errno));
1457 read_file_free (fm);
1460 /* Here we loop through all the URLs in file, replacing those of
1461 them that are downloaded with relative references. */
1463 for (; l; l = l->next)
1465 char *url_start = fm->content + l->pos;
1467 if (l->pos >= fm->length)
1469 DEBUGP (("Something strange is going on. Please investigate."));
1472 /* If the URL is not to be converted, skip it. */
1473 if (l->convert == CO_NOCONVERT)
1475 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1479 /* Echo the file contents, up to the offending URL's opening
1480 quote, to the outfile. */
1481 fwrite (p, 1, url_start - p, fp);
1483 if (l->convert == CO_CONVERT_TO_RELATIVE)
1485 /* Convert absolute URL to relative. */
1486 char *newname = construct_relative (file, l->local_name);
1487 char *quoted_newname = html_quote_string (newname);
1488 replace_attr (&p, l->size, fp, quoted_newname);
1489 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1490 l->url, newname, l->pos, file));
1492 xfree (quoted_newname);
1494 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1496 /* Convert the link to absolute URL. */
1497 char *newlink = l->url;
1498 char *quoted_newlink = html_quote_string (newlink);
1499 replace_attr (&p, l->size, fp, quoted_newlink);
1500 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1501 newlink, l->pos, file));
1502 xfree (quoted_newlink);
1505 /* Output the rest of the file. */
1506 if (p - fm->content < fm->length)
1507 fwrite (p, 1, fm->length - (p - fm->content), fp);
1509 read_file_free (fm);
1510 logputs (LOG_VERBOSE, _("done.\n"));
1513 /* Construct and return a malloced copy of the relative link from two
1514 pieces of information: local name S1 of the referring file and
1515 local name S2 of the referred file.
1517 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1518 "jagor.srce.hr/images/news.gif", the function will return
1521 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1522 "fly.cc.fer.hr/images/fly.gif", the function will return
1523 "../images/fly.gif".
1525 Caveats: S1 should not begin with `/', unless S2 also begins with
1526 '/'. S1 should not contain things like ".." and such --
1527 construct_relative ("fly/ioccc/../index.html",
1528 "fly/images/fly.gif") will fail. (A workaround is to call
1529 something like path_simplify() on S1). */
1531 construct_relative (const char *s1, const char *s2)
1533 int i, cnt, sepdirs1;
1537 return xstrdup (s2);
1538 /* S1 should *not* be absolute, if S2 wasn't. */
1539 assert (*s1 != '/');
1541 /* Skip the directories common to both strings. */
1544 while (s1[i] && s2[i]
1549 if (s1[i] == '/' && s2[i] == '/')
1554 for (sepdirs1 = 0; s1[i]; i++)
1557 /* Now, construct the file as of:
1558 - ../ repeated sepdirs1 time
1559 - all the non-mutual directories of S2. */
1560 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1561 for (i = 0; i < sepdirs1; i++)
1562 memcpy (res + 3 * i, "../", 3);
1563 strcpy (res + 3 * i, s2 + cnt);
1567 /* Add URL to the head of the list L. */
1569 add_url (urlpos *l, const char *url, const char *file)
1573 t = (urlpos *)xmalloc (sizeof (urlpos));
1574 memset (t, 0, sizeof (*t));
1575 t->url = xstrdup (url);
1576 t->local_name = xstrdup (file);
1582 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1584 /* Rather than just writing over the original .html file with the
1585 converted version, save the former to *.orig. Note we only do
1586 this for files we've _successfully_ downloaded, so we don't
1587 clobber .orig files sitting around from previous invocations. */
1589 /* Construct the backup filename as the original name plus ".orig". */
1590 size_t filename_len = strlen(file);
1591 char* filename_plus_orig_suffix;
1592 boolean already_wrote_backup_file = FALSE;
1593 slist* converted_file_ptr;
1594 static slist* converted_files = NULL;
1596 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1598 /* Just write "orig" over "html". We need to do it this way
1599 because when we're checking to see if we've downloaded the
1600 file before (to see if we can skip downloading it), we don't
1601 know if it's a text/html file. Therefore we don't know yet
1602 at that stage that -E is going to cause us to tack on
1603 ".html", so we need to compare vs. the original URL plus
1604 ".orig", not the original URL plus ".html.orig". */
1605 filename_plus_orig_suffix = alloca (filename_len + 1);
1606 strcpy(filename_plus_orig_suffix, file);
1607 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1609 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1611 /* Append ".orig" to the name. */
1612 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1613 strcpy(filename_plus_orig_suffix, file);
1614 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1617 /* We can get called twice on the same URL thanks to the
1618 convert_all_links() call in main(). If we write the .orig file
1619 each time in such a case, it'll end up containing the first-pass
1620 conversion, not the original file. So, see if we've already been
1621 called on this file. */
1622 converted_file_ptr = converted_files;
1623 while (converted_file_ptr != NULL)
1624 if (strcmp(converted_file_ptr->string, file) == 0)
1626 already_wrote_backup_file = TRUE;
1630 converted_file_ptr = converted_file_ptr->next;
1632 if (!already_wrote_backup_file)
1634 /* Rename <file> to <file>.orig before former gets written over. */
1635 if (rename(file, filename_plus_orig_suffix) != 0)
1636 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1637 file, filename_plus_orig_suffix, strerror (errno));
1639 /* Remember that we've already written a .orig backup for this file.
1640 Note that we never free this memory since we need it till the
1641 convert_all_links() call, which is one of the last things the
1642 program does before terminating. BTW, I'm not sure if it would be
1643 safe to just set 'converted_file_ptr->string' to 'file' below,
1644 rather than making a copy of the string... Another note is that I
1645 thought I could just add a field to the urlpos structure saying
1646 that we'd written a .orig file for this URL, but that didn't work,
1647 so I had to make this separate list.
1648 -- Dan Harkless <wget@harkless.org>
1650 This [adding a field to the urlpos structure] didn't work
1651 because convert_file() is called twice: once after all its
1652 sublinks have been retrieved in recursive_retrieve(), and
1653 once at the end of the day in convert_all_links(). The
1654 original linked list collected in recursive_retrieve() is
1655 lost after the first invocation of convert_links(), and
1656 convert_all_links() makes a new one (it calls get_urls_html()
1657 for each file it covers.) That's why your first approach didn't
1658 work. The way to make it work is perhaps to make this flag a
1659 field in the `urls_html' list.
1660 -- Hrvoje Niksic <hniksic@arsdigita.com>
1662 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1663 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1664 converted_file_ptr->next = converted_files;
1665 converted_files = converted_file_ptr;
1669 static int find_fragment PARAMS ((const char *, int, const char **,
1673 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1675 const char *p = *pp;
1677 int size = raw_size;
1678 char quote_char = '\"';
1679 const char *frag_beg, *frag_end;
1681 /* Structure of our string is:
1682 "...old-contents..."
1683 <--- l->size ---> (with quotes)
1686 <--- l->size --> (no quotes) */
1688 if (*p == '\"' || *p == '\'')
1693 size -= 2; /* disregard opening and closing quote */
1695 putc (quote_char, fp);
1696 fputs (new_str, fp);
1698 /* Look for fragment identifier, if any. */
1699 if (find_fragment (p, size, &frag_beg, &frag_end))
1700 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1704 putc (quote_char, fp);
1708 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1709 preceded by '&'. If the character is not found, return zero. If
1710 the character is found, return 1 and set BP and EP to point to the
1711 beginning and end of the region.
1713 This is used for finding the fragment indentifiers in URLs. */
1716 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1718 const char *end = beg + size;
1720 for (; beg < end; beg++)
1742 typedef struct _downloaded_file_list {
1744 downloaded_file_t download_type;
1745 struct _downloaded_file_list* next;
1746 } downloaded_file_list;
1748 static downloaded_file_list *downloaded_files;
1750 /* Remembers which files have been downloaded. In the standard case, should be
1751 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1752 download successfully (i.e. not for ones we have failures on or that we skip
1755 When we've downloaded a file and tacked on a ".html" extension due to -E,
1756 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1757 FILE_DOWNLOADED_NORMALLY.
1759 If you just want to check if a file has been previously added without adding
1760 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1761 with local filenames, not remote URLs. */
1763 downloaded_file (downloaded_file_t mode, const char* file)
1765 boolean found_file = FALSE;
1766 downloaded_file_list* rover = downloaded_files;
1768 while (rover != NULL)
1769 if (strcmp(rover->file, file) == 0)
1775 rover = rover->next;
1778 return rover->download_type; /* file had already been downloaded */
1781 if (mode != CHECK_FOR_FILE)
1783 rover = xmalloc(sizeof(*rover));
1784 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1785 rover->download_type = mode;
1786 rover->next = downloaded_files;
1787 downloaded_files = rover;
1790 return FILE_NOT_ALREADY_DOWNLOADED;
1795 downloaded_files_free (void)
1797 downloaded_file_list* rover = downloaded_files;
1800 downloaded_file_list *next = rover->next;
1801 xfree (rover->file);