2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
50 static int urlpath_length PARAMS ((const char *));
54 enum url_scheme scheme;
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { SCHEME_HTTP, "http://", DEFAULT_HTTP_PORT },
64 { SCHEME_HTTPS, "https://", DEFAULT_HTTPS_PORT },
66 { SCHEME_FTP, "ftp://", DEFAULT_FTP_PORT }
69 static void parse_dir PARAMS ((const char *, char **, char **));
70 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
71 static char *construct_relative PARAMS ((const char *, const char *));
72 static char process_ftp_type PARAMS ((char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars. We don't use this yet; preservation of
91 reserved chars will be implemented when I integrate the new
92 `reencode_string' function. */
94 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
98 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
99 - '@' and ':'; needed for encoding URL username and password.
100 - anything >= 127. */
102 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
104 const static unsigned char urlchr_table[256] =
106 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
107 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
108 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
109 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
110 U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */
111 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
112 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
113 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */
114 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
115 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
116 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
117 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
118 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
119 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
120 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
121 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
134 /* Decodes the forms %xy in a URL to the character the hexadecimal
135 code of which is xy. xy are hexadecimal digits from
136 [0123456789ABCDEF] (case-insensitive). If x or y are not
137 hex-digits or `%' precedes `\0', the sequence is inserted
141 decode_string (char *s)
143 char *t = s; /* t - tortoise */
144 char *h = s; /* h - hare */
155 /* Do nothing if '%' is not followed by two hex digits. */
156 if (!*(h + 1) || !*(h + 2)
157 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
159 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
166 /* Like encode_string, but return S if there are no unsafe chars. */
169 encode_string_maybe (const char *s)
176 for (p1 = s; *p1; p1++)
177 if (UNSAFE_CHAR (*p1))
178 addition += 2; /* Two more characters (hex digits) */
183 newlen = (p1 - s) + addition;
184 newstr = (char *)xmalloc (newlen + 1);
190 if (UNSAFE_CHAR (*p1))
192 const unsigned char c = *p1++;
194 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
195 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
201 assert (p2 - newstr == newlen);
206 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
207 given string, returning a malloc-ed %XX encoded string. */
210 encode_string (const char *s)
212 char *encoded = encode_string_maybe (s);
219 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
220 the old value of PTR is freed and PTR is made to point to the newly
221 allocated storage. */
223 #define ENCODE(ptr) do { \
224 char *e_new = encode_string_maybe (ptr); \
232 /* Returns the scheme type if the scheme is supported, or
233 SCHEME_INVALID if not. */
235 url_scheme (const char *url)
239 for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
240 if (!strncasecmp (url, supported_schemes[i].leading_string,
241 strlen (supported_schemes[i].leading_string)))
242 return supported_schemes[i].scheme;
243 return SCHEME_INVALID;
246 /* Return the number of characters needed to skip the scheme part of
247 the URL, e.g. `http://'. If no scheme is found, returns 0. */
249 url_skip_scheme (const char *url)
253 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
255 while (ISALNUM (*p) || *p == '-' || *p == '+')
262 /* Skip "//" if found. */
263 if (*p == '/' && *(p + 1) == '/')
269 /* Returns 1 if the URL begins with a scheme (supported or
270 unsupported), 0 otherwise. */
272 url_has_scheme (const char *url)
275 while (ISALNUM (*p) || *p == '-' || *p == '+')
280 /* Skip the username and password, if present here. The function
281 should be called *not* with the complete URL, but with the part
282 right after the scheme.
284 If no username and password are found, return 0. */
286 url_skip_uname (const char *url)
289 const char *q = NULL;
290 for (p = url ; *p && *p != '/'; p++)
291 if (*p == '@') q = p;
292 /* If a `@' was found before the first occurrence of `/', skip
300 /* Used by main.c: detect URLs written using the "shorthand" URL forms
301 popularized by Netscape and NcFTP. HTTP shorthands look like this:
303 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
304 www.foo.com[:port] -> http://www.foo.com[:port]
306 FTP shorthands look like this:
308 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
309 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
311 If the URL needs not or cannot be rewritten, return NULL. */
313 rewrite_url_maybe (const char *url)
317 if (url_has_scheme (url))
320 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
322 for (p = url; *p && *p != ':' && *p != '/'; p++)
330 const char *pp, *path;
332 /* If the characters after the colon and before the next slash
333 or end of string are all digits, it's HTTP. */
335 for (pp = p + 1; ISDIGIT (*pp); pp++)
338 && (*pp == '/' || *pp == '\0'))
341 /* Prepend "ftp://" to the entire URL... */
343 res = xmalloc (6 + strlen (url) + 1);
344 sprintf (res, "ftp://%s", url);
345 /* ...and replace ':' with '/'. */
346 res[6 + (p - url)] = '/';
353 /* Just prepend "http://" to what we have. */
354 res = xmalloc (7 + strlen (url) + 1);
355 sprintf (res, "http://%s", url);
360 /* Allocate a new urlinfo structure, fill it with default values and
361 return a pointer to it. */
367 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
368 memset (u, 0, sizeof (*u));
369 u->scheme = SCHEME_INVALID;
373 /* Perform a "deep" free of the urlinfo structure. The structure
374 should have been created with newurl, but need not have been used.
375 If free_pointer is non-0, free the pointer itself. */
377 freeurl (struct urlinfo *u, int complete)
381 FREE_MAYBE (u->host);
382 FREE_MAYBE (u->path);
383 FREE_MAYBE (u->file);
385 FREE_MAYBE (u->user);
386 FREE_MAYBE (u->passwd);
387 FREE_MAYBE (u->local);
388 FREE_MAYBE (u->referer);
390 freeurl (u->proxy, 1);
396 enum url_parse_error {
397 PE_UNRECOGNIZED_SCHEME, PE_BAD_PORT
400 /* Extract the given URL of the form
401 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
402 1. hostname (terminated with `/' or `:')
403 2. port number (terminated with `/'), or chosen for the scheme
404 3. dirname (everything after hostname)
405 Most errors are handled. No allocation is done, you must supply
406 pointers to allocated memory.
407 ...and a host of other stuff :-)
409 - Recognizes hostname:dir/file for FTP and
410 hostname (:portnum)?/dir/file for HTTP.
411 - Parses the path to yield directory and file
412 - Parses the URL to yield the username and passwd (if present)
413 - Decodes the strings, in case they contain "forbidden" characters
414 - Writes the result to struct urlinfo
416 If the argument STRICT is set, it recognizes only the canonical
419 parseurl (const char *url, struct urlinfo *u, int strict)
422 int recognizable; /* Recognizable URL is the one where
423 the scheme was explicitly named,
424 i.e. it wasn't deduced from the URL
426 uerr_t type = URLUNKNOWN;
428 DEBUGP (("parseurl (\"%s\") -> ", url));
429 recognizable = url_has_scheme (url);
430 if (strict && !recognizable)
432 for (i = 0, l = 0; i < ARRAY_SIZE (supported_schemes); i++)
434 l = strlen (supported_schemes[i].leading_string);
435 if (!strncasecmp (supported_schemes[i].leading_string, url, l))
438 /* If scheme is recognizable, but unsupported, bail out, else
440 if (recognizable && i == ARRAY_SIZE (supported_schemes))
442 else if (i == ARRAY_SIZE (supported_schemes))
446 u->scheme = supported_schemes[i].scheme;
447 if (u->scheme == SCHEME_HTTP)
450 if (u->scheme == SCHEME_HTTPS)
453 if (u->scheme == SCHEME_FTP)
457 if (type == URLUNKNOWN)
459 /* Allow a username and password to be specified (i.e. just skip
462 l += url_skip_uname (url + l);
463 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
466 /* Get the hostname. */
467 u->host = strdupdelim (url + l, url + i);
468 DEBUGP (("host %s -> ", u->host));
470 /* Assume no port has been given. */
474 /* We have a colon delimiting the hostname. It could mean that
475 a port number is following it, or a directory. */
476 if (ISDIGIT (url[++i])) /* A port number */
478 if (type == URLUNKNOWN)
481 u->scheme = SCHEME_HTTP;
483 for (; url[i] && url[i] != '/'; i++)
484 if (ISDIGIT (url[i]))
485 u->port = 10 * u->port + (url[i] - '0');
490 DEBUGP (("port %hu -> ", u->port));
492 else if (type == URLUNKNOWN) /* or a directory */
495 u->scheme = SCHEME_FTP;
497 else /* or just a misformed port number */
500 else if (type == URLUNKNOWN)
503 u->scheme = SCHEME_HTTP;
508 for (ind = 0; ind < ARRAY_SIZE (supported_schemes); ind++)
509 if (supported_schemes[ind].scheme == u->scheme)
511 if (ind == ARRAY_SIZE (supported_schemes))
513 u->port = supported_schemes[ind].default_port;
515 /* Some delimiter troubles... */
516 if (url[i] == '/' && url[i - 1] != ':')
518 if (u->scheme == SCHEME_HTTP)
519 while (url[i] && url[i] == '/')
521 u->path = (char *)xmalloc (strlen (url + i) + 8);
522 strcpy (u->path, url + i);
523 if (u->scheme == SCHEME_FTP)
525 u->ftp_type = process_ftp_type (u->path);
526 /* #### We don't handle type `d' correctly yet. */
527 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
529 DEBUGP (("ftp_type %c -> ", u->ftp_type));
531 DEBUGP (("opath %s -> ", u->path));
532 /* Parse the username and password (if existing). */
533 parse_uname (url, &u->user, &u->passwd);
534 /* Decode the strings, as per RFC 1738. */
535 decode_string (u->host);
536 decode_string (u->path);
538 decode_string (u->user);
540 decode_string (u->passwd);
541 /* Parse the directory. */
542 parse_dir (u->path, &u->dir, &u->file);
543 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
544 /* Simplify the directory. */
545 path_simplify (u->dir);
546 /* Remove the leading `/' in HTTP. */
547 if (u->scheme == SCHEME_HTTP && *u->dir == '/')
548 strcpy (u->dir, u->dir + 1);
549 DEBUGP (("ndir %s\n", u->dir));
550 /* Strip trailing `/'. */
552 if (l > 1 && u->dir[l - 1] == '/')
553 u->dir[l - 1] = '\0';
554 /* Re-create the path: */
555 abs_ftp = (u->scheme == SCHEME_FTP && *u->dir == '/');
556 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
557 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
558 strcpy (u->path, abs_ftp ? "%2F" : "/");
559 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
560 strcat (u->path, *u->dir ? "/" : "");
561 strcat (u->path, u->file);
563 DEBUGP (("newpath: %s\n", u->path));
564 /* Create the clean URL. */
565 u->url = str_url (u, 0);
569 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
570 DOTP and DDOTP, but they also recognize `?' as end-of-string
571 delimiter. This is needed for correct handling of query
574 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
575 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
576 && (!*((x) + 2) || *((x) + 2) == '?'))
578 /* Build the directory and filename components of the path. Both
579 components are *separately* malloc-ed strings! It does not change
580 the contents of path.
582 If the path ends with "." or "..", they are (correctly) counted as
585 parse_dir (const char *path, char **dir, char **file)
589 l = urlpath_length (path);
590 for (i = l; i && path[i] != '/'; i--);
592 if (!i && *path != '/') /* Just filename */
594 if (PD_DOTP (path) || PD_DDOTP (path))
596 *dir = strdupdelim (path, path + l);
597 *file = xstrdup (path + l); /* normally empty, but could
602 *dir = xstrdup (""); /* This is required because of FTP */
603 *file = xstrdup (path);
606 else if (!i) /* /filename */
608 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
610 *dir = strdupdelim (path, path + l);
611 *file = xstrdup (path + l); /* normally empty, but could
616 *dir = xstrdup ("/");
617 *file = xstrdup (path + 1);
620 else /* Nonempty directory with or without a filename */
622 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
624 *dir = strdupdelim (path, path + l);
625 *file = xstrdup (path + l); /* normally empty, but could
630 *dir = strdupdelim (path, path + i);
631 *file = xstrdup (path + i + 1);
636 /* Find the optional username and password within the URL, as per
637 RFC1738. The returned user and passwd char pointers are
640 parse_uname (const char *url, char **user, char **passwd)
643 const char *p, *q, *col;
649 /* Look for the end of the scheme identifier. */
650 l = url_skip_scheme (url);
654 /* Is there an `@' character? */
655 for (p = url; *p && *p != '/'; p++)
658 /* If not, return. */
661 /* Else find the username and password. */
662 for (p = q = col = url; *p && *p != '/'; p++)
664 if (*p == ':' && !*user)
666 *user = (char *)xmalloc (p - url + 1);
667 memcpy (*user, url, p - url);
668 (*user)[p - url] = '\0';
671 if (*p == '@') q = p;
673 /* Decide whether you have only the username or both. */
674 where = *user ? passwd : user;
675 *where = (char *)xmalloc (q - col + 1);
676 memcpy (*where, col, q - col);
677 (*where)[q - col] = '\0';
681 /* If PATH ends with `;type=X', return the character X. */
683 process_ftp_type (char *path)
685 int len = strlen (path);
688 && !memcmp (path + len - 7, ";type=", 6))
690 path[len - 7] = '\0';
691 return path[len - 1];
697 /* Recreate the URL string from the data in urlinfo. This can be used
698 to create a "canonical" representation of the URL. If `hide' is
699 non-zero (as it is when we're calling this on a URL we plan to
700 print, but not when calling it to canonicalize a URL for use within
701 the program), password will be hidden. The forbidden characters in
702 the URL will be cleansed. */
704 str_url (const struct urlinfo *u, int hide)
706 char *res, *host, *user, *passwd, *scheme_name, *dir, *file;
707 int i, l, ln, lu, lh, lp, lf, ld;
708 unsigned short default_port;
710 /* Look for the scheme. */
711 for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
712 if (supported_schemes[i].scheme == u->scheme)
714 if (i == ARRAY_SIZE (supported_schemes))
716 scheme_name = supported_schemes[i].leading_string;
717 default_port = supported_schemes[i].default_port;
718 host = encode_string (u->host);
719 dir = encode_string (u->dir);
720 file = encode_string (u->file);
721 user = passwd = NULL;
723 user = encode_string (u->user);
727 /* Don't output the password, or someone might see it over the user's
728 shoulder (or in saved wget output). Don't give away the number of
729 characters in the password, either, as we did in past versions of
730 this code, when we replaced the password characters with 'x's. */
731 passwd = xstrdup("<password>");
733 passwd = encode_string (u->passwd);
735 if (u->scheme == SCHEME_FTP && *dir == '/')
737 char *tmp = (char *)xmalloc (strlen (dir) + 3);
738 /*sprintf (tmp, "%%2F%s", dir + 1);*/
742 strcpy (tmp + 3, dir + 1);
747 ln = strlen (scheme_name);
748 lu = user ? strlen (user) : 0;
749 lp = passwd ? strlen (passwd) : 0;
753 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
754 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", scheme_name,
755 (user ? user : ""), (passwd ? ":" : ""),
756 (passwd ? passwd : ""), (user ? "@" : ""),
757 host, u->port, dir, *dir ? "/" : "", file); */
759 memcpy (res, scheme_name, ln);
763 memcpy (res + l, user, lu);
768 memcpy (res + l, passwd, lp);
773 memcpy (res + l, host, lh);
775 if (u->port != default_port)
778 long_to_string (res + l, (long)u->port);
779 l += numdigit (u->port);
782 memcpy (res + l, dir, ld);
786 strcpy (res + l, file);
795 /* Check whether two URL-s are equivalent, i.e. pointing to the same
796 location. Uses parseurl to parse them, and compares the canonical
799 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
800 return 0 on error. */
801 /* Do not compile unused code. */
804 url_equal (const char *url1, const char *url2)
806 struct urlinfo *u1, *u2;
811 err = parseurl (url1, u1, 0);
818 err = parseurl (url2, u2, 0);
825 res = !strcmp (u1->url, u2->url);
833 get_urls_file (const char *file)
835 struct file_memory *fm;
837 const char *text, *text_end;
840 fm = read_file (file);
843 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
846 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
849 text_end = fm->content + fm->length;
850 while (text < text_end)
852 const char *line_beg = text;
853 const char *line_end = memchr (text, '\n', text_end - text);
859 while (line_beg < line_end
860 && ISSPACE (*line_beg))
862 while (line_end > line_beg + 1
863 && ISSPACE (*(line_end - 1)))
865 if (line_end > line_beg)
867 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
868 memset (entry, 0, sizeof (*entry));
870 entry->url = strdupdelim (line_beg, line_end);
882 /* Free the linked list of urlpos. */
884 free_urlpos (urlpos *l)
888 urlpos *next = l->next;
890 FREE_MAYBE (l->local_name);
896 /* Rotate FNAME opt.backups times */
898 rotate_backups(const char *fname)
900 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
901 char *from = (char *)alloca (maxlen);
902 char *to = (char *)alloca (maxlen);
906 if (stat (fname, &sb) == 0)
907 if (S_ISREG (sb.st_mode) == 0)
910 for (i = opt.backups; i > 1; i--)
912 sprintf (from, "%s.%d", fname, i - 1);
913 sprintf (to, "%s.%d", fname, i);
914 /* #### This will fail on machines without the rename() system
919 sprintf (to, "%s.%d", fname, 1);
923 /* Create all the necessary directories for PATH (a file). Calls
924 mkdirhier() internally. */
926 mkalldirs (const char *path)
933 p = path + strlen (path);
934 for (; *p != '/' && p != path; p--);
935 /* Don't create if it's just a file. */
936 if ((p == path) && (*p != '/'))
938 t = strdupdelim (path, p);
939 /* Check whether the directory exists. */
940 if ((stat (t, &st) == 0))
942 if (S_ISDIR (st.st_mode))
949 /* If the dir exists as a file name, remove it first. This
950 is *only* for Wget to work with buggy old CERN http
951 servers. Here is the scenario: When Wget tries to
952 retrieve a directory without a slash, e.g.
953 http://foo/bar (bar being a directory), CERN server will
954 not redirect it too http://foo/bar/ -- it will generate a
955 directory listing containing links to bar/file1,
956 bar/file2, etc. Wget will lose because it saves this
957 HTML listing to a file `bar', so it cannot create the
958 directory. To work around this, if the file of the same
959 name exists, we just remove it and create the directory
961 DEBUGP (("Removing %s because of directory danger!\n", t));
965 res = make_directory (t);
967 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
973 count_slashes (const char *s)
982 /* Return the path name of the URL-equivalent file name, with a
983 remote-like structure of directories. */
985 mkstruct (const struct urlinfo *u)
987 char *host, *dir, *file, *res, *dirpref;
990 assert (u->dir != NULL);
991 assert (u->host != NULL);
995 char *ptr = u->dir + (*u->dir == '/');
996 int slash_count = 1 + count_slashes (ptr);
997 int cut = MINVAL (opt.cut_dirs, slash_count);
998 for (; cut && *ptr; ptr++)
1001 STRDUP_ALLOCA (dir, ptr);
1004 dir = u->dir + (*u->dir == '/');
1006 host = xstrdup (u->host);
1007 /* Check for the true name (or at least a consistent name for saving
1008 to directory) of HOST, reusing the hlist if possible. */
1009 if (opt.add_hostdir && !opt.simple_check)
1011 char *nhost = realhost (host);
1015 /* Add dir_prefix and hostname (if required) to the beginning of
1017 if (opt.add_hostdir)
1019 if (!DOTP (opt.dir_prefix))
1021 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1022 + strlen (host) + 1);
1023 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1026 STRDUP_ALLOCA (dirpref, host);
1028 else /* not add_hostdir */
1030 if (!DOTP (opt.dir_prefix))
1031 dirpref = opt.dir_prefix;
1037 /* If there is a prefix, prepend it. */
1040 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1041 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1044 dir = encode_string (dir);
1046 if (l && dir[l - 1] == '/')
1050 file = "index.html";
1054 /* Finally, construct the full name. */
1055 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1056 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1061 /* Return a malloced copy of S, but protect any '/' characters. */
1064 file_name_protect_query_string (const char *s)
1069 for (from = s; *from; from++)
1073 destlen += 2; /* each / gets replaced with %2F, so
1074 it adds two more chars. */
1076 dest = (char *)xmalloc (destlen + 1);
1077 for (from = s, to = dest; *from; from++)
1088 assert (to - dest == destlen);
1093 /* Create a unique filename, corresponding to a given URL. Calls
1094 mkstruct if necessary. Does *not* actually create any directories. */
1096 url_filename (const struct urlinfo *u)
1099 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1103 file = mkstruct (u);
1109 file = xstrdup ("index.html");
1112 /* If the URL came with a query string, u->file will contain
1113 a question mark followed by query string contents. These
1114 contents can contain '/' which would make us create
1115 unwanted directories. These slashes must be protected
1117 if (!strchr (u->file, '/'))
1118 file = xstrdup (u->file);
1121 /*assert (strchr (u->file, '?') != NULL);*/
1122 file = file_name_protect_query_string (u->file);
1129 /* Check whether the prefix directory is something other than "."
1130 before prepending it. */
1131 if (!DOTP (opt.dir_prefix))
1133 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1134 + 1 + strlen (file) + 1);
1135 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1140 /* DOS-ish file systems don't like `%' signs in them; we change it
1145 for (p = file; *p; p++)
1149 #endif /* WINDOWS */
1151 /* Check the cases in which the unique extensions are not used:
1152 1) Clobbering is turned off (-nc).
1153 2) Retrieval with regetting.
1154 3) Timestamping is used.
1155 4) Hierarchy is built.
1157 The exception is the case when file does exist and is a
1158 directory (actually support for bad httpd-s). */
1159 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1160 && !(file_exists_p (file) && !file_non_directory_p (file)))
1163 /* Find a unique name. */
1164 name = unique_name (file);
1169 /* Like strlen(), but allow the URL to be ended with '?'. */
1171 urlpath_length (const char *url)
1173 const char *q = strchr (url, '?');
1176 return strlen (url);
1179 /* Find the last occurrence of character C in the range [b, e), or
1180 NULL, if none are present. This is almost completely equivalent to
1181 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1182 the contents of the string. */
1184 find_last_char (const char *b, const char *e, char c)
1192 /* Resolve the result of "linking" a base URI (BASE) to a
1193 link-specified URI (LINK).
1195 Either of the URIs may be absolute or relative, complete with the
1196 host name, or path only. This tries to behave "reasonably" in all
1197 foreseeable cases. It employs little specific knowledge about
1198 schemes or URL-specific stuff -- it just works on strings.
1200 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1201 See uri_merge for a gentler interface to this functionality.
1203 #### This function should handle `./' and `../' so that the evil
1204 path_simplify can go. */
1206 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1212 const char *end = base + urlpath_length (base);
1216 /* LINK is a relative URL: we need to replace everything
1217 after last slash (possibly empty) with LINK.
1219 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1220 our result should be "whatever/foo/qux/xyzzy". */
1221 int need_explicit_slash = 0;
1223 const char *start_insert;
1224 const char *last_slash = find_last_char (base, end, '/');
1227 /* No slash found at all. Append LINK to what we have,
1228 but we'll need a slash as a separator.
1230 Example: if base == "foo" and link == "qux/xyzzy", then
1231 we cannot just append link to base, because we'd get
1232 "fooqux/xyzzy", whereas what we want is
1235 To make sure the / gets inserted, we set
1236 need_explicit_slash to 1. We also set start_insert
1237 to end + 1, so that the length calculations work out
1238 correctly for one more (slash) character. Accessing
1239 that character is fine, since it will be the
1240 delimiter, '\0' or '?'. */
1241 /* example: "foo?..." */
1242 /* ^ ('?' gets changed to '/') */
1243 start_insert = end + 1;
1244 need_explicit_slash = 1;
1246 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1248 /* example: http://host" */
1250 start_insert = end + 1;
1251 need_explicit_slash = 1;
1255 /* example: "whatever/foo/bar" */
1257 start_insert = last_slash + 1;
1260 span = start_insert - base;
1261 constr = (char *)xmalloc (span + linklength + 1);
1263 memcpy (constr, base, span);
1264 if (need_explicit_slash)
1265 constr[span - 1] = '/';
1267 memcpy (constr + span, link, linklength);
1268 constr[span + linklength] = '\0';
1270 else /* *link == `/' */
1272 /* LINK is an absolute path: we need to replace everything
1273 after (and including) the FIRST slash with LINK.
1275 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1276 "/qux/xyzzy", our result should be
1277 "http://host/qux/xyzzy". */
1280 const char *start_insert = NULL; /* for gcc to shut up. */
1281 const char *pos = base;
1282 int seen_slash_slash = 0;
1283 /* We're looking for the first slash, but want to ignore
1286 slash = memchr (pos, '/', end - pos);
1287 if (slash && !seen_slash_slash)
1288 if (*(slash + 1) == '/')
1291 seen_slash_slash = 1;
1295 /* At this point, SLASH is the location of the first / after
1296 "//", or the first slash altogether. START_INSERT is the
1297 pointer to the location where LINK will be inserted. When
1298 examining the last two examples, keep in mind that LINK
1301 if (!slash && !seen_slash_slash)
1302 /* example: "foo" */
1304 start_insert = base;
1305 else if (!slash && seen_slash_slash)
1306 /* example: "http://foo" */
1309 else if (slash && !seen_slash_slash)
1310 /* example: "foo/bar" */
1312 start_insert = base;
1313 else if (slash && seen_slash_slash)
1314 /* example: "http://something/" */
1316 start_insert = slash;
1318 span = start_insert - base;
1319 constr = (char *)xmalloc (span + linklength + 1);
1321 memcpy (constr, base, span);
1323 memcpy (constr + span, link, linklength);
1324 constr[span + linklength] = '\0';
1327 else /* !no_scheme */
1329 constr = strdupdelim (link, link + linklength);
1334 /* Merge BASE with LINK and return the resulting URI. This is an
1335 interface to uri_merge_1 that assumes that LINK is a
1336 zero-terminated string. */
1338 uri_merge (const char *base, const char *link)
1340 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1343 /* Optimize URL by host, destructively replacing u->host with realhost
1344 (u->host). Do this regardless of opt.simple_check. */
1346 opt_url (struct urlinfo *u)
1348 /* Find the "true" host. */
1349 char *host = realhost (u->host);
1352 assert (u->dir != NULL); /* the URL must have been parsed */
1353 /* Refresh the printed representation. */
1355 u->url = str_url (u, 0);
1358 /* Returns proxy host address, in accordance with SCHEME. */
1360 getproxy (enum url_scheme scheme)
1363 char *rewritten_url;
1364 static char rewritten_storage[1024];
1369 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1373 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1377 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1379 case SCHEME_INVALID:
1382 if (!proxy || !*proxy)
1385 /* Handle shorthands. */
1386 rewritten_url = rewrite_url_maybe (proxy);
1389 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1390 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1391 proxy = rewritten_storage;
1397 /* Should a host be accessed through proxy, concerning no_proxy? */
1399 no_proxy_match (const char *host, const char **no_proxy)
1404 return !sufmatch (no_proxy, host);
1407 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1408 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1410 /* Change the links in an HTML document. Accepts a structure that
1411 defines the positions of all the links. */
1413 convert_links (const char *file, urlpos *l)
1415 struct file_memory *fm;
1418 downloaded_file_t downloaded_file_return;
1420 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1423 /* First we do a "dry run": go through the list L and see whether
1424 any URL needs to be converted in the first place. If not, just
1425 leave the file alone. */
1428 for (dry = l; dry; dry = dry->next)
1429 if (dry->convert != CO_NOCONVERT)
1433 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1438 fm = read_file (file);
1441 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1442 file, strerror (errno));
1446 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1447 if (opt.backup_converted && downloaded_file_return)
1448 write_backup_file (file, downloaded_file_return);
1450 /* Before opening the file for writing, unlink the file. This is
1451 important if the data in FM is mmaped. In such case, nulling the
1452 file, which is what fopen() below does, would make us read all
1453 zeroes from the mmaped region. */
1454 if (unlink (file) < 0 && errno != ENOENT)
1456 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1457 file, strerror (errno));
1458 read_file_free (fm);
1461 /* Now open the file for writing. */
1462 fp = fopen (file, "wb");
1465 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1466 file, strerror (errno));
1467 read_file_free (fm);
1470 /* Here we loop through all the URLs in file, replacing those of
1471 them that are downloaded with relative references. */
1473 for (; l; l = l->next)
1475 char *url_start = fm->content + l->pos;
1477 if (l->pos >= fm->length)
1479 DEBUGP (("Something strange is going on. Please investigate."));
1482 /* If the URL is not to be converted, skip it. */
1483 if (l->convert == CO_NOCONVERT)
1485 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1489 /* Echo the file contents, up to the offending URL's opening
1490 quote, to the outfile. */
1491 fwrite (p, 1, url_start - p, fp);
1493 if (l->convert == CO_CONVERT_TO_RELATIVE)
1495 /* Convert absolute URL to relative. */
1496 char *newname = construct_relative (file, l->local_name);
1497 char *quoted_newname = html_quote_string (newname);
1498 replace_attr (&p, l->size, fp, quoted_newname);
1499 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1500 l->url, newname, l->pos, file));
1502 xfree (quoted_newname);
1504 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1506 /* Convert the link to absolute URL. */
1507 char *newlink = l->url;
1508 char *quoted_newlink = html_quote_string (newlink);
1509 replace_attr (&p, l->size, fp, quoted_newlink);
1510 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1511 newlink, l->pos, file));
1512 xfree (quoted_newlink);
1515 /* Output the rest of the file. */
1516 if (p - fm->content < fm->length)
1517 fwrite (p, 1, fm->length - (p - fm->content), fp);
1519 read_file_free (fm);
1520 logputs (LOG_VERBOSE, _("done.\n"));
1523 /* Construct and return a malloced copy of the relative link from two
1524 pieces of information: local name S1 of the referring file and
1525 local name S2 of the referred file.
1527 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1528 "jagor.srce.hr/images/news.gif", the function will return
1531 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1532 "fly.cc.fer.hr/images/fly.gif", the function will return
1533 "../images/fly.gif".
1535 Caveats: S1 should not begin with `/', unless S2 also begins with
1536 '/'. S1 should not contain things like ".." and such --
1537 construct_relative ("fly/ioccc/../index.html",
1538 "fly/images/fly.gif") will fail. (A workaround is to call
1539 something like path_simplify() on S1). */
1541 construct_relative (const char *s1, const char *s2)
1543 int i, cnt, sepdirs1;
1547 return xstrdup (s2);
1548 /* S1 should *not* be absolute, if S2 wasn't. */
1549 assert (*s1 != '/');
1551 /* Skip the directories common to both strings. */
1554 while (s1[i] && s2[i]
1559 if (s1[i] == '/' && s2[i] == '/')
1564 for (sepdirs1 = 0; s1[i]; i++)
1567 /* Now, construct the file as of:
1568 - ../ repeated sepdirs1 time
1569 - all the non-mutual directories of S2. */
1570 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1571 for (i = 0; i < sepdirs1; i++)
1572 memcpy (res + 3 * i, "../", 3);
1573 strcpy (res + 3 * i, s2 + cnt);
1577 /* Add URL to the head of the list L. */
1579 add_url (urlpos *l, const char *url, const char *file)
1583 t = (urlpos *)xmalloc (sizeof (urlpos));
1584 memset (t, 0, sizeof (*t));
1585 t->url = xstrdup (url);
1586 t->local_name = xstrdup (file);
1592 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1594 /* Rather than just writing over the original .html file with the
1595 converted version, save the former to *.orig. Note we only do
1596 this for files we've _successfully_ downloaded, so we don't
1597 clobber .orig files sitting around from previous invocations. */
1599 /* Construct the backup filename as the original name plus ".orig". */
1600 size_t filename_len = strlen(file);
1601 char* filename_plus_orig_suffix;
1602 boolean already_wrote_backup_file = FALSE;
1603 slist* converted_file_ptr;
1604 static slist* converted_files = NULL;
1606 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1608 /* Just write "orig" over "html". We need to do it this way
1609 because when we're checking to see if we've downloaded the
1610 file before (to see if we can skip downloading it), we don't
1611 know if it's a text/html file. Therefore we don't know yet
1612 at that stage that -E is going to cause us to tack on
1613 ".html", so we need to compare vs. the original URL plus
1614 ".orig", not the original URL plus ".html.orig". */
1615 filename_plus_orig_suffix = alloca (filename_len + 1);
1616 strcpy(filename_plus_orig_suffix, file);
1617 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1619 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1621 /* Append ".orig" to the name. */
1622 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1623 strcpy(filename_plus_orig_suffix, file);
1624 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1627 /* We can get called twice on the same URL thanks to the
1628 convert_all_links() call in main(). If we write the .orig file
1629 each time in such a case, it'll end up containing the first-pass
1630 conversion, not the original file. So, see if we've already been
1631 called on this file. */
1632 converted_file_ptr = converted_files;
1633 while (converted_file_ptr != NULL)
1634 if (strcmp(converted_file_ptr->string, file) == 0)
1636 already_wrote_backup_file = TRUE;
1640 converted_file_ptr = converted_file_ptr->next;
1642 if (!already_wrote_backup_file)
1644 /* Rename <file> to <file>.orig before former gets written over. */
1645 if (rename(file, filename_plus_orig_suffix) != 0)
1646 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1647 file, filename_plus_orig_suffix, strerror (errno));
1649 /* Remember that we've already written a .orig backup for this file.
1650 Note that we never free this memory since we need it till the
1651 convert_all_links() call, which is one of the last things the
1652 program does before terminating. BTW, I'm not sure if it would be
1653 safe to just set 'converted_file_ptr->string' to 'file' below,
1654 rather than making a copy of the string... Another note is that I
1655 thought I could just add a field to the urlpos structure saying
1656 that we'd written a .orig file for this URL, but that didn't work,
1657 so I had to make this separate list.
1658 -- Dan Harkless <wget@harkless.org>
1660 This [adding a field to the urlpos structure] didn't work
1661 because convert_file() is called twice: once after all its
1662 sublinks have been retrieved in recursive_retrieve(), and
1663 once at the end of the day in convert_all_links(). The
1664 original linked list collected in recursive_retrieve() is
1665 lost after the first invocation of convert_links(), and
1666 convert_all_links() makes a new one (it calls get_urls_html()
1667 for each file it covers.) That's why your first approach didn't
1668 work. The way to make it work is perhaps to make this flag a
1669 field in the `urls_html' list.
1670 -- Hrvoje Niksic <hniksic@arsdigita.com>
1672 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1673 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1674 converted_file_ptr->next = converted_files;
1675 converted_files = converted_file_ptr;
1679 static int find_fragment PARAMS ((const char *, int, const char **,
1683 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1685 const char *p = *pp;
1687 int size = raw_size;
1688 char quote_char = '\"';
1689 const char *frag_beg, *frag_end;
1691 /* Structure of our string is:
1692 "...old-contents..."
1693 <--- l->size ---> (with quotes)
1696 <--- l->size --> (no quotes) */
1698 if (*p == '\"' || *p == '\'')
1703 size -= 2; /* disregard opening and closing quote */
1705 putc (quote_char, fp);
1706 fputs (new_str, fp);
1708 /* Look for fragment identifier, if any. */
1709 if (find_fragment (p, size, &frag_beg, &frag_end))
1710 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1714 putc (quote_char, fp);
1718 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1719 preceded by '&'. If the character is not found, return zero. If
1720 the character is found, return 1 and set BP and EP to point to the
1721 beginning and end of the region.
1723 This is used for finding the fragment indentifiers in URLs. */
1726 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1728 const char *end = beg + size;
1730 for (; beg < end; beg++)
1752 typedef struct _downloaded_file_list {
1754 downloaded_file_t download_type;
1755 struct _downloaded_file_list* next;
1756 } downloaded_file_list;
1758 static downloaded_file_list *downloaded_files;
1760 /* Remembers which files have been downloaded. In the standard case, should be
1761 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1762 download successfully (i.e. not for ones we have failures on or that we skip
1765 When we've downloaded a file and tacked on a ".html" extension due to -E,
1766 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1767 FILE_DOWNLOADED_NORMALLY.
1769 If you just want to check if a file has been previously added without adding
1770 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1771 with local filenames, not remote URLs. */
1773 downloaded_file (downloaded_file_t mode, const char* file)
1775 boolean found_file = FALSE;
1776 downloaded_file_list* rover = downloaded_files;
1778 while (rover != NULL)
1779 if (strcmp(rover->file, file) == 0)
1785 rover = rover->next;
1788 return rover->download_type; /* file had already been downloaded */
1791 if (mode != CHECK_FOR_FILE)
1793 rover = xmalloc(sizeof(*rover));
1794 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1795 rover->download_type = mode;
1796 rover->next = downloaded_files;
1797 downloaded_files = rover;
1800 return FILE_NOT_ALREADY_DOWNLOADED;
1805 downloaded_files_free (void)
1807 downloaded_file_list* rover = downloaded_files;
1810 downloaded_file_list *next = rover->next;
1811 xfree (rover->file);