2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
46 /* Default port definitions */
47 #define DEFAULT_HTTP_PORT 80
48 #define DEFAULT_FTP_PORT 21
49 #define DEFAULT_HTTPS_PORT 443
51 /* Table of Unsafe chars. This is intialized in
52 init_unsafe_char_table. */
54 static char unsafe_char_table[256];
56 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
58 /* If S contains unsafe characters, free it and replace it with a
59 version that doesn't. */
60 #define URL_CLEANSE(s) do \
62 if (contains_unsafe (s)) \
64 char *uc_tmp = encode_string (s); \
70 /* Is a directory "."? */
71 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
72 /* Is a directory ".."? */
73 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
76 static void path_simplify_with_kludge PARAMS ((char *));
78 static int urlpath_length PARAMS ((const char *));
80 /* NULL-terminated list of strings to be recognized as prototypes (URL
81 schemes). Note that recognized doesn't mean supported -- only HTTP,
82 HTTPS and FTP are currently supported .
84 However, a string that does not match anything in the list will be
85 considered a relative URL. Thus it's important that this list has
86 anything anyone could think of being legal.
88 There are wild things here. :-) Take a look at
89 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
91 static char *protostrings[] =
133 /* Similar to former, but for supported protocols: */
134 static struct proto sup_protos[] =
136 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
138 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
140 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
141 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
144 static void parse_dir PARAMS ((const char *, char **, char **));
145 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
146 static char *construct PARAMS ((const char *, const char *, int , int));
147 static char *construct_relative PARAMS ((const char *, const char *));
148 static char process_ftp_type PARAMS ((char *));
151 /* Returns the number of characters to be skipped if the first thing
152 in a URL is URL: (which is 0 or 4+). The optional spaces after
153 URL: are also skipped. */
155 skip_url (const char *url)
159 if (TOUPPER (url[0]) == 'U'
160 && TOUPPER (url[1]) == 'R'
161 && TOUPPER (url[2]) == 'L'
165 for (i = 4; url[i] && ISSPACE (url[i]); i++);
174 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
175 - @ and :, for user/password encoding.
176 - everything over 127 (but we don't bother with recording those. */
178 init_unsafe_char_table (void)
181 for (i = 0; i < 256; i++)
182 if (i < 32 || i >= 127
198 unsafe_char_table[i] = 1;
201 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
203 contains_unsafe (const char *s)
206 if (UNSAFE_CHAR (*s))
211 /* Decodes the forms %xy in a URL to the character the hexadecimal
212 code of which is xy. xy are hexadecimal digits from
213 [0123456789ABCDEF] (case-insensitive). If x or y are not
214 hex-digits or `%' precedes `\0', the sequence is inserted
218 decode_string (char *s)
228 /* Do nothing if at the end of the string, or if the chars
229 are not hex-digits. */
230 if (!*(s + 1) || !*(s + 2)
231 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
236 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
243 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
244 given string, returning a malloc-ed %XX encoded string. */
246 encode_string (const char *s)
253 for (i = 0; *s; s++, i++)
254 if (UNSAFE_CHAR (*s))
255 i += 2; /* Two more characters (hex digits) */
256 res = (char *)xmalloc (i + 1);
258 for (p = res; *s; s++)
259 if (UNSAFE_CHAR (*s))
261 const unsigned char c = *s;
263 *p++ = HEXD2ASC (c >> 4);
264 *p++ = HEXD2ASC (c & 0xf);
272 /* Returns the proto-type if URL's protocol is supported, or
273 URLUNKNOWN if not. */
275 urlproto (const char *url)
279 url += skip_url (url);
280 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
281 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
282 return sup_protos[i].ind;
283 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
286 for (++i; url[i] && url[i] != '/'; i++)
287 if (!ISDIGIT (url[i]))
289 if (url[i - 1] == ':')
298 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
299 part is found, returns 0. */
301 skip_proto (const char *url)
306 for (s = protostrings; *s; s++)
307 if (!strncasecmp (*s, url, strlen (*s)))
312 /* HTTP and FTP protocols are expected to yield exact host names
313 (i.e. the `//' part must be skipped, too). */
314 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
319 /* Returns 1 if the URL begins with a protocol (supported or
320 unsupported), 0 otherwise. */
322 has_proto (const char *url)
326 url += skip_url (url);
327 for (s = protostrings; *s; s++)
328 if (strncasecmp (url, *s, strlen (*s)) == 0)
333 /* Skip the username and password, if present here. The function
334 should be called *not* with the complete URL, but with the part
335 right after the protocol.
337 If no username and password are found, return 0. */
339 skip_uname (const char *url)
342 const char *q = NULL;
343 for (p = url ; *p && *p != '/'; p++)
344 if (*p == '@') q = p;
345 /* If a `@' was found before the first occurrence of `/', skip
353 /* Allocate a new urlinfo structure, fill it with default values and
354 return a pointer to it. */
360 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
361 memset (u, 0, sizeof (*u));
362 u->proto = URLUNKNOWN;
366 /* Perform a "deep" free of the urlinfo structure. The structure
367 should have been created with newurl, but need not have been used.
368 If free_pointer is non-0, free the pointer itself. */
370 freeurl (struct urlinfo *u, int complete)
374 FREE_MAYBE (u->host);
375 FREE_MAYBE (u->path);
376 FREE_MAYBE (u->file);
378 FREE_MAYBE (u->user);
379 FREE_MAYBE (u->passwd);
380 FREE_MAYBE (u->local);
381 FREE_MAYBE (u->referer);
383 freeurl (u->proxy, 1);
389 /* Extract the given URL of the form
390 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
391 1. hostname (terminated with `/' or `:')
392 2. port number (terminated with `/'), or chosen for the protocol
393 3. dirname (everything after hostname)
394 Most errors are handled. No allocation is done, you must supply
395 pointers to allocated memory.
396 ...and a host of other stuff :-)
398 - Recognizes hostname:dir/file for FTP and
399 hostname (:portnum)?/dir/file for HTTP.
400 - Parses the path to yield directory and file
401 - Parses the URL to yield the username and passwd (if present)
402 - Decodes the strings, in case they contain "forbidden" characters
403 - Writes the result to struct urlinfo
405 If the argument STRICT is set, it recognizes only the canonical
408 parseurl (const char *url, struct urlinfo *u, int strict)
411 int recognizable; /* Recognizable URL is the one where
412 the protocol name was explicitly
413 named, i.e. it wasn't deduced from
417 DEBUGP (("parseurl (\"%s\") -> ", url));
418 url += skip_url (url);
419 recognizable = has_proto (url);
420 if (strict && !recognizable)
422 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
424 l = strlen (sup_protos[i].name);
425 if (!strncasecmp (sup_protos[i].name, url, l))
428 /* If protocol is recognizable, but unsupported, bail out, else
430 if (recognizable && i == ARRAY_SIZE (sup_protos))
432 else if (i == ARRAY_SIZE (sup_protos))
435 u->proto = type = sup_protos[i].ind;
437 if (type == URLUNKNOWN)
439 /* Allow a username and password to be specified (i.e. just skip
442 l += skip_uname (url + l);
443 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
446 /* Get the hostname. */
447 u->host = strdupdelim (url + l, url + i);
448 DEBUGP (("host %s -> ", u->host));
450 /* Assume no port has been given. */
454 /* We have a colon delimiting the hostname. It could mean that
455 a port number is following it, or a directory. */
456 if (ISDIGIT (url[++i])) /* A port number */
458 if (type == URLUNKNOWN)
459 u->proto = type = URLHTTP;
460 for (; url[i] && url[i] != '/'; i++)
461 if (ISDIGIT (url[i]))
462 u->port = 10 * u->port + (url[i] - '0');
467 DEBUGP (("port %hu -> ", u->port));
469 else if (type == URLUNKNOWN) /* or a directory */
470 u->proto = type = URLFTP;
471 else /* or just a misformed port number */
474 else if (type == URLUNKNOWN)
475 u->proto = type = URLHTTP;
479 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
480 if (sup_protos[ind].ind == type)
482 if (ind == ARRAY_SIZE (sup_protos))
484 u->port = sup_protos[ind].port;
486 /* Some delimiter troubles... */
487 if (url[i] == '/' && url[i - 1] != ':')
490 while (url[i] && url[i] == '/')
492 u->path = (char *)xmalloc (strlen (url + i) + 8);
493 strcpy (u->path, url + i);
496 u->ftp_type = process_ftp_type (u->path);
497 /* #### We don't handle type `d' correctly yet. */
498 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
500 DEBUGP (("ftp_type %c -> ", u->ftp_type));
502 DEBUGP (("opath %s -> ", u->path));
503 /* Parse the username and password (if existing). */
504 parse_uname (url, &u->user, &u->passwd);
505 /* Decode the strings, as per RFC 1738. */
506 decode_string (u->host);
507 decode_string (u->path);
509 decode_string (u->user);
511 decode_string (u->passwd);
512 /* Parse the directory. */
513 parse_dir (u->path, &u->dir, &u->file);
514 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
515 /* Simplify the directory. */
516 path_simplify (u->dir);
517 /* Remove the leading `/' in HTTP. */
518 if (type == URLHTTP && *u->dir == '/')
519 strcpy (u->dir, u->dir + 1);
520 DEBUGP (("ndir %s\n", u->dir));
521 /* Strip trailing `/'. */
523 if (l && u->dir[l - 1] == '/')
524 u->dir[l - 1] = '\0';
525 /* Re-create the path: */
526 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
527 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
528 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
529 strcpy (u->path, abs_ftp ? "%2F" : "/");
530 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
531 strcat (u->path, *u->dir ? "/" : "");
532 strcat (u->path, u->file);
533 URL_CLEANSE (u->path);
534 DEBUGP (("newpath: %s\n", u->path));
535 /* Create the clean URL. */
536 u->url = str_url (u, 0);
540 /* Special versions of DOTP and DDOTP for parse_dir(). */
542 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
543 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
544 && (!*((x) + 2) || *((x) + 2) == '?'))
546 /* Build the directory and filename components of the path. Both
547 components are *separately* malloc-ed strings! It does not change
548 the contents of path.
550 If the path ends with "." or "..", they are (correctly) counted as
553 parse_dir (const char *path, char **dir, char **file)
557 l = urlpath_length (path);
558 for (i = l; i && path[i] != '/'; i--);
560 if (!i && *path != '/') /* Just filename */
562 if (PD_DOTP (path) || PD_DDOTP (path))
564 *dir = strdupdelim (path, path + l);
565 *file = xstrdup (path + l); /* normally empty, but could
570 *dir = xstrdup (""); /* This is required because of FTP */
571 *file = xstrdup (path);
574 else if (!i) /* /filename */
576 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
578 *dir = strdupdelim (path, path + l);
579 *file = xstrdup (path + l); /* normally empty, but could
584 *dir = xstrdup ("/");
585 *file = xstrdup (path + 1);
588 else /* Nonempty directory with or without a filename */
590 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
592 *dir = strdupdelim (path, path + l);
593 *file = xstrdup (path + l); /* normally empty, but could
598 *dir = strdupdelim (path, path + i);
599 *file = xstrdup (path + i + 1);
604 /* Find the optional username and password within the URL, as per
605 RFC1738. The returned user and passwd char pointers are
608 parse_uname (const char *url, char **user, char **passwd)
611 const char *p, *q, *col;
616 url += skip_url (url);
617 /* Look for end of protocol string. */
618 l = skip_proto (url);
621 /* Add protocol offset. */
623 /* Is there an `@' character? */
624 for (p = url; *p && *p != '/'; p++)
627 /* If not, return. */
630 /* Else find the username and password. */
631 for (p = q = col = url; *p != '/'; p++)
633 if (*p == ':' && !*user)
635 *user = (char *)xmalloc (p - url + 1);
636 memcpy (*user, url, p - url);
637 (*user)[p - url] = '\0';
640 if (*p == '@') q = p;
642 /* Decide whether you have only the username or both. */
643 where = *user ? passwd : user;
644 *where = (char *)xmalloc (q - col + 1);
645 memcpy (*where, col, q - col);
646 (*where)[q - col] = '\0';
650 /* If PATH ends with `;type=X', return the character X. */
652 process_ftp_type (char *path)
654 int len = strlen (path);
657 && !memcmp (path + len - 7, ";type=", 6))
659 path[len - 7] = '\0';
660 return path[len - 1];
666 /* Return the URL as fine-formed string, with a proper protocol, optional port
667 number, directory and optional user/password. If `hide' is non-zero (as it
668 is when we're calling this on a URL we plan to print, but not when calling it
669 to canonicalize a URL for use within the program), password will be hidden.
670 The forbidden characters in the URL will be cleansed. */
672 str_url (const struct urlinfo *u, int hide)
674 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
675 int i, l, ln, lu, lh, lp, lf, ld;
676 unsigned short proto_default_port;
678 /* Look for the protocol name. */
679 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
680 if (sup_protos[i].ind == u->proto)
682 if (i == ARRAY_SIZE (sup_protos))
684 proto_name = sup_protos[i].name;
685 proto_default_port = sup_protos[i].port;
686 host = CLEANDUP (u->host);
687 dir = CLEANDUP (u->dir);
688 file = CLEANDUP (u->file);
689 user = passwd = NULL;
691 user = CLEANDUP (u->user);
695 /* Don't output the password, or someone might see it over the user's
696 shoulder (or in saved wget output). Don't give away the number of
697 characters in the password, either, as we did in past versions of
698 this code, when we replaced the password characters with 'x's. */
699 passwd = xstrdup("<password>");
701 passwd = CLEANDUP (u->passwd);
703 if (u->proto == URLFTP && *dir == '/')
705 char *tmp = (char *)xmalloc (strlen (dir) + 3);
706 /*sprintf (tmp, "%%2F%s", dir + 1);*/
710 strcpy (tmp + 3, dir + 1);
715 ln = strlen (proto_name);
716 lu = user ? strlen (user) : 0;
717 lp = passwd ? strlen (passwd) : 0;
721 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
722 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
723 (user ? user : ""), (passwd ? ":" : ""),
724 (passwd ? passwd : ""), (user ? "@" : ""),
725 host, u->port, dir, *dir ? "/" : "", file); */
727 memcpy (res, proto_name, ln);
731 memcpy (res + l, user, lu);
736 memcpy (res + l, passwd, lp);
741 memcpy (res + l, host, lh);
743 if (u->port != proto_default_port)
746 long_to_string (res + l, (long)u->port);
747 l += numdigit (u->port);
750 memcpy (res + l, dir, ld);
754 strcpy (res + l, file);
763 /* Check whether two URL-s are equivalent, i.e. pointing to the same
764 location. Uses parseurl to parse them, and compares the canonical
767 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
768 return 0 on error. */
770 url_equal (const char *url1, const char *url2)
772 struct urlinfo *u1, *u2;
777 err = parseurl (url1, u1, 0);
784 err = parseurl (url2, u2, 0);
790 res = !strcmp (u1->url, u2->url);
797 get_urls_file (const char *file)
799 struct file_memory *fm;
801 const char *text, *text_end;
804 fm = read_file (file);
807 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
810 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
813 text_end = fm->content + fm->length;
814 while (text < text_end)
816 const char *line_beg = text;
817 const char *line_end = memchr (text, '\n', text_end - text);
823 while (line_beg < line_end
824 && ISSPACE (*line_beg))
826 while (line_end > line_beg + 1
827 && ISSPACE (*(line_end - 1)))
829 if (line_end > line_beg)
831 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
832 memset (entry, 0, sizeof (*entry));
834 entry->url = strdupdelim (line_beg, line_end);
846 /* Free the linked list of urlpos. */
848 free_urlpos (urlpos *l)
852 urlpos *next = l->next;
854 FREE_MAYBE (l->local_name);
860 /* Rotate FNAME opt.backups times */
862 rotate_backups(const char *fname)
864 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
865 char *from = (char *)alloca (maxlen);
866 char *to = (char *)alloca (maxlen);
870 if (stat (fname, &sb) == 0)
871 if (S_ISREG (sb.st_mode) == 0)
874 for (i = opt.backups; i > 1; i--)
876 sprintf (from, "%s.%d", fname, i - 1);
877 sprintf (to, "%s.%d", fname, i);
878 /* #### This will fail on machines without the rename() system
883 sprintf (to, "%s.%d", fname, 1);
887 /* Create all the necessary directories for PATH (a file). Calls
888 mkdirhier() internally. */
890 mkalldirs (const char *path)
897 p = path + strlen (path);
898 for (; *p != '/' && p != path; p--);
899 /* Don't create if it's just a file. */
900 if ((p == path) && (*p != '/'))
902 t = strdupdelim (path, p);
903 /* Check whether the directory exists. */
904 if ((stat (t, &st) == 0))
906 if (S_ISDIR (st.st_mode))
913 /* If the dir exists as a file name, remove it first. This
914 is *only* for Wget to work with buggy old CERN http
915 servers. Here is the scenario: When Wget tries to
916 retrieve a directory without a slash, e.g.
917 http://foo/bar (bar being a directory), CERN server will
918 not redirect it too http://foo/bar/ -- it will generate a
919 directory listing containing links to bar/file1,
920 bar/file2, etc. Wget will lose because it saves this
921 HTML listing to a file `bar', so it cannot create the
922 directory. To work around this, if the file of the same
923 name exists, we just remove it and create the directory
925 DEBUGP (("Removing %s because of directory danger!\n", t));
929 res = make_directory (t);
931 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
937 count_slashes (const char *s)
946 /* Return the path name of the URL-equivalent file name, with a
947 remote-like structure of directories. */
949 mkstruct (const struct urlinfo *u)
951 char *host, *dir, *file, *res, *dirpref;
954 assert (u->dir != NULL);
955 assert (u->host != NULL);
959 char *ptr = u->dir + (*u->dir == '/');
960 int slash_count = 1 + count_slashes (ptr);
961 int cut = MINVAL (opt.cut_dirs, slash_count);
962 for (; cut && *ptr; ptr++)
965 STRDUP_ALLOCA (dir, ptr);
968 dir = u->dir + (*u->dir == '/');
970 host = xstrdup (u->host);
971 /* Check for the true name (or at least a consistent name for saving
972 to directory) of HOST, reusing the hlist if possible. */
973 if (opt.add_hostdir && !opt.simple_check)
975 char *nhost = realhost (host);
979 /* Add dir_prefix and hostname (if required) to the beginning of
983 if (!DOTP (opt.dir_prefix))
985 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
986 + strlen (host) + 1);
987 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
990 STRDUP_ALLOCA (dirpref, host);
992 else /* not add_hostdir */
994 if (!DOTP (opt.dir_prefix))
995 dirpref = opt.dir_prefix;
1001 /* If there is a prefix, prepend it. */
1004 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1005 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1008 dir = xstrdup (dir);
1011 if (l && dir[l - 1] == '/')
1015 file = "index.html";
1019 /* Finally, construct the full name. */
1020 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1021 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1026 /* Create a unique filename, corresponding to a given URL. Calls
1027 mkstruct if necessary. Does *not* actually create any directories. */
1029 url_filename (const struct urlinfo *u)
1032 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1036 file = mkstruct (u);
1042 file = xstrdup ("index.html");
1044 file = xstrdup (u->file);
1049 /* Check whether the prefix directory is something other than "."
1050 before prepending it. */
1051 if (!DOTP (opt.dir_prefix))
1053 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1054 + 1 + strlen (file) + 1);
1055 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1060 /* DOS-ish file systems don't like `%' signs in them; we change it
1065 for (p = file; *p; p++)
1069 #endif /* WINDOWS */
1071 /* Check the cases in which the unique extensions are not used:
1072 1) Clobbering is turned off (-nc).
1073 2) Retrieval with regetting.
1074 3) Timestamping is used.
1075 4) Hierarchy is built.
1077 The exception is the case when file does exist and is a
1078 directory (actually support for bad httpd-s). */
1079 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1080 && !(file_exists_p (file) && !file_non_directory_p (file)))
1083 /* Find a unique name. */
1084 name = unique_name (file);
1089 /* Like strlen(), but allow the URL to be ended with '?'. */
1091 urlpath_length (const char *url)
1093 const char *q = strchr (url, '?');
1096 return strlen (url);
1099 /* Find the last occurrence of character C in the range [b, e), or
1100 NULL, if none are present. This is almost completely equivalent to
1101 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1102 the contents of the string. */
1104 find_last_char (const char *b, const char *e, char c)
1112 /* Construct a URL by concatenating an absolute URL and a path, which
1113 may or may not be absolute. This tries to behave "reasonably" in
1114 all foreseeable cases. It employs little specific knowledge about
1115 protocols or URL-specific stuff -- it just works on strings. */
1117 construct (const char *url, const char *sub, int subsize, int no_proto)
1123 const char *end = url + urlpath_length (url);
1127 /* SUB is a relative URL: we need to replace everything
1128 after last slash (possibly empty) with SUB.
1130 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1131 our result should be "whatever/foo/qux/xyzzy". */
1132 int need_explicit_slash = 0;
1134 const char *start_insert;
1135 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1138 /* No slash found at all. Append SUB to what we have,
1139 but we'll need a slash as a separator.
1141 Example: if url == "foo" and sub == "qux/xyzzy", then
1142 we cannot just append sub to url, because we'd get
1143 "fooqux/xyzzy", whereas what we want is
1146 To make sure the / gets inserted, we set
1147 need_explicit_slash to 1. We also set start_insert
1148 to end + 1, so that the length calculations work out
1149 correctly for one more (slash) character. Accessing
1150 that character is fine, since it will be the
1151 delimiter, '\0' or '?'. */
1152 /* example: "foo?..." */
1153 /* ^ ('?' gets changed to '/') */
1154 start_insert = end + 1;
1155 need_explicit_slash = 1;
1157 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1159 /* example: http://host" */
1161 start_insert = end + 1;
1162 need_explicit_slash = 1;
1166 /* example: "whatever/foo/bar" */
1168 start_insert = last_slash + 1;
1171 span = start_insert - url;
1172 constr = (char *)xmalloc (span + subsize + 1);
1174 memcpy (constr, url, span);
1175 if (need_explicit_slash)
1176 constr[span - 1] = '/';
1178 memcpy (constr + span, sub, subsize);
1179 constr[span + subsize] = '\0';
1181 else /* *sub == `/' */
1183 /* SUB is an absolute path: we need to replace everything
1184 after (and including) the FIRST slash with SUB.
1186 So, if URL is "http://host/whatever/foo/bar", and SUB is
1187 "/qux/xyzzy", our result should be
1188 "http://host/qux/xyzzy". */
1191 const char *start_insert = NULL; /* for gcc to shut up. */
1192 const char *pos = url;
1193 int seen_slash_slash = 0;
1194 /* We're looking for the first slash, but want to ignore
1197 slash = memchr (pos, '/', end - pos);
1198 if (slash && !seen_slash_slash)
1199 if (*(slash + 1) == '/')
1202 seen_slash_slash = 1;
1206 /* At this point, SLASH is the location of the first / after
1207 "//", or the first slash altogether. START_INSERT is the
1208 pointer to the location where SUB will be inserted. When
1209 examining the last two examples, keep in mind that SUB
1212 if (!slash && !seen_slash_slash)
1213 /* example: "foo" */
1216 else if (!slash && seen_slash_slash)
1217 /* example: "http://foo" */
1220 else if (slash && !seen_slash_slash)
1221 /* example: "foo/bar" */
1224 else if (slash && seen_slash_slash)
1225 /* example: "http://something/" */
1227 start_insert = slash;
1229 span = start_insert - url;
1230 constr = (char *)xmalloc (span + subsize + 1);
1232 memcpy (constr, url, span);
1234 memcpy (constr + span, sub, subsize);
1235 constr[span + subsize] = '\0';
1238 else /* !no_proto */
1240 constr = strdupdelim (sub, sub + subsize);
1245 /* Like the function above, but with a saner caller interface. */
1247 url_concat (const char *base_url, const char *new_url)
1249 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1252 /* Optimize URL by host, destructively replacing u->host with realhost
1253 (u->host). Do this regardless of opt.simple_check. */
1255 opt_url (struct urlinfo *u)
1257 /* Find the "true" host. */
1258 char *host = realhost (u->host);
1261 assert (u->dir != NULL); /* the URL must have been parsed */
1262 /* Refresh the printed representation. */
1264 u->url = str_url (u, 0);
1267 /* This beautiful kludge is fortunately not needed, as I've made
1268 parse_dir do the (almost) right thing, so that a query can never
1269 become a part of directory. */
1271 /* Call path_simplify, but make sure that the part after the
1272 question-mark, if any, is not destroyed by path_simplify's
1275 path_simplify_with_kludge (char *path)
1277 char *query = strchr (path, '?');
1279 /* path_simplify also works destructively, so we also have the
1280 license to write. */
1282 path_simplify (path);
1285 char *newend = path + strlen (path);
1287 if (newend != query)
1288 memmove (newend, query, strlen (query) + 1);
1293 /* Returns proxy host address, in accordance with PROTO. */
1295 getproxy (uerr_t proto)
1297 if (proto == URLHTTP)
1298 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1299 else if (proto == URLFTP)
1300 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1302 else if (proto == URLHTTPS)
1303 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1304 #endif /* HAVE_SSL */
1309 /* Should a host be accessed through proxy, concerning no_proxy? */
1311 no_proxy_match (const char *host, const char **no_proxy)
1316 return !sufmatch (no_proxy, host);
1319 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1320 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1322 /* Change the links in an HTML document. Accepts a structure that
1323 defines the positions of all the links. */
1325 convert_links (const char *file, urlpos *l)
1327 struct file_memory *fm;
1330 downloaded_file_t downloaded_file_return;
1332 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1335 /* First we do a "dry run": go through the list L and see whether
1336 any URL needs to be converted in the first place. If not, just
1337 leave the file alone. */
1340 for (dry = l; dry; dry = dry->next)
1341 if (dry->convert != CO_NOCONVERT)
1345 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1350 fm = read_file (file);
1353 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1354 file, strerror (errno));
1358 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1359 if (opt.backup_converted && downloaded_file_return)
1360 write_backup_file (file, downloaded_file_return);
1362 /* Before opening the file for writing, unlink the file. This is
1363 important if the data in FM is mmaped. In such case, nulling the
1364 file, which is what fopen() below does, would make us read all
1365 zeroes from the mmaped region. */
1366 if (unlink (file) < 0 && errno != ENOENT)
1368 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1369 file, strerror (errno));
1370 read_file_free (fm);
1373 /* Now open the file for writing. */
1374 fp = fopen (file, "wb");
1377 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1378 file, strerror (errno));
1379 read_file_free (fm);
1382 /* Here we loop through all the URLs in file, replacing those of
1383 them that are downloaded with relative references. */
1385 for (; l; l = l->next)
1387 char *url_start = fm->content + l->pos;
1389 if (l->pos >= fm->length)
1391 DEBUGP (("Something strange is going on. Please investigate."));
1394 /* If the URL is not to be converted, skip it. */
1395 if (l->convert == CO_NOCONVERT)
1397 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1401 /* Echo the file contents, up to the offending URL's opening
1402 quote, to the outfile. */
1403 fwrite (p, 1, url_start - p, fp);
1405 if (l->convert == CO_CONVERT_TO_RELATIVE)
1407 /* Convert absolute URL to relative. */
1408 char *newname = construct_relative (file, l->local_name);
1409 char *quoted_newname = html_quote_string (newname);
1410 replace_attr (&p, l->size, fp, quoted_newname);
1411 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1412 l->url, newname, l->pos, file));
1414 xfree (quoted_newname);
1416 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1418 /* Convert the link to absolute URL. */
1419 char *newlink = l->url;
1420 char *quoted_newlink = html_quote_string (newlink);
1421 replace_attr (&p, l->size, fp, quoted_newlink);
1422 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1423 newlink, l->pos, file));
1424 xfree (quoted_newlink);
1427 /* Output the rest of the file. */
1428 if (p - fm->content < fm->length)
1429 fwrite (p, 1, fm->length - (p - fm->content), fp);
1431 read_file_free (fm);
1432 logputs (LOG_VERBOSE, _("done.\n"));
1435 /* Construct and return a malloced copy of the relative link from two
1436 pieces of information: local name S1 of the referring file and
1437 local name S2 of the referred file.
1439 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1440 "jagor.srce.hr/images/news.gif", the function will return
1443 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1444 "fly.cc.fer.hr/images/fly.gif", the function will return
1445 "../images/fly.gif".
1447 Caveats: S1 should not begin with `/', unless S2 also begins with
1448 '/'. S1 should not contain things like ".." and such --
1449 construct_relative ("fly/ioccc/../index.html",
1450 "fly/images/fly.gif") will fail. (A workaround is to call
1451 something like path_simplify() on S1). */
1453 construct_relative (const char *s1, const char *s2)
1455 int i, cnt, sepdirs1;
1459 return xstrdup (s2);
1460 /* S1 should *not* be absolute, if S2 wasn't. */
1461 assert (*s1 != '/');
1463 /* Skip the directories common to both strings. */
1466 while (s1[i] && s2[i]
1471 if (s1[i] == '/' && s2[i] == '/')
1476 for (sepdirs1 = 0; s1[i]; i++)
1479 /* Now, construct the file as of:
1480 - ../ repeated sepdirs1 time
1481 - all the non-mutual directories of S2. */
1482 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1483 for (i = 0; i < sepdirs1; i++)
1484 memcpy (res + 3 * i, "../", 3);
1485 strcpy (res + 3 * i, s2 + cnt);
1489 /* Add URL to the head of the list L. */
1491 add_url (urlpos *l, const char *url, const char *file)
1495 t = (urlpos *)xmalloc (sizeof (urlpos));
1496 memset (t, 0, sizeof (*t));
1497 t->url = xstrdup (url);
1498 t->local_name = xstrdup (file);
1504 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1506 /* Rather than just writing over the original .html file with the
1507 converted version, save the former to *.orig. Note we only do
1508 this for files we've _successfully_ downloaded, so we don't
1509 clobber .orig files sitting around from previous invocations. */
1511 /* Construct the backup filename as the original name plus ".orig". */
1512 size_t filename_len = strlen(file);
1513 char* filename_plus_orig_suffix;
1514 boolean already_wrote_backup_file = FALSE;
1515 slist* converted_file_ptr;
1516 static slist* converted_files = NULL;
1518 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1520 /* Just write "orig" over "html". We need to do it this way
1521 because when we're checking to see if we've downloaded the
1522 file before (to see if we can skip downloading it), we don't
1523 know if it's a text/html file. Therefore we don't know yet
1524 at that stage that -E is going to cause us to tack on
1525 ".html", so we need to compare vs. the original URL plus
1526 ".orig", not the original URL plus ".html.orig". */
1527 filename_plus_orig_suffix = alloca (filename_len + 1);
1528 strcpy(filename_plus_orig_suffix, file);
1529 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1531 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1533 /* Append ".orig" to the name. */
1534 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1535 strcpy(filename_plus_orig_suffix, file);
1536 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1539 /* We can get called twice on the same URL thanks to the
1540 convert_all_links() call in main(). If we write the .orig file
1541 each time in such a case, it'll end up containing the first-pass
1542 conversion, not the original file. So, see if we've already been
1543 called on this file. */
1544 converted_file_ptr = converted_files;
1545 while (converted_file_ptr != NULL)
1546 if (strcmp(converted_file_ptr->string, file) == 0)
1548 already_wrote_backup_file = TRUE;
1552 converted_file_ptr = converted_file_ptr->next;
1554 if (!already_wrote_backup_file)
1556 /* Rename <file> to <file>.orig before former gets written over. */
1557 if (rename(file, filename_plus_orig_suffix) != 0)
1558 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1559 file, filename_plus_orig_suffix, strerror (errno));
1561 /* Remember that we've already written a .orig backup for this file.
1562 Note that we never free this memory since we need it till the
1563 convert_all_links() call, which is one of the last things the
1564 program does before terminating. BTW, I'm not sure if it would be
1565 safe to just set 'converted_file_ptr->string' to 'file' below,
1566 rather than making a copy of the string... Another note is that I
1567 thought I could just add a field to the urlpos structure saying
1568 that we'd written a .orig file for this URL, but that didn't work,
1569 so I had to make this separate list.
1570 -- Dan Harkless <wget@harkless.org>
1572 This [adding a field to the urlpos structure] didn't work
1573 because convert_file() is called twice: once after all its
1574 sublinks have been retrieved in recursive_retrieve(), and
1575 once at the end of the day in convert_all_links(). The
1576 original linked list collected in recursive_retrieve() is
1577 lost after the first invocation of convert_links(), and
1578 convert_all_links() makes a new one (it calls get_urls_html()
1579 for each file it covers.) That's why your first approach didn't
1580 work. The way to make it work is perhaps to make this flag a
1581 field in the `urls_html' list.
1582 -- Hrvoje Niksic <hniksic@arsdigita.com>
1584 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1585 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1586 converted_file_ptr->next = converted_files;
1587 converted_files = converted_file_ptr;
1591 static int find_fragment PARAMS ((const char *, int, const char **,
1595 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1597 const char *p = *pp;
1599 int size = raw_size;
1600 char quote_char = '\"';
1601 const char *frag_beg, *frag_end;
1603 /* Structure of our string is:
1604 "...old-contents..."
1605 <--- l->size ---> (with quotes)
1608 <--- l->size --> (no quotes) */
1610 if (*p == '\"' || *p == '\'')
1615 size -= 2; /* disregard opening and closing quote */
1617 putc (quote_char, fp);
1618 fputs (new_str, fp);
1620 /* Look for fragment identifier, if any. */
1621 if (find_fragment (p, size, &frag_beg, &frag_end))
1622 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1626 putc (quote_char, fp);
1630 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1631 preceded by '&'. If the character is not found, return zero. If
1632 the character is found, return 1 and set BP and EP to point to the
1633 beginning and end of the region.
1635 This is used for finding the fragment indentifiers in URLs. */
1638 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1640 const char *end = beg + size;
1642 for (; beg < end; beg++)
1664 typedef struct _downloaded_file_list {
1666 downloaded_file_t download_type;
1667 struct _downloaded_file_list* next;
1668 } downloaded_file_list;
1670 static downloaded_file_list *downloaded_files;
1672 /* Remembers which files have been downloaded. In the standard case, should be
1673 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1674 download successfully (i.e. not for ones we have failures on or that we skip
1677 When we've downloaded a file and tacked on a ".html" extension due to -E,
1678 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1679 FILE_DOWNLOADED_NORMALLY.
1681 If you just want to check if a file has been previously added without adding
1682 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1683 with local filenames, not remote URLs. */
1685 downloaded_file (downloaded_file_t mode, const char* file)
1687 boolean found_file = FALSE;
1688 downloaded_file_list* rover = downloaded_files;
1690 while (rover != NULL)
1691 if (strcmp(rover->file, file) == 0)
1697 rover = rover->next;
1700 return rover->download_type; /* file had already been downloaded */
1703 if (mode != CHECK_FOR_FILE)
1705 rover = xmalloc(sizeof(*rover));
1706 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1707 rover->download_type = mode;
1708 rover->next = downloaded_files;
1709 downloaded_files = rover;
1712 return FILE_NOT_ALREADY_DOWNLOADED;
1717 downloaded_files_free (void)
1719 downloaded_file_list* rover = downloaded_files;
1722 downloaded_file_list *next = rover->next;
1723 xfree (rover->file);
1729 /* Initialization of static stuff. */
1733 init_unsafe_char_table ();