2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
46 /* Default port definitions */
47 #define DEFAULT_HTTP_PORT 80
48 #define DEFAULT_FTP_PORT 21
49 #define DEFAULT_HTTPS_PORT 443
51 /* Table of Unsafe chars. This is intialized in
52 init_unsafe_char_table. */
54 static char unsafe_char_table[256];
56 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
58 /* If S contains unsafe characters, free it and replace it with a
59 version that doesn't. */
60 #define URL_CLEANSE(s) do \
62 if (contains_unsafe (s)) \
64 char *uc_tmp = encode_string (s); \
70 /* Is a directory "."? */
71 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
72 /* Is a directory ".."? */
73 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
76 static void path_simplify_with_kludge PARAMS ((char *));
78 static int urlpath_length PARAMS ((const char *));
80 /* NULL-terminated list of strings to be recognized as prototypes (URL
81 schemes). Note that recognized doesn't mean supported -- only HTTP,
82 HTTPS and FTP are currently supported .
84 However, a string that does not match anything in the list will be
85 considered a relative URL. Thus it's important that this list has
86 anything anyone could think of being legal.
88 There are wild things here. :-) Take a look at
89 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
91 static char *protostrings[] =
133 /* Similar to former, but for supported protocols: */
134 static struct proto sup_protos[] =
136 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
138 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
140 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
141 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
144 static void parse_dir PARAMS ((const char *, char **, char **));
145 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
146 static char *construct PARAMS ((const char *, const char *, int , int));
147 static char *construct_relative PARAMS ((const char *, const char *));
148 static char process_ftp_type PARAMS ((char *));
151 /* Returns the number of characters to be skipped if the first thing
152 in a URL is URL: (which is 0 or 4+). The optional spaces after
153 URL: are also skipped. */
155 skip_url (const char *url)
159 if (TOUPPER (url[0]) == 'U'
160 && TOUPPER (url[1]) == 'R'
161 && TOUPPER (url[2]) == 'L'
165 for (i = 4; url[i] && ISSPACE (url[i]); i++);
174 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
175 - @ and :, for user/password encoding.
176 - everything over 127 (but we don't bother with recording those. */
178 init_unsafe_char_table (void)
181 for (i = 0; i < 256; i++)
182 if (i < 32 || i >= 127
198 unsafe_char_table[i] = 1;
201 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
203 contains_unsafe (const char *s)
206 if (UNSAFE_CHAR (*s))
211 /* Decodes the forms %xy in a URL to the character the hexadecimal
212 code of which is xy. xy are hexadecimal digits from
213 [0123456789ABCDEF] (case-insensitive). If x or y are not
214 hex-digits or `%' precedes `\0', the sequence is inserted
218 decode_string (char *s)
228 /* Do nothing if at the end of the string, or if the chars
229 are not hex-digits. */
230 if (!*(s + 1) || !*(s + 2)
231 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
236 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
243 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
244 given string, returning a malloc-ed %XX encoded string. */
246 encode_string (const char *s)
253 for (i = 0; *s; s++, i++)
254 if (UNSAFE_CHAR (*s))
255 i += 2; /* Two more characters (hex digits) */
256 res = (char *)xmalloc (i + 1);
258 for (p = res; *s; s++)
259 if (UNSAFE_CHAR (*s))
261 const unsigned char c = *s;
263 *p++ = HEXD2ASC (c >> 4);
264 *p++ = HEXD2ASC (c & 0xf);
272 /* Returns the proto-type if URL's protocol is supported, or
273 URLUNKNOWN if not. */
275 urlproto (const char *url)
279 url += skip_url (url);
280 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
281 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
282 return sup_protos[i].ind;
283 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
286 for (++i; url[i] && url[i] != '/'; i++)
287 if (!ISDIGIT (url[i]))
289 if (url[i - 1] == ':')
298 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
299 part is found, returns 0. */
301 skip_proto (const char *url)
306 for (s = protostrings; *s; s++)
307 if (!strncasecmp (*s, url, strlen (*s)))
312 /* HTTP and FTP protocols are expected to yield exact host names
313 (i.e. the `//' part must be skipped, too). */
314 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
319 /* Returns 1 if the URL begins with a protocol (supported or
320 unsupported), 0 otherwise. */
322 has_proto (const char *url)
326 url += skip_url (url);
327 for (s = protostrings; *s; s++)
328 if (strncasecmp (url, *s, strlen (*s)) == 0)
333 /* Skip the username and password, if present here. The function
334 should be called *not* with the complete URL, but with the part
335 right after the protocol.
337 If no username and password are found, return 0. */
339 skip_uname (const char *url)
342 for (p = url; *p && *p != '/'; p++)
345 /* If a `@' was found before the first occurrence of `/', skip
353 /* Allocate a new urlinfo structure, fill it with default values and
354 return a pointer to it. */
360 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
361 memset (u, 0, sizeof (*u));
362 u->proto = URLUNKNOWN;
366 /* Perform a "deep" free of the urlinfo structure. The structure
367 should have been created with newurl, but need not have been used.
368 If free_pointer is non-0, free the pointer itself. */
370 freeurl (struct urlinfo *u, int complete)
374 FREE_MAYBE (u->host);
375 FREE_MAYBE (u->path);
376 FREE_MAYBE (u->file);
378 FREE_MAYBE (u->user);
379 FREE_MAYBE (u->passwd);
380 FREE_MAYBE (u->local);
381 FREE_MAYBE (u->referer);
383 freeurl (u->proxy, 1);
389 /* Extract the given URL of the form
390 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
391 1. hostname (terminated with `/' or `:')
392 2. port number (terminated with `/'), or chosen for the protocol
393 3. dirname (everything after hostname)
394 Most errors are handled. No allocation is done, you must supply
395 pointers to allocated memory.
396 ...and a host of other stuff :-)
398 - Recognizes hostname:dir/file for FTP and
399 hostname (:portnum)?/dir/file for HTTP.
400 - Parses the path to yield directory and file
401 - Parses the URL to yield the username and passwd (if present)
402 - Decodes the strings, in case they contain "forbidden" characters
403 - Writes the result to struct urlinfo
405 If the argument STRICT is set, it recognizes only the canonical
408 parseurl (const char *url, struct urlinfo *u, int strict)
411 int recognizable; /* Recognizable URL is the one where
412 the protocol name was explicitly
413 named, i.e. it wasn't deduced from
417 DEBUGP (("parseurl (\"%s\") -> ", url));
418 url += skip_url (url);
419 recognizable = has_proto (url);
420 if (strict && !recognizable)
422 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
424 l = strlen (sup_protos[i].name);
425 if (!strncasecmp (sup_protos[i].name, url, l))
428 /* If protocol is recognizable, but unsupported, bail out, else
430 if (recognizable && i == ARRAY_SIZE (sup_protos))
432 else if (i == ARRAY_SIZE (sup_protos))
435 u->proto = type = sup_protos[i].ind;
437 if (type == URLUNKNOWN)
439 /* Allow a username and password to be specified (i.e. just skip
442 l += skip_uname (url + l);
443 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
446 /* Get the hostname. */
447 u->host = strdupdelim (url + l, url + i);
448 DEBUGP (("host %s -> ", u->host));
450 /* Assume no port has been given. */
454 /* We have a colon delimiting the hostname. It could mean that
455 a port number is following it, or a directory. */
456 if (ISDIGIT (url[++i])) /* A port number */
458 if (type == URLUNKNOWN)
459 u->proto = type = URLHTTP;
460 for (; url[i] && url[i] != '/'; i++)
461 if (ISDIGIT (url[i]))
462 u->port = 10 * u->port + (url[i] - '0');
467 DEBUGP (("port %hu -> ", u->port));
469 else if (type == URLUNKNOWN) /* or a directory */
470 u->proto = type = URLFTP;
471 else /* or just a misformed port number */
474 else if (type == URLUNKNOWN)
475 u->proto = type = URLHTTP;
479 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
480 if (sup_protos[ind].ind == type)
482 if (ind == ARRAY_SIZE (sup_protos))
484 u->port = sup_protos[ind].port;
486 /* Some delimiter troubles... */
487 if (url[i] == '/' && url[i - 1] != ':')
490 while (url[i] && url[i] == '/')
492 u->path = (char *)xmalloc (strlen (url + i) + 8);
493 strcpy (u->path, url + i);
496 u->ftp_type = process_ftp_type (u->path);
497 /* #### We don't handle type `d' correctly yet. */
498 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
501 DEBUGP (("opath %s -> ", u->path));
502 /* Parse the username and password (if existing). */
503 parse_uname (url, &u->user, &u->passwd);
504 /* Decode the strings, as per RFC 1738. */
505 decode_string (u->host);
506 decode_string (u->path);
508 decode_string (u->user);
510 decode_string (u->passwd);
511 /* Parse the directory. */
512 parse_dir (u->path, &u->dir, &u->file);
513 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
514 /* Simplify the directory. */
515 path_simplify (u->dir);
516 /* Remove the leading `/' in HTTP. */
517 if (type == URLHTTP && *u->dir == '/')
518 strcpy (u->dir, u->dir + 1);
519 DEBUGP (("ndir %s\n", u->dir));
520 /* Strip trailing `/'. */
522 if (l && u->dir[l - 1] == '/')
523 u->dir[l - 1] = '\0';
524 /* Re-create the path: */
525 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
526 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
527 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
528 strcpy (u->path, abs_ftp ? "%2F" : "/");
529 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
530 strcat (u->path, *u->dir ? "/" : "");
531 strcat (u->path, u->file);
532 URL_CLEANSE (u->path);
533 DEBUGP (("newpath: %s\n", u->path));
534 /* Create the clean URL. */
535 u->url = str_url (u, 0);
539 /* Special versions of DOTP and DDOTP for parse_dir(). */
541 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
542 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
543 && (!*((x) + 2) || *((x) + 2) == '?'))
545 /* Build the directory and filename components of the path. Both
546 components are *separately* malloc-ed strings! It does not change
547 the contents of path.
549 If the path ends with "." or "..", they are (correctly) counted as
552 parse_dir (const char *path, char **dir, char **file)
556 l = urlpath_length (path);
557 for (i = l; i && path[i] != '/'; i--);
559 if (!i && *path != '/') /* Just filename */
561 if (PD_DOTP (path) || PD_DDOTP (path))
563 *dir = strdupdelim (path, path + l);
564 *file = xstrdup (path + l); /* normally empty, but could
569 *dir = xstrdup (""); /* This is required because of FTP */
570 *file = xstrdup (path);
573 else if (!i) /* /filename */
575 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
577 *dir = strdupdelim (path, path + l);
578 *file = xstrdup (path + l); /* normally empty, but could
583 *dir = xstrdup ("/");
584 *file = xstrdup (path + 1);
587 else /* Nonempty directory with or without a filename */
589 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
591 *dir = strdupdelim (path, path + l);
592 *file = xstrdup (path + l); /* normally empty, but could
597 *dir = strdupdelim (path, path + i);
598 *file = xstrdup (path + i + 1);
603 /* Find the optional username and password within the URL, as per
604 RFC1738. The returned user and passwd char pointers are
607 parse_uname (const char *url, char **user, char **passwd)
615 url += skip_url (url);
616 /* Look for end of protocol string. */
617 l = skip_proto (url);
620 /* Add protocol offset. */
622 /* Is there an `@' character? */
623 for (p = url; *p && *p != '/'; p++)
626 /* If not, return. */
629 /* Else find the username and password. */
630 for (p = col = url; *p != '@'; p++)
632 if (*p == ':' && !*user)
634 *user = (char *)xmalloc (p - url + 1);
635 memcpy (*user, url, p - url);
636 (*user)[p - url] = '\0';
640 /* Decide whether you have only the username or both. */
641 where = *user ? passwd : user;
642 *where = (char *)xmalloc (p - col + 1);
643 memcpy (*where, col, p - col);
644 (*where)[p - col] = '\0';
648 /* If PATH ends with `;type=X', return the character X. */
650 process_ftp_type (char *path)
652 int len = strlen (path);
655 && !memcmp (path + len - 7, ";type=", 6))
657 path[len - 7] = '\0';
658 return path[len - 1];
664 /* Return the URL as fine-formed string, with a proper protocol, optional port
665 number, directory and optional user/password. If `hide' is non-zero (as it
666 is when we're calling this on a URL we plan to print, but not when calling it
667 to canonicalize a URL for use within the program), password will be hidden.
668 The forbidden characters in the URL will be cleansed. */
670 str_url (const struct urlinfo *u, int hide)
672 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
673 int i, l, ln, lu, lh, lp, lf, ld;
674 unsigned short proto_default_port;
676 /* Look for the protocol name. */
677 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
678 if (sup_protos[i].ind == u->proto)
680 if (i == ARRAY_SIZE (sup_protos))
682 proto_name = sup_protos[i].name;
683 proto_default_port = sup_protos[i].port;
684 host = CLEANDUP (u->host);
685 dir = CLEANDUP (u->dir);
686 file = CLEANDUP (u->file);
687 user = passwd = NULL;
689 user = CLEANDUP (u->user);
693 /* Don't output the password, or someone might see it over the user's
694 shoulder (or in saved wget output). Don't give away the number of
695 characters in the password, either, as we did in past versions of
696 this code, when we replaced the password characters with 'x's. */
697 passwd = xstrdup("<password>");
699 passwd = CLEANDUP (u->passwd);
701 if (u->proto == URLFTP && *dir == '/')
703 char *tmp = (char *)xmalloc (strlen (dir) + 3);
704 /*sprintf (tmp, "%%2F%s", dir + 1);*/
708 strcpy (tmp + 3, dir + 1);
713 ln = strlen (proto_name);
714 lu = user ? strlen (user) : 0;
715 lp = passwd ? strlen (passwd) : 0;
719 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
720 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
721 (user ? user : ""), (passwd ? ":" : ""),
722 (passwd ? passwd : ""), (user ? "@" : ""),
723 host, u->port, dir, *dir ? "/" : "", file); */
725 memcpy (res, proto_name, ln);
729 memcpy (res + l, user, lu);
734 memcpy (res + l, passwd, lp);
739 memcpy (res + l, host, lh);
741 if (u->port != proto_default_port)
744 long_to_string (res + l, (long)u->port);
745 l += numdigit (u->port);
748 memcpy (res + l, dir, ld);
752 strcpy (res + l, file);
761 /* Check whether two URL-s are equivalent, i.e. pointing to the same
762 location. Uses parseurl to parse them, and compares the canonical
765 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
766 return 0 on error. */
768 url_equal (const char *url1, const char *url2)
770 struct urlinfo *u1, *u2;
775 err = parseurl (url1, u1, 0);
782 err = parseurl (url2, u2, 0);
788 res = !strcmp (u1->url, u2->url);
795 get_urls_file (const char *file)
797 struct file_memory *fm;
799 const char *text, *text_end;
802 fm = read_file (file);
805 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
808 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
811 text_end = fm->content + fm->length;
812 while (text < text_end)
814 const char *line_beg = text;
815 const char *line_end = memchr (text, '\n', text_end - text);
821 while (line_beg < line_end
822 && ISSPACE (*line_beg))
824 while (line_end > line_beg + 1
825 && ISSPACE (*(line_end - 1)))
827 if (line_end > line_beg)
829 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
830 memset (entry, 0, sizeof (*entry));
832 entry->url = strdupdelim (line_beg, line_end);
844 /* Free the linked list of urlpos. */
846 free_urlpos (urlpos *l)
850 urlpos *next = l->next;
852 FREE_MAYBE (l->local_name);
858 /* Rotate FNAME opt.backups times */
860 rotate_backups(const char *fname)
862 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
863 char *from = (char *)alloca (maxlen);
864 char *to = (char *)alloca (maxlen);
868 if (stat (fname, &sb) == 0)
869 if (S_ISREG (sb.st_mode) == 0)
872 for (i = opt.backups; i > 1; i--)
874 sprintf (from, "%s.%d", fname, i - 1);
875 sprintf (to, "%s.%d", fname, i);
876 /* #### This will fail on machines without the rename() system
881 sprintf (to, "%s.%d", fname, 1);
885 /* Create all the necessary directories for PATH (a file). Calls
886 mkdirhier() internally. */
888 mkalldirs (const char *path)
895 p = path + strlen (path);
896 for (; *p != '/' && p != path; p--);
897 /* Don't create if it's just a file. */
898 if ((p == path) && (*p != '/'))
900 t = strdupdelim (path, p);
901 /* Check whether the directory exists. */
902 if ((stat (t, &st) == 0))
904 if (S_ISDIR (st.st_mode))
911 /* If the dir exists as a file name, remove it first. This
912 is *only* for Wget to work with buggy old CERN http
913 servers. Here is the scenario: When Wget tries to
914 retrieve a directory without a slash, e.g.
915 http://foo/bar (bar being a directory), CERN server will
916 not redirect it too http://foo/bar/ -- it will generate a
917 directory listing containing links to bar/file1,
918 bar/file2, etc. Wget will lose because it saves this
919 HTML listing to a file `bar', so it cannot create the
920 directory. To work around this, if the file of the same
921 name exists, we just remove it and create the directory
923 DEBUGP (("Removing %s because of directory danger!\n", t));
927 res = make_directory (t);
929 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
935 count_slashes (const char *s)
944 /* Return the path name of the URL-equivalent file name, with a
945 remote-like structure of directories. */
947 mkstruct (const struct urlinfo *u)
949 char *host, *dir, *file, *res, *dirpref;
952 assert (u->dir != NULL);
953 assert (u->host != NULL);
957 char *ptr = u->dir + (*u->dir == '/');
958 int slash_count = 1 + count_slashes (ptr);
959 int cut = MINVAL (opt.cut_dirs, slash_count);
960 for (; cut && *ptr; ptr++)
963 STRDUP_ALLOCA (dir, ptr);
966 dir = u->dir + (*u->dir == '/');
968 host = xstrdup (u->host);
969 /* Check for the true name (or at least a consistent name for saving
970 to directory) of HOST, reusing the hlist if possible. */
971 if (opt.add_hostdir && !opt.simple_check)
973 char *nhost = realhost (host);
977 /* Add dir_prefix and hostname (if required) to the beginning of
981 if (!DOTP (opt.dir_prefix))
983 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
984 + strlen (host) + 1);
985 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
988 STRDUP_ALLOCA (dirpref, host);
990 else /* not add_hostdir */
992 if (!DOTP (opt.dir_prefix))
993 dirpref = opt.dir_prefix;
999 /* If there is a prefix, prepend it. */
1002 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1003 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1006 dir = xstrdup (dir);
1009 if (l && dir[l - 1] == '/')
1013 file = "index.html";
1017 /* Finally, construct the full name. */
1018 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1019 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1024 /* Create a unique filename, corresponding to a given URL. Calls
1025 mkstruct if necessary. Does *not* actually create any directories. */
1027 url_filename (const struct urlinfo *u)
1030 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1034 file = mkstruct (u);
1040 file = xstrdup ("index.html");
1042 file = xstrdup (u->file);
1047 /* Check whether the prefix directory is something other than "."
1048 before prepending it. */
1049 if (!DOTP (opt.dir_prefix))
1051 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1052 + 1 + strlen (file) + 1);
1053 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1058 /* DOS-ish file systems don't like `%' signs in them; we change it
1063 for (p = file; *p; p++)
1067 #endif /* WINDOWS */
1069 /* Check the cases in which the unique extensions are not used:
1070 1) Clobbering is turned off (-nc).
1071 2) Retrieval with regetting.
1072 3) Timestamping is used.
1073 4) Hierarchy is built.
1075 The exception is the case when file does exist and is a
1076 directory (actually support for bad httpd-s). */
1077 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1078 && !(file_exists_p (file) && !file_non_directory_p (file)))
1081 /* Find a unique name. */
1082 name = unique_name (file);
1087 /* Like strlen(), but allow the URL to be ended with '?'. */
1089 urlpath_length (const char *url)
1091 const char *q = strchr (url, '?');
1094 return strlen (url);
1097 /* Find the last occurrence of character C in the range [b, e), or
1098 NULL, if none are present. This is almost completely equivalent to
1099 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1100 the contents of the string. */
1102 find_last_char (const char *b, const char *e, char c)
1110 /* Construct a URL by concatenating an absolute URL and a path, which
1111 may or may not be absolute. This tries to behave "reasonably" in
1112 all foreseeable cases. It employs little specific knowledge about
1113 protocols or URL-specific stuff -- it just works on strings. */
1115 construct (const char *url, const char *sub, int subsize, int no_proto)
1121 const char *end = url + urlpath_length (url);
1125 /* SUB is a relative URL: we need to replace everything
1126 after last slash (possibly empty) with SUB.
1128 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1129 our result should be "whatever/foo/qux/xyzzy". */
1130 int need_explicit_slash = 0;
1132 const char *start_insert;
1133 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1136 /* No slash found at all. Append SUB to what we have,
1137 but we'll need a slash as a separator.
1139 Example: if url == "foo" and sub == "qux/xyzzy", then
1140 we cannot just append sub to url, because we'd get
1141 "fooqux/xyzzy", whereas what we want is
1144 To make sure the / gets inserted, we set
1145 need_explicit_slash to 1. We also set start_insert
1146 to end + 1, so that the length calculations work out
1147 correctly for one more (slash) character. Accessing
1148 that character is fine, since it will be the
1149 delimiter, '\0' or '?'. */
1150 /* example: "foo?..." */
1151 /* ^ ('?' gets changed to '/') */
1152 start_insert = end + 1;
1153 need_explicit_slash = 1;
1155 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1157 /* example: http://host" */
1159 start_insert = end + 1;
1160 need_explicit_slash = 1;
1164 /* example: "whatever/foo/bar" */
1166 start_insert = last_slash + 1;
1169 span = start_insert - url;
1170 constr = (char *)xmalloc (span + subsize + 1);
1172 memcpy (constr, url, span);
1173 if (need_explicit_slash)
1174 constr[span - 1] = '/';
1176 memcpy (constr + span, sub, subsize);
1177 constr[span + subsize] = '\0';
1179 else /* *sub == `/' */
1181 /* SUB is an absolute path: we need to replace everything
1182 after (and including) the FIRST slash with SUB.
1184 So, if URL is "http://host/whatever/foo/bar", and SUB is
1185 "/qux/xyzzy", our result should be
1186 "http://host/qux/xyzzy". */
1189 const char *start_insert = NULL; /* for gcc to shut up. */
1190 const char *pos = url;
1191 int seen_slash_slash = 0;
1192 /* We're looking for the first slash, but want to ignore
1195 slash = memchr (pos, '/', end - pos);
1196 if (slash && !seen_slash_slash)
1197 if (*(slash + 1) == '/')
1200 seen_slash_slash = 1;
1204 /* At this point, SLASH is the location of the first / after
1205 "//", or the first slash altogether. START_INSERT is the
1206 pointer to the location where SUB will be inserted. When
1207 examining the last two examples, keep in mind that SUB
1210 if (!slash && !seen_slash_slash)
1211 /* example: "foo" */
1214 else if (!slash && seen_slash_slash)
1215 /* example: "http://foo" */
1218 else if (slash && !seen_slash_slash)
1219 /* example: "foo/bar" */
1222 else if (slash && seen_slash_slash)
1223 /* example: "http://something/" */
1225 start_insert = slash;
1227 span = start_insert - url;
1228 constr = (char *)xmalloc (span + subsize + 1);
1230 memcpy (constr, url, span);
1232 memcpy (constr + span, sub, subsize);
1233 constr[span + subsize] = '\0';
1236 else /* !no_proto */
1238 constr = strdupdelim (sub, sub + subsize);
1243 /* Like the function above, but with a saner caller interface. */
1245 url_concat (const char *base_url, const char *new_url)
1247 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1250 /* Optimize URL by host, destructively replacing u->host with realhost
1251 (u->host). Do this regardless of opt.simple_check. */
1253 opt_url (struct urlinfo *u)
1255 /* Find the "true" host. */
1256 char *host = realhost (u->host);
1259 assert (u->dir != NULL); /* the URL must have been parsed */
1260 /* Refresh the printed representation. */
1262 u->url = str_url (u, 0);
1265 /* This beautiful kludge is fortunately not needed, as I've made
1266 parse_dir do the (almost) right thing, so that a query can never
1267 become a part of directory. */
1269 /* Call path_simplify, but make sure that the part after the
1270 question-mark, if any, is not destroyed by path_simplify's
1273 path_simplify_with_kludge (char *path)
1275 char *query = strchr (path, '?');
1277 /* path_simplify also works destructively, so we also have the
1278 license to write. */
1280 path_simplify (path);
1283 char *newend = path + strlen (path);
1285 if (newend != query)
1286 memmove (newend, query, strlen (query) + 1);
1291 /* Returns proxy host address, in accordance with PROTO. */
1293 getproxy (uerr_t proto)
1295 if (proto == URLHTTP)
1296 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1297 else if (proto == URLFTP)
1298 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1300 else if (proto == URLHTTPS)
1301 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1302 #endif /* HAVE_SSL */
1307 /* Should a host be accessed through proxy, concerning no_proxy? */
1309 no_proxy_match (const char *host, const char **no_proxy)
1314 return !sufmatch (no_proxy, host);
1317 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1318 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1320 /* Change the links in an HTML document. Accepts a structure that
1321 defines the positions of all the links. */
1323 convert_links (const char *file, urlpos *l)
1325 struct file_memory *fm;
1328 downloaded_file_t downloaded_file_return;
1330 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1333 /* First we do a "dry run": go through the list L and see whether
1334 any URL needs to be converted in the first place. If not, just
1335 leave the file alone. */
1338 for (dry = l; dry; dry = dry->next)
1339 if (dry->convert != CO_NOCONVERT)
1343 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1348 fm = read_file (file);
1351 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1352 file, strerror (errno));
1356 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1357 if (opt.backup_converted && downloaded_file_return)
1358 write_backup_file (file, downloaded_file_return);
1360 /* Before opening the file for writing, unlink the file. This is
1361 important if the data in FM is mmaped. In such case, nulling the
1362 file, which is what fopen() below does, would make us read all
1363 zeroes from the mmaped region. */
1364 if (unlink (file) < 0 && errno != ENOENT)
1366 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1367 file, strerror (errno));
1368 read_file_free (fm);
1371 /* Now open the file for writing. */
1372 fp = fopen (file, "wb");
1375 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1376 file, strerror (errno));
1377 read_file_free (fm);
1380 /* Here we loop through all the URLs in file, replacing those of
1381 them that are downloaded with relative references. */
1383 for (; l; l = l->next)
1385 char *url_start = fm->content + l->pos;
1387 if (l->pos >= fm->length)
1389 DEBUGP (("Something strange is going on. Please investigate."));
1392 /* If the URL is not to be converted, skip it. */
1393 if (l->convert == CO_NOCONVERT)
1395 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1399 /* Echo the file contents, up to the offending URL's opening
1400 quote, to the outfile. */
1401 fwrite (p, 1, url_start - p, fp);
1403 if (l->convert == CO_CONVERT_TO_RELATIVE)
1405 /* Convert absolute URL to relative. */
1406 char *newname = construct_relative (file, l->local_name);
1407 char *quoted_newname = html_quote_string (newname);
1408 replace_attr (&p, l->size, fp, quoted_newname);
1409 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1410 l->url, newname, l->pos, file));
1412 xfree (quoted_newname);
1414 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1416 /* Convert the link to absolute URL. */
1417 char *newlink = l->url;
1418 char *quoted_newlink = html_quote_string (newlink);
1419 replace_attr (&p, l->size, fp, quoted_newlink);
1420 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1421 newlink, l->pos, file));
1422 xfree (quoted_newlink);
1425 /* Output the rest of the file. */
1426 if (p - fm->content < fm->length)
1427 fwrite (p, 1, fm->length - (p - fm->content), fp);
1429 read_file_free (fm);
1430 logputs (LOG_VERBOSE, _("done.\n"));
1433 /* Construct and return a malloced copy of the relative link from two
1434 pieces of information: local name S1 of the referring file and
1435 local name S2 of the referred file.
1437 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1438 "jagor.srce.hr/images/news.gif", the function will return
1441 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1442 "fly.cc.fer.hr/images/fly.gif", the function will return
1443 "../images/fly.gif".
1445 Caveats: S1 should not begin with `/', unless S2 also begins with
1446 '/'. S1 should not contain things like ".." and such --
1447 construct_relative ("fly/ioccc/../index.html",
1448 "fly/images/fly.gif") will fail. (A workaround is to call
1449 something like path_simplify() on S1). */
1451 construct_relative (const char *s1, const char *s2)
1453 int i, cnt, sepdirs1;
1457 return xstrdup (s2);
1458 /* S1 should *not* be absolute, if S2 wasn't. */
1459 assert (*s1 != '/');
1461 /* Skip the directories common to both strings. */
1464 while (s1[i] && s2[i]
1469 if (s1[i] == '/' && s2[i] == '/')
1474 for (sepdirs1 = 0; s1[i]; i++)
1477 /* Now, construct the file as of:
1478 - ../ repeated sepdirs1 time
1479 - all the non-mutual directories of S2. */
1480 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1481 for (i = 0; i < sepdirs1; i++)
1482 memcpy (res + 3 * i, "../", 3);
1483 strcpy (res + 3 * i, s2 + cnt);
1487 /* Add URL to the head of the list L. */
1489 add_url (urlpos *l, const char *url, const char *file)
1493 t = (urlpos *)xmalloc (sizeof (urlpos));
1494 memset (t, 0, sizeof (*t));
1495 t->url = xstrdup (url);
1496 t->local_name = xstrdup (file);
1502 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1504 /* Rather than just writing over the original .html file with the
1505 converted version, save the former to *.orig. Note we only do
1506 this for files we've _successfully_ downloaded, so we don't
1507 clobber .orig files sitting around from previous invocations. */
1509 /* Construct the backup filename as the original name plus ".orig". */
1510 size_t filename_len = strlen(file);
1511 char* filename_plus_orig_suffix;
1512 boolean already_wrote_backup_file = FALSE;
1513 slist* converted_file_ptr;
1514 static slist* converted_files = NULL;
1516 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1518 /* Just write "orig" over "html". We need to do it this way
1519 because when we're checking to see if we've downloaded the
1520 file before (to see if we can skip downloading it), we don't
1521 know if it's a text/html file. Therefore we don't know yet
1522 at that stage that -E is going to cause us to tack on
1523 ".html", so we need to compare vs. the original URL plus
1524 ".orig", not the original URL plus ".html.orig". */
1525 filename_plus_orig_suffix = alloca (filename_len + 1);
1526 strcpy(filename_plus_orig_suffix, file);
1527 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1529 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1531 /* Append ".orig" to the name. */
1532 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1533 strcpy(filename_plus_orig_suffix, file);
1534 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1537 /* We can get called twice on the same URL thanks to the
1538 convert_all_links() call in main(). If we write the .orig file
1539 each time in such a case, it'll end up containing the first-pass
1540 conversion, not the original file. So, see if we've already been
1541 called on this file. */
1542 converted_file_ptr = converted_files;
1543 while (converted_file_ptr != NULL)
1544 if (strcmp(converted_file_ptr->string, file) == 0)
1546 already_wrote_backup_file = TRUE;
1550 converted_file_ptr = converted_file_ptr->next;
1552 if (!already_wrote_backup_file)
1554 /* Rename <file> to <file>.orig before former gets written over. */
1555 if (rename(file, filename_plus_orig_suffix) != 0)
1556 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1557 file, filename_plus_orig_suffix, strerror (errno));
1559 /* Remember that we've already written a .orig backup for this file.
1560 Note that we never free this memory since we need it till the
1561 convert_all_links() call, which is one of the last things the
1562 program does before terminating. BTW, I'm not sure if it would be
1563 safe to just set 'converted_file_ptr->string' to 'file' below,
1564 rather than making a copy of the string... Another note is that I
1565 thought I could just add a field to the urlpos structure saying
1566 that we'd written a .orig file for this URL, but that didn't work,
1567 so I had to make this separate list.
1568 -- Dan Harkless <wget@harkless.org>
1570 This [adding a field to the urlpos structure] didn't work
1571 because convert_file() is called twice: once after all its
1572 sublinks have been retrieved in recursive_retrieve(), and
1573 once at the end of the day in convert_all_links(). The
1574 original linked list collected in recursive_retrieve() is
1575 lost after the first invocation of convert_links(), and
1576 convert_all_links() makes a new one (it calls get_urls_html()
1577 for each file it covers.) That's why your first approach didn't
1578 work. The way to make it work is perhaps to make this flag a
1579 field in the `urls_html' list.
1580 -- Hrvoje Niksic <hniksic@arsdigita.com>
1582 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1583 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1584 converted_file_ptr->next = converted_files;
1585 converted_files = converted_file_ptr;
1589 static int find_fragment PARAMS ((const char *, int, const char **,
1593 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1595 const char *p = *pp;
1597 int size = raw_size;
1598 char quote_char = '\"';
1599 const char *frag_beg, *frag_end;
1601 /* Structure of our string is:
1602 "...old-contents..."
1603 <--- l->size ---> (with quotes)
1606 <--- l->size --> (no quotes) */
1608 if (*p == '\"' || *p == '\'')
1613 size -= 2; /* disregard opening and closing quote */
1615 putc (quote_char, fp);
1616 fputs (new_str, fp);
1618 /* Look for fragment identifier, if any. */
1619 if (find_fragment (p, size, &frag_beg, &frag_end))
1620 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1624 putc (quote_char, fp);
1628 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1629 preceded by '&'. If the character is not found, return zero. If
1630 the character is found, return 1 and set BP and EP to point to the
1631 beginning and end of the region.
1633 This is used for finding the fragment indentifiers in URLs. */
1636 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1638 const char *end = beg + size;
1640 for (; beg < end; beg++)
1662 typedef struct _downloaded_file_list {
1664 downloaded_file_t download_type;
1665 struct _downloaded_file_list* next;
1666 } downloaded_file_list;
1668 static downloaded_file_list *downloaded_files;
1670 /* Remembers which files have been downloaded. In the standard case, should be
1671 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1672 download successfully (i.e. not for ones we have failures on or that we skip
1675 When we've downloaded a file and tacked on a ".html" extension due to -E,
1676 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1677 FILE_DOWNLOADED_NORMALLY.
1679 If you just want to check if a file has been previously added without adding
1680 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1681 with local filenames, not remote URLs. */
1683 downloaded_file (downloaded_file_t mode, const char* file)
1685 boolean found_file = FALSE;
1686 downloaded_file_list* rover = downloaded_files;
1688 while (rover != NULL)
1689 if (strcmp(rover->file, file) == 0)
1695 rover = rover->next;
1698 return rover->download_type; /* file had already been downloaded */
1701 if (mode != CHECK_FOR_FILE)
1703 rover = xmalloc(sizeof(*rover));
1704 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1705 rover->download_type = mode;
1706 rover->next = downloaded_files;
1707 downloaded_files = rover;
1710 return FILE_NOT_ALREADY_DOWNLOADED;
1715 downloaded_files_free (void)
1717 downloaded_file_list* rover = downloaded_files;
1720 downloaded_file_list *next = rover->next;
1721 xfree (rover->file);
1727 /* Initialization of static stuff. */
1731 init_unsafe_char_table ();