2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
45 /* Table of Unsafe chars. This is intialized in
46 init_unsafe_char_table. */
48 static char unsafe_char_table[256];
50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
52 /* If S contains unsafe characters, free it and replace it with a
53 version that doesn't. */
54 #define URL_CLEANSE(s) do \
56 if (contains_unsafe (s)) \
58 char *uc_tmp = encode_string (s); \
64 /* Is a directory "."? */
65 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
66 /* Is a directory ".."? */
67 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
70 static void path_simplify_with_kludge PARAMS ((char *));
72 static int urlpath_length PARAMS ((const char *));
74 /* A NULL-terminated list of strings to be recognized as prototypes
75 (URL schemes). Note that recognized doesn't mean supported -- only
76 HTTP, HTTPS and FTP are currently supported .
78 However, a string that does not match anything in the list will be
79 considered a relative URL. Thus it's important that this list has
80 anything anyone could think of being legal.
82 There are wild things here. :-) Take a look at
83 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
85 static char *protostrings[] =
127 /* Similar to former, but for supported protocols: */
128 static struct proto sup_protos[] =
130 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
132 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
134 { "ftp://", URLFTP, DEFAULT_FTP_PORT }
137 static void parse_dir PARAMS ((const char *, char **, char **));
138 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
139 static char *construct PARAMS ((const char *, const char *, int , int));
140 static char *construct_relative PARAMS ((const char *, const char *));
141 static char process_ftp_type PARAMS ((char *));
146 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
147 - @ and :, for user/password encoding.
148 - everything over 127 (but we don't bother with recording those. */
150 init_unsafe_char_table (void)
153 for (i = 0; i < 256; i++)
154 if (i < 32 || i >= 127
170 unsafe_char_table[i] = 1;
173 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
175 contains_unsafe (const char *s)
178 if (UNSAFE_CHAR (*s))
183 /* Decodes the forms %xy in a URL to the character the hexadecimal
184 code of which is xy. xy are hexadecimal digits from
185 [0123456789ABCDEF] (case-insensitive). If x or y are not
186 hex-digits or `%' precedes `\0', the sequence is inserted
190 decode_string (char *s)
200 /* Do nothing if at the end of the string, or if the chars
201 are not hex-digits. */
202 if (!*(s + 1) || !*(s + 2)
203 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
208 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
215 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
216 given string, returning a malloc-ed %XX encoded string. */
218 encode_string (const char *s)
225 for (i = 0; *s; s++, i++)
226 if (UNSAFE_CHAR (*s))
227 i += 2; /* Two more characters (hex digits) */
228 res = (char *)xmalloc (i + 1);
230 for (p = res; *s; s++)
231 if (UNSAFE_CHAR (*s))
233 const unsigned char c = *s;
235 *p++ = HEXD2ASC (c >> 4);
236 *p++ = HEXD2ASC (c & 0xf);
244 /* Returns the proto-type if URL's protocol is supported, or
245 URLUNKNOWN if not. */
247 urlproto (const char *url)
251 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
252 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
253 return sup_protos[i].ind;
254 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
257 for (++i; url[i] && url[i] != '/'; i++)
258 if (!ISDIGIT (url[i]))
260 if (url[i - 1] == ':')
269 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
270 part is found, returns 0. */
272 skip_proto (const char *url)
277 for (s = protostrings; *s; s++)
278 if (!strncasecmp (*s, url, strlen (*s)))
283 /* HTTP and FTP protocols are expected to yield exact host names
284 (i.e. the `//' part must be skipped, too). */
285 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
290 /* Returns 1 if the URL begins with a protocol (supported or
291 unsupported), 0 otherwise. */
293 has_proto (const char *url)
297 for (s = protostrings; *s; s++)
298 if (strncasecmp (url, *s, strlen (*s)) == 0)
303 /* Skip the username and password, if present here. The function
304 should be called *not* with the complete URL, but with the part
305 right after the protocol.
307 If no username and password are found, return 0. */
309 skip_uname (const char *url)
312 const char *q = NULL;
313 for (p = url ; *p && *p != '/'; p++)
314 if (*p == '@') q = p;
315 /* If a `@' was found before the first occurrence of `/', skip
323 /* Allocate a new urlinfo structure, fill it with default values and
324 return a pointer to it. */
330 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
331 memset (u, 0, sizeof (*u));
332 u->proto = URLUNKNOWN;
336 /* Perform a "deep" free of the urlinfo structure. The structure
337 should have been created with newurl, but need not have been used.
338 If free_pointer is non-0, free the pointer itself. */
340 freeurl (struct urlinfo *u, int complete)
344 FREE_MAYBE (u->host);
345 FREE_MAYBE (u->path);
346 FREE_MAYBE (u->file);
348 FREE_MAYBE (u->user);
349 FREE_MAYBE (u->passwd);
350 FREE_MAYBE (u->local);
351 FREE_MAYBE (u->referer);
353 freeurl (u->proxy, 1);
359 /* Extract the given URL of the form
360 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
361 1. hostname (terminated with `/' or `:')
362 2. port number (terminated with `/'), or chosen for the protocol
363 3. dirname (everything after hostname)
364 Most errors are handled. No allocation is done, you must supply
365 pointers to allocated memory.
366 ...and a host of other stuff :-)
368 - Recognizes hostname:dir/file for FTP and
369 hostname (:portnum)?/dir/file for HTTP.
370 - Parses the path to yield directory and file
371 - Parses the URL to yield the username and passwd (if present)
372 - Decodes the strings, in case they contain "forbidden" characters
373 - Writes the result to struct urlinfo
375 If the argument STRICT is set, it recognizes only the canonical
378 parseurl (const char *url, struct urlinfo *u, int strict)
381 int recognizable; /* Recognizable URL is the one where
382 the protocol name was explicitly
383 named, i.e. it wasn't deduced from
387 DEBUGP (("parseurl (\"%s\") -> ", url));
388 recognizable = has_proto (url);
389 if (strict && !recognizable)
391 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
393 l = strlen (sup_protos[i].name);
394 if (!strncasecmp (sup_protos[i].name, url, l))
397 /* If protocol is recognizable, but unsupported, bail out, else
399 if (recognizable && i == ARRAY_SIZE (sup_protos))
401 else if (i == ARRAY_SIZE (sup_protos))
404 u->proto = type = sup_protos[i].ind;
406 if (type == URLUNKNOWN)
408 /* Allow a username and password to be specified (i.e. just skip
411 l += skip_uname (url + l);
412 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
415 /* Get the hostname. */
416 u->host = strdupdelim (url + l, url + i);
417 DEBUGP (("host %s -> ", u->host));
419 /* Assume no port has been given. */
423 /* We have a colon delimiting the hostname. It could mean that
424 a port number is following it, or a directory. */
425 if (ISDIGIT (url[++i])) /* A port number */
427 if (type == URLUNKNOWN)
428 u->proto = type = URLHTTP;
429 for (; url[i] && url[i] != '/'; i++)
430 if (ISDIGIT (url[i]))
431 u->port = 10 * u->port + (url[i] - '0');
436 DEBUGP (("port %hu -> ", u->port));
438 else if (type == URLUNKNOWN) /* or a directory */
439 u->proto = type = URLFTP;
440 else /* or just a misformed port number */
443 else if (type == URLUNKNOWN)
444 u->proto = type = URLHTTP;
448 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
449 if (sup_protos[ind].ind == type)
451 if (ind == ARRAY_SIZE (sup_protos))
453 u->port = sup_protos[ind].port;
455 /* Some delimiter troubles... */
456 if (url[i] == '/' && url[i - 1] != ':')
459 while (url[i] && url[i] == '/')
461 u->path = (char *)xmalloc (strlen (url + i) + 8);
462 strcpy (u->path, url + i);
465 u->ftp_type = process_ftp_type (u->path);
466 /* #### We don't handle type `d' correctly yet. */
467 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
469 DEBUGP (("ftp_type %c -> ", u->ftp_type));
471 DEBUGP (("opath %s -> ", u->path));
472 /* Parse the username and password (if existing). */
473 parse_uname (url, &u->user, &u->passwd);
474 /* Decode the strings, as per RFC 1738. */
475 decode_string (u->host);
476 decode_string (u->path);
478 decode_string (u->user);
480 decode_string (u->passwd);
481 /* Parse the directory. */
482 parse_dir (u->path, &u->dir, &u->file);
483 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
484 /* Simplify the directory. */
485 path_simplify (u->dir);
486 /* Remove the leading `/' in HTTP. */
487 if (type == URLHTTP && *u->dir == '/')
488 strcpy (u->dir, u->dir + 1);
489 DEBUGP (("ndir %s\n", u->dir));
490 /* Strip trailing `/'. */
492 if (l > 1 && u->dir[l - 1] == '/')
493 u->dir[l - 1] = '\0';
494 /* Re-create the path: */
495 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
496 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
497 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
498 strcpy (u->path, abs_ftp ? "%2F" : "/");
499 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
500 strcat (u->path, *u->dir ? "/" : "");
501 strcat (u->path, u->file);
502 URL_CLEANSE (u->path);
503 DEBUGP (("newpath: %s\n", u->path));
504 /* Create the clean URL. */
505 u->url = str_url (u, 0);
509 /* Special versions of DOTP and DDOTP for parse_dir(). */
511 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
512 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
513 && (!*((x) + 2) || *((x) + 2) == '?'))
515 /* Build the directory and filename components of the path. Both
516 components are *separately* malloc-ed strings! It does not change
517 the contents of path.
519 If the path ends with "." or "..", they are (correctly) counted as
522 parse_dir (const char *path, char **dir, char **file)
526 l = urlpath_length (path);
527 for (i = l; i && path[i] != '/'; i--);
529 if (!i && *path != '/') /* Just filename */
531 if (PD_DOTP (path) || PD_DDOTP (path))
533 *dir = strdupdelim (path, path + l);
534 *file = xstrdup (path + l); /* normally empty, but could
539 *dir = xstrdup (""); /* This is required because of FTP */
540 *file = xstrdup (path);
543 else if (!i) /* /filename */
545 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
547 *dir = strdupdelim (path, path + l);
548 *file = xstrdup (path + l); /* normally empty, but could
553 *dir = xstrdup ("/");
554 *file = xstrdup (path + 1);
557 else /* Nonempty directory with or without a filename */
559 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
561 *dir = strdupdelim (path, path + l);
562 *file = xstrdup (path + l); /* normally empty, but could
567 *dir = strdupdelim (path, path + i);
568 *file = xstrdup (path + i + 1);
573 /* Find the optional username and password within the URL, as per
574 RFC1738. The returned user and passwd char pointers are
577 parse_uname (const char *url, char **user, char **passwd)
580 const char *p, *q, *col;
586 /* Look for the end of the protocol string. */
587 l = skip_proto (url);
590 /* Add protocol offset. */
592 /* Is there an `@' character? */
593 for (p = url; *p && *p != '/'; p++)
596 /* If not, return. */
599 /* Else find the username and password. */
600 for (p = q = col = url; *p && *p != '/'; p++)
602 if (*p == ':' && !*user)
604 *user = (char *)xmalloc (p - url + 1);
605 memcpy (*user, url, p - url);
606 (*user)[p - url] = '\0';
609 if (*p == '@') q = p;
611 /* Decide whether you have only the username or both. */
612 where = *user ? passwd : user;
613 *where = (char *)xmalloc (q - col + 1);
614 memcpy (*where, col, q - col);
615 (*where)[q - col] = '\0';
619 /* If PATH ends with `;type=X', return the character X. */
621 process_ftp_type (char *path)
623 int len = strlen (path);
626 && !memcmp (path + len - 7, ";type=", 6))
628 path[len - 7] = '\0';
629 return path[len - 1];
635 /* Return the URL as fine-formed string, with a proper protocol, optional port
636 number, directory and optional user/password. If `hide' is non-zero (as it
637 is when we're calling this on a URL we plan to print, but not when calling it
638 to canonicalize a URL for use within the program), password will be hidden.
639 The forbidden characters in the URL will be cleansed. */
641 str_url (const struct urlinfo *u, int hide)
643 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
644 int i, l, ln, lu, lh, lp, lf, ld;
645 unsigned short proto_default_port;
647 /* Look for the protocol name. */
648 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
649 if (sup_protos[i].ind == u->proto)
651 if (i == ARRAY_SIZE (sup_protos))
653 proto_name = sup_protos[i].name;
654 proto_default_port = sup_protos[i].port;
655 host = CLEANDUP (u->host);
656 dir = CLEANDUP (u->dir);
657 file = CLEANDUP (u->file);
658 user = passwd = NULL;
660 user = CLEANDUP (u->user);
664 /* Don't output the password, or someone might see it over the user's
665 shoulder (or in saved wget output). Don't give away the number of
666 characters in the password, either, as we did in past versions of
667 this code, when we replaced the password characters with 'x's. */
668 passwd = xstrdup("<password>");
670 passwd = CLEANDUP (u->passwd);
672 if (u->proto == URLFTP && *dir == '/')
674 char *tmp = (char *)xmalloc (strlen (dir) + 3);
675 /*sprintf (tmp, "%%2F%s", dir + 1);*/
679 strcpy (tmp + 3, dir + 1);
684 ln = strlen (proto_name);
685 lu = user ? strlen (user) : 0;
686 lp = passwd ? strlen (passwd) : 0;
690 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
691 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
692 (user ? user : ""), (passwd ? ":" : ""),
693 (passwd ? passwd : ""), (user ? "@" : ""),
694 host, u->port, dir, *dir ? "/" : "", file); */
696 memcpy (res, proto_name, ln);
700 memcpy (res + l, user, lu);
705 memcpy (res + l, passwd, lp);
710 memcpy (res + l, host, lh);
712 if (u->port != proto_default_port)
715 long_to_string (res + l, (long)u->port);
716 l += numdigit (u->port);
719 memcpy (res + l, dir, ld);
723 strcpy (res + l, file);
732 /* Check whether two URL-s are equivalent, i.e. pointing to the same
733 location. Uses parseurl to parse them, and compares the canonical
736 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
737 return 0 on error. */
739 url_equal (const char *url1, const char *url2)
741 struct urlinfo *u1, *u2;
746 err = parseurl (url1, u1, 0);
753 err = parseurl (url2, u2, 0);
759 res = !strcmp (u1->url, u2->url);
766 get_urls_file (const char *file)
768 struct file_memory *fm;
770 const char *text, *text_end;
773 fm = read_file (file);
776 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
779 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
782 text_end = fm->content + fm->length;
783 while (text < text_end)
785 const char *line_beg = text;
786 const char *line_end = memchr (text, '\n', text_end - text);
792 while (line_beg < line_end
793 && ISSPACE (*line_beg))
795 while (line_end > line_beg + 1
796 && ISSPACE (*(line_end - 1)))
798 if (line_end > line_beg)
800 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
801 memset (entry, 0, sizeof (*entry));
803 entry->url = strdupdelim (line_beg, line_end);
815 /* Free the linked list of urlpos. */
817 free_urlpos (urlpos *l)
821 urlpos *next = l->next;
823 FREE_MAYBE (l->local_name);
829 /* Rotate FNAME opt.backups times */
831 rotate_backups(const char *fname)
833 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
834 char *from = (char *)alloca (maxlen);
835 char *to = (char *)alloca (maxlen);
839 if (stat (fname, &sb) == 0)
840 if (S_ISREG (sb.st_mode) == 0)
843 for (i = opt.backups; i > 1; i--)
845 sprintf (from, "%s.%d", fname, i - 1);
846 sprintf (to, "%s.%d", fname, i);
847 /* #### This will fail on machines without the rename() system
852 sprintf (to, "%s.%d", fname, 1);
856 /* Create all the necessary directories for PATH (a file). Calls
857 mkdirhier() internally. */
859 mkalldirs (const char *path)
866 p = path + strlen (path);
867 for (; *p != '/' && p != path; p--);
868 /* Don't create if it's just a file. */
869 if ((p == path) && (*p != '/'))
871 t = strdupdelim (path, p);
872 /* Check whether the directory exists. */
873 if ((stat (t, &st) == 0))
875 if (S_ISDIR (st.st_mode))
882 /* If the dir exists as a file name, remove it first. This
883 is *only* for Wget to work with buggy old CERN http
884 servers. Here is the scenario: When Wget tries to
885 retrieve a directory without a slash, e.g.
886 http://foo/bar (bar being a directory), CERN server will
887 not redirect it too http://foo/bar/ -- it will generate a
888 directory listing containing links to bar/file1,
889 bar/file2, etc. Wget will lose because it saves this
890 HTML listing to a file `bar', so it cannot create the
891 directory. To work around this, if the file of the same
892 name exists, we just remove it and create the directory
894 DEBUGP (("Removing %s because of directory danger!\n", t));
898 res = make_directory (t);
900 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
906 count_slashes (const char *s)
915 /* Return the path name of the URL-equivalent file name, with a
916 remote-like structure of directories. */
918 mkstruct (const struct urlinfo *u)
920 char *host, *dir, *file, *res, *dirpref;
923 assert (u->dir != NULL);
924 assert (u->host != NULL);
928 char *ptr = u->dir + (*u->dir == '/');
929 int slash_count = 1 + count_slashes (ptr);
930 int cut = MINVAL (opt.cut_dirs, slash_count);
931 for (; cut && *ptr; ptr++)
934 STRDUP_ALLOCA (dir, ptr);
937 dir = u->dir + (*u->dir == '/');
939 host = xstrdup (u->host);
940 /* Check for the true name (or at least a consistent name for saving
941 to directory) of HOST, reusing the hlist if possible. */
942 if (opt.add_hostdir && !opt.simple_check)
944 char *nhost = realhost (host);
948 /* Add dir_prefix and hostname (if required) to the beginning of
952 if (!DOTP (opt.dir_prefix))
954 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
955 + strlen (host) + 1);
956 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
959 STRDUP_ALLOCA (dirpref, host);
961 else /* not add_hostdir */
963 if (!DOTP (opt.dir_prefix))
964 dirpref = opt.dir_prefix;
970 /* If there is a prefix, prepend it. */
973 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
974 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
980 if (l && dir[l - 1] == '/')
988 /* Finally, construct the full name. */
989 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
990 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
995 /* Create a unique filename, corresponding to a given URL. Calls
996 mkstruct if necessary. Does *not* actually create any directories. */
998 url_filename (const struct urlinfo *u)
1001 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1005 file = mkstruct (u);
1011 file = xstrdup ("index.html");
1013 file = xstrdup (u->file);
1018 /* Check whether the prefix directory is something other than "."
1019 before prepending it. */
1020 if (!DOTP (opt.dir_prefix))
1022 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1023 + 1 + strlen (file) + 1);
1024 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1029 /* DOS-ish file systems don't like `%' signs in them; we change it
1034 for (p = file; *p; p++)
1038 #endif /* WINDOWS */
1040 /* Check the cases in which the unique extensions are not used:
1041 1) Clobbering is turned off (-nc).
1042 2) Retrieval with regetting.
1043 3) Timestamping is used.
1044 4) Hierarchy is built.
1046 The exception is the case when file does exist and is a
1047 directory (actually support for bad httpd-s). */
1048 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1049 && !(file_exists_p (file) && !file_non_directory_p (file)))
1052 /* Find a unique name. */
1053 name = unique_name (file);
1058 /* Like strlen(), but allow the URL to be ended with '?'. */
1060 urlpath_length (const char *url)
1062 const char *q = strchr (url, '?');
1065 return strlen (url);
1068 /* Find the last occurrence of character C in the range [b, e), or
1069 NULL, if none are present. This is almost completely equivalent to
1070 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1071 the contents of the string. */
1073 find_last_char (const char *b, const char *e, char c)
1081 /* Construct a URL by concatenating an absolute URL and a path, which
1082 may or may not be absolute. This tries to behave "reasonably" in
1083 all foreseeable cases. It employs little specific knowledge about
1084 protocols or URL-specific stuff -- it just works on strings. */
1086 construct (const char *url, const char *sub, int subsize, int no_proto)
1092 const char *end = url + urlpath_length (url);
1096 /* SUB is a relative URL: we need to replace everything
1097 after last slash (possibly empty) with SUB.
1099 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1100 our result should be "whatever/foo/qux/xyzzy". */
1101 int need_explicit_slash = 0;
1103 const char *start_insert;
1104 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1107 /* No slash found at all. Append SUB to what we have,
1108 but we'll need a slash as a separator.
1110 Example: if url == "foo" and sub == "qux/xyzzy", then
1111 we cannot just append sub to url, because we'd get
1112 "fooqux/xyzzy", whereas what we want is
1115 To make sure the / gets inserted, we set
1116 need_explicit_slash to 1. We also set start_insert
1117 to end + 1, so that the length calculations work out
1118 correctly for one more (slash) character. Accessing
1119 that character is fine, since it will be the
1120 delimiter, '\0' or '?'. */
1121 /* example: "foo?..." */
1122 /* ^ ('?' gets changed to '/') */
1123 start_insert = end + 1;
1124 need_explicit_slash = 1;
1126 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1128 /* example: http://host" */
1130 start_insert = end + 1;
1131 need_explicit_slash = 1;
1135 /* example: "whatever/foo/bar" */
1137 start_insert = last_slash + 1;
1140 span = start_insert - url;
1141 constr = (char *)xmalloc (span + subsize + 1);
1143 memcpy (constr, url, span);
1144 if (need_explicit_slash)
1145 constr[span - 1] = '/';
1147 memcpy (constr + span, sub, subsize);
1148 constr[span + subsize] = '\0';
1150 else /* *sub == `/' */
1152 /* SUB is an absolute path: we need to replace everything
1153 after (and including) the FIRST slash with SUB.
1155 So, if URL is "http://host/whatever/foo/bar", and SUB is
1156 "/qux/xyzzy", our result should be
1157 "http://host/qux/xyzzy". */
1160 const char *start_insert = NULL; /* for gcc to shut up. */
1161 const char *pos = url;
1162 int seen_slash_slash = 0;
1163 /* We're looking for the first slash, but want to ignore
1166 slash = memchr (pos, '/', end - pos);
1167 if (slash && !seen_slash_slash)
1168 if (*(slash + 1) == '/')
1171 seen_slash_slash = 1;
1175 /* At this point, SLASH is the location of the first / after
1176 "//", or the first slash altogether. START_INSERT is the
1177 pointer to the location where SUB will be inserted. When
1178 examining the last two examples, keep in mind that SUB
1181 if (!slash && !seen_slash_slash)
1182 /* example: "foo" */
1185 else if (!slash && seen_slash_slash)
1186 /* example: "http://foo" */
1189 else if (slash && !seen_slash_slash)
1190 /* example: "foo/bar" */
1193 else if (slash && seen_slash_slash)
1194 /* example: "http://something/" */
1196 start_insert = slash;
1198 span = start_insert - url;
1199 constr = (char *)xmalloc (span + subsize + 1);
1201 memcpy (constr, url, span);
1203 memcpy (constr + span, sub, subsize);
1204 constr[span + subsize] = '\0';
1207 else /* !no_proto */
1209 constr = strdupdelim (sub, sub + subsize);
1214 /* Like the function above, but with a saner caller interface. */
1216 url_concat (const char *base_url, const char *new_url)
1218 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1221 /* Optimize URL by host, destructively replacing u->host with realhost
1222 (u->host). Do this regardless of opt.simple_check. */
1224 opt_url (struct urlinfo *u)
1226 /* Find the "true" host. */
1227 char *host = realhost (u->host);
1230 assert (u->dir != NULL); /* the URL must have been parsed */
1231 /* Refresh the printed representation. */
1233 u->url = str_url (u, 0);
1236 /* This beautiful kludge is fortunately not needed, as I've made
1237 parse_dir do the (almost) right thing, so that a query can never
1238 become a part of directory. */
1240 /* Call path_simplify, but make sure that the part after the
1241 question-mark, if any, is not destroyed by path_simplify's
1244 path_simplify_with_kludge (char *path)
1246 char *query = strchr (path, '?');
1248 /* path_simplify also works destructively, so we also have the
1249 license to write. */
1251 path_simplify (path);
1254 char *newend = path + strlen (path);
1256 if (newend != query)
1257 memmove (newend, query, strlen (query) + 1);
1262 /* Returns proxy host address, in accordance with PROTO. */
1264 getproxy (uerr_t proto)
1266 if (proto == URLHTTP)
1267 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1268 else if (proto == URLFTP)
1269 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1271 else if (proto == URLHTTPS)
1272 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1273 #endif /* HAVE_SSL */
1278 /* Should a host be accessed through proxy, concerning no_proxy? */
1280 no_proxy_match (const char *host, const char **no_proxy)
1285 return !sufmatch (no_proxy, host);
1288 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1289 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1291 /* Change the links in an HTML document. Accepts a structure that
1292 defines the positions of all the links. */
1294 convert_links (const char *file, urlpos *l)
1296 struct file_memory *fm;
1299 downloaded_file_t downloaded_file_return;
1301 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1304 /* First we do a "dry run": go through the list L and see whether
1305 any URL needs to be converted in the first place. If not, just
1306 leave the file alone. */
1309 for (dry = l; dry; dry = dry->next)
1310 if (dry->convert != CO_NOCONVERT)
1314 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1319 fm = read_file (file);
1322 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1323 file, strerror (errno));
1327 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1328 if (opt.backup_converted && downloaded_file_return)
1329 write_backup_file (file, downloaded_file_return);
1331 /* Before opening the file for writing, unlink the file. This is
1332 important if the data in FM is mmaped. In such case, nulling the
1333 file, which is what fopen() below does, would make us read all
1334 zeroes from the mmaped region. */
1335 if (unlink (file) < 0 && errno != ENOENT)
1337 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1338 file, strerror (errno));
1339 read_file_free (fm);
1342 /* Now open the file for writing. */
1343 fp = fopen (file, "wb");
1346 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1347 file, strerror (errno));
1348 read_file_free (fm);
1351 /* Here we loop through all the URLs in file, replacing those of
1352 them that are downloaded with relative references. */
1354 for (; l; l = l->next)
1356 char *url_start = fm->content + l->pos;
1358 if (l->pos >= fm->length)
1360 DEBUGP (("Something strange is going on. Please investigate."));
1363 /* If the URL is not to be converted, skip it. */
1364 if (l->convert == CO_NOCONVERT)
1366 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1370 /* Echo the file contents, up to the offending URL's opening
1371 quote, to the outfile. */
1372 fwrite (p, 1, url_start - p, fp);
1374 if (l->convert == CO_CONVERT_TO_RELATIVE)
1376 /* Convert absolute URL to relative. */
1377 char *newname = construct_relative (file, l->local_name);
1378 char *quoted_newname = html_quote_string (newname);
1379 replace_attr (&p, l->size, fp, quoted_newname);
1380 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1381 l->url, newname, l->pos, file));
1383 xfree (quoted_newname);
1385 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1387 /* Convert the link to absolute URL. */
1388 char *newlink = l->url;
1389 char *quoted_newlink = html_quote_string (newlink);
1390 replace_attr (&p, l->size, fp, quoted_newlink);
1391 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1392 newlink, l->pos, file));
1393 xfree (quoted_newlink);
1396 /* Output the rest of the file. */
1397 if (p - fm->content < fm->length)
1398 fwrite (p, 1, fm->length - (p - fm->content), fp);
1400 read_file_free (fm);
1401 logputs (LOG_VERBOSE, _("done.\n"));
1404 /* Construct and return a malloced copy of the relative link from two
1405 pieces of information: local name S1 of the referring file and
1406 local name S2 of the referred file.
1408 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1409 "jagor.srce.hr/images/news.gif", the function will return
1412 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1413 "fly.cc.fer.hr/images/fly.gif", the function will return
1414 "../images/fly.gif".
1416 Caveats: S1 should not begin with `/', unless S2 also begins with
1417 '/'. S1 should not contain things like ".." and such --
1418 construct_relative ("fly/ioccc/../index.html",
1419 "fly/images/fly.gif") will fail. (A workaround is to call
1420 something like path_simplify() on S1). */
1422 construct_relative (const char *s1, const char *s2)
1424 int i, cnt, sepdirs1;
1428 return xstrdup (s2);
1429 /* S1 should *not* be absolute, if S2 wasn't. */
1430 assert (*s1 != '/');
1432 /* Skip the directories common to both strings. */
1435 while (s1[i] && s2[i]
1440 if (s1[i] == '/' && s2[i] == '/')
1445 for (sepdirs1 = 0; s1[i]; i++)
1448 /* Now, construct the file as of:
1449 - ../ repeated sepdirs1 time
1450 - all the non-mutual directories of S2. */
1451 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1452 for (i = 0; i < sepdirs1; i++)
1453 memcpy (res + 3 * i, "../", 3);
1454 strcpy (res + 3 * i, s2 + cnt);
1458 /* Add URL to the head of the list L. */
1460 add_url (urlpos *l, const char *url, const char *file)
1464 t = (urlpos *)xmalloc (sizeof (urlpos));
1465 memset (t, 0, sizeof (*t));
1466 t->url = xstrdup (url);
1467 t->local_name = xstrdup (file);
1473 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1475 /* Rather than just writing over the original .html file with the
1476 converted version, save the former to *.orig. Note we only do
1477 this for files we've _successfully_ downloaded, so we don't
1478 clobber .orig files sitting around from previous invocations. */
1480 /* Construct the backup filename as the original name plus ".orig". */
1481 size_t filename_len = strlen(file);
1482 char* filename_plus_orig_suffix;
1483 boolean already_wrote_backup_file = FALSE;
1484 slist* converted_file_ptr;
1485 static slist* converted_files = NULL;
1487 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1489 /* Just write "orig" over "html". We need to do it this way
1490 because when we're checking to see if we've downloaded the
1491 file before (to see if we can skip downloading it), we don't
1492 know if it's a text/html file. Therefore we don't know yet
1493 at that stage that -E is going to cause us to tack on
1494 ".html", so we need to compare vs. the original URL plus
1495 ".orig", not the original URL plus ".html.orig". */
1496 filename_plus_orig_suffix = alloca (filename_len + 1);
1497 strcpy(filename_plus_orig_suffix, file);
1498 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1500 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1502 /* Append ".orig" to the name. */
1503 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1504 strcpy(filename_plus_orig_suffix, file);
1505 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1508 /* We can get called twice on the same URL thanks to the
1509 convert_all_links() call in main(). If we write the .orig file
1510 each time in such a case, it'll end up containing the first-pass
1511 conversion, not the original file. So, see if we've already been
1512 called on this file. */
1513 converted_file_ptr = converted_files;
1514 while (converted_file_ptr != NULL)
1515 if (strcmp(converted_file_ptr->string, file) == 0)
1517 already_wrote_backup_file = TRUE;
1521 converted_file_ptr = converted_file_ptr->next;
1523 if (!already_wrote_backup_file)
1525 /* Rename <file> to <file>.orig before former gets written over. */
1526 if (rename(file, filename_plus_orig_suffix) != 0)
1527 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1528 file, filename_plus_orig_suffix, strerror (errno));
1530 /* Remember that we've already written a .orig backup for this file.
1531 Note that we never free this memory since we need it till the
1532 convert_all_links() call, which is one of the last things the
1533 program does before terminating. BTW, I'm not sure if it would be
1534 safe to just set 'converted_file_ptr->string' to 'file' below,
1535 rather than making a copy of the string... Another note is that I
1536 thought I could just add a field to the urlpos structure saying
1537 that we'd written a .orig file for this URL, but that didn't work,
1538 so I had to make this separate list.
1539 -- Dan Harkless <wget@harkless.org>
1541 This [adding a field to the urlpos structure] didn't work
1542 because convert_file() is called twice: once after all its
1543 sublinks have been retrieved in recursive_retrieve(), and
1544 once at the end of the day in convert_all_links(). The
1545 original linked list collected in recursive_retrieve() is
1546 lost after the first invocation of convert_links(), and
1547 convert_all_links() makes a new one (it calls get_urls_html()
1548 for each file it covers.) That's why your first approach didn't
1549 work. The way to make it work is perhaps to make this flag a
1550 field in the `urls_html' list.
1551 -- Hrvoje Niksic <hniksic@arsdigita.com>
1553 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1554 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1555 converted_file_ptr->next = converted_files;
1556 converted_files = converted_file_ptr;
1560 static int find_fragment PARAMS ((const char *, int, const char **,
1564 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1566 const char *p = *pp;
1568 int size = raw_size;
1569 char quote_char = '\"';
1570 const char *frag_beg, *frag_end;
1572 /* Structure of our string is:
1573 "...old-contents..."
1574 <--- l->size ---> (with quotes)
1577 <--- l->size --> (no quotes) */
1579 if (*p == '\"' || *p == '\'')
1584 size -= 2; /* disregard opening and closing quote */
1586 putc (quote_char, fp);
1587 fputs (new_str, fp);
1589 /* Look for fragment identifier, if any. */
1590 if (find_fragment (p, size, &frag_beg, &frag_end))
1591 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1595 putc (quote_char, fp);
1599 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1600 preceded by '&'. If the character is not found, return zero. If
1601 the character is found, return 1 and set BP and EP to point to the
1602 beginning and end of the region.
1604 This is used for finding the fragment indentifiers in URLs. */
1607 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1609 const char *end = beg + size;
1611 for (; beg < end; beg++)
1633 typedef struct _downloaded_file_list {
1635 downloaded_file_t download_type;
1636 struct _downloaded_file_list* next;
1637 } downloaded_file_list;
1639 static downloaded_file_list *downloaded_files;
1641 /* Remembers which files have been downloaded. In the standard case, should be
1642 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1643 download successfully (i.e. not for ones we have failures on or that we skip
1646 When we've downloaded a file and tacked on a ".html" extension due to -E,
1647 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1648 FILE_DOWNLOADED_NORMALLY.
1650 If you just want to check if a file has been previously added without adding
1651 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1652 with local filenames, not remote URLs. */
1654 downloaded_file (downloaded_file_t mode, const char* file)
1656 boolean found_file = FALSE;
1657 downloaded_file_list* rover = downloaded_files;
1659 while (rover != NULL)
1660 if (strcmp(rover->file, file) == 0)
1666 rover = rover->next;
1669 return rover->download_type; /* file had already been downloaded */
1672 if (mode != CHECK_FOR_FILE)
1674 rover = xmalloc(sizeof(*rover));
1675 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1676 rover->download_type = mode;
1677 rover->next = downloaded_files;
1678 downloaded_files = rover;
1681 return FILE_NOT_ALREADY_DOWNLOADED;
1686 downloaded_files_free (void)
1688 downloaded_file_list* rover = downloaded_files;
1691 downloaded_file_list *next = rover->next;
1692 xfree (rover->file);
1698 /* Initialization of static stuff. */
1702 init_unsafe_char_table ();