2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
50 static int urlpath_length PARAMS ((const char *));
59 /* Supported protocols: */
60 static struct proto sup_protos[] =
62 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
64 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
66 { "ftp://", URLFTP, DEFAULT_FTP_PORT }
69 static void parse_dir PARAMS ((const char *, char **, char **));
70 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
71 static char *construct_relative PARAMS ((const char *, const char *));
72 static char process_ftp_type PARAMS ((char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars. We don't use this yet; preservation of
91 reserved chars will be implemented when I integrate the new
92 `reencode_string' function. */
94 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
98 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
99 - '@' and ':'; needed for encoding URL username and password.
100 - anything >= 127. */
102 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
104 const static unsigned char urlchr_table[256] =
106 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
107 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
108 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
109 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
110 U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */
111 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
112 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
113 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */
114 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
115 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
116 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
117 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
118 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
119 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
120 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
121 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
134 /* Decodes the forms %xy in a URL to the character the hexadecimal
135 code of which is xy. xy are hexadecimal digits from
136 [0123456789ABCDEF] (case-insensitive). If x or y are not
137 hex-digits or `%' precedes `\0', the sequence is inserted
141 decode_string (char *s)
143 char *t = s; /* t - tortoise */
144 char *h = s; /* h - hare */
155 /* Do nothing if '%' is not followed by two hex digits. */
156 if (!*(h + 1) || !*(h + 2)
157 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
159 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
166 /* Like encode_string, but return S if there are no unsafe chars. */
169 encode_string_maybe (const char *s)
176 for (p1 = s; *p1; p1++)
177 if (UNSAFE_CHAR (*p1))
178 addition += 2; /* Two more characters (hex digits) */
183 newlen = (p1 - s) + addition;
184 newstr = (char *)xmalloc (newlen + 1);
190 if (UNSAFE_CHAR (*p1))
192 const unsigned char c = *p1++;
194 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
195 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
201 assert (p2 - newstr == newlen);
206 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
207 given string, returning a malloc-ed %XX encoded string. */
210 encode_string (const char *s)
212 char *encoded = encode_string_maybe (s);
219 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
220 the old value of PTR is freed and PTR is made to point to the newly
221 allocated storage. */
223 #define ENCODE(ptr) do { \
224 char *e_new = encode_string_maybe (ptr); \
232 /* Returns the protocol type if URL's protocol is supported, or
233 URLUNKNOWN if not. */
235 urlproto (const char *url)
239 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
240 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
241 return sup_protos[i].ind;
242 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
245 for (++i; url[i] && url[i] != '/'; i++)
246 if (!ISDIGIT (url[i]))
248 if (url[i - 1] == ':')
257 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
258 part is found, returns 0. */
260 skip_proto (const char *url)
264 /* Skip protocol name. We allow `-' and `+' because of `whois++',
266 while (ISALNUM (*p) || *p == '-' || *p == '+')
273 /* Skip "//" if found. */
274 if (*p == '/' && *(p + 1) == '/')
280 /* Returns 1 if the URL begins with a protocol (supported or
281 unsupported), 0 otherwise. */
283 has_proto (const char *url)
286 while (ISALNUM (*p) || *p == '-' || *p == '+')
291 /* Skip the username and password, if present here. The function
292 should be called *not* with the complete URL, but with the part
293 right after the protocol.
295 If no username and password are found, return 0. */
297 skip_uname (const char *url)
300 const char *q = NULL;
301 for (p = url ; *p && *p != '/'; p++)
302 if (*p == '@') q = p;
303 /* If a `@' was found before the first occurrence of `/', skip
311 /* Allocate a new urlinfo structure, fill it with default values and
312 return a pointer to it. */
318 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
319 memset (u, 0, sizeof (*u));
320 u->proto = URLUNKNOWN;
324 /* Perform a "deep" free of the urlinfo structure. The structure
325 should have been created with newurl, but need not have been used.
326 If free_pointer is non-0, free the pointer itself. */
328 freeurl (struct urlinfo *u, int complete)
332 FREE_MAYBE (u->host);
333 FREE_MAYBE (u->path);
334 FREE_MAYBE (u->file);
336 FREE_MAYBE (u->user);
337 FREE_MAYBE (u->passwd);
338 FREE_MAYBE (u->local);
339 FREE_MAYBE (u->referer);
341 freeurl (u->proxy, 1);
347 /* Extract the given URL of the form
348 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
349 1. hostname (terminated with `/' or `:')
350 2. port number (terminated with `/'), or chosen for the protocol
351 3. dirname (everything after hostname)
352 Most errors are handled. No allocation is done, you must supply
353 pointers to allocated memory.
354 ...and a host of other stuff :-)
356 - Recognizes hostname:dir/file for FTP and
357 hostname (:portnum)?/dir/file for HTTP.
358 - Parses the path to yield directory and file
359 - Parses the URL to yield the username and passwd (if present)
360 - Decodes the strings, in case they contain "forbidden" characters
361 - Writes the result to struct urlinfo
363 If the argument STRICT is set, it recognizes only the canonical
366 parseurl (const char *url, struct urlinfo *u, int strict)
369 int recognizable; /* Recognizable URL is the one where
370 the protocol name was explicitly
371 named, i.e. it wasn't deduced from
375 DEBUGP (("parseurl (\"%s\") -> ", url));
376 recognizable = has_proto (url);
377 if (strict && !recognizable)
379 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
381 l = strlen (sup_protos[i].name);
382 if (!strncasecmp (sup_protos[i].name, url, l))
385 /* If protocol is recognizable, but unsupported, bail out, else
387 if (recognizable && i == ARRAY_SIZE (sup_protos))
389 else if (i == ARRAY_SIZE (sup_protos))
392 u->proto = type = sup_protos[i].ind;
394 if (type == URLUNKNOWN)
396 /* Allow a username and password to be specified (i.e. just skip
399 l += skip_uname (url + l);
400 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
403 /* Get the hostname. */
404 u->host = strdupdelim (url + l, url + i);
405 DEBUGP (("host %s -> ", u->host));
407 /* Assume no port has been given. */
411 /* We have a colon delimiting the hostname. It could mean that
412 a port number is following it, or a directory. */
413 if (ISDIGIT (url[++i])) /* A port number */
415 if (type == URLUNKNOWN)
416 u->proto = type = URLHTTP;
417 for (; url[i] && url[i] != '/'; i++)
418 if (ISDIGIT (url[i]))
419 u->port = 10 * u->port + (url[i] - '0');
424 DEBUGP (("port %hu -> ", u->port));
426 else if (type == URLUNKNOWN) /* or a directory */
427 u->proto = type = URLFTP;
428 else /* or just a misformed port number */
431 else if (type == URLUNKNOWN)
432 u->proto = type = URLHTTP;
436 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
437 if (sup_protos[ind].ind == type)
439 if (ind == ARRAY_SIZE (sup_protos))
441 u->port = sup_protos[ind].port;
443 /* Some delimiter troubles... */
444 if (url[i] == '/' && url[i - 1] != ':')
447 while (url[i] && url[i] == '/')
449 u->path = (char *)xmalloc (strlen (url + i) + 8);
450 strcpy (u->path, url + i);
453 u->ftp_type = process_ftp_type (u->path);
454 /* #### We don't handle type `d' correctly yet. */
455 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
457 DEBUGP (("ftp_type %c -> ", u->ftp_type));
459 DEBUGP (("opath %s -> ", u->path));
460 /* Parse the username and password (if existing). */
461 parse_uname (url, &u->user, &u->passwd);
462 /* Decode the strings, as per RFC 1738. */
463 decode_string (u->host);
464 decode_string (u->path);
466 decode_string (u->user);
468 decode_string (u->passwd);
469 /* Parse the directory. */
470 parse_dir (u->path, &u->dir, &u->file);
471 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
472 /* Simplify the directory. */
473 path_simplify (u->dir);
474 /* Remove the leading `/' in HTTP. */
475 if (type == URLHTTP && *u->dir == '/')
476 strcpy (u->dir, u->dir + 1);
477 DEBUGP (("ndir %s\n", u->dir));
478 /* Strip trailing `/'. */
480 if (l > 1 && u->dir[l - 1] == '/')
481 u->dir[l - 1] = '\0';
482 /* Re-create the path: */
483 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
484 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
485 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
486 strcpy (u->path, abs_ftp ? "%2F" : "/");
487 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
488 strcat (u->path, *u->dir ? "/" : "");
489 strcat (u->path, u->file);
491 DEBUGP (("newpath: %s\n", u->path));
492 /* Create the clean URL. */
493 u->url = str_url (u, 0);
497 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
498 DOTP and DDOTP, but they also recognize `?' as end-of-string
499 delimiter. This is needed for correct handling of query
502 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
503 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
504 && (!*((x) + 2) || *((x) + 2) == '?'))
506 /* Build the directory and filename components of the path. Both
507 components are *separately* malloc-ed strings! It does not change
508 the contents of path.
510 If the path ends with "." or "..", they are (correctly) counted as
513 parse_dir (const char *path, char **dir, char **file)
517 l = urlpath_length (path);
518 for (i = l; i && path[i] != '/'; i--);
520 if (!i && *path != '/') /* Just filename */
522 if (PD_DOTP (path) || PD_DDOTP (path))
524 *dir = strdupdelim (path, path + l);
525 *file = xstrdup (path + l); /* normally empty, but could
530 *dir = xstrdup (""); /* This is required because of FTP */
531 *file = xstrdup (path);
534 else if (!i) /* /filename */
536 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
538 *dir = strdupdelim (path, path + l);
539 *file = xstrdup (path + l); /* normally empty, but could
544 *dir = xstrdup ("/");
545 *file = xstrdup (path + 1);
548 else /* Nonempty directory with or without a filename */
550 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
552 *dir = strdupdelim (path, path + l);
553 *file = xstrdup (path + l); /* normally empty, but could
558 *dir = strdupdelim (path, path + i);
559 *file = xstrdup (path + i + 1);
564 /* Find the optional username and password within the URL, as per
565 RFC1738. The returned user and passwd char pointers are
568 parse_uname (const char *url, char **user, char **passwd)
571 const char *p, *q, *col;
577 /* Look for the end of the protocol string. */
578 l = skip_proto (url);
581 /* Add protocol offset. */
583 /* Is there an `@' character? */
584 for (p = url; *p && *p != '/'; p++)
587 /* If not, return. */
590 /* Else find the username and password. */
591 for (p = q = col = url; *p && *p != '/'; p++)
593 if (*p == ':' && !*user)
595 *user = (char *)xmalloc (p - url + 1);
596 memcpy (*user, url, p - url);
597 (*user)[p - url] = '\0';
600 if (*p == '@') q = p;
602 /* Decide whether you have only the username or both. */
603 where = *user ? passwd : user;
604 *where = (char *)xmalloc (q - col + 1);
605 memcpy (*where, col, q - col);
606 (*where)[q - col] = '\0';
610 /* If PATH ends with `;type=X', return the character X. */
612 process_ftp_type (char *path)
614 int len = strlen (path);
617 && !memcmp (path + len - 7, ";type=", 6))
619 path[len - 7] = '\0';
620 return path[len - 1];
626 /* Return the URL as fine-formed string, with a proper protocol, optional port
627 number, directory and optional user/password. If `hide' is non-zero (as it
628 is when we're calling this on a URL we plan to print, but not when calling it
629 to canonicalize a URL for use within the program), password will be hidden.
630 The forbidden characters in the URL will be cleansed. */
632 str_url (const struct urlinfo *u, int hide)
634 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
635 int i, l, ln, lu, lh, lp, lf, ld;
636 unsigned short proto_default_port;
638 /* Look for the protocol name. */
639 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
640 if (sup_protos[i].ind == u->proto)
642 if (i == ARRAY_SIZE (sup_protos))
644 proto_name = sup_protos[i].name;
645 proto_default_port = sup_protos[i].port;
646 host = encode_string (u->host);
647 dir = encode_string (u->dir);
648 file = encode_string (u->file);
649 user = passwd = NULL;
651 user = encode_string (u->user);
655 /* Don't output the password, or someone might see it over the user's
656 shoulder (or in saved wget output). Don't give away the number of
657 characters in the password, either, as we did in past versions of
658 this code, when we replaced the password characters with 'x's. */
659 passwd = xstrdup("<password>");
661 passwd = encode_string (u->passwd);
663 if (u->proto == URLFTP && *dir == '/')
665 char *tmp = (char *)xmalloc (strlen (dir) + 3);
666 /*sprintf (tmp, "%%2F%s", dir + 1);*/
670 strcpy (tmp + 3, dir + 1);
675 ln = strlen (proto_name);
676 lu = user ? strlen (user) : 0;
677 lp = passwd ? strlen (passwd) : 0;
681 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
682 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
683 (user ? user : ""), (passwd ? ":" : ""),
684 (passwd ? passwd : ""), (user ? "@" : ""),
685 host, u->port, dir, *dir ? "/" : "", file); */
687 memcpy (res, proto_name, ln);
691 memcpy (res + l, user, lu);
696 memcpy (res + l, passwd, lp);
701 memcpy (res + l, host, lh);
703 if (u->port != proto_default_port)
706 long_to_string (res + l, (long)u->port);
707 l += numdigit (u->port);
710 memcpy (res + l, dir, ld);
714 strcpy (res + l, file);
723 /* Check whether two URL-s are equivalent, i.e. pointing to the same
724 location. Uses parseurl to parse them, and compares the canonical
727 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
728 return 0 on error. */
729 /* Do not compile unused code. */
732 url_equal (const char *url1, const char *url2)
734 struct urlinfo *u1, *u2;
739 err = parseurl (url1, u1, 0);
746 err = parseurl (url2, u2, 0);
753 res = !strcmp (u1->url, u2->url);
761 get_urls_file (const char *file)
763 struct file_memory *fm;
765 const char *text, *text_end;
768 fm = read_file (file);
771 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
774 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
777 text_end = fm->content + fm->length;
778 while (text < text_end)
780 const char *line_beg = text;
781 const char *line_end = memchr (text, '\n', text_end - text);
787 while (line_beg < line_end
788 && ISSPACE (*line_beg))
790 while (line_end > line_beg + 1
791 && ISSPACE (*(line_end - 1)))
793 if (line_end > line_beg)
795 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
796 memset (entry, 0, sizeof (*entry));
798 entry->url = strdupdelim (line_beg, line_end);
810 /* Free the linked list of urlpos. */
812 free_urlpos (urlpos *l)
816 urlpos *next = l->next;
818 FREE_MAYBE (l->local_name);
824 /* Rotate FNAME opt.backups times */
826 rotate_backups(const char *fname)
828 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
829 char *from = (char *)alloca (maxlen);
830 char *to = (char *)alloca (maxlen);
834 if (stat (fname, &sb) == 0)
835 if (S_ISREG (sb.st_mode) == 0)
838 for (i = opt.backups; i > 1; i--)
840 sprintf (from, "%s.%d", fname, i - 1);
841 sprintf (to, "%s.%d", fname, i);
842 /* #### This will fail on machines without the rename() system
847 sprintf (to, "%s.%d", fname, 1);
851 /* Create all the necessary directories for PATH (a file). Calls
852 mkdirhier() internally. */
854 mkalldirs (const char *path)
861 p = path + strlen (path);
862 for (; *p != '/' && p != path; p--);
863 /* Don't create if it's just a file. */
864 if ((p == path) && (*p != '/'))
866 t = strdupdelim (path, p);
867 /* Check whether the directory exists. */
868 if ((stat (t, &st) == 0))
870 if (S_ISDIR (st.st_mode))
877 /* If the dir exists as a file name, remove it first. This
878 is *only* for Wget to work with buggy old CERN http
879 servers. Here is the scenario: When Wget tries to
880 retrieve a directory without a slash, e.g.
881 http://foo/bar (bar being a directory), CERN server will
882 not redirect it too http://foo/bar/ -- it will generate a
883 directory listing containing links to bar/file1,
884 bar/file2, etc. Wget will lose because it saves this
885 HTML listing to a file `bar', so it cannot create the
886 directory. To work around this, if the file of the same
887 name exists, we just remove it and create the directory
889 DEBUGP (("Removing %s because of directory danger!\n", t));
893 res = make_directory (t);
895 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
901 count_slashes (const char *s)
910 /* Return the path name of the URL-equivalent file name, with a
911 remote-like structure of directories. */
913 mkstruct (const struct urlinfo *u)
915 char *host, *dir, *file, *res, *dirpref;
918 assert (u->dir != NULL);
919 assert (u->host != NULL);
923 char *ptr = u->dir + (*u->dir == '/');
924 int slash_count = 1 + count_slashes (ptr);
925 int cut = MINVAL (opt.cut_dirs, slash_count);
926 for (; cut && *ptr; ptr++)
929 STRDUP_ALLOCA (dir, ptr);
932 dir = u->dir + (*u->dir == '/');
934 host = xstrdup (u->host);
935 /* Check for the true name (or at least a consistent name for saving
936 to directory) of HOST, reusing the hlist if possible. */
937 if (opt.add_hostdir && !opt.simple_check)
939 char *nhost = realhost (host);
943 /* Add dir_prefix and hostname (if required) to the beginning of
947 if (!DOTP (opt.dir_prefix))
949 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
950 + strlen (host) + 1);
951 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
954 STRDUP_ALLOCA (dirpref, host);
956 else /* not add_hostdir */
958 if (!DOTP (opt.dir_prefix))
959 dirpref = opt.dir_prefix;
965 /* If there is a prefix, prepend it. */
968 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
969 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
972 dir = encode_string (dir);
974 if (l && dir[l - 1] == '/')
982 /* Finally, construct the full name. */
983 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
984 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
989 /* Return a malloced copy of S, but protect any '/' characters. */
992 file_name_protect_query_string (const char *s)
997 for (from = s; *from; from++)
1001 destlen += 2; /* each / gets replaced with %2F, so
1002 it adds two more chars. */
1004 dest = (char *)xmalloc (destlen + 1);
1005 for (from = s, to = dest; *from; from++)
1016 assert (to - dest == destlen);
1021 /* Create a unique filename, corresponding to a given URL. Calls
1022 mkstruct if necessary. Does *not* actually create any directories. */
1024 url_filename (const struct urlinfo *u)
1027 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1031 file = mkstruct (u);
1037 file = xstrdup ("index.html");
1040 /* If the URL came with a query string, u->file will contain
1041 a question mark followed by query string contents. These
1042 contents can contain '/' which would make us create
1043 unwanted directories. These slashes must be protected
1045 if (!strchr (u->file, '/'))
1046 file = xstrdup (u->file);
1049 /*assert (strchr (u->file, '?') != NULL);*/
1050 file = file_name_protect_query_string (u->file);
1057 /* Check whether the prefix directory is something other than "."
1058 before prepending it. */
1059 if (!DOTP (opt.dir_prefix))
1061 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1062 + 1 + strlen (file) + 1);
1063 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1068 /* DOS-ish file systems don't like `%' signs in them; we change it
1073 for (p = file; *p; p++)
1077 #endif /* WINDOWS */
1079 /* Check the cases in which the unique extensions are not used:
1080 1) Clobbering is turned off (-nc).
1081 2) Retrieval with regetting.
1082 3) Timestamping is used.
1083 4) Hierarchy is built.
1085 The exception is the case when file does exist and is a
1086 directory (actually support for bad httpd-s). */
1087 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1088 && !(file_exists_p (file) && !file_non_directory_p (file)))
1091 /* Find a unique name. */
1092 name = unique_name (file);
1097 /* Like strlen(), but allow the URL to be ended with '?'. */
1099 urlpath_length (const char *url)
1101 const char *q = strchr (url, '?');
1104 return strlen (url);
1107 /* Find the last occurrence of character C in the range [b, e), or
1108 NULL, if none are present. This is almost completely equivalent to
1109 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1110 the contents of the string. */
1112 find_last_char (const char *b, const char *e, char c)
1120 /* Resolve the result of "linking" a base URI (BASE) to a
1121 link-specified URI (LINK).
1123 Either of the URIs may be absolute or relative, complete with the
1124 host name, or path only. This tries to behave "reasonably" in all
1125 foreseeable cases. It employs little specific knowledge about
1126 protocols or URL-specific stuff -- it just works on strings.
1128 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1129 See uri_merge for a gentler interface to this functionality.
1131 #### This function should handle `./' and `../' so that the evil
1132 path_simplify can go. */
1134 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1140 const char *end = base + urlpath_length (base);
1144 /* LINK is a relative URL: we need to replace everything
1145 after last slash (possibly empty) with LINK.
1147 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1148 our result should be "whatever/foo/qux/xyzzy". */
1149 int need_explicit_slash = 0;
1151 const char *start_insert;
1152 const char *last_slash = find_last_char (base, end, '/');
1155 /* No slash found at all. Append LINK to what we have,
1156 but we'll need a slash as a separator.
1158 Example: if base == "foo" and link == "qux/xyzzy", then
1159 we cannot just append link to base, because we'd get
1160 "fooqux/xyzzy", whereas what we want is
1163 To make sure the / gets inserted, we set
1164 need_explicit_slash to 1. We also set start_insert
1165 to end + 1, so that the length calculations work out
1166 correctly for one more (slash) character. Accessing
1167 that character is fine, since it will be the
1168 delimiter, '\0' or '?'. */
1169 /* example: "foo?..." */
1170 /* ^ ('?' gets changed to '/') */
1171 start_insert = end + 1;
1172 need_explicit_slash = 1;
1174 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1176 /* example: http://host" */
1178 start_insert = end + 1;
1179 need_explicit_slash = 1;
1183 /* example: "whatever/foo/bar" */
1185 start_insert = last_slash + 1;
1188 span = start_insert - base;
1189 constr = (char *)xmalloc (span + linklength + 1);
1191 memcpy (constr, base, span);
1192 if (need_explicit_slash)
1193 constr[span - 1] = '/';
1195 memcpy (constr + span, link, linklength);
1196 constr[span + linklength] = '\0';
1198 else /* *link == `/' */
1200 /* LINK is an absolute path: we need to replace everything
1201 after (and including) the FIRST slash with LINK.
1203 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1204 "/qux/xyzzy", our result should be
1205 "http://host/qux/xyzzy". */
1208 const char *start_insert = NULL; /* for gcc to shut up. */
1209 const char *pos = base;
1210 int seen_slash_slash = 0;
1211 /* We're looking for the first slash, but want to ignore
1214 slash = memchr (pos, '/', end - pos);
1215 if (slash && !seen_slash_slash)
1216 if (*(slash + 1) == '/')
1219 seen_slash_slash = 1;
1223 /* At this point, SLASH is the location of the first / after
1224 "//", or the first slash altogether. START_INSERT is the
1225 pointer to the location where LINK will be inserted. When
1226 examining the last two examples, keep in mind that LINK
1229 if (!slash && !seen_slash_slash)
1230 /* example: "foo" */
1232 start_insert = base;
1233 else if (!slash && seen_slash_slash)
1234 /* example: "http://foo" */
1237 else if (slash && !seen_slash_slash)
1238 /* example: "foo/bar" */
1240 start_insert = base;
1241 else if (slash && seen_slash_slash)
1242 /* example: "http://something/" */
1244 start_insert = slash;
1246 span = start_insert - base;
1247 constr = (char *)xmalloc (span + linklength + 1);
1249 memcpy (constr, base, span);
1251 memcpy (constr + span, link, linklength);
1252 constr[span + linklength] = '\0';
1255 else /* !no_proto */
1257 constr = strdupdelim (link, link + linklength);
1262 /* Merge BASE with LINK and return the resulting URI. This is an
1263 interface to uri_merge_1 that assumes that LINK is a
1264 zero-terminated string. */
1266 uri_merge (const char *base, const char *link)
1268 return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1271 /* Optimize URL by host, destructively replacing u->host with realhost
1272 (u->host). Do this regardless of opt.simple_check. */
1274 opt_url (struct urlinfo *u)
1276 /* Find the "true" host. */
1277 char *host = realhost (u->host);
1280 assert (u->dir != NULL); /* the URL must have been parsed */
1281 /* Refresh the printed representation. */
1283 u->url = str_url (u, 0);
1286 /* Returns proxy host address, in accordance with PROTO. */
1288 getproxy (uerr_t proto)
1292 if (proto == URLHTTP)
1293 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1294 else if (proto == URLFTP)
1295 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1297 else if (proto == URLHTTPS)
1298 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1299 #endif /* HAVE_SSL */
1302 if (!proxy || !*proxy)
1307 /* Should a host be accessed through proxy, concerning no_proxy? */
1309 no_proxy_match (const char *host, const char **no_proxy)
1314 return !sufmatch (no_proxy, host);
1317 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1318 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1320 /* Change the links in an HTML document. Accepts a structure that
1321 defines the positions of all the links. */
1323 convert_links (const char *file, urlpos *l)
1325 struct file_memory *fm;
1328 downloaded_file_t downloaded_file_return;
1330 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1333 /* First we do a "dry run": go through the list L and see whether
1334 any URL needs to be converted in the first place. If not, just
1335 leave the file alone. */
1338 for (dry = l; dry; dry = dry->next)
1339 if (dry->convert != CO_NOCONVERT)
1343 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1348 fm = read_file (file);
1351 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1352 file, strerror (errno));
1356 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1357 if (opt.backup_converted && downloaded_file_return)
1358 write_backup_file (file, downloaded_file_return);
1360 /* Before opening the file for writing, unlink the file. This is
1361 important if the data in FM is mmaped. In such case, nulling the
1362 file, which is what fopen() below does, would make us read all
1363 zeroes from the mmaped region. */
1364 if (unlink (file) < 0 && errno != ENOENT)
1366 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1367 file, strerror (errno));
1368 read_file_free (fm);
1371 /* Now open the file for writing. */
1372 fp = fopen (file, "wb");
1375 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1376 file, strerror (errno));
1377 read_file_free (fm);
1380 /* Here we loop through all the URLs in file, replacing those of
1381 them that are downloaded with relative references. */
1383 for (; l; l = l->next)
1385 char *url_start = fm->content + l->pos;
1387 if (l->pos >= fm->length)
1389 DEBUGP (("Something strange is going on. Please investigate."));
1392 /* If the URL is not to be converted, skip it. */
1393 if (l->convert == CO_NOCONVERT)
1395 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1399 /* Echo the file contents, up to the offending URL's opening
1400 quote, to the outfile. */
1401 fwrite (p, 1, url_start - p, fp);
1403 if (l->convert == CO_CONVERT_TO_RELATIVE)
1405 /* Convert absolute URL to relative. */
1406 char *newname = construct_relative (file, l->local_name);
1407 char *quoted_newname = html_quote_string (newname);
1408 replace_attr (&p, l->size, fp, quoted_newname);
1409 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1410 l->url, newname, l->pos, file));
1412 xfree (quoted_newname);
1414 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1416 /* Convert the link to absolute URL. */
1417 char *newlink = l->url;
1418 char *quoted_newlink = html_quote_string (newlink);
1419 replace_attr (&p, l->size, fp, quoted_newlink);
1420 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1421 newlink, l->pos, file));
1422 xfree (quoted_newlink);
1425 /* Output the rest of the file. */
1426 if (p - fm->content < fm->length)
1427 fwrite (p, 1, fm->length - (p - fm->content), fp);
1429 read_file_free (fm);
1430 logputs (LOG_VERBOSE, _("done.\n"));
1433 /* Construct and return a malloced copy of the relative link from two
1434 pieces of information: local name S1 of the referring file and
1435 local name S2 of the referred file.
1437 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1438 "jagor.srce.hr/images/news.gif", the function will return
1441 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1442 "fly.cc.fer.hr/images/fly.gif", the function will return
1443 "../images/fly.gif".
1445 Caveats: S1 should not begin with `/', unless S2 also begins with
1446 '/'. S1 should not contain things like ".." and such --
1447 construct_relative ("fly/ioccc/../index.html",
1448 "fly/images/fly.gif") will fail. (A workaround is to call
1449 something like path_simplify() on S1). */
1451 construct_relative (const char *s1, const char *s2)
1453 int i, cnt, sepdirs1;
1457 return xstrdup (s2);
1458 /* S1 should *not* be absolute, if S2 wasn't. */
1459 assert (*s1 != '/');
1461 /* Skip the directories common to both strings. */
1464 while (s1[i] && s2[i]
1469 if (s1[i] == '/' && s2[i] == '/')
1474 for (sepdirs1 = 0; s1[i]; i++)
1477 /* Now, construct the file as of:
1478 - ../ repeated sepdirs1 time
1479 - all the non-mutual directories of S2. */
1480 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1481 for (i = 0; i < sepdirs1; i++)
1482 memcpy (res + 3 * i, "../", 3);
1483 strcpy (res + 3 * i, s2 + cnt);
1487 /* Add URL to the head of the list L. */
1489 add_url (urlpos *l, const char *url, const char *file)
1493 t = (urlpos *)xmalloc (sizeof (urlpos));
1494 memset (t, 0, sizeof (*t));
1495 t->url = xstrdup (url);
1496 t->local_name = xstrdup (file);
1502 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1504 /* Rather than just writing over the original .html file with the
1505 converted version, save the former to *.orig. Note we only do
1506 this for files we've _successfully_ downloaded, so we don't
1507 clobber .orig files sitting around from previous invocations. */
1509 /* Construct the backup filename as the original name plus ".orig". */
1510 size_t filename_len = strlen(file);
1511 char* filename_plus_orig_suffix;
1512 boolean already_wrote_backup_file = FALSE;
1513 slist* converted_file_ptr;
1514 static slist* converted_files = NULL;
1516 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1518 /* Just write "orig" over "html". We need to do it this way
1519 because when we're checking to see if we've downloaded the
1520 file before (to see if we can skip downloading it), we don't
1521 know if it's a text/html file. Therefore we don't know yet
1522 at that stage that -E is going to cause us to tack on
1523 ".html", so we need to compare vs. the original URL plus
1524 ".orig", not the original URL plus ".html.orig". */
1525 filename_plus_orig_suffix = alloca (filename_len + 1);
1526 strcpy(filename_plus_orig_suffix, file);
1527 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1529 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1531 /* Append ".orig" to the name. */
1532 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1533 strcpy(filename_plus_orig_suffix, file);
1534 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1537 /* We can get called twice on the same URL thanks to the
1538 convert_all_links() call in main(). If we write the .orig file
1539 each time in such a case, it'll end up containing the first-pass
1540 conversion, not the original file. So, see if we've already been
1541 called on this file. */
1542 converted_file_ptr = converted_files;
1543 while (converted_file_ptr != NULL)
1544 if (strcmp(converted_file_ptr->string, file) == 0)
1546 already_wrote_backup_file = TRUE;
1550 converted_file_ptr = converted_file_ptr->next;
1552 if (!already_wrote_backup_file)
1554 /* Rename <file> to <file>.orig before former gets written over. */
1555 if (rename(file, filename_plus_orig_suffix) != 0)
1556 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1557 file, filename_plus_orig_suffix, strerror (errno));
1559 /* Remember that we've already written a .orig backup for this file.
1560 Note that we never free this memory since we need it till the
1561 convert_all_links() call, which is one of the last things the
1562 program does before terminating. BTW, I'm not sure if it would be
1563 safe to just set 'converted_file_ptr->string' to 'file' below,
1564 rather than making a copy of the string... Another note is that I
1565 thought I could just add a field to the urlpos structure saying
1566 that we'd written a .orig file for this URL, but that didn't work,
1567 so I had to make this separate list.
1568 -- Dan Harkless <wget@harkless.org>
1570 This [adding a field to the urlpos structure] didn't work
1571 because convert_file() is called twice: once after all its
1572 sublinks have been retrieved in recursive_retrieve(), and
1573 once at the end of the day in convert_all_links(). The
1574 original linked list collected in recursive_retrieve() is
1575 lost after the first invocation of convert_links(), and
1576 convert_all_links() makes a new one (it calls get_urls_html()
1577 for each file it covers.) That's why your first approach didn't
1578 work. The way to make it work is perhaps to make this flag a
1579 field in the `urls_html' list.
1580 -- Hrvoje Niksic <hniksic@arsdigita.com>
1582 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1583 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1584 converted_file_ptr->next = converted_files;
1585 converted_files = converted_file_ptr;
1589 static int find_fragment PARAMS ((const char *, int, const char **,
1593 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1595 const char *p = *pp;
1597 int size = raw_size;
1598 char quote_char = '\"';
1599 const char *frag_beg, *frag_end;
1601 /* Structure of our string is:
1602 "...old-contents..."
1603 <--- l->size ---> (with quotes)
1606 <--- l->size --> (no quotes) */
1608 if (*p == '\"' || *p == '\'')
1613 size -= 2; /* disregard opening and closing quote */
1615 putc (quote_char, fp);
1616 fputs (new_str, fp);
1618 /* Look for fragment identifier, if any. */
1619 if (find_fragment (p, size, &frag_beg, &frag_end))
1620 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1624 putc (quote_char, fp);
1628 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1629 preceded by '&'. If the character is not found, return zero. If
1630 the character is found, return 1 and set BP and EP to point to the
1631 beginning and end of the region.
1633 This is used for finding the fragment indentifiers in URLs. */
1636 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1638 const char *end = beg + size;
1640 for (; beg < end; beg++)
1662 typedef struct _downloaded_file_list {
1664 downloaded_file_t download_type;
1665 struct _downloaded_file_list* next;
1666 } downloaded_file_list;
1668 static downloaded_file_list *downloaded_files;
1670 /* Remembers which files have been downloaded. In the standard case, should be
1671 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1672 download successfully (i.e. not for ones we have failures on or that we skip
1675 When we've downloaded a file and tacked on a ".html" extension due to -E,
1676 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1677 FILE_DOWNLOADED_NORMALLY.
1679 If you just want to check if a file has been previously added without adding
1680 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1681 with local filenames, not remote URLs. */
1683 downloaded_file (downloaded_file_t mode, const char* file)
1685 boolean found_file = FALSE;
1686 downloaded_file_list* rover = downloaded_files;
1688 while (rover != NULL)
1689 if (strcmp(rover->file, file) == 0)
1695 rover = rover->next;
1698 return rover->download_type; /* file had already been downloaded */
1701 if (mode != CHECK_FOR_FILE)
1703 rover = xmalloc(sizeof(*rover));
1704 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1705 rover->download_type = mode;
1706 rover->next = downloaded_files;
1707 downloaded_files = rover;
1710 return FILE_NOT_ALREADY_DOWNLOADED;
1715 downloaded_files_free (void)
1717 downloaded_file_list* rover = downloaded_files;
1720 downloaded_file_list *next = rover->next;
1721 xfree (rover->file);