2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
50 static int urlpath_length PARAMS ((const char *));
52 /* A NULL-terminated list of strings to be recognized as protocol
53 types (URL schemes). Note that recognized doesn't mean supported
54 -- only HTTP, HTTPS and FTP are currently supported.
56 However, a string that does not match anything in the list will be
57 considered a relative URL. Thus it's important that this list has
58 anything anyone could think of being legal.
60 #### This is probably broken. Wget should use other means to
61 distinguish between absolute and relative URIs in HTML links.
63 Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
65 static char *protostrings[] =
107 /* Similar to former, but for supported protocols: */
108 static struct proto sup_protos[] =
110 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
112 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
114 { "ftp://", URLFTP, DEFAULT_FTP_PORT }
117 static void parse_dir PARAMS ((const char *, char **, char **));
118 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
119 static char *construct_relative PARAMS ((const char *, const char *));
120 static char process_ftp_type PARAMS ((char *));
123 /* Support for encoding and decoding of URL strings. We determine
124 whether a character is unsafe through static table lookup. This
125 code assumes ASCII character set and 8-bit chars. */
132 #define R urlchr_reserved
133 #define U urlchr_unsafe
136 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
138 /* rfc1738 reserved chars. We don't use this yet; preservation of
139 reserved chars will be implemented when I integrate the new
140 `reencode_string' function. */
142 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
146 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
147 - '@' and ':'; needed for encoding URL username and password.
148 - anything >= 127. */
150 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
152 const static unsigned char urlchr_table[256] =
154 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
155 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
156 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
157 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
158 U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */
159 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
160 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
161 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */
162 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
163 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
164 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
165 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
166 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
167 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
168 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
169 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
171 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
172 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
173 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
174 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
176 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
177 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
178 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
179 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
182 /* Decodes the forms %xy in a URL to the character the hexadecimal
183 code of which is xy. xy are hexadecimal digits from
184 [0123456789ABCDEF] (case-insensitive). If x or y are not
185 hex-digits or `%' precedes `\0', the sequence is inserted
189 decode_string (char *s)
191 char *t = s; /* t - tortoise */
192 char *h = s; /* h - hare */
203 /* Do nothing if '%' is not followed by two hex digits. */
204 if (!*(h + 1) || !*(h + 2)
205 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
207 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
214 /* Like encode_string, but return S if there are no unsafe chars. */
217 encode_string_maybe (const char *s)
224 for (p1 = s; *p1; p1++)
225 if (UNSAFE_CHAR (*p1))
226 addition += 2; /* Two more characters (hex digits) */
231 newlen = (p1 - s) + addition;
232 newstr = (char *)xmalloc (newlen + 1);
238 if (UNSAFE_CHAR (*p1))
240 const unsigned char c = *p1++;
242 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
243 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
249 assert (p2 - newstr == newlen);
254 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
255 given string, returning a malloc-ed %XX encoded string. */
258 encode_string (const char *s)
260 char *encoded = encode_string_maybe (s);
267 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
268 the old value of PTR is freed and PTR is made to point to the newly
269 allocated storage. */
271 #define ENCODE(ptr) do { \
272 char *e_new = encode_string_maybe (ptr); \
280 /* Returns the protocol type if URL's protocol is supported, or
281 URLUNKNOWN if not. */
283 urlproto (const char *url)
287 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
288 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
289 return sup_protos[i].ind;
290 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
293 for (++i; url[i] && url[i] != '/'; i++)
294 if (!ISDIGIT (url[i]))
296 if (url[i - 1] == ':')
305 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
306 part is found, returns 0. */
308 skip_proto (const char *url)
313 for (s = protostrings; *s; s++)
314 if (!strncasecmp (*s, url, strlen (*s)))
319 /* HTTP and FTP protocols are expected to yield exact host names
320 (i.e. the `//' part must be skipped, too). */
321 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
326 /* Returns 1 if the URL begins with a protocol (supported or
327 unsupported), 0 otherwise. */
329 has_proto (const char *url)
333 for (s = protostrings; *s; s++)
334 if (strncasecmp (url, *s, strlen (*s)) == 0)
339 /* Skip the username and password, if present here. The function
340 should be called *not* with the complete URL, but with the part
341 right after the protocol.
343 If no username and password are found, return 0. */
345 skip_uname (const char *url)
348 const char *q = NULL;
349 for (p = url ; *p && *p != '/'; p++)
350 if (*p == '@') q = p;
351 /* If a `@' was found before the first occurrence of `/', skip
359 /* Allocate a new urlinfo structure, fill it with default values and
360 return a pointer to it. */
366 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
367 memset (u, 0, sizeof (*u));
368 u->proto = URLUNKNOWN;
372 /* Perform a "deep" free of the urlinfo structure. The structure
373 should have been created with newurl, but need not have been used.
374 If free_pointer is non-0, free the pointer itself. */
376 freeurl (struct urlinfo *u, int complete)
380 FREE_MAYBE (u->host);
381 FREE_MAYBE (u->path);
382 FREE_MAYBE (u->file);
384 FREE_MAYBE (u->user);
385 FREE_MAYBE (u->passwd);
386 FREE_MAYBE (u->local);
387 FREE_MAYBE (u->referer);
389 freeurl (u->proxy, 1);
395 /* Extract the given URL of the form
396 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
397 1. hostname (terminated with `/' or `:')
398 2. port number (terminated with `/'), or chosen for the protocol
399 3. dirname (everything after hostname)
400 Most errors are handled. No allocation is done, you must supply
401 pointers to allocated memory.
402 ...and a host of other stuff :-)
404 - Recognizes hostname:dir/file for FTP and
405 hostname (:portnum)?/dir/file for HTTP.
406 - Parses the path to yield directory and file
407 - Parses the URL to yield the username and passwd (if present)
408 - Decodes the strings, in case they contain "forbidden" characters
409 - Writes the result to struct urlinfo
411 If the argument STRICT is set, it recognizes only the canonical
414 parseurl (const char *url, struct urlinfo *u, int strict)
417 int recognizable; /* Recognizable URL is the one where
418 the protocol name was explicitly
419 named, i.e. it wasn't deduced from
423 DEBUGP (("parseurl (\"%s\") -> ", url));
424 recognizable = has_proto (url);
425 if (strict && !recognizable)
427 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
429 l = strlen (sup_protos[i].name);
430 if (!strncasecmp (sup_protos[i].name, url, l))
433 /* If protocol is recognizable, but unsupported, bail out, else
435 if (recognizable && i == ARRAY_SIZE (sup_protos))
437 else if (i == ARRAY_SIZE (sup_protos))
440 u->proto = type = sup_protos[i].ind;
442 if (type == URLUNKNOWN)
444 /* Allow a username and password to be specified (i.e. just skip
447 l += skip_uname (url + l);
448 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
451 /* Get the hostname. */
452 u->host = strdupdelim (url + l, url + i);
453 DEBUGP (("host %s -> ", u->host));
455 /* Assume no port has been given. */
459 /* We have a colon delimiting the hostname. It could mean that
460 a port number is following it, or a directory. */
461 if (ISDIGIT (url[++i])) /* A port number */
463 if (type == URLUNKNOWN)
464 u->proto = type = URLHTTP;
465 for (; url[i] && url[i] != '/'; i++)
466 if (ISDIGIT (url[i]))
467 u->port = 10 * u->port + (url[i] - '0');
472 DEBUGP (("port %hu -> ", u->port));
474 else if (type == URLUNKNOWN) /* or a directory */
475 u->proto = type = URLFTP;
476 else /* or just a misformed port number */
479 else if (type == URLUNKNOWN)
480 u->proto = type = URLHTTP;
484 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
485 if (sup_protos[ind].ind == type)
487 if (ind == ARRAY_SIZE (sup_protos))
489 u->port = sup_protos[ind].port;
491 /* Some delimiter troubles... */
492 if (url[i] == '/' && url[i - 1] != ':')
495 while (url[i] && url[i] == '/')
497 u->path = (char *)xmalloc (strlen (url + i) + 8);
498 strcpy (u->path, url + i);
501 u->ftp_type = process_ftp_type (u->path);
502 /* #### We don't handle type `d' correctly yet. */
503 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
505 DEBUGP (("ftp_type %c -> ", u->ftp_type));
507 DEBUGP (("opath %s -> ", u->path));
508 /* Parse the username and password (if existing). */
509 parse_uname (url, &u->user, &u->passwd);
510 /* Decode the strings, as per RFC 1738. */
511 decode_string (u->host);
512 decode_string (u->path);
514 decode_string (u->user);
516 decode_string (u->passwd);
517 /* Parse the directory. */
518 parse_dir (u->path, &u->dir, &u->file);
519 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
520 /* Simplify the directory. */
521 path_simplify (u->dir);
522 /* Remove the leading `/' in HTTP. */
523 if (type == URLHTTP && *u->dir == '/')
524 strcpy (u->dir, u->dir + 1);
525 DEBUGP (("ndir %s\n", u->dir));
526 /* Strip trailing `/'. */
528 if (l > 1 && u->dir[l - 1] == '/')
529 u->dir[l - 1] = '\0';
530 /* Re-create the path: */
531 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
532 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
533 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
534 strcpy (u->path, abs_ftp ? "%2F" : "/");
535 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
536 strcat (u->path, *u->dir ? "/" : "");
537 strcat (u->path, u->file);
539 DEBUGP (("newpath: %s\n", u->path));
540 /* Create the clean URL. */
541 u->url = str_url (u, 0);
545 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
546 DOTP and DDOTP, but they also recognize `?' as end-of-string
547 delimiter. This is needed for correct handling of query
550 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
551 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
552 && (!*((x) + 2) || *((x) + 2) == '?'))
554 /* Build the directory and filename components of the path. Both
555 components are *separately* malloc-ed strings! It does not change
556 the contents of path.
558 If the path ends with "." or "..", they are (correctly) counted as
561 parse_dir (const char *path, char **dir, char **file)
565 l = urlpath_length (path);
566 for (i = l; i && path[i] != '/'; i--);
568 if (!i && *path != '/') /* Just filename */
570 if (PD_DOTP (path) || PD_DDOTP (path))
572 *dir = strdupdelim (path, path + l);
573 *file = xstrdup (path + l); /* normally empty, but could
578 *dir = xstrdup (""); /* This is required because of FTP */
579 *file = xstrdup (path);
582 else if (!i) /* /filename */
584 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
586 *dir = strdupdelim (path, path + l);
587 *file = xstrdup (path + l); /* normally empty, but could
592 *dir = xstrdup ("/");
593 *file = xstrdup (path + 1);
596 else /* Nonempty directory with or without a filename */
598 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
600 *dir = strdupdelim (path, path + l);
601 *file = xstrdup (path + l); /* normally empty, but could
606 *dir = strdupdelim (path, path + i);
607 *file = xstrdup (path + i + 1);
612 /* Find the optional username and password within the URL, as per
613 RFC1738. The returned user and passwd char pointers are
616 parse_uname (const char *url, char **user, char **passwd)
619 const char *p, *q, *col;
625 /* Look for the end of the protocol string. */
626 l = skip_proto (url);
629 /* Add protocol offset. */
631 /* Is there an `@' character? */
632 for (p = url; *p && *p != '/'; p++)
635 /* If not, return. */
638 /* Else find the username and password. */
639 for (p = q = col = url; *p && *p != '/'; p++)
641 if (*p == ':' && !*user)
643 *user = (char *)xmalloc (p - url + 1);
644 memcpy (*user, url, p - url);
645 (*user)[p - url] = '\0';
648 if (*p == '@') q = p;
650 /* Decide whether you have only the username or both. */
651 where = *user ? passwd : user;
652 *where = (char *)xmalloc (q - col + 1);
653 memcpy (*where, col, q - col);
654 (*where)[q - col] = '\0';
658 /* If PATH ends with `;type=X', return the character X. */
660 process_ftp_type (char *path)
662 int len = strlen (path);
665 && !memcmp (path + len - 7, ";type=", 6))
667 path[len - 7] = '\0';
668 return path[len - 1];
674 /* Return the URL as fine-formed string, with a proper protocol, optional port
675 number, directory and optional user/password. If `hide' is non-zero (as it
676 is when we're calling this on a URL we plan to print, but not when calling it
677 to canonicalize a URL for use within the program), password will be hidden.
678 The forbidden characters in the URL will be cleansed. */
680 str_url (const struct urlinfo *u, int hide)
682 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
683 int i, l, ln, lu, lh, lp, lf, ld;
684 unsigned short proto_default_port;
686 /* Look for the protocol name. */
687 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
688 if (sup_protos[i].ind == u->proto)
690 if (i == ARRAY_SIZE (sup_protos))
692 proto_name = sup_protos[i].name;
693 proto_default_port = sup_protos[i].port;
694 host = encode_string (u->host);
695 dir = encode_string (u->dir);
696 file = encode_string (u->file);
697 user = passwd = NULL;
699 user = encode_string (u->user);
703 /* Don't output the password, or someone might see it over the user's
704 shoulder (or in saved wget output). Don't give away the number of
705 characters in the password, either, as we did in past versions of
706 this code, when we replaced the password characters with 'x's. */
707 passwd = xstrdup("<password>");
709 passwd = encode_string (u->passwd);
711 if (u->proto == URLFTP && *dir == '/')
713 char *tmp = (char *)xmalloc (strlen (dir) + 3);
714 /*sprintf (tmp, "%%2F%s", dir + 1);*/
718 strcpy (tmp + 3, dir + 1);
723 ln = strlen (proto_name);
724 lu = user ? strlen (user) : 0;
725 lp = passwd ? strlen (passwd) : 0;
729 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
730 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
731 (user ? user : ""), (passwd ? ":" : ""),
732 (passwd ? passwd : ""), (user ? "@" : ""),
733 host, u->port, dir, *dir ? "/" : "", file); */
735 memcpy (res, proto_name, ln);
739 memcpy (res + l, user, lu);
744 memcpy (res + l, passwd, lp);
749 memcpy (res + l, host, lh);
751 if (u->port != proto_default_port)
754 long_to_string (res + l, (long)u->port);
755 l += numdigit (u->port);
758 memcpy (res + l, dir, ld);
762 strcpy (res + l, file);
771 /* Check whether two URL-s are equivalent, i.e. pointing to the same
772 location. Uses parseurl to parse them, and compares the canonical
775 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
776 return 0 on error. */
777 /* Do not compile unused code. */
780 url_equal (const char *url1, const char *url2)
782 struct urlinfo *u1, *u2;
787 err = parseurl (url1, u1, 0);
794 err = parseurl (url2, u2, 0);
801 res = !strcmp (u1->url, u2->url);
809 get_urls_file (const char *file)
811 struct file_memory *fm;
813 const char *text, *text_end;
816 fm = read_file (file);
819 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
822 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
825 text_end = fm->content + fm->length;
826 while (text < text_end)
828 const char *line_beg = text;
829 const char *line_end = memchr (text, '\n', text_end - text);
835 while (line_beg < line_end
836 && ISSPACE (*line_beg))
838 while (line_end > line_beg + 1
839 && ISSPACE (*(line_end - 1)))
841 if (line_end > line_beg)
843 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
844 memset (entry, 0, sizeof (*entry));
846 entry->url = strdupdelim (line_beg, line_end);
858 /* Free the linked list of urlpos. */
860 free_urlpos (urlpos *l)
864 urlpos *next = l->next;
866 FREE_MAYBE (l->local_name);
872 /* Rotate FNAME opt.backups times */
874 rotate_backups(const char *fname)
876 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
877 char *from = (char *)alloca (maxlen);
878 char *to = (char *)alloca (maxlen);
882 if (stat (fname, &sb) == 0)
883 if (S_ISREG (sb.st_mode) == 0)
886 for (i = opt.backups; i > 1; i--)
888 sprintf (from, "%s.%d", fname, i - 1);
889 sprintf (to, "%s.%d", fname, i);
890 /* #### This will fail on machines without the rename() system
895 sprintf (to, "%s.%d", fname, 1);
899 /* Create all the necessary directories for PATH (a file). Calls
900 mkdirhier() internally. */
902 mkalldirs (const char *path)
909 p = path + strlen (path);
910 for (; *p != '/' && p != path; p--);
911 /* Don't create if it's just a file. */
912 if ((p == path) && (*p != '/'))
914 t = strdupdelim (path, p);
915 /* Check whether the directory exists. */
916 if ((stat (t, &st) == 0))
918 if (S_ISDIR (st.st_mode))
925 /* If the dir exists as a file name, remove it first. This
926 is *only* for Wget to work with buggy old CERN http
927 servers. Here is the scenario: When Wget tries to
928 retrieve a directory without a slash, e.g.
929 http://foo/bar (bar being a directory), CERN server will
930 not redirect it too http://foo/bar/ -- it will generate a
931 directory listing containing links to bar/file1,
932 bar/file2, etc. Wget will lose because it saves this
933 HTML listing to a file `bar', so it cannot create the
934 directory. To work around this, if the file of the same
935 name exists, we just remove it and create the directory
937 DEBUGP (("Removing %s because of directory danger!\n", t));
941 res = make_directory (t);
943 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
949 count_slashes (const char *s)
958 /* Return the path name of the URL-equivalent file name, with a
959 remote-like structure of directories. */
961 mkstruct (const struct urlinfo *u)
963 char *host, *dir, *file, *res, *dirpref;
966 assert (u->dir != NULL);
967 assert (u->host != NULL);
971 char *ptr = u->dir + (*u->dir == '/');
972 int slash_count = 1 + count_slashes (ptr);
973 int cut = MINVAL (opt.cut_dirs, slash_count);
974 for (; cut && *ptr; ptr++)
977 STRDUP_ALLOCA (dir, ptr);
980 dir = u->dir + (*u->dir == '/');
982 host = xstrdup (u->host);
983 /* Check for the true name (or at least a consistent name for saving
984 to directory) of HOST, reusing the hlist if possible. */
985 if (opt.add_hostdir && !opt.simple_check)
987 char *nhost = realhost (host);
991 /* Add dir_prefix and hostname (if required) to the beginning of
995 if (!DOTP (opt.dir_prefix))
997 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
998 + strlen (host) + 1);
999 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1002 STRDUP_ALLOCA (dirpref, host);
1004 else /* not add_hostdir */
1006 if (!DOTP (opt.dir_prefix))
1007 dirpref = opt.dir_prefix;
1013 /* If there is a prefix, prepend it. */
1016 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1017 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1020 dir = encode_string (dir);
1022 if (l && dir[l - 1] == '/')
1026 file = "index.html";
1030 /* Finally, construct the full name. */
1031 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1032 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1037 /* Return a malloced copy of S, but protect any '/' characters. */
1040 file_name_protect_query_string (const char *s)
1045 for (from = s; *from; from++)
1049 destlen += 2; /* each / gets replaced with %2F, so
1050 it adds two more chars. */
1052 dest = (char *)xmalloc (destlen + 1);
1053 for (from = s, to = dest; *from; from++)
1064 assert (to - dest == destlen);
1069 /* Create a unique filename, corresponding to a given URL. Calls
1070 mkstruct if necessary. Does *not* actually create any directories. */
1072 url_filename (const struct urlinfo *u)
1075 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1079 file = mkstruct (u);
1085 file = xstrdup ("index.html");
1088 /* If the URL came with a query string, u->file will contain
1089 a question mark followed by query string contents. These
1090 contents can contain '/' which would make us create
1091 unwanted directories. These slashes must be protected
1093 if (!strchr (u->file, '/'))
1094 file = xstrdup (u->file);
1097 /*assert (strchr (u->file, '?') != NULL);*/
1098 file = file_name_protect_query_string (u->file);
1105 /* Check whether the prefix directory is something other than "."
1106 before prepending it. */
1107 if (!DOTP (opt.dir_prefix))
1109 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1110 + 1 + strlen (file) + 1);
1111 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1116 /* DOS-ish file systems don't like `%' signs in them; we change it
1121 for (p = file; *p; p++)
1125 #endif /* WINDOWS */
1127 /* Check the cases in which the unique extensions are not used:
1128 1) Clobbering is turned off (-nc).
1129 2) Retrieval with regetting.
1130 3) Timestamping is used.
1131 4) Hierarchy is built.
1133 The exception is the case when file does exist and is a
1134 directory (actually support for bad httpd-s). */
1135 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1136 && !(file_exists_p (file) && !file_non_directory_p (file)))
1139 /* Find a unique name. */
1140 name = unique_name (file);
1145 /* Like strlen(), but allow the URL to be ended with '?'. */
1147 urlpath_length (const char *url)
1149 const char *q = strchr (url, '?');
1152 return strlen (url);
1155 /* Find the last occurrence of character C in the range [b, e), or
1156 NULL, if none are present. This is almost completely equivalent to
1157 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1158 the contents of the string. */
1160 find_last_char (const char *b, const char *e, char c)
1168 /* Resolve the result of "linking" a base URI (BASE) to a
1169 link-specified URI (LINK).
1171 Either of the URIs may be absolute or relative, complete with the
1172 host name, or path only. This tries to behave "reasonably" in all
1173 foreseeable cases. It employs little specific knowledge about
1174 protocols or URL-specific stuff -- it just works on strings.
1176 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1177 See uri_merge for a gentler interface to this functionality.
1179 #### This function should handle `./' and `../' so that the evil
1180 path_simplify can go. */
1182 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1188 const char *end = base + urlpath_length (base);
1192 /* LINK is a relative URL: we need to replace everything
1193 after last slash (possibly empty) with LINK.
1195 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1196 our result should be "whatever/foo/qux/xyzzy". */
1197 int need_explicit_slash = 0;
1199 const char *start_insert;
1200 const char *last_slash = find_last_char (base, end, '/');
1203 /* No slash found at all. Append LINK to what we have,
1204 but we'll need a slash as a separator.
1206 Example: if base == "foo" and link == "qux/xyzzy", then
1207 we cannot just append link to base, because we'd get
1208 "fooqux/xyzzy", whereas what we want is
1211 To make sure the / gets inserted, we set
1212 need_explicit_slash to 1. We also set start_insert
1213 to end + 1, so that the length calculations work out
1214 correctly for one more (slash) character. Accessing
1215 that character is fine, since it will be the
1216 delimiter, '\0' or '?'. */
1217 /* example: "foo?..." */
1218 /* ^ ('?' gets changed to '/') */
1219 start_insert = end + 1;
1220 need_explicit_slash = 1;
1222 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1224 /* example: http://host" */
1226 start_insert = end + 1;
1227 need_explicit_slash = 1;
1231 /* example: "whatever/foo/bar" */
1233 start_insert = last_slash + 1;
1236 span = start_insert - base;
1237 constr = (char *)xmalloc (span + linklength + 1);
1239 memcpy (constr, base, span);
1240 if (need_explicit_slash)
1241 constr[span - 1] = '/';
1243 memcpy (constr + span, link, linklength);
1244 constr[span + linklength] = '\0';
1246 else /* *link == `/' */
1248 /* LINK is an absolute path: we need to replace everything
1249 after (and including) the FIRST slash with LINK.
1251 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1252 "/qux/xyzzy", our result should be
1253 "http://host/qux/xyzzy". */
1256 const char *start_insert = NULL; /* for gcc to shut up. */
1257 const char *pos = base;
1258 int seen_slash_slash = 0;
1259 /* We're looking for the first slash, but want to ignore
1262 slash = memchr (pos, '/', end - pos);
1263 if (slash && !seen_slash_slash)
1264 if (*(slash + 1) == '/')
1267 seen_slash_slash = 1;
1271 /* At this point, SLASH is the location of the first / after
1272 "//", or the first slash altogether. START_INSERT is the
1273 pointer to the location where LINK will be inserted. When
1274 examining the last two examples, keep in mind that LINK
1277 if (!slash && !seen_slash_slash)
1278 /* example: "foo" */
1280 start_insert = base;
1281 else if (!slash && seen_slash_slash)
1282 /* example: "http://foo" */
1285 else if (slash && !seen_slash_slash)
1286 /* example: "foo/bar" */
1288 start_insert = base;
1289 else if (slash && seen_slash_slash)
1290 /* example: "http://something/" */
1292 start_insert = slash;
1294 span = start_insert - base;
1295 constr = (char *)xmalloc (span + linklength + 1);
1297 memcpy (constr, base, span);
1299 memcpy (constr + span, link, linklength);
1300 constr[span + linklength] = '\0';
1303 else /* !no_proto */
1305 constr = strdupdelim (link, link + linklength);
1310 /* Merge BASE with LINK and return the resulting URI. This is an
1311 interface to uri_merge_1 that assumes that LINK is a
1312 zero-terminated string. */
1314 uri_merge (const char *base, const char *link)
1316 return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1319 /* Optimize URL by host, destructively replacing u->host with realhost
1320 (u->host). Do this regardless of opt.simple_check. */
1322 opt_url (struct urlinfo *u)
1324 /* Find the "true" host. */
1325 char *host = realhost (u->host);
1328 assert (u->dir != NULL); /* the URL must have been parsed */
1329 /* Refresh the printed representation. */
1331 u->url = str_url (u, 0);
1334 /* Returns proxy host address, in accordance with PROTO. */
1336 getproxy (uerr_t proto)
1340 if (proto == URLHTTP)
1341 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1342 else if (proto == URLFTP)
1343 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1345 else if (proto == URLHTTPS)
1346 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1347 #endif /* HAVE_SSL */
1350 if (!proxy || !*proxy)
1355 /* Should a host be accessed through proxy, concerning no_proxy? */
1357 no_proxy_match (const char *host, const char **no_proxy)
1362 return !sufmatch (no_proxy, host);
1365 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1366 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1368 /* Change the links in an HTML document. Accepts a structure that
1369 defines the positions of all the links. */
1371 convert_links (const char *file, urlpos *l)
1373 struct file_memory *fm;
1376 downloaded_file_t downloaded_file_return;
1378 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1381 /* First we do a "dry run": go through the list L and see whether
1382 any URL needs to be converted in the first place. If not, just
1383 leave the file alone. */
1386 for (dry = l; dry; dry = dry->next)
1387 if (dry->convert != CO_NOCONVERT)
1391 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1396 fm = read_file (file);
1399 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1400 file, strerror (errno));
1404 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1405 if (opt.backup_converted && downloaded_file_return)
1406 write_backup_file (file, downloaded_file_return);
1408 /* Before opening the file for writing, unlink the file. This is
1409 important if the data in FM is mmaped. In such case, nulling the
1410 file, which is what fopen() below does, would make us read all
1411 zeroes from the mmaped region. */
1412 if (unlink (file) < 0 && errno != ENOENT)
1414 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1415 file, strerror (errno));
1416 read_file_free (fm);
1419 /* Now open the file for writing. */
1420 fp = fopen (file, "wb");
1423 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1424 file, strerror (errno));
1425 read_file_free (fm);
1428 /* Here we loop through all the URLs in file, replacing those of
1429 them that are downloaded with relative references. */
1431 for (; l; l = l->next)
1433 char *url_start = fm->content + l->pos;
1435 if (l->pos >= fm->length)
1437 DEBUGP (("Something strange is going on. Please investigate."));
1440 /* If the URL is not to be converted, skip it. */
1441 if (l->convert == CO_NOCONVERT)
1443 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1447 /* Echo the file contents, up to the offending URL's opening
1448 quote, to the outfile. */
1449 fwrite (p, 1, url_start - p, fp);
1451 if (l->convert == CO_CONVERT_TO_RELATIVE)
1453 /* Convert absolute URL to relative. */
1454 char *newname = construct_relative (file, l->local_name);
1455 char *quoted_newname = html_quote_string (newname);
1456 replace_attr (&p, l->size, fp, quoted_newname);
1457 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1458 l->url, newname, l->pos, file));
1460 xfree (quoted_newname);
1462 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1464 /* Convert the link to absolute URL. */
1465 char *newlink = l->url;
1466 char *quoted_newlink = html_quote_string (newlink);
1467 replace_attr (&p, l->size, fp, quoted_newlink);
1468 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1469 newlink, l->pos, file));
1470 xfree (quoted_newlink);
1473 /* Output the rest of the file. */
1474 if (p - fm->content < fm->length)
1475 fwrite (p, 1, fm->length - (p - fm->content), fp);
1477 read_file_free (fm);
1478 logputs (LOG_VERBOSE, _("done.\n"));
1481 /* Construct and return a malloced copy of the relative link from two
1482 pieces of information: local name S1 of the referring file and
1483 local name S2 of the referred file.
1485 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1486 "jagor.srce.hr/images/news.gif", the function will return
1489 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1490 "fly.cc.fer.hr/images/fly.gif", the function will return
1491 "../images/fly.gif".
1493 Caveats: S1 should not begin with `/', unless S2 also begins with
1494 '/'. S1 should not contain things like ".." and such --
1495 construct_relative ("fly/ioccc/../index.html",
1496 "fly/images/fly.gif") will fail. (A workaround is to call
1497 something like path_simplify() on S1). */
1499 construct_relative (const char *s1, const char *s2)
1501 int i, cnt, sepdirs1;
1505 return xstrdup (s2);
1506 /* S1 should *not* be absolute, if S2 wasn't. */
1507 assert (*s1 != '/');
1509 /* Skip the directories common to both strings. */
1512 while (s1[i] && s2[i]
1517 if (s1[i] == '/' && s2[i] == '/')
1522 for (sepdirs1 = 0; s1[i]; i++)
1525 /* Now, construct the file as of:
1526 - ../ repeated sepdirs1 time
1527 - all the non-mutual directories of S2. */
1528 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1529 for (i = 0; i < sepdirs1; i++)
1530 memcpy (res + 3 * i, "../", 3);
1531 strcpy (res + 3 * i, s2 + cnt);
1535 /* Add URL to the head of the list L. */
1537 add_url (urlpos *l, const char *url, const char *file)
1541 t = (urlpos *)xmalloc (sizeof (urlpos));
1542 memset (t, 0, sizeof (*t));
1543 t->url = xstrdup (url);
1544 t->local_name = xstrdup (file);
1550 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1552 /* Rather than just writing over the original .html file with the
1553 converted version, save the former to *.orig. Note we only do
1554 this for files we've _successfully_ downloaded, so we don't
1555 clobber .orig files sitting around from previous invocations. */
1557 /* Construct the backup filename as the original name plus ".orig". */
1558 size_t filename_len = strlen(file);
1559 char* filename_plus_orig_suffix;
1560 boolean already_wrote_backup_file = FALSE;
1561 slist* converted_file_ptr;
1562 static slist* converted_files = NULL;
1564 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1566 /* Just write "orig" over "html". We need to do it this way
1567 because when we're checking to see if we've downloaded the
1568 file before (to see if we can skip downloading it), we don't
1569 know if it's a text/html file. Therefore we don't know yet
1570 at that stage that -E is going to cause us to tack on
1571 ".html", so we need to compare vs. the original URL plus
1572 ".orig", not the original URL plus ".html.orig". */
1573 filename_plus_orig_suffix = alloca (filename_len + 1);
1574 strcpy(filename_plus_orig_suffix, file);
1575 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1577 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1579 /* Append ".orig" to the name. */
1580 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1581 strcpy(filename_plus_orig_suffix, file);
1582 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1585 /* We can get called twice on the same URL thanks to the
1586 convert_all_links() call in main(). If we write the .orig file
1587 each time in such a case, it'll end up containing the first-pass
1588 conversion, not the original file. So, see if we've already been
1589 called on this file. */
1590 converted_file_ptr = converted_files;
1591 while (converted_file_ptr != NULL)
1592 if (strcmp(converted_file_ptr->string, file) == 0)
1594 already_wrote_backup_file = TRUE;
1598 converted_file_ptr = converted_file_ptr->next;
1600 if (!already_wrote_backup_file)
1602 /* Rename <file> to <file>.orig before former gets written over. */
1603 if (rename(file, filename_plus_orig_suffix) != 0)
1604 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1605 file, filename_plus_orig_suffix, strerror (errno));
1607 /* Remember that we've already written a .orig backup for this file.
1608 Note that we never free this memory since we need it till the
1609 convert_all_links() call, which is one of the last things the
1610 program does before terminating. BTW, I'm not sure if it would be
1611 safe to just set 'converted_file_ptr->string' to 'file' below,
1612 rather than making a copy of the string... Another note is that I
1613 thought I could just add a field to the urlpos structure saying
1614 that we'd written a .orig file for this URL, but that didn't work,
1615 so I had to make this separate list.
1616 -- Dan Harkless <wget@harkless.org>
1618 This [adding a field to the urlpos structure] didn't work
1619 because convert_file() is called twice: once after all its
1620 sublinks have been retrieved in recursive_retrieve(), and
1621 once at the end of the day in convert_all_links(). The
1622 original linked list collected in recursive_retrieve() is
1623 lost after the first invocation of convert_links(), and
1624 convert_all_links() makes a new one (it calls get_urls_html()
1625 for each file it covers.) That's why your first approach didn't
1626 work. The way to make it work is perhaps to make this flag a
1627 field in the `urls_html' list.
1628 -- Hrvoje Niksic <hniksic@arsdigita.com>
1630 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1631 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1632 converted_file_ptr->next = converted_files;
1633 converted_files = converted_file_ptr;
1637 static int find_fragment PARAMS ((const char *, int, const char **,
1641 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1643 const char *p = *pp;
1645 int size = raw_size;
1646 char quote_char = '\"';
1647 const char *frag_beg, *frag_end;
1649 /* Structure of our string is:
1650 "...old-contents..."
1651 <--- l->size ---> (with quotes)
1654 <--- l->size --> (no quotes) */
1656 if (*p == '\"' || *p == '\'')
1661 size -= 2; /* disregard opening and closing quote */
1663 putc (quote_char, fp);
1664 fputs (new_str, fp);
1666 /* Look for fragment identifier, if any. */
1667 if (find_fragment (p, size, &frag_beg, &frag_end))
1668 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1672 putc (quote_char, fp);
1676 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1677 preceded by '&'. If the character is not found, return zero. If
1678 the character is found, return 1 and set BP and EP to point to the
1679 beginning and end of the region.
1681 This is used for finding the fragment indentifiers in URLs. */
1684 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1686 const char *end = beg + size;
1688 for (; beg < end; beg++)
1710 typedef struct _downloaded_file_list {
1712 downloaded_file_t download_type;
1713 struct _downloaded_file_list* next;
1714 } downloaded_file_list;
1716 static downloaded_file_list *downloaded_files;
1718 /* Remembers which files have been downloaded. In the standard case, should be
1719 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1720 download successfully (i.e. not for ones we have failures on or that we skip
1723 When we've downloaded a file and tacked on a ".html" extension due to -E,
1724 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1725 FILE_DOWNLOADED_NORMALLY.
1727 If you just want to check if a file has been previously added without adding
1728 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1729 with local filenames, not remote URLs. */
1731 downloaded_file (downloaded_file_t mode, const char* file)
1733 boolean found_file = FALSE;
1734 downloaded_file_list* rover = downloaded_files;
1736 while (rover != NULL)
1737 if (strcmp(rover->file, file) == 0)
1743 rover = rover->next;
1746 return rover->download_type; /* file had already been downloaded */
1749 if (mode != CHECK_FOR_FILE)
1751 rover = xmalloc(sizeof(*rover));
1752 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1753 rover->download_type = mode;
1754 rover->next = downloaded_files;
1755 downloaded_files = rover;
1758 return FILE_NOT_ALREADY_DOWNLOADED;
1763 downloaded_files_free (void)
1765 downloaded_file_list* rover = downloaded_files;
1768 downloaded_file_list *next = rover->next;
1769 xfree (rover->file);