2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
64 #define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
65 || ((unsigned char)(c) > '~') /* ASCII 127 */ \
66 || strchr (URL_UNSAFE_CHARS, c))
68 /* If S contains unsafe characters, free it and replace it with a
69 version that doesn't. */
70 #define URL_CLEANSE(s) do \
72 if (contains_unsafe (s)) \
74 char *uc_tmp = encode_string (s); \
80 /* Is a directory "."? */
81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
82 /* Is a directory ".."? */
83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
85 /* NULL-terminated list of strings to be recognized as prototypes (URL
86 schemes). Note that recognized doesn't mean supported -- only HTTP
87 and FTP are currently supported.
89 However, a string that does not match anything in the list will be
90 considered a relative URL. Thus it's important that this list has
91 anything anyone could think of being legal.
93 There are wild things here. :-) Take a look at
94 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
96 static char *protostrings[] =
138 /* Similar to former, but for supported protocols: */
139 static struct proto sup_protos[] =
141 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
142 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
143 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
146 static void parse_dir PARAMS ((const char *, char **, char **));
147 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
148 static char *construct PARAMS ((const char *, const char *, int , int));
149 static char *construct_relative PARAMS ((const char *, const char *));
150 static char process_ftp_type PARAMS ((char *));
153 /* Returns the number of characters to be skipped if the first thing
154 in a URL is URL: (which is 0 or 4+). The optional spaces after
155 URL: are also skipped. */
157 skip_url (const char *url)
161 if (TOUPPER (url[0]) == 'U'
162 && TOUPPER (url[1]) == 'R'
163 && TOUPPER (url[2]) == 'L'
167 for (i = 4; url[i] && ISSPACE (url[i]); i++);
174 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
176 contains_unsafe (const char *s)
179 if (UNSAFE_CHAR (*s))
184 /* Decodes the forms %xy in a URL to the character the hexadecimal
185 code of which is xy. xy are hexadecimal digits from
186 [0123456789ABCDEF] (case-insensitive). If x or y are not
187 hex-digits or `%' precedes `\0', the sequence is inserted
191 decode_string (char *s)
201 /* Do nothing if at the end of the string, or if the chars
202 are not hex-digits. */
203 if (!*(s + 1) || !*(s + 2)
204 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
209 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
216 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
217 given string, returning a malloc-ed %XX encoded string. */
219 encode_string (const char *s)
226 for (i = 0; *s; s++, i++)
227 if (UNSAFE_CHAR (*s))
228 i += 2; /* Two more characters (hex digits) */
229 res = (char *)xmalloc (i + 1);
231 for (p = res; *s; s++)
232 if (UNSAFE_CHAR (*s))
234 const unsigned char c = *s;
236 *p++ = HEXD2ASC (c >> 4);
237 *p++ = HEXD2ASC (c & 0xf);
245 /* Returns the proto-type if URL's protocol is supported, or
246 URLUNKNOWN if not. */
248 urlproto (const char *url)
252 url += skip_url (url);
253 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
254 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
255 return sup_protos[i].ind;
256 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
259 for (++i; url[i] && url[i] != '/'; i++)
260 if (!ISDIGIT (url[i]))
262 if (url[i - 1] == ':')
271 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
272 part is found, returns 0. */
274 skip_proto (const char *url)
279 for (s = protostrings; *s; s++)
280 if (!strncasecmp (*s, url, strlen (*s)))
285 /* HTTP and FTP protocols are expected to yield exact host names
286 (i.e. the `//' part must be skipped, too). */
287 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
292 /* Returns 1 if the URL begins with a protocol (supported or
293 unsupported), 0 otherwise. */
295 has_proto (const char *url)
299 url += skip_url (url);
300 for (s = protostrings; *s; s++)
301 if (strncasecmp (url, *s, strlen (*s)) == 0)
306 /* Skip the username and password, if present here. The function
307 should be called *not* with the complete URL, but with the part
308 right after the protocol.
310 If no username and password are found, return 0. */
312 skip_uname (const char *url)
315 for (p = url; *p && *p != '/'; p++)
318 /* If a `@' was found before the first occurrence of `/', skip
326 /* Allocate a new urlinfo structure, fill it with default values and
327 return a pointer to it. */
333 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
334 memset (u, 0, sizeof (*u));
335 u->proto = URLUNKNOWN;
339 /* Perform a "deep" free of the urlinfo structure. The structure
340 should have been created with newurl, but need not have been used.
341 If free_pointer is non-0, free the pointer itself. */
343 freeurl (struct urlinfo *u, int complete)
347 FREE_MAYBE (u->host);
348 FREE_MAYBE (u->path);
349 FREE_MAYBE (u->file);
351 FREE_MAYBE (u->user);
352 FREE_MAYBE (u->passwd);
353 FREE_MAYBE (u->local);
354 FREE_MAYBE (u->referer);
356 freeurl (u->proxy, 1);
362 /* Extract the given URL of the form
363 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
364 1. hostname (terminated with `/' or `:')
365 2. port number (terminated with `/'), or chosen for the protocol
366 3. dirname (everything after hostname)
367 Most errors are handled. No allocation is done, you must supply
368 pointers to allocated memory.
369 ...and a host of other stuff :-)
371 - Recognizes hostname:dir/file for FTP and
372 hostname (:portnum)?/dir/file for HTTP.
373 - Parses the path to yield directory and file
374 - Parses the URL to yield the username and passwd (if present)
375 - Decodes the strings, in case they contain "forbidden" characters
376 - Writes the result to struct urlinfo
378 If the argument STRICT is set, it recognizes only the canonical
381 parseurl (const char *url, struct urlinfo *u, int strict)
384 int recognizable; /* Recognizable URL is the one where
385 the protocol name was explicitly
386 named, i.e. it wasn't deduced from
390 DEBUGP (("parseurl (\"%s\") -> ", url));
391 url += skip_url (url);
392 recognizable = has_proto (url);
393 if (strict && !recognizable)
395 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
397 l = strlen (sup_protos[i].name);
398 if (!strncasecmp (sup_protos[i].name, url, l))
401 /* If protocol is recognizable, but unsupported, bail out, else
403 if (recognizable && !sup_protos[i].name)
405 else if (i == ARRAY_SIZE (sup_protos))
408 u->proto = type = sup_protos[i].ind;
410 if (type == URLUNKNOWN)
412 /* Allow a username and password to be specified (i.e. just skip
415 l += skip_uname (url + l);
416 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
419 /* Get the hostname. */
420 u->host = strdupdelim (url + l, url + i);
421 DEBUGP (("host %s -> ", u->host));
423 /* Assume no port has been given. */
427 /* We have a colon delimiting the hostname. It could mean that
428 a port number is following it, or a directory. */
429 if (ISDIGIT (url[++i])) /* A port number */
431 if (type == URLUNKNOWN)
432 u->proto = type = URLHTTP;
433 for (; url[i] && url[i] != '/'; i++)
434 if (ISDIGIT (url[i]))
435 u->port = 10 * u->port + (url[i] - '0');
440 DEBUGP (("port %hu -> ", u->port));
442 else if (type == URLUNKNOWN) /* or a directory */
443 u->proto = type = URLFTP;
444 else /* or just a misformed port number */
447 else if (type == URLUNKNOWN)
448 u->proto = type = URLHTTP;
452 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
453 if (sup_protos[i].ind == type)
455 if (i == ARRAY_SIZE (sup_protos))
457 u->port = sup_protos[i].port;
459 /* Some delimiter troubles... */
460 if (url[i] == '/' && url[i - 1] != ':')
463 while (url[i] && url[i] == '/')
465 u->path = (char *)xmalloc (strlen (url + i) + 8);
466 strcpy (u->path, url + i);
469 u->ftp_type = process_ftp_type (u->path);
470 /* #### We don't handle type `d' correctly yet. */
471 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
474 DEBUGP (("opath %s -> ", u->path));
475 /* Parse the username and password (if existing). */
476 parse_uname (url, &u->user, &u->passwd);
477 /* Decode the strings, as per RFC 1738. */
478 decode_string (u->host);
479 decode_string (u->path);
481 decode_string (u->user);
483 decode_string (u->passwd);
484 /* Parse the directory. */
485 parse_dir (u->path, &u->dir, &u->file);
486 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
487 /* Simplify the directory. */
488 path_simplify (u->dir);
489 /* Remove the leading `/' in HTTP. */
490 if (type == URLHTTP && *u->dir == '/')
491 strcpy (u->dir, u->dir + 1);
492 DEBUGP (("ndir %s\n", u->dir));
493 /* Strip trailing `/'. */
495 if (l && u->dir[l - 1] == '/')
496 u->dir[l - 1] = '\0';
497 /* Re-create the path: */
498 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
499 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
500 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
501 strcpy (u->path, abs_ftp ? "%2F" : "/");
502 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
503 strcat (u->path, *u->dir ? "/" : "");
504 strcat (u->path, u->file);
505 URL_CLEANSE (u->path);
506 /* Create the clean URL. */
507 u->url = str_url (u, 0);
511 /* Build the directory and filename components of the path. Both
512 components are *separately* malloc-ed strings! It does not change
513 the contents of path.
515 If the path ends with "." or "..", they are (correctly) counted as
518 parse_dir (const char *path, char **dir, char **file)
522 for (i = l = strlen (path); i && path[i] != '/'; i--);
523 if (!i && *path != '/') /* Just filename */
525 if (DOTP (path) || DDOTP (path))
527 *dir = xstrdup (path);
528 *file = xstrdup ("");
532 *dir = xstrdup (""); /* This is required because of FTP */
533 *file = xstrdup (path);
536 else if (!i) /* /filename */
538 if (DOTP (path + 1) || DDOTP (path + 1))
540 *dir = xstrdup (path);
541 *file = xstrdup ("");
545 *dir = xstrdup ("/");
546 *file = xstrdup (path + 1);
549 else /* Nonempty directory with or without a filename */
551 if (DOTP (path + i + 1) || DDOTP (path + i + 1))
553 *dir = xstrdup (path);
554 *file = xstrdup ("");
558 *dir = strdupdelim (path, path + i);
559 *file = strdupdelim (path + i + 1, path + l + 1);
564 /* Find the optional username and password within the URL, as per
565 RFC1738. The returned user and passwd char pointers are
568 parse_uname (const char *url, char **user, char **passwd)
576 url += skip_url (url);
577 /* Look for end of protocol string. */
578 l = skip_proto (url);
581 /* Add protocol offset. */
583 /* Is there an `@' character? */
584 for (p = url; *p && *p != '/'; p++)
587 /* If not, return. */
590 /* Else find the username and password. */
591 for (p = col = url; *p != '@'; p++)
593 if (*p == ':' && !*user)
595 *user = (char *)xmalloc (p - url + 1);
596 memcpy (*user, url, p - url);
597 (*user)[p - url] = '\0';
601 /* Decide whether you have only the username or both. */
602 where = *user ? passwd : user;
603 *where = (char *)xmalloc (p - col + 1);
604 memcpy (*where, col, p - col);
605 (*where)[p - col] = '\0';
609 /* If PATH ends with `;type=X', return the character X. */
611 process_ftp_type (char *path)
613 int len = strlen (path);
616 && !memcmp (path + len - 7, ";type=", 6))
618 path[len - 7] = '\0';
619 return path[len - 1];
625 /* Return the URL as fine-formed string, with a proper protocol,
626 optional port number, directory and optional user/password. If
627 HIDE is non-zero, password will be hidden. The forbidden
628 characters in the URL will be cleansed. */
630 str_url (const struct urlinfo *u, int hide)
632 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
633 int i, l, ln, lu, lh, lp, lf, ld;
634 unsigned short proto_default_port;
636 /* Look for the protocol name. */
637 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
638 if (sup_protos[i].ind == u->proto)
640 if (i == ARRAY_SIZE (sup_protos))
642 proto_name = sup_protos[i].name;
643 proto_default_port = sup_protos[i].port;
644 host = CLEANDUP (u->host);
645 dir = CLEANDUP (u->dir);
646 file = CLEANDUP (u->file);
647 user = passwd = NULL;
649 user = CLEANDUP (u->user);
653 passwd = CLEANDUP (u->passwd);
655 for (i = 0; passwd[i]; i++)
658 if (u->proto == URLFTP && *dir == '/')
660 char *tmp = (char *)xmalloc (strlen (dir) + 3);
661 /*sprintf (tmp, "%%2F%s", dir + 1);*/
665 strcpy (tmp + 3, dir + 1);
670 ln = strlen (proto_name);
671 lu = user ? strlen (user) : 0;
672 lp = passwd ? strlen (passwd) : 0;
676 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
677 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
678 (user ? user : ""), (passwd ? ":" : ""),
679 (passwd ? passwd : ""), (user ? "@" : ""),
680 host, u->port, dir, *dir ? "/" : "", file); */
682 memcpy (res, proto_name, ln);
686 memcpy (res + l, user, lu);
691 memcpy (res + l, passwd, lp);
696 memcpy (res + l, host, lh);
698 if (u->port != proto_default_port)
701 long_to_string (res + l, (long)u->port);
702 l += numdigit (u->port);
705 memcpy (res + l, dir, ld);
709 strcpy (res + l, file);
718 /* Check whether two URL-s are equivalent, i.e. pointing to the same
719 location. Uses parseurl to parse them, and compares the canonical
722 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
723 return 0 on error. */
725 url_equal (const char *url1, const char *url2)
727 struct urlinfo *u1, *u2;
732 err = parseurl (url1, u1, 0);
739 err = parseurl (url2, u2, 0);
745 res = !strcmp (u1->url, u2->url);
751 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
752 buffer may contain pretty much anything; no errors are signaled. */
754 findurl (const char *buf, int howmuch, int *count)
759 for (s1 = buf; howmuch; s1++, howmuch--)
760 for (prot = protostrings; *prot; prot++)
761 if (howmuch <= strlen (*prot))
763 else if (!strncasecmp (*prot, s1, strlen (*prot)))
765 for (s2 = s1, *count = 0;
766 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
767 !strchr (URL_SEPARATOR, *s2);
768 s2++, (*count)++, howmuch--);
774 /* Scans the file for signs of URL-s. Returns a vector of pointers,
775 each pointer representing a URL string. The file is *not* assumed
778 get_urls_file (const char *file)
785 urlpos *first, *current, *old;
787 if (file && !HYPHENP (file))
789 fp = fopen (file, "rb");
792 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
799 load_file (fp, &buf, &nread);
800 if (file && !HYPHENP (file))
802 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
803 first = current = NULL;
804 /* Fill the linked list with URLs. */
805 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
808 /* Allocate the space. */
810 current = (urlpos *)xmalloc (sizeof (urlpos));
813 memset (current, 0, sizeof (*current));
814 current->next = NULL;
815 current->url = (char *)xmalloc (size + 1);
816 memcpy (current->url, pbuf, size);
817 current->url[size] = '\0';
821 /* Free the buffer. */
827 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
828 an HTML document using htmlfindurl(), which see. get_urls_html()
829 constructs the HTML-s from the relative href-s.
831 If SILENT is non-zero, do not barf on baseless relative links. */
833 get_urls_html (const char *file, const char *this_url, int silent,
834 int dash_p_leaf_HTML)
840 int step, first_time;
841 urlpos *first, *current, *old;
843 if (file && !HYPHENP (file))
845 fp = fopen (file, "rb");
848 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
855 load_file (fp, &orig_buf, &nread);
856 if (file && !HYPHENP (file))
858 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
859 first = current = NULL;
861 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
863 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
869 const char *pbuf = buf;
872 char *needs_freeing, *url_data;
876 /* A frequent phenomenon that needs to be handled are pages
877 generated by brain-damaged HTML generators, which refer to to
878 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
879 any spaces at the beginning or at the end of the string.
880 This is probably not strictly correct, but that's what the
881 browsers do, so we may follow. May the authors of "WYSIWYG"
882 HTML tools burn in hell for the damage they've inflicted! */
883 while ((pbuf < buf + step) && ISSPACE (*pbuf))
888 while (size && ISSPACE (pbuf[size - 1]))
893 /* It would be nice if we could avoid allocating memory in this
894 loop, but I don't see an easy way. To process the entities,
895 we need to either copy the data, or change it destructively.
898 We have two pointers: needs_freeing and url_data, because the
899 code below does thing like url_data += <something>, and we
900 want to pass the original string to free(). */
901 needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
902 size = strlen (url_data);
904 for (i = 0; protostrings[i]; i++)
906 if (!strncasecmp (protostrings[i], url_data,
907 MINVAL (strlen (protostrings[i]), size)))
910 /* Check for http:RELATIVE_URI. See below for details. */
912 && !(strncasecmp (url_data, "http:", 5) == 0
913 && strncasecmp (url_data, "http://", 7) != 0))
920 /* This is for extremely brain-damaged pages that refer to
921 relative URI-s as <a href="http:URL">. Just strip off the
922 silly leading "http:" (as well as any leading blanks
924 if ((size > 5) && !strncasecmp ("http:", url_data, 5))
925 url_data += 5, size -= 5;
929 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
931 if (!strncasecmp (sup_protos[i].name, url_data,
932 MINVAL (strlen (sup_protos[i].name), size)))
935 /* Do *not* accept a non-supported protocol. */
936 if (i == ARRAY_SIZE (sup_protos))
938 free (needs_freeing);
944 /* First, construct the base, which can be relative itself.
946 Criteria for creating the base are:
947 1) html_base created by <base href="...">
949 3) base provided from the command line */
950 cbase = html_base ();
954 cbase = opt.base_href;
955 if (!cbase) /* Error condition -- a baseless
958 if (!opt.quiet && !silent)
960 /* Use malloc, not alloca because this is called in
962 char *temp = (char *)malloc (size + 1);
963 strncpy (temp, url_data, size);
965 logprintf (LOG_NOTQUIET,
966 _("Error (%s): Link %s without a base provided.\n"),
970 free (needs_freeing);
974 base = construct (this_url, cbase, strlen (cbase),
978 /* Base must now be absolute, with host name and
980 if (!has_proto (cbase))
982 logprintf (LOG_NOTQUIET, _("\
983 Error (%s): Base %s relative, without referer URL.\n"),
985 free (needs_freeing);
988 base = xstrdup (cbase);
990 constr = construct (base, url_data, size, no_proto);
995 constr = (char *)xmalloc (size + 1);
996 strncpy (constr, url_data, size);
1005 tmp2 = html_base ();
1006 /* Use malloc, not alloca because this is called in a loop. */
1007 tmp = (char *)xmalloc (size + 1);
1008 strncpy (tmp, url_data, size);
1010 logprintf (LOG_ALWAYS,
1011 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
1012 file, this_url ? this_url : "(null)",
1013 tmp2 ? tmp2 : "(null)", tmp, constr);
1018 /* Allocate the space. */
1020 current = (urlpos *)xmalloc (sizeof (urlpos));
1022 old->next = current;
1025 /* Fill the values. */
1026 memset (current, 0, sizeof (*current));
1027 current->next = NULL;
1028 current->url = constr;
1029 current->size = step;
1030 current->pos = buf - orig_buf;
1031 /* A URL is relative if the host and protocol are not named,
1032 and the name does not start with `/'. */
1033 if (no_proto && *url_data != '/')
1034 current->flags |= (URELATIVE | UNOPROTO);
1036 current->flags |= UNOPROTO;
1037 free (needs_freeing);
1044 /* Free the linked list of urlpos. */
1046 free_urlpos (urlpos *l)
1050 urlpos *next = l->next;
1052 FREE_MAYBE (l->local_name);
1058 /* Rotate FNAME opt.backups times */
1060 rotate_backups(const char *fname)
1062 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1063 char *from = (char *)alloca (maxlen);
1064 char *to = (char *)alloca (maxlen);
1068 if (stat (fname, &sb) == 0)
1069 if (S_ISREG (sb.st_mode) == 0)
1072 for (i = opt.backups; i > 1; i--)
1074 sprintf (from, "%s.%d", fname, i - 1);
1075 sprintf (to, "%s.%d", fname, i);
1076 /* #### This will fail on machines without the rename() system
1081 sprintf (to, "%s.%d", fname, 1);
1085 /* Create all the necessary directories for PATH (a file). Calls
1086 mkdirhier() internally. */
1088 mkalldirs (const char *path)
1095 p = path + strlen (path);
1096 for (; *p != '/' && p != path; p--);
1097 /* Don't create if it's just a file. */
1098 if ((p == path) && (*p != '/'))
1100 t = strdupdelim (path, p);
1101 /* Check whether the directory exists. */
1102 if ((stat (t, &st) == 0))
1104 if (S_ISDIR (st.st_mode))
1111 /* If the dir exists as a file name, remove it first. This
1112 is *only* for Wget to work with buggy old CERN http
1113 servers. Here is the scenario: When Wget tries to
1114 retrieve a directory without a slash, e.g.
1115 http://foo/bar (bar being a directory), CERN server will
1116 not redirect it too http://foo/bar/ -- it will generate a
1117 directory listing containing links to bar/file1,
1118 bar/file2, etc. Wget will lose because it saves this
1119 HTML listing to a file `bar', so it cannot create the
1120 directory. To work around this, if the file of the same
1121 name exists, we just remove it and create the directory
1123 DEBUGP (("Removing %s because of directory danger!\n", t));
1127 res = make_directory (t);
1129 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1135 count_slashes (const char *s)
1144 /* Return the path name of the URL-equivalent file name, with a
1145 remote-like structure of directories. */
1147 mkstruct (const struct urlinfo *u)
1149 char *host, *dir, *file, *res, *dirpref;
1152 assert (u->dir != NULL);
1153 assert (u->host != NULL);
1157 char *ptr = u->dir + (*u->dir == '/');
1158 int slash_count = 1 + count_slashes (ptr);
1159 int cut = MINVAL (opt.cut_dirs, slash_count);
1160 for (; cut && *ptr; ptr++)
1163 STRDUP_ALLOCA (dir, ptr);
1166 dir = u->dir + (*u->dir == '/');
1168 host = xstrdup (u->host);
1169 /* Check for the true name (or at least a consistent name for saving
1170 to directory) of HOST, reusing the hlist if possible. */
1171 if (opt.add_hostdir && !opt.simple_check)
1173 char *nhost = realhost (host);
1177 /* Add dir_prefix and hostname (if required) to the beginning of
1179 if (opt.add_hostdir)
1181 if (!DOTP (opt.dir_prefix))
1183 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1184 + strlen (host) + 1);
1185 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1188 STRDUP_ALLOCA (dirpref, host);
1190 else /* not add_hostdir */
1192 if (!DOTP (opt.dir_prefix))
1193 dirpref = opt.dir_prefix;
1199 /* If there is a prefix, prepend it. */
1202 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1203 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1206 dir = xstrdup (dir);
1209 if (l && dir[l - 1] == '/')
1213 file = "index.html";
1217 /* Finally, construct the full name. */
1218 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1219 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1224 /* Create a unique filename, corresponding to a given URL. Calls
1225 mkstruct if necessary. Does *not* actually create any directories. */
1227 url_filename (const struct urlinfo *u)
1230 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1234 file = mkstruct (u);
1240 file = xstrdup ("index.html");
1242 file = xstrdup (u->file);
1247 /* Check whether the prefix directory is something other than "."
1248 before prepending it. */
1249 if (!DOTP (opt.dir_prefix))
1251 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1252 + 1 + strlen (file) + 1);
1253 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1258 /* DOS-ish file systems don't like `%' signs in them; we change it
1263 for (p = file; *p; p++)
1267 #endif /* WINDOWS */
1269 /* Check the cases in which the unique extensions are not used:
1270 1) Clobbering is turned off (-nc).
1271 2) Retrieval with regetting.
1272 3) Timestamping is used.
1273 4) Hierarchy is built.
1275 The exception is the case when file does exist and is a
1276 directory (actually support for bad httpd-s). */
1277 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1278 && !(file_exists_p (file) && !file_non_directory_p (file)))
1281 /* Find a unique name. */
1282 name = unique_name (file);
1287 /* Like strlen(), but allow the URL to be ended with '?'. */
1289 urlpath_length (const char *url)
1291 const char *q = strchr (url, '?');
1294 return strlen (url);
1298 find_last_char (const char *b, const char *e, char c)
1306 /* Construct an absolute URL, given a (possibly) relative one. This
1307 gets tricky if you want to cover all the "reasonable" cases, but
1308 I'm satisfied with the result. */
1310 construct (const char *url, const char *sub, int subsize, int no_proto)
1316 const char *end = url + urlpath_length (url);
1320 /* SUB is a relative URL: we need to replace everything
1321 after last slash (possibly empty) with SUB.
1323 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1324 our result should be "whatever/foo/qux/xyzzy". */
1325 int need_explicit_slash = 0;
1327 const char *start_insert;
1328 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1331 /* No slash found at all. Append SUB to what we have,
1332 but we'll need a slash as a separator.
1334 Example: if url == "foo" and sub == "qux/xyzzy", then
1335 we cannot just append sub to url, because we'd get
1336 "fooqux/xyzzy", whereas what we want is
1339 To make sure the / gets inserted, we set
1340 need_explicit_slash to 1. We also set start_insert
1341 to end + 1, so that the length calculations work out
1342 correctly for one more (slash) character. Accessing
1343 that character is fine, since it will be the
1344 delimiter, '\0' or '?'. */
1345 /* example: "foo?..." */
1346 /* ^ ('?' gets changed to '/') */
1347 start_insert = end + 1;
1348 need_explicit_slash = 1;
1352 /* example: "whatever/foo/bar" */
1354 start_insert = last_slash + 1;
1357 span = start_insert - url;
1358 constr = (char *)xmalloc (span + subsize + 1);
1360 memcpy (constr, url, span);
1361 if (need_explicit_slash)
1362 constr[span - 1] = '/';
1364 memcpy (constr + span, sub, subsize);
1365 constr[span + subsize] = '\0';
1367 else /* *sub == `/' */
1369 /* SUB is an absolute path: we need to replace everything
1370 after (and including) the FIRST slash with SUB.
1372 So, if URL is "http://host/whatever/foo/bar", and SUB is
1373 "/qux/xyzzy", our result should be
1374 "http://host/qux/xyzzy". */
1376 const char *slash, *start_insert;
1377 const char *pos = url;
1378 int seen_slash_slash = 0;
1379 /* We're looking for the first slash, but want to ignore
1382 slash = memchr (pos, '/', end - pos);
1383 if (slash && !seen_slash_slash)
1384 if (*(slash + 1) == '/')
1387 seen_slash_slash = 1;
1391 /* At this point, SLASH is the location of the first / after
1392 "//", or the first slash altogether. START_INSERT is the
1393 pointer to the location where SUB will be inserted. When
1394 examining the last two examples, keep in mind that SUB
1397 if (!slash && !seen_slash_slash)
1398 /* example: "foo" */
1401 else if (!slash && seen_slash_slash)
1402 /* example: "http://foo" */
1405 else if (slash && !seen_slash_slash)
1406 /* example: "foo/bar" */
1409 else if (slash && seen_slash_slash)
1410 /* example: "http://something/" */
1412 start_insert = slash;
1414 span = start_insert - url;
1415 constr = (char *)xmalloc (span + subsize + 1);
1417 memcpy (constr, url, span);
1419 memcpy (constr + span, sub, subsize);
1420 constr[span + subsize] = '\0';
1423 else /* !no_proto */
1425 constr = strdupdelim (sub, sub + subsize);
1430 /* Like the function above, but with a saner caller interface. */
1432 url_concat (const char *base_url, const char *new_url)
1434 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1437 /* Optimize URL by host, destructively replacing u->host with realhost
1438 (u->host). Do this regardless of opt.simple_check. */
1440 opt_url (struct urlinfo *u)
1442 /* Find the "true" host. */
1443 char *host = realhost (u->host);
1446 assert (u->dir != NULL); /* the URL must have been parsed */
1447 /* Refresh the printed representation. */
1449 u->url = str_url (u, 0);
1452 /* Returns proxy host address, in accordance with PROTO. */
1454 getproxy (uerr_t proto)
1456 if (proto == URLHTTP)
1457 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1458 else if (proto == URLFTP)
1459 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1464 /* Should a host be accessed through proxy, concerning no_proxy? */
1466 no_proxy_match (const char *host, const char **no_proxy)
1471 return !sufmatch (no_proxy, host);
1474 /* Change the links in an HTML document. Accepts a structure that
1475 defines the positions of all the links. */
1477 convert_links (const char *file, urlpos *l)
1481 downloaded_file_t downloaded_file_return;
1484 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1485 /* Read from the file.... */
1486 fp = fopen (file, "rb");
1489 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1490 file, strerror (errno));
1493 /* ...to a buffer. */
1494 load_file (fp, &buf, &size);
1497 downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
1499 if (opt.backup_converted && downloaded_file_return)
1500 /* Rather than just writing over the original .html file with the converted
1501 version, save the former to *.orig. Note we only do this for files we've
1502 _successfully_ downloaded, so we don't clobber .orig files sitting around
1503 from previous invocations. */
1505 /* Construct the backup filename as the original name plus ".orig". */
1506 size_t filename_len = strlen(file);
1507 char* filename_plus_orig_suffix;
1508 boolean already_wrote_backup_file = FALSE;
1509 slist* converted_file_ptr;
1510 static slist* converted_files = NULL;
1512 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1514 /* Just write "orig" over "html". We need to do it this way because
1515 when we're checking to see if we've downloaded the file before (to
1516 see if we can skip downloading it), we don't know if it's a
1517 text/html file. Therefore we don't know yet at that stage that -E
1518 is going to cause us to tack on ".html", so we need to compare
1519 vs. the original URL plus ".orig", not the original URL plus
1521 filename_plus_orig_suffix = xmalloc(filename_len + 1);
1522 strcpy(filename_plus_orig_suffix, file);
1523 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1525 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1527 /* Append ".orig" to the name. */
1528 filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
1529 strcpy(filename_plus_orig_suffix, file);
1530 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1533 /* We can get called twice on the same URL thanks to the
1534 convert_all_links() call in main(). If we write the .orig file each
1535 time in such a case, it'll end up containing the first-pass conversion,
1536 not the original file. So, see if we've already been called on this
1538 converted_file_ptr = converted_files;
1539 while (converted_file_ptr != NULL)
1540 if (strcmp(converted_file_ptr->string, file) == 0)
1542 already_wrote_backup_file = TRUE;
1546 converted_file_ptr = converted_file_ptr->next;
1548 if (!already_wrote_backup_file)
1550 /* Rename <file> to <file>.orig before former gets written over. */
1551 if (rename(file, filename_plus_orig_suffix) != 0)
1552 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1553 file, filename_plus_orig_suffix, strerror (errno));
1555 /* Remember that we've already written a .orig backup for this file.
1556 Note that we never free this memory since we need it till the
1557 convert_all_links() call, which is one of the last things the
1558 program does before terminating. BTW, I'm not sure if it would be
1559 safe to just set 'converted_file_ptr->string' to 'file' below,
1560 rather than making a copy of the string... Another note is that I
1561 thought I could just add a field to the urlpos structure saying
1562 that we'd written a .orig file for this URL, but that didn't work,
1563 so I had to make this separate list. */
1564 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1565 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1566 converted_file_ptr->next = converted_files;
1567 converted_files = converted_file_ptr;
1570 free(filename_plus_orig_suffix);
1572 /* Now open the file for writing. */
1573 fp = fopen (file, "wb");
1576 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1577 file, strerror (errno));
1581 /* Presumably we have to loop through multiple URLs here (even though we're
1582 only talking about a single local file) because of the -O option. */
1583 for (p = buf; l; l = l->next)
1587 DEBUGP (("Something strange is going on. Please investigate."));
1590 /* If the URL already is relative or it is not to be converted
1591 for some other reason (e.g. because of not having been
1592 downloaded in the first place), skip it. */
1593 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1595 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1599 /* Else, reach the position of the offending URL, echoing
1600 everything up to it to the outfile. */
1601 for (p2 = buf + l->pos; p < p2; p++)
1603 if (l->flags & UABS2REL)
1604 /* Convert absolute URL to relative. */
1606 char *newname = construct_relative (file, l->local_name);
1607 fprintf (fp, "%s", newname);
1608 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1609 l->url, newname, l->pos, file));
1614 /* Output the rest of the file. */
1617 for (p2 = buf + size; p < p2; p++)
1622 logputs (LOG_VERBOSE, _("done.\n"));
1625 /* Construct and return a malloced copy of the relative link from two
1626 pieces of information: local name S1 of the referring file and
1627 local name S2 of the referred file.
1629 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1630 "jagor.srce.hr/images/news.gif", the function will return
1633 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1634 "fly.cc.fer.hr/images/fly.gif", the function will return
1635 "../images/fly.gif".
1637 Caveats: S1 should not begin with `/', unless S2 also begins with
1638 '/'. S1 should not contain things like ".." and such --
1639 construct_relative ("fly/ioccc/../index.html",
1640 "fly/images/fly.gif") will fail. (A workaround is to call
1641 something like path_simplify() on S1). */
1643 construct_relative (const char *s1, const char *s2)
1645 int i, cnt, sepdirs1;
1649 return xstrdup (s2);
1650 /* S1 should *not* be absolute, if S2 wasn't. */
1651 assert (*s1 != '/');
1653 /* Skip the directories common to both strings. */
1656 while (s1[i] && s2[i]
1661 if (s1[i] == '/' && s2[i] == '/')
1666 for (sepdirs1 = 0; s1[i]; i++)
1669 /* Now, construct the file as of:
1670 - ../ repeated sepdirs1 time
1671 - all the non-mutual directories of S2. */
1672 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1673 for (i = 0; i < sepdirs1; i++)
1674 memcpy (res + 3 * i, "../", 3);
1675 strcpy (res + 3 * i, s2 + cnt);
1679 /* Add URL to the head of the list L. */
1681 add_url (urlpos *l, const char *url, const char *file)
1685 t = (urlpos *)xmalloc (sizeof (urlpos));
1686 memset (t, 0, sizeof (*t));
1687 t->url = xstrdup (url);
1688 t->local_name = xstrdup (file);
1694 /* Remembers which files have been downloaded. In the standard case, should be
1695 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1696 download successfully (i.e. not for ones we have failures on or that we skip
1699 When we've downloaded a file and tacked on a ".html" extension due to -E,
1700 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1701 FILE_DOWNLOADED_NORMALLY.
1703 If you just want to check if a file has been previously added without adding
1704 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1705 with local filenames, not remote URLs. */
1707 downloaded_file (downloaded_file_t mode, const char* file)
1709 typedef struct _downloaded_file_list
1712 downloaded_file_t download_type;
1713 struct _downloaded_file_list* next;
1714 } downloaded_file_list;
1716 boolean found_file = FALSE;
1717 static downloaded_file_list* downloaded_files = NULL;
1718 downloaded_file_list* rover = downloaded_files;
1720 while (rover != NULL)
1721 if (strcmp(rover->file, file) == 0)
1727 rover = rover->next;
1730 return rover->download_type; /* file had already been downloaded */
1733 if (mode != CHECK_FOR_FILE)
1735 rover = xmalloc(sizeof(*rover));
1736 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1737 rover->download_type = mode;
1738 rover->next = downloaded_files;
1739 downloaded_files = rover;
1742 return FILE_NOT_ALREADY_DOWNLOADED;