2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
56 encoding, and \033 for safe printing. */
59 # define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
61 # define URL_UNSAFE " <>\"%{}|\\^[]`\033"
64 /* If S contains unsafe characters, free it and replace it with a
65 version that doesn't. */
66 #define URL_CLEANSE(s) do \
68 if (contains_unsafe (s)) \
70 char *uc_tmp = encode_string (s); \
76 /* Is a directory "."? */
77 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
78 /* Is a directory ".."? */
79 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
81 /* NULL-terminated list of strings to be recognized as prototypes (URL
82 schemes). Note that recognized doesn't mean supported -- only HTTP
83 and FTP are currently supported.
85 However, a string that does not match anything in the list will be
86 considered a relative URL. Thus it's important that this list has
87 anything anyone could think of being legal.
89 There are wild things here. :-) Take a look at
90 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
92 static char *protostrings[] =
134 /* Similar to former, but for supported protocols: */
135 static struct proto sup_protos[] =
137 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
138 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
139 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
142 static void parse_dir PARAMS ((const char *, char **, char **));
143 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
144 static char *construct PARAMS ((const char *, const char *, int , int));
145 static char *construct_relative PARAMS ((const char *, const char *));
146 static char process_ftp_type PARAMS ((char *));
149 /* Returns the number of characters to be skipped if the first thing
150 in a URL is URL: (which is 0 or 4+). The optional spaces after
151 URL: are also skipped. */
153 skip_url (const char *url)
157 if (toupper (url[0]) == 'U'
158 && toupper (url[1]) == 'R'
159 && toupper (url[2]) == 'L'
163 for (i = 4; url[i] && ISSPACE (url[i]); i++);
170 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
172 contains_unsafe (const char *s)
175 if (strchr (URL_UNSAFE, *s))
180 /* Decodes the forms %xy in a URL to the character the hexadecimal
181 code of which is xy. xy are hexadecimal digits from
182 [0123456789ABCDEF] (case-insensitive). If x or y are not
183 hex-digits or `%' precedes `\0', the sequence is inserted
187 decode_string (char *s)
197 /* Do nothing if at the end of the string, or if the chars
198 are not hex-digits. */
199 if (!*(s + 1) || !*(s + 2)
200 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
205 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
212 /* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
213 string, returning a malloc-ed %XX encoded string. */
215 encode_string (const char *s)
222 for (i = 0; *s; s++, i++)
223 if (strchr (URL_UNSAFE, *s))
224 i += 2; /* Two more characters (hex digits) */
225 res = (char *)xmalloc (i + 1);
227 for (p = res; *s; s++)
228 if (strchr (URL_UNSAFE, *s))
230 const unsigned char c = *s;
232 *p++ = HEXD2ASC (c >> 4);
233 *p++ = HEXD2ASC (c & 0xf);
241 /* Returns the proto-type if URL's protocol is supported, or
242 URLUNKNOWN if not. */
244 urlproto (const char *url)
248 url += skip_url (url);
249 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
250 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
251 return sup_protos[i].ind;
252 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
255 for (++i; url[i] && url[i] != '/'; i++)
256 if (!ISDIGIT (url[i]))
258 if (url[i - 1] == ':')
267 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
268 part is found, returns 0. */
270 skip_proto (const char *url)
275 for (s = protostrings; *s; s++)
276 if (!strncasecmp (*s, url, strlen (*s)))
281 /* HTTP and FTP protocols are expected to yield exact host names
282 (i.e. the `//' part must be skipped, too). */
283 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
288 /* Returns 1 if the URL begins with a protocol (supported or
289 unsupported), 0 otherwise. */
291 has_proto (const char *url)
295 url += skip_url (url);
296 for (s = protostrings; *s; s++)
297 if (strncasecmp (url, *s, strlen (*s)) == 0)
302 /* Skip the username and password, if present here. The function
303 should be called *not* with the complete URL, but with the part
304 right after the protocol.
306 If no username and password are found, return 0. */
308 skip_uname (const char *url)
311 for (p = url; *p && *p != '/'; p++)
314 /* If a `@' was found before the first occurrence of `/', skip
322 /* Allocate a new urlinfo structure, fill it with default values and
323 return a pointer to it. */
329 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
330 memset (u, 0, sizeof (*u));
331 u->proto = URLUNKNOWN;
335 /* Perform a "deep" free of the urlinfo structure. The structure
336 should have been created with newurl, but need not have been used.
337 If free_pointer is non-0, free the pointer itself. */
339 freeurl (struct urlinfo *u, int complete)
343 FREE_MAYBE (u->host);
344 FREE_MAYBE (u->path);
345 FREE_MAYBE (u->file);
347 FREE_MAYBE (u->user);
348 FREE_MAYBE (u->passwd);
349 FREE_MAYBE (u->local);
350 FREE_MAYBE (u->referer);
352 freeurl (u->proxy, 1);
358 /* Extract the given URL of the form
359 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
360 1. hostname (terminated with `/' or `:')
361 2. port number (terminated with `/'), or chosen for the protocol
362 3. dirname (everything after hostname)
363 Most errors are handled. No allocation is done, you must supply
364 pointers to allocated memory.
365 ...and a host of other stuff :-)
367 - Recognizes hostname:dir/file for FTP and
368 hostname (:portnum)?/dir/file for HTTP.
369 - Parses the path to yield directory and file
370 - Parses the URL to yield the username and passwd (if present)
371 - Decodes the strings, in case they contain "forbidden" characters
372 - Writes the result to struct urlinfo
374 If the argument STRICT is set, it recognizes only the canonical
377 parseurl (const char *url, struct urlinfo *u, int strict)
380 int recognizable; /* Recognizable URL is the one where
381 the protocol name was explicitly
382 named, i.e. it wasn't deduced from
386 DEBUGP (("parseurl (\"%s\") -> ", url));
387 url += skip_url (url);
388 recognizable = has_proto (url);
389 if (strict && !recognizable)
391 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
393 l = strlen (sup_protos[i].name);
394 if (!strncasecmp (sup_protos[i].name, url, l))
397 /* If protocol is recognizable, but unsupported, bail out, else
399 if (recognizable && !sup_protos[i].name)
401 else if (i == ARRAY_SIZE (sup_protos))
404 u->proto = type = sup_protos[i].ind;
406 if (type == URLUNKNOWN)
408 /* Allow a username and password to be specified (i.e. just skip
411 l += skip_uname (url + l);
412 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
415 /* Get the hostname. */
416 u->host = strdupdelim (url + l, url + i);
417 DEBUGP (("host %s -> ", u->host));
419 /* Assume no port has been given. */
423 /* We have a colon delimiting the hostname. It could mean that
424 a port number is following it, or a directory. */
425 if (ISDIGIT (url[++i])) /* A port number */
427 if (type == URLUNKNOWN)
428 u->proto = type = URLHTTP;
429 for (; url[i] && url[i] != '/'; i++)
430 if (ISDIGIT (url[i]))
431 u->port = 10 * u->port + (url[i] - '0');
436 DEBUGP (("port %hu -> ", u->port));
438 else if (type == URLUNKNOWN) /* or a directory */
439 u->proto = type = URLFTP;
440 else /* or just a misformed port number */
443 else if (type == URLUNKNOWN)
444 u->proto = type = URLHTTP;
448 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
449 if (sup_protos[i].ind == type)
451 if (i == ARRAY_SIZE (sup_protos))
453 u->port = sup_protos[i].port;
455 /* Some delimiter troubles... */
456 if (url[i] == '/' && url[i - 1] != ':')
459 while (url[i] && url[i] == '/')
462 /* dfb: break "path" into "path" and "qstring" if the URL is HTTP
463 if it's not an HTTP url, set l to the last character, so the
464 xmalloc and strncpy work as desired */
465 if (type == URLHTTP) {
466 for (l = i; url[l] && url[l] != '?'; l++);
467 if (l != strlen(url)) {
468 /* copy the query string, including the '?' into u->qstring */
469 u->qstring = (char *)xmalloc (strlen (url + l) + 8);
470 strcpy (u->qstring, url + l);
477 u->path = strdupdelim (url + i, url + l);
480 u->ftp_type = process_ftp_type (u->path);
481 /* #### We don't handle type `d' correctly yet. */
482 if (!u->ftp_type || toupper (u->ftp_type) == 'D')
485 DEBUGP (("opath %s -> ", u->path));
486 /* Parse the username and password (if existing). */
487 parse_uname (url, &u->user, &u->passwd);
488 /* Decode the strings, as per RFC 1738. */
489 decode_string (u->host);
490 decode_string (u->path);
492 decode_string (u->user);
494 decode_string (u->passwd);
495 /* Parse the directory. */
496 parse_dir (u->path, &u->dir, &u->file);
497 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
498 if (type == URLHTTP && u->qstring)
499 DEBUGP (("query-string %s -> ", u->qstring));
500 /* Simplify the directory. */
501 path_simplify (u->dir);
502 /* Remove the leading `/' in HTTP. */
503 if (type == URLHTTP && *u->dir == '/')
504 strcpy (u->dir, u->dir + 1);
505 DEBUGP (("ndir %s\n", u->dir));
506 /* Strip trailing `/'. */
508 if (l && u->dir[l - 1] == '/')
509 u->dir[l - 1] = '\0';
510 /* Re-create the path: */
511 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
512 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
513 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
514 strcpy (u->path, abs_ftp ? "%2F" : "/");
515 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
516 strcat (u->path, *u->dir ? "/" : "");
517 strcat (u->path, u->file);
518 URL_CLEANSE (u->path);
519 /* Create the clean URL. */
520 u->url = str_url (u, 0);
524 /* Build the directory and filename components of the path. Both
525 components are *separately* malloc-ed strings! It does not change
526 the contents of path.
528 If the path ends with "." or "..", they are (correctly) counted as
531 parse_dir (const char *path, char **dir, char **file)
535 for (i = l = strlen (path); i && path[i] != '/'; i--);
536 if (!i && *path != '/') /* Just filename */
538 if (DOTP (path) || DDOTP (path))
540 *dir = xstrdup (path);
541 *file = xstrdup ("");
545 *dir = xstrdup (""); /* This is required because of FTP */
546 *file = xstrdup (path);
549 else if (!i) /* /filename */
551 if (DOTP (path + 1) || DDOTP (path + 1))
553 *dir = xstrdup (path);
554 *file = xstrdup ("");
558 *dir = xstrdup ("/");
559 *file = xstrdup (path + 1);
562 else /* Nonempty directory with or without a filename */
564 if (DOTP (path + i + 1) || DDOTP (path + i + 1))
566 *dir = xstrdup (path);
567 *file = xstrdup ("");
571 *dir = strdupdelim (path, path + i);
572 *file = strdupdelim (path + i + 1, path + l + 1);
577 /* Find the optional username and password within the URL, as per
578 RFC1738. The returned user and passwd char pointers are
581 parse_uname (const char *url, char **user, char **passwd)
589 url += skip_url (url);
590 /* Look for end of protocol string. */
591 l = skip_proto (url);
594 /* Add protocol offset. */
596 /* Is there an `@' character? */
597 for (p = url; *p && *p != '/'; p++)
600 /* If not, return. */
603 /* Else find the username and password. */
604 for (p = col = url; *p != '@'; p++)
606 if (*p == ':' && !*user)
608 *user = (char *)xmalloc (p - url + 1);
609 memcpy (*user, url, p - url);
610 (*user)[p - url] = '\0';
614 /* Decide whether you have only the username or both. */
615 where = *user ? passwd : user;
616 *where = (char *)xmalloc (p - col + 1);
617 memcpy (*where, col, p - col);
618 (*where)[p - col] = '\0';
622 /* If PATH ends with `;type=X', return the character X. */
624 process_ftp_type (char *path)
626 int len = strlen (path);
629 && !memcmp (path + len - 7, ";type=", 6))
631 path[len - 7] = '\0';
632 return path[len - 1];
638 /* Return the URL as fine-formed string, with a proper protocol, port
639 number, directory and optional user/password. If HIDE is non-zero,
640 password will be hidden. The forbidden characters in the URL will
643 str_url (const struct urlinfo *u, int hide)
645 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
646 int i, l, ln, lu, lh, lp, lf, ld, lq;
648 /* Look for the protocol name. */
649 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
650 if (sup_protos[i].ind == u->proto)
652 if (i == ARRAY_SIZE (sup_protos))
654 proto_name = sup_protos[i].name;
655 host = CLEANDUP (u->host);
656 dir = CLEANDUP (u->dir);
657 file = CLEANDUP (u->file);
658 user = passwd = NULL;
660 user = CLEANDUP (u->user);
664 passwd = CLEANDUP (u->passwd);
666 for (i = 0; passwd[i]; i++)
669 if (u->proto == URLFTP && *dir == '/')
671 char *tmp = (char *)xmalloc (strlen (dir) + 3);
672 /*sprintf (tmp, "%%2F%s", dir + 1);*/
676 strcpy (tmp + 3, dir + 1);
681 ln = strlen (proto_name);
682 lu = user ? strlen (user) : 0;
683 lp = passwd ? strlen (passwd) : 0;
687 lq = (u->proto == URLHTTP && u->qstring) ? strlen (u->qstring) : 0;
688 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + lq + 20); /* safe sex */
689 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
690 (user ? user : ""), (passwd ? ":" : ""),
691 (passwd ? passwd : ""), (user ? "@" : ""),
692 host, u->port, dir, *dir ? "/" : "", file); */
694 memcpy (res, proto_name, ln);
698 memcpy (res + l, user, lu);
703 memcpy (res + l, passwd, lp);
708 memcpy (res + l, host, lh);
711 long_to_string (res + l, (long)u->port);
712 l += numdigit (u->port);
714 memcpy (res + l, dir, ld);
718 strcpy (res + l, file);
725 /* copy in the raw query string to avoid munging arguments */
726 memcpy (res + l, u->qstring, lq);
733 /* Check whether two URL-s are equivalent, i.e. pointing to the same
734 location. Uses parseurl to parse them, and compares the canonical
737 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
738 return 0 on error. */
740 url_equal (const char *url1, const char *url2)
742 struct urlinfo *u1, *u2;
747 err = parseurl (url1, u1, 0);
754 err = parseurl (url2, u2, 0);
760 res = !strcmp (u1->url, u2->url);
766 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
767 buffer may contain pretty much anything; no errors are signaled. */
769 findurl (const char *buf, int howmuch, int *count)
774 for (s1 = buf; howmuch; s1++, howmuch--)
775 for (prot = protostrings; *prot; prot++)
776 if (howmuch <= strlen (*prot))
778 else if (!strncasecmp (*prot, s1, strlen (*prot)))
780 for (s2 = s1, *count = 0;
781 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
782 !strchr (URL_SEPARATOR, *s2);
783 s2++, (*count)++, howmuch--);
789 /* Scans the file for signs of URL-s. Returns a vector of pointers,
790 each pointer representing a URL string. The file is *not* assumed
793 get_urls_file (const char *file)
800 urlpos *first, *current, *old;
802 if (file && !HYPHENP (file))
804 fp = fopen (file, "rb");
807 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
814 load_file (fp, &buf, &nread);
815 if (file && !HYPHENP (file))
817 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
818 first = current = NULL;
819 /* Fill the linked list with URLs. */
820 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
823 /* Allocate the space. */
825 current = (urlpos *)xmalloc (sizeof (urlpos));
828 memset (current, 0, sizeof (*current));
829 current->next = NULL;
830 current->url = (char *)xmalloc (size + 1);
831 memcpy (current->url, pbuf, size);
832 current->url[size] = '\0';
836 /* Free the buffer. */
842 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
843 an HTML document using htmlfindurl(), which see. get_urls_html()
844 constructs the HTML-s from the relative href-s.
846 If SILENT is non-zero, do not barf on baseless relative links. */
848 get_urls_html (const char *file, const char *this_url, int silent)
854 int step, first_time;
855 urlpos *first, *current, *old;
857 if (file && !HYPHENP (file))
859 fp = fopen (file, "rb");
862 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
869 load_file (fp, &orig_buf, &nread);
870 if (file && !HYPHENP (file))
872 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
873 first = current = NULL;
875 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
877 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
882 const char *pbuf = buf;
888 /* A frequent phenomenon that needs to be handled are pages
889 generated by brain-damaged HTML generators, which refer to to
890 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
891 any spaces at the beginning or at the end of the string.
892 This is probably not strictly correct, but that's what the
893 browsers do, so we may follow. May the authors of "WYSIWYG"
894 HTML tools burn in hell for the damage they've inflicted! */
895 while ((pbuf < buf + step) && ISSPACE (*pbuf))
900 while (size && ISSPACE (pbuf[size - 1]))
905 for (i = 0; protostrings[i]; i++)
907 if (!strncasecmp (protostrings[i], pbuf,
908 MINVAL (strlen (protostrings[i]), size)))
911 /* Check for http:RELATIVE_URI. See below for details. */
913 && !(strncasecmp (pbuf, "http:", 5) == 0
914 && strncasecmp (pbuf, "http://", 7) != 0))
921 /* This is for extremely brain-damaged pages that refer to
922 relative URI-s as <a href="http:URL">. Just strip off the
923 silly leading "http:" (as well as any leading blanks
925 if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
926 pbuf += 5, size -= 5;
930 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
932 if (!strncasecmp (sup_protos[i].name, pbuf,
933 MINVAL (strlen (sup_protos[i].name), size)))
936 /* Do *not* accept a non-supported protocol. */
937 if (i == ARRAY_SIZE (sup_protos))
942 /* First, construct the base, which can be relative itself.
944 Criteria for creating the base are:
945 1) html_base created by <base href="...">
947 3) base provided from the command line */
948 cbase = html_base ();
952 cbase = opt.base_href;
953 if (!cbase) /* Error condition -- a baseless
956 if (!opt.quiet && !silent)
958 /* Use malloc, not alloca because this is called in
960 char *temp = (char *)malloc (size + 1);
961 strncpy (temp, pbuf, size);
963 logprintf (LOG_NOTQUIET,
964 _("Error (%s): Link %s without a base provided.\n"),
971 base = construct (this_url, cbase, strlen (cbase),
975 /* Base must now be absolute, with host name and
977 if (!has_proto (cbase))
979 logprintf (LOG_NOTQUIET, _("\
980 Error (%s): Base %s relative, without referer URL.\n"),
984 base = xstrdup (cbase);
986 constr = construct (base, pbuf, size, no_proto);
991 constr = (char *)xmalloc (size + 1);
992 strncpy (constr, pbuf, size);
1001 tmp2 = html_base ();
1002 /* Use malloc, not alloca because this is called in a loop. */
1003 tmp = (char *)xmalloc (size + 1);
1004 strncpy (tmp, pbuf, size);
1006 logprintf (LOG_ALWAYS,
1007 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
1008 file, this_url ? this_url : "(null)",
1009 tmp2 ? tmp2 : "(null)", tmp, constr);
1014 /* Allocate the space. */
1016 current = (urlpos *)xmalloc (sizeof (urlpos));
1018 old->next = current;
1021 /* Fill the values. */
1022 memset (current, 0, sizeof (*current));
1023 current->next = NULL;
1024 current->url = constr;
1025 current->size = size;
1026 current->pos = pbuf - orig_buf;
1027 /* A URL is relative if the host and protocol are not named,
1028 and the name does not start with `/'. */
1029 if (no_proto && *pbuf != '/')
1030 current->flags |= (URELATIVE | UNOPROTO);
1032 current->flags |= UNOPROTO;
1039 /* Free the linked list of urlpos. */
1041 free_urlpos (urlpos *l)
1045 urlpos *next = l->next;
1047 FREE_MAYBE (l->local_name);
1053 /* Rotate FNAME opt.backups times */
1055 rotate_backups(const char *fname)
1057 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1058 char *from = (char *)alloca (maxlen);
1059 char *to = (char *)alloca (maxlen);
1063 if (stat (fname, &sb) == 0)
1064 if (S_ISREG (sb.st_mode) == 0)
1067 for (i = opt.backups; i > 1; i--)
1069 sprintf (from, "%s.%d", fname, i - 1);
1070 sprintf (to, "%s.%d", fname, i);
1071 /* #### This will fail on machines without the rename() system
1076 sprintf (to, "%s.%d", fname, 1);
1080 /* Create all the necessary directories for PATH (a file). Calls
1081 mkdirhier() internally. */
1083 mkalldirs (const char *path)
1090 p = path + strlen (path);
1091 for (; *p != '/' && p != path; p--);
1092 /* Don't create if it's just a file. */
1093 if ((p == path) && (*p != '/'))
1095 t = strdupdelim (path, p);
1096 /* Check whether the directory exists. */
1097 if ((stat (t, &st) == 0))
1099 if (S_ISDIR (st.st_mode))
1106 /* If the dir exists as a file name, remove it first. This
1107 is *only* for Wget to work with buggy old CERN http
1108 servers. Here is the scenario: When Wget tries to
1109 retrieve a directory without a slash, e.g.
1110 http://foo/bar (bar being a directory), CERN server will
1111 not redirect it too http://foo/bar/ -- it will generate a
1112 directory listing containing links to bar/file1,
1113 bar/file2, etc. Wget will lose because it saves this
1114 HTML listing to a file `bar', so it cannot create the
1115 directory. To work around this, if the file of the same
1116 name exists, we just remove it and create the directory
1118 DEBUGP (("Removing %s because of directory danger!\n", t));
1122 res = make_directory (t);
1124 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1130 count_slashes (const char *s)
1139 /* Return the path name of the URL-equivalent file name, with a
1140 remote-like structure of directories. */
1142 mkstruct (const struct urlinfo *u)
1144 char *host, *dir, *file, *res, *dirpref;
1147 assert (u->dir != NULL);
1148 assert (u->host != NULL);
1152 char *ptr = u->dir + (*u->dir == '/');
1153 int slash_count = 1 + count_slashes (ptr);
1154 int cut = MINVAL (opt.cut_dirs, slash_count);
1155 for (; cut && *ptr; ptr++)
1158 STRDUP_ALLOCA (dir, ptr);
1161 dir = u->dir + (*u->dir == '/');
1163 host = xstrdup (u->host);
1164 /* Check for the true name (or at least a consistent name for saving
1165 to directory) of HOST, reusing the hlist if possible. */
1166 if (opt.add_hostdir && !opt.simple_check)
1168 char *nhost = realhost (host);
1172 /* Add dir_prefix and hostname (if required) to the beginning of
1174 if (opt.add_hostdir)
1176 if (!DOTP (opt.dir_prefix))
1178 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1179 + strlen (host) + 1);
1180 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1183 STRDUP_ALLOCA (dirpref, host);
1185 else /* not add_hostdir */
1187 if (!DOTP (opt.dir_prefix))
1188 dirpref = opt.dir_prefix;
1194 /* If there is a prefix, prepend it. */
1197 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1198 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1201 dir = xstrdup (dir);
1204 if (l && dir[l - 1] == '/')
1208 file = "index.html";
1212 /* Finally, construct the full name. */
1213 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1214 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1219 /* Create a unique filename, corresponding to a given URL. Calls
1220 mkstruct if necessary. Does *not* actually create any directories. */
1222 url_filename (const struct urlinfo *u)
1225 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1229 file = mkstruct (u);
1235 file = xstrdup ("index.html");
1237 file = xstrdup (u->file);
1242 /* Check whether the prefix directory is something other than "."
1243 before prepending it. */
1244 if (!DOTP (opt.dir_prefix))
1246 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1247 + 1 + strlen (file) + 1);
1248 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1253 /* DOS-ish file systems don't like `%' signs in them; we change it
1258 for (p = file; *p; p++)
1262 #endif /* WINDOWS */
1264 /* Check the cases in which the unique extensions are not used:
1265 1) Clobbering is turned off (-nc).
1266 2) Retrieval with regetting.
1267 3) Timestamping is used.
1268 4) Hierarchy is built.
1270 The exception is the case when file does exist and is a
1271 directory (actually support for bad httpd-s). */
1272 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1273 && !(file_exists_p (file) && !file_non_directory_p (file)))
1276 /* Find a unique name. */
1277 name = unique_name (file);
1282 /* Construct an absolute URL, given a (possibly) relative one. This
1283 is more tricky than it might seem, but it works. */
1285 construct (const char *url, const char *sub, int subsize, int no_proto)
1295 for (i = strlen (url); i && url[i] != '/'; i--);
1296 if (!i || (url[i] == url[i - 1]))
1298 int l = strlen (url);
1299 char *t = (char *)alloca (l + 2);
1306 constr = (char *)xmalloc (i + 1 + subsize + 1);
1307 strncpy (constr, url, i + 1);
1308 constr[i + 1] = '\0';
1309 strncat (constr, sub, subsize);
1311 else /* *sub == `/' */
1318 for (; url[i] && url[i] != '/'; i++);
1321 fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1328 int l = strlen (url);
1329 char *t = (char *)alloca (l + 2);
1335 constr = (char *)xmalloc (i + 1 + subsize + 1);
1336 strncpy (constr, url, i);
1338 strncat (constr + i, sub, subsize);
1339 constr[i + subsize] = '\0';
1342 else /* !no_proto */
1344 constr = (char *)xmalloc (subsize + 1);
1345 strncpy (constr, sub, subsize);
1346 constr[subsize] = '\0';
1351 /* Optimize URL by host, destructively replacing u->host with realhost
1352 (u->host). Do this regardless of opt.simple_check. */
1354 opt_url (struct urlinfo *u)
1356 /* Find the "true" host. */
1357 char *host = realhost (u->host);
1360 assert (u->dir != NULL); /* the URL must have been parsed */
1361 /* Refresh the printed representation. */
1363 u->url = str_url (u, 0);
1366 /* Returns proxy host address, in accordance with PROTO. */
1368 getproxy (uerr_t proto)
1370 if (proto == URLHTTP)
1371 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1372 else if (proto == URLFTP)
1373 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1378 /* Should a host be accessed through proxy, concerning no_proxy? */
1380 no_proxy_match (const char *host, const char **no_proxy)
1385 return !sufmatch (no_proxy, host);
1388 /* Change the links in an HTML document. Accepts a structure that
1389 defines the positions of all the links. */
1391 convert_links (const char *file, urlpos *l)
1397 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1398 /* Read from the file.... */
1399 fp = fopen (file, "rb");
1402 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1403 file, strerror (errno));
1406 /* ...to a buffer. */
1407 load_file (fp, &buf, &size);
1409 if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1410 /* Rather than just writing over the original .html file with the converted
1411 version, save the former to *.orig. Note we only do this for files we've
1412 _successfully_ downloaded, so we don't clobber .orig files sitting around
1413 from previous invocations. */
1415 /* Construct the backup filename as the original name plus ".orig". */
1416 size_t filename_len = strlen(file);
1417 char* filename_plus_orig_suffix = malloc(filename_len +
1419 boolean already_wrote_backup_file = FALSE;
1420 slist* converted_file_ptr;
1421 static slist* converted_files = NULL;
1423 /* Would a single s[n]printf() call be faster? */
1424 strcpy(filename_plus_orig_suffix, file);
1425 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1427 /* We can get called twice on the same URL thanks to the
1428 convert_all_links() call in main(). If we write the .orig file each
1429 time in such a case, it'll end up containing the first-pass conversion,
1430 not the original file. So, see if we've already been called on this
1432 converted_file_ptr = converted_files;
1433 while (converted_file_ptr != NULL)
1434 if (strcmp(converted_file_ptr->string, file) == 0)
1436 already_wrote_backup_file = TRUE;
1440 converted_file_ptr = converted_file_ptr->next;
1442 if (!already_wrote_backup_file)
1444 /* Rename <file> to <file>.orig before former gets written over. */
1445 if (rename(file, filename_plus_orig_suffix) != 0)
1446 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1447 file, filename_plus_orig_suffix, strerror (errno));
1449 /* Remember that we've already written a .orig backup for this file.
1450 Note that we never free this memory since we need it till the
1451 convert_all_links() call, which is one of the last things the
1452 program does before terminating. BTW, I'm not sure if it would be
1453 safe to just set 'converted_file_ptr->string' to 'file' below,
1454 rather than making a copy of the string... Another note is that I
1455 thought I could just add a field to the urlpos structure saying
1456 that we'd written a .orig file for this URL, but that didn't work,
1457 so I had to make this separate list. */
1458 converted_file_ptr = malloc(sizeof(slist));
1459 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1460 converted_file_ptr->next = converted_files;
1461 converted_files = converted_file_ptr;
1464 free(filename_plus_orig_suffix);
1466 /* Now open the file for writing. */
1467 fp = fopen (file, "wb");
1470 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1471 file, strerror (errno));
1475 /* [If someone understands why multiple URLs can correspond to one local file,
1476 can they please add a comment here...?] */
1477 for (p = buf; l; l = l->next)
1481 DEBUGP (("Something strange is going on. Please investigate."));
1484 /* If the URL already is relative or it is not to be converted
1485 for some other reason (e.g. because of not having been
1486 downloaded in the first place), skip it. */
1487 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1489 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1493 /* Else, reach the position of the offending URL, echoing
1494 everything up to it to the outfile. */
1495 for (p2 = buf + l->pos; p < p2; p++)
1497 if (l->flags & UABS2REL)
1499 char *newname = construct_relative (file, l->local_name);
1500 fprintf (fp, "%s", newname);
1501 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1502 l->url, newname, l->pos, file));
1509 for (p2 = buf + size; p < p2; p++)
1514 logputs (LOG_VERBOSE, _("done.\n"));
1517 /* Construct and return a malloced copy of the relative link from two
1518 pieces of information: local name S1 of the referring file and
1519 local name S2 of the referred file.
1521 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1522 "jagor.srce.hr/images/news.gif", the function will return
1525 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1526 "fly.cc.fer.hr/images/fly.gif", the function will return
1527 "../images/fly.gif".
1529 Caveats: S1 should not begin with `/', unless S2 also begins with
1530 '/'. S1 should not contain things like ".." and such --
1531 construct_relative ("fly/ioccc/../index.html",
1532 "fly/images/fly.gif") will fail. (A workaround is to call
1533 something like path_simplify() on S1). */
1535 construct_relative (const char *s1, const char *s2)
1537 int i, cnt, sepdirs1;
1541 return xstrdup (s2);
1542 /* S1 should *not* be absolute, if S2 wasn't. */
1543 assert (*s1 != '/');
1545 /* Skip the directories common to both strings. */
1548 while (s1[i] && s2[i]
1553 if (s1[i] == '/' && s2[i] == '/')
1558 for (sepdirs1 = 0; s1[i]; i++)
1561 /* Now, construct the file as of:
1562 - ../ repeated sepdirs1 time
1563 - all the non-mutual directories of S2. */
1564 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1565 for (i = 0; i < sepdirs1; i++)
1566 memcpy (res + 3 * i, "../", 3);
1567 strcpy (res + 3 * i, s2 + cnt);
1571 /* Add URL to the head of the list L. */
1573 add_url (urlpos *l, const char *url, const char *file)
1577 t = (urlpos *)xmalloc (sizeof (urlpos));
1578 memset (t, 0, sizeof (*t));
1579 t->url = xstrdup (url);
1580 t->local_name = xstrdup (file);
1586 /* Remembers which files have been downloaded. Should be called with
1587 add_or_check == ADD_FILE for each file we actually download successfully
1588 (i.e. not for ones we have failures on or that we skip due to -N). If you
1589 just want to check if a file has been previously added without adding it,
1590 call with add_or_check == CHECK_FOR_FILE. Please be sure to call this
1591 function with local filenames, not remote URLs -- by some means that isn't
1592 commented well enough for me understand, multiple remote URLs can apparently
1593 correspond to a single local file. */
1595 downloaded_file (downloaded_file_t add_or_check, const char* file)
1597 boolean found_file = FALSE;
1598 static slist* downloaded_files = NULL;
1599 slist* rover = downloaded_files;
1601 while (rover != NULL)
1602 if (strcmp(rover->string, file) == 0)
1608 rover = rover->next;
1611 return TRUE; /* file had already been downloaded */
1614 if (add_or_check == ADD_FILE)
1616 rover = malloc(sizeof(slist));
1617 rover->string = xstrdup(file); /* die on out-of-mem. */
1618 rover->next = downloaded_files;
1619 downloaded_files = rover;
1622 return FALSE; /* file had not already been downloaded */