2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
64 #define UNSAFE_CHAR(c) (((c) >= 0 && (c) <= 32) \
65 || strchr (URL_UNSAFE_CHARS, c))
67 /* If S contains unsafe characters, free it and replace it with a
68 version that doesn't. */
69 #define URL_CLEANSE(s) do \
71 if (contains_unsafe (s)) \
73 char *uc_tmp = encode_string (s); \
79 /* Is a directory "."? */
80 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
81 /* Is a directory ".."? */
82 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
84 /* NULL-terminated list of strings to be recognized as prototypes (URL
85 schemes). Note that recognized doesn't mean supported -- only HTTP
86 and FTP are currently supported.
88 However, a string that does not match anything in the list will be
89 considered a relative URL. Thus it's important that this list has
90 anything anyone could think of being legal.
92 There are wild things here. :-) Take a look at
93 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
95 static char *protostrings[] =
137 /* Similar to former, but for supported protocols: */
138 static struct proto sup_protos[] =
140 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
141 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
142 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
145 static void parse_dir PARAMS ((const char *, char **, char **));
146 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
147 static char *construct PARAMS ((const char *, const char *, int , int));
148 static char *construct_relative PARAMS ((const char *, const char *));
149 static char process_ftp_type PARAMS ((char *));
152 /* Returns the number of characters to be skipped if the first thing
153 in a URL is URL: (which is 0 or 4+). The optional spaces after
154 URL: are also skipped. */
156 skip_url (const char *url)
160 if (TOUPPER (url[0]) == 'U'
161 && TOUPPER (url[1]) == 'R'
162 && TOUPPER (url[2]) == 'L'
166 for (i = 4; url[i] && ISSPACE (url[i]); i++);
173 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
175 contains_unsafe (const char *s)
178 if (UNSAFE_CHAR (*s))
183 /* Decodes the forms %xy in a URL to the character the hexadecimal
184 code of which is xy. xy are hexadecimal digits from
185 [0123456789ABCDEF] (case-insensitive). If x or y are not
186 hex-digits or `%' precedes `\0', the sequence is inserted
190 decode_string (char *s)
200 /* Do nothing if at the end of the string, or if the chars
201 are not hex-digits. */
202 if (!*(s + 1) || !*(s + 2)
203 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
208 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
215 /* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
216 given string, returning a malloc-ed %XX encoded string. */
218 encode_string (const char *s)
225 for (i = 0; *s; s++, i++)
226 if (UNSAFE_CHAR (*s))
227 i += 2; /* Two more characters (hex digits) */
228 res = (char *)xmalloc (i + 1);
230 for (p = res; *s; s++)
231 if (UNSAFE_CHAR (*s))
233 const unsigned char c = *s;
235 *p++ = HEXD2ASC (c >> 4);
236 *p++ = HEXD2ASC (c & 0xf);
244 /* Returns the proto-type if URL's protocol is supported, or
245 URLUNKNOWN if not. */
247 urlproto (const char *url)
251 url += skip_url (url);
252 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
253 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
254 return sup_protos[i].ind;
255 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
258 for (++i; url[i] && url[i] != '/'; i++)
259 if (!ISDIGIT (url[i]))
261 if (url[i - 1] == ':')
270 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
271 part is found, returns 0. */
273 skip_proto (const char *url)
278 for (s = protostrings; *s; s++)
279 if (!strncasecmp (*s, url, strlen (*s)))
284 /* HTTP and FTP protocols are expected to yield exact host names
285 (i.e. the `//' part must be skipped, too). */
286 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
291 /* Returns 1 if the URL begins with a protocol (supported or
292 unsupported), 0 otherwise. */
294 has_proto (const char *url)
298 url += skip_url (url);
299 for (s = protostrings; *s; s++)
300 if (strncasecmp (url, *s, strlen (*s)) == 0)
305 /* Skip the username and password, if present here. The function
306 should be called *not* with the complete URL, but with the part
307 right after the protocol.
309 If no username and password are found, return 0. */
311 skip_uname (const char *url)
314 for (p = url; *p && *p != '/'; p++)
317 /* If a `@' was found before the first occurrence of `/', skip
325 /* Allocate a new urlinfo structure, fill it with default values and
326 return a pointer to it. */
332 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
333 memset (u, 0, sizeof (*u));
334 u->proto = URLUNKNOWN;
338 /* Perform a "deep" free of the urlinfo structure. The structure
339 should have been created with newurl, but need not have been used.
340 If free_pointer is non-0, free the pointer itself. */
342 freeurl (struct urlinfo *u, int complete)
346 FREE_MAYBE (u->host);
347 FREE_MAYBE (u->path);
348 FREE_MAYBE (u->file);
350 FREE_MAYBE (u->user);
351 FREE_MAYBE (u->passwd);
352 FREE_MAYBE (u->local);
353 FREE_MAYBE (u->referer);
355 freeurl (u->proxy, 1);
361 /* Extract the given URL of the form
362 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
363 1. hostname (terminated with `/' or `:')
364 2. port number (terminated with `/'), or chosen for the protocol
365 3. dirname (everything after hostname)
366 Most errors are handled. No allocation is done, you must supply
367 pointers to allocated memory.
368 ...and a host of other stuff :-)
370 - Recognizes hostname:dir/file for FTP and
371 hostname (:portnum)?/dir/file for HTTP.
372 - Parses the path to yield directory and file
373 - Parses the URL to yield the username and passwd (if present)
374 - Decodes the strings, in case they contain "forbidden" characters
375 - Writes the result to struct urlinfo
377 If the argument STRICT is set, it recognizes only the canonical
380 parseurl (const char *url, struct urlinfo *u, int strict)
383 int recognizable; /* Recognizable URL is the one where
384 the protocol name was explicitly
385 named, i.e. it wasn't deduced from
389 DEBUGP (("parseurl (\"%s\") -> ", url));
390 url += skip_url (url);
391 recognizable = has_proto (url);
392 if (strict && !recognizable)
394 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
396 l = strlen (sup_protos[i].name);
397 if (!strncasecmp (sup_protos[i].name, url, l))
400 /* If protocol is recognizable, but unsupported, bail out, else
402 if (recognizable && !sup_protos[i].name)
404 else if (i == ARRAY_SIZE (sup_protos))
407 u->proto = type = sup_protos[i].ind;
409 if (type == URLUNKNOWN)
411 /* Allow a username and password to be specified (i.e. just skip
414 l += skip_uname (url + l);
415 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
418 /* Get the hostname. */
419 u->host = strdupdelim (url + l, url + i);
420 DEBUGP (("host %s -> ", u->host));
422 /* Assume no port has been given. */
426 /* We have a colon delimiting the hostname. It could mean that
427 a port number is following it, or a directory. */
428 if (ISDIGIT (url[++i])) /* A port number */
430 if (type == URLUNKNOWN)
431 u->proto = type = URLHTTP;
432 for (; url[i] && url[i] != '/'; i++)
433 if (ISDIGIT (url[i]))
434 u->port = 10 * u->port + (url[i] - '0');
439 DEBUGP (("port %hu -> ", u->port));
441 else if (type == URLUNKNOWN) /* or a directory */
442 u->proto = type = URLFTP;
443 else /* or just a misformed port number */
446 else if (type == URLUNKNOWN)
447 u->proto = type = URLHTTP;
451 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
452 if (sup_protos[i].ind == type)
454 if (i == ARRAY_SIZE (sup_protos))
456 u->port = sup_protos[i].port;
458 /* Some delimiter troubles... */
459 if (url[i] == '/' && url[i - 1] != ':')
462 while (url[i] && url[i] == '/')
464 u->path = (char *)xmalloc (strlen (url + i) + 8);
465 strcpy (u->path, url + i);
468 u->ftp_type = process_ftp_type (u->path);
469 /* #### We don't handle type `d' correctly yet. */
470 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
473 DEBUGP (("opath %s -> ", u->path));
474 /* Parse the username and password (if existing). */
475 parse_uname (url, &u->user, &u->passwd);
476 /* Decode the strings, as per RFC 1738. */
477 decode_string (u->host);
478 decode_string (u->path);
480 decode_string (u->user);
482 decode_string (u->passwd);
483 /* Parse the directory. */
484 parse_dir (u->path, &u->dir, &u->file);
485 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
486 /* Simplify the directory. */
487 path_simplify (u->dir);
488 /* Remove the leading `/' in HTTP. */
489 if (type == URLHTTP && *u->dir == '/')
490 strcpy (u->dir, u->dir + 1);
491 DEBUGP (("ndir %s\n", u->dir));
492 /* Strip trailing `/'. */
494 if (l && u->dir[l - 1] == '/')
495 u->dir[l - 1] = '\0';
496 /* Re-create the path: */
497 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
498 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
499 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
500 strcpy (u->path, abs_ftp ? "%2F" : "/");
501 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
502 strcat (u->path, *u->dir ? "/" : "");
503 strcat (u->path, u->file);
504 URL_CLEANSE (u->path);
505 /* Create the clean URL. */
506 u->url = str_url (u, 0);
510 /* Build the directory and filename components of the path. Both
511 components are *separately* malloc-ed strings! It does not change
512 the contents of path.
514 If the path ends with "." or "..", they are (correctly) counted as
517 parse_dir (const char *path, char **dir, char **file)
521 for (i = l = strlen (path); i && path[i] != '/'; i--);
522 if (!i && *path != '/') /* Just filename */
524 if (DOTP (path) || DDOTP (path))
526 *dir = xstrdup (path);
527 *file = xstrdup ("");
531 *dir = xstrdup (""); /* This is required because of FTP */
532 *file = xstrdup (path);
535 else if (!i) /* /filename */
537 if (DOTP (path + 1) || DDOTP (path + 1))
539 *dir = xstrdup (path);
540 *file = xstrdup ("");
544 *dir = xstrdup ("/");
545 *file = xstrdup (path + 1);
548 else /* Nonempty directory with or without a filename */
550 if (DOTP (path + i + 1) || DDOTP (path + i + 1))
552 *dir = xstrdup (path);
553 *file = xstrdup ("");
557 *dir = strdupdelim (path, path + i);
558 *file = strdupdelim (path + i + 1, path + l + 1);
563 /* Find the optional username and password within the URL, as per
564 RFC1738. The returned user and passwd char pointers are
567 parse_uname (const char *url, char **user, char **passwd)
575 url += skip_url (url);
576 /* Look for end of protocol string. */
577 l = skip_proto (url);
580 /* Add protocol offset. */
582 /* Is there an `@' character? */
583 for (p = url; *p && *p != '/'; p++)
586 /* If not, return. */
589 /* Else find the username and password. */
590 for (p = col = url; *p != '@'; p++)
592 if (*p == ':' && !*user)
594 *user = (char *)xmalloc (p - url + 1);
595 memcpy (*user, url, p - url);
596 (*user)[p - url] = '\0';
600 /* Decide whether you have only the username or both. */
601 where = *user ? passwd : user;
602 *where = (char *)xmalloc (p - col + 1);
603 memcpy (*where, col, p - col);
604 (*where)[p - col] = '\0';
608 /* If PATH ends with `;type=X', return the character X. */
610 process_ftp_type (char *path)
612 int len = strlen (path);
615 && !memcmp (path + len - 7, ";type=", 6))
617 path[len - 7] = '\0';
618 return path[len - 1];
624 /* Return the URL as fine-formed string, with a proper protocol, port
625 number, directory and optional user/password. If HIDE is non-zero,
626 password will be hidden. The forbidden characters in the URL will
629 str_url (const struct urlinfo *u, int hide)
631 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
632 int i, l, ln, lu, lh, lp, lf, ld;
634 /* Look for the protocol name. */
635 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
636 if (sup_protos[i].ind == u->proto)
638 if (i == ARRAY_SIZE (sup_protos))
640 proto_name = sup_protos[i].name;
641 host = CLEANDUP (u->host);
642 dir = CLEANDUP (u->dir);
643 file = CLEANDUP (u->file);
644 user = passwd = NULL;
646 user = CLEANDUP (u->user);
650 passwd = CLEANDUP (u->passwd);
652 for (i = 0; passwd[i]; i++)
655 if (u->proto == URLFTP && *dir == '/')
657 char *tmp = (char *)xmalloc (strlen (dir) + 3);
658 /*sprintf (tmp, "%%2F%s", dir + 1);*/
662 strcpy (tmp + 3, dir + 1);
667 ln = strlen (proto_name);
668 lu = user ? strlen (user) : 0;
669 lp = passwd ? strlen (passwd) : 0;
673 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
674 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
675 (user ? user : ""), (passwd ? ":" : ""),
676 (passwd ? passwd : ""), (user ? "@" : ""),
677 host, u->port, dir, *dir ? "/" : "", file); */
679 memcpy (res, proto_name, ln);
683 memcpy (res + l, user, lu);
688 memcpy (res + l, passwd, lp);
693 memcpy (res + l, host, lh);
696 long_to_string (res + l, (long)u->port);
697 l += numdigit (u->port);
699 memcpy (res + l, dir, ld);
703 strcpy (res + l, file);
712 /* Check whether two URL-s are equivalent, i.e. pointing to the same
713 location. Uses parseurl to parse them, and compares the canonical
716 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
717 return 0 on error. */
719 url_equal (const char *url1, const char *url2)
721 struct urlinfo *u1, *u2;
726 err = parseurl (url1, u1, 0);
733 err = parseurl (url2, u2, 0);
739 res = !strcmp (u1->url, u2->url);
745 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
746 buffer may contain pretty much anything; no errors are signaled. */
748 findurl (const char *buf, int howmuch, int *count)
753 for (s1 = buf; howmuch; s1++, howmuch--)
754 for (prot = protostrings; *prot; prot++)
755 if (howmuch <= strlen (*prot))
757 else if (!strncasecmp (*prot, s1, strlen (*prot)))
759 for (s2 = s1, *count = 0;
760 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
761 !strchr (URL_SEPARATOR, *s2);
762 s2++, (*count)++, howmuch--);
768 /* Scans the file for signs of URL-s. Returns a vector of pointers,
769 each pointer representing a URL string. The file is *not* assumed
772 get_urls_file (const char *file)
779 urlpos *first, *current, *old;
781 if (file && !HYPHENP (file))
783 fp = fopen (file, "rb");
786 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
793 load_file (fp, &buf, &nread);
794 if (file && !HYPHENP (file))
796 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
797 first = current = NULL;
798 /* Fill the linked list with URLs. */
799 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
802 /* Allocate the space. */
804 current = (urlpos *)xmalloc (sizeof (urlpos));
807 memset (current, 0, sizeof (*current));
808 current->next = NULL;
809 current->url = (char *)xmalloc (size + 1);
810 memcpy (current->url, pbuf, size);
811 current->url[size] = '\0';
815 /* Free the buffer. */
821 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
822 an HTML document using htmlfindurl(), which see. get_urls_html()
823 constructs the HTML-s from the relative href-s.
825 If SILENT is non-zero, do not barf on baseless relative links. */
827 get_urls_html (const char *file, const char *this_url, int silent)
833 int step, first_time;
834 urlpos *first, *current, *old;
836 if (file && !HYPHENP (file))
838 fp = fopen (file, "rb");
841 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
848 load_file (fp, &orig_buf, &nread);
849 if (file && !HYPHENP (file))
851 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
852 first = current = NULL;
854 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
856 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
861 const char *pbuf = buf;
867 /* A frequent phenomenon that needs to be handled are pages
868 generated by brain-damaged HTML generators, which refer to to
869 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
870 any spaces at the beginning or at the end of the string.
871 This is probably not strictly correct, but that's what the
872 browsers do, so we may follow. May the authors of "WYSIWYG"
873 HTML tools burn in hell for the damage they've inflicted! */
874 while ((pbuf < buf + step) && ISSPACE (*pbuf))
879 while (size && ISSPACE (pbuf[size - 1]))
884 for (i = 0; protostrings[i]; i++)
886 if (!strncasecmp (protostrings[i], pbuf,
887 MINVAL (strlen (protostrings[i]), size)))
890 /* Check for http:RELATIVE_URI. See below for details. */
892 && !(strncasecmp (pbuf, "http:", 5) == 0
893 && strncasecmp (pbuf, "http://", 7) != 0))
900 /* This is for extremely brain-damaged pages that refer to
901 relative URI-s as <a href="http:URL">. Just strip off the
902 silly leading "http:" (as well as any leading blanks
904 if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
905 pbuf += 5, size -= 5;
909 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
911 if (!strncasecmp (sup_protos[i].name, pbuf,
912 MINVAL (strlen (sup_protos[i].name), size)))
915 /* Do *not* accept a non-supported protocol. */
916 if (i == ARRAY_SIZE (sup_protos))
921 /* First, construct the base, which can be relative itself.
923 Criteria for creating the base are:
924 1) html_base created by <base href="...">
926 3) base provided from the command line */
927 cbase = html_base ();
931 cbase = opt.base_href;
932 if (!cbase) /* Error condition -- a baseless
935 if (!opt.quiet && !silent)
937 /* Use malloc, not alloca because this is called in
939 char *temp = (char *)malloc (size + 1);
940 strncpy (temp, pbuf, size);
942 logprintf (LOG_NOTQUIET,
943 _("Error (%s): Link %s without a base provided.\n"),
950 base = construct (this_url, cbase, strlen (cbase),
954 /* Base must now be absolute, with host name and
956 if (!has_proto (cbase))
958 logprintf (LOG_NOTQUIET, _("\
959 Error (%s): Base %s relative, without referer URL.\n"),
963 base = xstrdup (cbase);
965 constr = construct (base, pbuf, size, no_proto);
970 constr = (char *)xmalloc (size + 1);
971 strncpy (constr, pbuf, size);
981 /* Use malloc, not alloca because this is called in a loop. */
982 tmp = (char *)xmalloc (size + 1);
983 strncpy (tmp, pbuf, size);
985 logprintf (LOG_ALWAYS,
986 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
987 file, this_url ? this_url : "(null)",
988 tmp2 ? tmp2 : "(null)", tmp, constr);
993 /* Allocate the space. */
995 current = (urlpos *)xmalloc (sizeof (urlpos));
1000 /* Fill the values. */
1001 memset (current, 0, sizeof (*current));
1002 current->next = NULL;
1003 current->url = constr;
1004 current->size = size;
1005 current->pos = pbuf - orig_buf;
1006 /* A URL is relative if the host and protocol are not named,
1007 and the name does not start with `/'. */
1008 if (no_proto && *pbuf != '/')
1009 current->flags |= (URELATIVE | UNOPROTO);
1011 current->flags |= UNOPROTO;
1018 /* Free the linked list of urlpos. */
1020 free_urlpos (urlpos *l)
1024 urlpos *next = l->next;
1026 FREE_MAYBE (l->local_name);
1032 /* Rotate FNAME opt.backups times */
1034 rotate_backups(const char *fname)
1036 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1037 char *from = (char *)alloca (maxlen);
1038 char *to = (char *)alloca (maxlen);
1042 if (stat (fname, &sb) == 0)
1043 if (S_ISREG (sb.st_mode) == 0)
1046 for (i = opt.backups; i > 1; i--)
1048 sprintf (from, "%s.%d", fname, i - 1);
1049 sprintf (to, "%s.%d", fname, i);
1050 /* #### This will fail on machines without the rename() system
1055 sprintf (to, "%s.%d", fname, 1);
1059 /* Create all the necessary directories for PATH (a file). Calls
1060 mkdirhier() internally. */
1062 mkalldirs (const char *path)
1069 p = path + strlen (path);
1070 for (; *p != '/' && p != path; p--);
1071 /* Don't create if it's just a file. */
1072 if ((p == path) && (*p != '/'))
1074 t = strdupdelim (path, p);
1075 /* Check whether the directory exists. */
1076 if ((stat (t, &st) == 0))
1078 if (S_ISDIR (st.st_mode))
1085 /* If the dir exists as a file name, remove it first. This
1086 is *only* for Wget to work with buggy old CERN http
1087 servers. Here is the scenario: When Wget tries to
1088 retrieve a directory without a slash, e.g.
1089 http://foo/bar (bar being a directory), CERN server will
1090 not redirect it too http://foo/bar/ -- it will generate a
1091 directory listing containing links to bar/file1,
1092 bar/file2, etc. Wget will lose because it saves this
1093 HTML listing to a file `bar', so it cannot create the
1094 directory. To work around this, if the file of the same
1095 name exists, we just remove it and create the directory
1097 DEBUGP (("Removing %s because of directory danger!\n", t));
1101 res = make_directory (t);
1103 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1109 count_slashes (const char *s)
1118 /* Return the path name of the URL-equivalent file name, with a
1119 remote-like structure of directories. */
1121 mkstruct (const struct urlinfo *u)
1123 char *host, *dir, *file, *res, *dirpref;
1126 assert (u->dir != NULL);
1127 assert (u->host != NULL);
1131 char *ptr = u->dir + (*u->dir == '/');
1132 int slash_count = 1 + count_slashes (ptr);
1133 int cut = MINVAL (opt.cut_dirs, slash_count);
1134 for (; cut && *ptr; ptr++)
1137 STRDUP_ALLOCA (dir, ptr);
1140 dir = u->dir + (*u->dir == '/');
1142 host = xstrdup (u->host);
1143 /* Check for the true name (or at least a consistent name for saving
1144 to directory) of HOST, reusing the hlist if possible. */
1145 if (opt.add_hostdir && !opt.simple_check)
1147 char *nhost = realhost (host);
1151 /* Add dir_prefix and hostname (if required) to the beginning of
1153 if (opt.add_hostdir)
1155 if (!DOTP (opt.dir_prefix))
1157 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1158 + strlen (host) + 1);
1159 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1162 STRDUP_ALLOCA (dirpref, host);
1164 else /* not add_hostdir */
1166 if (!DOTP (opt.dir_prefix))
1167 dirpref = opt.dir_prefix;
1173 /* If there is a prefix, prepend it. */
1176 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1177 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1180 dir = xstrdup (dir);
1183 if (l && dir[l - 1] == '/')
1187 file = "index.html";
1191 /* Finally, construct the full name. */
1192 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1193 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1198 /* Create a unique filename, corresponding to a given URL. Calls
1199 mkstruct if necessary. Does *not* actually create any directories. */
1201 url_filename (const struct urlinfo *u)
1204 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1208 file = mkstruct (u);
1214 file = xstrdup ("index.html");
1216 file = xstrdup (u->file);
1221 /* Check whether the prefix directory is something other than "."
1222 before prepending it. */
1223 if (!DOTP (opt.dir_prefix))
1225 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1226 + 1 + strlen (file) + 1);
1227 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1232 /* DOS-ish file systems don't like `%' signs in them; we change it
1237 for (p = file; *p; p++)
1241 #endif /* WINDOWS */
1243 /* Check the cases in which the unique extensions are not used:
1244 1) Clobbering is turned off (-nc).
1245 2) Retrieval with regetting.
1246 3) Timestamping is used.
1247 4) Hierarchy is built.
1249 The exception is the case when file does exist and is a
1250 directory (actually support for bad httpd-s). */
1251 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1252 && !(file_exists_p (file) && !file_non_directory_p (file)))
1255 /* Find a unique name. */
1256 name = unique_name (file);
1261 /* Construct an absolute URL, given a (possibly) relative one. This
1262 is more tricky than it might seem, but it works. */
1264 construct (const char *url, const char *sub, int subsize, int no_proto)
1274 for (i = strlen (url); i && url[i] != '/'; i--);
1275 if (!i || (url[i] == url[i - 1]))
1277 int l = strlen (url);
1278 char *t = (char *)alloca (l + 2);
1285 constr = (char *)xmalloc (i + 1 + subsize + 1);
1286 strncpy (constr, url, i + 1);
1287 constr[i + 1] = '\0';
1288 strncat (constr, sub, subsize);
1290 else /* *sub == `/' */
1297 for (; url[i] && url[i] != '/'; i++);
1300 fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1307 int l = strlen (url);
1308 char *t = (char *)alloca (l + 2);
1314 constr = (char *)xmalloc (i + 1 + subsize + 1);
1315 strncpy (constr, url, i);
1317 strncat (constr + i, sub, subsize);
1318 constr[i + subsize] = '\0';
1321 else /* !no_proto */
1323 constr = (char *)xmalloc (subsize + 1);
1324 strncpy (constr, sub, subsize);
1325 constr[subsize] = '\0';
1330 /* Optimize URL by host, destructively replacing u->host with realhost
1331 (u->host). Do this regardless of opt.simple_check. */
1333 opt_url (struct urlinfo *u)
1335 /* Find the "true" host. */
1336 char *host = realhost (u->host);
1339 assert (u->dir != NULL); /* the URL must have been parsed */
1340 /* Refresh the printed representation. */
1342 u->url = str_url (u, 0);
1345 /* Returns proxy host address, in accordance with PROTO. */
1347 getproxy (uerr_t proto)
1349 if (proto == URLHTTP)
1350 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1351 else if (proto == URLFTP)
1352 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1357 /* Should a host be accessed through proxy, concerning no_proxy? */
1359 no_proxy_match (const char *host, const char **no_proxy)
1364 return !sufmatch (no_proxy, host);
1367 /* Change the links in an HTML document. Accepts a structure that
1368 defines the positions of all the links. */
1370 convert_links (const char *file, urlpos *l)
1376 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1377 /* Read from the file.... */
1378 fp = fopen (file, "rb");
1381 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1382 file, strerror (errno));
1385 /* ...to a buffer. */
1386 load_file (fp, &buf, &size);
1388 if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1389 /* Rather than just writing over the original .html file with the converted
1390 version, save the former to *.orig. Note we only do this for files we've
1391 _successfully_ downloaded, so we don't clobber .orig files sitting around
1392 from previous invocations. */
1394 /* Construct the backup filename as the original name plus ".orig". */
1395 size_t filename_len = strlen(file);
1396 char* filename_plus_orig_suffix = malloc(filename_len +
1398 boolean already_wrote_backup_file = FALSE;
1399 slist* converted_file_ptr;
1400 static slist* converted_files = NULL;
1402 /* Would a single s[n]printf() call be faster? */
1403 strcpy(filename_plus_orig_suffix, file);
1404 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1406 /* We can get called twice on the same URL thanks to the
1407 convert_all_links() call in main(). If we write the .orig file each
1408 time in such a case, it'll end up containing the first-pass conversion,
1409 not the original file. So, see if we've already been called on this
1411 converted_file_ptr = converted_files;
1412 while (converted_file_ptr != NULL)
1413 if (strcmp(converted_file_ptr->string, file) == 0)
1415 already_wrote_backup_file = TRUE;
1419 converted_file_ptr = converted_file_ptr->next;
1421 if (!already_wrote_backup_file)
1423 /* Rename <file> to <file>.orig before former gets written over. */
1424 if (rename(file, filename_plus_orig_suffix) != 0)
1425 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1426 file, filename_plus_orig_suffix, strerror (errno));
1428 /* Remember that we've already written a .orig backup for this file.
1429 Note that we never free this memory since we need it till the
1430 convert_all_links() call, which is one of the last things the
1431 program does before terminating. BTW, I'm not sure if it would be
1432 safe to just set 'converted_file_ptr->string' to 'file' below,
1433 rather than making a copy of the string... Another note is that I
1434 thought I could just add a field to the urlpos structure saying
1435 that we'd written a .orig file for this URL, but that didn't work,
1436 so I had to make this separate list. */
1437 converted_file_ptr = malloc(sizeof(slist));
1438 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1439 converted_file_ptr->next = converted_files;
1440 converted_files = converted_file_ptr;
1443 free(filename_plus_orig_suffix);
1445 /* Now open the file for writing. */
1446 fp = fopen (file, "wb");
1449 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1450 file, strerror (errno));
1454 /* [If someone understands why multiple URLs can correspond to one local file,
1455 can they please add a comment here...?] */
1456 for (p = buf; l; l = l->next)
1460 DEBUGP (("Something strange is going on. Please investigate."));
1463 /* If the URL already is relative or it is not to be converted
1464 for some other reason (e.g. because of not having been
1465 downloaded in the first place), skip it. */
1466 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1468 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1472 /* Else, reach the position of the offending URL, echoing
1473 everything up to it to the outfile. */
1474 for (p2 = buf + l->pos; p < p2; p++)
1476 if (l->flags & UABS2REL)
1478 char *newname = construct_relative (file, l->local_name);
1479 fprintf (fp, "%s", newname);
1480 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1481 l->url, newname, l->pos, file));
1488 for (p2 = buf + size; p < p2; p++)
1493 logputs (LOG_VERBOSE, _("done.\n"));
1496 /* Construct and return a malloced copy of the relative link from two
1497 pieces of information: local name S1 of the referring file and
1498 local name S2 of the referred file.
1500 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1501 "jagor.srce.hr/images/news.gif", the function will return
1504 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1505 "fly.cc.fer.hr/images/fly.gif", the function will return
1506 "../images/fly.gif".
1508 Caveats: S1 should not begin with `/', unless S2 also begins with
1509 '/'. S1 should not contain things like ".." and such --
1510 construct_relative ("fly/ioccc/../index.html",
1511 "fly/images/fly.gif") will fail. (A workaround is to call
1512 something like path_simplify() on S1). */
1514 construct_relative (const char *s1, const char *s2)
1516 int i, cnt, sepdirs1;
1520 return xstrdup (s2);
1521 /* S1 should *not* be absolute, if S2 wasn't. */
1522 assert (*s1 != '/');
1524 /* Skip the directories common to both strings. */
1527 while (s1[i] && s2[i]
1532 if (s1[i] == '/' && s2[i] == '/')
1537 for (sepdirs1 = 0; s1[i]; i++)
1540 /* Now, construct the file as of:
1541 - ../ repeated sepdirs1 time
1542 - all the non-mutual directories of S2. */
1543 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1544 for (i = 0; i < sepdirs1; i++)
1545 memcpy (res + 3 * i, "../", 3);
1546 strcpy (res + 3 * i, s2 + cnt);
1550 /* Add URL to the head of the list L. */
1552 add_url (urlpos *l, const char *url, const char *file)
1556 t = (urlpos *)xmalloc (sizeof (urlpos));
1557 memset (t, 0, sizeof (*t));
1558 t->url = xstrdup (url);
1559 t->local_name = xstrdup (file);
1565 /* Remembers which files have been downloaded. Should be called with
1566 add_or_check == ADD_FILE for each file we actually download successfully
1567 (i.e. not for ones we have failures on or that we skip due to -N). If you
1568 just want to check if a file has been previously added without adding it,
1569 call with add_or_check == CHECK_FOR_FILE. Please be sure to call this
1570 function with local filenames, not remote URLs -- by some means that isn't
1571 commented well enough for me understand, multiple remote URLs can apparently
1572 correspond to a single local file. */
1574 downloaded_file (downloaded_file_t add_or_check, const char* file)
1576 boolean found_file = FALSE;
1577 static slist* downloaded_files = NULL;
1578 slist* rover = downloaded_files;
1580 while (rover != NULL)
1581 if (strcmp(rover->string, file) == 0)
1587 rover = rover->next;
1590 return TRUE; /* file had already been downloaded */
1593 if (add_or_check == ADD_FILE)
1595 rover = malloc(sizeof(slist));
1596 rover->string = xstrdup(file); /* die on out-of-mem. */
1597 rover->next = downloaded_files;
1598 downloaded_files = rover;
1601 return FALSE; /* file had not already been downloaded */