2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
64 #define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
65 || ((unsigned char)(c) > '~') /* ASCII 127 */ \
66 || strchr (URL_UNSAFE_CHARS, c))
68 /* If S contains unsafe characters, free it and replace it with a
69 version that doesn't. */
70 #define URL_CLEANSE(s) do \
72 if (contains_unsafe (s)) \
74 char *uc_tmp = encode_string (s); \
80 /* Is a directory "."? */
81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
82 /* Is a directory ".."? */
83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
85 /* NULL-terminated list of strings to be recognized as prototypes (URL
86 schemes). Note that recognized doesn't mean supported -- only HTTP
87 and FTP are currently supported.
89 However, a string that does not match anything in the list will be
90 considered a relative URL. Thus it's important that this list has
91 anything anyone could think of being legal.
93 There are wild things here. :-) Take a look at
94 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
96 static char *protostrings[] =
138 /* Similar to former, but for supported protocols: */
139 static struct proto sup_protos[] =
141 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
142 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
143 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
146 static void parse_dir PARAMS ((const char *, char **, char **));
147 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
148 static char *construct PARAMS ((const char *, const char *, int , int));
149 static char *construct_relative PARAMS ((const char *, const char *));
150 static char process_ftp_type PARAMS ((char *));
153 /* Returns the number of characters to be skipped if the first thing
154 in a URL is URL: (which is 0 or 4+). The optional spaces after
155 URL: are also skipped. */
157 skip_url (const char *url)
161 if (TOUPPER (url[0]) == 'U'
162 && TOUPPER (url[1]) == 'R'
163 && TOUPPER (url[2]) == 'L'
167 for (i = 4; url[i] && ISSPACE (url[i]); i++);
174 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
176 contains_unsafe (const char *s)
179 if (UNSAFE_CHAR (*s))
184 /* Decodes the forms %xy in a URL to the character the hexadecimal
185 code of which is xy. xy are hexadecimal digits from
186 [0123456789ABCDEF] (case-insensitive). If x or y are not
187 hex-digits or `%' precedes `\0', the sequence is inserted
191 decode_string (char *s)
201 /* Do nothing if at the end of the string, or if the chars
202 are not hex-digits. */
203 if (!*(s + 1) || !*(s + 2)
204 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
209 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
216 /* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
217 given string, returning a malloc-ed %XX encoded string. */
219 encode_string (const char *s)
226 for (i = 0; *s; s++, i++)
227 if (UNSAFE_CHAR (*s))
228 i += 2; /* Two more characters (hex digits) */
229 res = (char *)xmalloc (i + 1);
231 for (p = res; *s; s++)
232 if (UNSAFE_CHAR (*s))
234 const unsigned char c = *s;
236 *p++ = HEXD2ASC (c >> 4);
237 *p++ = HEXD2ASC (c & 0xf);
245 /* Returns the proto-type if URL's protocol is supported, or
246 URLUNKNOWN if not. */
248 urlproto (const char *url)
252 url += skip_url (url);
253 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
254 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
255 return sup_protos[i].ind;
256 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
259 for (++i; url[i] && url[i] != '/'; i++)
260 if (!ISDIGIT (url[i]))
262 if (url[i - 1] == ':')
271 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
272 part is found, returns 0. */
274 skip_proto (const char *url)
279 for (s = protostrings; *s; s++)
280 if (!strncasecmp (*s, url, strlen (*s)))
285 /* HTTP and FTP protocols are expected to yield exact host names
286 (i.e. the `//' part must be skipped, too). */
287 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
292 /* Returns 1 if the URL begins with a protocol (supported or
293 unsupported), 0 otherwise. */
295 has_proto (const char *url)
299 url += skip_url (url);
300 for (s = protostrings; *s; s++)
301 if (strncasecmp (url, *s, strlen (*s)) == 0)
306 /* Skip the username and password, if present here. The function
307 should be called *not* with the complete URL, but with the part
308 right after the protocol.
310 If no username and password are found, return 0. */
312 skip_uname (const char *url)
315 for (p = url; *p && *p != '/'; p++)
318 /* If a `@' was found before the first occurrence of `/', skip
326 /* Allocate a new urlinfo structure, fill it with default values and
327 return a pointer to it. */
333 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
334 memset (u, 0, sizeof (*u));
335 u->proto = URLUNKNOWN;
339 /* Perform a "deep" free of the urlinfo structure. The structure
340 should have been created with newurl, but need not have been used.
341 If free_pointer is non-0, free the pointer itself. */
343 freeurl (struct urlinfo *u, int complete)
347 FREE_MAYBE (u->host);
348 FREE_MAYBE (u->path);
349 FREE_MAYBE (u->file);
351 FREE_MAYBE (u->user);
352 FREE_MAYBE (u->passwd);
353 FREE_MAYBE (u->local);
354 FREE_MAYBE (u->referer);
356 freeurl (u->proxy, 1);
362 /* Extract the given URL of the form
363 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
364 1. hostname (terminated with `/' or `:')
365 2. port number (terminated with `/'), or chosen for the protocol
366 3. dirname (everything after hostname)
367 Most errors are handled. No allocation is done, you must supply
368 pointers to allocated memory.
369 ...and a host of other stuff :-)
371 - Recognizes hostname:dir/file for FTP and
372 hostname (:portnum)?/dir/file for HTTP.
373 - Parses the path to yield directory and file
374 - Parses the URL to yield the username and passwd (if present)
375 - Decodes the strings, in case they contain "forbidden" characters
376 - Writes the result to struct urlinfo
378 If the argument STRICT is set, it recognizes only the canonical
381 parseurl (const char *url, struct urlinfo *u, int strict)
384 int recognizable; /* Recognizable URL is the one where
385 the protocol name was explicitly
386 named, i.e. it wasn't deduced from
390 DEBUGP (("parseurl (\"%s\") -> ", url));
391 url += skip_url (url);
392 recognizable = has_proto (url);
393 if (strict && !recognizable)
395 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
397 l = strlen (sup_protos[i].name);
398 if (!strncasecmp (sup_protos[i].name, url, l))
401 /* If protocol is recognizable, but unsupported, bail out, else
403 if (recognizable && !sup_protos[i].name)
405 else if (i == ARRAY_SIZE (sup_protos))
408 u->proto = type = sup_protos[i].ind;
410 if (type == URLUNKNOWN)
412 /* Allow a username and password to be specified (i.e. just skip
415 l += skip_uname (url + l);
416 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
419 /* Get the hostname. */
420 u->host = strdupdelim (url + l, url + i);
421 DEBUGP (("host %s -> ", u->host));
423 /* Assume no port has been given. */
427 /* We have a colon delimiting the hostname. It could mean that
428 a port number is following it, or a directory. */
429 if (ISDIGIT (url[++i])) /* A port number */
431 if (type == URLUNKNOWN)
432 u->proto = type = URLHTTP;
433 for (; url[i] && url[i] != '/'; i++)
434 if (ISDIGIT (url[i]))
435 u->port = 10 * u->port + (url[i] - '0');
440 DEBUGP (("port %hu -> ", u->port));
442 else if (type == URLUNKNOWN) /* or a directory */
443 u->proto = type = URLFTP;
444 else /* or just a misformed port number */
447 else if (type == URLUNKNOWN)
448 u->proto = type = URLHTTP;
452 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
453 if (sup_protos[i].ind == type)
455 if (i == ARRAY_SIZE (sup_protos))
457 u->port = sup_protos[i].port;
459 /* Some delimiter troubles... */
460 if (url[i] == '/' && url[i - 1] != ':')
463 while (url[i] && url[i] == '/')
465 u->path = (char *)xmalloc (strlen (url + i) + 8);
466 strcpy (u->path, url + i);
469 u->ftp_type = process_ftp_type (u->path);
470 /* #### We don't handle type `d' correctly yet. */
471 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
474 DEBUGP (("opath %s -> ", u->path));
475 /* Parse the username and password (if existing). */
476 parse_uname (url, &u->user, &u->passwd);
477 /* Decode the strings, as per RFC 1738. */
478 decode_string (u->host);
479 decode_string (u->path);
481 decode_string (u->user);
483 decode_string (u->passwd);
484 /* Parse the directory. */
485 parse_dir (u->path, &u->dir, &u->file);
486 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
487 /* Simplify the directory. */
488 path_simplify (u->dir);
489 /* Remove the leading `/' in HTTP. */
490 if (type == URLHTTP && *u->dir == '/')
491 strcpy (u->dir, u->dir + 1);
492 DEBUGP (("ndir %s\n", u->dir));
493 /* Strip trailing `/'. */
495 if (l && u->dir[l - 1] == '/')
496 u->dir[l - 1] = '\0';
497 /* Re-create the path: */
498 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
499 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
500 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
501 strcpy (u->path, abs_ftp ? "%2F" : "/");
502 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
503 strcat (u->path, *u->dir ? "/" : "");
504 strcat (u->path, u->file);
505 URL_CLEANSE (u->path);
506 /* Create the clean URL. */
507 u->url = str_url (u, 0);
511 /* Build the directory and filename components of the path. Both
512 components are *separately* malloc-ed strings! It does not change
513 the contents of path.
515 If the path ends with "." or "..", they are (correctly) counted as
518 parse_dir (const char *path, char **dir, char **file)
522 for (i = l = strlen (path); i && path[i] != '/'; i--);
523 if (!i && *path != '/') /* Just filename */
525 if (DOTP (path) || DDOTP (path))
527 *dir = xstrdup (path);
528 *file = xstrdup ("");
532 *dir = xstrdup (""); /* This is required because of FTP */
533 *file = xstrdup (path);
536 else if (!i) /* /filename */
538 if (DOTP (path + 1) || DDOTP (path + 1))
540 *dir = xstrdup (path);
541 *file = xstrdup ("");
545 *dir = xstrdup ("/");
546 *file = xstrdup (path + 1);
549 else /* Nonempty directory with or without a filename */
551 if (DOTP (path + i + 1) || DDOTP (path + i + 1))
553 *dir = xstrdup (path);
554 *file = xstrdup ("");
558 *dir = strdupdelim (path, path + i);
559 *file = strdupdelim (path + i + 1, path + l + 1);
564 /* Find the optional username and password within the URL, as per
565 RFC1738. The returned user and passwd char pointers are
568 parse_uname (const char *url, char **user, char **passwd)
576 url += skip_url (url);
577 /* Look for end of protocol string. */
578 l = skip_proto (url);
581 /* Add protocol offset. */
583 /* Is there an `@' character? */
584 for (p = url; *p && *p != '/'; p++)
587 /* If not, return. */
590 /* Else find the username and password. */
591 for (p = col = url; *p != '@'; p++)
593 if (*p == ':' && !*user)
595 *user = (char *)xmalloc (p - url + 1);
596 memcpy (*user, url, p - url);
597 (*user)[p - url] = '\0';
601 /* Decide whether you have only the username or both. */
602 where = *user ? passwd : user;
603 *where = (char *)xmalloc (p - col + 1);
604 memcpy (*where, col, p - col);
605 (*where)[p - col] = '\0';
609 /* If PATH ends with `;type=X', return the character X. */
611 process_ftp_type (char *path)
613 int len = strlen (path);
616 && !memcmp (path + len - 7, ";type=", 6))
618 path[len - 7] = '\0';
619 return path[len - 1];
625 /* Return the URL as fine-formed string, with a proper protocol, port
626 number, directory and optional user/password. If HIDE is non-zero,
627 password will be hidden. The forbidden characters in the URL will
630 str_url (const struct urlinfo *u, int hide)
632 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
633 int i, l, ln, lu, lh, lp, lf, ld;
635 /* Look for the protocol name. */
636 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
637 if (sup_protos[i].ind == u->proto)
639 if (i == ARRAY_SIZE (sup_protos))
641 proto_name = sup_protos[i].name;
642 host = CLEANDUP (u->host);
643 dir = CLEANDUP (u->dir);
644 file = CLEANDUP (u->file);
645 user = passwd = NULL;
647 user = CLEANDUP (u->user);
651 passwd = CLEANDUP (u->passwd);
653 for (i = 0; passwd[i]; i++)
656 if (u->proto == URLFTP && *dir == '/')
658 char *tmp = (char *)xmalloc (strlen (dir) + 3);
659 /*sprintf (tmp, "%%2F%s", dir + 1);*/
663 strcpy (tmp + 3, dir + 1);
668 ln = strlen (proto_name);
669 lu = user ? strlen (user) : 0;
670 lp = passwd ? strlen (passwd) : 0;
674 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
675 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
676 (user ? user : ""), (passwd ? ":" : ""),
677 (passwd ? passwd : ""), (user ? "@" : ""),
678 host, u->port, dir, *dir ? "/" : "", file); */
680 memcpy (res, proto_name, ln);
684 memcpy (res + l, user, lu);
689 memcpy (res + l, passwd, lp);
694 memcpy (res + l, host, lh);
697 long_to_string (res + l, (long)u->port);
698 l += numdigit (u->port);
700 memcpy (res + l, dir, ld);
704 strcpy (res + l, file);
713 /* Check whether two URL-s are equivalent, i.e. pointing to the same
714 location. Uses parseurl to parse them, and compares the canonical
717 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
718 return 0 on error. */
720 url_equal (const char *url1, const char *url2)
722 struct urlinfo *u1, *u2;
727 err = parseurl (url1, u1, 0);
734 err = parseurl (url2, u2, 0);
740 res = !strcmp (u1->url, u2->url);
746 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
747 buffer may contain pretty much anything; no errors are signaled. */
749 findurl (const char *buf, int howmuch, int *count)
754 for (s1 = buf; howmuch; s1++, howmuch--)
755 for (prot = protostrings; *prot; prot++)
756 if (howmuch <= strlen (*prot))
758 else if (!strncasecmp (*prot, s1, strlen (*prot)))
760 for (s2 = s1, *count = 0;
761 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
762 !strchr (URL_SEPARATOR, *s2);
763 s2++, (*count)++, howmuch--);
769 /* Scans the file for signs of URL-s. Returns a vector of pointers,
770 each pointer representing a URL string. The file is *not* assumed
773 get_urls_file (const char *file)
780 urlpos *first, *current, *old;
782 if (file && !HYPHENP (file))
784 fp = fopen (file, "rb");
787 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
794 load_file (fp, &buf, &nread);
795 if (file && !HYPHENP (file))
797 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
798 first = current = NULL;
799 /* Fill the linked list with URLs. */
800 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
803 /* Allocate the space. */
805 current = (urlpos *)xmalloc (sizeof (urlpos));
808 memset (current, 0, sizeof (*current));
809 current->next = NULL;
810 current->url = (char *)xmalloc (size + 1);
811 memcpy (current->url, pbuf, size);
812 current->url[size] = '\0';
816 /* Free the buffer. */
822 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
823 an HTML document using htmlfindurl(), which see. get_urls_html()
824 constructs the HTML-s from the relative href-s.
826 If SILENT is non-zero, do not barf on baseless relative links. */
828 get_urls_html (const char *file, const char *this_url, int silent)
834 int step, first_time;
835 urlpos *first, *current, *old;
837 if (file && !HYPHENP (file))
839 fp = fopen (file, "rb");
842 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
849 load_file (fp, &orig_buf, &nread);
850 if (file && !HYPHENP (file))
852 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
853 first = current = NULL;
855 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
857 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
862 const char *pbuf = buf;
868 /* A frequent phenomenon that needs to be handled are pages
869 generated by brain-damaged HTML generators, which refer to to
870 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
871 any spaces at the beginning or at the end of the string.
872 This is probably not strictly correct, but that's what the
873 browsers do, so we may follow. May the authors of "WYSIWYG"
874 HTML tools burn in hell for the damage they've inflicted! */
875 while ((pbuf < buf + step) && ISSPACE (*pbuf))
880 while (size && ISSPACE (pbuf[size - 1]))
885 for (i = 0; protostrings[i]; i++)
887 if (!strncasecmp (protostrings[i], pbuf,
888 MINVAL (strlen (protostrings[i]), size)))
891 /* Check for http:RELATIVE_URI. See below for details. */
893 && !(strncasecmp (pbuf, "http:", 5) == 0
894 && strncasecmp (pbuf, "http://", 7) != 0))
901 /* This is for extremely brain-damaged pages that refer to
902 relative URI-s as <a href="http:URL">. Just strip off the
903 silly leading "http:" (as well as any leading blanks
905 if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
906 pbuf += 5, size -= 5;
910 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
912 if (!strncasecmp (sup_protos[i].name, pbuf,
913 MINVAL (strlen (sup_protos[i].name), size)))
916 /* Do *not* accept a non-supported protocol. */
917 if (i == ARRAY_SIZE (sup_protos))
922 /* First, construct the base, which can be relative itself.
924 Criteria for creating the base are:
925 1) html_base created by <base href="...">
927 3) base provided from the command line */
928 cbase = html_base ();
932 cbase = opt.base_href;
933 if (!cbase) /* Error condition -- a baseless
936 if (!opt.quiet && !silent)
938 /* Use malloc, not alloca because this is called in
940 char *temp = (char *)malloc (size + 1);
941 strncpy (temp, pbuf, size);
943 logprintf (LOG_NOTQUIET,
944 _("Error (%s): Link %s without a base provided.\n"),
951 base = construct (this_url, cbase, strlen (cbase),
955 /* Base must now be absolute, with host name and
957 if (!has_proto (cbase))
959 logprintf (LOG_NOTQUIET, _("\
960 Error (%s): Base %s relative, without referer URL.\n"),
964 base = xstrdup (cbase);
966 constr = construct (base, pbuf, size, no_proto);
971 constr = (char *)xmalloc (size + 1);
972 strncpy (constr, pbuf, size);
982 /* Use malloc, not alloca because this is called in a loop. */
983 tmp = (char *)xmalloc (size + 1);
984 strncpy (tmp, pbuf, size);
986 logprintf (LOG_ALWAYS,
987 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
988 file, this_url ? this_url : "(null)",
989 tmp2 ? tmp2 : "(null)", tmp, constr);
994 /* Allocate the space. */
996 current = (urlpos *)xmalloc (sizeof (urlpos));
1001 /* Fill the values. */
1002 memset (current, 0, sizeof (*current));
1003 current->next = NULL;
1004 current->url = constr;
1005 current->size = size;
1006 current->pos = pbuf - orig_buf;
1007 /* A URL is relative if the host and protocol are not named,
1008 and the name does not start with `/'. */
1009 if (no_proto && *pbuf != '/')
1010 current->flags |= (URELATIVE | UNOPROTO);
1012 current->flags |= UNOPROTO;
1019 /* Free the linked list of urlpos. */
1021 free_urlpos (urlpos *l)
1025 urlpos *next = l->next;
1027 FREE_MAYBE (l->local_name);
1033 /* Rotate FNAME opt.backups times */
1035 rotate_backups(const char *fname)
1037 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1038 char *from = (char *)alloca (maxlen);
1039 char *to = (char *)alloca (maxlen);
1043 if (stat (fname, &sb) == 0)
1044 if (S_ISREG (sb.st_mode) == 0)
1047 for (i = opt.backups; i > 1; i--)
1049 sprintf (from, "%s.%d", fname, i - 1);
1050 sprintf (to, "%s.%d", fname, i);
1051 /* #### This will fail on machines without the rename() system
1056 sprintf (to, "%s.%d", fname, 1);
1060 /* Create all the necessary directories for PATH (a file). Calls
1061 mkdirhier() internally. */
1063 mkalldirs (const char *path)
1070 p = path + strlen (path);
1071 for (; *p != '/' && p != path; p--);
1072 /* Don't create if it's just a file. */
1073 if ((p == path) && (*p != '/'))
1075 t = strdupdelim (path, p);
1076 /* Check whether the directory exists. */
1077 if ((stat (t, &st) == 0))
1079 if (S_ISDIR (st.st_mode))
1086 /* If the dir exists as a file name, remove it first. This
1087 is *only* for Wget to work with buggy old CERN http
1088 servers. Here is the scenario: When Wget tries to
1089 retrieve a directory without a slash, e.g.
1090 http://foo/bar (bar being a directory), CERN server will
1091 not redirect it too http://foo/bar/ -- it will generate a
1092 directory listing containing links to bar/file1,
1093 bar/file2, etc. Wget will lose because it saves this
1094 HTML listing to a file `bar', so it cannot create the
1095 directory. To work around this, if the file of the same
1096 name exists, we just remove it and create the directory
1098 DEBUGP (("Removing %s because of directory danger!\n", t));
1102 res = make_directory (t);
1104 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1110 count_slashes (const char *s)
1119 /* Return the path name of the URL-equivalent file name, with a
1120 remote-like structure of directories. */
1122 mkstruct (const struct urlinfo *u)
1124 char *host, *dir, *file, *res, *dirpref;
1127 assert (u->dir != NULL);
1128 assert (u->host != NULL);
1132 char *ptr = u->dir + (*u->dir == '/');
1133 int slash_count = 1 + count_slashes (ptr);
1134 int cut = MINVAL (opt.cut_dirs, slash_count);
1135 for (; cut && *ptr; ptr++)
1138 STRDUP_ALLOCA (dir, ptr);
1141 dir = u->dir + (*u->dir == '/');
1143 host = xstrdup (u->host);
1144 /* Check for the true name (or at least a consistent name for saving
1145 to directory) of HOST, reusing the hlist if possible. */
1146 if (opt.add_hostdir && !opt.simple_check)
1148 char *nhost = realhost (host);
1152 /* Add dir_prefix and hostname (if required) to the beginning of
1154 if (opt.add_hostdir)
1156 if (!DOTP (opt.dir_prefix))
1158 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1159 + strlen (host) + 1);
1160 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1163 STRDUP_ALLOCA (dirpref, host);
1165 else /* not add_hostdir */
1167 if (!DOTP (opt.dir_prefix))
1168 dirpref = opt.dir_prefix;
1174 /* If there is a prefix, prepend it. */
1177 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1178 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1181 dir = xstrdup (dir);
1184 if (l && dir[l - 1] == '/')
1188 file = "index.html";
1192 /* Finally, construct the full name. */
1193 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1194 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1199 /* Create a unique filename, corresponding to a given URL. Calls
1200 mkstruct if necessary. Does *not* actually create any directories. */
1202 url_filename (const struct urlinfo *u)
1205 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1209 file = mkstruct (u);
1215 file = xstrdup ("index.html");
1217 file = xstrdup (u->file);
1222 /* Check whether the prefix directory is something other than "."
1223 before prepending it. */
1224 if (!DOTP (opt.dir_prefix))
1226 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1227 + 1 + strlen (file) + 1);
1228 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1233 /* DOS-ish file systems don't like `%' signs in them; we change it
1238 for (p = file; *p; p++)
1242 #endif /* WINDOWS */
1244 /* Check the cases in which the unique extensions are not used:
1245 1) Clobbering is turned off (-nc).
1246 2) Retrieval with regetting.
1247 3) Timestamping is used.
1248 4) Hierarchy is built.
1250 The exception is the case when file does exist and is a
1251 directory (actually support for bad httpd-s). */
1252 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1253 && !(file_exists_p (file) && !file_non_directory_p (file)))
1256 /* Find a unique name. */
1257 name = unique_name (file);
1262 /* Construct an absolute URL, given a (possibly) relative one. This
1263 is more tricky than it might seem, but it works. */
1265 construct (const char *url, const char *sub, int subsize, int no_proto)
1275 for (i = strlen (url); i && url[i] != '/'; i--);
1276 if (!i || (url[i] == url[i - 1]))
1278 int l = strlen (url);
1279 char *t = (char *)alloca (l + 2);
1286 constr = (char *)xmalloc (i + 1 + subsize + 1);
1287 strncpy (constr, url, i + 1);
1288 constr[i + 1] = '\0';
1289 strncat (constr, sub, subsize);
1291 else /* *sub == `/' */
1298 for (; url[i] && url[i] != '/'; i++);
1301 fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1308 int l = strlen (url);
1309 char *t = (char *)alloca (l + 2);
1315 constr = (char *)xmalloc (i + 1 + subsize + 1);
1316 strncpy (constr, url, i);
1318 strncat (constr + i, sub, subsize);
1319 constr[i + subsize] = '\0';
1322 else /* !no_proto */
1324 constr = (char *)xmalloc (subsize + 1);
1325 strncpy (constr, sub, subsize);
1326 constr[subsize] = '\0';
1331 /* Optimize URL by host, destructively replacing u->host with realhost
1332 (u->host). Do this regardless of opt.simple_check. */
1334 opt_url (struct urlinfo *u)
1336 /* Find the "true" host. */
1337 char *host = realhost (u->host);
1340 assert (u->dir != NULL); /* the URL must have been parsed */
1341 /* Refresh the printed representation. */
1343 u->url = str_url (u, 0);
1346 /* Returns proxy host address, in accordance with PROTO. */
1348 getproxy (uerr_t proto)
1350 if (proto == URLHTTP)
1351 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1352 else if (proto == URLFTP)
1353 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1358 /* Should a host be accessed through proxy, concerning no_proxy? */
1360 no_proxy_match (const char *host, const char **no_proxy)
1365 return !sufmatch (no_proxy, host);
1368 /* Change the links in an HTML document. Accepts a structure that
1369 defines the positions of all the links. */
1371 convert_links (const char *file, urlpos *l)
1377 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1378 /* Read from the file.... */
1379 fp = fopen (file, "rb");
1382 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1383 file, strerror (errno));
1386 /* ...to a buffer. */
1387 load_file (fp, &buf, &size);
1389 if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1390 /* Rather than just writing over the original .html file with the converted
1391 version, save the former to *.orig. Note we only do this for files we've
1392 _successfully_ downloaded, so we don't clobber .orig files sitting around
1393 from previous invocations. */
1395 /* Construct the backup filename as the original name plus ".orig". */
1396 size_t filename_len = strlen(file);
1397 char* filename_plus_orig_suffix = malloc(filename_len +
1399 boolean already_wrote_backup_file = FALSE;
1400 slist* converted_file_ptr;
1401 static slist* converted_files = NULL;
1403 /* Would a single s[n]printf() call be faster? */
1404 strcpy(filename_plus_orig_suffix, file);
1405 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1407 /* We can get called twice on the same URL thanks to the
1408 convert_all_links() call in main(). If we write the .orig file each
1409 time in such a case, it'll end up containing the first-pass conversion,
1410 not the original file. So, see if we've already been called on this
1412 converted_file_ptr = converted_files;
1413 while (converted_file_ptr != NULL)
1414 if (strcmp(converted_file_ptr->string, file) == 0)
1416 already_wrote_backup_file = TRUE;
1420 converted_file_ptr = converted_file_ptr->next;
1422 if (!already_wrote_backup_file)
1424 /* Rename <file> to <file>.orig before former gets written over. */
1425 if (rename(file, filename_plus_orig_suffix) != 0)
1426 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1427 file, filename_plus_orig_suffix, strerror (errno));
1429 /* Remember that we've already written a .orig backup for this file.
1430 Note that we never free this memory since we need it till the
1431 convert_all_links() call, which is one of the last things the
1432 program does before terminating. BTW, I'm not sure if it would be
1433 safe to just set 'converted_file_ptr->string' to 'file' below,
1434 rather than making a copy of the string... Another note is that I
1435 thought I could just add a field to the urlpos structure saying
1436 that we'd written a .orig file for this URL, but that didn't work,
1437 so I had to make this separate list. */
1438 converted_file_ptr = malloc(sizeof(slist));
1439 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1440 converted_file_ptr->next = converted_files;
1441 converted_files = converted_file_ptr;
1444 free(filename_plus_orig_suffix);
1446 /* Now open the file for writing. */
1447 fp = fopen (file, "wb");
1450 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1451 file, strerror (errno));
1455 /* [If someone understands why multiple URLs can correspond to one local file,
1456 can they please add a comment here...?] */
1457 for (p = buf; l; l = l->next)
1461 DEBUGP (("Something strange is going on. Please investigate."));
1464 /* If the URL already is relative or it is not to be converted
1465 for some other reason (e.g. because of not having been
1466 downloaded in the first place), skip it. */
1467 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1469 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1473 /* Else, reach the position of the offending URL, echoing
1474 everything up to it to the outfile. */
1475 for (p2 = buf + l->pos; p < p2; p++)
1477 if (l->flags & UABS2REL)
1479 char *newname = construct_relative (file, l->local_name);
1480 fprintf (fp, "%s", newname);
1481 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1482 l->url, newname, l->pos, file));
1489 for (p2 = buf + size; p < p2; p++)
1494 logputs (LOG_VERBOSE, _("done.\n"));
1497 /* Construct and return a malloced copy of the relative link from two
1498 pieces of information: local name S1 of the referring file and
1499 local name S2 of the referred file.
1501 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1502 "jagor.srce.hr/images/news.gif", the function will return
1505 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1506 "fly.cc.fer.hr/images/fly.gif", the function will return
1507 "../images/fly.gif".
1509 Caveats: S1 should not begin with `/', unless S2 also begins with
1510 '/'. S1 should not contain things like ".." and such --
1511 construct_relative ("fly/ioccc/../index.html",
1512 "fly/images/fly.gif") will fail. (A workaround is to call
1513 something like path_simplify() on S1). */
1515 construct_relative (const char *s1, const char *s2)
1517 int i, cnt, sepdirs1;
1521 return xstrdup (s2);
1522 /* S1 should *not* be absolute, if S2 wasn't. */
1523 assert (*s1 != '/');
1525 /* Skip the directories common to both strings. */
1528 while (s1[i] && s2[i]
1533 if (s1[i] == '/' && s2[i] == '/')
1538 for (sepdirs1 = 0; s1[i]; i++)
1541 /* Now, construct the file as of:
1542 - ../ repeated sepdirs1 time
1543 - all the non-mutual directories of S2. */
1544 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1545 for (i = 0; i < sepdirs1; i++)
1546 memcpy (res + 3 * i, "../", 3);
1547 strcpy (res + 3 * i, s2 + cnt);
1551 /* Add URL to the head of the list L. */
1553 add_url (urlpos *l, const char *url, const char *file)
1557 t = (urlpos *)xmalloc (sizeof (urlpos));
1558 memset (t, 0, sizeof (*t));
1559 t->url = xstrdup (url);
1560 t->local_name = xstrdup (file);
1566 /* Remembers which files have been downloaded. Should be called with
1567 add_or_check == ADD_FILE for each file we actually download successfully
1568 (i.e. not for ones we have failures on or that we skip due to -N). If you
1569 just want to check if a file has been previously added without adding it,
1570 call with add_or_check == CHECK_FOR_FILE. Please be sure to call this
1571 function with local filenames, not remote URLs -- by some means that isn't
1572 commented well enough for me understand, multiple remote URLs can apparently
1573 correspond to a single local file. */
1575 downloaded_file (downloaded_file_t add_or_check, const char* file)
1577 boolean found_file = FALSE;
1578 static slist* downloaded_files = NULL;
1579 slist* rover = downloaded_files;
1581 while (rover != NULL)
1582 if (strcmp(rover->string, file) == 0)
1588 rover = rover->next;
1591 return TRUE; /* file had already been downloaded */
1594 if (add_or_check == ADD_FILE)
1596 rover = malloc(sizeof(slist));
1597 rover->string = xstrdup(file); /* die on out-of-mem. */
1598 rover->next = downloaded_files;
1599 downloaded_files = rover;
1602 return FALSE; /* file had not already been downloaded */