2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
56 encoding, and \033 for safe printing. */
59 # define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
61 # define URL_UNSAFE " <>\"%{}|\\^[]`\033"
64 /* If S contains unsafe characters, free it and replace it with a
65 version that doesn't. */
66 #define URL_CLEANSE(s) do \
68 if (contains_unsafe (s)) \
70 char *uc_tmp = encode_string (s); \
76 /* Is a directory "."? */
77 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
78 /* Is a directory ".."? */
79 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
81 /* NULL-terminated list of strings to be recognized as prototypes (URL
82 schemes). Note that recognized doesn't mean supported -- only HTTP
83 and FTP are currently supported.
85 However, a string that does not match anything in the list will be
86 considered a relative URL. Thus it's important that this list has
87 anything anyone could think of being legal.
89 There are wild things here. :-) Take a look at
90 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
92 static char *protostrings[] =
134 /* Similar to former, but for supported protocols: */
135 static struct proto sup_protos[] =
137 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
138 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
139 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
142 static void parse_dir PARAMS ((const char *, char **, char **));
143 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
144 static char *construct PARAMS ((const char *, const char *, int , int));
145 static char *construct_relative PARAMS ((const char *, const char *));
146 static char process_ftp_type PARAMS ((char *));
149 /* Returns the number of characters to be skipped if the first thing
150 in a URL is URL: (which is 0 or 4+). The optional spaces after
151 URL: are also skipped. */
153 skip_url (const char *url)
157 if (toupper (url[0]) == 'U'
158 && toupper (url[1]) == 'R'
159 && toupper (url[2]) == 'L'
163 for (i = 4; url[i] && ISSPACE (url[i]); i++);
170 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
172 contains_unsafe (const char *s)
175 if (strchr (URL_UNSAFE, *s))
180 /* Decodes the forms %xy in a URL to the character the hexadecimal
181 code of which is xy. xy are hexadecimal digits from
182 [0123456789ABCDEF] (case-insensitive). If x or y are not
183 hex-digits or `%' precedes `\0', the sequence is inserted
187 decode_string (char *s)
197 /* Do nothing if at the end of the string, or if the chars
198 are not hex-digits. */
199 if (!*(s + 1) || !*(s + 2)
200 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
205 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
212 /* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
213 string, returning a malloc-ed %XX encoded string. */
215 encode_string (const char *s)
222 for (i = 0; *s; s++, i++)
223 if (strchr (URL_UNSAFE, *s))
224 i += 2; /* Two more characters (hex digits) */
225 res = (char *)xmalloc (i + 1);
227 for (p = res; *s; s++)
228 if (strchr (URL_UNSAFE, *s))
230 const unsigned char c = *s;
232 *p++ = HEXD2ASC (c >> 4);
233 *p++ = HEXD2ASC (c & 0xf);
241 /* Returns the proto-type if URL's protocol is supported, or
242 URLUNKNOWN if not. */
244 urlproto (const char *url)
248 url += skip_url (url);
249 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
250 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
251 return sup_protos[i].ind;
252 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
255 for (++i; url[i] && url[i] != '/'; i++)
256 if (!ISDIGIT (url[i]))
258 if (url[i - 1] == ':')
267 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
268 part is found, returns 0. */
270 skip_proto (const char *url)
275 for (s = protostrings; *s; s++)
276 if (!strncasecmp (*s, url, strlen (*s)))
281 /* HTTP and FTP protocols are expected to yield exact host names
282 (i.e. the `//' part must be skipped, too). */
283 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
288 /* Returns 1 if the URL begins with a protocol (supported or
289 unsupported), 0 otherwise. */
291 has_proto (const char *url)
295 url += skip_url (url);
296 for (s = protostrings; *s; s++)
297 if (strncasecmp (url, *s, strlen (*s)) == 0)
302 /* Skip the username and password, if present here. The function
303 should be called *not* with the complete URL, but with the part
304 right after the protocol.
306 If no username and password are found, return 0. */
308 skip_uname (const char *url)
311 for (p = url; *p && *p != '/'; p++)
314 /* If a `@' was found before the first occurrence of `/', skip
322 /* Allocate a new urlinfo structure, fill it with default values and
323 return a pointer to it. */
329 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
330 memset (u, 0, sizeof (*u));
331 u->proto = URLUNKNOWN;
335 /* Perform a "deep" free of the urlinfo structure. The structure
336 should have been created with newurl, but need not have been used.
337 If free_pointer is non-0, free the pointer itself. */
339 freeurl (struct urlinfo *u, int complete)
343 FREE_MAYBE (u->host);
344 FREE_MAYBE (u->path);
345 FREE_MAYBE (u->file);
347 FREE_MAYBE (u->user);
348 FREE_MAYBE (u->passwd);
349 FREE_MAYBE (u->local);
350 FREE_MAYBE (u->referer);
352 freeurl (u->proxy, 1);
358 /* Extract the given URL of the form
359 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
360 1. hostname (terminated with `/' or `:')
361 2. port number (terminated with `/'), or chosen for the protocol
362 3. dirname (everything after hostname)
363 Most errors are handled. No allocation is done, you must supply
364 pointers to allocated memory.
365 ...and a host of other stuff :-)
367 - Recognizes hostname:dir/file for FTP and
368 hostname (:portnum)?/dir/file for HTTP.
369 - Parses the path to yield directory and file
370 - Parses the URL to yield the username and passwd (if present)
371 - Decodes the strings, in case they contain "forbidden" characters
372 - Writes the result to struct urlinfo
374 If the argument STRICT is set, it recognizes only the canonical
377 parseurl (const char *url, struct urlinfo *u, int strict)
380 int recognizable; /* Recognizable URL is the one where
381 the protocol name was explicitly
382 named, i.e. it wasn't deduced from
386 DEBUGP (("parseurl (\"%s\") -> ", url));
387 url += skip_url (url);
388 recognizable = has_proto (url);
389 if (strict && !recognizable)
391 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
393 l = strlen (sup_protos[i].name);
394 if (!strncasecmp (sup_protos[i].name, url, l))
397 /* If protocol is recognizable, but unsupported, bail out, else
399 if (recognizable && !sup_protos[i].name)
401 else if (i == ARRAY_SIZE (sup_protos))
404 u->proto = type = sup_protos[i].ind;
406 if (type == URLUNKNOWN)
408 /* Allow a username and password to be specified (i.e. just skip
411 l += skip_uname (url + l);
412 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
415 /* Get the hostname. */
416 u->host = strdupdelim (url + l, url + i);
417 DEBUGP (("host %s -> ", u->host));
419 /* Assume no port has been given. */
423 /* We have a colon delimiting the hostname. It could mean that
424 a port number is following it, or a directory. */
425 if (ISDIGIT (url[++i])) /* A port number */
427 if (type == URLUNKNOWN)
428 u->proto = type = URLHTTP;
429 for (; url[i] && url[i] != '/'; i++)
430 if (ISDIGIT (url[i]))
431 u->port = 10 * u->port + (url[i] - '0');
436 DEBUGP (("port %hu -> ", u->port));
438 else if (type == URLUNKNOWN) /* or a directory */
439 u->proto = type = URLFTP;
440 else /* or just a misformed port number */
443 else if (type == URLUNKNOWN)
444 u->proto = type = URLHTTP;
448 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
449 if (sup_protos[i].ind == type)
451 if (i == ARRAY_SIZE (sup_protos))
453 u->port = sup_protos[i].port;
455 /* Some delimiter troubles... */
456 if (url[i] == '/' && url[i - 1] != ':')
459 while (url[i] && url[i] == '/')
461 u->path = (char *)xmalloc (strlen (url + i) + 8);
462 strcpy (u->path, url + i);
465 u->ftp_type = process_ftp_type (u->path);
466 /* #### We don't handle type `d' correctly yet. */
467 if (!u->ftp_type || toupper (u->ftp_type) == 'D')
470 DEBUGP (("opath %s -> ", u->path));
471 /* Parse the username and password (if existing). */
472 parse_uname (url, &u->user, &u->passwd);
473 /* Decode the strings, as per RFC 1738. */
474 decode_string (u->host);
475 decode_string (u->path);
477 decode_string (u->user);
479 decode_string (u->passwd);
480 /* Parse the directory. */
481 parse_dir (u->path, &u->dir, &u->file);
482 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
483 /* Simplify the directory. */
484 path_simplify (u->dir);
485 /* Remove the leading `/' in HTTP. */
486 if (type == URLHTTP && *u->dir == '/')
487 strcpy (u->dir, u->dir + 1);
488 DEBUGP (("ndir %s\n", u->dir));
489 /* Strip trailing `/'. */
491 if (l && u->dir[l - 1] == '/')
492 u->dir[l - 1] = '\0';
493 /* Re-create the path: */
494 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
495 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
496 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
497 strcpy (u->path, abs_ftp ? "%2F" : "/");
498 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
499 strcat (u->path, *u->dir ? "/" : "");
500 strcat (u->path, u->file);
501 URL_CLEANSE (u->path);
502 /* Create the clean URL. */
503 u->url = str_url (u, 0);
507 /* Build the directory and filename components of the path. Both
508 components are *separately* malloc-ed strings! It does not change
509 the contents of path.
511 If the path ends with "." or "..", they are (correctly) counted as
514 parse_dir (const char *path, char **dir, char **file)
518 for (i = l = strlen (path); i && path[i] != '/'; i--);
519 if (!i && *path != '/') /* Just filename */
521 if (DOTP (path) || DDOTP (path))
523 *dir = xstrdup (path);
524 *file = xstrdup ("");
528 *dir = xstrdup (""); /* This is required because of FTP */
529 *file = xstrdup (path);
532 else if (!i) /* /filename */
534 if (DOTP (path + 1) || DDOTP (path + 1))
536 *dir = xstrdup (path);
537 *file = xstrdup ("");
541 *dir = xstrdup ("/");
542 *file = xstrdup (path + 1);
545 else /* Nonempty directory with or without a filename */
547 if (DOTP (path + i + 1) || DDOTP (path + i + 1))
549 *dir = xstrdup (path);
550 *file = xstrdup ("");
554 *dir = strdupdelim (path, path + i);
555 *file = strdupdelim (path + i + 1, path + l + 1);
560 /* Find the optional username and password within the URL, as per
561 RFC1738. The returned user and passwd char pointers are
564 parse_uname (const char *url, char **user, char **passwd)
572 url += skip_url (url);
573 /* Look for end of protocol string. */
574 l = skip_proto (url);
577 /* Add protocol offset. */
579 /* Is there an `@' character? */
580 for (p = url; *p && *p != '/'; p++)
583 /* If not, return. */
586 /* Else find the username and password. */
587 for (p = col = url; *p != '@'; p++)
589 if (*p == ':' && !*user)
591 *user = (char *)xmalloc (p - url + 1);
592 memcpy (*user, url, p - url);
593 (*user)[p - url] = '\0';
597 /* Decide whether you have only the username or both. */
598 where = *user ? passwd : user;
599 *where = (char *)xmalloc (p - col + 1);
600 memcpy (*where, col, p - col);
601 (*where)[p - col] = '\0';
605 /* If PATH ends with `;type=X', return the character X. */
607 process_ftp_type (char *path)
609 int len = strlen (path);
612 && !memcmp (path + len - 7, ";type=", 6))
614 path[len - 7] = '\0';
615 return path[len - 1];
621 /* Return the URL as fine-formed string, with a proper protocol, port
622 number, directory and optional user/password. If HIDE is non-zero,
623 password will be hidden. The forbidden characters in the URL will
626 str_url (const struct urlinfo *u, int hide)
628 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
629 int i, l, ln, lu, lh, lp, lf, ld;
631 /* Look for the protocol name. */
632 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
633 if (sup_protos[i].ind == u->proto)
635 if (i == ARRAY_SIZE (sup_protos))
637 proto_name = sup_protos[i].name;
638 host = CLEANDUP (u->host);
639 dir = CLEANDUP (u->dir);
640 file = CLEANDUP (u->file);
641 user = passwd = NULL;
643 user = CLEANDUP (u->user);
647 passwd = CLEANDUP (u->passwd);
649 for (i = 0; passwd[i]; i++)
652 if (u->proto == URLFTP && *dir == '/')
654 char *tmp = (char *)xmalloc (strlen (dir) + 3);
655 /*sprintf (tmp, "%%2F%s", dir + 1);*/
659 strcpy (tmp + 3, dir + 1);
664 ln = strlen (proto_name);
665 lu = user ? strlen (user) : 0;
666 lp = passwd ? strlen (passwd) : 0;
670 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
671 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
672 (user ? user : ""), (passwd ? ":" : ""),
673 (passwd ? passwd : ""), (user ? "@" : ""),
674 host, u->port, dir, *dir ? "/" : "", file); */
676 memcpy (res, proto_name, ln);
680 memcpy (res + l, user, lu);
685 memcpy (res + l, passwd, lp);
690 memcpy (res + l, host, lh);
693 long_to_string (res + l, (long)u->port);
694 l += numdigit (u->port);
696 memcpy (res + l, dir, ld);
700 strcpy (res + l, file);
709 /* Check whether two URL-s are equivalent, i.e. pointing to the same
710 location. Uses parseurl to parse them, and compares the canonical
713 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
714 return 0 on error. */
716 url_equal (const char *url1, const char *url2)
718 struct urlinfo *u1, *u2;
723 err = parseurl (url1, u1, 0);
730 err = parseurl (url2, u2, 0);
736 res = !strcmp (u1->url, u2->url);
742 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
743 buffer may contain pretty much anything; no errors are signaled. */
745 findurl (const char *buf, int howmuch, int *count)
750 for (s1 = buf; howmuch; s1++, howmuch--)
751 for (prot = protostrings; *prot; prot++)
752 if (howmuch <= strlen (*prot))
754 else if (!strncasecmp (*prot, s1, strlen (*prot)))
756 for (s2 = s1, *count = 0;
757 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
758 !strchr (URL_SEPARATOR, *s2);
759 s2++, (*count)++, howmuch--);
765 /* Scans the file for signs of URL-s. Returns a vector of pointers,
766 each pointer representing a URL string. The file is *not* assumed
769 get_urls_file (const char *file)
776 urlpos *first, *current, *old;
778 if (file && !HYPHENP (file))
780 fp = fopen (file, "rb");
783 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
790 load_file (fp, &buf, &nread);
791 if (file && !HYPHENP (file))
793 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
794 first = current = NULL;
795 /* Fill the linked list with URLs. */
796 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
799 /* Allocate the space. */
801 current = (urlpos *)xmalloc (sizeof (urlpos));
804 memset (current, 0, sizeof (*current));
805 current->next = NULL;
806 current->url = (char *)xmalloc (size + 1);
807 memcpy (current->url, pbuf, size);
808 current->url[size] = '\0';
812 /* Free the buffer. */
818 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
819 an HTML document using htmlfindurl(), which see. get_urls_html()
820 constructs the HTML-s from the relative href-s.
822 If SILENT is non-zero, do not barf on baseless relative links. */
824 get_urls_html (const char *file, const char *this_url, int silent)
830 int step, first_time;
831 urlpos *first, *current, *old;
833 if (file && !HYPHENP (file))
835 fp = fopen (file, "rb");
838 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
845 load_file (fp, &orig_buf, &nread);
846 if (file && !HYPHENP (file))
848 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
849 first = current = NULL;
851 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
853 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
858 const char *pbuf = buf;
864 /* A frequent phenomenon that needs to be handled are pages
865 generated by brain-damaged HTML generators, which refer to to
866 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
867 any spaces at the beginning or at the end of the string.
868 This is probably not strictly correct, but that's what the
869 browsers do, so we may follow. May the authors of "WYSIWYG"
870 HTML tools burn in hell for the damage they've inflicted! */
871 while ((pbuf < buf + step) && ISSPACE (*pbuf))
876 while (size && ISSPACE (pbuf[size - 1]))
881 for (i = 0; protostrings[i]; i++)
883 if (!strncasecmp (protostrings[i], pbuf,
884 MINVAL (strlen (protostrings[i]), size)))
887 /* Check for http:RELATIVE_URI. See below for details. */
889 && !(strncasecmp (pbuf, "http:", 5) == 0
890 && strncasecmp (pbuf, "http://", 7) != 0))
897 /* This is for extremely brain-damaged pages that refer to
898 relative URI-s as <a href="http:URL">. Just strip off the
899 silly leading "http:" (as well as any leading blanks
901 if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
902 pbuf += 5, size -= 5;
906 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
908 if (!strncasecmp (sup_protos[i].name, pbuf,
909 MINVAL (strlen (sup_protos[i].name), size)))
912 /* Do *not* accept a non-supported protocol. */
913 if (i == ARRAY_SIZE (sup_protos))
918 /* First, construct the base, which can be relative itself.
920 Criteria for creating the base are:
921 1) html_base created by <base href="...">
923 3) base provided from the command line */
924 cbase = html_base ();
928 cbase = opt.base_href;
929 if (!cbase) /* Error condition -- a baseless
932 if (!opt.quiet && !silent)
934 /* Use malloc, not alloca because this is called in
936 char *temp = (char *)malloc (size + 1);
937 strncpy (temp, pbuf, size);
939 logprintf (LOG_NOTQUIET,
940 _("Error (%s): Link %s without a base provided.\n"),
947 base = construct (this_url, cbase, strlen (cbase),
951 /* Base must now be absolute, with host name and
953 if (!has_proto (cbase))
955 logprintf (LOG_NOTQUIET, _("\
956 Error (%s): Base %s relative, without referer URL.\n"),
960 base = xstrdup (cbase);
962 constr = construct (base, pbuf, size, no_proto);
967 constr = (char *)xmalloc (size + 1);
968 strncpy (constr, pbuf, size);
978 /* Use malloc, not alloca because this is called in a loop. */
979 tmp = (char *)xmalloc (size + 1);
980 strncpy (tmp, pbuf, size);
982 logprintf (LOG_ALWAYS,
983 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
984 file, this_url ? this_url : "(null)",
985 tmp2 ? tmp2 : "(null)", tmp, constr);
990 /* Allocate the space. */
992 current = (urlpos *)xmalloc (sizeof (urlpos));
997 /* Fill the values. */
998 memset (current, 0, sizeof (*current));
999 current->next = NULL;
1000 current->url = constr;
1001 current->size = size;
1002 current->pos = pbuf - orig_buf;
1003 /* A URL is relative if the host and protocol are not named,
1004 and the name does not start with `/'. */
1005 if (no_proto && *pbuf != '/')
1006 current->flags |= (URELATIVE | UNOPROTO);
1008 current->flags |= UNOPROTO;
1015 /* Free the linked list of urlpos. */
1017 free_urlpos (urlpos *l)
1021 urlpos *next = l->next;
1023 FREE_MAYBE (l->local_name);
1029 /* Rotate FNAME opt.backups times */
1031 rotate_backups(const char *fname)
1033 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1034 char *from = (char *)alloca (maxlen);
1035 char *to = (char *)alloca (maxlen);
1039 if (stat (fname, &sb) == 0)
1040 if (S_ISREG (sb.st_mode) == 0)
1043 for (i = opt.backups; i > 1; i--)
1045 sprintf (from, "%s.%d", fname, i - 1);
1046 sprintf (to, "%s.%d", fname, i);
1047 /* #### This will fail on machines without the rename() system
1052 sprintf (to, "%s.%d", fname, 1);
1056 /* Create all the necessary directories for PATH (a file). Calls
1057 mkdirhier() internally. */
1059 mkalldirs (const char *path)
1066 p = path + strlen (path);
1067 for (; *p != '/' && p != path; p--);
1068 /* Don't create if it's just a file. */
1069 if ((p == path) && (*p != '/'))
1071 t = strdupdelim (path, p);
1072 /* Check whether the directory exists. */
1073 if ((stat (t, &st) == 0))
1075 if (S_ISDIR (st.st_mode))
1082 /* If the dir exists as a file name, remove it first. This
1083 is *only* for Wget to work with buggy old CERN http
1084 servers. Here is the scenario: When Wget tries to
1085 retrieve a directory without a slash, e.g.
1086 http://foo/bar (bar being a directory), CERN server will
1087 not redirect it too http://foo/bar/ -- it will generate a
1088 directory listing containing links to bar/file1,
1089 bar/file2, etc. Wget will lose because it saves this
1090 HTML listing to a file `bar', so it cannot create the
1091 directory. To work around this, if the file of the same
1092 name exists, we just remove it and create the directory
1094 DEBUGP (("Removing %s because of directory danger!\n", t));
1098 res = make_directory (t);
1100 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1106 count_slashes (const char *s)
1115 /* Return the path name of the URL-equivalent file name, with a
1116 remote-like structure of directories. */
1118 mkstruct (const struct urlinfo *u)
1120 char *host, *dir, *file, *res, *dirpref;
1123 assert (u->dir != NULL);
1124 assert (u->host != NULL);
1128 char *ptr = u->dir + (*u->dir == '/');
1129 int slash_count = 1 + count_slashes (ptr);
1130 int cut = MINVAL (opt.cut_dirs, slash_count);
1131 for (; cut && *ptr; ptr++)
1134 STRDUP_ALLOCA (dir, ptr);
1137 dir = u->dir + (*u->dir == '/');
1139 host = xstrdup (u->host);
1140 /* Check for the true name (or at least a consistent name for saving
1141 to directory) of HOST, reusing the hlist if possible. */
1142 if (opt.add_hostdir && !opt.simple_check)
1144 char *nhost = realhost (host);
1148 /* Add dir_prefix and hostname (if required) to the beginning of
1150 if (opt.add_hostdir)
1152 if (!DOTP (opt.dir_prefix))
1154 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1155 + strlen (host) + 1);
1156 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1159 STRDUP_ALLOCA (dirpref, host);
1161 else /* not add_hostdir */
1163 if (!DOTP (opt.dir_prefix))
1164 dirpref = opt.dir_prefix;
1170 /* If there is a prefix, prepend it. */
1173 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1174 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1177 dir = xstrdup (dir);
1180 if (l && dir[l - 1] == '/')
1184 file = "index.html";
1188 /* Finally, construct the full name. */
1189 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1190 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1195 /* Create a unique filename, corresponding to a given URL. Calls
1196 mkstruct if necessary. Does *not* actually create any directories. */
1198 url_filename (const struct urlinfo *u)
1201 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1205 file = mkstruct (u);
1211 file = xstrdup ("index.html");
1213 file = xstrdup (u->file);
1218 /* Check whether the prefix directory is something other than "."
1219 before prepending it. */
1220 if (!DOTP (opt.dir_prefix))
1222 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1223 + 1 + strlen (file) + 1);
1224 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1229 /* DOS-ish file systems don't like `%' signs in them; we change it
1234 for (p = file; *p; p++)
1238 #endif /* WINDOWS */
1240 /* Check the cases in which the unique extensions are not used:
1241 1) Clobbering is turned off (-nc).
1242 2) Retrieval with regetting.
1243 3) Timestamping is used.
1244 4) Hierarchy is built.
1246 The exception is the case when file does exist and is a
1247 directory (actually support for bad httpd-s). */
1248 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1249 && !(file_exists_p (file) && !file_non_directory_p (file)))
1252 /* Find a unique name. */
1253 name = unique_name (file);
1258 /* Construct an absolute URL, given a (possibly) relative one. This
1259 is more tricky than it might seem, but it works. */
1261 construct (const char *url, const char *sub, int subsize, int no_proto)
1271 for (i = strlen (url); i && url[i] != '/'; i--);
1272 if (!i || (url[i] == url[i - 1]))
1274 int l = strlen (url);
1275 char *t = (char *)alloca (l + 2);
1282 constr = (char *)xmalloc (i + 1 + subsize + 1);
1283 strncpy (constr, url, i + 1);
1284 constr[i + 1] = '\0';
1285 strncat (constr, sub, subsize);
1287 else /* *sub == `/' */
1294 for (; url[i] && url[i] != '/'; i++);
1297 fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1304 int l = strlen (url);
1305 char *t = (char *)alloca (l + 2);
1311 constr = (char *)xmalloc (i + 1 + subsize + 1);
1312 strncpy (constr, url, i);
1314 strncat (constr + i, sub, subsize);
1315 constr[i + subsize] = '\0';
1318 else /* !no_proto */
1320 constr = (char *)xmalloc (subsize + 1);
1321 strncpy (constr, sub, subsize);
1322 constr[subsize] = '\0';
1327 /* Optimize URL by host, destructively replacing u->host with realhost
1328 (u->host). Do this regardless of opt.simple_check. */
1330 opt_url (struct urlinfo *u)
1332 /* Find the "true" host. */
1333 char *host = realhost (u->host);
1336 assert (u->dir != NULL); /* the URL must have been parsed */
1337 /* Refresh the printed representation. */
1339 u->url = str_url (u, 0);
1342 /* Returns proxy host address, in accordance with PROTO. */
1344 getproxy (uerr_t proto)
1346 if (proto == URLHTTP)
1347 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1348 else if (proto == URLFTP)
1349 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1354 /* Should a host be accessed through proxy, concerning no_proxy? */
1356 no_proxy_match (const char *host, const char **no_proxy)
1361 return !sufmatch (no_proxy, host);
1364 /* Change the links in an HTML document. Accepts a structure that
1365 defines the positions of all the links. */
1367 convert_links (const char *file, urlpos *l)
1373 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1374 /* Read from the file.... */
1375 fp = fopen (file, "rb");
1378 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1379 file, strerror (errno));
1382 /* ...to a buffer. */
1383 load_file (fp, &buf, &size);
1385 if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1386 /* Rather than just writing over the original .html file with the converted
1387 version, save the former to *.orig. Note we only do this for files we've
1388 _successfully_ downloaded, so we don't clobber .orig files sitting around
1389 from previous invocations. */
1391 /* Construct the backup filename as the original name plus ".orig". */
1392 size_t filename_len = strlen(file);
1393 char* filename_plus_orig_suffix = malloc(filename_len +
1395 boolean already_wrote_backup_file = FALSE;
1396 slist* converted_file_ptr;
1397 static slist* converted_files = NULL;
1399 /* Would a single s[n]printf() call be faster? */
1400 strcpy(filename_plus_orig_suffix, file);
1401 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1403 /* We can get called twice on the same URL thanks to the
1404 convert_all_links() call in main(). If we write the .orig file each
1405 time in such a case, it'll end up containing the first-pass conversion,
1406 not the original file. So, see if we've already been called on this
1408 converted_file_ptr = converted_files;
1409 while (converted_file_ptr != NULL)
1410 if (strcmp(converted_file_ptr->string, file) == 0)
1412 already_wrote_backup_file = TRUE;
1416 converted_file_ptr = converted_file_ptr->next;
1418 if (!already_wrote_backup_file)
1420 /* Rename <file> to <file>.orig before former gets written over. */
1421 if (rename(file, filename_plus_orig_suffix) != 0)
1422 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1423 file, filename_plus_orig_suffix, strerror (errno));
1425 /* Remember that we've already written a .orig backup for this file.
1426 Note that we never free this memory since we need it till the
1427 convert_all_links() call, which is one of the last things the
1428 program does before terminating. BTW, I'm not sure if it would be
1429 safe to just set 'converted_file_ptr->string' to 'file' below,
1430 rather than making a copy of the string... Another note is that I
1431 thought I could just add a field to the urlpos structure saying
1432 that we'd written a .orig file for this URL, but that didn't work,
1433 so I had to make this separate list. */
1434 converted_file_ptr = malloc(sizeof(slist));
1435 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1436 converted_file_ptr->next = converted_files;
1437 converted_files = converted_file_ptr;
1440 free(filename_plus_orig_suffix);
1442 /* Now open the file for writing. */
1443 fp = fopen (file, "wb");
1446 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1447 file, strerror (errno));
1451 /* [If someone understands why multiple URLs can correspond to one local file,
1452 can they please add a comment here...?] */
1453 for (p = buf; l; l = l->next)
1457 DEBUGP (("Something strange is going on. Please investigate."));
1460 /* If the URL already is relative or it is not to be converted
1461 for some other reason (e.g. because of not having been
1462 downloaded in the first place), skip it. */
1463 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1465 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1469 /* Else, reach the position of the offending URL, echoing
1470 everything up to it to the outfile. */
1471 for (p2 = buf + l->pos; p < p2; p++)
1473 if (l->flags & UABS2REL)
1475 char *newname = construct_relative (file, l->local_name);
1476 fprintf (fp, "%s", newname);
1477 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1478 l->url, newname, l->pos, file));
1485 for (p2 = buf + size; p < p2; p++)
1490 logputs (LOG_VERBOSE, _("done.\n"));
1493 /* Construct and return a malloced copy of the relative link from two
1494 pieces of information: local name S1 of the referring file and
1495 local name S2 of the referred file.
1497 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1498 "jagor.srce.hr/images/news.gif", the function will return
1501 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1502 "fly.cc.fer.hr/images/fly.gif", the function will return
1503 "../images/fly.gif".
1505 Caveats: S1 should not begin with `/', unless S2 also begins with
1506 '/'. S1 should not contain things like ".." and such --
1507 construct_relative ("fly/ioccc/../index.html",
1508 "fly/images/fly.gif") will fail. (A workaround is to call
1509 something like path_simplify() on S1). */
1511 construct_relative (const char *s1, const char *s2)
1513 int i, cnt, sepdirs1;
1517 return xstrdup (s2);
1518 /* S1 should *not* be absolute, if S2 wasn't. */
1519 assert (*s1 != '/');
1521 /* Skip the directories common to both strings. */
1524 while (s1[i] && s2[i]
1529 if (s1[i] == '/' && s2[i] == '/')
1534 for (sepdirs1 = 0; s1[i]; i++)
1537 /* Now, construct the file as of:
1538 - ../ repeated sepdirs1 time
1539 - all the non-mutual directories of S2. */
1540 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1541 for (i = 0; i < sepdirs1; i++)
1542 memcpy (res + 3 * i, "../", 3);
1543 strcpy (res + 3 * i, s2 + cnt);
1547 /* Add URL to the head of the list L. */
1549 add_url (urlpos *l, const char *url, const char *file)
1553 t = (urlpos *)xmalloc (sizeof (urlpos));
1554 memset (t, 0, sizeof (*t));
1555 t->url = xstrdup (url);
1556 t->local_name = xstrdup (file);
1562 /* Remembers which files have been downloaded. Should be called with
1563 add_or_check == ADD_FILE for each file we actually download successfully
1564 (i.e. not for ones we have failures on or that we skip due to -N). If you
1565 just want to check if a file has been previously added without adding it,
1566 call with add_or_check == CHECK_FOR_FILE. Please be sure to call this
1567 function with local filenames, not remote URLs -- by some means that isn't
1568 commented well enough for me understand, multiple remote URLs can apparently
1569 correspond to a single local file. */
1571 downloaded_file (downloaded_file_t add_or_check, const char* file)
1573 boolean found_file = FALSE;
1574 static slist* downloaded_files = NULL;
1575 slist* rover = downloaded_files;
1577 while (rover != NULL)
1578 if (strcmp(rover->string, file) == 0)
1584 rover = rover->next;
1587 return TRUE; /* file had already been downloaded */
1590 if (add_or_check == ADD_FILE)
1592 rover = malloc(sizeof(slist));
1593 rover->string = xstrdup(file); /* die on out-of-mem. */
1594 rover->next = downloaded_files;
1595 downloaded_files = rover;
1598 return FALSE; /* file had not already been downloaded */