2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
64 #define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
65 || ((unsigned char)(c) > '~') /* ASCII 127 */ \
66 || strchr (URL_UNSAFE_CHARS, c))
68 /* If S contains unsafe characters, free it and replace it with a
69 version that doesn't. */
70 #define URL_CLEANSE(s) do \
72 if (contains_unsafe (s)) \
74 char *uc_tmp = encode_string (s); \
80 /* Is a directory "."? */
81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
82 /* Is a directory ".."? */
83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
86 static void path_simplify_with_kludge PARAMS ((char *));
88 static int urlpath_length PARAMS ((const char *));
90 /* NULL-terminated list of strings to be recognized as prototypes (URL
91 schemes). Note that recognized doesn't mean supported -- only HTTP
92 and FTP are currently supported.
94 However, a string that does not match anything in the list will be
95 considered a relative URL. Thus it's important that this list has
96 anything anyone could think of being legal.
98 There are wild things here. :-) Take a look at
99 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
101 static char *protostrings[] =
143 /* Similar to former, but for supported protocols: */
144 static struct proto sup_protos[] =
146 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
147 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
148 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
151 static void parse_dir PARAMS ((const char *, char **, char **));
152 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
153 static char *construct PARAMS ((const char *, const char *, int , int));
154 static char *construct_relative PARAMS ((const char *, const char *));
155 static char process_ftp_type PARAMS ((char *));
158 /* Returns the number of characters to be skipped if the first thing
159 in a URL is URL: (which is 0 or 4+). The optional spaces after
160 URL: are also skipped. */
162 skip_url (const char *url)
166 if (TOUPPER (url[0]) == 'U'
167 && TOUPPER (url[1]) == 'R'
168 && TOUPPER (url[2]) == 'L'
172 for (i = 4; url[i] && ISSPACE (url[i]); i++);
179 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
181 contains_unsafe (const char *s)
184 if (UNSAFE_CHAR (*s))
189 /* Decodes the forms %xy in a URL to the character the hexadecimal
190 code of which is xy. xy are hexadecimal digits from
191 [0123456789ABCDEF] (case-insensitive). If x or y are not
192 hex-digits or `%' precedes `\0', the sequence is inserted
196 decode_string (char *s)
206 /* Do nothing if at the end of the string, or if the chars
207 are not hex-digits. */
208 if (!*(s + 1) || !*(s + 2)
209 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
214 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
221 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
222 given string, returning a malloc-ed %XX encoded string. */
224 encode_string (const char *s)
231 for (i = 0; *s; s++, i++)
232 if (UNSAFE_CHAR (*s))
233 i += 2; /* Two more characters (hex digits) */
234 res = (char *)xmalloc (i + 1);
236 for (p = res; *s; s++)
237 if (UNSAFE_CHAR (*s))
239 const unsigned char c = *s;
241 *p++ = HEXD2ASC (c >> 4);
242 *p++ = HEXD2ASC (c & 0xf);
250 /* Returns the proto-type if URL's protocol is supported, or
251 URLUNKNOWN if not. */
253 urlproto (const char *url)
257 url += skip_url (url);
258 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
259 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
260 return sup_protos[i].ind;
261 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
264 for (++i; url[i] && url[i] != '/'; i++)
265 if (!ISDIGIT (url[i]))
267 if (url[i - 1] == ':')
276 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
277 part is found, returns 0. */
279 skip_proto (const char *url)
284 for (s = protostrings; *s; s++)
285 if (!strncasecmp (*s, url, strlen (*s)))
290 /* HTTP and FTP protocols are expected to yield exact host names
291 (i.e. the `//' part must be skipped, too). */
292 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
297 /* Returns 1 if the URL begins with a protocol (supported or
298 unsupported), 0 otherwise. */
300 has_proto (const char *url)
304 url += skip_url (url);
305 for (s = protostrings; *s; s++)
306 if (strncasecmp (url, *s, strlen (*s)) == 0)
311 /* Skip the username and password, if present here. The function
312 should be called *not* with the complete URL, but with the part
313 right after the protocol.
315 If no username and password are found, return 0. */
317 skip_uname (const char *url)
320 for (p = url; *p && *p != '/'; p++)
323 /* If a `@' was found before the first occurrence of `/', skip
331 /* Allocate a new urlinfo structure, fill it with default values and
332 return a pointer to it. */
338 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
339 memset (u, 0, sizeof (*u));
340 u->proto = URLUNKNOWN;
344 /* Perform a "deep" free of the urlinfo structure. The structure
345 should have been created with newurl, but need not have been used.
346 If free_pointer is non-0, free the pointer itself. */
348 freeurl (struct urlinfo *u, int complete)
352 FREE_MAYBE (u->host);
353 FREE_MAYBE (u->path);
354 FREE_MAYBE (u->file);
356 FREE_MAYBE (u->user);
357 FREE_MAYBE (u->passwd);
358 FREE_MAYBE (u->local);
359 FREE_MAYBE (u->referer);
361 freeurl (u->proxy, 1);
367 /* Extract the given URL of the form
368 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
369 1. hostname (terminated with `/' or `:')
370 2. port number (terminated with `/'), or chosen for the protocol
371 3. dirname (everything after hostname)
372 Most errors are handled. No allocation is done, you must supply
373 pointers to allocated memory.
374 ...and a host of other stuff :-)
376 - Recognizes hostname:dir/file for FTP and
377 hostname (:portnum)?/dir/file for HTTP.
378 - Parses the path to yield directory and file
379 - Parses the URL to yield the username and passwd (if present)
380 - Decodes the strings, in case they contain "forbidden" characters
381 - Writes the result to struct urlinfo
383 If the argument STRICT is set, it recognizes only the canonical
386 parseurl (const char *url, struct urlinfo *u, int strict)
389 int recognizable; /* Recognizable URL is the one where
390 the protocol name was explicitly
391 named, i.e. it wasn't deduced from
395 DEBUGP (("parseurl (\"%s\") -> ", url));
396 url += skip_url (url);
397 recognizable = has_proto (url);
398 if (strict && !recognizable)
400 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
402 l = strlen (sup_protos[i].name);
403 if (!strncasecmp (sup_protos[i].name, url, l))
406 /* If protocol is recognizable, but unsupported, bail out, else
408 if (recognizable && i == ARRAY_SIZE (sup_protos))
410 else if (i == ARRAY_SIZE (sup_protos))
413 u->proto = type = sup_protos[i].ind;
415 if (type == URLUNKNOWN)
417 /* Allow a username and password to be specified (i.e. just skip
420 l += skip_uname (url + l);
421 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
424 /* Get the hostname. */
425 u->host = strdupdelim (url + l, url + i);
426 DEBUGP (("host %s -> ", u->host));
428 /* Assume no port has been given. */
432 /* We have a colon delimiting the hostname. It could mean that
433 a port number is following it, or a directory. */
434 if (ISDIGIT (url[++i])) /* A port number */
436 if (type == URLUNKNOWN)
437 u->proto = type = URLHTTP;
438 for (; url[i] && url[i] != '/'; i++)
439 if (ISDIGIT (url[i]))
440 u->port = 10 * u->port + (url[i] - '0');
445 DEBUGP (("port %hu -> ", u->port));
447 else if (type == URLUNKNOWN) /* or a directory */
448 u->proto = type = URLFTP;
449 else /* or just a misformed port number */
452 else if (type == URLUNKNOWN)
453 u->proto = type = URLHTTP;
457 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
458 if (sup_protos[i].ind == type)
460 if (i == ARRAY_SIZE (sup_protos))
462 u->port = sup_protos[i].port;
464 /* Some delimiter troubles... */
465 if (url[i] == '/' && url[i - 1] != ':')
468 while (url[i] && url[i] == '/')
470 u->path = (char *)xmalloc (strlen (url + i) + 8);
471 strcpy (u->path, url + i);
474 u->ftp_type = process_ftp_type (u->path);
475 /* #### We don't handle type `d' correctly yet. */
476 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
479 DEBUGP (("opath %s -> ", u->path));
480 /* Parse the username and password (if existing). */
481 parse_uname (url, &u->user, &u->passwd);
482 /* Decode the strings, as per RFC 1738. */
483 decode_string (u->host);
484 decode_string (u->path);
486 decode_string (u->user);
488 decode_string (u->passwd);
489 /* Parse the directory. */
490 parse_dir (u->path, &u->dir, &u->file);
491 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
492 /* Simplify the directory. */
493 path_simplify (u->dir);
494 /* Remove the leading `/' in HTTP. */
495 if (type == URLHTTP && *u->dir == '/')
496 strcpy (u->dir, u->dir + 1);
497 DEBUGP (("ndir %s\n", u->dir));
498 /* Strip trailing `/'. */
500 if (l && u->dir[l - 1] == '/')
501 u->dir[l - 1] = '\0';
502 /* Re-create the path: */
503 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
504 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
505 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
506 strcpy (u->path, abs_ftp ? "%2F" : "/");
507 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
508 strcat (u->path, *u->dir ? "/" : "");
509 strcat (u->path, u->file);
510 URL_CLEANSE (u->path);
511 DEBUGP (("newpath: %s\n", u->path));
512 /* Create the clean URL. */
513 u->url = str_url (u, 0);
517 /* Special versions of DOTP and DDOTP for parse_dir(). */
519 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
520 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
521 && (!*((x) + 2) || *((x) + 2) == '?'))
523 /* Build the directory and filename components of the path. Both
524 components are *separately* malloc-ed strings! It does not change
525 the contents of path.
527 If the path ends with "." or "..", they are (correctly) counted as
530 parse_dir (const char *path, char **dir, char **file)
534 l = urlpath_length (path);
535 for (i = l; i && path[i] != '/'; i--);
537 if (!i && *path != '/') /* Just filename */
539 if (PD_DOTP (path) || PD_DDOTP (path))
541 *dir = strdupdelim (path, path + l);
542 *file = xstrdup (path + l); /* normally empty, but could
547 *dir = xstrdup (""); /* This is required because of FTP */
548 *file = xstrdup (path);
551 else if (!i) /* /filename */
553 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
555 *dir = strdupdelim (path, path + l);
556 *file = xstrdup (path + l); /* normally empty, but could
561 *dir = xstrdup ("/");
562 *file = xstrdup (path + 1);
565 else /* Nonempty directory with or without a filename */
567 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
569 *dir = strdupdelim (path, path + l);
570 *file = xstrdup (path + l); /* normally empty, but could
575 *dir = strdupdelim (path, path + i);
576 *file = xstrdup (path + i + 1);
581 /* Find the optional username and password within the URL, as per
582 RFC1738. The returned user and passwd char pointers are
585 parse_uname (const char *url, char **user, char **passwd)
593 url += skip_url (url);
594 /* Look for end of protocol string. */
595 l = skip_proto (url);
598 /* Add protocol offset. */
600 /* Is there an `@' character? */
601 for (p = url; *p && *p != '/'; p++)
604 /* If not, return. */
607 /* Else find the username and password. */
608 for (p = col = url; *p != '@'; p++)
610 if (*p == ':' && !*user)
612 *user = (char *)xmalloc (p - url + 1);
613 memcpy (*user, url, p - url);
614 (*user)[p - url] = '\0';
618 /* Decide whether you have only the username or both. */
619 where = *user ? passwd : user;
620 *where = (char *)xmalloc (p - col + 1);
621 memcpy (*where, col, p - col);
622 (*where)[p - col] = '\0';
626 /* If PATH ends with `;type=X', return the character X. */
628 process_ftp_type (char *path)
630 int len = strlen (path);
633 && !memcmp (path + len - 7, ";type=", 6))
635 path[len - 7] = '\0';
636 return path[len - 1];
642 /* Return the URL as fine-formed string, with a proper protocol,
643 optional port number, directory and optional user/password. If
644 HIDE is non-zero, password will be hidden. The forbidden
645 characters in the URL will be cleansed. */
647 str_url (const struct urlinfo *u, int hide)
649 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
650 int i, l, ln, lu, lh, lp, lf, ld;
651 unsigned short proto_default_port;
653 /* Look for the protocol name. */
654 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
655 if (sup_protos[i].ind == u->proto)
657 if (i == ARRAY_SIZE (sup_protos))
659 proto_name = sup_protos[i].name;
660 proto_default_port = sup_protos[i].port;
661 host = CLEANDUP (u->host);
662 dir = CLEANDUP (u->dir);
663 file = CLEANDUP (u->file);
664 user = passwd = NULL;
666 user = CLEANDUP (u->user);
670 passwd = CLEANDUP (u->passwd);
672 for (i = 0; passwd[i]; i++)
675 if (u->proto == URLFTP && *dir == '/')
677 char *tmp = (char *)xmalloc (strlen (dir) + 3);
678 /*sprintf (tmp, "%%2F%s", dir + 1);*/
682 strcpy (tmp + 3, dir + 1);
687 ln = strlen (proto_name);
688 lu = user ? strlen (user) : 0;
689 lp = passwd ? strlen (passwd) : 0;
693 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
694 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
695 (user ? user : ""), (passwd ? ":" : ""),
696 (passwd ? passwd : ""), (user ? "@" : ""),
697 host, u->port, dir, *dir ? "/" : "", file); */
699 memcpy (res, proto_name, ln);
703 memcpy (res + l, user, lu);
708 memcpy (res + l, passwd, lp);
713 memcpy (res + l, host, lh);
715 if (u->port != proto_default_port)
718 long_to_string (res + l, (long)u->port);
719 l += numdigit (u->port);
722 memcpy (res + l, dir, ld);
726 strcpy (res + l, file);
735 /* Check whether two URL-s are equivalent, i.e. pointing to the same
736 location. Uses parseurl to parse them, and compares the canonical
739 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
740 return 0 on error. */
742 url_equal (const char *url1, const char *url2)
744 struct urlinfo *u1, *u2;
749 err = parseurl (url1, u1, 0);
756 err = parseurl (url2, u2, 0);
762 res = !strcmp (u1->url, u2->url);
768 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
769 buffer may contain pretty much anything; no errors are signaled. */
771 findurl (const char *buf, int howmuch, int *count)
776 for (s1 = buf; howmuch; s1++, howmuch--)
777 for (prot = protostrings; *prot; prot++)
778 if (howmuch <= strlen (*prot))
780 else if (!strncasecmp (*prot, s1, strlen (*prot)))
782 for (s2 = s1, *count = 0;
783 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
784 !strchr (URL_SEPARATOR, *s2);
785 s2++, (*count)++, howmuch--);
791 /* Scans the file for signs of URL-s. Returns a vector of pointers,
792 each pointer representing a URL string. The file is *not* assumed
795 get_urls_file (const char *file)
802 urlpos *first, *current, *old;
804 if (file && !HYPHENP (file))
806 fp = fopen (file, "rb");
809 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
816 load_file (fp, &buf, &nread);
817 if (file && !HYPHENP (file))
819 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
820 first = current = NULL;
821 /* Fill the linked list with URLs. */
822 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
825 /* Allocate the space. */
827 current = (urlpos *)xmalloc (sizeof (urlpos));
830 memset (current, 0, sizeof (*current));
831 current->next = NULL;
832 current->url = (char *)xmalloc (size + 1);
833 memcpy (current->url, pbuf, size);
834 current->url[size] = '\0';
838 /* Free the buffer. */
844 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
845 an HTML document using htmlfindurl(), which see. get_urls_html()
846 constructs the HTML-s from the relative href-s.
848 If SILENT is non-zero, do not barf on baseless relative links. */
850 get_urls_html (const char *file, const char *this_url, int silent,
851 int dash_p_leaf_HTML)
857 int step, first_time;
858 urlpos *first, *current, *old;
860 if (file && !HYPHENP (file))
862 fp = fopen (file, "rb");
865 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
872 load_file (fp, &orig_buf, &nread);
873 if (file && !HYPHENP (file))
875 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
876 first = current = NULL;
878 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
880 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
886 const char *pbuf = buf;
889 char *needs_freeing, *url_data;
893 /* A frequent phenomenon that needs to be handled are pages
894 generated by brain-damaged HTML generators, which refer to to
895 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
896 any spaces at the beginning or at the end of the string.
897 This is probably not strictly correct, but that's what the
898 browsers do, so we may follow. May the authors of "WYSIWYG"
899 HTML tools burn in hell for the damage they've inflicted! */
900 while ((pbuf < buf + step) && ISSPACE (*pbuf))
905 while (size && ISSPACE (pbuf[size - 1]))
910 /* It would be nice if we could avoid allocating memory in this
911 loop, but I don't see an easy way. To process the entities,
912 we need to either copy the data, or change it destructively.
915 We have two pointers: needs_freeing and url_data, because the
916 code below does thing like url_data += <something>, and we
917 want to pass the original string to free(). */
918 needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
919 size = strlen (url_data);
921 for (i = 0; protostrings[i]; i++)
923 if (!strncasecmp (protostrings[i], url_data,
924 MINVAL (strlen (protostrings[i]), size)))
927 /* Check for http:RELATIVE_URI. See below for details. */
929 && !(strncasecmp (url_data, "http:", 5) == 0
930 && strncasecmp (url_data, "http://", 7) != 0))
937 /* This is for extremely brain-damaged pages that refer to
938 relative URI-s as <a href="http:URL">. Just strip off the
939 silly leading "http:" (as well as any leading blanks
941 if ((size > 5) && !strncasecmp ("http:", url_data, 5))
942 url_data += 5, size -= 5;
946 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
948 if (!strncasecmp (sup_protos[i].name, url_data,
949 MINVAL (strlen (sup_protos[i].name), size)))
952 /* Do *not* accept a non-supported protocol. */
953 if (i == ARRAY_SIZE (sup_protos))
955 free (needs_freeing);
961 /* First, construct the base, which can be relative itself.
963 Criteria for creating the base are:
964 1) html_base created by <base href="...">
966 3) base provided from the command line */
967 cbase = html_base ();
971 cbase = opt.base_href;
972 if (!cbase) /* Error condition -- a baseless
975 if (!opt.quiet && !silent)
977 /* Use malloc, not alloca because this is called in
979 char *temp = (char *)malloc (size + 1);
980 strncpy (temp, url_data, size);
982 logprintf (LOG_NOTQUIET,
983 _("Error (%s): Link %s without a base provided.\n"),
987 free (needs_freeing);
991 base = construct (this_url, cbase, strlen (cbase),
995 /* Base must now be absolute, with host name and
997 if (!has_proto (cbase))
999 logprintf (LOG_NOTQUIET, _("\
1000 Error (%s): Base %s relative, without referer URL.\n"),
1002 free (needs_freeing);
1005 base = xstrdup (cbase);
1007 constr = construct (base, url_data, size, no_proto);
1010 else /* has proto */
1012 constr = (char *)xmalloc (size + 1);
1013 strncpy (constr, url_data, size);
1014 constr[size] = '\0';
1022 tmp2 = html_base ();
1023 /* Use malloc, not alloca because this is called in a loop. */
1024 tmp = (char *)xmalloc (size + 1);
1025 strncpy (tmp, url_data, size);
1027 logprintf (LOG_ALWAYS,
1028 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
1029 file, this_url ? this_url : "(null)",
1030 tmp2 ? tmp2 : "(null)", tmp, constr);
1035 /* Allocate the space. */
1037 current = (urlpos *)xmalloc (sizeof (urlpos));
1039 old->next = current;
1042 /* Fill the values. */
1043 memset (current, 0, sizeof (*current));
1044 current->next = NULL;
1045 current->url = constr;
1046 current->size = step;
1047 current->pos = buf - orig_buf;
1048 /* A URL is relative if the host and protocol are not named,
1049 and the name does not start with `/'. */
1050 if (no_proto && *url_data != '/')
1051 current->flags |= (URELATIVE | UNOPROTO);
1053 current->flags |= UNOPROTO;
1054 free (needs_freeing);
1061 /* Free the linked list of urlpos. */
1063 free_urlpos (urlpos *l)
1067 urlpos *next = l->next;
1069 FREE_MAYBE (l->local_name);
1075 /* Rotate FNAME opt.backups times */
1077 rotate_backups(const char *fname)
1079 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1080 char *from = (char *)alloca (maxlen);
1081 char *to = (char *)alloca (maxlen);
1085 if (stat (fname, &sb) == 0)
1086 if (S_ISREG (sb.st_mode) == 0)
1089 for (i = opt.backups; i > 1; i--)
1091 sprintf (from, "%s.%d", fname, i - 1);
1092 sprintf (to, "%s.%d", fname, i);
1093 /* #### This will fail on machines without the rename() system
1098 sprintf (to, "%s.%d", fname, 1);
1102 /* Create all the necessary directories for PATH (a file). Calls
1103 mkdirhier() internally. */
1105 mkalldirs (const char *path)
1112 p = path + strlen (path);
1113 for (; *p != '/' && p != path; p--);
1114 /* Don't create if it's just a file. */
1115 if ((p == path) && (*p != '/'))
1117 t = strdupdelim (path, p);
1118 /* Check whether the directory exists. */
1119 if ((stat (t, &st) == 0))
1121 if (S_ISDIR (st.st_mode))
1128 /* If the dir exists as a file name, remove it first. This
1129 is *only* for Wget to work with buggy old CERN http
1130 servers. Here is the scenario: When Wget tries to
1131 retrieve a directory without a slash, e.g.
1132 http://foo/bar (bar being a directory), CERN server will
1133 not redirect it too http://foo/bar/ -- it will generate a
1134 directory listing containing links to bar/file1,
1135 bar/file2, etc. Wget will lose because it saves this
1136 HTML listing to a file `bar', so it cannot create the
1137 directory. To work around this, if the file of the same
1138 name exists, we just remove it and create the directory
1140 DEBUGP (("Removing %s because of directory danger!\n", t));
1144 res = make_directory (t);
1146 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1152 count_slashes (const char *s)
1161 /* Return the path name of the URL-equivalent file name, with a
1162 remote-like structure of directories. */
1164 mkstruct (const struct urlinfo *u)
1166 char *host, *dir, *file, *res, *dirpref;
1169 assert (u->dir != NULL);
1170 assert (u->host != NULL);
1174 char *ptr = u->dir + (*u->dir == '/');
1175 int slash_count = 1 + count_slashes (ptr);
1176 int cut = MINVAL (opt.cut_dirs, slash_count);
1177 for (; cut && *ptr; ptr++)
1180 STRDUP_ALLOCA (dir, ptr);
1183 dir = u->dir + (*u->dir == '/');
1185 host = xstrdup (u->host);
1186 /* Check for the true name (or at least a consistent name for saving
1187 to directory) of HOST, reusing the hlist if possible. */
1188 if (opt.add_hostdir && !opt.simple_check)
1190 char *nhost = realhost (host);
1194 /* Add dir_prefix and hostname (if required) to the beginning of
1196 if (opt.add_hostdir)
1198 if (!DOTP (opt.dir_prefix))
1200 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1201 + strlen (host) + 1);
1202 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1205 STRDUP_ALLOCA (dirpref, host);
1207 else /* not add_hostdir */
1209 if (!DOTP (opt.dir_prefix))
1210 dirpref = opt.dir_prefix;
1216 /* If there is a prefix, prepend it. */
1219 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1220 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1223 dir = xstrdup (dir);
1226 if (l && dir[l - 1] == '/')
1230 file = "index.html";
1234 /* Finally, construct the full name. */
1235 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1236 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1241 /* Create a unique filename, corresponding to a given URL. Calls
1242 mkstruct if necessary. Does *not* actually create any directories. */
1244 url_filename (const struct urlinfo *u)
1247 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1251 file = mkstruct (u);
1257 file = xstrdup ("index.html");
1259 file = xstrdup (u->file);
1264 /* Check whether the prefix directory is something other than "."
1265 before prepending it. */
1266 if (!DOTP (opt.dir_prefix))
1268 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1269 + 1 + strlen (file) + 1);
1270 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1275 /* DOS-ish file systems don't like `%' signs in them; we change it
1280 for (p = file; *p; p++)
1284 #endif /* WINDOWS */
1286 /* Check the cases in which the unique extensions are not used:
1287 1) Clobbering is turned off (-nc).
1288 2) Retrieval with regetting.
1289 3) Timestamping is used.
1290 4) Hierarchy is built.
1292 The exception is the case when file does exist and is a
1293 directory (actually support for bad httpd-s). */
1294 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1295 && !(file_exists_p (file) && !file_non_directory_p (file)))
1298 /* Find a unique name. */
1299 name = unique_name (file);
1304 /* Like strlen(), but allow the URL to be ended with '?'. */
1306 urlpath_length (const char *url)
1308 const char *q = strchr (url, '?');
1311 return strlen (url);
1314 /* Find the last occurrence of character C in the range [b, e), or
1315 NULL, if none are present. This is almost completely equivalent to
1316 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1317 the contents of the string. */
1319 find_last_char (const char *b, const char *e, char c)
1327 /* Construct a URL by concatenating an absolute URL and a path, which
1328 may or may not be absolute. This tries to behave "reasonably" in
1329 all foreseeable cases. It employs little specific knowledge about
1330 protocols or URL-specific stuff -- it just works on strings. */
1332 construct (const char *url, const char *sub, int subsize, int no_proto)
1338 const char *end = url + urlpath_length (url);
1342 /* SUB is a relative URL: we need to replace everything
1343 after last slash (possibly empty) with SUB.
1345 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1346 our result should be "whatever/foo/qux/xyzzy". */
1347 int need_explicit_slash = 0;
1349 const char *start_insert;
1350 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1353 /* No slash found at all. Append SUB to what we have,
1354 but we'll need a slash as a separator.
1356 Example: if url == "foo" and sub == "qux/xyzzy", then
1357 we cannot just append sub to url, because we'd get
1358 "fooqux/xyzzy", whereas what we want is
1361 To make sure the / gets inserted, we set
1362 need_explicit_slash to 1. We also set start_insert
1363 to end + 1, so that the length calculations work out
1364 correctly for one more (slash) character. Accessing
1365 that character is fine, since it will be the
1366 delimiter, '\0' or '?'. */
1367 /* example: "foo?..." */
1368 /* ^ ('?' gets changed to '/') */
1369 start_insert = end + 1;
1370 need_explicit_slash = 1;
1374 /* example: "whatever/foo/bar" */
1376 start_insert = last_slash + 1;
1379 span = start_insert - url;
1380 constr = (char *)xmalloc (span + subsize + 1);
1382 memcpy (constr, url, span);
1383 if (need_explicit_slash)
1384 constr[span - 1] = '/';
1386 memcpy (constr + span, sub, subsize);
1387 constr[span + subsize] = '\0';
1389 else /* *sub == `/' */
1391 /* SUB is an absolute path: we need to replace everything
1392 after (and including) the FIRST slash with SUB.
1394 So, if URL is "http://host/whatever/foo/bar", and SUB is
1395 "/qux/xyzzy", our result should be
1396 "http://host/qux/xyzzy". */
1398 const char *slash, *start_insert;
1399 const char *pos = url;
1400 int seen_slash_slash = 0;
1401 /* We're looking for the first slash, but want to ignore
1404 slash = memchr (pos, '/', end - pos);
1405 if (slash && !seen_slash_slash)
1406 if (*(slash + 1) == '/')
1409 seen_slash_slash = 1;
1413 /* At this point, SLASH is the location of the first / after
1414 "//", or the first slash altogether. START_INSERT is the
1415 pointer to the location where SUB will be inserted. When
1416 examining the last two examples, keep in mind that SUB
1419 if (!slash && !seen_slash_slash)
1420 /* example: "foo" */
1423 else if (!slash && seen_slash_slash)
1424 /* example: "http://foo" */
1427 else if (slash && !seen_slash_slash)
1428 /* example: "foo/bar" */
1431 else if (slash && seen_slash_slash)
1432 /* example: "http://something/" */
1434 start_insert = slash;
1436 span = start_insert - url;
1437 constr = (char *)xmalloc (span + subsize + 1);
1439 memcpy (constr, url, span);
1441 memcpy (constr + span, sub, subsize);
1442 constr[span + subsize] = '\0';
1445 else /* !no_proto */
1447 constr = strdupdelim (sub, sub + subsize);
1452 /* Like the function above, but with a saner caller interface. */
1454 url_concat (const char *base_url, const char *new_url)
1456 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1459 /* Optimize URL by host, destructively replacing u->host with realhost
1460 (u->host). Do this regardless of opt.simple_check. */
1462 opt_url (struct urlinfo *u)
1464 /* Find the "true" host. */
1465 char *host = realhost (u->host);
1468 assert (u->dir != NULL); /* the URL must have been parsed */
1469 /* Refresh the printed representation. */
1471 u->url = str_url (u, 0);
1474 /* This beautiful kludge is fortunately not needed, as I've made
1475 parse_dir do the (almost) right thing, so that a query can never
1476 become a part of directory. */
1478 /* Call path_simplify, but make sure that the part after the
1479 question-mark, if any, is not destroyed by path_simplify's
1482 path_simplify_with_kludge (char *path)
1484 char *query = strchr (path, '?');
1486 /* path_simplify also works destructively, so we also have the
1487 license to write. */
1489 path_simplify (path);
1492 char *newend = path + strlen (path);
1494 if (newend != query)
1495 memmove (newend, query, strlen (query) + 1);
1500 /* Returns proxy host address, in accordance with PROTO. */
1502 getproxy (uerr_t proto)
1504 if (proto == URLHTTP)
1505 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1506 else if (proto == URLFTP)
1507 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1512 /* Should a host be accessed through proxy, concerning no_proxy? */
1514 no_proxy_match (const char *host, const char **no_proxy)
1519 return !sufmatch (no_proxy, host);
1522 /* Change the links in an HTML document. Accepts a structure that
1523 defines the positions of all the links. */
1525 convert_links (const char *file, urlpos *l)
1529 downloaded_file_t downloaded_file_return;
1532 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1533 /* Read from the file.... */
1534 fp = fopen (file, "rb");
1537 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1538 file, strerror (errno));
1541 /* ...to a buffer. */
1542 load_file (fp, &buf, &size);
1545 downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
1547 if (opt.backup_converted && downloaded_file_return)
1548 /* Rather than just writing over the original .html file with the converted
1549 version, save the former to *.orig. Note we only do this for files we've
1550 _successfully_ downloaded, so we don't clobber .orig files sitting around
1551 from previous invocations. */
1553 /* Construct the backup filename as the original name plus ".orig". */
1554 size_t filename_len = strlen(file);
1555 char* filename_plus_orig_suffix;
1556 boolean already_wrote_backup_file = FALSE;
1557 slist* converted_file_ptr;
1558 static slist* converted_files = NULL;
1560 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1562 /* Just write "orig" over "html". We need to do it this way because
1563 when we're checking to see if we've downloaded the file before (to
1564 see if we can skip downloading it), we don't know if it's a
1565 text/html file. Therefore we don't know yet at that stage that -E
1566 is going to cause us to tack on ".html", so we need to compare
1567 vs. the original URL plus ".orig", not the original URL plus
1569 filename_plus_orig_suffix = xmalloc(filename_len + 1);
1570 strcpy(filename_plus_orig_suffix, file);
1571 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1573 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1575 /* Append ".orig" to the name. */
1576 filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
1577 strcpy(filename_plus_orig_suffix, file);
1578 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1581 /* We can get called twice on the same URL thanks to the
1582 convert_all_links() call in main(). If we write the .orig file each
1583 time in such a case, it'll end up containing the first-pass conversion,
1584 not the original file. So, see if we've already been called on this
1586 converted_file_ptr = converted_files;
1587 while (converted_file_ptr != NULL)
1588 if (strcmp(converted_file_ptr->string, file) == 0)
1590 already_wrote_backup_file = TRUE;
1594 converted_file_ptr = converted_file_ptr->next;
1596 if (!already_wrote_backup_file)
1598 /* Rename <file> to <file>.orig before former gets written over. */
1599 if (rename(file, filename_plus_orig_suffix) != 0)
1600 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1601 file, filename_plus_orig_suffix, strerror (errno));
1603 /* Remember that we've already written a .orig backup for this file.
1604 Note that we never free this memory since we need it till the
1605 convert_all_links() call, which is one of the last things the
1606 program does before terminating. BTW, I'm not sure if it would be
1607 safe to just set 'converted_file_ptr->string' to 'file' below,
1608 rather than making a copy of the string... Another note is that I
1609 thought I could just add a field to the urlpos structure saying
1610 that we'd written a .orig file for this URL, but that didn't work,
1611 so I had to make this separate list. */
1612 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1613 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1614 converted_file_ptr->next = converted_files;
1615 converted_files = converted_file_ptr;
1618 free(filename_plus_orig_suffix);
1620 /* Now open the file for writing. */
1621 fp = fopen (file, "wb");
1624 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1625 file, strerror (errno));
1629 /* Presumably we have to loop through multiple URLs here (even though we're
1630 only talking about a single local file) because of the -O option. */
1631 for (p = buf; l; l = l->next)
1635 DEBUGP (("Something strange is going on. Please investigate."));
1638 /* If the URL already is relative or it is not to be converted
1639 for some other reason (e.g. because of not having been
1640 downloaded in the first place), skip it. */
1641 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1643 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1647 /* Else, reach the position of the offending URL, echoing
1648 everything up to it to the outfile. */
1649 for (p2 = buf + l->pos; p < p2; p++)
1651 if (l->flags & UABS2REL)
1652 /* Convert absolute URL to relative. */
1654 char *newname = construct_relative (file, l->local_name);
1655 fprintf (fp, "%s", newname);
1656 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1657 l->url, newname, l->pos, file));
1662 /* Output the rest of the file. */
1665 for (p2 = buf + size; p < p2; p++)
1670 logputs (LOG_VERBOSE, _("done.\n"));
1673 /* Construct and return a malloced copy of the relative link from two
1674 pieces of information: local name S1 of the referring file and
1675 local name S2 of the referred file.
1677 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1678 "jagor.srce.hr/images/news.gif", the function will return
1681 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1682 "fly.cc.fer.hr/images/fly.gif", the function will return
1683 "../images/fly.gif".
1685 Caveats: S1 should not begin with `/', unless S2 also begins with
1686 '/'. S1 should not contain things like ".." and such --
1687 construct_relative ("fly/ioccc/../index.html",
1688 "fly/images/fly.gif") will fail. (A workaround is to call
1689 something like path_simplify() on S1). */
1691 construct_relative (const char *s1, const char *s2)
1693 int i, cnt, sepdirs1;
1697 return xstrdup (s2);
1698 /* S1 should *not* be absolute, if S2 wasn't. */
1699 assert (*s1 != '/');
1701 /* Skip the directories common to both strings. */
1704 while (s1[i] && s2[i]
1709 if (s1[i] == '/' && s2[i] == '/')
1714 for (sepdirs1 = 0; s1[i]; i++)
1717 /* Now, construct the file as of:
1718 - ../ repeated sepdirs1 time
1719 - all the non-mutual directories of S2. */
1720 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1721 for (i = 0; i < sepdirs1; i++)
1722 memcpy (res + 3 * i, "../", 3);
1723 strcpy (res + 3 * i, s2 + cnt);
1727 /* Add URL to the head of the list L. */
1729 add_url (urlpos *l, const char *url, const char *file)
1733 t = (urlpos *)xmalloc (sizeof (urlpos));
1734 memset (t, 0, sizeof (*t));
1735 t->url = xstrdup (url);
1736 t->local_name = xstrdup (file);
1742 /* Remembers which files have been downloaded. In the standard case, should be
1743 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1744 download successfully (i.e. not for ones we have failures on or that we skip
1747 When we've downloaded a file and tacked on a ".html" extension due to -E,
1748 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1749 FILE_DOWNLOADED_NORMALLY.
1751 If you just want to check if a file has been previously added without adding
1752 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1753 with local filenames, not remote URLs. */
1755 downloaded_file (downloaded_file_t mode, const char* file)
1757 typedef struct _downloaded_file_list
1760 downloaded_file_t download_type;
1761 struct _downloaded_file_list* next;
1762 } downloaded_file_list;
1764 boolean found_file = FALSE;
1765 static downloaded_file_list* downloaded_files = NULL;
1766 downloaded_file_list* rover = downloaded_files;
1768 while (rover != NULL)
1769 if (strcmp(rover->file, file) == 0)
1775 rover = rover->next;
1778 return rover->download_type; /* file had already been downloaded */
1781 if (mode != CHECK_FOR_FILE)
1783 rover = xmalloc(sizeof(*rover));
1784 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1785 rover->download_type = mode;
1786 rover->next = downloaded_files;
1787 downloaded_files = rover;
1790 return FILE_NOT_ALREADY_DOWNLOADED;