2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
47 /* Default port definitions */
48 #define DEFAULT_HTTP_PORT 80
49 #define DEFAULT_FTP_PORT 21
51 /* URL separator (for findurl) */
52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
54 /* A list of unsafe characters for encoding, as per RFC1738. '@' and
55 ':' (not listed in RFC) were added because of user/password
59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
64 #define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
65 || ((unsigned char)(c) > '~') /* ASCII 127 */ \
66 || strchr (URL_UNSAFE_CHARS, c))
68 /* If S contains unsafe characters, free it and replace it with a
69 version that doesn't. */
70 #define URL_CLEANSE(s) do \
72 if (contains_unsafe (s)) \
74 char *uc_tmp = encode_string (s); \
80 /* Is a directory "."? */
81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
82 /* Is a directory ".."? */
83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
86 static void path_simplify_with_kludge PARAMS ((char *));
88 static int urlpath_length PARAMS ((const char *));
90 /* NULL-terminated list of strings to be recognized as prototypes (URL
91 schemes). Note that recognized doesn't mean supported -- only HTTP
92 and FTP are currently supported.
94 However, a string that does not match anything in the list will be
95 considered a relative URL. Thus it's important that this list has
96 anything anyone could think of being legal.
98 There are wild things here. :-) Take a look at
99 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
101 static char *protostrings[] =
143 /* Similar to former, but for supported protocols: */
144 static struct proto sup_protos[] =
146 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
147 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
148 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
151 static void parse_dir PARAMS ((const char *, char **, char **));
152 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
153 static char *construct PARAMS ((const char *, const char *, int , int));
154 static char *construct_relative PARAMS ((const char *, const char *));
155 static char process_ftp_type PARAMS ((char *));
158 /* Returns the number of characters to be skipped if the first thing
159 in a URL is URL: (which is 0 or 4+). The optional spaces after
160 URL: are also skipped. */
162 skip_url (const char *url)
166 if (TOUPPER (url[0]) == 'U'
167 && TOUPPER (url[1]) == 'R'
168 && TOUPPER (url[2]) == 'L'
172 for (i = 4; url[i] && ISSPACE (url[i]); i++);
179 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
181 contains_unsafe (const char *s)
184 if (UNSAFE_CHAR (*s))
189 /* Decodes the forms %xy in a URL to the character the hexadecimal
190 code of which is xy. xy are hexadecimal digits from
191 [0123456789ABCDEF] (case-insensitive). If x or y are not
192 hex-digits or `%' precedes `\0', the sequence is inserted
196 decode_string (char *s)
206 /* Do nothing if at the end of the string, or if the chars
207 are not hex-digits. */
208 if (!*(s + 1) || !*(s + 2)
209 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
214 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
221 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
222 given string, returning a malloc-ed %XX encoded string. */
224 encode_string (const char *s)
231 for (i = 0; *s; s++, i++)
232 if (UNSAFE_CHAR (*s))
233 i += 2; /* Two more characters (hex digits) */
234 res = (char *)xmalloc (i + 1);
236 for (p = res; *s; s++)
237 if (UNSAFE_CHAR (*s))
239 const unsigned char c = *s;
241 *p++ = HEXD2ASC (c >> 4);
242 *p++ = HEXD2ASC (c & 0xf);
250 /* Returns the proto-type if URL's protocol is supported, or
251 URLUNKNOWN if not. */
253 urlproto (const char *url)
257 url += skip_url (url);
258 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
259 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
260 return sup_protos[i].ind;
261 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
264 for (++i; url[i] && url[i] != '/'; i++)
265 if (!ISDIGIT (url[i]))
267 if (url[i - 1] == ':')
276 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
277 part is found, returns 0. */
279 skip_proto (const char *url)
284 for (s = protostrings; *s; s++)
285 if (!strncasecmp (*s, url, strlen (*s)))
290 /* HTTP and FTP protocols are expected to yield exact host names
291 (i.e. the `//' part must be skipped, too). */
292 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
297 /* Returns 1 if the URL begins with a protocol (supported or
298 unsupported), 0 otherwise. */
300 has_proto (const char *url)
304 url += skip_url (url);
305 for (s = protostrings; *s; s++)
306 if (strncasecmp (url, *s, strlen (*s)) == 0)
311 /* Skip the username and password, if present here. The function
312 should be called *not* with the complete URL, but with the part
313 right after the protocol.
315 If no username and password are found, return 0. */
317 skip_uname (const char *url)
320 for (p = url; *p && *p != '/'; p++)
323 /* If a `@' was found before the first occurrence of `/', skip
331 /* Allocate a new urlinfo structure, fill it with default values and
332 return a pointer to it. */
338 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
339 memset (u, 0, sizeof (*u));
340 u->proto = URLUNKNOWN;
344 /* Perform a "deep" free of the urlinfo structure. The structure
345 should have been created with newurl, but need not have been used.
346 If free_pointer is non-0, free the pointer itself. */
348 freeurl (struct urlinfo *u, int complete)
352 FREE_MAYBE (u->host);
353 FREE_MAYBE (u->path);
354 FREE_MAYBE (u->file);
356 FREE_MAYBE (u->user);
357 FREE_MAYBE (u->passwd);
358 FREE_MAYBE (u->local);
359 FREE_MAYBE (u->referer);
361 freeurl (u->proxy, 1);
367 /* Extract the given URL of the form
368 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
369 1. hostname (terminated with `/' or `:')
370 2. port number (terminated with `/'), or chosen for the protocol
371 3. dirname (everything after hostname)
372 Most errors are handled. No allocation is done, you must supply
373 pointers to allocated memory.
374 ...and a host of other stuff :-)
376 - Recognizes hostname:dir/file for FTP and
377 hostname (:portnum)?/dir/file for HTTP.
378 - Parses the path to yield directory and file
379 - Parses the URL to yield the username and passwd (if present)
380 - Decodes the strings, in case they contain "forbidden" characters
381 - Writes the result to struct urlinfo
383 If the argument STRICT is set, it recognizes only the canonical
386 parseurl (const char *url, struct urlinfo *u, int strict)
389 int recognizable; /* Recognizable URL is the one where
390 the protocol name was explicitly
391 named, i.e. it wasn't deduced from
395 DEBUGP (("parseurl (\"%s\") -> ", url));
396 url += skip_url (url);
397 recognizable = has_proto (url);
398 if (strict && !recognizable)
400 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
402 l = strlen (sup_protos[i].name);
403 if (!strncasecmp (sup_protos[i].name, url, l))
406 /* If protocol is recognizable, but unsupported, bail out, else
408 if (recognizable && i == ARRAY_SIZE (sup_protos))
410 else if (i == ARRAY_SIZE (sup_protos))
413 u->proto = type = sup_protos[i].ind;
415 if (type == URLUNKNOWN)
417 /* Allow a username and password to be specified (i.e. just skip
420 l += skip_uname (url + l);
421 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
424 /* Get the hostname. */
425 u->host = strdupdelim (url + l, url + i);
426 DEBUGP (("host %s -> ", u->host));
428 /* Assume no port has been given. */
432 /* We have a colon delimiting the hostname. It could mean that
433 a port number is following it, or a directory. */
434 if (ISDIGIT (url[++i])) /* A port number */
436 if (type == URLUNKNOWN)
437 u->proto = type = URLHTTP;
438 for (; url[i] && url[i] != '/'; i++)
439 if (ISDIGIT (url[i]))
440 u->port = 10 * u->port + (url[i] - '0');
445 DEBUGP (("port %hu -> ", u->port));
447 else if (type == URLUNKNOWN) /* or a directory */
448 u->proto = type = URLFTP;
449 else /* or just a misformed port number */
452 else if (type == URLUNKNOWN)
453 u->proto = type = URLHTTP;
457 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
458 if (sup_protos[i].ind == type)
460 if (i == ARRAY_SIZE (sup_protos))
462 u->port = sup_protos[i].port;
464 /* Some delimiter troubles... */
465 if (url[i] == '/' && url[i - 1] != ':')
468 while (url[i] && url[i] == '/')
470 u->path = (char *)xmalloc (strlen (url + i) + 8);
471 strcpy (u->path, url + i);
474 u->ftp_type = process_ftp_type (u->path);
475 /* #### We don't handle type `d' correctly yet. */
476 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
479 DEBUGP (("opath %s -> ", u->path));
480 /* Parse the username and password (if existing). */
481 parse_uname (url, &u->user, &u->passwd);
482 /* Decode the strings, as per RFC 1738. */
483 decode_string (u->host);
484 decode_string (u->path);
486 decode_string (u->user);
488 decode_string (u->passwd);
489 /* Parse the directory. */
490 parse_dir (u->path, &u->dir, &u->file);
491 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
492 /* Simplify the directory. */
493 path_simplify (u->dir);
494 /* Remove the leading `/' in HTTP. */
495 if (type == URLHTTP && *u->dir == '/')
496 strcpy (u->dir, u->dir + 1);
497 DEBUGP (("ndir %s\n", u->dir));
498 /* Strip trailing `/'. */
500 if (l && u->dir[l - 1] == '/')
501 u->dir[l - 1] = '\0';
502 /* Re-create the path: */
503 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
504 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
505 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
506 strcpy (u->path, abs_ftp ? "%2F" : "/");
507 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
508 strcat (u->path, *u->dir ? "/" : "");
509 strcat (u->path, u->file);
510 URL_CLEANSE (u->path);
511 DEBUGP (("newpath: %s\n", u->path));
512 /* Create the clean URL. */
513 u->url = str_url (u, 0);
517 /* Special versions of DOTP and DDOTP for parse_dir(). */
519 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
520 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
521 && (!*((x) + 2) || *((x) + 2) == '?'))
523 /* Build the directory and filename components of the path. Both
524 components are *separately* malloc-ed strings! It does not change
525 the contents of path.
527 If the path ends with "." or "..", they are (correctly) counted as
530 parse_dir (const char *path, char **dir, char **file)
534 l = urlpath_length (path);
535 for (i = l; i && path[i] != '/'; i--);
537 if (!i && *path != '/') /* Just filename */
539 if (PD_DOTP (path) || PD_DDOTP (path))
541 *dir = strdupdelim (path, path + l);
542 *file = xstrdup (path + l); /* normally empty, but could
547 *dir = xstrdup (""); /* This is required because of FTP */
548 *file = xstrdup (path);
551 else if (!i) /* /filename */
553 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
555 *dir = strdupdelim (path, path + l);
556 *file = xstrdup (path + l); /* normally empty, but could
561 *dir = xstrdup ("/");
562 *file = xstrdup (path + 1);
565 else /* Nonempty directory with or without a filename */
567 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
569 *dir = strdupdelim (path, path + l);
570 *file = xstrdup (path + l); /* normally empty, but could
575 *dir = strdupdelim (path, path + i);
576 *file = xstrdup (path + i + 1);
581 /* Find the optional username and password within the URL, as per
582 RFC1738. The returned user and passwd char pointers are
585 parse_uname (const char *url, char **user, char **passwd)
593 url += skip_url (url);
594 /* Look for end of protocol string. */
595 l = skip_proto (url);
598 /* Add protocol offset. */
600 /* Is there an `@' character? */
601 for (p = url; *p && *p != '/'; p++)
604 /* If not, return. */
607 /* Else find the username and password. */
608 for (p = col = url; *p != '@'; p++)
610 if (*p == ':' && !*user)
612 *user = (char *)xmalloc (p - url + 1);
613 memcpy (*user, url, p - url);
614 (*user)[p - url] = '\0';
618 /* Decide whether you have only the username or both. */
619 where = *user ? passwd : user;
620 *where = (char *)xmalloc (p - col + 1);
621 memcpy (*where, col, p - col);
622 (*where)[p - col] = '\0';
626 /* If PATH ends with `;type=X', return the character X. */
628 process_ftp_type (char *path)
630 int len = strlen (path);
633 && !memcmp (path + len - 7, ";type=", 6))
635 path[len - 7] = '\0';
636 return path[len - 1];
642 /* Return the URL as fine-formed string, with a proper protocol,
643 optional port number, directory and optional user/password. If
644 HIDE is non-zero, password will be hidden. The forbidden
645 characters in the URL will be cleansed. */
647 str_url (const struct urlinfo *u, int hide)
649 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
650 int i, l, ln, lu, lh, lp, lf, ld;
651 unsigned short proto_default_port;
653 /* Look for the protocol name. */
654 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
655 if (sup_protos[i].ind == u->proto)
657 if (i == ARRAY_SIZE (sup_protos))
659 proto_name = sup_protos[i].name;
660 proto_default_port = sup_protos[i].port;
661 host = CLEANDUP (u->host);
662 dir = CLEANDUP (u->dir);
663 file = CLEANDUP (u->file);
664 user = passwd = NULL;
666 user = CLEANDUP (u->user);
670 passwd = CLEANDUP (u->passwd);
672 for (i = 0; passwd[i]; i++)
675 if (u->proto == URLFTP && *dir == '/')
677 char *tmp = (char *)xmalloc (strlen (dir) + 3);
678 /*sprintf (tmp, "%%2F%s", dir + 1);*/
682 strcpy (tmp + 3, dir + 1);
687 ln = strlen (proto_name);
688 lu = user ? strlen (user) : 0;
689 lp = passwd ? strlen (passwd) : 0;
693 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
694 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
695 (user ? user : ""), (passwd ? ":" : ""),
696 (passwd ? passwd : ""), (user ? "@" : ""),
697 host, u->port, dir, *dir ? "/" : "", file); */
699 memcpy (res, proto_name, ln);
703 memcpy (res + l, user, lu);
708 memcpy (res + l, passwd, lp);
713 memcpy (res + l, host, lh);
715 if (u->port != proto_default_port)
718 long_to_string (res + l, (long)u->port);
719 l += numdigit (u->port);
722 memcpy (res + l, dir, ld);
726 strcpy (res + l, file);
735 /* Check whether two URL-s are equivalent, i.e. pointing to the same
736 location. Uses parseurl to parse them, and compares the canonical
739 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
740 return 0 on error. */
742 url_equal (const char *url1, const char *url2)
744 struct urlinfo *u1, *u2;
749 err = parseurl (url1, u1, 0);
756 err = parseurl (url2, u2, 0);
762 res = !strcmp (u1->url, u2->url);
768 /* Find URL of format scheme:hostname[:port]/dir in a buffer. The
769 buffer may contain pretty much anything; no errors are signaled. */
771 findurl (const char *buf, int howmuch, int *count)
776 for (s1 = buf; howmuch; s1++, howmuch--)
777 for (prot = protostrings; *prot; prot++)
778 if (howmuch <= strlen (*prot))
780 else if (!strncasecmp (*prot, s1, strlen (*prot)))
782 for (s2 = s1, *count = 0;
783 howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
784 !strchr (URL_SEPARATOR, *s2);
785 s2++, (*count)++, howmuch--);
791 /* Scans the file for signs of URL-s. Returns a vector of pointers,
792 each pointer representing a URL string. The file is *not* assumed
795 get_urls_file (const char *file)
802 urlpos *first, *current, *old;
804 if (file && !HYPHENP (file))
806 fp = fopen (file, "rb");
809 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
816 load_file (fp, &buf, &nread);
817 if (file && !HYPHENP (file))
819 DEBUGP (("Loaded %s (size %ld).\n", file, nread));
820 first = current = NULL;
821 /* Fill the linked list with URLs. */
822 for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
825 /* Allocate the space. */
827 current = (urlpos *)xmalloc (sizeof (urlpos));
830 memset (current, 0, sizeof (*current));
831 current->next = NULL;
832 current->url = (char *)xmalloc (size + 1);
833 memcpy (current->url, pbuf, size);
834 current->url[size] = '\0';
838 /* Free the buffer. */
844 /* Similar to get_urls_file, but for HTML files. FILE is scanned as
845 an HTML document using htmlfindurl(), which see. get_urls_html()
846 constructs the HTML-s from the relative href-s.
848 If SILENT is non-zero, do not barf on baseless relative links. */
850 get_urls_html (const char *file, const char *this_url, int silent,
851 int dash_p_leaf_HTML)
857 int step, first_time;
858 urlpos *first, *current, *old;
860 if (file && !HYPHENP (file))
862 fp = fopen (file, "rb");
865 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
872 load_file (fp, &orig_buf, &nread);
873 if (file && !HYPHENP (file))
875 DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
876 first = current = NULL;
878 /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
880 (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
886 const char *pbuf = buf;
889 char *needs_freeing, *url_data;
893 /* A frequent phenomenon that needs to be handled are pages
894 generated by brain-damaged HTML generators, which refer to to
895 URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
896 any spaces at the beginning or at the end of the string.
897 This is probably not strictly correct, but that's what the
898 browsers do, so we may follow. May the authors of "WYSIWYG"
899 HTML tools burn in hell for the damage they've inflicted! */
900 while ((pbuf < buf + step) && ISSPACE (*pbuf))
905 while (size && ISSPACE (pbuf[size - 1]))
910 /* It would be nice if we could avoid allocating memory in this
911 loop, but I don't see an easy way. To process the entities,
912 we need to either copy the data, or change it destructively.
915 We have two pointers: needs_freeing and url_data, because the
916 code below does thing like url_data += <something>, and we
917 want to pass the original string to free(). */
918 needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
919 size = strlen (url_data);
921 for (i = 0; protostrings[i]; i++)
923 if (!strncasecmp (protostrings[i], url_data,
924 MINVAL (strlen (protostrings[i]), size)))
927 /* Check for http:RELATIVE_URI. See below for details. */
929 && !(strncasecmp (url_data, "http:", 5) == 0
930 && strncasecmp (url_data, "http://", 7) != 0))
937 /* This is for extremely brain-damaged pages that refer to
938 relative URI-s as <a href="http:URL">. Just strip off the
939 silly leading "http:" (as well as any leading blanks
941 if ((size > 5) && !strncasecmp ("http:", url_data, 5))
942 url_data += 5, size -= 5;
946 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
948 if (!strncasecmp (sup_protos[i].name, url_data,
949 MINVAL (strlen (sup_protos[i].name), size)))
952 /* Do *not* accept a non-supported protocol. */
953 if (i == ARRAY_SIZE (sup_protos))
955 free (needs_freeing);
961 /* First, construct the base, which can be relative itself.
963 Criteria for creating the base are:
964 1) html_base created by <base href="...">
966 3) base provided from the command line */
967 cbase = html_base ();
971 cbase = opt.base_href;
972 if (!cbase) /* Error condition -- a baseless
975 if (!opt.quiet && !silent)
977 /* Use malloc, not alloca because this is called in
979 char *temp = (char *)malloc (size + 1);
980 strncpy (temp, url_data, size);
982 logprintf (LOG_NOTQUIET,
983 _("Error (%s): Link %s without a base provided.\n"),
987 free (needs_freeing);
991 base = construct (this_url, cbase, strlen (cbase),
995 /* Base must now be absolute, with host name and
997 if (!has_proto (cbase))
999 logprintf (LOG_NOTQUIET, _("\
1000 Error (%s): Base %s relative, without referer URL.\n"),
1002 free (needs_freeing);
1005 base = xstrdup (cbase);
1007 constr = construct (base, url_data, size, no_proto);
1010 else /* has proto */
1012 constr = (char *)xmalloc (size + 1);
1013 strncpy (constr, url_data, size);
1014 constr[size] = '\0';
1022 tmp2 = html_base ();
1023 /* Use malloc, not alloca because this is called in a loop. */
1024 tmp = (char *)xmalloc (size + 1);
1025 strncpy (tmp, url_data, size);
1027 logprintf (LOG_ALWAYS,
1028 "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
1029 file, this_url ? this_url : "(null)",
1030 tmp2 ? tmp2 : "(null)", tmp, constr);
1035 /* Allocate the space. */
1037 current = (urlpos *)xmalloc (sizeof (urlpos));
1039 old->next = current;
1042 /* Fill the values. */
1043 memset (current, 0, sizeof (*current));
1044 current->next = NULL;
1045 current->url = constr;
1046 current->size = step;
1047 current->pos = buf - orig_buf;
1048 /* A URL is relative if the host and protocol are not named,
1049 and the name does not start with `/'. */
1050 if (no_proto && *url_data != '/')
1051 current->flags |= (URELATIVE | UNOPROTO);
1053 current->flags |= UNOPROTO;
1054 free (needs_freeing);
1061 /* Free the linked list of urlpos. */
1063 free_urlpos (urlpos *l)
1067 urlpos *next = l->next;
1069 FREE_MAYBE (l->local_name);
1075 /* Rotate FNAME opt.backups times */
1077 rotate_backups(const char *fname)
1079 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1080 char *from = (char *)alloca (maxlen);
1081 char *to = (char *)alloca (maxlen);
1085 if (stat (fname, &sb) == 0)
1086 if (S_ISREG (sb.st_mode) == 0)
1089 for (i = opt.backups; i > 1; i--)
1091 sprintf (from, "%s.%d", fname, i - 1);
1092 sprintf (to, "%s.%d", fname, i);
1093 /* #### This will fail on machines without the rename() system
1098 sprintf (to, "%s.%d", fname, 1);
1102 /* Create all the necessary directories for PATH (a file). Calls
1103 mkdirhier() internally. */
1105 mkalldirs (const char *path)
1112 p = path + strlen (path);
1113 for (; *p != '/' && p != path; p--);
1114 /* Don't create if it's just a file. */
1115 if ((p == path) && (*p != '/'))
1117 t = strdupdelim (path, p);
1118 /* Check whether the directory exists. */
1119 if ((stat (t, &st) == 0))
1121 if (S_ISDIR (st.st_mode))
1128 /* If the dir exists as a file name, remove it first. This
1129 is *only* for Wget to work with buggy old CERN http
1130 servers. Here is the scenario: When Wget tries to
1131 retrieve a directory without a slash, e.g.
1132 http://foo/bar (bar being a directory), CERN server will
1133 not redirect it too http://foo/bar/ -- it will generate a
1134 directory listing containing links to bar/file1,
1135 bar/file2, etc. Wget will lose because it saves this
1136 HTML listing to a file `bar', so it cannot create the
1137 directory. To work around this, if the file of the same
1138 name exists, we just remove it and create the directory
1140 DEBUGP (("Removing %s because of directory danger!\n", t));
1144 res = make_directory (t);
1146 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1152 count_slashes (const char *s)
1161 /* Return the path name of the URL-equivalent file name, with a
1162 remote-like structure of directories. */
1164 mkstruct (const struct urlinfo *u)
1166 char *host, *dir, *file, *res, *dirpref;
1169 assert (u->dir != NULL);
1170 assert (u->host != NULL);
1174 char *ptr = u->dir + (*u->dir == '/');
1175 int slash_count = 1 + count_slashes (ptr);
1176 int cut = MINVAL (opt.cut_dirs, slash_count);
1177 for (; cut && *ptr; ptr++)
1180 STRDUP_ALLOCA (dir, ptr);
1183 dir = u->dir + (*u->dir == '/');
1185 host = xstrdup (u->host);
1186 /* Check for the true name (or at least a consistent name for saving
1187 to directory) of HOST, reusing the hlist if possible. */
1188 if (opt.add_hostdir && !opt.simple_check)
1190 char *nhost = realhost (host);
1194 /* Add dir_prefix and hostname (if required) to the beginning of
1196 if (opt.add_hostdir)
1198 if (!DOTP (opt.dir_prefix))
1200 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1201 + strlen (host) + 1);
1202 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1205 STRDUP_ALLOCA (dirpref, host);
1207 else /* not add_hostdir */
1209 if (!DOTP (opt.dir_prefix))
1210 dirpref = opt.dir_prefix;
1216 /* If there is a prefix, prepend it. */
1219 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1220 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1223 dir = xstrdup (dir);
1226 if (l && dir[l - 1] == '/')
1230 file = "index.html";
1234 /* Finally, construct the full name. */
1235 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1236 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1241 /* Create a unique filename, corresponding to a given URL. Calls
1242 mkstruct if necessary. Does *not* actually create any directories. */
1244 url_filename (const struct urlinfo *u)
1247 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1251 file = mkstruct (u);
1257 file = xstrdup ("index.html");
1259 file = xstrdup (u->file);
1264 /* Check whether the prefix directory is something other than "."
1265 before prepending it. */
1266 if (!DOTP (opt.dir_prefix))
1268 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1269 + 1 + strlen (file) + 1);
1270 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1275 /* DOS-ish file systems don't like `%' signs in them; we change it
1280 for (p = file; *p; p++)
1284 #endif /* WINDOWS */
1286 /* Check the cases in which the unique extensions are not used:
1287 1) Clobbering is turned off (-nc).
1288 2) Retrieval with regetting.
1289 3) Timestamping is used.
1290 4) Hierarchy is built.
1292 The exception is the case when file does exist and is a
1293 directory (actually support for bad httpd-s). */
1294 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1295 && !(file_exists_p (file) && !file_non_directory_p (file)))
1298 /* Find a unique name. */
1299 name = unique_name (file);
1304 /* Like strlen(), but allow the URL to be ended with '?'. */
1306 urlpath_length (const char *url)
1308 const char *q = strchr (url, '?');
1311 return strlen (url);
1314 /* Find the last occurrence of character C in the range [b, e), or
1315 NULL, if none are present. This is almost completely equivalent to
1316 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1317 the contents of the string. */
1319 find_last_char (const char *b, const char *e, char c)
1327 /* Construct a URL by concatenating an absolute URL and a path, which
1328 may or may not be absolute. This tries to behave "reasonably" in
1329 all foreseeable cases. It employs little specific knowledge about
1330 protocols or URL-specific stuff -- it just works on strings. */
1332 construct (const char *url, const char *sub, int subsize, int no_proto)
1338 const char *end = url + urlpath_length (url);
1342 /* SUB is a relative URL: we need to replace everything
1343 after last slash (possibly empty) with SUB.
1345 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1346 our result should be "whatever/foo/qux/xyzzy". */
1347 int need_explicit_slash = 0;
1349 const char *start_insert;
1350 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1353 /* No slash found at all. Append SUB to what we have,
1354 but we'll need a slash as a separator.
1356 Example: if url == "foo" and sub == "qux/xyzzy", then
1357 we cannot just append sub to url, because we'd get
1358 "fooqux/xyzzy", whereas what we want is
1361 To make sure the / gets inserted, we set
1362 need_explicit_slash to 1. We also set start_insert
1363 to end + 1, so that the length calculations work out
1364 correctly for one more (slash) character. Accessing
1365 that character is fine, since it will be the
1366 delimiter, '\0' or '?'. */
1367 /* example: "foo?..." */
1368 /* ^ ('?' gets changed to '/') */
1369 start_insert = end + 1;
1370 need_explicit_slash = 1;
1374 /* example: "whatever/foo/bar" */
1376 start_insert = last_slash + 1;
1379 span = start_insert - url;
1380 constr = (char *)xmalloc (span + subsize + 1);
1382 memcpy (constr, url, span);
1383 if (need_explicit_slash)
1384 constr[span - 1] = '/';
1386 memcpy (constr + span, sub, subsize);
1387 constr[span + subsize] = '\0';
1389 else /* *sub == `/' */
1391 /* SUB is an absolute path: we need to replace everything
1392 after (and including) the FIRST slash with SUB.
1394 So, if URL is "http://host/whatever/foo/bar", and SUB is
1395 "/qux/xyzzy", our result should be
1396 "http://host/qux/xyzzy". */
1399 const char *start_insert = NULL; /* for gcc to shut up. */
1400 const char *pos = url;
1401 int seen_slash_slash = 0;
1402 /* We're looking for the first slash, but want to ignore
1405 slash = memchr (pos, '/', end - pos);
1406 if (slash && !seen_slash_slash)
1407 if (*(slash + 1) == '/')
1410 seen_slash_slash = 1;
1414 /* At this point, SLASH is the location of the first / after
1415 "//", or the first slash altogether. START_INSERT is the
1416 pointer to the location where SUB will be inserted. When
1417 examining the last two examples, keep in mind that SUB
1420 if (!slash && !seen_slash_slash)
1421 /* example: "foo" */
1424 else if (!slash && seen_slash_slash)
1425 /* example: "http://foo" */
1428 else if (slash && !seen_slash_slash)
1429 /* example: "foo/bar" */
1432 else if (slash && seen_slash_slash)
1433 /* example: "http://something/" */
1435 start_insert = slash;
1437 span = start_insert - url;
1438 constr = (char *)xmalloc (span + subsize + 1);
1440 memcpy (constr, url, span);
1442 memcpy (constr + span, sub, subsize);
1443 constr[span + subsize] = '\0';
1446 else /* !no_proto */
1448 constr = strdupdelim (sub, sub + subsize);
1453 /* Like the function above, but with a saner caller interface. */
1455 url_concat (const char *base_url, const char *new_url)
1457 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1460 /* Optimize URL by host, destructively replacing u->host with realhost
1461 (u->host). Do this regardless of opt.simple_check. */
1463 opt_url (struct urlinfo *u)
1465 /* Find the "true" host. */
1466 char *host = realhost (u->host);
1469 assert (u->dir != NULL); /* the URL must have been parsed */
1470 /* Refresh the printed representation. */
1472 u->url = str_url (u, 0);
1475 /* This beautiful kludge is fortunately not needed, as I've made
1476 parse_dir do the (almost) right thing, so that a query can never
1477 become a part of directory. */
1479 /* Call path_simplify, but make sure that the part after the
1480 question-mark, if any, is not destroyed by path_simplify's
1483 path_simplify_with_kludge (char *path)
1485 char *query = strchr (path, '?');
1487 /* path_simplify also works destructively, so we also have the
1488 license to write. */
1490 path_simplify (path);
1493 char *newend = path + strlen (path);
1495 if (newend != query)
1496 memmove (newend, query, strlen (query) + 1);
1501 /* Returns proxy host address, in accordance with PROTO. */
1503 getproxy (uerr_t proto)
1505 if (proto == URLHTTP)
1506 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1507 else if (proto == URLFTP)
1508 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1513 /* Should a host be accessed through proxy, concerning no_proxy? */
1515 no_proxy_match (const char *host, const char **no_proxy)
1520 return !sufmatch (no_proxy, host);
1523 /* Change the links in an HTML document. Accepts a structure that
1524 defines the positions of all the links. */
1526 convert_links (const char *file, urlpos *l)
1530 downloaded_file_t downloaded_file_return;
1533 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1534 /* Read from the file.... */
1535 fp = fopen (file, "rb");
1538 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1539 file, strerror (errno));
1542 /* ...to a buffer. */
1543 load_file (fp, &buf, &size);
1546 downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
1548 if (opt.backup_converted && downloaded_file_return)
1549 /* Rather than just writing over the original .html file with the converted
1550 version, save the former to *.orig. Note we only do this for files we've
1551 _successfully_ downloaded, so we don't clobber .orig files sitting around
1552 from previous invocations. */
1554 /* Construct the backup filename as the original name plus ".orig". */
1555 size_t filename_len = strlen(file);
1556 char* filename_plus_orig_suffix;
1557 boolean already_wrote_backup_file = FALSE;
1558 slist* converted_file_ptr;
1559 static slist* converted_files = NULL;
1561 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1563 /* Just write "orig" over "html". We need to do it this way because
1564 when we're checking to see if we've downloaded the file before (to
1565 see if we can skip downloading it), we don't know if it's a
1566 text/html file. Therefore we don't know yet at that stage that -E
1567 is going to cause us to tack on ".html", so we need to compare
1568 vs. the original URL plus ".orig", not the original URL plus
1570 filename_plus_orig_suffix = xmalloc(filename_len + 1);
1571 strcpy(filename_plus_orig_suffix, file);
1572 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1574 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1576 /* Append ".orig" to the name. */
1577 filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
1578 strcpy(filename_plus_orig_suffix, file);
1579 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1582 /* We can get called twice on the same URL thanks to the
1583 convert_all_links() call in main(). If we write the .orig file each
1584 time in such a case, it'll end up containing the first-pass conversion,
1585 not the original file. So, see if we've already been called on this
1587 converted_file_ptr = converted_files;
1588 while (converted_file_ptr != NULL)
1589 if (strcmp(converted_file_ptr->string, file) == 0)
1591 already_wrote_backup_file = TRUE;
1595 converted_file_ptr = converted_file_ptr->next;
1597 if (!already_wrote_backup_file)
1599 /* Rename <file> to <file>.orig before former gets written over. */
1600 if (rename(file, filename_plus_orig_suffix) != 0)
1601 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1602 file, filename_plus_orig_suffix, strerror (errno));
1604 /* Remember that we've already written a .orig backup for this file.
1605 Note that we never free this memory since we need it till the
1606 convert_all_links() call, which is one of the last things the
1607 program does before terminating. BTW, I'm not sure if it would be
1608 safe to just set 'converted_file_ptr->string' to 'file' below,
1609 rather than making a copy of the string... Another note is that I
1610 thought I could just add a field to the urlpos structure saying
1611 that we'd written a .orig file for this URL, but that didn't work,
1612 so I had to make this separate list. */
1613 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1614 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1615 converted_file_ptr->next = converted_files;
1616 converted_files = converted_file_ptr;
1619 free(filename_plus_orig_suffix);
1621 /* Now open the file for writing. */
1622 fp = fopen (file, "wb");
1625 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1626 file, strerror (errno));
1630 /* Presumably we have to loop through multiple URLs here (even though we're
1631 only talking about a single local file) because of the -O option. */
1632 for (p = buf; l; l = l->next)
1636 DEBUGP (("Something strange is going on. Please investigate."));
1639 /* If the URL already is relative or it is not to be converted
1640 for some other reason (e.g. because of not having been
1641 downloaded in the first place), skip it. */
1642 if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1644 DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1648 /* Else, reach the position of the offending URL, echoing
1649 everything up to it to the outfile. */
1650 for (p2 = buf + l->pos; p < p2; p++)
1652 if (l->flags & UABS2REL)
1653 /* Convert absolute URL to relative. */
1655 char *newname = construct_relative (file, l->local_name);
1656 fprintf (fp, "%s", newname);
1657 DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1658 l->url, newname, l->pos, file));
1663 /* Output the rest of the file. */
1666 for (p2 = buf + size; p < p2; p++)
1671 logputs (LOG_VERBOSE, _("done.\n"));
1674 /* Construct and return a malloced copy of the relative link from two
1675 pieces of information: local name S1 of the referring file and
1676 local name S2 of the referred file.
1678 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1679 "jagor.srce.hr/images/news.gif", the function will return
1682 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1683 "fly.cc.fer.hr/images/fly.gif", the function will return
1684 "../images/fly.gif".
1686 Caveats: S1 should not begin with `/', unless S2 also begins with
1687 '/'. S1 should not contain things like ".." and such --
1688 construct_relative ("fly/ioccc/../index.html",
1689 "fly/images/fly.gif") will fail. (A workaround is to call
1690 something like path_simplify() on S1). */
1692 construct_relative (const char *s1, const char *s2)
1694 int i, cnt, sepdirs1;
1698 return xstrdup (s2);
1699 /* S1 should *not* be absolute, if S2 wasn't. */
1700 assert (*s1 != '/');
1702 /* Skip the directories common to both strings. */
1705 while (s1[i] && s2[i]
1710 if (s1[i] == '/' && s2[i] == '/')
1715 for (sepdirs1 = 0; s1[i]; i++)
1718 /* Now, construct the file as of:
1719 - ../ repeated sepdirs1 time
1720 - all the non-mutual directories of S2. */
1721 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1722 for (i = 0; i < sepdirs1; i++)
1723 memcpy (res + 3 * i, "../", 3);
1724 strcpy (res + 3 * i, s2 + cnt);
1728 /* Add URL to the head of the list L. */
1730 add_url (urlpos *l, const char *url, const char *file)
1734 t = (urlpos *)xmalloc (sizeof (urlpos));
1735 memset (t, 0, sizeof (*t));
1736 t->url = xstrdup (url);
1737 t->local_name = xstrdup (file);
1743 /* Remembers which files have been downloaded. In the standard case, should be
1744 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1745 download successfully (i.e. not for ones we have failures on or that we skip
1748 When we've downloaded a file and tacked on a ".html" extension due to -E,
1749 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1750 FILE_DOWNLOADED_NORMALLY.
1752 If you just want to check if a file has been previously added without adding
1753 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1754 with local filenames, not remote URLs. */
1756 downloaded_file (downloaded_file_t mode, const char* file)
1758 typedef struct _downloaded_file_list
1761 downloaded_file_t download_type;
1762 struct _downloaded_file_list* next;
1763 } downloaded_file_list;
1765 boolean found_file = FALSE;
1766 static downloaded_file_list* downloaded_files = NULL;
1767 downloaded_file_list* rover = downloaded_files;
1769 while (rover != NULL)
1770 if (strcmp(rover->file, file) == 0)
1776 rover = rover->next;
1779 return rover->download_type; /* file had already been downloaded */
1782 if (mode != CHECK_FOR_FILE)
1784 rover = xmalloc(sizeof(*rover));
1785 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1786 rover->download_type = mode;
1787 rover->next = downloaded_files;
1788 downloaded_files = rover;
1791 return FILE_NOT_ALREADY_DOWNLOADED;