2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
45 /* Table of Unsafe chars. This is intialized in
46 init_unsafe_char_table. */
48 static char unsafe_char_table[256];
50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
52 /* If S contains unsafe characters, free it and replace it with a
53 version that doesn't. */
54 #define URL_CLEANSE(s) do \
56 if (contains_unsafe (s)) \
58 char *uc_tmp = encode_string (s); \
64 /* Is a directory "."? */
65 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
66 /* Is a directory ".."? */
67 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
70 static void path_simplify_with_kludge PARAMS ((char *));
72 static int urlpath_length PARAMS ((const char *));
74 /* NULL-terminated list of strings to be recognized as prototypes (URL
75 schemes). Note that recognized doesn't mean supported -- only HTTP,
76 HTTPS and FTP are currently supported .
78 However, a string that does not match anything in the list will be
79 considered a relative URL. Thus it's important that this list has
80 anything anyone could think of being legal.
82 There are wild things here. :-) Take a look at
83 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
85 static char *protostrings[] =
127 /* Similar to former, but for supported protocols: */
128 static struct proto sup_protos[] =
130 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
132 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
134 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
135 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
138 static void parse_dir PARAMS ((const char *, char **, char **));
139 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
140 static char *construct PARAMS ((const char *, const char *, int , int));
141 static char *construct_relative PARAMS ((const char *, const char *));
142 static char process_ftp_type PARAMS ((char *));
145 /* Returns the number of characters to be skipped if the first thing
146 in a URL is URL: (which is 0 or 4+). The optional spaces after
147 URL: are also skipped. */
149 skip_url (const char *url)
153 if (TOUPPER (url[0]) == 'U'
154 && TOUPPER (url[1]) == 'R'
155 && TOUPPER (url[2]) == 'L'
159 for (i = 4; url[i] && ISSPACE (url[i]); i++);
168 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
169 - @ and :, for user/password encoding.
170 - everything over 127 (but we don't bother with recording those. */
172 init_unsafe_char_table (void)
175 for (i = 0; i < 256; i++)
176 if (i < 32 || i >= 127
192 unsafe_char_table[i] = 1;
195 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
197 contains_unsafe (const char *s)
200 if (UNSAFE_CHAR (*s))
205 /* Decodes the forms %xy in a URL to the character the hexadecimal
206 code of which is xy. xy are hexadecimal digits from
207 [0123456789ABCDEF] (case-insensitive). If x or y are not
208 hex-digits or `%' precedes `\0', the sequence is inserted
212 decode_string (char *s)
222 /* Do nothing if at the end of the string, or if the chars
223 are not hex-digits. */
224 if (!*(s + 1) || !*(s + 2)
225 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
230 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
237 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
238 given string, returning a malloc-ed %XX encoded string. */
240 encode_string (const char *s)
247 for (i = 0; *s; s++, i++)
248 if (UNSAFE_CHAR (*s))
249 i += 2; /* Two more characters (hex digits) */
250 res = (char *)xmalloc (i + 1);
252 for (p = res; *s; s++)
253 if (UNSAFE_CHAR (*s))
255 const unsigned char c = *s;
257 *p++ = HEXD2ASC (c >> 4);
258 *p++ = HEXD2ASC (c & 0xf);
266 /* Returns the proto-type if URL's protocol is supported, or
267 URLUNKNOWN if not. */
269 urlproto (const char *url)
273 url += skip_url (url);
274 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
275 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
276 return sup_protos[i].ind;
277 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
280 for (++i; url[i] && url[i] != '/'; i++)
281 if (!ISDIGIT (url[i]))
283 if (url[i - 1] == ':')
292 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
293 part is found, returns 0. */
295 skip_proto (const char *url)
300 for (s = protostrings; *s; s++)
301 if (!strncasecmp (*s, url, strlen (*s)))
306 /* HTTP and FTP protocols are expected to yield exact host names
307 (i.e. the `//' part must be skipped, too). */
308 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
313 /* Returns 1 if the URL begins with a protocol (supported or
314 unsupported), 0 otherwise. */
316 has_proto (const char *url)
320 url += skip_url (url);
321 for (s = protostrings; *s; s++)
322 if (strncasecmp (url, *s, strlen (*s)) == 0)
327 /* Skip the username and password, if present here. The function
328 should be called *not* with the complete URL, but with the part
329 right after the protocol.
331 If no username and password are found, return 0. */
333 skip_uname (const char *url)
336 const char *q = NULL;
337 for (p = url ; *p && *p != '/'; p++)
338 if (*p == '@') q = p;
339 /* If a `@' was found before the first occurrence of `/', skip
347 /* Allocate a new urlinfo structure, fill it with default values and
348 return a pointer to it. */
354 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
355 memset (u, 0, sizeof (*u));
356 u->proto = URLUNKNOWN;
360 /* Perform a "deep" free of the urlinfo structure. The structure
361 should have been created with newurl, but need not have been used.
362 If free_pointer is non-0, free the pointer itself. */
364 freeurl (struct urlinfo *u, int complete)
368 FREE_MAYBE (u->host);
369 FREE_MAYBE (u->path);
370 FREE_MAYBE (u->file);
372 FREE_MAYBE (u->user);
373 FREE_MAYBE (u->passwd);
374 FREE_MAYBE (u->local);
375 FREE_MAYBE (u->referer);
377 freeurl (u->proxy, 1);
383 /* Extract the given URL of the form
384 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
385 1. hostname (terminated with `/' or `:')
386 2. port number (terminated with `/'), or chosen for the protocol
387 3. dirname (everything after hostname)
388 Most errors are handled. No allocation is done, you must supply
389 pointers to allocated memory.
390 ...and a host of other stuff :-)
392 - Recognizes hostname:dir/file for FTP and
393 hostname (:portnum)?/dir/file for HTTP.
394 - Parses the path to yield directory and file
395 - Parses the URL to yield the username and passwd (if present)
396 - Decodes the strings, in case they contain "forbidden" characters
397 - Writes the result to struct urlinfo
399 If the argument STRICT is set, it recognizes only the canonical
402 parseurl (const char *url, struct urlinfo *u, int strict)
405 int recognizable; /* Recognizable URL is the one where
406 the protocol name was explicitly
407 named, i.e. it wasn't deduced from
411 DEBUGP (("parseurl (\"%s\") -> ", url));
412 url += skip_url (url);
413 recognizable = has_proto (url);
414 if (strict && !recognizable)
416 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
418 l = strlen (sup_protos[i].name);
419 if (!strncasecmp (sup_protos[i].name, url, l))
422 /* If protocol is recognizable, but unsupported, bail out, else
424 if (recognizable && i == ARRAY_SIZE (sup_protos))
426 else if (i == ARRAY_SIZE (sup_protos))
429 u->proto = type = sup_protos[i].ind;
431 if (type == URLUNKNOWN)
433 /* Allow a username and password to be specified (i.e. just skip
436 l += skip_uname (url + l);
437 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
440 /* Get the hostname. */
441 u->host = strdupdelim (url + l, url + i);
442 DEBUGP (("host %s -> ", u->host));
444 /* Assume no port has been given. */
448 /* We have a colon delimiting the hostname. It could mean that
449 a port number is following it, or a directory. */
450 if (ISDIGIT (url[++i])) /* A port number */
452 if (type == URLUNKNOWN)
453 u->proto = type = URLHTTP;
454 for (; url[i] && url[i] != '/'; i++)
455 if (ISDIGIT (url[i]))
456 u->port = 10 * u->port + (url[i] - '0');
461 DEBUGP (("port %hu -> ", u->port));
463 else if (type == URLUNKNOWN) /* or a directory */
464 u->proto = type = URLFTP;
465 else /* or just a misformed port number */
468 else if (type == URLUNKNOWN)
469 u->proto = type = URLHTTP;
473 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
474 if (sup_protos[ind].ind == type)
476 if (ind == ARRAY_SIZE (sup_protos))
478 u->port = sup_protos[ind].port;
480 /* Some delimiter troubles... */
481 if (url[i] == '/' && url[i - 1] != ':')
484 while (url[i] && url[i] == '/')
486 u->path = (char *)xmalloc (strlen (url + i) + 8);
487 strcpy (u->path, url + i);
490 u->ftp_type = process_ftp_type (u->path);
491 /* #### We don't handle type `d' correctly yet. */
492 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
494 DEBUGP (("ftp_type %c -> ", u->ftp_type));
496 DEBUGP (("opath %s -> ", u->path));
497 /* Parse the username and password (if existing). */
498 parse_uname (url, &u->user, &u->passwd);
499 /* Decode the strings, as per RFC 1738. */
500 decode_string (u->host);
501 decode_string (u->path);
503 decode_string (u->user);
505 decode_string (u->passwd);
506 /* Parse the directory. */
507 parse_dir (u->path, &u->dir, &u->file);
508 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
509 /* Simplify the directory. */
510 path_simplify (u->dir);
511 /* Remove the leading `/' in HTTP. */
512 if (type == URLHTTP && *u->dir == '/')
513 strcpy (u->dir, u->dir + 1);
514 DEBUGP (("ndir %s\n", u->dir));
515 /* Strip trailing `/'. */
517 if (l > 1 && u->dir[l - 1] == '/')
518 u->dir[l - 1] = '\0';
519 /* Re-create the path: */
520 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
521 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
522 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
523 strcpy (u->path, abs_ftp ? "%2F" : "/");
524 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
525 strcat (u->path, *u->dir ? "/" : "");
526 strcat (u->path, u->file);
527 URL_CLEANSE (u->path);
528 DEBUGP (("newpath: %s\n", u->path));
529 /* Create the clean URL. */
530 u->url = str_url (u, 0);
534 /* Special versions of DOTP and DDOTP for parse_dir(). */
536 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
537 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
538 && (!*((x) + 2) || *((x) + 2) == '?'))
540 /* Build the directory and filename components of the path. Both
541 components are *separately* malloc-ed strings! It does not change
542 the contents of path.
544 If the path ends with "." or "..", they are (correctly) counted as
547 parse_dir (const char *path, char **dir, char **file)
551 l = urlpath_length (path);
552 for (i = l; i && path[i] != '/'; i--);
554 if (!i && *path != '/') /* Just filename */
556 if (PD_DOTP (path) || PD_DDOTP (path))
558 *dir = strdupdelim (path, path + l);
559 *file = xstrdup (path + l); /* normally empty, but could
564 *dir = xstrdup (""); /* This is required because of FTP */
565 *file = xstrdup (path);
568 else if (!i) /* /filename */
570 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
572 *dir = strdupdelim (path, path + l);
573 *file = xstrdup (path + l); /* normally empty, but could
578 *dir = xstrdup ("/");
579 *file = xstrdup (path + 1);
582 else /* Nonempty directory with or without a filename */
584 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
586 *dir = strdupdelim (path, path + l);
587 *file = xstrdup (path + l); /* normally empty, but could
592 *dir = strdupdelim (path, path + i);
593 *file = xstrdup (path + i + 1);
598 /* Find the optional username and password within the URL, as per
599 RFC1738. The returned user and passwd char pointers are
602 parse_uname (const char *url, char **user, char **passwd)
605 const char *p, *q, *col;
610 url += skip_url (url);
611 /* Look for end of protocol string. */
612 l = skip_proto (url);
615 /* Add protocol offset. */
617 /* Is there an `@' character? */
618 for (p = url; *p && *p != '/'; p++)
621 /* If not, return. */
624 /* Else find the username and password. */
625 for (p = q = col = url; *p && *p != '/'; p++)
627 if (*p == ':' && !*user)
629 *user = (char *)xmalloc (p - url + 1);
630 memcpy (*user, url, p - url);
631 (*user)[p - url] = '\0';
634 if (*p == '@') q = p;
636 /* Decide whether you have only the username or both. */
637 where = *user ? passwd : user;
638 *where = (char *)xmalloc (q - col + 1);
639 memcpy (*where, col, q - col);
640 (*where)[q - col] = '\0';
644 /* If PATH ends with `;type=X', return the character X. */
646 process_ftp_type (char *path)
648 int len = strlen (path);
651 && !memcmp (path + len - 7, ";type=", 6))
653 path[len - 7] = '\0';
654 return path[len - 1];
660 /* Return the URL as fine-formed string, with a proper protocol, optional port
661 number, directory and optional user/password. If `hide' is non-zero (as it
662 is when we're calling this on a URL we plan to print, but not when calling it
663 to canonicalize a URL for use within the program), password will be hidden.
664 The forbidden characters in the URL will be cleansed. */
666 str_url (const struct urlinfo *u, int hide)
668 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
669 int i, l, ln, lu, lh, lp, lf, ld;
670 unsigned short proto_default_port;
672 /* Look for the protocol name. */
673 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
674 if (sup_protos[i].ind == u->proto)
676 if (i == ARRAY_SIZE (sup_protos))
678 proto_name = sup_protos[i].name;
679 proto_default_port = sup_protos[i].port;
680 host = CLEANDUP (u->host);
681 dir = CLEANDUP (u->dir);
682 file = CLEANDUP (u->file);
683 user = passwd = NULL;
685 user = CLEANDUP (u->user);
689 /* Don't output the password, or someone might see it over the user's
690 shoulder (or in saved wget output). Don't give away the number of
691 characters in the password, either, as we did in past versions of
692 this code, when we replaced the password characters with 'x's. */
693 passwd = xstrdup("<password>");
695 passwd = CLEANDUP (u->passwd);
697 if (u->proto == URLFTP && *dir == '/')
699 char *tmp = (char *)xmalloc (strlen (dir) + 3);
700 /*sprintf (tmp, "%%2F%s", dir + 1);*/
704 strcpy (tmp + 3, dir + 1);
709 ln = strlen (proto_name);
710 lu = user ? strlen (user) : 0;
711 lp = passwd ? strlen (passwd) : 0;
715 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
716 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
717 (user ? user : ""), (passwd ? ":" : ""),
718 (passwd ? passwd : ""), (user ? "@" : ""),
719 host, u->port, dir, *dir ? "/" : "", file); */
721 memcpy (res, proto_name, ln);
725 memcpy (res + l, user, lu);
730 memcpy (res + l, passwd, lp);
735 memcpy (res + l, host, lh);
737 if (u->port != proto_default_port)
740 long_to_string (res + l, (long)u->port);
741 l += numdigit (u->port);
744 memcpy (res + l, dir, ld);
748 strcpy (res + l, file);
757 /* Check whether two URL-s are equivalent, i.e. pointing to the same
758 location. Uses parseurl to parse them, and compares the canonical
761 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
762 return 0 on error. */
764 url_equal (const char *url1, const char *url2)
766 struct urlinfo *u1, *u2;
771 err = parseurl (url1, u1, 0);
778 err = parseurl (url2, u2, 0);
784 res = !strcmp (u1->url, u2->url);
791 get_urls_file (const char *file)
793 struct file_memory *fm;
795 const char *text, *text_end;
798 fm = read_file (file);
801 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
804 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
807 text_end = fm->content + fm->length;
808 while (text < text_end)
810 const char *line_beg = text;
811 const char *line_end = memchr (text, '\n', text_end - text);
817 while (line_beg < line_end
818 && ISSPACE (*line_beg))
820 while (line_end > line_beg + 1
821 && ISSPACE (*(line_end - 1)))
823 if (line_end > line_beg)
825 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
826 memset (entry, 0, sizeof (*entry));
828 entry->url = strdupdelim (line_beg, line_end);
840 /* Free the linked list of urlpos. */
842 free_urlpos (urlpos *l)
846 urlpos *next = l->next;
848 FREE_MAYBE (l->local_name);
854 /* Rotate FNAME opt.backups times */
856 rotate_backups(const char *fname)
858 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
859 char *from = (char *)alloca (maxlen);
860 char *to = (char *)alloca (maxlen);
864 if (stat (fname, &sb) == 0)
865 if (S_ISREG (sb.st_mode) == 0)
868 for (i = opt.backups; i > 1; i--)
870 sprintf (from, "%s.%d", fname, i - 1);
871 sprintf (to, "%s.%d", fname, i);
872 /* #### This will fail on machines without the rename() system
877 sprintf (to, "%s.%d", fname, 1);
881 /* Create all the necessary directories for PATH (a file). Calls
882 mkdirhier() internally. */
884 mkalldirs (const char *path)
891 p = path + strlen (path);
892 for (; *p != '/' && p != path; p--);
893 /* Don't create if it's just a file. */
894 if ((p == path) && (*p != '/'))
896 t = strdupdelim (path, p);
897 /* Check whether the directory exists. */
898 if ((stat (t, &st) == 0))
900 if (S_ISDIR (st.st_mode))
907 /* If the dir exists as a file name, remove it first. This
908 is *only* for Wget to work with buggy old CERN http
909 servers. Here is the scenario: When Wget tries to
910 retrieve a directory without a slash, e.g.
911 http://foo/bar (bar being a directory), CERN server will
912 not redirect it too http://foo/bar/ -- it will generate a
913 directory listing containing links to bar/file1,
914 bar/file2, etc. Wget will lose because it saves this
915 HTML listing to a file `bar', so it cannot create the
916 directory. To work around this, if the file of the same
917 name exists, we just remove it and create the directory
919 DEBUGP (("Removing %s because of directory danger!\n", t));
923 res = make_directory (t);
925 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
931 count_slashes (const char *s)
940 /* Return the path name of the URL-equivalent file name, with a
941 remote-like structure of directories. */
943 mkstruct (const struct urlinfo *u)
945 char *host, *dir, *file, *res, *dirpref;
948 assert (u->dir != NULL);
949 assert (u->host != NULL);
953 char *ptr = u->dir + (*u->dir == '/');
954 int slash_count = 1 + count_slashes (ptr);
955 int cut = MINVAL (opt.cut_dirs, slash_count);
956 for (; cut && *ptr; ptr++)
959 STRDUP_ALLOCA (dir, ptr);
962 dir = u->dir + (*u->dir == '/');
964 host = xstrdup (u->host);
965 /* Check for the true name (or at least a consistent name for saving
966 to directory) of HOST, reusing the hlist if possible. */
967 if (opt.add_hostdir && !opt.simple_check)
969 char *nhost = realhost (host);
973 /* Add dir_prefix and hostname (if required) to the beginning of
977 if (!DOTP (opt.dir_prefix))
979 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
980 + strlen (host) + 1);
981 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
984 STRDUP_ALLOCA (dirpref, host);
986 else /* not add_hostdir */
988 if (!DOTP (opt.dir_prefix))
989 dirpref = opt.dir_prefix;
995 /* If there is a prefix, prepend it. */
998 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
999 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1002 dir = xstrdup (dir);
1005 if (l && dir[l - 1] == '/')
1009 file = "index.html";
1013 /* Finally, construct the full name. */
1014 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1015 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1020 /* Create a unique filename, corresponding to a given URL. Calls
1021 mkstruct if necessary. Does *not* actually create any directories. */
1023 url_filename (const struct urlinfo *u)
1026 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1030 file = mkstruct (u);
1036 file = xstrdup ("index.html");
1038 file = xstrdup (u->file);
1043 /* Check whether the prefix directory is something other than "."
1044 before prepending it. */
1045 if (!DOTP (opt.dir_prefix))
1047 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1048 + 1 + strlen (file) + 1);
1049 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1054 /* DOS-ish file systems don't like `%' signs in them; we change it
1059 for (p = file; *p; p++)
1063 #endif /* WINDOWS */
1065 /* Check the cases in which the unique extensions are not used:
1066 1) Clobbering is turned off (-nc).
1067 2) Retrieval with regetting.
1068 3) Timestamping is used.
1069 4) Hierarchy is built.
1071 The exception is the case when file does exist and is a
1072 directory (actually support for bad httpd-s). */
1073 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1074 && !(file_exists_p (file) && !file_non_directory_p (file)))
1077 /* Find a unique name. */
1078 name = unique_name (file);
1083 /* Like strlen(), but allow the URL to be ended with '?'. */
1085 urlpath_length (const char *url)
1087 const char *q = strchr (url, '?');
1090 return strlen (url);
1093 /* Find the last occurrence of character C in the range [b, e), or
1094 NULL, if none are present. This is almost completely equivalent to
1095 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1096 the contents of the string. */
1098 find_last_char (const char *b, const char *e, char c)
1106 /* Construct a URL by concatenating an absolute URL and a path, which
1107 may or may not be absolute. This tries to behave "reasonably" in
1108 all foreseeable cases. It employs little specific knowledge about
1109 protocols or URL-specific stuff -- it just works on strings. */
1111 construct (const char *url, const char *sub, int subsize, int no_proto)
1117 const char *end = url + urlpath_length (url);
1121 /* SUB is a relative URL: we need to replace everything
1122 after last slash (possibly empty) with SUB.
1124 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1125 our result should be "whatever/foo/qux/xyzzy". */
1126 int need_explicit_slash = 0;
1128 const char *start_insert;
1129 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1132 /* No slash found at all. Append SUB to what we have,
1133 but we'll need a slash as a separator.
1135 Example: if url == "foo" and sub == "qux/xyzzy", then
1136 we cannot just append sub to url, because we'd get
1137 "fooqux/xyzzy", whereas what we want is
1140 To make sure the / gets inserted, we set
1141 need_explicit_slash to 1. We also set start_insert
1142 to end + 1, so that the length calculations work out
1143 correctly for one more (slash) character. Accessing
1144 that character is fine, since it will be the
1145 delimiter, '\0' or '?'. */
1146 /* example: "foo?..." */
1147 /* ^ ('?' gets changed to '/') */
1148 start_insert = end + 1;
1149 need_explicit_slash = 1;
1151 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1153 /* example: http://host" */
1155 start_insert = end + 1;
1156 need_explicit_slash = 1;
1160 /* example: "whatever/foo/bar" */
1162 start_insert = last_slash + 1;
1165 span = start_insert - url;
1166 constr = (char *)xmalloc (span + subsize + 1);
1168 memcpy (constr, url, span);
1169 if (need_explicit_slash)
1170 constr[span - 1] = '/';
1172 memcpy (constr + span, sub, subsize);
1173 constr[span + subsize] = '\0';
1175 else /* *sub == `/' */
1177 /* SUB is an absolute path: we need to replace everything
1178 after (and including) the FIRST slash with SUB.
1180 So, if URL is "http://host/whatever/foo/bar", and SUB is
1181 "/qux/xyzzy", our result should be
1182 "http://host/qux/xyzzy". */
1185 const char *start_insert = NULL; /* for gcc to shut up. */
1186 const char *pos = url;
1187 int seen_slash_slash = 0;
1188 /* We're looking for the first slash, but want to ignore
1191 slash = memchr (pos, '/', end - pos);
1192 if (slash && !seen_slash_slash)
1193 if (*(slash + 1) == '/')
1196 seen_slash_slash = 1;
1200 /* At this point, SLASH is the location of the first / after
1201 "//", or the first slash altogether. START_INSERT is the
1202 pointer to the location where SUB will be inserted. When
1203 examining the last two examples, keep in mind that SUB
1206 if (!slash && !seen_slash_slash)
1207 /* example: "foo" */
1210 else if (!slash && seen_slash_slash)
1211 /* example: "http://foo" */
1214 else if (slash && !seen_slash_slash)
1215 /* example: "foo/bar" */
1218 else if (slash && seen_slash_slash)
1219 /* example: "http://something/" */
1221 start_insert = slash;
1223 span = start_insert - url;
1224 constr = (char *)xmalloc (span + subsize + 1);
1226 memcpy (constr, url, span);
1228 memcpy (constr + span, sub, subsize);
1229 constr[span + subsize] = '\0';
1232 else /* !no_proto */
1234 constr = strdupdelim (sub, sub + subsize);
1239 /* Like the function above, but with a saner caller interface. */
1241 url_concat (const char *base_url, const char *new_url)
1243 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1246 /* Optimize URL by host, destructively replacing u->host with realhost
1247 (u->host). Do this regardless of opt.simple_check. */
1249 opt_url (struct urlinfo *u)
1251 /* Find the "true" host. */
1252 char *host = realhost (u->host);
1255 assert (u->dir != NULL); /* the URL must have been parsed */
1256 /* Refresh the printed representation. */
1258 u->url = str_url (u, 0);
1261 /* This beautiful kludge is fortunately not needed, as I've made
1262 parse_dir do the (almost) right thing, so that a query can never
1263 become a part of directory. */
1265 /* Call path_simplify, but make sure that the part after the
1266 question-mark, if any, is not destroyed by path_simplify's
1269 path_simplify_with_kludge (char *path)
1271 char *query = strchr (path, '?');
1273 /* path_simplify also works destructively, so we also have the
1274 license to write. */
1276 path_simplify (path);
1279 char *newend = path + strlen (path);
1281 if (newend != query)
1282 memmove (newend, query, strlen (query) + 1);
1287 /* Returns proxy host address, in accordance with PROTO. */
1289 getproxy (uerr_t proto)
1291 if (proto == URLHTTP)
1292 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1293 else if (proto == URLFTP)
1294 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1296 else if (proto == URLHTTPS)
1297 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1298 #endif /* HAVE_SSL */
1303 /* Should a host be accessed through proxy, concerning no_proxy? */
1305 no_proxy_match (const char *host, const char **no_proxy)
1310 return !sufmatch (no_proxy, host);
1313 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1314 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1316 /* Change the links in an HTML document. Accepts a structure that
1317 defines the positions of all the links. */
1319 convert_links (const char *file, urlpos *l)
1321 struct file_memory *fm;
1324 downloaded_file_t downloaded_file_return;
1326 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1329 /* First we do a "dry run": go through the list L and see whether
1330 any URL needs to be converted in the first place. If not, just
1331 leave the file alone. */
1334 for (dry = l; dry; dry = dry->next)
1335 if (dry->convert != CO_NOCONVERT)
1339 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1344 fm = read_file (file);
1347 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1348 file, strerror (errno));
1352 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1353 if (opt.backup_converted && downloaded_file_return)
1354 write_backup_file (file, downloaded_file_return);
1356 /* Before opening the file for writing, unlink the file. This is
1357 important if the data in FM is mmaped. In such case, nulling the
1358 file, which is what fopen() below does, would make us read all
1359 zeroes from the mmaped region. */
1360 if (unlink (file) < 0 && errno != ENOENT)
1362 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1363 file, strerror (errno));
1364 read_file_free (fm);
1367 /* Now open the file for writing. */
1368 fp = fopen (file, "wb");
1371 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1372 file, strerror (errno));
1373 read_file_free (fm);
1376 /* Here we loop through all the URLs in file, replacing those of
1377 them that are downloaded with relative references. */
1379 for (; l; l = l->next)
1381 char *url_start = fm->content + l->pos;
1383 if (l->pos >= fm->length)
1385 DEBUGP (("Something strange is going on. Please investigate."));
1388 /* If the URL is not to be converted, skip it. */
1389 if (l->convert == CO_NOCONVERT)
1391 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1395 /* Echo the file contents, up to the offending URL's opening
1396 quote, to the outfile. */
1397 fwrite (p, 1, url_start - p, fp);
1399 if (l->convert == CO_CONVERT_TO_RELATIVE)
1401 /* Convert absolute URL to relative. */
1402 char *newname = construct_relative (file, l->local_name);
1403 char *quoted_newname = html_quote_string (newname);
1404 replace_attr (&p, l->size, fp, quoted_newname);
1405 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1406 l->url, newname, l->pos, file));
1408 xfree (quoted_newname);
1410 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1412 /* Convert the link to absolute URL. */
1413 char *newlink = l->url;
1414 char *quoted_newlink = html_quote_string (newlink);
1415 replace_attr (&p, l->size, fp, quoted_newlink);
1416 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1417 newlink, l->pos, file));
1418 xfree (quoted_newlink);
1421 /* Output the rest of the file. */
1422 if (p - fm->content < fm->length)
1423 fwrite (p, 1, fm->length - (p - fm->content), fp);
1425 read_file_free (fm);
1426 logputs (LOG_VERBOSE, _("done.\n"));
1429 /* Construct and return a malloced copy of the relative link from two
1430 pieces of information: local name S1 of the referring file and
1431 local name S2 of the referred file.
1433 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1434 "jagor.srce.hr/images/news.gif", the function will return
1437 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1438 "fly.cc.fer.hr/images/fly.gif", the function will return
1439 "../images/fly.gif".
1441 Caveats: S1 should not begin with `/', unless S2 also begins with
1442 '/'. S1 should not contain things like ".." and such --
1443 construct_relative ("fly/ioccc/../index.html",
1444 "fly/images/fly.gif") will fail. (A workaround is to call
1445 something like path_simplify() on S1). */
1447 construct_relative (const char *s1, const char *s2)
1449 int i, cnt, sepdirs1;
1453 return xstrdup (s2);
1454 /* S1 should *not* be absolute, if S2 wasn't. */
1455 assert (*s1 != '/');
1457 /* Skip the directories common to both strings. */
1460 while (s1[i] && s2[i]
1465 if (s1[i] == '/' && s2[i] == '/')
1470 for (sepdirs1 = 0; s1[i]; i++)
1473 /* Now, construct the file as of:
1474 - ../ repeated sepdirs1 time
1475 - all the non-mutual directories of S2. */
1476 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1477 for (i = 0; i < sepdirs1; i++)
1478 memcpy (res + 3 * i, "../", 3);
1479 strcpy (res + 3 * i, s2 + cnt);
1483 /* Add URL to the head of the list L. */
1485 add_url (urlpos *l, const char *url, const char *file)
1489 t = (urlpos *)xmalloc (sizeof (urlpos));
1490 memset (t, 0, sizeof (*t));
1491 t->url = xstrdup (url);
1492 t->local_name = xstrdup (file);
1498 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1500 /* Rather than just writing over the original .html file with the
1501 converted version, save the former to *.orig. Note we only do
1502 this for files we've _successfully_ downloaded, so we don't
1503 clobber .orig files sitting around from previous invocations. */
1505 /* Construct the backup filename as the original name plus ".orig". */
1506 size_t filename_len = strlen(file);
1507 char* filename_plus_orig_suffix;
1508 boolean already_wrote_backup_file = FALSE;
1509 slist* converted_file_ptr;
1510 static slist* converted_files = NULL;
1512 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1514 /* Just write "orig" over "html". We need to do it this way
1515 because when we're checking to see if we've downloaded the
1516 file before (to see if we can skip downloading it), we don't
1517 know if it's a text/html file. Therefore we don't know yet
1518 at that stage that -E is going to cause us to tack on
1519 ".html", so we need to compare vs. the original URL plus
1520 ".orig", not the original URL plus ".html.orig". */
1521 filename_plus_orig_suffix = alloca (filename_len + 1);
1522 strcpy(filename_plus_orig_suffix, file);
1523 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1525 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1527 /* Append ".orig" to the name. */
1528 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1529 strcpy(filename_plus_orig_suffix, file);
1530 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1533 /* We can get called twice on the same URL thanks to the
1534 convert_all_links() call in main(). If we write the .orig file
1535 each time in such a case, it'll end up containing the first-pass
1536 conversion, not the original file. So, see if we've already been
1537 called on this file. */
1538 converted_file_ptr = converted_files;
1539 while (converted_file_ptr != NULL)
1540 if (strcmp(converted_file_ptr->string, file) == 0)
1542 already_wrote_backup_file = TRUE;
1546 converted_file_ptr = converted_file_ptr->next;
1548 if (!already_wrote_backup_file)
1550 /* Rename <file> to <file>.orig before former gets written over. */
1551 if (rename(file, filename_plus_orig_suffix) != 0)
1552 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1553 file, filename_plus_orig_suffix, strerror (errno));
1555 /* Remember that we've already written a .orig backup for this file.
1556 Note that we never free this memory since we need it till the
1557 convert_all_links() call, which is one of the last things the
1558 program does before terminating. BTW, I'm not sure if it would be
1559 safe to just set 'converted_file_ptr->string' to 'file' below,
1560 rather than making a copy of the string... Another note is that I
1561 thought I could just add a field to the urlpos structure saying
1562 that we'd written a .orig file for this URL, but that didn't work,
1563 so I had to make this separate list.
1564 -- Dan Harkless <wget@harkless.org>
1566 This [adding a field to the urlpos structure] didn't work
1567 because convert_file() is called twice: once after all its
1568 sublinks have been retrieved in recursive_retrieve(), and
1569 once at the end of the day in convert_all_links(). The
1570 original linked list collected in recursive_retrieve() is
1571 lost after the first invocation of convert_links(), and
1572 convert_all_links() makes a new one (it calls get_urls_html()
1573 for each file it covers.) That's why your first approach didn't
1574 work. The way to make it work is perhaps to make this flag a
1575 field in the `urls_html' list.
1576 -- Hrvoje Niksic <hniksic@arsdigita.com>
1578 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1579 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1580 converted_file_ptr->next = converted_files;
1581 converted_files = converted_file_ptr;
1585 static int find_fragment PARAMS ((const char *, int, const char **,
1589 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1591 const char *p = *pp;
1593 int size = raw_size;
1594 char quote_char = '\"';
1595 const char *frag_beg, *frag_end;
1597 /* Structure of our string is:
1598 "...old-contents..."
1599 <--- l->size ---> (with quotes)
1602 <--- l->size --> (no quotes) */
1604 if (*p == '\"' || *p == '\'')
1609 size -= 2; /* disregard opening and closing quote */
1611 putc (quote_char, fp);
1612 fputs (new_str, fp);
1614 /* Look for fragment identifier, if any. */
1615 if (find_fragment (p, size, &frag_beg, &frag_end))
1616 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1620 putc (quote_char, fp);
1624 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1625 preceded by '&'. If the character is not found, return zero. If
1626 the character is found, return 1 and set BP and EP to point to the
1627 beginning and end of the region.
1629 This is used for finding the fragment indentifiers in URLs. */
1632 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1634 const char *end = beg + size;
1636 for (; beg < end; beg++)
1658 typedef struct _downloaded_file_list {
1660 downloaded_file_t download_type;
1661 struct _downloaded_file_list* next;
1662 } downloaded_file_list;
1664 static downloaded_file_list *downloaded_files;
1666 /* Remembers which files have been downloaded. In the standard case, should be
1667 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1668 download successfully (i.e. not for ones we have failures on or that we skip
1671 When we've downloaded a file and tacked on a ".html" extension due to -E,
1672 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1673 FILE_DOWNLOADED_NORMALLY.
1675 If you just want to check if a file has been previously added without adding
1676 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1677 with local filenames, not remote URLs. */
1679 downloaded_file (downloaded_file_t mode, const char* file)
1681 boolean found_file = FALSE;
1682 downloaded_file_list* rover = downloaded_files;
1684 while (rover != NULL)
1685 if (strcmp(rover->file, file) == 0)
1691 rover = rover->next;
1694 return rover->download_type; /* file had already been downloaded */
1697 if (mode != CHECK_FOR_FILE)
1699 rover = xmalloc(sizeof(*rover));
1700 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1701 rover->download_type = mode;
1702 rover->next = downloaded_files;
1703 downloaded_files = rover;
1706 return FILE_NOT_ALREADY_DOWNLOADED;
1711 downloaded_files_free (void)
1713 downloaded_file_list* rover = downloaded_files;
1716 downloaded_file_list *next = rover->next;
1717 xfree (rover->file);
1723 /* Initialization of static stuff. */
1727 init_unsafe_char_table ();