2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
46 /* Table of Unsafe chars. This is intialized in
47 init_unsafe_char_table. */
49 static char unsafe_char_table[256];
51 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
53 /* If S contains unsafe characters, free it and replace it with a
54 version that doesn't. */
55 #define URL_CLEANSE(s) do \
57 if (contains_unsafe (s)) \
59 char *uc_tmp = encode_string (s); \
65 /* Is a directory "."? */
66 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
67 /* Is a directory ".."? */
68 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
71 static void path_simplify_with_kludge PARAMS ((char *));
73 static int urlpath_length PARAMS ((const char *));
75 /* NULL-terminated list of strings to be recognized as prototypes (URL
76 schemes). Note that recognized doesn't mean supported -- only HTTP,
77 HTTPS and FTP are currently supported .
79 However, a string that does not match anything in the list will be
80 considered a relative URL. Thus it's important that this list has
81 anything anyone could think of being legal.
83 There are wild things here. :-) Take a look at
84 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
86 static char *protostrings[] =
128 /* Similar to former, but for supported protocols: */
129 static struct proto sup_protos[] =
131 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
133 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
135 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
136 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
139 static void parse_dir PARAMS ((const char *, char **, char **));
140 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
141 static char *construct PARAMS ((const char *, const char *, int , int));
142 static char *construct_relative PARAMS ((const char *, const char *));
143 static char process_ftp_type PARAMS ((char *));
146 /* Returns the number of characters to be skipped if the first thing
147 in a URL is URL: (which is 0 or 4+). The optional spaces after
148 URL: are also skipped. */
150 skip_url (const char *url)
154 if (TOUPPER (url[0]) == 'U'
155 && TOUPPER (url[1]) == 'R'
156 && TOUPPER (url[2]) == 'L'
160 for (i = 4; url[i] && ISSPACE (url[i]); i++);
169 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
170 - @ and :, for user/password encoding.
171 - everything over 127 (but we don't bother with recording those. */
173 init_unsafe_char_table (void)
176 for (i = 0; i < 256; i++)
177 if (i < 32 || i >= 127
193 unsafe_char_table[i] = 1;
196 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
198 contains_unsafe (const char *s)
201 if (UNSAFE_CHAR (*s))
206 /* Decodes the forms %xy in a URL to the character the hexadecimal
207 code of which is xy. xy are hexadecimal digits from
208 [0123456789ABCDEF] (case-insensitive). If x or y are not
209 hex-digits or `%' precedes `\0', the sequence is inserted
213 decode_string (char *s)
223 /* Do nothing if at the end of the string, or if the chars
224 are not hex-digits. */
225 if (!*(s + 1) || !*(s + 2)
226 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
231 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
238 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
239 given string, returning a malloc-ed %XX encoded string. */
241 encode_string (const char *s)
248 for (i = 0; *s; s++, i++)
249 if (UNSAFE_CHAR (*s))
250 i += 2; /* Two more characters (hex digits) */
251 res = (char *)xmalloc (i + 1);
253 for (p = res; *s; s++)
254 if (UNSAFE_CHAR (*s))
256 const unsigned char c = *s;
258 *p++ = HEXD2ASC (c >> 4);
259 *p++ = HEXD2ASC (c & 0xf);
267 /* Returns the proto-type if URL's protocol is supported, or
268 URLUNKNOWN if not. */
270 urlproto (const char *url)
274 url += skip_url (url);
275 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
276 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
277 return sup_protos[i].ind;
278 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
281 for (++i; url[i] && url[i] != '/'; i++)
282 if (!ISDIGIT (url[i]))
284 if (url[i - 1] == ':')
293 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
294 part is found, returns 0. */
296 skip_proto (const char *url)
301 for (s = protostrings; *s; s++)
302 if (!strncasecmp (*s, url, strlen (*s)))
307 /* HTTP and FTP protocols are expected to yield exact host names
308 (i.e. the `//' part must be skipped, too). */
309 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
314 /* Returns 1 if the URL begins with a protocol (supported or
315 unsupported), 0 otherwise. */
317 has_proto (const char *url)
321 url += skip_url (url);
322 for (s = protostrings; *s; s++)
323 if (strncasecmp (url, *s, strlen (*s)) == 0)
328 /* Skip the username and password, if present here. The function
329 should be called *not* with the complete URL, but with the part
330 right after the protocol.
332 If no username and password are found, return 0. */
334 skip_uname (const char *url)
337 const char *q = NULL;
338 for (p = url ; *p && *p != '/'; p++)
339 if (*p == '@') q = p;
340 /* If a `@' was found before the first occurrence of `/', skip
348 /* Allocate a new urlinfo structure, fill it with default values and
349 return a pointer to it. */
355 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
356 memset (u, 0, sizeof (*u));
357 u->proto = URLUNKNOWN;
361 /* Perform a "deep" free of the urlinfo structure. The structure
362 should have been created with newurl, but need not have been used.
363 If free_pointer is non-0, free the pointer itself. */
365 freeurl (struct urlinfo *u, int complete)
369 FREE_MAYBE (u->host);
370 FREE_MAYBE (u->path);
371 FREE_MAYBE (u->file);
373 FREE_MAYBE (u->user);
374 FREE_MAYBE (u->passwd);
375 FREE_MAYBE (u->local);
376 FREE_MAYBE (u->referer);
378 freeurl (u->proxy, 1);
384 /* Extract the given URL of the form
385 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
386 1. hostname (terminated with `/' or `:')
387 2. port number (terminated with `/'), or chosen for the protocol
388 3. dirname (everything after hostname)
389 Most errors are handled. No allocation is done, you must supply
390 pointers to allocated memory.
391 ...and a host of other stuff :-)
393 - Recognizes hostname:dir/file for FTP and
394 hostname (:portnum)?/dir/file for HTTP.
395 - Parses the path to yield directory and file
396 - Parses the URL to yield the username and passwd (if present)
397 - Decodes the strings, in case they contain "forbidden" characters
398 - Writes the result to struct urlinfo
400 If the argument STRICT is set, it recognizes only the canonical
403 parseurl (const char *url, struct urlinfo *u, int strict)
406 int recognizable; /* Recognizable URL is the one where
407 the protocol name was explicitly
408 named, i.e. it wasn't deduced from
412 DEBUGP (("parseurl (\"%s\") -> ", url));
413 url += skip_url (url);
414 recognizable = has_proto (url);
415 if (strict && !recognizable)
417 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
419 l = strlen (sup_protos[i].name);
420 if (!strncasecmp (sup_protos[i].name, url, l))
423 /* If protocol is recognizable, but unsupported, bail out, else
425 if (recognizable && i == ARRAY_SIZE (sup_protos))
427 else if (i == ARRAY_SIZE (sup_protos))
430 u->proto = type = sup_protos[i].ind;
432 if (type == URLUNKNOWN)
434 /* Allow a username and password to be specified (i.e. just skip
437 l += skip_uname (url + l);
438 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
441 /* Get the hostname. */
442 u->host = strdupdelim (url + l, url + i);
443 DEBUGP (("host %s -> ", u->host));
445 /* Assume no port has been given. */
449 /* We have a colon delimiting the hostname. It could mean that
450 a port number is following it, or a directory. */
451 if (ISDIGIT (url[++i])) /* A port number */
453 if (type == URLUNKNOWN)
454 u->proto = type = URLHTTP;
455 for (; url[i] && url[i] != '/'; i++)
456 if (ISDIGIT (url[i]))
457 u->port = 10 * u->port + (url[i] - '0');
462 DEBUGP (("port %hu -> ", u->port));
464 else if (type == URLUNKNOWN) /* or a directory */
465 u->proto = type = URLFTP;
466 else /* or just a misformed port number */
469 else if (type == URLUNKNOWN)
470 u->proto = type = URLHTTP;
474 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
475 if (sup_protos[ind].ind == type)
477 if (ind == ARRAY_SIZE (sup_protos))
479 u->port = sup_protos[ind].port;
481 /* Some delimiter troubles... */
482 if (url[i] == '/' && url[i - 1] != ':')
485 while (url[i] && url[i] == '/')
487 u->path = (char *)xmalloc (strlen (url + i) + 8);
488 strcpy (u->path, url + i);
491 u->ftp_type = process_ftp_type (u->path);
492 /* #### We don't handle type `d' correctly yet. */
493 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
495 DEBUGP (("ftp_type %c -> ", u->ftp_type));
497 DEBUGP (("opath %s -> ", u->path));
498 /* Parse the username and password (if existing). */
499 parse_uname (url, &u->user, &u->passwd);
500 /* Decode the strings, as per RFC 1738. */
501 decode_string (u->host);
502 decode_string (u->path);
504 decode_string (u->user);
506 decode_string (u->passwd);
507 /* Parse the directory. */
508 parse_dir (u->path, &u->dir, &u->file);
509 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
510 /* Simplify the directory. */
511 path_simplify (u->dir);
512 /* Remove the leading `/' in HTTP. */
513 if (type == URLHTTP && *u->dir == '/')
514 strcpy (u->dir, u->dir + 1);
515 DEBUGP (("ndir %s\n", u->dir));
516 /* Strip trailing `/'. */
518 if (l && u->dir[l - 1] == '/')
519 u->dir[l - 1] = '\0';
520 /* Re-create the path: */
521 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
522 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
523 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
524 strcpy (u->path, abs_ftp ? "%2F" : "/");
525 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
526 strcat (u->path, *u->dir ? "/" : "");
527 strcat (u->path, u->file);
528 URL_CLEANSE (u->path);
529 DEBUGP (("newpath: %s\n", u->path));
530 /* Create the clean URL. */
531 u->url = str_url (u, 0);
535 /* Special versions of DOTP and DDOTP for parse_dir(). */
537 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
538 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
539 && (!*((x) + 2) || *((x) + 2) == '?'))
541 /* Build the directory and filename components of the path. Both
542 components are *separately* malloc-ed strings! It does not change
543 the contents of path.
545 If the path ends with "." or "..", they are (correctly) counted as
548 parse_dir (const char *path, char **dir, char **file)
552 l = urlpath_length (path);
553 for (i = l; i && path[i] != '/'; i--);
555 if (!i && *path != '/') /* Just filename */
557 if (PD_DOTP (path) || PD_DDOTP (path))
559 *dir = strdupdelim (path, path + l);
560 *file = xstrdup (path + l); /* normally empty, but could
565 *dir = xstrdup (""); /* This is required because of FTP */
566 *file = xstrdup (path);
569 else if (!i) /* /filename */
571 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
573 *dir = strdupdelim (path, path + l);
574 *file = xstrdup (path + l); /* normally empty, but could
579 *dir = xstrdup ("/");
580 *file = xstrdup (path + 1);
583 else /* Nonempty directory with or without a filename */
585 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
587 *dir = strdupdelim (path, path + l);
588 *file = xstrdup (path + l); /* normally empty, but could
593 *dir = strdupdelim (path, path + i);
594 *file = xstrdup (path + i + 1);
599 /* Find the optional username and password within the URL, as per
600 RFC1738. The returned user and passwd char pointers are
603 parse_uname (const char *url, char **user, char **passwd)
606 const char *p, *q, *col;
611 url += skip_url (url);
612 /* Look for end of protocol string. */
613 l = skip_proto (url);
616 /* Add protocol offset. */
618 /* Is there an `@' character? */
619 for (p = url; *p && *p != '/'; p++)
622 /* If not, return. */
625 /* Else find the username and password. */
626 for (p = q = col = url; *p != '/'; p++)
628 if (*p == ':' && !*user)
630 *user = (char *)xmalloc (p - url + 1);
631 memcpy (*user, url, p - url);
632 (*user)[p - url] = '\0';
635 if (*p == '@') q = p;
637 /* Decide whether you have only the username or both. */
638 where = *user ? passwd : user;
639 *where = (char *)xmalloc (q - col + 1);
640 memcpy (*where, col, q - col);
641 (*where)[q - col] = '\0';
645 /* If PATH ends with `;type=X', return the character X. */
647 process_ftp_type (char *path)
649 int len = strlen (path);
652 && !memcmp (path + len - 7, ";type=", 6))
654 path[len - 7] = '\0';
655 return path[len - 1];
661 /* Return the URL as fine-formed string, with a proper protocol, optional port
662 number, directory and optional user/password. If `hide' is non-zero (as it
663 is when we're calling this on a URL we plan to print, but not when calling it
664 to canonicalize a URL for use within the program), password will be hidden.
665 The forbidden characters in the URL will be cleansed. */
667 str_url (const struct urlinfo *u, int hide)
669 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
670 int i, l, ln, lu, lh, lp, lf, ld;
671 unsigned short proto_default_port;
673 /* Look for the protocol name. */
674 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
675 if (sup_protos[i].ind == u->proto)
677 if (i == ARRAY_SIZE (sup_protos))
679 proto_name = sup_protos[i].name;
680 proto_default_port = sup_protos[i].port;
681 host = CLEANDUP (u->host);
682 dir = CLEANDUP (u->dir);
683 file = CLEANDUP (u->file);
684 user = passwd = NULL;
686 user = CLEANDUP (u->user);
690 /* Don't output the password, or someone might see it over the user's
691 shoulder (or in saved wget output). Don't give away the number of
692 characters in the password, either, as we did in past versions of
693 this code, when we replaced the password characters with 'x's. */
694 passwd = xstrdup("<password>");
696 passwd = CLEANDUP (u->passwd);
698 if (u->proto == URLFTP && *dir == '/')
700 char *tmp = (char *)xmalloc (strlen (dir) + 3);
701 /*sprintf (tmp, "%%2F%s", dir + 1);*/
705 strcpy (tmp + 3, dir + 1);
710 ln = strlen (proto_name);
711 lu = user ? strlen (user) : 0;
712 lp = passwd ? strlen (passwd) : 0;
716 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
717 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
718 (user ? user : ""), (passwd ? ":" : ""),
719 (passwd ? passwd : ""), (user ? "@" : ""),
720 host, u->port, dir, *dir ? "/" : "", file); */
722 memcpy (res, proto_name, ln);
726 memcpy (res + l, user, lu);
731 memcpy (res + l, passwd, lp);
736 memcpy (res + l, host, lh);
738 if (u->port != proto_default_port)
741 long_to_string (res + l, (long)u->port);
742 l += numdigit (u->port);
745 memcpy (res + l, dir, ld);
749 strcpy (res + l, file);
758 /* Check whether two URL-s are equivalent, i.e. pointing to the same
759 location. Uses parseurl to parse them, and compares the canonical
762 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
763 return 0 on error. */
765 url_equal (const char *url1, const char *url2)
767 struct urlinfo *u1, *u2;
772 err = parseurl (url1, u1, 0);
779 err = parseurl (url2, u2, 0);
785 res = !strcmp (u1->url, u2->url);
792 get_urls_file (const char *file)
794 struct file_memory *fm;
796 const char *text, *text_end;
799 fm = read_file (file);
802 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
805 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
808 text_end = fm->content + fm->length;
809 while (text < text_end)
811 const char *line_beg = text;
812 const char *line_end = memchr (text, '\n', text_end - text);
818 while (line_beg < line_end
819 && ISSPACE (*line_beg))
821 while (line_end > line_beg + 1
822 && ISSPACE (*(line_end - 1)))
824 if (line_end > line_beg)
826 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
827 memset (entry, 0, sizeof (*entry));
829 entry->url = strdupdelim (line_beg, line_end);
841 /* Free the linked list of urlpos. */
843 free_urlpos (urlpos *l)
847 urlpos *next = l->next;
849 FREE_MAYBE (l->local_name);
855 /* Rotate FNAME opt.backups times */
857 rotate_backups(const char *fname)
859 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
860 char *from = (char *)alloca (maxlen);
861 char *to = (char *)alloca (maxlen);
865 if (stat (fname, &sb) == 0)
866 if (S_ISREG (sb.st_mode) == 0)
869 for (i = opt.backups; i > 1; i--)
871 sprintf (from, "%s.%d", fname, i - 1);
872 sprintf (to, "%s.%d", fname, i);
873 /* #### This will fail on machines without the rename() system
878 sprintf (to, "%s.%d", fname, 1);
882 /* Create all the necessary directories for PATH (a file). Calls
883 mkdirhier() internally. */
885 mkalldirs (const char *path)
892 p = path + strlen (path);
893 for (; *p != '/' && p != path; p--);
894 /* Don't create if it's just a file. */
895 if ((p == path) && (*p != '/'))
897 t = strdupdelim (path, p);
898 /* Check whether the directory exists. */
899 if ((stat (t, &st) == 0))
901 if (S_ISDIR (st.st_mode))
908 /* If the dir exists as a file name, remove it first. This
909 is *only* for Wget to work with buggy old CERN http
910 servers. Here is the scenario: When Wget tries to
911 retrieve a directory without a slash, e.g.
912 http://foo/bar (bar being a directory), CERN server will
913 not redirect it too http://foo/bar/ -- it will generate a
914 directory listing containing links to bar/file1,
915 bar/file2, etc. Wget will lose because it saves this
916 HTML listing to a file `bar', so it cannot create the
917 directory. To work around this, if the file of the same
918 name exists, we just remove it and create the directory
920 DEBUGP (("Removing %s because of directory danger!\n", t));
924 res = make_directory (t);
926 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
932 count_slashes (const char *s)
941 /* Return the path name of the URL-equivalent file name, with a
942 remote-like structure of directories. */
944 mkstruct (const struct urlinfo *u)
946 char *host, *dir, *file, *res, *dirpref;
949 assert (u->dir != NULL);
950 assert (u->host != NULL);
954 char *ptr = u->dir + (*u->dir == '/');
955 int slash_count = 1 + count_slashes (ptr);
956 int cut = MINVAL (opt.cut_dirs, slash_count);
957 for (; cut && *ptr; ptr++)
960 STRDUP_ALLOCA (dir, ptr);
963 dir = u->dir + (*u->dir == '/');
965 host = xstrdup (u->host);
966 /* Check for the true name (or at least a consistent name for saving
967 to directory) of HOST, reusing the hlist if possible. */
968 if (opt.add_hostdir && !opt.simple_check)
970 char *nhost = realhost (host);
974 /* Add dir_prefix and hostname (if required) to the beginning of
978 if (!DOTP (opt.dir_prefix))
980 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
981 + strlen (host) + 1);
982 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
985 STRDUP_ALLOCA (dirpref, host);
987 else /* not add_hostdir */
989 if (!DOTP (opt.dir_prefix))
990 dirpref = opt.dir_prefix;
996 /* If there is a prefix, prepend it. */
999 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1000 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1003 dir = xstrdup (dir);
1006 if (l && dir[l - 1] == '/')
1010 file = "index.html";
1014 /* Finally, construct the full name. */
1015 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1016 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1021 /* Create a unique filename, corresponding to a given URL. Calls
1022 mkstruct if necessary. Does *not* actually create any directories. */
1024 url_filename (const struct urlinfo *u)
1027 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1031 file = mkstruct (u);
1037 file = xstrdup ("index.html");
1039 file = xstrdup (u->file);
1044 /* Check whether the prefix directory is something other than "."
1045 before prepending it. */
1046 if (!DOTP (opt.dir_prefix))
1048 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1049 + 1 + strlen (file) + 1);
1050 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1055 /* DOS-ish file systems don't like `%' signs in them; we change it
1060 for (p = file; *p; p++)
1064 #endif /* WINDOWS */
1066 /* Check the cases in which the unique extensions are not used:
1067 1) Clobbering is turned off (-nc).
1068 2) Retrieval with regetting.
1069 3) Timestamping is used.
1070 4) Hierarchy is built.
1072 The exception is the case when file does exist and is a
1073 directory (actually support for bad httpd-s). */
1074 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1075 && !(file_exists_p (file) && !file_non_directory_p (file)))
1078 /* Find a unique name. */
1079 name = unique_name (file);
1084 /* Like strlen(), but allow the URL to be ended with '?'. */
1086 urlpath_length (const char *url)
1088 const char *q = strchr (url, '?');
1091 return strlen (url);
1094 /* Find the last occurrence of character C in the range [b, e), or
1095 NULL, if none are present. This is almost completely equivalent to
1096 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1097 the contents of the string. */
1099 find_last_char (const char *b, const char *e, char c)
1107 /* Construct a URL by concatenating an absolute URL and a path, which
1108 may or may not be absolute. This tries to behave "reasonably" in
1109 all foreseeable cases. It employs little specific knowledge about
1110 protocols or URL-specific stuff -- it just works on strings. */
1112 construct (const char *url, const char *sub, int subsize, int no_proto)
1118 const char *end = url + urlpath_length (url);
1122 /* SUB is a relative URL: we need to replace everything
1123 after last slash (possibly empty) with SUB.
1125 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1126 our result should be "whatever/foo/qux/xyzzy". */
1127 int need_explicit_slash = 0;
1129 const char *start_insert;
1130 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1133 /* No slash found at all. Append SUB to what we have,
1134 but we'll need a slash as a separator.
1136 Example: if url == "foo" and sub == "qux/xyzzy", then
1137 we cannot just append sub to url, because we'd get
1138 "fooqux/xyzzy", whereas what we want is
1141 To make sure the / gets inserted, we set
1142 need_explicit_slash to 1. We also set start_insert
1143 to end + 1, so that the length calculations work out
1144 correctly for one more (slash) character. Accessing
1145 that character is fine, since it will be the
1146 delimiter, '\0' or '?'. */
1147 /* example: "foo?..." */
1148 /* ^ ('?' gets changed to '/') */
1149 start_insert = end + 1;
1150 need_explicit_slash = 1;
1152 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1154 /* example: http://host" */
1156 start_insert = end + 1;
1157 need_explicit_slash = 1;
1161 /* example: "whatever/foo/bar" */
1163 start_insert = last_slash + 1;
1166 span = start_insert - url;
1167 constr = (char *)xmalloc (span + subsize + 1);
1169 memcpy (constr, url, span);
1170 if (need_explicit_slash)
1171 constr[span - 1] = '/';
1173 memcpy (constr + span, sub, subsize);
1174 constr[span + subsize] = '\0';
1176 else /* *sub == `/' */
1178 /* SUB is an absolute path: we need to replace everything
1179 after (and including) the FIRST slash with SUB.
1181 So, if URL is "http://host/whatever/foo/bar", and SUB is
1182 "/qux/xyzzy", our result should be
1183 "http://host/qux/xyzzy". */
1186 const char *start_insert = NULL; /* for gcc to shut up. */
1187 const char *pos = url;
1188 int seen_slash_slash = 0;
1189 /* We're looking for the first slash, but want to ignore
1192 slash = memchr (pos, '/', end - pos);
1193 if (slash && !seen_slash_slash)
1194 if (*(slash + 1) == '/')
1197 seen_slash_slash = 1;
1201 /* At this point, SLASH is the location of the first / after
1202 "//", or the first slash altogether. START_INSERT is the
1203 pointer to the location where SUB will be inserted. When
1204 examining the last two examples, keep in mind that SUB
1207 if (!slash && !seen_slash_slash)
1208 /* example: "foo" */
1211 else if (!slash && seen_slash_slash)
1212 /* example: "http://foo" */
1215 else if (slash && !seen_slash_slash)
1216 /* example: "foo/bar" */
1219 else if (slash && seen_slash_slash)
1220 /* example: "http://something/" */
1222 start_insert = slash;
1224 span = start_insert - url;
1225 constr = (char *)xmalloc (span + subsize + 1);
1227 memcpy (constr, url, span);
1229 memcpy (constr + span, sub, subsize);
1230 constr[span + subsize] = '\0';
1233 else /* !no_proto */
1235 constr = strdupdelim (sub, sub + subsize);
1240 /* Like the function above, but with a saner caller interface. */
1242 url_concat (const char *base_url, const char *new_url)
1244 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1247 /* Optimize URL by host, destructively replacing u->host with realhost
1248 (u->host). Do this regardless of opt.simple_check. */
1250 opt_url (struct urlinfo *u)
1252 /* Find the "true" host. */
1253 char *host = realhost (u->host);
1256 assert (u->dir != NULL); /* the URL must have been parsed */
1257 /* Refresh the printed representation. */
1259 u->url = str_url (u, 0);
1262 /* This beautiful kludge is fortunately not needed, as I've made
1263 parse_dir do the (almost) right thing, so that a query can never
1264 become a part of directory. */
1266 /* Call path_simplify, but make sure that the part after the
1267 question-mark, if any, is not destroyed by path_simplify's
1270 path_simplify_with_kludge (char *path)
1272 char *query = strchr (path, '?');
1274 /* path_simplify also works destructively, so we also have the
1275 license to write. */
1277 path_simplify (path);
1280 char *newend = path + strlen (path);
1282 if (newend != query)
1283 memmove (newend, query, strlen (query) + 1);
1288 /* Returns proxy host address, in accordance with PROTO. */
1290 getproxy (uerr_t proto)
1292 if (proto == URLHTTP)
1293 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1294 else if (proto == URLFTP)
1295 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1297 else if (proto == URLHTTPS)
1298 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1299 #endif /* HAVE_SSL */
1304 /* Should a host be accessed through proxy, concerning no_proxy? */
1306 no_proxy_match (const char *host, const char **no_proxy)
1311 return !sufmatch (no_proxy, host);
1314 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1315 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1317 /* Change the links in an HTML document. Accepts a structure that
1318 defines the positions of all the links. */
1320 convert_links (const char *file, urlpos *l)
1322 struct file_memory *fm;
1325 downloaded_file_t downloaded_file_return;
1327 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1330 /* First we do a "dry run": go through the list L and see whether
1331 any URL needs to be converted in the first place. If not, just
1332 leave the file alone. */
1335 for (dry = l; dry; dry = dry->next)
1336 if (dry->convert != CO_NOCONVERT)
1340 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1345 fm = read_file (file);
1348 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1349 file, strerror (errno));
1353 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1354 if (opt.backup_converted && downloaded_file_return)
1355 write_backup_file (file, downloaded_file_return);
1357 /* Before opening the file for writing, unlink the file. This is
1358 important if the data in FM is mmaped. In such case, nulling the
1359 file, which is what fopen() below does, would make us read all
1360 zeroes from the mmaped region. */
1361 if (unlink (file) < 0 && errno != ENOENT)
1363 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1364 file, strerror (errno));
1365 read_file_free (fm);
1368 /* Now open the file for writing. */
1369 fp = fopen (file, "wb");
1372 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1373 file, strerror (errno));
1374 read_file_free (fm);
1377 /* Here we loop through all the URLs in file, replacing those of
1378 them that are downloaded with relative references. */
1380 for (; l; l = l->next)
1382 char *url_start = fm->content + l->pos;
1384 if (l->pos >= fm->length)
1386 DEBUGP (("Something strange is going on. Please investigate."));
1389 /* If the URL is not to be converted, skip it. */
1390 if (l->convert == CO_NOCONVERT)
1392 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1396 /* Echo the file contents, up to the offending URL's opening
1397 quote, to the outfile. */
1398 fwrite (p, 1, url_start - p, fp);
1400 if (l->convert == CO_CONVERT_TO_RELATIVE)
1402 /* Convert absolute URL to relative. */
1403 char *newname = construct_relative (file, l->local_name);
1404 char *quoted_newname = html_quote_string (newname);
1405 replace_attr (&p, l->size, fp, quoted_newname);
1406 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1407 l->url, newname, l->pos, file));
1409 xfree (quoted_newname);
1411 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1413 /* Convert the link to absolute URL. */
1414 char *newlink = l->url;
1415 char *quoted_newlink = html_quote_string (newlink);
1416 replace_attr (&p, l->size, fp, quoted_newlink);
1417 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1418 newlink, l->pos, file));
1419 xfree (quoted_newlink);
1422 /* Output the rest of the file. */
1423 if (p - fm->content < fm->length)
1424 fwrite (p, 1, fm->length - (p - fm->content), fp);
1426 read_file_free (fm);
1427 logputs (LOG_VERBOSE, _("done.\n"));
1430 /* Construct and return a malloced copy of the relative link from two
1431 pieces of information: local name S1 of the referring file and
1432 local name S2 of the referred file.
1434 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1435 "jagor.srce.hr/images/news.gif", the function will return
1438 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1439 "fly.cc.fer.hr/images/fly.gif", the function will return
1440 "../images/fly.gif".
1442 Caveats: S1 should not begin with `/', unless S2 also begins with
1443 '/'. S1 should not contain things like ".." and such --
1444 construct_relative ("fly/ioccc/../index.html",
1445 "fly/images/fly.gif") will fail. (A workaround is to call
1446 something like path_simplify() on S1). */
1448 construct_relative (const char *s1, const char *s2)
1450 int i, cnt, sepdirs1;
1454 return xstrdup (s2);
1455 /* S1 should *not* be absolute, if S2 wasn't. */
1456 assert (*s1 != '/');
1458 /* Skip the directories common to both strings. */
1461 while (s1[i] && s2[i]
1466 if (s1[i] == '/' && s2[i] == '/')
1471 for (sepdirs1 = 0; s1[i]; i++)
1474 /* Now, construct the file as of:
1475 - ../ repeated sepdirs1 time
1476 - all the non-mutual directories of S2. */
1477 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1478 for (i = 0; i < sepdirs1; i++)
1479 memcpy (res + 3 * i, "../", 3);
1480 strcpy (res + 3 * i, s2 + cnt);
1484 /* Add URL to the head of the list L. */
1486 add_url (urlpos *l, const char *url, const char *file)
1490 t = (urlpos *)xmalloc (sizeof (urlpos));
1491 memset (t, 0, sizeof (*t));
1492 t->url = xstrdup (url);
1493 t->local_name = xstrdup (file);
1499 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1501 /* Rather than just writing over the original .html file with the
1502 converted version, save the former to *.orig. Note we only do
1503 this for files we've _successfully_ downloaded, so we don't
1504 clobber .orig files sitting around from previous invocations. */
1506 /* Construct the backup filename as the original name plus ".orig". */
1507 size_t filename_len = strlen(file);
1508 char* filename_plus_orig_suffix;
1509 boolean already_wrote_backup_file = FALSE;
1510 slist* converted_file_ptr;
1511 static slist* converted_files = NULL;
1513 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1515 /* Just write "orig" over "html". We need to do it this way
1516 because when we're checking to see if we've downloaded the
1517 file before (to see if we can skip downloading it), we don't
1518 know if it's a text/html file. Therefore we don't know yet
1519 at that stage that -E is going to cause us to tack on
1520 ".html", so we need to compare vs. the original URL plus
1521 ".orig", not the original URL plus ".html.orig". */
1522 filename_plus_orig_suffix = alloca (filename_len + 1);
1523 strcpy(filename_plus_orig_suffix, file);
1524 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1526 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1528 /* Append ".orig" to the name. */
1529 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1530 strcpy(filename_plus_orig_suffix, file);
1531 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1534 /* We can get called twice on the same URL thanks to the
1535 convert_all_links() call in main(). If we write the .orig file
1536 each time in such a case, it'll end up containing the first-pass
1537 conversion, not the original file. So, see if we've already been
1538 called on this file. */
1539 converted_file_ptr = converted_files;
1540 while (converted_file_ptr != NULL)
1541 if (strcmp(converted_file_ptr->string, file) == 0)
1543 already_wrote_backup_file = TRUE;
1547 converted_file_ptr = converted_file_ptr->next;
1549 if (!already_wrote_backup_file)
1551 /* Rename <file> to <file>.orig before former gets written over. */
1552 if (rename(file, filename_plus_orig_suffix) != 0)
1553 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1554 file, filename_plus_orig_suffix, strerror (errno));
1556 /* Remember that we've already written a .orig backup for this file.
1557 Note that we never free this memory since we need it till the
1558 convert_all_links() call, which is one of the last things the
1559 program does before terminating. BTW, I'm not sure if it would be
1560 safe to just set 'converted_file_ptr->string' to 'file' below,
1561 rather than making a copy of the string... Another note is that I
1562 thought I could just add a field to the urlpos structure saying
1563 that we'd written a .orig file for this URL, but that didn't work,
1564 so I had to make this separate list.
1565 -- Dan Harkless <wget@harkless.org>
1567 This [adding a field to the urlpos structure] didn't work
1568 because convert_file() is called twice: once after all its
1569 sublinks have been retrieved in recursive_retrieve(), and
1570 once at the end of the day in convert_all_links(). The
1571 original linked list collected in recursive_retrieve() is
1572 lost after the first invocation of convert_links(), and
1573 convert_all_links() makes a new one (it calls get_urls_html()
1574 for each file it covers.) That's why your first approach didn't
1575 work. The way to make it work is perhaps to make this flag a
1576 field in the `urls_html' list.
1577 -- Hrvoje Niksic <hniksic@arsdigita.com>
1579 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1580 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1581 converted_file_ptr->next = converted_files;
1582 converted_files = converted_file_ptr;
1586 static int find_fragment PARAMS ((const char *, int, const char **,
1590 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1592 const char *p = *pp;
1594 int size = raw_size;
1595 char quote_char = '\"';
1596 const char *frag_beg, *frag_end;
1598 /* Structure of our string is:
1599 "...old-contents..."
1600 <--- l->size ---> (with quotes)
1603 <--- l->size --> (no quotes) */
1605 if (*p == '\"' || *p == '\'')
1610 size -= 2; /* disregard opening and closing quote */
1612 putc (quote_char, fp);
1613 fputs (new_str, fp);
1615 /* Look for fragment identifier, if any. */
1616 if (find_fragment (p, size, &frag_beg, &frag_end))
1617 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1621 putc (quote_char, fp);
1625 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1626 preceded by '&'. If the character is not found, return zero. If
1627 the character is found, return 1 and set BP and EP to point to the
1628 beginning and end of the region.
1630 This is used for finding the fragment indentifiers in URLs. */
1633 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1635 const char *end = beg + size;
1637 for (; beg < end; beg++)
1659 typedef struct _downloaded_file_list {
1661 downloaded_file_t download_type;
1662 struct _downloaded_file_list* next;
1663 } downloaded_file_list;
1665 static downloaded_file_list *downloaded_files;
1667 /* Remembers which files have been downloaded. In the standard case, should be
1668 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1669 download successfully (i.e. not for ones we have failures on or that we skip
1672 When we've downloaded a file and tacked on a ".html" extension due to -E,
1673 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1674 FILE_DOWNLOADED_NORMALLY.
1676 If you just want to check if a file has been previously added without adding
1677 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1678 with local filenames, not remote URLs. */
1680 downloaded_file (downloaded_file_t mode, const char* file)
1682 boolean found_file = FALSE;
1683 downloaded_file_list* rover = downloaded_files;
1685 while (rover != NULL)
1686 if (strcmp(rover->file, file) == 0)
1692 rover = rover->next;
1695 return rover->download_type; /* file had already been downloaded */
1698 if (mode != CHECK_FOR_FILE)
1700 rover = xmalloc(sizeof(*rover));
1701 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1702 rover->download_type = mode;
1703 rover->next = downloaded_files;
1704 downloaded_files = rover;
1707 return FILE_NOT_ALREADY_DOWNLOADED;
1712 downloaded_files_free (void)
1714 downloaded_file_list* rover = downloaded_files;
1717 downloaded_file_list *next = rover->next;
1718 xfree (rover->file);
1724 /* Initialization of static stuff. */
1728 init_unsafe_char_table ();