2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
45 /* Table of Unsafe chars. This is intialized in
46 init_unsafe_char_table. */
48 static char unsafe_char_table[256];
50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
52 /* rfc1738 reserved chars. This is too short to warrant a table. We
53 don't use this yet; preservation of reserved chars will be
54 implemented when I integrate the new `reencode_string'
56 #define RESERVED_CHAR(c) ( (c) == ';' || (c) == '/' || (c) == '?' \
57 || (c) == '@' || (c) == '=' || (c) == '&' \
61 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
63 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
65 static int urlpath_length PARAMS ((const char *));
67 /* A NULL-terminated list of strings to be recognized as protocol
68 types (URL schemes). Note that recognized doesn't mean supported
69 -- only HTTP, HTTPS and FTP are currently supported.
71 However, a string that does not match anything in the list will be
72 considered a relative URL. Thus it's important that this list has
73 anything anyone could think of being legal.
75 #### This is probably broken. Wget should use other means to
76 distinguish between absolute and relative URIs in HTML links.
78 Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
80 static char *protostrings[] =
122 /* Similar to former, but for supported protocols: */
123 static struct proto sup_protos[] =
125 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
127 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
129 { "ftp://", URLFTP, DEFAULT_FTP_PORT }
132 static void parse_dir PARAMS ((const char *, char **, char **));
133 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
134 static char *construct_relative PARAMS ((const char *, const char *));
135 static char process_ftp_type PARAMS ((char *));
140 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
141 - @ and :, for user/password encoding.
142 - everything over 127 (but we don't bother with recording those. */
144 init_unsafe_char_table (void)
147 for (i = 0; i < 256; i++)
148 if (i < 32 || i >= 127
164 unsafe_char_table[i] = 1;
167 /* Decodes the forms %xy in a URL to the character the hexadecimal
168 code of which is xy. xy are hexadecimal digits from
169 [0123456789ABCDEF] (case-insensitive). If x or y are not
170 hex-digits or `%' precedes `\0', the sequence is inserted
174 decode_string (char *s)
184 /* Do nothing if at the end of the string, or if the chars
185 are not hex-digits. */
186 if (!*(s + 1) || !*(s + 2)
187 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
192 *p = (XCHAR_TO_XDIGIT (*(s + 1)) << 4) + XCHAR_TO_XDIGIT (*(s + 2));
199 /* Like encode_string, but return S if there are no unsafe chars. */
202 encode_string_maybe (const char *s)
209 for (p1 = s; *p1; p1++)
210 if (UNSAFE_CHAR (*p1))
211 addition += 2; /* Two more characters (hex digits) */
216 newlen = (p1 - s) + addition;
217 newstr = (char *)xmalloc (newlen + 1);
223 if (UNSAFE_CHAR (*p1))
225 const unsigned char c = *p1++;
227 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
228 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
234 assert (p2 - newstr == newlen);
239 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
240 given string, returning a malloc-ed %XX encoded string. */
243 encode_string (const char *s)
245 char *encoded = encode_string_maybe (s);
252 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
253 the old value of PTR is freed and PTR is made to point to the newly
254 allocated storage. */
256 #define ENCODE(ptr) do { \
257 char *e_new = encode_string_maybe (ptr); \
265 /* Returns the protocol type if URL's protocol is supported, or
266 URLUNKNOWN if not. */
268 urlproto (const char *url)
272 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
273 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
274 return sup_protos[i].ind;
275 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
278 for (++i; url[i] && url[i] != '/'; i++)
279 if (!ISDIGIT (url[i]))
281 if (url[i - 1] == ':')
290 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
291 part is found, returns 0. */
293 skip_proto (const char *url)
298 for (s = protostrings; *s; s++)
299 if (!strncasecmp (*s, url, strlen (*s)))
304 /* HTTP and FTP protocols are expected to yield exact host names
305 (i.e. the `//' part must be skipped, too). */
306 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
311 /* Returns 1 if the URL begins with a protocol (supported or
312 unsupported), 0 otherwise. */
314 has_proto (const char *url)
318 for (s = protostrings; *s; s++)
319 if (strncasecmp (url, *s, strlen (*s)) == 0)
324 /* Skip the username and password, if present here. The function
325 should be called *not* with the complete URL, but with the part
326 right after the protocol.
328 If no username and password are found, return 0. */
330 skip_uname (const char *url)
333 const char *q = NULL;
334 for (p = url ; *p && *p != '/'; p++)
335 if (*p == '@') q = p;
336 /* If a `@' was found before the first occurrence of `/', skip
344 /* Allocate a new urlinfo structure, fill it with default values and
345 return a pointer to it. */
351 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
352 memset (u, 0, sizeof (*u));
353 u->proto = URLUNKNOWN;
357 /* Perform a "deep" free of the urlinfo structure. The structure
358 should have been created with newurl, but need not have been used.
359 If free_pointer is non-0, free the pointer itself. */
361 freeurl (struct urlinfo *u, int complete)
365 FREE_MAYBE (u->host);
366 FREE_MAYBE (u->path);
367 FREE_MAYBE (u->file);
369 FREE_MAYBE (u->user);
370 FREE_MAYBE (u->passwd);
371 FREE_MAYBE (u->local);
372 FREE_MAYBE (u->referer);
374 freeurl (u->proxy, 1);
380 /* Extract the given URL of the form
381 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
382 1. hostname (terminated with `/' or `:')
383 2. port number (terminated with `/'), or chosen for the protocol
384 3. dirname (everything after hostname)
385 Most errors are handled. No allocation is done, you must supply
386 pointers to allocated memory.
387 ...and a host of other stuff :-)
389 - Recognizes hostname:dir/file for FTP and
390 hostname (:portnum)?/dir/file for HTTP.
391 - Parses the path to yield directory and file
392 - Parses the URL to yield the username and passwd (if present)
393 - Decodes the strings, in case they contain "forbidden" characters
394 - Writes the result to struct urlinfo
396 If the argument STRICT is set, it recognizes only the canonical
399 parseurl (const char *url, struct urlinfo *u, int strict)
402 int recognizable; /* Recognizable URL is the one where
403 the protocol name was explicitly
404 named, i.e. it wasn't deduced from
408 DEBUGP (("parseurl (\"%s\") -> ", url));
409 recognizable = has_proto (url);
410 if (strict && !recognizable)
412 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
414 l = strlen (sup_protos[i].name);
415 if (!strncasecmp (sup_protos[i].name, url, l))
418 /* If protocol is recognizable, but unsupported, bail out, else
420 if (recognizable && i == ARRAY_SIZE (sup_protos))
422 else if (i == ARRAY_SIZE (sup_protos))
425 u->proto = type = sup_protos[i].ind;
427 if (type == URLUNKNOWN)
429 /* Allow a username and password to be specified (i.e. just skip
432 l += skip_uname (url + l);
433 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
436 /* Get the hostname. */
437 u->host = strdupdelim (url + l, url + i);
438 DEBUGP (("host %s -> ", u->host));
440 /* Assume no port has been given. */
444 /* We have a colon delimiting the hostname. It could mean that
445 a port number is following it, or a directory. */
446 if (ISDIGIT (url[++i])) /* A port number */
448 if (type == URLUNKNOWN)
449 u->proto = type = URLHTTP;
450 for (; url[i] && url[i] != '/'; i++)
451 if (ISDIGIT (url[i]))
452 u->port = 10 * u->port + (url[i] - '0');
457 DEBUGP (("port %hu -> ", u->port));
459 else if (type == URLUNKNOWN) /* or a directory */
460 u->proto = type = URLFTP;
461 else /* or just a misformed port number */
464 else if (type == URLUNKNOWN)
465 u->proto = type = URLHTTP;
469 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
470 if (sup_protos[ind].ind == type)
472 if (ind == ARRAY_SIZE (sup_protos))
474 u->port = sup_protos[ind].port;
476 /* Some delimiter troubles... */
477 if (url[i] == '/' && url[i - 1] != ':')
480 while (url[i] && url[i] == '/')
482 u->path = (char *)xmalloc (strlen (url + i) + 8);
483 strcpy (u->path, url + i);
486 u->ftp_type = process_ftp_type (u->path);
487 /* #### We don't handle type `d' correctly yet. */
488 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
490 DEBUGP (("ftp_type %c -> ", u->ftp_type));
492 DEBUGP (("opath %s -> ", u->path));
493 /* Parse the username and password (if existing). */
494 parse_uname (url, &u->user, &u->passwd);
495 /* Decode the strings, as per RFC 1738. */
496 decode_string (u->host);
497 decode_string (u->path);
499 decode_string (u->user);
501 decode_string (u->passwd);
502 /* Parse the directory. */
503 parse_dir (u->path, &u->dir, &u->file);
504 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
505 /* Simplify the directory. */
506 path_simplify (u->dir);
507 /* Remove the leading `/' in HTTP. */
508 if (type == URLHTTP && *u->dir == '/')
509 strcpy (u->dir, u->dir + 1);
510 DEBUGP (("ndir %s\n", u->dir));
511 /* Strip trailing `/'. */
513 if (l > 1 && u->dir[l - 1] == '/')
514 u->dir[l - 1] = '\0';
515 /* Re-create the path: */
516 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
517 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
518 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
519 strcpy (u->path, abs_ftp ? "%2F" : "/");
520 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
521 strcat (u->path, *u->dir ? "/" : "");
522 strcat (u->path, u->file);
524 DEBUGP (("newpath: %s\n", u->path));
525 /* Create the clean URL. */
526 u->url = str_url (u, 0);
530 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
531 DOTP and DDOTP, but they also recognize `?' as end-of-string
532 delimiter. This is needed for correct handling of query
535 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
536 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
537 && (!*((x) + 2) || *((x) + 2) == '?'))
539 /* Build the directory and filename components of the path. Both
540 components are *separately* malloc-ed strings! It does not change
541 the contents of path.
543 If the path ends with "." or "..", they are (correctly) counted as
546 parse_dir (const char *path, char **dir, char **file)
550 l = urlpath_length (path);
551 for (i = l; i && path[i] != '/'; i--);
553 if (!i && *path != '/') /* Just filename */
555 if (PD_DOTP (path) || PD_DDOTP (path))
557 *dir = strdupdelim (path, path + l);
558 *file = xstrdup (path + l); /* normally empty, but could
563 *dir = xstrdup (""); /* This is required because of FTP */
564 *file = xstrdup (path);
567 else if (!i) /* /filename */
569 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
571 *dir = strdupdelim (path, path + l);
572 *file = xstrdup (path + l); /* normally empty, but could
577 *dir = xstrdup ("/");
578 *file = xstrdup (path + 1);
581 else /* Nonempty directory with or without a filename */
583 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
585 *dir = strdupdelim (path, path + l);
586 *file = xstrdup (path + l); /* normally empty, but could
591 *dir = strdupdelim (path, path + i);
592 *file = xstrdup (path + i + 1);
597 /* Find the optional username and password within the URL, as per
598 RFC1738. The returned user and passwd char pointers are
601 parse_uname (const char *url, char **user, char **passwd)
604 const char *p, *q, *col;
610 /* Look for the end of the protocol string. */
611 l = skip_proto (url);
614 /* Add protocol offset. */
616 /* Is there an `@' character? */
617 for (p = url; *p && *p != '/'; p++)
620 /* If not, return. */
623 /* Else find the username and password. */
624 for (p = q = col = url; *p && *p != '/'; p++)
626 if (*p == ':' && !*user)
628 *user = (char *)xmalloc (p - url + 1);
629 memcpy (*user, url, p - url);
630 (*user)[p - url] = '\0';
633 if (*p == '@') q = p;
635 /* Decide whether you have only the username or both. */
636 where = *user ? passwd : user;
637 *where = (char *)xmalloc (q - col + 1);
638 memcpy (*where, col, q - col);
639 (*where)[q - col] = '\0';
643 /* If PATH ends with `;type=X', return the character X. */
645 process_ftp_type (char *path)
647 int len = strlen (path);
650 && !memcmp (path + len - 7, ";type=", 6))
652 path[len - 7] = '\0';
653 return path[len - 1];
659 /* Return the URL as fine-formed string, with a proper protocol, optional port
660 number, directory and optional user/password. If `hide' is non-zero (as it
661 is when we're calling this on a URL we plan to print, but not when calling it
662 to canonicalize a URL for use within the program), password will be hidden.
663 The forbidden characters in the URL will be cleansed. */
665 str_url (const struct urlinfo *u, int hide)
667 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
668 int i, l, ln, lu, lh, lp, lf, ld;
669 unsigned short proto_default_port;
671 /* Look for the protocol name. */
672 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
673 if (sup_protos[i].ind == u->proto)
675 if (i == ARRAY_SIZE (sup_protos))
677 proto_name = sup_protos[i].name;
678 proto_default_port = sup_protos[i].port;
679 host = encode_string (u->host);
680 dir = encode_string (u->dir);
681 file = encode_string (u->file);
682 user = passwd = NULL;
684 user = encode_string (u->user);
688 /* Don't output the password, or someone might see it over the user's
689 shoulder (or in saved wget output). Don't give away the number of
690 characters in the password, either, as we did in past versions of
691 this code, when we replaced the password characters with 'x's. */
692 passwd = xstrdup("<password>");
694 passwd = encode_string (u->passwd);
696 if (u->proto == URLFTP && *dir == '/')
698 char *tmp = (char *)xmalloc (strlen (dir) + 3);
699 /*sprintf (tmp, "%%2F%s", dir + 1);*/
703 strcpy (tmp + 3, dir + 1);
708 ln = strlen (proto_name);
709 lu = user ? strlen (user) : 0;
710 lp = passwd ? strlen (passwd) : 0;
714 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
715 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
716 (user ? user : ""), (passwd ? ":" : ""),
717 (passwd ? passwd : ""), (user ? "@" : ""),
718 host, u->port, dir, *dir ? "/" : "", file); */
720 memcpy (res, proto_name, ln);
724 memcpy (res + l, user, lu);
729 memcpy (res + l, passwd, lp);
734 memcpy (res + l, host, lh);
736 if (u->port != proto_default_port)
739 long_to_string (res + l, (long)u->port);
740 l += numdigit (u->port);
743 memcpy (res + l, dir, ld);
747 strcpy (res + l, file);
756 /* Check whether two URL-s are equivalent, i.e. pointing to the same
757 location. Uses parseurl to parse them, and compares the canonical
760 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
761 return 0 on error. */
763 url_equal (const char *url1, const char *url2)
765 struct urlinfo *u1, *u2;
770 err = parseurl (url1, u1, 0);
777 err = parseurl (url2, u2, 0);
783 res = !strcmp (u1->url, u2->url);
790 get_urls_file (const char *file)
792 struct file_memory *fm;
794 const char *text, *text_end;
797 fm = read_file (file);
800 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
803 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
806 text_end = fm->content + fm->length;
807 while (text < text_end)
809 const char *line_beg = text;
810 const char *line_end = memchr (text, '\n', text_end - text);
816 while (line_beg < line_end
817 && ISSPACE (*line_beg))
819 while (line_end > line_beg + 1
820 && ISSPACE (*(line_end - 1)))
822 if (line_end > line_beg)
824 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
825 memset (entry, 0, sizeof (*entry));
827 entry->url = strdupdelim (line_beg, line_end);
839 /* Free the linked list of urlpos. */
841 free_urlpos (urlpos *l)
845 urlpos *next = l->next;
847 FREE_MAYBE (l->local_name);
853 /* Rotate FNAME opt.backups times */
855 rotate_backups(const char *fname)
857 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
858 char *from = (char *)alloca (maxlen);
859 char *to = (char *)alloca (maxlen);
863 if (stat (fname, &sb) == 0)
864 if (S_ISREG (sb.st_mode) == 0)
867 for (i = opt.backups; i > 1; i--)
869 sprintf (from, "%s.%d", fname, i - 1);
870 sprintf (to, "%s.%d", fname, i);
871 /* #### This will fail on machines without the rename() system
876 sprintf (to, "%s.%d", fname, 1);
880 /* Create all the necessary directories for PATH (a file). Calls
881 mkdirhier() internally. */
883 mkalldirs (const char *path)
890 p = path + strlen (path);
891 for (; *p != '/' && p != path; p--);
892 /* Don't create if it's just a file. */
893 if ((p == path) && (*p != '/'))
895 t = strdupdelim (path, p);
896 /* Check whether the directory exists. */
897 if ((stat (t, &st) == 0))
899 if (S_ISDIR (st.st_mode))
906 /* If the dir exists as a file name, remove it first. This
907 is *only* for Wget to work with buggy old CERN http
908 servers. Here is the scenario: When Wget tries to
909 retrieve a directory without a slash, e.g.
910 http://foo/bar (bar being a directory), CERN server will
911 not redirect it too http://foo/bar/ -- it will generate a
912 directory listing containing links to bar/file1,
913 bar/file2, etc. Wget will lose because it saves this
914 HTML listing to a file `bar', so it cannot create the
915 directory. To work around this, if the file of the same
916 name exists, we just remove it and create the directory
918 DEBUGP (("Removing %s because of directory danger!\n", t));
922 res = make_directory (t);
924 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
930 count_slashes (const char *s)
939 /* Return the path name of the URL-equivalent file name, with a
940 remote-like structure of directories. */
942 mkstruct (const struct urlinfo *u)
944 char *host, *dir, *file, *res, *dirpref;
947 assert (u->dir != NULL);
948 assert (u->host != NULL);
952 char *ptr = u->dir + (*u->dir == '/');
953 int slash_count = 1 + count_slashes (ptr);
954 int cut = MINVAL (opt.cut_dirs, slash_count);
955 for (; cut && *ptr; ptr++)
958 STRDUP_ALLOCA (dir, ptr);
961 dir = u->dir + (*u->dir == '/');
963 host = xstrdup (u->host);
964 /* Check for the true name (or at least a consistent name for saving
965 to directory) of HOST, reusing the hlist if possible. */
966 if (opt.add_hostdir && !opt.simple_check)
968 char *nhost = realhost (host);
972 /* Add dir_prefix and hostname (if required) to the beginning of
976 if (!DOTP (opt.dir_prefix))
978 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
979 + strlen (host) + 1);
980 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
983 STRDUP_ALLOCA (dirpref, host);
985 else /* not add_hostdir */
987 if (!DOTP (opt.dir_prefix))
988 dirpref = opt.dir_prefix;
994 /* If there is a prefix, prepend it. */
997 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
998 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1001 dir = encode_string (dir);
1003 if (l && dir[l - 1] == '/')
1007 file = "index.html";
1011 /* Finally, construct the full name. */
1012 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1013 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1018 /* Create a unique filename, corresponding to a given URL. Calls
1019 mkstruct if necessary. Does *not* actually create any directories. */
1021 url_filename (const struct urlinfo *u)
1024 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1028 file = mkstruct (u);
1034 file = xstrdup ("index.html");
1036 file = xstrdup (u->file);
1041 /* Check whether the prefix directory is something other than "."
1042 before prepending it. */
1043 if (!DOTP (opt.dir_prefix))
1045 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1046 + 1 + strlen (file) + 1);
1047 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1052 /* DOS-ish file systems don't like `%' signs in them; we change it
1057 for (p = file; *p; p++)
1061 #endif /* WINDOWS */
1063 /* Check the cases in which the unique extensions are not used:
1064 1) Clobbering is turned off (-nc).
1065 2) Retrieval with regetting.
1066 3) Timestamping is used.
1067 4) Hierarchy is built.
1069 The exception is the case when file does exist and is a
1070 directory (actually support for bad httpd-s). */
1071 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1072 && !(file_exists_p (file) && !file_non_directory_p (file)))
1075 /* Find a unique name. */
1076 name = unique_name (file);
1081 /* Like strlen(), but allow the URL to be ended with '?'. */
1083 urlpath_length (const char *url)
1085 const char *q = strchr (url, '?');
1088 return strlen (url);
1091 /* Find the last occurrence of character C in the range [b, e), or
1092 NULL, if none are present. This is almost completely equivalent to
1093 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1094 the contents of the string. */
1096 find_last_char (const char *b, const char *e, char c)
1104 /* Resolve the result of "linking" a base URI (BASE) to a
1105 link-specified URI (LINK).
1107 Either of the URIs may be absolute or relative, complete with the
1108 host name, or path only. This tries to behave "reasonably" in all
1109 foreseeable cases. It employs little specific knowledge about
1110 protocols or URL-specific stuff -- it just works on strings.
1112 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1113 See uri_merge for a gentler interface to this functionality.
1115 #### This function should handle `./' and `../' so that the evil
1116 path_simplify can go. */
1118 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1124 const char *end = base + urlpath_length (base);
1128 /* LINK is a relative URL: we need to replace everything
1129 after last slash (possibly empty) with LINK.
1131 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1132 our result should be "whatever/foo/qux/xyzzy". */
1133 int need_explicit_slash = 0;
1135 const char *start_insert;
1136 const char *last_slash = find_last_char (base, end, '/');
1139 /* No slash found at all. Append LINK to what we have,
1140 but we'll need a slash as a separator.
1142 Example: if base == "foo" and link == "qux/xyzzy", then
1143 we cannot just append link to base, because we'd get
1144 "fooqux/xyzzy", whereas what we want is
1147 To make sure the / gets inserted, we set
1148 need_explicit_slash to 1. We also set start_insert
1149 to end + 1, so that the length calculations work out
1150 correctly for one more (slash) character. Accessing
1151 that character is fine, since it will be the
1152 delimiter, '\0' or '?'. */
1153 /* example: "foo?..." */
1154 /* ^ ('?' gets changed to '/') */
1155 start_insert = end + 1;
1156 need_explicit_slash = 1;
1158 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1160 /* example: http://host" */
1162 start_insert = end + 1;
1163 need_explicit_slash = 1;
1167 /* example: "whatever/foo/bar" */
1169 start_insert = last_slash + 1;
1172 span = start_insert - base;
1173 constr = (char *)xmalloc (span + linklength + 1);
1175 memcpy (constr, base, span);
1176 if (need_explicit_slash)
1177 constr[span - 1] = '/';
1179 memcpy (constr + span, link, linklength);
1180 constr[span + linklength] = '\0';
1182 else /* *link == `/' */
1184 /* LINK is an absolute path: we need to replace everything
1185 after (and including) the FIRST slash with LINK.
1187 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1188 "/qux/xyzzy", our result should be
1189 "http://host/qux/xyzzy". */
1192 const char *start_insert = NULL; /* for gcc to shut up. */
1193 const char *pos = base;
1194 int seen_slash_slash = 0;
1195 /* We're looking for the first slash, but want to ignore
1198 slash = memchr (pos, '/', end - pos);
1199 if (slash && !seen_slash_slash)
1200 if (*(slash + 1) == '/')
1203 seen_slash_slash = 1;
1207 /* At this point, SLASH is the location of the first / after
1208 "//", or the first slash altogether. START_INSERT is the
1209 pointer to the location where LINK will be inserted. When
1210 examining the last two examples, keep in mind that LINK
1213 if (!slash && !seen_slash_slash)
1214 /* example: "foo" */
1216 start_insert = base;
1217 else if (!slash && seen_slash_slash)
1218 /* example: "http://foo" */
1221 else if (slash && !seen_slash_slash)
1222 /* example: "foo/bar" */
1224 start_insert = base;
1225 else if (slash && seen_slash_slash)
1226 /* example: "http://something/" */
1228 start_insert = slash;
1230 span = start_insert - base;
1231 constr = (char *)xmalloc (span + linklength + 1);
1233 memcpy (constr, base, span);
1235 memcpy (constr + span, link, linklength);
1236 constr[span + linklength] = '\0';
1239 else /* !no_proto */
1241 constr = strdupdelim (link, link + linklength);
1246 /* Merge BASE with LINK and return the resulting URI. This is an
1247 interface to uri_merge_1 that assumes that LINK is a
1248 zero-terminated string. */
1250 uri_merge (const char *base, const char *link)
1252 return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1255 /* Optimize URL by host, destructively replacing u->host with realhost
1256 (u->host). Do this regardless of opt.simple_check. */
1258 opt_url (struct urlinfo *u)
1260 /* Find the "true" host. */
1261 char *host = realhost (u->host);
1264 assert (u->dir != NULL); /* the URL must have been parsed */
1265 /* Refresh the printed representation. */
1267 u->url = str_url (u, 0);
1270 /* Returns proxy host address, in accordance with PROTO. */
1272 getproxy (uerr_t proto)
1274 if (proto == URLHTTP)
1275 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1276 else if (proto == URLFTP)
1277 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1279 else if (proto == URLHTTPS)
1280 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1281 #endif /* HAVE_SSL */
1286 /* Should a host be accessed through proxy, concerning no_proxy? */
1288 no_proxy_match (const char *host, const char **no_proxy)
1293 return !sufmatch (no_proxy, host);
1296 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1297 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1299 /* Change the links in an HTML document. Accepts a structure that
1300 defines the positions of all the links. */
1302 convert_links (const char *file, urlpos *l)
1304 struct file_memory *fm;
1307 downloaded_file_t downloaded_file_return;
1309 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1312 /* First we do a "dry run": go through the list L and see whether
1313 any URL needs to be converted in the first place. If not, just
1314 leave the file alone. */
1317 for (dry = l; dry; dry = dry->next)
1318 if (dry->convert != CO_NOCONVERT)
1322 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1327 fm = read_file (file);
1330 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1331 file, strerror (errno));
1335 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1336 if (opt.backup_converted && downloaded_file_return)
1337 write_backup_file (file, downloaded_file_return);
1339 /* Before opening the file for writing, unlink the file. This is
1340 important if the data in FM is mmaped. In such case, nulling the
1341 file, which is what fopen() below does, would make us read all
1342 zeroes from the mmaped region. */
1343 if (unlink (file) < 0 && errno != ENOENT)
1345 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1346 file, strerror (errno));
1347 read_file_free (fm);
1350 /* Now open the file for writing. */
1351 fp = fopen (file, "wb");
1354 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1355 file, strerror (errno));
1356 read_file_free (fm);
1359 /* Here we loop through all the URLs in file, replacing those of
1360 them that are downloaded with relative references. */
1362 for (; l; l = l->next)
1364 char *url_start = fm->content + l->pos;
1366 if (l->pos >= fm->length)
1368 DEBUGP (("Something strange is going on. Please investigate."));
1371 /* If the URL is not to be converted, skip it. */
1372 if (l->convert == CO_NOCONVERT)
1374 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1378 /* Echo the file contents, up to the offending URL's opening
1379 quote, to the outfile. */
1380 fwrite (p, 1, url_start - p, fp);
1382 if (l->convert == CO_CONVERT_TO_RELATIVE)
1384 /* Convert absolute URL to relative. */
1385 char *newname = construct_relative (file, l->local_name);
1386 char *quoted_newname = html_quote_string (newname);
1387 replace_attr (&p, l->size, fp, quoted_newname);
1388 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1389 l->url, newname, l->pos, file));
1391 xfree (quoted_newname);
1393 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1395 /* Convert the link to absolute URL. */
1396 char *newlink = l->url;
1397 char *quoted_newlink = html_quote_string (newlink);
1398 replace_attr (&p, l->size, fp, quoted_newlink);
1399 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1400 newlink, l->pos, file));
1401 xfree (quoted_newlink);
1404 /* Output the rest of the file. */
1405 if (p - fm->content < fm->length)
1406 fwrite (p, 1, fm->length - (p - fm->content), fp);
1408 read_file_free (fm);
1409 logputs (LOG_VERBOSE, _("done.\n"));
1412 /* Construct and return a malloced copy of the relative link from two
1413 pieces of information: local name S1 of the referring file and
1414 local name S2 of the referred file.
1416 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1417 "jagor.srce.hr/images/news.gif", the function will return
1420 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1421 "fly.cc.fer.hr/images/fly.gif", the function will return
1422 "../images/fly.gif".
1424 Caveats: S1 should not begin with `/', unless S2 also begins with
1425 '/'. S1 should not contain things like ".." and such --
1426 construct_relative ("fly/ioccc/../index.html",
1427 "fly/images/fly.gif") will fail. (A workaround is to call
1428 something like path_simplify() on S1). */
1430 construct_relative (const char *s1, const char *s2)
1432 int i, cnt, sepdirs1;
1436 return xstrdup (s2);
1437 /* S1 should *not* be absolute, if S2 wasn't. */
1438 assert (*s1 != '/');
1440 /* Skip the directories common to both strings. */
1443 while (s1[i] && s2[i]
1448 if (s1[i] == '/' && s2[i] == '/')
1453 for (sepdirs1 = 0; s1[i]; i++)
1456 /* Now, construct the file as of:
1457 - ../ repeated sepdirs1 time
1458 - all the non-mutual directories of S2. */
1459 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1460 for (i = 0; i < sepdirs1; i++)
1461 memcpy (res + 3 * i, "../", 3);
1462 strcpy (res + 3 * i, s2 + cnt);
1466 /* Add URL to the head of the list L. */
1468 add_url (urlpos *l, const char *url, const char *file)
1472 t = (urlpos *)xmalloc (sizeof (urlpos));
1473 memset (t, 0, sizeof (*t));
1474 t->url = xstrdup (url);
1475 t->local_name = xstrdup (file);
1481 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1483 /* Rather than just writing over the original .html file with the
1484 converted version, save the former to *.orig. Note we only do
1485 this for files we've _successfully_ downloaded, so we don't
1486 clobber .orig files sitting around from previous invocations. */
1488 /* Construct the backup filename as the original name plus ".orig". */
1489 size_t filename_len = strlen(file);
1490 char* filename_plus_orig_suffix;
1491 boolean already_wrote_backup_file = FALSE;
1492 slist* converted_file_ptr;
1493 static slist* converted_files = NULL;
1495 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1497 /* Just write "orig" over "html". We need to do it this way
1498 because when we're checking to see if we've downloaded the
1499 file before (to see if we can skip downloading it), we don't
1500 know if it's a text/html file. Therefore we don't know yet
1501 at that stage that -E is going to cause us to tack on
1502 ".html", so we need to compare vs. the original URL plus
1503 ".orig", not the original URL plus ".html.orig". */
1504 filename_plus_orig_suffix = alloca (filename_len + 1);
1505 strcpy(filename_plus_orig_suffix, file);
1506 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1508 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1510 /* Append ".orig" to the name. */
1511 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1512 strcpy(filename_plus_orig_suffix, file);
1513 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1516 /* We can get called twice on the same URL thanks to the
1517 convert_all_links() call in main(). If we write the .orig file
1518 each time in such a case, it'll end up containing the first-pass
1519 conversion, not the original file. So, see if we've already been
1520 called on this file. */
1521 converted_file_ptr = converted_files;
1522 while (converted_file_ptr != NULL)
1523 if (strcmp(converted_file_ptr->string, file) == 0)
1525 already_wrote_backup_file = TRUE;
1529 converted_file_ptr = converted_file_ptr->next;
1531 if (!already_wrote_backup_file)
1533 /* Rename <file> to <file>.orig before former gets written over. */
1534 if (rename(file, filename_plus_orig_suffix) != 0)
1535 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1536 file, filename_plus_orig_suffix, strerror (errno));
1538 /* Remember that we've already written a .orig backup for this file.
1539 Note that we never free this memory since we need it till the
1540 convert_all_links() call, which is one of the last things the
1541 program does before terminating. BTW, I'm not sure if it would be
1542 safe to just set 'converted_file_ptr->string' to 'file' below,
1543 rather than making a copy of the string... Another note is that I
1544 thought I could just add a field to the urlpos structure saying
1545 that we'd written a .orig file for this URL, but that didn't work,
1546 so I had to make this separate list.
1547 -- Dan Harkless <wget@harkless.org>
1549 This [adding a field to the urlpos structure] didn't work
1550 because convert_file() is called twice: once after all its
1551 sublinks have been retrieved in recursive_retrieve(), and
1552 once at the end of the day in convert_all_links(). The
1553 original linked list collected in recursive_retrieve() is
1554 lost after the first invocation of convert_links(), and
1555 convert_all_links() makes a new one (it calls get_urls_html()
1556 for each file it covers.) That's why your first approach didn't
1557 work. The way to make it work is perhaps to make this flag a
1558 field in the `urls_html' list.
1559 -- Hrvoje Niksic <hniksic@arsdigita.com>
1561 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1562 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1563 converted_file_ptr->next = converted_files;
1564 converted_files = converted_file_ptr;
1568 static int find_fragment PARAMS ((const char *, int, const char **,
1572 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1574 const char *p = *pp;
1576 int size = raw_size;
1577 char quote_char = '\"';
1578 const char *frag_beg, *frag_end;
1580 /* Structure of our string is:
1581 "...old-contents..."
1582 <--- l->size ---> (with quotes)
1585 <--- l->size --> (no quotes) */
1587 if (*p == '\"' || *p == '\'')
1592 size -= 2; /* disregard opening and closing quote */
1594 putc (quote_char, fp);
1595 fputs (new_str, fp);
1597 /* Look for fragment identifier, if any. */
1598 if (find_fragment (p, size, &frag_beg, &frag_end))
1599 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1603 putc (quote_char, fp);
1607 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1608 preceded by '&'. If the character is not found, return zero. If
1609 the character is found, return 1 and set BP and EP to point to the
1610 beginning and end of the region.
1612 This is used for finding the fragment indentifiers in URLs. */
1615 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1617 const char *end = beg + size;
1619 for (; beg < end; beg++)
1641 typedef struct _downloaded_file_list {
1643 downloaded_file_t download_type;
1644 struct _downloaded_file_list* next;
1645 } downloaded_file_list;
1647 static downloaded_file_list *downloaded_files;
1649 /* Remembers which files have been downloaded. In the standard case, should be
1650 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1651 download successfully (i.e. not for ones we have failures on or that we skip
1654 When we've downloaded a file and tacked on a ".html" extension due to -E,
1655 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1656 FILE_DOWNLOADED_NORMALLY.
1658 If you just want to check if a file has been previously added without adding
1659 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1660 with local filenames, not remote URLs. */
1662 downloaded_file (downloaded_file_t mode, const char* file)
1664 boolean found_file = FALSE;
1665 downloaded_file_list* rover = downloaded_files;
1667 while (rover != NULL)
1668 if (strcmp(rover->file, file) == 0)
1674 rover = rover->next;
1677 return rover->download_type; /* file had already been downloaded */
1680 if (mode != CHECK_FOR_FILE)
1682 rover = xmalloc(sizeof(*rover));
1683 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1684 rover->download_type = mode;
1685 rover->next = downloaded_files;
1686 downloaded_files = rover;
1689 return FILE_NOT_ALREADY_DOWNLOADED;
1694 downloaded_files_free (void)
1696 downloaded_file_list* rover = downloaded_files;
1699 downloaded_file_list *next = rover->next;
1700 xfree (rover->file);
1706 /* Initialization of static stuff. */
1710 init_unsafe_char_table ();