2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
45 /* Table of Unsafe chars. This is intialized in
46 init_unsafe_char_table. */
48 static char unsafe_char_table[256];
50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
52 /* rfc1738 reserved chars. This is too short to warrant a table. We
53 don't use this yet; preservation of reserved chars will be
54 implemented when I integrate the new `reencode_string'
56 #define RESERVED_CHAR(c) ( (c) == ';' || (c) == '/' || (c) == '?' \
57 || (c) == '@' || (c) == '=' || (c) == '&' \
61 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
63 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
65 static int urlpath_length PARAMS ((const char *));
67 /* A NULL-terminated list of strings to be recognized as protocol
68 types (URL schemes). Note that recognized doesn't mean supported
69 -- only HTTP, HTTPS and FTP are currently supported.
71 However, a string that does not match anything in the list will be
72 considered a relative URL. Thus it's important that this list has
73 anything anyone could think of being legal.
75 #### This is probably broken. Wget should use other means to
76 distinguish between absolute and relative URIs in HTML links.
78 Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
80 static char *protostrings[] =
122 /* Similar to former, but for supported protocols: */
123 static struct proto sup_protos[] =
125 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
127 { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
129 { "ftp://", URLFTP, DEFAULT_FTP_PORT }
132 static void parse_dir PARAMS ((const char *, char **, char **));
133 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
134 static char *construct PARAMS ((const char *, const char *, int , int));
135 static char *construct_relative PARAMS ((const char *, const char *));
136 static char process_ftp_type PARAMS ((char *));
141 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
142 - @ and :, for user/password encoding.
143 - everything over 127 (but we don't bother with recording those. */
145 init_unsafe_char_table (void)
148 for (i = 0; i < 256; i++)
149 if (i < 32 || i >= 127
165 unsafe_char_table[i] = 1;
168 /* Decodes the forms %xy in a URL to the character the hexadecimal
169 code of which is xy. xy are hexadecimal digits from
170 [0123456789ABCDEF] (case-insensitive). If x or y are not
171 hex-digits or `%' precedes `\0', the sequence is inserted
175 decode_string (char *s)
185 /* Do nothing if at the end of the string, or if the chars
186 are not hex-digits. */
187 if (!*(s + 1) || !*(s + 2)
188 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
193 *p = (XCHAR_TO_XDIGIT (*(s + 1)) << 4) + XCHAR_TO_XDIGIT (*(s + 2));
200 /* Like encode_string, but return S if there are no unsafe chars. */
203 encode_string_maybe (const char *s)
210 for (p1 = s; *p1; p1++)
211 if (UNSAFE_CHAR (*p1))
212 addition += 2; /* Two more characters (hex digits) */
217 newlen = (p1 - s) + addition;
218 newstr = (char *)xmalloc (newlen + 1);
224 if (UNSAFE_CHAR (*p1))
226 const unsigned char c = *p1++;
228 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
229 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
235 assert (p2 - newstr == newlen);
240 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
241 given string, returning a malloc-ed %XX encoded string. */
244 encode_string (const char *s)
246 char *encoded = encode_string_maybe (s);
253 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
254 the old value of PTR is freed and PTR is made to point to the newly
255 allocated storage. */
257 #define ENCODE(ptr) do { \
258 char *e_new = encode_string_maybe (ptr); \
266 /* Returns the protocol type if URL's protocol is supported, or
267 URLUNKNOWN if not. */
269 urlproto (const char *url)
273 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
274 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
275 return sup_protos[i].ind;
276 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
279 for (++i; url[i] && url[i] != '/'; i++)
280 if (!ISDIGIT (url[i]))
282 if (url[i - 1] == ':')
291 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
292 part is found, returns 0. */
294 skip_proto (const char *url)
299 for (s = protostrings; *s; s++)
300 if (!strncasecmp (*s, url, strlen (*s)))
305 /* HTTP and FTP protocols are expected to yield exact host names
306 (i.e. the `//' part must be skipped, too). */
307 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
312 /* Returns 1 if the URL begins with a protocol (supported or
313 unsupported), 0 otherwise. */
315 has_proto (const char *url)
319 for (s = protostrings; *s; s++)
320 if (strncasecmp (url, *s, strlen (*s)) == 0)
325 /* Skip the username and password, if present here. The function
326 should be called *not* with the complete URL, but with the part
327 right after the protocol.
329 If no username and password are found, return 0. */
331 skip_uname (const char *url)
334 const char *q = NULL;
335 for (p = url ; *p && *p != '/'; p++)
336 if (*p == '@') q = p;
337 /* If a `@' was found before the first occurrence of `/', skip
345 /* Allocate a new urlinfo structure, fill it with default values and
346 return a pointer to it. */
352 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
353 memset (u, 0, sizeof (*u));
354 u->proto = URLUNKNOWN;
358 /* Perform a "deep" free of the urlinfo structure. The structure
359 should have been created with newurl, but need not have been used.
360 If free_pointer is non-0, free the pointer itself. */
362 freeurl (struct urlinfo *u, int complete)
366 FREE_MAYBE (u->host);
367 FREE_MAYBE (u->path);
368 FREE_MAYBE (u->file);
370 FREE_MAYBE (u->user);
371 FREE_MAYBE (u->passwd);
372 FREE_MAYBE (u->local);
373 FREE_MAYBE (u->referer);
375 freeurl (u->proxy, 1);
381 /* Extract the given URL of the form
382 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
383 1. hostname (terminated with `/' or `:')
384 2. port number (terminated with `/'), or chosen for the protocol
385 3. dirname (everything after hostname)
386 Most errors are handled. No allocation is done, you must supply
387 pointers to allocated memory.
388 ...and a host of other stuff :-)
390 - Recognizes hostname:dir/file for FTP and
391 hostname (:portnum)?/dir/file for HTTP.
392 - Parses the path to yield directory and file
393 - Parses the URL to yield the username and passwd (if present)
394 - Decodes the strings, in case they contain "forbidden" characters
395 - Writes the result to struct urlinfo
397 If the argument STRICT is set, it recognizes only the canonical
400 parseurl (const char *url, struct urlinfo *u, int strict)
403 int recognizable; /* Recognizable URL is the one where
404 the protocol name was explicitly
405 named, i.e. it wasn't deduced from
409 DEBUGP (("parseurl (\"%s\") -> ", url));
410 recognizable = has_proto (url);
411 if (strict && !recognizable)
413 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
415 l = strlen (sup_protos[i].name);
416 if (!strncasecmp (sup_protos[i].name, url, l))
419 /* If protocol is recognizable, but unsupported, bail out, else
421 if (recognizable && i == ARRAY_SIZE (sup_protos))
423 else if (i == ARRAY_SIZE (sup_protos))
426 u->proto = type = sup_protos[i].ind;
428 if (type == URLUNKNOWN)
430 /* Allow a username and password to be specified (i.e. just skip
433 l += skip_uname (url + l);
434 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
437 /* Get the hostname. */
438 u->host = strdupdelim (url + l, url + i);
439 DEBUGP (("host %s -> ", u->host));
441 /* Assume no port has been given. */
445 /* We have a colon delimiting the hostname. It could mean that
446 a port number is following it, or a directory. */
447 if (ISDIGIT (url[++i])) /* A port number */
449 if (type == URLUNKNOWN)
450 u->proto = type = URLHTTP;
451 for (; url[i] && url[i] != '/'; i++)
452 if (ISDIGIT (url[i]))
453 u->port = 10 * u->port + (url[i] - '0');
458 DEBUGP (("port %hu -> ", u->port));
460 else if (type == URLUNKNOWN) /* or a directory */
461 u->proto = type = URLFTP;
462 else /* or just a misformed port number */
465 else if (type == URLUNKNOWN)
466 u->proto = type = URLHTTP;
470 for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
471 if (sup_protos[ind].ind == type)
473 if (ind == ARRAY_SIZE (sup_protos))
475 u->port = sup_protos[ind].port;
477 /* Some delimiter troubles... */
478 if (url[i] == '/' && url[i - 1] != ':')
481 while (url[i] && url[i] == '/')
483 u->path = (char *)xmalloc (strlen (url + i) + 8);
484 strcpy (u->path, url + i);
487 u->ftp_type = process_ftp_type (u->path);
488 /* #### We don't handle type `d' correctly yet. */
489 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
491 DEBUGP (("ftp_type %c -> ", u->ftp_type));
493 DEBUGP (("opath %s -> ", u->path));
494 /* Parse the username and password (if existing). */
495 parse_uname (url, &u->user, &u->passwd);
496 /* Decode the strings, as per RFC 1738. */
497 decode_string (u->host);
498 decode_string (u->path);
500 decode_string (u->user);
502 decode_string (u->passwd);
503 /* Parse the directory. */
504 parse_dir (u->path, &u->dir, &u->file);
505 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
506 /* Simplify the directory. */
507 path_simplify (u->dir);
508 /* Remove the leading `/' in HTTP. */
509 if (type == URLHTTP && *u->dir == '/')
510 strcpy (u->dir, u->dir + 1);
511 DEBUGP (("ndir %s\n", u->dir));
512 /* Strip trailing `/'. */
514 if (l > 1 && u->dir[l - 1] == '/')
515 u->dir[l - 1] = '\0';
516 /* Re-create the path: */
517 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
518 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
519 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
520 strcpy (u->path, abs_ftp ? "%2F" : "/");
521 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
522 strcat (u->path, *u->dir ? "/" : "");
523 strcat (u->path, u->file);
525 DEBUGP (("newpath: %s\n", u->path));
526 /* Create the clean URL. */
527 u->url = str_url (u, 0);
531 /* Special versions of DOTP and DDOTP for parse_dir(). They work like
532 DOTP and DDOTP, but they also recognize `?' as end-of-string
533 delimiter. This is needed for correct handling of query
536 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
537 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
538 && (!*((x) + 2) || *((x) + 2) == '?'))
540 /* Build the directory and filename components of the path. Both
541 components are *separately* malloc-ed strings! It does not change
542 the contents of path.
544 If the path ends with "." or "..", they are (correctly) counted as
547 parse_dir (const char *path, char **dir, char **file)
551 l = urlpath_length (path);
552 for (i = l; i && path[i] != '/'; i--);
554 if (!i && *path != '/') /* Just filename */
556 if (PD_DOTP (path) || PD_DDOTP (path))
558 *dir = strdupdelim (path, path + l);
559 *file = xstrdup (path + l); /* normally empty, but could
564 *dir = xstrdup (""); /* This is required because of FTP */
565 *file = xstrdup (path);
568 else if (!i) /* /filename */
570 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
572 *dir = strdupdelim (path, path + l);
573 *file = xstrdup (path + l); /* normally empty, but could
578 *dir = xstrdup ("/");
579 *file = xstrdup (path + 1);
582 else /* Nonempty directory with or without a filename */
584 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
586 *dir = strdupdelim (path, path + l);
587 *file = xstrdup (path + l); /* normally empty, but could
592 *dir = strdupdelim (path, path + i);
593 *file = xstrdup (path + i + 1);
598 /* Find the optional username and password within the URL, as per
599 RFC1738. The returned user and passwd char pointers are
602 parse_uname (const char *url, char **user, char **passwd)
605 const char *p, *q, *col;
611 /* Look for the end of the protocol string. */
612 l = skip_proto (url);
615 /* Add protocol offset. */
617 /* Is there an `@' character? */
618 for (p = url; *p && *p != '/'; p++)
621 /* If not, return. */
624 /* Else find the username and password. */
625 for (p = q = col = url; *p && *p != '/'; p++)
627 if (*p == ':' && !*user)
629 *user = (char *)xmalloc (p - url + 1);
630 memcpy (*user, url, p - url);
631 (*user)[p - url] = '\0';
634 if (*p == '@') q = p;
636 /* Decide whether you have only the username or both. */
637 where = *user ? passwd : user;
638 *where = (char *)xmalloc (q - col + 1);
639 memcpy (*where, col, q - col);
640 (*where)[q - col] = '\0';
644 /* If PATH ends with `;type=X', return the character X. */
646 process_ftp_type (char *path)
648 int len = strlen (path);
651 && !memcmp (path + len - 7, ";type=", 6))
653 path[len - 7] = '\0';
654 return path[len - 1];
660 /* Return the URL as fine-formed string, with a proper protocol, optional port
661 number, directory and optional user/password. If `hide' is non-zero (as it
662 is when we're calling this on a URL we plan to print, but not when calling it
663 to canonicalize a URL for use within the program), password will be hidden.
664 The forbidden characters in the URL will be cleansed. */
666 str_url (const struct urlinfo *u, int hide)
668 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
669 int i, l, ln, lu, lh, lp, lf, ld;
670 unsigned short proto_default_port;
672 /* Look for the protocol name. */
673 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
674 if (sup_protos[i].ind == u->proto)
676 if (i == ARRAY_SIZE (sup_protos))
678 proto_name = sup_protos[i].name;
679 proto_default_port = sup_protos[i].port;
680 host = encode_string (u->host);
681 dir = encode_string (u->dir);
682 file = encode_string (u->file);
683 user = passwd = NULL;
685 user = encode_string (u->user);
689 /* Don't output the password, or someone might see it over the user's
690 shoulder (or in saved wget output). Don't give away the number of
691 characters in the password, either, as we did in past versions of
692 this code, when we replaced the password characters with 'x's. */
693 passwd = xstrdup("<password>");
695 passwd = encode_string (u->passwd);
697 if (u->proto == URLFTP && *dir == '/')
699 char *tmp = (char *)xmalloc (strlen (dir) + 3);
700 /*sprintf (tmp, "%%2F%s", dir + 1);*/
704 strcpy (tmp + 3, dir + 1);
709 ln = strlen (proto_name);
710 lu = user ? strlen (user) : 0;
711 lp = passwd ? strlen (passwd) : 0;
715 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
716 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
717 (user ? user : ""), (passwd ? ":" : ""),
718 (passwd ? passwd : ""), (user ? "@" : ""),
719 host, u->port, dir, *dir ? "/" : "", file); */
721 memcpy (res, proto_name, ln);
725 memcpy (res + l, user, lu);
730 memcpy (res + l, passwd, lp);
735 memcpy (res + l, host, lh);
737 if (u->port != proto_default_port)
740 long_to_string (res + l, (long)u->port);
741 l += numdigit (u->port);
744 memcpy (res + l, dir, ld);
748 strcpy (res + l, file);
757 /* Check whether two URL-s are equivalent, i.e. pointing to the same
758 location. Uses parseurl to parse them, and compares the canonical
761 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
762 return 0 on error. */
764 url_equal (const char *url1, const char *url2)
766 struct urlinfo *u1, *u2;
771 err = parseurl (url1, u1, 0);
778 err = parseurl (url2, u2, 0);
784 res = !strcmp (u1->url, u2->url);
791 get_urls_file (const char *file)
793 struct file_memory *fm;
795 const char *text, *text_end;
798 fm = read_file (file);
801 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
804 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
807 text_end = fm->content + fm->length;
808 while (text < text_end)
810 const char *line_beg = text;
811 const char *line_end = memchr (text, '\n', text_end - text);
817 while (line_beg < line_end
818 && ISSPACE (*line_beg))
820 while (line_end > line_beg + 1
821 && ISSPACE (*(line_end - 1)))
823 if (line_end > line_beg)
825 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
826 memset (entry, 0, sizeof (*entry));
828 entry->url = strdupdelim (line_beg, line_end);
840 /* Free the linked list of urlpos. */
842 free_urlpos (urlpos *l)
846 urlpos *next = l->next;
848 FREE_MAYBE (l->local_name);
854 /* Rotate FNAME opt.backups times */
856 rotate_backups(const char *fname)
858 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
859 char *from = (char *)alloca (maxlen);
860 char *to = (char *)alloca (maxlen);
864 if (stat (fname, &sb) == 0)
865 if (S_ISREG (sb.st_mode) == 0)
868 for (i = opt.backups; i > 1; i--)
870 sprintf (from, "%s.%d", fname, i - 1);
871 sprintf (to, "%s.%d", fname, i);
872 /* #### This will fail on machines without the rename() system
877 sprintf (to, "%s.%d", fname, 1);
881 /* Create all the necessary directories for PATH (a file). Calls
882 mkdirhier() internally. */
884 mkalldirs (const char *path)
891 p = path + strlen (path);
892 for (; *p != '/' && p != path; p--);
893 /* Don't create if it's just a file. */
894 if ((p == path) && (*p != '/'))
896 t = strdupdelim (path, p);
897 /* Check whether the directory exists. */
898 if ((stat (t, &st) == 0))
900 if (S_ISDIR (st.st_mode))
907 /* If the dir exists as a file name, remove it first. This
908 is *only* for Wget to work with buggy old CERN http
909 servers. Here is the scenario: When Wget tries to
910 retrieve a directory without a slash, e.g.
911 http://foo/bar (bar being a directory), CERN server will
912 not redirect it too http://foo/bar/ -- it will generate a
913 directory listing containing links to bar/file1,
914 bar/file2, etc. Wget will lose because it saves this
915 HTML listing to a file `bar', so it cannot create the
916 directory. To work around this, if the file of the same
917 name exists, we just remove it and create the directory
919 DEBUGP (("Removing %s because of directory danger!\n", t));
923 res = make_directory (t);
925 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
931 count_slashes (const char *s)
940 /* Return the path name of the URL-equivalent file name, with a
941 remote-like structure of directories. */
943 mkstruct (const struct urlinfo *u)
945 char *host, *dir, *file, *res, *dirpref;
948 assert (u->dir != NULL);
949 assert (u->host != NULL);
953 char *ptr = u->dir + (*u->dir == '/');
954 int slash_count = 1 + count_slashes (ptr);
955 int cut = MINVAL (opt.cut_dirs, slash_count);
956 for (; cut && *ptr; ptr++)
959 STRDUP_ALLOCA (dir, ptr);
962 dir = u->dir + (*u->dir == '/');
964 host = xstrdup (u->host);
965 /* Check for the true name (or at least a consistent name for saving
966 to directory) of HOST, reusing the hlist if possible. */
967 if (opt.add_hostdir && !opt.simple_check)
969 char *nhost = realhost (host);
973 /* Add dir_prefix and hostname (if required) to the beginning of
977 if (!DOTP (opt.dir_prefix))
979 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
980 + strlen (host) + 1);
981 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
984 STRDUP_ALLOCA (dirpref, host);
986 else /* not add_hostdir */
988 if (!DOTP (opt.dir_prefix))
989 dirpref = opt.dir_prefix;
995 /* If there is a prefix, prepend it. */
998 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
999 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1002 dir = encode_string (dir);
1004 if (l && dir[l - 1] == '/')
1008 file = "index.html";
1012 /* Finally, construct the full name. */
1013 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1014 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1019 /* Create a unique filename, corresponding to a given URL. Calls
1020 mkstruct if necessary. Does *not* actually create any directories. */
1022 url_filename (const struct urlinfo *u)
1025 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1029 file = mkstruct (u);
1035 file = xstrdup ("index.html");
1037 file = xstrdup (u->file);
1042 /* Check whether the prefix directory is something other than "."
1043 before prepending it. */
1044 if (!DOTP (opt.dir_prefix))
1046 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1047 + 1 + strlen (file) + 1);
1048 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1053 /* DOS-ish file systems don't like `%' signs in them; we change it
1058 for (p = file; *p; p++)
1062 #endif /* WINDOWS */
1064 /* Check the cases in which the unique extensions are not used:
1065 1) Clobbering is turned off (-nc).
1066 2) Retrieval with regetting.
1067 3) Timestamping is used.
1068 4) Hierarchy is built.
1070 The exception is the case when file does exist and is a
1071 directory (actually support for bad httpd-s). */
1072 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1073 && !(file_exists_p (file) && !file_non_directory_p (file)))
1076 /* Find a unique name. */
1077 name = unique_name (file);
1082 /* Like strlen(), but allow the URL to be ended with '?'. */
1084 urlpath_length (const char *url)
1086 const char *q = strchr (url, '?');
1089 return strlen (url);
1092 /* Find the last occurrence of character C in the range [b, e), or
1093 NULL, if none are present. This is almost completely equivalent to
1094 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1095 the contents of the string. */
1097 find_last_char (const char *b, const char *e, char c)
1105 /* Resolve the result of "linking" a base URI (BASE) to a
1106 link-specified URI (LINK).
1108 Either of the URIs may be absolute or relative, complete with the
1109 host name, or path only. This tries to behave "reasonably" in all
1110 foreseeable cases. It employs little specific knowledge about
1111 protocols or URL-specific stuff -- it just works on strings.
1113 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1114 See uri_merge for a gentler interface to this functionality.
1116 #### This function should handle `./' and `../' so that the evil
1117 path_simplify can go. */
1119 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1125 const char *end = base + urlpath_length (base);
1129 /* LINK is a relative URL: we need to replace everything
1130 after last slash (possibly empty) with LINK.
1132 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1133 our result should be "whatever/foo/qux/xyzzy". */
1134 int need_explicit_slash = 0;
1136 const char *start_insert;
1137 const char *last_slash = find_last_char (base, end, '/');
1140 /* No slash found at all. Append LINK to what we have,
1141 but we'll need a slash as a separator.
1143 Example: if base == "foo" and link == "qux/xyzzy", then
1144 we cannot just append link to base, because we'd get
1145 "fooqux/xyzzy", whereas what we want is
1148 To make sure the / gets inserted, we set
1149 need_explicit_slash to 1. We also set start_insert
1150 to end + 1, so that the length calculations work out
1151 correctly for one more (slash) character. Accessing
1152 that character is fine, since it will be the
1153 delimiter, '\0' or '?'. */
1154 /* example: "foo?..." */
1155 /* ^ ('?' gets changed to '/') */
1156 start_insert = end + 1;
1157 need_explicit_slash = 1;
1159 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1161 /* example: http://host" */
1163 start_insert = end + 1;
1164 need_explicit_slash = 1;
1168 /* example: "whatever/foo/bar" */
1170 start_insert = last_slash + 1;
1173 span = start_insert - base;
1174 constr = (char *)xmalloc (span + linklength + 1);
1176 memcpy (constr, base, span);
1177 if (need_explicit_slash)
1178 constr[span - 1] = '/';
1180 memcpy (constr + span, link, linklength);
1181 constr[span + linklength] = '\0';
1183 else /* *link == `/' */
1185 /* LINK is an absolute path: we need to replace everything
1186 after (and including) the FIRST slash with LINK.
1188 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1189 "/qux/xyzzy", our result should be
1190 "http://host/qux/xyzzy". */
1193 const char *start_insert = NULL; /* for gcc to shut up. */
1194 const char *pos = base;
1195 int seen_slash_slash = 0;
1196 /* We're looking for the first slash, but want to ignore
1199 slash = memchr (pos, '/', end - pos);
1200 if (slash && !seen_slash_slash)
1201 if (*(slash + 1) == '/')
1204 seen_slash_slash = 1;
1208 /* At this point, SLASH is the location of the first / after
1209 "//", or the first slash altogether. START_INSERT is the
1210 pointer to the location where LINK will be inserted. When
1211 examining the last two examples, keep in mind that LINK
1214 if (!slash && !seen_slash_slash)
1215 /* example: "foo" */
1217 start_insert = base;
1218 else if (!slash && seen_slash_slash)
1219 /* example: "http://foo" */
1222 else if (slash && !seen_slash_slash)
1223 /* example: "foo/bar" */
1225 start_insert = base;
1226 else if (slash && seen_slash_slash)
1227 /* example: "http://something/" */
1229 start_insert = slash;
1231 span = start_insert - base;
1232 constr = (char *)xmalloc (span + linklength + 1);
1234 memcpy (constr, base, span);
1236 memcpy (constr + span, link, linklength);
1237 constr[span + linklength] = '\0';
1240 else /* !no_proto */
1242 constr = strdupdelim (link, link + linklength);
1247 /* Merge BASE with LINK and return the resulting URI. This is an
1248 interface to uri_merge_1 that assumes that LINK is a
1249 zero-terminated string. */
1251 uri_merge (const char *base, const char *link)
1253 return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1256 /* Optimize URL by host, destructively replacing u->host with realhost
1257 (u->host). Do this regardless of opt.simple_check. */
1259 opt_url (struct urlinfo *u)
1261 /* Find the "true" host. */
1262 char *host = realhost (u->host);
1265 assert (u->dir != NULL); /* the URL must have been parsed */
1266 /* Refresh the printed representation. */
1268 u->url = str_url (u, 0);
1271 /* Returns proxy host address, in accordance with PROTO. */
1273 getproxy (uerr_t proto)
1275 if (proto == URLHTTP)
1276 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1277 else if (proto == URLFTP)
1278 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1280 else if (proto == URLHTTPS)
1281 return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1282 #endif /* HAVE_SSL */
1287 /* Should a host be accessed through proxy, concerning no_proxy? */
1289 no_proxy_match (const char *host, const char **no_proxy)
1294 return !sufmatch (no_proxy, host);
1297 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1298 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1300 /* Change the links in an HTML document. Accepts a structure that
1301 defines the positions of all the links. */
1303 convert_links (const char *file, urlpos *l)
1305 struct file_memory *fm;
1308 downloaded_file_t downloaded_file_return;
1310 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1313 /* First we do a "dry run": go through the list L and see whether
1314 any URL needs to be converted in the first place. If not, just
1315 leave the file alone. */
1318 for (dry = l; dry; dry = dry->next)
1319 if (dry->convert != CO_NOCONVERT)
1323 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1328 fm = read_file (file);
1331 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1332 file, strerror (errno));
1336 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1337 if (opt.backup_converted && downloaded_file_return)
1338 write_backup_file (file, downloaded_file_return);
1340 /* Before opening the file for writing, unlink the file. This is
1341 important if the data in FM is mmaped. In such case, nulling the
1342 file, which is what fopen() below does, would make us read all
1343 zeroes from the mmaped region. */
1344 if (unlink (file) < 0 && errno != ENOENT)
1346 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1347 file, strerror (errno));
1348 read_file_free (fm);
1351 /* Now open the file for writing. */
1352 fp = fopen (file, "wb");
1355 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1356 file, strerror (errno));
1357 read_file_free (fm);
1360 /* Here we loop through all the URLs in file, replacing those of
1361 them that are downloaded with relative references. */
1363 for (; l; l = l->next)
1365 char *url_start = fm->content + l->pos;
1367 if (l->pos >= fm->length)
1369 DEBUGP (("Something strange is going on. Please investigate."));
1372 /* If the URL is not to be converted, skip it. */
1373 if (l->convert == CO_NOCONVERT)
1375 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1379 /* Echo the file contents, up to the offending URL's opening
1380 quote, to the outfile. */
1381 fwrite (p, 1, url_start - p, fp);
1383 if (l->convert == CO_CONVERT_TO_RELATIVE)
1385 /* Convert absolute URL to relative. */
1386 char *newname = construct_relative (file, l->local_name);
1387 char *quoted_newname = html_quote_string (newname);
1388 replace_attr (&p, l->size, fp, quoted_newname);
1389 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1390 l->url, newname, l->pos, file));
1392 xfree (quoted_newname);
1394 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1396 /* Convert the link to absolute URL. */
1397 char *newlink = l->url;
1398 char *quoted_newlink = html_quote_string (newlink);
1399 replace_attr (&p, l->size, fp, quoted_newlink);
1400 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1401 newlink, l->pos, file));
1402 xfree (quoted_newlink);
1405 /* Output the rest of the file. */
1406 if (p - fm->content < fm->length)
1407 fwrite (p, 1, fm->length - (p - fm->content), fp);
1409 read_file_free (fm);
1410 logputs (LOG_VERBOSE, _("done.\n"));
1413 /* Construct and return a malloced copy of the relative link from two
1414 pieces of information: local name S1 of the referring file and
1415 local name S2 of the referred file.
1417 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1418 "jagor.srce.hr/images/news.gif", the function will return
1421 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1422 "fly.cc.fer.hr/images/fly.gif", the function will return
1423 "../images/fly.gif".
1425 Caveats: S1 should not begin with `/', unless S2 also begins with
1426 '/'. S1 should not contain things like ".." and such --
1427 construct_relative ("fly/ioccc/../index.html",
1428 "fly/images/fly.gif") will fail. (A workaround is to call
1429 something like path_simplify() on S1). */
1431 construct_relative (const char *s1, const char *s2)
1433 int i, cnt, sepdirs1;
1437 return xstrdup (s2);
1438 /* S1 should *not* be absolute, if S2 wasn't. */
1439 assert (*s1 != '/');
1441 /* Skip the directories common to both strings. */
1444 while (s1[i] && s2[i]
1449 if (s1[i] == '/' && s2[i] == '/')
1454 for (sepdirs1 = 0; s1[i]; i++)
1457 /* Now, construct the file as of:
1458 - ../ repeated sepdirs1 time
1459 - all the non-mutual directories of S2. */
1460 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1461 for (i = 0; i < sepdirs1; i++)
1462 memcpy (res + 3 * i, "../", 3);
1463 strcpy (res + 3 * i, s2 + cnt);
1467 /* Add URL to the head of the list L. */
1469 add_url (urlpos *l, const char *url, const char *file)
1473 t = (urlpos *)xmalloc (sizeof (urlpos));
1474 memset (t, 0, sizeof (*t));
1475 t->url = xstrdup (url);
1476 t->local_name = xstrdup (file);
1482 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1484 /* Rather than just writing over the original .html file with the
1485 converted version, save the former to *.orig. Note we only do
1486 this for files we've _successfully_ downloaded, so we don't
1487 clobber .orig files sitting around from previous invocations. */
1489 /* Construct the backup filename as the original name plus ".orig". */
1490 size_t filename_len = strlen(file);
1491 char* filename_plus_orig_suffix;
1492 boolean already_wrote_backup_file = FALSE;
1493 slist* converted_file_ptr;
1494 static slist* converted_files = NULL;
1496 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1498 /* Just write "orig" over "html". We need to do it this way
1499 because when we're checking to see if we've downloaded the
1500 file before (to see if we can skip downloading it), we don't
1501 know if it's a text/html file. Therefore we don't know yet
1502 at that stage that -E is going to cause us to tack on
1503 ".html", so we need to compare vs. the original URL plus
1504 ".orig", not the original URL plus ".html.orig". */
1505 filename_plus_orig_suffix = alloca (filename_len + 1);
1506 strcpy(filename_plus_orig_suffix, file);
1507 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1509 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1511 /* Append ".orig" to the name. */
1512 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1513 strcpy(filename_plus_orig_suffix, file);
1514 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1517 /* We can get called twice on the same URL thanks to the
1518 convert_all_links() call in main(). If we write the .orig file
1519 each time in such a case, it'll end up containing the first-pass
1520 conversion, not the original file. So, see if we've already been
1521 called on this file. */
1522 converted_file_ptr = converted_files;
1523 while (converted_file_ptr != NULL)
1524 if (strcmp(converted_file_ptr->string, file) == 0)
1526 already_wrote_backup_file = TRUE;
1530 converted_file_ptr = converted_file_ptr->next;
1532 if (!already_wrote_backup_file)
1534 /* Rename <file> to <file>.orig before former gets written over. */
1535 if (rename(file, filename_plus_orig_suffix) != 0)
1536 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1537 file, filename_plus_orig_suffix, strerror (errno));
1539 /* Remember that we've already written a .orig backup for this file.
1540 Note that we never free this memory since we need it till the
1541 convert_all_links() call, which is one of the last things the
1542 program does before terminating. BTW, I'm not sure if it would be
1543 safe to just set 'converted_file_ptr->string' to 'file' below,
1544 rather than making a copy of the string... Another note is that I
1545 thought I could just add a field to the urlpos structure saying
1546 that we'd written a .orig file for this URL, but that didn't work,
1547 so I had to make this separate list.
1548 -- Dan Harkless <wget@harkless.org>
1550 This [adding a field to the urlpos structure] didn't work
1551 because convert_file() is called twice: once after all its
1552 sublinks have been retrieved in recursive_retrieve(), and
1553 once at the end of the day in convert_all_links(). The
1554 original linked list collected in recursive_retrieve() is
1555 lost after the first invocation of convert_links(), and
1556 convert_all_links() makes a new one (it calls get_urls_html()
1557 for each file it covers.) That's why your first approach didn't
1558 work. The way to make it work is perhaps to make this flag a
1559 field in the `urls_html' list.
1560 -- Hrvoje Niksic <hniksic@arsdigita.com>
1562 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1563 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1564 converted_file_ptr->next = converted_files;
1565 converted_files = converted_file_ptr;
1569 static int find_fragment PARAMS ((const char *, int, const char **,
1573 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1575 const char *p = *pp;
1577 int size = raw_size;
1578 char quote_char = '\"';
1579 const char *frag_beg, *frag_end;
1581 /* Structure of our string is:
1582 "...old-contents..."
1583 <--- l->size ---> (with quotes)
1586 <--- l->size --> (no quotes) */
1588 if (*p == '\"' || *p == '\'')
1593 size -= 2; /* disregard opening and closing quote */
1595 putc (quote_char, fp);
1596 fputs (new_str, fp);
1598 /* Look for fragment identifier, if any. */
1599 if (find_fragment (p, size, &frag_beg, &frag_end))
1600 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1604 putc (quote_char, fp);
1608 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1609 preceded by '&'. If the character is not found, return zero. If
1610 the character is found, return 1 and set BP and EP to point to the
1611 beginning and end of the region.
1613 This is used for finding the fragment indentifiers in URLs. */
1616 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1618 const char *end = beg + size;
1620 for (; beg < end; beg++)
1642 typedef struct _downloaded_file_list {
1644 downloaded_file_t download_type;
1645 struct _downloaded_file_list* next;
1646 } downloaded_file_list;
1648 static downloaded_file_list *downloaded_files;
1650 /* Remembers which files have been downloaded. In the standard case, should be
1651 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1652 download successfully (i.e. not for ones we have failures on or that we skip
1655 When we've downloaded a file and tacked on a ".html" extension due to -E,
1656 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1657 FILE_DOWNLOADED_NORMALLY.
1659 If you just want to check if a file has been previously added without adding
1660 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1661 with local filenames, not remote URLs. */
1663 downloaded_file (downloaded_file_t mode, const char* file)
1665 boolean found_file = FALSE;
1666 downloaded_file_list* rover = downloaded_files;
1668 while (rover != NULL)
1669 if (strcmp(rover->file, file) == 0)
1675 rover = rover->next;
1678 return rover->download_type; /* file had already been downloaded */
1681 if (mode != CHECK_FOR_FILE)
1683 rover = xmalloc(sizeof(*rover));
1684 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1685 rover->download_type = mode;
1686 rover->next = downloaded_files;
1687 downloaded_files = rover;
1690 return FILE_NOT_ALREADY_DOWNLOADED;
1695 downloaded_files_free (void)
1697 downloaded_file_list* rover = downloaded_files;
1700 downloaded_file_list *next = rover->next;
1701 xfree (rover->file);
1707 /* Initialization of static stuff. */
1711 init_unsafe_char_table ();