2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
46 /* Default port definitions */
47 #define DEFAULT_HTTP_PORT 80
48 #define DEFAULT_FTP_PORT 21
50 /* Table of Unsafe chars. This is intialized in
51 init_unsafe_char_table. */
53 static char unsafe_char_table[256];
55 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
57 /* If S contains unsafe characters, free it and replace it with a
58 version that doesn't. */
59 #define URL_CLEANSE(s) do \
61 if (contains_unsafe (s)) \
63 char *uc_tmp = encode_string (s); \
69 /* Is a directory "."? */
70 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
71 /* Is a directory ".."? */
72 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
75 static void path_simplify_with_kludge PARAMS ((char *));
77 static int urlpath_length PARAMS ((const char *));
79 /* NULL-terminated list of strings to be recognized as prototypes (URL
80 schemes). Note that recognized doesn't mean supported -- only HTTP
81 and FTP are currently supported.
83 However, a string that does not match anything in the list will be
84 considered a relative URL. Thus it's important that this list has
85 anything anyone could think of being legal.
87 There are wild things here. :-) Take a look at
88 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
90 static char *protostrings[] =
132 /* Similar to former, but for supported protocols: */
133 static struct proto sup_protos[] =
135 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
136 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
137 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
140 static void parse_dir PARAMS ((const char *, char **, char **));
141 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
142 static char *construct PARAMS ((const char *, const char *, int , int));
143 static char *construct_relative PARAMS ((const char *, const char *));
144 static char process_ftp_type PARAMS ((char *));
147 /* Returns the number of characters to be skipped if the first thing
148 in a URL is URL: (which is 0 or 4+). The optional spaces after
149 URL: are also skipped. */
151 skip_url (const char *url)
155 if (TOUPPER (url[0]) == 'U'
156 && TOUPPER (url[1]) == 'R'
157 && TOUPPER (url[2]) == 'L'
161 for (i = 4; url[i] && ISSPACE (url[i]); i++);
170 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
171 - @ and :, for user/password encoding.
172 - everything over 127 (but we don't bother with recording those. */
174 init_unsafe_char_table (void)
177 for (i = 0; i < 256; i++)
178 if (i < 32 || i >= 127
194 unsafe_char_table[i] = 1;
197 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
199 contains_unsafe (const char *s)
202 if (UNSAFE_CHAR (*s))
207 /* Decodes the forms %xy in a URL to the character the hexadecimal
208 code of which is xy. xy are hexadecimal digits from
209 [0123456789ABCDEF] (case-insensitive). If x or y are not
210 hex-digits or `%' precedes `\0', the sequence is inserted
214 decode_string (char *s)
224 /* Do nothing if at the end of the string, or if the chars
225 are not hex-digits. */
226 if (!*(s + 1) || !*(s + 2)
227 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
232 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
239 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
240 given string, returning a malloc-ed %XX encoded string. */
242 encode_string (const char *s)
249 for (i = 0; *s; s++, i++)
250 if (UNSAFE_CHAR (*s))
251 i += 2; /* Two more characters (hex digits) */
252 res = (char *)xmalloc (i + 1);
254 for (p = res; *s; s++)
255 if (UNSAFE_CHAR (*s))
257 const unsigned char c = *s;
259 *p++ = HEXD2ASC (c >> 4);
260 *p++ = HEXD2ASC (c & 0xf);
268 /* Returns the proto-type if URL's protocol is supported, or
269 URLUNKNOWN if not. */
271 urlproto (const char *url)
275 url += skip_url (url);
276 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
277 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
278 return sup_protos[i].ind;
279 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
282 for (++i; url[i] && url[i] != '/'; i++)
283 if (!ISDIGIT (url[i]))
285 if (url[i - 1] == ':')
294 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
295 part is found, returns 0. */
297 skip_proto (const char *url)
302 for (s = protostrings; *s; s++)
303 if (!strncasecmp (*s, url, strlen (*s)))
308 /* HTTP and FTP protocols are expected to yield exact host names
309 (i.e. the `//' part must be skipped, too). */
310 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
315 /* Returns 1 if the URL begins with a protocol (supported or
316 unsupported), 0 otherwise. */
318 has_proto (const char *url)
322 url += skip_url (url);
323 for (s = protostrings; *s; s++)
324 if (strncasecmp (url, *s, strlen (*s)) == 0)
329 /* Skip the username and password, if present here. The function
330 should be called *not* with the complete URL, but with the part
331 right after the protocol.
333 If no username and password are found, return 0. */
335 skip_uname (const char *url)
338 for (p = url; *p && *p != '/'; p++)
341 /* If a `@' was found before the first occurrence of `/', skip
349 /* Allocate a new urlinfo structure, fill it with default values and
350 return a pointer to it. */
356 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
357 memset (u, 0, sizeof (*u));
358 u->proto = URLUNKNOWN;
362 /* Perform a "deep" free of the urlinfo structure. The structure
363 should have been created with newurl, but need not have been used.
364 If free_pointer is non-0, free the pointer itself. */
366 freeurl (struct urlinfo *u, int complete)
370 FREE_MAYBE (u->host);
371 FREE_MAYBE (u->path);
372 FREE_MAYBE (u->file);
374 FREE_MAYBE (u->user);
375 FREE_MAYBE (u->passwd);
376 FREE_MAYBE (u->local);
377 FREE_MAYBE (u->referer);
379 freeurl (u->proxy, 1);
385 /* Extract the given URL of the form
386 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
387 1. hostname (terminated with `/' or `:')
388 2. port number (terminated with `/'), or chosen for the protocol
389 3. dirname (everything after hostname)
390 Most errors are handled. No allocation is done, you must supply
391 pointers to allocated memory.
392 ...and a host of other stuff :-)
394 - Recognizes hostname:dir/file for FTP and
395 hostname (:portnum)?/dir/file for HTTP.
396 - Parses the path to yield directory and file
397 - Parses the URL to yield the username and passwd (if present)
398 - Decodes the strings, in case they contain "forbidden" characters
399 - Writes the result to struct urlinfo
401 If the argument STRICT is set, it recognizes only the canonical
404 parseurl (const char *url, struct urlinfo *u, int strict)
407 int recognizable; /* Recognizable URL is the one where
408 the protocol name was explicitly
409 named, i.e. it wasn't deduced from
413 DEBUGP (("parseurl (\"%s\") -> ", url));
414 url += skip_url (url);
415 recognizable = has_proto (url);
416 if (strict && !recognizable)
418 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
420 l = strlen (sup_protos[i].name);
421 if (!strncasecmp (sup_protos[i].name, url, l))
424 /* If protocol is recognizable, but unsupported, bail out, else
426 if (recognizable && i == ARRAY_SIZE (sup_protos))
428 else if (i == ARRAY_SIZE (sup_protos))
431 u->proto = type = sup_protos[i].ind;
433 if (type == URLUNKNOWN)
435 /* Allow a username and password to be specified (i.e. just skip
438 l += skip_uname (url + l);
439 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
442 /* Get the hostname. */
443 u->host = strdupdelim (url + l, url + i);
444 DEBUGP (("host %s -> ", u->host));
446 /* Assume no port has been given. */
450 /* We have a colon delimiting the hostname. It could mean that
451 a port number is following it, or a directory. */
452 if (ISDIGIT (url[++i])) /* A port number */
454 if (type == URLUNKNOWN)
455 u->proto = type = URLHTTP;
456 for (; url[i] && url[i] != '/'; i++)
457 if (ISDIGIT (url[i]))
458 u->port = 10 * u->port + (url[i] - '0');
463 DEBUGP (("port %hu -> ", u->port));
465 else if (type == URLUNKNOWN) /* or a directory */
466 u->proto = type = URLFTP;
467 else /* or just a misformed port number */
470 else if (type == URLUNKNOWN)
471 u->proto = type = URLHTTP;
475 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
476 if (sup_protos[i].ind == type)
478 if (i == ARRAY_SIZE (sup_protos))
480 u->port = sup_protos[i].port;
482 /* Some delimiter troubles... */
483 if (url[i] == '/' && url[i - 1] != ':')
486 while (url[i] && url[i] == '/')
488 u->path = (char *)xmalloc (strlen (url + i) + 8);
489 strcpy (u->path, url + i);
492 u->ftp_type = process_ftp_type (u->path);
493 /* #### We don't handle type `d' correctly yet. */
494 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
497 DEBUGP (("opath %s -> ", u->path));
498 /* Parse the username and password (if existing). */
499 parse_uname (url, &u->user, &u->passwd);
500 /* Decode the strings, as per RFC 1738. */
501 decode_string (u->host);
502 decode_string (u->path);
504 decode_string (u->user);
506 decode_string (u->passwd);
507 /* Parse the directory. */
508 parse_dir (u->path, &u->dir, &u->file);
509 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
510 /* Simplify the directory. */
511 path_simplify (u->dir);
512 /* Remove the leading `/' in HTTP. */
513 if (type == URLHTTP && *u->dir == '/')
514 strcpy (u->dir, u->dir + 1);
515 DEBUGP (("ndir %s\n", u->dir));
516 /* Strip trailing `/'. */
518 if (l && u->dir[l - 1] == '/')
519 u->dir[l - 1] = '\0';
520 /* Re-create the path: */
521 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
522 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
523 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
524 strcpy (u->path, abs_ftp ? "%2F" : "/");
525 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
526 strcat (u->path, *u->dir ? "/" : "");
527 strcat (u->path, u->file);
528 URL_CLEANSE (u->path);
529 DEBUGP (("newpath: %s\n", u->path));
530 /* Create the clean URL. */
531 u->url = str_url (u, 0);
535 /* Special versions of DOTP and DDOTP for parse_dir(). */
537 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
538 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
539 && (!*((x) + 2) || *((x) + 2) == '?'))
541 /* Build the directory and filename components of the path. Both
542 components are *separately* malloc-ed strings! It does not change
543 the contents of path.
545 If the path ends with "." or "..", they are (correctly) counted as
548 parse_dir (const char *path, char **dir, char **file)
552 l = urlpath_length (path);
553 for (i = l; i && path[i] != '/'; i--);
555 if (!i && *path != '/') /* Just filename */
557 if (PD_DOTP (path) || PD_DDOTP (path))
559 *dir = strdupdelim (path, path + l);
560 *file = xstrdup (path + l); /* normally empty, but could
565 *dir = xstrdup (""); /* This is required because of FTP */
566 *file = xstrdup (path);
569 else if (!i) /* /filename */
571 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
573 *dir = strdupdelim (path, path + l);
574 *file = xstrdup (path + l); /* normally empty, but could
579 *dir = xstrdup ("/");
580 *file = xstrdup (path + 1);
583 else /* Nonempty directory with or without a filename */
585 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
587 *dir = strdupdelim (path, path + l);
588 *file = xstrdup (path + l); /* normally empty, but could
593 *dir = strdupdelim (path, path + i);
594 *file = xstrdup (path + i + 1);
599 /* Find the optional username and password within the URL, as per
600 RFC1738. The returned user and passwd char pointers are
603 parse_uname (const char *url, char **user, char **passwd)
611 url += skip_url (url);
612 /* Look for end of protocol string. */
613 l = skip_proto (url);
616 /* Add protocol offset. */
618 /* Is there an `@' character? */
619 for (p = url; *p && *p != '/'; p++)
622 /* If not, return. */
625 /* Else find the username and password. */
626 for (p = col = url; *p != '@'; p++)
628 if (*p == ':' && !*user)
630 *user = (char *)xmalloc (p - url + 1);
631 memcpy (*user, url, p - url);
632 (*user)[p - url] = '\0';
636 /* Decide whether you have only the username or both. */
637 where = *user ? passwd : user;
638 *where = (char *)xmalloc (p - col + 1);
639 memcpy (*where, col, p - col);
640 (*where)[p - col] = '\0';
644 /* If PATH ends with `;type=X', return the character X. */
646 process_ftp_type (char *path)
648 int len = strlen (path);
651 && !memcmp (path + len - 7, ";type=", 6))
653 path[len - 7] = '\0';
654 return path[len - 1];
660 /* Return the URL as fine-formed string, with a proper protocol,
661 optional port number, directory and optional user/password. If
662 HIDE is non-zero, password will be hidden. The forbidden
663 characters in the URL will be cleansed. */
665 str_url (const struct urlinfo *u, int hide)
667 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
668 int i, l, ln, lu, lh, lp, lf, ld;
669 unsigned short proto_default_port;
671 /* Look for the protocol name. */
672 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
673 if (sup_protos[i].ind == u->proto)
675 if (i == ARRAY_SIZE (sup_protos))
677 proto_name = sup_protos[i].name;
678 proto_default_port = sup_protos[i].port;
679 host = CLEANDUP (u->host);
680 dir = CLEANDUP (u->dir);
681 file = CLEANDUP (u->file);
682 user = passwd = NULL;
684 user = CLEANDUP (u->user);
688 passwd = CLEANDUP (u->passwd);
690 for (i = 0; passwd[i]; i++)
693 if (u->proto == URLFTP && *dir == '/')
695 char *tmp = (char *)xmalloc (strlen (dir) + 3);
696 /*sprintf (tmp, "%%2F%s", dir + 1);*/
700 strcpy (tmp + 3, dir + 1);
705 ln = strlen (proto_name);
706 lu = user ? strlen (user) : 0;
707 lp = passwd ? strlen (passwd) : 0;
711 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
712 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
713 (user ? user : ""), (passwd ? ":" : ""),
714 (passwd ? passwd : ""), (user ? "@" : ""),
715 host, u->port, dir, *dir ? "/" : "", file); */
717 memcpy (res, proto_name, ln);
721 memcpy (res + l, user, lu);
726 memcpy (res + l, passwd, lp);
731 memcpy (res + l, host, lh);
733 if (u->port != proto_default_port)
736 long_to_string (res + l, (long)u->port);
737 l += numdigit (u->port);
740 memcpy (res + l, dir, ld);
744 strcpy (res + l, file);
753 /* Check whether two URL-s are equivalent, i.e. pointing to the same
754 location. Uses parseurl to parse them, and compares the canonical
757 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
758 return 0 on error. */
760 url_equal (const char *url1, const char *url2)
762 struct urlinfo *u1, *u2;
767 err = parseurl (url1, u1, 0);
774 err = parseurl (url2, u2, 0);
780 res = !strcmp (u1->url, u2->url);
787 get_urls_file (const char *file)
789 struct file_memory *fm;
791 const char *text, *text_end;
794 fm = read_file (file);
797 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
800 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
803 text_end = fm->content + fm->length;
804 while (text < text_end)
806 const char *line_beg = text;
807 const char *line_end = memchr (text, '\n', text_end - text);
813 while (line_beg < line_end
814 && ISSPACE (*line_beg))
816 while (line_end > line_beg + 1
817 && ISSPACE (*(line_end - 1)))
819 if (line_end > line_beg)
821 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
822 memset (entry, 0, sizeof (*entry));
824 entry->url = strdupdelim (line_beg, line_end);
836 /* Free the linked list of urlpos. */
838 free_urlpos (urlpos *l)
842 urlpos *next = l->next;
844 FREE_MAYBE (l->local_name);
850 /* Rotate FNAME opt.backups times */
852 rotate_backups(const char *fname)
854 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
855 char *from = (char *)alloca (maxlen);
856 char *to = (char *)alloca (maxlen);
860 if (stat (fname, &sb) == 0)
861 if (S_ISREG (sb.st_mode) == 0)
864 for (i = opt.backups; i > 1; i--)
866 sprintf (from, "%s.%d", fname, i - 1);
867 sprintf (to, "%s.%d", fname, i);
868 /* #### This will fail on machines without the rename() system
873 sprintf (to, "%s.%d", fname, 1);
877 /* Create all the necessary directories for PATH (a file). Calls
878 mkdirhier() internally. */
880 mkalldirs (const char *path)
887 p = path + strlen (path);
888 for (; *p != '/' && p != path; p--);
889 /* Don't create if it's just a file. */
890 if ((p == path) && (*p != '/'))
892 t = strdupdelim (path, p);
893 /* Check whether the directory exists. */
894 if ((stat (t, &st) == 0))
896 if (S_ISDIR (st.st_mode))
903 /* If the dir exists as a file name, remove it first. This
904 is *only* for Wget to work with buggy old CERN http
905 servers. Here is the scenario: When Wget tries to
906 retrieve a directory without a slash, e.g.
907 http://foo/bar (bar being a directory), CERN server will
908 not redirect it too http://foo/bar/ -- it will generate a
909 directory listing containing links to bar/file1,
910 bar/file2, etc. Wget will lose because it saves this
911 HTML listing to a file `bar', so it cannot create the
912 directory. To work around this, if the file of the same
913 name exists, we just remove it and create the directory
915 DEBUGP (("Removing %s because of directory danger!\n", t));
919 res = make_directory (t);
921 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
927 count_slashes (const char *s)
936 /* Return the path name of the URL-equivalent file name, with a
937 remote-like structure of directories. */
939 mkstruct (const struct urlinfo *u)
941 char *host, *dir, *file, *res, *dirpref;
944 assert (u->dir != NULL);
945 assert (u->host != NULL);
949 char *ptr = u->dir + (*u->dir == '/');
950 int slash_count = 1 + count_slashes (ptr);
951 int cut = MINVAL (opt.cut_dirs, slash_count);
952 for (; cut && *ptr; ptr++)
955 STRDUP_ALLOCA (dir, ptr);
958 dir = u->dir + (*u->dir == '/');
960 host = xstrdup (u->host);
961 /* Check for the true name (or at least a consistent name for saving
962 to directory) of HOST, reusing the hlist if possible. */
963 if (opt.add_hostdir && !opt.simple_check)
965 char *nhost = realhost (host);
969 /* Add dir_prefix and hostname (if required) to the beginning of
973 if (!DOTP (opt.dir_prefix))
975 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
976 + strlen (host) + 1);
977 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
980 STRDUP_ALLOCA (dirpref, host);
982 else /* not add_hostdir */
984 if (!DOTP (opt.dir_prefix))
985 dirpref = opt.dir_prefix;
991 /* If there is a prefix, prepend it. */
994 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
995 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1001 if (l && dir[l - 1] == '/')
1005 file = "index.html";
1009 /* Finally, construct the full name. */
1010 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1011 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1016 /* Create a unique filename, corresponding to a given URL. Calls
1017 mkstruct if necessary. Does *not* actually create any directories. */
1019 url_filename (const struct urlinfo *u)
1022 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1026 file = mkstruct (u);
1032 file = xstrdup ("index.html");
1034 file = xstrdup (u->file);
1039 /* Check whether the prefix directory is something other than "."
1040 before prepending it. */
1041 if (!DOTP (opt.dir_prefix))
1043 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1044 + 1 + strlen (file) + 1);
1045 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1050 /* DOS-ish file systems don't like `%' signs in them; we change it
1055 for (p = file; *p; p++)
1059 #endif /* WINDOWS */
1061 /* Check the cases in which the unique extensions are not used:
1062 1) Clobbering is turned off (-nc).
1063 2) Retrieval with regetting.
1064 3) Timestamping is used.
1065 4) Hierarchy is built.
1067 The exception is the case when file does exist and is a
1068 directory (actually support for bad httpd-s). */
1069 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1070 && !(file_exists_p (file) && !file_non_directory_p (file)))
1073 /* Find a unique name. */
1074 name = unique_name (file);
1079 /* Like strlen(), but allow the URL to be ended with '?'. */
1081 urlpath_length (const char *url)
1083 const char *q = strchr (url, '?');
1086 return strlen (url);
1089 /* Find the last occurrence of character C in the range [b, e), or
1090 NULL, if none are present. This is almost completely equivalent to
1091 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1092 the contents of the string. */
1094 find_last_char (const char *b, const char *e, char c)
1102 /* Construct a URL by concatenating an absolute URL and a path, which
1103 may or may not be absolute. This tries to behave "reasonably" in
1104 all foreseeable cases. It employs little specific knowledge about
1105 protocols or URL-specific stuff -- it just works on strings. */
1107 construct (const char *url, const char *sub, int subsize, int no_proto)
1113 const char *end = url + urlpath_length (url);
1117 /* SUB is a relative URL: we need to replace everything
1118 after last slash (possibly empty) with SUB.
1120 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1121 our result should be "whatever/foo/qux/xyzzy". */
1122 int need_explicit_slash = 0;
1124 const char *start_insert;
1125 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1128 /* No slash found at all. Append SUB to what we have,
1129 but we'll need a slash as a separator.
1131 Example: if url == "foo" and sub == "qux/xyzzy", then
1132 we cannot just append sub to url, because we'd get
1133 "fooqux/xyzzy", whereas what we want is
1136 To make sure the / gets inserted, we set
1137 need_explicit_slash to 1. We also set start_insert
1138 to end + 1, so that the length calculations work out
1139 correctly for one more (slash) character. Accessing
1140 that character is fine, since it will be the
1141 delimiter, '\0' or '?'. */
1142 /* example: "foo?..." */
1143 /* ^ ('?' gets changed to '/') */
1144 start_insert = end + 1;
1145 need_explicit_slash = 1;
1147 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1149 /* example: http://host" */
1151 start_insert = end + 1;
1152 need_explicit_slash = 1;
1156 /* example: "whatever/foo/bar" */
1158 start_insert = last_slash + 1;
1161 span = start_insert - url;
1162 constr = (char *)xmalloc (span + subsize + 1);
1164 memcpy (constr, url, span);
1165 if (need_explicit_slash)
1166 constr[span - 1] = '/';
1168 memcpy (constr + span, sub, subsize);
1169 constr[span + subsize] = '\0';
1171 else /* *sub == `/' */
1173 /* SUB is an absolute path: we need to replace everything
1174 after (and including) the FIRST slash with SUB.
1176 So, if URL is "http://host/whatever/foo/bar", and SUB is
1177 "/qux/xyzzy", our result should be
1178 "http://host/qux/xyzzy". */
1181 const char *start_insert = NULL; /* for gcc to shut up. */
1182 const char *pos = url;
1183 int seen_slash_slash = 0;
1184 /* We're looking for the first slash, but want to ignore
1187 slash = memchr (pos, '/', end - pos);
1188 if (slash && !seen_slash_slash)
1189 if (*(slash + 1) == '/')
1192 seen_slash_slash = 1;
1196 /* At this point, SLASH is the location of the first / after
1197 "//", or the first slash altogether. START_INSERT is the
1198 pointer to the location where SUB will be inserted. When
1199 examining the last two examples, keep in mind that SUB
1202 if (!slash && !seen_slash_slash)
1203 /* example: "foo" */
1206 else if (!slash && seen_slash_slash)
1207 /* example: "http://foo" */
1210 else if (slash && !seen_slash_slash)
1211 /* example: "foo/bar" */
1214 else if (slash && seen_slash_slash)
1215 /* example: "http://something/" */
1217 start_insert = slash;
1219 span = start_insert - url;
1220 constr = (char *)xmalloc (span + subsize + 1);
1222 memcpy (constr, url, span);
1224 memcpy (constr + span, sub, subsize);
1225 constr[span + subsize] = '\0';
1228 else /* !no_proto */
1230 constr = strdupdelim (sub, sub + subsize);
1235 /* Like the function above, but with a saner caller interface. */
1237 url_concat (const char *base_url, const char *new_url)
1239 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1242 /* Optimize URL by host, destructively replacing u->host with realhost
1243 (u->host). Do this regardless of opt.simple_check. */
1245 opt_url (struct urlinfo *u)
1247 /* Find the "true" host. */
1248 char *host = realhost (u->host);
1251 assert (u->dir != NULL); /* the URL must have been parsed */
1252 /* Refresh the printed representation. */
1254 u->url = str_url (u, 0);
1257 /* This beautiful kludge is fortunately not needed, as I've made
1258 parse_dir do the (almost) right thing, so that a query can never
1259 become a part of directory. */
1261 /* Call path_simplify, but make sure that the part after the
1262 question-mark, if any, is not destroyed by path_simplify's
1265 path_simplify_with_kludge (char *path)
1267 char *query = strchr (path, '?');
1269 /* path_simplify also works destructively, so we also have the
1270 license to write. */
1272 path_simplify (path);
1275 char *newend = path + strlen (path);
1277 if (newend != query)
1278 memmove (newend, query, strlen (query) + 1);
1283 /* Returns proxy host address, in accordance with PROTO. */
1285 getproxy (uerr_t proto)
1287 if (proto == URLHTTP)
1288 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1289 else if (proto == URLFTP)
1290 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1295 /* Should a host be accessed through proxy, concerning no_proxy? */
1297 no_proxy_match (const char *host, const char **no_proxy)
1302 return !sufmatch (no_proxy, host);
1305 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1307 /* Change the links in an HTML document. Accepts a structure that
1308 defines the positions of all the links. */
1310 convert_links (const char *file, urlpos *l)
1312 struct file_memory *fm;
1315 downloaded_file_t downloaded_file_return;
1317 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1320 /* First we do a "dry run": go through the list L and see whether
1321 any URL needs to be converted in the first place. If not, just
1322 leave the file alone. */
1325 for (dry = l; dry; dry = dry->next)
1326 if (dry->convert != CO_NOCONVERT)
1330 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1335 fm = read_file (file);
1338 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1339 file, strerror (errno));
1343 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1344 if (opt.backup_converted && downloaded_file_return)
1345 write_backup_file (file, downloaded_file_return);
1347 /* Before opening the file for writing, unlink the file. This is
1348 important if the data in FM is mmaped. In such case, nulling the
1349 file, which is what fopen() below does, would make us read all
1350 zeroes from the mmaped region. */
1351 if (unlink (file) < 0 && errno != ENOENT)
1353 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1354 file, strerror (errno));
1355 read_file_free (fm);
1358 /* Now open the file for writing. */
1359 fp = fopen (file, "wb");
1362 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1363 file, strerror (errno));
1364 read_file_free (fm);
1367 /* Here we loop through all the URLs in file, replacing those of
1368 them that are downloaded with relative references. */
1370 for (; l; l = l->next)
1372 char *url_start = fm->content + l->pos;
1373 if (l->pos >= fm->length)
1375 DEBUGP (("Something strange is going on. Please investigate."));
1378 /* If the URL is not to be converted, skip it. */
1379 if (l->convert == CO_NOCONVERT)
1381 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1385 /* Echo the file contents, up to the offending URL's opening
1386 quote, to the outfile. */
1387 fwrite (p, 1, url_start - p, fp);
1389 if (l->convert == CO_CONVERT_TO_RELATIVE)
1391 /* Convert absolute URL to relative. */
1392 char *newname = construct_relative (file, l->local_name);
1393 char *quoted_newname = html_quote_string (newname);
1394 putc (*p, fp); /* quoting char */
1395 fputs (quoted_newname, fp);
1397 putc (*p, fp); /* close quote */
1400 xfree (quoted_newname);
1401 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1402 l->url, newname, l->pos, file));
1404 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1406 /* Convert the link to absolute URL. */
1407 char *newlink = l->url;
1408 char *quoted_newlink = html_quote_string (newlink);
1409 putc (*p, fp); /* quoting char */
1410 fputs (quoted_newlink, fp);
1412 putc (*p, fp); /* close quote */
1414 xfree (quoted_newlink);
1415 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1416 newlink, l->pos, file));
1419 /* Output the rest of the file. */
1420 if (p - fm->content < fm->length)
1421 fwrite (p, 1, fm->length - (p - fm->content), fp);
1423 read_file_free (fm);
1424 logputs (LOG_VERBOSE, _("done.\n"));
1427 /* Construct and return a malloced copy of the relative link from two
1428 pieces of information: local name S1 of the referring file and
1429 local name S2 of the referred file.
1431 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1432 "jagor.srce.hr/images/news.gif", the function will return
1435 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1436 "fly.cc.fer.hr/images/fly.gif", the function will return
1437 "../images/fly.gif".
1439 Caveats: S1 should not begin with `/', unless S2 also begins with
1440 '/'. S1 should not contain things like ".." and such --
1441 construct_relative ("fly/ioccc/../index.html",
1442 "fly/images/fly.gif") will fail. (A workaround is to call
1443 something like path_simplify() on S1). */
1445 construct_relative (const char *s1, const char *s2)
1447 int i, cnt, sepdirs1;
1451 return xstrdup (s2);
1452 /* S1 should *not* be absolute, if S2 wasn't. */
1453 assert (*s1 != '/');
1455 /* Skip the directories common to both strings. */
1458 while (s1[i] && s2[i]
1463 if (s1[i] == '/' && s2[i] == '/')
1468 for (sepdirs1 = 0; s1[i]; i++)
1471 /* Now, construct the file as of:
1472 - ../ repeated sepdirs1 time
1473 - all the non-mutual directories of S2. */
1474 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1475 for (i = 0; i < sepdirs1; i++)
1476 memcpy (res + 3 * i, "../", 3);
1477 strcpy (res + 3 * i, s2 + cnt);
1481 /* Add URL to the head of the list L. */
1483 add_url (urlpos *l, const char *url, const char *file)
1487 t = (urlpos *)xmalloc (sizeof (urlpos));
1488 memset (t, 0, sizeof (*t));
1489 t->url = xstrdup (url);
1490 t->local_name = xstrdup (file);
1496 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1498 /* Rather than just writing over the original .html file with the
1499 converted version, save the former to *.orig. Note we only do
1500 this for files we've _successfully_ downloaded, so we don't
1501 clobber .orig files sitting around from previous invocations. */
1503 /* Construct the backup filename as the original name plus ".orig". */
1504 size_t filename_len = strlen(file);
1505 char* filename_plus_orig_suffix;
1506 boolean already_wrote_backup_file = FALSE;
1507 slist* converted_file_ptr;
1508 static slist* converted_files = NULL;
1510 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1512 /* Just write "orig" over "html". We need to do it this way
1513 because when we're checking to see if we've downloaded the
1514 file before (to see if we can skip downloading it), we don't
1515 know if it's a text/html file. Therefore we don't know yet
1516 at that stage that -E is going to cause us to tack on
1517 ".html", so we need to compare vs. the original URL plus
1518 ".orig", not the original URL plus ".html.orig". */
1519 filename_plus_orig_suffix = alloca (filename_len + 1);
1520 strcpy(filename_plus_orig_suffix, file);
1521 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1523 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1525 /* Append ".orig" to the name. */
1526 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1527 strcpy(filename_plus_orig_suffix, file);
1528 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1531 /* We can get called twice on the same URL thanks to the
1532 convert_all_links() call in main(). If we write the .orig file
1533 each time in such a case, it'll end up containing the first-pass
1534 conversion, not the original file. So, see if we've already been
1535 called on this file. */
1536 converted_file_ptr = converted_files;
1537 while (converted_file_ptr != NULL)
1538 if (strcmp(converted_file_ptr->string, file) == 0)
1540 already_wrote_backup_file = TRUE;
1544 converted_file_ptr = converted_file_ptr->next;
1546 if (!already_wrote_backup_file)
1548 /* Rename <file> to <file>.orig before former gets written over. */
1549 if (rename(file, filename_plus_orig_suffix) != 0)
1550 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1551 file, filename_plus_orig_suffix, strerror (errno));
1553 /* Remember that we've already written a .orig backup for this file.
1554 Note that we never free this memory since we need it till the
1555 convert_all_links() call, which is one of the last things the
1556 program does before terminating. BTW, I'm not sure if it would be
1557 safe to just set 'converted_file_ptr->string' to 'file' below,
1558 rather than making a copy of the string... Another note is that I
1559 thought I could just add a field to the urlpos structure saying
1560 that we'd written a .orig file for this URL, but that didn't work,
1561 so I had to make this separate list.
1563 This [adding a field to the urlpos structure] didn't work
1564 because convert_file() is called twice: once after all its
1565 sublinks have been retrieved in recursive_retrieve(), and
1566 once at the end of the day in convert_all_links(). The
1567 original linked list collected in recursive_retrieve() is
1568 lost after the first invocation of convert_links(), and
1569 convert_all_links() makes a new one (it calls get_urls_html()
1570 for each file it covers.) That's why your approach didn't
1571 work. The way to make it work is perhaps to make this flag a
1572 field in the `urls_html' list. */
1574 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1575 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1576 converted_file_ptr->next = converted_files;
1577 converted_files = converted_file_ptr;
1581 typedef struct _downloaded_file_list {
1583 downloaded_file_t download_type;
1584 struct _downloaded_file_list* next;
1585 } downloaded_file_list;
1587 static downloaded_file_list *downloaded_files;
1589 /* Remembers which files have been downloaded. In the standard case, should be
1590 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1591 download successfully (i.e. not for ones we have failures on or that we skip
1594 When we've downloaded a file and tacked on a ".html" extension due to -E,
1595 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1596 FILE_DOWNLOADED_NORMALLY.
1598 If you just want to check if a file has been previously added without adding
1599 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1600 with local filenames, not remote URLs. */
1602 downloaded_file (downloaded_file_t mode, const char* file)
1604 boolean found_file = FALSE;
1605 downloaded_file_list* rover = downloaded_files;
1607 while (rover != NULL)
1608 if (strcmp(rover->file, file) == 0)
1614 rover = rover->next;
1617 return rover->download_type; /* file had already been downloaded */
1620 if (mode != CHECK_FOR_FILE)
1622 rover = xmalloc(sizeof(*rover));
1623 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1624 rover->download_type = mode;
1625 rover->next = downloaded_files;
1626 downloaded_files = rover;
1629 return FILE_NOT_ALREADY_DOWNLOADED;
1634 downloaded_files_free (void)
1636 downloaded_file_list* rover = downloaded_files;
1639 downloaded_file_list *next = rover->next;
1640 xfree (rover->file);
1646 /* Initialization of static stuff. */
1650 init_unsafe_char_table ();