2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
30 #include <sys/types.h>
46 /* Default port definitions */
47 #define DEFAULT_HTTP_PORT 80
48 #define DEFAULT_FTP_PORT 21
50 /* Table of Unsafe chars. This is intialized in
51 init_unsafe_char_table. */
53 static char unsafe_char_table[256];
55 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
57 /* If S contains unsafe characters, free it and replace it with a
58 version that doesn't. */
59 #define URL_CLEANSE(s) do \
61 if (contains_unsafe (s)) \
63 char *uc_tmp = encode_string (s); \
69 /* Is a directory "."? */
70 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
71 /* Is a directory ".."? */
72 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
75 static void path_simplify_with_kludge PARAMS ((char *));
77 static int urlpath_length PARAMS ((const char *));
79 /* NULL-terminated list of strings to be recognized as prototypes (URL
80 schemes). Note that recognized doesn't mean supported -- only HTTP
81 and FTP are currently supported.
83 However, a string that does not match anything in the list will be
84 considered a relative URL. Thus it's important that this list has
85 anything anyone could think of being legal.
87 There are wild things here. :-) Take a look at
88 <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
90 static char *protostrings[] =
132 /* Similar to former, but for supported protocols: */
133 static struct proto sup_protos[] =
135 { "http://", URLHTTP, DEFAULT_HTTP_PORT },
136 { "ftp://", URLFTP, DEFAULT_FTP_PORT },
137 /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
140 static void parse_dir PARAMS ((const char *, char **, char **));
141 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
142 static char *construct PARAMS ((const char *, const char *, int , int));
143 static char *construct_relative PARAMS ((const char *, const char *));
144 static char process_ftp_type PARAMS ((char *));
147 /* Returns the number of characters to be skipped if the first thing
148 in a URL is URL: (which is 0 or 4+). The optional spaces after
149 URL: are also skipped. */
151 skip_url (const char *url)
155 if (TOUPPER (url[0]) == 'U'
156 && TOUPPER (url[1]) == 'R'
157 && TOUPPER (url[2]) == 'L'
161 for (i = 4; url[i] && ISSPACE (url[i]); i++);
170 - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
171 - @ and :, for user/password encoding.
172 - everything over 127 (but we don't bother with recording those. */
174 init_unsafe_char_table (void)
177 for (i = 0; i < 256; i++)
178 if (i < 32 || i >= 127
193 unsafe_char_table[i] = 1;
196 /* Returns 1 if the string contains unsafe characters, 0 otherwise. */
198 contains_unsafe (const char *s)
201 if (UNSAFE_CHAR (*s))
206 /* Decodes the forms %xy in a URL to the character the hexadecimal
207 code of which is xy. xy are hexadecimal digits from
208 [0123456789ABCDEF] (case-insensitive). If x or y are not
209 hex-digits or `%' precedes `\0', the sequence is inserted
213 decode_string (char *s)
223 /* Do nothing if at the end of the string, or if the chars
224 are not hex-digits. */
225 if (!*(s + 1) || !*(s + 2)
226 || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
231 *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
238 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
239 given string, returning a malloc-ed %XX encoded string. */
241 encode_string (const char *s)
248 for (i = 0; *s; s++, i++)
249 if (UNSAFE_CHAR (*s))
250 i += 2; /* Two more characters (hex digits) */
251 res = (char *)xmalloc (i + 1);
253 for (p = res; *s; s++)
254 if (UNSAFE_CHAR (*s))
256 const unsigned char c = *s;
258 *p++ = HEXD2ASC (c >> 4);
259 *p++ = HEXD2ASC (c & 0xf);
267 /* Returns the proto-type if URL's protocol is supported, or
268 URLUNKNOWN if not. */
270 urlproto (const char *url)
274 url += skip_url (url);
275 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
276 if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
277 return sup_protos[i].ind;
278 for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
281 for (++i; url[i] && url[i] != '/'; i++)
282 if (!ISDIGIT (url[i]))
284 if (url[i - 1] == ':')
293 /* Skip the protocol part of the URL, e.g. `http://'. If no protocol
294 part is found, returns 0. */
296 skip_proto (const char *url)
301 for (s = protostrings; *s; s++)
302 if (!strncasecmp (*s, url, strlen (*s)))
307 /* HTTP and FTP protocols are expected to yield exact host names
308 (i.e. the `//' part must be skipped, too). */
309 if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
314 /* Returns 1 if the URL begins with a protocol (supported or
315 unsupported), 0 otherwise. */
317 has_proto (const char *url)
321 url += skip_url (url);
322 for (s = protostrings; *s; s++)
323 if (strncasecmp (url, *s, strlen (*s)) == 0)
328 /* Skip the username and password, if present here. The function
329 should be called *not* with the complete URL, but with the part
330 right after the protocol.
332 If no username and password are found, return 0. */
334 skip_uname (const char *url)
337 for (p = url; *p && *p != '/'; p++)
340 /* If a `@' was found before the first occurrence of `/', skip
348 /* Allocate a new urlinfo structure, fill it with default values and
349 return a pointer to it. */
355 u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
356 memset (u, 0, sizeof (*u));
357 u->proto = URLUNKNOWN;
361 /* Perform a "deep" free of the urlinfo structure. The structure
362 should have been created with newurl, but need not have been used.
363 If free_pointer is non-0, free the pointer itself. */
365 freeurl (struct urlinfo *u, int complete)
369 FREE_MAYBE (u->host);
370 FREE_MAYBE (u->path);
371 FREE_MAYBE (u->file);
373 FREE_MAYBE (u->user);
374 FREE_MAYBE (u->passwd);
375 FREE_MAYBE (u->local);
376 FREE_MAYBE (u->referer);
378 freeurl (u->proxy, 1);
384 /* Extract the given URL of the form
385 (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
386 1. hostname (terminated with `/' or `:')
387 2. port number (terminated with `/'), or chosen for the protocol
388 3. dirname (everything after hostname)
389 Most errors are handled. No allocation is done, you must supply
390 pointers to allocated memory.
391 ...and a host of other stuff :-)
393 - Recognizes hostname:dir/file for FTP and
394 hostname (:portnum)?/dir/file for HTTP.
395 - Parses the path to yield directory and file
396 - Parses the URL to yield the username and passwd (if present)
397 - Decodes the strings, in case they contain "forbidden" characters
398 - Writes the result to struct urlinfo
400 If the argument STRICT is set, it recognizes only the canonical
403 parseurl (const char *url, struct urlinfo *u, int strict)
406 int recognizable; /* Recognizable URL is the one where
407 the protocol name was explicitly
408 named, i.e. it wasn't deduced from
412 DEBUGP (("parseurl (\"%s\") -> ", url));
413 url += skip_url (url);
414 recognizable = has_proto (url);
415 if (strict && !recognizable)
417 for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
419 l = strlen (sup_protos[i].name);
420 if (!strncasecmp (sup_protos[i].name, url, l))
423 /* If protocol is recognizable, but unsupported, bail out, else
425 if (recognizable && i == ARRAY_SIZE (sup_protos))
427 else if (i == ARRAY_SIZE (sup_protos))
430 u->proto = type = sup_protos[i].ind;
432 if (type == URLUNKNOWN)
434 /* Allow a username and password to be specified (i.e. just skip
437 l += skip_uname (url + l);
438 for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
441 /* Get the hostname. */
442 u->host = strdupdelim (url + l, url + i);
443 DEBUGP (("host %s -> ", u->host));
445 /* Assume no port has been given. */
449 /* We have a colon delimiting the hostname. It could mean that
450 a port number is following it, or a directory. */
451 if (ISDIGIT (url[++i])) /* A port number */
453 if (type == URLUNKNOWN)
454 u->proto = type = URLHTTP;
455 for (; url[i] && url[i] != '/'; i++)
456 if (ISDIGIT (url[i]))
457 u->port = 10 * u->port + (url[i] - '0');
462 DEBUGP (("port %hu -> ", u->port));
464 else if (type == URLUNKNOWN) /* or a directory */
465 u->proto = type = URLFTP;
466 else /* or just a misformed port number */
469 else if (type == URLUNKNOWN)
470 u->proto = type = URLHTTP;
474 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
475 if (sup_protos[i].ind == type)
477 if (i == ARRAY_SIZE (sup_protos))
479 u->port = sup_protos[i].port;
481 /* Some delimiter troubles... */
482 if (url[i] == '/' && url[i - 1] != ':')
485 while (url[i] && url[i] == '/')
487 u->path = (char *)xmalloc (strlen (url + i) + 8);
488 strcpy (u->path, url + i);
491 u->ftp_type = process_ftp_type (u->path);
492 /* #### We don't handle type `d' correctly yet. */
493 if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
496 DEBUGP (("opath %s -> ", u->path));
497 /* Parse the username and password (if existing). */
498 parse_uname (url, &u->user, &u->passwd);
499 /* Decode the strings, as per RFC 1738. */
500 decode_string (u->host);
501 decode_string (u->path);
503 decode_string (u->user);
505 decode_string (u->passwd);
506 /* Parse the directory. */
507 parse_dir (u->path, &u->dir, &u->file);
508 DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
509 /* Simplify the directory. */
510 path_simplify (u->dir);
511 /* Remove the leading `/' in HTTP. */
512 if (type == URLHTTP && *u->dir == '/')
513 strcpy (u->dir, u->dir + 1);
514 DEBUGP (("ndir %s\n", u->dir));
515 /* Strip trailing `/'. */
517 if (l && u->dir[l - 1] == '/')
518 u->dir[l - 1] = '\0';
519 /* Re-create the path: */
520 abs_ftp = (u->proto == URLFTP && *u->dir == '/');
521 /* sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
522 abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
523 strcpy (u->path, abs_ftp ? "%2F" : "/");
524 strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
525 strcat (u->path, *u->dir ? "/" : "");
526 strcat (u->path, u->file);
527 URL_CLEANSE (u->path);
528 DEBUGP (("newpath: %s\n", u->path));
529 /* Create the clean URL. */
530 u->url = str_url (u, 0);
534 /* Special versions of DOTP and DDOTP for parse_dir(). */
536 #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
537 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \
538 && (!*((x) + 2) || *((x) + 2) == '?'))
540 /* Build the directory and filename components of the path. Both
541 components are *separately* malloc-ed strings! It does not change
542 the contents of path.
544 If the path ends with "." or "..", they are (correctly) counted as
547 parse_dir (const char *path, char **dir, char **file)
551 l = urlpath_length (path);
552 for (i = l; i && path[i] != '/'; i--);
554 if (!i && *path != '/') /* Just filename */
556 if (PD_DOTP (path) || PD_DDOTP (path))
558 *dir = strdupdelim (path, path + l);
559 *file = xstrdup (path + l); /* normally empty, but could
564 *dir = xstrdup (""); /* This is required because of FTP */
565 *file = xstrdup (path);
568 else if (!i) /* /filename */
570 if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
572 *dir = strdupdelim (path, path + l);
573 *file = xstrdup (path + l); /* normally empty, but could
578 *dir = xstrdup ("/");
579 *file = xstrdup (path + 1);
582 else /* Nonempty directory with or without a filename */
584 if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
586 *dir = strdupdelim (path, path + l);
587 *file = xstrdup (path + l); /* normally empty, but could
592 *dir = strdupdelim (path, path + i);
593 *file = xstrdup (path + i + 1);
598 /* Find the optional username and password within the URL, as per
599 RFC1738. The returned user and passwd char pointers are
602 parse_uname (const char *url, char **user, char **passwd)
610 url += skip_url (url);
611 /* Look for end of protocol string. */
612 l = skip_proto (url);
615 /* Add protocol offset. */
617 /* Is there an `@' character? */
618 for (p = url; *p && *p != '/'; p++)
621 /* If not, return. */
624 /* Else find the username and password. */
625 for (p = col = url; *p != '@'; p++)
627 if (*p == ':' && !*user)
629 *user = (char *)xmalloc (p - url + 1);
630 memcpy (*user, url, p - url);
631 (*user)[p - url] = '\0';
635 /* Decide whether you have only the username or both. */
636 where = *user ? passwd : user;
637 *where = (char *)xmalloc (p - col + 1);
638 memcpy (*where, col, p - col);
639 (*where)[p - col] = '\0';
643 /* If PATH ends with `;type=X', return the character X. */
645 process_ftp_type (char *path)
647 int len = strlen (path);
650 && !memcmp (path + len - 7, ";type=", 6))
652 path[len - 7] = '\0';
653 return path[len - 1];
659 /* Return the URL as fine-formed string, with a proper protocol,
660 optional port number, directory and optional user/password. If
661 HIDE is non-zero, password will be hidden. The forbidden
662 characters in the URL will be cleansed. */
664 str_url (const struct urlinfo *u, int hide)
666 char *res, *host, *user, *passwd, *proto_name, *dir, *file;
667 int i, l, ln, lu, lh, lp, lf, ld;
668 unsigned short proto_default_port;
670 /* Look for the protocol name. */
671 for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
672 if (sup_protos[i].ind == u->proto)
674 if (i == ARRAY_SIZE (sup_protos))
676 proto_name = sup_protos[i].name;
677 proto_default_port = sup_protos[i].port;
678 host = CLEANDUP (u->host);
679 dir = CLEANDUP (u->dir);
680 file = CLEANDUP (u->file);
681 user = passwd = NULL;
683 user = CLEANDUP (u->user);
687 passwd = CLEANDUP (u->passwd);
689 for (i = 0; passwd[i]; i++)
692 if (u->proto == URLFTP && *dir == '/')
694 char *tmp = (char *)xmalloc (strlen (dir) + 3);
695 /*sprintf (tmp, "%%2F%s", dir + 1);*/
699 strcpy (tmp + 3, dir + 1);
704 ln = strlen (proto_name);
705 lu = user ? strlen (user) : 0;
706 lp = passwd ? strlen (passwd) : 0;
710 res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
711 /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
712 (user ? user : ""), (passwd ? ":" : ""),
713 (passwd ? passwd : ""), (user ? "@" : ""),
714 host, u->port, dir, *dir ? "/" : "", file); */
716 memcpy (res, proto_name, ln);
720 memcpy (res + l, user, lu);
725 memcpy (res + l, passwd, lp);
730 memcpy (res + l, host, lh);
732 if (u->port != proto_default_port)
735 long_to_string (res + l, (long)u->port);
736 l += numdigit (u->port);
739 memcpy (res + l, dir, ld);
743 strcpy (res + l, file);
752 /* Check whether two URL-s are equivalent, i.e. pointing to the same
753 location. Uses parseurl to parse them, and compares the canonical
756 Returns 1 if the URL1 is equivalent to URL2, 0 otherwise. Also
757 return 0 on error. */
759 url_equal (const char *url1, const char *url2)
761 struct urlinfo *u1, *u2;
766 err = parseurl (url1, u1, 0);
773 err = parseurl (url2, u2, 0);
779 res = !strcmp (u1->url, u2->url);
786 get_urls_file (const char *file)
788 struct file_memory *fm;
790 const char *text, *text_end;
793 fm = read_file (file);
796 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
799 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
802 text_end = fm->content + fm->length;
803 while (text < text_end)
805 const char *line_beg = text;
806 const char *line_end = memchr (text, '\n', text_end - text);
812 while (line_beg < line_end
813 && ISSPACE (*line_beg))
815 while (line_end > line_beg + 1
816 && ISSPACE (*(line_end - 1)))
818 if (line_end > line_beg)
820 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
821 memset (entry, 0, sizeof (*entry));
823 entry->url = strdupdelim (line_beg, line_end);
835 /* Free the linked list of urlpos. */
837 free_urlpos (urlpos *l)
841 urlpos *next = l->next;
843 FREE_MAYBE (l->local_name);
849 /* Rotate FNAME opt.backups times */
851 rotate_backups(const char *fname)
853 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
854 char *from = (char *)alloca (maxlen);
855 char *to = (char *)alloca (maxlen);
859 if (stat (fname, &sb) == 0)
860 if (S_ISREG (sb.st_mode) == 0)
863 for (i = opt.backups; i > 1; i--)
865 sprintf (from, "%s.%d", fname, i - 1);
866 sprintf (to, "%s.%d", fname, i);
867 /* #### This will fail on machines without the rename() system
872 sprintf (to, "%s.%d", fname, 1);
876 /* Create all the necessary directories for PATH (a file). Calls
877 mkdirhier() internally. */
879 mkalldirs (const char *path)
886 p = path + strlen (path);
887 for (; *p != '/' && p != path; p--);
888 /* Don't create if it's just a file. */
889 if ((p == path) && (*p != '/'))
891 t = strdupdelim (path, p);
892 /* Check whether the directory exists. */
893 if ((stat (t, &st) == 0))
895 if (S_ISDIR (st.st_mode))
902 /* If the dir exists as a file name, remove it first. This
903 is *only* for Wget to work with buggy old CERN http
904 servers. Here is the scenario: When Wget tries to
905 retrieve a directory without a slash, e.g.
906 http://foo/bar (bar being a directory), CERN server will
907 not redirect it too http://foo/bar/ -- it will generate a
908 directory listing containing links to bar/file1,
909 bar/file2, etc. Wget will lose because it saves this
910 HTML listing to a file `bar', so it cannot create the
911 directory. To work around this, if the file of the same
912 name exists, we just remove it and create the directory
914 DEBUGP (("Removing %s because of directory danger!\n", t));
918 res = make_directory (t);
920 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
926 count_slashes (const char *s)
935 /* Return the path name of the URL-equivalent file name, with a
936 remote-like structure of directories. */
938 mkstruct (const struct urlinfo *u)
940 char *host, *dir, *file, *res, *dirpref;
943 assert (u->dir != NULL);
944 assert (u->host != NULL);
948 char *ptr = u->dir + (*u->dir == '/');
949 int slash_count = 1 + count_slashes (ptr);
950 int cut = MINVAL (opt.cut_dirs, slash_count);
951 for (; cut && *ptr; ptr++)
954 STRDUP_ALLOCA (dir, ptr);
957 dir = u->dir + (*u->dir == '/');
959 host = xstrdup (u->host);
960 /* Check for the true name (or at least a consistent name for saving
961 to directory) of HOST, reusing the hlist if possible. */
962 if (opt.add_hostdir && !opt.simple_check)
964 char *nhost = realhost (host);
968 /* Add dir_prefix and hostname (if required) to the beginning of
972 if (!DOTP (opt.dir_prefix))
974 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
975 + strlen (host) + 1);
976 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
979 STRDUP_ALLOCA (dirpref, host);
981 else /* not add_hostdir */
983 if (!DOTP (opt.dir_prefix))
984 dirpref = opt.dir_prefix;
990 /* If there is a prefix, prepend it. */
993 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
994 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1000 if (l && dir[l - 1] == '/')
1004 file = "index.html";
1008 /* Finally, construct the full name. */
1009 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1010 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1015 /* Create a unique filename, corresponding to a given URL. Calls
1016 mkstruct if necessary. Does *not* actually create any directories. */
1018 url_filename (const struct urlinfo *u)
1021 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1025 file = mkstruct (u);
1031 file = xstrdup ("index.html");
1033 file = xstrdup (u->file);
1038 /* Check whether the prefix directory is something other than "."
1039 before prepending it. */
1040 if (!DOTP (opt.dir_prefix))
1042 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1043 + 1 + strlen (file) + 1);
1044 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1049 /* DOS-ish file systems don't like `%' signs in them; we change it
1054 for (p = file; *p; p++)
1058 #endif /* WINDOWS */
1060 /* Check the cases in which the unique extensions are not used:
1061 1) Clobbering is turned off (-nc).
1062 2) Retrieval with regetting.
1063 3) Timestamping is used.
1064 4) Hierarchy is built.
1066 The exception is the case when file does exist and is a
1067 directory (actually support for bad httpd-s). */
1068 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1069 && !(file_exists_p (file) && !file_non_directory_p (file)))
1072 /* Find a unique name. */
1073 name = unique_name (file);
1078 /* Like strlen(), but allow the URL to be ended with '?'. */
1080 urlpath_length (const char *url)
1082 const char *q = strchr (url, '?');
1085 return strlen (url);
1088 /* Find the last occurrence of character C in the range [b, e), or
1089 NULL, if none are present. This is almost completely equivalent to
1090 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1091 the contents of the string. */
1093 find_last_char (const char *b, const char *e, char c)
1101 /* Construct a URL by concatenating an absolute URL and a path, which
1102 may or may not be absolute. This tries to behave "reasonably" in
1103 all foreseeable cases. It employs little specific knowledge about
1104 protocols or URL-specific stuff -- it just works on strings. */
1106 construct (const char *url, const char *sub, int subsize, int no_proto)
1112 const char *end = url + urlpath_length (url);
1116 /* SUB is a relative URL: we need to replace everything
1117 after last slash (possibly empty) with SUB.
1119 So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1120 our result should be "whatever/foo/qux/xyzzy". */
1121 int need_explicit_slash = 0;
1123 const char *start_insert;
1124 const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1127 /* No slash found at all. Append SUB to what we have,
1128 but we'll need a slash as a separator.
1130 Example: if url == "foo" and sub == "qux/xyzzy", then
1131 we cannot just append sub to url, because we'd get
1132 "fooqux/xyzzy", whereas what we want is
1135 To make sure the / gets inserted, we set
1136 need_explicit_slash to 1. We also set start_insert
1137 to end + 1, so that the length calculations work out
1138 correctly for one more (slash) character. Accessing
1139 that character is fine, since it will be the
1140 delimiter, '\0' or '?'. */
1141 /* example: "foo?..." */
1142 /* ^ ('?' gets changed to '/') */
1143 start_insert = end + 1;
1144 need_explicit_slash = 1;
1146 else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1148 /* example: http://host" */
1150 start_insert = end + 1;
1151 need_explicit_slash = 1;
1155 /* example: "whatever/foo/bar" */
1157 start_insert = last_slash + 1;
1160 span = start_insert - url;
1161 constr = (char *)xmalloc (span + subsize + 1);
1163 memcpy (constr, url, span);
1164 if (need_explicit_slash)
1165 constr[span - 1] = '/';
1167 memcpy (constr + span, sub, subsize);
1168 constr[span + subsize] = '\0';
1170 else /* *sub == `/' */
1172 /* SUB is an absolute path: we need to replace everything
1173 after (and including) the FIRST slash with SUB.
1175 So, if URL is "http://host/whatever/foo/bar", and SUB is
1176 "/qux/xyzzy", our result should be
1177 "http://host/qux/xyzzy". */
1180 const char *start_insert = NULL; /* for gcc to shut up. */
1181 const char *pos = url;
1182 int seen_slash_slash = 0;
1183 /* We're looking for the first slash, but want to ignore
1186 slash = memchr (pos, '/', end - pos);
1187 if (slash && !seen_slash_slash)
1188 if (*(slash + 1) == '/')
1191 seen_slash_slash = 1;
1195 /* At this point, SLASH is the location of the first / after
1196 "//", or the first slash altogether. START_INSERT is the
1197 pointer to the location where SUB will be inserted. When
1198 examining the last two examples, keep in mind that SUB
1201 if (!slash && !seen_slash_slash)
1202 /* example: "foo" */
1205 else if (!slash && seen_slash_slash)
1206 /* example: "http://foo" */
1209 else if (slash && !seen_slash_slash)
1210 /* example: "foo/bar" */
1213 else if (slash && seen_slash_slash)
1214 /* example: "http://something/" */
1216 start_insert = slash;
1218 span = start_insert - url;
1219 constr = (char *)xmalloc (span + subsize + 1);
1221 memcpy (constr, url, span);
1223 memcpy (constr + span, sub, subsize);
1224 constr[span + subsize] = '\0';
1227 else /* !no_proto */
1229 constr = strdupdelim (sub, sub + subsize);
1234 /* Like the function above, but with a saner caller interface. */
1236 url_concat (const char *base_url, const char *new_url)
1238 return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1241 /* Optimize URL by host, destructively replacing u->host with realhost
1242 (u->host). Do this regardless of opt.simple_check. */
1244 opt_url (struct urlinfo *u)
1246 /* Find the "true" host. */
1247 char *host = realhost (u->host);
1250 assert (u->dir != NULL); /* the URL must have been parsed */
1251 /* Refresh the printed representation. */
1253 u->url = str_url (u, 0);
1256 /* This beautiful kludge is fortunately not needed, as I've made
1257 parse_dir do the (almost) right thing, so that a query can never
1258 become a part of directory. */
1260 /* Call path_simplify, but make sure that the part after the
1261 question-mark, if any, is not destroyed by path_simplify's
1264 path_simplify_with_kludge (char *path)
1266 char *query = strchr (path, '?');
1268 /* path_simplify also works destructively, so we also have the
1269 license to write. */
1271 path_simplify (path);
1274 char *newend = path + strlen (path);
1276 if (newend != query)
1277 memmove (newend, query, strlen (query) + 1);
1282 /* Returns proxy host address, in accordance with PROTO. */
1284 getproxy (uerr_t proto)
1286 if (proto == URLHTTP)
1287 return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1288 else if (proto == URLFTP)
1289 return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1294 /* Should a host be accessed through proxy, concerning no_proxy? */
1296 no_proxy_match (const char *host, const char **no_proxy)
1301 return !sufmatch (no_proxy, host);
1304 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1306 /* Change the links in an HTML document. Accepts a structure that
1307 defines the positions of all the links. */
1309 convert_links (const char *file, urlpos *l)
1311 struct file_memory *fm;
1314 downloaded_file_t downloaded_file_return;
1316 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1319 /* First we do a "dry run": go through the list L and see whether
1320 any URL needs to be converted in the first place. If not, just
1321 leave the file alone. */
1324 for (dry = l; dry; dry = dry->next)
1325 if (dry->convert != CO_NOCONVERT)
1329 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1334 fm = read_file (file);
1337 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1338 file, strerror (errno));
1342 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1343 if (opt.backup_converted && downloaded_file_return)
1344 write_backup_file (file, downloaded_file_return);
1346 /* Before opening the file for writing, unlink the file. This is
1347 important if the data in FM is mmaped. In such case, nulling the
1348 file, which is what fopen() below does, would make us read all
1349 zeroes from the mmaped region. */
1350 if (unlink (file) < 0 && errno != ENOENT)
1352 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1353 file, strerror (errno));
1354 read_file_free (fm);
1357 /* Now open the file for writing. */
1358 fp = fopen (file, "wb");
1361 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1362 file, strerror (errno));
1363 read_file_free (fm);
1366 /* Here we loop through all the URLs in file, replacing those of
1367 them that are downloaded with relative references. */
1369 for (; l; l = l->next)
1371 char *url_start = fm->content + l->pos;
1372 if (l->pos >= fm->length)
1374 DEBUGP (("Something strange is going on. Please investigate."));
1377 /* If the URL is not to be converted, skip it. */
1378 if (l->convert == CO_NOCONVERT)
1380 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1384 /* Echo the file contents, up to the offending URL's opening
1385 quote, to the outfile. */
1386 fwrite (p, 1, url_start - p, fp);
1388 if (l->convert == CO_CONVERT_TO_RELATIVE)
1390 /* Convert absolute URL to relative. */
1391 char *newname = construct_relative (file, l->local_name);
1392 char *quoted_newname = html_quote_string (newname);
1393 putc (*p, fp); /* quoting char */
1394 fputs (quoted_newname, fp);
1396 putc (*p, fp); /* close quote */
1399 xfree (quoted_newname);
1400 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1401 l->url, newname, l->pos, file));
1403 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1405 /* Convert the link to absolute URL. */
1406 char *newlink = l->url;
1407 char *quoted_newlink = html_quote_string (newlink);
1408 putc (*p, fp); /* quoting char */
1409 fputs (quoted_newlink, fp);
1411 putc (*p, fp); /* close quote */
1413 xfree (quoted_newlink);
1414 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1415 newlink, l->pos, file));
1418 /* Output the rest of the file. */
1419 if (p - fm->content < fm->length)
1420 fwrite (p, 1, fm->length - (p - fm->content), fp);
1422 read_file_free (fm);
1423 logputs (LOG_VERBOSE, _("done.\n"));
1426 /* Construct and return a malloced copy of the relative link from two
1427 pieces of information: local name S1 of the referring file and
1428 local name S2 of the referred file.
1430 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1431 "jagor.srce.hr/images/news.gif", the function will return
1434 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1435 "fly.cc.fer.hr/images/fly.gif", the function will return
1436 "../images/fly.gif".
1438 Caveats: S1 should not begin with `/', unless S2 also begins with
1439 '/'. S1 should not contain things like ".." and such --
1440 construct_relative ("fly/ioccc/../index.html",
1441 "fly/images/fly.gif") will fail. (A workaround is to call
1442 something like path_simplify() on S1). */
1444 construct_relative (const char *s1, const char *s2)
1446 int i, cnt, sepdirs1;
1450 return xstrdup (s2);
1451 /* S1 should *not* be absolute, if S2 wasn't. */
1452 assert (*s1 != '/');
1454 /* Skip the directories common to both strings. */
1457 while (s1[i] && s2[i]
1462 if (s1[i] == '/' && s2[i] == '/')
1467 for (sepdirs1 = 0; s1[i]; i++)
1470 /* Now, construct the file as of:
1471 - ../ repeated sepdirs1 time
1472 - all the non-mutual directories of S2. */
1473 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1474 for (i = 0; i < sepdirs1; i++)
1475 memcpy (res + 3 * i, "../", 3);
1476 strcpy (res + 3 * i, s2 + cnt);
1480 /* Add URL to the head of the list L. */
1482 add_url (urlpos *l, const char *url, const char *file)
1486 t = (urlpos *)xmalloc (sizeof (urlpos));
1487 memset (t, 0, sizeof (*t));
1488 t->url = xstrdup (url);
1489 t->local_name = xstrdup (file);
1495 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1497 /* Rather than just writing over the original .html file with the
1498 converted version, save the former to *.orig. Note we only do
1499 this for files we've _successfully_ downloaded, so we don't
1500 clobber .orig files sitting around from previous invocations. */
1502 /* Construct the backup filename as the original name plus ".orig". */
1503 size_t filename_len = strlen(file);
1504 char* filename_plus_orig_suffix;
1505 boolean already_wrote_backup_file = FALSE;
1506 slist* converted_file_ptr;
1507 static slist* converted_files = NULL;
1509 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1511 /* Just write "orig" over "html". We need to do it this way
1512 because when we're checking to see if we've downloaded the
1513 file before (to see if we can skip downloading it), we don't
1514 know if it's a text/html file. Therefore we don't know yet
1515 at that stage that -E is going to cause us to tack on
1516 ".html", so we need to compare vs. the original URL plus
1517 ".orig", not the original URL plus ".html.orig". */
1518 filename_plus_orig_suffix = alloca (filename_len + 1);
1519 strcpy(filename_plus_orig_suffix, file);
1520 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1522 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1524 /* Append ".orig" to the name. */
1525 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1526 strcpy(filename_plus_orig_suffix, file);
1527 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1530 /* We can get called twice on the same URL thanks to the
1531 convert_all_links() call in main(). If we write the .orig file
1532 each time in such a case, it'll end up containing the first-pass
1533 conversion, not the original file. So, see if we've already been
1534 called on this file. */
1535 converted_file_ptr = converted_files;
1536 while (converted_file_ptr != NULL)
1537 if (strcmp(converted_file_ptr->string, file) == 0)
1539 already_wrote_backup_file = TRUE;
1543 converted_file_ptr = converted_file_ptr->next;
1545 if (!already_wrote_backup_file)
1547 /* Rename <file> to <file>.orig before former gets written over. */
1548 if (rename(file, filename_plus_orig_suffix) != 0)
1549 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1550 file, filename_plus_orig_suffix, strerror (errno));
1552 /* Remember that we've already written a .orig backup for this file.
1553 Note that we never free this memory since we need it till the
1554 convert_all_links() call, which is one of the last things the
1555 program does before terminating. BTW, I'm not sure if it would be
1556 safe to just set 'converted_file_ptr->string' to 'file' below,
1557 rather than making a copy of the string... Another note is that I
1558 thought I could just add a field to the urlpos structure saying
1559 that we'd written a .orig file for this URL, but that didn't work,
1560 so I had to make this separate list.
1562 This [adding a field to the urlpos structure] didn't work
1563 because convert_file() is called twice: once after all its
1564 sublinks have been retrieved in recursive_retrieve(), and
1565 once at the end of the day in convert_all_links(). The
1566 original linked list collected in recursive_retrieve() is
1567 lost after the first invocation of convert_links(), and
1568 convert_all_links() makes a new one (it calls get_urls_html()
1569 for each file it covers.) That's why your approach didn't
1570 work. The way to make it work is perhaps to make this flag a
1571 field in the `urls_html' list. */
1573 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1574 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1575 converted_file_ptr->next = converted_files;
1576 converted_files = converted_file_ptr;
1580 typedef struct _downloaded_file_list {
1582 downloaded_file_t download_type;
1583 struct _downloaded_file_list* next;
1584 } downloaded_file_list;
1586 static downloaded_file_list *downloaded_files;
1588 /* Remembers which files have been downloaded. In the standard case, should be
1589 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1590 download successfully (i.e. not for ones we have failures on or that we skip
1593 When we've downloaded a file and tacked on a ".html" extension due to -E,
1594 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1595 FILE_DOWNLOADED_NORMALLY.
1597 If you just want to check if a file has been previously added without adding
1598 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1599 with local filenames, not remote URLs. */
1601 downloaded_file (downloaded_file_t mode, const char* file)
1603 boolean found_file = FALSE;
1604 downloaded_file_list* rover = downloaded_files;
1606 while (rover != NULL)
1607 if (strcmp(rover->file, file) == 0)
1613 rover = rover->next;
1616 return rover->download_type; /* file had already been downloaded */
1619 if (mode != CHECK_FOR_FILE)
1621 rover = xmalloc(sizeof(*rover));
1622 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1623 rover->download_type = mode;
1624 rover->next = downloaded_files;
1625 downloaded_files = rover;
1628 return FILE_NOT_ALREADY_DOWNLOADED;
1633 downloaded_files_free (void)
1635 downloaded_file_list* rover = downloaded_files;
1638 downloaded_file_list *next = rover->next;
1639 xfree (rover->file);
1645 /* Initialization of static stuff. */
1649 init_unsafe_char_table ();