2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* See the comment in gethttp() why this is needed. */
67 int global_download_count;
69 /* Total size of downloaded files. Used to enforce quota. */
70 LARGE_INT total_downloaded_bytes;
80 limit_bandwidth_reset (void)
82 limit_data.chunk_bytes = 0;
83 limit_data.chunk_start = 0;
86 /* Limit the bandwidth by pausing the download for an amount of time.
87 BYTES is the number of bytes received from the network, and TIMER
88 is the timer that started at the beginning of download. */
91 limit_bandwidth (long bytes, struct wget_timer *timer)
93 double delta_t = wtimer_read (timer) - limit_data.chunk_start;
96 limit_data.chunk_bytes += bytes;
98 /* Calculate the amount of time we expect downloading the chunk
99 should take. If in reality it took less time, sleep to
100 compensate for the difference. */
101 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
103 if (expected > delta_t)
105 double slp = expected - delta_t + limit_data.sleep_adjust;
109 DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
110 slp, limit_data.chunk_bytes, delta_t));
113 DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
114 slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
116 t0 = wtimer_read (timer);
118 wtimer_update (timer);
119 t1 = wtimer_read (timer);
121 /* Due to scheduling, we probably slept slightly longer (or
122 shorter) than desired. Calculate the difference between the
123 desired and the actual sleep, and adjust the next sleep by
125 limit_data.sleep_adjust = slp - (t1 - t0);
128 limit_data.chunk_bytes = 0;
129 limit_data.chunk_start = wtimer_read (timer);
132 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
134 /* Reads the contents of file descriptor FD, until it is closed, or a
135 read error occurs. The data is read in 8K chunks, and stored to
136 stream fp, which should have been open for writing.
138 The EXPECTED argument is passed to show_progress() unchanged, but
141 If opt.verbose is set, the progress is also shown. RESTVAL
142 represents a value from which to start downloading (which will be
143 shown accordingly). If RESTVAL is non-zero, the stream should have
144 been open for appending.
146 The function exits and returns codes of 0, -1 and -2 if the
147 connection was closed, there was a read error, or if it could not
148 write to the output stream, respectively. */
151 fd_read_body (int fd, FILE *out, long *len, long restval, long expected,
152 int use_expected, double *elapsed)
156 static char dlbuf[16384];
157 int dlbufsize = sizeof (dlbuf);
159 struct wget_timer *timer = wtimer_allocate ();
160 double last_successful_read_tm;
162 /* The progress gauge, set according to the user preferences. */
163 void *progress = NULL;
165 /* Non-zero if the progress gauge is interactive, i.e. if it can
166 continually update the display. When true, smaller timeout
167 values are used so that the gauge can update the display when
168 data arrives slowly. */
169 int progress_interactive = 0;
175 progress = progress_create (restval, expected);
176 progress_interactive = progress_interactive_p (progress);
180 limit_bandwidth_reset ();
181 wtimer_reset (timer);
182 last_successful_read_tm = 0;
184 /* Use a smaller buffer for low requested bandwidths. For example,
185 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
186 data and then sleep for 8s. With buffer size equal to the limit,
187 we never have to sleep for more than one second. */
188 if (opt.limit_rate && opt.limit_rate < dlbufsize)
189 dlbufsize = opt.limit_rate;
191 /* Read from fd while there is available data.
193 Normally, if expected is 0, it means that it is not known how
194 much data is expected. However, if use_expected is specified,
195 then expected being zero means exactly that. */
196 while (!use_expected || (*len < expected))
198 int amount_to_read = (use_expected
199 ? MIN (expected - *len, dlbufsize) : dlbufsize);
200 double tmout = opt.read_timeout;
201 if (progress_interactive)
204 /* For interactive progress gauges, always specify a ~1s
205 timeout, so that the gauge can be updated regularly even
206 when the data arrives very slowly or stalls. */
208 waittm = (wtimer_read (timer) - last_successful_read_tm) / 1000;
209 if (waittm + tmout > opt.read_timeout)
211 /* Don't allow waiting time to exceed read timeout. */
212 tmout = opt.read_timeout - waittm;
215 /* We've already exceeded the timeout. */
216 res = -1, errno = ETIMEDOUT;
221 res = fd_read (fd, dlbuf, amount_to_read, tmout);
223 if (res == 0 || (res < 0 && errno != ETIMEDOUT))
226 res = 0; /* timeout */
228 wtimer_update (timer);
231 fwrite (dlbuf, 1, res, out);
232 /* Always flush the contents of the network packet. This
233 should not hinder performance: fast downloads will be
234 received in 16K chunks (which stdio would write out
235 anyway), and slow downloads won't be limited by disk
243 last_successful_read_tm = wtimer_read (timer);
247 limit_bandwidth (res, timer);
251 progress_update (progress, res, wtimer_read (timer));
253 if (use_expected && expected > 0)
254 ws_percenttitle (100.0 * (double)(*len) / (double)expected);
262 progress_finish (progress, wtimer_read (timer));
264 *elapsed = wtimer_read (timer);
265 wtimer_delete (timer);
270 typedef const char *(*finder_t) PARAMS ((const char *, int, int));
272 /* Driver for fd_read_line and fd_read_head: keeps reading data until
273 a terminator (as decided by FINDER) occurs in the data. The trick
274 is that the data is first peeked at, and only then actually read.
275 That way the data after the terminator is never read. */
278 fd_read_until (int fd, finder_t finder, int bufsize)
280 int size = bufsize, tail = 0;
281 char *buf = xmalloc (size);
286 int pklen, rdlen, remain;
288 /* First, peek at the available data. */
290 pklen = fd_peek (fd, buf + tail, size - tail, -1);
296 end = finder (buf, tail, pklen);
299 /* The data contains the terminator: we'll read the data up
300 to the end of the terminator. */
301 remain = end - (buf + tail);
302 /* Note +1 for trailing \0. */
303 if (size < tail + remain + 1)
305 size = tail + remain + 1;
306 buf = xrealloc (buf, size);
310 /* No terminator: simply read the data we know is (or should
314 /* Now, read the data. Note that we make no assumptions about
315 how much data we'll get. (Some TCP stacks are notorious for
316 read returning less data than the previous MSG_PEEK.) */
318 rdlen = fd_read (fd, buf + tail, remain, 0);
328 /* EOF without anything having been read */
333 /* Return what we received so far. */
336 size = tail + 1; /* expand the buffer to receive the
338 buf = xrealloc (buf, size);
344 if (end && rdlen == remain)
346 /* The end was seen and the data read -- we got what we came
352 /* Keep looping until all the data arrives. */
357 buf = xrealloc (buf, size);
363 line_terminator (const char *buf, int tail, int peeklen)
365 const char *p = memchr (buf + tail, '\n', peeklen);
367 /* p+1 because we want the line to include '\n' */
372 /* Read one line from FD and return it. The line is allocated using
375 If an error occurs, or if no data can be read, NULL is returned.
376 In the former case errno indicates the error condition, and in the
377 latter case, errno is NULL. */
380 fd_read_line (int fd)
382 return fd_read_until (fd, line_terminator, 128);
386 head_terminator (const char *buf, int tail, int peeklen)
388 const char *start, *end;
392 start = buf + tail - 4;
393 end = buf + tail + peeklen;
395 for (; start < end - 1; start++)
402 if (start[1] == '\n')
408 /* Read the request head from FD and return it. The chunk of data is
409 allocated using malloc.
411 If an error occurs, or if no data can be read, NULL is returned.
412 In the former case errno indicates the error condition, and in the
413 latter case, errno is NULL. */
416 fd_read_head (int fd)
418 return fd_read_until (fd, head_terminator, 512);
421 /* Return a printed representation of the download rate, as
422 appropriate for the speed. If PAD is non-zero, strings will be
423 padded to the width of 7 characters (xxxx.xx). */
425 retr_rate (long bytes, double msecs, int pad)
428 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
431 double dlrate = calc_rate (bytes, msecs, &units);
432 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
437 /* Calculate the download rate and trim it as appropriate for the
438 speed. Appropriate means that if rate is greater than 1K/s,
439 kilobytes are used, and if rate is greater than 1MB/s, megabytes
442 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
445 calc_rate (long bytes, double msecs, int *units)
453 /* If elapsed time is exactly zero, it means we're under the
454 granularity of the timer. This often happens on systems that
455 use time() for the timer. */
456 msecs = wtimer_granularity ();
458 dlrate = (double)1000 * bytes / msecs;
461 else if (dlrate < 1024.0 * 1024.0)
462 *units = 1, dlrate /= 1024.0;
463 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
464 *units = 2, dlrate /= (1024.0 * 1024.0);
466 /* Maybe someone will need this, one day. */
467 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
472 /* Maximum number of allowed redirections. 20 was chosen as a
473 "reasonable" value, which is low enough to not cause havoc, yet
474 high enough to guarantee that normal retrievals will not be hurt by
477 #define MAX_REDIRECTIONS 20
479 #define SUSPEND_POST_DATA do { \
480 post_data_suspended = 1; \
481 saved_post_data = opt.post_data; \
482 saved_post_file_name = opt.post_file_name; \
483 opt.post_data = NULL; \
484 opt.post_file_name = NULL; \
487 #define RESTORE_POST_DATA do { \
488 if (post_data_suspended) \
490 opt.post_data = saved_post_data; \
491 opt.post_file_name = saved_post_file_name; \
492 post_data_suspended = 0; \
496 static char *getproxy PARAMS ((struct url *));
498 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
501 /* #### This function should be rewritten so it doesn't return from
505 retrieve_url (const char *origurl, char **file, char **newloc,
506 const char *refurl, int *dt)
510 int location_changed, dummy;
511 char *mynewloc, *proxy;
512 struct url *u, *proxy_url;
513 int up_error_code; /* url parse error code */
515 int redirection_count = 0;
517 int post_data_suspended = 0;
518 char *saved_post_data = NULL;
519 char *saved_post_file_name = NULL;
521 /* If dt is NULL, use local storage. */
527 url = xstrdup (origurl);
533 u = url_parse (url, &up_error_code);
536 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
542 refurl = opt.referer;
551 proxy = getproxy (u);
554 /* Parse the proxy URL. */
555 proxy_url = url_parse (proxy, &up_error_code);
558 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
559 proxy, url_error (up_error_code));
564 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
566 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
567 url_free (proxy_url);
574 if (u->scheme == SCHEME_HTTP
576 || u->scheme == SCHEME_HTTPS
578 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
580 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
582 else if (u->scheme == SCHEME_FTP)
584 /* If this is a redirection, we must not allow recursive FTP
585 retrieval, so we save recursion to oldrec, and restore it
587 int oldrec = opt.recursive;
588 if (redirection_count)
590 result = ftp_loop (u, dt, proxy_url);
591 opt.recursive = oldrec;
593 /* There is a possibility of having HTTP being redirected to
594 FTP. In these cases we must decide whether the text is HTML
595 according to the suffix. The HTML suffixes are `.html',
596 `.htm' and a few others, case-insensitive. */
597 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
599 if (has_html_suffix_p (local_file))
606 url_free (proxy_url);
610 location_changed = (result == NEWLOCATION);
611 if (location_changed)
613 char *construced_newloc;
614 struct url *newloc_parsed;
616 assert (mynewloc != NULL);
621 /* The HTTP specs only allow absolute URLs to appear in
622 redirects, but a ton of boneheaded webservers and CGIs out
623 there break the rules and use relative URLs, and popular
624 browsers are lenient about this, so wget should be too. */
625 construced_newloc = uri_merge (url, mynewloc);
627 mynewloc = construced_newloc;
629 /* Now, see if this new location makes sense. */
630 newloc_parsed = url_parse (mynewloc, &up_error_code);
633 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
634 url_error (up_error_code));
642 /* Now mynewloc will become newloc_parsed->url, because if the
643 Location contained relative paths like .././something, we
644 don't want that propagating as url. */
646 mynewloc = xstrdup (newloc_parsed->url);
648 /* Check for max. number of redirections. */
649 if (++redirection_count > MAX_REDIRECTIONS)
651 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
653 url_free (newloc_parsed);
666 /* If we're being redirected from POST, we don't want to POST
667 again. Many requests answer POST with a redirection to an
668 index page; that redirection is clearly a GET. We "suspend"
669 POST data for the duration of the redirections, and restore
670 it when we're done. */
671 if (!post_data_suspended)
681 register_download (u->url, local_file);
682 if (redirection_count && 0 != strcmp (origurl, u->url))
683 register_redirection (origurl, u->url);
685 register_html (u->url, local_file);
690 *file = local_file ? local_file : NULL;
692 xfree_null (local_file);
696 if (redirection_count)
710 ++global_download_count;
716 /* Find the URLs in the file and call retrieve_url() for each of
717 them. If HTML is non-zero, treat the file as HTML, and construct
718 the URLs accordingly.
720 If opt.recursive is set, call retrieve_tree() for each file. */
723 retrieve_from_file (const char *file, int html, int *count)
726 struct urlpos *url_list, *cur_url;
728 url_list = (html ? get_urls_html (file, NULL, NULL)
729 : get_urls_file (file));
730 status = RETROK; /* Suppose everything is OK. */
731 *count = 0; /* Reset the URL count. */
733 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
735 char *filename = NULL, *new_file = NULL;
738 if (cur_url->ignore_when_downloading)
741 if (opt.quota && total_downloaded_bytes > opt.quota)
746 if ((opt.recursive || opt.page_requisites)
747 && cur_url->url->scheme != SCHEME_FTP)
748 status = retrieve_tree (cur_url->url->url);
750 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
752 if (filename && opt.delete_after && file_exists_p (filename))
754 DEBUGP (("Removing file due to --delete-after in"
755 " retrieve_from_file():\n"));
756 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
757 if (unlink (filename))
758 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
762 xfree_null (new_file);
763 xfree_null (filename);
766 /* Free the linked list of URL-s. */
767 free_urlpos (url_list);
772 /* Print `giving up', or `retrying', depending on the impending
773 action. N1 and N2 are the attempt number and the attempt limit. */
775 printwhat (int n1, int n2)
777 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
780 /* If opt.wait or opt.waitretry are specified, and if certain
781 conditions are met, sleep the appropriate number of seconds. See
782 the documentation of --wait and --waitretry for more information.
784 COUNT is the count of current retrieval, beginning with 1. */
787 sleep_between_retrievals (int count)
789 static int first_retrieval = 1;
793 /* Don't sleep before the very first retrieval. */
798 if (opt.waitretry && count > 1)
800 /* If opt.waitretry is specified and this is a retry, wait for
801 COUNT-1 number of seconds, or for opt.waitretry seconds. */
802 if (count <= opt.waitretry)
805 xsleep (opt.waitretry);
809 if (!opt.random_wait || count > 1)
810 /* If random-wait is not specified, or if we are sleeping
811 between retries of the same download, sleep the fixed
816 /* Sleep a random amount of time averaging in opt.wait
817 seconds. The sleeping amount ranges from 0 to
818 opt.wait*2, inclusive. */
819 double waitsecs = 2 * opt.wait * random_float ();
820 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
821 opt.wait, waitsecs));
827 /* Free the linked list of urlpos. */
829 free_urlpos (struct urlpos *l)
833 struct urlpos *next = l->next;
836 xfree_null (l->local_name);
842 /* Rotate FNAME opt.backups times */
844 rotate_backups(const char *fname)
846 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
847 char *from = (char *)alloca (maxlen);
848 char *to = (char *)alloca (maxlen);
852 if (stat (fname, &sb) == 0)
853 if (S_ISREG (sb.st_mode) == 0)
856 for (i = opt.backups; i > 1; i--)
858 sprintf (from, "%s.%d", fname, i - 1);
859 sprintf (to, "%s.%d", fname, i);
863 sprintf (to, "%s.%d", fname, 1);
867 static int no_proxy_match PARAMS ((const char *, const char **));
869 /* Return the URL of the proxy appropriate for url U. */
872 getproxy (struct url *u)
876 static char rewritten_storage[1024];
880 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
886 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
890 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
894 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
899 if (!proxy || !*proxy)
902 /* Handle shorthands. `rewritten_storage' is a kludge to allow
903 getproxy() to return static storage. */
904 rewritten_url = rewrite_shorthand_url (proxy);
907 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
908 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
909 proxy = rewritten_storage;
915 /* Should a host be accessed through proxy, concerning no_proxy? */
917 no_proxy_match (const char *host, const char **no_proxy)
922 return !sufmatch (no_proxy, host);