2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* See the comment in gethttp() why this is needed. */
67 int global_download_count;
69 /* Total size of downloaded files. Used to enforce quota. */
70 LARGE_INT total_downloaded_bytes;
80 limit_bandwidth_reset (void)
82 limit_data.chunk_bytes = 0;
83 limit_data.chunk_start = 0;
86 /* Limit the bandwidth by pausing the download for an amount of time.
87 BYTES is the number of bytes received from the network, and TIMER
88 is the timer that started at the beginning of download. */
91 limit_bandwidth (long bytes, struct wget_timer *timer)
93 double delta_t = wtimer_read (timer) - limit_data.chunk_start;
96 limit_data.chunk_bytes += bytes;
98 /* Calculate the amount of time we expect downloading the chunk
99 should take. If in reality it took less time, sleep to
100 compensate for the difference. */
101 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
103 if (expected > delta_t)
105 double slp = expected - delta_t + limit_data.sleep_adjust;
109 DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
110 slp, limit_data.chunk_bytes, delta_t));
113 DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
114 slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
116 t0 = wtimer_read (timer);
118 wtimer_update (timer);
119 t1 = wtimer_read (timer);
121 /* Due to scheduling, we probably slept slightly longer (or
122 shorter) than desired. Calculate the difference between the
123 desired and the actual sleep, and adjust the next sleep by
125 limit_data.sleep_adjust = slp - (t1 - t0);
128 limit_data.chunk_bytes = 0;
129 limit_data.chunk_start = wtimer_read (timer);
132 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
134 /* Reads the contents of file descriptor FD, until it is closed, or a
135 read error occurs. The data is read in 8K chunks, and stored to
136 stream fp, which should have been open for writing. If BUF is
137 non-NULL and its file descriptor is equal to FD, flush RBUF first.
138 This function will *not* use the rbuf_* functions!
140 The EXPECTED argument is passed to show_progress() unchanged, but
143 If opt.verbose is set, the progress is also shown. RESTVAL
144 represents a value from which to start downloading (which will be
145 shown accordingly). If RESTVAL is non-zero, the stream should have
146 been open for appending.
148 The function exits and returns codes of 0, -1 and -2 if the
149 connection was closed, there was a read error, or if it could not
150 write to the output stream, respectively.
152 IMPORTANT: The function flushes the contents of the buffer in
153 rbuf_flush() before actually reading from fd. If you wish to read
154 from fd immediately, flush or discard the buffer. */
156 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
157 struct rbuf *rbuf, int use_expected, double *elapsed)
161 static char dlbuf[16384];
162 int dlbufsize = sizeof (dlbuf);
164 struct wget_timer *timer = wtimer_allocate ();
165 double last_successful_read_tm;
167 /* The progress gauge, set according to the user preferences. */
168 void *progress = NULL;
170 /* Non-zero if the progress gauge is interactive, i.e. if it can
171 continually update the display. When true, smaller timeout
172 values are used so that the gauge can update the display when
173 data arrives slowly. */
174 int progress_interactive = 0;
180 progress = progress_create (restval, expected);
181 progress_interactive = progress_interactive_p (progress);
184 if (rbuf && RBUF_FD (rbuf) == fd)
187 while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
189 fwrite (dlbuf, 1, res, fp);
201 progress_update (progress, sz, 0);
205 limit_bandwidth_reset ();
206 wtimer_reset (timer);
207 last_successful_read_tm = 0;
209 /* Use a smaller buffer for low requested bandwidths. For example,
210 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
211 data and then sleep for 8s. With buffer size equal to the limit,
212 we never have to sleep for more than one second. */
213 if (opt.limit_rate && opt.limit_rate < dlbufsize)
214 dlbufsize = opt.limit_rate;
216 /* Read from fd while there is available data.
218 Normally, if expected is 0, it means that it is not known how
219 much data is expected. However, if use_expected is specified,
220 then expected being zero means exactly that. */
221 while (!use_expected || (*len < expected))
223 int amount_to_read = (use_expected
224 ? MIN (expected - *len, dlbufsize) : dlbufsize);
225 double tmout = opt.read_timeout;
226 if (progress_interactive)
229 /* For interactive progress gauges, always specify a ~1s
230 timeout, so that the gauge can be updated regularly even
231 when the data arrives very slowly or stalls. */
233 waittm = (wtimer_read (timer) - last_successful_read_tm) / 1000;
234 if (waittm + tmout > opt.read_timeout)
236 /* Don't allow waiting time to exceed read timeout. */
237 tmout = opt.read_timeout - waittm;
240 /* We've already exceeded the timeout. */
241 res = -1, errno = ETIMEDOUT;
246 res = fd_read (fd, dlbuf, amount_to_read, tmout);
248 if (res == 0 || (res < 0 && errno != ETIMEDOUT))
251 res = 0; /* timeout */
253 wtimer_update (timer);
256 fwrite (dlbuf, 1, res, fp);
257 /* Always flush the contents of the network packet. This
258 should not hinder performance: fast downloads will be
259 received in 16K chunks (which stdio would write out
260 anyway), and slow downloads won't be limited by disk
268 last_successful_read_tm = wtimer_read (timer);
272 limit_bandwidth (res, timer);
276 progress_update (progress, res, wtimer_read (timer));
278 if (use_expected && expected > 0)
279 ws_percenttitle (100.0 * (double)(*len) / (double)expected);
287 progress_finish (progress, wtimer_read (timer));
289 *elapsed = wtimer_read (timer);
290 wtimer_delete (timer);
295 /* Return a printed representation of the download rate, as
296 appropriate for the speed. If PAD is non-zero, strings will be
297 padded to the width of 7 characters (xxxx.xx). */
299 retr_rate (long bytes, double msecs, int pad)
302 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
305 double dlrate = calc_rate (bytes, msecs, &units);
306 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
311 /* Calculate the download rate and trim it as appropriate for the
312 speed. Appropriate means that if rate is greater than 1K/s,
313 kilobytes are used, and if rate is greater than 1MB/s, megabytes
316 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
319 calc_rate (long bytes, double msecs, int *units)
327 /* If elapsed time is exactly zero, it means we're under the
328 granularity of the timer. This often happens on systems that
329 use time() for the timer. */
330 msecs = wtimer_granularity ();
332 dlrate = (double)1000 * bytes / msecs;
335 else if (dlrate < 1024.0 * 1024.0)
336 *units = 1, dlrate /= 1024.0;
337 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
338 *units = 2, dlrate /= (1024.0 * 1024.0);
340 /* Maybe someone will need this, one day. */
341 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
346 /* Maximum number of allowed redirections. 20 was chosen as a
347 "reasonable" value, which is low enough to not cause havoc, yet
348 high enough to guarantee that normal retrievals will not be hurt by
351 #define MAX_REDIRECTIONS 20
353 #define SUSPEND_POST_DATA do { \
354 post_data_suspended = 1; \
355 saved_post_data = opt.post_data; \
356 saved_post_file_name = opt.post_file_name; \
357 opt.post_data = NULL; \
358 opt.post_file_name = NULL; \
361 #define RESTORE_POST_DATA do { \
362 if (post_data_suspended) \
364 opt.post_data = saved_post_data; \
365 opt.post_file_name = saved_post_file_name; \
366 post_data_suspended = 0; \
370 static char *getproxy PARAMS ((struct url *));
372 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
375 /* #### This function should be rewritten so it doesn't return from
379 retrieve_url (const char *origurl, char **file, char **newloc,
380 const char *refurl, int *dt)
384 int location_changed, dummy;
385 char *mynewloc, *proxy;
386 struct url *u, *proxy_url;
387 int up_error_code; /* url parse error code */
389 int redirection_count = 0;
391 int post_data_suspended = 0;
392 char *saved_post_data = NULL;
393 char *saved_post_file_name = NULL;
395 /* If dt is NULL, use local storage. */
401 url = xstrdup (origurl);
407 u = url_parse (url, &up_error_code);
410 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
416 refurl = opt.referer;
425 proxy = getproxy (u);
428 /* Parse the proxy URL. */
429 proxy_url = url_parse (proxy, &up_error_code);
432 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
433 proxy, url_error (up_error_code));
438 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
440 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
441 url_free (proxy_url);
448 if (u->scheme == SCHEME_HTTP
450 || u->scheme == SCHEME_HTTPS
452 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
454 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
456 else if (u->scheme == SCHEME_FTP)
458 /* If this is a redirection, we must not allow recursive FTP
459 retrieval, so we save recursion to oldrec, and restore it
461 int oldrec = opt.recursive;
462 if (redirection_count)
464 result = ftp_loop (u, dt, proxy_url);
465 opt.recursive = oldrec;
467 /* There is a possibility of having HTTP being redirected to
468 FTP. In these cases we must decide whether the text is HTML
469 according to the suffix. The HTML suffixes are `.html',
470 `.htm' and a few others, case-insensitive. */
471 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
473 if (has_html_suffix_p (local_file))
480 url_free (proxy_url);
484 location_changed = (result == NEWLOCATION);
485 if (location_changed)
487 char *construced_newloc;
488 struct url *newloc_parsed;
490 assert (mynewloc != NULL);
495 /* The HTTP specs only allow absolute URLs to appear in
496 redirects, but a ton of boneheaded webservers and CGIs out
497 there break the rules and use relative URLs, and popular
498 browsers are lenient about this, so wget should be too. */
499 construced_newloc = uri_merge (url, mynewloc);
501 mynewloc = construced_newloc;
503 /* Now, see if this new location makes sense. */
504 newloc_parsed = url_parse (mynewloc, &up_error_code);
507 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
508 url_error (up_error_code));
516 /* Now mynewloc will become newloc_parsed->url, because if the
517 Location contained relative paths like .././something, we
518 don't want that propagating as url. */
520 mynewloc = xstrdup (newloc_parsed->url);
522 /* Check for max. number of redirections. */
523 if (++redirection_count > MAX_REDIRECTIONS)
525 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
527 url_free (newloc_parsed);
540 /* If we're being redirected from POST, we don't want to POST
541 again. Many requests answer POST with a redirection to an
542 index page; that redirection is clearly a GET. We "suspend"
543 POST data for the duration of the redirections, and restore
544 it when we're done. */
545 if (!post_data_suspended)
555 register_download (u->url, local_file);
556 if (redirection_count && 0 != strcmp (origurl, u->url))
557 register_redirection (origurl, u->url);
559 register_html (u->url, local_file);
564 *file = local_file ? local_file : NULL;
566 xfree_null (local_file);
570 if (redirection_count)
584 ++global_download_count;
590 /* Find the URLs in the file and call retrieve_url() for each of
591 them. If HTML is non-zero, treat the file as HTML, and construct
592 the URLs accordingly.
594 If opt.recursive is set, call retrieve_tree() for each file. */
597 retrieve_from_file (const char *file, int html, int *count)
600 struct urlpos *url_list, *cur_url;
602 url_list = (html ? get_urls_html (file, NULL, NULL)
603 : get_urls_file (file));
604 status = RETROK; /* Suppose everything is OK. */
605 *count = 0; /* Reset the URL count. */
607 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
609 char *filename = NULL, *new_file = NULL;
612 if (cur_url->ignore_when_downloading)
615 if (opt.quota && total_downloaded_bytes > opt.quota)
620 if ((opt.recursive || opt.page_requisites)
621 && cur_url->url->scheme != SCHEME_FTP)
622 status = retrieve_tree (cur_url->url->url);
624 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
626 if (filename && opt.delete_after && file_exists_p (filename))
628 DEBUGP (("Removing file due to --delete-after in"
629 " retrieve_from_file():\n"));
630 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
631 if (unlink (filename))
632 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
636 xfree_null (new_file);
637 xfree_null (filename);
640 /* Free the linked list of URL-s. */
641 free_urlpos (url_list);
646 /* Print `giving up', or `retrying', depending on the impending
647 action. N1 and N2 are the attempt number and the attempt limit. */
649 printwhat (int n1, int n2)
651 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
654 /* If opt.wait or opt.waitretry are specified, and if certain
655 conditions are met, sleep the appropriate number of seconds. See
656 the documentation of --wait and --waitretry for more information.
658 COUNT is the count of current retrieval, beginning with 1. */
661 sleep_between_retrievals (int count)
663 static int first_retrieval = 1;
667 /* Don't sleep before the very first retrieval. */
672 if (opt.waitretry && count > 1)
674 /* If opt.waitretry is specified and this is a retry, wait for
675 COUNT-1 number of seconds, or for opt.waitretry seconds. */
676 if (count <= opt.waitretry)
679 xsleep (opt.waitretry);
683 if (!opt.random_wait || count > 1)
684 /* If random-wait is not specified, or if we are sleeping
685 between retries of the same download, sleep the fixed
690 /* Sleep a random amount of time averaging in opt.wait
691 seconds. The sleeping amount ranges from 0 to
692 opt.wait*2, inclusive. */
693 double waitsecs = 2 * opt.wait * random_float ();
694 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
695 opt.wait, waitsecs));
701 /* Free the linked list of urlpos. */
703 free_urlpos (struct urlpos *l)
707 struct urlpos *next = l->next;
710 xfree_null (l->local_name);
716 /* Rotate FNAME opt.backups times */
718 rotate_backups(const char *fname)
720 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
721 char *from = (char *)alloca (maxlen);
722 char *to = (char *)alloca (maxlen);
726 if (stat (fname, &sb) == 0)
727 if (S_ISREG (sb.st_mode) == 0)
730 for (i = opt.backups; i > 1; i--)
732 sprintf (from, "%s.%d", fname, i - 1);
733 sprintf (to, "%s.%d", fname, i);
737 sprintf (to, "%s.%d", fname, 1);
741 static int no_proxy_match PARAMS ((const char *, const char **));
743 /* Return the URL of the proxy appropriate for url U. */
746 getproxy (struct url *u)
750 static char rewritten_storage[1024];
754 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
760 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
764 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
768 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
773 if (!proxy || !*proxy)
776 /* Handle shorthands. `rewritten_storage' is a kludge to allow
777 getproxy() to return static storage. */
778 rewritten_url = rewrite_shorthand_url (proxy);
781 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
782 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
783 proxy = rewritten_storage;
789 /* Should a host be accessed through proxy, concerning no_proxy? */
791 no_proxy_match (const char *host, const char **no_proxy)
796 return !sufmatch (no_proxy, host);