2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
24 #include <sys/types.h>
27 #endif /* HAVE_UNISTD_H */
33 #endif /* HAVE_STRING_H */
48 # include "gen_sslfunc.h" /* for ssl_iread */
55 /* See the comment in gethttp() why this is needed. */
56 int global_download_count;
65 limit_bandwidth_reset (void)
68 limit_data.dltime = 0;
71 /* Limit the bandwidth by pausing the download for an amount of time.
72 BYTES is the number of bytes received from the network, DELTA is
73 how long it took to receive them, DLTIME the current download time,
74 TIMER the timer, and ADJUSTMENT the previous. */
77 limit_bandwidth (long bytes, long delta)
81 limit_data.bytes += bytes;
82 limit_data.dltime += delta;
84 expected = (long)(1000.0 * limit_data.bytes / opt.limit_rate);
86 if (expected > limit_data.dltime)
88 long slp = expected - limit_data.dltime;
91 DEBUGP (("deferring a %ld ms sleep (%ld/%ld) until later.\n",
92 slp, limit_data.bytes, limit_data.dltime));
95 DEBUGP (("sleeping %ld ms\n", slp));
100 limit_data.dltime = 0;
103 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
105 /* Reads the contents of file descriptor FD, until it is closed, or a
106 read error occurs. The data is read in 8K chunks, and stored to
107 stream fp, which should have been open for writing. If BUF is
108 non-NULL and its file descriptor is equal to FD, flush RBUF first.
109 This function will *not* use the rbuf_* functions!
111 The EXPECTED argument is passed to show_progress() unchanged, but
114 If opt.verbose is set, the progress is also shown. RESTVAL
115 represents a value from which to start downloading (which will be
116 shown accordingly). If RESTVAL is non-zero, the stream should have
117 been open for appending.
119 The function exits and returns codes of 0, -1 and -2 if the
120 connection was closed, there was a read error, or if it could not
121 write to the output stream, respectively.
123 IMPORTANT: The function flushes the contents of the buffer in
124 rbuf_flush() before actually reading from fd. If you wish to read
125 from fd immediately, flush or discard the buffer. */
127 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
128 struct rbuf *rbuf, int use_expected, long *elapsed)
132 void *progress = NULL;
133 struct wget_timer *timer = wtimer_allocate ();
134 long dltime = 0, last_dltime = 0;
139 progress = progress_create (restval, expected);
141 if (rbuf && RBUF_FD (rbuf) == fd)
144 while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
146 fwrite (c, sizeof (char), res, fp);
158 progress_update (progress, sz, 0);
162 limit_bandwidth_reset ();
163 wtimer_reset (timer);
165 /* Read from fd while there is available data.
167 Normally, if expected is 0, it means that it is not known how
168 much data is expected. However, if use_expected is specified,
169 then expected being zero means exactly that. */
170 while (!use_expected || (*len < expected))
172 int amount_to_read = (use_expected
173 ? MIN (expected - *len, sizeof (c))
177 res = ssl_iread (rbuf->ssl, c, amount_to_read);
179 #endif /* HAVE_SSL */
180 res = iread (fd, c, amount_to_read);
184 fwrite (c, sizeof (char), res, fp);
185 /* Always flush the contents of the network packet. This
186 should not be adverse to performance, as the network
187 packets typically won't be too tiny anyway. */
195 /* If bandwidth is not limited, one call to wtimer_elapsed
197 dltime = wtimer_elapsed (timer);
200 limit_bandwidth (res, dltime - last_dltime);
201 dltime = wtimer_elapsed (timer);
202 last_dltime = dltime;
206 progress_update (progress, res, dltime);
217 progress_finish (progress, dltime);
220 wtimer_delete (timer);
225 /* Return a printed representation of the download rate, as
226 appropriate for the speed. If PAD is non-zero, strings will be
227 padded to the width of 7 characters (xxxx.xx). */
229 retr_rate (long bytes, long msecs, int pad)
232 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
235 double dlrate = calc_rate (bytes, msecs, &units);
236 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
241 /* Calculate the download rate and trim it as appropriate for the
242 speed. Appropriate means that if rate is greater than 1K/s,
243 kilobytes are used, and if rate is greater than 1MB/s, megabytes
246 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
249 calc_rate (long bytes, long msecs, int *units)
257 /* If elapsed time is 0, it means we're under the granularity of
258 the timer. This often happens on systems that use time() for
260 msecs = wtimer_granularity ();
262 dlrate = (double)1000 * bytes / msecs;
265 else if (dlrate < 1024.0 * 1024.0)
266 *units = 1, dlrate /= 1024.0;
267 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
268 *units = 2, dlrate /= (1024.0 * 1024.0);
270 /* Maybe someone will need this one day. More realistically, it
271 will get tickled by buggy timers. */
272 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
277 /* Maximum number of allowed redirections. 20 was chosen as a
278 "reasonable" value, which is low enough to not cause havoc, yet
279 high enough to guarantee that normal retrievals will not be hurt by
282 #define MAX_REDIRECTIONS 20
284 #define SUSPEND_POST_DATA do { \
285 post_data_suspended = 1; \
286 saved_post_data = opt.post_data; \
287 saved_post_file_name = opt.post_file_name; \
288 opt.post_data = NULL; \
289 opt.post_file_name = NULL; \
292 #define RESTORE_POST_DATA do { \
293 if (post_data_suspended) \
295 opt.post_data = saved_post_data; \
296 opt.post_file_name = saved_post_file_name; \
297 post_data_suspended = 0; \
301 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
304 /* #### This function should be rewritten so it doesn't return from
308 retrieve_url (const char *origurl, char **file, char **newloc,
309 const char *refurl, int *dt)
313 int location_changed, dummy;
314 char *mynewloc, *proxy;
315 struct url *u, *proxy_url;
316 int up_error_code; /* url parse error code */
318 int redirection_count = 0;
320 int post_data_suspended = 0;
321 char *saved_post_data;
322 char *saved_post_file_name;
324 /* If dt is NULL, just ignore it. */
327 url = xstrdup (origurl);
333 u = url_parse (url, &up_error_code);
336 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
342 refurl = opt.referer;
351 proxy = getproxy (u);
354 /* Parse the proxy URL. */
355 proxy_url = url_parse (proxy, &up_error_code);
358 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
359 proxy, url_error (up_error_code));
364 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
366 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
367 url_free (proxy_url);
374 if (u->scheme == SCHEME_HTTP
376 || u->scheme == SCHEME_HTTPS
378 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
380 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
382 else if (u->scheme == SCHEME_FTP)
384 /* If this is a redirection, we must not allow recursive FTP
385 retrieval, so we save recursion to oldrec, and restore it
387 int oldrec = opt.recursive;
388 if (redirection_count)
390 result = ftp_loop (u, dt, proxy_url);
391 opt.recursive = oldrec;
393 /* There is a possibility of having HTTP being redirected to
394 FTP. In these cases we must decide whether the text is HTML
395 according to the suffix. The HTML suffixes are `.html',
396 `.htm' and a few others, case-insensitive. */
397 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
399 if (has_html_suffix_p (local_file))
406 url_free (proxy_url);
410 location_changed = (result == NEWLOCATION);
411 if (location_changed)
413 char *construced_newloc;
414 struct url *newloc_parsed;
416 assert (mynewloc != NULL);
421 /* The HTTP specs only allow absolute URLs to appear in
422 redirects, but a ton of boneheaded webservers and CGIs out
423 there break the rules and use relative URLs, and popular
424 browsers are lenient about this, so wget should be too. */
425 construced_newloc = uri_merge (url, mynewloc);
427 mynewloc = construced_newloc;
429 /* Now, see if this new location makes sense. */
430 newloc_parsed = url_parse (mynewloc, &up_error_code);
433 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
434 url_error (up_error_code));
442 /* Now mynewloc will become newloc_parsed->url, because if the
443 Location contained relative paths like .././something, we
444 don't want that propagating as url. */
446 mynewloc = xstrdup (newloc_parsed->url);
448 /* Check for max. number of redirections. */
449 if (++redirection_count > MAX_REDIRECTIONS)
451 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
453 url_free (newloc_parsed);
466 /* If we're being redirected from POST, we don't want to POST
467 again. Many requests answer POST with a redirection to an
468 index page; that redirection is clearly a GET. We "suspend"
469 POST data for the duration of the redirections, and restore
470 it when we're done. */
471 if (!post_data_suspended)
481 register_download (u->url, local_file);
482 if (redirection_count && 0 != strcmp (origurl, u->url))
483 register_redirection (origurl, u->url);
485 register_html (u->url, local_file);
490 *file = local_file ? local_file : NULL;
492 FREE_MAYBE (local_file);
496 if (redirection_count)
510 ++global_download_count;
516 /* Find the URLs in the file and call retrieve_url() for each of
517 them. If HTML is non-zero, treat the file as HTML, and construct
518 the URLs accordingly.
520 If opt.recursive is set, call recursive_retrieve() for each file. */
522 retrieve_from_file (const char *file, int html, int *count)
525 struct urlpos *url_list, *cur_url;
527 url_list = (html ? get_urls_html (file, NULL, NULL)
528 : get_urls_file (file));
529 status = RETROK; /* Suppose everything is OK. */
530 *count = 0; /* Reset the URL count. */
532 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
534 char *filename = NULL, *new_file = NULL;
537 if (cur_url->ignore_when_downloading)
540 if (downloaded_exceeds_quota ())
545 if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
546 status = retrieve_tree (cur_url->url->url);
548 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
550 if (filename && opt.delete_after && file_exists_p (filename))
552 DEBUGP (("Removing file due to --delete-after in"
553 " retrieve_from_file():\n"));
554 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
555 if (unlink (filename))
556 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
560 FREE_MAYBE (new_file);
561 FREE_MAYBE (filename);
564 /* Free the linked list of URL-s. */
565 free_urlpos (url_list);
570 /* Print `giving up', or `retrying', depending on the impending
571 action. N1 and N2 are the attempt number and the attempt limit. */
573 printwhat (int n1, int n2)
575 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
578 /* Increment opt.downloaded by BY_HOW_MUCH. If an overflow occurs,
579 set opt.downloaded_overflow to 1. */
581 downloaded_increase (unsigned long by_how_much)
584 if (opt.downloaded_overflow)
586 old = opt.downloaded;
587 opt.downloaded += by_how_much;
588 if (opt.downloaded < old) /* carry flag, where are you when I
592 opt.downloaded_overflow = 1;
593 opt.downloaded = ~((VERY_LONG_TYPE)0);
597 /* Return non-zero if the downloaded amount of bytes exceeds the
598 desired quota. If quota is not set or if the amount overflowed, 0
601 downloaded_exceeds_quota (void)
605 if (opt.downloaded_overflow)
606 /* We don't really know. (Wildly) assume not. */
609 return opt.downloaded > opt.quota;
612 /* If opt.wait or opt.waitretry are specified, and if certain
613 conditions are met, sleep the appropriate number of seconds. See
614 the documentation of --wait and --waitretry for more information.
616 COUNT is the count of current retrieval, beginning with 1. */
619 sleep_between_retrievals (int count)
621 static int first_retrieval = 1;
625 /* Don't sleep before the very first retrieval. */
630 if (opt.waitretry && count > 1)
632 /* If opt.waitretry is specified and this is a retry, wait for
633 COUNT-1 number of seconds, or for opt.waitretry seconds. */
634 if (count <= opt.waitretry)
637 sleep (opt.waitretry);
641 if (!opt.random_wait || count > 1)
642 /* If random-wait is not specified, or if we are sleeping
643 between retries of the same download, sleep the fixed
648 /* Sleep a random amount of time averaging in opt.wait
649 seconds. The sleeping amount ranges from 0 to
650 opt.wait*2, inclusive. */
651 int waitsecs = random_number (opt.wait * 2 + 1);
653 DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
654 opt.wait, waitsecs - opt.wait, waitsecs));