2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* See the comment in gethttp() why this is needed. */
67 int global_download_count;
77 limit_bandwidth_reset (void)
79 limit_data.chunk_bytes = 0;
80 limit_data.chunk_start = 0;
83 /* Limit the bandwidth by pausing the download for an amount of time.
84 BYTES is the number of bytes received from the network, and DELTA
85 is the number of milliseconds it took to receive them. */
88 limit_bandwidth (long bytes, double *dltime, struct wget_timer *timer)
90 double delta_t = *dltime - limit_data.chunk_start;
93 limit_data.chunk_bytes += bytes;
95 /* Calculate the amount of time we expect downloading the chunk
96 should take. If in reality it took less time, sleep to
97 compensate for the difference. */
98 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
100 if (expected > delta_t)
102 double slp = expected - delta_t + limit_data.sleep_adjust;
106 DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
107 slp, limit_data.chunk_bytes, delta_t));
110 DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
111 slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
114 usleep ((unsigned long) (1000 * slp));
115 t1 = wtimer_elapsed (timer);
117 /* Due to scheduling, we probably slept slightly longer (or
118 shorter) than desired. Calculate the difference between the
119 desired and the actual sleep, and adjust the next sleep by
121 limit_data.sleep_adjust = slp - (t1 - t0);
123 /* Since we've called wtimer_elapsed, we might as well update
124 the caller's dltime. */
128 limit_data.chunk_bytes = 0;
129 limit_data.chunk_start = *dltime;
132 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
134 /* Reads the contents of file descriptor FD, until it is closed, or a
135 read error occurs. The data is read in 8K chunks, and stored to
136 stream fp, which should have been open for writing. If BUF is
137 non-NULL and its file descriptor is equal to FD, flush RBUF first.
138 This function will *not* use the rbuf_* functions!
140 The EXPECTED argument is passed to show_progress() unchanged, but
143 If opt.verbose is set, the progress is also shown. RESTVAL
144 represents a value from which to start downloading (which will be
145 shown accordingly). If RESTVAL is non-zero, the stream should have
146 been open for appending.
148 The function exits and returns codes of 0, -1 and -2 if the
149 connection was closed, there was a read error, or if it could not
150 write to the output stream, respectively.
152 IMPORTANT: The function flushes the contents of the buffer in
153 rbuf_flush() before actually reading from fd. If you wish to read
154 from fd immediately, flush or discard the buffer. */
156 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
157 struct rbuf *rbuf, int use_expected, double *elapsed)
161 static char dlbuf[16384];
162 int dlbufsize = sizeof (dlbuf);
164 void *progress = NULL;
165 struct wget_timer *timer = wtimer_allocate ();
171 progress = progress_create (restval, expected);
173 if (rbuf && RBUF_FD (rbuf) == fd)
176 while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
178 fwrite (dlbuf, 1, res, fp);
190 progress_update (progress, sz, 0);
194 limit_bandwidth_reset ();
195 wtimer_reset (timer);
197 /* Use a smaller buffer for low requested bandwidths. For example,
198 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
199 data and then sleep for 8s. With buffer size equal to the limit,
200 we never have to sleep for more than one second. */
201 if (opt.limit_rate && opt.limit_rate < dlbufsize)
202 dlbufsize = opt.limit_rate;
204 /* Read from fd while there is available data.
206 Normally, if expected is 0, it means that it is not known how
207 much data is expected. However, if use_expected is specified,
208 then expected being zero means exactly that. */
209 while (!use_expected || (*len < expected))
211 int amount_to_read = (use_expected
212 ? MIN (expected - *len, dlbufsize) : dlbufsize);
215 res = ssl_iread (rbuf->ssl, dlbuf, amount_to_read);
217 #endif /* HAVE_SSL */
218 res = iread (fd, dlbuf, amount_to_read);
223 fwrite (dlbuf, 1, res, fp);
224 /* Always flush the contents of the network packet. This should
225 not hinder performance: fast downloads will be received in
226 16K chunks (which stdio would write out anyway), and slow
227 downloads won't be limited with disk performance. */
235 dltime = wtimer_elapsed (timer);
237 limit_bandwidth (res, &dltime, timer);
241 progress_update (progress, res, dltime);
243 if (use_expected && expected > 0)
244 ws_percenttitle (100.0 * (double)(*len) / (double)expected);
252 progress_finish (progress, dltime);
255 wtimer_delete (timer);
260 /* Return a printed representation of the download rate, as
261 appropriate for the speed. If PAD is non-zero, strings will be
262 padded to the width of 7 characters (xxxx.xx). */
264 retr_rate (long bytes, double msecs, int pad)
267 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
270 double dlrate = calc_rate (bytes, msecs, &units);
271 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
276 /* Calculate the download rate and trim it as appropriate for the
277 speed. Appropriate means that if rate is greater than 1K/s,
278 kilobytes are used, and if rate is greater than 1MB/s, megabytes
281 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
284 calc_rate (long bytes, double msecs, int *units)
292 /* If elapsed time is exactly zero, it means we're under the
293 granularity of the timer. This often happens on systems that
294 use time() for the timer. */
295 msecs = wtimer_granularity ();
297 dlrate = (double)1000 * bytes / msecs;
300 else if (dlrate < 1024.0 * 1024.0)
301 *units = 1, dlrate /= 1024.0;
302 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
303 *units = 2, dlrate /= (1024.0 * 1024.0);
305 /* Maybe someone will need this, one day. */
306 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
311 /* Maximum number of allowed redirections. 20 was chosen as a
312 "reasonable" value, which is low enough to not cause havoc, yet
313 high enough to guarantee that normal retrievals will not be hurt by
316 #define MAX_REDIRECTIONS 20
318 #define SUSPEND_POST_DATA do { \
319 post_data_suspended = 1; \
320 saved_post_data = opt.post_data; \
321 saved_post_file_name = opt.post_file_name; \
322 opt.post_data = NULL; \
323 opt.post_file_name = NULL; \
326 #define RESTORE_POST_DATA do { \
327 if (post_data_suspended) \
329 opt.post_data = saved_post_data; \
330 opt.post_file_name = saved_post_file_name; \
331 post_data_suspended = 0; \
335 static char *getproxy PARAMS ((struct url *));
337 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
340 /* #### This function should be rewritten so it doesn't return from
344 retrieve_url (const char *origurl, char **file, char **newloc,
345 const char *refurl, int *dt)
349 int location_changed, dummy;
350 char *mynewloc, *proxy;
351 struct url *u, *proxy_url;
352 int up_error_code; /* url parse error code */
354 int redirection_count = 0;
356 int post_data_suspended = 0;
357 char *saved_post_data = NULL;
358 char *saved_post_file_name = NULL;
360 /* If dt is NULL, just ignore it. */
363 url = xstrdup (origurl);
369 u = url_parse (url, &up_error_code);
372 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
378 refurl = opt.referer;
387 proxy = getproxy (u);
390 /* Parse the proxy URL. */
391 proxy_url = url_parse (proxy, &up_error_code);
394 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
395 proxy, url_error (up_error_code));
400 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
402 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
403 url_free (proxy_url);
410 if (u->scheme == SCHEME_HTTP
412 || u->scheme == SCHEME_HTTPS
414 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
416 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
418 else if (u->scheme == SCHEME_FTP)
420 /* If this is a redirection, we must not allow recursive FTP
421 retrieval, so we save recursion to oldrec, and restore it
423 int oldrec = opt.recursive;
424 if (redirection_count)
426 result = ftp_loop (u, dt, proxy_url);
427 opt.recursive = oldrec;
429 /* There is a possibility of having HTTP being redirected to
430 FTP. In these cases we must decide whether the text is HTML
431 according to the suffix. The HTML suffixes are `.html',
432 `.htm' and a few others, case-insensitive. */
433 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
435 if (has_html_suffix_p (local_file))
442 url_free (proxy_url);
446 location_changed = (result == NEWLOCATION);
447 if (location_changed)
449 char *construced_newloc;
450 struct url *newloc_parsed;
452 assert (mynewloc != NULL);
457 /* The HTTP specs only allow absolute URLs to appear in
458 redirects, but a ton of boneheaded webservers and CGIs out
459 there break the rules and use relative URLs, and popular
460 browsers are lenient about this, so wget should be too. */
461 construced_newloc = uri_merge (url, mynewloc);
463 mynewloc = construced_newloc;
465 /* Now, see if this new location makes sense. */
466 newloc_parsed = url_parse (mynewloc, &up_error_code);
469 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
470 url_error (up_error_code));
478 /* Now mynewloc will become newloc_parsed->url, because if the
479 Location contained relative paths like .././something, we
480 don't want that propagating as url. */
482 mynewloc = xstrdup (newloc_parsed->url);
484 /* Check for max. number of redirections. */
485 if (++redirection_count > MAX_REDIRECTIONS)
487 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
489 url_free (newloc_parsed);
502 /* If we're being redirected from POST, we don't want to POST
503 again. Many requests answer POST with a redirection to an
504 index page; that redirection is clearly a GET. We "suspend"
505 POST data for the duration of the redirections, and restore
506 it when we're done. */
507 if (!post_data_suspended)
517 register_download (u->url, local_file);
518 if (redirection_count && 0 != strcmp (origurl, u->url))
519 register_redirection (origurl, u->url);
521 register_html (u->url, local_file);
526 *file = local_file ? local_file : NULL;
528 FREE_MAYBE (local_file);
532 if (redirection_count)
546 ++global_download_count;
552 /* Find the URLs in the file and call retrieve_url() for each of
553 them. If HTML is non-zero, treat the file as HTML, and construct
554 the URLs accordingly.
556 If opt.recursive is set, call recursive_retrieve() for each file. */
558 retrieve_from_file (const char *file, int html, int *count)
561 struct urlpos *url_list, *cur_url;
563 url_list = (html ? get_urls_html (file, NULL, NULL)
564 : get_urls_file (file));
565 status = RETROK; /* Suppose everything is OK. */
566 *count = 0; /* Reset the URL count. */
568 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
570 char *filename = NULL, *new_file = NULL;
573 if (cur_url->ignore_when_downloading)
576 if (downloaded_exceeds_quota ())
581 if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
582 status = retrieve_tree (cur_url->url->url);
584 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
586 if (filename && opt.delete_after && file_exists_p (filename))
588 DEBUGP (("Removing file due to --delete-after in"
589 " retrieve_from_file():\n"));
590 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
591 if (unlink (filename))
592 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
596 FREE_MAYBE (new_file);
597 FREE_MAYBE (filename);
600 /* Free the linked list of URL-s. */
601 free_urlpos (url_list);
606 /* Print `giving up', or `retrying', depending on the impending
607 action. N1 and N2 are the attempt number and the attempt limit. */
609 printwhat (int n1, int n2)
611 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
614 /* Increment opt.downloaded by BY_HOW_MUCH. If an overflow occurs,
615 set opt.downloaded_overflow to 1. */
617 downloaded_increase (unsigned long by_how_much)
620 if (opt.downloaded_overflow)
622 old = opt.downloaded;
623 opt.downloaded += by_how_much;
624 if (opt.downloaded < old) /* carry flag, where are you when I
628 opt.downloaded_overflow = 1;
629 opt.downloaded = ~((VERY_LONG_TYPE)0);
633 /* Return non-zero if the downloaded amount of bytes exceeds the
634 desired quota. If quota is not set or if the amount overflowed, 0
637 downloaded_exceeds_quota (void)
641 if (opt.downloaded_overflow)
642 /* We don't really know. (Wildly) assume not. */
645 return opt.downloaded > opt.quota;
648 /* If opt.wait or opt.waitretry are specified, and if certain
649 conditions are met, sleep the appropriate number of seconds. See
650 the documentation of --wait and --waitretry for more information.
652 COUNT is the count of current retrieval, beginning with 1. */
655 sleep_between_retrievals (int count)
657 static int first_retrieval = 1;
661 /* Don't sleep before the very first retrieval. */
666 if (opt.waitretry && count > 1)
668 /* If opt.waitretry is specified and this is a retry, wait for
669 COUNT-1 number of seconds, or for opt.waitretry seconds. */
670 if (count <= opt.waitretry)
673 usleep (1000000L * opt.waitretry);
677 if (!opt.random_wait || count > 1)
678 /* If random-wait is not specified, or if we are sleeping
679 between retries of the same download, sleep the fixed
681 usleep (1000000L * opt.wait);
684 /* Sleep a random amount of time averaging in opt.wait
685 seconds. The sleeping amount ranges from 0 to
686 opt.wait*2, inclusive. */
687 double waitsecs = 2 * opt.wait * random_float ();
688 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
689 opt.wait, waitsecs));
690 usleep (1000000L * waitsecs);
695 /* Free the linked list of urlpos. */
697 free_urlpos (struct urlpos *l)
701 struct urlpos *next = l->next;
704 FREE_MAYBE (l->local_name);
710 /* Rotate FNAME opt.backups times */
712 rotate_backups(const char *fname)
714 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
715 char *from = (char *)alloca (maxlen);
716 char *to = (char *)alloca (maxlen);
720 if (stat (fname, &sb) == 0)
721 if (S_ISREG (sb.st_mode) == 0)
724 for (i = opt.backups; i > 1; i--)
726 sprintf (from, "%s.%d", fname, i - 1);
727 sprintf (to, "%s.%d", fname, i);
731 sprintf (to, "%s.%d", fname, 1);
735 static int no_proxy_match PARAMS ((const char *, const char **));
737 /* Return the URL of the proxy appropriate for url U. */
740 getproxy (struct url *u)
744 static char rewritten_storage[1024];
748 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
754 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
758 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
762 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
767 if (!proxy || !*proxy)
770 /* Handle shorthands. `rewritten_storage' is a kludge to allow
771 getproxy() to return static storage. */
772 rewritten_url = rewrite_shorthand_url (proxy);
775 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
776 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
777 proxy = rewritten_storage;
783 /* Should a host be accessed through proxy, concerning no_proxy? */
785 no_proxy_match (const char *host, const char **no_proxy)
790 return !sufmatch (no_proxy, host);