2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* See the comment in gethttp() why this is needed. */
67 int global_download_count;
69 /* Total size of downloaded files. Used to enforce quota. */
70 LARGE_INT total_downloaded_bytes;
80 limit_bandwidth_reset (void)
82 limit_data.chunk_bytes = 0;
83 limit_data.chunk_start = 0;
86 /* Limit the bandwidth by pausing the download for an amount of time.
87 BYTES is the number of bytes received from the network, and DELTA
88 is the number of milliseconds it took to receive them. */
91 limit_bandwidth (long bytes, double *dltime, struct wget_timer *timer)
93 double delta_t = *dltime - limit_data.chunk_start;
96 limit_data.chunk_bytes += bytes;
98 /* Calculate the amount of time we expect downloading the chunk
99 should take. If in reality it took less time, sleep to
100 compensate for the difference. */
101 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
103 if (expected > delta_t)
105 double slp = expected - delta_t + limit_data.sleep_adjust;
109 DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
110 slp, limit_data.chunk_bytes, delta_t));
113 DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
114 slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
117 usleep ((unsigned long) (1000 * slp));
118 t1 = wtimer_elapsed (timer);
120 /* Due to scheduling, we probably slept slightly longer (or
121 shorter) than desired. Calculate the difference between the
122 desired and the actual sleep, and adjust the next sleep by
124 limit_data.sleep_adjust = slp - (t1 - t0);
126 /* Since we've called wtimer_elapsed, we might as well update
127 the caller's dltime. */
131 limit_data.chunk_bytes = 0;
132 limit_data.chunk_start = *dltime;
135 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
137 /* Reads the contents of file descriptor FD, until it is closed, or a
138 read error occurs. The data is read in 8K chunks, and stored to
139 stream fp, which should have been open for writing. If BUF is
140 non-NULL and its file descriptor is equal to FD, flush RBUF first.
141 This function will *not* use the rbuf_* functions!
143 The EXPECTED argument is passed to show_progress() unchanged, but
146 If opt.verbose is set, the progress is also shown. RESTVAL
147 represents a value from which to start downloading (which will be
148 shown accordingly). If RESTVAL is non-zero, the stream should have
149 been open for appending.
151 The function exits and returns codes of 0, -1 and -2 if the
152 connection was closed, there was a read error, or if it could not
153 write to the output stream, respectively.
155 IMPORTANT: The function flushes the contents of the buffer in
156 rbuf_flush() before actually reading from fd. If you wish to read
157 from fd immediately, flush or discard the buffer. */
159 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
160 struct rbuf *rbuf, int use_expected, double *elapsed)
164 static char dlbuf[16384];
165 int dlbufsize = sizeof (dlbuf);
167 void *progress = NULL;
168 struct wget_timer *timer = wtimer_allocate ();
174 progress = progress_create (restval, expected);
176 if (rbuf && RBUF_FD (rbuf) == fd)
179 while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
181 fwrite (dlbuf, 1, res, fp);
193 progress_update (progress, sz, 0);
197 limit_bandwidth_reset ();
198 wtimer_reset (timer);
200 /* Use a smaller buffer for low requested bandwidths. For example,
201 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
202 data and then sleep for 8s. With buffer size equal to the limit,
203 we never have to sleep for more than one second. */
204 if (opt.limit_rate && opt.limit_rate < dlbufsize)
205 dlbufsize = opt.limit_rate;
207 /* Read from fd while there is available data.
209 Normally, if expected is 0, it means that it is not known how
210 much data is expected. However, if use_expected is specified,
211 then expected being zero means exactly that. */
212 while (!use_expected || (*len < expected))
214 int amount_to_read = (use_expected
215 ? MIN (expected - *len, dlbufsize) : dlbufsize);
218 res = ssl_iread (rbuf->ssl, dlbuf, amount_to_read);
220 #endif /* HAVE_SSL */
221 res = iread (fd, dlbuf, amount_to_read);
226 fwrite (dlbuf, 1, res, fp);
227 /* Always flush the contents of the network packet. This should
228 not hinder performance: fast downloads will be received in
229 16K chunks (which stdio would write out anyway), and slow
230 downloads won't be limited with disk performance. */
238 dltime = wtimer_elapsed (timer);
240 limit_bandwidth (res, &dltime, timer);
244 progress_update (progress, res, dltime);
246 if (use_expected && expected > 0)
247 ws_percenttitle (100.0 * (double)(*len) / (double)expected);
255 progress_finish (progress, dltime);
258 wtimer_delete (timer);
263 /* Return a printed representation of the download rate, as
264 appropriate for the speed. If PAD is non-zero, strings will be
265 padded to the width of 7 characters (xxxx.xx). */
267 retr_rate (long bytes, double msecs, int pad)
270 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
273 double dlrate = calc_rate (bytes, msecs, &units);
274 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
279 /* Calculate the download rate and trim it as appropriate for the
280 speed. Appropriate means that if rate is greater than 1K/s,
281 kilobytes are used, and if rate is greater than 1MB/s, megabytes
284 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
287 calc_rate (long bytes, double msecs, int *units)
295 /* If elapsed time is exactly zero, it means we're under the
296 granularity of the timer. This often happens on systems that
297 use time() for the timer. */
298 msecs = wtimer_granularity ();
300 dlrate = (double)1000 * bytes / msecs;
303 else if (dlrate < 1024.0 * 1024.0)
304 *units = 1, dlrate /= 1024.0;
305 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
306 *units = 2, dlrate /= (1024.0 * 1024.0);
308 /* Maybe someone will need this, one day. */
309 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
314 /* Maximum number of allowed redirections. 20 was chosen as a
315 "reasonable" value, which is low enough to not cause havoc, yet
316 high enough to guarantee that normal retrievals will not be hurt by
319 #define MAX_REDIRECTIONS 20
321 #define SUSPEND_POST_DATA do { \
322 post_data_suspended = 1; \
323 saved_post_data = opt.post_data; \
324 saved_post_file_name = opt.post_file_name; \
325 opt.post_data = NULL; \
326 opt.post_file_name = NULL; \
329 #define RESTORE_POST_DATA do { \
330 if (post_data_suspended) \
332 opt.post_data = saved_post_data; \
333 opt.post_file_name = saved_post_file_name; \
334 post_data_suspended = 0; \
338 static char *getproxy PARAMS ((struct url *));
340 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
343 /* #### This function should be rewritten so it doesn't return from
347 retrieve_url (const char *origurl, char **file, char **newloc,
348 const char *refurl, int *dt)
352 int location_changed, dummy;
353 char *mynewloc, *proxy;
354 struct url *u, *proxy_url;
355 int up_error_code; /* url parse error code */
357 int redirection_count = 0;
359 int post_data_suspended = 0;
360 char *saved_post_data = NULL;
361 char *saved_post_file_name = NULL;
363 /* If dt is NULL, use local storage. */
369 url = xstrdup (origurl);
375 u = url_parse (url, &up_error_code);
378 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
384 refurl = opt.referer;
393 proxy = getproxy (u);
396 /* Parse the proxy URL. */
397 proxy_url = url_parse (proxy, &up_error_code);
400 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
401 proxy, url_error (up_error_code));
406 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
408 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
409 url_free (proxy_url);
416 if (u->scheme == SCHEME_HTTP
418 || u->scheme == SCHEME_HTTPS
420 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
422 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
424 else if (u->scheme == SCHEME_FTP)
426 /* If this is a redirection, we must not allow recursive FTP
427 retrieval, so we save recursion to oldrec, and restore it
429 int oldrec = opt.recursive;
430 if (redirection_count)
432 result = ftp_loop (u, dt, proxy_url);
433 opt.recursive = oldrec;
435 /* There is a possibility of having HTTP being redirected to
436 FTP. In these cases we must decide whether the text is HTML
437 according to the suffix. The HTML suffixes are `.html',
438 `.htm' and a few others, case-insensitive. */
439 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
441 if (has_html_suffix_p (local_file))
448 url_free (proxy_url);
452 location_changed = (result == NEWLOCATION);
453 if (location_changed)
455 char *construced_newloc;
456 struct url *newloc_parsed;
458 assert (mynewloc != NULL);
463 /* The HTTP specs only allow absolute URLs to appear in
464 redirects, but a ton of boneheaded webservers and CGIs out
465 there break the rules and use relative URLs, and popular
466 browsers are lenient about this, so wget should be too. */
467 construced_newloc = uri_merge (url, mynewloc);
469 mynewloc = construced_newloc;
471 /* Now, see if this new location makes sense. */
472 newloc_parsed = url_parse (mynewloc, &up_error_code);
475 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
476 url_error (up_error_code));
484 /* Now mynewloc will become newloc_parsed->url, because if the
485 Location contained relative paths like .././something, we
486 don't want that propagating as url. */
488 mynewloc = xstrdup (newloc_parsed->url);
490 /* Check for max. number of redirections. */
491 if (++redirection_count > MAX_REDIRECTIONS)
493 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
495 url_free (newloc_parsed);
508 /* If we're being redirected from POST, we don't want to POST
509 again. Many requests answer POST with a redirection to an
510 index page; that redirection is clearly a GET. We "suspend"
511 POST data for the duration of the redirections, and restore
512 it when we're done. */
513 if (!post_data_suspended)
523 register_download (u->url, local_file);
524 if (redirection_count && 0 != strcmp (origurl, u->url))
525 register_redirection (origurl, u->url);
527 register_html (u->url, local_file);
532 *file = local_file ? local_file : NULL;
534 FREE_MAYBE (local_file);
538 if (redirection_count)
552 ++global_download_count;
558 /* Find the URLs in the file and call retrieve_url() for each of
559 them. If HTML is non-zero, treat the file as HTML, and construct
560 the URLs accordingly.
562 If opt.recursive is set, call retrieve_tree() for each file. */
565 retrieve_from_file (const char *file, int html, int *count)
568 struct urlpos *url_list, *cur_url;
570 url_list = (html ? get_urls_html (file, NULL, NULL)
571 : get_urls_file (file));
572 status = RETROK; /* Suppose everything is OK. */
573 *count = 0; /* Reset the URL count. */
575 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
577 char *filename = NULL, *new_file = NULL;
580 if (cur_url->ignore_when_downloading)
583 if (opt.quota && total_downloaded_bytes > opt.quota)
588 if ((opt.recursive || opt.page_requisites)
589 && cur_url->url->scheme != SCHEME_FTP)
590 status = retrieve_tree (cur_url->url->url);
592 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
594 if (filename && opt.delete_after && file_exists_p (filename))
596 DEBUGP (("Removing file due to --delete-after in"
597 " retrieve_from_file():\n"));
598 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
599 if (unlink (filename))
600 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
604 FREE_MAYBE (new_file);
605 FREE_MAYBE (filename);
608 /* Free the linked list of URL-s. */
609 free_urlpos (url_list);
614 /* Print `giving up', or `retrying', depending on the impending
615 action. N1 and N2 are the attempt number and the attempt limit. */
617 printwhat (int n1, int n2)
619 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
622 /* If opt.wait or opt.waitretry are specified, and if certain
623 conditions are met, sleep the appropriate number of seconds. See
624 the documentation of --wait and --waitretry for more information.
626 COUNT is the count of current retrieval, beginning with 1. */
629 sleep_between_retrievals (int count)
631 static int first_retrieval = 1;
635 /* Don't sleep before the very first retrieval. */
640 if (opt.waitretry && count > 1)
642 /* If opt.waitretry is specified and this is a retry, wait for
643 COUNT-1 number of seconds, or for opt.waitretry seconds. */
644 if (count <= opt.waitretry)
647 usleep (1000000L * opt.waitretry);
651 if (!opt.random_wait || count > 1)
652 /* If random-wait is not specified, or if we are sleeping
653 between retries of the same download, sleep the fixed
655 usleep (1000000L * opt.wait);
658 /* Sleep a random amount of time averaging in opt.wait
659 seconds. The sleeping amount ranges from 0 to
660 opt.wait*2, inclusive. */
661 double waitsecs = 2 * opt.wait * random_float ();
662 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
663 opt.wait, waitsecs));
664 usleep (1000000L * waitsecs);
669 /* Free the linked list of urlpos. */
671 free_urlpos (struct urlpos *l)
675 struct urlpos *next = l->next;
678 FREE_MAYBE (l->local_name);
684 /* Rotate FNAME opt.backups times */
686 rotate_backups(const char *fname)
688 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
689 char *from = (char *)alloca (maxlen);
690 char *to = (char *)alloca (maxlen);
694 if (stat (fname, &sb) == 0)
695 if (S_ISREG (sb.st_mode) == 0)
698 for (i = opt.backups; i > 1; i--)
700 sprintf (from, "%s.%d", fname, i - 1);
701 sprintf (to, "%s.%d", fname, i);
705 sprintf (to, "%s.%d", fname, 1);
709 static int no_proxy_match PARAMS ((const char *, const char **));
711 /* Return the URL of the proxy appropriate for url U. */
714 getproxy (struct url *u)
718 static char rewritten_storage[1024];
722 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
728 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
732 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
736 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
741 if (!proxy || !*proxy)
744 /* Handle shorthands. `rewritten_storage' is a kludge to allow
745 getproxy() to return static storage. */
746 rewritten_url = rewrite_shorthand_url (proxy);
749 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
750 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
751 proxy = rewritten_storage;
757 /* Should a host be accessed through proxy, concerning no_proxy? */
759 no_proxy_match (const char *host, const char **no_proxy)
764 return !sufmatch (no_proxy, host);