2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* See the comment in gethttp() why this is needed. */
67 int global_download_count;
69 /* Total size of downloaded files. Used to enforce quota. */
70 LARGE_INT total_downloaded_bytes;
80 limit_bandwidth_reset (void)
82 limit_data.chunk_bytes = 0;
83 limit_data.chunk_start = 0;
86 /* Limit the bandwidth by pausing the download for an amount of time.
87 BYTES is the number of bytes received from the network, and TIMER
88 is the timer that started at the beginning of download. */
91 limit_bandwidth (long bytes, struct wget_timer *timer)
93 double delta_t = wtimer_read (timer) - limit_data.chunk_start;
96 limit_data.chunk_bytes += bytes;
98 /* Calculate the amount of time we expect downloading the chunk
99 should take. If in reality it took less time, sleep to
100 compensate for the difference. */
101 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
103 if (expected > delta_t)
105 double slp = expected - delta_t + limit_data.sleep_adjust;
109 DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
110 slp, limit_data.chunk_bytes, delta_t));
113 DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
114 slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
116 t0 = wtimer_read (timer);
118 wtimer_update (timer);
119 t1 = wtimer_read (timer);
121 /* Due to scheduling, we probably slept slightly longer (or
122 shorter) than desired. Calculate the difference between the
123 desired and the actual sleep, and adjust the next sleep by
125 limit_data.sleep_adjust = slp - (t1 - t0);
128 limit_data.chunk_bytes = 0;
129 limit_data.chunk_start = wtimer_read (timer);
132 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
134 /* Reads the contents of file descriptor FD, until it is closed, or a
135 read error occurs. The data is read in 8K chunks, and stored to
136 stream fp, which should have been open for writing. If BUF is
137 non-NULL and its file descriptor is equal to FD, flush RBUF first.
138 This function will *not* use the rbuf_* functions!
140 The EXPECTED argument is passed to show_progress() unchanged, but
143 If opt.verbose is set, the progress is also shown. RESTVAL
144 represents a value from which to start downloading (which will be
145 shown accordingly). If RESTVAL is non-zero, the stream should have
146 been open for appending.
148 The function exits and returns codes of 0, -1 and -2 if the
149 connection was closed, there was a read error, or if it could not
150 write to the output stream, respectively.
152 IMPORTANT: The function flushes the contents of the buffer in
153 rbuf_flush() before actually reading from fd. If you wish to read
154 from fd immediately, flush or discard the buffer. */
156 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
157 struct rbuf *rbuf, int use_expected, double *elapsed)
161 static char dlbuf[16384];
162 int dlbufsize = sizeof (dlbuf);
164 void *progress = NULL;
165 struct wget_timer *timer = wtimer_allocate ();
170 progress = progress_create (restval, expected);
172 if (rbuf && RBUF_FD (rbuf) == fd)
175 while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
177 fwrite (dlbuf, 1, res, fp);
189 progress_update (progress, sz, 0);
193 limit_bandwidth_reset ();
194 wtimer_reset (timer);
196 /* Use a smaller buffer for low requested bandwidths. For example,
197 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
198 data and then sleep for 8s. With buffer size equal to the limit,
199 we never have to sleep for more than one second. */
200 if (opt.limit_rate && opt.limit_rate < dlbufsize)
201 dlbufsize = opt.limit_rate;
203 /* Read from fd while there is available data.
205 Normally, if expected is 0, it means that it is not known how
206 much data is expected. However, if use_expected is specified,
207 then expected being zero means exactly that. */
208 while (!use_expected || (*len < expected))
210 int amount_to_read = (use_expected
211 ? MIN (expected - *len, dlbufsize) : dlbufsize);
212 res = xread (fd, dlbuf, amount_to_read, -1);
217 fwrite (dlbuf, 1, res, fp);
218 /* Always flush the contents of the network packet. This should
219 not hinder performance: fast downloads will be received in
220 16K chunks (which stdio would write out anyway), and slow
221 downloads won't be limited by disk performance. */
229 wtimer_update (timer);
231 limit_bandwidth (res, timer);
235 progress_update (progress, res, wtimer_read (timer));
237 if (use_expected && expected > 0)
238 ws_percenttitle (100.0 * (double)(*len) / (double)expected);
246 progress_finish (progress, wtimer_read (timer));
248 *elapsed = wtimer_read (timer);
249 wtimer_delete (timer);
254 /* Return a printed representation of the download rate, as
255 appropriate for the speed. If PAD is non-zero, strings will be
256 padded to the width of 7 characters (xxxx.xx). */
258 retr_rate (long bytes, double msecs, int pad)
261 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
264 double dlrate = calc_rate (bytes, msecs, &units);
265 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
270 /* Calculate the download rate and trim it as appropriate for the
271 speed. Appropriate means that if rate is greater than 1K/s,
272 kilobytes are used, and if rate is greater than 1MB/s, megabytes
275 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
278 calc_rate (long bytes, double msecs, int *units)
286 /* If elapsed time is exactly zero, it means we're under the
287 granularity of the timer. This often happens on systems that
288 use time() for the timer. */
289 msecs = wtimer_granularity ();
291 dlrate = (double)1000 * bytes / msecs;
294 else if (dlrate < 1024.0 * 1024.0)
295 *units = 1, dlrate /= 1024.0;
296 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
297 *units = 2, dlrate /= (1024.0 * 1024.0);
299 /* Maybe someone will need this, one day. */
300 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
305 /* Maximum number of allowed redirections. 20 was chosen as a
306 "reasonable" value, which is low enough to not cause havoc, yet
307 high enough to guarantee that normal retrievals will not be hurt by
310 #define MAX_REDIRECTIONS 20
312 #define SUSPEND_POST_DATA do { \
313 post_data_suspended = 1; \
314 saved_post_data = opt.post_data; \
315 saved_post_file_name = opt.post_file_name; \
316 opt.post_data = NULL; \
317 opt.post_file_name = NULL; \
320 #define RESTORE_POST_DATA do { \
321 if (post_data_suspended) \
323 opt.post_data = saved_post_data; \
324 opt.post_file_name = saved_post_file_name; \
325 post_data_suspended = 0; \
329 static char *getproxy PARAMS ((struct url *));
331 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
334 /* #### This function should be rewritten so it doesn't return from
338 retrieve_url (const char *origurl, char **file, char **newloc,
339 const char *refurl, int *dt)
343 int location_changed, dummy;
344 char *mynewloc, *proxy;
345 struct url *u, *proxy_url;
346 int up_error_code; /* url parse error code */
348 int redirection_count = 0;
350 int post_data_suspended = 0;
351 char *saved_post_data = NULL;
352 char *saved_post_file_name = NULL;
354 /* If dt is NULL, use local storage. */
360 url = xstrdup (origurl);
366 u = url_parse (url, &up_error_code);
369 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
375 refurl = opt.referer;
384 proxy = getproxy (u);
387 /* Parse the proxy URL. */
388 proxy_url = url_parse (proxy, &up_error_code);
391 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
392 proxy, url_error (up_error_code));
397 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
399 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
400 url_free (proxy_url);
407 if (u->scheme == SCHEME_HTTP
409 || u->scheme == SCHEME_HTTPS
411 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
413 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
415 else if (u->scheme == SCHEME_FTP)
417 /* If this is a redirection, we must not allow recursive FTP
418 retrieval, so we save recursion to oldrec, and restore it
420 int oldrec = opt.recursive;
421 if (redirection_count)
423 result = ftp_loop (u, dt, proxy_url);
424 opt.recursive = oldrec;
426 /* There is a possibility of having HTTP being redirected to
427 FTP. In these cases we must decide whether the text is HTML
428 according to the suffix. The HTML suffixes are `.html',
429 `.htm' and a few others, case-insensitive. */
430 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
432 if (has_html_suffix_p (local_file))
439 url_free (proxy_url);
443 location_changed = (result == NEWLOCATION);
444 if (location_changed)
446 char *construced_newloc;
447 struct url *newloc_parsed;
449 assert (mynewloc != NULL);
454 /* The HTTP specs only allow absolute URLs to appear in
455 redirects, but a ton of boneheaded webservers and CGIs out
456 there break the rules and use relative URLs, and popular
457 browsers are lenient about this, so wget should be too. */
458 construced_newloc = uri_merge (url, mynewloc);
460 mynewloc = construced_newloc;
462 /* Now, see if this new location makes sense. */
463 newloc_parsed = url_parse (mynewloc, &up_error_code);
466 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
467 url_error (up_error_code));
475 /* Now mynewloc will become newloc_parsed->url, because if the
476 Location contained relative paths like .././something, we
477 don't want that propagating as url. */
479 mynewloc = xstrdup (newloc_parsed->url);
481 /* Check for max. number of redirections. */
482 if (++redirection_count > MAX_REDIRECTIONS)
484 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
486 url_free (newloc_parsed);
499 /* If we're being redirected from POST, we don't want to POST
500 again. Many requests answer POST with a redirection to an
501 index page; that redirection is clearly a GET. We "suspend"
502 POST data for the duration of the redirections, and restore
503 it when we're done. */
504 if (!post_data_suspended)
514 register_download (u->url, local_file);
515 if (redirection_count && 0 != strcmp (origurl, u->url))
516 register_redirection (origurl, u->url);
518 register_html (u->url, local_file);
523 *file = local_file ? local_file : NULL;
525 xfree_null (local_file);
529 if (redirection_count)
543 ++global_download_count;
549 /* Find the URLs in the file and call retrieve_url() for each of
550 them. If HTML is non-zero, treat the file as HTML, and construct
551 the URLs accordingly.
553 If opt.recursive is set, call retrieve_tree() for each file. */
556 retrieve_from_file (const char *file, int html, int *count)
559 struct urlpos *url_list, *cur_url;
561 url_list = (html ? get_urls_html (file, NULL, NULL)
562 : get_urls_file (file));
563 status = RETROK; /* Suppose everything is OK. */
564 *count = 0; /* Reset the URL count. */
566 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
568 char *filename = NULL, *new_file = NULL;
571 if (cur_url->ignore_when_downloading)
574 if (opt.quota && total_downloaded_bytes > opt.quota)
579 if ((opt.recursive || opt.page_requisites)
580 && cur_url->url->scheme != SCHEME_FTP)
581 status = retrieve_tree (cur_url->url->url);
583 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
585 if (filename && opt.delete_after && file_exists_p (filename))
587 DEBUGP (("Removing file due to --delete-after in"
588 " retrieve_from_file():\n"));
589 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
590 if (unlink (filename))
591 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
595 xfree_null (new_file);
596 xfree_null (filename);
599 /* Free the linked list of URL-s. */
600 free_urlpos (url_list);
605 /* Print `giving up', or `retrying', depending on the impending
606 action. N1 and N2 are the attempt number and the attempt limit. */
608 printwhat (int n1, int n2)
610 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
613 /* If opt.wait or opt.waitretry are specified, and if certain
614 conditions are met, sleep the appropriate number of seconds. See
615 the documentation of --wait and --waitretry for more information.
617 COUNT is the count of current retrieval, beginning with 1. */
620 sleep_between_retrievals (int count)
622 static int first_retrieval = 1;
626 /* Don't sleep before the very first retrieval. */
631 if (opt.waitretry && count > 1)
633 /* If opt.waitretry is specified and this is a retry, wait for
634 COUNT-1 number of seconds, or for opt.waitretry seconds. */
635 if (count <= opt.waitretry)
638 xsleep (opt.waitretry);
642 if (!opt.random_wait || count > 1)
643 /* If random-wait is not specified, or if we are sleeping
644 between retries of the same download, sleep the fixed
649 /* Sleep a random amount of time averaging in opt.wait
650 seconds. The sleeping amount ranges from 0 to
651 opt.wait*2, inclusive. */
652 double waitsecs = 2 * opt.wait * random_float ();
653 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
654 opt.wait, waitsecs));
660 /* Free the linked list of urlpos. */
662 free_urlpos (struct urlpos *l)
666 struct urlpos *next = l->next;
669 xfree_null (l->local_name);
675 /* Rotate FNAME opt.backups times */
677 rotate_backups(const char *fname)
679 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
680 char *from = (char *)alloca (maxlen);
681 char *to = (char *)alloca (maxlen);
685 if (stat (fname, &sb) == 0)
686 if (S_ISREG (sb.st_mode) == 0)
689 for (i = opt.backups; i > 1; i--)
691 sprintf (from, "%s.%d", fname, i - 1);
692 sprintf (to, "%s.%d", fname, i);
696 sprintf (to, "%s.%d", fname, 1);
700 static int no_proxy_match PARAMS ((const char *, const char **));
702 /* Return the URL of the proxy appropriate for url U. */
705 getproxy (struct url *u)
709 static char rewritten_storage[1024];
713 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
719 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
723 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
727 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
732 if (!proxy || !*proxy)
735 /* Handle shorthands. `rewritten_storage' is a kludge to allow
736 getproxy() to return static storage. */
737 rewritten_url = rewrite_shorthand_url (proxy);
740 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
741 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
742 proxy = rewritten_storage;
748 /* Should a host be accessed through proxy, concerning no_proxy? */
750 no_proxy_match (const char *host, const char **no_proxy)
755 return !sufmatch (no_proxy, host);