2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
24 #include <sys/types.h>
27 #endif /* HAVE_UNISTD_H */
33 #endif /* HAVE_STRING_H */
48 # include "gen_sslfunc.h" /* for ssl_iread */
55 /* See the comment in gethttp() why this is needed. */
56 int global_download_count;
65 limit_bandwidth_reset (void)
68 limit_data.dltime = 0;
71 /* Limit the bandwidth by pausing the download for an amount of time.
72 BYTES is the number of bytes received from the network, DELTA is
73 how long it took to receive them, DLTIME the current download time,
74 TIMER the timer, and ADJUSTMENT the previous. */
77 limit_bandwidth (long bytes, long delta)
81 limit_data.bytes += bytes;
82 limit_data.dltime += delta;
84 expected = (long)(1000.0 * limit_data.bytes / opt.limit_rate);
86 if (expected > limit_data.dltime)
88 long slp = expected - limit_data.dltime;
91 DEBUGP (("deferring a %ld ms sleep (%ld/%ld) until later.\n",
92 slp, limit_data.bytes, limit_data.dltime));
95 DEBUGP (("sleeping %ld ms\n", slp));
100 limit_data.dltime = 0;
103 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
105 /* Reads the contents of file descriptor FD, until it is closed, or a
106 read error occurs. The data is read in 8K chunks, and stored to
107 stream fp, which should have been open for writing. If BUF is
108 non-NULL and its file descriptor is equal to FD, flush RBUF first.
109 This function will *not* use the rbuf_* functions!
111 The EXPECTED argument is passed to show_progress() unchanged, but
114 If opt.verbose is set, the progress is also shown. RESTVAL
115 represents a value from which to start downloading (which will be
116 shown accordingly). If RESTVAL is non-zero, the stream should have
117 been open for appending.
119 The function exits and returns codes of 0, -1 and -2 if the
120 connection was closed, there was a read error, or if it could not
121 write to the output stream, respectively.
123 IMPORTANT: The function flushes the contents of the buffer in
124 rbuf_flush() before actually reading from fd. If you wish to read
125 from fd immediately, flush or discard the buffer. */
127 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
128 struct rbuf *rbuf, int use_expected, long *elapsed)
132 void *progress = NULL;
133 struct wget_timer *timer = wtimer_allocate ();
134 long dltime = 0, last_dltime = 0;
139 progress = progress_create (restval, expected);
141 if (rbuf && RBUF_FD (rbuf) == fd)
144 while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
146 fwrite (c, sizeof (char), res, fp);
158 progress_update (progress, sz, 0);
162 limit_bandwidth_reset ();
163 wtimer_reset (timer);
165 /* Read from fd while there is available data.
167 Normally, if expected is 0, it means that it is not known how
168 much data is expected. However, if use_expected is specified,
169 then expected being zero means exactly that. */
170 while (!use_expected || (*len < expected))
172 int amount_to_read = (use_expected
173 ? MIN (expected - *len, sizeof (c))
177 res = ssl_iread (rbuf->ssl, c, amount_to_read);
179 #endif /* HAVE_SSL */
180 res = iread (fd, c, amount_to_read);
184 fwrite (c, sizeof (char), res, fp);
185 /* Always flush the contents of the network packet. This
186 should not be adverse to performance, as the network
187 packets typically won't be too tiny anyway. */
195 /* If bandwidth is not limited, one call to wtimer_elapsed
197 dltime = wtimer_elapsed (timer);
200 limit_bandwidth (res, dltime - last_dltime);
201 dltime = wtimer_elapsed (timer);
202 last_dltime = dltime;
206 progress_update (progress, res, dltime);
217 progress_finish (progress, dltime);
220 wtimer_delete (timer);
225 /* Return a printed representation of the download rate, as
226 appropriate for the speed. If PAD is non-zero, strings will be
227 padded to the width of 7 characters (xxxx.xx). */
229 retr_rate (long bytes, long msecs, int pad)
232 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
235 double dlrate = calc_rate (bytes, msecs, &units);
236 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
241 /* Calculate the download rate and trim it as appropriate for the
242 speed. Appropriate means that if rate is greater than 1K/s,
243 kilobytes are used, and if rate is greater than 1MB/s, megabytes
246 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
249 calc_rate (long bytes, long msecs, int *units)
257 /* If elapsed time is 0, it means we're under the granularity of
258 the timer. This often happens on systems that use time() for
260 msecs = wtimer_granularity ();
262 dlrate = (double)1000 * bytes / msecs;
265 else if (dlrate < 1024.0 * 1024.0)
266 *units = 1, dlrate /= 1024.0;
267 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
268 *units = 2, dlrate /= (1024.0 * 1024.0);
270 /* Maybe someone will need this one day. More realistically, it
271 will get tickled by buggy timers. */
272 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
278 register_redirections_mapper (void *key, void *value, void *arg)
280 const char *redirected_from = (const char *)key;
281 const char *redirected_to = (const char *)arg;
282 if (0 != strcmp (redirected_from, redirected_to))
283 register_redirection (redirected_from, redirected_to);
287 /* Register the redirections that lead to the successful download of
288 this URL. This is necessary so that the link converter can convert
289 redirected URLs to the local file. */
292 register_all_redirections (struct hash_table *redirections, const char *final)
294 hash_table_map (redirections, register_redirections_mapper, (void *)final);
297 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme) \
298 && no_proxy_match((u)->host, \
299 (const char **)opt.no_proxy))
301 /* Maximum number of allowed redirections. 20 was chosen as a
302 "reasonable" value, which is low enough to not cause havoc, yet
303 high enough to guarantee that normal retrievals will not be hurt by
306 #define MAX_REDIRECTIONS 20
308 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
312 retrieve_url (const char *origurl, char **file, char **newloc,
313 const char *refurl, int *dt)
317 int location_changed, dummy;
319 char *mynewloc, *proxy;
321 int up_error_code; /* url parse error code */
323 struct hash_table *redirections = NULL;
324 int redirection_count = 0;
326 /* If dt is NULL, just ignore it. */
329 url = xstrdup (origurl);
335 u = url_parse (url, &up_error_code);
338 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
340 string_set_free (redirections);
346 refurl = opt.referer;
354 use_proxy = USE_PROXY_P (u);
357 struct url *proxy_url;
359 /* Get the proxy server for the current scheme. */
360 proxy = getproxy (u->scheme);
363 logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
366 string_set_free (redirections);
371 /* Parse the proxy URL. */
372 proxy_url = url_parse (proxy, &up_error_code);
375 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
376 proxy, url_error (up_error_code));
378 string_set_free (redirections);
382 if (proxy_url->scheme != SCHEME_HTTP)
384 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
385 url_free (proxy_url);
387 string_set_free (redirections);
392 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
393 url_free (proxy_url);
395 else if (u->scheme == SCHEME_HTTP
397 || u->scheme == SCHEME_HTTPS
401 result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
403 else if (u->scheme == SCHEME_FTP)
405 /* If this is a redirection, we must not allow recursive FTP
406 retrieval, so we save recursion to oldrec, and restore it
408 int oldrec = opt.recursive;
411 result = ftp_loop (u, dt);
412 opt.recursive = oldrec;
414 /* There is a possibility of having HTTP being redirected to
415 FTP. In these cases we must decide whether the text is HTML
416 according to the suffix. The HTML suffixes are `.html' and
417 `.htm', case-insensitive. */
418 if (redirections && local_file && u->scheme == SCHEME_FTP)
420 char *suf = suffix (local_file);
421 if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
425 location_changed = (result == NEWLOCATION);
426 if (location_changed)
428 char *construced_newloc;
429 struct url *newloc_parsed;
431 assert (mynewloc != NULL);
436 /* The HTTP specs only allow absolute URLs to appear in
437 redirects, but a ton of boneheaded webservers and CGIs out
438 there break the rules and use relative URLs, and popular
439 browsers are lenient about this, so wget should be too. */
440 construced_newloc = uri_merge (url, mynewloc);
442 mynewloc = construced_newloc;
444 /* Now, see if this new location makes sense. */
445 newloc_parsed = url_parse (mynewloc, &up_error_code);
448 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
449 url_error (up_error_code));
452 string_set_free (redirections);
458 /* Now mynewloc will become newloc_parsed->url, because if the
459 Location contained relative paths like .././something, we
460 don't want that propagating as url. */
462 mynewloc = xstrdup (newloc_parsed->url);
466 redirections = make_string_hash_table (0);
467 /* Add current URL immediately so we can detect it as soon
468 as possible in case of a cycle. */
469 string_set_add (redirections, u->url);
472 /* The new location is OK. Check for max. number of
474 if (++redirection_count > MAX_REDIRECTIONS)
476 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
478 url_free (newloc_parsed);
481 string_set_free (redirections);
487 /*Check for redirection cycle by
488 peeking through the history of redirections. */
489 if (string_set_contains (redirections, newloc_parsed->url))
491 logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
493 url_free (newloc_parsed);
496 string_set_free (redirections);
501 string_set_add (redirections, newloc_parsed->url);
514 register_download (u->url, local_file);
516 register_all_redirections (redirections, u->url);
518 register_html (u->url, local_file);
523 *file = local_file ? local_file : NULL;
525 FREE_MAYBE (local_file);
531 string_set_free (redirections);
544 ++global_download_count;
549 /* Find the URLs in the file and call retrieve_url() for each of
550 them. If HTML is non-zero, treat the file as HTML, and construct
551 the URLs accordingly.
553 If opt.recursive is set, call recursive_retrieve() for each file. */
555 retrieve_from_file (const char *file, int html, int *count)
558 struct urlpos *url_list, *cur_url;
560 url_list = (html ? get_urls_html (file, NULL, NULL)
561 : get_urls_file (file));
562 status = RETROK; /* Suppose everything is OK. */
563 *count = 0; /* Reset the URL count. */
565 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
567 char *filename = NULL, *new_file = NULL;
570 if (cur_url->ignore_when_downloading)
573 if (downloaded_exceeds_quota ())
578 if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
579 status = retrieve_tree (cur_url->url->url);
581 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
583 if (filename && opt.delete_after && file_exists_p (filename))
585 DEBUGP (("Removing file due to --delete-after in"
586 " retrieve_from_file():\n"));
587 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
588 if (unlink (filename))
589 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
593 FREE_MAYBE (new_file);
594 FREE_MAYBE (filename);
597 /* Free the linked list of URL-s. */
598 free_urlpos (url_list);
603 /* Print `giving up', or `retrying', depending on the impending
604 action. N1 and N2 are the attempt number and the attempt limit. */
606 printwhat (int n1, int n2)
608 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
611 /* Increment opt.downloaded by BY_HOW_MUCH. If an overflow occurs,
612 set opt.downloaded_overflow to 1. */
614 downloaded_increase (unsigned long by_how_much)
617 if (opt.downloaded_overflow)
619 old = opt.downloaded;
620 opt.downloaded += by_how_much;
621 if (opt.downloaded < old) /* carry flag, where are you when I
625 opt.downloaded_overflow = 1;
626 opt.downloaded = ~((VERY_LONG_TYPE)0);
630 /* Return non-zero if the downloaded amount of bytes exceeds the
631 desired quota. If quota is not set or if the amount overflowed, 0
634 downloaded_exceeds_quota (void)
638 if (opt.downloaded_overflow)
639 /* We don't really know. (Wildly) assume not. */
642 return opt.downloaded > opt.quota;
645 /* If opt.wait or opt.waitretry are specified, and if certain
646 conditions are met, sleep the appropriate number of seconds. See
647 the documentation of --wait and --waitretry for more information.
649 COUNT is the count of current retrieval, beginning with 1. */
652 sleep_between_retrievals (int count)
654 static int first_retrieval = 1;
656 if (!first_retrieval && (opt.wait || opt.waitretry))
658 if (opt.waitretry && count > 1)
660 /* If opt.waitretry is specified and this is a retry, wait
661 for COUNT-1 number of seconds, or for opt.waitretry
663 if (count <= opt.waitretry)
666 sleep (opt.waitretry);
670 /* Otherwise, check if opt.wait is specified. If so, sleep. */
671 if (count > 1 || !opt.random_wait)
675 /* Sleep a random amount of time averaging in opt.wait
676 seconds. The sleeping amount ranges from 0 to
677 opt.wait*2, inclusive. */
678 int waitsecs = random_number (opt.wait * 2 + 1);
680 DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
681 opt.wait, waitsecs - opt.wait, waitsecs));