2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* See the comment in gethttp() why this is needed. */
67 int global_download_count;
69 /* Total size of downloaded files. Used to enforce quota. */
70 LARGE_INT total_downloaded_bytes;
80 limit_bandwidth_reset (void)
82 limit_data.chunk_bytes = 0;
83 limit_data.chunk_start = 0;
86 /* Limit the bandwidth by pausing the download for an amount of time.
87 BYTES is the number of bytes received from the network, and TIMER
88 is the timer that started at the beginning of download. */
91 limit_bandwidth (long bytes, struct wget_timer *timer)
93 double delta_t = wtimer_read (timer) - limit_data.chunk_start;
96 limit_data.chunk_bytes += bytes;
98 /* Calculate the amount of time we expect downloading the chunk
99 should take. If in reality it took less time, sleep to
100 compensate for the difference. */
101 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
103 if (expected > delta_t)
105 double slp = expected - delta_t + limit_data.sleep_adjust;
109 DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
110 slp, limit_data.chunk_bytes, delta_t));
113 DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
114 slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
116 t0 = wtimer_read (timer);
118 wtimer_update (timer);
119 t1 = wtimer_read (timer);
121 /* Due to scheduling, we probably slept slightly longer (or
122 shorter) than desired. Calculate the difference between the
123 desired and the actual sleep, and adjust the next sleep by
125 limit_data.sleep_adjust = slp - (t1 - t0);
128 limit_data.chunk_bytes = 0;
129 limit_data.chunk_start = wtimer_read (timer);
133 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
136 /* Reads the contents of file descriptor FD, until it is closed, or a
137 read error occurs. The data is read in 8K chunks, and stored to
138 stream fp, which should have been open for writing.
140 The EXPECTED argument is passed to show_progress() unchanged, but
143 If opt.verbose is set, the progress is also shown. RESTVAL
144 represents a value from which to start downloading (which will be
145 shown accordingly). If RESTVAL is non-zero, the stream should have
146 been open for appending.
148 The function exits and returns codes of 0, -1 and -2 if the
149 connection was closed, there was a read error, or if it could not
150 write to the output stream, respectively. */
153 fd_read_body (int fd, FILE *out, long *len, long restval, long expected,
154 int use_expected, double *elapsed)
158 static char dlbuf[16384];
159 int dlbufsize = sizeof (dlbuf);
161 struct wget_timer *timer = wtimer_allocate ();
162 double last_successful_read_tm;
164 /* The progress gauge, set according to the user preferences. */
165 void *progress = NULL;
167 /* Non-zero if the progress gauge is interactive, i.e. if it can
168 continually update the display. When true, smaller timeout
169 values are used so that the gauge can update the display when
170 data arrives slowly. */
171 int progress_interactive = 0;
177 progress = progress_create (restval, expected);
178 progress_interactive = progress_interactive_p (progress);
182 limit_bandwidth_reset ();
183 wtimer_reset (timer);
184 last_successful_read_tm = 0;
186 /* Use a smaller buffer for low requested bandwidths. For example,
187 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
188 data and then sleep for 8s. With buffer size equal to the limit,
189 we never have to sleep for more than one second. */
190 if (opt.limit_rate && opt.limit_rate < dlbufsize)
191 dlbufsize = opt.limit_rate;
193 /* Read from fd while there is available data.
195 Normally, if expected is 0, it means that it is not known how
196 much data is expected. However, if use_expected is specified,
197 then expected being zero means exactly that. */
198 while (!use_expected || (*len < expected))
200 int amount_to_read = (use_expected
201 ? MIN (expected - *len, dlbufsize) : dlbufsize);
202 double tmout = opt.read_timeout;
203 if (progress_interactive)
206 /* For interactive progress gauges, always specify a ~1s
207 timeout, so that the gauge can be updated regularly even
208 when the data arrives very slowly or stalls. */
210 waittm = (wtimer_read (timer) - last_successful_read_tm) / 1000;
211 if (waittm + tmout > opt.read_timeout)
213 /* Don't allow waiting time to exceed read timeout. */
214 tmout = opt.read_timeout - waittm;
217 /* We've already exceeded the timeout. */
218 res = -1, errno = ETIMEDOUT;
223 res = fd_read (fd, dlbuf, amount_to_read, tmout);
225 if (res == 0 || (res < 0 && errno != ETIMEDOUT))
228 res = 0; /* timeout */
230 wtimer_update (timer);
233 fwrite (dlbuf, 1, res, out);
234 /* Always flush the contents of the network packet. This
235 should not hinder performance: fast downloads will be
236 received in 16K chunks (which stdio would write out
237 anyway), and slow downloads won't be limited by disk
245 last_successful_read_tm = wtimer_read (timer);
249 limit_bandwidth (res, timer);
253 progress_update (progress, res, wtimer_read (timer));
255 if (use_expected && expected > 0)
256 ws_percenttitle (100.0 * (double)(*len) / (double)expected);
264 progress_finish (progress, wtimer_read (timer));
266 *elapsed = wtimer_read (timer);
267 wtimer_delete (timer);
272 typedef const char *(*finder_t) PARAMS ((const char *, int, int));
274 /* Read a hunk of data from FD, up until a terminator. The terminator
275 is whatever the TERMINATOR function determines it to be; for
276 example, it can be a line of data, or the head of an HTTP response.
277 The function returns the data read allocated with malloc.
279 In case of error, NULL is returned. In case of EOF and no data
280 read, NULL is returned and errno set to 0. In case of EOF with
281 data having been read, the data is returned, but it will
282 (obviously) not contain the terminator.
284 The idea is to be able to read a line of input, or otherwise a hunk
285 of text, such as the head of an HTTP request, without crossing the
286 boundary, so that the next call to fd_read etc. reads the data
287 after the hunk. To achieve that, this function does the following:
289 1. Peek at available data.
291 2. Determine whether the peeked data, along with the previously
292 read data, includes the terminator.
294 2a. If yes, read the data until the end of the terminator, and
297 2b. If no, read the peeked data and goto 1.
299 The function is careful to assume as little as possible about the
300 implementation of peeking. For example, every peek is followed by
301 a read. If the read returns a different amount of data, the
302 process is retried until all data arrives safely.
304 BUFSIZE is the size of the initial buffer expected to read all the
305 data in the typical case.
307 This function should be used as a building block for other
308 functions -- see fd_read_line as a simple example. */
311 fd_read_hunk (int fd, hunk_terminator_t hunk_terminator, int bufsize)
313 char *hunk = xmalloc (bufsize);
314 int tail = 0; /* tail position in HUNK */
319 int pklen, rdlen, remain;
321 /* First, peek at the available data. */
323 pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
329 end = hunk_terminator (hunk, tail, pklen);
332 /* The data contains the terminator: we'll drain the data up
333 to the end of the terminator. */
334 remain = end - (hunk + tail);
337 /* No more data needs to be read. */
341 if (bufsize - 1 < tail + remain)
343 bufsize = tail + remain + 1;
344 hunk = xrealloc (hunk, bufsize);
348 /* No terminator: simply read the data we know is (or should
352 /* Now, read the data. Note that we make no assumptions about
353 how much data we'll get. (Some TCP stacks are notorious for
354 read returning less data than the previous MSG_PEEK.) */
356 rdlen = fd_read (fd, hunk + tail, remain, 0);
369 /* EOF without anything having been read */
375 /* EOF seen: return the data we've read. */
378 if (end && rdlen == remain)
379 /* The terminator was seen and the remaining data drained --
380 we got what we came for. */
383 /* Keep looping until all the data arrives. */
385 if (tail == bufsize - 1)
388 hunk = xrealloc (hunk, bufsize);
394 line_terminator (const char *hunk, int oldlen, int peeklen)
396 const char *p = memchr (hunk + oldlen, '\n', peeklen);
398 /* p+1 because we want the line to include '\n' */
403 /* Read one line from FD and return it. The line is allocated using
406 If an error occurs, or if no data can be read, NULL is returned.
407 In the former case errno indicates the error condition, and in the
408 latter case, errno is NULL. */
411 fd_read_line (int fd)
413 return fd_read_hunk (fd, line_terminator, 128);
416 /* Return a printed representation of the download rate, as
417 appropriate for the speed. If PAD is non-zero, strings will be
418 padded to the width of 7 characters (xxxx.xx). */
420 retr_rate (long bytes, double msecs, int pad)
423 static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
426 double dlrate = calc_rate (bytes, msecs, &units);
427 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
432 /* Calculate the download rate and trim it as appropriate for the
433 speed. Appropriate means that if rate is greater than 1K/s,
434 kilobytes are used, and if rate is greater than 1MB/s, megabytes
437 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
440 calc_rate (long bytes, double msecs, int *units)
448 /* If elapsed time is exactly zero, it means we're under the
449 granularity of the timer. This often happens on systems that
450 use time() for the timer. */
451 msecs = wtimer_granularity ();
453 dlrate = (double)1000 * bytes / msecs;
456 else if (dlrate < 1024.0 * 1024.0)
457 *units = 1, dlrate /= 1024.0;
458 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
459 *units = 2, dlrate /= (1024.0 * 1024.0);
461 /* Maybe someone will need this, one day. */
462 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
467 /* Maximum number of allowed redirections. 20 was chosen as a
468 "reasonable" value, which is low enough to not cause havoc, yet
469 high enough to guarantee that normal retrievals will not be hurt by
472 #define MAX_REDIRECTIONS 20
474 #define SUSPEND_POST_DATA do { \
475 post_data_suspended = 1; \
476 saved_post_data = opt.post_data; \
477 saved_post_file_name = opt.post_file_name; \
478 opt.post_data = NULL; \
479 opt.post_file_name = NULL; \
482 #define RESTORE_POST_DATA do { \
483 if (post_data_suspended) \
485 opt.post_data = saved_post_data; \
486 opt.post_file_name = saved_post_file_name; \
487 post_data_suspended = 0; \
491 static char *getproxy PARAMS ((struct url *));
493 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
496 /* #### This function should be rewritten so it doesn't return from
500 retrieve_url (const char *origurl, char **file, char **newloc,
501 const char *refurl, int *dt)
505 int location_changed, dummy;
506 char *mynewloc, *proxy;
507 struct url *u, *proxy_url;
508 int up_error_code; /* url parse error code */
510 int redirection_count = 0;
512 int post_data_suspended = 0;
513 char *saved_post_data = NULL;
514 char *saved_post_file_name = NULL;
516 /* If dt is NULL, use local storage. */
522 url = xstrdup (origurl);
528 u = url_parse (url, &up_error_code);
531 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
537 refurl = opt.referer;
546 proxy = getproxy (u);
549 /* Parse the proxy URL. */
550 proxy_url = url_parse (proxy, &up_error_code);
553 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
554 proxy, url_error (up_error_code));
559 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
561 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
562 url_free (proxy_url);
569 if (u->scheme == SCHEME_HTTP
571 || u->scheme == SCHEME_HTTPS
573 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
575 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
577 else if (u->scheme == SCHEME_FTP)
579 /* If this is a redirection, we must not allow recursive FTP
580 retrieval, so we save recursion to oldrec, and restore it
582 int oldrec = opt.recursive;
583 if (redirection_count)
585 result = ftp_loop (u, dt, proxy_url);
586 opt.recursive = oldrec;
588 /* There is a possibility of having HTTP being redirected to
589 FTP. In these cases we must decide whether the text is HTML
590 according to the suffix. The HTML suffixes are `.html',
591 `.htm' and a few others, case-insensitive. */
592 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
594 if (has_html_suffix_p (local_file))
601 url_free (proxy_url);
605 location_changed = (result == NEWLOCATION);
606 if (location_changed)
608 char *construced_newloc;
609 struct url *newloc_parsed;
611 assert (mynewloc != NULL);
616 /* The HTTP specs only allow absolute URLs to appear in
617 redirects, but a ton of boneheaded webservers and CGIs out
618 there break the rules and use relative URLs, and popular
619 browsers are lenient about this, so wget should be too. */
620 construced_newloc = uri_merge (url, mynewloc);
622 mynewloc = construced_newloc;
624 /* Now, see if this new location makes sense. */
625 newloc_parsed = url_parse (mynewloc, &up_error_code);
628 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
629 url_error (up_error_code));
637 /* Now mynewloc will become newloc_parsed->url, because if the
638 Location contained relative paths like .././something, we
639 don't want that propagating as url. */
641 mynewloc = xstrdup (newloc_parsed->url);
643 /* Check for max. number of redirections. */
644 if (++redirection_count > MAX_REDIRECTIONS)
646 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
648 url_free (newloc_parsed);
661 /* If we're being redirected from POST, we don't want to POST
662 again. Many requests answer POST with a redirection to an
663 index page; that redirection is clearly a GET. We "suspend"
664 POST data for the duration of the redirections, and restore
665 it when we're done. */
666 if (!post_data_suspended)
676 register_download (u->url, local_file);
677 if (redirection_count && 0 != strcmp (origurl, u->url))
678 register_redirection (origurl, u->url);
680 register_html (u->url, local_file);
685 *file = local_file ? local_file : NULL;
687 xfree_null (local_file);
691 if (redirection_count)
705 ++global_download_count;
711 /* Find the URLs in the file and call retrieve_url() for each of
712 them. If HTML is non-zero, treat the file as HTML, and construct
713 the URLs accordingly.
715 If opt.recursive is set, call retrieve_tree() for each file. */
718 retrieve_from_file (const char *file, int html, int *count)
721 struct urlpos *url_list, *cur_url;
723 url_list = (html ? get_urls_html (file, NULL, NULL)
724 : get_urls_file (file));
725 status = RETROK; /* Suppose everything is OK. */
726 *count = 0; /* Reset the URL count. */
728 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
730 char *filename = NULL, *new_file = NULL;
733 if (cur_url->ignore_when_downloading)
736 if (opt.quota && total_downloaded_bytes > opt.quota)
741 if ((opt.recursive || opt.page_requisites)
742 && cur_url->url->scheme != SCHEME_FTP)
743 status = retrieve_tree (cur_url->url->url);
745 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
747 if (filename && opt.delete_after && file_exists_p (filename))
749 DEBUGP (("Removing file due to --delete-after in"
750 " retrieve_from_file():\n"));
751 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
752 if (unlink (filename))
753 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
757 xfree_null (new_file);
758 xfree_null (filename);
761 /* Free the linked list of URL-s. */
762 free_urlpos (url_list);
767 /* Print `giving up', or `retrying', depending on the impending
768 action. N1 and N2 are the attempt number and the attempt limit. */
770 printwhat (int n1, int n2)
772 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
775 /* If opt.wait or opt.waitretry are specified, and if certain
776 conditions are met, sleep the appropriate number of seconds. See
777 the documentation of --wait and --waitretry for more information.
779 COUNT is the count of current retrieval, beginning with 1. */
782 sleep_between_retrievals (int count)
784 static int first_retrieval = 1;
788 /* Don't sleep before the very first retrieval. */
793 if (opt.waitretry && count > 1)
795 /* If opt.waitretry is specified and this is a retry, wait for
796 COUNT-1 number of seconds, or for opt.waitretry seconds. */
797 if (count <= opt.waitretry)
800 xsleep (opt.waitretry);
804 if (!opt.random_wait || count > 1)
805 /* If random-wait is not specified, or if we are sleeping
806 between retries of the same download, sleep the fixed
811 /* Sleep a random amount of time averaging in opt.wait
812 seconds. The sleeping amount ranges from 0 to
813 opt.wait*2, inclusive. */
814 double waitsecs = 2 * opt.wait * random_float ();
815 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
816 opt.wait, waitsecs));
822 /* Free the linked list of urlpos. */
824 free_urlpos (struct urlpos *l)
828 struct urlpos *next = l->next;
831 xfree_null (l->local_name);
837 /* Rotate FNAME opt.backups times */
839 rotate_backups(const char *fname)
841 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
842 char *from = (char *)alloca (maxlen);
843 char *to = (char *)alloca (maxlen);
847 if (stat (fname, &sb) == 0)
848 if (S_ISREG (sb.st_mode) == 0)
851 for (i = opt.backups; i > 1; i--)
853 sprintf (from, "%s.%d", fname, i - 1);
854 sprintf (to, "%s.%d", fname, i);
858 sprintf (to, "%s.%d", fname, 1);
862 static int no_proxy_match PARAMS ((const char *, const char **));
864 /* Return the URL of the proxy appropriate for url U. */
867 getproxy (struct url *u)
871 static char rewritten_storage[1024];
875 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
881 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
885 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
889 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
894 if (!proxy || !*proxy)
897 /* Handle shorthands. `rewritten_storage' is a kludge to allow
898 getproxy() to return static storage. */
899 rewritten_url = rewrite_shorthand_url (proxy);
902 strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
903 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
904 proxy = rewritten_storage;
910 /* Should a host be accessed through proxy, concerning no_proxy? */
912 no_proxy_match (const char *host, const char **no_proxy)
917 return !sufmatch (no_proxy, host);