2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
34 #include <sys/types.h>
37 #endif /* HAVE_UNISTD_H */
43 #endif /* HAVE_STRING_H */
59 # include "gen_sslfunc.h" /* for ssl_iread */
66 /* Total size of downloaded files. Used to enforce quota. */
67 LARGE_INT total_downloaded_bytes;
69 /* If non-NULL, the stream to which output should be written. This
70 stream is initialized when `-O' is used. */
73 /* Whether output_document is a regular file we can manipulate,
74 i.e. not `-' or a device file. */
75 int output_stream_regular;
84 limit_bandwidth_reset (void)
86 limit_data.chunk_bytes = 0;
87 limit_data.chunk_start = 0;
90 /* Limit the bandwidth by pausing the download for an amount of time.
91 BYTES is the number of bytes received from the network, and TIMER
92 is the timer that started at the beginning of download. */
95 limit_bandwidth (wgint bytes, struct wget_timer *timer)
97 double delta_t = wtimer_read (timer) - limit_data.chunk_start;
100 limit_data.chunk_bytes += bytes;
102 /* Calculate the amount of time we expect downloading the chunk
103 should take. If in reality it took less time, sleep to
104 compensate for the difference. */
105 expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
107 if (expected > delta_t)
109 double slp = expected - delta_t + limit_data.sleep_adjust;
113 DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
114 slp, number_to_static_string (limit_data.chunk_bytes),
118 DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
119 slp, number_to_static_string (limit_data.chunk_bytes),
120 limit_data.sleep_adjust));
122 t0 = wtimer_read (timer);
124 wtimer_update (timer);
125 t1 = wtimer_read (timer);
127 /* Due to scheduling, we probably slept slightly longer (or
128 shorter) than desired. Calculate the difference between the
129 desired and the actual sleep, and adjust the next sleep by
131 limit_data.sleep_adjust = slp - (t1 - t0);
134 limit_data.chunk_bytes = 0;
135 limit_data.chunk_start = wtimer_read (timer);
139 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
142 /* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that
143 amount of data and decrease SKIP. Increment *TOTAL by the amount
147 write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
166 fwrite (buf, 1, bufsize, out);
169 /* Immediately flush the downloaded data. This should not hinder
170 performance: fast downloads will arrive in large 16K chunks
171 (which stdio would write out immediately anyway), and slow
172 downloads wouldn't be limited by disk speed. */
174 return !ferror (out);
177 /* Read the contents of file descriptor FD until it the connection
178 terminates or a read error occurs. The data is read in portions of
179 up to 16K and written to OUT as it arrives. If opt.verbose is set,
180 the progress is shown.
182 TOREAD is the amount of data expected to arrive, normally only used
183 by the progress gauge.
185 STARTPOS is the position from which the download starts, used by
186 the progress gauge. If QTYREAD is non-NULL, the value it points to
187 is incremented by the amount of data read from the network. If
188 QTYWRITTEN is non-NULL, the value it points to is incremented by
189 the amount of data written to disk. The time it took to download
190 the data (in milliseconds) is stored to ELAPSED.
192 The function exits and returns the amount of data read. In case of
193 error while reading data, -1 is returned. In case of error while
194 writing data, -2 is returned. */
197 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
198 wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
202 static char dlbuf[16384];
203 int dlbufsize = sizeof (dlbuf);
205 struct wget_timer *timer = NULL;
206 double last_successful_read_tm = 0;
208 /* The progress gauge, set according to the user preferences. */
209 void *progress = NULL;
211 /* Non-zero if the progress gauge is interactive, i.e. if it can
212 continually update the display. When true, smaller timeout
213 values are used so that the gauge can update the display when
214 data arrives slowly. */
215 int progress_interactive = 0;
217 int exact = flags & rb_read_exactly;
220 /* How much data we've read/written. */
222 wgint sum_written = 0;
224 if (flags & rb_skip_startpos)
229 /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
230 argument to progress_create because the indicator doesn't
231 (yet) know about "skipping" data. */
232 progress = progress_create (skip ? 0 : startpos, startpos + toread);
233 progress_interactive = progress_interactive_p (progress);
237 limit_bandwidth_reset ();
239 /* A timer is needed for tracking progress, for throttling, and for
240 tracking elapsed time. If either of these are requested, start
242 if (progress || opt.limit_rate || elapsed)
244 timer = wtimer_new ();
245 last_successful_read_tm = 0;
248 /* Use a smaller buffer for low requested bandwidths. For example,
249 with --limit-rate=2k, it doesn't make sense to slurp in 16K of
250 data and then sleep for 8s. With buffer size equal to the limit,
251 we never have to sleep for more than one second. */
252 if (opt.limit_rate && opt.limit_rate < dlbufsize)
253 dlbufsize = opt.limit_rate;
255 /* Read from FD while there is data to read. Normally toread==0
256 means that it is unknown how much data is to arrive. However, if
257 EXACT is set, then toread==0 means what it says: that no data
259 while (!exact || (sum_read < toread))
261 int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
262 double tmout = opt.read_timeout;
263 if (progress_interactive)
265 /* For interactive progress gauges, always specify a ~1s
266 timeout, so that the gauge can be updated regularly even
267 when the data arrives very slowly or stalls. */
269 if (opt.read_timeout)
272 waittm = (wtimer_read (timer) - last_successful_read_tm) / 1000;
273 if (waittm + tmout > opt.read_timeout)
275 /* Don't let total idle time exceed read timeout. */
276 tmout = opt.read_timeout - waittm;
279 /* We've already exceeded the timeout. */
280 ret = -1, errno = ETIMEDOUT;
286 ret = fd_read (fd, dlbuf, rdsize, tmout);
288 /* when retrieving from http-proxy wget sometimes does not trust the
289 * file length reported by server.
290 * this check is to tell wget not to stubbornly try to read again and
291 * again until another errno code was received. */
292 if ( ret == -1 && errno == ETIMEDOUT && sum_read == toread && toread > 0 )
295 if (ret == 0 || (ret < 0 && errno != ETIMEDOUT))
296 break; /* read error */
298 ret = 0; /* read timeout */
300 if (progress || opt.limit_rate)
302 wtimer_update (timer);
304 last_successful_read_tm = wtimer_read (timer);
310 if (!write_data (out, dlbuf, ret, &skip, &sum_written))
318 limit_bandwidth (ret, timer);
321 progress_update (progress, ret, wtimer_read (timer));
323 if (toread > 0 && !opt.quiet)
324 ws_percenttitle (100.0 *
325 (startpos + sum_read) / (startpos + toread));
333 progress_finish (progress, wtimer_read (timer));
336 *elapsed = wtimer_read (timer);
338 wtimer_delete (timer);
341 *qtyread += sum_read;
343 *qtywritten += sum_written;
348 /* Read a hunk of data from FD, up until a terminator. The terminator
349 is whatever the TERMINATOR function determines it to be; for
350 example, it can be a line of data, or the head of an HTTP response.
351 The function returns the data read allocated with malloc.
353 In case of error, NULL is returned. In case of EOF and no data
354 read, NULL is returned and errno set to 0. In case of EOF with
355 data having been read, the data is returned, but it will
356 (obviously) not contain the terminator.
358 The idea is to be able to read a line of input, or otherwise a hunk
359 of text, such as the head of an HTTP request, without crossing the
360 boundary, so that the next call to fd_read etc. reads the data
361 after the hunk. To achieve that, this function does the following:
363 1. Peek at available data.
365 2. Determine whether the peeked data, along with the previously
366 read data, includes the terminator.
368 2a. If yes, read the data until the end of the terminator, and
371 2b. If no, read the peeked data and goto 1.
373 The function is careful to assume as little as possible about the
374 implementation of peeking. For example, every peek is followed by
375 a read. If the read returns a different amount of data, the
376 process is retried until all data arrives safely.
378 BUFSIZE is the size of the initial buffer expected to read all the
379 data in the typical case.
381 This function should be used as a building block for other
382 functions -- see fd_read_line as a simple example. */
385 fd_read_hunk (int fd, hunk_terminator_t hunk_terminator, int bufsize)
387 char *hunk = xmalloc (bufsize);
388 int tail = 0; /* tail position in HUNK */
393 int pklen, rdlen, remain;
395 /* First, peek at the available data. */
397 pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
403 end = hunk_terminator (hunk, tail, pklen);
406 /* The data contains the terminator: we'll drain the data up
407 to the end of the terminator. */
408 remain = end - (hunk + tail);
411 /* No more data needs to be read. */
415 if (bufsize - 1 < tail + remain)
417 bufsize = tail + remain + 1;
418 hunk = xrealloc (hunk, bufsize);
422 /* No terminator: simply read the data we know is (or should
426 /* Now, read the data. Note that we make no assumptions about
427 how much data we'll get. (Some TCP stacks are notorious for
428 read returning less data than the previous MSG_PEEK.) */
430 rdlen = fd_read (fd, hunk + tail, remain, 0);
443 /* EOF without anything having been read */
449 /* EOF seen: return the data we've read. */
452 if (end && rdlen == remain)
453 /* The terminator was seen and the remaining data drained --
454 we got what we came for. */
457 /* Keep looping until all the data arrives. */
459 if (tail == bufsize - 1)
462 hunk = xrealloc (hunk, bufsize);
468 line_terminator (const char *hunk, int oldlen, int peeklen)
470 const char *p = memchr (hunk + oldlen, '\n', peeklen);
472 /* p+1 because we want the line to include '\n' */
477 /* Read one line from FD and return it. The line is allocated using
480 If an error occurs, or if no data can be read, NULL is returned.
481 In the former case errno indicates the error condition, and in the
482 latter case, errno is NULL. */
485 fd_read_line (int fd)
487 return fd_read_hunk (fd, line_terminator, 128);
490 /* Return a printed representation of the download rate, as
491 appropriate for the speed. If PAD is non-zero, strings will be
492 padded to the width of 7 characters (xxxx.xx). */
494 retr_rate (wgint bytes, double msecs, int pad)
497 static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
500 double dlrate = calc_rate (bytes, msecs, &units);
501 sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
506 /* Calculate the download rate and trim it as appropriate for the
507 speed. Appropriate means that if rate is greater than 1K/s,
508 kilobytes are used, and if rate is greater than 1MB/s, megabytes
511 UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
514 calc_rate (wgint bytes, double msecs, int *units)
522 /* If elapsed time is exactly zero, it means we're under the
523 granularity of the timer. This often happens on systems that
524 use time() for the timer. */
525 msecs = wtimer_granularity ();
527 dlrate = (double)1000 * bytes / msecs;
530 else if (dlrate < 1024.0 * 1024.0)
531 *units = 1, dlrate /= 1024.0;
532 else if (dlrate < 1024.0 * 1024.0 * 1024.0)
533 *units = 2, dlrate /= (1024.0 * 1024.0);
535 /* Maybe someone will need this, one day. */
536 *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
541 /* Maximum number of allowed redirections. 20 was chosen as a
542 "reasonable" value, which is low enough to not cause havoc, yet
543 high enough to guarantee that normal retrievals will not be hurt by
546 #define MAX_REDIRECTIONS 20
548 #define SUSPEND_POST_DATA do { \
549 post_data_suspended = 1; \
550 saved_post_data = opt.post_data; \
551 saved_post_file_name = opt.post_file_name; \
552 opt.post_data = NULL; \
553 opt.post_file_name = NULL; \
556 #define RESTORE_POST_DATA do { \
557 if (post_data_suspended) \
559 opt.post_data = saved_post_data; \
560 opt.post_file_name = saved_post_file_name; \
561 post_data_suspended = 0; \
565 static char *getproxy PARAMS ((struct url *));
567 /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP,
570 /* #### This function should be rewritten so it doesn't return from
574 retrieve_url (const char *origurl, char **file, char **newloc,
575 const char *refurl, int *dt)
579 int location_changed, dummy;
580 char *mynewloc, *proxy;
581 struct url *u, *proxy_url;
582 int up_error_code; /* url parse error code */
584 int redirection_count = 0;
586 int post_data_suspended = 0;
587 char *saved_post_data = NULL;
588 char *saved_post_file_name = NULL;
590 /* If dt is NULL, use local storage. */
596 url = xstrdup (origurl);
602 u = url_parse (url, &up_error_code);
605 logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
611 refurl = opt.referer;
620 proxy = getproxy (u);
623 /* Parse the proxy URL. */
624 proxy_url = url_parse (proxy, &up_error_code);
627 logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
628 proxy, url_error (up_error_code));
633 if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
635 logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
636 url_free (proxy_url);
643 if (u->scheme == SCHEME_HTTP
645 || u->scheme == SCHEME_HTTPS
647 || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
649 result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
651 else if (u->scheme == SCHEME_FTP)
653 /* If this is a redirection, we must not allow recursive FTP
654 retrieval, so we save recursion to oldrec, and restore it
656 int oldrec = opt.recursive;
657 if (redirection_count)
659 result = ftp_loop (u, dt, proxy_url);
660 opt.recursive = oldrec;
662 /* There is a possibility of having HTTP being redirected to
663 FTP. In these cases we must decide whether the text is HTML
664 according to the suffix. The HTML suffixes are `.html',
665 `.htm' and a few others, case-insensitive. */
666 if (redirection_count && local_file && u->scheme == SCHEME_FTP)
668 if (has_html_suffix_p (local_file))
675 url_free (proxy_url);
679 location_changed = (result == NEWLOCATION);
680 if (location_changed)
682 char *construced_newloc;
683 struct url *newloc_parsed;
685 assert (mynewloc != NULL);
690 /* The HTTP specs only allow absolute URLs to appear in
691 redirects, but a ton of boneheaded webservers and CGIs out
692 there break the rules and use relative URLs, and popular
693 browsers are lenient about this, so wget should be too. */
694 construced_newloc = uri_merge (url, mynewloc);
696 mynewloc = construced_newloc;
698 /* Now, see if this new location makes sense. */
699 newloc_parsed = url_parse (mynewloc, &up_error_code);
702 logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
703 url_error (up_error_code));
711 /* Now mynewloc will become newloc_parsed->url, because if the
712 Location contained relative paths like .././something, we
713 don't want that propagating as url. */
715 mynewloc = xstrdup (newloc_parsed->url);
717 /* Check for max. number of redirections. */
718 if (++redirection_count > MAX_REDIRECTIONS)
720 logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
722 url_free (newloc_parsed);
735 /* If we're being redirected from POST, we don't want to POST
736 again. Many requests answer POST with a redirection to an
737 index page; that redirection is clearly a GET. We "suspend"
738 POST data for the duration of the redirections, and restore
739 it when we're done. */
740 if (!post_data_suspended)
750 register_download (u->url, local_file);
751 if (redirection_count && 0 != strcmp (origurl, u->url))
752 register_redirection (origurl, u->url);
754 register_html (u->url, local_file);
759 *file = local_file ? local_file : NULL;
761 xfree_null (local_file);
765 if (redirection_count)
784 /* Find the URLs in the file and call retrieve_url() for each of
785 them. If HTML is non-zero, treat the file as HTML, and construct
786 the URLs accordingly.
788 If opt.recursive is set, call retrieve_tree() for each file. */
791 retrieve_from_file (const char *file, int html, int *count)
794 struct urlpos *url_list, *cur_url;
796 url_list = (html ? get_urls_html (file, NULL, NULL)
797 : get_urls_file (file));
798 status = RETROK; /* Suppose everything is OK. */
799 *count = 0; /* Reset the URL count. */
801 for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
803 char *filename = NULL, *new_file = NULL;
806 if (cur_url->ignore_when_downloading)
809 if (opt.quota && total_downloaded_bytes > opt.quota)
814 if ((opt.recursive || opt.page_requisites)
815 && cur_url->url->scheme != SCHEME_FTP)
816 status = retrieve_tree (cur_url->url->url);
818 status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
820 if (filename && opt.delete_after && file_exists_p (filename))
822 DEBUGP (("Removing file due to --delete-after in"
823 " retrieve_from_file():\n"));
824 logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
825 if (unlink (filename))
826 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
830 xfree_null (new_file);
831 xfree_null (filename);
834 /* Free the linked list of URL-s. */
835 free_urlpos (url_list);
840 /* Print `giving up', or `retrying', depending on the impending
841 action. N1 and N2 are the attempt number and the attempt limit. */
843 printwhat (int n1, int n2)
845 logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
848 /* If opt.wait or opt.waitretry are specified, and if certain
849 conditions are met, sleep the appropriate number of seconds. See
850 the documentation of --wait and --waitretry for more information.
852 COUNT is the count of current retrieval, beginning with 1. */
855 sleep_between_retrievals (int count)
857 static int first_retrieval = 1;
861 /* Don't sleep before the very first retrieval. */
866 if (opt.waitretry && count > 1)
868 /* If opt.waitretry is specified and this is a retry, wait for
869 COUNT-1 number of seconds, or for opt.waitretry seconds. */
870 if (count <= opt.waitretry)
873 xsleep (opt.waitretry);
877 if (!opt.random_wait || count > 1)
878 /* If random-wait is not specified, or if we are sleeping
879 between retries of the same download, sleep the fixed
884 /* Sleep a random amount of time averaging in opt.wait
885 seconds. The sleeping amount ranges from 0 to
886 opt.wait*2, inclusive. */
887 double waitsecs = 2 * opt.wait * random_float ();
888 DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
889 opt.wait, waitsecs));
895 /* Free the linked list of urlpos. */
897 free_urlpos (struct urlpos *l)
901 struct urlpos *next = l->next;
904 xfree_null (l->local_name);
910 /* Rotate FNAME opt.backups times */
912 rotate_backups(const char *fname)
914 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
915 char *from = (char *)alloca (maxlen);
916 char *to = (char *)alloca (maxlen);
920 if (stat (fname, &sb) == 0)
921 if (S_ISREG (sb.st_mode) == 0)
924 for (i = opt.backups; i > 1; i--)
926 sprintf (from, "%s.%d", fname, i - 1);
927 sprintf (to, "%s.%d", fname, i);
931 sprintf (to, "%s.%d", fname, 1);
935 static int no_proxy_match PARAMS ((const char *, const char **));
937 /* Return the URL of the proxy appropriate for url U. */
940 getproxy (struct url *u)
944 static char rewritten_storage[1024];
948 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
954 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
958 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
962 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
967 if (!proxy || !*proxy)
970 /* Handle shorthands. `rewritten_storage' is a kludge to allow
971 getproxy() to return static storage. */
972 rewritten_url = rewrite_shorthand_url (proxy);
975 strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
976 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
977 proxy = rewritten_storage;
983 /* Should a host be accessed through proxy, concerning no_proxy? */
985 no_proxy_match (const char *host, const char **no_proxy)
990 return !sufmatch (no_proxy, host);