sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <sys/types.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #ifdef HAVE_STRING_H
  40 # include <string.h>
  41 #else
  42 # include <strings.h>
  43 #endif /* HAVE_STRING_H */
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "retr.h"
  49 #include "progress.h"
  50 #include "url.h"
  51 #include "recur.h"
  52 #include "ftp.h"
  53 #include "host.h"
  54 #include "connect.h"
  55 #include "hash.h"
  56
  57 #ifdef HAVE_SSL
  58 # include "gen_sslfunc.h"       /* for ssl_iread */
  59 #endif
  60
  61 #ifndef errno
  62 extern int errno;
  63 #endif
  64
  65 /* See the comment in gethttp() why this is needed. */
  66 int global_download_count;
  67
  68 \f
  69 static struct {
  70   long chunk_bytes;
  71   double chunk_start;
  72   double sleep_adjust;
  73 } limit_data;
  74
  75 static void
  76 limit_bandwidth_reset (void)
  77 {
  78   limit_data.chunk_bytes = 0;
  79   limit_data.chunk_start = 0;
  80 }
  81
  82 /* Limit the bandwidth by pausing the download for an amount of time.
  83    BYTES is the number of bytes received from the network, and DELTA
  84    is the number of milliseconds it took to receive them.  */
  85
  86 static void
  87 limit_bandwidth (long bytes, double *dltime, struct wget_timer *timer)
  88 {
  89   double delta_t = *dltime - limit_data.chunk_start;
  90   double expected;
  91
  92   limit_data.chunk_bytes += bytes;
  93
  94   /* Calculate the amount of time we expect downloading the chunk
  95      should take.  If in reality it took less time, sleep to
  96      compensate for the difference.  */
  97   expected = 1000.0 * limit_data.chunk_bytes / opt.limit_rate;
  98
  99   if (expected > delta_t)
 100     {
 101       double slp = expected - delta_t + limit_data.sleep_adjust;
 102       double t0, t1;
 103       if (slp < 200)
 104         {
 105           DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
 106                    slp, limit_data.chunk_bytes, delta_t));
 107           return;
 108         }
 109       DEBUGP (("\nsleeping %.2f ms for %ld bytes, adjust %.2f ms\n",
 110                slp, limit_data.chunk_bytes, limit_data.sleep_adjust));
 111
 112       t0 = *dltime;
 113       usleep ((unsigned long) (1000 * slp));
 114       t1 = wtimer_elapsed (timer);
 115
 116       /* Due to scheduling, we probably slept slightly longer (or
 117          shorter) than desired.  Calculate the difference between the
 118          desired and the actual sleep, and adjust the next sleep by
 119          that amount.  */
 120       limit_data.sleep_adjust = slp - (t1 - t0);
 121
 122       /* Since we've called wtimer_elapsed, we might as well update
 123          the caller's dltime. */
 124       *dltime = t1;
 125     }
 126
 127   limit_data.chunk_bytes = 0;
 128   limit_data.chunk_start = *dltime;
 129 }
 130
 131 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 132
 133 /* Reads the contents of file descriptor FD, until it is closed, or a
 134    read error occurs.  The data is read in 8K chunks, and stored to
 135    stream fp, which should have been open for writing.  If BUF is
 136    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 137    This function will *not* use the rbuf_* functions!
 138
 139    The EXPECTED argument is passed to show_progress() unchanged, but
 140    otherwise ignored.
 141
 142    If opt.verbose is set, the progress is also shown.  RESTVAL
 143    represents a value from which to start downloading (which will be
 144    shown accordingly).  If RESTVAL is non-zero, the stream should have
 145    been open for appending.
 146
 147    The function exits and returns codes of 0, -1 and -2 if the
 148    connection was closed, there was a read error, or if it could not
 149    write to the output stream, respectively.
 150
 151    IMPORTANT: The function flushes the contents of the buffer in
 152    rbuf_flush() before actually reading from fd.  If you wish to read
 153    from fd immediately, flush or discard the buffer.  */
 154 int
 155 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 156               struct rbuf *rbuf, int use_expected, double *elapsed)
 157 {
 158   int res = 0;
 159
 160   static char dlbuf[16384];
 161   int dlbufsize = sizeof (dlbuf);
 162
 163   void *progress = NULL;
 164   struct wget_timer *timer = wtimer_allocate ();
 165   double dltime = 0;
 166
 167   *len = restval;
 168
 169   if (opt.verbose)
 170     progress = progress_create (restval, expected);
 171
 172   if (rbuf && RBUF_FD (rbuf) == fd)
 173     {
 174       int sz = 0;
 175       while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
 176         {
 177           fwrite (dlbuf, 1, res, fp);
 178           *len += res;
 179           sz += res;
 180         }
 181       if (sz)
 182         fflush (fp);
 183       if (ferror (fp))
 184         {
 185           res = -2;
 186           goto out;
 187         }
 188       if (progress)
 189         progress_update (progress, sz, 0);
 190     }
 191
 192   if (opt.limit_rate)
 193     limit_bandwidth_reset ();
 194   wtimer_reset (timer);
 195
 196   /* If we're limiting the download, set our buffer size to the
 197      limit.  */
 198   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 199     dlbufsize = opt.limit_rate;
 200
 201   /* Read from fd while there is available data.
 202
 203      Normally, if expected is 0, it means that it is not known how
 204      much data is expected.  However, if use_expected is specified,
 205      then expected being zero means exactly that.  */
 206   while (!use_expected || (*len < expected))
 207     {
 208       int amount_to_read = (use_expected
 209                             ? MIN (expected - *len, dlbufsize) : dlbufsize);
 210 #ifdef HAVE_SSL
 211       if (rbuf->ssl!=NULL)
 212         res = ssl_iread (rbuf->ssl, dlbuf, amount_to_read);
 213       else
 214 #endif /* HAVE_SSL */
 215         res = iread (fd, dlbuf, amount_to_read);
 216
 217       if (res <= 0)
 218         break;
 219
 220       fwrite (dlbuf, 1, res, fp);
 221       /* Always flush the contents of the network packet.  This should
 222          not hinder performance: fast downloads will be received in
 223          16K chunks (which stdio would write out anyway), and slow
 224          downloads won't be limited with disk performance.  */
 225       fflush (fp);
 226       if (ferror (fp))
 227         {
 228           res = -2;
 229           goto out;
 230         }
 231
 232       dltime = wtimer_elapsed (timer);
 233       if (opt.limit_rate)
 234         limit_bandwidth (res, &dltime, timer);
 235
 236       if (progress)
 237         progress_update (progress, res, dltime);
 238       *len += res;
 239     }
 240   if (res < -1)
 241     res = -1;
 242
 243  out:
 244   if (progress)
 245     progress_finish (progress, dltime);
 246   if (elapsed)
 247     *elapsed = dltime;
 248   wtimer_delete (timer);
 249
 250   return res;
 251 }
 252 \f
 253 /* Return a printed representation of the download rate, as
 254    appropriate for the speed.  If PAD is non-zero, strings will be
 255    padded to the width of 7 characters (xxxx.xx).  */
 256 char *
 257 retr_rate (long bytes, double msecs, int pad)
 258 {
 259   static char res[20];
 260   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 261   int units = 0;
 262
 263   double dlrate = calc_rate (bytes, msecs, &units);
 264   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 265
 266   return res;
 267 }
 268
 269 /* Calculate the download rate and trim it as appropriate for the
 270    speed.  Appropriate means that if rate is greater than 1K/s,
 271    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 272    are used.
 273
 274    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 275    GB/s.  */
 276 double
 277 calc_rate (long bytes, double msecs, int *units)
 278 {
 279   double dlrate;
 280
 281   assert (msecs >= 0);
 282   assert (bytes >= 0);
 283
 284   if (msecs == 0)
 285     /* If elapsed time is exactly zero, it means we're under the
 286        granularity of the timer.  This often happens on systems that
 287        use time() for the timer.  */
 288     msecs = wtimer_granularity ();
 289
 290   dlrate = (double)1000 * bytes / msecs;
 291   if (dlrate < 1024.0)
 292     *units = 0;
 293   else if (dlrate < 1024.0 * 1024.0)
 294     *units = 1, dlrate /= 1024.0;
 295   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 296     *units = 2, dlrate /= (1024.0 * 1024.0);
 297   else
 298     /* Maybe someone will need this, one day. */
 299     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 300
 301   return dlrate;
 302 }
 303 \f
 304 /* Maximum number of allowed redirections.  20 was chosen as a
 305    "reasonable" value, which is low enough to not cause havoc, yet
 306    high enough to guarantee that normal retrievals will not be hurt by
 307    the check.  */
 308
 309 #define MAX_REDIRECTIONS 20
 310
 311 #define SUSPEND_POST_DATA do {                  \
 312   post_data_suspended = 1;                      \
 313   saved_post_data = opt.post_data;              \
 314   saved_post_file_name = opt.post_file_name;    \
 315   opt.post_data = NULL;                         \
 316   opt.post_file_name = NULL;                    \
 317 } while (0)
 318
 319 #define RESTORE_POST_DATA do {                          \
 320   if (post_data_suspended)                              \
 321     {                                                   \
 322       opt.post_data = saved_post_data;                  \
 323       opt.post_file_name = saved_post_file_name;        \
 324       post_data_suspended = 0;                          \
 325     }                                                   \
 326 } while (0)
 327
 328 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 329    FTP, proxy, etc.  */
 330
 331 /* #### This function should be rewritten so it doesn't return from
 332    multiple points. */
 333
 334 uerr_t
 335 retrieve_url (const char *origurl, char **file, char **newloc,
 336               const char *refurl, int *dt)
 337 {
 338   uerr_t result;
 339   char *url;
 340   int location_changed, dummy;
 341   char *mynewloc, *proxy;
 342   struct url *u, *proxy_url;
 343   int up_error_code;            /* url parse error code */
 344   char *local_file;
 345   int redirection_count = 0;
 346
 347   int post_data_suspended = 0;
 348   char *saved_post_data = NULL;
 349   char *saved_post_file_name = NULL;
 350
 351   /* If dt is NULL, just ignore it.  */
 352   if (!dt)
 353     dt = &dummy;
 354   url = xstrdup (origurl);
 355   if (newloc)
 356     *newloc = NULL;
 357   if (file)
 358     *file = NULL;
 359
 360   u = url_parse (url, &up_error_code);
 361   if (!u)
 362     {
 363       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 364       xfree (url);
 365       return URLERROR;
 366     }
 367
 368   if (!refurl)
 369     refurl = opt.referer;
 370
 371  redirected:
 372
 373   result = NOCONERROR;
 374   mynewloc = NULL;
 375   local_file = NULL;
 376   proxy_url = NULL;
 377
 378   proxy = getproxy (u);
 379   if (proxy)
 380     {
 381       /* Parse the proxy URL.  */
 382       proxy_url = url_parse (proxy, &up_error_code);
 383       if (!proxy_url)
 384         {
 385           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 386                      proxy, url_error (up_error_code));
 387           xfree (url);
 388           RESTORE_POST_DATA;
 389           return PROXERR;
 390         }
 391       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 392         {
 393           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 394           url_free (proxy_url);
 395           xfree (url);
 396           RESTORE_POST_DATA;
 397           return PROXERR;
 398         }
 399     }
 400
 401   if (u->scheme == SCHEME_HTTP
 402 #ifdef HAVE_SSL
 403       || u->scheme == SCHEME_HTTPS
 404 #endif
 405       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 406     {
 407       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 408     }
 409   else if (u->scheme == SCHEME_FTP)
 410     {
 411       /* If this is a redirection, we must not allow recursive FTP
 412          retrieval, so we save recursion to oldrec, and restore it
 413          later.  */
 414       int oldrec = opt.recursive;
 415       if (redirection_count)
 416         opt.recursive = 0;
 417       result = ftp_loop (u, dt, proxy_url);
 418       opt.recursive = oldrec;
 419
 420       /* There is a possibility of having HTTP being redirected to
 421          FTP.  In these cases we must decide whether the text is HTML
 422          according to the suffix.  The HTML suffixes are `.html',
 423          `.htm' and a few others, case-insensitive.  */
 424       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 425         {
 426           if (has_html_suffix_p (local_file))
 427             *dt |= TEXTHTML;
 428         }
 429     }
 430
 431   if (proxy_url)
 432     {
 433       url_free (proxy_url);
 434       proxy_url = NULL;
 435     }
 436
 437   location_changed = (result == NEWLOCATION);
 438   if (location_changed)
 439     {
 440       char *construced_newloc;
 441       struct url *newloc_parsed;
 442
 443       assert (mynewloc != NULL);
 444
 445       if (local_file)
 446         xfree (local_file);
 447
 448       /* The HTTP specs only allow absolute URLs to appear in
 449          redirects, but a ton of boneheaded webservers and CGIs out
 450          there break the rules and use relative URLs, and popular
 451          browsers are lenient about this, so wget should be too. */
 452       construced_newloc = uri_merge (url, mynewloc);
 453       xfree (mynewloc);
 454       mynewloc = construced_newloc;
 455
 456       /* Now, see if this new location makes sense. */
 457       newloc_parsed = url_parse (mynewloc, &up_error_code);
 458       if (!newloc_parsed)
 459         {
 460           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 461                      url_error (up_error_code));
 462           url_free (u);
 463           xfree (url);
 464           xfree (mynewloc);
 465           RESTORE_POST_DATA;
 466           return result;
 467         }
 468
 469       /* Now mynewloc will become newloc_parsed->url, because if the
 470          Location contained relative paths like .././something, we
 471          don't want that propagating as url.  */
 472       xfree (mynewloc);
 473       mynewloc = xstrdup (newloc_parsed->url);
 474
 475       /* Check for max. number of redirections.  */
 476       if (++redirection_count > MAX_REDIRECTIONS)
 477         {
 478           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 479                      MAX_REDIRECTIONS);
 480           url_free (newloc_parsed);
 481           url_free (u);
 482           xfree (url);
 483           xfree (mynewloc);
 484           RESTORE_POST_DATA;
 485           return WRONGCODE;
 486         }
 487
 488       xfree (url);
 489       url = mynewloc;
 490       url_free (u);
 491       u = newloc_parsed;
 492
 493       /* If we're being redirected from POST, we don't want to POST
 494          again.  Many requests answer POST with a redirection to an
 495          index page; that redirection is clearly a GET.  We "suspend"
 496          POST data for the duration of the redirections, and restore
 497          it when we're done. */
 498       if (!post_data_suspended)
 499         SUSPEND_POST_DATA;
 500
 501       goto redirected;
 502     }
 503
 504   if (local_file)
 505     {
 506       if (*dt & RETROKF)
 507         {
 508           register_download (u->url, local_file);
 509           if (redirection_count && 0 != strcmp (origurl, u->url))
 510             register_redirection (origurl, u->url);
 511           if (*dt & TEXTHTML)
 512             register_html (u->url, local_file);
 513         }
 514     }
 515
 516   if (file)
 517     *file = local_file ? local_file : NULL;
 518   else
 519     FREE_MAYBE (local_file);
 520
 521   url_free (u);
 522
 523   if (redirection_count)
 524     {
 525       if (newloc)
 526         *newloc = url;
 527       else
 528         xfree (url);
 529     }
 530   else
 531     {
 532       if (newloc)
 533         *newloc = NULL;
 534       xfree (url);
 535     }
 536
 537   ++global_download_count;
 538   RESTORE_POST_DATA;
 539
 540   return result;
 541 }
 542
 543 /* Find the URLs in the file and call retrieve_url() for each of
 544    them.  If HTML is non-zero, treat the file as HTML, and construct
 545    the URLs accordingly.
 546
 547    If opt.recursive is set, call recursive_retrieve() for each file.  */
 548 uerr_t
 549 retrieve_from_file (const char *file, int html, int *count)
 550 {
 551   uerr_t status;
 552   struct urlpos *url_list, *cur_url;
 553
 554   url_list = (html ? get_urls_html (file, NULL, NULL)
 555               : get_urls_file (file));
 556   status = RETROK;             /* Suppose everything is OK.  */
 557   *count = 0;                  /* Reset the URL count.  */
 558
 559   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 560     {
 561       char *filename = NULL, *new_file = NULL;
 562       int dt;
 563
 564       if (cur_url->ignore_when_downloading)
 565         continue;
 566
 567       if (downloaded_exceeds_quota ())
 568         {
 569           status = QUOTEXC;
 570           break;
 571         }
 572       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 573         status = retrieve_tree (cur_url->url->url);
 574       else
 575         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 576
 577       if (filename && opt.delete_after && file_exists_p (filename))
 578         {
 579           DEBUGP (("Removing file due to --delete-after in"
 580                    " retrieve_from_file():\n"));
 581           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 582           if (unlink (filename))
 583             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 584           dt &= ~RETROKF;
 585         }
 586
 587       FREE_MAYBE (new_file);
 588       FREE_MAYBE (filename);
 589     }
 590
 591   /* Free the linked list of URL-s.  */
 592   free_urlpos (url_list);
 593
 594   return status;
 595 }
 596
 597 /* Print `giving up', or `retrying', depending on the impending
 598    action.  N1 and N2 are the attempt number and the attempt limit.  */
 599 void
 600 printwhat (int n1, int n2)
 601 {
 602   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 603 }
 604
 605 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 606    set opt.downloaded_overflow to 1. */
 607 void
 608 downloaded_increase (unsigned long by_how_much)
 609 {
 610   VERY_LONG_TYPE old;
 611   if (opt.downloaded_overflow)
 612     return;
 613   old = opt.downloaded;
 614   opt.downloaded += by_how_much;
 615   if (opt.downloaded < old)     /* carry flag, where are you when I
 616                                    need you? */
 617     {
 618       /* Overflow. */
 619       opt.downloaded_overflow = 1;
 620       opt.downloaded = ~((VERY_LONG_TYPE)0);
 621     }
 622 }
 623
 624 /* Return non-zero if the downloaded amount of bytes exceeds the
 625    desired quota.  If quota is not set or if the amount overflowed, 0
 626    is returned. */
 627 int
 628 downloaded_exceeds_quota (void)
 629 {
 630   if (!opt.quota)
 631     return 0;
 632   if (opt.downloaded_overflow)
 633     /* We don't really know.  (Wildly) assume not. */
 634     return 0;
 635
 636   return opt.downloaded > opt.quota;
 637 }
 638
 639 /* If opt.wait or opt.waitretry are specified, and if certain
 640    conditions are met, sleep the appropriate number of seconds.  See
 641    the documentation of --wait and --waitretry for more information.
 642
 643    COUNT is the count of current retrieval, beginning with 1. */
 644
 645 void
 646 sleep_between_retrievals (int count)
 647 {
 648   static int first_retrieval = 1;
 649
 650   if (first_retrieval)
 651     {
 652       /* Don't sleep before the very first retrieval. */
 653       first_retrieval = 0;
 654       return;
 655     }
 656
 657   if (opt.waitretry && count > 1)
 658     {
 659       /* If opt.waitretry is specified and this is a retry, wait for
 660          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 661       if (count <= opt.waitretry)
 662         sleep (count - 1);
 663       else
 664         usleep (1000000L * opt.waitretry);
 665     }
 666   else if (opt.wait)
 667     {
 668       if (!opt.random_wait || count > 1)
 669         /* If random-wait is not specified, or if we are sleeping
 670            between retries of the same download, sleep the fixed
 671            interval.  */
 672         usleep (1000000L * opt.wait);
 673       else
 674         {
 675           /* Sleep a random amount of time averaging in opt.wait
 676              seconds.  The sleeping amount ranges from 0 to
 677              opt.wait*2, inclusive.  */
 678           double waitsecs = 2 * opt.wait * random_float ();
 679           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
 680                    opt.wait, waitsecs));
 681           usleep (1000000L * waitsecs);
 682         }
 683     }
 684 }