sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifdef HAVE_SSL
  48 # include "gen_sslfunc.h"       /* for ssl_iread */
  49 #endif
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 /* See the comment in gethttp() why this is needed. */
  56 int global_download_count;
  57
  58 \f
  59 static struct {
  60   long bytes;
  61   long dltime;
  62 } limit_data;
  63
  64 static void
  65 limit_bandwidth_reset (void)
  66 {
  67   limit_data.bytes  = 0;
  68   limit_data.dltime = 0;
  69 }
  70
  71 /* Limit the bandwidth by pausing the download for an amount of time.
  72    BYTES is the number of bytes received from the network, DELTA is
  73    how long it took to receive them, DLTIME the current download time,
  74    TIMER the timer, and ADJUSTMENT the previous.  */
  75
  76 static void
  77 limit_bandwidth (long bytes, long delta)
  78 {
  79   long expected;
  80
  81   limit_data.bytes += bytes;
  82   limit_data.dltime += delta;
  83
  84   expected = (long)(1000.0 * limit_data.bytes / opt.limit_rate);
  85
  86   if (expected > limit_data.dltime)
  87     {
  88       long slp = expected - limit_data.dltime;
  89       if (slp < 200)
  90         {
  91           DEBUGP (("deferring a %ld ms sleep (%ld/%ld) until later.\n",
  92                    slp, limit_data.bytes, limit_data.dltime));
  93           return;
  94         }
  95       DEBUGP (("sleeping %ld ms\n", slp));
  96       usleep (1000 * slp);
  97     }
  98
  99   limit_data.bytes = 0;
 100   limit_data.dltime = 0;
 101 }
 102
 103 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 104
 105 /* Reads the contents of file descriptor FD, until it is closed, or a
 106    read error occurs.  The data is read in 8K chunks, and stored to
 107    stream fp, which should have been open for writing.  If BUF is
 108    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 109    This function will *not* use the rbuf_* functions!
 110
 111    The EXPECTED argument is passed to show_progress() unchanged, but
 112    otherwise ignored.
 113
 114    If opt.verbose is set, the progress is also shown.  RESTVAL
 115    represents a value from which to start downloading (which will be
 116    shown accordingly).  If RESTVAL is non-zero, the stream should have
 117    been open for appending.
 118
 119    The function exits and returns codes of 0, -1 and -2 if the
 120    connection was closed, there was a read error, or if it could not
 121    write to the output stream, respectively.
 122
 123    IMPORTANT: The function flushes the contents of the buffer in
 124    rbuf_flush() before actually reading from fd.  If you wish to read
 125    from fd immediately, flush or discard the buffer.  */
 126 int
 127 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 128               struct rbuf *rbuf, int use_expected, long *elapsed)
 129 {
 130   int res = 0;
 131   static char c[8192];
 132   void *progress = NULL;
 133   struct wget_timer *timer = wtimer_allocate ();
 134   long dltime = 0, last_dltime = 0;
 135
 136   *len = restval;
 137
 138   if (opt.verbose)
 139     progress = progress_create (restval, expected);
 140
 141   if (rbuf && RBUF_FD (rbuf) == fd)
 142     {
 143       int sz = 0;
 144       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
 145         {
 146           fwrite (c, sizeof (char), res, fp);
 147           *len += res;
 148           sz += res;
 149         }
 150       if (sz)
 151         fflush (fp);
 152       if (ferror (fp))
 153         {
 154           res = -2;
 155           goto out;
 156         }
 157       if (opt.verbose)
 158         progress_update (progress, sz, 0);
 159     }
 160
 161   if (opt.limit_rate)
 162     limit_bandwidth_reset ();
 163   wtimer_reset (timer);
 164
 165   /* Read from fd while there is available data.
 166
 167      Normally, if expected is 0, it means that it is not known how
 168      much data is expected.  However, if use_expected is specified,
 169      then expected being zero means exactly that.  */
 170   while (!use_expected || (*len < expected))
 171     {
 172       int amount_to_read = (use_expected
 173                             ? MIN (expected - *len, sizeof (c))
 174                             : sizeof (c));
 175 #ifdef HAVE_SSL
 176       if (rbuf->ssl!=NULL)
 177         res = ssl_iread (rbuf->ssl, c, amount_to_read);
 178       else
 179 #endif /* HAVE_SSL */
 180         res = iread (fd, c, amount_to_read);
 181
 182       if (res > 0)
 183         {
 184           fwrite (c, sizeof (char), res, fp);
 185           /* Always flush the contents of the network packet.  This
 186              should not be adverse to performance, as the network
 187              packets typically won't be too tiny anyway.  */
 188           fflush (fp);
 189           if (ferror (fp))
 190             {
 191               res = -2;
 192               goto out;
 193             }
 194
 195           /* If bandwidth is not limited, one call to wtimer_elapsed
 196              is sufficient.  */
 197           dltime = wtimer_elapsed (timer);
 198           if (opt.limit_rate)
 199             {
 200               limit_bandwidth (res, dltime - last_dltime);
 201               dltime = wtimer_elapsed (timer);
 202               last_dltime = dltime;
 203             }
 204
 205           if (opt.verbose)
 206             progress_update (progress, res, dltime);
 207           *len += res;
 208         }
 209       else
 210         break;
 211     }
 212   if (res < -1)
 213     res = -1;
 214
 215  out:
 216   if (opt.verbose)
 217     progress_finish (progress, dltime);
 218   if (elapsed)
 219     *elapsed = dltime;
 220   wtimer_delete (timer);
 221
 222   return res;
 223 }
 224 \f
 225 /* Return a printed representation of the download rate, as
 226    appropriate for the speed.  If PAD is non-zero, strings will be
 227    padded to the width of 7 characters (xxxx.xx).  */
 228 char *
 229 retr_rate (long bytes, long msecs, int pad)
 230 {
 231   static char res[20];
 232   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 233   int units = 0;
 234
 235   double dlrate = calc_rate (bytes, msecs, &units);
 236   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 237
 238   return res;
 239 }
 240
 241 /* Calculate the download rate and trim it as appropriate for the
 242    speed.  Appropriate means that if rate is greater than 1K/s,
 243    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 244    are used.
 245
 246    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 247    GB/s.  */
 248 double
 249 calc_rate (long bytes, long msecs, int *units)
 250 {
 251   double dlrate;
 252
 253   assert (msecs >= 0);
 254   assert (bytes >= 0);
 255
 256   if (msecs == 0)
 257     /* If elapsed time is 0, it means we're under the granularity of
 258        the timer.  This often happens on systems that use time() for
 259        the timer.  */
 260     msecs = wtimer_granularity ();
 261
 262   dlrate = (double)1000 * bytes / msecs;
 263   if (dlrate < 1024.0)
 264     *units = 0;
 265   else if (dlrate < 1024.0 * 1024.0)
 266     *units = 1, dlrate /= 1024.0;
 267   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 268     *units = 2, dlrate /= (1024.0 * 1024.0);
 269   else
 270     /* Maybe someone will need this one day.  More realistically, it
 271        will get tickled by buggy timers. */
 272     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 273
 274   return dlrate;
 275 }
 276 \f
 277 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 278                         && no_proxy_match((u)->host,                    \
 279                                           (const char **)opt.no_proxy))
 280
 281 /* Maximum number of allowed redirections.  20 was chosen as a
 282    "reasonable" value, which is low enough to not cause havoc, yet
 283    high enough to guarantee that normal retrievals will not be hurt by
 284    the check.  */
 285
 286 #define MAX_REDIRECTIONS 20
 287
 288 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 289    FTP, proxy, etc.  */
 290
 291 uerr_t
 292 retrieve_url (const char *origurl, char **file, char **newloc,
 293               const char *refurl, int *dt)
 294 {
 295   uerr_t result;
 296   char *url;
 297   int location_changed, dummy;
 298   int use_proxy;
 299   char *mynewloc, *proxy;
 300   struct url *u;
 301   int up_error_code;            /* url parse error code */
 302   char *local_file;
 303   int redirection_count = 0;
 304
 305   /* If dt is NULL, just ignore it.  */
 306   if (!dt)
 307     dt = &dummy;
 308   url = xstrdup (origurl);
 309   if (newloc)
 310     *newloc = NULL;
 311   if (file)
 312     *file = NULL;
 313
 314   u = url_parse (url, &up_error_code);
 315   if (!u)
 316     {
 317       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 318       xfree (url);
 319       return URLERROR;
 320     }
 321
 322   if (!refurl)
 323     refurl = opt.referer;
 324
 325  redirected:
 326
 327   result = NOCONERROR;
 328   mynewloc = NULL;
 329   local_file = NULL;
 330
 331   use_proxy = USE_PROXY_P (u);
 332   if (use_proxy)
 333     {
 334       struct url *proxy_url;
 335
 336       /* Get the proxy server for the current scheme.  */
 337       proxy = getproxy (u->scheme);
 338       if (!proxy)
 339         {
 340           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 341           url_free (u);
 342           xfree (url);
 343           return PROXERR;
 344         }
 345
 346       /* Parse the proxy URL.  */
 347       proxy_url = url_parse (proxy, &up_error_code);
 348       if (!proxy_url)
 349         {
 350           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 351                      proxy, url_error (up_error_code));
 352           xfree (url);
 353           return PROXERR;
 354         }
 355       if (proxy_url->scheme != SCHEME_HTTP)
 356         {
 357           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 358           url_free (proxy_url);
 359           xfree (url);
 360           return PROXERR;
 361         }
 362
 363       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 364       url_free (proxy_url);
 365     }
 366   else if (u->scheme == SCHEME_HTTP
 367 #ifdef HAVE_SSL
 368       || u->scheme == SCHEME_HTTPS
 369 #endif
 370       )
 371     {
 372       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 373     }
 374   else if (u->scheme == SCHEME_FTP)
 375     {
 376       /* If this is a redirection, we must not allow recursive FTP
 377          retrieval, so we save recursion to oldrec, and restore it
 378          later.  */
 379       int oldrec = opt.recursive;
 380       if (redirection_count)
 381         opt.recursive = 0;
 382       result = ftp_loop (u, dt);
 383       opt.recursive = oldrec;
 384
 385       /* There is a possibility of having HTTP being redirected to
 386          FTP.  In these cases we must decide whether the text is HTML
 387          according to the suffix.  The HTML suffixes are `.html' and
 388          `.htm', case-insensitive.  */
 389       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 390         {
 391           char *suf = suffix (local_file);
 392           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 393             *dt |= TEXTHTML;
 394         }
 395     }
 396   location_changed = (result == NEWLOCATION);
 397   if (location_changed)
 398     {
 399       char *construced_newloc;
 400       struct url *newloc_parsed;
 401
 402       assert (mynewloc != NULL);
 403
 404       if (local_file)
 405         xfree (local_file);
 406
 407       /* The HTTP specs only allow absolute URLs to appear in
 408          redirects, but a ton of boneheaded webservers and CGIs out
 409          there break the rules and use relative URLs, and popular
 410          browsers are lenient about this, so wget should be too. */
 411       construced_newloc = uri_merge (url, mynewloc);
 412       xfree (mynewloc);
 413       mynewloc = construced_newloc;
 414
 415       /* Now, see if this new location makes sense. */
 416       newloc_parsed = url_parse (mynewloc, &up_error_code);
 417       if (!newloc_parsed)
 418         {
 419           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 420                      url_error (up_error_code));
 421           url_free (u);
 422           xfree (url);
 423           xfree (mynewloc);
 424           return result;
 425         }
 426
 427       /* Now mynewloc will become newloc_parsed->url, because if the
 428          Location contained relative paths like .././something, we
 429          don't want that propagating as url.  */
 430       xfree (mynewloc);
 431       mynewloc = xstrdup (newloc_parsed->url);
 432
 433       /* Check for max. number of redirections.  */
 434       if (++redirection_count > MAX_REDIRECTIONS)
 435         {
 436           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 437                      MAX_REDIRECTIONS);
 438           url_free (newloc_parsed);
 439           url_free (u);
 440           xfree (url);
 441           xfree (mynewloc);
 442           return WRONGCODE;
 443         }
 444
 445       xfree (url);
 446       url = mynewloc;
 447       url_free (u);
 448       u = newloc_parsed;
 449       goto redirected;
 450     }
 451
 452   if (local_file)
 453     {
 454       if (*dt & RETROKF)
 455         {
 456           register_download (u->url, local_file);
 457           if (redirection_count && 0 != strcmp (origurl, u->url))
 458             register_redirection (origurl, u->url);
 459           if (*dt & TEXTHTML)
 460             register_html (u->url, local_file);
 461         }
 462     }
 463
 464   if (file)
 465     *file = local_file ? local_file : NULL;
 466   else
 467     FREE_MAYBE (local_file);
 468
 469   url_free (u);
 470
 471   if (redirection_count)
 472     {
 473       if (newloc)
 474         *newloc = url;
 475       else
 476         xfree (url);
 477     }
 478   else
 479     {
 480       if (newloc)
 481         *newloc = NULL;
 482       xfree (url);
 483     }
 484
 485   ++global_download_count;
 486
 487   return result;
 488 }
 489
 490 /* Find the URLs in the file and call retrieve_url() for each of
 491    them.  If HTML is non-zero, treat the file as HTML, and construct
 492    the URLs accordingly.
 493
 494    If opt.recursive is set, call recursive_retrieve() for each file.  */
 495 uerr_t
 496 retrieve_from_file (const char *file, int html, int *count)
 497 {
 498   uerr_t status;
 499   struct urlpos *url_list, *cur_url;
 500
 501   url_list = (html ? get_urls_html (file, NULL, NULL)
 502               : get_urls_file (file));
 503   status = RETROK;             /* Suppose everything is OK.  */
 504   *count = 0;                  /* Reset the URL count.  */
 505
 506   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 507     {
 508       char *filename = NULL, *new_file = NULL;
 509       int dt;
 510
 511       if (cur_url->ignore_when_downloading)
 512         continue;
 513
 514       if (downloaded_exceeds_quota ())
 515         {
 516           status = QUOTEXC;
 517           break;
 518         }
 519       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 520         status = retrieve_tree (cur_url->url->url);
 521       else
 522         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 523
 524       if (filename && opt.delete_after && file_exists_p (filename))
 525         {
 526           DEBUGP (("Removing file due to --delete-after in"
 527                    " retrieve_from_file():\n"));
 528           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 529           if (unlink (filename))
 530             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 531           dt &= ~RETROKF;
 532         }
 533
 534       FREE_MAYBE (new_file);
 535       FREE_MAYBE (filename);
 536     }
 537
 538   /* Free the linked list of URL-s.  */
 539   free_urlpos (url_list);
 540
 541   return status;
 542 }
 543
 544 /* Print `giving up', or `retrying', depending on the impending
 545    action.  N1 and N2 are the attempt number and the attempt limit.  */
 546 void
 547 printwhat (int n1, int n2)
 548 {
 549   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 550 }
 551
 552 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 553    set opt.downloaded_overflow to 1. */
 554 void
 555 downloaded_increase (unsigned long by_how_much)
 556 {
 557   VERY_LONG_TYPE old;
 558   if (opt.downloaded_overflow)
 559     return;
 560   old = opt.downloaded;
 561   opt.downloaded += by_how_much;
 562   if (opt.downloaded < old)     /* carry flag, where are you when I
 563                                    need you? */
 564     {
 565       /* Overflow. */
 566       opt.downloaded_overflow = 1;
 567       opt.downloaded = ~((VERY_LONG_TYPE)0);
 568     }
 569 }
 570
 571 /* Return non-zero if the downloaded amount of bytes exceeds the
 572    desired quota.  If quota is not set or if the amount overflowed, 0
 573    is returned. */
 574 int
 575 downloaded_exceeds_quota (void)
 576 {
 577   if (!opt.quota)
 578     return 0;
 579   if (opt.downloaded_overflow)
 580     /* We don't really know.  (Wildly) assume not. */
 581     return 0;
 582
 583   return opt.downloaded > opt.quota;
 584 }
 585
 586 /* If opt.wait or opt.waitretry are specified, and if certain
 587    conditions are met, sleep the appropriate number of seconds.  See
 588    the documentation of --wait and --waitretry for more information.
 589
 590    COUNT is the count of current retrieval, beginning with 1. */
 591
 592 void
 593 sleep_between_retrievals (int count)
 594 {
 595   static int first_retrieval = 1;
 596
 597   if (first_retrieval)
 598     {
 599       /* Don't sleep before the very first retrieval. */
 600       first_retrieval = 0;
 601       return;
 602     }
 603
 604   if (opt.waitretry && count > 1)
 605     {
 606       /* If opt.waitretry is specified and this is a retry, wait for
 607          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 608       if (count <= opt.waitretry)
 609         sleep (count - 1);
 610       else
 611         sleep (opt.waitretry);
 612     }
 613   else if (opt.wait)
 614     {
 615       if (!opt.random_wait || count > 1)
 616         /* If random-wait is not specified, or if we are sleeping
 617            between retries of the same download, sleep the fixed
 618            interval.  */
 619         sleep (opt.wait);
 620       else
 621         {
 622           /* Sleep a random amount of time averaging in opt.wait
 623              seconds.  The sleeping amount ranges from 0 to
 624              opt.wait*2, inclusive.  */
 625           int waitsecs = random_number (opt.wait * 2 + 1);
 626
 627           DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
 628                    opt.wait, waitsecs - opt.wait, waitsecs));
 629
 630           if (waitsecs)
 631             sleep (waitsecs);
 632         }
 633     }
 634 }