sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <sys/types.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #ifdef HAVE_STRING_H
  40 # include <string.h>
  41 #else
  42 # include <strings.h>
  43 #endif /* HAVE_STRING_H */
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "retr.h"
  49 #include "progress.h"
  50 #include "url.h"
  51 #include "recur.h"
  52 #include "ftp.h"
  53 #include "host.h"
  54 #include "connect.h"
  55 #include "hash.h"
  56
  57 #ifdef HAVE_SSL
  58 # include "gen_sslfunc.h"       /* for ssl_iread */
  59 #endif
  60
  61 #ifndef errno
  62 extern int errno;
  63 #endif
  64
  65 /* See the comment in gethttp() why this is needed. */
  66 int global_download_count;
  67
  68 \f
  69 static struct {
  70   long bytes;
  71   double dltime;
  72 } limit_data;
  73
  74 static void
  75 limit_bandwidth_reset (void)
  76 {
  77   limit_data.bytes  = 0;
  78   limit_data.dltime = 0;
  79 }
  80
  81 /* Limit the bandwidth by pausing the download for an amount of time.
  82    BYTES is the number of bytes received from the network, and DELTA
  83    is how long it took to receive them.  */
  84
  85 static void
  86 limit_bandwidth (long bytes, double delta)
  87 {
  88   double expected;
  89
  90   limit_data.bytes += bytes;
  91   limit_data.dltime += delta;
  92
  93   expected = 1000.0 * limit_data.bytes / opt.limit_rate;
  94
  95   if (expected > limit_data.dltime)
  96     {
  97       double slp = expected - limit_data.dltime;
  98       if (slp < 200)
  99         {
 100           DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
 101                    slp, limit_data.bytes, limit_data.dltime));
 102           return;
 103         }
 104       DEBUGP (("sleeping %.2f ms\n", slp));
 105       usleep ((unsigned long) (1000 * slp));
 106     }
 107
 108   limit_data.bytes = 0;
 109   limit_data.dltime = 0;
 110 }
 111
 112 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 113
 114 /* Reads the contents of file descriptor FD, until it is closed, or a
 115    read error occurs.  The data is read in 8K chunks, and stored to
 116    stream fp, which should have been open for writing.  If BUF is
 117    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 118    This function will *not* use the rbuf_* functions!
 119
 120    The EXPECTED argument is passed to show_progress() unchanged, but
 121    otherwise ignored.
 122
 123    If opt.verbose is set, the progress is also shown.  RESTVAL
 124    represents a value from which to start downloading (which will be
 125    shown accordingly).  If RESTVAL is non-zero, the stream should have
 126    been open for appending.
 127
 128    The function exits and returns codes of 0, -1 and -2 if the
 129    connection was closed, there was a read error, or if it could not
 130    write to the output stream, respectively.
 131
 132    IMPORTANT: The function flushes the contents of the buffer in
 133    rbuf_flush() before actually reading from fd.  If you wish to read
 134    from fd immediately, flush or discard the buffer.  */
 135 int
 136 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 137               struct rbuf *rbuf, int use_expected, double *elapsed)
 138 {
 139   int res = 0;
 140
 141   static char dlbuf[16384];
 142   int dlbufsize = sizeof (dlbuf);
 143
 144   void *progress = NULL;
 145   struct wget_timer *timer = wtimer_allocate ();
 146   double dltime = 0, last_dltime = 0;
 147
 148   *len = restval;
 149
 150   if (opt.verbose)
 151     progress = progress_create (restval, expected);
 152
 153   if (rbuf && RBUF_FD (rbuf) == fd)
 154     {
 155       int sz = 0;
 156       while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
 157         {
 158           fwrite (dlbuf, 1, res, fp);
 159           *len += res;
 160           sz += res;
 161         }
 162       if (sz)
 163         fflush (fp);
 164       if (ferror (fp))
 165         {
 166           res = -2;
 167           goto out;
 168         }
 169       if (progress)
 170         progress_update (progress, sz, 0);
 171     }
 172
 173   if (opt.limit_rate)
 174     limit_bandwidth_reset ();
 175   wtimer_reset (timer);
 176
 177   /* If we're limiting the download, set our buffer size to the
 178      limit.  */
 179   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 180     dlbufsize = opt.limit_rate;
 181
 182   /* Read from fd while there is available data.
 183
 184      Normally, if expected is 0, it means that it is not known how
 185      much data is expected.  However, if use_expected is specified,
 186      then expected being zero means exactly that.  */
 187   while (!use_expected || (*len < expected))
 188     {
 189       int amount_to_read = (use_expected
 190                             ? MIN (expected - *len, dlbufsize) : dlbufsize);
 191 #ifdef HAVE_SSL
 192       if (rbuf->ssl!=NULL)
 193         res = ssl_iread (rbuf->ssl, dlbuf, amount_to_read);
 194       else
 195 #endif /* HAVE_SSL */
 196         res = iread (fd, dlbuf, amount_to_read);
 197
 198       if (res <= 0)
 199         break;
 200
 201       fwrite (dlbuf, 1, res, fp);
 202       /* Always flush the contents of the network packet.  This should
 203          not hinder performance: fast downloads will be received in
 204          16K chunks (which stdio would write out anyway), and slow
 205          downloads won't be limited with disk performance.  */
 206       fflush (fp);
 207       if (ferror (fp))
 208         {
 209           res = -2;
 210           goto out;
 211         }
 212
 213       /* If bandwidth is not limited, one call to wtimer_elapsed is
 214          sufficient.  */
 215       dltime = wtimer_elapsed (timer);
 216       if (opt.limit_rate)
 217         {
 218           limit_bandwidth (res, dltime - last_dltime);
 219           dltime = wtimer_elapsed (timer);
 220           last_dltime = dltime;
 221         }
 222
 223       if (progress)
 224         progress_update (progress, res, dltime);
 225       *len += res;
 226     }
 227   if (res < -1)
 228     res = -1;
 229
 230  out:
 231   if (progress)
 232     progress_finish (progress, dltime);
 233   if (elapsed)
 234     *elapsed = dltime;
 235   wtimer_delete (timer);
 236
 237   return res;
 238 }
 239 \f
 240 /* Return a printed representation of the download rate, as
 241    appropriate for the speed.  If PAD is non-zero, strings will be
 242    padded to the width of 7 characters (xxxx.xx).  */
 243 char *
 244 retr_rate (long bytes, double msecs, int pad)
 245 {
 246   static char res[20];
 247   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 248   int units = 0;
 249
 250   double dlrate = calc_rate (bytes, msecs, &units);
 251   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 252
 253   return res;
 254 }
 255
 256 /* Calculate the download rate and trim it as appropriate for the
 257    speed.  Appropriate means that if rate is greater than 1K/s,
 258    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 259    are used.
 260
 261    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 262    GB/s.  */
 263 double
 264 calc_rate (long bytes, double msecs, int *units)
 265 {
 266   double dlrate;
 267
 268   assert (msecs >= 0);
 269   assert (bytes >= 0);
 270
 271   if (msecs == 0)
 272     /* If elapsed time is exactly zero, it means we're under the
 273        granularity of the timer.  This often happens on systems that
 274        use time() for the timer.  */
 275     msecs = wtimer_granularity ();
 276
 277   dlrate = (double)1000 * bytes / msecs;
 278   if (dlrate < 1024.0)
 279     *units = 0;
 280   else if (dlrate < 1024.0 * 1024.0)
 281     *units = 1, dlrate /= 1024.0;
 282   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 283     *units = 2, dlrate /= (1024.0 * 1024.0);
 284   else
 285     /* Maybe someone will need this, one day. */
 286     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 287
 288   return dlrate;
 289 }
 290 \f
 291 /* Maximum number of allowed redirections.  20 was chosen as a
 292    "reasonable" value, which is low enough to not cause havoc, yet
 293    high enough to guarantee that normal retrievals will not be hurt by
 294    the check.  */
 295
 296 #define MAX_REDIRECTIONS 20
 297
 298 #define SUSPEND_POST_DATA do {                  \
 299   post_data_suspended = 1;                      \
 300   saved_post_data = opt.post_data;              \
 301   saved_post_file_name = opt.post_file_name;    \
 302   opt.post_data = NULL;                         \
 303   opt.post_file_name = NULL;                    \
 304 } while (0)
 305
 306 #define RESTORE_POST_DATA do {                          \
 307   if (post_data_suspended)                              \
 308     {                                                   \
 309       opt.post_data = saved_post_data;                  \
 310       opt.post_file_name = saved_post_file_name;        \
 311       post_data_suspended = 0;                          \
 312     }                                                   \
 313 } while (0)
 314
 315 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 316    FTP, proxy, etc.  */
 317
 318 /* #### This function should be rewritten so it doesn't return from
 319    multiple points. */
 320
 321 uerr_t
 322 retrieve_url (const char *origurl, char **file, char **newloc,
 323               const char *refurl, int *dt)
 324 {
 325   uerr_t result;
 326   char *url;
 327   int location_changed, dummy;
 328   char *mynewloc, *proxy;
 329   struct url *u, *proxy_url;
 330   int up_error_code;            /* url parse error code */
 331   char *local_file;
 332   int redirection_count = 0;
 333
 334   int post_data_suspended = 0;
 335   char *saved_post_data = NULL;
 336   char *saved_post_file_name = NULL;
 337
 338   /* If dt is NULL, just ignore it.  */
 339   if (!dt)
 340     dt = &dummy;
 341   url = xstrdup (origurl);
 342   if (newloc)
 343     *newloc = NULL;
 344   if (file)
 345     *file = NULL;
 346
 347   u = url_parse (url, &up_error_code);
 348   if (!u)
 349     {
 350       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 351       xfree (url);
 352       return URLERROR;
 353     }
 354
 355   if (!refurl)
 356     refurl = opt.referer;
 357
 358  redirected:
 359
 360   result = NOCONERROR;
 361   mynewloc = NULL;
 362   local_file = NULL;
 363   proxy_url = NULL;
 364
 365   proxy = getproxy (u);
 366   if (proxy)
 367     {
 368       /* Parse the proxy URL.  */
 369       proxy_url = url_parse (proxy, &up_error_code);
 370       if (!proxy_url)
 371         {
 372           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 373                      proxy, url_error (up_error_code));
 374           xfree (url);
 375           RESTORE_POST_DATA;
 376           return PROXERR;
 377         }
 378       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 379         {
 380           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 381           url_free (proxy_url);
 382           xfree (url);
 383           RESTORE_POST_DATA;
 384           return PROXERR;
 385         }
 386     }
 387
 388   if (u->scheme == SCHEME_HTTP
 389 #ifdef HAVE_SSL
 390       || u->scheme == SCHEME_HTTPS
 391 #endif
 392       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 393     {
 394       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 395     }
 396   else if (u->scheme == SCHEME_FTP)
 397     {
 398       /* If this is a redirection, we must not allow recursive FTP
 399          retrieval, so we save recursion to oldrec, and restore it
 400          later.  */
 401       int oldrec = opt.recursive;
 402       if (redirection_count)
 403         opt.recursive = 0;
 404       result = ftp_loop (u, dt, proxy_url);
 405       opt.recursive = oldrec;
 406
 407       /* There is a possibility of having HTTP being redirected to
 408          FTP.  In these cases we must decide whether the text is HTML
 409          according to the suffix.  The HTML suffixes are `.html',
 410          `.htm' and a few others, case-insensitive.  */
 411       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 412         {
 413           if (has_html_suffix_p (local_file))
 414             *dt |= TEXTHTML;
 415         }
 416     }
 417
 418   if (proxy_url)
 419     {
 420       url_free (proxy_url);
 421       proxy_url = NULL;
 422     }
 423
 424   location_changed = (result == NEWLOCATION);
 425   if (location_changed)
 426     {
 427       char *construced_newloc;
 428       struct url *newloc_parsed;
 429
 430       assert (mynewloc != NULL);
 431
 432       if (local_file)
 433         xfree (local_file);
 434
 435       /* The HTTP specs only allow absolute URLs to appear in
 436          redirects, but a ton of boneheaded webservers and CGIs out
 437          there break the rules and use relative URLs, and popular
 438          browsers are lenient about this, so wget should be too. */
 439       construced_newloc = uri_merge (url, mynewloc);
 440       xfree (mynewloc);
 441       mynewloc = construced_newloc;
 442
 443       /* Now, see if this new location makes sense. */
 444       newloc_parsed = url_parse (mynewloc, &up_error_code);
 445       if (!newloc_parsed)
 446         {
 447           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 448                      url_error (up_error_code));
 449           url_free (u);
 450           xfree (url);
 451           xfree (mynewloc);
 452           RESTORE_POST_DATA;
 453           return result;
 454         }
 455
 456       /* Now mynewloc will become newloc_parsed->url, because if the
 457          Location contained relative paths like .././something, we
 458          don't want that propagating as url.  */
 459       xfree (mynewloc);
 460       mynewloc = xstrdup (newloc_parsed->url);
 461
 462       /* Check for max. number of redirections.  */
 463       if (++redirection_count > MAX_REDIRECTIONS)
 464         {
 465           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 466                      MAX_REDIRECTIONS);
 467           url_free (newloc_parsed);
 468           url_free (u);
 469           xfree (url);
 470           xfree (mynewloc);
 471           RESTORE_POST_DATA;
 472           return WRONGCODE;
 473         }
 474
 475       xfree (url);
 476       url = mynewloc;
 477       url_free (u);
 478       u = newloc_parsed;
 479
 480       /* If we're being redirected from POST, we don't want to POST
 481          again.  Many requests answer POST with a redirection to an
 482          index page; that redirection is clearly a GET.  We "suspend"
 483          POST data for the duration of the redirections, and restore
 484          it when we're done. */
 485       if (!post_data_suspended)
 486         SUSPEND_POST_DATA;
 487
 488       goto redirected;
 489     }
 490
 491   if (local_file)
 492     {
 493       if (*dt & RETROKF)
 494         {
 495           register_download (u->url, local_file);
 496           if (redirection_count && 0 != strcmp (origurl, u->url))
 497             register_redirection (origurl, u->url);
 498           if (*dt & TEXTHTML)
 499             register_html (u->url, local_file);
 500         }
 501     }
 502
 503   if (file)
 504     *file = local_file ? local_file : NULL;
 505   else
 506     FREE_MAYBE (local_file);
 507
 508   url_free (u);
 509
 510   if (redirection_count)
 511     {
 512       if (newloc)
 513         *newloc = url;
 514       else
 515         xfree (url);
 516     }
 517   else
 518     {
 519       if (newloc)
 520         *newloc = NULL;
 521       xfree (url);
 522     }
 523
 524   ++global_download_count;
 525   RESTORE_POST_DATA;
 526
 527   return result;
 528 }
 529
 530 /* Find the URLs in the file and call retrieve_url() for each of
 531    them.  If HTML is non-zero, treat the file as HTML, and construct
 532    the URLs accordingly.
 533
 534    If opt.recursive is set, call recursive_retrieve() for each file.  */
 535 uerr_t
 536 retrieve_from_file (const char *file, int html, int *count)
 537 {
 538   uerr_t status;
 539   struct urlpos *url_list, *cur_url;
 540
 541   url_list = (html ? get_urls_html (file, NULL, NULL)
 542               : get_urls_file (file));
 543   status = RETROK;             /* Suppose everything is OK.  */
 544   *count = 0;                  /* Reset the URL count.  */
 545
 546   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 547     {
 548       char *filename = NULL, *new_file = NULL;
 549       int dt;
 550
 551       if (cur_url->ignore_when_downloading)
 552         continue;
 553
 554       if (downloaded_exceeds_quota ())
 555         {
 556           status = QUOTEXC;
 557           break;
 558         }
 559       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 560         status = retrieve_tree (cur_url->url->url);
 561       else
 562         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 563
 564       if (filename && opt.delete_after && file_exists_p (filename))
 565         {
 566           DEBUGP (("Removing file due to --delete-after in"
 567                    " retrieve_from_file():\n"));
 568           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 569           if (unlink (filename))
 570             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 571           dt &= ~RETROKF;
 572         }
 573
 574       FREE_MAYBE (new_file);
 575       FREE_MAYBE (filename);
 576     }
 577
 578   /* Free the linked list of URL-s.  */
 579   free_urlpos (url_list);
 580
 581   return status;
 582 }
 583
 584 /* Print `giving up', or `retrying', depending on the impending
 585    action.  N1 and N2 are the attempt number and the attempt limit.  */
 586 void
 587 printwhat (int n1, int n2)
 588 {
 589   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 590 }
 591
 592 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 593    set opt.downloaded_overflow to 1. */
 594 void
 595 downloaded_increase (unsigned long by_how_much)
 596 {
 597   VERY_LONG_TYPE old;
 598   if (opt.downloaded_overflow)
 599     return;
 600   old = opt.downloaded;
 601   opt.downloaded += by_how_much;
 602   if (opt.downloaded < old)     /* carry flag, where are you when I
 603                                    need you? */
 604     {
 605       /* Overflow. */
 606       opt.downloaded_overflow = 1;
 607       opt.downloaded = ~((VERY_LONG_TYPE)0);
 608     }
 609 }
 610
 611 /* Return non-zero if the downloaded amount of bytes exceeds the
 612    desired quota.  If quota is not set or if the amount overflowed, 0
 613    is returned. */
 614 int
 615 downloaded_exceeds_quota (void)
 616 {
 617   if (!opt.quota)
 618     return 0;
 619   if (opt.downloaded_overflow)
 620     /* We don't really know.  (Wildly) assume not. */
 621     return 0;
 622
 623   return opt.downloaded > opt.quota;
 624 }
 625
 626 /* If opt.wait or opt.waitretry are specified, and if certain
 627    conditions are met, sleep the appropriate number of seconds.  See
 628    the documentation of --wait and --waitretry for more information.
 629
 630    COUNT is the count of current retrieval, beginning with 1. */
 631
 632 void
 633 sleep_between_retrievals (int count)
 634 {
 635   static int first_retrieval = 1;
 636
 637   if (first_retrieval)
 638     {
 639       /* Don't sleep before the very first retrieval. */
 640       first_retrieval = 0;
 641       return;
 642     }
 643
 644   if (opt.waitretry && count > 1)
 645     {
 646       /* If opt.waitretry is specified and this is a retry, wait for
 647          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 648       if (count <= opt.waitretry)
 649         sleep (count - 1);
 650       else
 651         sleep (opt.waitretry);
 652     }
 653   else if (opt.wait)
 654     {
 655       if (!opt.random_wait || count > 1)
 656         /* If random-wait is not specified, or if we are sleeping
 657            between retries of the same download, sleep the fixed
 658            interval.  */
 659         sleep (opt.wait);
 660       else
 661         {
 662           /* Sleep a random amount of time averaging in opt.wait
 663              seconds.  The sleeping amount ranges from 0 to
 664              opt.wait*2, inclusive.  */
 665           int waitsecs = random_number (opt.wait * 2 + 1);
 666
 667           DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
 668                    opt.wait, waitsecs - opt.wait, waitsecs));
 669
 670           if (waitsecs)
 671             sleep (waitsecs);
 672         }
 673     }
 674 }