sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <sys/types.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #ifdef HAVE_STRING_H
  40 # include <string.h>
  41 #else
  42 # include <strings.h>
  43 #endif /* HAVE_STRING_H */
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "retr.h"
  49 #include "progress.h"
  50 #include "url.h"
  51 #include "recur.h"
  52 #include "ftp.h"
  53 #include "host.h"
  54 #include "connect.h"
  55 #include "hash.h"
  56
  57 #ifdef HAVE_SSL
  58 # include "gen_sslfunc.h"       /* for ssl_iread */
  59 #endif
  60
  61 #ifndef errno
  62 extern int errno;
  63 #endif
  64
  65 /* See the comment in gethttp() why this is needed. */
  66 int global_download_count;
  67
  68 \f
  69 static struct {
  70   long bytes;
  71   long dltime;
  72 } limit_data;
  73
  74 static void
  75 limit_bandwidth_reset (void)
  76 {
  77   limit_data.bytes  = 0;
  78   limit_data.dltime = 0;
  79 }
  80
  81 /* Limit the bandwidth by pausing the download for an amount of time.
  82    BYTES is the number of bytes received from the network, DELTA is
  83    how long it took to receive them, DLTIME the current download time,
  84    TIMER the timer, and ADJUSTMENT the previous.  */
  85
  86 static void
  87 limit_bandwidth (long bytes, long delta)
  88 {
  89   long expected;
  90
  91   limit_data.bytes += bytes;
  92   limit_data.dltime += delta;
  93
  94   expected = (long)(1000.0 * limit_data.bytes / opt.limit_rate);
  95
  96   if (expected > limit_data.dltime)
  97     {
  98       long slp = expected - limit_data.dltime;
  99       if (slp < 200)
 100         {
 101           DEBUGP (("deferring a %ld ms sleep (%ld/%ld) until later.\n",
 102                    slp, limit_data.bytes, limit_data.dltime));
 103           return;
 104         }
 105       DEBUGP (("sleeping %ld ms\n", slp));
 106       usleep (1000 * slp);
 107     }
 108
 109   limit_data.bytes = 0;
 110   limit_data.dltime = 0;
 111 }
 112
 113 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 114
 115 /* Reads the contents of file descriptor FD, until it is closed, or a
 116    read error occurs.  The data is read in 8K chunks, and stored to
 117    stream fp, which should have been open for writing.  If BUF is
 118    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 119    This function will *not* use the rbuf_* functions!
 120
 121    The EXPECTED argument is passed to show_progress() unchanged, but
 122    otherwise ignored.
 123
 124    If opt.verbose is set, the progress is also shown.  RESTVAL
 125    represents a value from which to start downloading (which will be
 126    shown accordingly).  If RESTVAL is non-zero, the stream should have
 127    been open for appending.
 128
 129    The function exits and returns codes of 0, -1 and -2 if the
 130    connection was closed, there was a read error, or if it could not
 131    write to the output stream, respectively.
 132
 133    IMPORTANT: The function flushes the contents of the buffer in
 134    rbuf_flush() before actually reading from fd.  If you wish to read
 135    from fd immediately, flush or discard the buffer.  */
 136 int
 137 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 138               struct rbuf *rbuf, int use_expected, long *elapsed)
 139 {
 140   int res = 0;
 141   static char c[8192];
 142   void *progress = NULL;
 143   struct wget_timer *timer = wtimer_allocate ();
 144   long dltime = 0, last_dltime = 0;
 145
 146   *len = restval;
 147
 148   if (opt.verbose)
 149     progress = progress_create (restval, expected);
 150
 151   if (rbuf && RBUF_FD (rbuf) == fd)
 152     {
 153       int sz = 0;
 154       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
 155         {
 156           fwrite (c, sizeof (char), res, fp);
 157           *len += res;
 158           sz += res;
 159         }
 160       if (sz)
 161         fflush (fp);
 162       if (ferror (fp))
 163         {
 164           res = -2;
 165           goto out;
 166         }
 167       if (progress)
 168         progress_update (progress, sz, 0);
 169     }
 170
 171   if (opt.limit_rate)
 172     limit_bandwidth_reset ();
 173   wtimer_reset (timer);
 174
 175   /* Read from fd while there is available data.
 176
 177      Normally, if expected is 0, it means that it is not known how
 178      much data is expected.  However, if use_expected is specified,
 179      then expected being zero means exactly that.  */
 180   while (!use_expected || (*len < expected))
 181     {
 182       int amount_to_read = (use_expected
 183                             ? MIN (expected - *len, sizeof (c))
 184                             : sizeof (c));
 185 #ifdef HAVE_SSL
 186       if (rbuf->ssl!=NULL)
 187         res = ssl_iread (rbuf->ssl, c, amount_to_read);
 188       else
 189 #endif /* HAVE_SSL */
 190         res = iread (fd, c, amount_to_read);
 191
 192       if (res > 0)
 193         {
 194           fwrite (c, sizeof (char), res, fp);
 195           /* Always flush the contents of the network packet.  This
 196              should not be adverse to performance, as the network
 197              packets typically won't be too tiny anyway.  */
 198           fflush (fp);
 199           if (ferror (fp))
 200             {
 201               res = -2;
 202               goto out;
 203             }
 204
 205           /* If bandwidth is not limited, one call to wtimer_elapsed
 206              is sufficient.  */
 207           dltime = wtimer_elapsed (timer);
 208           if (opt.limit_rate)
 209             {
 210               limit_bandwidth (res, dltime - last_dltime);
 211               dltime = wtimer_elapsed (timer);
 212               last_dltime = dltime;
 213             }
 214
 215           if (progress)
 216             progress_update (progress, res, dltime);
 217           *len += res;
 218         }
 219       else
 220         break;
 221     }
 222   if (res < -1)
 223     res = -1;
 224
 225  out:
 226   if (progress)
 227     progress_finish (progress, dltime);
 228   if (elapsed)
 229     *elapsed = dltime;
 230   wtimer_delete (timer);
 231
 232   return res;
 233 }
 234 \f
 235 /* Return a printed representation of the download rate, as
 236    appropriate for the speed.  If PAD is non-zero, strings will be
 237    padded to the width of 7 characters (xxxx.xx).  */
 238 char *
 239 retr_rate (long bytes, long msecs, int pad)
 240 {
 241   static char res[20];
 242   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 243   int units = 0;
 244
 245   double dlrate = calc_rate (bytes, msecs, &units);
 246   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 247
 248   return res;
 249 }
 250
 251 /* Calculate the download rate and trim it as appropriate for the
 252    speed.  Appropriate means that if rate is greater than 1K/s,
 253    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 254    are used.
 255
 256    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 257    GB/s.  */
 258 double
 259 calc_rate (long bytes, long msecs, int *units)
 260 {
 261   double dlrate;
 262
 263   assert (msecs >= 0);
 264   assert (bytes >= 0);
 265
 266   if (msecs == 0)
 267     /* If elapsed time is 0, it means we're under the granularity of
 268        the timer.  This often happens on systems that use time() for
 269        the timer.  */
 270     msecs = wtimer_granularity ();
 271
 272   dlrate = (double)1000 * bytes / msecs;
 273   if (dlrate < 1024.0)
 274     *units = 0;
 275   else if (dlrate < 1024.0 * 1024.0)
 276     *units = 1, dlrate /= 1024.0;
 277   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 278     *units = 2, dlrate /= (1024.0 * 1024.0);
 279   else
 280     /* Maybe someone will need this one day.  More realistically, it
 281        will get tickled by buggy timers. */
 282     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 283
 284   return dlrate;
 285 }
 286 \f
 287 /* Maximum number of allowed redirections.  20 was chosen as a
 288    "reasonable" value, which is low enough to not cause havoc, yet
 289    high enough to guarantee that normal retrievals will not be hurt by
 290    the check.  */
 291
 292 #define MAX_REDIRECTIONS 20
 293
 294 #define SUSPEND_POST_DATA do {                  \
 295   post_data_suspended = 1;                      \
 296   saved_post_data = opt.post_data;              \
 297   saved_post_file_name = opt.post_file_name;    \
 298   opt.post_data = NULL;                         \
 299   opt.post_file_name = NULL;                    \
 300 } while (0)
 301
 302 #define RESTORE_POST_DATA do {                          \
 303   if (post_data_suspended)                              \
 304     {                                                   \
 305       opt.post_data = saved_post_data;                  \
 306       opt.post_file_name = saved_post_file_name;        \
 307       post_data_suspended = 0;                          \
 308     }                                                   \
 309 } while (0)
 310
 311 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 312    FTP, proxy, etc.  */
 313
 314 /* #### This function should be rewritten so it doesn't return from
 315    multiple points. */
 316
 317 uerr_t
 318 retrieve_url (const char *origurl, char **file, char **newloc,
 319               const char *refurl, int *dt)
 320 {
 321   uerr_t result;
 322   char *url;
 323   int location_changed, dummy;
 324   char *mynewloc, *proxy;
 325   struct url *u, *proxy_url;
 326   int up_error_code;            /* url parse error code */
 327   char *local_file;
 328   int redirection_count = 0;
 329
 330   int post_data_suspended = 0;
 331   char *saved_post_data = NULL;
 332   char *saved_post_file_name = NULL;
 333
 334   /* If dt is NULL, just ignore it.  */
 335   if (!dt)
 336     dt = &dummy;
 337   url = xstrdup (origurl);
 338   if (newloc)
 339     *newloc = NULL;
 340   if (file)
 341     *file = NULL;
 342
 343   u = url_parse (url, &up_error_code);
 344   if (!u)
 345     {
 346       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 347       xfree (url);
 348       return URLERROR;
 349     }
 350
 351   if (!refurl)
 352     refurl = opt.referer;
 353
 354  redirected:
 355
 356   result = NOCONERROR;
 357   mynewloc = NULL;
 358   local_file = NULL;
 359   proxy_url = NULL;
 360
 361   proxy = getproxy (u);
 362   if (proxy)
 363     {
 364       /* Parse the proxy URL.  */
 365       proxy_url = url_parse (proxy, &up_error_code);
 366       if (!proxy_url)
 367         {
 368           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 369                      proxy, url_error (up_error_code));
 370           xfree (url);
 371           RESTORE_POST_DATA;
 372           return PROXERR;
 373         }
 374       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 375         {
 376           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 377           url_free (proxy_url);
 378           xfree (url);
 379           RESTORE_POST_DATA;
 380           return PROXERR;
 381         }
 382     }
 383
 384   if (u->scheme == SCHEME_HTTP
 385 #ifdef HAVE_SSL
 386       || u->scheme == SCHEME_HTTPS
 387 #endif
 388       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 389     {
 390       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 391     }
 392   else if (u->scheme == SCHEME_FTP)
 393     {
 394       /* If this is a redirection, we must not allow recursive FTP
 395          retrieval, so we save recursion to oldrec, and restore it
 396          later.  */
 397       int oldrec = opt.recursive;
 398       if (redirection_count)
 399         opt.recursive = 0;
 400       result = ftp_loop (u, dt, proxy_url);
 401       opt.recursive = oldrec;
 402
 403       /* There is a possibility of having HTTP being redirected to
 404          FTP.  In these cases we must decide whether the text is HTML
 405          according to the suffix.  The HTML suffixes are `.html',
 406          `.htm' and a few others, case-insensitive.  */
 407       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 408         {
 409           if (has_html_suffix_p (local_file))
 410             *dt |= TEXTHTML;
 411         }
 412     }
 413
 414   if (proxy_url)
 415     {
 416       url_free (proxy_url);
 417       proxy_url = NULL;
 418     }
 419
 420   location_changed = (result == NEWLOCATION);
 421   if (location_changed)
 422     {
 423       char *construced_newloc;
 424       struct url *newloc_parsed;
 425
 426       assert (mynewloc != NULL);
 427
 428       if (local_file)
 429         xfree (local_file);
 430
 431       /* The HTTP specs only allow absolute URLs to appear in
 432          redirects, but a ton of boneheaded webservers and CGIs out
 433          there break the rules and use relative URLs, and popular
 434          browsers are lenient about this, so wget should be too. */
 435       construced_newloc = uri_merge (url, mynewloc);
 436       xfree (mynewloc);
 437       mynewloc = construced_newloc;
 438
 439       /* Now, see if this new location makes sense. */
 440       newloc_parsed = url_parse (mynewloc, &up_error_code);
 441       if (!newloc_parsed)
 442         {
 443           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 444                      url_error (up_error_code));
 445           url_free (u);
 446           xfree (url);
 447           xfree (mynewloc);
 448           RESTORE_POST_DATA;
 449           return result;
 450         }
 451
 452       /* Now mynewloc will become newloc_parsed->url, because if the
 453          Location contained relative paths like .././something, we
 454          don't want that propagating as url.  */
 455       xfree (mynewloc);
 456       mynewloc = xstrdup (newloc_parsed->url);
 457
 458       /* Check for max. number of redirections.  */
 459       if (++redirection_count > MAX_REDIRECTIONS)
 460         {
 461           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 462                      MAX_REDIRECTIONS);
 463           url_free (newloc_parsed);
 464           url_free (u);
 465           xfree (url);
 466           xfree (mynewloc);
 467           RESTORE_POST_DATA;
 468           return WRONGCODE;
 469         }
 470
 471       xfree (url);
 472       url = mynewloc;
 473       url_free (u);
 474       u = newloc_parsed;
 475
 476       /* If we're being redirected from POST, we don't want to POST
 477          again.  Many requests answer POST with a redirection to an
 478          index page; that redirection is clearly a GET.  We "suspend"
 479          POST data for the duration of the redirections, and restore
 480          it when we're done. */
 481       if (!post_data_suspended)
 482         SUSPEND_POST_DATA;
 483
 484       goto redirected;
 485     }
 486
 487   if (local_file)
 488     {
 489       if (*dt & RETROKF)
 490         {
 491           register_download (u->url, local_file);
 492           if (redirection_count && 0 != strcmp (origurl, u->url))
 493             register_redirection (origurl, u->url);
 494           if (*dt & TEXTHTML)
 495             register_html (u->url, local_file);
 496         }
 497     }
 498
 499   if (file)
 500     *file = local_file ? local_file : NULL;
 501   else
 502     FREE_MAYBE (local_file);
 503
 504   url_free (u);
 505
 506   if (redirection_count)
 507     {
 508       if (newloc)
 509         *newloc = url;
 510       else
 511         xfree (url);
 512     }
 513   else
 514     {
 515       if (newloc)
 516         *newloc = NULL;
 517       xfree (url);
 518     }
 519
 520   ++global_download_count;
 521   RESTORE_POST_DATA;
 522
 523   return result;
 524 }
 525
 526 /* Find the URLs in the file and call retrieve_url() for each of
 527    them.  If HTML is non-zero, treat the file as HTML, and construct
 528    the URLs accordingly.
 529
 530    If opt.recursive is set, call recursive_retrieve() for each file.  */
 531 uerr_t
 532 retrieve_from_file (const char *file, int html, int *count)
 533 {
 534   uerr_t status;
 535   struct urlpos *url_list, *cur_url;
 536
 537   url_list = (html ? get_urls_html (file, NULL, NULL)
 538               : get_urls_file (file));
 539   status = RETROK;             /* Suppose everything is OK.  */
 540   *count = 0;                  /* Reset the URL count.  */
 541
 542   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 543     {
 544       char *filename = NULL, *new_file = NULL;
 545       int dt;
 546
 547       if (cur_url->ignore_when_downloading)
 548         continue;
 549
 550       if (downloaded_exceeds_quota ())
 551         {
 552           status = QUOTEXC;
 553           break;
 554         }
 555       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 556         status = retrieve_tree (cur_url->url->url);
 557       else
 558         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 559
 560       if (filename && opt.delete_after && file_exists_p (filename))
 561         {
 562           DEBUGP (("Removing file due to --delete-after in"
 563                    " retrieve_from_file():\n"));
 564           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 565           if (unlink (filename))
 566             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 567           dt &= ~RETROKF;
 568         }
 569
 570       FREE_MAYBE (new_file);
 571       FREE_MAYBE (filename);
 572     }
 573
 574   /* Free the linked list of URL-s.  */
 575   free_urlpos (url_list);
 576
 577   return status;
 578 }
 579
 580 /* Print `giving up', or `retrying', depending on the impending
 581    action.  N1 and N2 are the attempt number and the attempt limit.  */
 582 void
 583 printwhat (int n1, int n2)
 584 {
 585   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 586 }
 587
 588 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 589    set opt.downloaded_overflow to 1. */
 590 void
 591 downloaded_increase (unsigned long by_how_much)
 592 {
 593   VERY_LONG_TYPE old;
 594   if (opt.downloaded_overflow)
 595     return;
 596   old = opt.downloaded;
 597   opt.downloaded += by_how_much;
 598   if (opt.downloaded < old)     /* carry flag, where are you when I
 599                                    need you? */
 600     {
 601       /* Overflow. */
 602       opt.downloaded_overflow = 1;
 603       opt.downloaded = ~((VERY_LONG_TYPE)0);
 604     }
 605 }
 606
 607 /* Return non-zero if the downloaded amount of bytes exceeds the
 608    desired quota.  If quota is not set or if the amount overflowed, 0
 609    is returned. */
 610 int
 611 downloaded_exceeds_quota (void)
 612 {
 613   if (!opt.quota)
 614     return 0;
 615   if (opt.downloaded_overflow)
 616     /* We don't really know.  (Wildly) assume not. */
 617     return 0;
 618
 619   return opt.downloaded > opt.quota;
 620 }
 621
 622 /* If opt.wait or opt.waitretry are specified, and if certain
 623    conditions are met, sleep the appropriate number of seconds.  See
 624    the documentation of --wait and --waitretry for more information.
 625
 626    COUNT is the count of current retrieval, beginning with 1. */
 627
 628 void
 629 sleep_between_retrievals (int count)
 630 {
 631   static int first_retrieval = 1;
 632
 633   if (first_retrieval)
 634     {
 635       /* Don't sleep before the very first retrieval. */
 636       first_retrieval = 0;
 637       return;
 638     }
 639
 640   if (opt.waitretry && count > 1)
 641     {
 642       /* If opt.waitretry is specified and this is a retry, wait for
 643          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 644       if (count <= opt.waitretry)
 645         sleep (count - 1);
 646       else
 647         sleep (opt.waitretry);
 648     }
 649   else if (opt.wait)
 650     {
 651       if (!opt.random_wait || count > 1)
 652         /* If random-wait is not specified, or if we are sleeping
 653            between retries of the same download, sleep the fixed
 654            interval.  */
 655         sleep (opt.wait);
 656       else
 657         {
 658           /* Sleep a random amount of time averaging in opt.wait
 659              seconds.  The sleeping amount ranges from 0 to
 660              opt.wait*2, inclusive.  */
 661           int waitsecs = random_number (opt.wait * 2 + 1);
 662
 663           DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
 664                    opt.wait, waitsecs - opt.wait, waitsecs));
 665
 666           if (waitsecs)
 667             sleep (waitsecs);
 668         }
 669     }
 670 }