sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <sys/types.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #ifdef HAVE_STRING_H
  40 # include <string.h>
  41 #else
  42 # include <strings.h>
  43 #endif /* HAVE_STRING_H */
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "retr.h"
  49 #include "progress.h"
  50 #include "url.h"
  51 #include "recur.h"
  52 #include "ftp.h"
  53 #include "host.h"
  54 #include "connect.h"
  55 #include "hash.h"
  56
  57 #ifdef HAVE_SSL
  58 # include "gen_sslfunc.h"       /* for ssl_iread */
  59 #endif
  60
  61 #ifndef errno
  62 extern int errno;
  63 #endif
  64
  65 /* See the comment in gethttp() why this is needed. */
  66 int global_download_count;
  67
  68 \f
  69 static struct {
  70   long bytes;
  71   double dltime;
  72 } limit_data;
  73
  74 static void
  75 limit_bandwidth_reset (void)
  76 {
  77   limit_data.bytes  = 0;
  78   limit_data.dltime = 0;
  79 }
  80
  81 /* Limit the bandwidth by pausing the download for an amount of time.
  82    BYTES is the number of bytes received from the network, DELTA is
  83    how long it took to receive them, DLTIME the current download time,
  84    TIMER the timer, and ADJUSTMENT the previous.  */
  85
  86 static void
  87 limit_bandwidth (long bytes, double delta)
  88 {
  89   double expected;
  90
  91   limit_data.bytes += bytes;
  92   limit_data.dltime += delta;
  93
  94   expected = 1000.0 * limit_data.bytes / opt.limit_rate;
  95
  96   if (expected > limit_data.dltime)
  97     {
  98       double slp = expected - limit_data.dltime;
  99       if (slp < 200)
 100         {
 101           DEBUGP (("deferring a %.2f ms sleep (%ld/%.2f).\n",
 102                    slp, limit_data.bytes, limit_data.dltime));
 103           return;
 104         }
 105       DEBUGP (("sleeping %.2f ms\n", slp));
 106       usleep ((unsigned long) (1000 * slp));
 107     }
 108
 109   limit_data.bytes = 0;
 110   limit_data.dltime = 0;
 111 }
 112
 113 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 114
 115 /* Reads the contents of file descriptor FD, until it is closed, or a
 116    read error occurs.  The data is read in 8K chunks, and stored to
 117    stream fp, which should have been open for writing.  If BUF is
 118    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 119    This function will *not* use the rbuf_* functions!
 120
 121    The EXPECTED argument is passed to show_progress() unchanged, but
 122    otherwise ignored.
 123
 124    If opt.verbose is set, the progress is also shown.  RESTVAL
 125    represents a value from which to start downloading (which will be
 126    shown accordingly).  If RESTVAL is non-zero, the stream should have
 127    been open for appending.
 128
 129    The function exits and returns codes of 0, -1 and -2 if the
 130    connection was closed, there was a read error, or if it could not
 131    write to the output stream, respectively.
 132
 133    IMPORTANT: The function flushes the contents of the buffer in
 134    rbuf_flush() before actually reading from fd.  If you wish to read
 135    from fd immediately, flush or discard the buffer.  */
 136 int
 137 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 138               struct rbuf *rbuf, int use_expected, double *elapsed)
 139 {
 140   int res = 0;
 141
 142   static char dlbuf[16384];
 143   int dlbufsize = sizeof (dlbuf);
 144
 145   void *progress = NULL;
 146   struct wget_timer *timer = wtimer_allocate ();
 147   double dltime = 0, last_dltime = 0;
 148
 149   *len = restval;
 150
 151   if (opt.verbose)
 152     progress = progress_create (restval, expected);
 153
 154   if (rbuf && RBUF_FD (rbuf) == fd)
 155     {
 156       int sz = 0;
 157       while ((res = rbuf_flush (rbuf, dlbuf, sizeof (dlbuf))) != 0)
 158         {
 159           fwrite (dlbuf, 1, res, fp);
 160           *len += res;
 161           sz += res;
 162         }
 163       if (sz)
 164         fflush (fp);
 165       if (ferror (fp))
 166         {
 167           res = -2;
 168           goto out;
 169         }
 170       if (progress)
 171         progress_update (progress, sz, 0);
 172     }
 173
 174   if (opt.limit_rate)
 175     limit_bandwidth_reset ();
 176   wtimer_reset (timer);
 177
 178   /* If we're limiting the download, set our buffer size to the
 179      limit.  */
 180   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 181     dlbufsize = opt.limit_rate;
 182
 183   /* Read from fd while there is available data.
 184
 185      Normally, if expected is 0, it means that it is not known how
 186      much data is expected.  However, if use_expected is specified,
 187      then expected being zero means exactly that.  */
 188   while (!use_expected || (*len < expected))
 189     {
 190       int amount_to_read = (use_expected
 191                             ? MIN (expected - *len, dlbufsize) : dlbufsize);
 192 #ifdef HAVE_SSL
 193       if (rbuf->ssl!=NULL)
 194         res = ssl_iread (rbuf->ssl, dlbufsize, amount_to_read);
 195       else
 196 #endif /* HAVE_SSL */
 197         res = iread (fd, dlbuf, amount_to_read);
 198
 199       if (res > 0)
 200         {
 201           fwrite (dlbuf, 1, res, fp);
 202           /* Always flush the contents of the network packet.  This
 203              should not be adverse to performance, as the network
 204              packets typically won't be too tiny anyway.  */
 205           fflush (fp);
 206           if (ferror (fp))
 207             {
 208               res = -2;
 209               goto out;
 210             }
 211
 212           /* If bandwidth is not limited, one call to wtimer_elapsed
 213              is sufficient.  */
 214           dltime = wtimer_elapsed (timer);
 215           if (opt.limit_rate)
 216             {
 217               limit_bandwidth (res, dltime - last_dltime);
 218               dltime = wtimer_elapsed (timer);
 219               last_dltime = dltime;
 220             }
 221
 222           if (progress)
 223             progress_update (progress, res, dltime);
 224           *len += res;
 225         }
 226       else
 227         break;
 228     }
 229   if (res < -1)
 230     res = -1;
 231
 232  out:
 233   if (progress)
 234     progress_finish (progress, dltime);
 235   if (elapsed)
 236     *elapsed = dltime;
 237   wtimer_delete (timer);
 238
 239   return res;
 240 }
 241 \f
 242 /* Return a printed representation of the download rate, as
 243    appropriate for the speed.  If PAD is non-zero, strings will be
 244    padded to the width of 7 characters (xxxx.xx).  */
 245 char *
 246 retr_rate (long bytes, double msecs, int pad)
 247 {
 248   static char res[20];
 249   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 250   int units = 0;
 251
 252   double dlrate = calc_rate (bytes, msecs, &units);
 253   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 254
 255   return res;
 256 }
 257
 258 /* Calculate the download rate and trim it as appropriate for the
 259    speed.  Appropriate means that if rate is greater than 1K/s,
 260    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 261    are used.
 262
 263    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 264    GB/s.  */
 265 double
 266 calc_rate (long bytes, double msecs, int *units)
 267 {
 268   double dlrate;
 269
 270   assert (msecs >= 0);
 271   assert (bytes >= 0);
 272
 273   if (msecs == 0)
 274     /* If elapsed time is exactly zero, it means we're under the
 275        granularity of the timer.  This often happens on systems that
 276        use time() for the timer.  */
 277     msecs = wtimer_granularity ();
 278
 279   dlrate = (double)1000 * bytes / msecs;
 280   if (dlrate < 1024.0)
 281     *units = 0;
 282   else if (dlrate < 1024.0 * 1024.0)
 283     *units = 1, dlrate /= 1024.0;
 284   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 285     *units = 2, dlrate /= (1024.0 * 1024.0);
 286   else
 287     /* Maybe someone will need this one day.  More realistically, it
 288        will get tickled by buggy timers. */
 289     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 290
 291   return dlrate;
 292 }
 293 \f
 294 /* Maximum number of allowed redirections.  20 was chosen as a
 295    "reasonable" value, which is low enough to not cause havoc, yet
 296    high enough to guarantee that normal retrievals will not be hurt by
 297    the check.  */
 298
 299 #define MAX_REDIRECTIONS 20
 300
 301 #define SUSPEND_POST_DATA do {                  \
 302   post_data_suspended = 1;                      \
 303   saved_post_data = opt.post_data;              \
 304   saved_post_file_name = opt.post_file_name;    \
 305   opt.post_data = NULL;                         \
 306   opt.post_file_name = NULL;                    \
 307 } while (0)
 308
 309 #define RESTORE_POST_DATA do {                          \
 310   if (post_data_suspended)                              \
 311     {                                                   \
 312       opt.post_data = saved_post_data;                  \
 313       opt.post_file_name = saved_post_file_name;        \
 314       post_data_suspended = 0;                          \
 315     }                                                   \
 316 } while (0)
 317
 318 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 319    FTP, proxy, etc.  */
 320
 321 /* #### This function should be rewritten so it doesn't return from
 322    multiple points. */
 323
 324 uerr_t
 325 retrieve_url (const char *origurl, char **file, char **newloc,
 326               const char *refurl, int *dt)
 327 {
 328   uerr_t result;
 329   char *url;
 330   int location_changed, dummy;
 331   char *mynewloc, *proxy;
 332   struct url *u, *proxy_url;
 333   int up_error_code;            /* url parse error code */
 334   char *local_file;
 335   int redirection_count = 0;
 336
 337   int post_data_suspended = 0;
 338   char *saved_post_data = NULL;
 339   char *saved_post_file_name = NULL;
 340
 341   /* If dt is NULL, just ignore it.  */
 342   if (!dt)
 343     dt = &dummy;
 344   url = xstrdup (origurl);
 345   if (newloc)
 346     *newloc = NULL;
 347   if (file)
 348     *file = NULL;
 349
 350   u = url_parse (url, &up_error_code);
 351   if (!u)
 352     {
 353       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 354       xfree (url);
 355       return URLERROR;
 356     }
 357
 358   if (!refurl)
 359     refurl = opt.referer;
 360
 361  redirected:
 362
 363   result = NOCONERROR;
 364   mynewloc = NULL;
 365   local_file = NULL;
 366   proxy_url = NULL;
 367
 368   proxy = getproxy (u);
 369   if (proxy)
 370     {
 371       /* Parse the proxy URL.  */
 372       proxy_url = url_parse (proxy, &up_error_code);
 373       if (!proxy_url)
 374         {
 375           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 376                      proxy, url_error (up_error_code));
 377           xfree (url);
 378           RESTORE_POST_DATA;
 379           return PROXERR;
 380         }
 381       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 382         {
 383           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 384           url_free (proxy_url);
 385           xfree (url);
 386           RESTORE_POST_DATA;
 387           return PROXERR;
 388         }
 389     }
 390
 391   if (u->scheme == SCHEME_HTTP
 392 #ifdef HAVE_SSL
 393       || u->scheme == SCHEME_HTTPS
 394 #endif
 395       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 396     {
 397       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 398     }
 399   else if (u->scheme == SCHEME_FTP)
 400     {
 401       /* If this is a redirection, we must not allow recursive FTP
 402          retrieval, so we save recursion to oldrec, and restore it
 403          later.  */
 404       int oldrec = opt.recursive;
 405       if (redirection_count)
 406         opt.recursive = 0;
 407       result = ftp_loop (u, dt, proxy_url);
 408       opt.recursive = oldrec;
 409
 410       /* There is a possibility of having HTTP being redirected to
 411          FTP.  In these cases we must decide whether the text is HTML
 412          according to the suffix.  The HTML suffixes are `.html',
 413          `.htm' and a few others, case-insensitive.  */
 414       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 415         {
 416           if (has_html_suffix_p (local_file))
 417             *dt |= TEXTHTML;
 418         }
 419     }
 420
 421   if (proxy_url)
 422     {
 423       url_free (proxy_url);
 424       proxy_url = NULL;
 425     }
 426
 427   location_changed = (result == NEWLOCATION);
 428   if (location_changed)
 429     {
 430       char *construced_newloc;
 431       struct url *newloc_parsed;
 432
 433       assert (mynewloc != NULL);
 434
 435       if (local_file)
 436         xfree (local_file);
 437
 438       /* The HTTP specs only allow absolute URLs to appear in
 439          redirects, but a ton of boneheaded webservers and CGIs out
 440          there break the rules and use relative URLs, and popular
 441          browsers are lenient about this, so wget should be too. */
 442       construced_newloc = uri_merge (url, mynewloc);
 443       xfree (mynewloc);
 444       mynewloc = construced_newloc;
 445
 446       /* Now, see if this new location makes sense. */
 447       newloc_parsed = url_parse (mynewloc, &up_error_code);
 448       if (!newloc_parsed)
 449         {
 450           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 451                      url_error (up_error_code));
 452           url_free (u);
 453           xfree (url);
 454           xfree (mynewloc);
 455           RESTORE_POST_DATA;
 456           return result;
 457         }
 458
 459       /* Now mynewloc will become newloc_parsed->url, because if the
 460          Location contained relative paths like .././something, we
 461          don't want that propagating as url.  */
 462       xfree (mynewloc);
 463       mynewloc = xstrdup (newloc_parsed->url);
 464
 465       /* Check for max. number of redirections.  */
 466       if (++redirection_count > MAX_REDIRECTIONS)
 467         {
 468           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 469                      MAX_REDIRECTIONS);
 470           url_free (newloc_parsed);
 471           url_free (u);
 472           xfree (url);
 473           xfree (mynewloc);
 474           RESTORE_POST_DATA;
 475           return WRONGCODE;
 476         }
 477
 478       xfree (url);
 479       url = mynewloc;
 480       url_free (u);
 481       u = newloc_parsed;
 482
 483       /* If we're being redirected from POST, we don't want to POST
 484          again.  Many requests answer POST with a redirection to an
 485          index page; that redirection is clearly a GET.  We "suspend"
 486          POST data for the duration of the redirections, and restore
 487          it when we're done. */
 488       if (!post_data_suspended)
 489         SUSPEND_POST_DATA;
 490
 491       goto redirected;
 492     }
 493
 494   if (local_file)
 495     {
 496       if (*dt & RETROKF)
 497         {
 498           register_download (u->url, local_file);
 499           if (redirection_count && 0 != strcmp (origurl, u->url))
 500             register_redirection (origurl, u->url);
 501           if (*dt & TEXTHTML)
 502             register_html (u->url, local_file);
 503         }
 504     }
 505
 506   if (file)
 507     *file = local_file ? local_file : NULL;
 508   else
 509     FREE_MAYBE (local_file);
 510
 511   url_free (u);
 512
 513   if (redirection_count)
 514     {
 515       if (newloc)
 516         *newloc = url;
 517       else
 518         xfree (url);
 519     }
 520   else
 521     {
 522       if (newloc)
 523         *newloc = NULL;
 524       xfree (url);
 525     }
 526
 527   ++global_download_count;
 528   RESTORE_POST_DATA;
 529
 530   return result;
 531 }
 532
 533 /* Find the URLs in the file and call retrieve_url() for each of
 534    them.  If HTML is non-zero, treat the file as HTML, and construct
 535    the URLs accordingly.
 536
 537    If opt.recursive is set, call recursive_retrieve() for each file.  */
 538 uerr_t
 539 retrieve_from_file (const char *file, int html, int *count)
 540 {
 541   uerr_t status;
 542   struct urlpos *url_list, *cur_url;
 543
 544   url_list = (html ? get_urls_html (file, NULL, NULL)
 545               : get_urls_file (file));
 546   status = RETROK;             /* Suppose everything is OK.  */
 547   *count = 0;                  /* Reset the URL count.  */
 548
 549   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 550     {
 551       char *filename = NULL, *new_file = NULL;
 552       int dt;
 553
 554       if (cur_url->ignore_when_downloading)
 555         continue;
 556
 557       if (downloaded_exceeds_quota ())
 558         {
 559           status = QUOTEXC;
 560           break;
 561         }
 562       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 563         status = retrieve_tree (cur_url->url->url);
 564       else
 565         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 566
 567       if (filename && opt.delete_after && file_exists_p (filename))
 568         {
 569           DEBUGP (("Removing file due to --delete-after in"
 570                    " retrieve_from_file():\n"));
 571           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 572           if (unlink (filename))
 573             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 574           dt &= ~RETROKF;
 575         }
 576
 577       FREE_MAYBE (new_file);
 578       FREE_MAYBE (filename);
 579     }
 580
 581   /* Free the linked list of URL-s.  */
 582   free_urlpos (url_list);
 583
 584   return status;
 585 }
 586
 587 /* Print `giving up', or `retrying', depending on the impending
 588    action.  N1 and N2 are the attempt number and the attempt limit.  */
 589 void
 590 printwhat (int n1, int n2)
 591 {
 592   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 593 }
 594
 595 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 596    set opt.downloaded_overflow to 1. */
 597 void
 598 downloaded_increase (unsigned long by_how_much)
 599 {
 600   VERY_LONG_TYPE old;
 601   if (opt.downloaded_overflow)
 602     return;
 603   old = opt.downloaded;
 604   opt.downloaded += by_how_much;
 605   if (opt.downloaded < old)     /* carry flag, where are you when I
 606                                    need you? */
 607     {
 608       /* Overflow. */
 609       opt.downloaded_overflow = 1;
 610       opt.downloaded = ~((VERY_LONG_TYPE)0);
 611     }
 612 }
 613
 614 /* Return non-zero if the downloaded amount of bytes exceeds the
 615    desired quota.  If quota is not set or if the amount overflowed, 0
 616    is returned. */
 617 int
 618 downloaded_exceeds_quota (void)
 619 {
 620   if (!opt.quota)
 621     return 0;
 622   if (opt.downloaded_overflow)
 623     /* We don't really know.  (Wildly) assume not. */
 624     return 0;
 625
 626   return opt.downloaded > opt.quota;
 627 }
 628
 629 /* If opt.wait or opt.waitretry are specified, and if certain
 630    conditions are met, sleep the appropriate number of seconds.  See
 631    the documentation of --wait and --waitretry for more information.
 632
 633    COUNT is the count of current retrieval, beginning with 1. */
 634
 635 void
 636 sleep_between_retrievals (int count)
 637 {
 638   static int first_retrieval = 1;
 639
 640   if (first_retrieval)
 641     {
 642       /* Don't sleep before the very first retrieval. */
 643       first_retrieval = 0;
 644       return;
 645     }
 646
 647   if (opt.waitretry && count > 1)
 648     {
 649       /* If opt.waitretry is specified and this is a retry, wait for
 650          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 651       if (count <= opt.waitretry)
 652         sleep (count - 1);
 653       else
 654         sleep (opt.waitretry);
 655     }
 656   else if (opt.wait)
 657     {
 658       if (!opt.random_wait || count > 1)
 659         /* If random-wait is not specified, or if we are sleeping
 660            between retries of the same download, sleep the fixed
 661            interval.  */
 662         sleep (opt.wait);
 663       else
 664         {
 665           /* Sleep a random amount of time averaging in opt.wait
 666              seconds.  The sleeping amount ranges from 0 to
 667              opt.wait*2, inclusive.  */
 668           int waitsecs = random_number (opt.wait * 2 + 1);
 669
 670           DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
 671                    opt.wait, waitsecs - opt.wait, waitsecs));
 672
 673           if (waitsecs)
 674             sleep (waitsecs);
 675         }
 676     }
 677 }