sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifdef HAVE_SSL
  48 # include "gen_sslfunc.h"       /* for ssl_iread */
  49 #endif
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 /* See the comment in gethttp() why this is needed. */
  56 int global_download_count;
  57
  58 \f
  59 static struct {
  60   long bytes;
  61   long dltime;
  62 } limit_data;
  63
  64 static void
  65 limit_bandwidth_reset (void)
  66 {
  67   limit_data.bytes  = 0;
  68   limit_data.dltime = 0;
  69 }
  70
  71 /* Limit the bandwidth by pausing the download for an amount of time.
  72    BYTES is the number of bytes received from the network, DELTA is
  73    how long it took to receive them, DLTIME the current download time,
  74    TIMER the timer, and ADJUSTMENT the previous.  */
  75
  76 static void
  77 limit_bandwidth (long bytes, long delta)
  78 {
  79   long expected;
  80
  81   limit_data.bytes += bytes;
  82   limit_data.dltime += delta;
  83
  84   expected = (long)(1000.0 * limit_data.bytes / opt.limit_rate);
  85
  86   if (expected > limit_data.dltime)
  87     {
  88       long slp = expected - limit_data.dltime;
  89       if (slp < 200)
  90         {
  91           DEBUGP (("deferring a %ld ms sleep (%ld/%ld) until later.\n",
  92                    slp, limit_data.bytes, limit_data.dltime));
  93           return;
  94         }
  95       DEBUGP (("sleeping %ld ms\n", slp));
  96       usleep (1000 * slp);
  97     }
  98
  99   limit_data.bytes = 0;
 100   limit_data.dltime = 0;
 101 }
 102
 103 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 104
 105 /* Reads the contents of file descriptor FD, until it is closed, or a
 106    read error occurs.  The data is read in 8K chunks, and stored to
 107    stream fp, which should have been open for writing.  If BUF is
 108    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 109    This function will *not* use the rbuf_* functions!
 110
 111    The EXPECTED argument is passed to show_progress() unchanged, but
 112    otherwise ignored.
 113
 114    If opt.verbose is set, the progress is also shown.  RESTVAL
 115    represents a value from which to start downloading (which will be
 116    shown accordingly).  If RESTVAL is non-zero, the stream should have
 117    been open for appending.
 118
 119    The function exits and returns codes of 0, -1 and -2 if the
 120    connection was closed, there was a read error, or if it could not
 121    write to the output stream, respectively.
 122
 123    IMPORTANT: The function flushes the contents of the buffer in
 124    rbuf_flush() before actually reading from fd.  If you wish to read
 125    from fd immediately, flush or discard the buffer.  */
 126 int
 127 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 128               struct rbuf *rbuf, int use_expected, long *elapsed)
 129 {
 130   int res = 0;
 131   static char c[8192];
 132   void *progress = NULL;
 133   struct wget_timer *timer = wtimer_allocate ();
 134   long dltime = 0, last_dltime = 0;
 135
 136   *len = restval;
 137
 138   if (opt.verbose)
 139     progress = progress_create (restval, expected);
 140
 141   if (rbuf && RBUF_FD (rbuf) == fd)
 142     {
 143       int sz = 0;
 144       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
 145         {
 146           fwrite (c, sizeof (char), res, fp);
 147           *len += res;
 148           sz += res;
 149         }
 150       if (sz)
 151         fflush (fp);
 152       if (ferror (fp))
 153         {
 154           res = -2;
 155           goto out;
 156         }
 157       if (progress)
 158         progress_update (progress, sz, 0);
 159     }
 160
 161   if (opt.limit_rate)
 162     limit_bandwidth_reset ();
 163   wtimer_reset (timer);
 164
 165   /* Read from fd while there is available data.
 166
 167      Normally, if expected is 0, it means that it is not known how
 168      much data is expected.  However, if use_expected is specified,
 169      then expected being zero means exactly that.  */
 170   while (!use_expected || (*len < expected))
 171     {
 172       int amount_to_read = (use_expected
 173                             ? MIN (expected - *len, sizeof (c))
 174                             : sizeof (c));
 175 #ifdef HAVE_SSL
 176       if (rbuf->ssl!=NULL)
 177         res = ssl_iread (rbuf->ssl, c, amount_to_read);
 178       else
 179 #endif /* HAVE_SSL */
 180         res = iread (fd, c, amount_to_read);
 181
 182       if (res > 0)
 183         {
 184           fwrite (c, sizeof (char), res, fp);
 185           /* Always flush the contents of the network packet.  This
 186              should not be adverse to performance, as the network
 187              packets typically won't be too tiny anyway.  */
 188           fflush (fp);
 189           if (ferror (fp))
 190             {
 191               res = -2;
 192               goto out;
 193             }
 194
 195           /* If bandwidth is not limited, one call to wtimer_elapsed
 196              is sufficient.  */
 197           dltime = wtimer_elapsed (timer);
 198           if (opt.limit_rate)
 199             {
 200               limit_bandwidth (res, dltime - last_dltime);
 201               dltime = wtimer_elapsed (timer);
 202               last_dltime = dltime;
 203             }
 204
 205           if (progress)
 206             progress_update (progress, res, dltime);
 207           *len += res;
 208         }
 209       else
 210         break;
 211     }
 212   if (res < -1)
 213     res = -1;
 214
 215  out:
 216   if (progress)
 217     progress_finish (progress, dltime);
 218   if (elapsed)
 219     *elapsed = dltime;
 220   wtimer_delete (timer);
 221
 222   return res;
 223 }
 224 \f
 225 /* Return a printed representation of the download rate, as
 226    appropriate for the speed.  If PAD is non-zero, strings will be
 227    padded to the width of 7 characters (xxxx.xx).  */
 228 char *
 229 retr_rate (long bytes, long msecs, int pad)
 230 {
 231   static char res[20];
 232   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 233   int units = 0;
 234
 235   double dlrate = calc_rate (bytes, msecs, &units);
 236   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 237
 238   return res;
 239 }
 240
 241 /* Calculate the download rate and trim it as appropriate for the
 242    speed.  Appropriate means that if rate is greater than 1K/s,
 243    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 244    are used.
 245
 246    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 247    GB/s.  */
 248 double
 249 calc_rate (long bytes, long msecs, int *units)
 250 {
 251   double dlrate;
 252
 253   assert (msecs >= 0);
 254   assert (bytes >= 0);
 255
 256   if (msecs == 0)
 257     /* If elapsed time is 0, it means we're under the granularity of
 258        the timer.  This often happens on systems that use time() for
 259        the timer.  */
 260     msecs = wtimer_granularity ();
 261
 262   dlrate = (double)1000 * bytes / msecs;
 263   if (dlrate < 1024.0)
 264     *units = 0;
 265   else if (dlrate < 1024.0 * 1024.0)
 266     *units = 1, dlrate /= 1024.0;
 267   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 268     *units = 2, dlrate /= (1024.0 * 1024.0);
 269   else
 270     /* Maybe someone will need this one day.  More realistically, it
 271        will get tickled by buggy timers. */
 272     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 273
 274   return dlrate;
 275 }
 276 \f
 277 /* Maximum number of allowed redirections.  20 was chosen as a
 278    "reasonable" value, which is low enough to not cause havoc, yet
 279    high enough to guarantee that normal retrievals will not be hurt by
 280    the check.  */
 281
 282 #define MAX_REDIRECTIONS 20
 283
 284 #define SUSPEND_POST_DATA do {                  \
 285   post_data_suspended = 1;                      \
 286   saved_post_data = opt.post_data;              \
 287   saved_post_file_name = opt.post_file_name;    \
 288   opt.post_data = NULL;                         \
 289   opt.post_file_name = NULL;                    \
 290 } while (0)
 291
 292 #define RESTORE_POST_DATA do {                          \
 293   if (post_data_suspended)                              \
 294     {                                                   \
 295       opt.post_data = saved_post_data;                  \
 296       opt.post_file_name = saved_post_file_name;        \
 297       post_data_suspended = 0;                          \
 298     }                                                   \
 299 } while (0)
 300
 301 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 302    FTP, proxy, etc.  */
 303
 304 /* #### This function should be rewritten so it doesn't return from
 305    multiple points. */
 306
 307 uerr_t
 308 retrieve_url (const char *origurl, char **file, char **newloc,
 309               const char *refurl, int *dt)
 310 {
 311   uerr_t result;
 312   char *url;
 313   int location_changed, dummy;
 314   char *mynewloc, *proxy;
 315   struct url *u, *proxy_url;
 316   int up_error_code;            /* url parse error code */
 317   char *local_file;
 318   int redirection_count = 0;
 319
 320   int post_data_suspended = 0;
 321   char *saved_post_data;
 322   char *saved_post_file_name;
 323
 324   /* If dt is NULL, just ignore it.  */
 325   if (!dt)
 326     dt = &dummy;
 327   url = xstrdup (origurl);
 328   if (newloc)
 329     *newloc = NULL;
 330   if (file)
 331     *file = NULL;
 332
 333   u = url_parse (url, &up_error_code);
 334   if (!u)
 335     {
 336       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 337       xfree (url);
 338       return URLERROR;
 339     }
 340
 341   if (!refurl)
 342     refurl = opt.referer;
 343
 344  redirected:
 345
 346   result = NOCONERROR;
 347   mynewloc = NULL;
 348   local_file = NULL;
 349   proxy_url = NULL;
 350
 351   proxy = getproxy (u);
 352   if (proxy)
 353     {
 354       /* Parse the proxy URL.  */
 355       proxy_url = url_parse (proxy, &up_error_code);
 356       if (!proxy_url)
 357         {
 358           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 359                      proxy, url_error (up_error_code));
 360           xfree (url);
 361           RESTORE_POST_DATA;
 362           return PROXERR;
 363         }
 364       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 365         {
 366           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 367           url_free (proxy_url);
 368           xfree (url);
 369           RESTORE_POST_DATA;
 370           return PROXERR;
 371         }
 372     }
 373
 374   if (u->scheme == SCHEME_HTTP
 375 #ifdef HAVE_SSL
 376       || u->scheme == SCHEME_HTTPS
 377 #endif
 378       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 379     {
 380       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 381     }
 382   else if (u->scheme == SCHEME_FTP)
 383     {
 384       /* If this is a redirection, we must not allow recursive FTP
 385          retrieval, so we save recursion to oldrec, and restore it
 386          later.  */
 387       int oldrec = opt.recursive;
 388       if (redirection_count)
 389         opt.recursive = 0;
 390       result = ftp_loop (u, dt, proxy_url);
 391       opt.recursive = oldrec;
 392
 393       /* There is a possibility of having HTTP being redirected to
 394          FTP.  In these cases we must decide whether the text is HTML
 395          according to the suffix.  The HTML suffixes are `.html',
 396          `.htm' and a few others, case-insensitive.  */
 397       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 398         {
 399           if (has_html_suffix_p (local_file))
 400             *dt |= TEXTHTML;
 401         }
 402     }
 403
 404   if (proxy_url)
 405     {
 406       url_free (proxy_url);
 407       proxy_url = NULL;
 408     }
 409
 410   location_changed = (result == NEWLOCATION);
 411   if (location_changed)
 412     {
 413       char *construced_newloc;
 414       struct url *newloc_parsed;
 415
 416       assert (mynewloc != NULL);
 417
 418       if (local_file)
 419         xfree (local_file);
 420
 421       /* The HTTP specs only allow absolute URLs to appear in
 422          redirects, but a ton of boneheaded webservers and CGIs out
 423          there break the rules and use relative URLs, and popular
 424          browsers are lenient about this, so wget should be too. */
 425       construced_newloc = uri_merge (url, mynewloc);
 426       xfree (mynewloc);
 427       mynewloc = construced_newloc;
 428
 429       /* Now, see if this new location makes sense. */
 430       newloc_parsed = url_parse (mynewloc, &up_error_code);
 431       if (!newloc_parsed)
 432         {
 433           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 434                      url_error (up_error_code));
 435           url_free (u);
 436           xfree (url);
 437           xfree (mynewloc);
 438           RESTORE_POST_DATA;
 439           return result;
 440         }
 441
 442       /* Now mynewloc will become newloc_parsed->url, because if the
 443          Location contained relative paths like .././something, we
 444          don't want that propagating as url.  */
 445       xfree (mynewloc);
 446       mynewloc = xstrdup (newloc_parsed->url);
 447
 448       /* Check for max. number of redirections.  */
 449       if (++redirection_count > MAX_REDIRECTIONS)
 450         {
 451           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 452                      MAX_REDIRECTIONS);
 453           url_free (newloc_parsed);
 454           url_free (u);
 455           xfree (url);
 456           xfree (mynewloc);
 457           RESTORE_POST_DATA;
 458           return WRONGCODE;
 459         }
 460
 461       xfree (url);
 462       url = mynewloc;
 463       url_free (u);
 464       u = newloc_parsed;
 465
 466       /* If we're being redirected from POST, we don't want to POST
 467          again.  Many requests answer POST with a redirection to an
 468          index page; that redirection is clearly a GET.  We "suspend"
 469          POST data for the duration of the redirections, and restore
 470          it when we're done. */
 471       if (!post_data_suspended)
 472         SUSPEND_POST_DATA;
 473
 474       goto redirected;
 475     }
 476
 477   if (local_file)
 478     {
 479       if (*dt & RETROKF)
 480         {
 481           register_download (u->url, local_file);
 482           if (redirection_count && 0 != strcmp (origurl, u->url))
 483             register_redirection (origurl, u->url);
 484           if (*dt & TEXTHTML)
 485             register_html (u->url, local_file);
 486         }
 487     }
 488
 489   if (file)
 490     *file = local_file ? local_file : NULL;
 491   else
 492     FREE_MAYBE (local_file);
 493
 494   url_free (u);
 495
 496   if (redirection_count)
 497     {
 498       if (newloc)
 499         *newloc = url;
 500       else
 501         xfree (url);
 502     }
 503   else
 504     {
 505       if (newloc)
 506         *newloc = NULL;
 507       xfree (url);
 508     }
 509
 510   ++global_download_count;
 511   RESTORE_POST_DATA;
 512
 513   return result;
 514 }
 515
 516 /* Find the URLs in the file and call retrieve_url() for each of
 517    them.  If HTML is non-zero, treat the file as HTML, and construct
 518    the URLs accordingly.
 519
 520    If opt.recursive is set, call recursive_retrieve() for each file.  */
 521 uerr_t
 522 retrieve_from_file (const char *file, int html, int *count)
 523 {
 524   uerr_t status;
 525   struct urlpos *url_list, *cur_url;
 526
 527   url_list = (html ? get_urls_html (file, NULL, NULL)
 528               : get_urls_file (file));
 529   status = RETROK;             /* Suppose everything is OK.  */
 530   *count = 0;                  /* Reset the URL count.  */
 531
 532   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 533     {
 534       char *filename = NULL, *new_file = NULL;
 535       int dt;
 536
 537       if (cur_url->ignore_when_downloading)
 538         continue;
 539
 540       if (downloaded_exceeds_quota ())
 541         {
 542           status = QUOTEXC;
 543           break;
 544         }
 545       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 546         status = retrieve_tree (cur_url->url->url);
 547       else
 548         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 549
 550       if (filename && opt.delete_after && file_exists_p (filename))
 551         {
 552           DEBUGP (("Removing file due to --delete-after in"
 553                    " retrieve_from_file():\n"));
 554           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 555           if (unlink (filename))
 556             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 557           dt &= ~RETROKF;
 558         }
 559
 560       FREE_MAYBE (new_file);
 561       FREE_MAYBE (filename);
 562     }
 563
 564   /* Free the linked list of URL-s.  */
 565   free_urlpos (url_list);
 566
 567   return status;
 568 }
 569
 570 /* Print `giving up', or `retrying', depending on the impending
 571    action.  N1 and N2 are the attempt number and the attempt limit.  */
 572 void
 573 printwhat (int n1, int n2)
 574 {
 575   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 576 }
 577
 578 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 579    set opt.downloaded_overflow to 1. */
 580 void
 581 downloaded_increase (unsigned long by_how_much)
 582 {
 583   VERY_LONG_TYPE old;
 584   if (opt.downloaded_overflow)
 585     return;
 586   old = opt.downloaded;
 587   opt.downloaded += by_how_much;
 588   if (opt.downloaded < old)     /* carry flag, where are you when I
 589                                    need you? */
 590     {
 591       /* Overflow. */
 592       opt.downloaded_overflow = 1;
 593       opt.downloaded = ~((VERY_LONG_TYPE)0);
 594     }
 595 }
 596
 597 /* Return non-zero if the downloaded amount of bytes exceeds the
 598    desired quota.  If quota is not set or if the amount overflowed, 0
 599    is returned. */
 600 int
 601 downloaded_exceeds_quota (void)
 602 {
 603   if (!opt.quota)
 604     return 0;
 605   if (opt.downloaded_overflow)
 606     /* We don't really know.  (Wildly) assume not. */
 607     return 0;
 608
 609   return opt.downloaded > opt.quota;
 610 }
 611
 612 /* If opt.wait or opt.waitretry are specified, and if certain
 613    conditions are met, sleep the appropriate number of seconds.  See
 614    the documentation of --wait and --waitretry for more information.
 615
 616    COUNT is the count of current retrieval, beginning with 1. */
 617
 618 void
 619 sleep_between_retrievals (int count)
 620 {
 621   static int first_retrieval = 1;
 622
 623   if (first_retrieval)
 624     {
 625       /* Don't sleep before the very first retrieval. */
 626       first_retrieval = 0;
 627       return;
 628     }
 629
 630   if (opt.waitretry && count > 1)
 631     {
 632       /* If opt.waitretry is specified and this is a retry, wait for
 633          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 634       if (count <= opt.waitretry)
 635         sleep (count - 1);
 636       else
 637         sleep (opt.waitretry);
 638     }
 639   else if (opt.wait)
 640     {
 641       if (!opt.random_wait || count > 1)
 642         /* If random-wait is not specified, or if we are sleeping
 643            between retries of the same download, sleep the fixed
 644            interval.  */
 645         sleep (opt.wait);
 646       else
 647         {
 648           /* Sleep a random amount of time averaging in opt.wait
 649              seconds.  The sleeping amount ranges from 0 to
 650              opt.wait*2, inclusive.  */
 651           int waitsecs = random_number (opt.wait * 2 + 1);
 652
 653           DEBUGP (("sleep_between_retrievals: norm=%ld,fuzz=%ld,sleep=%d\n",
 654                    opt.wait, waitsecs - opt.wait, waitsecs));
 655
 656           if (waitsecs)
 657             sleep (waitsecs);
 658         }
 659     }
 660 }