sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 /* See the comment in gethttp() why this is needed. */
  52 int global_download_count;
  53
  54 \f
  55 static struct {
  56   long bytes;
  57   long dltime;
  58 } limit_data;
  59
  60 static void
  61 limit_bandwidth_reset (void)
  62 {
  63   limit_data.bytes  = 0;
  64   limit_data.dltime = 0;
  65 }
  66
  67 /* Limit the bandwidth by pausing the download for an amount of time.
  68    BYTES is the number of bytes received from the network, DELTA is
  69    how long it took to receive them, DLTIME the current download time,
  70    TIMER the timer, and ADJUSTMENT the previous.  */
  71
  72 static void
  73 limit_bandwidth (long bytes, long delta)
  74 {
  75   long expected;
  76
  77   limit_data.bytes += bytes;
  78   limit_data.dltime += delta;
  79
  80   expected = (long)(1000.0 * limit_data.bytes / opt.limit_rate);
  81
  82   if (expected > limit_data.dltime)
  83     {
  84       long slp = expected - limit_data.dltime;
  85       if (slp < 200)
  86         {
  87           DEBUGP (("deferring a %ld ms sleep (%ld/%ld) until later.\n",
  88                    slp, limit_data.bytes, limit_data.dltime));
  89           return;
  90         }
  91       DEBUGP (("sleeping %ld ms\n", slp));
  92       usleep (1000 * slp);
  93     }
  94
  95   limit_data.bytes = 0;
  96   limit_data.dltime = 0;
  97 }
  98
  99 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
 100
 101 /* Reads the contents of file descriptor FD, until it is closed, or a
 102    read error occurs.  The data is read in 8K chunks, and stored to
 103    stream fp, which should have been open for writing.  If BUF is
 104    non-NULL and its file descriptor is equal to FD, flush RBUF first.
 105    This function will *not* use the rbuf_* functions!
 106
 107    The EXPECTED argument is passed to show_progress() unchanged, but
 108    otherwise ignored.
 109
 110    If opt.verbose is set, the progress is also shown.  RESTVAL
 111    represents a value from which to start downloading (which will be
 112    shown accordingly).  If RESTVAL is non-zero, the stream should have
 113    been open for appending.
 114
 115    The function exits and returns codes of 0, -1 and -2 if the
 116    connection was closed, there was a read error, or if it could not
 117    write to the output stream, respectively.
 118
 119    IMPORTANT: The function flushes the contents of the buffer in
 120    rbuf_flush() before actually reading from fd.  If you wish to read
 121    from fd immediately, flush or discard the buffer.  */
 122 int
 123 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 124               struct rbuf *rbuf, int use_expected, long *elapsed)
 125 {
 126   int res = 0;
 127   static char c[8192];
 128   void *progress = NULL;
 129   struct wget_timer *timer = wtimer_allocate ();
 130   long dltime = 0, last_dltime = 0;
 131
 132   *len = restval;
 133
 134   if (opt.verbose)
 135     progress = progress_create (restval, expected);
 136
 137   if (rbuf && RBUF_FD (rbuf) == fd)
 138     {
 139       int sz = 0;
 140       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
 141         {
 142           fwrite (c, sizeof (char), res, fp);
 143           *len += res;
 144           sz += res;
 145         }
 146       if (sz)
 147         fflush (fp);
 148       if (ferror (fp))
 149         {
 150           res = -2;
 151           goto out;
 152         }
 153       if (opt.verbose)
 154         progress_update (progress, sz, 0);
 155     }
 156
 157   if (opt.limit_rate)
 158     limit_bandwidth_reset ();
 159   wtimer_reset (timer);
 160
 161   /* Read from fd while there is available data.
 162
 163      Normally, if expected is 0, it means that it is not known how
 164      much data is expected.  However, if use_expected is specified,
 165      then expected being zero means exactly that.  */
 166   while (!use_expected || (*len < expected))
 167     {
 168       int amount_to_read = (use_expected
 169                             ? MIN (expected - *len, sizeof (c))
 170                             : sizeof (c));
 171 #ifdef HAVE_SSL
 172       if (rbuf->ssl!=NULL)
 173         res = ssl_iread (rbuf->ssl, c, amount_to_read);
 174       else
 175 #endif /* HAVE_SSL */
 176         res = iread (fd, c, amount_to_read);
 177
 178       if (res > 0)
 179         {
 180           fwrite (c, sizeof (char), res, fp);
 181           /* Always flush the contents of the network packet.  This
 182              should not be adverse to performance, as the network
 183              packets typically won't be too tiny anyway.  */
 184           fflush (fp);
 185           if (ferror (fp))
 186             {
 187               res = -2;
 188               goto out;
 189             }
 190
 191           /* If bandwidth is not limited, one call to wtimer_elapsed
 192              is sufficient.  */
 193           dltime = wtimer_elapsed (timer);
 194           if (opt.limit_rate)
 195             {
 196               limit_bandwidth (res, dltime - last_dltime);
 197               dltime = wtimer_elapsed (timer);
 198               last_dltime = dltime;
 199             }
 200
 201           if (opt.verbose)
 202             progress_update (progress, res, dltime);
 203           *len += res;
 204         }
 205       else
 206         break;
 207     }
 208   if (res < -1)
 209     res = -1;
 210
 211  out:
 212   if (opt.verbose)
 213     progress_finish (progress, dltime);
 214   if (elapsed)
 215     *elapsed = dltime;
 216   wtimer_delete (timer);
 217
 218   return res;
 219 }
 220 \f
 221 /* Return a printed representation of the download rate, as
 222    appropriate for the speed.  If PAD is non-zero, strings will be
 223    padded to the width of 7 characters (xxxx.xx).  */
 224 char *
 225 retr_rate (long bytes, long msecs, int pad)
 226 {
 227   static char res[20];
 228   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 229   int units = 0;
 230
 231   double dlrate = calc_rate (bytes, msecs, &units);
 232   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 233
 234   return res;
 235 }
 236
 237 /* Calculate the download rate and trim it as appropriate for the
 238    speed.  Appropriate means that if rate is greater than 1K/s,
 239    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 240    are used.
 241
 242    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 243    GB/s.  */
 244 double
 245 calc_rate (long bytes, long msecs, int *units)
 246 {
 247   double dlrate;
 248
 249   assert (msecs >= 0);
 250   assert (bytes >= 0);
 251
 252   if (msecs == 0)
 253     /* If elapsed time is 0, it means we're under the granularity of
 254        the timer.  This often happens on systems that use time() for
 255        the timer.  */
 256     msecs = wtimer_granularity ();
 257
 258   dlrate = (double)1000 * bytes / msecs;
 259   if (dlrate < 1024.0)
 260     *units = 0;
 261   else if (dlrate < 1024.0 * 1024.0)
 262     *units = 1, dlrate /= 1024.0;
 263   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 264     *units = 2, dlrate /= (1024.0 * 1024.0);
 265   else
 266     /* Maybe someone will need this one day.  More realistically, it
 267        will get tickled by buggy timers. */
 268     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 269
 270   return dlrate;
 271 }
 272 \f
 273 static int
 274 register_redirections_mapper (void *key, void *value, void *arg)
 275 {
 276   const char *redirected_from = (const char *)key;
 277   const char *redirected_to   = (const char *)arg;
 278   if (0 != strcmp (redirected_from, redirected_to))
 279     register_redirection (redirected_from, redirected_to);
 280   return 0;
 281 }
 282
 283 /* Register the redirections that lead to the successful download of
 284    this URL.  This is necessary so that the link converter can convert
 285    redirected URLs to the local file.  */
 286
 287 static void
 288 register_all_redirections (struct hash_table *redirections, const char *final)
 289 {
 290   hash_table_map (redirections, register_redirections_mapper, (void *)final);
 291 }
 292
 293 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 294                         && no_proxy_match((u)->host,                    \
 295                                           (const char **)opt.no_proxy))
 296
 297 /* Retrieve the given URL.  Decides which loop to call -- HTTP(S), FTP,
 298    or simply copy it with file:// (#### the latter not yet
 299    implemented!).  */
 300 uerr_t
 301 retrieve_url (const char *origurl, char **file, char **newloc,
 302               const char *refurl, int *dt)
 303 {
 304   uerr_t result;
 305   char *url;
 306   int location_changed, dummy;
 307   int use_proxy;
 308   char *mynewloc, *proxy;
 309   struct url *u;
 310   int up_error_code;            /* url parse error code */
 311   char *local_file;
 312   struct hash_table *redirections = NULL;
 313
 314   /* If dt is NULL, just ignore it.  */
 315   if (!dt)
 316     dt = &dummy;
 317   url = xstrdup (origurl);
 318   if (newloc)
 319     *newloc = NULL;
 320   if (file)
 321     *file = NULL;
 322
 323   u = url_parse (url, &up_error_code);
 324   if (!u)
 325     {
 326       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 327       if (redirections)
 328         string_set_free (redirections);
 329       xfree (url);
 330       return URLERROR;
 331     }
 332
 333   if (!refurl)
 334     refurl = opt.referer;
 335
 336  redirected:
 337
 338   result = NOCONERROR;
 339   mynewloc = NULL;
 340   local_file = NULL;
 341
 342   use_proxy = USE_PROXY_P (u);
 343   if (use_proxy)
 344     {
 345       struct url *proxy_url;
 346
 347       /* Get the proxy server for the current scheme.  */
 348       proxy = getproxy (u->scheme);
 349       if (!proxy)
 350         {
 351           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 352           url_free (u);
 353           if (redirections)
 354             string_set_free (redirections);
 355           xfree (url);
 356           return PROXERR;
 357         }
 358
 359       /* Parse the proxy URL.  */
 360       proxy_url = url_parse (proxy, &up_error_code);
 361       if (!proxy_url)
 362         {
 363           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 364                      proxy, url_error (up_error_code));
 365           if (redirections)
 366             string_set_free (redirections);
 367           xfree (url);
 368           return PROXERR;
 369         }
 370       if (proxy_url->scheme != SCHEME_HTTP)
 371         {
 372           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 373           url_free (proxy_url);
 374           if (redirections)
 375             string_set_free (redirections);
 376           xfree (url);
 377           return PROXERR;
 378         }
 379
 380       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 381       url_free (proxy_url);
 382     }
 383   else if (u->scheme == SCHEME_HTTP
 384 #ifdef HAVE_SSL
 385       || u->scheme == SCHEME_HTTPS
 386 #endif
 387       )
 388     {
 389       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 390     }
 391   else if (u->scheme == SCHEME_FTP)
 392     {
 393       /* If this is a redirection, we must not allow recursive FTP
 394          retrieval, so we save recursion to oldrec, and restore it
 395          later.  */
 396       int oldrec = opt.recursive;
 397       if (redirections)
 398         opt.recursive = 0;
 399       result = ftp_loop (u, dt);
 400       opt.recursive = oldrec;
 401 #if 0
 402       /* There is a possibility of having HTTP being redirected to
 403          FTP.  In these cases we must decide whether the text is HTML
 404          according to the suffix.  The HTML suffixes are `.html' and
 405          `.htm', case-insensitive.  */
 406       if (redirections && u->local && (u->scheme == SCHEME_FTP))
 407         {
 408           char *suf = suffix (u->local);
 409           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 410             *dt |= TEXTHTML;
 411         }
 412 #endif
 413     }
 414   location_changed = (result == NEWLOCATION);
 415   if (location_changed)
 416     {
 417       char *construced_newloc;
 418       struct url *newloc_parsed;
 419
 420       assert (mynewloc != NULL);
 421
 422       if (local_file)
 423         xfree (local_file);
 424
 425       /* The HTTP specs only allow absolute URLs to appear in
 426          redirects, but a ton of boneheaded webservers and CGIs out
 427          there break the rules and use relative URLs, and popular
 428          browsers are lenient about this, so wget should be too. */
 429       construced_newloc = uri_merge (url, mynewloc);
 430       xfree (mynewloc);
 431       mynewloc = construced_newloc;
 432
 433       /* Now, see if this new location makes sense. */
 434       newloc_parsed = url_parse (mynewloc, &up_error_code);
 435       if (!newloc_parsed)
 436         {
 437           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 438                      url_error (up_error_code));
 439           url_free (u);
 440           if (redirections)
 441             string_set_free (redirections);
 442           xfree (url);
 443           xfree (mynewloc);
 444           return result;
 445         }
 446
 447       /* Now mynewloc will become newloc_parsed->url, because if the
 448          Location contained relative paths like .././something, we
 449          don't want that propagating as url.  */
 450       xfree (mynewloc);
 451       mynewloc = xstrdup (newloc_parsed->url);
 452
 453       if (!redirections)
 454         {
 455           redirections = make_string_hash_table (0);
 456           /* Add current URL immediately so we can detect it as soon
 457              as possible in case of a cycle. */
 458           string_set_add (redirections, u->url);
 459         }
 460
 461       /* The new location is OK.  Check for redirection cycle by
 462          peeking through the history of redirections. */
 463       if (string_set_contains (redirections, newloc_parsed->url))
 464         {
 465           logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 466                      mynewloc);
 467           url_free (newloc_parsed);
 468           url_free (u);
 469           if (redirections)
 470             string_set_free (redirections);
 471           xfree (url);
 472           xfree (mynewloc);
 473           return WRONGCODE;
 474         }
 475       string_set_add (redirections, newloc_parsed->url);
 476
 477       xfree (url);
 478       url = mynewloc;
 479       url_free (u);
 480       u = newloc_parsed;
 481       goto redirected;
 482     }
 483
 484   if (local_file)
 485     {
 486       if (*dt & RETROKF)
 487         {
 488           register_download (url, local_file);
 489           if (redirections)
 490             register_all_redirections (redirections, url);
 491           if (*dt & TEXTHTML)
 492             register_html (url, local_file);
 493         }
 494     }
 495
 496   if (file)
 497     *file = local_file ? local_file : NULL;
 498   else
 499     FREE_MAYBE (local_file);
 500
 501   url_free (u);
 502
 503   if (redirections)
 504     {
 505       string_set_free (redirections);
 506       if (newloc)
 507         *newloc = url;
 508       else
 509         xfree (url);
 510     }
 511   else
 512     {
 513       if (newloc)
 514         *newloc = NULL;
 515       xfree (url);
 516     }
 517
 518   ++global_download_count;
 519
 520   return result;
 521 }
 522
 523 /* Find the URLs in the file and call retrieve_url() for each of
 524    them.  If HTML is non-zero, treat the file as HTML, and construct
 525    the URLs accordingly.
 526
 527    If opt.recursive is set, call recursive_retrieve() for each file.  */
 528 uerr_t
 529 retrieve_from_file (const char *file, int html, int *count)
 530 {
 531   uerr_t status;
 532   struct urlpos *url_list, *cur_url;
 533
 534   url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 535               : get_urls_file (file));
 536   status = RETROK;             /* Suppose everything is OK.  */
 537   *count = 0;                  /* Reset the URL count.  */
 538
 539   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 540     {
 541       char *filename = NULL, *new_file;
 542       int dt;
 543
 544       if (cur_url->ignore_when_downloading)
 545         continue;
 546
 547       if (downloaded_exceeds_quota ())
 548         {
 549           status = QUOTEXC;
 550           break;
 551         }
 552       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 553         status = retrieve_tree (cur_url->url->url);
 554       else
 555         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 556
 557       if (filename && opt.delete_after && file_exists_p (filename))
 558         {
 559           DEBUGP (("Removing file due to --delete-after in"
 560                    " retrieve_from_file():\n"));
 561           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 562           if (unlink (filename))
 563             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 564           dt &= ~RETROKF;
 565         }
 566
 567       FREE_MAYBE (new_file);
 568       FREE_MAYBE (filename);
 569     }
 570
 571   /* Free the linked list of URL-s.  */
 572   free_urlpos (url_list);
 573
 574   return status;
 575 }
 576
 577 /* Print `giving up', or `retrying', depending on the impending
 578    action.  N1 and N2 are the attempt number and the attempt limit.  */
 579 void
 580 printwhat (int n1, int n2)
 581 {
 582   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 583 }
 584
 585 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 586    set opt.downloaded_overflow to 1. */
 587 void
 588 downloaded_increase (unsigned long by_how_much)
 589 {
 590   VERY_LONG_TYPE old;
 591   if (opt.downloaded_overflow)
 592     return;
 593   old = opt.downloaded;
 594   opt.downloaded += by_how_much;
 595   if (opt.downloaded < old)     /* carry flag, where are you when I
 596                                    need you? */
 597     {
 598       /* Overflow. */
 599       opt.downloaded_overflow = 1;
 600       opt.downloaded = ~((VERY_LONG_TYPE)0);
 601     }
 602 }
 603
 604 /* Return non-zero if the downloaded amount of bytes exceeds the
 605    desired quota.  If quota is not set or if the amount overflowed, 0
 606    is returned. */
 607 int
 608 downloaded_exceeds_quota (void)
 609 {
 610   if (!opt.quota)
 611     return 0;
 612   if (opt.downloaded_overflow)
 613     /* We don't really know.  (Wildly) assume not. */
 614     return 0;
 615
 616   return opt.downloaded > opt.quota;
 617 }
 618
 619 /* If opt.wait or opt.waitretry are specified, and if certain
 620    conditions are met, sleep the appropriate number of seconds.  See
 621    the documentation of --wait and --waitretry for more information.
 622
 623    COUNT is the count of current retrieval, beginning with 1. */
 624
 625 void
 626 sleep_between_retrievals (int count)
 627 {
 628   static int first_retrieval = 1;
 629
 630   if (!first_retrieval && (opt.wait || opt.waitretry))
 631     {
 632       if (opt.waitretry && count > 1)
 633         {
 634           /* If opt.waitretry is specified and this is a retry, wait
 635              for COUNT-1 number of seconds, or for opt.waitretry
 636              seconds.  */
 637           if (count <= opt.waitretry)
 638             sleep (count - 1);
 639           else
 640             sleep (opt.waitretry);
 641         }
 642       else if (opt.wait)
 643         {
 644           /* Otherwise, check if opt.wait is specified.  If so, sleep.  */
 645           if (count > 1 || !opt.random_wait)
 646             sleep (opt.wait);
 647           else
 648             {
 649               int waitsecs = random() % (opt.wait * 2 + 1);
 650               DEBUGP(("sleep_between_retrievals: norm=%ld,random=%ld,sleep=%d\n",
 651                       opt.wait, waitsecs - opt.wait, waitsecs));
 652               sleep(waitsecs);
 653             }
 654         }
 655     }
 656   if (first_retrieval)
 657     first_retrieval = 0;
 658 }