sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 /* See the comment in gethttp() why this is needed. */
  52 int global_download_count;
  53
  54 \f
  55 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
  56
  57 /* Reads the contents of file descriptor FD, until it is closed, or a
  58    read error occurs.  The data is read in 8K chunks, and stored to
  59    stream fp, which should have been open for writing.  If BUF is
  60    non-NULL and its file descriptor is equal to FD, flush RBUF first.
  61    This function will *not* use the rbuf_* functions!
  62
  63    The EXPECTED argument is passed to show_progress() unchanged, but
  64    otherwise ignored.
  65
  66    If opt.verbose is set, the progress is also shown.  RESTVAL
  67    represents a value from which to start downloading (which will be
  68    shown accordingly).  If RESTVAL is non-zero, the stream should have
  69    been open for appending.
  70
  71    The function exits and returns codes of 0, -1 and -2 if the
  72    connection was closed, there was a read error, or if it could not
  73    write to the output stream, respectively.
  74
  75    IMPORTANT: The function flushes the contents of the buffer in
  76    rbuf_flush() before actually reading from fd.  If you wish to read
  77    from fd immediately, flush or discard the buffer.  */
  78 int
  79 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
  80               struct rbuf *rbuf, int use_expected, long *elapsed)
  81 {
  82   int res = 0;
  83   static char c[8192];
  84   void *progress = NULL;
  85   struct wget_timer *timer = NULL;
  86
  87   *len = restval;
  88
  89   if (opt.verbose)
  90     progress = progress_create (restval, expected);
  91   if (opt.verbose || elapsed != NULL)
  92     timer = wtimer_new ();
  93
  94   if (rbuf && RBUF_FD (rbuf) == fd)
  95     {
  96       int sz = 0;
  97       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
  98         {
  99           fwrite (c, sizeof (char), res, fp);
 100           *len += res;
 101           sz += res;
 102         }
 103       if (sz)
 104         fflush (fp);
 105       if (ferror (fp))
 106         {
 107           res = -2;
 108           goto out;
 109         }
 110       if (opt.verbose)
 111         progress_update (progress, sz, wtimer_elapsed (timer));
 112     }
 113   /* Read from fd while there is available data.
 114
 115      Normally, if expected is 0, it means that it is not known how
 116      much data is expected.  However, if use_expected is specified,
 117      then expected being zero means exactly that.  */
 118   while (!use_expected || (*len < expected))
 119     {
 120       int amount_to_read = (use_expected
 121                             ? MIN (expected - *len, sizeof (c))
 122                             : sizeof (c));
 123 #ifdef HAVE_SSL
 124                 if (rbuf->ssl!=NULL) {
 125                   res = ssl_iread (rbuf->ssl, c, amount_to_read);
 126                 } else {
 127 #endif /* HAVE_SSL */
 128                   res = iread (fd, c, amount_to_read);
 129 #ifdef HAVE_SSL
 130                 }
 131 #endif /* HAVE_SSL */
 132       if (res > 0)
 133         {
 134           fwrite (c, sizeof (char), res, fp);
 135           /* Always flush the contents of the network packet.  This
 136              should not be adverse to performance, as the network
 137              packets typically won't be too tiny anyway.  */
 138           fflush (fp);
 139           if (ferror (fp))
 140             {
 141               res = -2;
 142               goto out;
 143             }
 144           if (opt.verbose)
 145             progress_update (progress, res, wtimer_elapsed (timer));
 146           *len += res;
 147         }
 148       else
 149         break;
 150     }
 151   if (res < -1)
 152     res = -1;
 153
 154  out:
 155   if (timer)
 156     {
 157       long dltime = wtimer_elapsed (timer);
 158       if (opt.verbose)
 159         progress_finish (progress, dltime);
 160       if (elapsed)
 161         *elapsed = dltime;
 162       wtimer_delete (timer);
 163     }
 164   return res;
 165 }
 166 \f
 167 /* Return a printed representation of the download rate, as
 168    appropriate for the speed.  If PAD is non-zero, strings will be
 169    padded to the width of 7 characters (xxxx.xx).  */
 170 char *
 171 retr_rate (long bytes, long msecs, int pad)
 172 {
 173   static char res[20];
 174   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 175   int units = 0;
 176
 177   double dlrate = calc_rate (bytes, msecs, &units);
 178   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 179
 180   return res;
 181 }
 182
 183 /* Calculate the download rate and trim it as appropriate for the
 184    speed.  Appropriate means that if rate is greater than 1K/s,
 185    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 186    are used.
 187
 188    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 189    GB/s.  */
 190 double
 191 calc_rate (long bytes, long msecs, int *units)
 192 {
 193   double dlrate;
 194
 195   assert (msecs >= 0);
 196   assert (bytes >= 0);
 197
 198   if (msecs == 0)
 199     /* If elapsed time is 0, it means we're under the granularity of
 200        the timer.  This often happens on systems that use time() for
 201        the timer.  */
 202     msecs = wtimer_granularity ();
 203
 204   dlrate = (double)1000 * bytes / msecs;
 205   if (dlrate < 1024.0)
 206     *units = 0;
 207   else if (dlrate < 1024.0 * 1024.0)
 208     *units = 1, dlrate /= 1024.0;
 209   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 210     *units = 2, dlrate /= (1024.0 * 1024.0);
 211   else
 212     /* Maybe someone will need this one day.  More realistically, it
 213        will get tickled by buggy timers. */
 214     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 215
 216   return dlrate;
 217 }
 218 \f
 219 static int
 220 register_redirections_mapper (void *key, void *value, void *arg)
 221 {
 222   const char *redirected_from = (const char *)key;
 223   const char *redirected_to   = (const char *)arg;
 224   if (0 != strcmp (redirected_from, redirected_to))
 225     register_redirection (redirected_from, redirected_to);
 226   return 0;
 227 }
 228
 229 /* Register the redirections that lead to the successful download of
 230    this URL.  This is necessary so that the link converter can convert
 231    redirected URLs to the local file.  */
 232
 233 static void
 234 register_all_redirections (struct hash_table *redirections, const char *final)
 235 {
 236   hash_table_map (redirections, register_redirections_mapper, (void *)final);
 237 }
 238
 239 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 240                         && no_proxy_match((u)->host,                    \
 241                                           (const char **)opt.no_proxy))
 242
 243 /* Retrieve the given URL.  Decides which loop to call -- HTTP(S), FTP,
 244    or simply copy it with file:// (#### the latter not yet
 245    implemented!).  */
 246 uerr_t
 247 retrieve_url (const char *origurl, char **file, char **newloc,
 248               const char *refurl, int *dt)
 249 {
 250   uerr_t result;
 251   char *url;
 252   int location_changed, dummy;
 253   int use_proxy;
 254   char *mynewloc, *proxy;
 255   struct url *u;
 256   int up_error_code;            /* url parse error code */
 257   char *local_file;
 258   struct hash_table *redirections = NULL;
 259
 260   /* If dt is NULL, just ignore it.  */
 261   if (!dt)
 262     dt = &dummy;
 263   url = xstrdup (origurl);
 264   if (newloc)
 265     *newloc = NULL;
 266   if (file)
 267     *file = NULL;
 268
 269   u = url_parse (url, &up_error_code);
 270   if (!u)
 271     {
 272       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 273       if (redirections)
 274         string_set_free (redirections);
 275       xfree (url);
 276       return URLERROR;
 277     }
 278
 279   if (!refurl)
 280     refurl = opt.referer;
 281
 282  redirected:
 283
 284   result = NOCONERROR;
 285   mynewloc = NULL;
 286   local_file = NULL;
 287
 288   use_proxy = USE_PROXY_P (u);
 289   if (use_proxy)
 290     {
 291       struct url *proxy_url;
 292
 293       /* Get the proxy server for the current scheme.  */
 294       proxy = getproxy (u->scheme);
 295       if (!proxy)
 296         {
 297           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 298           url_free (u);
 299           if (redirections)
 300             string_set_free (redirections);
 301           xfree (url);
 302           return PROXERR;
 303         }
 304
 305       /* Parse the proxy URL.  */
 306       proxy_url = url_parse (proxy, &up_error_code);
 307       if (!proxy_url)
 308         {
 309           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 310                      proxy, url_error (up_error_code));
 311           if (redirections)
 312             string_set_free (redirections);
 313           xfree (url);
 314           return PROXERR;
 315         }
 316       if (proxy_url->scheme != SCHEME_HTTP)
 317         {
 318           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 319           url_free (proxy_url);
 320           if (redirections)
 321             string_set_free (redirections);
 322           xfree (url);
 323           return PROXERR;
 324         }
 325
 326       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 327       url_free (proxy_url);
 328     }
 329   else if (u->scheme == SCHEME_HTTP
 330 #ifdef HAVE_SSL
 331       || u->scheme == SCHEME_HTTPS
 332 #endif
 333       )
 334     {
 335       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 336     }
 337   else if (u->scheme == SCHEME_FTP)
 338     {
 339       /* If this is a redirection, we must not allow recursive FTP
 340          retrieval, so we save recursion to oldrec, and restore it
 341          later.  */
 342       int oldrec = opt.recursive;
 343       if (redirections)
 344         opt.recursive = 0;
 345       result = ftp_loop (u, dt);
 346       opt.recursive = oldrec;
 347 #if 0
 348       /* There is a possibility of having HTTP being redirected to
 349          FTP.  In these cases we must decide whether the text is HTML
 350          according to the suffix.  The HTML suffixes are `.html' and
 351          `.htm', case-insensitive.  */
 352       if (redirections && u->local && (u->scheme == SCHEME_FTP))
 353         {
 354           char *suf = suffix (u->local);
 355           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 356             *dt |= TEXTHTML;
 357         }
 358 #endif
 359     }
 360   location_changed = (result == NEWLOCATION);
 361   if (location_changed)
 362     {
 363       char *construced_newloc;
 364       struct url *newloc_parsed;
 365
 366       assert (mynewloc != NULL);
 367
 368       if (local_file)
 369         xfree (local_file);
 370
 371       /* The HTTP specs only allow absolute URLs to appear in
 372          redirects, but a ton of boneheaded webservers and CGIs out
 373          there break the rules and use relative URLs, and popular
 374          browsers are lenient about this, so wget should be too. */
 375       construced_newloc = uri_merge (url, mynewloc);
 376       xfree (mynewloc);
 377       mynewloc = construced_newloc;
 378
 379       /* Now, see if this new location makes sense. */
 380       newloc_parsed = url_parse (mynewloc, &up_error_code);
 381       if (!newloc_parsed)
 382         {
 383           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 384                      url_error (up_error_code));
 385           url_free (u);
 386           if (redirections)
 387             string_set_free (redirections);
 388           xfree (url);
 389           xfree (mynewloc);
 390           return result;
 391         }
 392
 393       /* Now mynewloc will become newloc_parsed->url, because if the
 394          Location contained relative paths like .././something, we
 395          don't want that propagating as url.  */
 396       xfree (mynewloc);
 397       mynewloc = xstrdup (newloc_parsed->url);
 398
 399       if (!redirections)
 400         {
 401           redirections = make_string_hash_table (0);
 402           /* Add current URL immediately so we can detect it as soon
 403              as possible in case of a cycle. */
 404           string_set_add (redirections, u->url);
 405         }
 406
 407       /* The new location is OK.  Check for redirection cycle by
 408          peeking through the history of redirections. */
 409       if (string_set_contains (redirections, newloc_parsed->url))
 410         {
 411           logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 412                      mynewloc);
 413           url_free (newloc_parsed);
 414           url_free (u);
 415           if (redirections)
 416             string_set_free (redirections);
 417           xfree (url);
 418           xfree (mynewloc);
 419           return WRONGCODE;
 420         }
 421       string_set_add (redirections, newloc_parsed->url);
 422
 423       xfree (url);
 424       url = mynewloc;
 425       url_free (u);
 426       u = newloc_parsed;
 427       goto redirected;
 428     }
 429
 430   if (local_file)
 431     {
 432       if (*dt & RETROKF)
 433         {
 434           register_download (url, local_file);
 435           if (redirections)
 436             register_all_redirections (redirections, url);
 437           if (*dt & TEXTHTML)
 438             register_html (url, local_file);
 439         }
 440     }
 441
 442   if (file)
 443     *file = local_file ? local_file : NULL;
 444   else
 445     FREE_MAYBE (local_file);
 446
 447   url_free (u);
 448
 449   if (redirections)
 450     {
 451       string_set_free (redirections);
 452       if (newloc)
 453         *newloc = url;
 454       else
 455         xfree (url);
 456     }
 457   else
 458     {
 459       if (newloc)
 460         *newloc = NULL;
 461       xfree (url);
 462     }
 463
 464   ++global_download_count;
 465
 466   return result;
 467 }
 468
 469 /* Find the URLs in the file and call retrieve_url() for each of
 470    them.  If HTML is non-zero, treat the file as HTML, and construct
 471    the URLs accordingly.
 472
 473    If opt.recursive is set, call recursive_retrieve() for each file.  */
 474 uerr_t
 475 retrieve_from_file (const char *file, int html, int *count)
 476 {
 477   uerr_t status;
 478   struct urlpos *url_list, *cur_url;
 479
 480   url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 481               : get_urls_file (file));
 482   status = RETROK;             /* Suppose everything is OK.  */
 483   *count = 0;                  /* Reset the URL count.  */
 484
 485   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 486     {
 487       char *filename = NULL, *new_file;
 488       int dt;
 489
 490       if (cur_url->ignore_when_downloading)
 491         continue;
 492
 493       if (downloaded_exceeds_quota ())
 494         {
 495           status = QUOTEXC;
 496           break;
 497         }
 498       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 499         status = retrieve_tree (cur_url->url->url);
 500       else
 501         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 502
 503       if (filename && opt.delete_after && file_exists_p (filename))
 504         {
 505           DEBUGP (("Removing file due to --delete-after in"
 506                    " retrieve_from_file():\n"));
 507           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 508           if (unlink (filename))
 509             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 510           dt &= ~RETROKF;
 511         }
 512
 513       FREE_MAYBE (new_file);
 514       FREE_MAYBE (filename);
 515     }
 516
 517   /* Free the linked list of URL-s.  */
 518   free_urlpos (url_list);
 519
 520   return status;
 521 }
 522
 523 /* Print `giving up', or `retrying', depending on the impending
 524    action.  N1 and N2 are the attempt number and the attempt limit.  */
 525 void
 526 printwhat (int n1, int n2)
 527 {
 528   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 529 }
 530
 531 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 532    set opt.downloaded_overflow to 1. */
 533 void
 534 downloaded_increase (unsigned long by_how_much)
 535 {
 536   VERY_LONG_TYPE old;
 537   if (opt.downloaded_overflow)
 538     return;
 539   old = opt.downloaded;
 540   opt.downloaded += by_how_much;
 541   if (opt.downloaded < old)     /* carry flag, where are you when I
 542                                    need you? */
 543     {
 544       /* Overflow. */
 545       opt.downloaded_overflow = 1;
 546       opt.downloaded = ~((VERY_LONG_TYPE)0);
 547     }
 548 }
 549
 550 /* Return non-zero if the downloaded amount of bytes exceeds the
 551    desired quota.  If quota is not set or if the amount overflowed, 0
 552    is returned. */
 553 int
 554 downloaded_exceeds_quota (void)
 555 {
 556   if (!opt.quota)
 557     return 0;
 558   if (opt.downloaded_overflow)
 559     /* We don't really know.  (Wildly) assume not. */
 560     return 0;
 561
 562   return opt.downloaded > opt.quota;
 563 }
 564
 565 /* If opt.wait or opt.waitretry are specified, and if certain
 566    conditions are met, sleep the appropriate number of seconds.  See
 567    the documentation of --wait and --waitretry for more information.
 568
 569    COUNT is the count of current retrieval, beginning with 1. */
 570
 571 void
 572 sleep_between_retrievals (int count)
 573 {
 574   static int first_retrieval = 1;
 575
 576   if (!first_retrieval && (opt.wait || opt.waitretry))
 577     {
 578       if (opt.waitretry && count > 1)
 579         {
 580           /* If opt.waitretry is specified and this is a retry, wait
 581              for COUNT-1 number of seconds, or for opt.waitretry
 582              seconds.  */
 583           if (count <= opt.waitretry)
 584             sleep (count - 1);
 585           else
 586             sleep (opt.waitretry);
 587         }
 588       else if (opt.wait)
 589         {
 590           /* Otherwise, check if opt.wait is specified.  If so, sleep.  */
 591           if (count > 1 || !opt.random_wait)
 592             sleep (opt.wait);
 593           else
 594             {
 595               int waitsecs = random() % (opt.wait * 2 + 1);
 596               DEBUGP(("sleep_between_retrievals: norm=%ld,random=%ld,sleep=%d\n",
 597                       opt.wait, waitsecs - opt.wait, waitsecs));
 598               sleep(waitsecs);
 599             }
 600         }
 601     }
 602   if (first_retrieval)
 603     first_retrieval = 0;
 604 }