sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 /* See the comment in gethttp() why this is needed. */
  52 int global_download_count;
  53
  54 \f
  55 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
  56
  57 /* Reads the contents of file descriptor FD, until it is closed, or a
  58    read error occurs.  The data is read in 8K chunks, and stored to
  59    stream fp, which should have been open for writing.  If BUF is
  60    non-NULL and its file descriptor is equal to FD, flush RBUF first.
  61    This function will *not* use the rbuf_* functions!
  62
  63    The EXPECTED argument is passed to show_progress() unchanged, but
  64    otherwise ignored.
  65
  66    If opt.verbose is set, the progress is also shown.  RESTVAL
  67    represents a value from which to start downloading (which will be
  68    shown accordingly).  If RESTVAL is non-zero, the stream should have
  69    been open for appending.
  70
  71    The function exits and returns codes of 0, -1 and -2 if the
  72    connection was closed, there was a read error, or if it could not
  73    write to the output stream, respectively.
  74
  75    IMPORTANT: The function flushes the contents of the buffer in
  76    rbuf_flush() before actually reading from fd.  If you wish to read
  77    from fd immediately, flush or discard the buffer.  */
  78 int
  79 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
  80               struct rbuf *rbuf, int use_expected)
  81 {
  82   int res = 0;
  83   static char c[8192];
  84   void *progress = NULL;
  85
  86   *len = restval;
  87   if (opt.verbose)
  88     progress = progress_create (restval, expected);
  89
  90   if (rbuf && RBUF_FD (rbuf) == fd)
  91     {
  92       int need_flush = 0;
  93       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
  94         {
  95           if (fwrite (c, sizeof (char), res, fp) < res)
  96             return -2;
  97           if (opt.verbose)
  98             progress_update (progress, res);
  99           *len += res;
 100           need_flush = 1;
 101         }
 102       if (need_flush)
 103         fflush (fp);
 104       if (ferror (fp))
 105         return -2;
 106     }
 107   /* Read from fd while there is available data.
 108
 109      Normally, if expected is 0, it means that it is not known how
 110      much data is expected.  However, if use_expected is specified,
 111      then expected being zero means exactly that.  */
 112   while (!use_expected || (*len < expected))
 113     {
 114       int amount_to_read = (use_expected
 115                             ? MIN (expected - *len, sizeof (c))
 116                             : sizeof (c));
 117 #ifdef HAVE_SSL
 118                 if (rbuf->ssl!=NULL) {
 119                   res = ssl_iread (rbuf->ssl, c, amount_to_read);
 120                 } else {
 121 #endif /* HAVE_SSL */
 122                   res = iread (fd, c, amount_to_read);
 123 #ifdef HAVE_SSL
 124                 }
 125 #endif /* HAVE_SSL */
 126       if (res > 0)
 127         {
 128           fwrite (c, sizeof (char), res, fp);
 129           /* Always flush the contents of the network packet.  This
 130              should not be adverse to performance, as the network
 131              packets typically won't be too tiny anyway.  */
 132           fflush (fp);
 133           if (ferror (fp))
 134             return -2;
 135           if (opt.verbose)
 136             progress_update (progress, res);
 137           *len += res;
 138         }
 139       else
 140         break;
 141     }
 142   if (res < -1)
 143     res = -1;
 144   if (opt.verbose)
 145     progress_finish (progress);
 146   return res;
 147 }
 148 \f
 149 /* Return a printed representation of the download rate, as
 150    appropriate for the speed.  If PAD is non-zero, strings will be
 151    padded to the width of 7 characters (xxxx.xx).  */
 152 char *
 153 retr_rate (long bytes, long msecs, int pad)
 154 {
 155   static char res[20];
 156   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 157   int units = 0;
 158
 159   double dlrate = calc_rate (bytes, msecs, &units);
 160   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 161
 162   return res;
 163 }
 164
 165 /* Calculate the download rate and trim it as appropriate for the
 166    speed.  Appropriate means that if rate is greater than 1K/s,
 167    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 168    are used.
 169
 170    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 171    GB/s.  */
 172 double
 173 calc_rate (long bytes, long msecs, int *units)
 174 {
 175   double dlrate;
 176
 177   assert (msecs >= 0);
 178   assert (bytes >= 0);
 179
 180   if (msecs == 0)
 181     /* If elapsed time is 0, it means we're under the granularity of
 182        the timer.  This often happens on systems that use time() for
 183        the timer.  */
 184     msecs = wtimer_granularity ();
 185
 186   dlrate = (double)1000 * bytes / msecs;
 187   if (dlrate < 1024.0)
 188     *units = 0;
 189   else if (dlrate < 1024.0 * 1024.0)
 190     *units = 1, dlrate /= 1024.0;
 191   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 192     *units = 2, dlrate /= (1024.0 * 1024.0);
 193   else
 194     /* Maybe someone will need this one day.  More realistically, it
 195        will get tickled by buggy timers. */
 196     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 197
 198   return dlrate;
 199 }
 200 \f
 201 static int
 202 register_redirections_mapper (void *key, void *value, void *arg)
 203 {
 204   const char *redirected_from = (const char *)key;
 205   const char *redirected_to   = (const char *)arg;
 206   if (0 != strcmp (redirected_from, redirected_to))
 207     register_redirection (redirected_from, redirected_to);
 208   return 0;
 209 }
 210
 211 /* Register the redirections that lead to the successful download of
 212    this URL.  This is necessary so that the link converter can convert
 213    redirected URLs to the local file.  */
 214
 215 static void
 216 register_all_redirections (struct hash_table *redirections, const char *final)
 217 {
 218   hash_table_map (redirections, register_redirections_mapper, (void *)final);
 219 }
 220
 221 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 222                         && no_proxy_match((u)->host,                    \
 223                                           (const char **)opt.no_proxy))
 224
 225 /* Retrieve the given URL.  Decides which loop to call -- HTTP(S), FTP,
 226    or simply copy it with file:// (#### the latter not yet
 227    implemented!).  */
 228 uerr_t
 229 retrieve_url (const char *origurl, char **file, char **newloc,
 230               const char *refurl, int *dt)
 231 {
 232   uerr_t result;
 233   char *url;
 234   int location_changed, dummy;
 235   int use_proxy;
 236   char *mynewloc, *proxy;
 237   struct url *u;
 238   int up_error_code;            /* url parse error code */
 239   char *local_file;
 240   struct hash_table *redirections = NULL;
 241
 242   /* If dt is NULL, just ignore it.  */
 243   if (!dt)
 244     dt = &dummy;
 245   url = xstrdup (origurl);
 246   if (newloc)
 247     *newloc = NULL;
 248   if (file)
 249     *file = NULL;
 250
 251   u = url_parse (url, &up_error_code);
 252   if (!u)
 253     {
 254       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 255       if (redirections)
 256         string_set_free (redirections);
 257       xfree (url);
 258       return URLERROR;
 259     }
 260
 261   if (!refurl)
 262     refurl = opt.referer;
 263
 264  redirected:
 265
 266   result = NOCONERROR;
 267   mynewloc = NULL;
 268   local_file = NULL;
 269
 270   use_proxy = USE_PROXY_P (u);
 271   if (use_proxy)
 272     {
 273       struct url *proxy_url;
 274
 275       /* Get the proxy server for the current scheme.  */
 276       proxy = getproxy (u->scheme);
 277       if (!proxy)
 278         {
 279           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 280           url_free (u);
 281           if (redirections)
 282             string_set_free (redirections);
 283           xfree (url);
 284           return PROXERR;
 285         }
 286
 287       /* Parse the proxy URL.  */
 288       proxy_url = url_parse (proxy, &up_error_code);
 289       if (!proxy_url)
 290         {
 291           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 292                      proxy, url_error (up_error_code));
 293           if (redirections)
 294             string_set_free (redirections);
 295           xfree (url);
 296           return PROXERR;
 297         }
 298       if (proxy_url->scheme != SCHEME_HTTP)
 299         {
 300           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 301           url_free (proxy_url);
 302           if (redirections)
 303             string_set_free (redirections);
 304           xfree (url);
 305           return PROXERR;
 306         }
 307
 308       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 309       url_free (proxy_url);
 310     }
 311   else if (u->scheme == SCHEME_HTTP
 312 #ifdef HAVE_SSL
 313       || u->scheme == SCHEME_HTTPS
 314 #endif
 315       )
 316     {
 317       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 318     }
 319   else if (u->scheme == SCHEME_FTP)
 320     {
 321       /* If this is a redirection, we must not allow recursive FTP
 322          retrieval, so we save recursion to oldrec, and restore it
 323          later.  */
 324       int oldrec = opt.recursive;
 325       if (redirections)
 326         opt.recursive = 0;
 327       result = ftp_loop (u, dt);
 328       opt.recursive = oldrec;
 329 #if 0
 330       /* There is a possibility of having HTTP being redirected to
 331          FTP.  In these cases we must decide whether the text is HTML
 332          according to the suffix.  The HTML suffixes are `.html' and
 333          `.htm', case-insensitive.  */
 334       if (redirections && u->local && (u->scheme == SCHEME_FTP))
 335         {
 336           char *suf = suffix (u->local);
 337           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 338             *dt |= TEXTHTML;
 339         }
 340 #endif
 341     }
 342   location_changed = (result == NEWLOCATION);
 343   if (location_changed)
 344     {
 345       char *construced_newloc;
 346       struct url *newloc_parsed;
 347
 348       assert (mynewloc != NULL);
 349
 350       if (local_file)
 351         xfree (local_file);
 352
 353       /* The HTTP specs only allow absolute URLs to appear in
 354          redirects, but a ton of boneheaded webservers and CGIs out
 355          there break the rules and use relative URLs, and popular
 356          browsers are lenient about this, so wget should be too. */
 357       construced_newloc = uri_merge (url, mynewloc);
 358       xfree (mynewloc);
 359       mynewloc = construced_newloc;
 360
 361       /* Now, see if this new location makes sense. */
 362       newloc_parsed = url_parse (mynewloc, &up_error_code);
 363       if (!newloc_parsed)
 364         {
 365           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 366                      url_error (up_error_code));
 367           url_free (u);
 368           if (redirections)
 369             string_set_free (redirections);
 370           xfree (url);
 371           xfree (mynewloc);
 372           return result;
 373         }
 374
 375       /* Now mynewloc will become newloc_parsed->url, because if the
 376          Location contained relative paths like .././something, we
 377          don't want that propagating as url.  */
 378       xfree (mynewloc);
 379       mynewloc = xstrdup (newloc_parsed->url);
 380
 381       if (!redirections)
 382         {
 383           redirections = make_string_hash_table (0);
 384           /* Add current URL immediately so we can detect it as soon
 385              as possible in case of a cycle. */
 386           string_set_add (redirections, u->url);
 387         }
 388
 389       /* The new location is OK.  Check for redirection cycle by
 390          peeking through the history of redirections. */
 391       if (string_set_contains (redirections, newloc_parsed->url))
 392         {
 393           logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 394                      mynewloc);
 395           url_free (newloc_parsed);
 396           url_free (u);
 397           if (redirections)
 398             string_set_free (redirections);
 399           xfree (url);
 400           xfree (mynewloc);
 401           return WRONGCODE;
 402         }
 403       string_set_add (redirections, newloc_parsed->url);
 404
 405       xfree (url);
 406       url = mynewloc;
 407       url_free (u);
 408       u = newloc_parsed;
 409       goto redirected;
 410     }
 411
 412   if (local_file)
 413     {
 414       if (*dt & RETROKF)
 415         {
 416           register_download (url, local_file);
 417           if (redirections)
 418             register_all_redirections (redirections, url);
 419           if (*dt & TEXTHTML)
 420             register_html (url, local_file);
 421         }
 422     }
 423
 424   if (file)
 425     *file = local_file ? local_file : NULL;
 426   else
 427     FREE_MAYBE (local_file);
 428
 429   url_free (u);
 430   if (redirections)
 431     string_set_free (redirections);
 432
 433   if (newloc)
 434     *newloc = url;
 435   else
 436     xfree (url);
 437
 438   ++global_download_count;
 439
 440   return result;
 441 }
 442
 443 /* Find the URLs in the file and call retrieve_url() for each of
 444    them.  If HTML is non-zero, treat the file as HTML, and construct
 445    the URLs accordingly.
 446
 447    If opt.recursive is set, call recursive_retrieve() for each file.  */
 448 uerr_t
 449 retrieve_from_file (const char *file, int html, int *count)
 450 {
 451   uerr_t status;
 452   struct urlpos *url_list, *cur_url;
 453
 454   url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 455               : get_urls_file (file));
 456   status = RETROK;             /* Suppose everything is OK.  */
 457   *count = 0;                  /* Reset the URL count.  */
 458
 459   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 460     {
 461       char *filename = NULL, *new_file;
 462       int dt;
 463
 464       if (cur_url->ignore_when_downloading)
 465         continue;
 466
 467       if (downloaded_exceeds_quota ())
 468         {
 469           status = QUOTEXC;
 470           break;
 471         }
 472       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 473         status = retrieve_tree (cur_url->url->url);
 474       else
 475         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 476
 477       if (filename && opt.delete_after && file_exists_p (filename))
 478         {
 479           DEBUGP (("Removing file due to --delete-after in"
 480                    " retrieve_from_file():\n"));
 481           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 482           if (unlink (filename))
 483             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 484           dt &= ~RETROKF;
 485         }
 486
 487       FREE_MAYBE (new_file);
 488       FREE_MAYBE (filename);
 489     }
 490
 491   /* Free the linked list of URL-s.  */
 492   free_urlpos (url_list);
 493
 494   return status;
 495 }
 496
 497 /* Print `giving up', or `retrying', depending on the impending
 498    action.  N1 and N2 are the attempt number and the attempt limit.  */
 499 void
 500 printwhat (int n1, int n2)
 501 {
 502   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 503 }
 504
 505 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 506    set opt.downloaded_overflow to 1. */
 507 void
 508 downloaded_increase (unsigned long by_how_much)
 509 {
 510   VERY_LONG_TYPE old;
 511   if (opt.downloaded_overflow)
 512     return;
 513   old = opt.downloaded;
 514   opt.downloaded += by_how_much;
 515   if (opt.downloaded < old)     /* carry flag, where are you when I
 516                                    need you? */
 517     {
 518       /* Overflow. */
 519       opt.downloaded_overflow = 1;
 520       opt.downloaded = ~((VERY_LONG_TYPE)0);
 521     }
 522 }
 523
 524 /* Return non-zero if the downloaded amount of bytes exceeds the
 525    desired quota.  If quota is not set or if the amount overflowed, 0
 526    is returned. */
 527 int
 528 downloaded_exceeds_quota (void)
 529 {
 530   if (!opt.quota)
 531     return 0;
 532   if (opt.downloaded_overflow)
 533     /* We don't really know.  (Wildly) assume not. */
 534     return 0;
 535
 536   return opt.downloaded > opt.quota;
 537 }
 538
 539 /* If opt.wait or opt.waitretry are specified, and if certain
 540    conditions are met, sleep the appropriate number of seconds.  See
 541    the documentation of --wait and --waitretry for more information.
 542
 543    COUNT is the count of current retrieval, beginning with 1. */
 544
 545 void
 546 sleep_between_retrievals (int count)
 547 {
 548   static int first_retrieval = 1;
 549
 550   if (!first_retrieval && (opt.wait || opt.waitretry))
 551     {
 552       if (opt.waitretry && count > 1)
 553         {
 554           /* If opt.waitretry is specified and this is a retry, wait
 555              for COUNT-1 number of seconds, or for opt.waitretry
 556              seconds.  */
 557           if (count <= opt.waitretry)
 558             sleep (count - 1);
 559           else
 560             sleep (opt.waitretry);
 561         }
 562       else if (opt.wait)
 563         {
 564           /* Otherwise, check if opt.wait is specified.  If so, sleep.  */
 565           if (count > 1 || !opt.random_wait)
 566             sleep (opt.wait);
 567           else
 568             {
 569               int waitsecs = random() % (opt.wait * 2 + 1);
 570               DEBUGP(("sleep_between_retrievals: norm=%ld,random=%ld,sleep=%d\n",
 571                       opt.wait, waitsecs - opt.wait, waitsecs));
 572               sleep(waitsecs);
 573             }
 574         }
 575     }
 576   if (first_retrieval)
 577     first_retrieval = 0;
 578 }