sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 /* See the comment in gethttp() why this is needed. */
  52 int global_download_count;
  53
  54 \f
  55 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
  56
  57 /* Reads the contents of file descriptor FD, until it is closed, or a
  58    read error occurs.  The data is read in 8K chunks, and stored to
  59    stream fp, which should have been open for writing.  If BUF is
  60    non-NULL and its file descriptor is equal to FD, flush RBUF first.
  61    This function will *not* use the rbuf_* functions!
  62
  63    The EXPECTED argument is passed to show_progress() unchanged, but
  64    otherwise ignored.
  65
  66    If opt.verbose is set, the progress is also shown.  RESTVAL
  67    represents a value from which to start downloading (which will be
  68    shown accordingly).  If RESTVAL is non-zero, the stream should have
  69    been open for appending.
  70
  71    The function exits and returns codes of 0, -1 and -2 if the
  72    connection was closed, there was a read error, or if it could not
  73    write to the output stream, respectively.
  74
  75    IMPORTANT: The function flushes the contents of the buffer in
  76    rbuf_flush() before actually reading from fd.  If you wish to read
  77    from fd immediately, flush or discard the buffer.  */
  78 int
  79 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
  80               struct rbuf *rbuf, int use_expected)
  81 {
  82   int res = 0;
  83   static char c[8192];
  84   void *progress = NULL;
  85
  86   *len = restval;
  87   if (opt.verbose)
  88     progress = progress_create (restval, expected);
  89
  90   if (rbuf && RBUF_FD (rbuf) == fd)
  91     {
  92       int need_flush = 0;
  93       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
  94         {
  95           if (fwrite (c, sizeof (char), res, fp) < res)
  96             return -2;
  97           if (opt.verbose)
  98             progress_update (progress, res);
  99           *len += res;
 100           need_flush = 1;
 101         }
 102       if (need_flush)
 103         fflush (fp);
 104       if (ferror (fp))
 105         return -2;
 106     }
 107   /* Read from fd while there is available data.
 108
 109      Normally, if expected is 0, it means that it is not known how
 110      much data is expected.  However, if use_expected is specified,
 111      then expected being zero means exactly that.  */
 112   while (!use_expected || (*len < expected))
 113     {
 114       int amount_to_read = (use_expected
 115                             ? MIN (expected - *len, sizeof (c))
 116                             : sizeof (c));
 117 #ifdef HAVE_SSL
 118                 if (rbuf->ssl!=NULL) {
 119                   res = ssl_iread (rbuf->ssl, c, amount_to_read);
 120                 } else {
 121 #endif /* HAVE_SSL */
 122                   res = iread (fd, c, amount_to_read);
 123 #ifdef HAVE_SSL
 124                 }
 125 #endif /* HAVE_SSL */
 126       if (res > 0)
 127         {
 128           fwrite (c, sizeof (char), res, fp);
 129           /* Always flush the contents of the network packet.  This
 130              should not be adverse to performance, as the network
 131              packets typically won't be too tiny anyway.  */
 132           fflush (fp);
 133           if (ferror (fp))
 134             return -2;
 135           if (opt.verbose)
 136             progress_update (progress, res);
 137           *len += res;
 138         }
 139       else
 140         break;
 141     }
 142   if (res < -1)
 143     res = -1;
 144   if (opt.verbose)
 145     progress_finish (progress);
 146   return res;
 147 }
 148 \f
 149 /* Return a printed representation of the download rate, as
 150    appropriate for the speed.  Appropriate means that if rate is
 151    greater than 1K/s, kilobytes are used, and if rate is greater than
 152    1MB/s, megabytes are used.
 153
 154    If PAD is non-zero, strings will be padded to the width of 7
 155    characters (xxxx.xx).  */
 156 char *
 157 rate (long bytes, long msecs, int pad)
 158 {
 159   static char res[15];
 160   double dlrate;
 161
 162   assert (msecs >= 0);
 163   assert (bytes >= 0);
 164
 165   if (msecs == 0)
 166     /* If elapsed time is 0, it means we're under the granularity of
 167        the timer.  This often happens on systems that use time() for
 168        the timer.  */
 169     msecs = wtimer_granularity ();
 170
 171   dlrate = (double)1000 * bytes / msecs;
 172   if (dlrate < 1024.0)
 173     sprintf (res, pad ? "%7.2f B/s" : "%.2f B/s", dlrate);
 174   else if (dlrate < 1024.0 * 1024.0)
 175     sprintf (res, pad ? "%7.2f K/s" : "%.2f K/s", dlrate / 1024.0);
 176   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 177     sprintf (res, pad ? "%7.2f M/s" : "%.2f M/s", dlrate / (1024.0 * 1024.0));
 178   else
 179     /* Maybe someone will need this one day.  More realistically, it
 180        will get tickled by buggy timers. */
 181     sprintf (res, pad ? "%7.2f GB/s" : "%.2f GB/s",
 182              dlrate / (1024.0 * 1024.0 * 1024.0));
 183
 184   return res;
 185 }
 186 \f
 187 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 188                         && no_proxy_match((u)->host,                    \
 189                                           (const char **)opt.no_proxy))
 190
 191 /* Retrieve the given URL.  Decides which loop to call -- HTTP(S), FTP,
 192    or simply copy it with file:// (#### the latter not yet
 193    implemented!).  */
 194 uerr_t
 195 retrieve_url (const char *origurl, char **file, char **newloc,
 196               const char *refurl, int *dt)
 197 {
 198   uerr_t result;
 199   char *url;
 200   int location_changed, dummy;
 201   int use_proxy;
 202   char *mynewloc, *proxy;
 203   struct url *u;
 204   int up_error_code;            /* url parse error code */
 205   char *local_file;
 206   struct hash_table *redirections = NULL;
 207
 208   /* If dt is NULL, just ignore it.  */
 209   if (!dt)
 210     dt = &dummy;
 211   url = xstrdup (origurl);
 212   if (newloc)
 213     *newloc = NULL;
 214   if (file)
 215     *file = NULL;
 216
 217   u = url_parse (url, &up_error_code);
 218   if (!u)
 219     {
 220       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 221       if (redirections)
 222         string_set_free (redirections);
 223       xfree (url);
 224       return URLERROR;
 225     }
 226
 227   if (!refurl)
 228     refurl = opt.referer;
 229
 230  redirected:
 231
 232   result = NOCONERROR;
 233   mynewloc = NULL;
 234   local_file = NULL;
 235
 236   use_proxy = USE_PROXY_P (u);
 237   if (use_proxy)
 238     {
 239       struct url *proxy_url;
 240
 241       /* Get the proxy server for the current scheme.  */
 242       proxy = getproxy (u->scheme);
 243       if (!proxy)
 244         {
 245           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 246           url_free (u);
 247           if (redirections)
 248             string_set_free (redirections);
 249           xfree (url);
 250           return PROXERR;
 251         }
 252
 253       /* Parse the proxy URL.  */
 254       proxy_url = url_parse (proxy, &up_error_code);
 255       if (!proxy_url)
 256         {
 257           logprintf (LOG_NOTQUIET, "Error parsing proxy URL %s: %s.\n",
 258                      proxy, url_error (up_error_code));
 259           if (redirections)
 260             string_set_free (redirections);
 261           xfree (url);
 262           return PROXERR;
 263         }
 264       if (proxy_url->scheme != SCHEME_HTTP)
 265         {
 266           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 267           url_free (proxy_url);
 268           if (redirections)
 269             string_set_free (redirections);
 270           xfree (url);
 271           return PROXERR;
 272         }
 273
 274       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 275       url_free (proxy_url);
 276     }
 277   else if (u->scheme == SCHEME_HTTP
 278 #ifdef HAVE_SSL
 279       || u->scheme == SCHEME_HTTPS
 280 #endif
 281       )
 282     {
 283       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 284     }
 285   else if (u->scheme == SCHEME_FTP)
 286     {
 287       /* If this is a redirection, we must not allow recursive FTP
 288          retrieval, so we save recursion to oldrec, and restore it
 289          later.  */
 290       int oldrec = opt.recursive;
 291       if (redirections)
 292         opt.recursive = 0;
 293       result = ftp_loop (u, dt);
 294       opt.recursive = oldrec;
 295 #if 0
 296       /* There is a possibility of having HTTP being redirected to
 297          FTP.  In these cases we must decide whether the text is HTML
 298          according to the suffix.  The HTML suffixes are `.html' and
 299          `.htm', case-insensitive.  */
 300       if (redirections && u->local && (u->scheme == SCHEME_FTP))
 301         {
 302           char *suf = suffix (u->local);
 303           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 304             *dt |= TEXTHTML;
 305           FREE_MAYBE (suf);
 306         }
 307 #endif
 308     }
 309   location_changed = (result == NEWLOCATION);
 310   if (location_changed)
 311     {
 312       char *construced_newloc;
 313       struct url *newloc_struct;
 314
 315       assert (mynewloc != NULL);
 316
 317       if (local_file)
 318         xfree (local_file);
 319
 320       /* The HTTP specs only allow absolute URLs to appear in
 321          redirects, but a ton of boneheaded webservers and CGIs out
 322          there break the rules and use relative URLs, and popular
 323          browsers are lenient about this, so wget should be too. */
 324       construced_newloc = uri_merge (url, mynewloc);
 325       xfree (mynewloc);
 326       mynewloc = construced_newloc;
 327
 328       /* Now, see if this new location makes sense. */
 329       newloc_struct = url_parse (mynewloc, &up_error_code);
 330       if (!newloc_struct)
 331         {
 332           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 333                      url_error (up_error_code));
 334           url_free (newloc_struct);
 335           url_free (u);
 336           if (redirections)
 337             string_set_free (redirections);
 338           xfree (url);
 339           xfree (mynewloc);
 340           return result;
 341         }
 342
 343       /* Now mynewloc will become newloc_struct->url, because if the
 344          Location contained relative paths like .././something, we
 345          don't want that propagating as url.  */
 346       xfree (mynewloc);
 347       mynewloc = xstrdup (newloc_struct->url);
 348
 349       if (!redirections)
 350         {
 351           redirections = make_string_hash_table (0);
 352           /* Add current URL immediately so we can detect it as soon
 353              as possible in case of a cycle. */
 354           string_set_add (redirections, u->url);
 355         }
 356
 357       /* The new location is OK.  Check for redirection cycle by
 358          peeking through the history of redirections. */
 359       if (string_set_contains (redirections, newloc_struct->url))
 360         {
 361           logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 362                      mynewloc);
 363           url_free (newloc_struct);
 364           url_free (u);
 365           if (redirections)
 366             string_set_free (redirections);
 367           xfree (url);
 368           xfree (mynewloc);
 369           return WRONGCODE;
 370         }
 371       string_set_add (redirections, newloc_struct->url);
 372
 373       xfree (url);
 374       url = mynewloc;
 375       url_free (u);
 376       u = newloc_struct;
 377       goto redirected;
 378     }
 379
 380   if (local_file)
 381     {
 382       if (*dt & RETROKF)
 383         {
 384           register_download (url, local_file);
 385           if (*dt & TEXTHTML)
 386             register_html (url, local_file);
 387         }
 388     }
 389
 390   if (file)
 391     *file = local_file ? local_file : NULL;
 392   else
 393     FREE_MAYBE (local_file);
 394
 395   url_free (u);
 396   if (redirections)
 397     string_set_free (redirections);
 398
 399   if (newloc)
 400     *newloc = url;
 401   else
 402     xfree (url);
 403
 404   ++global_download_count;
 405
 406   return result;
 407 }
 408
 409 /* Find the URLs in the file and call retrieve_url() for each of
 410    them.  If HTML is non-zero, treat the file as HTML, and construct
 411    the URLs accordingly.
 412
 413    If opt.recursive is set, call recursive_retrieve() for each file.  */
 414 uerr_t
 415 retrieve_from_file (const char *file, int html, int *count)
 416 {
 417   uerr_t status;
 418   urlpos *url_list, *cur_url;
 419
 420   url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 421               : get_urls_file (file));
 422   status = RETROK;             /* Suppose everything is OK.  */
 423   *count = 0;                  /* Reset the URL count.  */
 424   recursive_reset ();
 425   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 426     {
 427       char *filename, *new_file;
 428       int dt;
 429
 430       if (downloaded_exceeds_quota ())
 431         {
 432           status = QUOTEXC;
 433           break;
 434         }
 435       status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt);
 436       if (opt.recursive && status == RETROK && (dt & TEXTHTML))
 437         status = recursive_retrieve (filename, new_file ? new_file
 438                                                         : cur_url->url);
 439
 440       if (filename && opt.delete_after && file_exists_p (filename))
 441         {
 442           DEBUGP (("Removing file due to --delete-after in"
 443                    " retrieve_from_file():\n"));
 444           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 445           if (unlink (filename))
 446             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 447           dt &= ~RETROKF;
 448         }
 449
 450       FREE_MAYBE (new_file);
 451       FREE_MAYBE (filename);
 452     }
 453
 454   /* Free the linked list of URL-s.  */
 455   free_urlpos (url_list);
 456
 457   return status;
 458 }
 459
 460 /* Print `giving up', or `retrying', depending on the impending
 461    action.  N1 and N2 are the attempt number and the attempt limit.  */
 462 void
 463 printwhat (int n1, int n2)
 464 {
 465   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 466 }
 467
 468 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 469    set opt.downloaded_overflow to 1. */
 470 void
 471 downloaded_increase (unsigned long by_how_much)
 472 {
 473   VERY_LONG_TYPE old;
 474   if (opt.downloaded_overflow)
 475     return;
 476   old = opt.downloaded;
 477   opt.downloaded += by_how_much;
 478   if (opt.downloaded < old)     /* carry flag, where are you when I
 479                                    need you? */
 480     {
 481       /* Overflow. */
 482       opt.downloaded_overflow = 1;
 483       opt.downloaded = ~((VERY_LONG_TYPE)0);
 484     }
 485 }
 486
 487 /* Return non-zero if the downloaded amount of bytes exceeds the
 488    desired quota.  If quota is not set or if the amount overflowed, 0
 489    is returned. */
 490 int
 491 downloaded_exceeds_quota (void)
 492 {
 493   if (!opt.quota)
 494     return 0;
 495   if (opt.downloaded_overflow)
 496     /* We don't really know.  (Wildly) assume not. */
 497     return 0;
 498
 499   return opt.downloaded > opt.quota;
 500 }
 501
 502 /* If opt.wait or opt.waitretry are specified, and if certain
 503    conditions are met, sleep the appropriate number of seconds.  See
 504    the documentation of --wait and --waitretry for more information.
 505
 506    COUNT is the count of current retrieval, beginning with 1. */
 507
 508 void
 509 sleep_between_retrievals (int count)
 510 {
 511   static int first_retrieval = 1;
 512
 513   if (!first_retrieval && (opt.wait || opt.waitretry))
 514     {
 515       if (opt.waitretry && count > 1)
 516         {
 517           /* If opt.waitretry is specified and this is a retry, wait
 518              for COUNT-1 number of seconds, or for opt.waitretry
 519              seconds.  */
 520           if (count <= opt.waitretry)
 521             sleep (count - 1);
 522           else
 523             sleep (opt.waitretry);
 524         }
 525       else if (opt.wait)
 526         /* Otherwise, check if opt.wait is specified.  If so, sleep.  */
 527         sleep (opt.wait);
 528     }
 529   if (first_retrieval)
 530     first_retrieval = 0;
 531 }