sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 /* See the comment in gethttp() why this is needed. */
  52 int global_download_count;
  53
  54 \f
  55 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
  56
  57 /* Reads the contents of file descriptor FD, until it is closed, or a
  58    read error occurs.  The data is read in 8K chunks, and stored to
  59    stream fp, which should have been open for writing.  If BUF is
  60    non-NULL and its file descriptor is equal to FD, flush RBUF first.
  61    This function will *not* use the rbuf_* functions!
  62
  63    The EXPECTED argument is passed to show_progress() unchanged, but
  64    otherwise ignored.
  65
  66    If opt.verbose is set, the progress is also shown.  RESTVAL
  67    represents a value from which to start downloading (which will be
  68    shown accordingly).  If RESTVAL is non-zero, the stream should have
  69    been open for appending.
  70
  71    The function exits and returns codes of 0, -1 and -2 if the
  72    connection was closed, there was a read error, or if it could not
  73    write to the output stream, respectively.
  74
  75    IMPORTANT: The function flushes the contents of the buffer in
  76    rbuf_flush() before actually reading from fd.  If you wish to read
  77    from fd immediately, flush or discard the buffer.  */
  78 int
  79 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
  80               struct rbuf *rbuf, int use_expected)
  81 {
  82   int res = 0;
  83   static char c[8192];
  84   void *progress = NULL;
  85
  86   *len = restval;
  87   if (opt.verbose)
  88     progress = progress_create (restval, expected);
  89
  90   if (rbuf && RBUF_FD (rbuf) == fd)
  91     {
  92       int need_flush = 0;
  93       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
  94         {
  95           if (fwrite (c, sizeof (char), res, fp) < res)
  96             return -2;
  97           if (opt.verbose)
  98             progress_update (progress, res);
  99           *len += res;
 100           need_flush = 1;
 101         }
 102       if (need_flush)
 103         fflush (fp);
 104       if (ferror (fp))
 105         return -2;
 106     }
 107   /* Read from fd while there is available data.
 108
 109      Normally, if expected is 0, it means that it is not known how
 110      much data is expected.  However, if use_expected is specified,
 111      then expected being zero means exactly that.  */
 112   while (!use_expected || (*len < expected))
 113     {
 114       int amount_to_read = (use_expected
 115                             ? MIN (expected - *len, sizeof (c))
 116                             : sizeof (c));
 117 #ifdef HAVE_SSL
 118                 if (rbuf->ssl!=NULL) {
 119                   res = ssl_iread (rbuf->ssl, c, amount_to_read);
 120                 } else {
 121 #endif /* HAVE_SSL */
 122                   res = iread (fd, c, amount_to_read);
 123 #ifdef HAVE_SSL
 124                 }
 125 #endif /* HAVE_SSL */
 126       if (res > 0)
 127         {
 128           fwrite (c, sizeof (char), res, fp);
 129           /* Always flush the contents of the network packet.  This
 130              should not be adverse to performance, as the network
 131              packets typically won't be too tiny anyway.  */
 132           fflush (fp);
 133           if (ferror (fp))
 134             return -2;
 135           if (opt.verbose)
 136             progress_update (progress, res);
 137           *len += res;
 138         }
 139       else
 140         break;
 141     }
 142   if (res < -1)
 143     res = -1;
 144   if (opt.verbose)
 145     progress_finish (progress);
 146   return res;
 147 }
 148 \f
 149 /* Return a printed representation of the download rate, as
 150    appropriate for the speed.  Appropriate means that if rate is
 151    greater than 1K/s, kilobytes are used, and if rate is greater than
 152    1MB/s, megabytes are used.
 153
 154    If PAD is non-zero, strings will be padded to the width of 7
 155    characters (xxxx.xx).  */
 156 char *
 157 rate (long bytes, long msecs, int pad)
 158 {
 159   static char res[15];
 160   double dlrate;
 161
 162   assert (msecs >= 0);
 163   assert (bytes >= 0);
 164
 165   if (msecs == 0)
 166     /* If elapsed time is 0, it means we're under the granularity of
 167        the timer.  This often happens on systems that use time() for
 168        the timer.  */
 169     msecs = wtimer_granularity ();
 170
 171   dlrate = (double)1000 * bytes / msecs;
 172   if (dlrate < 1024.0)
 173     sprintf (res, pad ? "%7.2f B/s" : "%.2f B/s", dlrate);
 174   else if (dlrate < 1024.0 * 1024.0)
 175     sprintf (res, pad ? "%7.2f K/s" : "%.2f K/s", dlrate / 1024.0);
 176   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 177     sprintf (res, pad ? "%7.2f M/s" : "%.2f M/s", dlrate / (1024.0 * 1024.0));
 178   else
 179     /* Maybe someone will need this one day.  More realistically, it
 180        will get tickled by buggy timers. */
 181     sprintf (res, pad ? "%7.2f GB/s" : "%.2f GB/s",
 182              dlrate / (1024.0 * 1024.0 * 1024.0));
 183
 184   return res;
 185 }
 186 \f
 187 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 188                         && no_proxy_match((u)->host,                    \
 189                                           (const char **)opt.no_proxy))
 190
 191 /* Retrieve the given URL.  Decides which loop to call -- HTTP(S), FTP,
 192    or simply copy it with file:// (#### the latter not yet
 193    implemented!).  */
 194 uerr_t
 195 retrieve_url (const char *origurl, char **file, char **newloc,
 196               const char *refurl, int *dt)
 197 {
 198   uerr_t result;
 199   char *url;
 200   int location_changed, dummy;
 201   int use_proxy;
 202   char *mynewloc, *proxy;
 203   struct url *u;
 204   int up_error_code;            /* url parse error code */
 205   char *local_file;
 206   struct hash_table *redirections = NULL;
 207
 208   /* If dt is NULL, just ignore it.  */
 209   if (!dt)
 210     dt = &dummy;
 211   url = xstrdup (origurl);
 212   if (newloc)
 213     *newloc = NULL;
 214   if (file)
 215     *file = NULL;
 216
 217   u = url_parse (url, &up_error_code);
 218   if (!u)
 219     {
 220       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 221       if (redirections)
 222         string_set_free (redirections);
 223       xfree (url);
 224       return URLERROR;
 225     }
 226
 227   if (!refurl)
 228     refurl = opt.referer;
 229
 230  redirected:
 231
 232   result = NOCONERROR;
 233   mynewloc = NULL;
 234   local_file = NULL;
 235
 236   use_proxy = USE_PROXY_P (u);
 237   if (use_proxy)
 238     {
 239       struct url *proxy_url;
 240
 241       /* Get the proxy server for the current scheme.  */
 242       proxy = getproxy (u->scheme);
 243       if (!proxy)
 244         {
 245           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 246           url_free (u);
 247           if (redirections)
 248             string_set_free (redirections);
 249           xfree (url);
 250           return PROXERR;
 251         }
 252
 253       /* Parse the proxy URL.  */
 254       proxy_url = url_parse (proxy, &up_error_code);
 255       if (!proxy_url)
 256         {
 257           logprintf (LOG_NOTQUIET, "Error parsing proxy URL %s: %s.\n",
 258                      proxy, url_error (up_error_code));
 259           if (redirections)
 260             string_set_free (redirections);
 261           xfree (url);
 262           return PROXERR;
 263         }
 264       if (proxy_url->scheme != SCHEME_HTTP)
 265         {
 266           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 267           url_free (proxy_url);
 268           if (redirections)
 269             string_set_free (redirections);
 270           xfree (url);
 271           return PROXERR;
 272         }
 273
 274       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 275       url_free (proxy_url);
 276     }
 277   else if (u->scheme == SCHEME_HTTP
 278 #ifdef HAVE_SSL
 279       || u->scheme == SCHEME_HTTPS
 280 #endif
 281       )
 282     {
 283       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 284     }
 285   else if (u->scheme == SCHEME_FTP)
 286     {
 287       /* If this is a redirection, we must not allow recursive FTP
 288          retrieval, so we save recursion to oldrec, and restore it
 289          later.  */
 290       int oldrec = opt.recursive;
 291       if (redirections)
 292         opt.recursive = 0;
 293       result = ftp_loop (u, dt);
 294       opt.recursive = oldrec;
 295 #if 0
 296       /* There is a possibility of having HTTP being redirected to
 297          FTP.  In these cases we must decide whether the text is HTML
 298          according to the suffix.  The HTML suffixes are `.html' and
 299          `.htm', case-insensitive.  */
 300       if (redirections && u->local && (u->scheme == SCHEME_FTP))
 301         {
 302           char *suf = suffix (u->local);
 303           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 304             *dt |= TEXTHTML;
 305           FREE_MAYBE (suf);
 306         }
 307 #endif
 308     }
 309   location_changed = (result == NEWLOCATION);
 310   if (location_changed)
 311     {
 312       char *construced_newloc;
 313       struct url *newloc_struct;
 314
 315       assert (mynewloc != NULL);
 316
 317       if (local_file)
 318         xfree (local_file);
 319
 320       /* The HTTP specs only allow absolute URLs to appear in
 321          redirects, but a ton of boneheaded webservers and CGIs out
 322          there break the rules and use relative URLs, and popular
 323          browsers are lenient about this, so wget should be too. */
 324       construced_newloc = uri_merge (url, mynewloc);
 325       xfree (mynewloc);
 326       mynewloc = construced_newloc;
 327
 328       /* Now, see if this new location makes sense. */
 329       newloc_struct = url_parse (mynewloc, NULL);
 330       if (!newloc_struct)
 331         {
 332           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, "UNKNOWN");
 333           url_free (newloc_struct);
 334           url_free (u);
 335           if (redirections)
 336             string_set_free (redirections);
 337           xfree (url);
 338           xfree (mynewloc);
 339           return result;
 340         }
 341
 342       /* Now mynewloc will become newloc_struct->url, because if the
 343          Location contained relative paths like .././something, we
 344          don't want that propagating as url.  */
 345       xfree (mynewloc);
 346       mynewloc = xstrdup (newloc_struct->url);
 347
 348       if (!redirections)
 349         {
 350           redirections = make_string_hash_table (0);
 351           /* Add current URL immediately so we can detect it as soon
 352              as possible in case of a cycle. */
 353           string_set_add (redirections, u->url);
 354         }
 355
 356       /* The new location is OK.  Check for redirection cycle by
 357          peeking through the history of redirections. */
 358       if (string_set_contains (redirections, newloc_struct->url))
 359         {
 360           logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 361                      mynewloc);
 362           url_free (newloc_struct);
 363           url_free (u);
 364           if (redirections)
 365             string_set_free (redirections);
 366           xfree (url);
 367           xfree (mynewloc);
 368           return WRONGCODE;
 369         }
 370       string_set_add (redirections, newloc_struct->url);
 371
 372       xfree (url);
 373       url = mynewloc;
 374       url_free (u);
 375       u = newloc_struct;
 376       goto redirected;
 377     }
 378
 379   if (local_file)
 380     {
 381       if (*dt & RETROKF)
 382         {
 383           register_download (url, local_file);
 384           if (*dt & TEXTHTML)
 385             register_html (url, local_file);
 386         }
 387     }
 388
 389   if (file)
 390     *file = local_file ? local_file : NULL;
 391   else
 392     FREE_MAYBE (local_file);
 393
 394   url_free (u);
 395   if (redirections)
 396     string_set_free (redirections);
 397
 398   if (newloc)
 399     *newloc = url;
 400   else
 401     xfree (url);
 402
 403   ++global_download_count;
 404
 405   return result;
 406 }
 407
 408 /* Find the URLs in the file and call retrieve_url() for each of
 409    them.  If HTML is non-zero, treat the file as HTML, and construct
 410    the URLs accordingly.
 411
 412    If opt.recursive is set, call recursive_retrieve() for each file.  */
 413 uerr_t
 414 retrieve_from_file (const char *file, int html, int *count)
 415 {
 416   uerr_t status;
 417   urlpos *url_list, *cur_url;
 418
 419   url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 420               : get_urls_file (file));
 421   status = RETROK;             /* Suppose everything is OK.  */
 422   *count = 0;                  /* Reset the URL count.  */
 423   recursive_reset ();
 424   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 425     {
 426       char *filename, *new_file;
 427       int dt;
 428
 429       if (downloaded_exceeds_quota ())
 430         {
 431           status = QUOTEXC;
 432           break;
 433         }
 434       status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt);
 435       if (opt.recursive && status == RETROK && (dt & TEXTHTML))
 436         status = recursive_retrieve (filename, new_file ? new_file
 437                                                         : cur_url->url);
 438
 439       if (filename && opt.delete_after && file_exists_p (filename))
 440         {
 441           DEBUGP (("Removing file due to --delete-after in"
 442                    " retrieve_from_file():\n"));
 443           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 444           if (unlink (filename))
 445             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 446           dt &= ~RETROKF;
 447         }
 448
 449       FREE_MAYBE (new_file);
 450       FREE_MAYBE (filename);
 451     }
 452
 453   /* Free the linked list of URL-s.  */
 454   free_urlpos (url_list);
 455
 456   return status;
 457 }
 458
 459 /* Print `giving up', or `retrying', depending on the impending
 460    action.  N1 and N2 are the attempt number and the attempt limit.  */
 461 void
 462 printwhat (int n1, int n2)
 463 {
 464   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 465 }
 466
 467 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 468    set opt.downloaded_overflow to 1. */
 469 void
 470 downloaded_increase (unsigned long by_how_much)
 471 {
 472   VERY_LONG_TYPE old;
 473   if (opt.downloaded_overflow)
 474     return;
 475   old = opt.downloaded;
 476   opt.downloaded += by_how_much;
 477   if (opt.downloaded < old)     /* carry flag, where are you when I
 478                                    need you? */
 479     {
 480       /* Overflow. */
 481       opt.downloaded_overflow = 1;
 482       opt.downloaded = ~((VERY_LONG_TYPE)0);
 483     }
 484 }
 485
 486 /* Return non-zero if the downloaded amount of bytes exceeds the
 487    desired quota.  If quota is not set or if the amount overflowed, 0
 488    is returned. */
 489 int
 490 downloaded_exceeds_quota (void)
 491 {
 492   if (!opt.quota)
 493     return 0;
 494   if (opt.downloaded_overflow)
 495     /* We don't really know.  (Wildly) assume not. */
 496     return 0;
 497
 498   return opt.downloaded > opt.quota;
 499 }
 500
 501 /* If opt.wait or opt.waitretry are specified, and if certain
 502    conditions are met, sleep the appropriate number of seconds.  See
 503    the documentation of --wait and --waitretry for more information.
 504
 505    COUNT is the count of current retrieval, beginning with 1. */
 506
 507 void
 508 sleep_between_retrievals (int count)
 509 {
 510   static int first_retrieval = 1;
 511
 512   if (!first_retrieval && (opt.wait || opt.waitretry))
 513     {
 514       if (opt.waitretry && count > 1)
 515         {
 516           /* If opt.waitretry is specified and this is a retry, wait
 517              for COUNT-1 number of seconds, or for opt.waitretry
 518              seconds.  */
 519           if (count <= opt.waitretry)
 520             sleep (count - 1);
 521           else
 522             sleep (opt.waitretry);
 523         }
 524       else if (opt.wait)
 525         /* Otherwise, check if opt.wait is specified.  If so, sleep.  */
 526         sleep (opt.wait);
 527     }
 528   if (first_retrieval)
 529     first_retrieval = 0;
 530 }