sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 # include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28 #include <errno.h>
  29 #ifdef HAVE_STRING_H
  30 # include <string.h>
  31 #else
  32 # include <strings.h>
  33 #endif /* HAVE_STRING_H */
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "retr.h"
  39 #include "progress.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "ftp.h"
  43 #include "host.h"
  44 #include "connect.h"
  45 #include "hash.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 /* See the comment in gethttp() why this is needed. */
  52 int global_download_count;
  53
  54 \f
  55 #define MIN(i, j) ((i) <= (j) ? (i) : (j))
  56
  57 /* Reads the contents of file descriptor FD, until it is closed, or a
  58    read error occurs.  The data is read in 8K chunks, and stored to
  59    stream fp, which should have been open for writing.  If BUF is
  60    non-NULL and its file descriptor is equal to FD, flush RBUF first.
  61    This function will *not* use the rbuf_* functions!
  62
  63    The EXPECTED argument is passed to show_progress() unchanged, but
  64    otherwise ignored.
  65
  66    If opt.verbose is set, the progress is also shown.  RESTVAL
  67    represents a value from which to start downloading (which will be
  68    shown accordingly).  If RESTVAL is non-zero, the stream should have
  69    been open for appending.
  70
  71    The function exits and returns codes of 0, -1 and -2 if the
  72    connection was closed, there was a read error, or if it could not
  73    write to the output stream, respectively.
  74
  75    IMPORTANT: The function flushes the contents of the buffer in
  76    rbuf_flush() before actually reading from fd.  If you wish to read
  77    from fd immediately, flush or discard the buffer.  */
  78 int
  79 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
  80               struct rbuf *rbuf, int use_expected)
  81 {
  82   int res = 0;
  83   static char c[8192];
  84   void *progress = NULL;
  85
  86   *len = restval;
  87   if (opt.verbose)
  88     progress = progress_create (restval, expected);
  89
  90   if (rbuf && RBUF_FD (rbuf) == fd)
  91     {
  92       int need_flush = 0;
  93       while ((res = rbuf_flush (rbuf, c, sizeof (c))) != 0)
  94         {
  95           if (fwrite (c, sizeof (char), res, fp) < res)
  96             return -2;
  97           if (opt.verbose)
  98             progress_update (progress, res);
  99           *len += res;
 100           need_flush = 1;
 101         }
 102       if (need_flush)
 103         fflush (fp);
 104       if (ferror (fp))
 105         return -2;
 106     }
 107   /* Read from fd while there is available data.
 108
 109      Normally, if expected is 0, it means that it is not known how
 110      much data is expected.  However, if use_expected is specified,
 111      then expected being zero means exactly that.  */
 112   while (!use_expected || (*len < expected))
 113     {
 114       int amount_to_read = (use_expected
 115                             ? MIN (expected - *len, sizeof (c))
 116                             : sizeof (c));
 117 #ifdef HAVE_SSL
 118                 if (rbuf->ssl!=NULL) {
 119                   res = ssl_iread (rbuf->ssl, c, amount_to_read);
 120                 } else {
 121 #endif /* HAVE_SSL */
 122                   res = iread (fd, c, amount_to_read);
 123 #ifdef HAVE_SSL
 124                 }
 125 #endif /* HAVE_SSL */
 126       if (res > 0)
 127         {
 128           fwrite (c, sizeof (char), res, fp);
 129           /* Always flush the contents of the network packet.  This
 130              should not be adverse to performance, as the network
 131              packets typically won't be too tiny anyway.  */
 132           fflush (fp);
 133           if (ferror (fp))
 134             return -2;
 135           if (opt.verbose)
 136             progress_update (progress, res);
 137           *len += res;
 138         }
 139       else
 140         break;
 141     }
 142   if (res < -1)
 143     res = -1;
 144   if (opt.verbose)
 145     progress_finish (progress);
 146   return res;
 147 }
 148 \f
 149 /* Return a printed representation of the download rate, as
 150    appropriate for the speed.  If PAD is non-zero, strings will be
 151    padded to the width of 7 characters (xxxx.xx).  */
 152 char *
 153 retr_rate (long bytes, long msecs, int pad)
 154 {
 155   static char res[20];
 156   static char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 157   int units = 0;
 158
 159   double dlrate = calc_rate (bytes, msecs, &units);
 160   sprintf (res, pad ? "%7.2f %s" : "%.2f %s", dlrate, rate_names[units]);
 161
 162   return res;
 163 }
 164
 165 /* Calculate the download rate and trim it as appropriate for the
 166    speed.  Appropriate means that if rate is greater than 1K/s,
 167    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 168    are used.
 169
 170    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 171    GB/s.  */
 172 double
 173 calc_rate (long bytes, long msecs, int *units)
 174 {
 175   double dlrate;
 176
 177   assert (msecs >= 0);
 178   assert (bytes >= 0);
 179
 180   if (msecs == 0)
 181     /* If elapsed time is 0, it means we're under the granularity of
 182        the timer.  This often happens on systems that use time() for
 183        the timer.  */
 184     msecs = wtimer_granularity ();
 185
 186   dlrate = (double)1000 * bytes / msecs;
 187   if (dlrate < 1024.0)
 188     *units = 0;
 189   else if (dlrate < 1024.0 * 1024.0)
 190     *units = 1, dlrate /= 1024.0;
 191   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 192     *units = 2, dlrate /= (1024.0 * 1024.0);
 193   else
 194     /* Maybe someone will need this one day.  More realistically, it
 195        will get tickled by buggy timers. */
 196     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 197
 198   return dlrate;
 199 }
 200 \f
 201 static int
 202 register_redirections_mapper (void *key, void *value, void *arg)
 203 {
 204   const char *redirected_from = (const char *)key;
 205   const char *redirected_to   = (const char *)arg;
 206   if (0 != strcmp (redirected_from, redirected_to))
 207     register_redirection (redirected_from, redirected_to);
 208   return 0;
 209 }
 210
 211 /* Register the redirections that lead to the successful download of
 212    this URL.  This is necessary so that the link converter can convert
 213    redirected URLs to the local file.  */
 214
 215 static void
 216 register_all_redirections (struct hash_table *redirections, const char *final)
 217 {
 218   hash_table_map (redirections, register_redirections_mapper, (void *)final);
 219 }
 220
 221 #define USE_PROXY_P(u) (opt.use_proxy && getproxy((u)->scheme)          \
 222                         && no_proxy_match((u)->host,                    \
 223                                           (const char **)opt.no_proxy))
 224
 225 /* Retrieve the given URL.  Decides which loop to call -- HTTP(S), FTP,
 226    or simply copy it with file:// (#### the latter not yet
 227    implemented!).  */
 228 uerr_t
 229 retrieve_url (const char *origurl, char **file, char **newloc,
 230               const char *refurl, int *dt)
 231 {
 232   uerr_t result;
 233   char *url;
 234   int location_changed, dummy;
 235   int use_proxy;
 236   char *mynewloc, *proxy;
 237   struct url *u;
 238   int up_error_code;            /* url parse error code */
 239   char *local_file;
 240   struct hash_table *redirections = NULL;
 241
 242   /* If dt is NULL, just ignore it.  */
 243   if (!dt)
 244     dt = &dummy;
 245   url = xstrdup (origurl);
 246   if (newloc)
 247     *newloc = NULL;
 248   if (file)
 249     *file = NULL;
 250
 251   u = url_parse (url, &up_error_code);
 252   if (!u)
 253     {
 254       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 255       if (redirections)
 256         string_set_free (redirections);
 257       xfree (url);
 258       return URLERROR;
 259     }
 260
 261   if (!refurl)
 262     refurl = opt.referer;
 263
 264  redirected:
 265
 266   result = NOCONERROR;
 267   mynewloc = NULL;
 268   local_file = NULL;
 269
 270   use_proxy = USE_PROXY_P (u);
 271   if (use_proxy)
 272     {
 273       struct url *proxy_url;
 274
 275       /* Get the proxy server for the current scheme.  */
 276       proxy = getproxy (u->scheme);
 277       if (!proxy)
 278         {
 279           logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 280           url_free (u);
 281           if (redirections)
 282             string_set_free (redirections);
 283           xfree (url);
 284           return PROXERR;
 285         }
 286
 287       /* Parse the proxy URL.  */
 288       proxy_url = url_parse (proxy, &up_error_code);
 289       if (!proxy_url)
 290         {
 291           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 292                      proxy, url_error (up_error_code));
 293           if (redirections)
 294             string_set_free (redirections);
 295           xfree (url);
 296           return PROXERR;
 297         }
 298       if (proxy_url->scheme != SCHEME_HTTP)
 299         {
 300           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 301           url_free (proxy_url);
 302           if (redirections)
 303             string_set_free (redirections);
 304           xfree (url);
 305           return PROXERR;
 306         }
 307
 308       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 309       url_free (proxy_url);
 310     }
 311   else if (u->scheme == SCHEME_HTTP
 312 #ifdef HAVE_SSL
 313       || u->scheme == SCHEME_HTTPS
 314 #endif
 315       )
 316     {
 317       result = http_loop (u, &mynewloc, &local_file, refurl, dt, NULL);
 318     }
 319   else if (u->scheme == SCHEME_FTP)
 320     {
 321       /* If this is a redirection, we must not allow recursive FTP
 322          retrieval, so we save recursion to oldrec, and restore it
 323          later.  */
 324       int oldrec = opt.recursive;
 325       if (redirections)
 326         opt.recursive = 0;
 327       result = ftp_loop (u, dt);
 328       opt.recursive = oldrec;
 329 #if 0
 330       /* There is a possibility of having HTTP being redirected to
 331          FTP.  In these cases we must decide whether the text is HTML
 332          according to the suffix.  The HTML suffixes are `.html' and
 333          `.htm', case-insensitive.  */
 334       if (redirections && u->local && (u->scheme == SCHEME_FTP))
 335         {
 336           char *suf = suffix (u->local);
 337           if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 338             *dt |= TEXTHTML;
 339           FREE_MAYBE (suf);
 340         }
 341 #endif
 342     }
 343   location_changed = (result == NEWLOCATION);
 344   if (location_changed)
 345     {
 346       char *construced_newloc;
 347       struct url *newloc_parsed;
 348
 349       assert (mynewloc != NULL);
 350
 351       if (local_file)
 352         xfree (local_file);
 353
 354       /* The HTTP specs only allow absolute URLs to appear in
 355          redirects, but a ton of boneheaded webservers and CGIs out
 356          there break the rules and use relative URLs, and popular
 357          browsers are lenient about this, so wget should be too. */
 358       construced_newloc = uri_merge (url, mynewloc);
 359       xfree (mynewloc);
 360       mynewloc = construced_newloc;
 361
 362       /* Now, see if this new location makes sense. */
 363       newloc_parsed = url_parse (mynewloc, &up_error_code);
 364       if (!newloc_parsed)
 365         {
 366           logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc,
 367                      url_error (up_error_code));
 368           url_free (u);
 369           if (redirections)
 370             string_set_free (redirections);
 371           xfree (url);
 372           xfree (mynewloc);
 373           return result;
 374         }
 375
 376       /* Now mynewloc will become newloc_parsed->url, because if the
 377          Location contained relative paths like .././something, we
 378          don't want that propagating as url.  */
 379       xfree (mynewloc);
 380       mynewloc = xstrdup (newloc_parsed->url);
 381
 382       if (!redirections)
 383         {
 384           redirections = make_string_hash_table (0);
 385           /* Add current URL immediately so we can detect it as soon
 386              as possible in case of a cycle. */
 387           string_set_add (redirections, u->url);
 388         }
 389
 390       /* The new location is OK.  Check for redirection cycle by
 391          peeking through the history of redirections. */
 392       if (string_set_contains (redirections, newloc_parsed->url))
 393         {
 394           logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 395                      mynewloc);
 396           url_free (newloc_parsed);
 397           url_free (u);
 398           if (redirections)
 399             string_set_free (redirections);
 400           xfree (url);
 401           xfree (mynewloc);
 402           return WRONGCODE;
 403         }
 404       string_set_add (redirections, newloc_parsed->url);
 405
 406       xfree (url);
 407       url = mynewloc;
 408       url_free (u);
 409       u = newloc_parsed;
 410       goto redirected;
 411     }
 412
 413   if (local_file)
 414     {
 415       if (*dt & RETROKF)
 416         {
 417           register_download (url, local_file);
 418           if (redirections)
 419             register_all_redirections (redirections, url);
 420           if (*dt & TEXTHTML)
 421             register_html (url, local_file);
 422         }
 423     }
 424
 425   if (file)
 426     *file = local_file ? local_file : NULL;
 427   else
 428     FREE_MAYBE (local_file);
 429
 430   url_free (u);
 431   if (redirections)
 432     string_set_free (redirections);
 433
 434   if (newloc)
 435     *newloc = url;
 436   else
 437     xfree (url);
 438
 439   ++global_download_count;
 440
 441   return result;
 442 }
 443
 444 /* Find the URLs in the file and call retrieve_url() for each of
 445    them.  If HTML is non-zero, treat the file as HTML, and construct
 446    the URLs accordingly.
 447
 448    If opt.recursive is set, call recursive_retrieve() for each file.  */
 449 uerr_t
 450 retrieve_from_file (const char *file, int html, int *count)
 451 {
 452   uerr_t status;
 453   struct urlpos *url_list, *cur_url;
 454
 455   url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 456               : get_urls_file (file));
 457   status = RETROK;             /* Suppose everything is OK.  */
 458   *count = 0;                  /* Reset the URL count.  */
 459
 460   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 461     {
 462       char *filename = NULL, *new_file;
 463       int dt;
 464
 465       if (cur_url->ignore_when_downloading)
 466         continue;
 467
 468       if (downloaded_exceeds_quota ())
 469         {
 470           status = QUOTEXC;
 471           break;
 472         }
 473       if (opt.recursive && cur_url->url->scheme != SCHEME_FTP)
 474         status = retrieve_tree (cur_url->url->url);
 475       else
 476         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
 477
 478       if (filename && opt.delete_after && file_exists_p (filename))
 479         {
 480           DEBUGP (("Removing file due to --delete-after in"
 481                    " retrieve_from_file():\n"));
 482           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 483           if (unlink (filename))
 484             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 485           dt &= ~RETROKF;
 486         }
 487
 488       FREE_MAYBE (new_file);
 489       FREE_MAYBE (filename);
 490     }
 491
 492   /* Free the linked list of URL-s.  */
 493   free_urlpos (url_list);
 494
 495   return status;
 496 }
 497
 498 /* Print `giving up', or `retrying', depending on the impending
 499    action.  N1 and N2 are the attempt number and the attempt limit.  */
 500 void
 501 printwhat (int n1, int n2)
 502 {
 503   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 504 }
 505
 506 /* Increment opt.downloaded by BY_HOW_MUCH.  If an overflow occurs,
 507    set opt.downloaded_overflow to 1. */
 508 void
 509 downloaded_increase (unsigned long by_how_much)
 510 {
 511   VERY_LONG_TYPE old;
 512   if (opt.downloaded_overflow)
 513     return;
 514   old = opt.downloaded;
 515   opt.downloaded += by_how_much;
 516   if (opt.downloaded < old)     /* carry flag, where are you when I
 517                                    need you? */
 518     {
 519       /* Overflow. */
 520       opt.downloaded_overflow = 1;
 521       opt.downloaded = ~((VERY_LONG_TYPE)0);
 522     }
 523 }
 524
 525 /* Return non-zero if the downloaded amount of bytes exceeds the
 526    desired quota.  If quota is not set or if the amount overflowed, 0
 527    is returned. */
 528 int
 529 downloaded_exceeds_quota (void)
 530 {
 531   if (!opt.quota)
 532     return 0;
 533   if (opt.downloaded_overflow)
 534     /* We don't really know.  (Wildly) assume not. */
 535     return 0;
 536
 537   return opt.downloaded > opt.quota;
 538 }
 539
 540 /* If opt.wait or opt.waitretry are specified, and if certain
 541    conditions are met, sleep the appropriate number of seconds.  See
 542    the documentation of --wait and --waitretry for more information.
 543
 544    COUNT is the count of current retrieval, beginning with 1. */
 545
 546 void
 547 sleep_between_retrievals (int count)
 548 {
 549   static int first_retrieval = 1;
 550
 551   if (!first_retrieval && (opt.wait || opt.waitretry))
 552     {
 553       if (opt.waitretry && count > 1)
 554         {
 555           /* If opt.waitretry is specified and this is a retry, wait
 556              for COUNT-1 number of seconds, or for opt.waitretry
 557              seconds.  */
 558           if (count <= opt.waitretry)
 559             sleep (count - 1);
 560           else
 561             sleep (opt.waitretry);
 562         }
 563       else if (opt.wait)
 564         /* Otherwise, check if opt.wait is specified.  If so, sleep.  */
 565         sleep (opt.wait);
 566     }
 567   if (first_retrieval)
 568     first_retrieval = 0;
 569 }