sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <string.h>
  39 #include <assert.h>
  40 #ifdef VMS
  41 # include <unixio.h>            /* For delete(). */
  42 #endif
  43
  44 #include "exits.h"
  45 #include "utils.h"
  46 #include "retr.h"
  47 #include "progress.h"
  48 #include "url.h"
  49 #include "recur.h"
  50 #include "ftp.h"
  51 #include "http.h"
  52 #include "host.h"
  53 #include "connect.h"
  54 #include "hash.h"
  55 #include "convert.h"
  56 #include "ptimer.h"
  57 #include "html-url.h"
  58 #include "iri.h"
  59
  60 /* Total size of downloaded files.  Used to enforce quota.  */
  61 SUM_SIZE_INT total_downloaded_bytes;
  62
  63 /* Total download time in seconds. */
  64 double total_download_time;
  65
  66 /* If non-NULL, the stream to which output should be written.  This
  67    stream is initialized when `-O' is used.  */
  68 FILE *output_stream;
  69
  70 /* Whether output_document is a regular file we can manipulate,
  71    i.e. not `-' or a device file. */
  72 bool output_stream_regular;
  73 \f
  74 static struct {
  75   wgint chunk_bytes;
  76   double chunk_start;
  77   double sleep_adjust;
  78 } limit_data;
  79
  80 static void
  81 limit_bandwidth_reset (void)
  82 {
  83   xzero (limit_data);
  84 }
  85
  86 /* Limit the bandwidth by pausing the download for an amount of time.
  87    BYTES is the number of bytes received from the network, and TIMER
  88    is the timer that started at the beginning of download.  */
  89
  90 static void
  91 limit_bandwidth (wgint bytes, struct ptimer *timer)
  92 {
  93   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  94   double expected;
  95
  96   limit_data.chunk_bytes += bytes;
  97
  98   /* Calculate the amount of time we expect downloading the chunk
  99      should take.  If in reality it took less time, sleep to
 100      compensate for the difference.  */
 101   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
 102
 103   if (expected > delta_t)
 104     {
 105       double slp = expected - delta_t + limit_data.sleep_adjust;
 106       double t0, t1;
 107       if (slp < 0.2)
 108         {
 109           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 110                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 111                    delta_t));
 112           return;
 113         }
 114       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 115                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 116                limit_data.sleep_adjust));
 117
 118       t0 = ptimer_read (timer);
 119       xsleep (slp);
 120       t1 = ptimer_measure (timer);
 121
 122       /* Due to scheduling, we probably slept slightly longer (or
 123          shorter) than desired.  Calculate the difference between the
 124          desired and the actual sleep, and adjust the next sleep by
 125          that amount.  */
 126       limit_data.sleep_adjust = slp - (t1 - t0);
 127       /* If sleep_adjust is very large, it's likely due to suspension
 128          and not clock inaccuracy.  Don't enforce those.  */
 129       if (limit_data.sleep_adjust > 0.5)
 130         limit_data.sleep_adjust = 0.5;
 131       else if (limit_data.sleep_adjust < -0.5)
 132         limit_data.sleep_adjust = -0.5;
 133     }
 134
 135   limit_data.chunk_bytes = 0;
 136   limit_data.chunk_start = ptimer_read (timer);
 137 }
 138
 139 #ifndef MIN
 140 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 141 #endif
 142
 143 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 144    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 145    of data written.  If OUT2 is not NULL, also write BUF to OUT2.
 146    In case of error writing to OUT, -1 is returned.  In case of error
 147    writing to OUT2, -2 is returned.  Return 1 if the whole BUF was
 148    skipped.  */
 149
 150 static int
 151 write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
 152             wgint *skip, wgint *written)
 153 {
 154   if (out == NULL && out2 == NULL)
 155     return 1;
 156   if (*skip > bufsize)
 157     {
 158       *skip -= bufsize;
 159       return 1;
 160     }
 161   if (*skip)
 162     {
 163       buf += *skip;
 164       bufsize -= *skip;
 165       *skip = 0;
 166       if (bufsize == 0)
 167         return 1;
 168     }
 169
 170   if (out != NULL)
 171     fwrite (buf, 1, bufsize, out);
 172   if (out2 != NULL)
 173     fwrite (buf, 1, bufsize, out2);
 174   *written += bufsize;
 175
 176   /* Immediately flush the downloaded data.  This should not hinder
 177      performance: fast downloads will arrive in large 16K chunks
 178      (which stdio would write out immediately anyway), and slow
 179      downloads wouldn't be limited by disk speed.  */
 180
 181   /* 2005-04-20 SMS.
 182      Perhaps it shouldn't hinder performance, but it sure does, at least
 183      on VMS (more than 2X).  Rather than speculate on what it should or
 184      shouldn't do, it might make more sense to test it.  Even better, it
 185      might be nice to explain what possible benefit it could offer, as
 186      it appears to be a clear invitation to poor performance with no
 187      actual justification.  (Also, why 16K?  Anyone test other values?)
 188   */
 189 #ifndef __VMS
 190   if (out != NULL)
 191     fflush (out);
 192   if (out2 != NULL)
 193     fflush (out2);
 194 #endif /* ndef __VMS */
 195   if (out != NULL && ferror (out))
 196     return -1;
 197   else if (out2 != NULL && ferror (out2))
 198     return -2;
 199   else
 200     return 0;
 201 }
 202
 203 /* Read the contents of file descriptor FD until it the connection
 204    terminates or a read error occurs.  The data is read in portions of
 205    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 206    the progress is shown.
 207
 208    TOREAD is the amount of data expected to arrive, normally only used
 209    by the progress gauge.
 210
 211    STARTPOS is the position from which the download starts, used by
 212    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 213    is incremented by the amount of data read from the network.  If
 214    QTYWRITTEN is non-NULL, the value it points to is incremented by
 215    the amount of data written to disk.  The time it took to download
 216    the data is stored to ELAPSED.
 217
 218    If OUT2 is non-NULL, the contents is also written to OUT2.
 219    OUT2 will get an exact copy of the response: if this is a chunked
 220    response, everything -- including the chunk headers -- is written
 221    to OUT2.  (OUT will only get the unchunked response.)
 222
 223    The function exits and returns the amount of data read.  In case of
 224    error while reading data, -1 is returned.  In case of error while
 225    writing data to OUT, -2 is returned.  In case of error while writing
 226    data to OUT2, -3 is returned.  */
 227
 228 int
 229 fd_read_body (const char *downloaded_filename, int fd, FILE *out, wgint toread, wgint startpos,
 230
 231               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
 232               FILE *out2)
 233 {
 234   int ret = 0;
 235 #undef max
 236 #define max(a,b) ((a) > (b) ? (a) : (b))
 237   int dlbufsize = max (BUFSIZ, 8 * 1024);
 238   char *dlbuf = xmalloc (dlbufsize);
 239
 240   struct ptimer *timer = NULL;
 241   double last_successful_read_tm = 0;
 242
 243   /* The progress gauge, set according to the user preferences. */
 244   void *progress = NULL;
 245
 246   /* Non-zero if the progress gauge is interactive, i.e. if it can
 247      continually update the display.  When true, smaller timeout
 248      values are used so that the gauge can update the display when
 249      data arrives slowly. */
 250   bool progress_interactive = false;
 251
 252   bool exact = !!(flags & rb_read_exactly);
 253
 254   /* Used only by HTTP/HTTPS chunked transfer encoding.  */
 255   bool chunked = flags & rb_chunked_transfer_encoding;
 256   wgint skip = 0;
 257
 258   /* How much data we've read/written.  */
 259   wgint sum_read = 0;
 260   wgint sum_written = 0;
 261   wgint remaining_chunk_size = 0;
 262
 263   if (flags & rb_skip_startpos)
 264     skip = startpos;
 265
 266   if (opt.show_progress)
 267     {
 268       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 269          argument to progress_create because the indicator doesn't
 270          (yet) know about "skipping" data.  */
 271       wgint start = skip ? 0 : startpos;
 272       progress = progress_create (downloaded_filename, start, start + toread);
 273       progress_interactive = progress_interactive_p (progress);
 274     }
 275
 276   if (opt.limit_rate)
 277     limit_bandwidth_reset ();
 278
 279   /* A timer is needed for tracking progress, for throttling, and for
 280      tracking elapsed time.  If either of these are requested, start
 281      the timer.  */
 282   if (progress || opt.limit_rate || elapsed)
 283     {
 284       timer = ptimer_new ();
 285       last_successful_read_tm = 0;
 286     }
 287
 288   /* Use a smaller buffer for low requested bandwidths.  For example,
 289      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 290      data and then sleep for 8s.  With buffer size equal to the limit,
 291      we never have to sleep for more than one second.  */
 292   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 293     dlbufsize = opt.limit_rate;
 294
 295   /* Read from FD while there is data to read.  Normally toread==0
 296      means that it is unknown how much data is to arrive.  However, if
 297      EXACT is set, then toread==0 means what it says: that no data
 298      should be read.  */
 299   while (!exact || (sum_read < toread))
 300     {
 301       int rdsize;
 302       double tmout = opt.read_timeout;
 303
 304       if (chunked)
 305         {
 306           if (remaining_chunk_size == 0)
 307             {
 308               char *line = fd_read_line (fd);
 309               char *endl;
 310               if (line == NULL)
 311                 {
 312                   ret = -1;
 313                   break;
 314                 }
 315               else if (out2 != NULL)
 316                 fwrite (line, 1, strlen (line), out2);
 317
 318               remaining_chunk_size = strtol (line, &endl, 16);
 319               xfree (line);
 320
 321               if (remaining_chunk_size == 0)
 322                 {
 323                   ret = 0;
 324                   line = fd_read_line (fd);
 325                   if (line == NULL)
 326                     ret = -1;
 327                   else
 328                     {
 329                       if (out2 != NULL)
 330                         fwrite (line, 1, strlen (line), out2);
 331                       xfree (line);
 332                     }
 333                   break;
 334                 }
 335             }
 336
 337           rdsize = MIN (remaining_chunk_size, dlbufsize);
 338         }
 339       else
 340         rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 341
 342       if (progress_interactive)
 343         {
 344           /* For interactive progress gauges, always specify a ~1s
 345              timeout, so that the gauge can be updated regularly even
 346              when the data arrives very slowly or stalls.  */
 347           tmout = 0.95;
 348           if (opt.read_timeout)
 349             {
 350               double waittm;
 351               waittm = ptimer_read (timer) - last_successful_read_tm;
 352               if (waittm + tmout > opt.read_timeout)
 353                 {
 354                   /* Don't let total idle time exceed read timeout. */
 355                   tmout = opt.read_timeout - waittm;
 356                   if (tmout < 0)
 357                     {
 358                       /* We've already exceeded the timeout. */
 359                       ret = -1, errno = ETIMEDOUT;
 360                       break;
 361                     }
 362                 }
 363             }
 364         }
 365       ret = fd_read (fd, dlbuf, rdsize, tmout);
 366
 367       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 368         ret = 0;                /* interactive timeout, handled above */
 369       else if (ret <= 0)
 370         break;                  /* EOF or read error */
 371
 372       if (progress || opt.limit_rate || elapsed)
 373         {
 374           ptimer_measure (timer);
 375           if (ret > 0)
 376             last_successful_read_tm = ptimer_read (timer);
 377         }
 378
 379       if (ret > 0)
 380         {
 381           sum_read += ret;
 382           int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
 383           if (write_res < 0)
 384             {
 385               ret = (write_res == -3) ? -3 : -2;
 386               goto out;
 387             }
 388           if (chunked)
 389             {
 390               remaining_chunk_size -= ret;
 391               if (remaining_chunk_size == 0)
 392                 {
 393                   char *line = fd_read_line (fd);
 394                   if (line == NULL)
 395                     {
 396                       ret = -1;
 397                       break;
 398                     }
 399                   else
 400                     {
 401                       if (out2 != NULL)
 402                         fwrite (line, 1, strlen (line), out2);
 403                       xfree (line);
 404                     }
 405                 }
 406             }
 407         }
 408
 409       if (opt.limit_rate)
 410         limit_bandwidth (ret, timer);
 411
 412       if (progress)
 413         progress_update (progress, ret, ptimer_read (timer));
 414 #ifdef WINDOWS
 415       if (toread > 0 && opt.show_progress)
 416         ws_percenttitle (100.0 *
 417                          (startpos + sum_read) / (startpos + toread));
 418 #endif
 419     }
 420   if (ret < -1)
 421     ret = -1;
 422
 423  out:
 424   if (progress)
 425     progress_finish (progress, ptimer_read (timer));
 426
 427   if (elapsed)
 428     *elapsed = ptimer_read (timer);
 429   if (timer)
 430     ptimer_destroy (timer);
 431
 432   if (qtyread)
 433     *qtyread += sum_read;
 434   if (qtywritten)
 435     *qtywritten += sum_written;
 436
 437   free (dlbuf);
 438
 439   return ret;
 440 }
 441 \f
 442 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 443    limited by whatever the TERMINATOR callback chooses as its
 444    terminator.  For example, if terminator stops at newline, the hunk
 445    will consist of a line of data; if terminator stops at two
 446    newlines, it can be used to read the head of an HTTP response.
 447    Upon determining the boundary, the function returns the data (up to
 448    the terminator) in malloc-allocated storage.
 449
 450    In case of read error, NULL is returned.  In case of EOF and no
 451    data read, NULL is returned and errno set to 0.  In case of having
 452    read some data, but encountering EOF before seeing the terminator,
 453    the data that has been read is returned, but it will (obviously)
 454    not contain the terminator.
 455
 456    The TERMINATOR function is called with three arguments: the
 457    beginning of the data read so far, the beginning of the current
 458    block of peeked-at data, and the length of the current block.
 459    Depending on its needs, the function is free to choose whether to
 460    analyze all data or just the newly arrived data.  If TERMINATOR
 461    returns NULL, it means that the terminator has not been seen.
 462    Otherwise it should return a pointer to the charactre immediately
 463    following the terminator.
 464
 465    The idea is to be able to read a line of input, or otherwise a hunk
 466    of text, such as the head of an HTTP request, without crossing the
 467    boundary, so that the next call to fd_read etc. reads the data
 468    after the hunk.  To achieve that, this function does the following:
 469
 470    1. Peek at incoming data.
 471
 472    2. Determine whether the peeked data, along with the previously
 473       read data, includes the terminator.
 474
 475       2a. If yes, read the data until the end of the terminator, and
 476           exit.
 477
 478       2b. If no, read the peeked data and goto 1.
 479
 480    The function is careful to assume as little as possible about the
 481    implementation of peeking.  For example, every peek is followed by
 482    a read.  If the read returns a different amount of data, the
 483    process is retried until all data arrives safely.
 484
 485    SIZEHINT is the buffer size sufficient to hold all the data in the
 486    typical case (it is used as the initial buffer size).  MAXSIZE is
 487    the maximum amount of memory this function is allowed to allocate,
 488    or 0 if no upper limit is to be enforced.
 489
 490    This function should be used as a building block for other
 491    functions -- see fd_read_line as a simple example.  */
 492
 493 char *
 494 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 495 {
 496   long bufsize = sizehint;
 497   char *hunk = xmalloc (bufsize);
 498   int tail = 0;                 /* tail position in HUNK */
 499
 500   assert (!maxsize || maxsize >= bufsize);
 501
 502   while (1)
 503     {
 504       const char *end;
 505       int pklen, rdlen, remain;
 506
 507       /* First, peek at the available data. */
 508
 509       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 510       if (pklen < 0)
 511         {
 512           xfree (hunk);
 513           return NULL;
 514         }
 515       end = terminator (hunk, hunk + tail, pklen);
 516       if (end)
 517         {
 518           /* The data contains the terminator: we'll drain the data up
 519              to the end of the terminator.  */
 520           remain = end - (hunk + tail);
 521           assert (remain >= 0);
 522           if (remain == 0)
 523             {
 524               /* No more data needs to be read. */
 525               hunk[tail] = '\0';
 526               return hunk;
 527             }
 528           if (bufsize - 1 < tail + remain)
 529             {
 530               bufsize = tail + remain + 1;
 531               hunk = xrealloc (hunk, bufsize);
 532             }
 533         }
 534       else
 535         /* No terminator: simply read the data we know is (or should
 536            be) available.  */
 537         remain = pklen;
 538
 539       /* Now, read the data.  Note that we make no assumptions about
 540          how much data we'll get.  (Some TCP stacks are notorious for
 541          read returning less data than the previous MSG_PEEK.)  */
 542
 543       rdlen = fd_read (fd, hunk + tail, remain, 0);
 544       if (rdlen < 0)
 545         {
 546           xfree_null (hunk);
 547           return NULL;
 548         }
 549       tail += rdlen;
 550       hunk[tail] = '\0';
 551
 552       if (rdlen == 0)
 553         {
 554           if (tail == 0)
 555             {
 556               /* EOF without anything having been read */
 557               xfree (hunk);
 558               errno = 0;
 559               return NULL;
 560             }
 561           else
 562             /* EOF seen: return the data we've read. */
 563             return hunk;
 564         }
 565       if (end && rdlen == remain)
 566         /* The terminator was seen and the remaining data drained --
 567            we got what we came for.  */
 568         return hunk;
 569
 570       /* Keep looping until all the data arrives. */
 571
 572       if (tail == bufsize - 1)
 573         {
 574           /* Double the buffer size, but refuse to allocate more than
 575              MAXSIZE bytes.  */
 576           if (maxsize && bufsize >= maxsize)
 577             {
 578               xfree (hunk);
 579               errno = ENOMEM;
 580               return NULL;
 581             }
 582           bufsize <<= 1;
 583           if (maxsize && bufsize > maxsize)
 584             bufsize = maxsize;
 585           hunk = xrealloc (hunk, bufsize);
 586         }
 587     }
 588 }
 589
 590 static const char *
 591 line_terminator (const char *start, const char *peeked, int peeklen)
 592 {
 593   const char *p = memchr (peeked, '\n', peeklen);
 594   if (p)
 595     /* p+1 because the line must include '\n' */
 596     return p + 1;
 597   return NULL;
 598 }
 599
 600 /* The maximum size of the single line we agree to accept.  This is
 601    not meant to impose an arbitrary limit, but to protect the user
 602    from Wget slurping up available memory upon encountering malicious
 603    or buggy server output.  Define it to 0 to remove the limit.  */
 604 #define FD_READ_LINE_MAX 4096
 605
 606 /* Read one line from FD and return it.  The line is allocated using
 607    malloc, but is never larger than FD_READ_LINE_MAX.
 608
 609    If an error occurs, or if no data can be read, NULL is returned.
 610    In the former case errno indicates the error condition, and in the
 611    latter case, errno is NULL.  */
 612
 613 char *
 614 fd_read_line (int fd)
 615 {
 616   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 617 }
 618 \f
 619 /* Return a printed representation of the download rate, along with
 620    the units appropriate for the download speed.  */
 621
 622 const char *
 623 retr_rate (wgint bytes, double secs)
 624 {
 625   static char res[20];
 626   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 627   static const char *rate_names_bits[] = {"b/s", "Kb/s", "Mb/s", "Gb/s" };
 628   int units;
 629
 630   double dlrate = calc_rate (bytes, secs, &units);
 631   /* Use more digits for smaller numbers (regardless of unit used),
 632      e.g. "1022", "247", "12.5", "2.38".  */
 633   sprintf (res, "%.*f %s",
 634            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 635            dlrate, !opt.report_bps ? rate_names[units]: rate_names_bits[units]);
 636
 637   return res;
 638 }
 639
 640 /* Calculate the download rate and trim it as appropriate for the
 641    speed.  Appropriate means that if rate is greater than 1K/s,
 642    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 643    are used.
 644
 645    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 646    GB/s.  */
 647
 648 double
 649 calc_rate (wgint bytes, double secs, int *units)
 650 {
 651   double dlrate;
 652   double bibyte = 1000.0;
 653
 654   if (!opt.report_bps)
 655     bibyte = 1024.0;
 656
 657
 658   assert (secs >= 0);
 659   assert (bytes >= 0);
 660
 661   if (secs == 0)
 662     /* If elapsed time is exactly zero, it means we're under the
 663        resolution of the timer.  This can easily happen on systems
 664        that use time() for the timer.  Since the interval lies between
 665        0 and the timer's resolution, assume half the resolution.  */
 666     secs = ptimer_resolution () / 2.0;
 667
 668   dlrate = convert_to_bits (bytes) / secs;
 669   if (dlrate < bibyte)
 670     *units = 0;
 671   else if (dlrate < (bibyte * bibyte))
 672     *units = 1, dlrate /= bibyte;
 673   else if (dlrate < (bibyte * bibyte * bibyte))
 674     *units = 2, dlrate /= (bibyte * bibyte);
 675
 676   else
 677     /* Maybe someone will need this, one day. */
 678     *units = 3, dlrate /= (bibyte * bibyte * bibyte);
 679
 680   return dlrate;
 681 }
 682 \f
 683
 684 #define SUSPEND_METHOD do {                     \
 685   method_suspended = true;                      \
 686   saved_body_data = opt.body_data;              \
 687   saved_body_file_name = opt.body_file;         \
 688   saved_method = opt.method;                    \
 689   opt.body_data = NULL;                         \
 690   opt.body_file = NULL;                         \
 691   opt.method = NULL;                            \
 692 } while (0)
 693
 694 #define RESTORE_METHOD do {                             \
 695   if (method_suspended)                                 \
 696     {                                                   \
 697       opt.body_data = saved_body_data;                  \
 698       opt.body_file = saved_body_file_name;             \
 699       opt.method = saved_method;                        \
 700       method_suspended = false;                         \
 701     }                                                   \
 702 } while (0)
 703
 704 static char *getproxy (struct url *);
 705
 706 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 707    FTP, proxy, etc.  */
 708
 709 /* #### This function should be rewritten so it doesn't return from
 710    multiple points. */
 711
 712 uerr_t
 713 retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
 714               char **newloc, const char *refurl, int *dt, bool recursive,
 715               struct iri *iri, bool register_status)
 716 {
 717   uerr_t result;
 718   char *url;
 719   bool location_changed;
 720   bool iri_fallbacked = 0;
 721   int dummy;
 722   char *mynewloc, *proxy;
 723   struct url *u = orig_parsed, *proxy_url;
 724   int up_error_code;            /* url parse error code */
 725   char *local_file;
 726   int redirection_count = 0;
 727
 728   bool method_suspended = false;
 729   char *saved_body_data = NULL;
 730   char *saved_method = NULL;
 731   char *saved_body_file_name = NULL;
 732
 733   /* If dt is NULL, use local storage.  */
 734   if (!dt)
 735     {
 736       dt = &dummy;
 737       dummy = 0;
 738     }
 739   url = xstrdup (origurl);
 740   if (newloc)
 741     *newloc = NULL;
 742   if (file)
 743     *file = NULL;
 744
 745   if (!refurl)
 746     refurl = opt.referer;
 747
 748  redirected:
 749   /* (also for IRI fallbacking) */
 750
 751   result = NOCONERROR;
 752   mynewloc = NULL;
 753   local_file = NULL;
 754   proxy_url = NULL;
 755
 756   proxy = getproxy (u);
 757   if (proxy)
 758     {
 759       struct iri *pi = iri_new ();
 760       set_uri_encoding (pi, opt.locale, true);
 761       pi->utf8_encode = false;
 762
 763       /* Parse the proxy URL.  */
 764       proxy_url = url_parse (proxy, &up_error_code, NULL, true);
 765       if (!proxy_url)
 766         {
 767           char *error = url_error (proxy, up_error_code);
 768           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 769                      proxy, error);
 770           xfree (url);
 771           xfree (error);
 772           RESTORE_METHOD;
 773           result = PROXERR;
 774           goto bail;
 775         }
 776       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 777         {
 778           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 779           url_free (proxy_url);
 780           xfree (url);
 781           RESTORE_METHOD;
 782           result = PROXERR;
 783           goto bail;
 784         }
 785     }
 786
 787   if (u->scheme == SCHEME_HTTP
 788 #ifdef HAVE_SSL
 789       || u->scheme == SCHEME_HTTPS
 790 #endif
 791       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 792     {
 793       result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
 794                           proxy_url, iri);
 795     }
 796   else if (u->scheme == SCHEME_FTP)
 797     {
 798       /* If this is a redirection, temporarily turn off opt.ftp_glob
 799          and opt.recursive, both being undesirable when following
 800          redirects.  */
 801       bool oldrec = recursive, glob = opt.ftp_glob;
 802       if (redirection_count)
 803         oldrec = glob = false;
 804
 805       result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
 806       recursive = oldrec;
 807
 808       /* There is a possibility of having HTTP being redirected to
 809          FTP.  In these cases we must decide whether the text is HTML
 810          according to the suffix.  The HTML suffixes are `.html',
 811          `.htm' and a few others, case-insensitive.  */
 812       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 813         {
 814           if (has_html_suffix_p (local_file))
 815             *dt |= TEXTHTML;
 816         }
 817     }
 818
 819   if (proxy_url)
 820     {
 821       url_free (proxy_url);
 822       proxy_url = NULL;
 823     }
 824
 825   location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
 826   if (location_changed)
 827     {
 828       char *construced_newloc;
 829       struct url *newloc_parsed;
 830
 831       assert (mynewloc != NULL);
 832
 833       if (local_file)
 834         xfree (local_file);
 835
 836       /* The HTTP specs only allow absolute URLs to appear in
 837          redirects, but a ton of boneheaded webservers and CGIs out
 838          there break the rules and use relative URLs, and popular
 839          browsers are lenient about this, so wget should be too. */
 840       construced_newloc = uri_merge (url, mynewloc);
 841       xfree (mynewloc);
 842       mynewloc = construced_newloc;
 843
 844       /* Reset UTF-8 encoding state, keep the URI encoding and reset
 845          the content encoding. */
 846       iri->utf8_encode = opt.enable_iri;
 847       set_content_encoding (iri, NULL);
 848       xfree_null (iri->orig_url);
 849       iri->orig_url = NULL;
 850
 851       /* Now, see if this new location makes sense. */
 852       newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
 853       if (!newloc_parsed)
 854         {
 855           char *error = url_error (mynewloc, up_error_code);
 856           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 857                      error);
 858           if (orig_parsed != u)
 859             {
 860               url_free (u);
 861             }
 862           xfree (url);
 863           xfree (mynewloc);
 864           xfree (error);
 865           RESTORE_METHOD;
 866           goto bail;
 867         }
 868
 869       /* Now mynewloc will become newloc_parsed->url, because if the
 870          Location contained relative paths like .././something, we
 871          don't want that propagating as url.  */
 872       xfree (mynewloc);
 873       mynewloc = xstrdup (newloc_parsed->url);
 874
 875       /* Check for max. number of redirections.  */
 876       if (++redirection_count > opt.max_redirect)
 877         {
 878           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 879                      opt.max_redirect);
 880           url_free (newloc_parsed);
 881           if (orig_parsed != u)
 882             {
 883               url_free (u);
 884             }
 885           xfree (url);
 886           xfree (mynewloc);
 887           RESTORE_METHOD;
 888           result = WRONGCODE;
 889           goto bail;
 890         }
 891
 892       xfree (url);
 893       url = mynewloc;
 894       if (orig_parsed != u)
 895         {
 896           url_free (u);
 897         }
 898       u = newloc_parsed;
 899
 900       /* If we're being redirected from POST, and we received a
 901          redirect code different than 307, we don't want to POST
 902          again.  Many requests answer POST with a redirection to an
 903          index page; that redirection is clearly a GET.  We "suspend"
 904          POST data for the duration of the redirections, and restore
 905          it when we're done.
 906
 907          RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
 908          specifically to preserve the method of the request.
 909          */
 910       if (result != NEWLOCATION_KEEP_POST && !method_suspended)
 911         SUSPEND_METHOD;
 912
 913       goto redirected;
 914     }
 915
 916   /* Try to not encode in UTF-8 if fetching failed */
 917   if (!(*dt & RETROKF) && iri->utf8_encode)
 918     {
 919       iri->utf8_encode = false;
 920       if (orig_parsed != u)
 921         {
 922           url_free (u);
 923         }
 924       u = url_parse (origurl, NULL, iri, true);
 925       if (u)
 926         {
 927           DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
 928           url = xstrdup (u->url);
 929           iri_fallbacked = 1;
 930           goto redirected;
 931         }
 932       else
 933           DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
 934     }
 935
 936   if (local_file && u && *dt & RETROKF)
 937     {
 938       register_download (u->url, local_file);
 939
 940       if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
 941         register_redirection (origurl, u->url);
 942
 943       if (*dt & TEXTHTML)
 944         register_html (local_file);
 945
 946       if (*dt & TEXTCSS)
 947         register_css (local_file);
 948     }
 949
 950   if (file)
 951     *file = local_file ? local_file : NULL;
 952   else
 953     xfree_null (local_file);
 954
 955   if (orig_parsed != u)
 956     {
 957       url_free (u);
 958     }
 959
 960   if (redirection_count || iri_fallbacked)
 961     {
 962       if (newloc)
 963         *newloc = url;
 964       else
 965         xfree (url);
 966     }
 967   else
 968     {
 969       if (newloc)
 970         *newloc = NULL;
 971       xfree (url);
 972     }
 973
 974   RESTORE_METHOD;
 975
 976 bail:
 977   if (register_status)
 978     inform_exit_status (result);
 979   return result;
 980 }
 981
 982 /* Find the URLs in the file and call retrieve_url() for each of them.
 983    If HTML is true, treat the file as HTML, and construct the URLs
 984    accordingly.
 985
 986    If opt.recursive is set, call retrieve_tree() for each file.  */
 987
 988 uerr_t
 989 retrieve_from_file (const char *file, bool html, int *count)
 990 {
 991   uerr_t status;
 992   struct urlpos *url_list, *cur_url;
 993   struct iri *iri = iri_new();
 994
 995   char *input_file, *url_file = NULL;
 996   const char *url = file;
 997
 998   status = RETROK;             /* Suppose everything is OK.  */
 999   *count = 0;                  /* Reset the URL count.  */
1000
1001   /* sXXXav : Assume filename and links in the file are in the locale */
1002   set_uri_encoding (iri, opt.locale, true);
1003   set_content_encoding (iri, opt.locale);
1004
1005   if (url_valid_scheme (url))
1006     {
1007       int dt,url_err;
1008       uerr_t status;
1009       struct url *url_parsed = url_parse (url, &url_err, iri, true);
1010       if (!url_parsed)
1011         {
1012           char *error = url_error (url, url_err);
1013           logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
1014           xfree (error);
1015           return URLERROR;
1016         }
1017
1018       if (!opt.base_href)
1019         opt.base_href = xstrdup (url);
1020
1021       status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
1022                              false, iri, true);
1023       url_free (url_parsed);
1024
1025       if (!url_file || (status != RETROK))
1026         return status;
1027
1028       if (dt & TEXTHTML)
1029         html = true;
1030
1031       /* If we have a found a content encoding, use it.
1032        * ( == is okay, because we're checking for identical object) */
1033       if (iri->content_encoding != opt.locale)
1034           set_uri_encoding (iri, iri->content_encoding, false);
1035
1036       /* Reset UTF-8 encode status */
1037       iri->utf8_encode = opt.enable_iri;
1038       xfree_null (iri->orig_url);
1039       iri->orig_url = NULL;
1040
1041       input_file = url_file;
1042     }
1043   else
1044     input_file = (char *) file;
1045
1046   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
1047               : get_urls_file (input_file));
1048
1049   xfree_null (url_file);
1050
1051   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
1052     {
1053       char *filename = NULL, *new_file = NULL;
1054       int dt;
1055       struct iri *tmpiri = iri_dup (iri);
1056       struct url *parsed_url = NULL;
1057
1058       if (cur_url->ignore_when_downloading)
1059         continue;
1060
1061       if (opt.quota && total_downloaded_bytes > opt.quota)
1062         {
1063           status = QUOTEXC;
1064           break;
1065         }
1066
1067       parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
1068
1069       if ((opt.recursive || opt.page_requisites)
1070           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
1071         {
1072           int old_follow_ftp = opt.follow_ftp;
1073
1074           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
1075           if (cur_url->url->scheme == SCHEME_FTP)
1076             opt.follow_ftp = 1;
1077
1078           status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
1079                                   tmpiri);
1080
1081           opt.follow_ftp = old_follow_ftp;
1082         }
1083       else
1084         status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
1085                                cur_url->url->url, &filename,
1086                                &new_file, NULL, &dt, opt.recursive, tmpiri,
1087                                true);
1088
1089       if (parsed_url)
1090           url_free (parsed_url);
1091
1092       if (filename && opt.delete_after && file_exists_p (filename))
1093         {
1094           DEBUGP (("\
1095 Removing file due to --delete-after in retrieve_from_file():\n"));
1096           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
1097           if (unlink (filename))
1098             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
1099           dt &= ~RETROKF;
1100         }
1101
1102       xfree_null (new_file);
1103       xfree_null (filename);
1104       iri_free (tmpiri);
1105     }
1106
1107   /* Free the linked list of URL-s.  */
1108   free_urlpos (url_list);
1109
1110   iri_free (iri);
1111
1112   return status;
1113 }
1114
1115 /* Print `giving up', or `retrying', depending on the impending
1116    action.  N1 and N2 are the attempt number and the attempt limit.  */
1117 void
1118 printwhat (int n1, int n2)
1119 {
1120   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1121 }
1122
1123 /* If opt.wait or opt.waitretry are specified, and if certain
1124    conditions are met, sleep the appropriate number of seconds.  See
1125    the documentation of --wait and --waitretry for more information.
1126
1127    COUNT is the count of current retrieval, beginning with 1. */
1128
1129 void
1130 sleep_between_retrievals (int count)
1131 {
1132   static bool first_retrieval = true;
1133
1134   if (first_retrieval)
1135     {
1136       /* Don't sleep before the very first retrieval. */
1137       first_retrieval = false;
1138       return;
1139     }
1140
1141   if (opt.waitretry && count > 1)
1142     {
1143       /* If opt.waitretry is specified and this is a retry, wait for
1144          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1145       if (count <= opt.waitretry)
1146         xsleep (count - 1);
1147       else
1148         xsleep (opt.waitretry);
1149     }
1150   else if (opt.wait)
1151     {
1152       if (!opt.random_wait || count > 1)
1153         /* If random-wait is not specified, or if we are sleeping
1154            between retries of the same download, sleep the fixed
1155            interval.  */
1156         xsleep (opt.wait);
1157       else
1158         {
1159           /* Sleep a random amount of time averaging in opt.wait
1160              seconds.  The sleeping amount ranges from 0.5*opt.wait to
1161              1.5*opt.wait.  */
1162           double waitsecs = (0.5 + random_float ()) * opt.wait;
1163           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1164                    opt.wait, waitsecs));
1165           xsleep (waitsecs);
1166         }
1167     }
1168 }
1169
1170 /* Free the linked list of urlpos.  */
1171 void
1172 free_urlpos (struct urlpos *l)
1173 {
1174   while (l)
1175     {
1176       struct urlpos *next = l->next;
1177       if (l->url)
1178         url_free (l->url);
1179       xfree_null (l->local_name);
1180       xfree (l);
1181       l = next;
1182     }
1183 }
1184
1185 /* Rotate FNAME opt.backups times */
1186 void
1187 rotate_backups(const char *fname)
1188 {
1189 #ifdef __VMS
1190 # define SEP "_"
1191 # define AVS ";*"                       /* All-version suffix. */
1192 # define AVSL (sizeof (AVS) - 1)
1193 #else
1194 # define SEP "."
1195 # define AVSL 0
1196 #endif
1197
1198   int maxlen = strlen (fname) + sizeof (SEP) + numdigit (opt.backups) + AVSL;
1199   char *from = (char *)alloca (maxlen);
1200   char *to = (char *)alloca (maxlen);
1201   struct_stat sb;
1202   int i;
1203
1204   if (stat (fname, &sb) == 0)
1205     if (S_ISREG (sb.st_mode) == 0)
1206       return;
1207
1208   for (i = opt.backups; i > 1; i--)
1209     {
1210 #ifdef VMS
1211       /* Delete (all versions of) any existing max-suffix file, to avoid
1212        * creating multiple versions of it.  (On VMS, rename() will
1213        * create a new version of an existing destination file, not
1214        * destroy/overwrite it.)
1215        */
1216       if (i == opt.backups)
1217         {
1218           sprintf (to, "%s%s%d%s", fname, SEP, i, AVS);
1219           delete (to);
1220         }
1221 #endif
1222       sprintf (to, "%s%s%d", fname, SEP, i);
1223       sprintf (from, "%s%s%d", fname, SEP, i - 1);
1224       rename (from, to);
1225     }
1226
1227   sprintf (to, "%s%s%d", fname, SEP, 1);
1228   rename(fname, to);
1229 }
1230
1231 static bool no_proxy_match (const char *, const char **);
1232
1233 /* Return the URL of the proxy appropriate for url U.  */
1234
1235 static char *
1236 getproxy (struct url *u)
1237 {
1238   char *proxy = NULL;
1239   char *rewritten_url;
1240   static char rewritten_storage[1024];
1241
1242   if (!opt.use_proxy)
1243     return NULL;
1244   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1245     return NULL;
1246
1247   switch (u->scheme)
1248     {
1249     case SCHEME_HTTP:
1250       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1251       break;
1252 #ifdef HAVE_SSL
1253     case SCHEME_HTTPS:
1254       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1255       break;
1256 #endif
1257     case SCHEME_FTP:
1258       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1259       break;
1260     case SCHEME_INVALID:
1261       break;
1262     }
1263   if (!proxy || !*proxy)
1264     return NULL;
1265
1266   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1267      getproxy() to return static storage. */
1268   rewritten_url = rewrite_shorthand_url (proxy);
1269   if (rewritten_url)
1270     {
1271       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1272       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1273       proxy = rewritten_storage;
1274     }
1275
1276   return proxy;
1277 }
1278
1279 /* Returns true if URL would be downloaded through a proxy. */
1280
1281 bool
1282 url_uses_proxy (struct url * u)
1283 {
1284   bool ret;
1285   if (!u)
1286     return false;
1287   ret = getproxy (u) != NULL;
1288   return ret;
1289 }
1290
1291 /* Should a host be accessed through proxy, concerning no_proxy?  */
1292 static bool
1293 no_proxy_match (const char *host, const char **no_proxy)
1294 {
1295   if (!no_proxy)
1296     return false;
1297   else
1298     return sufmatch (no_proxy, host);
1299 }
1300
1301 /* Set the file parameter to point to the local file string.  */
1302 void
1303 set_local_file (const char **file, const char *default_file)
1304 {
1305   if (opt.output_document)
1306     {
1307       if (output_stream_regular)
1308         *file = opt.output_document;
1309     }
1310   else
1311     *file = default_file;
1312 }
1313
1314 /* Return true for an input file's own URL, false otherwise.  */
1315 bool
1316 input_file_url (const char *input_file)
1317 {
1318   static bool first = true;
1319
1320   if (input_file
1321       && url_has_scheme (input_file)
1322       && first)
1323     {
1324       first = false;
1325       return true;
1326     }
1327   else
1328     return false;
1329 }