sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <string.h>
  39 #include <assert.h>
  40
  41 #include "exits.h"
  42 #include "utils.h"
  43 #include "retr.h"
  44 #include "progress.h"
  45 #include "url.h"
  46 #include "recur.h"
  47 #include "ftp.h"
  48 #include "http.h"
  49 #include "host.h"
  50 #include "connect.h"
  51 #include "hash.h"
  52 #include "convert.h"
  53 #include "ptimer.h"
  54 #include "html-url.h"
  55 #include "iri.h"
  56
  57 /* Total size of downloaded files.  Used to enforce quota.  */
  58 SUM_SIZE_INT total_downloaded_bytes;
  59
  60 /* Total download time in seconds. */
  61 double total_download_time;
  62
  63 /* If non-NULL, the stream to which output should be written.  This
  64    stream is initialized when `-O' is used.  */
  65 FILE *output_stream;
  66
  67 /* Whether output_document is a regular file we can manipulate,
  68    i.e. not `-' or a device file. */
  69 bool output_stream_regular;
  70 \f
  71 static struct {
  72   wgint chunk_bytes;
  73   double chunk_start;
  74   double sleep_adjust;
  75 } limit_data;
  76
  77 static void
  78 limit_bandwidth_reset (void)
  79 {
  80   xzero (limit_data);
  81 }
  82
  83 /* Limit the bandwidth by pausing the download for an amount of time.
  84    BYTES is the number of bytes received from the network, and TIMER
  85    is the timer that started at the beginning of download.  */
  86
  87 static void
  88 limit_bandwidth (wgint bytes, struct ptimer *timer)
  89 {
  90   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  91   double expected;
  92
  93   limit_data.chunk_bytes += bytes;
  94
  95   /* Calculate the amount of time we expect downloading the chunk
  96      should take.  If in reality it took less time, sleep to
  97      compensate for the difference.  */
  98   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
  99
 100   if (expected > delta_t)
 101     {
 102       double slp = expected - delta_t + limit_data.sleep_adjust;
 103       double t0, t1;
 104       if (slp < 0.2)
 105         {
 106           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 107                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 108                    delta_t));
 109           return;
 110         }
 111       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 112                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 113                limit_data.sleep_adjust));
 114
 115       t0 = ptimer_read (timer);
 116       xsleep (slp);
 117       t1 = ptimer_measure (timer);
 118
 119       /* Due to scheduling, we probably slept slightly longer (or
 120          shorter) than desired.  Calculate the difference between the
 121          desired and the actual sleep, and adjust the next sleep by
 122          that amount.  */
 123       limit_data.sleep_adjust = slp - (t1 - t0);
 124       /* If sleep_adjust is very large, it's likely due to suspension
 125          and not clock inaccuracy.  Don't enforce those.  */
 126       if (limit_data.sleep_adjust > 0.5)
 127         limit_data.sleep_adjust = 0.5;
 128       else if (limit_data.sleep_adjust < -0.5)
 129         limit_data.sleep_adjust = -0.5;
 130     }
 131
 132   limit_data.chunk_bytes = 0;
 133   limit_data.chunk_start = ptimer_read (timer);
 134 }
 135
 136 #ifndef MIN
 137 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 138 #endif
 139
 140 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 141    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 142    of data written.  If OUT2 is not NULL, also write BUF to OUT2.
 143    In case of error writing to OUT, -1 is returned.  In case of error
 144    writing to OUT2, -2 is returned.  In case of any other error,
 145    1 is returned.  */
 146
 147 static int
 148 write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
 149             wgint *skip, wgint *written)
 150 {
 151   if (out == NULL && out2 == NULL)
 152     return 1;
 153   if (*skip > bufsize)
 154     {
 155       *skip -= bufsize;
 156       return 1;
 157     }
 158   if (*skip)
 159     {
 160       buf += *skip;
 161       bufsize -= *skip;
 162       *skip = 0;
 163       if (bufsize == 0)
 164         return 1;
 165     }
 166
 167   if (out != NULL)
 168     fwrite (buf, 1, bufsize, out);
 169   if (out2 != NULL)
 170     fwrite (buf, 1, bufsize, out2);
 171   *written += bufsize;
 172
 173   /* Immediately flush the downloaded data.  This should not hinder
 174      performance: fast downloads will arrive in large 16K chunks
 175      (which stdio would write out immediately anyway), and slow
 176      downloads wouldn't be limited by disk speed.  */
 177
 178   /* 2005-04-20 SMS.
 179      Perhaps it shouldn't hinder performance, but it sure does, at least
 180      on VMS (more than 2X).  Rather than speculate on what it should or
 181      shouldn't do, it might make more sense to test it.  Even better, it
 182      might be nice to explain what possible benefit it could offer, as
 183      it appears to be a clear invitation to poor performance with no
 184      actual justification.  (Also, why 16K?  Anyone test other values?)
 185   */
 186 #ifndef __VMS
 187   if (out != NULL)
 188     fflush (out);
 189   if (out2 != NULL)
 190     fflush (out2);
 191 #endif /* ndef __VMS */
 192   if (out != NULL && ferror (out))
 193     return -1;
 194   else if (out2 != NULL && ferror (out2))
 195     return -2;
 196   else
 197     return 0;
 198 }
 199
 200 /* Read the contents of file descriptor FD until it the connection
 201    terminates or a read error occurs.  The data is read in portions of
 202    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 203    the progress is shown.
 204
 205    TOREAD is the amount of data expected to arrive, normally only used
 206    by the progress gauge.
 207
 208    STARTPOS is the position from which the download starts, used by
 209    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 210    is incremented by the amount of data read from the network.  If
 211    QTYWRITTEN is non-NULL, the value it points to is incremented by
 212    the amount of data written to disk.  The time it took to download
 213    the data is stored to ELAPSED.
 214
 215    If OUT2 is non-NULL, the contents is also written to OUT2.
 216
 217    The function exits and returns the amount of data read.  In case of
 218    error while reading data, -1 is returned.  In case of error while
 219    writing data to OUT, -2 is returned.  In case of error while writing
 220    data to OUT2, -3 is returned.  */
 221
 222 int
 223 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
 224               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
 225               FILE *out2)
 226 {
 227   int ret = 0;
 228 #undef max
 229 #define max(a,b) ((a) > (b) ? (a) : (b))
 230   int dlbufsize = max (BUFSIZ, 8 * 1024);
 231   char *dlbuf = xmalloc (dlbufsize);
 232
 233   struct ptimer *timer = NULL;
 234   double last_successful_read_tm = 0;
 235
 236   /* The progress gauge, set according to the user preferences. */
 237   void *progress = NULL;
 238
 239   /* Non-zero if the progress gauge is interactive, i.e. if it can
 240      continually update the display.  When true, smaller timeout
 241      values are used so that the gauge can update the display when
 242      data arrives slowly. */
 243   bool progress_interactive = false;
 244
 245   bool exact = !!(flags & rb_read_exactly);
 246
 247   /* Used only by HTTP/HTTPS chunked transfer encoding.  */
 248   bool chunked = flags & rb_chunked_transfer_encoding;
 249   wgint skip = 0;
 250
 251   /* How much data we've read/written.  */
 252   wgint sum_read = 0;
 253   wgint sum_written = 0;
 254   wgint remaining_chunk_size = 0;
 255
 256   if (flags & rb_skip_startpos)
 257     skip = startpos;
 258
 259   if (opt.verbose)
 260     {
 261       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 262          argument to progress_create because the indicator doesn't
 263          (yet) know about "skipping" data.  */
 264       wgint start = skip ? 0 : startpos;
 265       progress = progress_create (start, start + toread);
 266       progress_interactive = progress_interactive_p (progress);
 267     }
 268
 269   if (opt.limit_rate)
 270     limit_bandwidth_reset ();
 271
 272   /* A timer is needed for tracking progress, for throttling, and for
 273      tracking elapsed time.  If either of these are requested, start
 274      the timer.  */
 275   if (progress || opt.limit_rate || elapsed)
 276     {
 277       timer = ptimer_new ();
 278       last_successful_read_tm = 0;
 279     }
 280
 281   /* Use a smaller buffer for low requested bandwidths.  For example,
 282      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 283      data and then sleep for 8s.  With buffer size equal to the limit,
 284      we never have to sleep for more than one second.  */
 285   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 286     dlbufsize = opt.limit_rate;
 287
 288   /* Read from FD while there is data to read.  Normally toread==0
 289      means that it is unknown how much data is to arrive.  However, if
 290      EXACT is set, then toread==0 means what it says: that no data
 291      should be read.  */
 292   while (!exact || (sum_read < toread))
 293     {
 294       int rdsize;
 295       double tmout = opt.read_timeout;
 296
 297       if (chunked)
 298         {
 299           if (remaining_chunk_size == 0)
 300             {
 301               char *line = fd_read_line (fd);
 302               char *endl;
 303               if (line == NULL)
 304                 {
 305                   ret = -1;
 306                   break;
 307                 }
 308
 309               remaining_chunk_size = strtol (line, &endl, 16);
 310               if (remaining_chunk_size == 0)
 311                 {
 312                   ret = 0;
 313                   if (fd_read_line (fd) == NULL)
 314                     ret = -1;
 315                   break;
 316                 }
 317             }
 318
 319           rdsize = MIN (remaining_chunk_size, dlbufsize);
 320         }
 321       else
 322         rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 323
 324       if (progress_interactive)
 325         {
 326           /* For interactive progress gauges, always specify a ~1s
 327              timeout, so that the gauge can be updated regularly even
 328              when the data arrives very slowly or stalls.  */
 329           tmout = 0.95;
 330           if (opt.read_timeout)
 331             {
 332               double waittm;
 333               waittm = ptimer_read (timer) - last_successful_read_tm;
 334               if (waittm + tmout > opt.read_timeout)
 335                 {
 336                   /* Don't let total idle time exceed read timeout. */
 337                   tmout = opt.read_timeout - waittm;
 338                   if (tmout < 0)
 339                     {
 340                       /* We've already exceeded the timeout. */
 341                       ret = -1, errno = ETIMEDOUT;
 342                       break;
 343                     }
 344                 }
 345             }
 346         }
 347       ret = fd_read (fd, dlbuf, rdsize, tmout);
 348
 349       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 350         ret = 0;                /* interactive timeout, handled above */
 351       else if (ret <= 0)
 352         break;                  /* EOF or read error */
 353
 354       if (progress || opt.limit_rate || elapsed)
 355         {
 356           ptimer_measure (timer);
 357           if (ret > 0)
 358             last_successful_read_tm = ptimer_read (timer);
 359         }
 360
 361       if (ret > 0)
 362         {
 363           sum_read += ret;
 364           int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
 365           if (write_res != 0)
 366             {
 367               ret = (write_res == -3) ? -3 : -2;
 368               goto out;
 369             }
 370           if (chunked)
 371             {
 372               remaining_chunk_size -= ret;
 373               if (remaining_chunk_size == 0)
 374                 if (fd_read_line (fd) == NULL)
 375                   {
 376                     ret = -1;
 377                     break;
 378                   }
 379             }
 380         }
 381
 382       if (opt.limit_rate)
 383         limit_bandwidth (ret, timer);
 384
 385       if (progress)
 386         progress_update (progress, ret, ptimer_read (timer));
 387 #ifdef WINDOWS
 388       if (toread > 0 && !opt.quiet)
 389         ws_percenttitle (100.0 *
 390                          (startpos + sum_read) / (startpos + toread));
 391 #endif
 392     }
 393   if (ret < -1)
 394     ret = -1;
 395
 396  out:
 397   if (progress)
 398     progress_finish (progress, ptimer_read (timer));
 399
 400   if (elapsed)
 401     *elapsed = ptimer_read (timer);
 402   if (timer)
 403     ptimer_destroy (timer);
 404
 405   if (qtyread)
 406     *qtyread += sum_read;
 407   if (qtywritten)
 408     *qtywritten += sum_written;
 409
 410   free (dlbuf);
 411
 412   return ret;
 413 }
 414 \f
 415 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 416    limited by whatever the TERMINATOR callback chooses as its
 417    terminator.  For example, if terminator stops at newline, the hunk
 418    will consist of a line of data; if terminator stops at two
 419    newlines, it can be used to read the head of an HTTP response.
 420    Upon determining the boundary, the function returns the data (up to
 421    the terminator) in malloc-allocated storage.
 422
 423    In case of read error, NULL is returned.  In case of EOF and no
 424    data read, NULL is returned and errno set to 0.  In case of having
 425    read some data, but encountering EOF before seeing the terminator,
 426    the data that has been read is returned, but it will (obviously)
 427    not contain the terminator.
 428
 429    The TERMINATOR function is called with three arguments: the
 430    beginning of the data read so far, the beginning of the current
 431    block of peeked-at data, and the length of the current block.
 432    Depending on its needs, the function is free to choose whether to
 433    analyze all data or just the newly arrived data.  If TERMINATOR
 434    returns NULL, it means that the terminator has not been seen.
 435    Otherwise it should return a pointer to the charactre immediately
 436    following the terminator.
 437
 438    The idea is to be able to read a line of input, or otherwise a hunk
 439    of text, such as the head of an HTTP request, without crossing the
 440    boundary, so that the next call to fd_read etc. reads the data
 441    after the hunk.  To achieve that, this function does the following:
 442
 443    1. Peek at incoming data.
 444
 445    2. Determine whether the peeked data, along with the previously
 446       read data, includes the terminator.
 447
 448       2a. If yes, read the data until the end of the terminator, and
 449           exit.
 450
 451       2b. If no, read the peeked data and goto 1.
 452
 453    The function is careful to assume as little as possible about the
 454    implementation of peeking.  For example, every peek is followed by
 455    a read.  If the read returns a different amount of data, the
 456    process is retried until all data arrives safely.
 457
 458    SIZEHINT is the buffer size sufficient to hold all the data in the
 459    typical case (it is used as the initial buffer size).  MAXSIZE is
 460    the maximum amount of memory this function is allowed to allocate,
 461    or 0 if no upper limit is to be enforced.
 462
 463    This function should be used as a building block for other
 464    functions -- see fd_read_line as a simple example.  */
 465
 466 char *
 467 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 468 {
 469   long bufsize = sizehint;
 470   char *hunk = xmalloc (bufsize);
 471   int tail = 0;                 /* tail position in HUNK */
 472
 473   assert (!maxsize || maxsize >= bufsize);
 474
 475   while (1)
 476     {
 477       const char *end;
 478       int pklen, rdlen, remain;
 479
 480       /* First, peek at the available data. */
 481
 482       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 483       if (pklen < 0)
 484         {
 485           xfree (hunk);
 486           return NULL;
 487         }
 488       end = terminator (hunk, hunk + tail, pklen);
 489       if (end)
 490         {
 491           /* The data contains the terminator: we'll drain the data up
 492              to the end of the terminator.  */
 493           remain = end - (hunk + tail);
 494           assert (remain >= 0);
 495           if (remain == 0)
 496             {
 497               /* No more data needs to be read. */
 498               hunk[tail] = '\0';
 499               return hunk;
 500             }
 501           if (bufsize - 1 < tail + remain)
 502             {
 503               bufsize = tail + remain + 1;
 504               hunk = xrealloc (hunk, bufsize);
 505             }
 506         }
 507       else
 508         /* No terminator: simply read the data we know is (or should
 509            be) available.  */
 510         remain = pklen;
 511
 512       /* Now, read the data.  Note that we make no assumptions about
 513          how much data we'll get.  (Some TCP stacks are notorious for
 514          read returning less data than the previous MSG_PEEK.)  */
 515
 516       rdlen = fd_read (fd, hunk + tail, remain, 0);
 517       if (rdlen < 0)
 518         {
 519           xfree_null (hunk);
 520           return NULL;
 521         }
 522       tail += rdlen;
 523       hunk[tail] = '\0';
 524
 525       if (rdlen == 0)
 526         {
 527           if (tail == 0)
 528             {
 529               /* EOF without anything having been read */
 530               xfree (hunk);
 531               errno = 0;
 532               return NULL;
 533             }
 534           else
 535             /* EOF seen: return the data we've read. */
 536             return hunk;
 537         }
 538       if (end && rdlen == remain)
 539         /* The terminator was seen and the remaining data drained --
 540            we got what we came for.  */
 541         return hunk;
 542
 543       /* Keep looping until all the data arrives. */
 544
 545       if (tail == bufsize - 1)
 546         {
 547           /* Double the buffer size, but refuse to allocate more than
 548              MAXSIZE bytes.  */
 549           if (maxsize && bufsize >= maxsize)
 550             {
 551               xfree (hunk);
 552               errno = ENOMEM;
 553               return NULL;
 554             }
 555           bufsize <<= 1;
 556           if (maxsize && bufsize > maxsize)
 557             bufsize = maxsize;
 558           hunk = xrealloc (hunk, bufsize);
 559         }
 560     }
 561 }
 562
 563 static const char *
 564 line_terminator (const char *start, const char *peeked, int peeklen)
 565 {
 566   const char *p = memchr (peeked, '\n', peeklen);
 567   if (p)
 568     /* p+1 because the line must include '\n' */
 569     return p + 1;
 570   return NULL;
 571 }
 572
 573 /* The maximum size of the single line we agree to accept.  This is
 574    not meant to impose an arbitrary limit, but to protect the user
 575    from Wget slurping up available memory upon encountering malicious
 576    or buggy server output.  Define it to 0 to remove the limit.  */
 577 #define FD_READ_LINE_MAX 4096
 578
 579 /* Read one line from FD and return it.  The line is allocated using
 580    malloc, but is never larger than FD_READ_LINE_MAX.
 581
 582    If an error occurs, or if no data can be read, NULL is returned.
 583    In the former case errno indicates the error condition, and in the
 584    latter case, errno is NULL.  */
 585
 586 char *
 587 fd_read_line (int fd)
 588 {
 589   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 590 }
 591 \f
 592 /* Return a printed representation of the download rate, along with
 593    the units appropriate for the download speed.  */
 594
 595 const char *
 596 retr_rate (wgint bytes, double secs)
 597 {
 598   static char res[20];
 599   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 600   int units;
 601
 602   double dlrate = calc_rate (bytes, secs, &units);
 603   /* Use more digits for smaller numbers (regardless of unit used),
 604      e.g. "1022", "247", "12.5", "2.38".  */
 605   sprintf (res, "%.*f %s",
 606            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 607            dlrate, rate_names[units]);
 608
 609   return res;
 610 }
 611
 612 /* Calculate the download rate and trim it as appropriate for the
 613    speed.  Appropriate means that if rate is greater than 1K/s,
 614    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 615    are used.
 616
 617    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 618    GB/s.  */
 619
 620 double
 621 calc_rate (wgint bytes, double secs, int *units)
 622 {
 623   double dlrate;
 624
 625   assert (secs >= 0);
 626   assert (bytes >= 0);
 627
 628   if (secs == 0)
 629     /* If elapsed time is exactly zero, it means we're under the
 630        resolution of the timer.  This can easily happen on systems
 631        that use time() for the timer.  Since the interval lies between
 632        0 and the timer's resolution, assume half the resolution.  */
 633     secs = ptimer_resolution () / 2.0;
 634
 635   dlrate = bytes / secs;
 636   if (dlrate < 1024.0)
 637     *units = 0;
 638   else if (dlrate < 1024.0 * 1024.0)
 639     *units = 1, dlrate /= 1024.0;
 640   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 641     *units = 2, dlrate /= (1024.0 * 1024.0);
 642   else
 643     /* Maybe someone will need this, one day. */
 644     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 645
 646   return dlrate;
 647 }
 648 \f
 649
 650 #define SUSPEND_POST_DATA do {                  \
 651   post_data_suspended = true;                   \
 652   saved_post_data = opt.post_data;              \
 653   saved_post_file_name = opt.post_file_name;    \
 654   opt.post_data = NULL;                         \
 655   opt.post_file_name = NULL;                    \
 656 } while (0)
 657
 658 #define RESTORE_POST_DATA do {                          \
 659   if (post_data_suspended)                              \
 660     {                                                   \
 661       opt.post_data = saved_post_data;                  \
 662       opt.post_file_name = saved_post_file_name;        \
 663       post_data_suspended = false;                      \
 664     }                                                   \
 665 } while (0)
 666
 667 static char *getproxy (struct url *);
 668
 669 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 670    FTP, proxy, etc.  */
 671
 672 /* #### This function should be rewritten so it doesn't return from
 673    multiple points. */
 674
 675 uerr_t
 676 retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
 677               char **newloc, const char *refurl, int *dt, bool recursive,
 678               struct iri *iri, bool register_status)
 679 {
 680   uerr_t result;
 681   char *url;
 682   bool location_changed;
 683   bool iri_fallbacked = 0;
 684   int dummy;
 685   char *mynewloc, *proxy;
 686   struct url *u = orig_parsed, *proxy_url;
 687   int up_error_code;            /* url parse error code */
 688   char *local_file;
 689   int redirection_count = 0;
 690
 691   bool post_data_suspended = false;
 692   char *saved_post_data = NULL;
 693   char *saved_post_file_name = NULL;
 694
 695   /* If dt is NULL, use local storage.  */
 696   if (!dt)
 697     {
 698       dt = &dummy;
 699       dummy = 0;
 700     }
 701   url = xstrdup (origurl);
 702   if (newloc)
 703     *newloc = NULL;
 704   if (file)
 705     *file = NULL;
 706
 707   if (!refurl)
 708     refurl = opt.referer;
 709
 710  redirected:
 711   /* (also for IRI fallbacking) */
 712
 713   result = NOCONERROR;
 714   mynewloc = NULL;
 715   local_file = NULL;
 716   proxy_url = NULL;
 717
 718   proxy = getproxy (u);
 719   if (proxy)
 720     {
 721       struct iri *pi = iri_new ();
 722       set_uri_encoding (pi, opt.locale, true);
 723       pi->utf8_encode = false;
 724
 725       /* Parse the proxy URL.  */
 726       proxy_url = url_parse (proxy, &up_error_code, NULL, true);
 727       if (!proxy_url)
 728         {
 729           char *error = url_error (proxy, up_error_code);
 730           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 731                      proxy, error);
 732           xfree (url);
 733           xfree (error);
 734           RESTORE_POST_DATA;
 735           result = PROXERR;
 736           goto bail;
 737         }
 738       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 739         {
 740           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 741           url_free (proxy_url);
 742           xfree (url);
 743           RESTORE_POST_DATA;
 744           result = PROXERR;
 745           goto bail;
 746         }
 747     }
 748
 749   if (u->scheme == SCHEME_HTTP
 750 #ifdef HAVE_SSL
 751       || u->scheme == SCHEME_HTTPS
 752 #endif
 753       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 754     {
 755       result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
 756                           proxy_url, iri);
 757     }
 758   else if (u->scheme == SCHEME_FTP)
 759     {
 760       /* If this is a redirection, temporarily turn off opt.ftp_glob
 761          and opt.recursive, both being undesirable when following
 762          redirects.  */
 763       bool oldrec = recursive, glob = opt.ftp_glob;
 764       if (redirection_count)
 765         oldrec = glob = false;
 766
 767       result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
 768       recursive = oldrec;
 769
 770       /* There is a possibility of having HTTP being redirected to
 771          FTP.  In these cases we must decide whether the text is HTML
 772          according to the suffix.  The HTML suffixes are `.html',
 773          `.htm' and a few others, case-insensitive.  */
 774       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 775         {
 776           if (has_html_suffix_p (local_file))
 777             *dt |= TEXTHTML;
 778         }
 779     }
 780
 781   if (proxy_url)
 782     {
 783       url_free (proxy_url);
 784       proxy_url = NULL;
 785     }
 786
 787   location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
 788   if (location_changed)
 789     {
 790       char *construced_newloc;
 791       struct url *newloc_parsed;
 792
 793       assert (mynewloc != NULL);
 794
 795       if (local_file)
 796         xfree (local_file);
 797
 798       /* The HTTP specs only allow absolute URLs to appear in
 799          redirects, but a ton of boneheaded webservers and CGIs out
 800          there break the rules and use relative URLs, and popular
 801          browsers are lenient about this, so wget should be too. */
 802       construced_newloc = uri_merge (url, mynewloc);
 803       xfree (mynewloc);
 804       mynewloc = construced_newloc;
 805
 806       /* Reset UTF-8 encoding state, keep the URI encoding and reset
 807          the content encoding. */
 808       iri->utf8_encode = opt.enable_iri;
 809       set_content_encoding (iri, NULL);
 810       xfree_null (iri->orig_url);
 811
 812       /* Now, see if this new location makes sense. */
 813       newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
 814       if (!newloc_parsed)
 815         {
 816           char *error = url_error (mynewloc, up_error_code);
 817           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 818                      error);
 819           if (orig_parsed != u)
 820             {
 821               url_free (u);
 822             }
 823           xfree (url);
 824           xfree (mynewloc);
 825           xfree (error);
 826           RESTORE_POST_DATA;
 827           goto bail;
 828         }
 829
 830       /* Now mynewloc will become newloc_parsed->url, because if the
 831          Location contained relative paths like .././something, we
 832          don't want that propagating as url.  */
 833       xfree (mynewloc);
 834       mynewloc = xstrdup (newloc_parsed->url);
 835
 836       /* Check for max. number of redirections.  */
 837       if (++redirection_count > opt.max_redirect)
 838         {
 839           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 840                      opt.max_redirect);
 841           url_free (newloc_parsed);
 842           if (orig_parsed != u)
 843             {
 844               url_free (u);
 845             }
 846           xfree (url);
 847           xfree (mynewloc);
 848           RESTORE_POST_DATA;
 849           result = WRONGCODE;
 850           goto bail;
 851         }
 852
 853       xfree (url);
 854       url = mynewloc;
 855       if (orig_parsed != u)
 856         {
 857           url_free (u);
 858         }
 859       u = newloc_parsed;
 860
 861       /* If we're being redirected from POST, and we received a
 862          redirect code different than 307, we don't want to POST
 863          again.  Many requests answer POST with a redirection to an
 864          index page; that redirection is clearly a GET.  We "suspend"
 865          POST data for the duration of the redirections, and restore
 866          it when we're done.
 867
 868          RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
 869          specifically to preserve the method of the request.
 870          */
 871       if (result != NEWLOCATION_KEEP_POST && !post_data_suspended)
 872         SUSPEND_POST_DATA;
 873
 874       goto redirected;
 875     }
 876
 877   /* Try to not encode in UTF-8 if fetching failed */
 878   if (!(*dt & RETROKF) && iri->utf8_encode)
 879     {
 880       iri->utf8_encode = false;
 881       if (orig_parsed != u)
 882         {
 883           url_free (u);
 884         }
 885       u = url_parse (origurl, NULL, iri, true);
 886       if (u)
 887         {
 888           DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
 889           url = xstrdup (u->url);
 890           iri_fallbacked = 1;
 891           goto redirected;
 892         }
 893       else
 894           DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
 895     }
 896
 897   if (local_file && u && *dt & RETROKF)
 898     {
 899       register_download (u->url, local_file);
 900
 901       if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
 902         register_redirection (origurl, u->url);
 903
 904       if (*dt & TEXTHTML)
 905         register_html (u->url, local_file);
 906
 907       if (*dt & TEXTCSS)
 908         register_css (u->url, local_file);
 909     }
 910
 911   if (file)
 912     *file = local_file ? local_file : NULL;
 913   else
 914     xfree_null (local_file);
 915
 916   if (orig_parsed != u)
 917     {
 918       url_free (u);
 919     }
 920
 921   if (redirection_count || iri_fallbacked)
 922     {
 923       if (newloc)
 924         *newloc = url;
 925       else
 926         xfree (url);
 927     }
 928   else
 929     {
 930       if (newloc)
 931         *newloc = NULL;
 932       xfree (url);
 933     }
 934
 935   RESTORE_POST_DATA;
 936
 937 bail:
 938   if (register_status)
 939     inform_exit_status (result);
 940   return result;
 941 }
 942
 943 /* Find the URLs in the file and call retrieve_url() for each of them.
 944    If HTML is true, treat the file as HTML, and construct the URLs
 945    accordingly.
 946
 947    If opt.recursive is set, call retrieve_tree() for each file.  */
 948
 949 uerr_t
 950 retrieve_from_file (const char *file, bool html, int *count)
 951 {
 952   uerr_t status;
 953   struct urlpos *url_list, *cur_url;
 954   struct iri *iri = iri_new();
 955
 956   char *input_file, *url_file = NULL;
 957   const char *url = file;
 958
 959   status = RETROK;             /* Suppose everything is OK.  */
 960   *count = 0;                  /* Reset the URL count.  */
 961
 962   /* sXXXav : Assume filename and links in the file are in the locale */
 963   set_uri_encoding (iri, opt.locale, true);
 964   set_content_encoding (iri, opt.locale);
 965
 966   if (url_valid_scheme (url))
 967     {
 968       int dt,url_err;
 969       uerr_t status;
 970       struct url *url_parsed = url_parse (url, &url_err, iri, true);
 971       if (!url_parsed)
 972         {
 973           char *error = url_error (url, url_err);
 974           logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
 975           xfree (error);
 976           return URLERROR;
 977         }
 978
 979       if (!opt.base_href)
 980         opt.base_href = xstrdup (url);
 981
 982       status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
 983                              false, iri, true);
 984       url_free (url_parsed);
 985
 986       if (!url_file || (status != RETROK))
 987         return status;
 988
 989       if (dt & TEXTHTML)
 990         html = true;
 991
 992       /* If we have a found a content encoding, use it.
 993        * ( == is okay, because we're checking for identical object) */
 994       if (iri->content_encoding != opt.locale)
 995           set_uri_encoding (iri, iri->content_encoding, false);
 996
 997       /* Reset UTF-8 encode status */
 998       iri->utf8_encode = opt.enable_iri;
 999       xfree_null (iri->orig_url);
1000       iri->orig_url = NULL;
1001
1002       input_file = url_file;
1003     }
1004   else
1005     input_file = (char *) file;
1006
1007   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
1008               : get_urls_file (input_file));
1009
1010   xfree_null (url_file);
1011
1012   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
1013     {
1014       char *filename = NULL, *new_file = NULL;
1015       int dt;
1016       struct iri *tmpiri = iri_dup (iri);
1017       struct url *parsed_url = NULL;
1018
1019       if (cur_url->ignore_when_downloading)
1020         continue;
1021
1022       if (opt.quota && total_downloaded_bytes > opt.quota)
1023         {
1024           status = QUOTEXC;
1025           break;
1026         }
1027
1028       parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
1029
1030       if ((opt.recursive || opt.page_requisites)
1031           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
1032         {
1033           int old_follow_ftp = opt.follow_ftp;
1034
1035           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
1036           if (cur_url->url->scheme == SCHEME_FTP)
1037             opt.follow_ftp = 1;
1038
1039           status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
1040                                   tmpiri);
1041
1042           opt.follow_ftp = old_follow_ftp;
1043         }
1044       else
1045         status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
1046                                cur_url->url->url, &filename,
1047                                &new_file, NULL, &dt, opt.recursive, tmpiri,
1048                                true);
1049
1050       if (parsed_url)
1051           url_free (parsed_url);
1052
1053       if (filename && opt.delete_after && file_exists_p (filename))
1054         {
1055           DEBUGP (("\
1056 Removing file due to --delete-after in retrieve_from_file():\n"));
1057           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
1058           if (unlink (filename))
1059             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
1060           dt &= ~RETROKF;
1061         }
1062
1063       xfree_null (new_file);
1064       xfree_null (filename);
1065       iri_free (tmpiri);
1066     }
1067
1068   /* Free the linked list of URL-s.  */
1069   free_urlpos (url_list);
1070
1071   iri_free (iri);
1072
1073   return status;
1074 }
1075
1076 /* Print `giving up', or `retrying', depending on the impending
1077    action.  N1 and N2 are the attempt number and the attempt limit.  */
1078 void
1079 printwhat (int n1, int n2)
1080 {
1081   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1082 }
1083
1084 /* If opt.wait or opt.waitretry are specified, and if certain
1085    conditions are met, sleep the appropriate number of seconds.  See
1086    the documentation of --wait and --waitretry for more information.
1087
1088    COUNT is the count of current retrieval, beginning with 1. */
1089
1090 void
1091 sleep_between_retrievals (int count)
1092 {
1093   static bool first_retrieval = true;
1094
1095   if (first_retrieval)
1096     {
1097       /* Don't sleep before the very first retrieval. */
1098       first_retrieval = false;
1099       return;
1100     }
1101
1102   if (opt.waitretry && count > 1)
1103     {
1104       /* If opt.waitretry is specified and this is a retry, wait for
1105          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1106       if (count <= opt.waitretry)
1107         xsleep (count - 1);
1108       else
1109         xsleep (opt.waitretry);
1110     }
1111   else if (opt.wait)
1112     {
1113       if (!opt.random_wait || count > 1)
1114         /* If random-wait is not specified, or if we are sleeping
1115            between retries of the same download, sleep the fixed
1116            interval.  */
1117         xsleep (opt.wait);
1118       else
1119         {
1120           /* Sleep a random amount of time averaging in opt.wait
1121              seconds.  The sleeping amount ranges from 0.5*opt.wait to
1122              1.5*opt.wait.  */
1123           double waitsecs = (0.5 + random_float ()) * opt.wait;
1124           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1125                    opt.wait, waitsecs));
1126           xsleep (waitsecs);
1127         }
1128     }
1129 }
1130
1131 /* Free the linked list of urlpos.  */
1132 void
1133 free_urlpos (struct urlpos *l)
1134 {
1135   while (l)
1136     {
1137       struct urlpos *next = l->next;
1138       if (l->url)
1139         url_free (l->url);
1140       xfree_null (l->local_name);
1141       xfree (l);
1142       l = next;
1143     }
1144 }
1145
1146 /* Rotate FNAME opt.backups times */
1147 void
1148 rotate_backups(const char *fname)
1149 {
1150   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1151   char *from = (char *)alloca (maxlen);
1152   char *to = (char *)alloca (maxlen);
1153   struct_stat sb;
1154   int i;
1155
1156   if (stat (fname, &sb) == 0)
1157     if (S_ISREG (sb.st_mode) == 0)
1158       return;
1159
1160   for (i = opt.backups; i > 1; i--)
1161     {
1162       sprintf (from, "%s.%d", fname, i - 1);
1163       sprintf (to, "%s.%d", fname, i);
1164       rename (from, to);
1165     }
1166
1167   sprintf (to, "%s.%d", fname, 1);
1168   rename(fname, to);
1169 }
1170
1171 static bool no_proxy_match (const char *, const char **);
1172
1173 /* Return the URL of the proxy appropriate for url U.  */
1174
1175 static char *
1176 getproxy (struct url *u)
1177 {
1178   char *proxy = NULL;
1179   char *rewritten_url;
1180   static char rewritten_storage[1024];
1181
1182   if (!opt.use_proxy)
1183     return NULL;
1184   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1185     return NULL;
1186
1187   switch (u->scheme)
1188     {
1189     case SCHEME_HTTP:
1190       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1191       break;
1192 #ifdef HAVE_SSL
1193     case SCHEME_HTTPS:
1194       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1195       break;
1196 #endif
1197     case SCHEME_FTP:
1198       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1199       break;
1200     case SCHEME_INVALID:
1201       break;
1202     }
1203   if (!proxy || !*proxy)
1204     return NULL;
1205
1206   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1207      getproxy() to return static storage. */
1208   rewritten_url = rewrite_shorthand_url (proxy);
1209   if (rewritten_url)
1210     {
1211       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1212       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1213       proxy = rewritten_storage;
1214     }
1215
1216   return proxy;
1217 }
1218
1219 /* Returns true if URL would be downloaded through a proxy. */
1220
1221 bool
1222 url_uses_proxy (struct url * u)
1223 {
1224   bool ret;
1225   if (!u)
1226     return false;
1227   ret = getproxy (u) != NULL;
1228   return ret;
1229 }
1230
1231 /* Should a host be accessed through proxy, concerning no_proxy?  */
1232 static bool
1233 no_proxy_match (const char *host, const char **no_proxy)
1234 {
1235   if (!no_proxy)
1236     return false;
1237   else
1238     return sufmatch (no_proxy, host);
1239 }
1240
1241 /* Set the file parameter to point to the local file string.  */
1242 void
1243 set_local_file (const char **file, const char *default_file)
1244 {
1245   if (opt.output_document)
1246     {
1247       if (output_stream_regular)
1248         *file = opt.output_document;
1249     }
1250   else
1251     *file = default_file;
1252 }
1253
1254 /* Return true for an input file's own URL, false otherwise.  */
1255 bool
1256 input_file_url (const char *input_file)
1257 {
1258   static bool first = true;
1259
1260   if (input_file
1261       && url_has_scheme (input_file)
1262       && first)
1263     {
1264       first = false;
1265       return true;
1266     }
1267   else
1268     return false;
1269 }