sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <string.h>
  39 #include <assert.h>
  40
  41 #include "exits.h"
  42 #include "utils.h"
  43 #include "retr.h"
  44 #include "progress.h"
  45 #include "url.h"
  46 #include "recur.h"
  47 #include "ftp.h"
  48 #include "http.h"
  49 #include "host.h"
  50 #include "connect.h"
  51 #include "hash.h"
  52 #include "convert.h"
  53 #include "ptimer.h"
  54 #include "html-url.h"
  55 #include "iri.h"
  56
  57 /* Total size of downloaded files.  Used to enforce quota.  */
  58 SUM_SIZE_INT total_downloaded_bytes;
  59
  60 /* Total download time in seconds. */
  61 double total_download_time;
  62
  63 /* If non-NULL, the stream to which output should be written.  This
  64    stream is initialized when `-O' is used.  */
  65 FILE *output_stream;
  66
  67 /* Whether output_document is a regular file we can manipulate,
  68    i.e. not `-' or a device file. */
  69 bool output_stream_regular;
  70 \f
  71 static struct {
  72   wgint chunk_bytes;
  73   double chunk_start;
  74   double sleep_adjust;
  75 } limit_data;
  76
  77 static void
  78 limit_bandwidth_reset (void)
  79 {
  80   xzero (limit_data);
  81 }
  82
  83 /* Limit the bandwidth by pausing the download for an amount of time.
  84    BYTES is the number of bytes received from the network, and TIMER
  85    is the timer that started at the beginning of download.  */
  86
  87 static void
  88 limit_bandwidth (wgint bytes, struct ptimer *timer)
  89 {
  90   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  91   double expected;
  92
  93   limit_data.chunk_bytes += bytes;
  94
  95   /* Calculate the amount of time we expect downloading the chunk
  96      should take.  If in reality it took less time, sleep to
  97      compensate for the difference.  */
  98   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
  99
 100   if (expected > delta_t)
 101     {
 102       double slp = expected - delta_t + limit_data.sleep_adjust;
 103       double t0, t1;
 104       if (slp < 0.2)
 105         {
 106           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 107                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 108                    delta_t));
 109           return;
 110         }
 111       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 112                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 113                limit_data.sleep_adjust));
 114
 115       t0 = ptimer_read (timer);
 116       xsleep (slp);
 117       t1 = ptimer_measure (timer);
 118
 119       /* Due to scheduling, we probably slept slightly longer (or
 120          shorter) than desired.  Calculate the difference between the
 121          desired and the actual sleep, and adjust the next sleep by
 122          that amount.  */
 123       limit_data.sleep_adjust = slp - (t1 - t0);
 124       /* If sleep_adjust is very large, it's likely due to suspension
 125          and not clock inaccuracy.  Don't enforce those.  */
 126       if (limit_data.sleep_adjust > 0.5)
 127         limit_data.sleep_adjust = 0.5;
 128       else if (limit_data.sleep_adjust < -0.5)
 129         limit_data.sleep_adjust = -0.5;
 130     }
 131
 132   limit_data.chunk_bytes = 0;
 133   limit_data.chunk_start = ptimer_read (timer);
 134 }
 135
 136 #ifndef MIN
 137 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 138 #endif
 139
 140 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 141    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 142    of data written.  If OUT2 is not NULL, also write BUF to OUT2.
 143    In case of error writing to OUT, -1 is returned.  In case of error
 144    writing to OUT2, -2 is returned.  Return 1 if the whole BUF was
 145    skipped.  */
 146
 147 static int
 148 write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
 149             wgint *skip, wgint *written)
 150 {
 151   if (out == NULL && out2 == NULL)
 152     return 1;
 153   if (*skip > bufsize)
 154     {
 155       *skip -= bufsize;
 156       return 1;
 157     }
 158   if (*skip)
 159     {
 160       buf += *skip;
 161       bufsize -= *skip;
 162       *skip = 0;
 163       if (bufsize == 0)
 164         return 1;
 165     }
 166
 167   if (out != NULL)
 168     fwrite (buf, 1, bufsize, out);
 169   if (out2 != NULL)
 170     fwrite (buf, 1, bufsize, out2);
 171   *written += bufsize;
 172
 173   /* Immediately flush the downloaded data.  This should not hinder
 174      performance: fast downloads will arrive in large 16K chunks
 175      (which stdio would write out immediately anyway), and slow
 176      downloads wouldn't be limited by disk speed.  */
 177
 178   /* 2005-04-20 SMS.
 179      Perhaps it shouldn't hinder performance, but it sure does, at least
 180      on VMS (more than 2X).  Rather than speculate on what it should or
 181      shouldn't do, it might make more sense to test it.  Even better, it
 182      might be nice to explain what possible benefit it could offer, as
 183      it appears to be a clear invitation to poor performance with no
 184      actual justification.  (Also, why 16K?  Anyone test other values?)
 185   */
 186 #ifndef __VMS
 187   if (out != NULL)
 188     fflush (out);
 189   if (out2 != NULL)
 190     fflush (out2);
 191 #endif /* ndef __VMS */
 192   if (out != NULL && ferror (out))
 193     return -1;
 194   else if (out2 != NULL && ferror (out2))
 195     return -2;
 196   else
 197     return 0;
 198 }
 199
 200 /* Read the contents of file descriptor FD until it the connection
 201    terminates or a read error occurs.  The data is read in portions of
 202    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 203    the progress is shown.
 204
 205    TOREAD is the amount of data expected to arrive, normally only used
 206    by the progress gauge.
 207
 208    STARTPOS is the position from which the download starts, used by
 209    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 210    is incremented by the amount of data read from the network.  If
 211    QTYWRITTEN is non-NULL, the value it points to is incremented by
 212    the amount of data written to disk.  The time it took to download
 213    the data is stored to ELAPSED.
 214
 215    If OUT2 is non-NULL, the contents is also written to OUT2.
 216    OUT2 will get an exact copy of the response: if this is a chunked
 217    response, everything -- including the chunk headers -- is written
 218    to OUT2.  (OUT will only get the unchunked response.)
 219
 220    The function exits and returns the amount of data read.  In case of
 221    error while reading data, -1 is returned.  In case of error while
 222    writing data to OUT, -2 is returned.  In case of error while writing
 223    data to OUT2, -3 is returned.  */
 224
 225 int
 226 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
 227               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
 228               FILE *out2)
 229 {
 230   int ret = 0;
 231 #undef max
 232 #define max(a,b) ((a) > (b) ? (a) : (b))
 233   int dlbufsize = max (BUFSIZ, 8 * 1024);
 234   char *dlbuf = xmalloc (dlbufsize);
 235
 236   struct ptimer *timer = NULL;
 237   double last_successful_read_tm = 0;
 238
 239   /* The progress gauge, set according to the user preferences. */
 240   void *progress = NULL;
 241
 242   /* Non-zero if the progress gauge is interactive, i.e. if it can
 243      continually update the display.  When true, smaller timeout
 244      values are used so that the gauge can update the display when
 245      data arrives slowly. */
 246   bool progress_interactive = false;
 247
 248   bool exact = !!(flags & rb_read_exactly);
 249
 250   /* Used only by HTTP/HTTPS chunked transfer encoding.  */
 251   bool chunked = flags & rb_chunked_transfer_encoding;
 252   wgint skip = 0;
 253
 254   /* How much data we've read/written.  */
 255   wgint sum_read = 0;
 256   wgint sum_written = 0;
 257   wgint remaining_chunk_size = 0;
 258
 259   if (flags & rb_skip_startpos)
 260     skip = startpos;
 261
 262   if (opt.verbose)
 263     {
 264       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 265          argument to progress_create because the indicator doesn't
 266          (yet) know about "skipping" data.  */
 267       wgint start = skip ? 0 : startpos;
 268       progress = progress_create (start, start + toread);
 269       progress_interactive = progress_interactive_p (progress);
 270     }
 271
 272   if (opt.limit_rate)
 273     limit_bandwidth_reset ();
 274
 275   /* A timer is needed for tracking progress, for throttling, and for
 276      tracking elapsed time.  If either of these are requested, start
 277      the timer.  */
 278   if (progress || opt.limit_rate || elapsed)
 279     {
 280       timer = ptimer_new ();
 281       last_successful_read_tm = 0;
 282     }
 283
 284   /* Use a smaller buffer for low requested bandwidths.  For example,
 285      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 286      data and then sleep for 8s.  With buffer size equal to the limit,
 287      we never have to sleep for more than one second.  */
 288   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 289     dlbufsize = opt.limit_rate;
 290
 291   /* Read from FD while there is data to read.  Normally toread==0
 292      means that it is unknown how much data is to arrive.  However, if
 293      EXACT is set, then toread==0 means what it says: that no data
 294      should be read.  */
 295   while (!exact || (sum_read < toread))
 296     {
 297       int rdsize;
 298       double tmout = opt.read_timeout;
 299
 300       if (chunked)
 301         {
 302           if (remaining_chunk_size == 0)
 303             {
 304               char *line = fd_read_line (fd);
 305               char *endl;
 306               if (line == NULL)
 307                 {
 308                   ret = -1;
 309                   break;
 310                 }
 311               else if (out2 != NULL)
 312                 fwrite (line, 1, strlen (line), out2);
 313
 314               remaining_chunk_size = strtol (line, &endl, 16);
 315               xfree (line);
 316
 317               if (remaining_chunk_size == 0)
 318                 {
 319                   ret = 0;
 320                   line = fd_read_line (fd);
 321                   if (line == NULL)
 322                     ret = -1;
 323                   else
 324                     {
 325                       if (out2 != NULL)
 326                         fwrite (line, 1, strlen (line), out2);
 327                       xfree (line);
 328                     }
 329                   break;
 330                 }
 331             }
 332
 333           rdsize = MIN (remaining_chunk_size, dlbufsize);
 334         }
 335       else
 336         rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 337
 338       if (progress_interactive)
 339         {
 340           /* For interactive progress gauges, always specify a ~1s
 341              timeout, so that the gauge can be updated regularly even
 342              when the data arrives very slowly or stalls.  */
 343           tmout = 0.95;
 344           if (opt.read_timeout)
 345             {
 346               double waittm;
 347               waittm = ptimer_read (timer) - last_successful_read_tm;
 348               if (waittm + tmout > opt.read_timeout)
 349                 {
 350                   /* Don't let total idle time exceed read timeout. */
 351                   tmout = opt.read_timeout - waittm;
 352                   if (tmout < 0)
 353                     {
 354                       /* We've already exceeded the timeout. */
 355                       ret = -1, errno = ETIMEDOUT;
 356                       break;
 357                     }
 358                 }
 359             }
 360         }
 361       ret = fd_read (fd, dlbuf, rdsize, tmout);
 362
 363       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 364         ret = 0;                /* interactive timeout, handled above */
 365       else if (ret <= 0)
 366         break;                  /* EOF or read error */
 367
 368       if (progress || opt.limit_rate || elapsed)
 369         {
 370           ptimer_measure (timer);
 371           if (ret > 0)
 372             last_successful_read_tm = ptimer_read (timer);
 373         }
 374
 375       if (ret > 0)
 376         {
 377           sum_read += ret;
 378           int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
 379           if (write_res < 0)
 380             {
 381               ret = (write_res == -3) ? -3 : -2;
 382               goto out;
 383             }
 384           if (chunked)
 385             {
 386               remaining_chunk_size -= ret;
 387               if (remaining_chunk_size == 0)
 388                 {
 389                   char *line = fd_read_line (fd);
 390                   if (line == NULL)
 391                     {
 392                       ret = -1;
 393                       break;
 394                     }
 395                   else
 396                     {
 397                       if (out2 != NULL)
 398                         fwrite (line, 1, strlen (line), out2);
 399                       xfree (line);
 400                     }
 401                 }
 402             }
 403         }
 404
 405       if (opt.limit_rate)
 406         limit_bandwidth (ret, timer);
 407
 408       if (progress)
 409         progress_update (progress, ret, ptimer_read (timer));
 410 #ifdef WINDOWS
 411       if (toread > 0 && !opt.quiet)
 412         ws_percenttitle (100.0 *
 413                          (startpos + sum_read) / (startpos + toread));
 414 #endif
 415     }
 416   if (ret < -1)
 417     ret = -1;
 418
 419  out:
 420   if (progress)
 421     progress_finish (progress, ptimer_read (timer));
 422
 423   if (elapsed)
 424     *elapsed = ptimer_read (timer);
 425   if (timer)
 426     ptimer_destroy (timer);
 427
 428   if (qtyread)
 429     *qtyread += sum_read;
 430   if (qtywritten)
 431     *qtywritten += sum_written;
 432
 433   free (dlbuf);
 434
 435   return ret;
 436 }
 437 \f
 438 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 439    limited by whatever the TERMINATOR callback chooses as its
 440    terminator.  For example, if terminator stops at newline, the hunk
 441    will consist of a line of data; if terminator stops at two
 442    newlines, it can be used to read the head of an HTTP response.
 443    Upon determining the boundary, the function returns the data (up to
 444    the terminator) in malloc-allocated storage.
 445
 446    In case of read error, NULL is returned.  In case of EOF and no
 447    data read, NULL is returned and errno set to 0.  In case of having
 448    read some data, but encountering EOF before seeing the terminator,
 449    the data that has been read is returned, but it will (obviously)
 450    not contain the terminator.
 451
 452    The TERMINATOR function is called with three arguments: the
 453    beginning of the data read so far, the beginning of the current
 454    block of peeked-at data, and the length of the current block.
 455    Depending on its needs, the function is free to choose whether to
 456    analyze all data or just the newly arrived data.  If TERMINATOR
 457    returns NULL, it means that the terminator has not been seen.
 458    Otherwise it should return a pointer to the charactre immediately
 459    following the terminator.
 460
 461    The idea is to be able to read a line of input, or otherwise a hunk
 462    of text, such as the head of an HTTP request, without crossing the
 463    boundary, so that the next call to fd_read etc. reads the data
 464    after the hunk.  To achieve that, this function does the following:
 465
 466    1. Peek at incoming data.
 467
 468    2. Determine whether the peeked data, along with the previously
 469       read data, includes the terminator.
 470
 471       2a. If yes, read the data until the end of the terminator, and
 472           exit.
 473
 474       2b. If no, read the peeked data and goto 1.
 475
 476    The function is careful to assume as little as possible about the
 477    implementation of peeking.  For example, every peek is followed by
 478    a read.  If the read returns a different amount of data, the
 479    process is retried until all data arrives safely.
 480
 481    SIZEHINT is the buffer size sufficient to hold all the data in the
 482    typical case (it is used as the initial buffer size).  MAXSIZE is
 483    the maximum amount of memory this function is allowed to allocate,
 484    or 0 if no upper limit is to be enforced.
 485
 486    This function should be used as a building block for other
 487    functions -- see fd_read_line as a simple example.  */
 488
 489 char *
 490 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 491 {
 492   long bufsize = sizehint;
 493   char *hunk = xmalloc (bufsize);
 494   int tail = 0;                 /* tail position in HUNK */
 495
 496   assert (!maxsize || maxsize >= bufsize);
 497
 498   while (1)
 499     {
 500       const char *end;
 501       int pklen, rdlen, remain;
 502
 503       /* First, peek at the available data. */
 504
 505       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 506       if (pklen < 0)
 507         {
 508           xfree (hunk);
 509           return NULL;
 510         }
 511       end = terminator (hunk, hunk + tail, pklen);
 512       if (end)
 513         {
 514           /* The data contains the terminator: we'll drain the data up
 515              to the end of the terminator.  */
 516           remain = end - (hunk + tail);
 517           assert (remain >= 0);
 518           if (remain == 0)
 519             {
 520               /* No more data needs to be read. */
 521               hunk[tail] = '\0';
 522               return hunk;
 523             }
 524           if (bufsize - 1 < tail + remain)
 525             {
 526               bufsize = tail + remain + 1;
 527               hunk = xrealloc (hunk, bufsize);
 528             }
 529         }
 530       else
 531         /* No terminator: simply read the data we know is (or should
 532            be) available.  */
 533         remain = pklen;
 534
 535       /* Now, read the data.  Note that we make no assumptions about
 536          how much data we'll get.  (Some TCP stacks are notorious for
 537          read returning less data than the previous MSG_PEEK.)  */
 538
 539       rdlen = fd_read (fd, hunk + tail, remain, 0);
 540       if (rdlen < 0)
 541         {
 542           xfree_null (hunk);
 543           return NULL;
 544         }
 545       tail += rdlen;
 546       hunk[tail] = '\0';
 547
 548       if (rdlen == 0)
 549         {
 550           if (tail == 0)
 551             {
 552               /* EOF without anything having been read */
 553               xfree (hunk);
 554               errno = 0;
 555               return NULL;
 556             }
 557           else
 558             /* EOF seen: return the data we've read. */
 559             return hunk;
 560         }
 561       if (end && rdlen == remain)
 562         /* The terminator was seen and the remaining data drained --
 563            we got what we came for.  */
 564         return hunk;
 565
 566       /* Keep looping until all the data arrives. */
 567
 568       if (tail == bufsize - 1)
 569         {
 570           /* Double the buffer size, but refuse to allocate more than
 571              MAXSIZE bytes.  */
 572           if (maxsize && bufsize >= maxsize)
 573             {
 574               xfree (hunk);
 575               errno = ENOMEM;
 576               return NULL;
 577             }
 578           bufsize <<= 1;
 579           if (maxsize && bufsize > maxsize)
 580             bufsize = maxsize;
 581           hunk = xrealloc (hunk, bufsize);
 582         }
 583     }
 584 }
 585
 586 static const char *
 587 line_terminator (const char *start, const char *peeked, int peeklen)
 588 {
 589   const char *p = memchr (peeked, '\n', peeklen);
 590   if (p)
 591     /* p+1 because the line must include '\n' */
 592     return p + 1;
 593   return NULL;
 594 }
 595
 596 /* The maximum size of the single line we agree to accept.  This is
 597    not meant to impose an arbitrary limit, but to protect the user
 598    from Wget slurping up available memory upon encountering malicious
 599    or buggy server output.  Define it to 0 to remove the limit.  */
 600 #define FD_READ_LINE_MAX 4096
 601
 602 /* Read one line from FD and return it.  The line is allocated using
 603    malloc, but is never larger than FD_READ_LINE_MAX.
 604
 605    If an error occurs, or if no data can be read, NULL is returned.
 606    In the former case errno indicates the error condition, and in the
 607    latter case, errno is NULL.  */
 608
 609 char *
 610 fd_read_line (int fd)
 611 {
 612   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 613 }
 614 \f
 615 /* Return a printed representation of the download rate, along with
 616    the units appropriate for the download speed.  */
 617
 618 const char *
 619 retr_rate (wgint bytes, double secs)
 620 {
 621   static char res[20];
 622   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 623   static const char *rate_names_bits[] = {"b/s", "Kb/s", "Mb/s", "Gb/s" };
 624   int units;
 625
 626   double dlrate = calc_rate (bytes, secs, &units);
 627   /* Use more digits for smaller numbers (regardless of unit used),
 628      e.g. "1022", "247", "12.5", "2.38".  */
 629   sprintf (res, "%.*f %s",
 630            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 631            dlrate, !opt.report_bps ? rate_names[units]: rate_names_bits[units]);
 632
 633   return res;
 634 }
 635
 636 /* Calculate the download rate and trim it as appropriate for the
 637    speed.  Appropriate means that if rate is greater than 1K/s,
 638    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 639    are used.
 640
 641    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 642    GB/s.  */
 643
 644 double
 645 calc_rate (wgint bytes, double secs, int *units)
 646 {
 647   double dlrate;
 648   double bibyte = 1000.0;
 649
 650   if (!opt.report_bps)
 651     bibyte = 1024.0;
 652
 653
 654   assert (secs >= 0);
 655   assert (bytes >= 0);
 656
 657   if (secs == 0)
 658     /* If elapsed time is exactly zero, it means we're under the
 659        resolution of the timer.  This can easily happen on systems
 660        that use time() for the timer.  Since the interval lies between
 661        0 and the timer's resolution, assume half the resolution.  */
 662     secs = ptimer_resolution () / 2.0;
 663
 664   dlrate = convert_to_bits (bytes) / secs;
 665   if (dlrate < bibyte)
 666     *units = 0;
 667   else if (dlrate < (bibyte * bibyte))
 668     *units = 1, dlrate /= bibyte;
 669   else if (dlrate < (bibyte * bibyte * bibyte))
 670     *units = 2, dlrate /= (bibyte * bibyte);
 671
 672   else
 673     /* Maybe someone will need this, one day. */
 674     *units = 3, dlrate /= (bibyte * bibyte * bibyte);
 675
 676   return dlrate;
 677 }
 678 \f
 679
 680 #define SUSPEND_METHOD do {                     \
 681   method_suspended = true;                      \
 682   saved_body_data = opt.body_data;              \
 683   saved_body_file_name = opt.body_file;         \
 684   saved_method = opt.method;                    \
 685   opt.body_data = NULL;                         \
 686   opt.body_file = NULL;                         \
 687   opt.method = NULL;                            \
 688 } while (0)
 689
 690 #define RESTORE_METHOD do {                             \
 691   if (method_suspended)                                 \
 692     {                                                   \
 693       opt.body_data = saved_body_data;                  \
 694       opt.body_file = saved_body_file_name;             \
 695       opt.method = saved_method;                        \
 696       method_suspended = false;                         \
 697     }                                                   \
 698 } while (0)
 699
 700 static char *getproxy (struct url *);
 701
 702 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 703    FTP, proxy, etc.  */
 704
 705 /* #### This function should be rewritten so it doesn't return from
 706    multiple points. */
 707
 708 uerr_t
 709 retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
 710               char **newloc, const char *refurl, int *dt, bool recursive,
 711               struct iri *iri, bool register_status)
 712 {
 713   uerr_t result;
 714   char *url;
 715   bool location_changed;
 716   bool iri_fallbacked = 0;
 717   int dummy;
 718   char *mynewloc, *proxy;
 719   struct url *u = orig_parsed, *proxy_url;
 720   int up_error_code;            /* url parse error code */
 721   char *local_file;
 722   int redirection_count = 0;
 723
 724   bool method_suspended = false;
 725   char *saved_body_data = NULL;
 726   char *saved_method = NULL;
 727   char *saved_body_file_name = NULL;
 728
 729   /* If dt is NULL, use local storage.  */
 730   if (!dt)
 731     {
 732       dt = &dummy;
 733       dummy = 0;
 734     }
 735   url = xstrdup (origurl);
 736   if (newloc)
 737     *newloc = NULL;
 738   if (file)
 739     *file = NULL;
 740
 741   if (!refurl)
 742     refurl = opt.referer;
 743
 744  redirected:
 745   /* (also for IRI fallbacking) */
 746
 747   result = NOCONERROR;
 748   mynewloc = NULL;
 749   local_file = NULL;
 750   proxy_url = NULL;
 751
 752   proxy = getproxy (u);
 753   if (proxy)
 754     {
 755       struct iri *pi = iri_new ();
 756       set_uri_encoding (pi, opt.locale, true);
 757       pi->utf8_encode = false;
 758
 759       /* Parse the proxy URL.  */
 760       proxy_url = url_parse (proxy, &up_error_code, NULL, true);
 761       if (!proxy_url)
 762         {
 763           char *error = url_error (proxy, up_error_code);
 764           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 765                      proxy, error);
 766           xfree (url);
 767           xfree (error);
 768           RESTORE_METHOD;
 769           result = PROXERR;
 770           goto bail;
 771         }
 772       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 773         {
 774           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 775           url_free (proxy_url);
 776           xfree (url);
 777           RESTORE_METHOD;
 778           result = PROXERR;
 779           goto bail;
 780         }
 781     }
 782
 783   if (u->scheme == SCHEME_HTTP
 784 #ifdef HAVE_SSL
 785       || u->scheme == SCHEME_HTTPS
 786 #endif
 787       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 788     {
 789       result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
 790                           proxy_url, iri);
 791     }
 792   else if (u->scheme == SCHEME_FTP)
 793     {
 794       /* If this is a redirection, temporarily turn off opt.ftp_glob
 795          and opt.recursive, both being undesirable when following
 796          redirects.  */
 797       bool oldrec = recursive, glob = opt.ftp_glob;
 798       if (redirection_count)
 799         oldrec = glob = false;
 800
 801       result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
 802       recursive = oldrec;
 803
 804       /* There is a possibility of having HTTP being redirected to
 805          FTP.  In these cases we must decide whether the text is HTML
 806          according to the suffix.  The HTML suffixes are `.html',
 807          `.htm' and a few others, case-insensitive.  */
 808       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 809         {
 810           if (has_html_suffix_p (local_file))
 811             *dt |= TEXTHTML;
 812         }
 813     }
 814
 815   if (proxy_url)
 816     {
 817       url_free (proxy_url);
 818       proxy_url = NULL;
 819     }
 820
 821   location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
 822   if (location_changed)
 823     {
 824       char *construced_newloc;
 825       struct url *newloc_parsed;
 826
 827       assert (mynewloc != NULL);
 828
 829       if (local_file)
 830         xfree (local_file);
 831
 832       /* The HTTP specs only allow absolute URLs to appear in
 833          redirects, but a ton of boneheaded webservers and CGIs out
 834          there break the rules and use relative URLs, and popular
 835          browsers are lenient about this, so wget should be too. */
 836       construced_newloc = uri_merge (url, mynewloc);
 837       xfree (mynewloc);
 838       mynewloc = construced_newloc;
 839
 840       /* Reset UTF-8 encoding state, keep the URI encoding and reset
 841          the content encoding. */
 842       iri->utf8_encode = opt.enable_iri;
 843       set_content_encoding (iri, NULL);
 844       xfree_null (iri->orig_url);
 845       iri->orig_url = NULL;
 846
 847       /* Now, see if this new location makes sense. */
 848       newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
 849       if (!newloc_parsed)
 850         {
 851           char *error = url_error (mynewloc, up_error_code);
 852           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 853                      error);
 854           if (orig_parsed != u)
 855             {
 856               url_free (u);
 857             }
 858           xfree (url);
 859           xfree (mynewloc);
 860           xfree (error);
 861           RESTORE_METHOD;
 862           goto bail;
 863         }
 864
 865       /* Now mynewloc will become newloc_parsed->url, because if the
 866          Location contained relative paths like .././something, we
 867          don't want that propagating as url.  */
 868       xfree (mynewloc);
 869       mynewloc = xstrdup (newloc_parsed->url);
 870
 871       /* Check for max. number of redirections.  */
 872       if (++redirection_count > opt.max_redirect)
 873         {
 874           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 875                      opt.max_redirect);
 876           url_free (newloc_parsed);
 877           if (orig_parsed != u)
 878             {
 879               url_free (u);
 880             }
 881           xfree (url);
 882           xfree (mynewloc);
 883           RESTORE_METHOD;
 884           result = WRONGCODE;
 885           goto bail;
 886         }
 887
 888       xfree (url);
 889       url = mynewloc;
 890       if (orig_parsed != u)
 891         {
 892           url_free (u);
 893         }
 894       u = newloc_parsed;
 895
 896       /* If we're being redirected from POST, and we received a
 897          redirect code different than 307, we don't want to POST
 898          again.  Many requests answer POST with a redirection to an
 899          index page; that redirection is clearly a GET.  We "suspend"
 900          POST data for the duration of the redirections, and restore
 901          it when we're done.
 902
 903          RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
 904          specifically to preserve the method of the request.
 905          */
 906       if (result != NEWLOCATION_KEEP_POST && !method_suspended)
 907         SUSPEND_METHOD;
 908
 909       goto redirected;
 910     }
 911
 912   /* Try to not encode in UTF-8 if fetching failed */
 913   if (!(*dt & RETROKF) && iri->utf8_encode)
 914     {
 915       iri->utf8_encode = false;
 916       if (orig_parsed != u)
 917         {
 918           url_free (u);
 919         }
 920       u = url_parse (origurl, NULL, iri, true);
 921       if (u)
 922         {
 923           DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
 924           url = xstrdup (u->url);
 925           iri_fallbacked = 1;
 926           goto redirected;
 927         }
 928       else
 929           DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
 930     }
 931
 932   if (local_file && u && *dt & RETROKF)
 933     {
 934       register_download (u->url, local_file);
 935
 936       if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
 937         register_redirection (origurl, u->url);
 938
 939       if (*dt & TEXTHTML)
 940         register_html (local_file);
 941
 942       if (*dt & TEXTCSS)
 943         register_css (local_file);
 944     }
 945
 946   if (file)
 947     *file = local_file ? local_file : NULL;
 948   else
 949     xfree_null (local_file);
 950
 951   if (orig_parsed != u)
 952     {
 953       url_free (u);
 954     }
 955
 956   if (redirection_count || iri_fallbacked)
 957     {
 958       if (newloc)
 959         *newloc = url;
 960       else
 961         xfree (url);
 962     }
 963   else
 964     {
 965       if (newloc)
 966         *newloc = NULL;
 967       xfree (url);
 968     }
 969
 970   RESTORE_METHOD;
 971
 972 bail:
 973   if (register_status)
 974     inform_exit_status (result);
 975   return result;
 976 }
 977
 978 /* Find the URLs in the file and call retrieve_url() for each of them.
 979    If HTML is true, treat the file as HTML, and construct the URLs
 980    accordingly.
 981
 982    If opt.recursive is set, call retrieve_tree() for each file.  */
 983
 984 uerr_t
 985 retrieve_from_file (const char *file, bool html, int *count)
 986 {
 987   uerr_t status;
 988   struct urlpos *url_list, *cur_url;
 989   struct iri *iri = iri_new();
 990
 991   char *input_file, *url_file = NULL;
 992   const char *url = file;
 993
 994   status = RETROK;             /* Suppose everything is OK.  */
 995   *count = 0;                  /* Reset the URL count.  */
 996
 997   /* sXXXav : Assume filename and links in the file are in the locale */
 998   set_uri_encoding (iri, opt.locale, true);
 999   set_content_encoding (iri, opt.locale);
1000
1001   if (url_valid_scheme (url))
1002     {
1003       int dt,url_err;
1004       uerr_t status;
1005       struct url *url_parsed = url_parse (url, &url_err, iri, true);
1006       if (!url_parsed)
1007         {
1008           char *error = url_error (url, url_err);
1009           logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
1010           xfree (error);
1011           return URLERROR;
1012         }
1013
1014       if (!opt.base_href)
1015         opt.base_href = xstrdup (url);
1016
1017       status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
1018                              false, iri, true);
1019       url_free (url_parsed);
1020
1021       if (!url_file || (status != RETROK))
1022         return status;
1023
1024       if (dt & TEXTHTML)
1025         html = true;
1026
1027       /* If we have a found a content encoding, use it.
1028        * ( == is okay, because we're checking for identical object) */
1029       if (iri->content_encoding != opt.locale)
1030           set_uri_encoding (iri, iri->content_encoding, false);
1031
1032       /* Reset UTF-8 encode status */
1033       iri->utf8_encode = opt.enable_iri;
1034       xfree_null (iri->orig_url);
1035       iri->orig_url = NULL;
1036
1037       input_file = url_file;
1038     }
1039   else
1040     input_file = (char *) file;
1041
1042   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
1043               : get_urls_file (input_file));
1044
1045   xfree_null (url_file);
1046
1047   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
1048     {
1049       char *filename = NULL, *new_file = NULL;
1050       int dt;
1051       struct iri *tmpiri = iri_dup (iri);
1052       struct url *parsed_url = NULL;
1053
1054       if (cur_url->ignore_when_downloading)
1055         continue;
1056
1057       if (opt.quota && total_downloaded_bytes > opt.quota)
1058         {
1059           status = QUOTEXC;
1060           break;
1061         }
1062
1063       parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
1064
1065       if ((opt.recursive || opt.page_requisites)
1066           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
1067         {
1068           int old_follow_ftp = opt.follow_ftp;
1069
1070           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
1071           if (cur_url->url->scheme == SCHEME_FTP)
1072             opt.follow_ftp = 1;
1073
1074           status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
1075                                   tmpiri);
1076
1077           opt.follow_ftp = old_follow_ftp;
1078         }
1079       else
1080         status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
1081                                cur_url->url->url, &filename,
1082                                &new_file, NULL, &dt, opt.recursive, tmpiri,
1083                                true);
1084
1085       if (parsed_url)
1086           url_free (parsed_url);
1087
1088       if (filename && opt.delete_after && file_exists_p (filename))
1089         {
1090           DEBUGP (("\
1091 Removing file due to --delete-after in retrieve_from_file():\n"));
1092           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
1093           if (unlink (filename))
1094             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
1095           dt &= ~RETROKF;
1096         }
1097
1098       xfree_null (new_file);
1099       xfree_null (filename);
1100       iri_free (tmpiri);
1101     }
1102
1103   /* Free the linked list of URL-s.  */
1104   free_urlpos (url_list);
1105
1106   iri_free (iri);
1107
1108   return status;
1109 }
1110
1111 /* Print `giving up', or `retrying', depending on the impending
1112    action.  N1 and N2 are the attempt number and the attempt limit.  */
1113 void
1114 printwhat (int n1, int n2)
1115 {
1116   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1117 }
1118
1119 /* If opt.wait or opt.waitretry are specified, and if certain
1120    conditions are met, sleep the appropriate number of seconds.  See
1121    the documentation of --wait and --waitretry for more information.
1122
1123    COUNT is the count of current retrieval, beginning with 1. */
1124
1125 void
1126 sleep_between_retrievals (int count)
1127 {
1128   static bool first_retrieval = true;
1129
1130   if (first_retrieval)
1131     {
1132       /* Don't sleep before the very first retrieval. */
1133       first_retrieval = false;
1134       return;
1135     }
1136
1137   if (opt.waitretry && count > 1)
1138     {
1139       /* If opt.waitretry is specified and this is a retry, wait for
1140          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1141       if (count <= opt.waitretry)
1142         xsleep (count - 1);
1143       else
1144         xsleep (opt.waitretry);
1145     }
1146   else if (opt.wait)
1147     {
1148       if (!opt.random_wait || count > 1)
1149         /* If random-wait is not specified, or if we are sleeping
1150            between retries of the same download, sleep the fixed
1151            interval.  */
1152         xsleep (opt.wait);
1153       else
1154         {
1155           /* Sleep a random amount of time averaging in opt.wait
1156              seconds.  The sleeping amount ranges from 0.5*opt.wait to
1157              1.5*opt.wait.  */
1158           double waitsecs = (0.5 + random_float ()) * opt.wait;
1159           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1160                    opt.wait, waitsecs));
1161           xsleep (waitsecs);
1162         }
1163     }
1164 }
1165
1166 /* Free the linked list of urlpos.  */
1167 void
1168 free_urlpos (struct urlpos *l)
1169 {
1170   while (l)
1171     {
1172       struct urlpos *next = l->next;
1173       if (l->url)
1174         url_free (l->url);
1175       xfree_null (l->local_name);
1176       xfree (l);
1177       l = next;
1178     }
1179 }
1180
1181 /* Rotate FNAME opt.backups times */
1182 void
1183 rotate_backups(const char *fname)
1184 {
1185   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1186   char *from = (char *)alloca (maxlen);
1187   char *to = (char *)alloca (maxlen);
1188   struct_stat sb;
1189   int i;
1190
1191   if (stat (fname, &sb) == 0)
1192     if (S_ISREG (sb.st_mode) == 0)
1193       return;
1194
1195   for (i = opt.backups; i > 1; i--)
1196     {
1197       sprintf (from, "%s.%d", fname, i - 1);
1198       sprintf (to, "%s.%d", fname, i);
1199       rename (from, to);
1200     }
1201
1202   sprintf (to, "%s.%d", fname, 1);
1203   rename(fname, to);
1204 }
1205
1206 static bool no_proxy_match (const char *, const char **);
1207
1208 /* Return the URL of the proxy appropriate for url U.  */
1209
1210 static char *
1211 getproxy (struct url *u)
1212 {
1213   char *proxy = NULL;
1214   char *rewritten_url;
1215   static char rewritten_storage[1024];
1216
1217   if (!opt.use_proxy)
1218     return NULL;
1219   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1220     return NULL;
1221
1222   switch (u->scheme)
1223     {
1224     case SCHEME_HTTP:
1225       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1226       break;
1227 #ifdef HAVE_SSL
1228     case SCHEME_HTTPS:
1229       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1230       break;
1231 #endif
1232     case SCHEME_FTP:
1233       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1234       break;
1235     case SCHEME_INVALID:
1236       break;
1237     }
1238   if (!proxy || !*proxy)
1239     return NULL;
1240
1241   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1242      getproxy() to return static storage. */
1243   rewritten_url = rewrite_shorthand_url (proxy);
1244   if (rewritten_url)
1245     {
1246       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1247       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1248       proxy = rewritten_storage;
1249     }
1250
1251   return proxy;
1252 }
1253
1254 /* Returns true if URL would be downloaded through a proxy. */
1255
1256 bool
1257 url_uses_proxy (struct url * u)
1258 {
1259   bool ret;
1260   if (!u)
1261     return false;
1262   ret = getproxy (u) != NULL;
1263   return ret;
1264 }
1265
1266 /* Should a host be accessed through proxy, concerning no_proxy?  */
1267 static bool
1268 no_proxy_match (const char *host, const char **no_proxy)
1269 {
1270   if (!no_proxy)
1271     return false;
1272   else
1273     return sufmatch (no_proxy, host);
1274 }
1275
1276 /* Set the file parameter to point to the local file string.  */
1277 void
1278 set_local_file (const char **file, const char *default_file)
1279 {
1280   if (opt.output_document)
1281     {
1282       if (output_stream_regular)
1283         *file = opt.output_document;
1284     }
1285   else
1286     *file = default_file;
1287 }
1288
1289 /* Return true for an input file's own URL, false otherwise.  */
1290 bool
1291 input_file_url (const char *input_file)
1292 {
1293   static bool first = true;
1294
1295   if (input_file
1296       && url_has_scheme (input_file)
1297       && first)
1298     {
1299       first = false;
1300       return true;
1301     }
1302   else
1303     return false;
1304 }