sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <string.h>
  39 #include <assert.h>
  40 #ifdef VMS
  41 # include <unixio.h>            /* For delete(). */
  42 #endif
  43
  44 #include "exits.h"
  45 #include "utils.h"
  46 #include "retr.h"
  47 #include "progress.h"
  48 #include "url.h"
  49 #include "recur.h"
  50 #include "ftp.h"
  51 #include "http.h"
  52 #include "host.h"
  53 #include "connect.h"
  54 #include "hash.h"
  55 #include "convert.h"
  56 #include "ptimer.h"
  57 #include "html-url.h"
  58 #include "iri.h"
  59
  60 /* Total size of downloaded files.  Used to enforce quota.  */
  61 SUM_SIZE_INT total_downloaded_bytes;
  62
  63 /* Total download time in seconds. */
  64 double total_download_time;
  65
  66 /* If non-NULL, the stream to which output should be written.  This
  67    stream is initialized when `-O' is used.  */
  68 FILE *output_stream;
  69
  70 /* Whether output_document is a regular file we can manipulate,
  71    i.e. not `-' or a device file. */
  72 bool output_stream_regular;
  73 \f
  74 static struct {
  75   wgint chunk_bytes;
  76   double chunk_start;
  77   double sleep_adjust;
  78 } limit_data;
  79
  80 static void
  81 limit_bandwidth_reset (void)
  82 {
  83   xzero (limit_data);
  84 }
  85
  86 /* Limit the bandwidth by pausing the download for an amount of time.
  87    BYTES is the number of bytes received from the network, and TIMER
  88    is the timer that started at the beginning of download.  */
  89
  90 static void
  91 limit_bandwidth (wgint bytes, struct ptimer *timer)
  92 {
  93   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  94   double expected;
  95
  96   limit_data.chunk_bytes += bytes;
  97
  98   /* Calculate the amount of time we expect downloading the chunk
  99      should take.  If in reality it took less time, sleep to
 100      compensate for the difference.  */
 101   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
 102
 103   if (expected > delta_t)
 104     {
 105       double slp = expected - delta_t + limit_data.sleep_adjust;
 106       double t0, t1;
 107       if (slp < 0.2)
 108         {
 109           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 110                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 111                    delta_t));
 112           return;
 113         }
 114       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 115                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 116                limit_data.sleep_adjust));
 117
 118       t0 = ptimer_read (timer);
 119       xsleep (slp);
 120       t1 = ptimer_measure (timer);
 121
 122       /* Due to scheduling, we probably slept slightly longer (or
 123          shorter) than desired.  Calculate the difference between the
 124          desired and the actual sleep, and adjust the next sleep by
 125          that amount.  */
 126       limit_data.sleep_adjust = slp - (t1 - t0);
 127       /* If sleep_adjust is very large, it's likely due to suspension
 128          and not clock inaccuracy.  Don't enforce those.  */
 129       if (limit_data.sleep_adjust > 0.5)
 130         limit_data.sleep_adjust = 0.5;
 131       else if (limit_data.sleep_adjust < -0.5)
 132         limit_data.sleep_adjust = -0.5;
 133     }
 134
 135   limit_data.chunk_bytes = 0;
 136   limit_data.chunk_start = ptimer_read (timer);
 137 }
 138
 139 #ifndef MIN
 140 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 141 #endif
 142
 143 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 144    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 145    of data written.  If OUT2 is not NULL, also write BUF to OUT2.
 146    In case of error writing to OUT, -1 is returned.  In case of error
 147    writing to OUT2, -2 is returned.  Return 1 if the whole BUF was
 148    skipped.  */
 149
 150 static int
 151 write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
 152             wgint *skip, wgint *written)
 153 {
 154   if (out == NULL && out2 == NULL)
 155     return 1;
 156   if (*skip > bufsize)
 157     {
 158       *skip -= bufsize;
 159       return 1;
 160     }
 161   if (*skip)
 162     {
 163       buf += *skip;
 164       bufsize -= *skip;
 165       *skip = 0;
 166       if (bufsize == 0)
 167         return 1;
 168     }
 169
 170   if (out != NULL)
 171     fwrite (buf, 1, bufsize, out);
 172   if (out2 != NULL)
 173     fwrite (buf, 1, bufsize, out2);
 174   *written += bufsize;
 175
 176   /* Immediately flush the downloaded data.  This should not hinder
 177      performance: fast downloads will arrive in large 16K chunks
 178      (which stdio would write out immediately anyway), and slow
 179      downloads wouldn't be limited by disk speed.  */
 180
 181   /* 2005-04-20 SMS.
 182      Perhaps it shouldn't hinder performance, but it sure does, at least
 183      on VMS (more than 2X).  Rather than speculate on what it should or
 184      shouldn't do, it might make more sense to test it.  Even better, it
 185      might be nice to explain what possible benefit it could offer, as
 186      it appears to be a clear invitation to poor performance with no
 187      actual justification.  (Also, why 16K?  Anyone test other values?)
 188   */
 189 #ifndef __VMS
 190   if (out != NULL)
 191     fflush (out);
 192   if (out2 != NULL)
 193     fflush (out2);
 194 #endif /* ndef __VMS */
 195   if (out != NULL && ferror (out))
 196     return -1;
 197   else if (out2 != NULL && ferror (out2))
 198     return -2;
 199   else
 200     return 0;
 201 }
 202
 203 /* Read the contents of file descriptor FD until it the connection
 204    terminates or a read error occurs.  The data is read in portions of
 205    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 206    the progress is shown.
 207
 208    TOREAD is the amount of data expected to arrive, normally only used
 209    by the progress gauge.
 210
 211    STARTPOS is the position from which the download starts, used by
 212    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 213    is incremented by the amount of data read from the network.  If
 214    QTYWRITTEN is non-NULL, the value it points to is incremented by
 215    the amount of data written to disk.  The time it took to download
 216    the data is stored to ELAPSED.
 217
 218    If OUT2 is non-NULL, the contents is also written to OUT2.
 219    OUT2 will get an exact copy of the response: if this is a chunked
 220    response, everything -- including the chunk headers -- is written
 221    to OUT2.  (OUT will only get the unchunked response.)
 222
 223    The function exits and returns the amount of data read.  In case of
 224    error while reading data, -1 is returned.  In case of error while
 225    writing data to OUT, -2 is returned.  In case of error while writing
 226    data to OUT2, -3 is returned.  */
 227
 228 int
 229 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
 230               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
 231               FILE *out2)
 232 {
 233   int ret = 0;
 234 #undef max
 235 #define max(a,b) ((a) > (b) ? (a) : (b))
 236   int dlbufsize = max (BUFSIZ, 8 * 1024);
 237   char *dlbuf = xmalloc (dlbufsize);
 238
 239   struct ptimer *timer = NULL;
 240   double last_successful_read_tm = 0;
 241
 242   /* The progress gauge, set according to the user preferences. */
 243   void *progress = NULL;
 244
 245   /* Non-zero if the progress gauge is interactive, i.e. if it can
 246      continually update the display.  When true, smaller timeout
 247      values are used so that the gauge can update the display when
 248      data arrives slowly. */
 249   bool progress_interactive = false;
 250
 251   bool exact = !!(flags & rb_read_exactly);
 252
 253   /* Used only by HTTP/HTTPS chunked transfer encoding.  */
 254   bool chunked = flags & rb_chunked_transfer_encoding;
 255   wgint skip = 0;
 256
 257   /* How much data we've read/written.  */
 258   wgint sum_read = 0;
 259   wgint sum_written = 0;
 260   wgint remaining_chunk_size = 0;
 261
 262   if (flags & rb_skip_startpos)
 263     skip = startpos;
 264
 265   if (opt.verbose)
 266     {
 267       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 268          argument to progress_create because the indicator doesn't
 269          (yet) know about "skipping" data.  */
 270       wgint start = skip ? 0 : startpos;
 271       progress = progress_create (start, start + toread);
 272       progress_interactive = progress_interactive_p (progress);
 273     }
 274
 275   if (opt.limit_rate)
 276     limit_bandwidth_reset ();
 277
 278   /* A timer is needed for tracking progress, for throttling, and for
 279      tracking elapsed time.  If either of these are requested, start
 280      the timer.  */
 281   if (progress || opt.limit_rate || elapsed)
 282     {
 283       timer = ptimer_new ();
 284       last_successful_read_tm = 0;
 285     }
 286
 287   /* Use a smaller buffer for low requested bandwidths.  For example,
 288      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 289      data and then sleep for 8s.  With buffer size equal to the limit,
 290      we never have to sleep for more than one second.  */
 291   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 292     dlbufsize = opt.limit_rate;
 293
 294   /* Read from FD while there is data to read.  Normally toread==0
 295      means that it is unknown how much data is to arrive.  However, if
 296      EXACT is set, then toread==0 means what it says: that no data
 297      should be read.  */
 298   while (!exact || (sum_read < toread))
 299     {
 300       int rdsize;
 301       double tmout = opt.read_timeout;
 302
 303       if (chunked)
 304         {
 305           if (remaining_chunk_size == 0)
 306             {
 307               char *line = fd_read_line (fd);
 308               char *endl;
 309               if (line == NULL)
 310                 {
 311                   ret = -1;
 312                   break;
 313                 }
 314               else if (out2 != NULL)
 315                 fwrite (line, 1, strlen (line), out2);
 316
 317               remaining_chunk_size = strtol (line, &endl, 16);
 318               xfree (line);
 319
 320               if (remaining_chunk_size == 0)
 321                 {
 322                   ret = 0;
 323                   line = fd_read_line (fd);
 324                   if (line == NULL)
 325                     ret = -1;
 326                   else
 327                     {
 328                       if (out2 != NULL)
 329                         fwrite (line, 1, strlen (line), out2);
 330                       xfree (line);
 331                     }
 332                   break;
 333                 }
 334             }
 335
 336           rdsize = MIN (remaining_chunk_size, dlbufsize);
 337         }
 338       else
 339         rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 340
 341       if (progress_interactive)
 342         {
 343           /* For interactive progress gauges, always specify a ~1s
 344              timeout, so that the gauge can be updated regularly even
 345              when the data arrives very slowly or stalls.  */
 346           tmout = 0.95;
 347           if (opt.read_timeout)
 348             {
 349               double waittm;
 350               waittm = ptimer_read (timer) - last_successful_read_tm;
 351               if (waittm + tmout > opt.read_timeout)
 352                 {
 353                   /* Don't let total idle time exceed read timeout. */
 354                   tmout = opt.read_timeout - waittm;
 355                   if (tmout < 0)
 356                     {
 357                       /* We've already exceeded the timeout. */
 358                       ret = -1, errno = ETIMEDOUT;
 359                       break;
 360                     }
 361                 }
 362             }
 363         }
 364       ret = fd_read (fd, dlbuf, rdsize, tmout);
 365
 366       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 367         ret = 0;                /* interactive timeout, handled above */
 368       else if (ret <= 0)
 369         break;                  /* EOF or read error */
 370
 371       if (progress || opt.limit_rate || elapsed)
 372         {
 373           ptimer_measure (timer);
 374           if (ret > 0)
 375             last_successful_read_tm = ptimer_read (timer);
 376         }
 377
 378       if (ret > 0)
 379         {
 380           sum_read += ret;
 381           int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
 382           if (write_res < 0)
 383             {
 384               ret = (write_res == -3) ? -3 : -2;
 385               goto out;
 386             }
 387           if (chunked)
 388             {
 389               remaining_chunk_size -= ret;
 390               if (remaining_chunk_size == 0)
 391                 {
 392                   char *line = fd_read_line (fd);
 393                   if (line == NULL)
 394                     {
 395                       ret = -1;
 396                       break;
 397                     }
 398                   else
 399                     {
 400                       if (out2 != NULL)
 401                         fwrite (line, 1, strlen (line), out2);
 402                       xfree (line);
 403                     }
 404                 }
 405             }
 406         }
 407
 408       if (opt.limit_rate)
 409         limit_bandwidth (ret, timer);
 410
 411       if (progress)
 412         progress_update (progress, ret, ptimer_read (timer));
 413 #ifdef WINDOWS
 414       if (toread > 0 && !opt.quiet)
 415         ws_percenttitle (100.0 *
 416                          (startpos + sum_read) / (startpos + toread));
 417 #endif
 418     }
 419   if (ret < -1)
 420     ret = -1;
 421
 422  out:
 423   if (progress)
 424     progress_finish (progress, ptimer_read (timer));
 425
 426   if (elapsed)
 427     *elapsed = ptimer_read (timer);
 428   if (timer)
 429     ptimer_destroy (timer);
 430
 431   if (qtyread)
 432     *qtyread += sum_read;
 433   if (qtywritten)
 434     *qtywritten += sum_written;
 435
 436   free (dlbuf);
 437
 438   return ret;
 439 }
 440 \f
 441 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 442    limited by whatever the TERMINATOR callback chooses as its
 443    terminator.  For example, if terminator stops at newline, the hunk
 444    will consist of a line of data; if terminator stops at two
 445    newlines, it can be used to read the head of an HTTP response.
 446    Upon determining the boundary, the function returns the data (up to
 447    the terminator) in malloc-allocated storage.
 448
 449    In case of read error, NULL is returned.  In case of EOF and no
 450    data read, NULL is returned and errno set to 0.  In case of having
 451    read some data, but encountering EOF before seeing the terminator,
 452    the data that has been read is returned, but it will (obviously)
 453    not contain the terminator.
 454
 455    The TERMINATOR function is called with three arguments: the
 456    beginning of the data read so far, the beginning of the current
 457    block of peeked-at data, and the length of the current block.
 458    Depending on its needs, the function is free to choose whether to
 459    analyze all data or just the newly arrived data.  If TERMINATOR
 460    returns NULL, it means that the terminator has not been seen.
 461    Otherwise it should return a pointer to the charactre immediately
 462    following the terminator.
 463
 464    The idea is to be able to read a line of input, or otherwise a hunk
 465    of text, such as the head of an HTTP request, without crossing the
 466    boundary, so that the next call to fd_read etc. reads the data
 467    after the hunk.  To achieve that, this function does the following:
 468
 469    1. Peek at incoming data.
 470
 471    2. Determine whether the peeked data, along with the previously
 472       read data, includes the terminator.
 473
 474       2a. If yes, read the data until the end of the terminator, and
 475           exit.
 476
 477       2b. If no, read the peeked data and goto 1.
 478
 479    The function is careful to assume as little as possible about the
 480    implementation of peeking.  For example, every peek is followed by
 481    a read.  If the read returns a different amount of data, the
 482    process is retried until all data arrives safely.
 483
 484    SIZEHINT is the buffer size sufficient to hold all the data in the
 485    typical case (it is used as the initial buffer size).  MAXSIZE is
 486    the maximum amount of memory this function is allowed to allocate,
 487    or 0 if no upper limit is to be enforced.
 488
 489    This function should be used as a building block for other
 490    functions -- see fd_read_line as a simple example.  */
 491
 492 char *
 493 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 494 {
 495   long bufsize = sizehint;
 496   char *hunk = xmalloc (bufsize);
 497   int tail = 0;                 /* tail position in HUNK */
 498
 499   assert (!maxsize || maxsize >= bufsize);
 500
 501   while (1)
 502     {
 503       const char *end;
 504       int pklen, rdlen, remain;
 505
 506       /* First, peek at the available data. */
 507
 508       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 509       if (pklen < 0)
 510         {
 511           xfree (hunk);
 512           return NULL;
 513         }
 514       end = terminator (hunk, hunk + tail, pklen);
 515       if (end)
 516         {
 517           /* The data contains the terminator: we'll drain the data up
 518              to the end of the terminator.  */
 519           remain = end - (hunk + tail);
 520           assert (remain >= 0);
 521           if (remain == 0)
 522             {
 523               /* No more data needs to be read. */
 524               hunk[tail] = '\0';
 525               return hunk;
 526             }
 527           if (bufsize - 1 < tail + remain)
 528             {
 529               bufsize = tail + remain + 1;
 530               hunk = xrealloc (hunk, bufsize);
 531             }
 532         }
 533       else
 534         /* No terminator: simply read the data we know is (or should
 535            be) available.  */
 536         remain = pklen;
 537
 538       /* Now, read the data.  Note that we make no assumptions about
 539          how much data we'll get.  (Some TCP stacks are notorious for
 540          read returning less data than the previous MSG_PEEK.)  */
 541
 542       rdlen = fd_read (fd, hunk + tail, remain, 0);
 543       if (rdlen < 0)
 544         {
 545           xfree_null (hunk);
 546           return NULL;
 547         }
 548       tail += rdlen;
 549       hunk[tail] = '\0';
 550
 551       if (rdlen == 0)
 552         {
 553           if (tail == 0)
 554             {
 555               /* EOF without anything having been read */
 556               xfree (hunk);
 557               errno = 0;
 558               return NULL;
 559             }
 560           else
 561             /* EOF seen: return the data we've read. */
 562             return hunk;
 563         }
 564       if (end && rdlen == remain)
 565         /* The terminator was seen and the remaining data drained --
 566            we got what we came for.  */
 567         return hunk;
 568
 569       /* Keep looping until all the data arrives. */
 570
 571       if (tail == bufsize - 1)
 572         {
 573           /* Double the buffer size, but refuse to allocate more than
 574              MAXSIZE bytes.  */
 575           if (maxsize && bufsize >= maxsize)
 576             {
 577               xfree (hunk);
 578               errno = ENOMEM;
 579               return NULL;
 580             }
 581           bufsize <<= 1;
 582           if (maxsize && bufsize > maxsize)
 583             bufsize = maxsize;
 584           hunk = xrealloc (hunk, bufsize);
 585         }
 586     }
 587 }
 588
 589 static const char *
 590 line_terminator (const char *start, const char *peeked, int peeklen)
 591 {
 592   const char *p = memchr (peeked, '\n', peeklen);
 593   if (p)
 594     /* p+1 because the line must include '\n' */
 595     return p + 1;
 596   return NULL;
 597 }
 598
 599 /* The maximum size of the single line we agree to accept.  This is
 600    not meant to impose an arbitrary limit, but to protect the user
 601    from Wget slurping up available memory upon encountering malicious
 602    or buggy server output.  Define it to 0 to remove the limit.  */
 603 #define FD_READ_LINE_MAX 4096
 604
 605 /* Read one line from FD and return it.  The line is allocated using
 606    malloc, but is never larger than FD_READ_LINE_MAX.
 607
 608    If an error occurs, or if no data can be read, NULL is returned.
 609    In the former case errno indicates the error condition, and in the
 610    latter case, errno is NULL.  */
 611
 612 char *
 613 fd_read_line (int fd)
 614 {
 615   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 616 }
 617 \f
 618 /* Return a printed representation of the download rate, along with
 619    the units appropriate for the download speed.  */
 620
 621 const char *
 622 retr_rate (wgint bytes, double secs)
 623 {
 624   static char res[20];
 625   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 626   static const char *rate_names_bits[] = {"b/s", "Kb/s", "Mb/s", "Gb/s" };
 627   int units;
 628
 629   double dlrate = calc_rate (bytes, secs, &units);
 630   /* Use more digits for smaller numbers (regardless of unit used),
 631      e.g. "1022", "247", "12.5", "2.38".  */
 632   sprintf (res, "%.*f %s",
 633            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 634            dlrate, !opt.report_bps ? rate_names[units]: rate_names_bits[units]);
 635
 636   return res;
 637 }
 638
 639 /* Calculate the download rate and trim it as appropriate for the
 640    speed.  Appropriate means that if rate is greater than 1K/s,
 641    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 642    are used.
 643
 644    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 645    GB/s.  */
 646
 647 double
 648 calc_rate (wgint bytes, double secs, int *units)
 649 {
 650   double dlrate;
 651   double bibyte = 1000.0;
 652
 653   if (!opt.report_bps)
 654     bibyte = 1024.0;
 655
 656
 657   assert (secs >= 0);
 658   assert (bytes >= 0);
 659
 660   if (secs == 0)
 661     /* If elapsed time is exactly zero, it means we're under the
 662        resolution of the timer.  This can easily happen on systems
 663        that use time() for the timer.  Since the interval lies between
 664        0 and the timer's resolution, assume half the resolution.  */
 665     secs = ptimer_resolution () / 2.0;
 666
 667   dlrate = convert_to_bits (bytes) / secs;
 668   if (dlrate < bibyte)
 669     *units = 0;
 670   else if (dlrate < (bibyte * bibyte))
 671     *units = 1, dlrate /= bibyte;
 672   else if (dlrate < (bibyte * bibyte * bibyte))
 673     *units = 2, dlrate /= (bibyte * bibyte);
 674
 675   else
 676     /* Maybe someone will need this, one day. */
 677     *units = 3, dlrate /= (bibyte * bibyte * bibyte);
 678
 679   return dlrate;
 680 }
 681 \f
 682
 683 #define SUSPEND_METHOD do {                     \
 684   method_suspended = true;                      \
 685   saved_body_data = opt.body_data;              \
 686   saved_body_file_name = opt.body_file;         \
 687   saved_method = opt.method;                    \
 688   opt.body_data = NULL;                         \
 689   opt.body_file = NULL;                         \
 690   opt.method = NULL;                            \
 691 } while (0)
 692
 693 #define RESTORE_METHOD do {                             \
 694   if (method_suspended)                                 \
 695     {                                                   \
 696       opt.body_data = saved_body_data;                  \
 697       opt.body_file = saved_body_file_name;             \
 698       opt.method = saved_method;                        \
 699       method_suspended = false;                         \
 700     }                                                   \
 701 } while (0)
 702
 703 static char *getproxy (struct url *);
 704
 705 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 706    FTP, proxy, etc.  */
 707
 708 /* #### This function should be rewritten so it doesn't return from
 709    multiple points. */
 710
 711 uerr_t
 712 retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
 713               char **newloc, const char *refurl, int *dt, bool recursive,
 714               struct iri *iri, bool register_status)
 715 {
 716   uerr_t result;
 717   char *url;
 718   bool location_changed;
 719   bool iri_fallbacked = 0;
 720   int dummy;
 721   char *mynewloc, *proxy;
 722   struct url *u = orig_parsed, *proxy_url;
 723   int up_error_code;            /* url parse error code */
 724   char *local_file;
 725   int redirection_count = 0;
 726
 727   bool method_suspended = false;
 728   char *saved_body_data = NULL;
 729   char *saved_method = NULL;
 730   char *saved_body_file_name = NULL;
 731
 732   /* If dt is NULL, use local storage.  */
 733   if (!dt)
 734     {
 735       dt = &dummy;
 736       dummy = 0;
 737     }
 738   url = xstrdup (origurl);
 739   if (newloc)
 740     *newloc = NULL;
 741   if (file)
 742     *file = NULL;
 743
 744   if (!refurl)
 745     refurl = opt.referer;
 746
 747  redirected:
 748   /* (also for IRI fallbacking) */
 749
 750   result = NOCONERROR;
 751   mynewloc = NULL;
 752   local_file = NULL;
 753   proxy_url = NULL;
 754
 755   proxy = getproxy (u);
 756   if (proxy)
 757     {
 758       struct iri *pi = iri_new ();
 759       set_uri_encoding (pi, opt.locale, true);
 760       pi->utf8_encode = false;
 761
 762       /* Parse the proxy URL.  */
 763       proxy_url = url_parse (proxy, &up_error_code, NULL, true);
 764       if (!proxy_url)
 765         {
 766           char *error = url_error (proxy, up_error_code);
 767           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 768                      proxy, error);
 769           xfree (url);
 770           xfree (error);
 771           RESTORE_METHOD;
 772           result = PROXERR;
 773           goto bail;
 774         }
 775       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 776         {
 777           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 778           url_free (proxy_url);
 779           xfree (url);
 780           RESTORE_METHOD;
 781           result = PROXERR;
 782           goto bail;
 783         }
 784     }
 785
 786   if (u->scheme == SCHEME_HTTP
 787 #ifdef HAVE_SSL
 788       || u->scheme == SCHEME_HTTPS
 789 #endif
 790       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 791     {
 792       result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
 793                           proxy_url, iri);
 794     }
 795   else if (u->scheme == SCHEME_FTP)
 796     {
 797       /* If this is a redirection, temporarily turn off opt.ftp_glob
 798          and opt.recursive, both being undesirable when following
 799          redirects.  */
 800       bool oldrec = recursive, glob = opt.ftp_glob;
 801       if (redirection_count)
 802         oldrec = glob = false;
 803
 804       result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
 805       recursive = oldrec;
 806
 807       /* There is a possibility of having HTTP being redirected to
 808          FTP.  In these cases we must decide whether the text is HTML
 809          according to the suffix.  The HTML suffixes are `.html',
 810          `.htm' and a few others, case-insensitive.  */
 811       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 812         {
 813           if (has_html_suffix_p (local_file))
 814             *dt |= TEXTHTML;
 815         }
 816     }
 817
 818   if (proxy_url)
 819     {
 820       url_free (proxy_url);
 821       proxy_url = NULL;
 822     }
 823
 824   location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
 825   if (location_changed)
 826     {
 827       char *construced_newloc;
 828       struct url *newloc_parsed;
 829
 830       assert (mynewloc != NULL);
 831
 832       if (local_file)
 833         xfree (local_file);
 834
 835       /* The HTTP specs only allow absolute URLs to appear in
 836          redirects, but a ton of boneheaded webservers and CGIs out
 837          there break the rules and use relative URLs, and popular
 838          browsers are lenient about this, so wget should be too. */
 839       construced_newloc = uri_merge (url, mynewloc);
 840       xfree (mynewloc);
 841       mynewloc = construced_newloc;
 842
 843       /* Reset UTF-8 encoding state, keep the URI encoding and reset
 844          the content encoding. */
 845       iri->utf8_encode = opt.enable_iri;
 846       set_content_encoding (iri, NULL);
 847       xfree_null (iri->orig_url);
 848       iri->orig_url = NULL;
 849
 850       /* Now, see if this new location makes sense. */
 851       newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
 852       if (!newloc_parsed)
 853         {
 854           char *error = url_error (mynewloc, up_error_code);
 855           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 856                      error);
 857           if (orig_parsed != u)
 858             {
 859               url_free (u);
 860             }
 861           xfree (url);
 862           xfree (mynewloc);
 863           xfree (error);
 864           RESTORE_METHOD;
 865           goto bail;
 866         }
 867
 868       /* Now mynewloc will become newloc_parsed->url, because if the
 869          Location contained relative paths like .././something, we
 870          don't want that propagating as url.  */
 871       xfree (mynewloc);
 872       mynewloc = xstrdup (newloc_parsed->url);
 873
 874       /* Check for max. number of redirections.  */
 875       if (++redirection_count > opt.max_redirect)
 876         {
 877           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 878                      opt.max_redirect);
 879           url_free (newloc_parsed);
 880           if (orig_parsed != u)
 881             {
 882               url_free (u);
 883             }
 884           xfree (url);
 885           xfree (mynewloc);
 886           RESTORE_METHOD;
 887           result = WRONGCODE;
 888           goto bail;
 889         }
 890
 891       xfree (url);
 892       url = mynewloc;
 893       if (orig_parsed != u)
 894         {
 895           url_free (u);
 896         }
 897       u = newloc_parsed;
 898
 899       /* If we're being redirected from POST, and we received a
 900          redirect code different than 307, we don't want to POST
 901          again.  Many requests answer POST with a redirection to an
 902          index page; that redirection is clearly a GET.  We "suspend"
 903          POST data for the duration of the redirections, and restore
 904          it when we're done.
 905
 906          RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
 907          specifically to preserve the method of the request.
 908          */
 909       if (result != NEWLOCATION_KEEP_POST && !method_suspended)
 910         SUSPEND_METHOD;
 911
 912       goto redirected;
 913     }
 914
 915   /* Try to not encode in UTF-8 if fetching failed */
 916   if (!(*dt & RETROKF) && iri->utf8_encode)
 917     {
 918       iri->utf8_encode = false;
 919       if (orig_parsed != u)
 920         {
 921           url_free (u);
 922         }
 923       u = url_parse (origurl, NULL, iri, true);
 924       if (u)
 925         {
 926           DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
 927           url = xstrdup (u->url);
 928           iri_fallbacked = 1;
 929           goto redirected;
 930         }
 931       else
 932           DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
 933     }
 934
 935   if (local_file && u && *dt & RETROKF)
 936     {
 937       register_download (u->url, local_file);
 938
 939       if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
 940         register_redirection (origurl, u->url);
 941
 942       if (*dt & TEXTHTML)
 943         register_html (local_file);
 944
 945       if (*dt & TEXTCSS)
 946         register_css (local_file);
 947     }
 948
 949   if (file)
 950     *file = local_file ? local_file : NULL;
 951   else
 952     xfree_null (local_file);
 953
 954   if (orig_parsed != u)
 955     {
 956       url_free (u);
 957     }
 958
 959   if (redirection_count || iri_fallbacked)
 960     {
 961       if (newloc)
 962         *newloc = url;
 963       else
 964         xfree (url);
 965     }
 966   else
 967     {
 968       if (newloc)
 969         *newloc = NULL;
 970       xfree (url);
 971     }
 972
 973   RESTORE_METHOD;
 974
 975 bail:
 976   if (register_status)
 977     inform_exit_status (result);
 978   return result;
 979 }
 980
 981 /* Find the URLs in the file and call retrieve_url() for each of them.
 982    If HTML is true, treat the file as HTML, and construct the URLs
 983    accordingly.
 984
 985    If opt.recursive is set, call retrieve_tree() for each file.  */
 986
 987 uerr_t
 988 retrieve_from_file (const char *file, bool html, int *count)
 989 {
 990   uerr_t status;
 991   struct urlpos *url_list, *cur_url;
 992   struct iri *iri = iri_new();
 993
 994   char *input_file, *url_file = NULL;
 995   const char *url = file;
 996
 997   status = RETROK;             /* Suppose everything is OK.  */
 998   *count = 0;                  /* Reset the URL count.  */
 999
1000   /* sXXXav : Assume filename and links in the file are in the locale */
1001   set_uri_encoding (iri, opt.locale, true);
1002   set_content_encoding (iri, opt.locale);
1003
1004   if (url_valid_scheme (url))
1005     {
1006       int dt,url_err;
1007       uerr_t status;
1008       struct url *url_parsed = url_parse (url, &url_err, iri, true);
1009       if (!url_parsed)
1010         {
1011           char *error = url_error (url, url_err);
1012           logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
1013           xfree (error);
1014           return URLERROR;
1015         }
1016
1017       if (!opt.base_href)
1018         opt.base_href = xstrdup (url);
1019
1020       status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
1021                              false, iri, true);
1022       url_free (url_parsed);
1023
1024       if (!url_file || (status != RETROK))
1025         return status;
1026
1027       if (dt & TEXTHTML)
1028         html = true;
1029
1030       /* If we have a found a content encoding, use it.
1031        * ( == is okay, because we're checking for identical object) */
1032       if (iri->content_encoding != opt.locale)
1033           set_uri_encoding (iri, iri->content_encoding, false);
1034
1035       /* Reset UTF-8 encode status */
1036       iri->utf8_encode = opt.enable_iri;
1037       xfree_null (iri->orig_url);
1038       iri->orig_url = NULL;
1039
1040       input_file = url_file;
1041     }
1042   else
1043     input_file = (char *) file;
1044
1045   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
1046               : get_urls_file (input_file));
1047
1048   xfree_null (url_file);
1049
1050   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
1051     {
1052       char *filename = NULL, *new_file = NULL;
1053       int dt;
1054       struct iri *tmpiri = iri_dup (iri);
1055       struct url *parsed_url = NULL;
1056
1057       if (cur_url->ignore_when_downloading)
1058         continue;
1059
1060       if (opt.quota && total_downloaded_bytes > opt.quota)
1061         {
1062           status = QUOTEXC;
1063           break;
1064         }
1065
1066       parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
1067
1068       if ((opt.recursive || opt.page_requisites)
1069           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
1070         {
1071           int old_follow_ftp = opt.follow_ftp;
1072
1073           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
1074           if (cur_url->url->scheme == SCHEME_FTP)
1075             opt.follow_ftp = 1;
1076
1077           status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
1078                                   tmpiri);
1079
1080           opt.follow_ftp = old_follow_ftp;
1081         }
1082       else
1083         status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
1084                                cur_url->url->url, &filename,
1085                                &new_file, NULL, &dt, opt.recursive, tmpiri,
1086                                true);
1087
1088       if (parsed_url)
1089           url_free (parsed_url);
1090
1091       if (filename && opt.delete_after && file_exists_p (filename))
1092         {
1093           DEBUGP (("\
1094 Removing file due to --delete-after in retrieve_from_file():\n"));
1095           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
1096           if (unlink (filename))
1097             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
1098           dt &= ~RETROKF;
1099         }
1100
1101       xfree_null (new_file);
1102       xfree_null (filename);
1103       iri_free (tmpiri);
1104     }
1105
1106   /* Free the linked list of URL-s.  */
1107   free_urlpos (url_list);
1108
1109   iri_free (iri);
1110
1111   return status;
1112 }
1113
1114 /* Print `giving up', or `retrying', depending on the impending
1115    action.  N1 and N2 are the attempt number and the attempt limit.  */
1116 void
1117 printwhat (int n1, int n2)
1118 {
1119   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1120 }
1121
1122 /* If opt.wait or opt.waitretry are specified, and if certain
1123    conditions are met, sleep the appropriate number of seconds.  See
1124    the documentation of --wait and --waitretry for more information.
1125
1126    COUNT is the count of current retrieval, beginning with 1. */
1127
1128 void
1129 sleep_between_retrievals (int count)
1130 {
1131   static bool first_retrieval = true;
1132
1133   if (first_retrieval)
1134     {
1135       /* Don't sleep before the very first retrieval. */
1136       first_retrieval = false;
1137       return;
1138     }
1139
1140   if (opt.waitretry && count > 1)
1141     {
1142       /* If opt.waitretry is specified and this is a retry, wait for
1143          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1144       if (count <= opt.waitretry)
1145         xsleep (count - 1);
1146       else
1147         xsleep (opt.waitretry);
1148     }
1149   else if (opt.wait)
1150     {
1151       if (!opt.random_wait || count > 1)
1152         /* If random-wait is not specified, or if we are sleeping
1153            between retries of the same download, sleep the fixed
1154            interval.  */
1155         xsleep (opt.wait);
1156       else
1157         {
1158           /* Sleep a random amount of time averaging in opt.wait
1159              seconds.  The sleeping amount ranges from 0.5*opt.wait to
1160              1.5*opt.wait.  */
1161           double waitsecs = (0.5 + random_float ()) * opt.wait;
1162           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1163                    opt.wait, waitsecs));
1164           xsleep (waitsecs);
1165         }
1166     }
1167 }
1168
1169 /* Free the linked list of urlpos.  */
1170 void
1171 free_urlpos (struct urlpos *l)
1172 {
1173   while (l)
1174     {
1175       struct urlpos *next = l->next;
1176       if (l->url)
1177         url_free (l->url);
1178       xfree_null (l->local_name);
1179       xfree (l);
1180       l = next;
1181     }
1182 }
1183
1184 /* Rotate FNAME opt.backups times */
1185 void
1186 rotate_backups(const char *fname)
1187 {
1188 #ifdef __VMS
1189 # define SEP "_"
1190 # define AVS ";*"                       /* All-version suffix. */
1191 # define AVSL (sizeof (AVS) - 1)
1192 #else
1193 # define SEP "."
1194 # define AVSL 0
1195 #endif
1196
1197   int maxlen = strlen (fname) + sizeof (SEP) + numdigit (opt.backups) + AVSL;
1198   char *from = (char *)alloca (maxlen);
1199   char *to = (char *)alloca (maxlen);
1200   struct_stat sb;
1201   int i;
1202
1203   if (stat (fname, &sb) == 0)
1204     if (S_ISREG (sb.st_mode) == 0)
1205       return;
1206
1207   for (i = opt.backups; i > 1; i--)
1208     {
1209 #ifdef VMS
1210       /* Delete (all versions of) any existing max-suffix file, to avoid
1211        * creating multiple versions of it.  (On VMS, rename() will
1212        * create a new version of an existing destination file, not
1213        * destroy/overwrite it.)
1214        */
1215       if (i == opt.backups)
1216         {
1217           sprintf (to, "%s%s%d%s", fname, SEP, i, AVS);
1218           delete (to);
1219         }
1220 #endif
1221       sprintf (to, "%s%s%d", fname, SEP, i);
1222       sprintf (from, "%s%s%d", fname, SEP, i - 1);
1223       rename (from, to);
1224     }
1225
1226   sprintf (to, "%s%s%d", fname, SEP, 1);
1227   rename(fname, to);
1228 }
1229
1230 static bool no_proxy_match (const char *, const char **);
1231
1232 /* Return the URL of the proxy appropriate for url U.  */
1233
1234 static char *
1235 getproxy (struct url *u)
1236 {
1237   char *proxy = NULL;
1238   char *rewritten_url;
1239   static char rewritten_storage[1024];
1240
1241   if (!opt.use_proxy)
1242     return NULL;
1243   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1244     return NULL;
1245
1246   switch (u->scheme)
1247     {
1248     case SCHEME_HTTP:
1249       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1250       break;
1251 #ifdef HAVE_SSL
1252     case SCHEME_HTTPS:
1253       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1254       break;
1255 #endif
1256     case SCHEME_FTP:
1257       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1258       break;
1259     case SCHEME_INVALID:
1260       break;
1261     }
1262   if (!proxy || !*proxy)
1263     return NULL;
1264
1265   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1266      getproxy() to return static storage. */
1267   rewritten_url = rewrite_shorthand_url (proxy);
1268   if (rewritten_url)
1269     {
1270       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1271       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1272       proxy = rewritten_storage;
1273     }
1274
1275   return proxy;
1276 }
1277
1278 /* Returns true if URL would be downloaded through a proxy. */
1279
1280 bool
1281 url_uses_proxy (struct url * u)
1282 {
1283   bool ret;
1284   if (!u)
1285     return false;
1286   ret = getproxy (u) != NULL;
1287   return ret;
1288 }
1289
1290 /* Should a host be accessed through proxy, concerning no_proxy?  */
1291 static bool
1292 no_proxy_match (const char *host, const char **no_proxy)
1293 {
1294   if (!no_proxy)
1295     return false;
1296   else
1297     return sufmatch (no_proxy, host);
1298 }
1299
1300 /* Set the file parameter to point to the local file string.  */
1301 void
1302 set_local_file (const char **file, const char *default_file)
1303 {
1304   if (opt.output_document)
1305     {
1306       if (output_stream_regular)
1307         *file = opt.output_document;
1308     }
1309   else
1310     *file = default_file;
1311 }
1312
1313 /* Return true for an input file's own URL, false otherwise.  */
1314 bool
1315 input_file_url (const char *input_file)
1316 {
1317   static bool first = true;
1318
1319   if (input_file
1320       && url_has_scheme (input_file)
1321       && first)
1322     {
1323       first = false;
1324       return true;
1325     }
1326   else
1327     return false;
1328 }