sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <string.h>
  39 #include <assert.h>
  40
  41 #include "exits.h"
  42 #include "utils.h"
  43 #include "retr.h"
  44 #include "progress.h"
  45 #include "url.h"
  46 #include "recur.h"
  47 #include "ftp.h"
  48 #include "http.h"
  49 #include "host.h"
  50 #include "connect.h"
  51 #include "hash.h"
  52 #include "convert.h"
  53 #include "ptimer.h"
  54 #include "html-url.h"
  55 #include "iri.h"
  56
  57 /* Total size of downloaded files.  Used to enforce quota.  */
  58 SUM_SIZE_INT total_downloaded_bytes;
  59
  60 /* Total download time in seconds. */
  61 double total_download_time;
  62
  63 /* If non-NULL, the stream to which output should be written.  This
  64    stream is initialized when `-O' is used.  */
  65 FILE *output_stream;
  66
  67 /* Whether output_document is a regular file we can manipulate,
  68    i.e. not `-' or a device file. */
  69 bool output_stream_regular;
  70 \f
  71 static struct {
  72   wgint chunk_bytes;
  73   double chunk_start;
  74   double sleep_adjust;
  75 } limit_data;
  76
  77 static void
  78 limit_bandwidth_reset (void)
  79 {
  80   xzero (limit_data);
  81 }
  82
  83 /* Limit the bandwidth by pausing the download for an amount of time.
  84    BYTES is the number of bytes received from the network, and TIMER
  85    is the timer that started at the beginning of download.  */
  86
  87 static void
  88 limit_bandwidth (wgint bytes, struct ptimer *timer)
  89 {
  90   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  91   double expected;
  92
  93   limit_data.chunk_bytes += bytes;
  94
  95   /* Calculate the amount of time we expect downloading the chunk
  96      should take.  If in reality it took less time, sleep to
  97      compensate for the difference.  */
  98   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
  99
 100   if (expected > delta_t)
 101     {
 102       double slp = expected - delta_t + limit_data.sleep_adjust;
 103       double t0, t1;
 104       if (slp < 0.2)
 105         {
 106           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 107                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 108                    delta_t));
 109           return;
 110         }
 111       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 112                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 113                limit_data.sleep_adjust));
 114
 115       t0 = ptimer_read (timer);
 116       xsleep (slp);
 117       t1 = ptimer_measure (timer);
 118
 119       /* Due to scheduling, we probably slept slightly longer (or
 120          shorter) than desired.  Calculate the difference between the
 121          desired and the actual sleep, and adjust the next sleep by
 122          that amount.  */
 123       limit_data.sleep_adjust = slp - (t1 - t0);
 124       /* If sleep_adjust is very large, it's likely due to suspension
 125          and not clock inaccuracy.  Don't enforce those.  */
 126       if (limit_data.sleep_adjust > 0.5)
 127         limit_data.sleep_adjust = 0.5;
 128       else if (limit_data.sleep_adjust < -0.5)
 129         limit_data.sleep_adjust = -0.5;
 130     }
 131
 132   limit_data.chunk_bytes = 0;
 133   limit_data.chunk_start = ptimer_read (timer);
 134 }
 135
 136 #ifndef MIN
 137 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 138 #endif
 139
 140 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 141    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 142    of data written.  If OUT2 is not NULL, also write BUF to OUT2.
 143    In case of error writing to OUT, -1 is returned.  In case of error
 144    writing to OUT2, -2 is returned.  In case of any other error,
 145    1 is returned.  */
 146
 147 static int
 148 write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
 149             wgint *skip, wgint *written)
 150 {
 151   if (out == NULL && out2 == NULL)
 152     return 1;
 153   if (*skip > bufsize)
 154     {
 155       *skip -= bufsize;
 156       return 1;
 157     }
 158   if (*skip)
 159     {
 160       buf += *skip;
 161       bufsize -= *skip;
 162       *skip = 0;
 163       if (bufsize == 0)
 164         return 1;
 165     }
 166
 167   if (out != NULL)
 168     fwrite (buf, 1, bufsize, out);
 169   if (out2 != NULL)
 170     fwrite (buf, 1, bufsize, out2);
 171   *written += bufsize;
 172
 173   /* Immediately flush the downloaded data.  This should not hinder
 174      performance: fast downloads will arrive in large 16K chunks
 175      (which stdio would write out immediately anyway), and slow
 176      downloads wouldn't be limited by disk speed.  */
 177
 178   /* 2005-04-20 SMS.
 179      Perhaps it shouldn't hinder performance, but it sure does, at least
 180      on VMS (more than 2X).  Rather than speculate on what it should or
 181      shouldn't do, it might make more sense to test it.  Even better, it
 182      might be nice to explain what possible benefit it could offer, as
 183      it appears to be a clear invitation to poor performance with no
 184      actual justification.  (Also, why 16K?  Anyone test other values?)
 185   */
 186 #ifndef __VMS
 187   if (out != NULL)
 188     fflush (out);
 189   if (out2 != NULL)
 190     fflush (out2);
 191 #endif /* ndef __VMS */
 192   if (out != NULL && ferror (out))
 193     return -1;
 194   else if (out2 != NULL && ferror (out2))
 195     return -2;
 196   else
 197     return 0;
 198 }
 199
 200 /* Read the contents of file descriptor FD until it the connection
 201    terminates or a read error occurs.  The data is read in portions of
 202    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 203    the progress is shown.
 204
 205    TOREAD is the amount of data expected to arrive, normally only used
 206    by the progress gauge.
 207
 208    STARTPOS is the position from which the download starts, used by
 209    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 210    is incremented by the amount of data read from the network.  If
 211    QTYWRITTEN is non-NULL, the value it points to is incremented by
 212    the amount of data written to disk.  The time it took to download
 213    the data is stored to ELAPSED.
 214
 215    If OUT2 is non-NULL, the contents is also written to OUT2.
 216
 217    The function exits and returns the amount of data read.  In case of
 218    error while reading data, -1 is returned.  In case of error while
 219    writing data to OUT, -2 is returned.  In case of error while writing
 220    data to OUT2, -3 is returned.  */
 221
 222 int
 223 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
 224               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
 225               FILE *out2)
 226 {
 227   int ret = 0;
 228 #undef max
 229 #define max(a,b) ((a) > (b) ? (a) : (b))
 230   int dlbufsize = max (BUFSIZ, 8 * 1024);
 231   char *dlbuf = xmalloc (dlbufsize);
 232
 233   struct ptimer *timer = NULL;
 234   double last_successful_read_tm = 0;
 235
 236   /* The progress gauge, set according to the user preferences. */
 237   void *progress = NULL;
 238
 239   /* Non-zero if the progress gauge is interactive, i.e. if it can
 240      continually update the display.  When true, smaller timeout
 241      values are used so that the gauge can update the display when
 242      data arrives slowly. */
 243   bool progress_interactive = false;
 244
 245   bool exact = !!(flags & rb_read_exactly);
 246
 247   /* Used only by HTTP/HTTPS chunked transfer encoding.  */
 248   bool chunked = flags & rb_chunked_transfer_encoding;
 249   wgint skip = 0;
 250
 251   /* How much data we've read/written.  */
 252   wgint sum_read = 0;
 253   wgint sum_written = 0;
 254   wgint remaining_chunk_size = 0;
 255
 256   if (flags & rb_skip_startpos)
 257     skip = startpos;
 258
 259   if (opt.verbose)
 260     {
 261       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 262          argument to progress_create because the indicator doesn't
 263          (yet) know about "skipping" data.  */
 264       wgint start = skip ? 0 : startpos;
 265       progress = progress_create (start, start + toread);
 266       progress_interactive = progress_interactive_p (progress);
 267     }
 268
 269   if (opt.limit_rate)
 270     limit_bandwidth_reset ();
 271
 272   /* A timer is needed for tracking progress, for throttling, and for
 273      tracking elapsed time.  If either of these are requested, start
 274      the timer.  */
 275   if (progress || opt.limit_rate || elapsed)
 276     {
 277       timer = ptimer_new ();
 278       last_successful_read_tm = 0;
 279     }
 280
 281   /* Use a smaller buffer for low requested bandwidths.  For example,
 282      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 283      data and then sleep for 8s.  With buffer size equal to the limit,
 284      we never have to sleep for more than one second.  */
 285   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 286     dlbufsize = opt.limit_rate;
 287
 288   /* Read from FD while there is data to read.  Normally toread==0
 289      means that it is unknown how much data is to arrive.  However, if
 290      EXACT is set, then toread==0 means what it says: that no data
 291      should be read.  */
 292   while (!exact || (sum_read < toread))
 293     {
 294       int rdsize;
 295       double tmout = opt.read_timeout;
 296
 297       if (chunked)
 298         {
 299           if (remaining_chunk_size == 0)
 300             {
 301               char *line = fd_read_line (fd);
 302               char *endl;
 303               if (line == NULL)
 304                 {
 305                   ret = -1;
 306                   break;
 307                 }
 308
 309               remaining_chunk_size = strtol (line, &endl, 16);
 310               xfree (line);
 311
 312               if (remaining_chunk_size == 0)
 313                 {
 314                   ret = 0;
 315                   line = fd_read_line (fd);
 316                   if (line == NULL)
 317                     ret = -1;
 318                   else
 319                     xfree (line);
 320                   break;
 321                 }
 322             }
 323
 324           rdsize = MIN (remaining_chunk_size, dlbufsize);
 325         }
 326       else
 327         rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 328
 329       if (progress_interactive)
 330         {
 331           /* For interactive progress gauges, always specify a ~1s
 332              timeout, so that the gauge can be updated regularly even
 333              when the data arrives very slowly or stalls.  */
 334           tmout = 0.95;
 335           if (opt.read_timeout)
 336             {
 337               double waittm;
 338               waittm = ptimer_read (timer) - last_successful_read_tm;
 339               if (waittm + tmout > opt.read_timeout)
 340                 {
 341                   /* Don't let total idle time exceed read timeout. */
 342                   tmout = opt.read_timeout - waittm;
 343                   if (tmout < 0)
 344                     {
 345                       /* We've already exceeded the timeout. */
 346                       ret = -1, errno = ETIMEDOUT;
 347                       break;
 348                     }
 349                 }
 350             }
 351         }
 352       ret = fd_read (fd, dlbuf, rdsize, tmout);
 353
 354       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 355         ret = 0;                /* interactive timeout, handled above */
 356       else if (ret <= 0)
 357         break;                  /* EOF or read error */
 358
 359       if (progress || opt.limit_rate || elapsed)
 360         {
 361           ptimer_measure (timer);
 362           if (ret > 0)
 363             last_successful_read_tm = ptimer_read (timer);
 364         }
 365
 366       if (ret > 0)
 367         {
 368           sum_read += ret;
 369           int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
 370           if (write_res != 0)
 371             {
 372               ret = (write_res == -3) ? -3 : -2;
 373               goto out;
 374             }
 375           if (chunked)
 376             {
 377               remaining_chunk_size -= ret;
 378               if (remaining_chunk_size == 0)
 379                 {
 380                   char *line = fd_read_line (fd);
 381                   if (line == NULL)
 382                     {
 383                       ret = -1;
 384                       break;
 385                     }
 386                   else
 387                     xfree (line);
 388                 }
 389             }
 390         }
 391
 392       if (opt.limit_rate)
 393         limit_bandwidth (ret, timer);
 394
 395       if (progress)
 396         progress_update (progress, ret, ptimer_read (timer));
 397 #ifdef WINDOWS
 398       if (toread > 0 && !opt.quiet)
 399         ws_percenttitle (100.0 *
 400                          (startpos + sum_read) / (startpos + toread));
 401 #endif
 402     }
 403   if (ret < -1)
 404     ret = -1;
 405
 406  out:
 407   if (progress)
 408     progress_finish (progress, ptimer_read (timer));
 409
 410   if (elapsed)
 411     *elapsed = ptimer_read (timer);
 412   if (timer)
 413     ptimer_destroy (timer);
 414
 415   if (qtyread)
 416     *qtyread += sum_read;
 417   if (qtywritten)
 418     *qtywritten += sum_written;
 419
 420   free (dlbuf);
 421
 422   return ret;
 423 }
 424 \f
 425 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 426    limited by whatever the TERMINATOR callback chooses as its
 427    terminator.  For example, if terminator stops at newline, the hunk
 428    will consist of a line of data; if terminator stops at two
 429    newlines, it can be used to read the head of an HTTP response.
 430    Upon determining the boundary, the function returns the data (up to
 431    the terminator) in malloc-allocated storage.
 432
 433    In case of read error, NULL is returned.  In case of EOF and no
 434    data read, NULL is returned and errno set to 0.  In case of having
 435    read some data, but encountering EOF before seeing the terminator,
 436    the data that has been read is returned, but it will (obviously)
 437    not contain the terminator.
 438
 439    The TERMINATOR function is called with three arguments: the
 440    beginning of the data read so far, the beginning of the current
 441    block of peeked-at data, and the length of the current block.
 442    Depending on its needs, the function is free to choose whether to
 443    analyze all data or just the newly arrived data.  If TERMINATOR
 444    returns NULL, it means that the terminator has not been seen.
 445    Otherwise it should return a pointer to the charactre immediately
 446    following the terminator.
 447
 448    The idea is to be able to read a line of input, or otherwise a hunk
 449    of text, such as the head of an HTTP request, without crossing the
 450    boundary, so that the next call to fd_read etc. reads the data
 451    after the hunk.  To achieve that, this function does the following:
 452
 453    1. Peek at incoming data.
 454
 455    2. Determine whether the peeked data, along with the previously
 456       read data, includes the terminator.
 457
 458       2a. If yes, read the data until the end of the terminator, and
 459           exit.
 460
 461       2b. If no, read the peeked data and goto 1.
 462
 463    The function is careful to assume as little as possible about the
 464    implementation of peeking.  For example, every peek is followed by
 465    a read.  If the read returns a different amount of data, the
 466    process is retried until all data arrives safely.
 467
 468    SIZEHINT is the buffer size sufficient to hold all the data in the
 469    typical case (it is used as the initial buffer size).  MAXSIZE is
 470    the maximum amount of memory this function is allowed to allocate,
 471    or 0 if no upper limit is to be enforced.
 472
 473    This function should be used as a building block for other
 474    functions -- see fd_read_line as a simple example.  */
 475
 476 char *
 477 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 478 {
 479   long bufsize = sizehint;
 480   char *hunk = xmalloc (bufsize);
 481   int tail = 0;                 /* tail position in HUNK */
 482
 483   assert (!maxsize || maxsize >= bufsize);
 484
 485   while (1)
 486     {
 487       const char *end;
 488       int pklen, rdlen, remain;
 489
 490       /* First, peek at the available data. */
 491
 492       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 493       if (pklen < 0)
 494         {
 495           xfree (hunk);
 496           return NULL;
 497         }
 498       end = terminator (hunk, hunk + tail, pklen);
 499       if (end)
 500         {
 501           /* The data contains the terminator: we'll drain the data up
 502              to the end of the terminator.  */
 503           remain = end - (hunk + tail);
 504           assert (remain >= 0);
 505           if (remain == 0)
 506             {
 507               /* No more data needs to be read. */
 508               hunk[tail] = '\0';
 509               return hunk;
 510             }
 511           if (bufsize - 1 < tail + remain)
 512             {
 513               bufsize = tail + remain + 1;
 514               hunk = xrealloc (hunk, bufsize);
 515             }
 516         }
 517       else
 518         /* No terminator: simply read the data we know is (or should
 519            be) available.  */
 520         remain = pklen;
 521
 522       /* Now, read the data.  Note that we make no assumptions about
 523          how much data we'll get.  (Some TCP stacks are notorious for
 524          read returning less data than the previous MSG_PEEK.)  */
 525
 526       rdlen = fd_read (fd, hunk + tail, remain, 0);
 527       if (rdlen < 0)
 528         {
 529           xfree_null (hunk);
 530           return NULL;
 531         }
 532       tail += rdlen;
 533       hunk[tail] = '\0';
 534
 535       if (rdlen == 0)
 536         {
 537           if (tail == 0)
 538             {
 539               /* EOF without anything having been read */
 540               xfree (hunk);
 541               errno = 0;
 542               return NULL;
 543             }
 544           else
 545             /* EOF seen: return the data we've read. */
 546             return hunk;
 547         }
 548       if (end && rdlen == remain)
 549         /* The terminator was seen and the remaining data drained --
 550            we got what we came for.  */
 551         return hunk;
 552
 553       /* Keep looping until all the data arrives. */
 554
 555       if (tail == bufsize - 1)
 556         {
 557           /* Double the buffer size, but refuse to allocate more than
 558              MAXSIZE bytes.  */
 559           if (maxsize && bufsize >= maxsize)
 560             {
 561               xfree (hunk);
 562               errno = ENOMEM;
 563               return NULL;
 564             }
 565           bufsize <<= 1;
 566           if (maxsize && bufsize > maxsize)
 567             bufsize = maxsize;
 568           hunk = xrealloc (hunk, bufsize);
 569         }
 570     }
 571 }
 572
 573 static const char *
 574 line_terminator (const char *start, const char *peeked, int peeklen)
 575 {
 576   const char *p = memchr (peeked, '\n', peeklen);
 577   if (p)
 578     /* p+1 because the line must include '\n' */
 579     return p + 1;
 580   return NULL;
 581 }
 582
 583 /* The maximum size of the single line we agree to accept.  This is
 584    not meant to impose an arbitrary limit, but to protect the user
 585    from Wget slurping up available memory upon encountering malicious
 586    or buggy server output.  Define it to 0 to remove the limit.  */
 587 #define FD_READ_LINE_MAX 4096
 588
 589 /* Read one line from FD and return it.  The line is allocated using
 590    malloc, but is never larger than FD_READ_LINE_MAX.
 591
 592    If an error occurs, or if no data can be read, NULL is returned.
 593    In the former case errno indicates the error condition, and in the
 594    latter case, errno is NULL.  */
 595
 596 char *
 597 fd_read_line (int fd)
 598 {
 599   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 600 }
 601 \f
 602 /* Return a printed representation of the download rate, along with
 603    the units appropriate for the download speed.  */
 604
 605 const char *
 606 retr_rate (wgint bytes, double secs)
 607 {
 608   static char res[20];
 609   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 610   int units;
 611
 612   double dlrate = calc_rate (bytes, secs, &units);
 613   /* Use more digits for smaller numbers (regardless of unit used),
 614      e.g. "1022", "247", "12.5", "2.38".  */
 615   sprintf (res, "%.*f %s",
 616            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 617            dlrate, rate_names[units]);
 618
 619   return res;
 620 }
 621
 622 /* Calculate the download rate and trim it as appropriate for the
 623    speed.  Appropriate means that if rate is greater than 1K/s,
 624    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 625    are used.
 626
 627    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 628    GB/s.  */
 629
 630 double
 631 calc_rate (wgint bytes, double secs, int *units)
 632 {
 633   double dlrate;
 634
 635   assert (secs >= 0);
 636   assert (bytes >= 0);
 637
 638   if (secs == 0)
 639     /* If elapsed time is exactly zero, it means we're under the
 640        resolution of the timer.  This can easily happen on systems
 641        that use time() for the timer.  Since the interval lies between
 642        0 and the timer's resolution, assume half the resolution.  */
 643     secs = ptimer_resolution () / 2.0;
 644
 645   dlrate = bytes / secs;
 646   if (dlrate < 1024.0)
 647     *units = 0;
 648   else if (dlrate < 1024.0 * 1024.0)
 649     *units = 1, dlrate /= 1024.0;
 650   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 651     *units = 2, dlrate /= (1024.0 * 1024.0);
 652   else
 653     /* Maybe someone will need this, one day. */
 654     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 655
 656   return dlrate;
 657 }
 658 \f
 659
 660 #define SUSPEND_POST_DATA do {                  \
 661   post_data_suspended = true;                   \
 662   saved_post_data = opt.post_data;              \
 663   saved_post_file_name = opt.post_file_name;    \
 664   opt.post_data = NULL;                         \
 665   opt.post_file_name = NULL;                    \
 666 } while (0)
 667
 668 #define RESTORE_POST_DATA do {                          \
 669   if (post_data_suspended)                              \
 670     {                                                   \
 671       opt.post_data = saved_post_data;                  \
 672       opt.post_file_name = saved_post_file_name;        \
 673       post_data_suspended = false;                      \
 674     }                                                   \
 675 } while (0)
 676
 677 static char *getproxy (struct url *);
 678
 679 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 680    FTP, proxy, etc.  */
 681
 682 /* #### This function should be rewritten so it doesn't return from
 683    multiple points. */
 684
 685 uerr_t
 686 retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
 687               char **newloc, const char *refurl, int *dt, bool recursive,
 688               struct iri *iri, bool register_status)
 689 {
 690   uerr_t result;
 691   char *url;
 692   bool location_changed;
 693   bool iri_fallbacked = 0;
 694   int dummy;
 695   char *mynewloc, *proxy;
 696   struct url *u = orig_parsed, *proxy_url;
 697   int up_error_code;            /* url parse error code */
 698   char *local_file;
 699   int redirection_count = 0;
 700
 701   bool post_data_suspended = false;
 702   char *saved_post_data = NULL;
 703   char *saved_post_file_name = NULL;
 704
 705   /* If dt is NULL, use local storage.  */
 706   if (!dt)
 707     {
 708       dt = &dummy;
 709       dummy = 0;
 710     }
 711   url = xstrdup (origurl);
 712   if (newloc)
 713     *newloc = NULL;
 714   if (file)
 715     *file = NULL;
 716
 717   if (!refurl)
 718     refurl = opt.referer;
 719
 720  redirected:
 721   /* (also for IRI fallbacking) */
 722
 723   result = NOCONERROR;
 724   mynewloc = NULL;
 725   local_file = NULL;
 726   proxy_url = NULL;
 727
 728   proxy = getproxy (u);
 729   if (proxy)
 730     {
 731       struct iri *pi = iri_new ();
 732       set_uri_encoding (pi, opt.locale, true);
 733       pi->utf8_encode = false;
 734
 735       /* Parse the proxy URL.  */
 736       proxy_url = url_parse (proxy, &up_error_code, NULL, true);
 737       if (!proxy_url)
 738         {
 739           char *error = url_error (proxy, up_error_code);
 740           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 741                      proxy, error);
 742           xfree (url);
 743           xfree (error);
 744           RESTORE_POST_DATA;
 745           result = PROXERR;
 746           goto bail;
 747         }
 748       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 749         {
 750           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 751           url_free (proxy_url);
 752           xfree (url);
 753           RESTORE_POST_DATA;
 754           result = PROXERR;
 755           goto bail;
 756         }
 757     }
 758
 759   if (u->scheme == SCHEME_HTTP
 760 #ifdef HAVE_SSL
 761       || u->scheme == SCHEME_HTTPS
 762 #endif
 763       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 764     {
 765       result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
 766                           proxy_url, iri);
 767     }
 768   else if (u->scheme == SCHEME_FTP)
 769     {
 770       /* If this is a redirection, temporarily turn off opt.ftp_glob
 771          and opt.recursive, both being undesirable when following
 772          redirects.  */
 773       bool oldrec = recursive, glob = opt.ftp_glob;
 774       if (redirection_count)
 775         oldrec = glob = false;
 776
 777       result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
 778       recursive = oldrec;
 779
 780       /* There is a possibility of having HTTP being redirected to
 781          FTP.  In these cases we must decide whether the text is HTML
 782          according to the suffix.  The HTML suffixes are `.html',
 783          `.htm' and a few others, case-insensitive.  */
 784       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 785         {
 786           if (has_html_suffix_p (local_file))
 787             *dt |= TEXTHTML;
 788         }
 789     }
 790
 791   if (proxy_url)
 792     {
 793       url_free (proxy_url);
 794       proxy_url = NULL;
 795     }
 796
 797   location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
 798   if (location_changed)
 799     {
 800       char *construced_newloc;
 801       struct url *newloc_parsed;
 802
 803       assert (mynewloc != NULL);
 804
 805       if (local_file)
 806         xfree (local_file);
 807
 808       /* The HTTP specs only allow absolute URLs to appear in
 809          redirects, but a ton of boneheaded webservers and CGIs out
 810          there break the rules and use relative URLs, and popular
 811          browsers are lenient about this, so wget should be too. */
 812       construced_newloc = uri_merge (url, mynewloc);
 813       xfree (mynewloc);
 814       mynewloc = construced_newloc;
 815
 816       /* Reset UTF-8 encoding state, keep the URI encoding and reset
 817          the content encoding. */
 818       iri->utf8_encode = opt.enable_iri;
 819       set_content_encoding (iri, NULL);
 820       xfree_null (iri->orig_url);
 821
 822       /* Now, see if this new location makes sense. */
 823       newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
 824       if (!newloc_parsed)
 825         {
 826           char *error = url_error (mynewloc, up_error_code);
 827           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 828                      error);
 829           if (orig_parsed != u)
 830             {
 831               url_free (u);
 832             }
 833           xfree (url);
 834           xfree (mynewloc);
 835           xfree (error);
 836           RESTORE_POST_DATA;
 837           goto bail;
 838         }
 839
 840       /* Now mynewloc will become newloc_parsed->url, because if the
 841          Location contained relative paths like .././something, we
 842          don't want that propagating as url.  */
 843       xfree (mynewloc);
 844       mynewloc = xstrdup (newloc_parsed->url);
 845
 846       /* Check for max. number of redirections.  */
 847       if (++redirection_count > opt.max_redirect)
 848         {
 849           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 850                      opt.max_redirect);
 851           url_free (newloc_parsed);
 852           if (orig_parsed != u)
 853             {
 854               url_free (u);
 855             }
 856           xfree (url);
 857           xfree (mynewloc);
 858           RESTORE_POST_DATA;
 859           result = WRONGCODE;
 860           goto bail;
 861         }
 862
 863       xfree (url);
 864       url = mynewloc;
 865       if (orig_parsed != u)
 866         {
 867           url_free (u);
 868         }
 869       u = newloc_parsed;
 870
 871       /* If we're being redirected from POST, and we received a
 872          redirect code different than 307, we don't want to POST
 873          again.  Many requests answer POST with a redirection to an
 874          index page; that redirection is clearly a GET.  We "suspend"
 875          POST data for the duration of the redirections, and restore
 876          it when we're done.
 877
 878          RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
 879          specifically to preserve the method of the request.
 880          */
 881       if (result != NEWLOCATION_KEEP_POST && !post_data_suspended)
 882         SUSPEND_POST_DATA;
 883
 884       goto redirected;
 885     }
 886
 887   /* Try to not encode in UTF-8 if fetching failed */
 888   if (!(*dt & RETROKF) && iri->utf8_encode)
 889     {
 890       iri->utf8_encode = false;
 891       if (orig_parsed != u)
 892         {
 893           url_free (u);
 894         }
 895       u = url_parse (origurl, NULL, iri, true);
 896       if (u)
 897         {
 898           DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
 899           url = xstrdup (u->url);
 900           iri_fallbacked = 1;
 901           goto redirected;
 902         }
 903       else
 904           DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
 905     }
 906
 907   if (local_file && u && *dt & RETROKF)
 908     {
 909       register_download (u->url, local_file);
 910
 911       if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
 912         register_redirection (origurl, u->url);
 913
 914       if (*dt & TEXTHTML)
 915         register_html (u->url, local_file);
 916
 917       if (*dt & TEXTCSS)
 918         register_css (u->url, local_file);
 919     }
 920
 921   if (file)
 922     *file = local_file ? local_file : NULL;
 923   else
 924     xfree_null (local_file);
 925
 926   if (orig_parsed != u)
 927     {
 928       url_free (u);
 929     }
 930
 931   if (redirection_count || iri_fallbacked)
 932     {
 933       if (newloc)
 934         *newloc = url;
 935       else
 936         xfree (url);
 937     }
 938   else
 939     {
 940       if (newloc)
 941         *newloc = NULL;
 942       xfree (url);
 943     }
 944
 945   RESTORE_POST_DATA;
 946
 947 bail:
 948   if (register_status)
 949     inform_exit_status (result);
 950   return result;
 951 }
 952
 953 /* Find the URLs in the file and call retrieve_url() for each of them.
 954    If HTML is true, treat the file as HTML, and construct the URLs
 955    accordingly.
 956
 957    If opt.recursive is set, call retrieve_tree() for each file.  */
 958
 959 uerr_t
 960 retrieve_from_file (const char *file, bool html, int *count)
 961 {
 962   uerr_t status;
 963   struct urlpos *url_list, *cur_url;
 964   struct iri *iri = iri_new();
 965
 966   char *input_file, *url_file = NULL;
 967   const char *url = file;
 968
 969   status = RETROK;             /* Suppose everything is OK.  */
 970   *count = 0;                  /* Reset the URL count.  */
 971
 972   /* sXXXav : Assume filename and links in the file are in the locale */
 973   set_uri_encoding (iri, opt.locale, true);
 974   set_content_encoding (iri, opt.locale);
 975
 976   if (url_valid_scheme (url))
 977     {
 978       int dt,url_err;
 979       uerr_t status;
 980       struct url *url_parsed = url_parse (url, &url_err, iri, true);
 981       if (!url_parsed)
 982         {
 983           char *error = url_error (url, url_err);
 984           logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
 985           xfree (error);
 986           return URLERROR;
 987         }
 988
 989       if (!opt.base_href)
 990         opt.base_href = xstrdup (url);
 991
 992       status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
 993                              false, iri, true);
 994       url_free (url_parsed);
 995
 996       if (!url_file || (status != RETROK))
 997         return status;
 998
 999       if (dt & TEXTHTML)
1000         html = true;
1001
1002       /* If we have a found a content encoding, use it.
1003        * ( == is okay, because we're checking for identical object) */
1004       if (iri->content_encoding != opt.locale)
1005           set_uri_encoding (iri, iri->content_encoding, false);
1006
1007       /* Reset UTF-8 encode status */
1008       iri->utf8_encode = opt.enable_iri;
1009       xfree_null (iri->orig_url);
1010       iri->orig_url = NULL;
1011
1012       input_file = url_file;
1013     }
1014   else
1015     input_file = (char *) file;
1016
1017   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
1018               : get_urls_file (input_file));
1019
1020   xfree_null (url_file);
1021
1022   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
1023     {
1024       char *filename = NULL, *new_file = NULL;
1025       int dt;
1026       struct iri *tmpiri = iri_dup (iri);
1027       struct url *parsed_url = NULL;
1028
1029       if (cur_url->ignore_when_downloading)
1030         continue;
1031
1032       if (opt.quota && total_downloaded_bytes > opt.quota)
1033         {
1034           status = QUOTEXC;
1035           break;
1036         }
1037
1038       parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
1039
1040       if ((opt.recursive || opt.page_requisites)
1041           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
1042         {
1043           int old_follow_ftp = opt.follow_ftp;
1044
1045           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
1046           if (cur_url->url->scheme == SCHEME_FTP)
1047             opt.follow_ftp = 1;
1048
1049           status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
1050                                   tmpiri);
1051
1052           opt.follow_ftp = old_follow_ftp;
1053         }
1054       else
1055         status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
1056                                cur_url->url->url, &filename,
1057                                &new_file, NULL, &dt, opt.recursive, tmpiri,
1058                                true);
1059
1060       if (parsed_url)
1061           url_free (parsed_url);
1062
1063       if (filename && opt.delete_after && file_exists_p (filename))
1064         {
1065           DEBUGP (("\
1066 Removing file due to --delete-after in retrieve_from_file():\n"));
1067           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
1068           if (unlink (filename))
1069             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
1070           dt &= ~RETROKF;
1071         }
1072
1073       xfree_null (new_file);
1074       xfree_null (filename);
1075       iri_free (tmpiri);
1076     }
1077
1078   /* Free the linked list of URL-s.  */
1079   free_urlpos (url_list);
1080
1081   iri_free (iri);
1082
1083   return status;
1084 }
1085
1086 /* Print `giving up', or `retrying', depending on the impending
1087    action.  N1 and N2 are the attempt number and the attempt limit.  */
1088 void
1089 printwhat (int n1, int n2)
1090 {
1091   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1092 }
1093
1094 /* If opt.wait or opt.waitretry are specified, and if certain
1095    conditions are met, sleep the appropriate number of seconds.  See
1096    the documentation of --wait and --waitretry for more information.
1097
1098    COUNT is the count of current retrieval, beginning with 1. */
1099
1100 void
1101 sleep_between_retrievals (int count)
1102 {
1103   static bool first_retrieval = true;
1104
1105   if (first_retrieval)
1106     {
1107       /* Don't sleep before the very first retrieval. */
1108       first_retrieval = false;
1109       return;
1110     }
1111
1112   if (opt.waitretry && count > 1)
1113     {
1114       /* If opt.waitretry is specified and this is a retry, wait for
1115          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1116       if (count <= opt.waitretry)
1117         xsleep (count - 1);
1118       else
1119         xsleep (opt.waitretry);
1120     }
1121   else if (opt.wait)
1122     {
1123       if (!opt.random_wait || count > 1)
1124         /* If random-wait is not specified, or if we are sleeping
1125            between retries of the same download, sleep the fixed
1126            interval.  */
1127         xsleep (opt.wait);
1128       else
1129         {
1130           /* Sleep a random amount of time averaging in opt.wait
1131              seconds.  The sleeping amount ranges from 0.5*opt.wait to
1132              1.5*opt.wait.  */
1133           double waitsecs = (0.5 + random_float ()) * opt.wait;
1134           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1135                    opt.wait, waitsecs));
1136           xsleep (waitsecs);
1137         }
1138     }
1139 }
1140
1141 /* Free the linked list of urlpos.  */
1142 void
1143 free_urlpos (struct urlpos *l)
1144 {
1145   while (l)
1146     {
1147       struct urlpos *next = l->next;
1148       if (l->url)
1149         url_free (l->url);
1150       xfree_null (l->local_name);
1151       xfree (l);
1152       l = next;
1153     }
1154 }
1155
1156 /* Rotate FNAME opt.backups times */
1157 void
1158 rotate_backups(const char *fname)
1159 {
1160   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1161   char *from = (char *)alloca (maxlen);
1162   char *to = (char *)alloca (maxlen);
1163   struct_stat sb;
1164   int i;
1165
1166   if (stat (fname, &sb) == 0)
1167     if (S_ISREG (sb.st_mode) == 0)
1168       return;
1169
1170   for (i = opt.backups; i > 1; i--)
1171     {
1172       sprintf (from, "%s.%d", fname, i - 1);
1173       sprintf (to, "%s.%d", fname, i);
1174       rename (from, to);
1175     }
1176
1177   sprintf (to, "%s.%d", fname, 1);
1178   rename(fname, to);
1179 }
1180
1181 static bool no_proxy_match (const char *, const char **);
1182
1183 /* Return the URL of the proxy appropriate for url U.  */
1184
1185 static char *
1186 getproxy (struct url *u)
1187 {
1188   char *proxy = NULL;
1189   char *rewritten_url;
1190   static char rewritten_storage[1024];
1191
1192   if (!opt.use_proxy)
1193     return NULL;
1194   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1195     return NULL;
1196
1197   switch (u->scheme)
1198     {
1199     case SCHEME_HTTP:
1200       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1201       break;
1202 #ifdef HAVE_SSL
1203     case SCHEME_HTTPS:
1204       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1205       break;
1206 #endif
1207     case SCHEME_FTP:
1208       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1209       break;
1210     case SCHEME_INVALID:
1211       break;
1212     }
1213   if (!proxy || !*proxy)
1214     return NULL;
1215
1216   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1217      getproxy() to return static storage. */
1218   rewritten_url = rewrite_shorthand_url (proxy);
1219   if (rewritten_url)
1220     {
1221       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1222       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1223       proxy = rewritten_storage;
1224     }
1225
1226   return proxy;
1227 }
1228
1229 /* Returns true if URL would be downloaded through a proxy. */
1230
1231 bool
1232 url_uses_proxy (struct url * u)
1233 {
1234   bool ret;
1235   if (!u)
1236     return false;
1237   ret = getproxy (u) != NULL;
1238   return ret;
1239 }
1240
1241 /* Should a host be accessed through proxy, concerning no_proxy?  */
1242 static bool
1243 no_proxy_match (const char *host, const char **no_proxy)
1244 {
1245   if (!no_proxy)
1246     return false;
1247   else
1248     return sufmatch (no_proxy, host);
1249 }
1250
1251 /* Set the file parameter to point to the local file string.  */
1252 void
1253 set_local_file (const char **file, const char *default_file)
1254 {
1255   if (opt.output_document)
1256     {
1257       if (output_stream_regular)
1258         *file = opt.output_document;
1259     }
1260   else
1261     *file = default_file;
1262 }
1263
1264 /* Return true for an input file's own URL, false otherwise.  */
1265 bool
1266 input_file_url (const char *input_file)
1267 {
1268   static bool first = true;
1269
1270   if (input_file
1271       && url_has_scheme (input_file)
1272       && first)
1273     {
1274       first = false;
1275       return true;
1276     }
1277   else
1278     return false;
1279 }