sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "retr.h"
  44 #include "progress.h"
  45 #include "url.h"
  46 #include "recur.h"
  47 #include "ftp.h"
  48 #include "http.h"
  49 #include "host.h"
  50 #include "connect.h"
  51 #include "hash.h"
  52 #include "convert.h"
  53 #include "ptimer.h"
  54
  55 /* Total size of downloaded files.  Used to enforce quota.  */
  56 SUM_SIZE_INT total_downloaded_bytes;
  57
  58 /* Total download time in seconds. */
  59 double total_download_time;
  60
  61 /* If non-NULL, the stream to which output should be written.  This
  62    stream is initialized when `-O' is used.  */
  63 FILE *output_stream;
  64
  65 /* Whether output_document is a regular file we can manipulate,
  66    i.e. not `-' or a device file. */
  67 bool output_stream_regular;
  68 \f
  69 static struct {
  70   wgint chunk_bytes;
  71   double chunk_start;
  72   double sleep_adjust;
  73 } limit_data;
  74
  75 static void
  76 limit_bandwidth_reset (void)
  77 {
  78   xzero (limit_data);
  79 }
  80
  81 /* Limit the bandwidth by pausing the download for an amount of time.
  82    BYTES is the number of bytes received from the network, and TIMER
  83    is the timer that started at the beginning of download.  */
  84
  85 static void
  86 limit_bandwidth (wgint bytes, struct ptimer *timer)
  87 {
  88   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  89   double expected;
  90
  91   limit_data.chunk_bytes += bytes;
  92
  93   /* Calculate the amount of time we expect downloading the chunk
  94      should take.  If in reality it took less time, sleep to
  95      compensate for the difference.  */
  96   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
  97
  98   if (expected > delta_t)
  99     {
 100       double slp = expected - delta_t + limit_data.sleep_adjust;
 101       double t0, t1;
 102       if (slp < 0.2)
 103         {
 104           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 105                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 106                    delta_t));
 107           return;
 108         }
 109       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 110                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 111                limit_data.sleep_adjust));
 112
 113       t0 = ptimer_read (timer);
 114       xsleep (slp);
 115       t1 = ptimer_measure (timer);
 116
 117       /* Due to scheduling, we probably slept slightly longer (or
 118          shorter) than desired.  Calculate the difference between the
 119          desired and the actual sleep, and adjust the next sleep by
 120          that amount.  */
 121       limit_data.sleep_adjust = slp - (t1 - t0);
 122       /* If sleep_adjust is very large, it's likely due to suspension
 123          and not clock inaccuracy.  Don't enforce those.  */
 124       if (limit_data.sleep_adjust > 0.5)
 125         limit_data.sleep_adjust = 0.5;
 126       else if (limit_data.sleep_adjust < -0.5)
 127         limit_data.sleep_adjust = -0.5;
 128     }
 129
 130   limit_data.chunk_bytes = 0;
 131   limit_data.chunk_start = ptimer_read (timer);
 132 }
 133
 134 #ifndef MIN
 135 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 136 #endif
 137
 138 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 139    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 140    of data written.  */
 141
 142 static int
 143 write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
 144             wgint *written, int flags)
 145 {
 146   static int cr_pending = 0;    /* Found CR in ASCII FTP data. */
 147
 148   if (!out)
 149     return 1;
 150   if (*skip > bufsize)
 151     {
 152       *skip -= bufsize;
 153       return 1;
 154     }
 155   if (*skip)
 156     {
 157       buf += *skip;
 158       bufsize -= *skip;
 159       *skip = 0;
 160       if (bufsize == 0)
 161         return 1;
 162     }
 163
 164 /* Note: This code assumes that "\n" is the universal line ending
 165    character, as on UNIX and VMS.  If this is not true, then here's
 166    where to change it.
 167 */
 168
 169 #if 1
 170 # define EOL_STRING "\n"
 171 #else /* 1 */
 172 # define EOL_STRING "\r\n"
 173 #endif /* 1 [else] */
 174 #define EOL_STRING_LEN (sizeof( EOL_STRING)- 1)
 175
 176   if (flags & rb_ftp_ascii)
 177     {
 178       const char *bufend;
 179
 180       /* ASCII transfer.  Put out lines delimited by CRLF. */
 181       bufend = buf+ bufsize;
 182       while (buf < bufend)
 183         {
 184           /* If CR, put out any pending CR, then set CR-pending flag. */
 185           if (*buf == '\r')
 186             {
 187               if (cr_pending)
 188                 {
 189                   fwrite ("\r", 1, 1, out);
 190                   *written += 1;
 191                 }
 192               cr_pending = 1;
 193               buf++;
 194               continue;
 195             }
 196
 197           if (cr_pending)
 198             {
 199               if (*buf == '\n')
 200                 {
 201                   /* Found FTP EOL (CRLF).  Put out local EOL. */
 202                   fwrite (EOL_STRING, 1, EOL_STRING_LEN, out);
 203                   *written += EOL_STRING_LEN;
 204                 }
 205               else
 206                 {
 207                   /* Normal character.  Put out pending CR and it. */
 208                   fwrite ("\r", 1, 1, out);
 209                   fwrite (buf, 1, 1, out);
 210                   *written += 2;
 211                 }
 212               buf++;
 213               cr_pending = 0;
 214             }
 215           else
 216             {
 217               /* Normal character.  Put it out. */
 218               fwrite (buf, 1, 1, out);
 219               *written += 1;
 220               buf++;
 221             }
 222         }
 223     }
 224   else
 225     {
 226       /* Image transfer.  Put out buffer. */
 227       fwrite (buf, 1, bufsize, out);
 228       *written += bufsize;
 229     }
 230
 231   /* Immediately flush the downloaded data.  This should not hinder
 232      performance: fast downloads will arrive in large 16K chunks
 233      (which stdio would write out immediately anyway), and slow
 234      downloads wouldn't be limited by disk speed.  */
 235
 236   /* 2005-04-20 SMS.
 237      Perhaps it shouldn't hinder performance, but it sure does, at least
 238      on VMS (more than 2X).  Rather than speculate on what it should or
 239      shouldn't do, it might make more sense to test it.  Even better, it
 240      might be nice to explain what possible benefit it could offer, as
 241      it appears to be a clear invitation to poor performance with no
 242      actual justification.  (Also, why 16K?  Anyone test other values?)
 243   */
 244 #ifndef __VMS
 245   fflush (out);
 246 #endif /* ndef __VMS */
 247   return !ferror (out);
 248 }
 249
 250 /* Read the contents of file descriptor FD until it the connection
 251    terminates or a read error occurs.  The data is read in portions of
 252    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 253    the progress is shown.
 254
 255    TOREAD is the amount of data expected to arrive, normally only used
 256    by the progress gauge.
 257
 258    STARTPOS is the position from which the download starts, used by
 259    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 260    is incremented by the amount of data read from the network.  If
 261    QTYWRITTEN is non-NULL, the value it points to is incremented by
 262    the amount of data written to disk.  The time it took to download
 263    the data is stored to ELAPSED.
 264
 265    The function exits and returns the amount of data read.  In case of
 266    error while reading data, -1 is returned.  In case of error while
 267    writing data, -2 is returned.  */
 268
 269 int
 270 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
 271               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
 272 {
 273   int ret = 0;
 274
 275   static char dlbuf[16384];
 276   int dlbufsize = sizeof (dlbuf);
 277
 278   struct ptimer *timer = NULL;
 279   double last_successful_read_tm = 0;
 280
 281   /* The progress gauge, set according to the user preferences. */
 282   void *progress = NULL;
 283
 284   /* Non-zero if the progress gauge is interactive, i.e. if it can
 285      continually update the display.  When true, smaller timeout
 286      values are used so that the gauge can update the display when
 287      data arrives slowly. */
 288   bool progress_interactive = false;
 289
 290   bool exact = !!(flags & rb_read_exactly);
 291   wgint skip = 0;
 292
 293   /* How much data we've read/written.  */
 294   wgint sum_read = 0;
 295   wgint sum_written = 0;
 296
 297   if (flags & rb_skip_startpos)
 298     skip = startpos;
 299
 300   if (opt.verbose)
 301     {
 302       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 303          argument to progress_create because the indicator doesn't
 304          (yet) know about "skipping" data.  */
 305       progress = progress_create (skip ? 0 : startpos, startpos + toread);
 306       progress_interactive = progress_interactive_p (progress);
 307     }
 308
 309   if (opt.limit_rate)
 310     limit_bandwidth_reset ();
 311
 312   /* A timer is needed for tracking progress, for throttling, and for
 313      tracking elapsed time.  If either of these are requested, start
 314      the timer.  */
 315   if (progress || opt.limit_rate || elapsed)
 316     {
 317       timer = ptimer_new ();
 318       last_successful_read_tm = 0;
 319     }
 320
 321   /* Use a smaller buffer for low requested bandwidths.  For example,
 322      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 323      data and then sleep for 8s.  With buffer size equal to the limit,
 324      we never have to sleep for more than one second.  */
 325   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 326     dlbufsize = opt.limit_rate;
 327
 328   /* Read from FD while there is data to read.  Normally toread==0
 329      means that it is unknown how much data is to arrive.  However, if
 330      EXACT is set, then toread==0 means what it says: that no data
 331      should be read.  */
 332   while (!exact || (sum_read < toread))
 333     {
 334       int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 335       double tmout = opt.read_timeout;
 336       if (progress_interactive)
 337         {
 338           /* For interactive progress gauges, always specify a ~1s
 339              timeout, so that the gauge can be updated regularly even
 340              when the data arrives very slowly or stalls.  */
 341           tmout = 0.95;
 342           if (opt.read_timeout)
 343             {
 344               double waittm;
 345               waittm = ptimer_read (timer) - last_successful_read_tm;
 346               if (waittm + tmout > opt.read_timeout)
 347                 {
 348                   /* Don't let total idle time exceed read timeout. */
 349                   tmout = opt.read_timeout - waittm;
 350                   if (tmout < 0)
 351                     {
 352                       /* We've already exceeded the timeout. */
 353                       ret = -1, errno = ETIMEDOUT;
 354                       break;
 355                     }
 356                 }
 357             }
 358         }
 359       ret = fd_read (fd, dlbuf, rdsize, tmout);
 360
 361       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 362         ret = 0;                /* interactive timeout, handled above */
 363       else if (ret <= 0)
 364         break;                  /* EOF or read error */
 365
 366       if (progress || opt.limit_rate)
 367         {
 368           ptimer_measure (timer);
 369           if (ret > 0)
 370             last_successful_read_tm = ptimer_read (timer);
 371         }
 372
 373       if (ret > 0)
 374         {
 375           sum_read += ret;
 376           if (!write_data (out, dlbuf, ret, &skip, &sum_written, flags))
 377             {
 378               ret = -2;
 379               goto out;
 380             }
 381         }
 382
 383       if (opt.limit_rate)
 384         limit_bandwidth (ret, timer);
 385
 386       if (progress)
 387         progress_update (progress, ret, ptimer_read (timer));
 388 #ifdef WINDOWS
 389       if (toread > 0 && !opt.quiet)
 390         ws_percenttitle (100.0 *
 391                          (startpos + sum_read) / (startpos + toread));
 392 #endif
 393     }
 394   if (ret < -1)
 395     ret = -1;
 396
 397  out:
 398   if (progress)
 399     progress_finish (progress, ptimer_read (timer));
 400
 401   if (elapsed)
 402     *elapsed = ptimer_read (timer);
 403   if (timer)
 404     ptimer_destroy (timer);
 405
 406   if (qtyread)
 407     *qtyread += sum_read;
 408   if (qtywritten)
 409     *qtywritten += sum_written;
 410
 411   return ret;
 412 }
 413 \f
 414 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 415    limited by whatever the TERMINATOR callback chooses as its
 416    terminator.  For example, if terminator stops at newline, the hunk
 417    will consist of a line of data; if terminator stops at two
 418    newlines, it can be used to read the head of an HTTP response.
 419    Upon determining the boundary, the function returns the data (up to
 420    the terminator) in malloc-allocated storage.
 421
 422    In case of read error, NULL is returned.  In case of EOF and no
 423    data read, NULL is returned and errno set to 0.  In case of having
 424    read some data, but encountering EOF before seeing the terminator,
 425    the data that has been read is returned, but it will (obviously)
 426    not contain the terminator.
 427
 428    The TERMINATOR function is called with three arguments: the
 429    beginning of the data read so far, the beginning of the current
 430    block of peeked-at data, and the length of the current block.
 431    Depending on its needs, the function is free to choose whether to
 432    analyze all data or just the newly arrived data.  If TERMINATOR
 433    returns NULL, it means that the terminator has not been seen.
 434    Otherwise it should return a pointer to the charactre immediately
 435    following the terminator.
 436
 437    The idea is to be able to read a line of input, or otherwise a hunk
 438    of text, such as the head of an HTTP request, without crossing the
 439    boundary, so that the next call to fd_read etc. reads the data
 440    after the hunk.  To achieve that, this function does the following:
 441
 442    1. Peek at incoming data.
 443
 444    2. Determine whether the peeked data, along with the previously
 445       read data, includes the terminator.
 446
 447       2a. If yes, read the data until the end of the terminator, and
 448           exit.
 449
 450       2b. If no, read the peeked data and goto 1.
 451
 452    The function is careful to assume as little as possible about the
 453    implementation of peeking.  For example, every peek is followed by
 454    a read.  If the read returns a different amount of data, the
 455    process is retried until all data arrives safely.
 456
 457    SIZEHINT is the buffer size sufficient to hold all the data in the
 458    typical case (it is used as the initial buffer size).  MAXSIZE is
 459    the maximum amount of memory this function is allowed to allocate,
 460    or 0 if no upper limit is to be enforced.
 461
 462    This function should be used as a building block for other
 463    functions -- see fd_read_line as a simple example.  */
 464
 465 char *
 466 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 467 {
 468   long bufsize = sizehint;
 469   char *hunk = xmalloc (bufsize);
 470   int tail = 0;                 /* tail position in HUNK */
 471
 472   assert (maxsize >= bufsize);
 473
 474   while (1)
 475     {
 476       const char *end;
 477       int pklen, rdlen, remain;
 478
 479       /* First, peek at the available data. */
 480
 481       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 482       if (pklen < 0)
 483         {
 484           xfree (hunk);
 485           return NULL;
 486         }
 487       end = terminator (hunk, hunk + tail, pklen);
 488       if (end)
 489         {
 490           /* The data contains the terminator: we'll drain the data up
 491              to the end of the terminator.  */
 492           remain = end - (hunk + tail);
 493           assert (remain >= 0);
 494           if (remain == 0)
 495             {
 496               /* No more data needs to be read. */
 497               hunk[tail] = '\0';
 498               return hunk;
 499             }
 500           if (bufsize - 1 < tail + remain)
 501             {
 502               bufsize = tail + remain + 1;
 503               hunk = xrealloc (hunk, bufsize);
 504             }
 505         }
 506       else
 507         /* No terminator: simply read the data we know is (or should
 508            be) available.  */
 509         remain = pklen;
 510
 511       /* Now, read the data.  Note that we make no assumptions about
 512          how much data we'll get.  (Some TCP stacks are notorious for
 513          read returning less data than the previous MSG_PEEK.)  */
 514
 515       rdlen = fd_read (fd, hunk + tail, remain, 0);
 516       if (rdlen < 0)
 517         {
 518           xfree_null (hunk);
 519           return NULL;
 520         }
 521       tail += rdlen;
 522       hunk[tail] = '\0';
 523
 524       if (rdlen == 0)
 525         {
 526           if (tail == 0)
 527             {
 528               /* EOF without anything having been read */
 529               xfree (hunk);
 530               errno = 0;
 531               return NULL;
 532             }
 533           else
 534             /* EOF seen: return the data we've read. */
 535             return hunk;
 536         }
 537       if (end && rdlen == remain)
 538         /* The terminator was seen and the remaining data drained --
 539            we got what we came for.  */
 540         return hunk;
 541
 542       /* Keep looping until all the data arrives. */
 543
 544       if (tail == bufsize - 1)
 545         {
 546           /* Double the buffer size, but refuse to allocate more than
 547              MAXSIZE bytes.  */
 548           if (maxsize && bufsize >= maxsize)
 549             {
 550               xfree (hunk);
 551               errno = ENOMEM;
 552               return NULL;
 553             }
 554           bufsize <<= 1;
 555           if (maxsize && bufsize > maxsize)
 556             bufsize = maxsize;
 557           hunk = xrealloc (hunk, bufsize);
 558         }
 559     }
 560 }
 561
 562 static const char *
 563 line_terminator (const char *start, const char *peeked, int peeklen)
 564 {
 565   const char *p = memchr (peeked, '\n', peeklen);
 566   if (p)
 567     /* p+1 because the line must include '\n' */
 568     return p + 1;
 569   return NULL;
 570 }
 571
 572 /* The maximum size of the single line we agree to accept.  This is
 573    not meant to impose an arbitrary limit, but to protect the user
 574    from Wget slurping up available memory upon encountering malicious
 575    or buggy server output.  Define it to 0 to remove the limit.  */
 576 #define FD_READ_LINE_MAX 4096
 577
 578 /* Read one line from FD and return it.  The line is allocated using
 579    malloc, but is never larger than FD_READ_LINE_MAX.
 580
 581    If an error occurs, or if no data can be read, NULL is returned.
 582    In the former case errno indicates the error condition, and in the
 583    latter case, errno is NULL.  */
 584
 585 char *
 586 fd_read_line (int fd)
 587 {
 588   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 589 }
 590 \f
 591 /* Return a printed representation of the download rate, along with
 592    the units appropriate for the download speed.  */
 593
 594 const char *
 595 retr_rate (wgint bytes, double secs)
 596 {
 597   static char res[20];
 598   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 599   int units;
 600
 601   double dlrate = calc_rate (bytes, secs, &units);
 602   /* Use more digits for smaller numbers (regardless of unit used),
 603      e.g. "1022", "247", "12.5", "2.38".  */
 604   sprintf (res, "%.*f %s",
 605            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 606            dlrate, rate_names[units]);
 607
 608   return res;
 609 }
 610
 611 /* Calculate the download rate and trim it as appropriate for the
 612    speed.  Appropriate means that if rate is greater than 1K/s,
 613    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 614    are used.
 615
 616    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 617    GB/s.  */
 618
 619 double
 620 calc_rate (wgint bytes, double secs, int *units)
 621 {
 622   double dlrate;
 623
 624   assert (secs >= 0);
 625   assert (bytes >= 0);
 626
 627   if (secs == 0)
 628     /* If elapsed time is exactly zero, it means we're under the
 629        resolution of the timer.  This can easily happen on systems
 630        that use time() for the timer.  Since the interval lies between
 631        0 and the timer's resolution, assume half the resolution.  */
 632     secs = ptimer_resolution () / 2.0;
 633
 634   dlrate = bytes / secs;
 635   if (dlrate < 1024.0)
 636     *units = 0;
 637   else if (dlrate < 1024.0 * 1024.0)
 638     *units = 1, dlrate /= 1024.0;
 639   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 640     *units = 2, dlrate /= (1024.0 * 1024.0);
 641   else
 642     /* Maybe someone will need this, one day. */
 643     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 644
 645   return dlrate;
 646 }
 647 \f
 648
 649 #define SUSPEND_POST_DATA do {                  \
 650   post_data_suspended = true;                   \
 651   saved_post_data = opt.post_data;              \
 652   saved_post_file_name = opt.post_file_name;    \
 653   opt.post_data = NULL;                         \
 654   opt.post_file_name = NULL;                    \
 655 } while (0)
 656
 657 #define RESTORE_POST_DATA do {                          \
 658   if (post_data_suspended)                              \
 659     {                                                   \
 660       opt.post_data = saved_post_data;                  \
 661       opt.post_file_name = saved_post_file_name;        \
 662       post_data_suspended = false;                      \
 663     }                                                   \
 664 } while (0)
 665
 666 static char *getproxy (struct url *);
 667
 668 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 669    FTP, proxy, etc.  */
 670
 671 /* #### This function should be rewritten so it doesn't return from
 672    multiple points. */
 673
 674 uerr_t
 675 retrieve_url (const char *origurl, char **file, char **newloc,
 676               const char *refurl, int *dt, bool recursive)
 677 {
 678   uerr_t result;
 679   char *url;
 680   bool location_changed;
 681   int dummy;
 682   char *mynewloc, *proxy;
 683   struct url *u, *proxy_url;
 684   int up_error_code;            /* url parse error code */
 685   char *local_file;
 686   int redirection_count = 0;
 687
 688   bool post_data_suspended = false;
 689   char *saved_post_data = NULL;
 690   char *saved_post_file_name = NULL;
 691
 692   /* If dt is NULL, use local storage.  */
 693   if (!dt)
 694     {
 695       dt = &dummy;
 696       dummy = 0;
 697     }
 698   url = xstrdup (origurl);
 699   if (newloc)
 700     *newloc = NULL;
 701   if (file)
 702     *file = NULL;
 703
 704   u = url_parse (url, &up_error_code);
 705   if (!u)
 706     {
 707       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
 708       xfree (url);
 709       return URLERROR;
 710     }
 711
 712   if (!refurl)
 713     refurl = opt.referer;
 714
 715  redirected:
 716
 717   result = NOCONERROR;
 718   mynewloc = NULL;
 719   local_file = NULL;
 720   proxy_url = NULL;
 721
 722   proxy = getproxy (u);
 723   if (proxy)
 724     {
 725       /* Parse the proxy URL.  */
 726       proxy_url = url_parse (proxy, &up_error_code);
 727       if (!proxy_url)
 728         {
 729           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 730                      proxy, url_error (up_error_code));
 731           xfree (url);
 732           RESTORE_POST_DATA;
 733           return PROXERR;
 734         }
 735       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 736         {
 737           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 738           url_free (proxy_url);
 739           xfree (url);
 740           RESTORE_POST_DATA;
 741           return PROXERR;
 742         }
 743     }
 744
 745   if (u->scheme == SCHEME_HTTP
 746 #ifdef HAVE_SSL
 747       || u->scheme == SCHEME_HTTPS
 748 #endif
 749       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 750     {
 751       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
 752     }
 753   else if (u->scheme == SCHEME_FTP)
 754     {
 755       /* If this is a redirection, temporarily turn off opt.ftp_glob
 756          and opt.recursive, both being undesirable when following
 757          redirects.  */
 758       bool oldrec = recursive, glob = opt.ftp_glob;
 759       if (redirection_count)
 760         oldrec = glob = false;
 761
 762       result = ftp_loop (u, dt, proxy_url, recursive, glob);
 763       recursive = oldrec;
 764
 765       /* There is a possibility of having HTTP being redirected to
 766          FTP.  In these cases we must decide whether the text is HTML
 767          according to the suffix.  The HTML suffixes are `.html',
 768          `.htm' and a few others, case-insensitive.  */
 769       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 770         {
 771           if (has_html_suffix_p (local_file))
 772             *dt |= TEXTHTML;
 773         }
 774     }
 775
 776   if (proxy_url)
 777     {
 778       url_free (proxy_url);
 779       proxy_url = NULL;
 780     }
 781
 782   location_changed = (result == NEWLOCATION);
 783   if (location_changed)
 784     {
 785       char *construced_newloc;
 786       struct url *newloc_parsed;
 787
 788       assert (mynewloc != NULL);
 789
 790       if (local_file)
 791         xfree (local_file);
 792
 793       /* The HTTP specs only allow absolute URLs to appear in
 794          redirects, but a ton of boneheaded webservers and CGIs out
 795          there break the rules and use relative URLs, and popular
 796          browsers are lenient about this, so wget should be too. */
 797       construced_newloc = uri_merge (url, mynewloc);
 798       xfree (mynewloc);
 799       mynewloc = construced_newloc;
 800
 801       /* Now, see if this new location makes sense. */
 802       newloc_parsed = url_parse (mynewloc, &up_error_code);
 803       if (!newloc_parsed)
 804         {
 805           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 806                      url_error (up_error_code));
 807           url_free (u);
 808           xfree (url);
 809           xfree (mynewloc);
 810           RESTORE_POST_DATA;
 811           return result;
 812         }
 813
 814       /* Now mynewloc will become newloc_parsed->url, because if the
 815          Location contained relative paths like .././something, we
 816          don't want that propagating as url.  */
 817       xfree (mynewloc);
 818       mynewloc = xstrdup (newloc_parsed->url);
 819
 820       /* Check for max. number of redirections.  */
 821       if (++redirection_count > opt.max_redirect)
 822         {
 823           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 824                      opt.max_redirect);
 825           url_free (newloc_parsed);
 826           url_free (u);
 827           xfree (url);
 828           xfree (mynewloc);
 829           RESTORE_POST_DATA;
 830           return WRONGCODE;
 831         }
 832
 833       xfree (url);
 834       url = mynewloc;
 835       url_free (u);
 836       u = newloc_parsed;
 837
 838       /* If we're being redirected from POST, we don't want to POST
 839          again.  Many requests answer POST with a redirection to an
 840          index page; that redirection is clearly a GET.  We "suspend"
 841          POST data for the duration of the redirections, and restore
 842          it when we're done. */
 843       if (!post_data_suspended)
 844         SUSPEND_POST_DATA;
 845
 846       goto redirected;
 847     }
 848
 849   if (local_file)
 850     {
 851       if (*dt & RETROKF)
 852         {
 853           register_download (u->url, local_file);
 854           if (redirection_count && 0 != strcmp (origurl, u->url))
 855             register_redirection (origurl, u->url);
 856           if (*dt & TEXTHTML)
 857             register_html (u->url, local_file);
 858         }
 859     }
 860
 861   if (file)
 862     *file = local_file ? local_file : NULL;
 863   else
 864     xfree_null (local_file);
 865
 866   url_free (u);
 867
 868   if (redirection_count)
 869     {
 870       if (newloc)
 871         *newloc = url;
 872       else
 873         xfree (url);
 874     }
 875   else
 876     {
 877       if (newloc)
 878         *newloc = NULL;
 879       xfree (url);
 880     }
 881
 882   RESTORE_POST_DATA;
 883
 884   return result;
 885 }
 886
 887 /* Find the URLs in the file and call retrieve_url() for each of them.
 888    If HTML is true, treat the file as HTML, and construct the URLs
 889    accordingly.
 890
 891    If opt.recursive is set, call retrieve_tree() for each file.  */
 892
 893 uerr_t
 894 retrieve_from_file (const char *file, bool html, int *count)
 895 {
 896   uerr_t status;
 897   struct urlpos *url_list, *cur_url;
 898
 899   url_list = (html ? get_urls_html (file, NULL, NULL)
 900               : get_urls_file (file));
 901   status = RETROK;             /* Suppose everything is OK.  */
 902   *count = 0;                  /* Reset the URL count.  */
 903
 904   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 905     {
 906       char *filename = NULL, *new_file = NULL;
 907       int dt;
 908
 909       if (cur_url->ignore_when_downloading)
 910         continue;
 911
 912       if (opt.quota && total_downloaded_bytes > opt.quota)
 913         {
 914           status = QUOTEXC;
 915           break;
 916         }
 917       if ((opt.recursive || opt.page_requisites)
 918           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
 919         {
 920           int old_follow_ftp = opt.follow_ftp;
 921
 922           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
 923           if (cur_url->url->scheme == SCHEME_FTP)
 924             opt.follow_ftp = 1;
 925
 926           status = retrieve_tree (cur_url->url->url);
 927
 928           opt.follow_ftp = old_follow_ftp;
 929         }
 930       else
 931         status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
 932
 933       if (filename && opt.delete_after && file_exists_p (filename))
 934         {
 935           DEBUGP (("\
 936 Removing file due to --delete-after in retrieve_from_file():\n"));
 937           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 938           if (unlink (filename))
 939             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 940           dt &= ~RETROKF;
 941         }
 942
 943       xfree_null (new_file);
 944       xfree_null (filename);
 945     }
 946
 947   /* Free the linked list of URL-s.  */
 948   free_urlpos (url_list);
 949
 950   return status;
 951 }
 952
 953 /* Print `giving up', or `retrying', depending on the impending
 954    action.  N1 and N2 are the attempt number and the attempt limit.  */
 955 void
 956 printwhat (int n1, int n2)
 957 {
 958   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
 959 }
 960
 961 /* If opt.wait or opt.waitretry are specified, and if certain
 962    conditions are met, sleep the appropriate number of seconds.  See
 963    the documentation of --wait and --waitretry for more information.
 964
 965    COUNT is the count of current retrieval, beginning with 1. */
 966
 967 void
 968 sleep_between_retrievals (int count)
 969 {
 970   static bool first_retrieval = true;
 971
 972   if (first_retrieval)
 973     {
 974       /* Don't sleep before the very first retrieval. */
 975       first_retrieval = false;
 976       return;
 977     }
 978
 979   if (opt.waitretry && count > 1)
 980     {
 981       /* If opt.waitretry is specified and this is a retry, wait for
 982          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
 983       if (count <= opt.waitretry)
 984         xsleep (count - 1);
 985       else
 986         xsleep (opt.waitretry);
 987     }
 988   else if (opt.wait)
 989     {
 990       if (!opt.random_wait || count > 1)
 991         /* If random-wait is not specified, or if we are sleeping
 992            between retries of the same download, sleep the fixed
 993            interval.  */
 994         xsleep (opt.wait);
 995       else
 996         {
 997           /* Sleep a random amount of time averaging in opt.wait
 998              seconds.  The sleeping amount ranges from 0.5*opt.wait to
 999              1.5*opt.wait.  */
1000           double waitsecs = (0.5 + random_float ()) * opt.wait;
1001           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1002                    opt.wait, waitsecs));
1003           xsleep (waitsecs);
1004         }
1005     }
1006 }
1007
1008 /* Free the linked list of urlpos.  */
1009 void
1010 free_urlpos (struct urlpos *l)
1011 {
1012   while (l)
1013     {
1014       struct urlpos *next = l->next;
1015       if (l->url)
1016         url_free (l->url);
1017       xfree_null (l->local_name);
1018       xfree (l);
1019       l = next;
1020     }
1021 }
1022
1023 /* Rotate FNAME opt.backups times */
1024 void
1025 rotate_backups(const char *fname)
1026 {
1027   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1028   char *from = (char *)alloca (maxlen);
1029   char *to = (char *)alloca (maxlen);
1030   struct_stat sb;
1031   int i;
1032
1033   if (stat (fname, &sb) == 0)
1034     if (S_ISREG (sb.st_mode) == 0)
1035       return;
1036
1037   for (i = opt.backups; i > 1; i--)
1038     {
1039       sprintf (from, "%s.%d", fname, i - 1);
1040       sprintf (to, "%s.%d", fname, i);
1041       rename (from, to);
1042     }
1043
1044   sprintf (to, "%s.%d", fname, 1);
1045   rename(fname, to);
1046 }
1047
1048 static bool no_proxy_match (const char *, const char **);
1049
1050 /* Return the URL of the proxy appropriate for url U.  */
1051
1052 static char *
1053 getproxy (struct url *u)
1054 {
1055   char *proxy = NULL;
1056   char *rewritten_url;
1057   static char rewritten_storage[1024];
1058
1059   if (!opt.use_proxy)
1060     return NULL;
1061   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1062     return NULL;
1063
1064   switch (u->scheme)
1065     {
1066     case SCHEME_HTTP:
1067       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1068       break;
1069 #ifdef HAVE_SSL
1070     case SCHEME_HTTPS:
1071       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1072       break;
1073 #endif
1074     case SCHEME_FTP:
1075       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1076       break;
1077     case SCHEME_INVALID:
1078       break;
1079     }
1080   if (!proxy || !*proxy)
1081     return NULL;
1082
1083   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1084      getproxy() to return static storage. */
1085   rewritten_url = rewrite_shorthand_url (proxy);
1086   if (rewritten_url)
1087     {
1088       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1089       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1090       proxy = rewritten_storage;
1091     }
1092
1093   return proxy;
1094 }
1095
1096 /* Returns true if URL would be downloaded through a proxy. */
1097
1098 bool
1099 url_uses_proxy (const char *url)
1100 {
1101   bool ret;
1102   struct url *u = url_parse (url, NULL);
1103   if (!u)
1104     return false;
1105   ret = getproxy (u) != NULL;
1106   url_free (u);
1107   return ret;
1108 }
1109
1110 /* Should a host be accessed through proxy, concerning no_proxy?  */
1111 static bool
1112 no_proxy_match (const char *host, const char **no_proxy)
1113 {
1114   if (!no_proxy)
1115     return false;
1116   else
1117     return sufmatch (no_proxy, host);
1118 }