sjero.net Git - wget/blob - src/retr.c

   1 /* File retrieval.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "retr.h"
  44 #include "progress.h"
  45 #include "url.h"
  46 #include "recur.h"
  47 #include "ftp.h"
  48 #include "http.h"
  49 #include "host.h"
  50 #include "connect.h"
  51 #include "hash.h"
  52 #include "convert.h"
  53 #include "ptimer.h"
  54 #include "html-url.h"
  55 #include "iri.h"
  56
  57 /* Total size of downloaded files.  Used to enforce quota.  */
  58 SUM_SIZE_INT total_downloaded_bytes;
  59
  60 /* Total download time in seconds. */
  61 double total_download_time;
  62
  63 /* If non-NULL, the stream to which output should be written.  This
  64    stream is initialized when `-O' is used.  */
  65 FILE *output_stream;
  66
  67 /* Whether output_document is a regular file we can manipulate,
  68    i.e. not `-' or a device file. */
  69 bool output_stream_regular;
  70 \f
  71 static struct {
  72   wgint chunk_bytes;
  73   double chunk_start;
  74   double sleep_adjust;
  75 } limit_data;
  76
  77 static void
  78 limit_bandwidth_reset (void)
  79 {
  80   xzero (limit_data);
  81 }
  82
  83 /* Limit the bandwidth by pausing the download for an amount of time.
  84    BYTES is the number of bytes received from the network, and TIMER
  85    is the timer that started at the beginning of download.  */
  86
  87 static void
  88 limit_bandwidth (wgint bytes, struct ptimer *timer)
  89 {
  90   double delta_t = ptimer_read (timer) - limit_data.chunk_start;
  91   double expected;
  92
  93   limit_data.chunk_bytes += bytes;
  94
  95   /* Calculate the amount of time we expect downloading the chunk
  96      should take.  If in reality it took less time, sleep to
  97      compensate for the difference.  */
  98   expected = (double) limit_data.chunk_bytes / opt.limit_rate;
  99
 100   if (expected > delta_t)
 101     {
 102       double slp = expected - delta_t + limit_data.sleep_adjust;
 103       double t0, t1;
 104       if (slp < 0.2)
 105         {
 106           DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
 107                    slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 108                    delta_t));
 109           return;
 110         }
 111       DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
 112                slp * 1000, number_to_static_string (limit_data.chunk_bytes),
 113                limit_data.sleep_adjust));
 114
 115       t0 = ptimer_read (timer);
 116       xsleep (slp);
 117       t1 = ptimer_measure (timer);
 118
 119       /* Due to scheduling, we probably slept slightly longer (or
 120          shorter) than desired.  Calculate the difference between the
 121          desired and the actual sleep, and adjust the next sleep by
 122          that amount.  */
 123       limit_data.sleep_adjust = slp - (t1 - t0);
 124       /* If sleep_adjust is very large, it's likely due to suspension
 125          and not clock inaccuracy.  Don't enforce those.  */
 126       if (limit_data.sleep_adjust > 0.5)
 127         limit_data.sleep_adjust = 0.5;
 128       else if (limit_data.sleep_adjust < -0.5)
 129         limit_data.sleep_adjust = -0.5;
 130     }
 131
 132   limit_data.chunk_bytes = 0;
 133   limit_data.chunk_start = ptimer_read (timer);
 134 }
 135
 136 #ifndef MIN
 137 # define MIN(i, j) ((i) <= (j) ? (i) : (j))
 138 #endif
 139
 140 /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
 141    amount of data and decrease SKIP.  Increment *TOTAL by the amount
 142    of data written.  */
 143
 144 static int
 145 write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
 146             wgint *written)
 147 {
 148   if (!out)
 149     return 1;
 150   if (*skip > bufsize)
 151     {
 152       *skip -= bufsize;
 153       return 1;
 154     }
 155   if (*skip)
 156     {
 157       buf += *skip;
 158       bufsize -= *skip;
 159       *skip = 0;
 160       if (bufsize == 0)
 161         return 1;
 162     }
 163
 164   fwrite (buf, 1, bufsize, out);
 165   *written += bufsize;
 166
 167   /* Immediately flush the downloaded data.  This should not hinder
 168      performance: fast downloads will arrive in large 16K chunks
 169      (which stdio would write out immediately anyway), and slow
 170      downloads wouldn't be limited by disk speed.  */
 171
 172   /* 2005-04-20 SMS.
 173      Perhaps it shouldn't hinder performance, but it sure does, at least
 174      on VMS (more than 2X).  Rather than speculate on what it should or
 175      shouldn't do, it might make more sense to test it.  Even better, it
 176      might be nice to explain what possible benefit it could offer, as
 177      it appears to be a clear invitation to poor performance with no
 178      actual justification.  (Also, why 16K?  Anyone test other values?)
 179   */
 180 #ifndef __VMS
 181   fflush (out);
 182 #endif /* ndef __VMS */
 183   return !ferror (out);
 184 }
 185
 186 /* Read the contents of file descriptor FD until it the connection
 187    terminates or a read error occurs.  The data is read in portions of
 188    up to 16K and written to OUT as it arrives.  If opt.verbose is set,
 189    the progress is shown.
 190
 191    TOREAD is the amount of data expected to arrive, normally only used
 192    by the progress gauge.
 193
 194    STARTPOS is the position from which the download starts, used by
 195    the progress gauge.  If QTYREAD is non-NULL, the value it points to
 196    is incremented by the amount of data read from the network.  If
 197    QTYWRITTEN is non-NULL, the value it points to is incremented by
 198    the amount of data written to disk.  The time it took to download
 199    the data is stored to ELAPSED.
 200
 201    The function exits and returns the amount of data read.  In case of
 202    error while reading data, -1 is returned.  In case of error while
 203    writing data, -2 is returned.  */
 204
 205 int
 206 fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
 207               wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
 208 {
 209   int ret = 0;
 210
 211   static char dlbuf[16384];
 212   int dlbufsize = sizeof (dlbuf);
 213
 214   struct ptimer *timer = NULL;
 215   double last_successful_read_tm = 0;
 216
 217   /* The progress gauge, set according to the user preferences. */
 218   void *progress = NULL;
 219
 220   /* Non-zero if the progress gauge is interactive, i.e. if it can
 221      continually update the display.  When true, smaller timeout
 222      values are used so that the gauge can update the display when
 223      data arrives slowly. */
 224   bool progress_interactive = false;
 225
 226   bool exact = !!(flags & rb_read_exactly);
 227   wgint skip = 0;
 228
 229   /* How much data we've read/written.  */
 230   wgint sum_read = 0;
 231   wgint sum_written = 0;
 232
 233   if (flags & rb_skip_startpos)
 234     skip = startpos;
 235
 236   if (opt.verbose)
 237     {
 238       /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
 239          argument to progress_create because the indicator doesn't
 240          (yet) know about "skipping" data.  */
 241       wgint start = skip ? 0 : startpos;
 242       progress = progress_create (start, start + toread);
 243       progress_interactive = progress_interactive_p (progress);
 244     }
 245
 246   if (opt.limit_rate)
 247     limit_bandwidth_reset ();
 248
 249   /* A timer is needed for tracking progress, for throttling, and for
 250      tracking elapsed time.  If either of these are requested, start
 251      the timer.  */
 252   if (progress || opt.limit_rate || elapsed)
 253     {
 254       timer = ptimer_new ();
 255       last_successful_read_tm = 0;
 256     }
 257
 258   /* Use a smaller buffer for low requested bandwidths.  For example,
 259      with --limit-rate=2k, it doesn't make sense to slurp in 16K of
 260      data and then sleep for 8s.  With buffer size equal to the limit,
 261      we never have to sleep for more than one second.  */
 262   if (opt.limit_rate && opt.limit_rate < dlbufsize)
 263     dlbufsize = opt.limit_rate;
 264
 265   /* Read from FD while there is data to read.  Normally toread==0
 266      means that it is unknown how much data is to arrive.  However, if
 267      EXACT is set, then toread==0 means what it says: that no data
 268      should be read.  */
 269   while (!exact || (sum_read < toread))
 270     {
 271       int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
 272       double tmout = opt.read_timeout;
 273       if (progress_interactive)
 274         {
 275           /* For interactive progress gauges, always specify a ~1s
 276              timeout, so that the gauge can be updated regularly even
 277              when the data arrives very slowly or stalls.  */
 278           tmout = 0.95;
 279           if (opt.read_timeout)
 280             {
 281               double waittm;
 282               waittm = ptimer_read (timer) - last_successful_read_tm;
 283               if (waittm + tmout > opt.read_timeout)
 284                 {
 285                   /* Don't let total idle time exceed read timeout. */
 286                   tmout = opt.read_timeout - waittm;
 287                   if (tmout < 0)
 288                     {
 289                       /* We've already exceeded the timeout. */
 290                       ret = -1, errno = ETIMEDOUT;
 291                       break;
 292                     }
 293                 }
 294             }
 295         }
 296       ret = fd_read (fd, dlbuf, rdsize, tmout);
 297
 298       if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
 299         ret = 0;                /* interactive timeout, handled above */
 300       else if (ret <= 0)
 301         break;                  /* EOF or read error */
 302
 303       if (progress || opt.limit_rate)
 304         {
 305           ptimer_measure (timer);
 306           if (ret > 0)
 307             last_successful_read_tm = ptimer_read (timer);
 308         }
 309
 310       if (ret > 0)
 311         {
 312           sum_read += ret;
 313           if (!write_data (out, dlbuf, ret, &skip, &sum_written))
 314             {
 315               ret = -2;
 316               goto out;
 317             }
 318         }
 319
 320       if (opt.limit_rate)
 321         limit_bandwidth (ret, timer);
 322
 323       if (progress)
 324         progress_update (progress, ret, ptimer_read (timer));
 325 #ifdef WINDOWS
 326       if (toread > 0 && !opt.quiet)
 327         ws_percenttitle (100.0 *
 328                          (startpos + sum_read) / (startpos + toread));
 329 #endif
 330     }
 331   if (ret < -1)
 332     ret = -1;
 333
 334  out:
 335   if (progress)
 336     progress_finish (progress, ptimer_read (timer));
 337
 338   if (elapsed)
 339     *elapsed = ptimer_read (timer);
 340   if (timer)
 341     ptimer_destroy (timer);
 342
 343   if (qtyread)
 344     *qtyread += sum_read;
 345   if (qtywritten)
 346     *qtywritten += sum_written;
 347
 348   return ret;
 349 }
 350 \f
 351 /* Read a hunk of data from FD, up until a terminator.  The hunk is
 352    limited by whatever the TERMINATOR callback chooses as its
 353    terminator.  For example, if terminator stops at newline, the hunk
 354    will consist of a line of data; if terminator stops at two
 355    newlines, it can be used to read the head of an HTTP response.
 356    Upon determining the boundary, the function returns the data (up to
 357    the terminator) in malloc-allocated storage.
 358
 359    In case of read error, NULL is returned.  In case of EOF and no
 360    data read, NULL is returned and errno set to 0.  In case of having
 361    read some data, but encountering EOF before seeing the terminator,
 362    the data that has been read is returned, but it will (obviously)
 363    not contain the terminator.
 364
 365    The TERMINATOR function is called with three arguments: the
 366    beginning of the data read so far, the beginning of the current
 367    block of peeked-at data, and the length of the current block.
 368    Depending on its needs, the function is free to choose whether to
 369    analyze all data or just the newly arrived data.  If TERMINATOR
 370    returns NULL, it means that the terminator has not been seen.
 371    Otherwise it should return a pointer to the charactre immediately
 372    following the terminator.
 373
 374    The idea is to be able to read a line of input, or otherwise a hunk
 375    of text, such as the head of an HTTP request, without crossing the
 376    boundary, so that the next call to fd_read etc. reads the data
 377    after the hunk.  To achieve that, this function does the following:
 378
 379    1. Peek at incoming data.
 380
 381    2. Determine whether the peeked data, along with the previously
 382       read data, includes the terminator.
 383
 384       2a. If yes, read the data until the end of the terminator, and
 385           exit.
 386
 387       2b. If no, read the peeked data and goto 1.
 388
 389    The function is careful to assume as little as possible about the
 390    implementation of peeking.  For example, every peek is followed by
 391    a read.  If the read returns a different amount of data, the
 392    process is retried until all data arrives safely.
 393
 394    SIZEHINT is the buffer size sufficient to hold all the data in the
 395    typical case (it is used as the initial buffer size).  MAXSIZE is
 396    the maximum amount of memory this function is allowed to allocate,
 397    or 0 if no upper limit is to be enforced.
 398
 399    This function should be used as a building block for other
 400    functions -- see fd_read_line as a simple example.  */
 401
 402 char *
 403 fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
 404 {
 405   long bufsize = sizehint;
 406   char *hunk = xmalloc (bufsize);
 407   int tail = 0;                 /* tail position in HUNK */
 408
 409   assert (!maxsize || maxsize >= bufsize);
 410
 411   while (1)
 412     {
 413       const char *end;
 414       int pklen, rdlen, remain;
 415
 416       /* First, peek at the available data. */
 417
 418       pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
 419       if (pklen < 0)
 420         {
 421           xfree (hunk);
 422           return NULL;
 423         }
 424       end = terminator (hunk, hunk + tail, pklen);
 425       if (end)
 426         {
 427           /* The data contains the terminator: we'll drain the data up
 428              to the end of the terminator.  */
 429           remain = end - (hunk + tail);
 430           assert (remain >= 0);
 431           if (remain == 0)
 432             {
 433               /* No more data needs to be read. */
 434               hunk[tail] = '\0';
 435               return hunk;
 436             }
 437           if (bufsize - 1 < tail + remain)
 438             {
 439               bufsize = tail + remain + 1;
 440               hunk = xrealloc (hunk, bufsize);
 441             }
 442         }
 443       else
 444         /* No terminator: simply read the data we know is (or should
 445            be) available.  */
 446         remain = pklen;
 447
 448       /* Now, read the data.  Note that we make no assumptions about
 449          how much data we'll get.  (Some TCP stacks are notorious for
 450          read returning less data than the previous MSG_PEEK.)  */
 451
 452       rdlen = fd_read (fd, hunk + tail, remain, 0);
 453       if (rdlen < 0)
 454         {
 455           xfree_null (hunk);
 456           return NULL;
 457         }
 458       tail += rdlen;
 459       hunk[tail] = '\0';
 460
 461       if (rdlen == 0)
 462         {
 463           if (tail == 0)
 464             {
 465               /* EOF without anything having been read */
 466               xfree (hunk);
 467               errno = 0;
 468               return NULL;
 469             }
 470           else
 471             /* EOF seen: return the data we've read. */
 472             return hunk;
 473         }
 474       if (end && rdlen == remain)
 475         /* The terminator was seen and the remaining data drained --
 476            we got what we came for.  */
 477         return hunk;
 478
 479       /* Keep looping until all the data arrives. */
 480
 481       if (tail == bufsize - 1)
 482         {
 483           /* Double the buffer size, but refuse to allocate more than
 484              MAXSIZE bytes.  */
 485           if (maxsize && bufsize >= maxsize)
 486             {
 487               xfree (hunk);
 488               errno = ENOMEM;
 489               return NULL;
 490             }
 491           bufsize <<= 1;
 492           if (maxsize && bufsize > maxsize)
 493             bufsize = maxsize;
 494           hunk = xrealloc (hunk, bufsize);
 495         }
 496     }
 497 }
 498
 499 static const char *
 500 line_terminator (const char *start, const char *peeked, int peeklen)
 501 {
 502   const char *p = memchr (peeked, '\n', peeklen);
 503   if (p)
 504     /* p+1 because the line must include '\n' */
 505     return p + 1;
 506   return NULL;
 507 }
 508
 509 /* The maximum size of the single line we agree to accept.  This is
 510    not meant to impose an arbitrary limit, but to protect the user
 511    from Wget slurping up available memory upon encountering malicious
 512    or buggy server output.  Define it to 0 to remove the limit.  */
 513 #define FD_READ_LINE_MAX 4096
 514
 515 /* Read one line from FD and return it.  The line is allocated using
 516    malloc, but is never larger than FD_READ_LINE_MAX.
 517
 518    If an error occurs, or if no data can be read, NULL is returned.
 519    In the former case errno indicates the error condition, and in the
 520    latter case, errno is NULL.  */
 521
 522 char *
 523 fd_read_line (int fd)
 524 {
 525   return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
 526 }
 527 \f
 528 /* Return a printed representation of the download rate, along with
 529    the units appropriate for the download speed.  */
 530
 531 const char *
 532 retr_rate (wgint bytes, double secs)
 533 {
 534   static char res[20];
 535   static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
 536   int units;
 537
 538   double dlrate = calc_rate (bytes, secs, &units);
 539   /* Use more digits for smaller numbers (regardless of unit used),
 540      e.g. "1022", "247", "12.5", "2.38".  */
 541   sprintf (res, "%.*f %s",
 542            dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
 543            dlrate, rate_names[units]);
 544
 545   return res;
 546 }
 547
 548 /* Calculate the download rate and trim it as appropriate for the
 549    speed.  Appropriate means that if rate is greater than 1K/s,
 550    kilobytes are used, and if rate is greater than 1MB/s, megabytes
 551    are used.
 552
 553    UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
 554    GB/s.  */
 555
 556 double
 557 calc_rate (wgint bytes, double secs, int *units)
 558 {
 559   double dlrate;
 560
 561   assert (secs >= 0);
 562   assert (bytes >= 0);
 563
 564   if (secs == 0)
 565     /* If elapsed time is exactly zero, it means we're under the
 566        resolution of the timer.  This can easily happen on systems
 567        that use time() for the timer.  Since the interval lies between
 568        0 and the timer's resolution, assume half the resolution.  */
 569     secs = ptimer_resolution () / 2.0;
 570
 571   dlrate = bytes / secs;
 572   if (dlrate < 1024.0)
 573     *units = 0;
 574   else if (dlrate < 1024.0 * 1024.0)
 575     *units = 1, dlrate /= 1024.0;
 576   else if (dlrate < 1024.0 * 1024.0 * 1024.0)
 577     *units = 2, dlrate /= (1024.0 * 1024.0);
 578   else
 579     /* Maybe someone will need this, one day. */
 580     *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
 581
 582   return dlrate;
 583 }
 584 \f
 585
 586 #define SUSPEND_POST_DATA do {                  \
 587   post_data_suspended = true;                   \
 588   saved_post_data = opt.post_data;              \
 589   saved_post_file_name = opt.post_file_name;    \
 590   opt.post_data = NULL;                         \
 591   opt.post_file_name = NULL;                    \
 592 } while (0)
 593
 594 #define RESTORE_POST_DATA do {                          \
 595   if (post_data_suspended)                              \
 596     {                                                   \
 597       opt.post_data = saved_post_data;                  \
 598       opt.post_file_name = saved_post_file_name;        \
 599       post_data_suspended = false;                      \
 600     }                                                   \
 601 } while (0)
 602
 603 static char *getproxy (struct url *);
 604
 605 /* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
 606    FTP, proxy, etc.  */
 607
 608 /* #### This function should be rewritten so it doesn't return from
 609    multiple points. */
 610
 611 uerr_t
 612 retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
 613               char **newloc, const char *refurl, int *dt, bool recursive,
 614               struct iri *iri)
 615 {
 616   uerr_t result;
 617   char *url;
 618   bool location_changed;
 619   bool iri_fallbacked = 0;
 620   int dummy;
 621   char *mynewloc, *proxy;
 622   struct url *u = orig_parsed, *proxy_url;
 623   int up_error_code;            /* url parse error code */
 624   char *local_file;
 625   int redirection_count = 0;
 626
 627   bool post_data_suspended = false;
 628   char *saved_post_data = NULL;
 629   char *saved_post_file_name = NULL;
 630
 631   /* If dt is NULL, use local storage.  */
 632   if (!dt)
 633     {
 634       dt = &dummy;
 635       dummy = 0;
 636     }
 637   url = xstrdup (origurl);
 638   if (newloc)
 639     *newloc = NULL;
 640   if (file)
 641     *file = NULL;
 642
 643   if (!refurl)
 644     refurl = opt.referer;
 645
 646  redirected:
 647   /* (also for IRI fallbacking) */
 648
 649   result = NOCONERROR;
 650   mynewloc = NULL;
 651   local_file = NULL;
 652   proxy_url = NULL;
 653
 654   proxy = getproxy (u);
 655   if (proxy)
 656     {
 657       struct iri *pi = iri_new ();
 658       set_uri_encoding (pi, opt.locale, true);
 659       pi->utf8_encode = false;
 660
 661       /* Parse the proxy URL.  */
 662       proxy_url = url_parse (proxy, &up_error_code, NULL, true);
 663       if (!proxy_url)
 664         {
 665           char *error = url_error (proxy, up_error_code);
 666           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
 667                      proxy, error);
 668           xfree (url);
 669           xfree (error);
 670           RESTORE_POST_DATA;
 671           return PROXERR;
 672         }
 673       if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
 674         {
 675           logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
 676           url_free (proxy_url);
 677           xfree (url);
 678           RESTORE_POST_DATA;
 679           return PROXERR;
 680         }
 681     }
 682
 683   if (u->scheme == SCHEME_HTTP
 684 #ifdef HAVE_SSL
 685       || u->scheme == SCHEME_HTTPS
 686 #endif
 687       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
 688     {
 689       result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
 690     }
 691   else if (u->scheme == SCHEME_FTP)
 692     {
 693       /* If this is a redirection, temporarily turn off opt.ftp_glob
 694          and opt.recursive, both being undesirable when following
 695          redirects.  */
 696       bool oldrec = recursive, glob = opt.ftp_glob;
 697       if (redirection_count)
 698         oldrec = glob = false;
 699
 700       result = ftp_loop (u, dt, proxy_url, recursive, glob);
 701       recursive = oldrec;
 702
 703       /* There is a possibility of having HTTP being redirected to
 704          FTP.  In these cases we must decide whether the text is HTML
 705          according to the suffix.  The HTML suffixes are `.html',
 706          `.htm' and a few others, case-insensitive.  */
 707       if (redirection_count && local_file && u->scheme == SCHEME_FTP)
 708         {
 709           if (has_html_suffix_p (local_file))
 710             *dt |= TEXTHTML;
 711         }
 712     }
 713
 714   if (proxy_url)
 715     {
 716       url_free (proxy_url);
 717       proxy_url = NULL;
 718     }
 719
 720   location_changed = (result == NEWLOCATION);
 721   if (location_changed)
 722     {
 723       char *construced_newloc;
 724       struct url *newloc_parsed;
 725
 726       assert (mynewloc != NULL);
 727
 728       if (local_file)
 729         xfree (local_file);
 730
 731       /* The HTTP specs only allow absolute URLs to appear in
 732          redirects, but a ton of boneheaded webservers and CGIs out
 733          there break the rules and use relative URLs, and popular
 734          browsers are lenient about this, so wget should be too. */
 735       construced_newloc = uri_merge (url, mynewloc);
 736       xfree (mynewloc);
 737       mynewloc = construced_newloc;
 738
 739       /* Reset UTF-8 encoding state, keep the URI encoding and reset
 740          the content encoding. */
 741       iri->utf8_encode = opt.enable_iri;
 742       set_content_encoding (iri, NULL);
 743       xfree_null (iri->orig_url);
 744
 745       /* Now, see if this new location makes sense. */
 746       newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
 747       if (!newloc_parsed)
 748         {
 749           char *error = url_error (mynewloc, up_error_code);
 750           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
 751                      error);
 752           if (orig_parsed != u)
 753             {
 754               url_free (u);
 755             }
 756           xfree (url);
 757           xfree (mynewloc);
 758           xfree (error);
 759           RESTORE_POST_DATA;
 760           return result;
 761         }
 762
 763       /* Now mynewloc will become newloc_parsed->url, because if the
 764          Location contained relative paths like .././something, we
 765          don't want that propagating as url.  */
 766       xfree (mynewloc);
 767       mynewloc = xstrdup (newloc_parsed->url);
 768
 769       /* Check for max. number of redirections.  */
 770       if (++redirection_count > opt.max_redirect)
 771         {
 772           logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
 773                      opt.max_redirect);
 774           url_free (newloc_parsed);
 775           if (orig_parsed != u)
 776             {
 777               url_free (u);
 778             }
 779           xfree (url);
 780           xfree (mynewloc);
 781           RESTORE_POST_DATA;
 782           return WRONGCODE;
 783         }
 784
 785       xfree (url);
 786       url = mynewloc;
 787       if (orig_parsed != u)
 788         {
 789           url_free (u);
 790         }
 791       u = newloc_parsed;
 792
 793       /* If we're being redirected from POST, we don't want to POST
 794          again.  Many requests answer POST with a redirection to an
 795          index page; that redirection is clearly a GET.  We "suspend"
 796          POST data for the duration of the redirections, and restore
 797          it when we're done. */
 798       if (!post_data_suspended)
 799         SUSPEND_POST_DATA;
 800
 801       goto redirected;
 802     }
 803
 804   /* Try to not encode in UTF-8 if fetching failed */
 805   if (!(*dt & RETROKF) && iri->utf8_encode)
 806     {
 807       iri->utf8_encode = false;
 808       if (orig_parsed != u)
 809         {
 810           url_free (u);
 811         }
 812       u = url_parse (origurl, NULL, iri, true);
 813       if (u)
 814         {
 815           DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
 816           url = xstrdup (u->url);
 817           iri_fallbacked = 1;
 818           goto redirected;
 819         }
 820       else
 821           DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
 822     }
 823
 824   if (local_file && *dt & RETROKF)
 825     {
 826       register_download (u->url, local_file);
 827       if (redirection_count && 0 != strcmp (origurl, u->url))
 828         register_redirection (origurl, u->url);
 829       if (*dt & TEXTHTML)
 830         register_html (u->url, local_file);
 831       if (*dt & RETROKF)
 832         {
 833           register_download (u->url, local_file);
 834           if (redirection_count && 0 != strcmp (origurl, u->url))
 835             register_redirection (origurl, u->url);
 836           if (*dt & TEXTHTML)
 837             register_html (u->url, local_file);
 838           if (*dt & TEXTCSS)
 839             register_css (u->url, local_file);
 840         }
 841     }
 842
 843   if (file)
 844     *file = local_file ? local_file : NULL;
 845   else
 846     xfree_null (local_file);
 847
 848   if (orig_parsed != u)
 849     {
 850       url_free (u);
 851     }
 852
 853   if (redirection_count || iri_fallbacked)
 854     {
 855       if (newloc)
 856         *newloc = url;
 857       else
 858         xfree (url);
 859     }
 860   else
 861     {
 862       if (newloc)
 863         *newloc = NULL;
 864       xfree (url);
 865     }
 866
 867   RESTORE_POST_DATA;
 868
 869   return result;
 870 }
 871
 872 /* Find the URLs in the file and call retrieve_url() for each of them.
 873    If HTML is true, treat the file as HTML, and construct the URLs
 874    accordingly.
 875
 876    If opt.recursive is set, call retrieve_tree() for each file.  */
 877
 878 uerr_t
 879 retrieve_from_file (const char *file, bool html, int *count)
 880 {
 881   uerr_t status;
 882   struct urlpos *url_list, *cur_url;
 883   struct iri *iri = iri_new();
 884
 885   char *input_file = NULL;
 886   const char *url = file;
 887
 888   status = RETROK;             /* Suppose everything is OK.  */
 889   *count = 0;                  /* Reset the URL count.  */
 890
 891   /* sXXXav : Assume filename and links in the file are in the locale */
 892   set_uri_encoding (iri, opt.locale, true);
 893   set_content_encoding (iri, opt.locale);
 894
 895   if (url_has_scheme (url))
 896     {
 897       int dt,url_err;
 898       uerr_t status;
 899       struct url * url_parsed = url_parse(url, &url_err, iri, true);
 900
 901       if (!url_parsed)
 902         {
 903           char *error = url_error (url, url_err);
 904           logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
 905           xfree (error);
 906           return URLERROR;
 907         }
 908
 909       if (!opt.base_href)
 910         opt.base_href = xstrdup (url);
 911
 912       status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
 913                              false, iri);
 914       if (status != RETROK)
 915         return status;
 916
 917       if (dt & TEXTHTML)
 918         html = true;
 919
 920       /* If we have a found a content encoding, use it.
 921        * ( == is okay, because we're checking for identical object) */
 922       if (iri->content_encoding != opt.locale)
 923           set_uri_encoding (iri, iri->content_encoding, false);
 924
 925       /* Reset UTF-8 encode status */
 926       iri->utf8_encode = opt.enable_iri;
 927       xfree_null (iri->orig_url);
 928       iri->orig_url = NULL;
 929     }
 930   else
 931     input_file = (char *) file;
 932
 933   url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
 934               : get_urls_file (input_file));
 935
 936   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
 937     {
 938       char *filename = NULL, *new_file = NULL;
 939       int dt;
 940       struct iri *tmpiri = iri_dup (iri);
 941       struct url *parsed_url = NULL;
 942
 943       if (cur_url->ignore_when_downloading)
 944         continue;
 945
 946       if (opt.quota && total_downloaded_bytes > opt.quota)
 947         {
 948           status = QUOTEXC;
 949           break;
 950         }
 951
 952       /* Need to reparse the url, since it didn't have iri information. */
 953       if (opt.enable_iri)
 954           parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
 955
 956       if ((opt.recursive || opt.page_requisites)
 957           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
 958         {
 959           int old_follow_ftp = opt.follow_ftp;
 960
 961           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
 962           if (cur_url->url->scheme == SCHEME_FTP)
 963             opt.follow_ftp = 1;
 964
 965           status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
 966                                   tmpiri);
 967
 968           opt.follow_ftp = old_follow_ftp;
 969         }
 970       else
 971         status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
 972                                cur_url->url->url, &filename,
 973                                &new_file, NULL, &dt, opt.recursive, tmpiri);
 974
 975       if (parsed_url)
 976           url_free (parsed_url);
 977
 978       if (filename && opt.delete_after && file_exists_p (filename))
 979         {
 980           DEBUGP (("\
 981 Removing file due to --delete-after in retrieve_from_file():\n"));
 982           logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
 983           if (unlink (filename))
 984             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 985           dt &= ~RETROKF;
 986         }
 987
 988       xfree_null (new_file);
 989       xfree_null (filename);
 990       iri_free (tmpiri);
 991     }
 992
 993   /* Free the linked list of URL-s.  */
 994   free_urlpos (url_list);
 995
 996   iri_free (iri);
 997
 998   return status;
 999 }
1000
1001 /* Print `giving up', or `retrying', depending on the impending
1002    action.  N1 and N2 are the attempt number and the attempt limit.  */
1003 void
1004 printwhat (int n1, int n2)
1005 {
1006   logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1007 }
1008
1009 /* If opt.wait or opt.waitretry are specified, and if certain
1010    conditions are met, sleep the appropriate number of seconds.  See
1011    the documentation of --wait and --waitretry for more information.
1012
1013    COUNT is the count of current retrieval, beginning with 1. */
1014
1015 void
1016 sleep_between_retrievals (int count)
1017 {
1018   static bool first_retrieval = true;
1019
1020   if (first_retrieval)
1021     {
1022       /* Don't sleep before the very first retrieval. */
1023       first_retrieval = false;
1024       return;
1025     }
1026
1027   if (opt.waitretry && count > 1)
1028     {
1029       /* If opt.waitretry is specified and this is a retry, wait for
1030          COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1031       if (count <= opt.waitretry)
1032         xsleep (count - 1);
1033       else
1034         xsleep (opt.waitretry);
1035     }
1036   else if (opt.wait)
1037     {
1038       if (!opt.random_wait || count > 1)
1039         /* If random-wait is not specified, or if we are sleeping
1040            between retries of the same download, sleep the fixed
1041            interval.  */
1042         xsleep (opt.wait);
1043       else
1044         {
1045           /* Sleep a random amount of time averaging in opt.wait
1046              seconds.  The sleeping amount ranges from 0.5*opt.wait to
1047              1.5*opt.wait.  */
1048           double waitsecs = (0.5 + random_float ()) * opt.wait;
1049           DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1050                    opt.wait, waitsecs));
1051           xsleep (waitsecs);
1052         }
1053     }
1054 }
1055
1056 /* Free the linked list of urlpos.  */
1057 void
1058 free_urlpos (struct urlpos *l)
1059 {
1060   while (l)
1061     {
1062       struct urlpos *next = l->next;
1063       if (l->url)
1064         url_free (l->url);
1065       xfree_null (l->local_name);
1066       xfree (l);
1067       l = next;
1068     }
1069 }
1070
1071 /* Rotate FNAME opt.backups times */
1072 void
1073 rotate_backups(const char *fname)
1074 {
1075   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1076   char *from = (char *)alloca (maxlen);
1077   char *to = (char *)alloca (maxlen);
1078   struct_stat sb;
1079   int i;
1080
1081   if (stat (fname, &sb) == 0)
1082     if (S_ISREG (sb.st_mode) == 0)
1083       return;
1084
1085   for (i = opt.backups; i > 1; i--)
1086     {
1087       sprintf (from, "%s.%d", fname, i - 1);
1088       sprintf (to, "%s.%d", fname, i);
1089       rename (from, to);
1090     }
1091
1092   sprintf (to, "%s.%d", fname, 1);
1093   rename(fname, to);
1094 }
1095
1096 static bool no_proxy_match (const char *, const char **);
1097
1098 /* Return the URL of the proxy appropriate for url U.  */
1099
1100 static char *
1101 getproxy (struct url *u)
1102 {
1103   char *proxy = NULL;
1104   char *rewritten_url;
1105   static char rewritten_storage[1024];
1106
1107   if (!opt.use_proxy)
1108     return NULL;
1109   if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1110     return NULL;
1111
1112   switch (u->scheme)
1113     {
1114     case SCHEME_HTTP:
1115       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1116       break;
1117 #ifdef HAVE_SSL
1118     case SCHEME_HTTPS:
1119       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1120       break;
1121 #endif
1122     case SCHEME_FTP:
1123       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1124       break;
1125     case SCHEME_INVALID:
1126       break;
1127     }
1128   if (!proxy || !*proxy)
1129     return NULL;
1130
1131   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1132      getproxy() to return static storage. */
1133   rewritten_url = rewrite_shorthand_url (proxy);
1134   if (rewritten_url)
1135     {
1136       strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1137       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1138       proxy = rewritten_storage;
1139     }
1140
1141   return proxy;
1142 }
1143
1144 /* Returns true if URL would be downloaded through a proxy. */
1145
1146 bool
1147 url_uses_proxy (struct url * u)
1148 {
1149   bool ret;
1150   if (!u)
1151     return false;
1152   ret = getproxy (u) != NULL;
1153   return ret;
1154 }
1155
1156 /* Should a host be accessed through proxy, concerning no_proxy?  */
1157 static bool
1158 no_proxy_match (const char *host, const char **no_proxy)
1159 {
1160   if (!no_proxy)
1161     return false;
1162   else
1163     return sufmatch (no_proxy, host);
1164 }
1165
1166 /* Set the file parameter to point to the local file string.  */
1167 void
1168 set_local_file (const char **file, const char *default_file)
1169 {
1170   if (opt.output_document)
1171     {
1172       if (output_stream_regular)
1173         *file = opt.output_document;
1174     }
1175   else
1176     *file = default_file;
1177 }