sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  62 #endif /* WINDOWS */
  63
  64 #define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
  65                         || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
  66                         || strchr (URL_UNSAFE_CHARS, c))
  67
  68 /* If S contains unsafe characters, free it and replace it with a
  69    version that doesn't.  */
  70 #define URL_CLEANSE(s) do                       \
  71 {                                               \
  72   if (contains_unsafe (s))                      \
  73     {                                           \
  74       char *uc_tmp = encode_string (s);         \
  75       free (s);                                 \
  76       (s) = uc_tmp;                             \
  77     }                                           \
  78 } while (0)
  79
  80 /* Is a directory "."?  */
  81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  82 /* Is a directory ".."?  */
  83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  84
  85 #if 0
  86 static void path_simplify_with_kludge PARAMS ((char *));
  87 #endif
  88 static int urlpath_length PARAMS ((const char *));
  89
  90 /* NULL-terminated list of strings to be recognized as prototypes (URL
  91    schemes).  Note that recognized doesn't mean supported -- only HTTP
  92    and FTP are currently supported.
  93
  94    However, a string that does not match anything in the list will be
  95    considered a relative URL.  Thus it's important that this list has
  96    anything anyone could think of being legal.
  97
  98    There are wild things here.  :-) Take a look at
  99    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
 100    fun.  */
 101 static char *protostrings[] =
 102 {
 103   "cid:",
 104   "clsid:",
 105   "file:",
 106   "finger:",
 107   "ftp:",
 108   "gopher:",
 109   "hdl:",
 110   "http:",
 111   "https:",
 112   "ilu:",
 113   "ior:",
 114   "irc:",
 115   "java:",
 116   "javascript:",
 117   "lifn:",
 118   "mailto:",
 119   "mid:",
 120   "news:",
 121   "nntp:",
 122   "path:",
 123   "prospero:",
 124   "rlogin:",
 125   "service:",
 126   "shttp:",
 127   "snews:",
 128   "stanf:",
 129   "telnet:",
 130   "tn3270:",
 131   "wais:",
 132   "whois++:",
 133   NULL
 134 };
 135
 136 struct proto
 137 {
 138   char *name;
 139   uerr_t ind;
 140   unsigned short port;
 141 };
 142
 143 /* Similar to former, but for supported protocols: */
 144 static struct proto sup_protos[] =
 145 {
 146   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 147   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 148   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 149 };
 150
 151 static void parse_dir PARAMS ((const char *, char **, char **));
 152 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 153 static char *construct PARAMS ((const char *, const char *, int , int));
 154 static char *construct_relative PARAMS ((const char *, const char *));
 155 static char process_ftp_type PARAMS ((char *));
 156
 157 \f
 158 /* Returns the number of characters to be skipped if the first thing
 159    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 160    URL: are also skipped.  */
 161 int
 162 skip_url (const char *url)
 163 {
 164   int i;
 165
 166   if (TOUPPER (url[0]) == 'U'
 167       && TOUPPER (url[1]) == 'R'
 168       && TOUPPER (url[2]) == 'L'
 169       && url[3] == ':')
 170     {
 171       /* Skip blanks.  */
 172       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 173       return i;
 174     }
 175   else
 176     return 0;
 177 }
 178
 179 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 180 int
 181 contains_unsafe (const char *s)
 182 {
 183   for (; *s; s++)
 184     if (UNSAFE_CHAR (*s))
 185       return 1;
 186   return 0;
 187 }
 188
 189 /* Decodes the forms %xy in a URL to the character the hexadecimal
 190    code of which is xy.  xy are hexadecimal digits from
 191    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 192    hex-digits or `%' precedes `\0', the sequence is inserted
 193    literally.  */
 194
 195 static void
 196 decode_string (char *s)
 197 {
 198   char *p = s;
 199
 200   for (; *s; s++, p++)
 201     {
 202       if (*s != '%')
 203         *p = *s;
 204       else
 205         {
 206           /* Do nothing if at the end of the string, or if the chars
 207              are not hex-digits.  */
 208           if (!*(s + 1) || !*(s + 2)
 209               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 210             {
 211               *p = *s;
 212               continue;
 213             }
 214           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 215           s += 2;
 216         }
 217     }
 218   *p = '\0';
 219 }
 220
 221 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 222    given string, returning a malloc-ed %XX encoded string.  */
 223 char *
 224 encode_string (const char *s)
 225 {
 226   const char *b;
 227   char *p, *res;
 228   int i;
 229
 230   b = s;
 231   for (i = 0; *s; s++, i++)
 232     if (UNSAFE_CHAR (*s))
 233       i += 2; /* Two more characters (hex digits) */
 234   res = (char *)xmalloc (i + 1);
 235   s = b;
 236   for (p = res; *s; s++)
 237     if (UNSAFE_CHAR (*s))
 238       {
 239         const unsigned char c = *s;
 240         *p++ = '%';
 241         *p++ = HEXD2ASC (c >> 4);
 242         *p++ = HEXD2ASC (c & 0xf);
 243       }
 244     else
 245       *p++ = *s;
 246   *p = '\0';
 247   return res;
 248 }
 249 \f
 250 /* Returns the proto-type if URL's protocol is supported, or
 251    URLUNKNOWN if not.  */
 252 uerr_t
 253 urlproto (const char *url)
 254 {
 255   int i;
 256
 257   url += skip_url (url);
 258   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 259     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 260       return sup_protos[i].ind;
 261   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 262   if (url[i] == ':')
 263     {
 264       for (++i; url[i] && url[i] != '/'; i++)
 265         if (!ISDIGIT (url[i]))
 266           return URLBADPORT;
 267       if (url[i - 1] == ':')
 268         return URLFTP;
 269       else
 270         return URLHTTP;
 271     }
 272   else
 273     return URLHTTP;
 274 }
 275
 276 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 277    part is found, returns 0.  */
 278 int
 279 skip_proto (const char *url)
 280 {
 281   char **s;
 282   int l;
 283
 284   for (s = protostrings; *s; s++)
 285     if (!strncasecmp (*s, url, strlen (*s)))
 286       break;
 287   if (!*s)
 288     return 0;
 289   l = strlen (*s);
 290   /* HTTP and FTP protocols are expected to yield exact host names
 291      (i.e. the `//' part must be skipped, too).  */
 292   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 293     l += 2;
 294   return l;
 295 }
 296
 297 /* Returns 1 if the URL begins with a protocol (supported or
 298    unsupported), 0 otherwise.  */
 299 static int
 300 has_proto (const char *url)
 301 {
 302   char **s;
 303
 304   url += skip_url (url);
 305   for (s = protostrings; *s; s++)
 306     if (strncasecmp (url, *s, strlen (*s)) == 0)
 307       return 1;
 308   return 0;
 309 }
 310
 311 /* Skip the username and password, if present here.  The function
 312    should be called *not* with the complete URL, but with the part
 313    right after the protocol.
 314
 315    If no username and password are found, return 0.  */
 316 int
 317 skip_uname (const char *url)
 318 {
 319   const char *p;
 320   for (p = url; *p && *p != '/'; p++)
 321     if (*p == '@')
 322       break;
 323   /* If a `@' was found before the first occurrence of `/', skip
 324      it.  */
 325   if (*p == '@')
 326     return p - url + 1;
 327   else
 328     return 0;
 329 }
 330 \f
 331 /* Allocate a new urlinfo structure, fill it with default values and
 332    return a pointer to it.  */
 333 struct urlinfo *
 334 newurl (void)
 335 {
 336   struct urlinfo *u;
 337
 338   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 339   memset (u, 0, sizeof (*u));
 340   u->proto = URLUNKNOWN;
 341   return u;
 342 }
 343
 344 /* Perform a "deep" free of the urlinfo structure.  The structure
 345    should have been created with newurl, but need not have been used.
 346    If free_pointer is non-0, free the pointer itself.  */
 347 void
 348 freeurl (struct urlinfo *u, int complete)
 349 {
 350   assert (u != NULL);
 351   FREE_MAYBE (u->url);
 352   FREE_MAYBE (u->host);
 353   FREE_MAYBE (u->path);
 354   FREE_MAYBE (u->file);
 355   FREE_MAYBE (u->dir);
 356   FREE_MAYBE (u->user);
 357   FREE_MAYBE (u->passwd);
 358   FREE_MAYBE (u->local);
 359   FREE_MAYBE (u->referer);
 360   if (u->proxy)
 361     freeurl (u->proxy, 1);
 362   if (complete)
 363     free (u);
 364   return;
 365 }
 366 \f
 367 /* Extract the given URL of the form
 368    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 369    1. hostname (terminated with `/' or `:')
 370    2. port number (terminated with `/'), or chosen for the protocol
 371    3. dirname (everything after hostname)
 372    Most errors are handled.  No allocation is done, you must supply
 373    pointers to allocated memory.
 374    ...and a host of other stuff :-)
 375
 376    - Recognizes hostname:dir/file for FTP and
 377      hostname (:portnum)?/dir/file for HTTP.
 378    - Parses the path to yield directory and file
 379    - Parses the URL to yield the username and passwd (if present)
 380    - Decodes the strings, in case they contain "forbidden" characters
 381    - Writes the result to struct urlinfo
 382
 383    If the argument STRICT is set, it recognizes only the canonical
 384    form.  */
 385 uerr_t
 386 parseurl (const char *url, struct urlinfo *u, int strict)
 387 {
 388   int i, l, abs_ftp;
 389   int recognizable;            /* Recognizable URL is the one where
 390                                   the protocol name was explicitly
 391                                   named, i.e. it wasn't deduced from
 392                                   the URL format.  */
 393   uerr_t type;
 394
 395   DEBUGP (("parseurl (\"%s\") -> ", url));
 396   url += skip_url (url);
 397   recognizable = has_proto (url);
 398   if (strict && !recognizable)
 399     return URLUNKNOWN;
 400   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 401     {
 402       l = strlen (sup_protos[i].name);
 403       if (!strncasecmp (sup_protos[i].name, url, l))
 404         break;
 405     }
 406   /* If protocol is recognizable, but unsupported, bail out, else
 407      suppose unknown.  */
 408   if (recognizable && i == ARRAY_SIZE (sup_protos))
 409     return URLUNKNOWN;
 410   else if (i == ARRAY_SIZE (sup_protos))
 411     type = URLUNKNOWN;
 412   else
 413     u->proto = type = sup_protos[i].ind;
 414
 415   if (type == URLUNKNOWN)
 416     l = 0;
 417   /* Allow a username and password to be specified (i.e. just skip
 418      them for now).  */
 419   if (recognizable)
 420     l += skip_uname (url + l);
 421   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 422   if (i == l)
 423     return URLBADHOST;
 424   /* Get the hostname.  */
 425   u->host = strdupdelim (url + l, url + i);
 426   DEBUGP (("host %s -> ", u->host));
 427
 428   /* Assume no port has been given.  */
 429   u->port = 0;
 430   if (url[i] == ':')
 431     {
 432       /* We have a colon delimiting the hostname.  It could mean that
 433          a port number is following it, or a directory.  */
 434       if (ISDIGIT (url[++i]))    /* A port number */
 435         {
 436           if (type == URLUNKNOWN)
 437             u->proto = type = URLHTTP;
 438           for (; url[i] && url[i] != '/'; i++)
 439             if (ISDIGIT (url[i]))
 440               u->port = 10 * u->port + (url[i] - '0');
 441             else
 442               return URLBADPORT;
 443           if (!u->port)
 444             return URLBADPORT;
 445           DEBUGP (("port %hu -> ", u->port));
 446         }
 447       else if (type == URLUNKNOWN) /* or a directory */
 448         u->proto = type = URLFTP;
 449       else                      /* or just a misformed port number */
 450         return URLBADPORT;
 451     }
 452   else if (type == URLUNKNOWN)
 453     u->proto = type = URLHTTP;
 454   if (!u->port)
 455     {
 456       int i;
 457       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 458         if (sup_protos[i].ind == type)
 459           break;
 460       if (i == ARRAY_SIZE (sup_protos))
 461         return URLUNKNOWN;
 462       u->port = sup_protos[i].port;
 463     }
 464   /* Some delimiter troubles...  */
 465   if (url[i] == '/' && url[i - 1] != ':')
 466     ++i;
 467   if (type == URLHTTP)
 468     while (url[i] && url[i] == '/')
 469       ++i;
 470   u->path = (char *)xmalloc (strlen (url + i) + 8);
 471   strcpy (u->path, url + i);
 472   if (type == URLFTP)
 473     {
 474       u->ftp_type = process_ftp_type (u->path);
 475       /* #### We don't handle type `d' correctly yet.  */
 476       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 477         u->ftp_type = 'I';
 478     }
 479   DEBUGP (("opath %s -> ", u->path));
 480   /* Parse the username and password (if existing).  */
 481   parse_uname (url, &u->user, &u->passwd);
 482   /* Decode the strings, as per RFC 1738.  */
 483   decode_string (u->host);
 484   decode_string (u->path);
 485   if (u->user)
 486     decode_string (u->user);
 487   if (u->passwd)
 488     decode_string (u->passwd);
 489   /* Parse the directory.  */
 490   parse_dir (u->path, &u->dir, &u->file);
 491   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 492   /* Simplify the directory.  */
 493   path_simplify (u->dir);
 494   /* Remove the leading `/' in HTTP.  */
 495   if (type == URLHTTP && *u->dir == '/')
 496     strcpy (u->dir, u->dir + 1);
 497   DEBUGP (("ndir %s\n", u->dir));
 498   /* Strip trailing `/'.  */
 499   l = strlen (u->dir);
 500   if (l && u->dir[l - 1] == '/')
 501     u->dir[l - 1] = '\0';
 502   /* Re-create the path: */
 503   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 504   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 505       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 506   strcpy (u->path, abs_ftp ? "%2F" : "/");
 507   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 508   strcat (u->path, *u->dir ? "/" : "");
 509   strcat (u->path, u->file);
 510   URL_CLEANSE (u->path);
 511   DEBUGP (("newpath: %s\n", u->path));
 512   /* Create the clean URL.  */
 513   u->url = str_url (u, 0);
 514   return URLOK;
 515 }
 516 \f
 517 /* Special versions of DOTP and DDOTP for parse_dir(). */
 518
 519 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 520 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 521                      && (!*((x) + 2) || *((x) + 2) == '?'))
 522
 523 /* Build the directory and filename components of the path.  Both
 524    components are *separately* malloc-ed strings!  It does not change
 525    the contents of path.
 526
 527    If the path ends with "." or "..", they are (correctly) counted as
 528    directories.  */
 529 static void
 530 parse_dir (const char *path, char **dir, char **file)
 531 {
 532   int i, l;
 533
 534   l = urlpath_length (path);
 535   for (i = l; i && path[i] != '/'; i--);
 536
 537   if (!i && *path != '/')   /* Just filename */
 538     {
 539       if (PD_DOTP (path) || PD_DDOTP (path))
 540         {
 541           *dir = strdupdelim (path, path + l);
 542           *file = xstrdup (path + l); /* normally empty, but could
 543                                          contain ?... */
 544         }
 545       else
 546         {
 547           *dir = xstrdup ("");     /* This is required because of FTP */
 548           *file = xstrdup (path);
 549         }
 550     }
 551   else if (!i)                 /* /filename */
 552     {
 553       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 554         {
 555           *dir = strdupdelim (path, path + l);
 556           *file = xstrdup (path + l); /* normally empty, but could
 557                                          contain ?... */
 558         }
 559       else
 560         {
 561           *dir = xstrdup ("/");
 562           *file = xstrdup (path + 1);
 563         }
 564     }
 565   else /* Nonempty directory with or without a filename */
 566     {
 567       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 568         {
 569           *dir = strdupdelim (path, path + l);
 570           *file = xstrdup (path + l); /* normally empty, but could
 571                                          contain ?... */
 572         }
 573       else
 574         {
 575           *dir = strdupdelim (path, path + i);
 576           *file = xstrdup (path + i + 1);
 577         }
 578     }
 579 }
 580
 581 /* Find the optional username and password within the URL, as per
 582    RFC1738.  The returned user and passwd char pointers are
 583    malloc-ed.  */
 584 static uerr_t
 585 parse_uname (const char *url, char **user, char **passwd)
 586 {
 587   int l;
 588   const char *p, *col;
 589   char **where;
 590
 591   *user = NULL;
 592   *passwd = NULL;
 593   url += skip_url (url);
 594   /* Look for end of protocol string.  */
 595   l = skip_proto (url);
 596   if (!l)
 597     return URLUNKNOWN;
 598   /* Add protocol offset.  */
 599   url += l;
 600   /* Is there an `@' character?  */
 601   for (p = url; *p && *p != '/'; p++)
 602     if (*p == '@')
 603       break;
 604   /* If not, return.  */
 605   if (*p != '@')
 606     return URLOK;
 607   /* Else find the username and password.  */
 608   for (p = col = url; *p != '@'; p++)
 609     {
 610       if (*p == ':' && !*user)
 611         {
 612           *user = (char *)xmalloc (p - url + 1);
 613           memcpy (*user, url, p - url);
 614           (*user)[p - url] = '\0';
 615           col = p + 1;
 616         }
 617     }
 618   /* Decide whether you have only the username or both.  */
 619   where = *user ? passwd : user;
 620   *where = (char *)xmalloc (p - col + 1);
 621   memcpy (*where, col, p - col);
 622   (*where)[p - col] = '\0';
 623   return URLOK;
 624 }
 625
 626 /* If PATH ends with `;type=X', return the character X.  */
 627 static char
 628 process_ftp_type (char *path)
 629 {
 630   int len = strlen (path);
 631
 632   if (len >= 7
 633       && !memcmp (path + len - 7, ";type=", 6))
 634     {
 635       path[len - 7] = '\0';
 636       return path[len - 1];
 637     }
 638   else
 639     return '\0';
 640 }
 641 \f
 642 /* Return the URL as fine-formed string, with a proper protocol,
 643    optional port number, directory and optional user/password.  If
 644    HIDE is non-zero, password will be hidden.  The forbidden
 645    characters in the URL will be cleansed.  */
 646 char *
 647 str_url (const struct urlinfo *u, int hide)
 648 {
 649   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 650   int i, l, ln, lu, lh, lp, lf, ld;
 651   unsigned short proto_default_port;
 652
 653   /* Look for the protocol name.  */
 654   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 655     if (sup_protos[i].ind == u->proto)
 656       break;
 657   if (i == ARRAY_SIZE (sup_protos))
 658     return NULL;
 659   proto_name = sup_protos[i].name;
 660   proto_default_port = sup_protos[i].port;
 661   host = CLEANDUP (u->host);
 662   dir = CLEANDUP (u->dir);
 663   file = CLEANDUP (u->file);
 664   user = passwd = NULL;
 665   if (u->user)
 666     user = CLEANDUP (u->user);
 667   if (u->passwd)
 668     {
 669       int i;
 670       passwd = CLEANDUP (u->passwd);
 671       if (hide)
 672         for (i = 0; passwd[i]; i++)
 673           passwd[i] = 'x';
 674     }
 675   if (u->proto == URLFTP && *dir == '/')
 676     {
 677       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 678       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 679       tmp[0] = '%';
 680       tmp[1] = '2';
 681       tmp[2] = 'F';
 682       strcpy (tmp + 3, dir + 1);
 683       free (dir);
 684       dir = tmp;
 685     }
 686
 687   ln = strlen (proto_name);
 688   lu = user ? strlen (user) : 0;
 689   lp = passwd ? strlen (passwd) : 0;
 690   lh = strlen (host);
 691   ld = strlen (dir);
 692   lf = strlen (file);
 693   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 694   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 695      (user ? user : ""), (passwd ? ":" : ""),
 696      (passwd ? passwd : ""), (user ? "@" : ""),
 697      host, u->port, dir, *dir ? "/" : "", file); */
 698   l = 0;
 699   memcpy (res, proto_name, ln);
 700   l += ln;
 701   if (user)
 702     {
 703       memcpy (res + l, user, lu);
 704       l += lu;
 705       if (passwd)
 706         {
 707           res[l++] = ':';
 708           memcpy (res + l, passwd, lp);
 709           l += lp;
 710         }
 711       res[l++] = '@';
 712     }
 713   memcpy (res + l, host, lh);
 714   l += lh;
 715   if (u->port != proto_default_port)
 716     {
 717       res[l++] = ':';
 718       long_to_string (res + l, (long)u->port);
 719       l += numdigit (u->port);
 720     }
 721   res[l++] = '/';
 722   memcpy (res + l, dir, ld);
 723   l += ld;
 724   if (*dir)
 725     res[l++] = '/';
 726   strcpy (res + l, file);
 727   free (host);
 728   free (dir);
 729   free (file);
 730   FREE_MAYBE (user);
 731   FREE_MAYBE (passwd);
 732   return res;
 733 }
 734
 735 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 736    location.  Uses parseurl to parse them, and compares the canonical
 737    forms.
 738
 739    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 740    return 0 on error.  */
 741 int
 742 url_equal (const char *url1, const char *url2)
 743 {
 744   struct urlinfo *u1, *u2;
 745   uerr_t err;
 746   int res;
 747
 748   u1 = newurl ();
 749   err = parseurl (url1, u1, 0);
 750   if (err != URLOK)
 751     {
 752       freeurl (u1, 1);
 753       return 0;
 754     }
 755   u2 = newurl ();
 756   err = parseurl (url2, u2, 0);
 757   if (err != URLOK)
 758     {
 759       freeurl (u2, 1);
 760       return 0;
 761     }
 762   res = !strcmp (u1->url, u2->url);
 763   freeurl (u1, 1);
 764   freeurl (u2, 1);
 765   return res;
 766 }
 767 \f
 768 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 769    buffer may contain pretty much anything; no errors are signaled.  */
 770 static const char *
 771 findurl (const char *buf, int howmuch, int *count)
 772 {
 773   char **prot;
 774   const char *s1, *s2;
 775
 776   for (s1 = buf; howmuch; s1++, howmuch--)
 777     for (prot = protostrings; *prot; prot++)
 778       if (howmuch <= strlen (*prot))
 779         continue;
 780       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 781         {
 782           for (s2 = s1, *count = 0;
 783                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 784                  !strchr (URL_SEPARATOR, *s2);
 785                s2++, (*count)++, howmuch--);
 786           return s1;
 787         }
 788   return NULL;
 789 }
 790
 791 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 792    each pointer representing a URL string.  The file is *not* assumed
 793    to be HTML.  */
 794 urlpos *
 795 get_urls_file (const char *file)
 796 {
 797   long nread;
 798   FILE *fp;
 799   char *buf;
 800   const char *pbuf;
 801   int size;
 802   urlpos *first, *current, *old;
 803
 804   if (file && !HYPHENP (file))
 805     {
 806       fp = fopen (file, "rb");
 807       if (!fp)
 808         {
 809           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 810           return NULL;
 811         }
 812     }
 813   else
 814     fp = stdin;
 815   /* Load the file.  */
 816   load_file (fp, &buf, &nread);
 817   if (file && !HYPHENP (file))
 818     fclose (fp);
 819   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 820   first = current = NULL;
 821   /* Fill the linked list with URLs.  */
 822   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 823        pbuf += size)
 824     {
 825       /* Allocate the space.  */
 826       old = current;
 827       current = (urlpos *)xmalloc (sizeof (urlpos));
 828       if (old)
 829         old->next = current;
 830       memset (current, 0, sizeof (*current));
 831       current->next = NULL;
 832       current->url = (char *)xmalloc (size + 1);
 833       memcpy (current->url, pbuf, size);
 834       current->url[size] = '\0';
 835       if (!first)
 836         first = current;
 837     }
 838   /* Free the buffer.  */
 839   free (buf);
 840
 841   return first;
 842 }
 843
 844 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 845    an HTML document using htmlfindurl(), which see.  get_urls_html()
 846    constructs the HTML-s from the relative href-s.
 847
 848    If SILENT is non-zero, do not barf on baseless relative links.  */
 849 urlpos *
 850 get_urls_html (const char *file, const char *this_url, int silent,
 851                int dash_p_leaf_HTML)
 852 {
 853   long nread;
 854   FILE *fp;
 855   char *orig_buf;
 856   const char *buf;
 857   int step, first_time;
 858   urlpos *first, *current, *old;
 859
 860   if (file && !HYPHENP (file))
 861     {
 862       fp = fopen (file, "rb");
 863       if (!fp)
 864         {
 865           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 866           return NULL;
 867         }
 868     }
 869   else
 870     fp = stdin;
 871   /* Load the file.  */
 872   load_file (fp, &orig_buf, &nread);
 873   if (file && !HYPHENP (file))
 874     fclose (fp);
 875   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 876   first = current = NULL;
 877   first_time = 1;
 878   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 879   for (buf = orig_buf;
 880        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
 881                            dash_p_leaf_HTML));
 882        buf += step)
 883     {
 884       int i, no_proto;
 885       int size = step;
 886       const char *pbuf = buf;
 887       char *constr, *base;
 888       const char *cbase;
 889       char *needs_freeing, *url_data;
 890
 891       first_time = 0;
 892
 893       /* A frequent phenomenon that needs to be handled are pages
 894          generated by brain-damaged HTML generators, which refer to to
 895          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 896          any spaces at the beginning or at the end of the string.
 897          This is probably not strictly correct, but that's what the
 898          browsers do, so we may follow.  May the authors of "WYSIWYG"
 899          HTML tools burn in hell for the damage they've inflicted!  */
 900       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 901         {
 902           ++pbuf;
 903           --size;
 904         }
 905       while (size && ISSPACE (pbuf[size - 1]))
 906         --size;
 907       if (!size)
 908         break;
 909
 910       /* It would be nice if we could avoid allocating memory in this
 911          loop, but I don't see an easy way.  To process the entities,
 912          we need to either copy the data, or change it destructively.
 913          I choose the former.
 914
 915          We have two pointers: needs_freeing and url_data, because the
 916          code below does thing like url_data += <something>, and we
 917          want to pass the original string to free(). */
 918       needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
 919       size = strlen (url_data);
 920
 921       for (i = 0; protostrings[i]; i++)
 922         {
 923           if (!strncasecmp (protostrings[i], url_data,
 924                             MINVAL (strlen (protostrings[i]), size)))
 925             break;
 926         }
 927       /* Check for http:RELATIVE_URI.  See below for details.  */
 928       if (protostrings[i]
 929           && !(strncasecmp (url_data, "http:", 5) == 0
 930                && strncasecmp (url_data, "http://", 7) != 0))
 931         {
 932           no_proto = 0;
 933         }
 934       else
 935         {
 936           no_proto = 1;
 937           /* This is for extremely brain-damaged pages that refer to
 938              relative URI-s as <a href="http:URL">.  Just strip off the
 939              silly leading "http:" (as well as any leading blanks
 940              before it).  */
 941           if ((size > 5) && !strncasecmp ("http:", url_data, 5))
 942             url_data += 5, size -= 5;
 943         }
 944       if (!no_proto)
 945         {
 946           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 947             {
 948               if (!strncasecmp (sup_protos[i].name, url_data,
 949                                MINVAL (strlen (sup_protos[i].name), size)))
 950                 break;
 951             }
 952           /* Do *not* accept a non-supported protocol.  */
 953           if (i == ARRAY_SIZE (sup_protos))
 954             {
 955               free (needs_freeing);
 956               continue;
 957             }
 958         }
 959       if (no_proto)
 960         {
 961           /* First, construct the base, which can be relative itself.
 962
 963              Criteria for creating the base are:
 964              1) html_base created by <base href="...">
 965              2) current URL
 966              3) base provided from the command line */
 967           cbase = html_base ();
 968           if (!cbase)
 969             cbase = this_url;
 970           if (!cbase)
 971             cbase = opt.base_href;
 972           if (!cbase)             /* Error condition -- a baseless
 973                                      relative link.  */
 974             {
 975               if (!opt.quiet && !silent)
 976                 {
 977                   /* Use malloc, not alloca because this is called in
 978                      a loop. */
 979                   char *temp = (char *)malloc (size + 1);
 980                   strncpy (temp, url_data, size);
 981                   temp[size] = '\0';
 982                   logprintf (LOG_NOTQUIET,
 983                              _("Error (%s): Link %s without a base provided.\n"),
 984                              file, temp);
 985                   free (temp);
 986                 }
 987               free (needs_freeing);
 988               continue;
 989             }
 990           if (this_url)
 991             base = construct (this_url, cbase, strlen (cbase),
 992                               !has_proto (cbase));
 993           else
 994             {
 995               /* Base must now be absolute, with host name and
 996                  protocol.  */
 997               if (!has_proto (cbase))
 998                 {
 999                   logprintf (LOG_NOTQUIET, _("\
1000 Error (%s): Base %s relative, without referer URL.\n"),
1001                              file, cbase);
1002                   free (needs_freeing);
1003                   continue;
1004                 }
1005               base = xstrdup (cbase);
1006             }
1007           constr = construct (base, url_data, size, no_proto);
1008           free (base);
1009         }
1010       else /* has proto */
1011         {
1012           constr = (char *)xmalloc (size + 1);
1013           strncpy (constr, url_data, size);
1014           constr[size] = '\0';
1015         }
1016 #ifdef DEBUG
1017       if (opt.debug)
1018         {
1019           char *tmp;
1020           const char *tmp2;
1021
1022           tmp2 = html_base ();
1023           /* Use malloc, not alloca because this is called in a loop. */
1024           tmp = (char *)xmalloc (size + 1);
1025           strncpy (tmp, url_data, size);
1026           tmp[size] = '\0';
1027           logprintf (LOG_ALWAYS,
1028                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
1029                      file, this_url ? this_url : "(null)",
1030                      tmp2 ? tmp2 : "(null)", tmp, constr);
1031           free (tmp);
1032         }
1033 #endif
1034
1035       /* Allocate the space.  */
1036       old = current;
1037       current = (urlpos *)xmalloc (sizeof (urlpos));
1038       if (old)
1039         old->next = current;
1040       if (!first)
1041         first = current;
1042       /* Fill the values.  */
1043       memset (current, 0, sizeof (*current));
1044       current->next = NULL;
1045       current->url = constr;
1046       current->size = step;
1047       current->pos = buf - orig_buf;
1048       /* A URL is relative if the host and protocol are not named,
1049          and the name does not start with `/'.  */
1050       if (no_proto && *url_data != '/')
1051         current->flags |= (URELATIVE | UNOPROTO);
1052       else if (no_proto)
1053         current->flags |= UNOPROTO;
1054       free (needs_freeing);
1055     }
1056   free (orig_buf);
1057
1058   return first;
1059 }
1060 \f
1061 /* Free the linked list of urlpos.  */
1062 void
1063 free_urlpos (urlpos *l)
1064 {
1065   while (l)
1066     {
1067       urlpos *next = l->next;
1068       free (l->url);
1069       FREE_MAYBE (l->local_name);
1070       free (l);
1071       l = next;
1072     }
1073 }
1074
1075 /* Rotate FNAME opt.backups times */
1076 void
1077 rotate_backups(const char *fname)
1078 {
1079   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1080   char *from = (char *)alloca (maxlen);
1081   char *to = (char *)alloca (maxlen);
1082   struct stat sb;
1083   int i;
1084
1085   if (stat (fname, &sb) == 0)
1086     if (S_ISREG (sb.st_mode) == 0)
1087       return;
1088
1089   for (i = opt.backups; i > 1; i--)
1090     {
1091       sprintf (from, "%s.%d", fname, i - 1);
1092       sprintf (to, "%s.%d", fname, i);
1093       /* #### This will fail on machines without the rename() system
1094          call.  */
1095       rename (from, to);
1096     }
1097
1098   sprintf (to, "%s.%d", fname, 1);
1099   rename(fname, to);
1100 }
1101
1102 /* Create all the necessary directories for PATH (a file).  Calls
1103    mkdirhier() internally.  */
1104 int
1105 mkalldirs (const char *path)
1106 {
1107   const char *p;
1108   char *t;
1109   struct stat st;
1110   int res;
1111
1112   p = path + strlen (path);
1113   for (; *p != '/' && p != path; p--);
1114   /* Don't create if it's just a file.  */
1115   if ((p == path) && (*p != '/'))
1116     return 0;
1117   t = strdupdelim (path, p);
1118   /* Check whether the directory exists.  */
1119   if ((stat (t, &st) == 0))
1120     {
1121       if (S_ISDIR (st.st_mode))
1122         {
1123           free (t);
1124           return 0;
1125         }
1126       else
1127         {
1128           /* If the dir exists as a file name, remove it first.  This
1129              is *only* for Wget to work with buggy old CERN http
1130              servers.  Here is the scenario: When Wget tries to
1131              retrieve a directory without a slash, e.g.
1132              http://foo/bar (bar being a directory), CERN server will
1133              not redirect it too http://foo/bar/ -- it will generate a
1134              directory listing containing links to bar/file1,
1135              bar/file2, etc.  Wget will lose because it saves this
1136              HTML listing to a file `bar', so it cannot create the
1137              directory.  To work around this, if the file of the same
1138              name exists, we just remove it and create the directory
1139              anyway.  */
1140           DEBUGP (("Removing %s because of directory danger!\n", t));
1141           unlink (t);
1142         }
1143     }
1144   res = make_directory (t);
1145   if (res != 0)
1146     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1147   free (t);
1148   return res;
1149 }
1150
1151 static int
1152 count_slashes (const char *s)
1153 {
1154   int i = 0;
1155   while (*s)
1156     if (*s++ == '/')
1157       ++i;
1158   return i;
1159 }
1160
1161 /* Return the path name of the URL-equivalent file name, with a
1162    remote-like structure of directories.  */
1163 static char *
1164 mkstruct (const struct urlinfo *u)
1165 {
1166   char *host, *dir, *file, *res, *dirpref;
1167   int l;
1168
1169   assert (u->dir != NULL);
1170   assert (u->host != NULL);
1171
1172   if (opt.cut_dirs)
1173     {
1174       char *ptr = u->dir + (*u->dir == '/');
1175       int slash_count = 1 + count_slashes (ptr);
1176       int cut = MINVAL (opt.cut_dirs, slash_count);
1177       for (; cut && *ptr; ptr++)
1178         if (*ptr == '/')
1179           --cut;
1180       STRDUP_ALLOCA (dir, ptr);
1181     }
1182   else
1183     dir = u->dir + (*u->dir == '/');
1184
1185   host = xstrdup (u->host);
1186   /* Check for the true name (or at least a consistent name for saving
1187      to directory) of HOST, reusing the hlist if possible.  */
1188   if (opt.add_hostdir && !opt.simple_check)
1189     {
1190       char *nhost = realhost (host);
1191       free (host);
1192       host = nhost;
1193     }
1194   /* Add dir_prefix and hostname (if required) to the beginning of
1195      dir.  */
1196   if (opt.add_hostdir)
1197     {
1198       if (!DOTP (opt.dir_prefix))
1199         {
1200           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1201                                     + strlen (host) + 1);
1202           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1203         }
1204       else
1205         STRDUP_ALLOCA (dirpref, host);
1206     }
1207   else                         /* not add_hostdir */
1208     {
1209       if (!DOTP (opt.dir_prefix))
1210         dirpref = opt.dir_prefix;
1211       else
1212         dirpref = "";
1213     }
1214   free (host);
1215
1216   /* If there is a prefix, prepend it.  */
1217   if (*dirpref)
1218     {
1219       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1220       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1221       dir = newdir;
1222     }
1223   dir = xstrdup (dir);
1224   URL_CLEANSE (dir);
1225   l = strlen (dir);
1226   if (l && dir[l - 1] == '/')
1227     dir[l - 1] = '\0';
1228
1229   if (!*u->file)
1230     file = "index.html";
1231   else
1232     file = u->file;
1233
1234   /* Finally, construct the full name.  */
1235   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1236   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1237   free (dir);
1238   return res;
1239 }
1240
1241 /* Create a unique filename, corresponding to a given URL.  Calls
1242    mkstruct if necessary.  Does *not* actually create any directories.  */
1243 char *
1244 url_filename (const struct urlinfo *u)
1245 {
1246   char *file, *name;
1247   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1248
1249   if (opt.dirstruct)
1250     {
1251       file = mkstruct (u);
1252       have_prefix = 1;
1253     }
1254   else
1255     {
1256       if (!*u->file)
1257         file = xstrdup ("index.html");
1258       else
1259         file = xstrdup (u->file);
1260     }
1261
1262   if (!have_prefix)
1263     {
1264       /* Check whether the prefix directory is something other than "."
1265          before prepending it.  */
1266       if (!DOTP (opt.dir_prefix))
1267         {
1268           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1269                                          + 1 + strlen (file) + 1);
1270           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1271           free (file);
1272           file = nfile;
1273         }
1274     }
1275   /* DOS-ish file systems don't like `%' signs in them; we change it
1276      to `@'.  */
1277 #ifdef WINDOWS
1278   {
1279     char *p = file;
1280     for (p = file; *p; p++)
1281       if (*p == '%')
1282         *p = '@';
1283   }
1284 #endif /* WINDOWS */
1285
1286   /* Check the cases in which the unique extensions are not used:
1287      1) Clobbering is turned off (-nc).
1288      2) Retrieval with regetting.
1289      3) Timestamping is used.
1290      4) Hierarchy is built.
1291
1292      The exception is the case when file does exist and is a
1293      directory (actually support for bad httpd-s).  */
1294   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1295       && !(file_exists_p (file) && !file_non_directory_p (file)))
1296     return file;
1297
1298   /* Find a unique name.  */
1299   name = unique_name (file);
1300   free (file);
1301   return name;
1302 }
1303
1304 /* Like strlen(), but allow the URL to be ended with '?'.  */
1305 static int
1306 urlpath_length (const char *url)
1307 {
1308   const char *q = strchr (url, '?');
1309   if (q)
1310     return q - url;
1311   return strlen (url);
1312 }
1313
1314 static const char *
1315 find_last_char (const char *b, const char *e, char c)
1316 {
1317   for (; e > b; e--)
1318     if (*e == c)
1319       return e;
1320   return NULL;
1321 }
1322
1323 /* Construct an absolute URL, given a (possibly) relative one.  This
1324    gets tricky if you want to cover all the "reasonable" cases, but
1325    I'm satisfied with the result.  */
1326 static char *
1327 construct (const char *url, const char *sub, int subsize, int no_proto)
1328 {
1329   char *constr;
1330
1331   if (no_proto)
1332     {
1333       const char *end = url + urlpath_length (url);
1334
1335       if (*sub != '/')
1336         {
1337           /* SUB is a relative URL: we need to replace everything
1338              after last slash (possibly empty) with SUB.
1339
1340              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1341              our result should be "whatever/foo/qux/xyzzy".  */
1342           int need_explicit_slash = 0;
1343           int span;
1344           const char *start_insert;
1345           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1346           if (!last_slash)
1347             {
1348               /* No slash found at all.  Append SUB to what we have,
1349                  but we'll need a slash as a separator.
1350
1351                  Example: if url == "foo" and sub == "qux/xyzzy", then
1352                  we cannot just append sub to url, because we'd get
1353                  "fooqux/xyzzy", whereas what we want is
1354                  "foo/qux/xyzzy".
1355
1356                  To make sure the / gets inserted, we set
1357                  need_explicit_slash to 1.  We also set start_insert
1358                  to end + 1, so that the length calculations work out
1359                  correctly for one more (slash) character.  Accessing
1360                  that character is fine, since it will be the
1361                  delimiter, '\0' or '?'.  */
1362               /* example: "foo?..." */
1363               /*               ^    ('?' gets changed to '/') */
1364               start_insert = end + 1;
1365               need_explicit_slash = 1;
1366             }
1367           else
1368             {
1369               /* example: "whatever/foo/bar" */
1370               /*                        ^    */
1371               start_insert = last_slash + 1;
1372             }
1373
1374           span = start_insert - url;
1375           constr = (char *)xmalloc (span + subsize + 1);
1376           if (span)
1377             memcpy (constr, url, span);
1378           if (need_explicit_slash)
1379             constr[span - 1] = '/';
1380           if (subsize)
1381             memcpy (constr + span, sub, subsize);
1382           constr[span + subsize] = '\0';
1383         }
1384       else /* *sub == `/' */
1385         {
1386           /* SUB is an absolute path: we need to replace everything
1387              after (and including) the FIRST slash with SUB.
1388
1389              So, if URL is "http://host/whatever/foo/bar", and SUB is
1390              "/qux/xyzzy", our result should be
1391              "http://host/qux/xyzzy".  */
1392           int span;
1393           const char *slash, *start_insert;
1394           const char *pos = url;
1395           int seen_slash_slash = 0;
1396           /* We're looking for the first slash, but want to ignore
1397              double slash. */
1398         again:
1399           slash = memchr (pos, '/', end - pos);
1400           if (slash && !seen_slash_slash)
1401             if (*(slash + 1) == '/')
1402               {
1403                 pos = slash + 2;
1404                 seen_slash_slash = 1;
1405                 goto again;
1406               }
1407
1408           /* At this point, SLASH is the location of the first / after
1409              "//", or the first slash altogether.  START_INSERT is the
1410              pointer to the location where SUB will be inserted.  When
1411              examining the last two examples, keep in mind that SUB
1412              begins with '/'. */
1413
1414           if (!slash && !seen_slash_slash)
1415             /* example: "foo" */
1416             /*           ^    */
1417             start_insert = url;
1418           else if (!slash && seen_slash_slash)
1419             /* example: "http://foo" */
1420             /*                     ^ */
1421             start_insert = end;
1422           else if (slash && !seen_slash_slash)
1423             /* example: "foo/bar" */
1424             /*           ^        */
1425             start_insert = url;
1426           else if (slash && seen_slash_slash)
1427             /* example: "http://something/" */
1428             /*                           ^  */
1429             start_insert = slash;
1430
1431           span = start_insert - url;
1432           constr = (char *)xmalloc (span + subsize + 1);
1433           if (span)
1434             memcpy (constr, url, span);
1435           if (subsize)
1436             memcpy (constr + span, sub, subsize);
1437           constr[span + subsize] = '\0';
1438         }
1439     }
1440   else /* !no_proto */
1441     {
1442       constr = strdupdelim (sub, sub + subsize);
1443     }
1444   return constr;
1445 }
1446
1447 /* Like the function above, but with a saner caller interface. */
1448 char *
1449 url_concat (const char *base_url, const char *new_url)
1450 {
1451   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1452 }
1453 \f
1454 /* Optimize URL by host, destructively replacing u->host with realhost
1455    (u->host).  Do this regardless of opt.simple_check.  */
1456 void
1457 opt_url (struct urlinfo *u)
1458 {
1459   /* Find the "true" host.  */
1460   char *host = realhost (u->host);
1461   free (u->host);
1462   u->host = host;
1463   assert (u->dir != NULL);      /* the URL must have been parsed */
1464   /* Refresh the printed representation.  */
1465   free (u->url);
1466   u->url = str_url (u, 0);
1467 }
1468
1469 /* This beautiful kludge is fortunately not needed, as I've made
1470    parse_dir do the (almost) right thing, so that a query can never
1471    become a part of directory.  */
1472 #if 0
1473 /* Call path_simplify, but make sure that the part after the
1474    question-mark, if any, is not destroyed by path_simplify's
1475    "optimizations".  */
1476 void
1477 path_simplify_with_kludge (char *path)
1478 {
1479   char *query = strchr (path, '?');
1480   if (query)
1481     /* path_simplify also works destructively, so we also have the
1482        license to write. */
1483     *query = '\0';
1484   path_simplify (path);
1485   if (query)
1486     {
1487       char *newend = path + strlen (path);
1488       *query = '?';
1489       if (newend != query)
1490         memmove (newend, query, strlen (query) + 1);
1491     }
1492 }
1493 #endif
1494 \f
1495 /* Returns proxy host address, in accordance with PROTO.  */
1496 char *
1497 getproxy (uerr_t proto)
1498 {
1499   if (proto == URLHTTP)
1500     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1501   else if (proto == URLFTP)
1502     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1503   else
1504     return NULL;
1505 }
1506
1507 /* Should a host be accessed through proxy, concerning no_proxy?  */
1508 int
1509 no_proxy_match (const char *host, const char **no_proxy)
1510 {
1511   if (!no_proxy)
1512     return 1;
1513   else
1514     return !sufmatch (no_proxy, host);
1515 }
1516 \f
1517 /* Change the links in an HTML document.  Accepts a structure that
1518    defines the positions of all the links.  */
1519 void
1520 convert_links (const char *file, urlpos *l)
1521 {
1522   FILE               *fp;
1523   char               *buf, *p, *p2;
1524   downloaded_file_t  downloaded_file_return;
1525   long               size;
1526
1527   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1528   /* Read from the file....  */
1529   fp = fopen (file, "rb");
1530   if (!fp)
1531     {
1532       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1533                  file, strerror (errno));
1534       return;
1535     }
1536   /* ...to a buffer.  */
1537   load_file (fp, &buf, &size);
1538   fclose (fp);
1539
1540   downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
1541
1542   if (opt.backup_converted && downloaded_file_return)
1543     /* Rather than just writing over the original .html file with the converted
1544        version, save the former to *.orig.  Note we only do this for files we've
1545        _successfully_ downloaded, so we don't clobber .orig files sitting around
1546        from previous invocations. */
1547     {
1548       /* Construct the backup filename as the original name plus ".orig". */
1549       size_t         filename_len = strlen(file);
1550       char*          filename_plus_orig_suffix;
1551       boolean        already_wrote_backup_file = FALSE;
1552       slist*         converted_file_ptr;
1553       static slist*  converted_files = NULL;
1554
1555       if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1556         {
1557           /* Just write "orig" over "html".  We need to do it this way because
1558              when we're checking to see if we've downloaded the file before (to
1559              see if we can skip downloading it), we don't know if it's a
1560              text/html file.  Therefore we don't know yet at that stage that -E
1561              is going to cause us to tack on ".html", so we need to compare
1562              vs. the original URL plus ".orig", not the original URL plus
1563              ".html.orig". */
1564           filename_plus_orig_suffix = xmalloc(filename_len + 1);
1565           strcpy(filename_plus_orig_suffix, file);
1566           strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1567         }
1568       else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1569         {
1570           /* Append ".orig" to the name. */
1571           filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
1572           strcpy(filename_plus_orig_suffix, file);
1573           strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1574         }
1575
1576       /* We can get called twice on the same URL thanks to the
1577          convert_all_links() call in main().  If we write the .orig file each
1578          time in such a case, it'll end up containing the first-pass conversion,
1579          not the original file.  So, see if we've already been called on this
1580          file. */
1581       converted_file_ptr = converted_files;
1582       while (converted_file_ptr != NULL)
1583         if (strcmp(converted_file_ptr->string, file) == 0)
1584           {
1585             already_wrote_backup_file = TRUE;
1586             break;
1587           }
1588         else
1589           converted_file_ptr = converted_file_ptr->next;
1590
1591       if (!already_wrote_backup_file)
1592         {
1593           /* Rename <file> to <file>.orig before former gets written over. */
1594           if (rename(file, filename_plus_orig_suffix) != 0)
1595             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1596                        file, filename_plus_orig_suffix, strerror (errno));
1597
1598           /* Remember that we've already written a .orig backup for this file.
1599              Note that we never free this memory since we need it till the
1600              convert_all_links() call, which is one of the last things the
1601              program does before terminating.  BTW, I'm not sure if it would be
1602              safe to just set 'converted_file_ptr->string' to 'file' below,
1603              rather than making a copy of the string...  Another note is that I
1604              thought I could just add a field to the urlpos structure saying
1605              that we'd written a .orig file for this URL, but that didn't work,
1606              so I had to make this separate list. */
1607           converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1608           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1609           converted_file_ptr->next = converted_files;
1610           converted_files = converted_file_ptr;
1611         }
1612
1613       free(filename_plus_orig_suffix);
1614     }
1615   /* Now open the file for writing.  */
1616   fp = fopen (file, "wb");
1617   if (!fp)
1618     {
1619       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1620                  file, strerror (errno));
1621       free (buf);
1622       return;
1623     }
1624   /* Presumably we have to loop through multiple URLs here (even though we're
1625      only talking about a single local file) because of the -O option. */
1626   for (p = buf; l; l = l->next)
1627     {
1628       if (l->pos >= size)
1629         {
1630           DEBUGP (("Something strange is going on.  Please investigate."));
1631           break;
1632         }
1633       /* If the URL already is relative or it is not to be converted
1634          for some other reason (e.g. because of not having been
1635          downloaded in the first place), skip it.  */
1636       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1637         {
1638           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1639                    l->pos, l->flags));
1640           continue;
1641         }
1642       /* Else, reach the position of the offending URL, echoing
1643          everything up to it to the outfile.  */
1644       for (p2 = buf + l->pos; p < p2; p++)
1645         putc (*p, fp);
1646       if (l->flags & UABS2REL)
1647         /* Convert absolute URL to relative. */
1648         {
1649           char *newname = construct_relative (file, l->local_name);
1650           fprintf (fp, "%s", newname);
1651           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1652                    l->url, newname, l->pos, file));
1653           free (newname);
1654         }
1655       p += l->size;
1656     }
1657   /* Output the rest of the file. */
1658   if (p - buf < size)
1659     {
1660       for (p2 = buf + size; p < p2; p++)
1661         putc (*p, fp);
1662     }
1663   fclose (fp);
1664   free (buf);
1665   logputs (LOG_VERBOSE, _("done.\n"));
1666 }
1667
1668 /* Construct and return a malloced copy of the relative link from two
1669    pieces of information: local name S1 of the referring file and
1670    local name S2 of the referred file.
1671
1672    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1673    "jagor.srce.hr/images/news.gif", the function will return
1674    "images/news.gif".
1675
1676    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1677    "fly.cc.fer.hr/images/fly.gif", the function will return
1678    "../images/fly.gif".
1679
1680    Caveats: S1 should not begin with `/', unless S2 also begins with
1681    '/'.  S1 should not contain things like ".." and such --
1682    construct_relative ("fly/ioccc/../index.html",
1683    "fly/images/fly.gif") will fail.  (A workaround is to call
1684    something like path_simplify() on S1).  */
1685 static char *
1686 construct_relative (const char *s1, const char *s2)
1687 {
1688   int i, cnt, sepdirs1;
1689   char *res;
1690
1691   if (*s2 == '/')
1692     return xstrdup (s2);
1693   /* S1 should *not* be absolute, if S2 wasn't.  */
1694   assert (*s1 != '/');
1695   i = cnt = 0;
1696   /* Skip the directories common to both strings.  */
1697   while (1)
1698     {
1699       while (s1[i] && s2[i]
1700              && (s1[i] == s2[i])
1701              && (s1[i] != '/')
1702              && (s2[i] != '/'))
1703         ++i;
1704       if (s1[i] == '/' && s2[i] == '/')
1705         cnt = ++i;
1706       else
1707         break;
1708     }
1709   for (sepdirs1 = 0; s1[i]; i++)
1710     if (s1[i] == '/')
1711       ++sepdirs1;
1712   /* Now, construct the file as of:
1713      - ../ repeated sepdirs1 time
1714      - all the non-mutual directories of S2.  */
1715   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1716   for (i = 0; i < sepdirs1; i++)
1717     memcpy (res + 3 * i, "../", 3);
1718   strcpy (res + 3 * i, s2 + cnt);
1719   return res;
1720 }
1721 \f
1722 /* Add URL to the head of the list L.  */
1723 urlpos *
1724 add_url (urlpos *l, const char *url, const char *file)
1725 {
1726   urlpos *t;
1727
1728   t = (urlpos *)xmalloc (sizeof (urlpos));
1729   memset (t, 0, sizeof (*t));
1730   t->url = xstrdup (url);
1731   t->local_name = xstrdup (file);
1732   t->next = l;
1733   return t;
1734 }
1735
1736
1737 /* Remembers which files have been downloaded.  In the standard case, should be
1738    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1739    download successfully (i.e. not for ones we have failures on or that we skip
1740    due to -N).
1741
1742    When we've downloaded a file and tacked on a ".html" extension due to -E,
1743    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1744    FILE_DOWNLOADED_NORMALLY.
1745
1746    If you just want to check if a file has been previously added without adding
1747    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1748    with local filenames, not remote URLs. */
1749 downloaded_file_t
1750 downloaded_file (downloaded_file_t  mode, const char*  file)
1751 {
1752   typedef struct _downloaded_file_list
1753   {
1754     char*                          file;
1755     downloaded_file_t              download_type;
1756     struct _downloaded_file_list*  next;
1757   } downloaded_file_list;
1758
1759   boolean                       found_file = FALSE;
1760   static downloaded_file_list*  downloaded_files = NULL;
1761   downloaded_file_list*         rover = downloaded_files;
1762
1763   while (rover != NULL)
1764     if (strcmp(rover->file, file) == 0)
1765       {
1766         found_file = TRUE;
1767         break;
1768       }
1769     else
1770       rover = rover->next;
1771
1772   if (found_file)
1773     return rover->download_type;  /* file had already been downloaded */
1774   else
1775     {
1776       if (mode != CHECK_FOR_FILE)
1777         {
1778           rover = xmalloc(sizeof(*rover));
1779           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1780           rover->download_type = mode;
1781           rover->next = downloaded_files;
1782           downloaded_files = rover;
1783         }
1784
1785       return FILE_NOT_ALREADY_DOWNLOADED;
1786     }
1787 }