sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Default port definitions */
  47 #define DEFAULT_HTTP_PORT 80
  48 #define DEFAULT_FTP_PORT 21
  49 #define DEFAULT_HTTPS_PORT 443
  50
  51 /* Table of Unsafe chars.  This is intialized in
  52    init_unsafe_char_table.  */
  53
  54 static char unsafe_char_table[256];
  55
  56 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  57
  58 /* If S contains unsafe characters, free it and replace it with a
  59    version that doesn't.  */
  60 #define URL_CLEANSE(s) do                       \
  61 {                                               \
  62   if (contains_unsafe (s))                      \
  63     {                                           \
  64       char *uc_tmp = encode_string (s);         \
  65       xfree (s);                                \
  66       (s) = uc_tmp;                             \
  67     }                                           \
  68 } while (0)
  69
  70 /* Is a directory "."?  */
  71 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  72 /* Is a directory ".."?  */
  73 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  74
  75 #if 0
  76 static void path_simplify_with_kludge PARAMS ((char *));
  77 #endif
  78 static int urlpath_length PARAMS ((const char *));
  79
  80 /* NULL-terminated list of strings to be recognized as prototypes (URL
  81    schemes).  Note that recognized doesn't mean supported -- only HTTP,
  82    HTTPS and FTP are currently supported .
  83
  84    However, a string that does not match anything in the list will be
  85    considered a relative URL.  Thus it's important that this list has
  86    anything anyone could think of being legal.
  87
  88    There are wild things here.  :-) Take a look at
  89    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  90    fun.  */
  91 static char *protostrings[] =
  92 {
  93   "cid:",
  94   "clsid:",
  95   "file:",
  96   "finger:",
  97   "ftp:",
  98   "gopher:",
  99   "hdl:",
 100   "http:",
 101   "https:",
 102   "ilu:",
 103   "ior:",
 104   "irc:",
 105   "java:",
 106   "javascript:",
 107   "lifn:",
 108   "mailto:",
 109   "mid:",
 110   "news:",
 111   "nntp:",
 112   "path:",
 113   "prospero:",
 114   "rlogin:",
 115   "service:",
 116   "shttp:",
 117   "snews:",
 118   "stanf:",
 119   "telnet:",
 120   "tn3270:",
 121   "wais:",
 122   "whois++:",
 123   NULL
 124 };
 125
 126 struct proto
 127 {
 128   char *name;
 129   uerr_t ind;
 130   unsigned short port;
 131 };
 132
 133 /* Similar to former, but for supported protocols: */
 134 static struct proto sup_protos[] =
 135 {
 136   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 137 #ifdef HAVE_SSL
 138   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 139 #endif
 140   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 141   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 142 };
 143
 144 static void parse_dir PARAMS ((const char *, char **, char **));
 145 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 146 static char *construct PARAMS ((const char *, const char *, int , int));
 147 static char *construct_relative PARAMS ((const char *, const char *));
 148 static char process_ftp_type PARAMS ((char *));
 149
 150 \f
 151 /* Returns the number of characters to be skipped if the first thing
 152    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 153    URL: are also skipped.  */
 154 int
 155 skip_url (const char *url)
 156 {
 157   int i;
 158
 159   if (TOUPPER (url[0]) == 'U'
 160       && TOUPPER (url[1]) == 'R'
 161       && TOUPPER (url[2]) == 'L'
 162       && url[3] == ':')
 163     {
 164       /* Skip blanks.  */
 165       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 166       return i;
 167     }
 168   else
 169     return 0;
 170 }
 171
 172 /* Unsafe chars:
 173    - anything <= 32;
 174    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 175    - @ and :, for user/password encoding.
 176    - everything over 127 (but we don't bother with recording those.  */
 177 void
 178 init_unsafe_char_table (void)
 179 {
 180   int i;
 181   for (i = 0; i < 256; i++)
 182     if (i < 32 || i >= 127
 183         || i == ' '
 184         || i == '<'
 185         || i == '>'
 186         || i == '\"'
 187         || i == '#'
 188         || i == '%'
 189         || i == '{'
 190         || i == '}'
 191         || i == '|'
 192         || i == '\\'
 193         || i == '^'
 194         || i == '~'
 195         || i == '['
 196         || i == ']'
 197         || i == '`')
 198       unsafe_char_table[i] = 1;
 199 }
 200
 201 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 202 int
 203 contains_unsafe (const char *s)
 204 {
 205   for (; *s; s++)
 206     if (UNSAFE_CHAR (*s))
 207       return 1;
 208   return 0;
 209 }
 210
 211 /* Decodes the forms %xy in a URL to the character the hexadecimal
 212    code of which is xy.  xy are hexadecimal digits from
 213    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 214    hex-digits or `%' precedes `\0', the sequence is inserted
 215    literally.  */
 216
 217 static void
 218 decode_string (char *s)
 219 {
 220   char *p = s;
 221
 222   for (; *s; s++, p++)
 223     {
 224       if (*s != '%')
 225         *p = *s;
 226       else
 227         {
 228           /* Do nothing if at the end of the string, or if the chars
 229              are not hex-digits.  */
 230           if (!*(s + 1) || !*(s + 2)
 231               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 232             {
 233               *p = *s;
 234               continue;
 235             }
 236           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 237           s += 2;
 238         }
 239     }
 240   *p = '\0';
 241 }
 242
 243 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 244    given string, returning a malloc-ed %XX encoded string.  */
 245 char *
 246 encode_string (const char *s)
 247 {
 248   const char *b;
 249   char *p, *res;
 250   int i;
 251
 252   b = s;
 253   for (i = 0; *s; s++, i++)
 254     if (UNSAFE_CHAR (*s))
 255       i += 2; /* Two more characters (hex digits) */
 256   res = (char *)xmalloc (i + 1);
 257   s = b;
 258   for (p = res; *s; s++)
 259     if (UNSAFE_CHAR (*s))
 260       {
 261         const unsigned char c = *s;
 262         *p++ = '%';
 263         *p++ = HEXD2ASC (c >> 4);
 264         *p++ = HEXD2ASC (c & 0xf);
 265       }
 266     else
 267       *p++ = *s;
 268   *p = '\0';
 269   return res;
 270 }
 271 \f
 272 /* Returns the proto-type if URL's protocol is supported, or
 273    URLUNKNOWN if not.  */
 274 uerr_t
 275 urlproto (const char *url)
 276 {
 277   int i;
 278
 279   url += skip_url (url);
 280   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 281     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 282       return sup_protos[i].ind;
 283   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 284   if (url[i] == ':')
 285     {
 286       for (++i; url[i] && url[i] != '/'; i++)
 287         if (!ISDIGIT (url[i]))
 288           return URLBADPORT;
 289       if (url[i - 1] == ':')
 290         return URLFTP;
 291       else
 292         return URLHTTP;
 293     }
 294   else
 295     return URLHTTP;
 296 }
 297
 298 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 299    part is found, returns 0.  */
 300 int
 301 skip_proto (const char *url)
 302 {
 303   char **s;
 304   int l;
 305
 306   for (s = protostrings; *s; s++)
 307     if (!strncasecmp (*s, url, strlen (*s)))
 308       break;
 309   if (!*s)
 310     return 0;
 311   l = strlen (*s);
 312   /* HTTP and FTP protocols are expected to yield exact host names
 313      (i.e. the `//' part must be skipped, too).  */
 314   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 315     l += 2;
 316   return l;
 317 }
 318
 319 /* Returns 1 if the URL begins with a protocol (supported or
 320    unsupported), 0 otherwise.  */
 321 int
 322 has_proto (const char *url)
 323 {
 324   char **s;
 325
 326   url += skip_url (url);
 327   for (s = protostrings; *s; s++)
 328     if (strncasecmp (url, *s, strlen (*s)) == 0)
 329       return 1;
 330   return 0;
 331 }
 332
 333 /* Skip the username and password, if present here.  The function
 334    should be called *not* with the complete URL, but with the part
 335    right after the protocol.
 336
 337    If no username and password are found, return 0.  */
 338 int
 339 skip_uname (const char *url)
 340 {
 341   const char *p;
 342   for (p = url; *p && *p != '/'; p++)
 343     if (*p == '@')
 344       break;
 345   /* If a `@' was found before the first occurrence of `/', skip
 346      it.  */
 347   if (*p == '@')
 348     return p - url + 1;
 349   else
 350     return 0;
 351 }
 352 \f
 353 /* Allocate a new urlinfo structure, fill it with default values and
 354    return a pointer to it.  */
 355 struct urlinfo *
 356 newurl (void)
 357 {
 358   struct urlinfo *u;
 359
 360   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 361   memset (u, 0, sizeof (*u));
 362   u->proto = URLUNKNOWN;
 363   return u;
 364 }
 365
 366 /* Perform a "deep" free of the urlinfo structure.  The structure
 367    should have been created with newurl, but need not have been used.
 368    If free_pointer is non-0, free the pointer itself.  */
 369 void
 370 freeurl (struct urlinfo *u, int complete)
 371 {
 372   assert (u != NULL);
 373   FREE_MAYBE (u->url);
 374   FREE_MAYBE (u->host);
 375   FREE_MAYBE (u->path);
 376   FREE_MAYBE (u->file);
 377   FREE_MAYBE (u->dir);
 378   FREE_MAYBE (u->user);
 379   FREE_MAYBE (u->passwd);
 380   FREE_MAYBE (u->local);
 381   FREE_MAYBE (u->referer);
 382   if (u->proxy)
 383     freeurl (u->proxy, 1);
 384   if (complete)
 385     xfree (u);
 386   return;
 387 }
 388 \f
 389 /* Extract the given URL of the form
 390    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 391    1. hostname (terminated with `/' or `:')
 392    2. port number (terminated with `/'), or chosen for the protocol
 393    3. dirname (everything after hostname)
 394    Most errors are handled.  No allocation is done, you must supply
 395    pointers to allocated memory.
 396    ...and a host of other stuff :-)
 397
 398    - Recognizes hostname:dir/file for FTP and
 399      hostname (:portnum)?/dir/file for HTTP.
 400    - Parses the path to yield directory and file
 401    - Parses the URL to yield the username and passwd (if present)
 402    - Decodes the strings, in case they contain "forbidden" characters
 403    - Writes the result to struct urlinfo
 404
 405    If the argument STRICT is set, it recognizes only the canonical
 406    form.  */
 407 uerr_t
 408 parseurl (const char *url, struct urlinfo *u, int strict)
 409 {
 410   int i, l, abs_ftp;
 411   int recognizable;            /* Recognizable URL is the one where
 412                                   the protocol name was explicitly
 413                                   named, i.e. it wasn't deduced from
 414                                   the URL format.  */
 415   uerr_t type;
 416
 417   DEBUGP (("parseurl (\"%s\") -> ", url));
 418   url += skip_url (url);
 419   recognizable = has_proto (url);
 420   if (strict && !recognizable)
 421     return URLUNKNOWN;
 422   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 423     {
 424       l = strlen (sup_protos[i].name);
 425       if (!strncasecmp (sup_protos[i].name, url, l))
 426         break;
 427     }
 428   /* If protocol is recognizable, but unsupported, bail out, else
 429      suppose unknown.  */
 430   if (recognizable && i == ARRAY_SIZE (sup_protos))
 431     return URLUNKNOWN;
 432   else if (i == ARRAY_SIZE (sup_protos))
 433     type = URLUNKNOWN;
 434   else
 435     u->proto = type = sup_protos[i].ind;
 436
 437   if (type == URLUNKNOWN)
 438     l = 0;
 439   /* Allow a username and password to be specified (i.e. just skip
 440      them for now).  */
 441   if (recognizable)
 442     l += skip_uname (url + l);
 443   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 444   if (i == l)
 445     return URLBADHOST;
 446   /* Get the hostname.  */
 447   u->host = strdupdelim (url + l, url + i);
 448   DEBUGP (("host %s -> ", u->host));
 449
 450   /* Assume no port has been given.  */
 451   u->port = 0;
 452   if (url[i] == ':')
 453     {
 454       /* We have a colon delimiting the hostname.  It could mean that
 455          a port number is following it, or a directory.  */
 456       if (ISDIGIT (url[++i]))    /* A port number */
 457         {
 458           if (type == URLUNKNOWN)
 459             u->proto = type = URLHTTP;
 460           for (; url[i] && url[i] != '/'; i++)
 461             if (ISDIGIT (url[i]))
 462               u->port = 10 * u->port + (url[i] - '0');
 463             else
 464               return URLBADPORT;
 465           if (!u->port)
 466             return URLBADPORT;
 467           DEBUGP (("port %hu -> ", u->port));
 468         }
 469       else if (type == URLUNKNOWN) /* or a directory */
 470         u->proto = type = URLFTP;
 471       else                      /* or just a misformed port number */
 472         return URLBADPORT;
 473     }
 474   else if (type == URLUNKNOWN)
 475     u->proto = type = URLHTTP;
 476   if (!u->port)
 477     {
 478       int ind;
 479       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 480         if (sup_protos[ind].ind == type)
 481           break;
 482       if (ind == ARRAY_SIZE (sup_protos))
 483         return URLUNKNOWN;
 484       u->port = sup_protos[ind].port;
 485     }
 486   /* Some delimiter troubles...  */
 487   if (url[i] == '/' && url[i - 1] != ':')
 488     ++i;
 489   if (type == URLHTTP)
 490     while (url[i] && url[i] == '/')
 491       ++i;
 492   u->path = (char *)xmalloc (strlen (url + i) + 8);
 493   strcpy (u->path, url + i);
 494   if (type == URLFTP)
 495     {
 496       u->ftp_type = process_ftp_type (u->path);
 497       /* #### We don't handle type `d' correctly yet.  */
 498       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 499         u->ftp_type = 'I';
 500     }
 501   DEBUGP (("opath %s -> ", u->path));
 502   /* Parse the username and password (if existing).  */
 503   parse_uname (url, &u->user, &u->passwd);
 504   /* Decode the strings, as per RFC 1738.  */
 505   decode_string (u->host);
 506   decode_string (u->path);
 507   if (u->user)
 508     decode_string (u->user);
 509   if (u->passwd)
 510     decode_string (u->passwd);
 511   /* Parse the directory.  */
 512   parse_dir (u->path, &u->dir, &u->file);
 513   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 514   /* Simplify the directory.  */
 515   path_simplify (u->dir);
 516   /* Remove the leading `/' in HTTP.  */
 517   if (type == URLHTTP && *u->dir == '/')
 518     strcpy (u->dir, u->dir + 1);
 519   DEBUGP (("ndir %s\n", u->dir));
 520   /* Strip trailing `/'.  */
 521   l = strlen (u->dir);
 522   if (l && u->dir[l - 1] == '/')
 523     u->dir[l - 1] = '\0';
 524   /* Re-create the path: */
 525   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 526   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 527       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 528   strcpy (u->path, abs_ftp ? "%2F" : "/");
 529   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 530   strcat (u->path, *u->dir ? "/" : "");
 531   strcat (u->path, u->file);
 532   URL_CLEANSE (u->path);
 533   DEBUGP (("newpath: %s\n", u->path));
 534   /* Create the clean URL.  */
 535   u->url = str_url (u, 0);
 536   return URLOK;
 537 }
 538 \f
 539 /* Special versions of DOTP and DDOTP for parse_dir(). */
 540
 541 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 542 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 543                      && (!*((x) + 2) || *((x) + 2) == '?'))
 544
 545 /* Build the directory and filename components of the path.  Both
 546    components are *separately* malloc-ed strings!  It does not change
 547    the contents of path.
 548
 549    If the path ends with "." or "..", they are (correctly) counted as
 550    directories.  */
 551 static void
 552 parse_dir (const char *path, char **dir, char **file)
 553 {
 554   int i, l;
 555
 556   l = urlpath_length (path);
 557   for (i = l; i && path[i] != '/'; i--);
 558
 559   if (!i && *path != '/')   /* Just filename */
 560     {
 561       if (PD_DOTP (path) || PD_DDOTP (path))
 562         {
 563           *dir = strdupdelim (path, path + l);
 564           *file = xstrdup (path + l); /* normally empty, but could
 565                                          contain ?... */
 566         }
 567       else
 568         {
 569           *dir = xstrdup ("");     /* This is required because of FTP */
 570           *file = xstrdup (path);
 571         }
 572     }
 573   else if (!i)                 /* /filename */
 574     {
 575       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 576         {
 577           *dir = strdupdelim (path, path + l);
 578           *file = xstrdup (path + l); /* normally empty, but could
 579                                          contain ?... */
 580         }
 581       else
 582         {
 583           *dir = xstrdup ("/");
 584           *file = xstrdup (path + 1);
 585         }
 586     }
 587   else /* Nonempty directory with or without a filename */
 588     {
 589       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 590         {
 591           *dir = strdupdelim (path, path + l);
 592           *file = xstrdup (path + l); /* normally empty, but could
 593                                          contain ?... */
 594         }
 595       else
 596         {
 597           *dir = strdupdelim (path, path + i);
 598           *file = xstrdup (path + i + 1);
 599         }
 600     }
 601 }
 602
 603 /* Find the optional username and password within the URL, as per
 604    RFC1738.  The returned user and passwd char pointers are
 605    malloc-ed.  */
 606 static uerr_t
 607 parse_uname (const char *url, char **user, char **passwd)
 608 {
 609   int l;
 610   const char *p, *col;
 611   char **where;
 612
 613   *user = NULL;
 614   *passwd = NULL;
 615   url += skip_url (url);
 616   /* Look for end of protocol string.  */
 617   l = skip_proto (url);
 618   if (!l)
 619     return URLUNKNOWN;
 620   /* Add protocol offset.  */
 621   url += l;
 622   /* Is there an `@' character?  */
 623   for (p = url; *p && *p != '/'; p++)
 624     if (*p == '@')
 625       break;
 626   /* If not, return.  */
 627   if (*p != '@')
 628     return URLOK;
 629   /* Else find the username and password.  */
 630   for (p = col = url; *p != '@'; p++)
 631     {
 632       if (*p == ':' && !*user)
 633         {
 634           *user = (char *)xmalloc (p - url + 1);
 635           memcpy (*user, url, p - url);
 636           (*user)[p - url] = '\0';
 637           col = p + 1;
 638         }
 639     }
 640   /* Decide whether you have only the username or both.  */
 641   where = *user ? passwd : user;
 642   *where = (char *)xmalloc (p - col + 1);
 643   memcpy (*where, col, p - col);
 644   (*where)[p - col] = '\0';
 645   return URLOK;
 646 }
 647
 648 /* If PATH ends with `;type=X', return the character X.  */
 649 static char
 650 process_ftp_type (char *path)
 651 {
 652   int len = strlen (path);
 653
 654   if (len >= 7
 655       && !memcmp (path + len - 7, ";type=", 6))
 656     {
 657       path[len - 7] = '\0';
 658       return path[len - 1];
 659     }
 660   else
 661     return '\0';
 662 }
 663 \f
 664 /* Return the URL as fine-formed string, with a proper protocol,
 665    optional port number, directory and optional user/password.  If
 666    HIDE is non-zero, password will be hidden.  The forbidden
 667    characters in the URL will be cleansed.  */
 668 char *
 669 str_url (const struct urlinfo *u, int hide)
 670 {
 671   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 672   int i, l, ln, lu, lh, lp, lf, ld;
 673   unsigned short proto_default_port;
 674
 675   /* Look for the protocol name.  */
 676   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 677     if (sup_protos[i].ind == u->proto)
 678       break;
 679   if (i == ARRAY_SIZE (sup_protos))
 680     return NULL;
 681   proto_name = sup_protos[i].name;
 682   proto_default_port = sup_protos[i].port;
 683   host = CLEANDUP (u->host);
 684   dir = CLEANDUP (u->dir);
 685   file = CLEANDUP (u->file);
 686   user = passwd = NULL;
 687   if (u->user)
 688     user = CLEANDUP (u->user);
 689   if (u->passwd)
 690     {
 691       int j;
 692       passwd = CLEANDUP (u->passwd);
 693       if (hide)
 694         for (j = 0; passwd[j]; j++)
 695           passwd[j] = 'x';
 696     }
 697   if (u->proto == URLFTP && *dir == '/')
 698     {
 699       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 700       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 701       tmp[0] = '%';
 702       tmp[1] = '2';
 703       tmp[2] = 'F';
 704       strcpy (tmp + 3, dir + 1);
 705       xfree (dir);
 706       dir = tmp;
 707     }
 708
 709   ln = strlen (proto_name);
 710   lu = user ? strlen (user) : 0;
 711   lp = passwd ? strlen (passwd) : 0;
 712   lh = strlen (host);
 713   ld = strlen (dir);
 714   lf = strlen (file);
 715   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 716   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 717      (user ? user : ""), (passwd ? ":" : ""),
 718      (passwd ? passwd : ""), (user ? "@" : ""),
 719      host, u->port, dir, *dir ? "/" : "", file); */
 720   l = 0;
 721   memcpy (res, proto_name, ln);
 722   l += ln;
 723   if (user)
 724     {
 725       memcpy (res + l, user, lu);
 726       l += lu;
 727       if (passwd)
 728         {
 729           res[l++] = ':';
 730           memcpy (res + l, passwd, lp);
 731           l += lp;
 732         }
 733       res[l++] = '@';
 734     }
 735   memcpy (res + l, host, lh);
 736   l += lh;
 737   if (u->port != proto_default_port)
 738     {
 739       res[l++] = ':';
 740       long_to_string (res + l, (long)u->port);
 741       l += numdigit (u->port);
 742     }
 743   res[l++] = '/';
 744   memcpy (res + l, dir, ld);
 745   l += ld;
 746   if (*dir)
 747     res[l++] = '/';
 748   strcpy (res + l, file);
 749   xfree (host);
 750   xfree (dir);
 751   xfree (file);
 752   FREE_MAYBE (user);
 753   FREE_MAYBE (passwd);
 754   return res;
 755 }
 756
 757 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 758    location.  Uses parseurl to parse them, and compares the canonical
 759    forms.
 760
 761    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 762    return 0 on error.  */
 763 int
 764 url_equal (const char *url1, const char *url2)
 765 {
 766   struct urlinfo *u1, *u2;
 767   uerr_t err;
 768   int res;
 769
 770   u1 = newurl ();
 771   err = parseurl (url1, u1, 0);
 772   if (err != URLOK)
 773     {
 774       freeurl (u1, 1);
 775       return 0;
 776     }
 777   u2 = newurl ();
 778   err = parseurl (url2, u2, 0);
 779   if (err != URLOK)
 780     {
 781       freeurl (u2, 1);
 782       return 0;
 783     }
 784   res = !strcmp (u1->url, u2->url);
 785   freeurl (u1, 1);
 786   freeurl (u2, 1);
 787   return res;
 788 }
 789 \f
 790 urlpos *
 791 get_urls_file (const char *file)
 792 {
 793   struct file_memory *fm;
 794   urlpos *head, *tail;
 795   const char *text, *text_end;
 796
 797   /* Load the file.  */
 798   fm = read_file (file);
 799   if (!fm)
 800     {
 801       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 802       return NULL;
 803     }
 804   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 805   head = tail = NULL;
 806   text = fm->content;
 807   text_end = fm->content + fm->length;
 808   while (text < text_end)
 809     {
 810       const char *line_beg = text;
 811       const char *line_end = memchr (text, '\n', text_end - text);
 812       if (!line_end)
 813         line_end = text_end;
 814       else
 815         ++line_end;
 816       text = line_end;
 817       while (line_beg < line_end
 818              && ISSPACE (*line_beg))
 819         ++line_beg;
 820       while (line_end > line_beg + 1
 821              && ISSPACE (*(line_end - 1)))
 822         --line_end;
 823       if (line_end > line_beg)
 824         {
 825           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 826           memset (entry, 0, sizeof (*entry));
 827           entry->next = NULL;
 828           entry->url = strdupdelim (line_beg, line_end);
 829           if (!head)
 830             head = entry;
 831           else
 832             tail->next = entry;
 833           tail = entry;
 834         }
 835     }
 836   read_file_free (fm);
 837   return head;
 838 }
 839 \f
 840 /* Free the linked list of urlpos.  */
 841 void
 842 free_urlpos (urlpos *l)
 843 {
 844   while (l)
 845     {
 846       urlpos *next = l->next;
 847       xfree (l->url);
 848       FREE_MAYBE (l->local_name);
 849       xfree (l);
 850       l = next;
 851     }
 852 }
 853
 854 /* Rotate FNAME opt.backups times */
 855 void
 856 rotate_backups(const char *fname)
 857 {
 858   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 859   char *from = (char *)alloca (maxlen);
 860   char *to = (char *)alloca (maxlen);
 861   struct stat sb;
 862   int i;
 863
 864   if (stat (fname, &sb) == 0)
 865     if (S_ISREG (sb.st_mode) == 0)
 866       return;
 867
 868   for (i = opt.backups; i > 1; i--)
 869     {
 870       sprintf (from, "%s.%d", fname, i - 1);
 871       sprintf (to, "%s.%d", fname, i);
 872       /* #### This will fail on machines without the rename() system
 873          call.  */
 874       rename (from, to);
 875     }
 876
 877   sprintf (to, "%s.%d", fname, 1);
 878   rename(fname, to);
 879 }
 880
 881 /* Create all the necessary directories for PATH (a file).  Calls
 882    mkdirhier() internally.  */
 883 int
 884 mkalldirs (const char *path)
 885 {
 886   const char *p;
 887   char *t;
 888   struct stat st;
 889   int res;
 890
 891   p = path + strlen (path);
 892   for (; *p != '/' && p != path; p--);
 893   /* Don't create if it's just a file.  */
 894   if ((p == path) && (*p != '/'))
 895     return 0;
 896   t = strdupdelim (path, p);
 897   /* Check whether the directory exists.  */
 898   if ((stat (t, &st) == 0))
 899     {
 900       if (S_ISDIR (st.st_mode))
 901         {
 902           xfree (t);
 903           return 0;
 904         }
 905       else
 906         {
 907           /* If the dir exists as a file name, remove it first.  This
 908              is *only* for Wget to work with buggy old CERN http
 909              servers.  Here is the scenario: When Wget tries to
 910              retrieve a directory without a slash, e.g.
 911              http://foo/bar (bar being a directory), CERN server will
 912              not redirect it too http://foo/bar/ -- it will generate a
 913              directory listing containing links to bar/file1,
 914              bar/file2, etc.  Wget will lose because it saves this
 915              HTML listing to a file `bar', so it cannot create the
 916              directory.  To work around this, if the file of the same
 917              name exists, we just remove it and create the directory
 918              anyway.  */
 919           DEBUGP (("Removing %s because of directory danger!\n", t));
 920           unlink (t);
 921         }
 922     }
 923   res = make_directory (t);
 924   if (res != 0)
 925     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 926   xfree (t);
 927   return res;
 928 }
 929
 930 static int
 931 count_slashes (const char *s)
 932 {
 933   int i = 0;
 934   while (*s)
 935     if (*s++ == '/')
 936       ++i;
 937   return i;
 938 }
 939
 940 /* Return the path name of the URL-equivalent file name, with a
 941    remote-like structure of directories.  */
 942 static char *
 943 mkstruct (const struct urlinfo *u)
 944 {
 945   char *host, *dir, *file, *res, *dirpref;
 946   int l;
 947
 948   assert (u->dir != NULL);
 949   assert (u->host != NULL);
 950
 951   if (opt.cut_dirs)
 952     {
 953       char *ptr = u->dir + (*u->dir == '/');
 954       int slash_count = 1 + count_slashes (ptr);
 955       int cut = MINVAL (opt.cut_dirs, slash_count);
 956       for (; cut && *ptr; ptr++)
 957         if (*ptr == '/')
 958           --cut;
 959       STRDUP_ALLOCA (dir, ptr);
 960     }
 961   else
 962     dir = u->dir + (*u->dir == '/');
 963
 964   host = xstrdup (u->host);
 965   /* Check for the true name (or at least a consistent name for saving
 966      to directory) of HOST, reusing the hlist if possible.  */
 967   if (opt.add_hostdir && !opt.simple_check)
 968     {
 969       char *nhost = realhost (host);
 970       xfree (host);
 971       host = nhost;
 972     }
 973   /* Add dir_prefix and hostname (if required) to the beginning of
 974      dir.  */
 975   if (opt.add_hostdir)
 976     {
 977       if (!DOTP (opt.dir_prefix))
 978         {
 979           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 980                                     + strlen (host) + 1);
 981           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 982         }
 983       else
 984         STRDUP_ALLOCA (dirpref, host);
 985     }
 986   else                         /* not add_hostdir */
 987     {
 988       if (!DOTP (opt.dir_prefix))
 989         dirpref = opt.dir_prefix;
 990       else
 991         dirpref = "";
 992     }
 993   xfree (host);
 994
 995   /* If there is a prefix, prepend it.  */
 996   if (*dirpref)
 997     {
 998       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
 999       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1000       dir = newdir;
1001     }
1002   dir = xstrdup (dir);
1003   URL_CLEANSE (dir);
1004   l = strlen (dir);
1005   if (l && dir[l - 1] == '/')
1006     dir[l - 1] = '\0';
1007
1008   if (!*u->file)
1009     file = "index.html";
1010   else
1011     file = u->file;
1012
1013   /* Finally, construct the full name.  */
1014   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1015   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1016   xfree (dir);
1017   return res;
1018 }
1019
1020 /* Create a unique filename, corresponding to a given URL.  Calls
1021    mkstruct if necessary.  Does *not* actually create any directories.  */
1022 char *
1023 url_filename (const struct urlinfo *u)
1024 {
1025   char *file, *name;
1026   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1027
1028   if (opt.dirstruct)
1029     {
1030       file = mkstruct (u);
1031       have_prefix = 1;
1032     }
1033   else
1034     {
1035       if (!*u->file)
1036         file = xstrdup ("index.html");
1037       else
1038         file = xstrdup (u->file);
1039     }
1040
1041   if (!have_prefix)
1042     {
1043       /* Check whether the prefix directory is something other than "."
1044          before prepending it.  */
1045       if (!DOTP (opt.dir_prefix))
1046         {
1047           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1048                                          + 1 + strlen (file) + 1);
1049           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1050           xfree (file);
1051           file = nfile;
1052         }
1053     }
1054   /* DOS-ish file systems don't like `%' signs in them; we change it
1055      to `@'.  */
1056 #ifdef WINDOWS
1057   {
1058     char *p = file;
1059     for (p = file; *p; p++)
1060       if (*p == '%')
1061         *p = '@';
1062   }
1063 #endif /* WINDOWS */
1064
1065   /* Check the cases in which the unique extensions are not used:
1066      1) Clobbering is turned off (-nc).
1067      2) Retrieval with regetting.
1068      3) Timestamping is used.
1069      4) Hierarchy is built.
1070
1071      The exception is the case when file does exist and is a
1072      directory (actually support for bad httpd-s).  */
1073   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1074       && !(file_exists_p (file) && !file_non_directory_p (file)))
1075     return file;
1076
1077   /* Find a unique name.  */
1078   name = unique_name (file);
1079   xfree (file);
1080   return name;
1081 }
1082
1083 /* Like strlen(), but allow the URL to be ended with '?'.  */
1084 static int
1085 urlpath_length (const char *url)
1086 {
1087   const char *q = strchr (url, '?');
1088   if (q)
1089     return q - url;
1090   return strlen (url);
1091 }
1092
1093 /* Find the last occurrence of character C in the range [b, e), or
1094    NULL, if none are present.  This is almost completely equivalent to
1095    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1096    the contents of the string.  */
1097 static const char *
1098 find_last_char (const char *b, const char *e, char c)
1099 {
1100   for (; e > b; e--)
1101     if (*e == c)
1102       return e;
1103   return NULL;
1104 }
1105
1106 /* Construct a URL by concatenating an absolute URL and a path, which
1107    may or may not be absolute.  This tries to behave "reasonably" in
1108    all foreseeable cases.  It employs little specific knowledge about
1109    protocols or URL-specific stuff -- it just works on strings.  */
1110 static char *
1111 construct (const char *url, const char *sub, int subsize, int no_proto)
1112 {
1113   char *constr;
1114
1115   if (no_proto)
1116     {
1117       const char *end = url + urlpath_length (url);
1118
1119       if (*sub != '/')
1120         {
1121           /* SUB is a relative URL: we need to replace everything
1122              after last slash (possibly empty) with SUB.
1123
1124              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1125              our result should be "whatever/foo/qux/xyzzy".  */
1126           int need_explicit_slash = 0;
1127           int span;
1128           const char *start_insert;
1129           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1130           if (!last_slash)
1131             {
1132               /* No slash found at all.  Append SUB to what we have,
1133                  but we'll need a slash as a separator.
1134
1135                  Example: if url == "foo" and sub == "qux/xyzzy", then
1136                  we cannot just append sub to url, because we'd get
1137                  "fooqux/xyzzy", whereas what we want is
1138                  "foo/qux/xyzzy".
1139
1140                  To make sure the / gets inserted, we set
1141                  need_explicit_slash to 1.  We also set start_insert
1142                  to end + 1, so that the length calculations work out
1143                  correctly for one more (slash) character.  Accessing
1144                  that character is fine, since it will be the
1145                  delimiter, '\0' or '?'.  */
1146               /* example: "foo?..." */
1147               /*               ^    ('?' gets changed to '/') */
1148               start_insert = end + 1;
1149               need_explicit_slash = 1;
1150             }
1151           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1152             {
1153               /* example: http://host"  */
1154               /*                      ^ */
1155               start_insert = end + 1;
1156               need_explicit_slash = 1;
1157             }
1158           else
1159             {
1160               /* example: "whatever/foo/bar" */
1161               /*                        ^    */
1162               start_insert = last_slash + 1;
1163             }
1164
1165           span = start_insert - url;
1166           constr = (char *)xmalloc (span + subsize + 1);
1167           if (span)
1168             memcpy (constr, url, span);
1169           if (need_explicit_slash)
1170             constr[span - 1] = '/';
1171           if (subsize)
1172             memcpy (constr + span, sub, subsize);
1173           constr[span + subsize] = '\0';
1174         }
1175       else /* *sub == `/' */
1176         {
1177           /* SUB is an absolute path: we need to replace everything
1178              after (and including) the FIRST slash with SUB.
1179
1180              So, if URL is "http://host/whatever/foo/bar", and SUB is
1181              "/qux/xyzzy", our result should be
1182              "http://host/qux/xyzzy".  */
1183           int span;
1184           const char *slash;
1185           const char *start_insert = NULL; /* for gcc to shut up. */
1186           const char *pos = url;
1187           int seen_slash_slash = 0;
1188           /* We're looking for the first slash, but want to ignore
1189              double slash. */
1190         again:
1191           slash = memchr (pos, '/', end - pos);
1192           if (slash && !seen_slash_slash)
1193             if (*(slash + 1) == '/')
1194               {
1195                 pos = slash + 2;
1196                 seen_slash_slash = 1;
1197                 goto again;
1198               }
1199
1200           /* At this point, SLASH is the location of the first / after
1201              "//", or the first slash altogether.  START_INSERT is the
1202              pointer to the location where SUB will be inserted.  When
1203              examining the last two examples, keep in mind that SUB
1204              begins with '/'. */
1205
1206           if (!slash && !seen_slash_slash)
1207             /* example: "foo" */
1208             /*           ^    */
1209             start_insert = url;
1210           else if (!slash && seen_slash_slash)
1211             /* example: "http://foo" */
1212             /*                     ^ */
1213             start_insert = end;
1214           else if (slash && !seen_slash_slash)
1215             /* example: "foo/bar" */
1216             /*           ^        */
1217             start_insert = url;
1218           else if (slash && seen_slash_slash)
1219             /* example: "http://something/" */
1220             /*                           ^  */
1221             start_insert = slash;
1222
1223           span = start_insert - url;
1224           constr = (char *)xmalloc (span + subsize + 1);
1225           if (span)
1226             memcpy (constr, url, span);
1227           if (subsize)
1228             memcpy (constr + span, sub, subsize);
1229           constr[span + subsize] = '\0';
1230         }
1231     }
1232   else /* !no_proto */
1233     {
1234       constr = strdupdelim (sub, sub + subsize);
1235     }
1236   return constr;
1237 }
1238
1239 /* Like the function above, but with a saner caller interface. */
1240 char *
1241 url_concat (const char *base_url, const char *new_url)
1242 {
1243   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1244 }
1245 \f
1246 /* Optimize URL by host, destructively replacing u->host with realhost
1247    (u->host).  Do this regardless of opt.simple_check.  */
1248 void
1249 opt_url (struct urlinfo *u)
1250 {
1251   /* Find the "true" host.  */
1252   char *host = realhost (u->host);
1253   xfree (u->host);
1254   u->host = host;
1255   assert (u->dir != NULL);      /* the URL must have been parsed */
1256   /* Refresh the printed representation.  */
1257   xfree (u->url);
1258   u->url = str_url (u, 0);
1259 }
1260
1261 /* This beautiful kludge is fortunately not needed, as I've made
1262    parse_dir do the (almost) right thing, so that a query can never
1263    become a part of directory.  */
1264 #if 0
1265 /* Call path_simplify, but make sure that the part after the
1266    question-mark, if any, is not destroyed by path_simplify's
1267    "optimizations".  */
1268 void
1269 path_simplify_with_kludge (char *path)
1270 {
1271   char *query = strchr (path, '?');
1272   if (query)
1273     /* path_simplify also works destructively, so we also have the
1274        license to write. */
1275     *query = '\0';
1276   path_simplify (path);
1277   if (query)
1278     {
1279       char *newend = path + strlen (path);
1280       *query = '?';
1281       if (newend != query)
1282         memmove (newend, query, strlen (query) + 1);
1283     }
1284 }
1285 #endif
1286 \f
1287 /* Returns proxy host address, in accordance with PROTO.  */
1288 char *
1289 getproxy (uerr_t proto)
1290 {
1291   if (proto == URLHTTP)
1292     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1293   else if (proto == URLFTP)
1294     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1295 #ifdef HAVE_SSL
1296   else if (proto == URLHTTPS)
1297     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1298 #endif /* HAVE_SSL */
1299   else
1300     return NULL;
1301 }
1302
1303 /* Should a host be accessed through proxy, concerning no_proxy?  */
1304 int
1305 no_proxy_match (const char *host, const char **no_proxy)
1306 {
1307   if (!no_proxy)
1308     return 1;
1309   else
1310     return !sufmatch (no_proxy, host);
1311 }
1312 \f
1313 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1314 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1315
1316 /* Change the links in an HTML document.  Accepts a structure that
1317    defines the positions of all the links.  */
1318 void
1319 convert_links (const char *file, urlpos *l)
1320 {
1321   struct file_memory *fm;
1322   FILE               *fp;
1323   const char         *p;
1324   downloaded_file_t  downloaded_file_return;
1325
1326   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1327
1328   {
1329     /* First we do a "dry run": go through the list L and see whether
1330        any URL needs to be converted in the first place.  If not, just
1331        leave the file alone.  */
1332     int count = 0;
1333     urlpos *dry = l;
1334     for (dry = l; dry; dry = dry->next)
1335       if (dry->convert != CO_NOCONVERT)
1336         ++count;
1337     if (!count)
1338       {
1339         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1340         return;
1341       }
1342   }
1343
1344   fm = read_file (file);
1345   if (!fm)
1346     {
1347       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1348                  file, strerror (errno));
1349       return;
1350     }
1351
1352   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1353   if (opt.backup_converted && downloaded_file_return)
1354     write_backup_file (file, downloaded_file_return);
1355
1356   /* Before opening the file for writing, unlink the file.  This is
1357      important if the data in FM is mmaped.  In such case, nulling the
1358      file, which is what fopen() below does, would make us read all
1359      zeroes from the mmaped region.  */
1360   if (unlink (file) < 0 && errno != ENOENT)
1361     {
1362       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1363                  file, strerror (errno));
1364       read_file_free (fm);
1365       return;
1366     }
1367   /* Now open the file for writing.  */
1368   fp = fopen (file, "wb");
1369   if (!fp)
1370     {
1371       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1372                  file, strerror (errno));
1373       read_file_free (fm);
1374       return;
1375     }
1376   /* Here we loop through all the URLs in file, replacing those of
1377      them that are downloaded with relative references.  */
1378   p = fm->content;
1379   for (; l; l = l->next)
1380     {
1381       char *url_start = fm->content + l->pos;
1382
1383       if (l->pos >= fm->length)
1384         {
1385           DEBUGP (("Something strange is going on.  Please investigate."));
1386           break;
1387         }
1388       /* If the URL is not to be converted, skip it.  */
1389       if (l->convert == CO_NOCONVERT)
1390         {
1391           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1392           continue;
1393         }
1394
1395       /* Echo the file contents, up to the offending URL's opening
1396          quote, to the outfile.  */
1397       fwrite (p, 1, url_start - p, fp);
1398       p = url_start;
1399       if (l->convert == CO_CONVERT_TO_RELATIVE)
1400         {
1401           /* Convert absolute URL to relative. */
1402           char *newname = construct_relative (file, l->local_name);
1403           char *quoted_newname = html_quote_string (newname);
1404           replace_attr (&p, l->size, fp, quoted_newname);
1405           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1406                    l->url, newname, l->pos, file));
1407           xfree (newname);
1408           xfree (quoted_newname);
1409         }
1410       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1411         {
1412           /* Convert the link to absolute URL. */
1413           char *newlink = l->url;
1414           char *quoted_newlink = html_quote_string (newlink);
1415           replace_attr (&p, l->size, fp, quoted_newlink);
1416           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1417                    newlink, l->pos, file));
1418           xfree (quoted_newlink);
1419         }
1420     }
1421   /* Output the rest of the file. */
1422   if (p - fm->content < fm->length)
1423     fwrite (p, 1, fm->length - (p - fm->content), fp);
1424   fclose (fp);
1425   read_file_free (fm);
1426   logputs (LOG_VERBOSE, _("done.\n"));
1427 }
1428
1429 /* Construct and return a malloced copy of the relative link from two
1430    pieces of information: local name S1 of the referring file and
1431    local name S2 of the referred file.
1432
1433    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1434    "jagor.srce.hr/images/news.gif", the function will return
1435    "images/news.gif".
1436
1437    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1438    "fly.cc.fer.hr/images/fly.gif", the function will return
1439    "../images/fly.gif".
1440
1441    Caveats: S1 should not begin with `/', unless S2 also begins with
1442    '/'.  S1 should not contain things like ".." and such --
1443    construct_relative ("fly/ioccc/../index.html",
1444    "fly/images/fly.gif") will fail.  (A workaround is to call
1445    something like path_simplify() on S1).  */
1446 static char *
1447 construct_relative (const char *s1, const char *s2)
1448 {
1449   int i, cnt, sepdirs1;
1450   char *res;
1451
1452   if (*s2 == '/')
1453     return xstrdup (s2);
1454   /* S1 should *not* be absolute, if S2 wasn't.  */
1455   assert (*s1 != '/');
1456   i = cnt = 0;
1457   /* Skip the directories common to both strings.  */
1458   while (1)
1459     {
1460       while (s1[i] && s2[i]
1461              && (s1[i] == s2[i])
1462              && (s1[i] != '/')
1463              && (s2[i] != '/'))
1464         ++i;
1465       if (s1[i] == '/' && s2[i] == '/')
1466         cnt = ++i;
1467       else
1468         break;
1469     }
1470   for (sepdirs1 = 0; s1[i]; i++)
1471     if (s1[i] == '/')
1472       ++sepdirs1;
1473   /* Now, construct the file as of:
1474      - ../ repeated sepdirs1 time
1475      - all the non-mutual directories of S2.  */
1476   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1477   for (i = 0; i < sepdirs1; i++)
1478     memcpy (res + 3 * i, "../", 3);
1479   strcpy (res + 3 * i, s2 + cnt);
1480   return res;
1481 }
1482 \f
1483 /* Add URL to the head of the list L.  */
1484 urlpos *
1485 add_url (urlpos *l, const char *url, const char *file)
1486 {
1487   urlpos *t;
1488
1489   t = (urlpos *)xmalloc (sizeof (urlpos));
1490   memset (t, 0, sizeof (*t));
1491   t->url = xstrdup (url);
1492   t->local_name = xstrdup (file);
1493   t->next = l;
1494   return t;
1495 }
1496
1497 static void
1498 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1499 {
1500   /* Rather than just writing over the original .html file with the
1501      converted version, save the former to *.orig.  Note we only do
1502      this for files we've _successfully_ downloaded, so we don't
1503      clobber .orig files sitting around from previous invocations. */
1504
1505   /* Construct the backup filename as the original name plus ".orig". */
1506   size_t         filename_len = strlen(file);
1507   char*          filename_plus_orig_suffix;
1508   boolean        already_wrote_backup_file = FALSE;
1509   slist*         converted_file_ptr;
1510   static slist*  converted_files = NULL;
1511
1512   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1513     {
1514       /* Just write "orig" over "html".  We need to do it this way
1515          because when we're checking to see if we've downloaded the
1516          file before (to see if we can skip downloading it), we don't
1517          know if it's a text/html file.  Therefore we don't know yet
1518          at that stage that -E is going to cause us to tack on
1519          ".html", so we need to compare vs. the original URL plus
1520          ".orig", not the original URL plus ".html.orig". */
1521       filename_plus_orig_suffix = alloca (filename_len + 1);
1522       strcpy(filename_plus_orig_suffix, file);
1523       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1524     }
1525   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1526     {
1527       /* Append ".orig" to the name. */
1528       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1529       strcpy(filename_plus_orig_suffix, file);
1530       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1531     }
1532
1533   /* We can get called twice on the same URL thanks to the
1534      convert_all_links() call in main().  If we write the .orig file
1535      each time in such a case, it'll end up containing the first-pass
1536      conversion, not the original file.  So, see if we've already been
1537      called on this file. */
1538   converted_file_ptr = converted_files;
1539   while (converted_file_ptr != NULL)
1540     if (strcmp(converted_file_ptr->string, file) == 0)
1541       {
1542         already_wrote_backup_file = TRUE;
1543         break;
1544       }
1545     else
1546       converted_file_ptr = converted_file_ptr->next;
1547
1548   if (!already_wrote_backup_file)
1549     {
1550       /* Rename <file> to <file>.orig before former gets written over. */
1551       if (rename(file, filename_plus_orig_suffix) != 0)
1552         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1553                    file, filename_plus_orig_suffix, strerror (errno));
1554
1555       /* Remember that we've already written a .orig backup for this file.
1556          Note that we never free this memory since we need it till the
1557          convert_all_links() call, which is one of the last things the
1558          program does before terminating.  BTW, I'm not sure if it would be
1559          safe to just set 'converted_file_ptr->string' to 'file' below,
1560          rather than making a copy of the string...  Another note is that I
1561          thought I could just add a field to the urlpos structure saying
1562          that we'd written a .orig file for this URL, but that didn't work,
1563          so I had to make this separate list.
1564
1565          This [adding a field to the urlpos structure] didn't work
1566          because convert_file() is called twice: once after all its
1567          sublinks have been retrieved in recursive_retrieve(), and
1568          once at the end of the day in convert_all_links().  The
1569          original linked list collected in recursive_retrieve() is
1570          lost after the first invocation of convert_links(), and
1571          convert_all_links() makes a new one (it calls get_urls_html()
1572          for each file it covers.)  That's why your approach didn't
1573          work.  The way to make it work is perhaps to make this flag a
1574          field in the `urls_html' list.  */
1575
1576       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1577       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1578       converted_file_ptr->next = converted_files;
1579       converted_files = converted_file_ptr;
1580     }
1581 }
1582
1583 static int find_fragment PARAMS ((const char *, int, const char **,
1584                                   const char **));
1585
1586 static void
1587 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1588 {
1589   const char *p = *pp;
1590   int quote_flag = 0;
1591   int size = raw_size;
1592   char quote_char = '\"';
1593   const char *frag_beg, *frag_end;
1594
1595   /* Structure of our string is:
1596        "...old-contents..."
1597        <---  l->size   --->  (with quotes)
1598      OR:
1599        ...old-contents...
1600        <---  l->size  -->    (no quotes)   */
1601
1602   if (*p == '\"' || *p == '\'')
1603     {
1604       quote_char = *p;
1605       quote_flag = 1;
1606       ++p;
1607       size -= 2;                /* disregard opening and closing quote */
1608     }
1609   putc (quote_char, fp);
1610   fputs (new_str, fp);
1611
1612   /* Look for fragment identifier, if any. */
1613   if (find_fragment (p, size, &frag_beg, &frag_end))
1614     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1615   p += size;
1616   if (quote_flag)
1617     ++p;
1618   putc (quote_char, fp);
1619   *pp = p;
1620 }
1621
1622 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1623    preceded by '&'.  If the character is not found, return zero.  If
1624    the character is found, return 1 and set BP and EP to point to the
1625    beginning and end of the region.
1626
1627    This is used for finding the fragment indentifiers in URLs.  */
1628
1629 static int
1630 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1631 {
1632   const char *end = beg + size;
1633   int saw_amp = 0;
1634   for (; beg < end; beg++)
1635     {
1636       switch (*beg)
1637         {
1638         case '&':
1639           saw_amp = 1;
1640           break;
1641         case '#':
1642           if (!saw_amp)
1643             {
1644               *bp = beg;
1645               *ep = end;
1646               return 1;
1647             }
1648           /* fallthrough */
1649         default:
1650           saw_amp = 0;
1651         }
1652     }
1653   return 0;
1654 }
1655
1656 typedef struct _downloaded_file_list {
1657   char*                          file;
1658   downloaded_file_t              download_type;
1659   struct _downloaded_file_list*  next;
1660 } downloaded_file_list;
1661
1662 static downloaded_file_list *downloaded_files;
1663
1664 /* Remembers which files have been downloaded.  In the standard case, should be
1665    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1666    download successfully (i.e. not for ones we have failures on or that we skip
1667    due to -N).
1668
1669    When we've downloaded a file and tacked on a ".html" extension due to -E,
1670    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1671    FILE_DOWNLOADED_NORMALLY.
1672
1673    If you just want to check if a file has been previously added without adding
1674    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1675    with local filenames, not remote URLs. */
1676 downloaded_file_t
1677 downloaded_file (downloaded_file_t  mode, const char*  file)
1678 {
1679   boolean                       found_file = FALSE;
1680   downloaded_file_list*         rover = downloaded_files;
1681
1682   while (rover != NULL)
1683     if (strcmp(rover->file, file) == 0)
1684       {
1685         found_file = TRUE;
1686         break;
1687       }
1688     else
1689       rover = rover->next;
1690
1691   if (found_file)
1692     return rover->download_type;  /* file had already been downloaded */
1693   else
1694     {
1695       if (mode != CHECK_FOR_FILE)
1696         {
1697           rover = xmalloc(sizeof(*rover));
1698           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1699           rover->download_type = mode;
1700           rover->next = downloaded_files;
1701           downloaded_files = rover;
1702         }
1703
1704       return FILE_NOT_ALREADY_DOWNLOADED;
1705     }
1706 }
1707
1708 void
1709 downloaded_files_free (void)
1710 {
1711   downloaded_file_list*         rover = downloaded_files;
1712   while (rover)
1713     {
1714       downloaded_file_list *next = rover->next;
1715       xfree (rover->file);
1716       xfree (rover);
1717       rover = next;
1718     }
1719 }
1720 \f
1721 /* Initialization of static stuff. */
1722 void
1723 url_init (void)
1724 {
1725   init_unsafe_char_table ();
1726 }