[svn] Implemented and documented new -E / --html-extension / html_extension option.

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index 87a97b30cec0a0d6f2448fe5fe80517df601bda6..b6220e3db248e4ab85f4af3bac20f97d54c2d08c 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -53,14 +53,18 @@ extern int errno;
  
  /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
     ':' (not listed in RFC) were added because of user/password
-   encoding, and \033 for safe printing.  */
+   encoding.  */
  
  #ifndef WINDOWS
-# define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
+# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  #else  /* WINDOWS */
-# define URL_UNSAFE " <>\"%{}|\\^[]`\033"
+# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  #endif /* WINDOWS */
  
+#define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
+                       || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
+                       || strchr (URL_UNSAFE_CHARS, c))
+
  /* If S contains unsafe characters, free it and replace it with a
     version that doesn't.  */
  #define URL_CLEANSE(s) do                      \
@@ -154,9 +158,9 @@ skip_url (const char *url)
  {
    int i;
  
-  if (toupper (url[0]) == 'U'
-      && toupper (url[1]) == 'R'
-      && toupper (url[2]) == 'L'
+  if (TOUPPER (url[0]) == 'U'
+      && TOUPPER (url[1]) == 'R'
+      && TOUPPER (url[2]) == 'L'
        && url[3] == ':')
      {
        /* Skip blanks.  */
@@ -172,7 +176,7 @@ int
  contains_unsafe (const char *s)
  {
    for (; *s; s++)
-    if (strchr (URL_UNSAFE, *s))
+    if (UNSAFE_CHAR (*s))
        return 1;
    return 0;
  }
@@ -209,8 +213,8 @@ decode_string (char *s)
    *p = '\0';
  }
  
-/* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
-   string, returning a malloc-ed %XX encoded string.  */
+/* Encode the unsafe characters (as determined by URL_UNSAFE) in a
+   given string, returning a malloc-ed %XX encoded string.  */
  char *
  encode_string (const char *s)
  {
@@ -220,12 +224,12 @@ encode_string (const char *s)
  
    b = s;
    for (i = 0; *s; s++, i++)
-    if (strchr (URL_UNSAFE, *s))
+    if (UNSAFE_CHAR (*s))
        i += 2; /* Two more characters (hex digits) */
    res = (char *)xmalloc (i + 1);
    s = b;
    for (p = res; *s; s++)
-    if (strchr (URL_UNSAFE, *s))
+    if (UNSAFE_CHAR (*s))
        {
         const unsigned char c = *s;
         *p++ = '%';
@@ -464,7 +468,7 @@ parseurl (const char *url, struct urlinfo *u, int strict)
      {
        u->ftp_type = process_ftp_type (u->path);
        /* #### We don't handle type `d' correctly yet.  */
-      if (!u->ftp_type || toupper (u->ftp_type) == 'D')
+      if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
         u->ftp_type = 'I';
      }
    DEBUGP (("opath %s -> ", u->path));
@@ -627,6 +631,7 @@ str_url (const struct urlinfo *u, int hide)
  {
    char *res, *host, *user, *passwd, *proto_name, *dir, *file;
    int i, l, ln, lu, lh, lp, lf, ld;
+  unsigned short proto_default_port;
  
    /* Look for the protocol name.  */
    for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
@@ -635,6 +640,7 @@ str_url (const struct urlinfo *u, int hide)
    if (i == ARRAY_SIZE (sup_protos))
      return NULL;
    proto_name = sup_protos[i].name;
+  proto_default_port = sup_protos[i].port;
    host = CLEANDUP (u->host);
    dir = CLEANDUP (u->dir);
    file = CLEANDUP (u->file);
@@ -689,9 +695,12 @@ str_url (const struct urlinfo *u, int hide)
      }
    memcpy (res + l, host, lh);
    l += lh;
-  res[l++] = ':';
-  long_to_string (res + l, (long)u->port);
-  l += numdigit (u->port);
+  if (u->port != proto_default_port)
+    {
+      res[l++] = ':';
+      long_to_string (res + l, (long)u->port);
+      l += numdigit (u->port);
+    }
    res[l++] = '/';
    memcpy (res + l, dir, ld);
    l += ld;
@@ -821,7 +830,8 @@ get_urls_file (const char *file)
  
     If SILENT is non-zero, do not barf on baseless relative links.  */
  urlpos *
-get_urls_html (const char *file, const char *this_url, int silent)
+get_urls_html (const char *file, const char *this_url, int silent,
+              int dash_p_leaf_HTML)
  {
    long nread;
    FILE *fp;
@@ -850,7 +860,8 @@ get_urls_html (const char *file, const char *this_url, int silent)
    first_time = 1;
    /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
    for (buf = orig_buf;
-       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
+       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
+                          dash_p_leaf_HTML));
         buf += step)
      {
        int i, no_proto;
@@ -1366,10 +1377,10 @@ no_proxy_match (const char *host, const char **no_proxy)
  void
  convert_links (const char *file, urlpos *l)
  {
-  FILE           *fp;
-  char           *buf, *p, *p2;
-  long           size;
-  static slist*  converted_files = NULL;
+  FILE               *fp;
+  char               *buf, *p, *p2;
+  downloaded_file_t  downloaded_file_return;
+  long               size;
  
    logprintf (LOG_VERBOSE, _("Converting %s... "), file);
    /* Read from the file....  */
@@ -1383,28 +1394,53 @@ convert_links (const char *file, urlpos *l)
    /* ...to a buffer.  */
    load_file (fp, &buf, &size);
    fclose (fp);
-  if (opt.backup_converted)
+
+  downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
+
+  if (opt.backup_converted && downloaded_file_return)
      /* Rather than just writing over the original .html file with the converted
-       version, save the former to *.orig. */
+       version, save the former to *.orig.  Note we only do this for files we've
+       _successfully_ downloaded, so we don't clobber .orig files sitting around
+       from previous invocations. */
      {
        /* Construct the backup filename as the original name plus ".orig". */
-      size_t filename_len = strlen(file);
-      char*  filename_plus_orig_suffix = malloc(filename_len + sizeof(".orig"));
-      int    already_wrote_backup_file = 0;
-      slist* converted_file_ptr;
+      size_t         filename_len = strlen(file);
+      char*          filename_plus_orig_suffix;
+      boolean        already_wrote_backup_file = FALSE;
+      slist*         converted_file_ptr;
+      static slist*  converted_files = NULL;
  
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+      if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+       {
+         /* Just write "orig" over "html".  We need to do it this way because
+            when we're checking to see if we've downloaded the file before (to
+            see if we can skip downloading it), we don't know if it's a
+            text/html file.  Therefore we don't know yet at that stage that -E
+            is going to cause us to tack on ".html", so we need to compare
+            vs. the original URL plus ".orig", not the original URL plus
+            ".html.orig". */
+         filename_plus_orig_suffix = xmalloc(filename_len + 1);
+         strcpy(filename_plus_orig_suffix, file);
+         strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+       }
+      else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+       {
+         /* Append ".orig" to the name. */
+         filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
+         strcpy(filename_plus_orig_suffix, file);
+         strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+       }
  
        /* We can get called twice on the same URL thanks to the
          convert_all_links() call in main().  If we write the .orig file each
          time in such a case, it'll end up containing the first-pass conversion,
-        not the original file. */
+        not the original file.  So, see if we've already been called on this
+        file. */
        converted_file_ptr = converted_files;
        while (converted_file_ptr != NULL)
         if (strcmp(converted_file_ptr->string, file) == 0)
           {
-           already_wrote_backup_file = 1;
+           already_wrote_backup_file = TRUE;
             break;
           }
         else
@@ -1421,10 +1457,13 @@ convert_links (const char *file, urlpos *l)
              Note that we never free this memory since we need it till the
              convert_all_links() call, which is one of the last things the
              program does before terminating.  BTW, I'm not sure if it would be
-            safe to just set converted_file_ptr->string to file below, rather
-            than making a copy of the string... */
-         converted_file_ptr = malloc(sizeof(slist));
-         converted_file_ptr->string = strdup(file);
+            safe to just set 'converted_file_ptr->string' to 'file' below,
+            rather than making a copy of the string...  Another note is that I
+            thought I could just add a field to the urlpos structure saying
+            that we'd written a .orig file for this URL, but that didn't work,
+            so I had to make this separate list. */
+         converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
+         converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
           converted_file_ptr->next = converted_files;
           converted_files = converted_file_ptr;
         }
@@ -1440,6 +1479,8 @@ convert_links (const char *file, urlpos *l)
        free (buf);
        return;
      }
+  /* Presumably we have to loop through multiple URLs here (even though we're
+     only talking about a single local file) because of the -O option. */
    for (p = buf; l; l = l->next)
      {
        if (l->pos >= size)
@@ -1461,6 +1502,7 @@ convert_links (const char *file, urlpos *l)
        for (p2 = buf + l->pos; p < p2; p++)
         putc (*p, fp);
        if (l->flags & UABS2REL)
+       /* Convert absolute URL to relative. */
         {
           char *newname = construct_relative (file, l->local_name);
           fprintf (fp, "%s", newname);
@@ -1470,6 +1512,7 @@ convert_links (const char *file, urlpos *l)
         }
        p += l->size;
      }
+  /* Output the rest of the file. */
    if (p - buf < size)
      {
        for (p2 = buf + size; p < p2; p++)
@@ -1547,3 +1590,56 @@ add_url (urlpos *l, const char *url, const char *file)
    t->next = l;
    return t;
  }
+
+
+/* Remembers which files have been downloaded.  In the standard case, should be
+   called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
+   download successfully (i.e. not for ones we have failures on or that we skip
+   due to -N).
+
+   When we've downloaded a file and tacked on a ".html" extension due to -E,
+   call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
+   FILE_DOWNLOADED_NORMALLY.
+
+   If you just want to check if a file has been previously added without adding
+   it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
+   with local filenames, not remote URLs. */
+downloaded_file_t
+downloaded_file (downloaded_file_t  mode, const char*  file)
+{
+  typedef struct _downloaded_file_list
+  {
+    char*                          file;
+    downloaded_file_t              download_type;
+    struct _downloaded_file_list*  next;
+  } downloaded_file_list;
+  
+  boolean                       found_file = FALSE;
+  static downloaded_file_list*  downloaded_files = NULL;
+  downloaded_file_list*         rover = downloaded_files;
+
+  while (rover != NULL)
+    if (strcmp(rover->file, file) == 0)
+      {
+       found_file = TRUE;
+       break;
+      }
+    else
+      rover = rover->next;
+
+  if (found_file)
+    return rover->download_type;  /* file had already been downloaded */
+  else
+    {
+      if (mode != CHECK_FOR_FILE)
+       {
+         rover = xmalloc(sizeof(*rover));
+         rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
+         rover->download_type = mode;
+         rover->next = downloaded_files;
+         downloaded_files = rover;
+       }
+
+      return FILE_NOT_ALREADY_DOWNLOADED;
+    }
+}