From 5fa53b5a1debb7cd33b260292060d7a45fed282a Mon Sep 17 00:00:00 2001 From: hniksic Date: Tue, 10 Apr 2001 09:04:18 -0700 Subject: [PATCH] [svn] Implement better sorting and matching of cookies. Fix previously broken command-line options. Published in . --- src/ChangeLog | 12 +++ src/cookies.c | 210 ++++++++++++++++++++++++++++++++++++-------------- src/http.c | 7 ++ src/main.c | 19 +---- 4 files changed, 177 insertions(+), 71 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 5f5cba3b..1b2bc25d 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,15 @@ +2001-04-10 Hrvoje Niksic + + * cookies.c (build_cookies_request): Use and sort cookies from all + matching domains. + (build_cookies_request): Check for duplicates before generating + the `Cookies' header. + + * main.c (main): Don't load cookies here. + (main): Make loadcookies and savecookies call the correct command. + + * http.c (http_loop): Load cookies on-demand. + 2001-04-09 Hrvoje Niksic * http.c (gethttp): Fix indentation of SSL ifdef. diff --git a/src/cookies.c b/src/cookies.c index 94660b1b..8c44267e 100644 --- a/src/cookies.c +++ b/src/cookies.c @@ -646,7 +646,7 @@ parse_set_cookies (const char *sc) delete_cookie (cookie); if (state == S_ERROR) - logprintf (LOG_NOTQUIET, _("Error in Set-Cookie, at character `%c'.\n"), c); + logprintf (LOG_NOTQUIET, _("Syntax error in Set-Cookie at character `%c'.\n"), c); else abort (); return NULL; @@ -654,7 +654,7 @@ parse_set_cookies (const char *sc) eof: delete_cookie (cookie); logprintf (LOG_NOTQUIET, - _("Error in Set-Cookie: premature end of string.\n")); + _("Syntax error in Set-Cookie: premature end of string.\n")); return NULL; } @@ -864,24 +864,39 @@ count_char (const char *string, char chr) return count; } -/* Return the head of the cookie chain that matches HOST. */ +/* Store CHAIN to STORE if there is room in STORE. If not, inrecement + COUNT anyway, so that when the function is done, we end up with the + exact count of how much place we actually need. */ -static struct cookie * -find_cookie_chain (const char *host, int port) +#define STORE_CHAIN(st_chain, st_store, st_size, st_count) do { \ + if (st_count < st_size) \ + store[st_count] = st_chain; \ + ++st_count; \ +} while (0) + +/* Store cookie chains that match HOST, PORT. Since more than one + chain can match, the matches are written to STORE. No more than + SIZE matches are written; if more matches are present, return the + number of chains that would have been written. */ + +int +find_matching_chains (const char *host, int port, + struct cookie *store[], int size) { + struct cookie *chain; int dot_count; char *hash_key; - struct cookie *chain = NULL; + int count = 0; if (!cookies_hash_table) - return NULL; + return 0; SET_HOSTPORT (host, port, hash_key); /* Exact match. */ chain = hash_table_get (cookies_hash_table, hash_key); if (chain) - return chain; + STORE_CHAIN (chain, store, size, count); dot_count = count_char (host, '.'); @@ -896,10 +911,10 @@ find_cookie_chain (const char *host, int port) assert (p != NULL); chain = hash_table_get (cookies_hash_table, p); if (chain) - return chain; + STORE_CHAIN (chain, store, size, count); hash_key = p + 1; } - return NULL; + return count; } /* If FULL_PATH begins with PREFIX, return the length of PREFIX, zero @@ -917,19 +932,71 @@ path_matches (const char *full_path, const char *prefix) return len; } +static int +matching_cookie (const struct cookie *cookie, const char *path, + int connection_secure_p, int *path_goodness) +{ + int pg; + + if (cookie->expiry_time < cookies_now) + /* Ignore stale cookies. There is no need to unchain the cookie + at this point -- Wget is a relatively short-lived application, + and stale cookies will not be saved by `save_cookies'. */ + return 0; + if (cookie->secure && !connection_secure_p) + /* Don't transmit secure cookies over an insecure connection. */ + return 0; + pg = path_matches (path, cookie->path); + if (!pg) + return 0; + + if (path_goodness) + /* If the caller requested path_goodness, we return it. This is + an optimization, so that the caller doesn't need to call + path_matches() again. */ + *path_goodness = pg; + return 1; +} + struct weighed_cookie { struct cookie *cookie; + int domain_goodness; int path_goodness; }; +/* Comparator used for uniquifying the list. */ + +static int +equality_comparator (const void *p1, const void *p2) +{ + struct weighed_cookie *wc1 = (struct weighed_cookie *)p1; + struct weighed_cookie *wc2 = (struct weighed_cookie *)p2; + + int namecmp = strcmp (wc1->cookie->attr, wc2->cookie->attr); + int valuecmp = strcmp (wc1->cookie->value, wc2->cookie->value); + + /* We only really care whether both name and value are equal. We + return them in this order only for consistency... */ + return namecmp ? namecmp : valuecmp; +} + +/* Comparator used for sorting by quality. */ + static int goodness_comparator (const void *p1, const void *p2) { struct weighed_cookie *wc1 = (struct weighed_cookie *)p1; struct weighed_cookie *wc2 = (struct weighed_cookie *)p2; - /* It's goodness2-goodness1 because we want a sort in *decreasing* - order of goodness. */ - return wc2->path_goodness - wc1->path_goodness; + + /* Subtractions take `wc2' as the first argument becauase we want a + sort in *decreasing* order of goodness. */ + int dgdiff = wc2->domain_goodness - wc1->domain_goodness; + int pgdiff = wc2->path_goodness - wc1->path_goodness; + + /* Sort by domain goodness; if these are the same, sort by path + goodness. (The sorting order isn't really specified; maybe it + should be the other way around.) */ + return dgdiff ? dgdiff : pgdiff; } /* Build a `Cookies' header for a request that goes to HOST:PORT and @@ -942,70 +1009,97 @@ char * build_cookies_request (const char *host, int port, const char *path, int connection_secure_p) { - struct cookie *chain = find_cookie_chain (host, port); + struct cookie *chain_default_store[20]; + struct cookie **all_chains = chain_default_store; + int chain_store_size = ARRAY_SIZE (chain_default_store); + int chain_count; + struct cookie *cookie; struct weighed_cookie *outgoing; - int count, i; + int count, i, ocnt; char *result; int result_size, pos; - if (!chain) + again: + chain_count = find_matching_chains (host, port, all_chains, chain_store_size); + if (chain_count > chain_store_size) + { + /* It's extremely unlikely that more than 20 chains will ever + match. But in this case it's easy to not have the + limitation, so we don't. */ + all_chains = alloca (chain_count * sizeof (struct cookie *)); + goto again; + } + + if (!chain_count) return NULL; cookies_now = time (NULL); /* Count the number of cookies whose path matches. */ count = 0; - result_size = 0; - for (cookie = chain; cookie; cookie = cookie->next) - { - if (cookie->expiry_time < cookies_now) - /* Ignore stale cookies. There is no need to unchain the - cookie at this point -- Wget is a relatively short-lived - application, and stale cookies will not be saved by - `save_cookies'. */ - continue; - if (cookie->secure && !connection_secure_p) - /* Don't transmit secure cookies over an insecure - connection. */ - continue; - if (path_matches (path, cookie->path)) - { - ++count; - /* name=value */ - result_size += strlen (cookie->attr) + 1 + strlen (cookie->value); - } - } + for (i = 0; i < chain_count; i++) + for (cookie = all_chains[i]; cookie; cookie = cookie->next) + if (matching_cookie (cookie, path, connection_secure_p, NULL)) + ++count; if (!count) + /* No matching cookies. */ return NULL; /* Allocate the array. */ outgoing = alloca (count * sizeof (struct weighed_cookie)); - i = 0; - for (cookie = chain; cookie; cookie = cookie->next) + + ocnt = 0; + for (i = 0; i < chain_count; i++) + for (cookie = all_chains[i]; cookie; cookie = cookie->next) + { + int pg; + if (!matching_cookie (cookie, path, connection_secure_p, &pg)) + continue; + outgoing[ocnt].cookie = cookie; + outgoing[ocnt].domain_goodness = strlen (cookie->domain); + outgoing[ocnt].path_goodness = pg; + ++ocnt; + } + assert (ocnt == count); + + /* Eliminate duplicate cookies; that is, those whose name and value + are the same. We do it by first sorting the array, and then + uniq'ing it. */ + qsort (outgoing, count, sizeof (struct weighed_cookie), equality_comparator); + for (i = 0; i < count - 1; i++) { - int goodness; - /* #### These two if's are repeated verbatim from the loop - above. Should I put them in a separate function? */ - if (cookie->expiry_time < cookies_now) - continue; - if (cookie->secure && !connection_secure_p) - /* Don't transmit secure cookies over an insecure - connection. */ - continue; - goodness = path_matches (path, cookie->path); - if (!goodness) - continue; - outgoing[i].cookie = cookie; - outgoing[i].path_goodness = goodness; - ++i; + struct cookie *c1 = outgoing[i].cookie; + struct cookie *c2 = outgoing[i + 1].cookie; + if (!strcmp (c1->attr, c2->attr) && !strcmp (c1->value, c2->value)) + { + /* c1 and c2 are the same; get rid of c2. */ + if (count > i + 1) + /* move all ptrs from positions [i + 1, count) to i. */ + memmove (outgoing + i, outgoing + i + 1, + (count - (i + 1)) * sizeof (struct weighed_cookie)); + /* We decrement i to counter the ++i above. Remember that + we've just removed the element in front of us; we need to + remain in place to check whether outgoing[i] what used to + be outgoing[i + 2]. */ + --i; + --count; + } } - assert (i == count); - /* Sort the array so that paths that match our path better come - first. */ + /* Sort the array so that best-matching domains come first, and + that, within one domain, best-matching paths come first. */ qsort (outgoing, count, sizeof (struct weighed_cookie), goodness_comparator); + /* Count the space the name=value pairs will take. */ + result_size = 0; + for (i = 0; i < count; i++) + { + struct cookie *c = outgoing[i].cookie; + /* name=value */ + result_size += strlen (c->attr) + 1 + strlen (c->value); + } + /* Allocate output buffer: "Cookie: " -- 8 name=value pairs -- result_size @@ -1252,6 +1346,8 @@ save_cookies (const char *file) /* no cookies stored; nothing to do. */ return; + DEBUGP (("Saving cookies to %s.\n", file)); + cookies_now = time (NULL); fp = fopen (file, "w"); @@ -1275,6 +1371,8 @@ save_cookies (const char *file) if (fclose (fp) < 0) logprintf (LOG_NOTQUIET, _("Error closing `%s': %s\n"), file, strerror (errno)); + + DEBUGP (("Done saving cookies.\n", file)); } static int diff --git a/src/http.c b/src/http.c index 0345e2be..83108bce 100644 --- a/src/http.c +++ b/src/http.c @@ -78,6 +78,7 @@ extern int h_errno; # endif #endif +static int cookies_loaded_p; #define TEXTHTML_S "text/html" #define HTTP_ACCEPT "*/*" @@ -1374,6 +1375,12 @@ http_loop (struct urlinfo *u, char **newloc, int *dt) struct http_stat hstat; /* HTTP status */ struct stat st; + /* This used to be done in main(), but it's a better idea to do it + here so that we don't go through the hoops if we're just using + FTP or whatever. */ + if (opt.cookies && opt.cookies_input && !cookies_loaded_p) + load_cookies (opt.cookies_input); + *newloc = NULL; /* Warn on (likely bogus) wildcard usage in HTTP. Don't use diff --git a/src/main.c b/src/main.c index 70690551..360d67e1 100644 --- a/src/main.c +++ b/src/main.c @@ -290,7 +290,6 @@ main (int argc, char *const *argv) { "base", required_argument, NULL, 'B' }, { "bind-address", required_argument, NULL, 155 }, { "cache", required_argument, NULL, 'C' }, - { "cookie-file", required_argument, NULL, 161 }, { "cut-dirs", required_argument, NULL, 145 }, { "directory-prefix", required_argument, NULL, 'P' }, { "domains", required_argument, NULL, 'D' }, @@ -308,7 +307,7 @@ main (int argc, char *const *argv) { "include-directories", required_argument, NULL, 'I' }, { "input-file", required_argument, NULL, 'i' }, { "level", required_argument, NULL, 'l' }, - { "load-cookies", required_argument, NULL, 162 }, + { "load-cookies", required_argument, NULL, 161 }, { "no", required_argument, NULL, 'n' }, { "output-document", required_argument, NULL, 'O' }, { "output-file", required_argument, NULL, 'o' }, @@ -317,7 +316,7 @@ main (int argc, char *const *argv) { "proxy-user", required_argument, NULL, 143 }, { "quota", required_argument, NULL, 'Q' }, { "reject", required_argument, NULL, 'R' }, - { "save-cookies", required_argument, NULL, 163 }, + { "save-cookies", required_argument, NULL, 162 }, { "timeout", required_argument, NULL, 'T' }, { "tries", required_argument, NULL, 't' }, { "user-agent", required_argument, NULL, 'U' }, @@ -531,17 +530,10 @@ GNU General Public License for more details.\n")); setval ("cookies", "on"); break; case 161: - setval ("cookies", "on"); - setval ("cookiein", optarg); - setval ("cookieout", optarg); + setval ("loadcookies", optarg); break; case 162: - setval ("cookies", "on"); - setval ("cookiein", optarg); - break; - case 163: - setval ("cookies", "on"); - setval ("cookieout", optarg); + setval ("savecookies", optarg); break; case 157: setval ("referer", optarg); @@ -792,9 +784,6 @@ Can't timestamp and not clobber old files at the same time.\n")); ws_startup (); #endif - if (opt.cookies_input) - load_cookies (opt.cookies_input); - /* Setup the signal handler to redirect output when hangup is received. */ #ifdef HAVE_SIGNAL -- 2.39.2