X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-parse.c;h=5033f8e3926be042c3b2c44dd18c27bea6112420;hb=4d7c5e087b2bc82c9f503dff003916d1047903ce;hp=88aec75341f7d0553af4b83843eff698817a39ea;hpb=233ebb78de296361d5a25d0856e0957de1058f15;p=wget diff --git a/src/html-parse.c b/src/html-parse.c index 88aec753..5033f8e3 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -1,11 +1,11 @@ /* HTML parser for Wget. - Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc. + Copyright (C) 1998-2006 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at +the Free Software Foundation; either version 3 of the License, or (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, @@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +along with Wget. If not, see . In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -96,11 +95,7 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #include "wget.h" @@ -157,7 +152,7 @@ struct pool { char *contents; /* pointer to the contents. */ int size; /* size of the pool. */ int tail; /* next available position index. */ - int resized; /* whether the pool has been resized + bool resized; /* whether the pool has been resized using malloc. */ char *orig_contents; /* original pool contents, usually @@ -174,7 +169,7 @@ struct pool { P->contents = (initial_storage); \ P->size = (initial_size); \ P->tail = 0; \ - P->resized = 0; \ + P->resized = false; \ P->orig_contents = P->contents; \ P->orig_size = P->size; \ } while (0) @@ -222,7 +217,7 @@ struct pool { P->contents = P->orig_contents; \ P->size = P->orig_size; \ P->tail = 0; \ - P->resized = 0; \ + P->resized = false; \ } while (0) /* Used for small stack-allocated memory chunks that might grow. Like @@ -245,45 +240,131 @@ struct pool { if (ga_newsize != (sizevar)) \ { \ if (resized) \ - basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type)); \ + basevar = xrealloc (basevar, ga_newsize * sizeof (type)); \ else \ { \ void *ga_new = xmalloc (ga_newsize * sizeof (type)); \ memcpy (ga_new, basevar, (sizevar) * sizeof (type)); \ (basevar) = ga_new; \ - resized = 1; \ + resized = true; \ } \ (sizevar) = ga_newsize; \ } \ } while (0) -#define AP_DOWNCASE 1 -#define AP_PROCESS_ENTITIES 2 -#define AP_TRIM_BLANKS 4 +/* Test whether n+1-sized entity name fits in P. We don't support + IE-style non-terminated entities, e.g. "<foo" -> "tail; - int size; - /* First, skip blanks if required. We must do this before entities - are processed, so that blanks can still be inserted as, for - instance, ` '. */ + /* Skip blanks if required. We must do this before entities are + processed, so that blanks can still be inserted as, for instance, + ` '. */ if (flags & AP_TRIM_BLANKS) { while (beg < end && ISSPACE (*beg)) @@ -291,9 +372,8 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags while (end > beg && ISSPACE (end[-1])) --end; } - size = end - beg; - if (flags & AP_PROCESS_ENTITIES) + if (flags & AP_DECODE_ENTITIES) { /* Grow the pool, then copy the text to the pool character by character, processing the encountered entities as we go @@ -304,78 +384,25 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags never lengthen it. */ const char *from = beg; char *to; + bool squash_newlines = !!(flags & AP_TRIM_BLANKS); POOL_GROW (pool, end - beg); to = pool->contents + pool->tail; while (from < end) { - if (*from != '&') - *to++ = *from++; - else + if (*from == '&') { - const char *save = from; - int remain; - - if (++from == end) - goto lose; - remain = end - from; - - /* Process numeric entities "&#DDD;" and "&#xHH;". */ - if (*from == '#') - { - int numeric = 0, digits = 0; - ++from; - if (*from == 'x') - { - ++from; - for (; from < end && ISXDIGIT (*from); from++, digits++) - numeric = (numeric << 4) + XDIGIT_TO_NUM (*from); - } - else - { - for (; from < end && ISDIGIT (*from); from++, digits++) - numeric = (numeric * 10) + (*from - '0'); - } - if (!digits) - goto lose; - numeric &= 0xff; - *to++ = numeric; - } -#define FROB(x) (remain >= (sizeof (x) - 1) \ - && 0 == memcmp (from, x, sizeof (x) - 1) \ - && (*(from + sizeof (x) - 1) == ';' \ - || remain == sizeof (x) - 1 \ - || !ISALNUM (*(from + sizeof (x) - 1)))) - else if (FROB ("lt")) - *to++ = '<', from += 2; - else if (FROB ("gt")) - *to++ = '>', from += 2; - else if (FROB ("amp")) - *to++ = '&', from += 3; - else if (FROB ("quot")) - *to++ = '\"', from += 4; - /* We don't implement the proposed "Added Latin 1" - entities (except for nbsp), because it is unnecessary - in the context of Wget, and would require hashing to - work efficiently. */ - else if (FROB ("nbsp")) - *to++ = 160, from += 4; + int entity = decode_entity (&from, end); + if (entity != -1) + *to++ = entity; else - goto lose; -#undef FROB - /* If the entity was followed by `;', we step over the - `;'. Otherwise, it was followed by either a - non-alphanumeric or EOB, in which case we do nothing. */ - if (from < end && *from == ';') - ++from; - continue; - - lose: - /* This was not an entity after all. Back out. */ - from = save; - *to++ = *from++; + *to++ = *from++; } + else if ((*from == '\n' || *from == '\r') && squash_newlines) + ++from; + else + *to++ = *from++; } /* Verify that we haven't exceeded the original size. (It shouldn't happen, hence the assert.) */ @@ -462,7 +489,7 @@ advance_declaration (const char *beg, const char *end) AC_S_DASH4, AC_S_QUOTE1, AC_S_IN_QUOTE, - AC_S_QUOTE2, + AC_S_QUOTE2 } state = AC_S_BANG; if (beg == end) @@ -652,15 +679,15 @@ find_comment_end (const char *beg, const char *end) return NULL; } -/* Return non-zero of the string inside [b, e) are present in hash - table HT. */ +/* Return true if the string containing of characters inside [b, e) is + present in hash table HT. */ -static int +static bool name_allowed (const struct hash_table *ht, const char *b, const char *e) { char *copy; if (!ht) - return 1; + return true; BOUNDED_TO_ALLOCA (b, e, copy); return hash_table_get (ht, copy) != NULL; } @@ -698,17 +725,15 @@ static int tag_backout_count; MAPFUN will be called with two arguments: pointer to an initialized struct taginfo, and MAPARG. - ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to - be processed by this function. If it is NULL, all the tags are - allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES. + ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of + which are the tags and attribute names that this function should + use. If ALLOWED_TAGS is NULL, all tags are processed; if + ALLOWED_ATTRIBUTES is NULL, all attributes are returned. (Obviously, the caller can filter out unwanted tags and attributes just as well, but this is just an optimization designed to avoid - unnecessary copying for tags/attributes which the caller doesn't - want to know about. These lists are searched linearly; therefore, - if you're interested in a large number of tags or attributes, you'd - better set these to NULL and filter them out yourself with a - hashing process most appropriate for your application.) */ + unnecessary copying of tags/attributes which the caller doesn't + care about.) */ void map_html_tags (const char *text, int size, @@ -727,7 +752,7 @@ map_html_tags (const char *text, int size, struct attr_pair attr_pair_initial_storage[8]; int attr_pair_size = countof (attr_pair_initial_storage); - int attr_pair_resized = 0; + bool attr_pair_resized = false; struct attr_pair *pairs = attr_pair_initial_storage; if (!size) @@ -739,7 +764,7 @@ map_html_tags (const char *text, int size, int nattrs, end_tag; const char *tag_name_begin, *tag_name_end; const char *tag_start_position; - int uninteresting_tag; + bool uninteresting_tag; look_for_tag: POOL_REWIND (&pool); @@ -802,10 +827,10 @@ map_html_tags (const char *text, int size, if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) /* We can't just say "goto look_for_tag" here because we need the loop below to properly advance over the tag's attributes. */ - uninteresting_tag = 1; + uninteresting_tag = true; else { - uninteresting_tag = 0; + uninteresting_tag = false; convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE); } @@ -864,7 +889,7 @@ map_html_tags (const char *text, int size, SKIP_WS (p); if (*p == '\"' || *p == '\'') { - int newline_seen = 0; + bool newline_seen = false; char quote_char = *p; attr_raw_value_begin = p; ADVANCE (p); @@ -882,7 +907,7 @@ map_html_tags (const char *text, int size, comes first. Such a tag terminated at `>' is discarded. */ p = attr_value_begin; - newline_seen = 1; + newline_seen = true; continue; } else if (newline_seen && *p == '>') @@ -897,7 +922,7 @@ map_html_tags (const char *text, int size, goto look_for_tag; attr_raw_value_end = p; /* */ /* ^ */ - operation = AP_PROCESS_ENTITIES; + operation = AP_DECODE_ENTITIES; if (flags & MHT_TRIM_VALUES) operation |= AP_TRIM_BLANKS; } @@ -920,7 +945,7 @@ map_html_tags (const char *text, int size, goto backout_tag; attr_raw_value_begin = attr_value_begin; attr_raw_value_end = attr_value_end; - operation = AP_PROCESS_ENTITIES; + operation = AP_DECODE_ENTITIES; } } else @@ -984,8 +1009,7 @@ map_html_tags (const char *text, int size, taginfo.attrs = pairs; taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; - /* Ta-dam! */ - (*mapfun) (&taginfo, maparg); + mapfun (&taginfo, maparg); ADVANCE (p); } goto look_for_tag; @@ -1026,7 +1050,7 @@ test_mapper (struct taginfo *taginfo, void *arg) int main () { int size = 256; - char *x = (char *)xmalloc (size); + char *x = xmalloc (size); int length = 0; int read_count; int tag_counter = 0; @@ -1035,7 +1059,7 @@ int main () { length += read_count; size <<= 1; - x = (char *)xrealloc (x, size); + x = xrealloc (x, size); } map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);