X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-parse.c;h=4a0a525771a97d173ee10bd713741e2c5f7a11f4;hb=277e840a0f8e3ec8800cfe7407fe3c16000bc622;hp=88aec75341f7d0553af4b83843eff698817a39ea;hpb=233ebb78de296361d5a25d0856e0957de1058f15;p=wget diff --git a/src/html-parse.c b/src/html-parse.c index 88aec753..4a0a5257 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -96,11 +96,7 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #include "wget.h" @@ -257,33 +253,119 @@ struct pool { } \ } while (0) -#define AP_DOWNCASE 1 -#define AP_PROCESS_ENTITIES 2 -#define AP_TRIM_BLANKS 4 +/* Test whether n+1-sized entity name fits in P. We don't support + IE-style non-terminated entities, e.g. "<foo" -> "tail; - int size; - /* First, skip blanks if required. We must do this before entities - are processed, so that blanks can still be inserted as, for - instance, ` '. */ + /* Skip blanks if required. We must do this before entities are + processed, so that blanks can still be inserted as, for instance, + ` '. */ if (flags & AP_TRIM_BLANKS) { while (beg < end && ISSPACE (*beg)) @@ -291,9 +373,8 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags while (end > beg && ISSPACE (end[-1])) --end; } - size = end - beg; - if (flags & AP_PROCESS_ENTITIES) + if (flags & AP_DECODE_ENTITIES) { /* Grow the pool, then copy the text to the pool character by character, processing the encountered entities as we go @@ -304,78 +385,25 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags never lengthen it. */ const char *from = beg; char *to; + int squash_newlines = flags & AP_TRIM_BLANKS; POOL_GROW (pool, end - beg); to = pool->contents + pool->tail; while (from < end) { - if (*from != '&') - *to++ = *from++; - else + if (*from == '&') { - const char *save = from; - int remain; - - if (++from == end) - goto lose; - remain = end - from; - - /* Process numeric entities "&#DDD;" and "&#xHH;". */ - if (*from == '#') - { - int numeric = 0, digits = 0; - ++from; - if (*from == 'x') - { - ++from; - for (; from < end && ISXDIGIT (*from); from++, digits++) - numeric = (numeric << 4) + XDIGIT_TO_NUM (*from); - } - else - { - for (; from < end && ISDIGIT (*from); from++, digits++) - numeric = (numeric * 10) + (*from - '0'); - } - if (!digits) - goto lose; - numeric &= 0xff; - *to++ = numeric; - } -#define FROB(x) (remain >= (sizeof (x) - 1) \ - && 0 == memcmp (from, x, sizeof (x) - 1) \ - && (*(from + sizeof (x) - 1) == ';' \ - || remain == sizeof (x) - 1 \ - || !ISALNUM (*(from + sizeof (x) - 1)))) - else if (FROB ("lt")) - *to++ = '<', from += 2; - else if (FROB ("gt")) - *to++ = '>', from += 2; - else if (FROB ("amp")) - *to++ = '&', from += 3; - else if (FROB ("quot")) - *to++ = '\"', from += 4; - /* We don't implement the proposed "Added Latin 1" - entities (except for nbsp), because it is unnecessary - in the context of Wget, and would require hashing to - work efficiently. */ - else if (FROB ("nbsp")) - *to++ = 160, from += 4; + int entity = decode_entity (&from, end); + if (entity != -1) + *to++ = entity; else - goto lose; -#undef FROB - /* If the entity was followed by `;', we step over the - `;'. Otherwise, it was followed by either a - non-alphanumeric or EOB, in which case we do nothing. */ - if (from < end && *from == ';') - ++from; - continue; - - lose: - /* This was not an entity after all. Back out. */ - from = save; - *to++ = *from++; + *to++ = *from++; } + else if ((*from == '\n' || *from == '\r') && squash_newlines) + ++from; + else + *to++ = *from++; } /* Verify that we haven't exceeded the original size. (It shouldn't happen, hence the assert.) */ @@ -462,7 +490,7 @@ advance_declaration (const char *beg, const char *end) AC_S_DASH4, AC_S_QUOTE1, AC_S_IN_QUOTE, - AC_S_QUOTE2, + AC_S_QUOTE2 } state = AC_S_BANG; if (beg == end) @@ -698,17 +726,15 @@ static int tag_backout_count; MAPFUN will be called with two arguments: pointer to an initialized struct taginfo, and MAPARG. - ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to - be processed by this function. If it is NULL, all the tags are - allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES. + ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of + which are the tags and attribute names that this function should + use. If ALLOWED_TAGS is NULL, all tags are processed; if + ALLOWED_ATTRIBUTES is NULL, all attributes are returned. (Obviously, the caller can filter out unwanted tags and attributes just as well, but this is just an optimization designed to avoid - unnecessary copying for tags/attributes which the caller doesn't - want to know about. These lists are searched linearly; therefore, - if you're interested in a large number of tags or attributes, you'd - better set these to NULL and filter them out yourself with a - hashing process most appropriate for your application.) */ + unnecessary copying of tags/attributes which the caller doesn't + care about.) */ void map_html_tags (const char *text, int size, @@ -897,7 +923,7 @@ map_html_tags (const char *text, int size, goto look_for_tag; attr_raw_value_end = p; /* */ /* ^ */ - operation = AP_PROCESS_ENTITIES; + operation = AP_DECODE_ENTITIES; if (flags & MHT_TRIM_VALUES) operation |= AP_TRIM_BLANKS; } @@ -920,7 +946,7 @@ map_html_tags (const char *text, int size, goto backout_tag; attr_raw_value_begin = attr_value_begin; attr_raw_value_end = attr_value_end; - operation = AP_PROCESS_ENTITIES; + operation = AP_DECODE_ENTITIES; } } else @@ -984,8 +1010,7 @@ map_html_tags (const char *text, int size, taginfo.attrs = pairs; taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; - /* Ta-dam! */ - (*mapfun) (&taginfo, maparg); + mapfun (&taginfo, maparg); ADVANCE (p); } goto look_for_tag;