From 504effdc1358285e50e3a4a8bd3fdee47b1e294a Mon Sep 17 00:00:00 2001 From: hniksic Date: Sun, 2 Nov 2003 06:57:31 -0800 Subject: [PATCH] [svn] Improved support for entities. --- src/ChangeLog | 9 +++ src/html-parse.c | 171 +++++++++++++++++++++++++++-------------------- 2 files changed, 109 insertions(+), 71 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 4b17fda2..b63d7ce1 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2003-11-02 Hrvoje Niksic + + * html-parse.c (decode_entity): New function; split the decoding + of entities here. + (convert_and_copy): Use it to decode entities. + (decode_entity): Handle the &apos entity. + (decode_entity): Don't decode Latin 1 numeric entities. Don't + decode �. + 2003-11-01 Hrvoje Niksic * ftp-opie.c (calculate_skey_response): Use uint32_t instead of diff --git a/src/html-parse.c b/src/html-parse.c index 71bc1a93..ea632546 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -257,23 +257,108 @@ struct pool { } \ } while (0) -#define AP_DOWNCASE 1 -#define AP_PROCESS_ENTITIES 2 -#define AP_TRIM_BLANKS 4 +/* Test whether n+1-sized entity name fits in P. We don't support + IE-style non-terminated entities, e.g. "<foo" -> "= (sizeof (x) - 1) \ - && 0 == memcmp (from, x, sizeof (x) - 1) \ - && (*(from + sizeof (x) - 1) == ';' \ - || remain == sizeof (x) - 1 \ - || !ISALNUM (*(from + sizeof (x) - 1)))) - else if (FROB ("lt")) - *to++ = '<', from += 2; - else if (FROB ("gt")) - *to++ = '>', from += 2; - else if (FROB ("amp")) - *to++ = '&', from += 3; - else if (FROB ("quot")) - *to++ = '\"', from += 4; - /* We don't implement the proposed "Added Latin 1" - entities (except for nbsp), because it is unnecessary - in the context of Wget, and would require hashing to - work efficiently. */ - else if (FROB ("nbsp")) - *to++ = 160, from += 4; + int entity = decode_entity (&from, end); + if (entity != -1) + *to++ = entity; else - goto lose; -#undef FROB - /* If the entity was followed by `;', we step over the - `;'. Otherwise, it was followed by either a - non-alphanumeric or EOB, in which case we do nothing. */ - if (from < end && *from == ';') - ++from; - continue; - - lose: - /* This was not an entity after all. Back out. */ - from = save; - *to++ = *from++; + *to++ = *from++; } } /* Verify that we haven't exceeded the original size. (It @@ -897,7 +926,7 @@ map_html_tags (const char *text, int size, goto look_for_tag; attr_raw_value_end = p; /* */ /* ^ */ - operation = AP_PROCESS_ENTITIES; + operation = AP_DECODE_ENTITIES; if (flags & MHT_TRIM_VALUES) operation |= AP_TRIM_BLANKS; } @@ -920,7 +949,7 @@ map_html_tags (const char *text, int size, goto backout_tag; attr_raw_value_begin = attr_value_begin; attr_raw_value_end = attr_value_end; - operation = AP_PROCESS_ENTITIES; + operation = AP_DECODE_ENTITIES; } } else -- 2.39.2