+2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
+
+ * html-parse.c (convert_and_copy): Remove embedded newlines when
+ AP_TRIM_BLANKS is specified.
+
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
* ftp.c: Set con->csock to -1 where rbuf_uninitialize was
the ASCII range when copying the string.
* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
- of text. */
+ of text, as well as embedded newlines. */
static void
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
{
int old_tail = pool->tail;
- int size;
- /* First, skip blanks if required. We must do this before entities
- are processed, so that blanks can still be inserted as, for
- instance, ` '. */
+ /* Skip blanks if required. We must do this before entities are
+ processed, so that blanks can still be inserted as, for instance,
+ ` '. */
if (flags & AP_TRIM_BLANKS)
{
while (beg < end && ISSPACE (*beg))
while (end > beg && ISSPACE (end[-1]))
--end;
}
- size = end - beg;
if (flags & AP_DECODE_ENTITIES)
{
never lengthen it. */
const char *from = beg;
char *to;
+ int squash_newlines = flags & AP_TRIM_BLANKS;
POOL_GROW (pool, end - beg);
to = pool->contents + pool->tail;
while (from < end)
{
- if (*from != '&')
- *to++ = *from++;
- else
+ if (*from == '&')
{
int entity = decode_entity (&from, end);
if (entity != -1)
else
*to++ = *from++;
}
+ else if ((*from == '\n' || *from == '\r') && squash_newlines)
+ ++from;
+ else
+ *to++ = *from++;
}
/* Verify that we haven't exceeded the original size. (It
shouldn't happen, hence the assert.) */
init_interesting ();
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
- generate <a href=" foo"> instead of <a href="foo"> (Netscape
- ignores spaces as well.) If you really mean space, use &32; or
- %20. */
+ generate <a href=" foo"> instead of <a href="foo"> (browsers
+ ignore spaces as well.) If you really mean space, use &32; or
+ %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
+ e.g. in <img src="foo.[newline]html">. Such newlines are also
+ ignored by IE and Mozilla and are presumably introduced by
+ writing HTML with editors that force word wrap. */
flags = MHT_TRIM_VALUES;
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;