X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-parse.c;h=74d344400f33c7c6a3e6c584af72408923e922db;hb=89b37c7eff60d6e5e233330956049cea1807ad27;hp=8cdcd692fa2c99b99fa298cf6bf45b415aa83976;hpb=f21d888d7caff72dc65ff32a77b2046f969005a3;p=wget diff --git a/src/html-parse.c b/src/html-parse.c index 8cdcd692..74d34440 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -344,10 +344,23 @@ array_allowed (const char **array, const char *beg, const char *end) return 1; } -/* RFC1866: name [of attribute or tag] consists of letters, digits, - periods, or hyphens. We also allow _, for compatibility with - brain-damaged generators. */ -#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_') +/* Originally we used to adhere to RFC1866 here, and allowed only + letters, digits, periods, and hyphens as names (of tags or + attributes). However, this broke too many pages which used + proprietary or strange attributes, e.g. . + + So now we allow any character except: + * whitespace + * 8-bit and control chars + * characters that clearly cannot be part of name: + '=', '>', '/'. + + This only affects attribute and tag names; attribute values allow + an even greater variety of characters. */ + +#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ + && (x) != '=' && (x) != '>' && (x) != '/') /* States while advancing through comments. */ #define AC_S_DONE 0 @@ -450,10 +463,10 @@ advance_declaration (const char *beg, const char *end) } break; case AC_S_DCLNAME: - if (NAME_CHAR_P (ch)) - ch = *p++; - else if (ch == '-') + if (ch == '-') state = AC_S_DASH1; + else if (NAME_CHAR_P (ch)) + ch = *p++; else state = AC_S_DEFAULT; break;