sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "exits.h"
  40 #include "html-parse.h"
  41 #include "url.h"
  42 #include "utils.h"
  43 #include "hash.h"
  44 #include "convert.h"
  45 #include "recur.h"
  46 #include "html-url.h"
  47 #include "css-url.h"
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH,
  83   TAG_VIDEO,
  84   TAG_AUDIO,
  85   TAG_SOURCE
  86 };
  87
  88 /* The list of known tags and functions used for handling them.  Most
  89    tags are simply harvested for URLs. */
  90 static struct known_tag {
  91   int tagid;
  92   const char *name;
  93   tag_handler_t handler;
  94 } known_tags[] = {
  95   { TAG_A,       "a",           tag_find_urls },
  96   { TAG_APPLET,  "applet",      tag_find_urls },
  97   { TAG_AREA,    "area",        tag_find_urls },
  98   { TAG_BASE,    "base",        tag_handle_base },
  99   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 100   { TAG_BODY,    "body",        tag_find_urls },
 101   { TAG_EMBED,   "embed",       tag_find_urls },
 102   { TAG_FIG,     "fig",         tag_find_urls },
 103   { TAG_FORM,    "form",        tag_handle_form },
 104   { TAG_FRAME,   "frame",       tag_find_urls },
 105   { TAG_IFRAME,  "iframe",      tag_find_urls },
 106   { TAG_IMG,     "img",         tag_find_urls },
 107   { TAG_INPUT,   "input",       tag_find_urls },
 108   { TAG_LAYER,   "layer",       tag_find_urls },
 109   { TAG_LINK,    "link",        tag_handle_link },
 110   { TAG_META,    "meta",        tag_handle_meta },
 111   { TAG_OBJECT,  "object",      tag_find_urls },
 112   { TAG_OVERLAY, "overlay",     tag_find_urls },
 113   { TAG_SCRIPT,  "script",      tag_find_urls },
 114   { TAG_TABLE,   "table",       tag_find_urls },
 115   { TAG_TD,      "td",          tag_find_urls },
 116   { TAG_TH,      "th",          tag_find_urls },
 117   { TAG_VIDEO,   "video",       tag_find_urls },
 118   { TAG_AUDIO,   "audio",       tag_find_urls },
 119   { TAG_SOURCE,  "source",      tag_find_urls }
 120 };
 121
 122 /* tag_url_attributes documents which attributes of which tags contain
 123    URLs to harvest.  It is used by tag_find_urls.  */
 124
 125 /* Defines for the FLAGS. */
 126
 127 /* The link is "inline", i.e. needs to be retrieved for this document
 128    to be correctly rendered.  Inline links include inlined images,
 129    stylesheets, children frames, etc.  */
 130 #define ATTR_INLINE     1
 131
 132 /* The link is expected to yield HTML contents.  It's important not to
 133    try to follow HTML obtained by following e.g. <img src="...">
 134    regardless of content-type.  Doing this causes infinite loops for
 135    "images" that return non-404 error pages with links to the same
 136    image.  */
 137 #define ATTR_HTML       2
 138
 139 /* For tags handled by tag_find_urls: attributes that contain URLs to
 140    download. */
 141 static struct {
 142   int tagid;
 143   const char *attr_name;
 144   int flags;
 145 } tag_url_attributes[] = {
 146   { TAG_A,              "href",         ATTR_HTML },
 147   { TAG_APPLET,         "code",         ATTR_INLINE },
 148   { TAG_AREA,           "href",         ATTR_HTML },
 149   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 150   { TAG_BODY,           "background",   ATTR_INLINE },
 151   { TAG_EMBED,          "href",         ATTR_HTML },
 152   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 153   { TAG_FIG,            "src",          ATTR_INLINE },
 154   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_IMG,            "href",         ATTR_INLINE },
 157   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 158   { TAG_IMG,            "src",          ATTR_INLINE },
 159   { TAG_INPUT,          "src",          ATTR_INLINE },
 160   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 161   { TAG_OBJECT,         "data",         ATTR_INLINE },
 162   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 163   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 164   { TAG_TABLE,          "background",   ATTR_INLINE },
 165   { TAG_TD,             "background",   ATTR_INLINE },
 166   { TAG_TH,             "background",   ATTR_INLINE },
 167   { TAG_VIDEO,          "src",          ATTR_INLINE },
 168   { TAG_VIDEO,          "poster",       ATTR_INLINE },
 169   { TAG_AUDIO,          "src",          ATTR_INLINE },
 170   { TAG_AUDIO,          "poster",       ATTR_INLINE },
 171   { TAG_SOURCE,         "src",          ATTR_INLINE }
 172 };
 173
 174 /* The lists of interesting tags and attributes are built dynamically,
 175    from the information above.  However, some places in the code refer
 176    to the attributes not mentioned here.  We add them manually.  */
 177 static const char *additional_attributes[] = {
 178   "rel",                        /* used by tag_handle_link  */
 179   "type",                       /* used by tag_handle_link  */
 180   "http-equiv",                 /* used by tag_handle_meta  */
 181   "name",                       /* used by tag_handle_meta  */
 182   "content",                    /* used by tag_handle_meta  */
 183   "action",                     /* used by tag_handle_form  */
 184   "style"                       /* used by check_style_attr */
 185 };
 186
 187 static struct hash_table *interesting_tags;
 188 static struct hash_table *interesting_attributes;
 189
 190 /* Will contains the (last) charset found in 'http-equiv=content-type'
 191    meta tags  */
 192 static char *meta_charset;
 193
 194 static void
 195 init_interesting (void)
 196 {
 197   /* Init the variables interesting_tags and interesting_attributes
 198      that are used by the HTML parser to know which tags and
 199      attributes we're interested in.  We initialize this only once,
 200      for performance reasons.
 201
 202      Here we also make sure that what we put in interesting_tags
 203      matches the user's preferences as specified through --ignore-tags
 204      and --follow-tags.  */
 205
 206   size_t i;
 207   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 208
 209   /* First, add all the tags we know hot to handle, mapped to their
 210      respective entries in known_tags.  */
 211   for (i = 0; i < countof (known_tags); i++)
 212     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 213
 214   /* Then remove the tags ignored through --ignore-tags.  */
 215   if (opt.ignore_tags)
 216     {
 217       char **ignored;
 218       for (ignored = opt.ignore_tags; *ignored; ignored++)
 219         hash_table_remove (interesting_tags, *ignored);
 220     }
 221
 222   /* If --follow-tags is specified, use only those tags.  */
 223   if (opt.follow_tags)
 224     {
 225       /* Create a new table intersecting --follow-tags and known_tags,
 226          and use it as interesting_tags.  */
 227       struct hash_table *intersect = make_nocase_string_hash_table (0);
 228       char **followed;
 229       for (followed = opt.follow_tags; *followed; followed++)
 230         {
 231           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 232           if (!t)
 233             continue;           /* ignore unknown --follow-tags entries. */
 234           hash_table_put (intersect, *followed, t);
 235         }
 236       hash_table_destroy (interesting_tags);
 237       interesting_tags = intersect;
 238     }
 239
 240   /* Add the attributes we care about. */
 241   interesting_attributes = make_nocase_string_hash_table (10);
 242   for (i = 0; i < countof (additional_attributes); i++)
 243     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 244   for (i = 0; i < countof (tag_url_attributes); i++)
 245     hash_table_put (interesting_attributes,
 246                     tag_url_attributes[i].attr_name, "1");
 247 }
 248
 249 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 250    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 251    index of the attribute in TAG will be stored there.  */
 252
 253 static char *
 254 find_attr (struct taginfo *tag, const char *name, int *attrind)
 255 {
 256   int i;
 257   for (i = 0; i < tag->nattrs; i++)
 258     if (!strcasecmp (tag->attrs[i].name, name))
 259       {
 260         if (attrind)
 261           *attrind = i;
 262         return tag->attrs[i].value;
 263       }
 264   return NULL;
 265 }
 266
 267 /* used for calls to append_url */
 268 #define ATTR_POS(tag, attrind, ctx) \
 269  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 270 #define ATTR_SIZE(tag, attrind) \
 271  (tag->attrs[attrind].value_raw_size)
 272
 273 /* Append LINK_URI to the urlpos structure that is being built.
 274
 275    LINK_URI will be merged with the current document base.
 276 */
 277
 278 struct urlpos *
 279 append_url (const char *link_uri, int position, int size,
 280             struct map_context *ctx)
 281 {
 282   int link_has_scheme = url_has_scheme (link_uri);
 283   struct urlpos *newel;
 284   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 285   struct url *url;
 286
 287   struct iri *iri = iri_new ();
 288   set_uri_encoding (iri, opt.locale, true);
 289   iri->utf8_encode = true;
 290
 291   if (!base)
 292     {
 293       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 294                ctx->document_file, link_uri));
 295
 296       if (!link_has_scheme)
 297         {
 298           /* Base URL is unavailable, and the link does not have a
 299              location attached to it -- we have to give up.  Since
 300              this can only happen when using `--force-html -i', print
 301              a warning.  */
 302           logprintf (LOG_NOTQUIET,
 303                      _("%s: Cannot resolve incomplete link %s.\n"),
 304                      ctx->document_file, link_uri);
 305           return NULL;
 306         }
 307
 308       url = url_parse (link_uri, NULL, iri, false);
 309       if (!url)
 310         {
 311           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 312                    ctx->document_file, link_uri));
 313           return NULL;
 314         }
 315     }
 316   else
 317     {
 318       /* Merge BASE with LINK_URI, but also make sure the result is
 319          canonicalized, i.e. that "../" have been resolved.
 320          (parse_url will do that for us.) */
 321
 322       char *complete_uri = uri_merge (base, link_uri);
 323
 324       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 325                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 326                quote_n (1, base),
 327                quote_n (2, link_uri),
 328                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 329
 330       url = url_parse (complete_uri, NULL, iri, false);
 331       if (!url)
 332         {
 333           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 334                    ctx->document_file, complete_uri));
 335           xfree (complete_uri);
 336           return NULL;
 337         }
 338       xfree (complete_uri);
 339     }
 340
 341   iri_free (iri);
 342
 343   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 344
 345   newel = xnew0 (struct urlpos);
 346   newel->url = url;
 347   newel->pos = position;
 348   newel->size = size;
 349
 350   /* A URL is relative if the host is not named, and the name does not
 351      start with `/'.  */
 352   if (!link_has_scheme && *link_uri != '/')
 353     newel->link_relative_p = 1;
 354   else if (link_has_scheme)
 355     newel->link_complete_p = 1;
 356
 357   /* Append the new URL maintaining the order by position.  */
 358   if (ctx->head == NULL)
 359     ctx->head = newel;
 360   else
 361     {
 362       struct urlpos *it, *prev = NULL;
 363
 364       it = ctx->head;
 365       while (it && position > it->pos)
 366         {
 367           prev = it;
 368           it = it->next;
 369         }
 370
 371       newel->next = it;
 372
 373       if (prev)
 374         prev->next = newel;
 375       else
 376         ctx->head = newel;
 377     }
 378
 379   return newel;
 380 }
 381 \f
 382 static void
 383 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 384 {
 385   int attrind;
 386   int raw_start;
 387   int raw_len;
 388   char *style = find_attr (tag, "style", &attrind);
 389   if (!style)
 390     return;
 391
 392   /* raw pos and raw size include the quotes, skip them when they are
 393      present.  */
 394   raw_start = ATTR_POS (tag, attrind, ctx);
 395   raw_len  = ATTR_SIZE (tag, attrind);
 396   if( *(char *)(ctx->text + raw_start) == '\''
 397       || *(char *)(ctx->text + raw_start) == '"')
 398     {
 399       raw_start += 1;
 400       raw_len -= 2;
 401     }
 402
 403   if(raw_len <= 0)
 404        return;
 405
 406   get_urls_css (ctx, raw_start, raw_len);
 407 }
 408
 409 /* All the tag_* functions are called from collect_tags_mapper, as
 410    specified by KNOWN_TAGS.  */
 411
 412 /* Default tag handler: collect URLs from attributes specified for
 413    this tag by tag_url_attributes.  */
 414
 415 static void
 416 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 417 {
 418   size_t i;
 419   int attrind;
 420   int first = -1;
 421
 422   for (i = 0; i < countof (tag_url_attributes); i++)
 423     if (tag_url_attributes[i].tagid == tagid)
 424       {
 425         /* We've found the index of tag_url_attributes where the
 426            attributes of our tag begin.  */
 427         first = i;
 428         break;
 429       }
 430   assert (first != -1);
 431
 432   /* Loop over the "interesting" attributes of this tag.  In this
 433      example, it will loop over "src" and "lowsrc".
 434
 435        <img src="foo.png" lowsrc="bar.png">
 436
 437      This has to be done in the outer loop so that the attributes are
 438      processed in the same order in which they appear in the page.
 439      This is required when converting links.  */
 440
 441   for (attrind = 0; attrind < tag->nattrs; attrind++)
 442     {
 443       /* Find whether TAG/ATTRIND is a combination that contains a
 444          URL. */
 445       char *link = tag->attrs[attrind].value;
 446       const size_t size = countof (tag_url_attributes);
 447
 448       /* If you're cringing at the inefficiency of the nested loops,
 449          remember that they both iterate over a very small number of
 450          items.  The worst-case inner loop is for the IMG tag, which
 451          has three attributes.  */
 452       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 453         {
 454           if (0 == strcasecmp (tag->attrs[attrind].name,
 455                                tag_url_attributes[i].attr_name))
 456             {
 457               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 458                                               ATTR_SIZE(tag,attrind), ctx);
 459               if (up)
 460                 {
 461                   int flags = tag_url_attributes[i].flags;
 462                   if (flags & ATTR_INLINE)
 463                     up->link_inline_p = 1;
 464                   if (flags & ATTR_HTML)
 465                     up->link_expect_html = 1;
 466                 }
 467             }
 468         }
 469     }
 470 }
 471
 472 /* Handle the BASE tag, for <base href=...>. */
 473
 474 static void
 475 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 476 {
 477   struct urlpos *base_urlpos;
 478   int attrind;
 479   char *newbase = find_attr (tag, "href", &attrind);
 480   if (!newbase)
 481     return;
 482
 483   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 484                             ATTR_SIZE(tag,attrind), ctx);
 485   if (!base_urlpos)
 486     return;
 487   base_urlpos->ignore_when_downloading = 1;
 488   base_urlpos->link_base_p = 1;
 489
 490   if (ctx->base)
 491     xfree (ctx->base);
 492   if (ctx->parent_base)
 493     ctx->base = uri_merge (ctx->parent_base, newbase);
 494   else
 495     ctx->base = xstrdup (newbase);
 496 }
 497
 498 /* Mark the URL found in <form action=...> for conversion. */
 499
 500 static void
 501 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 502 {
 503   int attrind;
 504   char *action = find_attr (tag, "action", &attrind);
 505
 506   if (action)
 507     {
 508       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 509                                       ATTR_SIZE(tag,attrind), ctx);
 510       if (up)
 511         up->ignore_when_downloading = 1;
 512     }
 513 }
 514
 515 /* Handle the LINK tag.  It requires special handling because how its
 516    links will be followed in -p mode depends on the REL attribute.  */
 517
 518 static void
 519 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 520 {
 521   int attrind;
 522   char *href = find_attr (tag, "href", &attrind);
 523
 524   /* All <link href="..."> link references are external, except those
 525      known not to be, such as style sheet and shortcut icon:
 526
 527      <link rel="stylesheet" href="...">
 528      <link rel="shortcut icon" href="...">
 529   */
 530   if (href)
 531     {
 532       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 533                                       ATTR_SIZE(tag,attrind), ctx);
 534       if (up)
 535         {
 536           char *rel = find_attr (tag, "rel", NULL);
 537           if (rel)
 538             {
 539               if (0 == strcasecmp (rel, "stylesheet"))
 540                 {
 541                   up->link_inline_p = 1;
 542                   up->link_expect_css = 1;
 543                 }
 544               else if (0 == strcasecmp (rel, "shortcut icon"))
 545                 {
 546                   up->link_inline_p = 1;
 547                 }
 548               else
 549                 {
 550                   /* The external ones usually point to HTML pages, such as
 551                      <link rel="next" href="...">
 552                      except when the type attribute says otherwise:
 553                      <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
 554                   */
 555                   char *type = find_attr (tag, "type", NULL);
 556                   if (!type || strcasecmp (type, "text/html") == 0)
 557                     up->link_expect_html = 1;
 558                 }
 559             }
 560         }
 561     }
 562 }
 563
 564 /* Handle the META tag.  This requires special handling because of the
 565    refresh feature and because of robot exclusion.  */
 566
 567 static void
 568 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 569 {
 570   char *name = find_attr (tag, "name", NULL);
 571   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 572
 573   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 574     {
 575       /* Some pages use a META tag to specify that the page be
 576          refreshed by a new page after a given number of seconds.  The
 577          general format for this is:
 578
 579            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 580
 581          So we just need to skip past the "NUMBER; URL=" garbage to
 582          get to the URL.  */
 583
 584       struct urlpos *entry;
 585       int attrind;
 586       int timeout = 0;
 587       char *p;
 588
 589       char *refresh = find_attr (tag, "content", &attrind);
 590       if (!refresh)
 591         return;
 592
 593       for (p = refresh; c_isdigit (*p); p++)
 594         timeout = 10 * timeout + *p - '0';
 595       if (*p++ != ';')
 596         return;
 597
 598       while (c_isspace (*p))
 599         ++p;
 600       if (!(   c_toupper (*p)       == 'U'
 601             && c_toupper (*(p + 1)) == 'R'
 602             && c_toupper (*(p + 2)) == 'L'
 603             &&          *(p + 3)  == '='))
 604         return;
 605       p += 4;
 606       while (c_isspace (*p))
 607         ++p;
 608
 609       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 610                           ATTR_SIZE(tag,attrind), ctx);
 611       if (entry)
 612         {
 613           entry->link_refresh_p = 1;
 614           entry->refresh_timeout = timeout;
 615           entry->link_expect_html = 1;
 616         }
 617     }
 618   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 619     {
 620       /* Handle stuff like:
 621          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 622
 623       char *mcharset;
 624       char *content = find_attr (tag, "content", NULL);
 625       if (!content)
 626         return;
 627
 628       mcharset = parse_charset (content);
 629       if (!mcharset)
 630         return;
 631
 632       xfree_null (meta_charset);
 633       meta_charset = mcharset;
 634     }
 635   else if (name && 0 == strcasecmp (name, "robots"))
 636     {
 637       /* Handle stuff like:
 638          <meta name="robots" content="index,nofollow"> */
 639       char *content = find_attr (tag, "content", NULL);
 640       if (!content)
 641         return;
 642       if (!strcasecmp (content, "none"))
 643         ctx->nofollow = true;
 644       else
 645         {
 646           while (*content)
 647             {
 648               char *end;
 649               /* Skip any initial whitespace. */
 650               content += strspn (content, " \f\n\r\t\v");
 651               /* Find the next occurrence of ',' or whitespace,
 652                * or the end of the string.  */
 653               end = content + strcspn (content, ", \f\n\r\t\v");
 654               if (!strncasecmp (content, "nofollow", end - content))
 655                 ctx->nofollow = true;
 656               /* Skip past the next comma, if any. */
 657               if (*end == ',')
 658                 ++end;
 659               else
 660                 {
 661                   end = strchr (end, ',');
 662                   if (end)
 663                     ++end;
 664                   else
 665                     end = content + strlen (content);
 666                 }
 667               content = end;
 668             }
 669         }
 670     }
 671 }
 672
 673 /* Dispatch the tag handler appropriate for the tag we're mapping
 674    over.  See known_tags[] for definition of tag handlers.  */
 675
 676 static void
 677 collect_tags_mapper (struct taginfo *tag, void *arg)
 678 {
 679   struct map_context *ctx = (struct map_context *)arg;
 680
 681   /* Find the tag in our table of tags.  This must not fail because
 682      map_html_tags only returns tags found in interesting_tags.
 683
 684      I've changed this for now, I'm passing NULL as interesting_tags
 685      to map_html_tags.  This way we can check all tags for a style
 686      attribute.
 687   */
 688   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 689
 690   if (t != NULL)
 691     t->handler (t->tagid, tag, ctx);
 692
 693   check_style_attr (tag, ctx);
 694
 695   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style"))
 696       && tag->contents_begin && tag->contents_end
 697       && tag->contents_begin <= tag->contents_end)
 698   {
 699     /* parse contents */
 700     get_urls_css (ctx, tag->contents_begin - ctx->text,
 701                   tag->contents_end - tag->contents_begin);
 702   }
 703 }
 704 \f
 705 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 706    it.  It merges relative links in FILE with URL.  It is aware of
 707    <base href=...> and does the right thing.  */
 708
 709 struct urlpos *
 710 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 711                struct iri *iri)
 712 {
 713   struct file_memory *fm;
 714   struct map_context ctx;
 715   int flags;
 716
 717   /* Load the file. */
 718   fm = wget_read_file (file);
 719   if (!fm)
 720     {
 721       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 722       return NULL;
 723     }
 724   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 725
 726   ctx.text = fm->content;
 727   ctx.head = NULL;
 728   ctx.base = NULL;
 729   ctx.parent_base = url ? url : opt.base_href;
 730   ctx.document_file = file;
 731   ctx.nofollow = false;
 732
 733   if (!interesting_tags)
 734     init_interesting ();
 735
 736   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 737      generate <a href=" foo"> instead of <a href="foo"> (browsers
 738      ignore spaces as well.)  If you really mean space, use &32; or
 739      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 740      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 741      ignored by IE and Mozilla and are presumably introduced by
 742      writing HTML with editors that force word wrap.  */
 743   flags = MHT_TRIM_VALUES;
 744   if (opt.strict_comments)
 745     flags |= MHT_STRICT_COMMENTS;
 746
 747   /* the NULL here used to be interesting_tags */
 748   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 749                  NULL, interesting_attributes);
 750
 751   /* If meta charset isn't null, override content encoding */
 752   if (iri && meta_charset)
 753     set_content_encoding (iri, meta_charset);
 754
 755   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 756   if (meta_disallow_follow)
 757     *meta_disallow_follow = ctx.nofollow;
 758
 759   xfree_null (ctx.base);
 760   wget_read_file_free (fm);
 761   return ctx.head;
 762 }
 763
 764 /* This doesn't really have anything to do with HTML, but it's similar
 765    to get_urls_html, so we put it here.  */
 766
 767 struct urlpos *
 768 get_urls_file (const char *file)
 769 {
 770   struct file_memory *fm;
 771   struct urlpos *head, *tail;
 772   const char *text, *text_end;
 773
 774   /* Load the file.  */
 775   fm = wget_read_file (file);
 776   if (!fm)
 777     {
 778       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 779       return NULL;
 780     }
 781   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 782
 783   head = tail = NULL;
 784   text = fm->content;
 785   text_end = fm->content + fm->length;
 786   while (text < text_end)
 787     {
 788       int up_error_code;
 789       char *url_text;
 790       struct urlpos *entry;
 791       struct url *url;
 792
 793       const char *line_beg = text;
 794       const char *line_end = memchr (text, '\n', text_end - text);
 795       if (!line_end)
 796         line_end = text_end;
 797       else
 798         ++line_end;
 799       text = line_end;
 800
 801       /* Strip whitespace from the beginning and end of line. */
 802       while (line_beg < line_end && c_isspace (*line_beg))
 803         ++line_beg;
 804       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 805         --line_end;
 806
 807       if (line_beg == line_end)
 808         continue;
 809
 810       /* The URL is in the [line_beg, line_end) region. */
 811
 812       /* We must copy the URL to a zero-terminated string, and we
 813          can't use alloca because we're in a loop.  *sigh*.  */
 814       url_text = strdupdelim (line_beg, line_end);
 815
 816       if (opt.base_href)
 817         {
 818           /* Merge opt.base_href with URL. */
 819           char *merged = uri_merge (opt.base_href, url_text);
 820           xfree (url_text);
 821           url_text = merged;
 822         }
 823
 824       char *new_url = rewrite_shorthand_url (url_text);
 825       if (new_url)
 826         {
 827           xfree (url_text);
 828           url_text = new_url;
 829         }
 830
 831       url = url_parse (url_text, &up_error_code, NULL, false);
 832       if (!url)
 833         {
 834           char *error = url_error (url_text, up_error_code);
 835           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 836                      file, url_text, error);
 837           xfree (url_text);
 838           xfree (error);
 839           inform_exit_status (URLERROR);
 840           continue;
 841         }
 842       xfree (url_text);
 843
 844       entry = xnew0 (struct urlpos);
 845       entry->url = url;
 846
 847       if (!head)
 848         head = entry;
 849       else
 850         tail->next = entry;
 851       tail = entry;
 852     }
 853   wget_read_file_free (fm);
 854   return head;
 855 }
 856
 857 void
 858 cleanup_html_url (void)
 859 {
 860   /* Destroy the hash tables.  The hash table keys and values are not
 861      allocated by this code, so we don't need to free them here.  */
 862   if (interesting_tags)
 863     hash_table_destroy (interesting_tags);
 864   if (interesting_attributes)
 865     hash_table_destroy (interesting_attributes);
 866 }