sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "exits.h"
  40 #include "html-parse.h"
  41 #include "url.h"
  42 #include "utils.h"
  43 #include "hash.h"
  44 #include "convert.h"
  45 #include "recur.h"
  46 #include "html-url.h"
  47 #include "css-url.h"
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH,
  83   TAG_VIDEO,
  84   TAG_AUDIO,
  85   TAG_SOURCE
  86 };
  87
  88 /* The list of known tags and functions used for handling them.  Most
  89    tags are simply harvested for URLs. */
  90 static struct known_tag {
  91   int tagid;
  92   const char *name;
  93   tag_handler_t handler;
  94 } known_tags[] = {
  95   { TAG_A,       "a",           tag_find_urls },
  96   { TAG_APPLET,  "applet",      tag_find_urls },
  97   { TAG_AREA,    "area",        tag_find_urls },
  98   { TAG_BASE,    "base",        tag_handle_base },
  99   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 100   { TAG_BODY,    "body",        tag_find_urls },
 101   { TAG_EMBED,   "embed",       tag_find_urls },
 102   { TAG_FIG,     "fig",         tag_find_urls },
 103   { TAG_FORM,    "form",        tag_handle_form },
 104   { TAG_FRAME,   "frame",       tag_find_urls },
 105   { TAG_IFRAME,  "iframe",      tag_find_urls },
 106   { TAG_IMG,     "img",         tag_find_urls },
 107   { TAG_INPUT,   "input",       tag_find_urls },
 108   { TAG_LAYER,   "layer",       tag_find_urls },
 109   { TAG_LINK,    "link",        tag_handle_link },
 110   { TAG_META,    "meta",        tag_handle_meta },
 111   { TAG_OBJECT,  "object",      tag_find_urls },
 112   { TAG_OVERLAY, "overlay",     tag_find_urls },
 113   { TAG_SCRIPT,  "script",      tag_find_urls },
 114   { TAG_TABLE,   "table",       tag_find_urls },
 115   { TAG_TD,      "td",          tag_find_urls },
 116   { TAG_TH,      "th",          tag_find_urls },
 117   { TAG_VIDEO,   "video",       tag_find_urls },
 118   { TAG_AUDIO,   "audio",       tag_find_urls },
 119   { TAG_SOURCE,  "source",      tag_find_urls }
 120 };
 121
 122 /* tag_url_attributes documents which attributes of which tags contain
 123    URLs to harvest.  It is used by tag_find_urls.  */
 124
 125 /* Defines for the FLAGS. */
 126
 127 /* The link is "inline", i.e. needs to be retrieved for this document
 128    to be correctly rendered.  Inline links include inlined images,
 129    stylesheets, children frames, etc.  */
 130 #define ATTR_INLINE     1
 131
 132 /* The link is expected to yield HTML contents.  It's important not to
 133    try to follow HTML obtained by following e.g. <img src="...">
 134    regardless of content-type.  Doing this causes infinite loops for
 135    "images" that return non-404 error pages with links to the same
 136    image.  */
 137 #define ATTR_HTML       2
 138
 139 /* For tags handled by tag_find_urls: attributes that contain URLs to
 140    download. */
 141 static struct {
 142   int tagid;
 143   const char *attr_name;
 144   int flags;
 145 } tag_url_attributes[] = {
 146   { TAG_A,              "href",         ATTR_HTML },
 147   { TAG_APPLET,         "code",         ATTR_INLINE },
 148   { TAG_AREA,           "href",         ATTR_HTML },
 149   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 150   { TAG_BODY,           "background",   ATTR_INLINE },
 151   { TAG_EMBED,          "href",         ATTR_HTML },
 152   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 153   { TAG_FIG,            "src",          ATTR_INLINE },
 154   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_IMG,            "href",         ATTR_INLINE },
 157   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 158   { TAG_IMG,            "src",          ATTR_INLINE },
 159   { TAG_INPUT,          "src",          ATTR_INLINE },
 160   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 161   { TAG_OBJECT,         "data",         ATTR_INLINE },
 162   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 163   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 164   { TAG_TABLE,          "background",   ATTR_INLINE },
 165   { TAG_TD,             "background",   ATTR_INLINE },
 166   { TAG_TH,             "background",   ATTR_INLINE },
 167   { TAG_VIDEO,          "src",          ATTR_INLINE },
 168   { TAG_VIDEO,          "poster",       ATTR_INLINE },
 169   { TAG_AUDIO,          "src",          ATTR_INLINE },
 170   { TAG_AUDIO,          "poster",       ATTR_INLINE },
 171   { TAG_SOURCE,         "src",          ATTR_INLINE }
 172 };
 173
 174 /* The lists of interesting tags and attributes are built dynamically,
 175    from the information above.  However, some places in the code refer
 176    to the attributes not mentioned here.  We add them manually.  */
 177 static const char *additional_attributes[] = {
 178   "rel",                        /* used by tag_handle_link  */
 179   "type",                       /* used by tag_handle_link  */
 180   "http-equiv",                 /* used by tag_handle_meta  */
 181   "name",                       /* used by tag_handle_meta  */
 182   "content",                    /* used by tag_handle_meta  */
 183   "action",                     /* used by tag_handle_form  */
 184   "style"                       /* used by check_style_attr */
 185 };
 186
 187 static struct hash_table *interesting_tags;
 188 static struct hash_table *interesting_attributes;
 189
 190 /* Will contains the (last) charset found in 'http-equiv=content-type'
 191    meta tags  */
 192 static char *meta_charset;
 193
 194 static void
 195 init_interesting (void)
 196 {
 197   /* Init the variables interesting_tags and interesting_attributes
 198      that are used by the HTML parser to know which tags and
 199      attributes we're interested in.  We initialize this only once,
 200      for performance reasons.
 201
 202      Here we also make sure that what we put in interesting_tags
 203      matches the user's preferences as specified through --ignore-tags
 204      and --follow-tags.  */
 205
 206   size_t i;
 207   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 208
 209   /* First, add all the tags we know hot to handle, mapped to their
 210      respective entries in known_tags.  */
 211   for (i = 0; i < countof (known_tags); i++)
 212     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 213
 214   /* Then remove the tags ignored through --ignore-tags.  */
 215   if (opt.ignore_tags)
 216     {
 217       char **ignored;
 218       for (ignored = opt.ignore_tags; *ignored; ignored++)
 219         hash_table_remove (interesting_tags, *ignored);
 220     }
 221
 222   /* If --follow-tags is specified, use only those tags.  */
 223   if (opt.follow_tags)
 224     {
 225       /* Create a new table intersecting --follow-tags and known_tags,
 226          and use it as interesting_tags.  */
 227       struct hash_table *intersect = make_nocase_string_hash_table (0);
 228       char **followed;
 229       for (followed = opt.follow_tags; *followed; followed++)
 230         {
 231           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 232           if (!t)
 233             continue;           /* ignore unknown --follow-tags entries. */
 234           hash_table_put (intersect, *followed, t);
 235         }
 236       hash_table_destroy (interesting_tags);
 237       interesting_tags = intersect;
 238     }
 239
 240   /* Add the attributes we care about. */
 241   interesting_attributes = make_nocase_string_hash_table (10);
 242   for (i = 0; i < countof (additional_attributes); i++)
 243     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 244   for (i = 0; i < countof (tag_url_attributes); i++)
 245     hash_table_put (interesting_attributes,
 246                     tag_url_attributes[i].attr_name, "1");
 247 }
 248
 249 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 250    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 251    index of the attribute in TAG will be stored there.  */
 252
 253 static char *
 254 find_attr (struct taginfo *tag, const char *name, int *attrind)
 255 {
 256   int i;
 257   for (i = 0; i < tag->nattrs; i++)
 258     if (!strcasecmp (tag->attrs[i].name, name))
 259       {
 260         if (attrind)
 261           *attrind = i;
 262         return tag->attrs[i].value;
 263       }
 264   return NULL;
 265 }
 266
 267 /* used for calls to append_url */
 268 #define ATTR_POS(tag, attrind, ctx) \
 269  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 270 #define ATTR_SIZE(tag, attrind) \
 271  (tag->attrs[attrind].value_raw_size)
 272
 273 /* Append LINK_URI to the urlpos structure that is being built.
 274
 275    LINK_URI will be merged with the current document base.
 276 */
 277
 278 struct urlpos *
 279 append_url (const char *link_uri, int position, int size,
 280             struct map_context *ctx)
 281 {
 282   int link_has_scheme = url_has_scheme (link_uri);
 283   struct urlpos *newel;
 284   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 285   struct url *url;
 286
 287   if (!base)
 288     {
 289       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 290                ctx->document_file, link_uri));
 291
 292       if (!link_has_scheme)
 293         {
 294           /* Base URL is unavailable, and the link does not have a
 295              location attached to it -- we have to give up.  Since
 296              this can only happen when using `--force-html -i', print
 297              a warning.  */
 298           logprintf (LOG_NOTQUIET,
 299                      _("%s: Cannot resolve incomplete link %s.\n"),
 300                      ctx->document_file, link_uri);
 301           return NULL;
 302         }
 303
 304       url = url_parse (link_uri, NULL, NULL, false);
 305       if (!url)
 306         {
 307           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 308                    ctx->document_file, link_uri));
 309           return NULL;
 310         }
 311     }
 312   else
 313     {
 314       /* Merge BASE with LINK_URI, but also make sure the result is
 315          canonicalized, i.e. that "../" have been resolved.
 316          (parse_url will do that for us.) */
 317
 318       char *complete_uri = uri_merge (base, link_uri);
 319
 320       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 321                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 322                quote_n (1, base),
 323                quote_n (2, link_uri),
 324                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 325
 326       url = url_parse (complete_uri, NULL, NULL, false);
 327       if (!url)
 328         {
 329           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 330                    ctx->document_file, complete_uri));
 331           xfree (complete_uri);
 332           return NULL;
 333         }
 334       xfree (complete_uri);
 335     }
 336
 337   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 338
 339   newel = xnew0 (struct urlpos);
 340   newel->url = url;
 341   newel->pos = position;
 342   newel->size = size;
 343
 344   /* A URL is relative if the host is not named, and the name does not
 345      start with `/'.  */
 346   if (!link_has_scheme && *link_uri != '/')
 347     newel->link_relative_p = 1;
 348   else if (link_has_scheme)
 349     newel->link_complete_p = 1;
 350
 351   /* Append the new URL maintaining the order by position.  */
 352   if (ctx->head == NULL)
 353     ctx->head = newel;
 354   else
 355     {
 356       struct urlpos *it, *prev = NULL;
 357
 358       it = ctx->head;
 359       while (it && position > it->pos)
 360         {
 361           prev = it;
 362           it = it->next;
 363         }
 364
 365       newel->next = it;
 366
 367       if (prev)
 368         prev->next = newel;
 369       else
 370         ctx->head = newel;
 371     }
 372
 373   return newel;
 374 }
 375 \f
 376 static void
 377 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 378 {
 379   int attrind;
 380   int raw_start;
 381   int raw_len;
 382   char *style = find_attr (tag, "style", &attrind);
 383   if (!style)
 384     return;
 385
 386   /* raw pos and raw size include the quotes, skip them when they are
 387      present.  */
 388   raw_start = ATTR_POS (tag, attrind, ctx);
 389   raw_len  = ATTR_SIZE (tag, attrind);
 390   if( *(char *)(ctx->text + raw_start) == '\''
 391       || *(char *)(ctx->text + raw_start) == '"')
 392     {
 393       raw_start += 1;
 394       raw_len -= 2;
 395     }
 396
 397   if(raw_len <= 0)
 398        return;
 399
 400   get_urls_css (ctx, raw_start, raw_len);
 401 }
 402
 403 /* All the tag_* functions are called from collect_tags_mapper, as
 404    specified by KNOWN_TAGS.  */
 405
 406 /* Default tag handler: collect URLs from attributes specified for
 407    this tag by tag_url_attributes.  */
 408
 409 static void
 410 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 411 {
 412   size_t i;
 413   int attrind;
 414   int first = -1;
 415
 416   for (i = 0; i < countof (tag_url_attributes); i++)
 417     if (tag_url_attributes[i].tagid == tagid)
 418       {
 419         /* We've found the index of tag_url_attributes where the
 420            attributes of our tag begin.  */
 421         first = i;
 422         break;
 423       }
 424   assert (first != -1);
 425
 426   /* Loop over the "interesting" attributes of this tag.  In this
 427      example, it will loop over "src" and "lowsrc".
 428
 429        <img src="foo.png" lowsrc="bar.png">
 430
 431      This has to be done in the outer loop so that the attributes are
 432      processed in the same order in which they appear in the page.
 433      This is required when converting links.  */
 434
 435   for (attrind = 0; attrind < tag->nattrs; attrind++)
 436     {
 437       /* Find whether TAG/ATTRIND is a combination that contains a
 438          URL. */
 439       char *link = tag->attrs[attrind].value;
 440       const size_t size = countof (tag_url_attributes);
 441
 442       /* If you're cringing at the inefficiency of the nested loops,
 443          remember that they both iterate over a very small number of
 444          items.  The worst-case inner loop is for the IMG tag, which
 445          has three attributes.  */
 446       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 447         {
 448           if (0 == strcasecmp (tag->attrs[attrind].name,
 449                                tag_url_attributes[i].attr_name))
 450             {
 451               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 452                                               ATTR_SIZE(tag,attrind), ctx);
 453               if (up)
 454                 {
 455                   int flags = tag_url_attributes[i].flags;
 456                   if (flags & ATTR_INLINE)
 457                     up->link_inline_p = 1;
 458                   if (flags & ATTR_HTML)
 459                     up->link_expect_html = 1;
 460                 }
 461             }
 462         }
 463     }
 464 }
 465
 466 /* Handle the BASE tag, for <base href=...>. */
 467
 468 static void
 469 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 470 {
 471   struct urlpos *base_urlpos;
 472   int attrind;
 473   char *newbase = find_attr (tag, "href", &attrind);
 474   if (!newbase)
 475     return;
 476
 477   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 478                             ATTR_SIZE(tag,attrind), ctx);
 479   if (!base_urlpos)
 480     return;
 481   base_urlpos->ignore_when_downloading = 1;
 482   base_urlpos->link_base_p = 1;
 483
 484   if (ctx->base)
 485     xfree (ctx->base);
 486   if (ctx->parent_base)
 487     ctx->base = uri_merge (ctx->parent_base, newbase);
 488   else
 489     ctx->base = xstrdup (newbase);
 490 }
 491
 492 /* Mark the URL found in <form action=...> for conversion. */
 493
 494 static void
 495 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 496 {
 497   int attrind;
 498   char *action = find_attr (tag, "action", &attrind);
 499
 500   if (action)
 501     {
 502       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 503                                       ATTR_SIZE(tag,attrind), ctx);
 504       if (up)
 505         up->ignore_when_downloading = 1;
 506     }
 507 }
 508
 509 /* Handle the LINK tag.  It requires special handling because how its
 510    links will be followed in -p mode depends on the REL attribute.  */
 511
 512 static void
 513 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 514 {
 515   int attrind;
 516   char *href = find_attr (tag, "href", &attrind);
 517
 518   /* All <link href="..."> link references are external, except those
 519      known not to be, such as style sheet and shortcut icon:
 520
 521      <link rel="stylesheet" href="...">
 522      <link rel="shortcut icon" href="...">
 523   */
 524   if (href)
 525     {
 526       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 527                                       ATTR_SIZE(tag,attrind), ctx);
 528       if (up)
 529         {
 530           char *rel = find_attr (tag, "rel", NULL);
 531           if (rel)
 532             {
 533               if (0 == strcasecmp (rel, "stylesheet"))
 534                 {
 535                   up->link_inline_p = 1;
 536                   up->link_expect_css = 1;
 537                 }
 538               else if (0 == strcasecmp (rel, "shortcut icon"))
 539                 {
 540                   up->link_inline_p = 1;
 541                 }
 542               else
 543                 {
 544                   /* The external ones usually point to HTML pages, such as
 545                      <link rel="next" href="...">
 546                      except when the type attribute says otherwise:
 547                      <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
 548                   */
 549                   char *type = find_attr (tag, "type", NULL);
 550                   if (!type || strcasecmp (type, "text/html") == 0)
 551                     up->link_expect_html = 1;
 552                 }
 553             }
 554         }
 555     }
 556 }
 557
 558 /* Handle the META tag.  This requires special handling because of the
 559    refresh feature and because of robot exclusion.  */
 560
 561 static void
 562 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 563 {
 564   char *name = find_attr (tag, "name", NULL);
 565   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 566
 567   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 568     {
 569       /* Some pages use a META tag to specify that the page be
 570          refreshed by a new page after a given number of seconds.  The
 571          general format for this is:
 572
 573            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 574
 575          So we just need to skip past the "NUMBER; URL=" garbage to
 576          get to the URL.  */
 577
 578       struct urlpos *entry;
 579       int attrind;
 580       int timeout = 0;
 581       char *p;
 582
 583       char *refresh = find_attr (tag, "content", &attrind);
 584       if (!refresh)
 585         return;
 586
 587       for (p = refresh; c_isdigit (*p); p++)
 588         timeout = 10 * timeout + *p - '0';
 589       if (*p++ != ';')
 590         return;
 591
 592       while (c_isspace (*p))
 593         ++p;
 594       if (!(   c_toupper (*p)       == 'U'
 595             && c_toupper (*(p + 1)) == 'R'
 596             && c_toupper (*(p + 2)) == 'L'
 597             &&          *(p + 3)  == '='))
 598         return;
 599       p += 4;
 600       while (c_isspace (*p))
 601         ++p;
 602
 603       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 604                           ATTR_SIZE(tag,attrind), ctx);
 605       if (entry)
 606         {
 607           entry->link_refresh_p = 1;
 608           entry->refresh_timeout = timeout;
 609           entry->link_expect_html = 1;
 610         }
 611     }
 612   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 613     {
 614       /* Handle stuff like:
 615          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 616
 617       char *mcharset;
 618       char *content = find_attr (tag, "content", NULL);
 619       if (!content)
 620         return;
 621
 622       mcharset = parse_charset (content);
 623       if (!mcharset)
 624         return;
 625
 626       xfree_null (meta_charset);
 627       meta_charset = mcharset;
 628     }
 629   else if (name && 0 == strcasecmp (name, "robots"))
 630     {
 631       /* Handle stuff like:
 632          <meta name="robots" content="index,nofollow"> */
 633       char *content = find_attr (tag, "content", NULL);
 634       if (!content)
 635         return;
 636       if (!strcasecmp (content, "none"))
 637         ctx->nofollow = true;
 638       else
 639         {
 640           while (*content)
 641             {
 642               char *end;
 643               /* Skip any initial whitespace. */
 644               content += strspn (content, " \f\n\r\t\v");
 645               /* Find the next occurrence of ',' or whitespace,
 646                * or the end of the string.  */
 647               end = content + strcspn (content, ", \f\n\r\t\v");
 648               if (!strncasecmp (content, "nofollow", end - content))
 649                 ctx->nofollow = true;
 650               /* Skip past the next comma, if any. */
 651               if (*end == ',')
 652                 ++end;
 653               else
 654                 {
 655                   end = strchr (end, ',');
 656                   if (end)
 657                     ++end;
 658                   else
 659                     end = content + strlen (content);
 660                 }
 661               content = end;
 662             }
 663         }
 664     }
 665 }
 666
 667 /* Dispatch the tag handler appropriate for the tag we're mapping
 668    over.  See known_tags[] for definition of tag handlers.  */
 669
 670 static void
 671 collect_tags_mapper (struct taginfo *tag, void *arg)
 672 {
 673   struct map_context *ctx = (struct map_context *)arg;
 674
 675   /* Find the tag in our table of tags.  This must not fail because
 676      map_html_tags only returns tags found in interesting_tags.
 677
 678      I've changed this for now, I'm passing NULL as interesting_tags
 679      to map_html_tags.  This way we can check all tags for a style
 680      attribute.
 681   */
 682   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 683
 684   if (t != NULL)
 685     t->handler (t->tagid, tag, ctx);
 686
 687   check_style_attr (tag, ctx);
 688
 689   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style"))
 690       && tag->contents_begin && tag->contents_end
 691       && tag->contents_begin <= tag->contents_end)
 692   {
 693     /* parse contents */
 694     get_urls_css (ctx, tag->contents_begin - ctx->text,
 695                   tag->contents_end - tag->contents_begin);
 696   }
 697 }
 698 \f
 699 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 700    it.  It merges relative links in FILE with URL.  It is aware of
 701    <base href=...> and does the right thing.  */
 702
 703 struct urlpos *
 704 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 705                struct iri *iri)
 706 {
 707   struct file_memory *fm;
 708   struct map_context ctx;
 709   int flags;
 710
 711   /* Load the file. */
 712   fm = wget_read_file (file);
 713   if (!fm)
 714     {
 715       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 716       return NULL;
 717     }
 718   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 719
 720   ctx.text = fm->content;
 721   ctx.head = NULL;
 722   ctx.base = NULL;
 723   ctx.parent_base = url ? url : opt.base_href;
 724   ctx.document_file = file;
 725   ctx.nofollow = false;
 726
 727   if (!interesting_tags)
 728     init_interesting ();
 729
 730   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 731      generate <a href=" foo"> instead of <a href="foo"> (browsers
 732      ignore spaces as well.)  If you really mean space, use &32; or
 733      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 734      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 735      ignored by IE and Mozilla and are presumably introduced by
 736      writing HTML with editors that force word wrap.  */
 737   flags = MHT_TRIM_VALUES;
 738   if (opt.strict_comments)
 739     flags |= MHT_STRICT_COMMENTS;
 740
 741   /* the NULL here used to be interesting_tags */
 742   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 743                  NULL, interesting_attributes);
 744
 745   /* If meta charset isn't null, override content encoding */
 746   if (iri && meta_charset)
 747     set_content_encoding (iri, meta_charset);
 748
 749   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 750   if (meta_disallow_follow)
 751     *meta_disallow_follow = ctx.nofollow;
 752
 753   xfree_null (ctx.base);
 754   wget_read_file_free (fm);
 755   return ctx.head;
 756 }
 757
 758 /* This doesn't really have anything to do with HTML, but it's similar
 759    to get_urls_html, so we put it here.  */
 760
 761 struct urlpos *
 762 get_urls_file (const char *file)
 763 {
 764   struct file_memory *fm;
 765   struct urlpos *head, *tail;
 766   const char *text, *text_end;
 767
 768   /* Load the file.  */
 769   fm = wget_read_file (file);
 770   if (!fm)
 771     {
 772       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 773       return NULL;
 774     }
 775   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 776
 777   head = tail = NULL;
 778   text = fm->content;
 779   text_end = fm->content + fm->length;
 780   while (text < text_end)
 781     {
 782       int up_error_code;
 783       char *url_text;
 784       struct urlpos *entry;
 785       struct url *url;
 786
 787       const char *line_beg = text;
 788       const char *line_end = memchr (text, '\n', text_end - text);
 789       if (!line_end)
 790         line_end = text_end;
 791       else
 792         ++line_end;
 793       text = line_end;
 794
 795       /* Strip whitespace from the beginning and end of line. */
 796       while (line_beg < line_end && c_isspace (*line_beg))
 797         ++line_beg;
 798       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 799         --line_end;
 800
 801       if (line_beg == line_end)
 802         continue;
 803
 804       /* The URL is in the [line_beg, line_end) region. */
 805
 806       /* We must copy the URL to a zero-terminated string, and we
 807          can't use alloca because we're in a loop.  *sigh*.  */
 808       url_text = strdupdelim (line_beg, line_end);
 809
 810       if (opt.base_href)
 811         {
 812           /* Merge opt.base_href with URL. */
 813           char *merged = uri_merge (opt.base_href, url_text);
 814           xfree (url_text);
 815           url_text = merged;
 816         }
 817
 818       char *new_url = rewrite_shorthand_url (url_text);
 819       if (new_url)
 820         {
 821           xfree (url_text);
 822           url_text = new_url;
 823         }
 824
 825       url = url_parse (url_text, &up_error_code, NULL, false);
 826       if (!url)
 827         {
 828           char *error = url_error (url_text, up_error_code);
 829           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 830                      file, url_text, error);
 831           xfree (url_text);
 832           xfree (error);
 833           inform_exit_status (URLERROR);
 834           continue;
 835         }
 836       xfree (url_text);
 837
 838       entry = xnew0 (struct urlpos);
 839       entry->url = url;
 840
 841       if (!head)
 842         head = entry;
 843       else
 844         tail->next = entry;
 845       tail = entry;
 846     }
 847   wget_read_file_free (fm);
 848   return head;
 849 }
 850
 851 void
 852 cleanup_html_url (void)
 853 {
 854   /* Destroy the hash tables.  The hash table keys and values are not
 855      allocated by this code, so we don't need to free them here.  */
 856   if (interesting_tags)
 857     hash_table_destroy (interesting_tags);
 858   if (interesting_attributes)
 859     hash_table_destroy (interesting_attributes);
 860 }