sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "exits.h"
  40 #include "html-parse.h"
  41 #include "url.h"
  42 #include "utils.h"
  43 #include "hash.h"
  44 #include "convert.h"
  45 #include "recur.h"
  46 #include "html-url.h"
  47 #include "css-url.h"
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH
  83 };
  84
  85 /* The list of known tags and functions used for handling them.  Most
  86    tags are simply harvested for URLs. */
  87 static struct known_tag {
  88   int tagid;
  89   const char *name;
  90   tag_handler_t handler;
  91 } known_tags[] = {
  92   { TAG_A,       "a",           tag_find_urls },
  93   { TAG_APPLET,  "applet",      tag_find_urls },
  94   { TAG_AREA,    "area",        tag_find_urls },
  95   { TAG_BASE,    "base",        tag_handle_base },
  96   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  97   { TAG_BODY,    "body",        tag_find_urls },
  98   { TAG_EMBED,   "embed",       tag_find_urls },
  99   { TAG_FIG,     "fig",         tag_find_urls },
 100   { TAG_FORM,    "form",        tag_handle_form },
 101   { TAG_FRAME,   "frame",       tag_find_urls },
 102   { TAG_IFRAME,  "iframe",      tag_find_urls },
 103   { TAG_IMG,     "img",         tag_find_urls },
 104   { TAG_INPUT,   "input",       tag_find_urls },
 105   { TAG_LAYER,   "layer",       tag_find_urls },
 106   { TAG_LINK,    "link",        tag_handle_link },
 107   { TAG_META,    "meta",        tag_handle_meta },
 108   { TAG_OBJECT,  "object",      tag_find_urls },
 109   { TAG_OVERLAY, "overlay",     tag_find_urls },
 110   { TAG_SCRIPT,  "script",      tag_find_urls },
 111   { TAG_TABLE,   "table",       tag_find_urls },
 112   { TAG_TD,      "td",          tag_find_urls },
 113   { TAG_TH,      "th",          tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS. */
 120
 121 /* The link is "inline", i.e. needs to be retrieved for this document
 122    to be correctly rendered.  Inline links include inlined images,
 123    stylesheets, children frames, etc.  */
 124 #define ATTR_INLINE     1
 125
 126 /* The link is expected to yield HTML contents.  It's important not to
 127    try to follow HTML obtained by following e.g. <img src="...">
 128    regardless of content-type.  Doing this causes infinite loops for
 129    "images" that return non-404 error pages with links to the same
 130    image.  */
 131 #define ATTR_HTML       2
 132
 133 /* For tags handled by tag_find_urls: attributes that contain URLs to
 134    download. */
 135 static struct {
 136   int tagid;
 137   const char *attr_name;
 138   int flags;
 139 } tag_url_attributes[] = {
 140   { TAG_A,              "href",         ATTR_HTML },
 141   { TAG_APPLET,         "code",         ATTR_INLINE },
 142   { TAG_AREA,           "href",         ATTR_HTML },
 143   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 144   { TAG_BODY,           "background",   ATTR_INLINE },
 145   { TAG_EMBED,          "href",         ATTR_HTML },
 146   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_FIG,            "src",          ATTR_INLINE },
 148   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IMG,            "href",         ATTR_INLINE },
 151   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 152   { TAG_IMG,            "src",          ATTR_INLINE },
 153   { TAG_INPUT,          "src",          ATTR_INLINE },
 154   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_OBJECT,         "data",         ATTR_INLINE },
 156   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 158   { TAG_TABLE,          "background",   ATTR_INLINE },
 159   { TAG_TD,             "background",   ATTR_INLINE },
 160   { TAG_TH,             "background",   ATTR_INLINE }
 161 };
 162
 163 /* The lists of interesting tags and attributes are built dynamically,
 164    from the information above.  However, some places in the code refer
 165    to the attributes not mentioned here.  We add them manually.  */
 166 static const char *additional_attributes[] = {
 167   "rel",                        /* used by tag_handle_link  */
 168   "type",                       /* used by tag_handle_link  */
 169   "http-equiv",                 /* used by tag_handle_meta  */
 170   "name",                       /* used by tag_handle_meta  */
 171   "content",                    /* used by tag_handle_meta  */
 172   "action",                     /* used by tag_handle_form  */
 173   "style"                       /* used by check_style_attr */
 174 };
 175
 176 static struct hash_table *interesting_tags;
 177 static struct hash_table *interesting_attributes;
 178
 179 /* Will contains the (last) charset found in 'http-equiv=content-type'
 180    meta tags  */
 181 static char *meta_charset;
 182
 183 static void
 184 init_interesting (void)
 185 {
 186   /* Init the variables interesting_tags and interesting_attributes
 187      that are used by the HTML parser to know which tags and
 188      attributes we're interested in.  We initialize this only once,
 189      for performance reasons.
 190
 191      Here we also make sure that what we put in interesting_tags
 192      matches the user's preferences as specified through --ignore-tags
 193      and --follow-tags.  */
 194
 195   size_t i;
 196   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 197
 198   /* First, add all the tags we know hot to handle, mapped to their
 199      respective entries in known_tags.  */
 200   for (i = 0; i < countof (known_tags); i++)
 201     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 202
 203   /* Then remove the tags ignored through --ignore-tags.  */
 204   if (opt.ignore_tags)
 205     {
 206       char **ignored;
 207       for (ignored = opt.ignore_tags; *ignored; ignored++)
 208         hash_table_remove (interesting_tags, *ignored);
 209     }
 210
 211   /* If --follow-tags is specified, use only those tags.  */
 212   if (opt.follow_tags)
 213     {
 214       /* Create a new table intersecting --follow-tags and known_tags,
 215          and use it as interesting_tags.  */
 216       struct hash_table *intersect = make_nocase_string_hash_table (0);
 217       char **followed;
 218       for (followed = opt.follow_tags; *followed; followed++)
 219         {
 220           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 221           if (!t)
 222             continue;           /* ignore unknown --follow-tags entries. */
 223           hash_table_put (intersect, *followed, t);
 224         }
 225       hash_table_destroy (interesting_tags);
 226       interesting_tags = intersect;
 227     }
 228
 229   /* Add the attributes we care about. */
 230   interesting_attributes = make_nocase_string_hash_table (10);
 231   for (i = 0; i < countof (additional_attributes); i++)
 232     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 233   for (i = 0; i < countof (tag_url_attributes); i++)
 234     hash_table_put (interesting_attributes,
 235                     tag_url_attributes[i].attr_name, "1");
 236 }
 237
 238 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 239    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 240    index of the attribute in TAG will be stored there.  */
 241
 242 static char *
 243 find_attr (struct taginfo *tag, const char *name, int *attrind)
 244 {
 245   int i;
 246   for (i = 0; i < tag->nattrs; i++)
 247     if (!strcasecmp (tag->attrs[i].name, name))
 248       {
 249         if (attrind)
 250           *attrind = i;
 251         return tag->attrs[i].value;
 252       }
 253   return NULL;
 254 }
 255
 256 /* used for calls to append_url */
 257 #define ATTR_POS(tag, attrind, ctx) \
 258  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 259 #define ATTR_SIZE(tag, attrind) \
 260  (tag->attrs[attrind].value_raw_size)
 261
 262 /* Append LINK_URI to the urlpos structure that is being built.
 263
 264    LINK_URI will be merged with the current document base.
 265 */
 266
 267 struct urlpos *
 268 append_url (const char *link_uri, int position, int size,
 269             struct map_context *ctx)
 270 {
 271   int link_has_scheme = url_has_scheme (link_uri);
 272   struct urlpos *newel;
 273   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 274   struct url *url;
 275
 276   if (!base)
 277     {
 278       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 279                ctx->document_file, link_uri));
 280
 281       if (!link_has_scheme)
 282         {
 283           /* Base URL is unavailable, and the link does not have a
 284              location attached to it -- we have to give up.  Since
 285              this can only happen when using `--force-html -i', print
 286              a warning.  */
 287           logprintf (LOG_NOTQUIET,
 288                      _("%s: Cannot resolve incomplete link %s.\n"),
 289                      ctx->document_file, link_uri);
 290           return NULL;
 291         }
 292
 293       url = url_parse (link_uri, NULL, NULL, false);
 294       if (!url)
 295         {
 296           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 297                    ctx->document_file, link_uri));
 298           return NULL;
 299         }
 300     }
 301   else
 302     {
 303       /* Merge BASE with LINK_URI, but also make sure the result is
 304          canonicalized, i.e. that "../" have been resolved.
 305          (parse_url will do that for us.) */
 306
 307       char *complete_uri = uri_merge (base, link_uri);
 308
 309       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 310                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 311                quote_n (1, base),
 312                quote_n (2, link_uri),
 313                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 314
 315       url = url_parse (complete_uri, NULL, NULL, false);
 316       if (!url)
 317         {
 318           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 319                    ctx->document_file, complete_uri));
 320           xfree (complete_uri);
 321           return NULL;
 322         }
 323       xfree (complete_uri);
 324     }
 325
 326   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 327
 328   newel = xnew0 (struct urlpos);
 329   newel->url = url;
 330   newel->pos = position;
 331   newel->size = size;
 332
 333   /* A URL is relative if the host is not named, and the name does not
 334      start with `/'.  */
 335   if (!link_has_scheme && *link_uri != '/')
 336     newel->link_relative_p = 1;
 337   else if (link_has_scheme)
 338     newel->link_complete_p = 1;
 339
 340   /* Append the new URL maintaining the order by position.  */
 341   if (ctx->head == NULL)
 342     ctx->head = newel;
 343   else
 344     {
 345       struct urlpos *it, *prev = NULL;
 346
 347       it = ctx->head;
 348       while (it && position > it->pos)
 349         {
 350           prev = it;
 351           it = it->next;
 352         }
 353
 354       newel->next = it;
 355
 356       if (prev)
 357         prev->next = newel;
 358       else
 359         ctx->head = newel;
 360     }
 361
 362   return newel;
 363 }
 364 \f
 365 static void
 366 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 367 {
 368   int attrind;
 369   int raw_start;
 370   int raw_len;
 371   char *style = find_attr (tag, "style", &attrind);
 372   if (!style)
 373     return;
 374
 375   /* raw pos and raw size include the quotes, skip them when they are
 376      present.  */
 377   raw_start = ATTR_POS (tag, attrind, ctx);
 378   raw_len  = ATTR_SIZE (tag, attrind);
 379   if( *(char *)(ctx->text + raw_start) == '\''
 380       || *(char *)(ctx->text + raw_start) == '"')
 381     {
 382       raw_start += 1;
 383       raw_len -= 2;
 384     }
 385
 386   if(raw_len <= 0)
 387        return;
 388
 389   get_urls_css (ctx, raw_start, raw_len);
 390 }
 391
 392 /* All the tag_* functions are called from collect_tags_mapper, as
 393    specified by KNOWN_TAGS.  */
 394
 395 /* Default tag handler: collect URLs from attributes specified for
 396    this tag by tag_url_attributes.  */
 397
 398 static void
 399 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 400 {
 401   size_t i;
 402   int attrind;
 403   int first = -1;
 404
 405   for (i = 0; i < countof (tag_url_attributes); i++)
 406     if (tag_url_attributes[i].tagid == tagid)
 407       {
 408         /* We've found the index of tag_url_attributes where the
 409            attributes of our tag begin.  */
 410         first = i;
 411         break;
 412       }
 413   assert (first != -1);
 414
 415   /* Loop over the "interesting" attributes of this tag.  In this
 416      example, it will loop over "src" and "lowsrc".
 417
 418        <img src="foo.png" lowsrc="bar.png">
 419
 420      This has to be done in the outer loop so that the attributes are
 421      processed in the same order in which they appear in the page.
 422      This is required when converting links.  */
 423
 424   for (attrind = 0; attrind < tag->nattrs; attrind++)
 425     {
 426       /* Find whether TAG/ATTRIND is a combination that contains a
 427          URL. */
 428       char *link = tag->attrs[attrind].value;
 429       const size_t size = countof (tag_url_attributes);
 430
 431       /* If you're cringing at the inefficiency of the nested loops,
 432          remember that they both iterate over a very small number of
 433          items.  The worst-case inner loop is for the IMG tag, which
 434          has three attributes.  */
 435       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 436         {
 437           if (0 == strcasecmp (tag->attrs[attrind].name,
 438                                tag_url_attributes[i].attr_name))
 439             {
 440               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 441                                               ATTR_SIZE(tag,attrind), ctx);
 442               if (up)
 443                 {
 444                   int flags = tag_url_attributes[i].flags;
 445                   if (flags & ATTR_INLINE)
 446                     up->link_inline_p = 1;
 447                   if (flags & ATTR_HTML)
 448                     up->link_expect_html = 1;
 449                 }
 450             }
 451         }
 452     }
 453 }
 454
 455 /* Handle the BASE tag, for <base href=...>. */
 456
 457 static void
 458 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 459 {
 460   struct urlpos *base_urlpos;
 461   int attrind;
 462   char *newbase = find_attr (tag, "href", &attrind);
 463   if (!newbase)
 464     return;
 465
 466   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 467                             ATTR_SIZE(tag,attrind), ctx);
 468   if (!base_urlpos)
 469     return;
 470   base_urlpos->ignore_when_downloading = 1;
 471   base_urlpos->link_base_p = 1;
 472
 473   if (ctx->base)
 474     xfree (ctx->base);
 475   if (ctx->parent_base)
 476     ctx->base = uri_merge (ctx->parent_base, newbase);
 477   else
 478     ctx->base = xstrdup (newbase);
 479 }
 480
 481 /* Mark the URL found in <form action=...> for conversion. */
 482
 483 static void
 484 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 485 {
 486   int attrind;
 487   char *action = find_attr (tag, "action", &attrind);
 488
 489   if (action)
 490     {
 491       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 492                                       ATTR_SIZE(tag,attrind), ctx);
 493       if (up)
 494         up->ignore_when_downloading = 1;
 495     }
 496 }
 497
 498 /* Handle the LINK tag.  It requires special handling because how its
 499    links will be followed in -p mode depends on the REL attribute.  */
 500
 501 static void
 502 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 503 {
 504   int attrind;
 505   char *href = find_attr (tag, "href", &attrind);
 506
 507   /* All <link href="..."> link references are external, except those
 508      known not to be, such as style sheet and shortcut icon:
 509
 510      <link rel="stylesheet" href="...">
 511      <link rel="shortcut icon" href="...">
 512   */
 513   if (href)
 514     {
 515       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 516                                       ATTR_SIZE(tag,attrind), ctx);
 517       if (up)
 518         {
 519           char *rel = find_attr (tag, "rel", NULL);
 520           if (rel)
 521             {
 522               if (0 == strcasecmp (rel, "stylesheet"))
 523                 {
 524                   up->link_inline_p = 1;
 525                   up->link_expect_css = 1;
 526                 }
 527               else if (0 == strcasecmp (rel, "shortcut icon"))
 528                 {
 529                   up->link_inline_p = 1;
 530                 }
 531               else
 532                 {
 533                   /* The external ones usually point to HTML pages, such as
 534                      <link rel="next" href="...">
 535                      except when the type attribute says otherwise:
 536                      <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
 537                   */
 538                   char *type = find_attr (tag, "type", NULL);
 539                   if (!type || strcasecmp (type, "text/html") == 0)
 540                     up->link_expect_html = 1;
 541                 }
 542             }
 543         }
 544     }
 545 }
 546
 547 /* Handle the META tag.  This requires special handling because of the
 548    refresh feature and because of robot exclusion.  */
 549
 550 static void
 551 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 552 {
 553   char *name = find_attr (tag, "name", NULL);
 554   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 555
 556   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 557     {
 558       /* Some pages use a META tag to specify that the page be
 559          refreshed by a new page after a given number of seconds.  The
 560          general format for this is:
 561
 562            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 563
 564          So we just need to skip past the "NUMBER; URL=" garbage to
 565          get to the URL.  */
 566
 567       struct urlpos *entry;
 568       int attrind;
 569       int timeout = 0;
 570       char *p;
 571
 572       char *refresh = find_attr (tag, "content", &attrind);
 573       if (!refresh)
 574         return;
 575
 576       for (p = refresh; c_isdigit (*p); p++)
 577         timeout = 10 * timeout + *p - '0';
 578       if (*p++ != ';')
 579         return;
 580
 581       while (c_isspace (*p))
 582         ++p;
 583       if (!(   c_toupper (*p)       == 'U'
 584             && c_toupper (*(p + 1)) == 'R'
 585             && c_toupper (*(p + 2)) == 'L'
 586             &&          *(p + 3)  == '='))
 587         return;
 588       p += 4;
 589       while (c_isspace (*p))
 590         ++p;
 591
 592       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 593                           ATTR_SIZE(tag,attrind), ctx);
 594       if (entry)
 595         {
 596           entry->link_refresh_p = 1;
 597           entry->refresh_timeout = timeout;
 598           entry->link_expect_html = 1;
 599         }
 600     }
 601   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 602     {
 603       /* Handle stuff like:
 604          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 605
 606       char *mcharset;
 607       char *content = find_attr (tag, "content", NULL);
 608       if (!content)
 609         return;
 610
 611       mcharset = parse_charset (content);
 612       if (!mcharset)
 613         return;
 614
 615       xfree_null (meta_charset);
 616       meta_charset = mcharset;
 617     }
 618   else if (name && 0 == strcasecmp (name, "robots"))
 619     {
 620       /* Handle stuff like:
 621          <meta name="robots" content="index,nofollow"> */
 622       char *content = find_attr (tag, "content", NULL);
 623       if (!content)
 624         return;
 625       if (!strcasecmp (content, "none"))
 626         ctx->nofollow = true;
 627       else
 628         {
 629           while (*content)
 630             {
 631               char *end;
 632               /* Skip any initial whitespace. */
 633               content += strspn (content, " \f\n\r\t\v");
 634               /* Find the next occurrence of ',' or whitespace,
 635                * or the end of the string.  */
 636               end = content + strcspn (content, ", \f\n\r\t\v");
 637               if (!strncasecmp (content, "nofollow", end - content))
 638                 ctx->nofollow = true;
 639               /* Skip past the next comma, if any. */
 640               if (*end == ',')
 641                 ++end;
 642               else
 643                 {
 644                   end = strchr (end, ',');
 645                   if (end)
 646                     ++end;
 647                   else
 648                     end = content + strlen (content);
 649                 }
 650               content = end;
 651             }
 652         }
 653     }
 654 }
 655
 656 /* Dispatch the tag handler appropriate for the tag we're mapping
 657    over.  See known_tags[] for definition of tag handlers.  */
 658
 659 static void
 660 collect_tags_mapper (struct taginfo *tag, void *arg)
 661 {
 662   struct map_context *ctx = (struct map_context *)arg;
 663
 664   /* Find the tag in our table of tags.  This must not fail because
 665      map_html_tags only returns tags found in interesting_tags.
 666
 667      I've changed this for now, I'm passing NULL as interesting_tags
 668      to map_html_tags.  This way we can check all tags for a style
 669      attribute.
 670   */
 671   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 672
 673   if (t != NULL)
 674     t->handler (t->tagid, tag, ctx);
 675
 676   check_style_attr (tag, ctx);
 677
 678   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 679       tag->contents_begin && tag->contents_end)
 680   {
 681     /* parse contents */
 682     get_urls_css (ctx, tag->contents_begin - ctx->text,
 683                   tag->contents_end - tag->contents_begin);
 684   }
 685 }
 686 \f
 687 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 688    it.  It merges relative links in FILE with URL.  It is aware of
 689    <base href=...> and does the right thing.  */
 690
 691 struct urlpos *
 692 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 693                struct iri *iri)
 694 {
 695   struct file_memory *fm;
 696   struct map_context ctx;
 697   int flags;
 698
 699   /* Load the file. */
 700   fm = wget_read_file (file);
 701   if (!fm)
 702     {
 703       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 704       return NULL;
 705     }
 706   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 707
 708   ctx.text = fm->content;
 709   ctx.head = NULL;
 710   ctx.base = NULL;
 711   ctx.parent_base = url ? url : opt.base_href;
 712   ctx.document_file = file;
 713   ctx.nofollow = false;
 714
 715   if (!interesting_tags)
 716     init_interesting ();
 717
 718   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 719      generate <a href=" foo"> instead of <a href="foo"> (browsers
 720      ignore spaces as well.)  If you really mean space, use &32; or
 721      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 722      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 723      ignored by IE and Mozilla and are presumably introduced by
 724      writing HTML with editors that force word wrap.  */
 725   flags = MHT_TRIM_VALUES;
 726   if (opt.strict_comments)
 727     flags |= MHT_STRICT_COMMENTS;
 728
 729   /* the NULL here used to be interesting_tags */
 730   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 731                  NULL, interesting_attributes);
 732
 733   /* If meta charset isn't null, override content encoding */
 734   if (iri && meta_charset)
 735     set_content_encoding (iri, meta_charset);
 736
 737   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 738   if (meta_disallow_follow)
 739     *meta_disallow_follow = ctx.nofollow;
 740
 741   xfree_null (ctx.base);
 742   wget_read_file_free (fm);
 743   return ctx.head;
 744 }
 745
 746 /* This doesn't really have anything to do with HTML, but it's similar
 747    to get_urls_html, so we put it here.  */
 748
 749 struct urlpos *
 750 get_urls_file (const char *file)
 751 {
 752   struct file_memory *fm;
 753   struct urlpos *head, *tail;
 754   const char *text, *text_end;
 755
 756   /* Load the file.  */
 757   fm = wget_read_file (file);
 758   if (!fm)
 759     {
 760       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 761       return NULL;
 762     }
 763   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 764
 765   head = tail = NULL;
 766   text = fm->content;
 767   text_end = fm->content + fm->length;
 768   while (text < text_end)
 769     {
 770       int up_error_code;
 771       char *url_text;
 772       struct urlpos *entry;
 773       struct url *url;
 774
 775       const char *line_beg = text;
 776       const char *line_end = memchr (text, '\n', text_end - text);
 777       if (!line_end)
 778         line_end = text_end;
 779       else
 780         ++line_end;
 781       text = line_end;
 782
 783       /* Strip whitespace from the beginning and end of line. */
 784       while (line_beg < line_end && c_isspace (*line_beg))
 785         ++line_beg;
 786       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 787         --line_end;
 788
 789       if (line_beg == line_end)
 790         continue;
 791
 792       /* The URL is in the [line_beg, line_end) region. */
 793
 794       /* We must copy the URL to a zero-terminated string, and we
 795          can't use alloca because we're in a loop.  *sigh*.  */
 796       url_text = strdupdelim (line_beg, line_end);
 797
 798       if (opt.base_href)
 799         {
 800           /* Merge opt.base_href with URL. */
 801           char *merged = uri_merge (opt.base_href, url_text);
 802           xfree (url_text);
 803           url_text = merged;
 804         }
 805
 806       url = url_parse (url_text, &up_error_code, NULL, false);
 807       if (!url)
 808         {
 809           char *error = url_error (url_text, up_error_code);
 810           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 811                      file, url_text, error);
 812           xfree (url_text);
 813           xfree (error);
 814           inform_exit_status (URLERROR);
 815           continue;
 816         }
 817       xfree (url_text);
 818
 819       entry = xnew0 (struct urlpos);
 820       entry->url = url;
 821
 822       if (!head)
 823         head = entry;
 824       else
 825         tail->next = entry;
 826       tail = entry;
 827     }
 828   wget_read_file_free (fm);
 829   return head;
 830 }
 831
 832 void
 833 cleanup_html_url (void)
 834 {
 835   /* Destroy the hash tables.  The hash table keys and values are not
 836      allocated by this code, so we don't need to free them here.  */
 837   if (interesting_tags)
 838     hash_table_destroy (interesting_tags);
 839   if (interesting_attributes)
 840     hash_table_destroy (interesting_attributes);
 841 }