sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link  */
 167   "type",                       /* used by tag_handle_link  */
 168   "http-equiv",                 /* used by tag_handle_meta  */
 169   "name",                       /* used by tag_handle_meta  */
 170   "content",                    /* used by tag_handle_meta  */
 171   "action",                     /* used by tag_handle_form  */
 172   "style"                       /* used by check_style_attr */
 173 };
 174
 175 static struct hash_table *interesting_tags;
 176 static struct hash_table *interesting_attributes;
 177
 178 /* Will contains the (last) charset found in 'http-equiv=content-type'
 179    meta tags  */
 180 static char *meta_charset;
 181
 182 static void
 183 init_interesting (void)
 184 {
 185   /* Init the variables interesting_tags and interesting_attributes
 186      that are used by the HTML parser to know which tags and
 187      attributes we're interested in.  We initialize this only once,
 188      for performance reasons.
 189
 190      Here we also make sure that what we put in interesting_tags
 191      matches the user's preferences as specified through --ignore-tags
 192      and --follow-tags.  */
 193
 194   size_t i;
 195   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 196
 197   /* First, add all the tags we know hot to handle, mapped to their
 198      respective entries in known_tags.  */
 199   for (i = 0; i < countof (known_tags); i++)
 200     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 201
 202   /* Then remove the tags ignored through --ignore-tags.  */
 203   if (opt.ignore_tags)
 204     {
 205       char **ignored;
 206       for (ignored = opt.ignore_tags; *ignored; ignored++)
 207         hash_table_remove (interesting_tags, *ignored);
 208     }
 209
 210   /* If --follow-tags is specified, use only those tags.  */
 211   if (opt.follow_tags)
 212     {
 213       /* Create a new table intersecting --follow-tags and known_tags,
 214          and use it as interesting_tags.  */
 215       struct hash_table *intersect = make_nocase_string_hash_table (0);
 216       char **followed;
 217       for (followed = opt.follow_tags; *followed; followed++)
 218         {
 219           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 220           if (!t)
 221             continue;           /* ignore unknown --follow-tags entries. */
 222           hash_table_put (intersect, *followed, t);
 223         }
 224       hash_table_destroy (interesting_tags);
 225       interesting_tags = intersect;
 226     }
 227
 228   /* Add the attributes we care about. */
 229   interesting_attributes = make_nocase_string_hash_table (10);
 230   for (i = 0; i < countof (additional_attributes); i++)
 231     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 232   for (i = 0; i < countof (tag_url_attributes); i++)
 233     hash_table_put (interesting_attributes,
 234                     tag_url_attributes[i].attr_name, "1");
 235 }
 236
 237 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 238    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 239    index of the attribute in TAG will be stored there.  */
 240
 241 static char *
 242 find_attr (struct taginfo *tag, const char *name, int *attrind)
 243 {
 244   int i;
 245   for (i = 0; i < tag->nattrs; i++)
 246     if (!strcasecmp (tag->attrs[i].name, name))
 247       {
 248         if (attrind)
 249           *attrind = i;
 250         return tag->attrs[i].value;
 251       }
 252   return NULL;
 253 }
 254
 255 /* used for calls to append_url */
 256 #define ATTR_POS(tag, attrind, ctx) \
 257  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 258 #define ATTR_SIZE(tag, attrind) \
 259  (tag->attrs[attrind].value_raw_size)
 260
 261 /* Append LINK_URI to the urlpos structure that is being built.
 262
 263    LINK_URI will be merged with the current document base.
 264 */
 265
 266 struct urlpos *
 267 append_url (const char *link_uri, int position, int size,
 268             struct map_context *ctx)
 269 {
 270   int link_has_scheme = url_has_scheme (link_uri);
 271   struct urlpos *newel;
 272   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 273   struct url *url;
 274
 275   if (!base)
 276     {
 277       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 278                ctx->document_file, link_uri));
 279
 280       if (!link_has_scheme)
 281         {
 282           /* Base URL is unavailable, and the link does not have a
 283              location attached to it -- we have to give up.  Since
 284              this can only happen when using `--force-html -i', print
 285              a warning.  */
 286           logprintf (LOG_NOTQUIET,
 287                      _("%s: Cannot resolve incomplete link %s.\n"),
 288                      ctx->document_file, link_uri);
 289           return NULL;
 290         }
 291
 292       url = url_parse (link_uri, NULL, NULL, false);
 293       if (!url)
 294         {
 295           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 296                    ctx->document_file, link_uri));
 297           return NULL;
 298         }
 299     }
 300   else
 301     {
 302       /* Merge BASE with LINK_URI, but also make sure the result is
 303          canonicalized, i.e. that "../" have been resolved.
 304          (parse_url will do that for us.) */
 305
 306       char *complete_uri = uri_merge (base, link_uri);
 307
 308       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 309                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 310                quote_n (1, base),
 311                quote_n (2, link_uri),
 312                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 313
 314       url = url_parse (complete_uri, NULL, NULL, false);
 315       if (!url)
 316         {
 317           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 318                    ctx->document_file, complete_uri));
 319           xfree (complete_uri);
 320           return NULL;
 321         }
 322       xfree (complete_uri);
 323     }
 324
 325   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 326
 327   newel = xnew0 (struct urlpos);
 328   newel->url = url;
 329   newel->pos = position;
 330   newel->size = size;
 331
 332   /* A URL is relative if the host is not named, and the name does not
 333      start with `/'.  */
 334   if (!link_has_scheme && *link_uri != '/')
 335     newel->link_relative_p = 1;
 336   else if (link_has_scheme)
 337     newel->link_complete_p = 1;
 338
 339   /* Append the new URL maintaining the order by position.  */
 340   if (ctx->head == NULL)
 341     ctx->head = newel;
 342   else
 343     {
 344       struct urlpos *it, *prev = NULL;
 345
 346       it = ctx->head;
 347       while (it && position > it->pos)
 348         {
 349           prev = it;
 350           it = it->next;
 351         }
 352
 353       newel->next = it;
 354
 355       if (prev)
 356         prev->next = newel;
 357       else
 358         ctx->head = newel;
 359     }
 360
 361   return newel;
 362 }
 363 \f
 364 static void
 365 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 366 {
 367   int attrind;
 368   int raw_start;
 369   int raw_len;
 370   char *style = find_attr (tag, "style", &attrind);
 371   if (!style)
 372     return;
 373
 374   /* raw pos and raw size include the quotes, skip them when they are
 375      present.  */
 376   raw_start = ATTR_POS (tag, attrind, ctx);
 377   raw_len  = ATTR_SIZE (tag, attrind);
 378   if( *(char *)(ctx->text + raw_start) == '\''
 379       || *(char *)(ctx->text + raw_start) == '"')
 380     {
 381       raw_start += 1;
 382       raw_len -= 2;
 383     }
 384
 385   if(raw_len <= 0)
 386        return;
 387
 388   get_urls_css (ctx, raw_start, raw_len);
 389 }
 390
 391 /* All the tag_* functions are called from collect_tags_mapper, as
 392    specified by KNOWN_TAGS.  */
 393
 394 /* Default tag handler: collect URLs from attributes specified for
 395    this tag by tag_url_attributes.  */
 396
 397 static void
 398 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 399 {
 400   size_t i;
 401   int attrind;
 402   int first = -1;
 403
 404   for (i = 0; i < countof (tag_url_attributes); i++)
 405     if (tag_url_attributes[i].tagid == tagid)
 406       {
 407         /* We've found the index of tag_url_attributes where the
 408            attributes of our tag begin.  */
 409         first = i;
 410         break;
 411       }
 412   assert (first != -1);
 413
 414   /* Loop over the "interesting" attributes of this tag.  In this
 415      example, it will loop over "src" and "lowsrc".
 416
 417        <img src="foo.png" lowsrc="bar.png">
 418
 419      This has to be done in the outer loop so that the attributes are
 420      processed in the same order in which they appear in the page.
 421      This is required when converting links.  */
 422
 423   for (attrind = 0; attrind < tag->nattrs; attrind++)
 424     {
 425       /* Find whether TAG/ATTRIND is a combination that contains a
 426          URL. */
 427       char *link = tag->attrs[attrind].value;
 428       const size_t size = countof (tag_url_attributes);
 429
 430       /* If you're cringing at the inefficiency of the nested loops,
 431          remember that they both iterate over a very small number of
 432          items.  The worst-case inner loop is for the IMG tag, which
 433          has three attributes.  */
 434       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 435         {
 436           if (0 == strcasecmp (tag->attrs[attrind].name,
 437                                tag_url_attributes[i].attr_name))
 438             {
 439               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 440                                               ATTR_SIZE(tag,attrind), ctx);
 441               if (up)
 442                 {
 443                   int flags = tag_url_attributes[i].flags;
 444                   if (flags & ATTR_INLINE)
 445                     up->link_inline_p = 1;
 446                   if (flags & ATTR_HTML)
 447                     up->link_expect_html = 1;
 448                 }
 449             }
 450         }
 451     }
 452 }
 453
 454 /* Handle the BASE tag, for <base href=...>. */
 455
 456 static void
 457 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 458 {
 459   struct urlpos *base_urlpos;
 460   int attrind;
 461   char *newbase = find_attr (tag, "href", &attrind);
 462   if (!newbase)
 463     return;
 464
 465   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 466                             ATTR_SIZE(tag,attrind), ctx);
 467   if (!base_urlpos)
 468     return;
 469   base_urlpos->ignore_when_downloading = 1;
 470   base_urlpos->link_base_p = 1;
 471
 472   if (ctx->base)
 473     xfree (ctx->base);
 474   if (ctx->parent_base)
 475     ctx->base = uri_merge (ctx->parent_base, newbase);
 476   else
 477     ctx->base = xstrdup (newbase);
 478 }
 479
 480 /* Mark the URL found in <form action=...> for conversion. */
 481
 482 static void
 483 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 484 {
 485   int attrind;
 486   char *action = find_attr (tag, "action", &attrind);
 487
 488   if (action)
 489     {
 490       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 491                                       ATTR_SIZE(tag,attrind), ctx);
 492       if (up)
 493         up->ignore_when_downloading = 1;
 494     }
 495 }
 496
 497 /* Handle the LINK tag.  It requires special handling because how its
 498    links will be followed in -p mode depends on the REL attribute.  */
 499
 500 static void
 501 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 502 {
 503   int attrind;
 504   char *href = find_attr (tag, "href", &attrind);
 505
 506   /* All <link href="..."> link references are external, except those
 507      known not to be, such as style sheet and shortcut icon:
 508
 509      <link rel="stylesheet" href="...">
 510      <link rel="shortcut icon" href="...">
 511   */
 512   if (href)
 513     {
 514       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 515                                       ATTR_SIZE(tag,attrind), ctx);
 516       if (up)
 517         {
 518           char *rel = find_attr (tag, "rel", NULL);
 519           if (rel)
 520             {
 521               if (0 == strcasecmp (rel, "stylesheet"))
 522                 {
 523                   up->link_inline_p = 1;
 524                   up->link_expect_css = 1;
 525                 }
 526               else if (0 == strcasecmp (rel, "shortcut icon"))
 527                 {
 528                   up->link_inline_p = 1;
 529                 }
 530               else
 531                 {
 532                   /* The external ones usually point to HTML pages, such as
 533                      <link rel="next" href="...">
 534                      except when the type attribute says otherwise:
 535                      <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
 536                   */
 537                   char *type = find_attr (tag, "type", NULL);
 538                   if (!type || strcasecmp (type, "text/html") == 0)
 539                     up->link_expect_html = 1;
 540                 }
 541             }
 542         }
 543     }
 544 }
 545
 546 /* Handle the META tag.  This requires special handling because of the
 547    refresh feature and because of robot exclusion.  */
 548
 549 static void
 550 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 551 {
 552   char *name = find_attr (tag, "name", NULL);
 553   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 554
 555   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 556     {
 557       /* Some pages use a META tag to specify that the page be
 558          refreshed by a new page after a given number of seconds.  The
 559          general format for this is:
 560
 561            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 562
 563          So we just need to skip past the "NUMBER; URL=" garbage to
 564          get to the URL.  */
 565
 566       struct urlpos *entry;
 567       int attrind;
 568       int timeout = 0;
 569       char *p;
 570
 571       char *refresh = find_attr (tag, "content", &attrind);
 572       if (!refresh)
 573         return;
 574
 575       for (p = refresh; c_isdigit (*p); p++)
 576         timeout = 10 * timeout + *p - '0';
 577       if (*p++ != ';')
 578         return;
 579
 580       while (c_isspace (*p))
 581         ++p;
 582       if (!(   c_toupper (*p)       == 'U'
 583             && c_toupper (*(p + 1)) == 'R'
 584             && c_toupper (*(p + 2)) == 'L'
 585             &&          *(p + 3)  == '='))
 586         return;
 587       p += 4;
 588       while (c_isspace (*p))
 589         ++p;
 590
 591       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 592                           ATTR_SIZE(tag,attrind), ctx);
 593       if (entry)
 594         {
 595           entry->link_refresh_p = 1;
 596           entry->refresh_timeout = timeout;
 597           entry->link_expect_html = 1;
 598         }
 599     }
 600   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 601     {
 602       /* Handle stuff like:
 603          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 604
 605       char *mcharset;
 606       char *content = find_attr (tag, "content", NULL);
 607       if (!content)
 608         return;
 609
 610       mcharset = parse_charset (content);
 611       if (!mcharset)
 612         return;
 613
 614       xfree_null (meta_charset);
 615       meta_charset = mcharset;
 616     }
 617   else if (name && 0 == strcasecmp (name, "robots"))
 618     {
 619       /* Handle stuff like:
 620          <meta name="robots" content="index,nofollow"> */
 621       char *content = find_attr (tag, "content", NULL);
 622       if (!content)
 623         return;
 624       if (!strcasecmp (content, "none"))
 625         ctx->nofollow = true;
 626       else
 627         {
 628           while (*content)
 629             {
 630               char *end;
 631               /* Skip any initial whitespace. */
 632               content += strspn (content, " \f\n\r\t\v");
 633               /* Find the next occurrence of ',' or whitespace,
 634                * or the end of the string.  */
 635               end = content + strcspn (content, ", \f\n\r\t\v");
 636               if (!strncasecmp (content, "nofollow", end - content))
 637                 ctx->nofollow = true;
 638               /* Skip past the next comma, if any. */
 639               if (*end == ',')
 640                 ++end;
 641               else
 642                 {
 643                   end = strchr (end, ',');
 644                   if (end)
 645                     ++end;
 646                   else
 647                     end = content + strlen (content);
 648                 }
 649               content = end;
 650             }
 651         }
 652     }
 653 }
 654
 655 /* Dispatch the tag handler appropriate for the tag we're mapping
 656    over.  See known_tags[] for definition of tag handlers.  */
 657
 658 static void
 659 collect_tags_mapper (struct taginfo *tag, void *arg)
 660 {
 661   struct map_context *ctx = (struct map_context *)arg;
 662
 663   /* Find the tag in our table of tags.  This must not fail because
 664      map_html_tags only returns tags found in interesting_tags.
 665
 666      I've changed this for now, I'm passing NULL as interesting_tags
 667      to map_html_tags.  This way we can check all tags for a style
 668      attribute.
 669   */
 670   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 671
 672   if (t != NULL)
 673     t->handler (t->tagid, tag, ctx);
 674
 675   check_style_attr (tag, ctx);
 676
 677   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 678       tag->contents_begin && tag->contents_end)
 679   {
 680     /* parse contents */
 681     get_urls_css (ctx, tag->contents_begin - ctx->text,
 682                   tag->contents_end - tag->contents_begin);
 683   }
 684 }
 685 \f
 686 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 687    it.  It merges relative links in FILE with URL.  It is aware of
 688    <base href=...> and does the right thing.  */
 689
 690 struct urlpos *
 691 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 692                struct iri *iri)
 693 {
 694   struct file_memory *fm;
 695   struct map_context ctx;
 696   int flags;
 697
 698   /* Load the file. */
 699   fm = wget_read_file (file);
 700   if (!fm)
 701     {
 702       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 703       return NULL;
 704     }
 705   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 706
 707   ctx.text = fm->content;
 708   ctx.head = NULL;
 709   ctx.base = NULL;
 710   ctx.parent_base = url ? url : opt.base_href;
 711   ctx.document_file = file;
 712   ctx.nofollow = false;
 713
 714   if (!interesting_tags)
 715     init_interesting ();
 716
 717   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 718      generate <a href=" foo"> instead of <a href="foo"> (browsers
 719      ignore spaces as well.)  If you really mean space, use &32; or
 720      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 721      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 722      ignored by IE and Mozilla and are presumably introduced by
 723      writing HTML with editors that force word wrap.  */
 724   flags = MHT_TRIM_VALUES;
 725   if (opt.strict_comments)
 726     flags |= MHT_STRICT_COMMENTS;
 727
 728   /* the NULL here used to be interesting_tags */
 729   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 730                  NULL, interesting_attributes);
 731
 732   /* If meta charset isn't null, override content encoding */
 733   if (iri && meta_charset)
 734     set_content_encoding (iri, meta_charset);
 735
 736   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 737   if (meta_disallow_follow)
 738     *meta_disallow_follow = ctx.nofollow;
 739
 740   xfree_null (ctx.base);
 741   wget_read_file_free (fm);
 742   return ctx.head;
 743 }
 744
 745 /* This doesn't really have anything to do with HTML, but it's similar
 746    to get_urls_html, so we put it here.  */
 747
 748 struct urlpos *
 749 get_urls_file (const char *file)
 750 {
 751   struct file_memory *fm;
 752   struct urlpos *head, *tail;
 753   const char *text, *text_end;
 754
 755   /* Load the file.  */
 756   fm = wget_read_file (file);
 757   if (!fm)
 758     {
 759       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 760       return NULL;
 761     }
 762   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 763
 764   head = tail = NULL;
 765   text = fm->content;
 766   text_end = fm->content + fm->length;
 767   while (text < text_end)
 768     {
 769       int up_error_code;
 770       char *url_text;
 771       struct urlpos *entry;
 772       struct url *url;
 773
 774       const char *line_beg = text;
 775       const char *line_end = memchr (text, '\n', text_end - text);
 776       if (!line_end)
 777         line_end = text_end;
 778       else
 779         ++line_end;
 780       text = line_end;
 781
 782       /* Strip whitespace from the beginning and end of line. */
 783       while (line_beg < line_end && c_isspace (*line_beg))
 784         ++line_beg;
 785       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 786         --line_end;
 787
 788       if (line_beg == line_end)
 789         continue;
 790
 791       /* The URL is in the [line_beg, line_end) region. */
 792
 793       /* We must copy the URL to a zero-terminated string, and we
 794          can't use alloca because we're in a loop.  *sigh*.  */
 795       url_text = strdupdelim (line_beg, line_end);
 796
 797       if (opt.base_href)
 798         {
 799           /* Merge opt.base_href with URL. */
 800           char *merged = uri_merge (opt.base_href, url_text);
 801           xfree (url_text);
 802           url_text = merged;
 803         }
 804
 805       url = url_parse (url_text, &up_error_code, NULL, false);
 806       if (!url)
 807         {
 808           char *error = url_error (url_text, up_error_code);
 809           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 810                      file, url_text, error);
 811           xfree (url_text);
 812           xfree (error);
 813           continue;
 814         }
 815       xfree (url_text);
 816
 817       entry = xnew0 (struct urlpos);
 818       entry->url = url;
 819
 820       if (!head)
 821         head = entry;
 822       else
 823         tail->next = entry;
 824       tail = entry;
 825     }
 826   wget_read_file_free (fm);
 827   return head;
 828 }
 829
 830 void
 831 cleanup_html_url (void)
 832 {
 833   /* Destroy the hash tables.  The hash table keys and values are not
 834      allocated by this code, so we don't need to free them here.  */
 835   if (interesting_tags)
 836     hash_table_destroy (interesting_tags);
 837   if (interesting_attributes)
 838     hash_table_destroy (interesting_attributes);
 839 }