sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link  */
 167   "http-equiv",                 /* used by tag_handle_meta  */
 168   "name",                       /* used by tag_handle_meta  */
 169   "content",                    /* used by tag_handle_meta  */
 170   "action",                     /* used by tag_handle_form  */
 171   "style"                       /* used by check_style_attr */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 /* Will contains the (last) charset found in 'http-equiv=content-type'
 178    meta tags  */
 179 static char *meta_charset;
 180
 181 static void
 182 init_interesting (void)
 183 {
 184   /* Init the variables interesting_tags and interesting_attributes
 185      that are used by the HTML parser to know which tags and
 186      attributes we're interested in.  We initialize this only once,
 187      for performance reasons.
 188
 189      Here we also make sure that what we put in interesting_tags
 190      matches the user's preferences as specified through --ignore-tags
 191      and --follow-tags.  */
 192
 193   size_t i;
 194   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 195
 196   /* First, add all the tags we know hot to handle, mapped to their
 197      respective entries in known_tags.  */
 198   for (i = 0; i < countof (known_tags); i++)
 199     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 200
 201   /* Then remove the tags ignored through --ignore-tags.  */
 202   if (opt.ignore_tags)
 203     {
 204       char **ignored;
 205       for (ignored = opt.ignore_tags; *ignored; ignored++)
 206         hash_table_remove (interesting_tags, *ignored);
 207     }
 208
 209   /* If --follow-tags is specified, use only those tags.  */
 210   if (opt.follow_tags)
 211     {
 212       /* Create a new table intersecting --follow-tags and known_tags,
 213          and use it as interesting_tags.  */
 214       struct hash_table *intersect = make_nocase_string_hash_table (0);
 215       char **followed;
 216       for (followed = opt.follow_tags; *followed; followed++)
 217         {
 218           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 219           if (!t)
 220             continue;           /* ignore unknown --follow-tags entries. */
 221           hash_table_put (intersect, *followed, t);
 222         }
 223       hash_table_destroy (interesting_tags);
 224       interesting_tags = intersect;
 225     }
 226
 227   /* Add the attributes we care about. */
 228   interesting_attributes = make_nocase_string_hash_table (10);
 229   for (i = 0; i < countof (additional_attributes); i++)
 230     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 231   for (i = 0; i < countof (tag_url_attributes); i++)
 232     hash_table_put (interesting_attributes,
 233                     tag_url_attributes[i].attr_name, "1");
 234 }
 235
 236 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 237    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 238    index of the attribute in TAG will be stored there.  */
 239
 240 static char *
 241 find_attr (struct taginfo *tag, const char *name, int *attrind)
 242 {
 243   int i;
 244   for (i = 0; i < tag->nattrs; i++)
 245     if (!strcasecmp (tag->attrs[i].name, name))
 246       {
 247         if (attrind)
 248           *attrind = i;
 249         return tag->attrs[i].value;
 250       }
 251   return NULL;
 252 }
 253
 254 /* used for calls to append_url */
 255 #define ATTR_POS(tag, attrind, ctx) \
 256  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 257 #define ATTR_SIZE(tag, attrind) \
 258  (tag->attrs[attrind].value_raw_size)
 259
 260 /* Append LINK_URI to the urlpos structure that is being built.
 261
 262    LINK_URI will be merged with the current document base.
 263 */
 264
 265 struct urlpos *
 266 append_url (const char *link_uri, int position, int size,
 267             struct map_context *ctx)
 268 {
 269   int link_has_scheme = url_has_scheme (link_uri);
 270   struct urlpos *newel;
 271   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 272   struct url *url;
 273
 274   if (!base)
 275     {
 276       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 277                ctx->document_file, link_uri));
 278
 279       if (!link_has_scheme)
 280         {
 281           /* Base URL is unavailable, and the link does not have a
 282              location attached to it -- we have to give up.  Since
 283              this can only happen when using `--force-html -i', print
 284              a warning.  */
 285           logprintf (LOG_NOTQUIET,
 286                      _("%s: Cannot resolve incomplete link %s.\n"),
 287                      ctx->document_file, link_uri);
 288           return NULL;
 289         }
 290
 291       url = url_parse (link_uri, NULL, NULL, false);
 292       if (!url)
 293         {
 294           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 295                    ctx->document_file, link_uri));
 296           return NULL;
 297         }
 298     }
 299   else
 300     {
 301       /* Merge BASE with LINK_URI, but also make sure the result is
 302          canonicalized, i.e. that "../" have been resolved.
 303          (parse_url will do that for us.) */
 304
 305       char *complete_uri = uri_merge (base, link_uri);
 306
 307       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 308                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 309                quote_n (1, base),
 310                quote_n (2, link_uri),
 311                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 312
 313       url = url_parse (complete_uri, NULL, NULL, false);
 314       if (!url)
 315         {
 316           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 317                    ctx->document_file, complete_uri));
 318           xfree (complete_uri);
 319           return NULL;
 320         }
 321       xfree (complete_uri);
 322     }
 323
 324   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 325
 326   newel = xnew0 (struct urlpos);
 327   newel->url = url;
 328   newel->pos = position;
 329   newel->size = size;
 330
 331   /* A URL is relative if the host is not named, and the name does not
 332      start with `/'.  */
 333   if (!link_has_scheme && *link_uri != '/')
 334     newel->link_relative_p = 1;
 335   else if (link_has_scheme)
 336     newel->link_complete_p = 1;
 337
 338   /* Append the new URL maintaining the order by position.  */
 339   if (ctx->head == NULL)
 340     ctx->head = newel;
 341   else
 342     {
 343       struct urlpos *it, *prev = NULL;
 344
 345       it = ctx->head;
 346       while (it && position > it->pos)
 347         {
 348           prev = it;
 349           it = it->next;
 350         }
 351
 352       newel->next = it;
 353
 354       if (prev)
 355         prev->next = newel;
 356       else
 357         ctx->head = newel;
 358     }
 359
 360   return newel;
 361 }
 362 \f
 363 static void
 364 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 365 {
 366   int attrind;
 367   int raw_start;
 368   int raw_len;
 369   char *style = find_attr (tag, "style", &attrind);
 370   if (!style)
 371     return;
 372
 373   /* raw pos and raw size include the quotes, skip them when they are
 374      present.  */
 375   raw_start = ATTR_POS (tag, attrind, ctx);
 376   raw_len  = ATTR_SIZE (tag, attrind);
 377   if( *(char *)(ctx->text + raw_start) == '\''
 378       || *(char *)(ctx->text + raw_start) == '"')
 379     {
 380       raw_start += 1;
 381       raw_len -= 2;
 382     }
 383
 384   if(raw_len <= 0)
 385        return;
 386
 387   get_urls_css (ctx, raw_start, raw_len);
 388 }
 389
 390 /* All the tag_* functions are called from collect_tags_mapper, as
 391    specified by KNOWN_TAGS.  */
 392
 393 /* Default tag handler: collect URLs from attributes specified for
 394    this tag by tag_url_attributes.  */
 395
 396 static void
 397 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 398 {
 399   size_t i;
 400   int attrind;
 401   int first = -1;
 402
 403   for (i = 0; i < countof (tag_url_attributes); i++)
 404     if (tag_url_attributes[i].tagid == tagid)
 405       {
 406         /* We've found the index of tag_url_attributes where the
 407            attributes of our tag begin.  */
 408         first = i;
 409         break;
 410       }
 411   assert (first != -1);
 412
 413   /* Loop over the "interesting" attributes of this tag.  In this
 414      example, it will loop over "src" and "lowsrc".
 415
 416        <img src="foo.png" lowsrc="bar.png">
 417
 418      This has to be done in the outer loop so that the attributes are
 419      processed in the same order in which they appear in the page.
 420      This is required when converting links.  */
 421
 422   for (attrind = 0; attrind < tag->nattrs; attrind++)
 423     {
 424       /* Find whether TAG/ATTRIND is a combination that contains a
 425          URL. */
 426       char *link = tag->attrs[attrind].value;
 427       const size_t size = countof (tag_url_attributes);
 428
 429       /* If you're cringing at the inefficiency of the nested loops,
 430          remember that they both iterate over a very small number of
 431          items.  The worst-case inner loop is for the IMG tag, which
 432          has three attributes.  */
 433       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 434         {
 435           if (0 == strcasecmp (tag->attrs[attrind].name,
 436                                tag_url_attributes[i].attr_name))
 437             {
 438               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 439                                               ATTR_SIZE(tag,attrind), ctx);
 440               if (up)
 441                 {
 442                   int flags = tag_url_attributes[i].flags;
 443                   if (flags & ATTR_INLINE)
 444                     up->link_inline_p = 1;
 445                   if (flags & ATTR_HTML)
 446                     up->link_expect_html = 1;
 447                 }
 448             }
 449         }
 450     }
 451 }
 452
 453 /* Handle the BASE tag, for <base href=...>. */
 454
 455 static void
 456 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 457 {
 458   struct urlpos *base_urlpos;
 459   int attrind;
 460   char *newbase = find_attr (tag, "href", &attrind);
 461   if (!newbase)
 462     return;
 463
 464   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 465                             ATTR_SIZE(tag,attrind), ctx);
 466   if (!base_urlpos)
 467     return;
 468   base_urlpos->ignore_when_downloading = 1;
 469   base_urlpos->link_base_p = 1;
 470
 471   if (ctx->base)
 472     xfree (ctx->base);
 473   if (ctx->parent_base)
 474     ctx->base = uri_merge (ctx->parent_base, newbase);
 475   else
 476     ctx->base = xstrdup (newbase);
 477 }
 478
 479 /* Mark the URL found in <form action=...> for conversion. */
 480
 481 static void
 482 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 483 {
 484   int attrind;
 485   char *action = find_attr (tag, "action", &attrind);
 486
 487   if (action)
 488     {
 489       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 490                                       ATTR_SIZE(tag,attrind), ctx);
 491       if (up)
 492         up->ignore_when_downloading = 1;
 493     }
 494 }
 495
 496 /* Handle the LINK tag.  It requires special handling because how its
 497    links will be followed in -p mode depends on the REL attribute.  */
 498
 499 static void
 500 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 501 {
 502   int attrind;
 503   char *href = find_attr (tag, "href", &attrind);
 504
 505   /* All <link href="..."> link references are external, except those
 506      known not to be, such as style sheet and shortcut icon:
 507
 508        <link rel="stylesheet" href="...">
 509        <link rel="shortcut icon" href="...">
 510   */
 511   if (href)
 512     {
 513       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 514                                       ATTR_SIZE(tag,attrind), ctx);
 515       if (up)
 516         {
 517           char *rel = find_attr (tag, "rel", NULL);
 518           if (rel)
 519             {
 520               if (0 == strcasecmp (rel, "stylesheet"))
 521                 {
 522                   up->link_inline_p = 1;
 523                   up->link_expect_css = 1;
 524                 }
 525               else if (0 == strcasecmp (rel, "shortcut icon"))
 526                 {
 527                   up->link_inline_p = 1;
 528                 }
 529             }
 530           else
 531             /* The external ones usually point to HTML pages, such as
 532                <link rel="next" href="..."> */
 533             up->link_expect_html = 1;
 534         }
 535     }
 536 }
 537
 538 /* Handle the META tag.  This requires special handling because of the
 539    refresh feature and because of robot exclusion.  */
 540
 541 static void
 542 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 543 {
 544   char *name = find_attr (tag, "name", NULL);
 545   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 546
 547   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 548     {
 549       /* Some pages use a META tag to specify that the page be
 550          refreshed by a new page after a given number of seconds.  The
 551          general format for this is:
 552
 553            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 554
 555          So we just need to skip past the "NUMBER; URL=" garbage to
 556          get to the URL.  */
 557
 558       struct urlpos *entry;
 559       int attrind;
 560       int timeout = 0;
 561       char *p;
 562
 563       char *refresh = find_attr (tag, "content", &attrind);
 564       if (!refresh)
 565         return;
 566
 567       for (p = refresh; c_isdigit (*p); p++)
 568         timeout = 10 * timeout + *p - '0';
 569       if (*p++ != ';')
 570         return;
 571
 572       while (c_isspace (*p))
 573         ++p;
 574       if (!(   c_toupper (*p)       == 'U'
 575             && c_toupper (*(p + 1)) == 'R'
 576             && c_toupper (*(p + 2)) == 'L'
 577             &&          *(p + 3)  == '='))
 578         return;
 579       p += 4;
 580       while (c_isspace (*p))
 581         ++p;
 582
 583       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 584                           ATTR_SIZE(tag,attrind), ctx);
 585       if (entry)
 586         {
 587           entry->link_refresh_p = 1;
 588           entry->refresh_timeout = timeout;
 589           entry->link_expect_html = 1;
 590         }
 591     }
 592   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 593     {
 594       /* Handle stuff like:
 595          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 596
 597       char *mcharset;
 598       char *content = find_attr (tag, "content", NULL);
 599       if (!content)
 600         return;
 601
 602       mcharset = parse_charset (content);
 603       if (!mcharset)
 604         return;
 605
 606       xfree_null (meta_charset);
 607       meta_charset = mcharset;
 608     }
 609   else if (name && 0 == strcasecmp (name, "robots"))
 610     {
 611       /* Handle stuff like:
 612          <meta name="robots" content="index,nofollow"> */
 613       char *content = find_attr (tag, "content", NULL);
 614       if (!content)
 615         return;
 616       if (!strcasecmp (content, "none"))
 617         ctx->nofollow = true;
 618       else
 619         {
 620           while (*content)
 621             {
 622               char *end;
 623               /* Skip any initial whitespace. */
 624               content += strspn (content, " \f\n\r\t\v");
 625               /* Find the next occurrence of ',' or whitespace,
 626                * or the end of the string.  */
 627               end = content + strcspn (content, ", \f\n\r\t\v");
 628               if (!strncasecmp (content, "nofollow", end - content))
 629                 ctx->nofollow = true;
 630               /* Skip past the next comma, if any. */
 631               if (*end == ',')
 632                 ++end;
 633               else
 634                 {
 635                   end = strchr (end, ',');
 636                   if (end)
 637                     ++end;
 638                   else
 639                     end = content + strlen (content);
 640                 }
 641               content = end;
 642             }
 643         }
 644     }
 645 }
 646
 647 /* Dispatch the tag handler appropriate for the tag we're mapping
 648    over.  See known_tags[] for definition of tag handlers.  */
 649
 650 static void
 651 collect_tags_mapper (struct taginfo *tag, void *arg)
 652 {
 653   struct map_context *ctx = (struct map_context *)arg;
 654
 655   /* Find the tag in our table of tags.  This must not fail because
 656      map_html_tags only returns tags found in interesting_tags.
 657
 658      I've changed this for now, I'm passing NULL as interesting_tags
 659      to map_html_tags.  This way we can check all tags for a style
 660      attribute.
 661   */
 662   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 663
 664   if (t != NULL)
 665     t->handler (t->tagid, tag, ctx);
 666
 667   check_style_attr (tag, ctx);
 668
 669   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 670       tag->contents_begin && tag->contents_end)
 671   {
 672     /* parse contents */
 673     get_urls_css (ctx, tag->contents_begin - ctx->text,
 674                   tag->contents_end - tag->contents_begin);
 675   }
 676 }
 677 \f
 678 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 679    it.  It merges relative links in FILE with URL.  It is aware of
 680    <base href=...> and does the right thing.  */
 681
 682 struct urlpos *
 683 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 684                struct iri *iri)
 685 {
 686   struct file_memory *fm;
 687   struct map_context ctx;
 688   int flags;
 689
 690   /* Load the file. */
 691   fm = wget_read_file (file);
 692   if (!fm)
 693     {
 694       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 695       return NULL;
 696     }
 697   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 698
 699   ctx.text = fm->content;
 700   ctx.head = NULL;
 701   ctx.base = NULL;
 702   ctx.parent_base = url ? url : opt.base_href;
 703   ctx.document_file = file;
 704   ctx.nofollow = false;
 705
 706   if (!interesting_tags)
 707     init_interesting ();
 708
 709   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 710      generate <a href=" foo"> instead of <a href="foo"> (browsers
 711      ignore spaces as well.)  If you really mean space, use &32; or
 712      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 713      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 714      ignored by IE and Mozilla and are presumably introduced by
 715      writing HTML with editors that force word wrap.  */
 716   flags = MHT_TRIM_VALUES;
 717   if (opt.strict_comments)
 718     flags |= MHT_STRICT_COMMENTS;
 719
 720   /* the NULL here used to be interesting_tags */
 721   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 722                  NULL, interesting_attributes);
 723
 724   /* If meta charset isn't null, override content encoding */
 725   if (iri && meta_charset)
 726     set_content_encoding (iri, meta_charset);
 727
 728   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 729   if (meta_disallow_follow)
 730     *meta_disallow_follow = ctx.nofollow;
 731
 732   xfree_null (ctx.base);
 733   wget_read_file_free (fm);
 734   return ctx.head;
 735 }
 736
 737 /* This doesn't really have anything to do with HTML, but it's similar
 738    to get_urls_html, so we put it here.  */
 739
 740 struct urlpos *
 741 get_urls_file (const char *file)
 742 {
 743   struct file_memory *fm;
 744   struct urlpos *head, *tail;
 745   const char *text, *text_end;
 746
 747   /* Load the file.  */
 748   fm = wget_read_file (file);
 749   if (!fm)
 750     {
 751       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 752       return NULL;
 753     }
 754   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 755
 756   head = tail = NULL;
 757   text = fm->content;
 758   text_end = fm->content + fm->length;
 759   while (text < text_end)
 760     {
 761       int up_error_code;
 762       char *url_text;
 763       struct urlpos *entry;
 764       struct url *url;
 765
 766       const char *line_beg = text;
 767       const char *line_end = memchr (text, '\n', text_end - text);
 768       if (!line_end)
 769         line_end = text_end;
 770       else
 771         ++line_end;
 772       text = line_end;
 773
 774       /* Strip whitespace from the beginning and end of line. */
 775       while (line_beg < line_end && c_isspace (*line_beg))
 776         ++line_beg;
 777       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 778         --line_end;
 779
 780       if (line_beg == line_end)
 781         continue;
 782
 783       /* The URL is in the [line_beg, line_end) region. */
 784
 785       /* We must copy the URL to a zero-terminated string, and we
 786          can't use alloca because we're in a loop.  *sigh*.  */
 787       url_text = strdupdelim (line_beg, line_end);
 788
 789       if (opt.base_href)
 790         {
 791           /* Merge opt.base_href with URL. */
 792           char *merged = uri_merge (opt.base_href, url_text);
 793           xfree (url_text);
 794           url_text = merged;
 795         }
 796
 797       url = url_parse (url_text, &up_error_code, NULL, false);
 798       if (!url)
 799         {
 800           char *error = url_error (url_text, up_error_code);
 801           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 802                      file, url_text, error);
 803           xfree (url_text);
 804           xfree (error);
 805           continue;
 806         }
 807       xfree (url_text);
 808
 809       entry = xnew0 (struct urlpos);
 810       entry->url = url;
 811
 812       if (!head)
 813         head = entry;
 814       else
 815         tail->next = entry;
 816       tail = entry;
 817     }
 818   wget_read_file_free (fm);
 819   return head;
 820 }
 821
 822 void
 823 cleanup_html_url (void)
 824 {
 825   /* Destroy the hash tables.  The hash table keys and values are not
 826      allocated by this code, so we don't need to free them here.  */
 827   if (interesting_tags)
 828     hash_table_destroy (interesting_tags);
 829   if (interesting_attributes)
 830     hash_table_destroy (interesting_attributes);
 831 }