sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link  */
 167   "http-equiv",                 /* used by tag_handle_meta  */
 168   "name",                       /* used by tag_handle_meta  */
 169   "content",                    /* used by tag_handle_meta  */
 170   "action",                     /* used by tag_handle_form  */
 171   "style"                       /* used by check_style_attr */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 static void
 178 init_interesting (void)
 179 {
 180   /* Init the variables interesting_tags and interesting_attributes
 181      that are used by the HTML parser to know which tags and
 182      attributes we're interested in.  We initialize this only once,
 183      for performance reasons.
 184
 185      Here we also make sure that what we put in interesting_tags
 186      matches the user's preferences as specified through --ignore-tags
 187      and --follow-tags.  */
 188
 189   size_t i;
 190   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 191
 192   /* First, add all the tags we know hot to handle, mapped to their
 193      respective entries in known_tags.  */
 194   for (i = 0; i < countof (known_tags); i++)
 195     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 196
 197   /* Then remove the tags ignored through --ignore-tags.  */
 198   if (opt.ignore_tags)
 199     {
 200       char **ignored;
 201       for (ignored = opt.ignore_tags; *ignored; ignored++)
 202         hash_table_remove (interesting_tags, *ignored);
 203     }
 204
 205   /* If --follow-tags is specified, use only those tags.  */
 206   if (opt.follow_tags)
 207     {
 208       /* Create a new table intersecting --follow-tags and known_tags,
 209          and use it as interesting_tags.  */
 210       struct hash_table *intersect = make_nocase_string_hash_table (0);
 211       char **followed;
 212       for (followed = opt.follow_tags; *followed; followed++)
 213         {
 214           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 215           if (!t)
 216             continue;           /* ignore unknown --follow-tags entries. */
 217           hash_table_put (intersect, *followed, t);
 218         }
 219       hash_table_destroy (interesting_tags);
 220       interesting_tags = intersect;
 221     }
 222
 223   /* Add the attributes we care about. */
 224   interesting_attributes = make_nocase_string_hash_table (10);
 225   for (i = 0; i < countof (additional_attributes); i++)
 226     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 227   for (i = 0; i < countof (tag_url_attributes); i++)
 228     hash_table_put (interesting_attributes,
 229                     tag_url_attributes[i].attr_name, "1");
 230 }
 231
 232 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 233    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 234    index of the attribute in TAG will be stored there.  */
 235
 236 static char *
 237 find_attr (struct taginfo *tag, const char *name, int *attrind)
 238 {
 239   int i;
 240   for (i = 0; i < tag->nattrs; i++)
 241     if (!strcasecmp (tag->attrs[i].name, name))
 242       {
 243         if (attrind)
 244           *attrind = i;
 245         return tag->attrs[i].value;
 246       }
 247   return NULL;
 248 }
 249
 250 /* used for calls to append_url */
 251 #define ATTR_POS(tag, attrind, ctx) \
 252  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 253 #define ATTR_SIZE(tag, attrind) \
 254  (tag->attrs[attrind].value_raw_size)
 255
 256 /* Append LINK_URI to the urlpos structure that is being built.
 257
 258    LINK_URI will be merged with the current document base.
 259 */
 260
 261 struct urlpos *
 262 append_url (const char *link_uri, int position, int size,
 263             struct map_context *ctx)
 264 {
 265   int link_has_scheme = url_has_scheme (link_uri);
 266   struct urlpos *newel;
 267   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 268   struct url *url;
 269
 270   if (!base)
 271     {
 272       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 273                ctx->document_file, link_uri));
 274
 275       if (!link_has_scheme)
 276         {
 277           /* Base URL is unavailable, and the link does not have a
 278              location attached to it -- we have to give up.  Since
 279              this can only happen when using `--force-html -i', print
 280              a warning.  */
 281           logprintf (LOG_NOTQUIET,
 282                      _("%s: Cannot resolve incomplete link %s.\n"),
 283                      ctx->document_file, link_uri);
 284           return NULL;
 285         }
 286
 287       url = url_parse (link_uri, NULL);
 288       if (!url)
 289         {
 290           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 291                    ctx->document_file, link_uri));
 292           return NULL;
 293         }
 294     }
 295   else
 296     {
 297       /* Merge BASE with LINK_URI, but also make sure the result is
 298          canonicalized, i.e. that "../" have been resolved.
 299          (parse_url will do that for us.) */
 300
 301       char *complete_uri = uri_merge (base, link_uri);
 302
 303       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 304                ctx->document_file, base, link_uri, complete_uri));
 305
 306       url = url_parse (complete_uri, NULL);
 307       if (!url)
 308         {
 309           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 310                    ctx->document_file, complete_uri));
 311           xfree (complete_uri);
 312           return NULL;
 313         }
 314       xfree (complete_uri);
 315     }
 316
 317   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 318
 319   newel = xnew0 (struct urlpos);
 320   newel->url = url;
 321   newel->pos = position;
 322   newel->size = size;
 323
 324   /* A URL is relative if the host is not named, and the name does not
 325      start with `/'.  */
 326   if (!link_has_scheme && *link_uri != '/')
 327     newel->link_relative_p = 1;
 328   else if (link_has_scheme)
 329     newel->link_complete_p = 1;
 330
 331   if (ctx->tail)
 332     {
 333       ctx->tail->next = newel;
 334       ctx->tail = newel;
 335     }
 336   else
 337     ctx->tail = ctx->head = newel;
 338
 339   return newel;
 340 }
 341 \f
 342 static void
 343 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 344 {
 345   int attrind;
 346   char *style = find_attr (tag, "style", &attrind);
 347   if (!style)
 348     return;
 349
 350   /* raw pos and raw size include the quotes, hence the +1 -2 */
 351   get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
 352 }
 353
 354 /* All the tag_* functions are called from collect_tags_mapper, as
 355    specified by KNOWN_TAGS.  */
 356
 357 /* Default tag handler: collect URLs from attributes specified for
 358    this tag by tag_url_attributes.  */
 359
 360 static void
 361 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 362 {
 363   size_t i;
 364   int attrind;
 365   int first = -1;
 366
 367   for (i = 0; i < countof (tag_url_attributes); i++)
 368     if (tag_url_attributes[i].tagid == tagid)
 369       {
 370         /* We've found the index of tag_url_attributes where the
 371            attributes of our tag begin.  */
 372         first = i;
 373         break;
 374       }
 375   assert (first != -1);
 376
 377   /* Loop over the "interesting" attributes of this tag.  In this
 378      example, it will loop over "src" and "lowsrc".
 379
 380        <img src="foo.png" lowsrc="bar.png">
 381
 382      This has to be done in the outer loop so that the attributes are
 383      processed in the same order in which they appear in the page.
 384      This is required when converting links.  */
 385
 386   for (attrind = 0; attrind < tag->nattrs; attrind++)
 387     {
 388       /* Find whether TAG/ATTRIND is a combination that contains a
 389          URL. */
 390       char *link = tag->attrs[attrind].value;
 391       const size_t size = countof (tag_url_attributes);
 392
 393       /* If you're cringing at the inefficiency of the nested loops,
 394          remember that they both iterate over a very small number of
 395          items.  The worst-case inner loop is for the IMG tag, which
 396          has three attributes.  */
 397       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 398         {
 399           if (0 == strcasecmp (tag->attrs[attrind].name,
 400                                tag_url_attributes[i].attr_name))
 401             {
 402               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 403                                               ATTR_SIZE(tag,attrind), ctx);
 404               if (up)
 405                 {
 406                   int flags = tag_url_attributes[i].flags;
 407                   if (flags & ATTR_INLINE)
 408                     up->link_inline_p = 1;
 409                   if (flags & ATTR_HTML)
 410                     up->link_expect_html = 1;
 411                 }
 412             }
 413         }
 414     }
 415 }
 416
 417 /* Handle the BASE tag, for <base href=...>. */
 418
 419 static void
 420 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 421 {
 422   struct urlpos *base_urlpos;
 423   int attrind;
 424   char *newbase = find_attr (tag, "href", &attrind);
 425   if (!newbase)
 426     return;
 427
 428   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 429                             ATTR_SIZE(tag,attrind), ctx);
 430   if (!base_urlpos)
 431     return;
 432   base_urlpos->ignore_when_downloading = 1;
 433   base_urlpos->link_base_p = 1;
 434
 435   if (ctx->base)
 436     xfree (ctx->base);
 437   if (ctx->parent_base)
 438     ctx->base = uri_merge (ctx->parent_base, newbase);
 439   else
 440     ctx->base = xstrdup (newbase);
 441 }
 442
 443 /* Mark the URL found in <form action=...> for conversion. */
 444
 445 static void
 446 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 447 {
 448   int attrind;
 449   char *action = find_attr (tag, "action", &attrind);
 450
 451   if (action)
 452     {
 453       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 454                                       ATTR_SIZE(tag,attrind), ctx);
 455       if (up)
 456         up->ignore_when_downloading = 1;
 457     }
 458 }
 459
 460 /* Handle the LINK tag.  It requires special handling because how its
 461    links will be followed in -p mode depends on the REL attribute.  */
 462
 463 static void
 464 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 465 {
 466   int attrind;
 467   char *href = find_attr (tag, "href", &attrind);
 468
 469   /* All <link href="..."> link references are external, except those
 470      known not to be, such as style sheet and shortcut icon:
 471
 472        <link rel="stylesheet" href="...">
 473        <link rel="shortcut icon" href="...">
 474   */
 475   if (href)
 476     {
 477       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 478                                       ATTR_SIZE(tag,attrind), ctx);
 479       if (up)
 480         {
 481           char *rel = find_attr (tag, "rel", NULL);
 482           if (rel)
 483             {
 484               if (0 == strcasecmp (rel, "stylesheet"))
 485                 {
 486                   up->link_inline_p = 1;
 487                   up->link_expect_css = 1;
 488                 }
 489               else if (0 == strcasecmp (rel, "shortcut icon"))
 490                 {
 491                   up->link_inline_p = 1;
 492                 }
 493             }
 494           else
 495             /* The external ones usually point to HTML pages, such as
 496                <link rel="next" href="..."> */
 497             up->link_expect_html = 1;
 498         }
 499     }
 500 }
 501
 502 /* Handle the META tag.  This requires special handling because of the
 503    refresh feature and because of robot exclusion.  */
 504
 505 static void
 506 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 507 {
 508   char *name = find_attr (tag, "name", NULL);
 509   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 510
 511   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 512     {
 513       /* Some pages use a META tag to specify that the page be
 514          refreshed by a new page after a given number of seconds.  The
 515          general format for this is:
 516
 517            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 518
 519          So we just need to skip past the "NUMBER; URL=" garbage to
 520          get to the URL.  */
 521
 522       struct urlpos *entry;
 523       int attrind;
 524       int timeout = 0;
 525       char *p;
 526
 527       char *refresh = find_attr (tag, "content", &attrind);
 528       if (!refresh)
 529         return;
 530
 531       for (p = refresh; c_isdigit (*p); p++)
 532         timeout = 10 * timeout + *p - '0';
 533       if (*p++ != ';')
 534         return;
 535
 536       while (c_isspace (*p))
 537         ++p;
 538       if (!(   c_toupper (*p)       == 'U'
 539             && c_toupper (*(p + 1)) == 'R'
 540             && c_toupper (*(p + 2)) == 'L'
 541             &&          *(p + 3)  == '='))
 542         return;
 543       p += 4;
 544       while (c_isspace (*p))
 545         ++p;
 546
 547       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 548                           ATTR_SIZE(tag,attrind), ctx);
 549       if (entry)
 550         {
 551           entry->link_refresh_p = 1;
 552           entry->refresh_timeout = timeout;
 553           entry->link_expect_html = 1;
 554         }
 555     }
 556   else if (name && 0 == strcasecmp (name, "robots"))
 557     {
 558       /* Handle stuff like:
 559          <meta name="robots" content="index,nofollow"> */
 560       char *content = find_attr (tag, "content", NULL);
 561       if (!content)
 562         return;
 563       if (!strcasecmp (content, "none"))
 564         ctx->nofollow = true;
 565       else
 566         {
 567           while (*content)
 568             {
 569               /* Find the next occurrence of ',' or the end of
 570                  the string.  */
 571               char *end = strchr (content, ',');
 572               if (end)
 573                 ++end;
 574               else
 575                 end = content + strlen (content);
 576               if (!strncasecmp (content, "nofollow", end - content))
 577                 ctx->nofollow = true;
 578               content = end;
 579             }
 580         }
 581     }
 582 }
 583
 584 /* Dispatch the tag handler appropriate for the tag we're mapping
 585    over.  See known_tags[] for definition of tag handlers.  */
 586
 587 static void
 588 collect_tags_mapper (struct taginfo *tag, void *arg)
 589 {
 590   struct map_context *ctx = (struct map_context *)arg;
 591
 592   /* Find the tag in our table of tags.  This must not fail because
 593      map_html_tags only returns tags found in interesting_tags.
 594
 595      I've changed this for now, I'm passing NULL as interesting_tags
 596      to map_html_tags.  This way we can check all tags for a style
 597      attribute.
 598   */
 599   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 600
 601   if (t != NULL)
 602     t->handler (t->tagid, tag, ctx);
 603
 604   check_style_attr (tag, ctx);
 605
 606   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 607       tag->contents_begin && tag->contents_end)
 608   {
 609     /* parse contents */
 610     get_urls_css (ctx, tag->contents_begin - ctx->text,
 611                   tag->contents_end - tag->contents_begin);
 612   }
 613 }
 614 \f
 615 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 616    it.  It merges relative links in FILE with URL.  It is aware of
 617    <base href=...> and does the right thing.  */
 618
 619 struct urlpos *
 620 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 621 {
 622   struct file_memory *fm;
 623   struct map_context ctx;
 624   int flags;
 625
 626   /* Load the file. */
 627   fm = read_file (file);
 628   if (!fm)
 629     {
 630       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 631       return NULL;
 632     }
 633   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 634
 635   ctx.text = fm->content;
 636   ctx.head = ctx.tail = NULL;
 637   ctx.base = NULL;
 638   ctx.parent_base = url ? url : opt.base_href;
 639   ctx.document_file = file;
 640   ctx.nofollow = false;
 641
 642   if (!interesting_tags)
 643     init_interesting ();
 644
 645   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 646      generate <a href=" foo"> instead of <a href="foo"> (browsers
 647      ignore spaces as well.)  If you really mean space, use &32; or
 648      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 649      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 650      ignored by IE and Mozilla and are presumably introduced by
 651      writing HTML with editors that force word wrap.  */
 652   flags = MHT_TRIM_VALUES;
 653   if (opt.strict_comments)
 654     flags |= MHT_STRICT_COMMENTS;
 655
 656   /* the NULL here used to be interesting_tags */
 657   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 658                  NULL, interesting_attributes);
 659
 660   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 661   if (meta_disallow_follow)
 662     *meta_disallow_follow = ctx.nofollow;
 663
 664   xfree_null (ctx.base);
 665   read_file_free (fm);
 666   return ctx.head;
 667 }
 668
 669 /* This doesn't really have anything to do with HTML, but it's similar
 670    to get_urls_html, so we put it here.  */
 671
 672 struct urlpos *
 673 get_urls_file (const char *file)
 674 {
 675   struct file_memory *fm;
 676   struct urlpos *head, *tail;
 677   const char *text, *text_end;
 678
 679   /* Load the file.  */
 680   fm = read_file (file);
 681   if (!fm)
 682     {
 683       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 684       return NULL;
 685     }
 686   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 687
 688   head = tail = NULL;
 689   text = fm->content;
 690   text_end = fm->content + fm->length;
 691   while (text < text_end)
 692     {
 693       int up_error_code;
 694       char *url_text;
 695       struct urlpos *entry;
 696       struct url *url;
 697
 698       const char *line_beg = text;
 699       const char *line_end = memchr (text, '\n', text_end - text);
 700       if (!line_end)
 701         line_end = text_end;
 702       else
 703         ++line_end;
 704       text = line_end;
 705
 706       /* Strip whitespace from the beginning and end of line. */
 707       while (line_beg < line_end && c_isspace (*line_beg))
 708         ++line_beg;
 709       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 710         --line_end;
 711
 712       if (line_beg == line_end)
 713         continue;
 714
 715       /* The URL is in the [line_beg, line_end) region. */
 716
 717       /* We must copy the URL to a zero-terminated string, and we
 718          can't use alloca because we're in a loop.  *sigh*.  */
 719       url_text = strdupdelim (line_beg, line_end);
 720
 721       if (opt.base_href)
 722         {
 723           /* Merge opt.base_href with URL. */
 724           char *merged = uri_merge (opt.base_href, url_text);
 725           xfree (url_text);
 726           url_text = merged;
 727         }
 728
 729       url = url_parse (url_text, &up_error_code);
 730       if (!url)
 731         {
 732           char *error = url_error (url_text, up_error_code);
 733           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 734                      file, url_text, error);
 735           xfree (url_text);
 736           xfree (error);
 737           continue;
 738         }
 739       xfree (url_text);
 740
 741       entry = xnew0 (struct urlpos);
 742       entry->url = url;
 743
 744       if (!head)
 745         head = entry;
 746       else
 747         tail->next = entry;
 748       tail = entry;
 749     }
 750   read_file_free (fm);
 751   return head;
 752 }
 753
 754 void
 755 cleanup_html_url (void)
 756 {
 757   /* Destroy the hash tables.  The hash table keys and values are not
 758      allocated by this code, so we don't need to free them here.  */
 759   if (interesting_tags)
 760     hash_table_destroy (interesting_tags);
 761   if (interesting_attributes)
 762     hash_table_destroy (interesting_attributes);
 763 }