sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47 #include "iri.h"
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH
  83 };
  84
  85 /* The list of known tags and functions used for handling them.  Most
  86    tags are simply harvested for URLs. */
  87 static struct known_tag {
  88   int tagid;
  89   const char *name;
  90   tag_handler_t handler;
  91 } known_tags[] = {
  92   { TAG_A,       "a",           tag_find_urls },
  93   { TAG_APPLET,  "applet",      tag_find_urls },
  94   { TAG_AREA,    "area",        tag_find_urls },
  95   { TAG_BASE,    "base",        tag_handle_base },
  96   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  97   { TAG_BODY,    "body",        tag_find_urls },
  98   { TAG_EMBED,   "embed",       tag_find_urls },
  99   { TAG_FIG,     "fig",         tag_find_urls },
 100   { TAG_FORM,    "form",        tag_handle_form },
 101   { TAG_FRAME,   "frame",       tag_find_urls },
 102   { TAG_IFRAME,  "iframe",      tag_find_urls },
 103   { TAG_IMG,     "img",         tag_find_urls },
 104   { TAG_INPUT,   "input",       tag_find_urls },
 105   { TAG_LAYER,   "layer",       tag_find_urls },
 106   { TAG_LINK,    "link",        tag_handle_link },
 107   { TAG_META,    "meta",        tag_handle_meta },
 108   { TAG_OBJECT,  "object",      tag_find_urls },
 109   { TAG_OVERLAY, "overlay",     tag_find_urls },
 110   { TAG_SCRIPT,  "script",      tag_find_urls },
 111   { TAG_TABLE,   "table",       tag_find_urls },
 112   { TAG_TD,      "td",          tag_find_urls },
 113   { TAG_TH,      "th",          tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS. */
 120
 121 /* The link is "inline", i.e. needs to be retrieved for this document
 122    to be correctly rendered.  Inline links include inlined images,
 123    stylesheets, children frames, etc.  */
 124 #define ATTR_INLINE     1
 125
 126 /* The link is expected to yield HTML contents.  It's important not to
 127    try to follow HTML obtained by following e.g. <img src="...">
 128    regardless of content-type.  Doing this causes infinite loops for
 129    "images" that return non-404 error pages with links to the same
 130    image.  */
 131 #define ATTR_HTML       2
 132
 133 /* For tags handled by tag_find_urls: attributes that contain URLs to
 134    download. */
 135 static struct {
 136   int tagid;
 137   const char *attr_name;
 138   int flags;
 139 } tag_url_attributes[] = {
 140   { TAG_A,              "href",         ATTR_HTML },
 141   { TAG_APPLET,         "code",         ATTR_INLINE },
 142   { TAG_AREA,           "href",         ATTR_HTML },
 143   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 144   { TAG_BODY,           "background",   ATTR_INLINE },
 145   { TAG_EMBED,          "href",         ATTR_HTML },
 146   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_FIG,            "src",          ATTR_INLINE },
 148   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IMG,            "href",         ATTR_INLINE },
 151   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 152   { TAG_IMG,            "src",          ATTR_INLINE },
 153   { TAG_INPUT,          "src",          ATTR_INLINE },
 154   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_OBJECT,         "data",         ATTR_INLINE },
 156   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 158   { TAG_TABLE,          "background",   ATTR_INLINE },
 159   { TAG_TD,             "background",   ATTR_INLINE },
 160   { TAG_TH,             "background",   ATTR_INLINE }
 161 };
 162
 163 /* The lists of interesting tags and attributes are built dynamically,
 164    from the information above.  However, some places in the code refer
 165    to the attributes not mentioned here.  We add them manually.  */
 166 static const char *additional_attributes[] = {
 167   "rel",                        /* used by tag_handle_link  */
 168   "http-equiv",                 /* used by tag_handle_meta  */
 169   "name",                       /* used by tag_handle_meta  */
 170   "content",                    /* used by tag_handle_meta  */
 171   "action",                     /* used by tag_handle_form  */
 172   "style"                       /* used by check_style_attr */
 173 };
 174
 175 static struct hash_table *interesting_tags;
 176 static struct hash_table *interesting_attributes;
 177
 178 static void
 179 init_interesting (void)
 180 {
 181   /* Init the variables interesting_tags and interesting_attributes
 182      that are used by the HTML parser to know which tags and
 183      attributes we're interested in.  We initialize this only once,
 184      for performance reasons.
 185
 186      Here we also make sure that what we put in interesting_tags
 187      matches the user's preferences as specified through --ignore-tags
 188      and --follow-tags.  */
 189
 190   size_t i;
 191   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 192
 193   /* First, add all the tags we know hot to handle, mapped to their
 194      respective entries in known_tags.  */
 195   for (i = 0; i < countof (known_tags); i++)
 196     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 197
 198   /* Then remove the tags ignored through --ignore-tags.  */
 199   if (opt.ignore_tags)
 200     {
 201       char **ignored;
 202       for (ignored = opt.ignore_tags; *ignored; ignored++)
 203         hash_table_remove (interesting_tags, *ignored);
 204     }
 205
 206   /* If --follow-tags is specified, use only those tags.  */
 207   if (opt.follow_tags)
 208     {
 209       /* Create a new table intersecting --follow-tags and known_tags,
 210          and use it as interesting_tags.  */
 211       struct hash_table *intersect = make_nocase_string_hash_table (0);
 212       char **followed;
 213       for (followed = opt.follow_tags; *followed; followed++)
 214         {
 215           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 216           if (!t)
 217             continue;           /* ignore unknown --follow-tags entries. */
 218           hash_table_put (intersect, *followed, t);
 219         }
 220       hash_table_destroy (interesting_tags);
 221       interesting_tags = intersect;
 222     }
 223
 224   /* Add the attributes we care about. */
 225   interesting_attributes = make_nocase_string_hash_table (10);
 226   for (i = 0; i < countof (additional_attributes); i++)
 227     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 228   for (i = 0; i < countof (tag_url_attributes); i++)
 229     hash_table_put (interesting_attributes,
 230                     tag_url_attributes[i].attr_name, "1");
 231 }
 232
 233 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 234    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 235    index of the attribute in TAG will be stored there.  */
 236
 237 static char *
 238 find_attr (struct taginfo *tag, const char *name, int *attrind)
 239 {
 240   int i;
 241   for (i = 0; i < tag->nattrs; i++)
 242     if (!strcasecmp (tag->attrs[i].name, name))
 243       {
 244         if (attrind)
 245           *attrind = i;
 246         return tag->attrs[i].value;
 247       }
 248   return NULL;
 249 }
 250
 251 /* used for calls to append_url */
 252 #define ATTR_POS(tag, attrind, ctx) \
 253  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 254 #define ATTR_SIZE(tag, attrind) \
 255  (tag->attrs[attrind].value_raw_size)
 256
 257 /* Append LINK_URI to the urlpos structure that is being built.
 258
 259    LINK_URI will be merged with the current document base.
 260 */
 261
 262 struct urlpos *
 263 append_url (const char *link_uri, int position, int size,
 264             struct map_context *ctx)
 265 {
 266   int link_has_scheme = url_has_scheme (link_uri);
 267   struct urlpos *newel;
 268   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 269   struct url *url;
 270
 271   if (!base)
 272     {
 273       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 274                ctx->document_file, link_uri));
 275
 276       if (!link_has_scheme)
 277         {
 278           /* Base URL is unavailable, and the link does not have a
 279              location attached to it -- we have to give up.  Since
 280              this can only happen when using `--force-html -i', print
 281              a warning.  */
 282           logprintf (LOG_NOTQUIET,
 283                      _("%s: Cannot resolve incomplete link %s.\n"),
 284                      ctx->document_file, link_uri);
 285           return NULL;
 286         }
 287
 288       set_ugly_no_encode (true);
 289       url = url_parse (link_uri, NULL);
 290       set_ugly_no_encode (false);
 291       if (!url)
 292         {
 293           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 294                    ctx->document_file, link_uri));
 295           return NULL;
 296         }
 297     }
 298   else
 299     {
 300       /* Merge BASE with LINK_URI, but also make sure the result is
 301          canonicalized, i.e. that "../" have been resolved.
 302          (parse_url will do that for us.) */
 303
 304       char *complete_uri = uri_merge (base, link_uri);
 305
 306       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 307                ctx->document_file, base, link_uri, complete_uri));
 308
 309       set_ugly_no_encode (true);
 310       url = url_parse (complete_uri, NULL);
 311       set_ugly_no_encode (false);
 312       if (!url)
 313         {
 314           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 315                    ctx->document_file, complete_uri));
 316           xfree (complete_uri);
 317           return NULL;
 318         }
 319       xfree (complete_uri);
 320     }
 321
 322   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 323
 324   newel = xnew0 (struct urlpos);
 325   newel->url = url;
 326   newel->pos = position;
 327   newel->size = size;
 328
 329   /* A URL is relative if the host is not named, and the name does not
 330      start with `/'.  */
 331   if (!link_has_scheme && *link_uri != '/')
 332     newel->link_relative_p = 1;
 333   else if (link_has_scheme)
 334     newel->link_complete_p = 1;
 335
 336   if (ctx->tail)
 337     {
 338       ctx->tail->next = newel;
 339       ctx->tail = newel;
 340     }
 341   else
 342     ctx->tail = ctx->head = newel;
 343
 344   return newel;
 345 }
 346 \f
 347 static void
 348 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 349 {
 350   int attrind;
 351   char *style = find_attr (tag, "style", &attrind);
 352   if (!style)
 353     return;
 354
 355   /* raw pos and raw size include the quotes, hence the +1 -2 */
 356   get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
 357 }
 358
 359 /* All the tag_* functions are called from collect_tags_mapper, as
 360    specified by KNOWN_TAGS.  */
 361
 362 /* Default tag handler: collect URLs from attributes specified for
 363    this tag by tag_url_attributes.  */
 364
 365 static void
 366 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 367 {
 368   size_t i;
 369   int attrind;
 370   int first = -1;
 371
 372   for (i = 0; i < countof (tag_url_attributes); i++)
 373     if (tag_url_attributes[i].tagid == tagid)
 374       {
 375         /* We've found the index of tag_url_attributes where the
 376            attributes of our tag begin.  */
 377         first = i;
 378         break;
 379       }
 380   assert (first != -1);
 381
 382   /* Loop over the "interesting" attributes of this tag.  In this
 383      example, it will loop over "src" and "lowsrc".
 384
 385        <img src="foo.png" lowsrc="bar.png">
 386
 387      This has to be done in the outer loop so that the attributes are
 388      processed in the same order in which they appear in the page.
 389      This is required when converting links.  */
 390
 391   for (attrind = 0; attrind < tag->nattrs; attrind++)
 392     {
 393       /* Find whether TAG/ATTRIND is a combination that contains a
 394          URL. */
 395       char *link = tag->attrs[attrind].value;
 396       const size_t size = countof (tag_url_attributes);
 397
 398       /* If you're cringing at the inefficiency of the nested loops,
 399          remember that they both iterate over a very small number of
 400          items.  The worst-case inner loop is for the IMG tag, which
 401          has three attributes.  */
 402       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 403         {
 404           if (0 == strcasecmp (tag->attrs[attrind].name,
 405                                tag_url_attributes[i].attr_name))
 406             {
 407               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 408                                               ATTR_SIZE(tag,attrind), ctx);
 409               if (up)
 410                 {
 411                   int flags = tag_url_attributes[i].flags;
 412                   if (flags & ATTR_INLINE)
 413                     up->link_inline_p = 1;
 414                   if (flags & ATTR_HTML)
 415                     up->link_expect_html = 1;
 416                 }
 417             }
 418         }
 419     }
 420 }
 421
 422 /* Handle the BASE tag, for <base href=...>. */
 423
 424 static void
 425 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 426 {
 427   struct urlpos *base_urlpos;
 428   int attrind;
 429   char *newbase = find_attr (tag, "href", &attrind);
 430   if (!newbase)
 431     return;
 432
 433   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 434                             ATTR_SIZE(tag,attrind), ctx);
 435   if (!base_urlpos)
 436     return;
 437   base_urlpos->ignore_when_downloading = 1;
 438   base_urlpos->link_base_p = 1;
 439
 440   if (ctx->base)
 441     xfree (ctx->base);
 442   if (ctx->parent_base)
 443     ctx->base = uri_merge (ctx->parent_base, newbase);
 444   else
 445     ctx->base = xstrdup (newbase);
 446 }
 447
 448 /* Mark the URL found in <form action=...> for conversion. */
 449
 450 static void
 451 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 452 {
 453   int attrind;
 454   char *action = find_attr (tag, "action", &attrind);
 455
 456   if (action)
 457     {
 458       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 459                                       ATTR_SIZE(tag,attrind), ctx);
 460       if (up)
 461         up->ignore_when_downloading = 1;
 462     }
 463 }
 464
 465 /* Handle the LINK tag.  It requires special handling because how its
 466    links will be followed in -p mode depends on the REL attribute.  */
 467
 468 static void
 469 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 470 {
 471   int attrind;
 472   char *href = find_attr (tag, "href", &attrind);
 473
 474   /* All <link href="..."> link references are external, except those
 475      known not to be, such as style sheet and shortcut icon:
 476
 477        <link rel="stylesheet" href="...">
 478        <link rel="shortcut icon" href="...">
 479   */
 480   if (href)
 481     {
 482       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 483                                       ATTR_SIZE(tag,attrind), ctx);
 484       if (up)
 485         {
 486           char *rel = find_attr (tag, "rel", NULL);
 487           if (rel)
 488             {
 489               if (0 == strcasecmp (rel, "stylesheet"))
 490                 {
 491                   up->link_inline_p = 1;
 492                   up->link_expect_css = 1;
 493                 }
 494               else if (0 == strcasecmp (rel, "shortcut icon"))
 495                 {
 496                   up->link_inline_p = 1;
 497                 }
 498             }
 499           else
 500             /* The external ones usually point to HTML pages, such as
 501                <link rel="next" href="..."> */
 502             up->link_expect_html = 1;
 503         }
 504     }
 505 }
 506
 507 /* Handle the META tag.  This requires special handling because of the
 508    refresh feature and because of robot exclusion.  */
 509
 510 static void
 511 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 512 {
 513   char *name = find_attr (tag, "name", NULL);
 514   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 515
 516   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 517     {
 518       /* Some pages use a META tag to specify that the page be
 519          refreshed by a new page after a given number of seconds.  The
 520          general format for this is:
 521
 522            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 523
 524          So we just need to skip past the "NUMBER; URL=" garbage to
 525          get to the URL.  */
 526
 527       struct urlpos *entry;
 528       int attrind;
 529       int timeout = 0;
 530       char *p;
 531
 532       char *refresh = find_attr (tag, "content", &attrind);
 533       if (!refresh)
 534         return;
 535
 536       for (p = refresh; c_isdigit (*p); p++)
 537         timeout = 10 * timeout + *p - '0';
 538       if (*p++ != ';')
 539         return;
 540
 541       while (c_isspace (*p))
 542         ++p;
 543       if (!(   c_toupper (*p)       == 'U'
 544             && c_toupper (*(p + 1)) == 'R'
 545             && c_toupper (*(p + 2)) == 'L'
 546             &&          *(p + 3)  == '='))
 547         return;
 548       p += 4;
 549       while (c_isspace (*p))
 550         ++p;
 551
 552       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 553                           ATTR_SIZE(tag,attrind), ctx);
 554       if (entry)
 555         {
 556           entry->link_refresh_p = 1;
 557           entry->refresh_timeout = timeout;
 558           entry->link_expect_html = 1;
 559         }
 560     }
 561   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 562     {
 563       /* Handle stuff like:
 564          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 565
 566       char *mcharset;
 567       char *content = find_attr (tag, "content", NULL);
 568       if (!content)
 569         return;
 570
 571       mcharset = parse_charset (content);
 572       if (!mcharset)
 573         return;
 574
 575       /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
 576
 577       set_current_charset (mcharset);
 578       xfree (mcharset);
 579     }
 580   else if (name && 0 == strcasecmp (name, "robots"))
 581     {
 582       /* Handle stuff like:
 583          <meta name="robots" content="index,nofollow"> */
 584       char *content = find_attr (tag, "content", NULL);
 585       if (!content)
 586         return;
 587       if (!strcasecmp (content, "none"))
 588         ctx->nofollow = true;
 589       else
 590         {
 591           while (*content)
 592             {
 593               /* Find the next occurrence of ',' or the end of
 594                  the string.  */
 595               char *end = strchr (content, ',');
 596               if (end)
 597                 ++end;
 598               else
 599                 end = content + strlen (content);
 600               if (!strncasecmp (content, "nofollow", end - content))
 601                 ctx->nofollow = true;
 602               content = end;
 603             }
 604         }
 605     }
 606 }
 607
 608 /* Dispatch the tag handler appropriate for the tag we're mapping
 609    over.  See known_tags[] for definition of tag handlers.  */
 610
 611 static void
 612 collect_tags_mapper (struct taginfo *tag, void *arg)
 613 {
 614   struct map_context *ctx = (struct map_context *)arg;
 615
 616   /* Find the tag in our table of tags.  This must not fail because
 617      map_html_tags only returns tags found in interesting_tags.
 618
 619      I've changed this for now, I'm passing NULL as interesting_tags
 620      to map_html_tags.  This way we can check all tags for a style
 621      attribute.
 622   */
 623   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 624
 625   if (t != NULL)
 626     t->handler (t->tagid, tag, ctx);
 627
 628   check_style_attr (tag, ctx);
 629
 630   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 631       tag->contents_begin && tag->contents_end)
 632   {
 633     /* parse contents */
 634     get_urls_css (ctx, tag->contents_begin - ctx->text,
 635                   tag->contents_end - tag->contents_begin);
 636   }
 637 }
 638 \f
 639 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 640    it.  It merges relative links in FILE with URL.  It is aware of
 641    <base href=...> and does the right thing.  */
 642
 643 struct urlpos *
 644 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 645 {
 646   struct file_memory *fm;
 647   struct map_context ctx;
 648   int flags;
 649
 650   /* Load the file. */
 651   fm = read_file (file);
 652   if (!fm)
 653     {
 654       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 655       return NULL;
 656     }
 657   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 658
 659   ctx.text = fm->content;
 660   ctx.head = ctx.tail = NULL;
 661   ctx.base = NULL;
 662   ctx.parent_base = url ? url : opt.base_href;
 663   ctx.document_file = file;
 664   ctx.nofollow = false;
 665
 666   if (!interesting_tags)
 667     init_interesting ();
 668
 669   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 670      generate <a href=" foo"> instead of <a href="foo"> (browsers
 671      ignore spaces as well.)  If you really mean space, use &32; or
 672      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 673      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 674      ignored by IE and Mozilla and are presumably introduced by
 675      writing HTML with editors that force word wrap.  */
 676   flags = MHT_TRIM_VALUES;
 677   if (opt.strict_comments)
 678     flags |= MHT_STRICT_COMMENTS;
 679
 680   /* the NULL here used to be interesting_tags */
 681   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 682                  NULL, interesting_attributes);
 683
 684   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 685   if (meta_disallow_follow)
 686     *meta_disallow_follow = ctx.nofollow;
 687
 688   xfree_null (ctx.base);
 689   read_file_free (fm);
 690   return ctx.head;
 691 }
 692
 693 /* This doesn't really have anything to do with HTML, but it's similar
 694    to get_urls_html, so we put it here.  */
 695
 696 struct urlpos *
 697 get_urls_file (const char *file)
 698 {
 699   struct file_memory *fm;
 700   struct urlpos *head, *tail;
 701   const char *text, *text_end;
 702
 703   /* Load the file.  */
 704   fm = read_file (file);
 705   if (!fm)
 706     {
 707       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 708       return NULL;
 709     }
 710   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 711
 712   head = tail = NULL;
 713   text = fm->content;
 714   text_end = fm->content + fm->length;
 715   while (text < text_end)
 716     {
 717       int up_error_code;
 718       char *url_text;
 719       struct urlpos *entry;
 720       struct url *url;
 721
 722       const char *line_beg = text;
 723       const char *line_end = memchr (text, '\n', text_end - text);
 724       if (!line_end)
 725         line_end = text_end;
 726       else
 727         ++line_end;
 728       text = line_end;
 729
 730       /* Strip whitespace from the beginning and end of line. */
 731       while (line_beg < line_end && c_isspace (*line_beg))
 732         ++line_beg;
 733       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 734         --line_end;
 735
 736       if (line_beg == line_end)
 737         continue;
 738
 739       /* The URL is in the [line_beg, line_end) region. */
 740
 741       /* We must copy the URL to a zero-terminated string, and we
 742          can't use alloca because we're in a loop.  *sigh*.  */
 743       url_text = strdupdelim (line_beg, line_end);
 744
 745       if (opt.base_href)
 746         {
 747           /* Merge opt.base_href with URL. */
 748           char *merged = uri_merge (opt.base_href, url_text);
 749           xfree (url_text);
 750           url_text = merged;
 751         }
 752
 753       set_ugly_no_encode (true);
 754       url = url_parse (url_text, &up_error_code);
 755       set_ugly_no_encode (false);
 756       if (!url)
 757         {
 758           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 759                      file, url_text, url_error (up_error_code));
 760           xfree (url_text);
 761           continue;
 762         }
 763       xfree (url_text);
 764
 765       entry = xnew0 (struct urlpos);
 766       entry->url = url;
 767
 768       if (!head)
 769         head = entry;
 770       else
 771         tail->next = entry;
 772       tail = entry;
 773     }
 774   read_file_free (fm);
 775   return head;
 776 }
 777
 778 void
 779 cleanup_html_url (void)
 780 {
 781   /* Destroy the hash tables.  The hash table keys and values are not
 782      allocated by this code, so we don't need to free them here.  */
 783   if (interesting_tags)
 784     hash_table_destroy (interesting_tags);
 785   if (interesting_attributes)
 786     hash_table_destroy (interesting_attributes);
 787 }