sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link  */
 167   "http-equiv",                 /* used by tag_handle_meta  */
 168   "name",                       /* used by tag_handle_meta  */
 169   "content",                    /* used by tag_handle_meta  */
 170   "action",                     /* used by tag_handle_form  */
 171   "style"                       /* used by check_style_attr */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 /* Will contains the (last) charset found in 'http-equiv=content-type'
 178    meta tags  */
 179 static char *meta_charset;
 180
 181 static void
 182 init_interesting (void)
 183 {
 184   /* Init the variables interesting_tags and interesting_attributes
 185      that are used by the HTML parser to know which tags and
 186      attributes we're interested in.  We initialize this only once,
 187      for performance reasons.
 188
 189      Here we also make sure that what we put in interesting_tags
 190      matches the user's preferences as specified through --ignore-tags
 191      and --follow-tags.  */
 192
 193   size_t i;
 194   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 195
 196   /* First, add all the tags we know hot to handle, mapped to their
 197      respective entries in known_tags.  */
 198   for (i = 0; i < countof (known_tags); i++)
 199     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 200
 201   /* Then remove the tags ignored through --ignore-tags.  */
 202   if (opt.ignore_tags)
 203     {
 204       char **ignored;
 205       for (ignored = opt.ignore_tags; *ignored; ignored++)
 206         hash_table_remove (interesting_tags, *ignored);
 207     }
 208
 209   /* If --follow-tags is specified, use only those tags.  */
 210   if (opt.follow_tags)
 211     {
 212       /* Create a new table intersecting --follow-tags and known_tags,
 213          and use it as interesting_tags.  */
 214       struct hash_table *intersect = make_nocase_string_hash_table (0);
 215       char **followed;
 216       for (followed = opt.follow_tags; *followed; followed++)
 217         {
 218           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 219           if (!t)
 220             continue;           /* ignore unknown --follow-tags entries. */
 221           hash_table_put (intersect, *followed, t);
 222         }
 223       hash_table_destroy (interesting_tags);
 224       interesting_tags = intersect;
 225     }
 226
 227   /* Add the attributes we care about. */
 228   interesting_attributes = make_nocase_string_hash_table (10);
 229   for (i = 0; i < countof (additional_attributes); i++)
 230     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 231   for (i = 0; i < countof (tag_url_attributes); i++)
 232     hash_table_put (interesting_attributes,
 233                     tag_url_attributes[i].attr_name, "1");
 234 }
 235
 236 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 237    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 238    index of the attribute in TAG will be stored there.  */
 239
 240 static char *
 241 find_attr (struct taginfo *tag, const char *name, int *attrind)
 242 {
 243   int i;
 244   for (i = 0; i < tag->nattrs; i++)
 245     if (!strcasecmp (tag->attrs[i].name, name))
 246       {
 247         if (attrind)
 248           *attrind = i;
 249         return tag->attrs[i].value;
 250       }
 251   return NULL;
 252 }
 253
 254 /* used for calls to append_url */
 255 #define ATTR_POS(tag, attrind, ctx) \
 256  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 257 #define ATTR_SIZE(tag, attrind) \
 258  (tag->attrs[attrind].value_raw_size)
 259
 260 /* Append LINK_URI to the urlpos structure that is being built.
 261
 262    LINK_URI will be merged with the current document base.
 263 */
 264
 265 struct urlpos *
 266 append_url (const char *link_uri, int position, int size,
 267             struct map_context *ctx)
 268 {
 269   int link_has_scheme = url_has_scheme (link_uri);
 270   struct urlpos *newel;
 271   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 272   struct url *url;
 273
 274   if (!base)
 275     {
 276       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 277                ctx->document_file, link_uri));
 278
 279       if (!link_has_scheme)
 280         {
 281           /* Base URL is unavailable, and the link does not have a
 282              location attached to it -- we have to give up.  Since
 283              this can only happen when using `--force-html -i', print
 284              a warning.  */
 285           logprintf (LOG_NOTQUIET,
 286                      _("%s: Cannot resolve incomplete link %s.\n"),
 287                      ctx->document_file, link_uri);
 288           return NULL;
 289         }
 290
 291       url = url_parse (link_uri, NULL, NULL, false);
 292       if (!url)
 293         {
 294           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 295                    ctx->document_file, link_uri));
 296           return NULL;
 297         }
 298     }
 299   else
 300     {
 301       /* Merge BASE with LINK_URI, but also make sure the result is
 302          canonicalized, i.e. that "../" have been resolved.
 303          (parse_url will do that for us.) */
 304
 305       char *complete_uri = uri_merge (base, link_uri);
 306
 307       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 308                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 309                quote_n (1, base),
 310                quote_n (2, link_uri),
 311                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 312
 313       url = url_parse (complete_uri, NULL, NULL, false);
 314       if (!url)
 315         {
 316           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 317                    ctx->document_file, complete_uri));
 318           xfree (complete_uri);
 319           return NULL;
 320         }
 321       xfree (complete_uri);
 322     }
 323
 324   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 325
 326   newel = xnew0 (struct urlpos);
 327   newel->url = url;
 328   newel->pos = position;
 329   newel->size = size;
 330
 331   /* A URL is relative if the host is not named, and the name does not
 332      start with `/'.  */
 333   if (!link_has_scheme && *link_uri != '/')
 334     newel->link_relative_p = 1;
 335   else if (link_has_scheme)
 336     newel->link_complete_p = 1;
 337
 338   if (ctx->tail)
 339     {
 340       ctx->tail->next = newel;
 341       ctx->tail = newel;
 342     }
 343   else
 344     ctx->tail = ctx->head = newel;
 345
 346   return newel;
 347 }
 348 \f
 349 static void
 350 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 351 {
 352   int attrind;
 353   char *style = find_attr (tag, "style", &attrind);
 354   if (!style)
 355     return;
 356
 357   /* raw pos and raw size include the quotes, hence the +1 -2 */
 358   get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
 359 }
 360
 361 /* All the tag_* functions are called from collect_tags_mapper, as
 362    specified by KNOWN_TAGS.  */
 363
 364 /* Default tag handler: collect URLs from attributes specified for
 365    this tag by tag_url_attributes.  */
 366
 367 static void
 368 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 369 {
 370   size_t i;
 371   int attrind;
 372   int first = -1;
 373
 374   for (i = 0; i < countof (tag_url_attributes); i++)
 375     if (tag_url_attributes[i].tagid == tagid)
 376       {
 377         /* We've found the index of tag_url_attributes where the
 378            attributes of our tag begin.  */
 379         first = i;
 380         break;
 381       }
 382   assert (first != -1);
 383
 384   /* Loop over the "interesting" attributes of this tag.  In this
 385      example, it will loop over "src" and "lowsrc".
 386
 387        <img src="foo.png" lowsrc="bar.png">
 388
 389      This has to be done in the outer loop so that the attributes are
 390      processed in the same order in which they appear in the page.
 391      This is required when converting links.  */
 392
 393   for (attrind = 0; attrind < tag->nattrs; attrind++)
 394     {
 395       /* Find whether TAG/ATTRIND is a combination that contains a
 396          URL. */
 397       char *link = tag->attrs[attrind].value;
 398       const size_t size = countof (tag_url_attributes);
 399
 400       /* If you're cringing at the inefficiency of the nested loops,
 401          remember that they both iterate over a very small number of
 402          items.  The worst-case inner loop is for the IMG tag, which
 403          has three attributes.  */
 404       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 405         {
 406           if (0 == strcasecmp (tag->attrs[attrind].name,
 407                                tag_url_attributes[i].attr_name))
 408             {
 409               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 410                                               ATTR_SIZE(tag,attrind), ctx);
 411               if (up)
 412                 {
 413                   int flags = tag_url_attributes[i].flags;
 414                   if (flags & ATTR_INLINE)
 415                     up->link_inline_p = 1;
 416                   if (flags & ATTR_HTML)
 417                     up->link_expect_html = 1;
 418                 }
 419             }
 420         }
 421     }
 422 }
 423
 424 /* Handle the BASE tag, for <base href=...>. */
 425
 426 static void
 427 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 428 {
 429   struct urlpos *base_urlpos;
 430   int attrind;
 431   char *newbase = find_attr (tag, "href", &attrind);
 432   if (!newbase)
 433     return;
 434
 435   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 436                             ATTR_SIZE(tag,attrind), ctx);
 437   if (!base_urlpos)
 438     return;
 439   base_urlpos->ignore_when_downloading = 1;
 440   base_urlpos->link_base_p = 1;
 441
 442   if (ctx->base)
 443     xfree (ctx->base);
 444   if (ctx->parent_base)
 445     ctx->base = uri_merge (ctx->parent_base, newbase);
 446   else
 447     ctx->base = xstrdup (newbase);
 448 }
 449
 450 /* Mark the URL found in <form action=...> for conversion. */
 451
 452 static void
 453 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 454 {
 455   int attrind;
 456   char *action = find_attr (tag, "action", &attrind);
 457
 458   if (action)
 459     {
 460       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 461                                       ATTR_SIZE(tag,attrind), ctx);
 462       if (up)
 463         up->ignore_when_downloading = 1;
 464     }
 465 }
 466
 467 /* Handle the LINK tag.  It requires special handling because how its
 468    links will be followed in -p mode depends on the REL attribute.  */
 469
 470 static void
 471 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 472 {
 473   int attrind;
 474   char *href = find_attr (tag, "href", &attrind);
 475
 476   /* All <link href="..."> link references are external, except those
 477      known not to be, such as style sheet and shortcut icon:
 478
 479        <link rel="stylesheet" href="...">
 480        <link rel="shortcut icon" href="...">
 481   */
 482   if (href)
 483     {
 484       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 485                                       ATTR_SIZE(tag,attrind), ctx);
 486       if (up)
 487         {
 488           char *rel = find_attr (tag, "rel", NULL);
 489           if (rel)
 490             {
 491               if (0 == strcasecmp (rel, "stylesheet"))
 492                 {
 493                   up->link_inline_p = 1;
 494                   up->link_expect_css = 1;
 495                 }
 496               else if (0 == strcasecmp (rel, "shortcut icon"))
 497                 {
 498                   up->link_inline_p = 1;
 499                 }
 500             }
 501           else
 502             /* The external ones usually point to HTML pages, such as
 503                <link rel="next" href="..."> */
 504             up->link_expect_html = 1;
 505         }
 506     }
 507 }
 508
 509 /* Handle the META tag.  This requires special handling because of the
 510    refresh feature and because of robot exclusion.  */
 511
 512 static void
 513 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 514 {
 515   char *name = find_attr (tag, "name", NULL);
 516   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 517
 518   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 519     {
 520       /* Some pages use a META tag to specify that the page be
 521          refreshed by a new page after a given number of seconds.  The
 522          general format for this is:
 523
 524            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 525
 526          So we just need to skip past the "NUMBER; URL=" garbage to
 527          get to the URL.  */
 528
 529       struct urlpos *entry;
 530       int attrind;
 531       int timeout = 0;
 532       char *p;
 533
 534       char *refresh = find_attr (tag, "content", &attrind);
 535       if (!refresh)
 536         return;
 537
 538       for (p = refresh; c_isdigit (*p); p++)
 539         timeout = 10 * timeout + *p - '0';
 540       if (*p++ != ';')
 541         return;
 542
 543       while (c_isspace (*p))
 544         ++p;
 545       if (!(   c_toupper (*p)       == 'U'
 546             && c_toupper (*(p + 1)) == 'R'
 547             && c_toupper (*(p + 2)) == 'L'
 548             &&          *(p + 3)  == '='))
 549         return;
 550       p += 4;
 551       while (c_isspace (*p))
 552         ++p;
 553
 554       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 555                           ATTR_SIZE(tag,attrind), ctx);
 556       if (entry)
 557         {
 558           entry->link_refresh_p = 1;
 559           entry->refresh_timeout = timeout;
 560           entry->link_expect_html = 1;
 561         }
 562     }
 563   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 564     {
 565       /* Handle stuff like:
 566          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 567
 568       char *mcharset;
 569       char *content = find_attr (tag, "content", NULL);
 570       if (!content)
 571         return;
 572
 573       mcharset = parse_charset (content);
 574       if (!mcharset)
 575         return;
 576
 577       xfree_null (meta_charset);
 578       meta_charset = mcharset;
 579     }
 580   else if (name && 0 == strcasecmp (name, "robots"))
 581     {
 582       /* Handle stuff like:
 583          <meta name="robots" content="index,nofollow"> */
 584       char *content = find_attr (tag, "content", NULL);
 585       if (!content)
 586         return;
 587       if (!strcasecmp (content, "none"))
 588         ctx->nofollow = true;
 589       else
 590         {
 591           while (*content)
 592             {
 593               /* Find the next occurrence of ',' or the end of
 594                  the string.  */
 595               char *end = strchr (content, ',');
 596               if (end)
 597                 ++end;
 598               else
 599                 end = content + strlen (content);
 600               if (!strncasecmp (content, "nofollow", end - content))
 601                 ctx->nofollow = true;
 602               content = end;
 603             }
 604         }
 605     }
 606 }
 607
 608 /* Dispatch the tag handler appropriate for the tag we're mapping
 609    over.  See known_tags[] for definition of tag handlers.  */
 610
 611 static void
 612 collect_tags_mapper (struct taginfo *tag, void *arg)
 613 {
 614   struct map_context *ctx = (struct map_context *)arg;
 615
 616   /* Find the tag in our table of tags.  This must not fail because
 617      map_html_tags only returns tags found in interesting_tags.
 618
 619      I've changed this for now, I'm passing NULL as interesting_tags
 620      to map_html_tags.  This way we can check all tags for a style
 621      attribute.
 622   */
 623   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 624
 625   if (t != NULL)
 626     t->handler (t->tagid, tag, ctx);
 627
 628   check_style_attr (tag, ctx);
 629
 630   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 631       tag->contents_begin && tag->contents_end)
 632   {
 633     /* parse contents */
 634     get_urls_css (ctx, tag->contents_begin - ctx->text,
 635                   tag->contents_end - tag->contents_begin);
 636   }
 637 }
 638 \f
 639 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 640    it.  It merges relative links in FILE with URL.  It is aware of
 641    <base href=...> and does the right thing.  */
 642
 643 struct urlpos *
 644 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 645                struct iri *iri)
 646 {
 647   struct file_memory *fm;
 648   struct map_context ctx;
 649   int flags;
 650
 651   /* Load the file. */
 652   fm = read_file (file);
 653   if (!fm)
 654     {
 655       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 656       return NULL;
 657     }
 658   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 659
 660   ctx.text = fm->content;
 661   ctx.head = ctx.tail = NULL;
 662   ctx.base = NULL;
 663   ctx.parent_base = url ? url : opt.base_href;
 664   ctx.document_file = file;
 665   ctx.nofollow = false;
 666
 667   if (!interesting_tags)
 668     init_interesting ();
 669
 670   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 671      generate <a href=" foo"> instead of <a href="foo"> (browsers
 672      ignore spaces as well.)  If you really mean space, use &32; or
 673      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 674      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 675      ignored by IE and Mozilla and are presumably introduced by
 676      writing HTML with editors that force word wrap.  */
 677   flags = MHT_TRIM_VALUES;
 678   if (opt.strict_comments)
 679     flags |= MHT_STRICT_COMMENTS;
 680
 681   /* the NULL here used to be interesting_tags */
 682   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 683                  NULL, interesting_attributes);
 684
 685   /* If meta charset isn't null, override content encoding */
 686   if (iri && meta_charset)
 687     set_content_encoding (iri, meta_charset);
 688
 689   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 690   if (meta_disallow_follow)
 691     *meta_disallow_follow = ctx.nofollow;
 692
 693   xfree_null (ctx.base);
 694   read_file_free (fm);
 695   return ctx.head;
 696 }
 697
 698 /* This doesn't really have anything to do with HTML, but it's similar
 699    to get_urls_html, so we put it here.  */
 700
 701 struct urlpos *
 702 get_urls_file (const char *file)
 703 {
 704   struct file_memory *fm;
 705   struct urlpos *head, *tail;
 706   const char *text, *text_end;
 707
 708   /* Load the file.  */
 709   fm = read_file (file);
 710   if (!fm)
 711     {
 712       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 713       return NULL;
 714     }
 715   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 716
 717   head = tail = NULL;
 718   text = fm->content;
 719   text_end = fm->content + fm->length;
 720   while (text < text_end)
 721     {
 722       int up_error_code;
 723       char *url_text;
 724       struct urlpos *entry;
 725       struct url *url;
 726
 727       const char *line_beg = text;
 728       const char *line_end = memchr (text, '\n', text_end - text);
 729       if (!line_end)
 730         line_end = text_end;
 731       else
 732         ++line_end;
 733       text = line_end;
 734
 735       /* Strip whitespace from the beginning and end of line. */
 736       while (line_beg < line_end && c_isspace (*line_beg))
 737         ++line_beg;
 738       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 739         --line_end;
 740
 741       if (line_beg == line_end)
 742         continue;
 743
 744       /* The URL is in the [line_beg, line_end) region. */
 745
 746       /* We must copy the URL to a zero-terminated string, and we
 747          can't use alloca because we're in a loop.  *sigh*.  */
 748       url_text = strdupdelim (line_beg, line_end);
 749
 750       if (opt.base_href)
 751         {
 752           /* Merge opt.base_href with URL. */
 753           char *merged = uri_merge (opt.base_href, url_text);
 754           xfree (url_text);
 755           url_text = merged;
 756         }
 757
 758       url = url_parse (url_text, &up_error_code, NULL, false);
 759       if (!url)
 760         {
 761           char *error = url_error (url_text, up_error_code);
 762           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 763                      file, url_text, error);
 764           xfree (url_text);
 765           xfree (error);
 766           continue;
 767         }
 768       xfree (url_text);
 769
 770       entry = xnew0 (struct urlpos);
 771       entry->url = url;
 772
 773       if (!head)
 774         head = entry;
 775       else
 776         tail->next = entry;
 777       tail = entry;
 778     }
 779   read_file_free (fm);
 780   return head;
 781 }
 782
 783 void
 784 cleanup_html_url (void)
 785 {
 786   /* Destroy the hash tables.  The hash table keys and values are not
 787      allocated by this code, so we don't need to free them here.  */
 788   if (interesting_tags)
 789     hash_table_destroy (interesting_tags);
 790   if (interesting_attributes)
 791     hash_table_destroy (interesting_attributes);
 792 }