sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "convert.h"
  48
  49 #ifndef errno
  50 extern int errno;
  51 #endif
  52
  53 struct map_context;
  54
  55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  56                                        struct map_context *));
  57
  58 #define DECLARE_TAG_HANDLER(fun)                                        \
  59   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  60
  61 DECLARE_TAG_HANDLER (tag_find_urls);
  62 DECLARE_TAG_HANDLER (tag_handle_base);
  63 DECLARE_TAG_HANDLER (tag_handle_form);
  64 DECLARE_TAG_HANDLER (tag_handle_link);
  65 DECLARE_TAG_HANDLER (tag_handle_meta);
  66
  67 enum {
  68   TAG_A,
  69   TAG_APPLET,
  70   TAG_AREA,
  71   TAG_BASE,
  72   TAG_BGSOUND,
  73   TAG_BODY,
  74   TAG_EMBED,
  75   TAG_FIG,
  76   TAG_FORM,
  77   TAG_FRAME,
  78   TAG_IFRAME,
  79   TAG_IMG,
  80   TAG_INPUT,
  81   TAG_LAYER,
  82   TAG_LINK,
  83   TAG_META,
  84   TAG_OBJECT,
  85   TAG_OVERLAY,
  86   TAG_SCRIPT,
  87   TAG_TABLE,
  88   TAG_TD,
  89   TAG_TH
  90 };
  91
  92 /* The list of known tags and functions used for handling them.  Most
  93    tags are simply harvested for URLs. */
  94 static struct known_tag {
  95   int tagid;
  96   const char *name;
  97   tag_handler_t handler;
  98 } known_tags[] = {
  99   { TAG_A,       "a",           tag_find_urls },
 100   { TAG_APPLET,  "applet",      tag_find_urls },
 101   { TAG_AREA,    "area",        tag_find_urls },
 102   { TAG_BASE,    "base",        tag_handle_base },
 103   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 104   { TAG_BODY,    "body",        tag_find_urls },
 105   { TAG_EMBED,   "embed",       tag_find_urls },
 106   { TAG_FIG,     "fig",         tag_find_urls },
 107   { TAG_FORM,    "form",        tag_handle_form },
 108   { TAG_FRAME,   "frame",       tag_find_urls },
 109   { TAG_IFRAME,  "iframe",      tag_find_urls },
 110   { TAG_IMG,     "img",         tag_find_urls },
 111   { TAG_INPUT,   "input",       tag_find_urls },
 112   { TAG_LAYER,   "layer",       tag_find_urls },
 113   { TAG_LINK,    "link",        tag_handle_link },
 114   { TAG_META,    "meta",        tag_handle_meta },
 115   { TAG_OBJECT,  "object",      tag_find_urls },
 116   { TAG_OVERLAY, "overlay",     tag_find_urls },
 117   { TAG_SCRIPT,  "script",      tag_find_urls },
 118   { TAG_TABLE,   "table",       tag_find_urls },
 119   { TAG_TD,      "td",          tag_find_urls },
 120   { TAG_TH,      "th",          tag_find_urls }
 121 };
 122
 123 /* tag_url_attributes documents which attributes of which tags contain
 124    URLs to harvest.  It is used by tag_find_urls.  */
 125
 126 /* Defines for the FLAGS. */
 127
 128 /* The link is "inline", i.e. needs to be retrieved for this document
 129    to be correctly rendered.  Inline links include inlined images,
 130    stylesheets, children frames, etc.  */
 131 #define ATTR_INLINE     1
 132
 133 /* The link is expected to yield HTML contents.  It's important not to
 134    try to follow HTML obtained by following e.g. <img src="...">
 135    regardless of content-type.  Doing this causes infinite loops for
 136    "images" that return non-404 error pages with links to the same
 137    image.  */
 138 #define ATTR_HTML       2
 139
 140 /* For tags handled by tag_find_urls: attributes that contain URLs to
 141    download. */
 142 static struct {
 143   int tagid;
 144   const char *attr_name;
 145   int flags;
 146 } tag_url_attributes[] = {
 147   { TAG_A,              "href",         ATTR_HTML },
 148   { TAG_APPLET,         "code",         ATTR_INLINE },
 149   { TAG_AREA,           "href",         ATTR_HTML },
 150   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 151   { TAG_BODY,           "background",   ATTR_INLINE },
 152   { TAG_EMBED,          "href",         ATTR_HTML },
 153   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_FIG,            "src",          ATTR_INLINE },
 155   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_IMG,            "href",         ATTR_INLINE },
 158   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 159   { TAG_IMG,            "src",          ATTR_INLINE },
 160   { TAG_INPUT,          "src",          ATTR_INLINE },
 161   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 162   { TAG_OBJECT,         "data",         ATTR_INLINE },
 163   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 164   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 165   { TAG_TABLE,          "background",   ATTR_INLINE },
 166   { TAG_TD,             "background",   ATTR_INLINE },
 167   { TAG_TH,             "background",   ATTR_INLINE }
 168 };
 169
 170 /* The lists of interesting tags and attributes are built dynamically,
 171    from the information above.  However, some places in the code refer
 172    to the attributes not mentioned here.  We add them manually.  */
 173 static const char *additional_attributes[] = {
 174   "rel",                        /* used by tag_handle_link */
 175   "http-equiv",                 /* used by tag_handle_meta */
 176   "name",                       /* used by tag_handle_meta */
 177   "content",                    /* used by tag_handle_meta */
 178   "action"                      /* used by tag_handle_form */
 179 };
 180
 181 struct hash_table *interesting_tags;
 182 struct hash_table *interesting_attributes;
 183
 184 static void
 185 init_interesting (void)
 186 {
 187   /* Init the variables interesting_tags and interesting_attributes
 188      that are used by the HTML parser to know which tags and
 189      attributes we're interested in.  We initialize this only once,
 190      for performance reasons.
 191
 192      Here we also make sure that what we put in interesting_tags
 193      matches the user's preferences as specified through --ignore-tags
 194      and --follow-tags.  */
 195
 196   int i;
 197   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 198
 199   /* First, add all the tags we know hot to handle, mapped to their
 200      respective entries in known_tags.  */
 201   for (i = 0; i < countof (known_tags); i++)
 202     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 203
 204   /* Then remove the tags ignored through --ignore-tags.  */
 205   if (opt.ignore_tags)
 206     {
 207       char **ignored;
 208       for (ignored = opt.ignore_tags; *ignored; ignored++)
 209         hash_table_remove (interesting_tags, *ignored);
 210     }
 211
 212   /* If --follow-tags is specified, use only those tags.  */
 213   if (opt.follow_tags)
 214     {
 215       /* Create a new table intersecting --follow-tags and known_tags,
 216          and use it as interesting_tags.  */
 217       struct hash_table *intersect = make_nocase_string_hash_table (0);
 218       char **followed;
 219       for (followed = opt.follow_tags; *followed; followed++)
 220         {
 221           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 222           if (!t)
 223             continue;           /* ignore unknown --follow-tags entries. */
 224           hash_table_put (intersect, *followed, t);
 225         }
 226       hash_table_destroy (interesting_tags);
 227       interesting_tags = intersect;
 228     }
 229
 230   /* Add the attributes we care about. */
 231   interesting_attributes = make_nocase_string_hash_table (10);
 232   for (i = 0; i < countof (additional_attributes); i++)
 233     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 234   for (i = 0; i < countof (tag_url_attributes); i++)
 235     hash_table_put (interesting_attributes,
 236                     tag_url_attributes[i].attr_name, "1");
 237 }
 238
 239 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 240    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 241    index of the attribute in TAG will be stored there.  */
 242
 243 static char *
 244 find_attr (struct taginfo *tag, const char *name, int *attrind)
 245 {
 246   int i;
 247   for (i = 0; i < tag->nattrs; i++)
 248     if (!strcasecmp (tag->attrs[i].name, name))
 249       {
 250         if (attrind)
 251           *attrind = i;
 252         return tag->attrs[i].value;
 253       }
 254   return NULL;
 255 }
 256
 257 struct map_context {
 258   char *text;                   /* HTML text. */
 259   char *base;                   /* Base URI of the document, possibly
 260                                    changed through <base href=...>. */
 261   const char *parent_base;      /* Base of the current document. */
 262   const char *document_file;    /* File name of this document. */
 263   int nofollow;                 /* whether NOFOLLOW was specified in a
 264                                    <meta name=robots> tag. */
 265
 266   struct urlpos *head, *tail;   /* List of URLs that is being
 267                                    built. */
 268 };
 269
 270 /* Append LINK_URI to the urlpos structure that is being built.
 271
 272    LINK_URI will be merged with the current document base.  TAG and
 273    ATTRIND are the necessary context to store the position and
 274    size.  */
 275
 276 static struct urlpos *
 277 append_url (const char *link_uri,
 278             struct taginfo *tag, int attrind, struct map_context *ctx)
 279 {
 280   int link_has_scheme = url_has_scheme (link_uri);
 281   struct urlpos *newel;
 282   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 283   struct url *url;
 284
 285   if (!base)
 286     {
 287       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 288                ctx->document_file, link_uri));
 289
 290       if (!link_has_scheme)
 291         {
 292           /* Base URL is unavailable, and the link does not have a
 293              location attached to it -- we have to give up.  Since
 294              this can only happen when using `--force-html -i', print
 295              a warning.  */
 296           logprintf (LOG_NOTQUIET,
 297                      _("%s: Cannot resolve incomplete link %s.\n"),
 298                      ctx->document_file, link_uri);
 299           return NULL;
 300         }
 301
 302       url = url_parse (link_uri, NULL);
 303       if (!url)
 304         {
 305           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 306                    ctx->document_file, link_uri));
 307           return NULL;
 308         }
 309     }
 310   else
 311     {
 312       /* Merge BASE with LINK_URI, but also make sure the result is
 313          canonicalized, i.e. that "../" have been resolved.
 314          (parse_url will do that for us.) */
 315
 316       char *complete_uri = uri_merge (base, link_uri);
 317
 318       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 319                ctx->document_file, base, link_uri, complete_uri));
 320
 321       url = url_parse (complete_uri, NULL);
 322       if (!url)
 323         {
 324           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 325                    ctx->document_file, complete_uri));
 326           xfree (complete_uri);
 327           return NULL;
 328         }
 329       xfree (complete_uri);
 330     }
 331
 332   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 333
 334   newel = xnew0 (struct urlpos);
 335   newel->url = url;
 336   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 337   newel->size = tag->attrs[attrind].value_raw_size;
 338
 339   /* A URL is relative if the host is not named, and the name does not
 340      start with `/'.  */
 341   if (!link_has_scheme && *link_uri != '/')
 342     newel->link_relative_p = 1;
 343   else if (link_has_scheme)
 344     newel->link_complete_p = 1;
 345
 346   if (ctx->tail)
 347     {
 348       ctx->tail->next = newel;
 349       ctx->tail = newel;
 350     }
 351   else
 352     ctx->tail = ctx->head = newel;
 353
 354   return newel;
 355 }
 356 \f
 357 /* All the tag_* functions are called from collect_tags_mapper, as
 358    specified by KNOWN_TAGS.  */
 359
 360 /* Default tag handler: collect URLs from attributes specified for
 361    this tag by tag_url_attributes.  */
 362
 363 static void
 364 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 365 {
 366   int i, attrind;
 367   int first = -1;
 368
 369   for (i = 0; i < countof (tag_url_attributes); i++)
 370     if (tag_url_attributes[i].tagid == tagid)
 371       {
 372         /* We've found the index of tag_url_attributes where the
 373            attributes of our tag begin.  */
 374         first = i;
 375         break;
 376       }
 377   assert (first != -1);
 378
 379   /* Loop over the "interesting" attributes of this tag.  In this
 380      example, it will loop over "src" and "lowsrc".
 381
 382        <img src="foo.png" lowsrc="bar.png">
 383
 384      This has to be done in the outer loop so that the attributes are
 385      processed in the same order in which they appear in the page.
 386      This is required when converting links.  */
 387
 388   for (attrind = 0; attrind < tag->nattrs; attrind++)
 389     {
 390       /* Find whether TAG/ATTRIND is a combination that contains a
 391          URL. */
 392       char *link = tag->attrs[attrind].value;
 393       const int size = countof (tag_url_attributes);
 394
 395       /* If you're cringing at the inefficiency of the nested loops,
 396          remember that they both iterate over a very small number of
 397          items.  The worst-case inner loop is for the IMG tag, which
 398          has three attributes.  */
 399       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 400         {
 401           if (0 == strcasecmp (tag->attrs[attrind].name,
 402                                tag_url_attributes[i].attr_name))
 403             {
 404               struct urlpos *up = append_url (link, tag, attrind, ctx);
 405               if (up)
 406                 {
 407                   int flags = tag_url_attributes[i].flags;
 408                   if (flags & ATTR_INLINE)
 409                     up->link_inline_p = 1;
 410                   if (flags & ATTR_HTML)
 411                     up->link_expect_html = 1;
 412                 }
 413             }
 414         }
 415     }
 416 }
 417
 418 /* Handle the BASE tag, for <base href=...>. */
 419
 420 static void
 421 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 422 {
 423   struct urlpos *base_urlpos;
 424   int attrind;
 425   char *newbase = find_attr (tag, "href", &attrind);
 426   if (!newbase)
 427     return;
 428
 429   base_urlpos = append_url (newbase, tag, attrind, ctx);
 430   if (!base_urlpos)
 431     return;
 432   base_urlpos->ignore_when_downloading = 1;
 433   base_urlpos->link_base_p = 1;
 434
 435   if (ctx->base)
 436     xfree (ctx->base);
 437   if (ctx->parent_base)
 438     ctx->base = uri_merge (ctx->parent_base, newbase);
 439   else
 440     ctx->base = xstrdup (newbase);
 441 }
 442
 443 /* Mark the URL found in <form action=...> for conversion. */
 444
 445 static void
 446 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 447 {
 448   int attrind;
 449   char *action = find_attr (tag, "action", &attrind);
 450   if (action)
 451     {
 452       struct urlpos *up = append_url (action, tag, attrind, ctx);
 453       if (up)
 454         up->ignore_when_downloading = 1;
 455     }
 456 }
 457
 458 /* Handle the LINK tag.  It requires special handling because how its
 459    links will be followed in -p mode depends on the REL attribute.  */
 460
 461 static void
 462 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 463 {
 464   int attrind;
 465   char *href = find_attr (tag, "href", &attrind);
 466
 467   /* All <link href="..."> link references are external, except those
 468      known not to be, such as style sheet and shortcut icon:
 469
 470        <link rel="stylesheet" href="...">
 471        <link rel="shortcut icon" href="...">
 472   */
 473   if (href)
 474     {
 475       struct urlpos *up = append_url (href, tag, attrind, ctx);
 476       if (up)
 477         {
 478           char *rel = find_attr (tag, "rel", NULL);
 479           if (rel
 480               && (0 == strcasecmp (rel, "stylesheet")
 481                   || 0 == strcasecmp (rel, "shortcut icon")))
 482             up->link_inline_p = 1;
 483         }
 484     }
 485 }
 486
 487 /* Handle the META tag.  This requires special handling because of the
 488    refresh feature and because of robot exclusion.  */
 489
 490 static void
 491 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 492 {
 493   char *name = find_attr (tag, "name", NULL);
 494   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 495
 496   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 497     {
 498       /* Some pages use a META tag to specify that the page be
 499          refreshed by a new page after a given number of seconds.  The
 500          general format for this is:
 501
 502            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 503
 504          So we just need to skip past the "NUMBER; URL=" garbage to
 505          get to the URL.  */
 506
 507       struct urlpos *entry;
 508       int attrind;
 509       int timeout = 0;
 510       char *p;
 511
 512       char *refresh = find_attr (tag, "content", &attrind);
 513       if (!refresh)
 514         return;
 515
 516       for (p = refresh; ISDIGIT (*p); p++)
 517         timeout = 10 * timeout + *p - '0';
 518       if (*p++ != ';')
 519         return;
 520
 521       while (ISSPACE (*p))
 522         ++p;
 523       if (!(   TOUPPER (*p)       == 'U'
 524             && TOUPPER (*(p + 1)) == 'R'
 525             && TOUPPER (*(p + 2)) == 'L'
 526             &&          *(p + 3)  == '='))
 527         return;
 528       p += 4;
 529       while (ISSPACE (*p))
 530         ++p;
 531
 532       entry = append_url (p, tag, attrind, ctx);
 533       if (entry)
 534         {
 535           entry->link_refresh_p = 1;
 536           entry->refresh_timeout = timeout;
 537           entry->link_expect_html = 1;
 538         }
 539     }
 540   else if (name && 0 == strcasecmp (name, "robots"))
 541     {
 542       /* Handle stuff like:
 543          <meta name="robots" content="index,nofollow"> */
 544       char *content = find_attr (tag, "content", NULL);
 545       if (!content)
 546         return;
 547       if (!strcasecmp (content, "none"))
 548         ctx->nofollow = 1;
 549       else
 550         {
 551           while (*content)
 552             {
 553               /* Find the next occurrence of ',' or the end of
 554                  the string.  */
 555               char *end = strchr (content, ',');
 556               if (end)
 557                 ++end;
 558               else
 559                 end = content + strlen (content);
 560               if (!strncasecmp (content, "nofollow", end - content))
 561                 ctx->nofollow = 1;
 562               content = end;
 563             }
 564         }
 565     }
 566 }
 567
 568 /* Dispatch the tag handler appropriate for the tag we're mapping
 569    over.  See known_tags[] for definition of tag handlers.  */
 570
 571 static void
 572 collect_tags_mapper (struct taginfo *tag, void *arg)
 573 {
 574   struct map_context *ctx = (struct map_context *)arg;
 575
 576   /* Find the tag in our table of tags.  This must not fail because
 577      map_html_tags only returns tags found in interesting_tags.  */
 578   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 579   assert (t != NULL);
 580
 581   t->handler (t->tagid, tag, ctx);
 582 }
 583 \f
 584 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 585    it.  It merges relative links in FILE with URL.  It is aware of
 586    <base href=...> and does the right thing.  */
 587
 588 struct urlpos *
 589 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 590 {
 591   struct file_memory *fm;
 592   struct map_context ctx;
 593   int flags;
 594
 595   /* Load the file. */
 596   fm = read_file (file);
 597   if (!fm)
 598     {
 599       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 600       return NULL;
 601     }
 602   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 603
 604   ctx.text = fm->content;
 605   ctx.head = ctx.tail = NULL;
 606   ctx.base = NULL;
 607   ctx.parent_base = url ? url : opt.base_href;
 608   ctx.document_file = file;
 609   ctx.nofollow = 0;
 610
 611   if (!interesting_tags)
 612     init_interesting ();
 613
 614   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 615      generate <a href=" foo"> instead of <a href="foo"> (browsers
 616      ignore spaces as well.)  If you really mean space, use &32; or
 617      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 618      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 619      ignored by IE and Mozilla and are presumably introduced by
 620      writing HTML with editors that force word wrap.  */
 621   flags = MHT_TRIM_VALUES;
 622   if (opt.strict_comments)
 623     flags |= MHT_STRICT_COMMENTS;
 624
 625   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 626                  interesting_tags, interesting_attributes);
 627
 628   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 629   if (meta_disallow_follow)
 630     *meta_disallow_follow = ctx.nofollow;
 631
 632   xfree_null (ctx.base);
 633   read_file_free (fm);
 634   return ctx.head;
 635 }
 636
 637 /* This doesn't really have anything to do with HTML, but it's similar
 638    to get_urls_html, so we put it here.  */
 639
 640 struct urlpos *
 641 get_urls_file (const char *file)
 642 {
 643   struct file_memory *fm;
 644   struct urlpos *head, *tail;
 645   const char *text, *text_end;
 646
 647   /* Load the file.  */
 648   fm = read_file (file);
 649   if (!fm)
 650     {
 651       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 652       return NULL;
 653     }
 654   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 655
 656   head = tail = NULL;
 657   text = fm->content;
 658   text_end = fm->content + fm->length;
 659   while (text < text_end)
 660     {
 661       int up_error_code;
 662       char *url_text;
 663       struct urlpos *entry;
 664       struct url *url;
 665
 666       const char *line_beg = text;
 667       const char *line_end = memchr (text, '\n', text_end - text);
 668       if (!line_end)
 669         line_end = text_end;
 670       else
 671         ++line_end;
 672       text = line_end;
 673
 674       /* Strip whitespace from the beginning and end of line. */
 675       while (line_beg < line_end && ISSPACE (*line_beg))
 676         ++line_beg;
 677       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 678         --line_end;
 679
 680       if (line_beg == line_end)
 681         continue;
 682
 683       /* The URL is in the [line_beg, line_end) region. */
 684
 685       /* We must copy the URL to a zero-terminated string, and we
 686          can't use alloca because we're in a loop.  *sigh*.  */
 687       url_text = strdupdelim (line_beg, line_end);
 688
 689       if (opt.base_href)
 690         {
 691           /* Merge opt.base_href with URL. */
 692           char *merged = uri_merge (opt.base_href, url_text);
 693           xfree (url_text);
 694           url_text = merged;
 695         }
 696
 697       url = url_parse (url_text, &up_error_code);
 698       if (!url)
 699         {
 700           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 701                      file, url_text, url_error (up_error_code));
 702           xfree (url_text);
 703           continue;
 704         }
 705       xfree (url_text);
 706
 707       entry = xnew0 (struct urlpos);
 708       entry->next = NULL;
 709       entry->url = url;
 710
 711       if (!head)
 712         head = entry;
 713       else
 714         tail->next = entry;
 715       tail = entry;
 716     }
 717   read_file_free (fm);
 718   return head;
 719 }
 720
 721 void
 722 cleanup_html_url (void)
 723 {
 724   /* Destroy the hash tables.  The hash table keys and values are not
 725      allocated by this code, so we don't need to free them here.  */
 726   if (interesting_tags)
 727     hash_table_destroy (interesting_tags);
 728   if (interesting_attributes)
 729     hash_table_destroy (interesting_attributes);
 730 }