sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include "wget.h"
  31
  32 #include <stdio.h>
  33 #include <string.h>
  34 #include <stdlib.h>
  35 #include <errno.h>
  36 #include <assert.h>
  37
  38 #include "html-parse.h"
  39 #include "url.h"
  40 #include "utils.h"
  41 #include "hash.h"
  42 #include "convert.h"
  43 #include "recur.h"              /* declaration of get_urls_html */
  44
  45 struct map_context;
  46
  47 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  48
  49 #define DECLARE_TAG_HANDLER(fun)                                \
  50   static void fun (int, struct taginfo *, struct map_context *)
  51
  52 DECLARE_TAG_HANDLER (tag_find_urls);
  53 DECLARE_TAG_HANDLER (tag_handle_base);
  54 DECLARE_TAG_HANDLER (tag_handle_form);
  55 DECLARE_TAG_HANDLER (tag_handle_link);
  56 DECLARE_TAG_HANDLER (tag_handle_meta);
  57
  58 enum {
  59   TAG_A,
  60   TAG_APPLET,
  61   TAG_AREA,
  62   TAG_BASE,
  63   TAG_BGSOUND,
  64   TAG_BODY,
  65   TAG_EMBED,
  66   TAG_FIG,
  67   TAG_FORM,
  68   TAG_FRAME,
  69   TAG_IFRAME,
  70   TAG_IMG,
  71   TAG_INPUT,
  72   TAG_LAYER,
  73   TAG_LINK,
  74   TAG_META,
  75   TAG_OBJECT,
  76   TAG_OVERLAY,
  77   TAG_SCRIPT,
  78   TAG_TABLE,
  79   TAG_TD,
  80   TAG_TH
  81 };
  82
  83 /* The list of known tags and functions used for handling them.  Most
  84    tags are simply harvested for URLs. */
  85 static struct known_tag {
  86   int tagid;
  87   const char *name;
  88   tag_handler_t handler;
  89 } known_tags[] = {
  90   { TAG_A,       "a",           tag_find_urls },
  91   { TAG_APPLET,  "applet",      tag_find_urls },
  92   { TAG_AREA,    "area",        tag_find_urls },
  93   { TAG_BASE,    "base",        tag_handle_base },
  94   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  95   { TAG_BODY,    "body",        tag_find_urls },
  96   { TAG_EMBED,   "embed",       tag_find_urls },
  97   { TAG_FIG,     "fig",         tag_find_urls },
  98   { TAG_FORM,    "form",        tag_handle_form },
  99   { TAG_FRAME,   "frame",       tag_find_urls },
 100   { TAG_IFRAME,  "iframe",      tag_find_urls },
 101   { TAG_IMG,     "img",         tag_find_urls },
 102   { TAG_INPUT,   "input",       tag_find_urls },
 103   { TAG_LAYER,   "layer",       tag_find_urls },
 104   { TAG_LINK,    "link",        tag_handle_link },
 105   { TAG_META,    "meta",        tag_handle_meta },
 106   { TAG_OBJECT,  "object",      tag_find_urls },
 107   { TAG_OVERLAY, "overlay",     tag_find_urls },
 108   { TAG_SCRIPT,  "script",      tag_find_urls },
 109   { TAG_TABLE,   "table",       tag_find_urls },
 110   { TAG_TD,      "td",          tag_find_urls },
 111   { TAG_TH,      "th",          tag_find_urls }
 112 };
 113
 114 /* tag_url_attributes documents which attributes of which tags contain
 115    URLs to harvest.  It is used by tag_find_urls.  */
 116
 117 /* Defines for the FLAGS. */
 118
 119 /* The link is "inline", i.e. needs to be retrieved for this document
 120    to be correctly rendered.  Inline links include inlined images,
 121    stylesheets, children frames, etc.  */
 122 #define ATTR_INLINE     1
 123
 124 /* The link is expected to yield HTML contents.  It's important not to
 125    try to follow HTML obtained by following e.g. <img src="...">
 126    regardless of content-type.  Doing this causes infinite loops for
 127    "images" that return non-404 error pages with links to the same
 128    image.  */
 129 #define ATTR_HTML       2
 130
 131 /* For tags handled by tag_find_urls: attributes that contain URLs to
 132    download. */
 133 static struct {
 134   int tagid;
 135   const char *attr_name;
 136   int flags;
 137 } tag_url_attributes[] = {
 138   { TAG_A,              "href",         ATTR_HTML },
 139   { TAG_APPLET,         "code",         ATTR_INLINE },
 140   { TAG_AREA,           "href",         ATTR_HTML },
 141   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 142   { TAG_BODY,           "background",   ATTR_INLINE },
 143   { TAG_EMBED,          "href",         ATTR_HTML },
 144   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 145   { TAG_FIG,            "src",          ATTR_INLINE },
 146   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IMG,            "href",         ATTR_INLINE },
 149   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 150   { TAG_IMG,            "src",          ATTR_INLINE },
 151   { TAG_INPUT,          "src",          ATTR_INLINE },
 152   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 153   { TAG_OBJECT,         "data",         ATTR_INLINE },
 154   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 156   { TAG_TABLE,          "background",   ATTR_INLINE },
 157   { TAG_TD,             "background",   ATTR_INLINE },
 158   { TAG_TH,             "background",   ATTR_INLINE }
 159 };
 160
 161 /* The lists of interesting tags and attributes are built dynamically,
 162    from the information above.  However, some places in the code refer
 163    to the attributes not mentioned here.  We add them manually.  */
 164 static const char *additional_attributes[] = {
 165   "rel",                        /* used by tag_handle_link */
 166   "http-equiv",                 /* used by tag_handle_meta */
 167   "name",                       /* used by tag_handle_meta */
 168   "content",                    /* used by tag_handle_meta */
 169   "action"                      /* used by tag_handle_form */
 170 };
 171
 172 static struct hash_table *interesting_tags;
 173 static struct hash_table *interesting_attributes;
 174
 175 static void
 176 init_interesting (void)
 177 {
 178   /* Init the variables interesting_tags and interesting_attributes
 179      that are used by the HTML parser to know which tags and
 180      attributes we're interested in.  We initialize this only once,
 181      for performance reasons.
 182
 183      Here we also make sure that what we put in interesting_tags
 184      matches the user's preferences as specified through --ignore-tags
 185      and --follow-tags.  */
 186
 187   int i;
 188   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 189
 190   /* First, add all the tags we know hot to handle, mapped to their
 191      respective entries in known_tags.  */
 192   for (i = 0; i < countof (known_tags); i++)
 193     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 194
 195   /* Then remove the tags ignored through --ignore-tags.  */
 196   if (opt.ignore_tags)
 197     {
 198       char **ignored;
 199       for (ignored = opt.ignore_tags; *ignored; ignored++)
 200         hash_table_remove (interesting_tags, *ignored);
 201     }
 202
 203   /* If --follow-tags is specified, use only those tags.  */
 204   if (opt.follow_tags)
 205     {
 206       /* Create a new table intersecting --follow-tags and known_tags,
 207          and use it as interesting_tags.  */
 208       struct hash_table *intersect = make_nocase_string_hash_table (0);
 209       char **followed;
 210       for (followed = opt.follow_tags; *followed; followed++)
 211         {
 212           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 213           if (!t)
 214             continue;           /* ignore unknown --follow-tags entries. */
 215           hash_table_put (intersect, *followed, t);
 216         }
 217       hash_table_destroy (interesting_tags);
 218       interesting_tags = intersect;
 219     }
 220
 221   /* Add the attributes we care about. */
 222   interesting_attributes = make_nocase_string_hash_table (10);
 223   for (i = 0; i < countof (additional_attributes); i++)
 224     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 225   for (i = 0; i < countof (tag_url_attributes); i++)
 226     hash_table_put (interesting_attributes,
 227                     tag_url_attributes[i].attr_name, "1");
 228 }
 229
 230 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 231    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 232    index of the attribute in TAG will be stored there.  */
 233
 234 static char *
 235 find_attr (struct taginfo *tag, const char *name, int *attrind)
 236 {
 237   int i;
 238   for (i = 0; i < tag->nattrs; i++)
 239     if (!strcasecmp (tag->attrs[i].name, name))
 240       {
 241         if (attrind)
 242           *attrind = i;
 243         return tag->attrs[i].value;
 244       }
 245   return NULL;
 246 }
 247
 248 struct map_context {
 249   char *text;                   /* HTML text. */
 250   char *base;                   /* Base URI of the document, possibly
 251                                    changed through <base href=...>. */
 252   const char *parent_base;      /* Base of the current document. */
 253   const char *document_file;    /* File name of this document. */
 254   bool nofollow;                /* whether NOFOLLOW was specified in a
 255                                    <meta name=robots> tag. */
 256
 257   struct urlpos *head, *tail;   /* List of URLs that is being
 258                                    built. */
 259 };
 260
 261 /* Append LINK_URI to the urlpos structure that is being built.
 262
 263    LINK_URI will be merged with the current document base.  TAG and
 264    ATTRIND are the necessary context to store the position and
 265    size.  */
 266
 267 static struct urlpos *
 268 append_url (const char *link_uri,
 269             struct taginfo *tag, int attrind, struct map_context *ctx)
 270 {
 271   int link_has_scheme = url_has_scheme (link_uri);
 272   struct urlpos *newel;
 273   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 274   struct url *url;
 275
 276   if (!base)
 277     {
 278       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 279                ctx->document_file, link_uri));
 280
 281       if (!link_has_scheme)
 282         {
 283           /* Base URL is unavailable, and the link does not have a
 284              location attached to it -- we have to give up.  Since
 285              this can only happen when using `--force-html -i', print
 286              a warning.  */
 287           logprintf (LOG_NOTQUIET,
 288                      _("%s: Cannot resolve incomplete link %s.\n"),
 289                      ctx->document_file, link_uri);
 290           return NULL;
 291         }
 292
 293       url = url_parse (link_uri, NULL);
 294       if (!url)
 295         {
 296           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 297                    ctx->document_file, link_uri));
 298           return NULL;
 299         }
 300     }
 301   else
 302     {
 303       /* Merge BASE with LINK_URI, but also make sure the result is
 304          canonicalized, i.e. that "../" have been resolved.
 305          (parse_url will do that for us.) */
 306
 307       char *complete_uri = uri_merge (base, link_uri);
 308
 309       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 310                ctx->document_file, base, link_uri, complete_uri));
 311
 312       url = url_parse (complete_uri, NULL);
 313       if (!url)
 314         {
 315           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 316                    ctx->document_file, complete_uri));
 317           xfree (complete_uri);
 318           return NULL;
 319         }
 320       xfree (complete_uri);
 321     }
 322
 323   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 324
 325   newel = xnew0 (struct urlpos);
 326   newel->url = url;
 327   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 328   newel->size = tag->attrs[attrind].value_raw_size;
 329
 330   /* A URL is relative if the host is not named, and the name does not
 331      start with `/'.  */
 332   if (!link_has_scheme && *link_uri != '/')
 333     newel->link_relative_p = 1;
 334   else if (link_has_scheme)
 335     newel->link_complete_p = 1;
 336
 337   if (ctx->tail)
 338     {
 339       ctx->tail->next = newel;
 340       ctx->tail = newel;
 341     }
 342   else
 343     ctx->tail = ctx->head = newel;
 344
 345   return newel;
 346 }
 347 \f
 348 /* All the tag_* functions are called from collect_tags_mapper, as
 349    specified by KNOWN_TAGS.  */
 350
 351 /* Default tag handler: collect URLs from attributes specified for
 352    this tag by tag_url_attributes.  */
 353
 354 static void
 355 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 356 {
 357   int i, attrind;
 358   int first = -1;
 359
 360   for (i = 0; i < countof (tag_url_attributes); i++)
 361     if (tag_url_attributes[i].tagid == tagid)
 362       {
 363         /* We've found the index of tag_url_attributes where the
 364            attributes of our tag begin.  */
 365         first = i;
 366         break;
 367       }
 368   assert (first != -1);
 369
 370   /* Loop over the "interesting" attributes of this tag.  In this
 371      example, it will loop over "src" and "lowsrc".
 372
 373        <img src="foo.png" lowsrc="bar.png">
 374
 375      This has to be done in the outer loop so that the attributes are
 376      processed in the same order in which they appear in the page.
 377      This is required when converting links.  */
 378
 379   for (attrind = 0; attrind < tag->nattrs; attrind++)
 380     {
 381       /* Find whether TAG/ATTRIND is a combination that contains a
 382          URL. */
 383       char *link = tag->attrs[attrind].value;
 384       const int size = countof (tag_url_attributes);
 385
 386       /* If you're cringing at the inefficiency of the nested loops,
 387          remember that they both iterate over a very small number of
 388          items.  The worst-case inner loop is for the IMG tag, which
 389          has three attributes.  */
 390       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 391         {
 392           if (0 == strcasecmp (tag->attrs[attrind].name,
 393                                tag_url_attributes[i].attr_name))
 394             {
 395               struct urlpos *up = append_url (link, tag, attrind, ctx);
 396               if (up)
 397                 {
 398                   int flags = tag_url_attributes[i].flags;
 399                   if (flags & ATTR_INLINE)
 400                     up->link_inline_p = 1;
 401                   if (flags & ATTR_HTML)
 402                     up->link_expect_html = 1;
 403                 }
 404             }
 405         }
 406     }
 407 }
 408
 409 /* Handle the BASE tag, for <base href=...>. */
 410
 411 static void
 412 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 413 {
 414   struct urlpos *base_urlpos;
 415   int attrind;
 416   char *newbase = find_attr (tag, "href", &attrind);
 417   if (!newbase)
 418     return;
 419
 420   base_urlpos = append_url (newbase, tag, attrind, ctx);
 421   if (!base_urlpos)
 422     return;
 423   base_urlpos->ignore_when_downloading = 1;
 424   base_urlpos->link_base_p = 1;
 425
 426   if (ctx->base)
 427     xfree (ctx->base);
 428   if (ctx->parent_base)
 429     ctx->base = uri_merge (ctx->parent_base, newbase);
 430   else
 431     ctx->base = xstrdup (newbase);
 432 }
 433
 434 /* Mark the URL found in <form action=...> for conversion. */
 435
 436 static void
 437 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 438 {
 439   int attrind;
 440   char *action = find_attr (tag, "action", &attrind);
 441   if (action)
 442     {
 443       struct urlpos *up = append_url (action, tag, attrind, ctx);
 444       if (up)
 445         up->ignore_when_downloading = 1;
 446     }
 447 }
 448
 449 /* Handle the LINK tag.  It requires special handling because how its
 450    links will be followed in -p mode depends on the REL attribute.  */
 451
 452 static void
 453 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 454 {
 455   int attrind;
 456   char *href = find_attr (tag, "href", &attrind);
 457
 458   /* All <link href="..."> link references are external, except those
 459      known not to be, such as style sheet and shortcut icon:
 460
 461        <link rel="stylesheet" href="...">
 462        <link rel="shortcut icon" href="...">
 463   */
 464   if (href)
 465     {
 466       struct urlpos *up = append_url (href, tag, attrind, ctx);
 467       if (up)
 468         {
 469           char *rel = find_attr (tag, "rel", NULL);
 470           if (rel
 471               && (0 == strcasecmp (rel, "stylesheet")
 472                   || 0 == strcasecmp (rel, "shortcut icon")))
 473             up->link_inline_p = 1;
 474           else
 475             /* The external ones usually point to HTML pages, such as
 476                <link rel="next" href="..."> */
 477             up->link_expect_html = 1;
 478         }
 479     }
 480 }
 481
 482 /* Handle the META tag.  This requires special handling because of the
 483    refresh feature and because of robot exclusion.  */
 484
 485 static void
 486 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 487 {
 488   char *name = find_attr (tag, "name", NULL);
 489   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 490
 491   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 492     {
 493       /* Some pages use a META tag to specify that the page be
 494          refreshed by a new page after a given number of seconds.  The
 495          general format for this is:
 496
 497            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 498
 499          So we just need to skip past the "NUMBER; URL=" garbage to
 500          get to the URL.  */
 501
 502       struct urlpos *entry;
 503       int attrind;
 504       int timeout = 0;
 505       char *p;
 506
 507       char *refresh = find_attr (tag, "content", &attrind);
 508       if (!refresh)
 509         return;
 510
 511       for (p = refresh; c_isdigit (*p); p++)
 512         timeout = 10 * timeout + *p - '0';
 513       if (*p++ != ';')
 514         return;
 515
 516       while (c_isspace (*p))
 517         ++p;
 518       if (!(   c_toupper (*p)       == 'U'
 519             && c_toupper (*(p + 1)) == 'R'
 520             && c_toupper (*(p + 2)) == 'L'
 521             &&          *(p + 3)  == '='))
 522         return;
 523       p += 4;
 524       while (c_isspace (*p))
 525         ++p;
 526
 527       entry = append_url (p, tag, attrind, ctx);
 528       if (entry)
 529         {
 530           entry->link_refresh_p = 1;
 531           entry->refresh_timeout = timeout;
 532           entry->link_expect_html = 1;
 533         }
 534     }
 535   else if (name && 0 == strcasecmp (name, "robots"))
 536     {
 537       /* Handle stuff like:
 538          <meta name="robots" content="index,nofollow"> */
 539       char *content = find_attr (tag, "content", NULL);
 540       if (!content)
 541         return;
 542       if (!strcasecmp (content, "none"))
 543         ctx->nofollow = true;
 544       else
 545         {
 546           while (*content)
 547             {
 548               /* Find the next occurrence of ',' or the end of
 549                  the string.  */
 550               char *end = strchr (content, ',');
 551               if (end)
 552                 ++end;
 553               else
 554                 end = content + strlen (content);
 555               if (!strncasecmp (content, "nofollow", end - content))
 556                 ctx->nofollow = true;
 557               content = end;
 558             }
 559         }
 560     }
 561 }
 562
 563 /* Dispatch the tag handler appropriate for the tag we're mapping
 564    over.  See known_tags[] for definition of tag handlers.  */
 565
 566 static void
 567 collect_tags_mapper (struct taginfo *tag, void *arg)
 568 {
 569   struct map_context *ctx = (struct map_context *)arg;
 570
 571   /* Find the tag in our table of tags.  This must not fail because
 572      map_html_tags only returns tags found in interesting_tags.  */
 573   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 574   assert (t != NULL);
 575
 576   t->handler (t->tagid, tag, ctx);
 577 }
 578 \f
 579 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 580    it.  It merges relative links in FILE with URL.  It is aware of
 581    <base href=...> and does the right thing.  */
 582
 583 struct urlpos *
 584 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 585 {
 586   struct file_memory *fm;
 587   struct map_context ctx;
 588   int flags;
 589
 590   /* Load the file. */
 591   fm = read_file (file);
 592   if (!fm)
 593     {
 594       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 595       return NULL;
 596     }
 597   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 598
 599   ctx.text = fm->content;
 600   ctx.head = ctx.tail = NULL;
 601   ctx.base = NULL;
 602   ctx.parent_base = url ? url : opt.base_href;
 603   ctx.document_file = file;
 604   ctx.nofollow = false;
 605
 606   if (!interesting_tags)
 607     init_interesting ();
 608
 609   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 610      generate <a href=" foo"> instead of <a href="foo"> (browsers
 611      ignore spaces as well.)  If you really mean space, use &32; or
 612      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 613      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 614      ignored by IE and Mozilla and are presumably introduced by
 615      writing HTML with editors that force word wrap.  */
 616   flags = MHT_TRIM_VALUES;
 617   if (opt.strict_comments)
 618     flags |= MHT_STRICT_COMMENTS;
 619
 620   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 621                  interesting_tags, interesting_attributes);
 622
 623   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 624   if (meta_disallow_follow)
 625     *meta_disallow_follow = ctx.nofollow;
 626
 627   xfree_null (ctx.base);
 628   read_file_free (fm);
 629   return ctx.head;
 630 }
 631
 632 /* This doesn't really have anything to do with HTML, but it's similar
 633    to get_urls_html, so we put it here.  */
 634
 635 struct urlpos *
 636 get_urls_file (const char *file)
 637 {
 638   struct file_memory *fm;
 639   struct urlpos *head, *tail;
 640   const char *text, *text_end;
 641
 642   /* Load the file.  */
 643   fm = read_file (file);
 644   if (!fm)
 645     {
 646       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 647       return NULL;
 648     }
 649   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 650
 651   head = tail = NULL;
 652   text = fm->content;
 653   text_end = fm->content + fm->length;
 654   while (text < text_end)
 655     {
 656       int up_error_code;
 657       char *url_text;
 658       struct urlpos *entry;
 659       struct url *url;
 660
 661       const char *line_beg = text;
 662       const char *line_end = memchr (text, '\n', text_end - text);
 663       if (!line_end)
 664         line_end = text_end;
 665       else
 666         ++line_end;
 667       text = line_end;
 668
 669       /* Strip whitespace from the beginning and end of line. */
 670       while (line_beg < line_end && c_isspace (*line_beg))
 671         ++line_beg;
 672       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 673         --line_end;
 674
 675       if (line_beg == line_end)
 676         continue;
 677
 678       /* The URL is in the [line_beg, line_end) region. */
 679
 680       /* We must copy the URL to a zero-terminated string, and we
 681          can't use alloca because we're in a loop.  *sigh*.  */
 682       url_text = strdupdelim (line_beg, line_end);
 683
 684       if (opt.base_href)
 685         {
 686           /* Merge opt.base_href with URL. */
 687           char *merged = uri_merge (opt.base_href, url_text);
 688           xfree (url_text);
 689           url_text = merged;
 690         }
 691
 692       url = url_parse (url_text, &up_error_code);
 693       if (!url)
 694         {
 695           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 696                      file, url_text, url_error (up_error_code));
 697           xfree (url_text);
 698           continue;
 699         }
 700       xfree (url_text);
 701
 702       entry = xnew0 (struct urlpos);
 703       entry->url = url;
 704
 705       if (!head)
 706         head = entry;
 707       else
 708         tail->next = entry;
 709       tail = entry;
 710     }
 711   read_file_free (fm);
 712   return head;
 713 }
 714
 715 void
 716 cleanup_html_url (void)
 717 {
 718   /* Destroy the hash tables.  The hash table keys and values are not
 719      allocated by this code, so we don't need to free them here.  */
 720   if (interesting_tags)
 721     hash_table_destroy (interesting_tags);
 722   if (interesting_attributes)
 723     hash_table_destroy (interesting_attributes);
 724 }