sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <string.h>
  34 #include <stdlib.h>
  35 #include <errno.h>
  36 #include <assert.h>
  37
  38 #include "wget.h"
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"              /* declaration of get_urls_html */
  45
  46 struct map_context;
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link */
 167   "http-equiv",                 /* used by tag_handle_meta */
 168   "name",                       /* used by tag_handle_meta */
 169   "content",                    /* used by tag_handle_meta */
 170   "action"                      /* used by tag_handle_form */
 171 };
 172
 173 static struct hash_table *interesting_tags;
 174 static struct hash_table *interesting_attributes;
 175
 176 static void
 177 init_interesting (void)
 178 {
 179   /* Init the variables interesting_tags and interesting_attributes
 180      that are used by the HTML parser to know which tags and
 181      attributes we're interested in.  We initialize this only once,
 182      for performance reasons.
 183
 184      Here we also make sure that what we put in interesting_tags
 185      matches the user's preferences as specified through --ignore-tags
 186      and --follow-tags.  */
 187
 188   int i;
 189   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 190
 191   /* First, add all the tags we know hot to handle, mapped to their
 192      respective entries in known_tags.  */
 193   for (i = 0; i < countof (known_tags); i++)
 194     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 195
 196   /* Then remove the tags ignored through --ignore-tags.  */
 197   if (opt.ignore_tags)
 198     {
 199       char **ignored;
 200       for (ignored = opt.ignore_tags; *ignored; ignored++)
 201         hash_table_remove (interesting_tags, *ignored);
 202     }
 203
 204   /* If --follow-tags is specified, use only those tags.  */
 205   if (opt.follow_tags)
 206     {
 207       /* Create a new table intersecting --follow-tags and known_tags,
 208          and use it as interesting_tags.  */
 209       struct hash_table *intersect = make_nocase_string_hash_table (0);
 210       char **followed;
 211       for (followed = opt.follow_tags; *followed; followed++)
 212         {
 213           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 214           if (!t)
 215             continue;           /* ignore unknown --follow-tags entries. */
 216           hash_table_put (intersect, *followed, t);
 217         }
 218       hash_table_destroy (interesting_tags);
 219       interesting_tags = intersect;
 220     }
 221
 222   /* Add the attributes we care about. */
 223   interesting_attributes = make_nocase_string_hash_table (10);
 224   for (i = 0; i < countof (additional_attributes); i++)
 225     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 226   for (i = 0; i < countof (tag_url_attributes); i++)
 227     hash_table_put (interesting_attributes,
 228                     tag_url_attributes[i].attr_name, "1");
 229 }
 230
 231 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 232    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 233    index of the attribute in TAG will be stored there.  */
 234
 235 static char *
 236 find_attr (struct taginfo *tag, const char *name, int *attrind)
 237 {
 238   int i;
 239   for (i = 0; i < tag->nattrs; i++)
 240     if (!strcasecmp (tag->attrs[i].name, name))
 241       {
 242         if (attrind)
 243           *attrind = i;
 244         return tag->attrs[i].value;
 245       }
 246   return NULL;
 247 }
 248
 249 struct map_context {
 250   char *text;                   /* HTML text. */
 251   char *base;                   /* Base URI of the document, possibly
 252                                    changed through <base href=...>. */
 253   const char *parent_base;      /* Base of the current document. */
 254   const char *document_file;    /* File name of this document. */
 255   bool nofollow;                /* whether NOFOLLOW was specified in a
 256                                    <meta name=robots> tag. */
 257
 258   struct urlpos *head, *tail;   /* List of URLs that is being
 259                                    built. */
 260 };
 261
 262 /* Append LINK_URI to the urlpos structure that is being built.
 263
 264    LINK_URI will be merged with the current document base.  TAG and
 265    ATTRIND are the necessary context to store the position and
 266    size.  */
 267
 268 static struct urlpos *
 269 append_url (const char *link_uri,
 270             struct taginfo *tag, int attrind, struct map_context *ctx)
 271 {
 272   int link_has_scheme = url_has_scheme (link_uri);
 273   struct urlpos *newel;
 274   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 275   struct url *url;
 276
 277   if (!base)
 278     {
 279       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 280                ctx->document_file, link_uri));
 281
 282       if (!link_has_scheme)
 283         {
 284           /* Base URL is unavailable, and the link does not have a
 285              location attached to it -- we have to give up.  Since
 286              this can only happen when using `--force-html -i', print
 287              a warning.  */
 288           logprintf (LOG_NOTQUIET,
 289                      _("%s: Cannot resolve incomplete link %s.\n"),
 290                      ctx->document_file, link_uri);
 291           return NULL;
 292         }
 293
 294       url = url_parse (link_uri, NULL);
 295       if (!url)
 296         {
 297           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 298                    ctx->document_file, link_uri));
 299           return NULL;
 300         }
 301     }
 302   else
 303     {
 304       /* Merge BASE with LINK_URI, but also make sure the result is
 305          canonicalized, i.e. that "../" have been resolved.
 306          (parse_url will do that for us.) */
 307
 308       char *complete_uri = uri_merge (base, link_uri);
 309
 310       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 311                ctx->document_file, base, link_uri, complete_uri));
 312
 313       url = url_parse (complete_uri, NULL);
 314       if (!url)
 315         {
 316           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 317                    ctx->document_file, complete_uri));
 318           xfree (complete_uri);
 319           return NULL;
 320         }
 321       xfree (complete_uri);
 322     }
 323
 324   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 325
 326   newel = xnew0 (struct urlpos);
 327   newel->url = url;
 328   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 329   newel->size = tag->attrs[attrind].value_raw_size;
 330
 331   /* A URL is relative if the host is not named, and the name does not
 332      start with `/'.  */
 333   if (!link_has_scheme && *link_uri != '/')
 334     newel->link_relative_p = 1;
 335   else if (link_has_scheme)
 336     newel->link_complete_p = 1;
 337
 338   if (ctx->tail)
 339     {
 340       ctx->tail->next = newel;
 341       ctx->tail = newel;
 342     }
 343   else
 344     ctx->tail = ctx->head = newel;
 345
 346   return newel;
 347 }
 348 \f
 349 /* All the tag_* functions are called from collect_tags_mapper, as
 350    specified by KNOWN_TAGS.  */
 351
 352 /* Default tag handler: collect URLs from attributes specified for
 353    this tag by tag_url_attributes.  */
 354
 355 static void
 356 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 357 {
 358   int i, attrind;
 359   int first = -1;
 360
 361   for (i = 0; i < countof (tag_url_attributes); i++)
 362     if (tag_url_attributes[i].tagid == tagid)
 363       {
 364         /* We've found the index of tag_url_attributes where the
 365            attributes of our tag begin.  */
 366         first = i;
 367         break;
 368       }
 369   assert (first != -1);
 370
 371   /* Loop over the "interesting" attributes of this tag.  In this
 372      example, it will loop over "src" and "lowsrc".
 373
 374        <img src="foo.png" lowsrc="bar.png">
 375
 376      This has to be done in the outer loop so that the attributes are
 377      processed in the same order in which they appear in the page.
 378      This is required when converting links.  */
 379
 380   for (attrind = 0; attrind < tag->nattrs; attrind++)
 381     {
 382       /* Find whether TAG/ATTRIND is a combination that contains a
 383          URL. */
 384       char *link = tag->attrs[attrind].value;
 385       const int size = countof (tag_url_attributes);
 386
 387       /* If you're cringing at the inefficiency of the nested loops,
 388          remember that they both iterate over a very small number of
 389          items.  The worst-case inner loop is for the IMG tag, which
 390          has three attributes.  */
 391       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 392         {
 393           if (0 == strcasecmp (tag->attrs[attrind].name,
 394                                tag_url_attributes[i].attr_name))
 395             {
 396               struct urlpos *up = append_url (link, tag, attrind, ctx);
 397               if (up)
 398                 {
 399                   int flags = tag_url_attributes[i].flags;
 400                   if (flags & ATTR_INLINE)
 401                     up->link_inline_p = 1;
 402                   if (flags & ATTR_HTML)
 403                     up->link_expect_html = 1;
 404                 }
 405             }
 406         }
 407     }
 408 }
 409
 410 /* Handle the BASE tag, for <base href=...>. */
 411
 412 static void
 413 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 414 {
 415   struct urlpos *base_urlpos;
 416   int attrind;
 417   char *newbase = find_attr (tag, "href", &attrind);
 418   if (!newbase)
 419     return;
 420
 421   base_urlpos = append_url (newbase, tag, attrind, ctx);
 422   if (!base_urlpos)
 423     return;
 424   base_urlpos->ignore_when_downloading = 1;
 425   base_urlpos->link_base_p = 1;
 426
 427   if (ctx->base)
 428     xfree (ctx->base);
 429   if (ctx->parent_base)
 430     ctx->base = uri_merge (ctx->parent_base, newbase);
 431   else
 432     ctx->base = xstrdup (newbase);
 433 }
 434
 435 /* Mark the URL found in <form action=...> for conversion. */
 436
 437 static void
 438 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 439 {
 440   int attrind;
 441   char *action = find_attr (tag, "action", &attrind);
 442   if (action)
 443     {
 444       struct urlpos *up = append_url (action, tag, attrind, ctx);
 445       if (up)
 446         up->ignore_when_downloading = 1;
 447     }
 448 }
 449
 450 /* Handle the LINK tag.  It requires special handling because how its
 451    links will be followed in -p mode depends on the REL attribute.  */
 452
 453 static void
 454 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 455 {
 456   int attrind;
 457   char *href = find_attr (tag, "href", &attrind);
 458
 459   /* All <link href="..."> link references are external, except those
 460      known not to be, such as style sheet and shortcut icon:
 461
 462        <link rel="stylesheet" href="...">
 463        <link rel="shortcut icon" href="...">
 464   */
 465   if (href)
 466     {
 467       struct urlpos *up = append_url (href, tag, attrind, ctx);
 468       if (up)
 469         {
 470           char *rel = find_attr (tag, "rel", NULL);
 471           if (rel
 472               && (0 == strcasecmp (rel, "stylesheet")
 473                   || 0 == strcasecmp (rel, "shortcut icon")))
 474             up->link_inline_p = 1;
 475           else
 476             /* The external ones usually point to HTML pages, such as
 477                <link rel="next" href="..."> */
 478             up->link_expect_html = 1;
 479         }
 480     }
 481 }
 482
 483 /* Handle the META tag.  This requires special handling because of the
 484    refresh feature and because of robot exclusion.  */
 485
 486 static void
 487 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 488 {
 489   char *name = find_attr (tag, "name", NULL);
 490   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 491
 492   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 493     {
 494       /* Some pages use a META tag to specify that the page be
 495          refreshed by a new page after a given number of seconds.  The
 496          general format for this is:
 497
 498            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 499
 500          So we just need to skip past the "NUMBER; URL=" garbage to
 501          get to the URL.  */
 502
 503       struct urlpos *entry;
 504       int attrind;
 505       int timeout = 0;
 506       char *p;
 507
 508       char *refresh = find_attr (tag, "content", &attrind);
 509       if (!refresh)
 510         return;
 511
 512       for (p = refresh; ISDIGIT (*p); p++)
 513         timeout = 10 * timeout + *p - '0';
 514       if (*p++ != ';')
 515         return;
 516
 517       while (ISSPACE (*p))
 518         ++p;
 519       if (!(   TOUPPER (*p)       == 'U'
 520             && TOUPPER (*(p + 1)) == 'R'
 521             && TOUPPER (*(p + 2)) == 'L'
 522             &&          *(p + 3)  == '='))
 523         return;
 524       p += 4;
 525       while (ISSPACE (*p))
 526         ++p;
 527
 528       entry = append_url (p, tag, attrind, ctx);
 529       if (entry)
 530         {
 531           entry->link_refresh_p = 1;
 532           entry->refresh_timeout = timeout;
 533           entry->link_expect_html = 1;
 534         }
 535     }
 536   else if (name && 0 == strcasecmp (name, "robots"))
 537     {
 538       /* Handle stuff like:
 539          <meta name="robots" content="index,nofollow"> */
 540       char *content = find_attr (tag, "content", NULL);
 541       if (!content)
 542         return;
 543       if (!strcasecmp (content, "none"))
 544         ctx->nofollow = true;
 545       else
 546         {
 547           while (*content)
 548             {
 549               /* Find the next occurrence of ',' or the end of
 550                  the string.  */
 551               char *end = strchr (content, ',');
 552               if (end)
 553                 ++end;
 554               else
 555                 end = content + strlen (content);
 556               if (!strncasecmp (content, "nofollow", end - content))
 557                 ctx->nofollow = true;
 558               content = end;
 559             }
 560         }
 561     }
 562 }
 563
 564 /* Dispatch the tag handler appropriate for the tag we're mapping
 565    over.  See known_tags[] for definition of tag handlers.  */
 566
 567 static void
 568 collect_tags_mapper (struct taginfo *tag, void *arg)
 569 {
 570   struct map_context *ctx = (struct map_context *)arg;
 571
 572   /* Find the tag in our table of tags.  This must not fail because
 573      map_html_tags only returns tags found in interesting_tags.  */
 574   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 575   assert (t != NULL);
 576
 577   t->handler (t->tagid, tag, ctx);
 578 }
 579 \f
 580 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 581    it.  It merges relative links in FILE with URL.  It is aware of
 582    <base href=...> and does the right thing.  */
 583
 584 struct urlpos *
 585 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 586 {
 587   struct file_memory *fm;
 588   struct map_context ctx;
 589   int flags;
 590
 591   /* Load the file. */
 592   fm = read_file (file);
 593   if (!fm)
 594     {
 595       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 596       return NULL;
 597     }
 598   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 599
 600   ctx.text = fm->content;
 601   ctx.head = ctx.tail = NULL;
 602   ctx.base = NULL;
 603   ctx.parent_base = url ? url : opt.base_href;
 604   ctx.document_file = file;
 605   ctx.nofollow = false;
 606
 607   if (!interesting_tags)
 608     init_interesting ();
 609
 610   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 611      generate <a href=" foo"> instead of <a href="foo"> (browsers
 612      ignore spaces as well.)  If you really mean space, use &32; or
 613      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 614      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 615      ignored by IE and Mozilla and are presumably introduced by
 616      writing HTML with editors that force word wrap.  */
 617   flags = MHT_TRIM_VALUES;
 618   if (opt.strict_comments)
 619     flags |= MHT_STRICT_COMMENTS;
 620
 621   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 622                  interesting_tags, interesting_attributes);
 623
 624   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 625   if (meta_disallow_follow)
 626     *meta_disallow_follow = ctx.nofollow;
 627
 628   xfree_null (ctx.base);
 629   read_file_free (fm);
 630   return ctx.head;
 631 }
 632
 633 /* This doesn't really have anything to do with HTML, but it's similar
 634    to get_urls_html, so we put it here.  */
 635
 636 struct urlpos *
 637 get_urls_file (const char *file)
 638 {
 639   struct file_memory *fm;
 640   struct urlpos *head, *tail;
 641   const char *text, *text_end;
 642
 643   /* Load the file.  */
 644   fm = read_file (file);
 645   if (!fm)
 646     {
 647       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 648       return NULL;
 649     }
 650   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 651
 652   head = tail = NULL;
 653   text = fm->content;
 654   text_end = fm->content + fm->length;
 655   while (text < text_end)
 656     {
 657       int up_error_code;
 658       char *url_text;
 659       struct urlpos *entry;
 660       struct url *url;
 661
 662       const char *line_beg = text;
 663       const char *line_end = memchr (text, '\n', text_end - text);
 664       if (!line_end)
 665         line_end = text_end;
 666       else
 667         ++line_end;
 668       text = line_end;
 669
 670       /* Strip whitespace from the beginning and end of line. */
 671       while (line_beg < line_end && ISSPACE (*line_beg))
 672         ++line_beg;
 673       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 674         --line_end;
 675
 676       if (line_beg == line_end)
 677         continue;
 678
 679       /* The URL is in the [line_beg, line_end) region. */
 680
 681       /* We must copy the URL to a zero-terminated string, and we
 682          can't use alloca because we're in a loop.  *sigh*.  */
 683       url_text = strdupdelim (line_beg, line_end);
 684
 685       if (opt.base_href)
 686         {
 687           /* Merge opt.base_href with URL. */
 688           char *merged = uri_merge (opt.base_href, url_text);
 689           xfree (url_text);
 690           url_text = merged;
 691         }
 692
 693       url = url_parse (url_text, &up_error_code);
 694       if (!url)
 695         {
 696           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 697                      file, url_text, url_error (up_error_code));
 698           xfree (url_text);
 699           continue;
 700         }
 701       xfree (url_text);
 702
 703       entry = xnew0 (struct urlpos);
 704       entry->url = url;
 705
 706       if (!head)
 707         head = entry;
 708       else
 709         tail->next = entry;
 710       tail = entry;
 711     }
 712   read_file_free (fm);
 713   return head;
 714 }
 715
 716 void
 717 cleanup_html_url (void)
 718 {
 719   /* Destroy the hash tables.  The hash table keys and values are not
 720      allocated by this code, so we don't need to free them here.  */
 721   if (interesting_tags)
 722     hash_table_destroy (interesting_tags);
 723   if (interesting_attributes)
 724     hash_table_destroy (interesting_attributes);
 725 }