sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "wget.h"
  40 #include "html-parse.h"
  41 #include "url.h"
  42 #include "utils.h"
  43 #include "hash.h"
  44 #include "convert.h"
  45 #include "recur.h"              /* declaration of get_urls_html */
  46
  47 struct map_context;
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH
  83 };
  84
  85 /* The list of known tags and functions used for handling them.  Most
  86    tags are simply harvested for URLs. */
  87 static struct known_tag {
  88   int tagid;
  89   const char *name;
  90   tag_handler_t handler;
  91 } known_tags[] = {
  92   { TAG_A,       "a",           tag_find_urls },
  93   { TAG_APPLET,  "applet",      tag_find_urls },
  94   { TAG_AREA,    "area",        tag_find_urls },
  95   { TAG_BASE,    "base",        tag_handle_base },
  96   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  97   { TAG_BODY,    "body",        tag_find_urls },
  98   { TAG_EMBED,   "embed",       tag_find_urls },
  99   { TAG_FIG,     "fig",         tag_find_urls },
 100   { TAG_FORM,    "form",        tag_handle_form },
 101   { TAG_FRAME,   "frame",       tag_find_urls },
 102   { TAG_IFRAME,  "iframe",      tag_find_urls },
 103   { TAG_IMG,     "img",         tag_find_urls },
 104   { TAG_INPUT,   "input",       tag_find_urls },
 105   { TAG_LAYER,   "layer",       tag_find_urls },
 106   { TAG_LINK,    "link",        tag_handle_link },
 107   { TAG_META,    "meta",        tag_handle_meta },
 108   { TAG_OBJECT,  "object",      tag_find_urls },
 109   { TAG_OVERLAY, "overlay",     tag_find_urls },
 110   { TAG_SCRIPT,  "script",      tag_find_urls },
 111   { TAG_TABLE,   "table",       tag_find_urls },
 112   { TAG_TD,      "td",          tag_find_urls },
 113   { TAG_TH,      "th",          tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS. */
 120
 121 /* The link is "inline", i.e. needs to be retrieved for this document
 122    to be correctly rendered.  Inline links include inlined images,
 123    stylesheets, children frames, etc.  */
 124 #define ATTR_INLINE     1
 125
 126 /* The link is expected to yield HTML contents.  It's important not to
 127    try to follow HTML obtained by following e.g. <img src="...">
 128    regardless of content-type.  Doing this causes infinite loops for
 129    "images" that return non-404 error pages with links to the same
 130    image.  */
 131 #define ATTR_HTML       2
 132
 133 /* For tags handled by tag_find_urls: attributes that contain URLs to
 134    download. */
 135 static struct {
 136   int tagid;
 137   const char *attr_name;
 138   int flags;
 139 } tag_url_attributes[] = {
 140   { TAG_A,              "href",         ATTR_HTML },
 141   { TAG_APPLET,         "code",         ATTR_INLINE },
 142   { TAG_AREA,           "href",         ATTR_HTML },
 143   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 144   { TAG_BODY,           "background",   ATTR_INLINE },
 145   { TAG_EMBED,          "href",         ATTR_HTML },
 146   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_FIG,            "src",          ATTR_INLINE },
 148   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IMG,            "href",         ATTR_INLINE },
 151   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 152   { TAG_IMG,            "src",          ATTR_INLINE },
 153   { TAG_INPUT,          "src",          ATTR_INLINE },
 154   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_OBJECT,         "data",         ATTR_INLINE },
 156   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 158   { TAG_TABLE,          "background",   ATTR_INLINE },
 159   { TAG_TD,             "background",   ATTR_INLINE },
 160   { TAG_TH,             "background",   ATTR_INLINE }
 161 };
 162
 163 /* The lists of interesting tags and attributes are built dynamically,
 164    from the information above.  However, some places in the code refer
 165    to the attributes not mentioned here.  We add them manually.  */
 166 static const char *additional_attributes[] = {
 167   "rel",                        /* used by tag_handle_link */
 168   "http-equiv",                 /* used by tag_handle_meta */
 169   "name",                       /* used by tag_handle_meta */
 170   "content",                    /* used by tag_handle_meta */
 171   "action"                      /* used by tag_handle_form */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 static void
 178 init_interesting (void)
 179 {
 180   /* Init the variables interesting_tags and interesting_attributes
 181      that are used by the HTML parser to know which tags and
 182      attributes we're interested in.  We initialize this only once,
 183      for performance reasons.
 184
 185      Here we also make sure that what we put in interesting_tags
 186      matches the user's preferences as specified through --ignore-tags
 187      and --follow-tags.  */
 188
 189   int i;
 190   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 191
 192   /* First, add all the tags we know hot to handle, mapped to their
 193      respective entries in known_tags.  */
 194   for (i = 0; i < countof (known_tags); i++)
 195     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 196
 197   /* Then remove the tags ignored through --ignore-tags.  */
 198   if (opt.ignore_tags)
 199     {
 200       char **ignored;
 201       for (ignored = opt.ignore_tags; *ignored; ignored++)
 202         hash_table_remove (interesting_tags, *ignored);
 203     }
 204
 205   /* If --follow-tags is specified, use only those tags.  */
 206   if (opt.follow_tags)
 207     {
 208       /* Create a new table intersecting --follow-tags and known_tags,
 209          and use it as interesting_tags.  */
 210       struct hash_table *intersect = make_nocase_string_hash_table (0);
 211       char **followed;
 212       for (followed = opt.follow_tags; *followed; followed++)
 213         {
 214           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 215           if (!t)
 216             continue;           /* ignore unknown --follow-tags entries. */
 217           hash_table_put (intersect, *followed, t);
 218         }
 219       hash_table_destroy (interesting_tags);
 220       interesting_tags = intersect;
 221     }
 222
 223   /* Add the attributes we care about. */
 224   interesting_attributes = make_nocase_string_hash_table (10);
 225   for (i = 0; i < countof (additional_attributes); i++)
 226     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 227   for (i = 0; i < countof (tag_url_attributes); i++)
 228     hash_table_put (interesting_attributes,
 229                     tag_url_attributes[i].attr_name, "1");
 230 }
 231
 232 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 233    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 234    index of the attribute in TAG will be stored there.  */
 235
 236 static char *
 237 find_attr (struct taginfo *tag, const char *name, int *attrind)
 238 {
 239   int i;
 240   for (i = 0; i < tag->nattrs; i++)
 241     if (!strcasecmp (tag->attrs[i].name, name))
 242       {
 243         if (attrind)
 244           *attrind = i;
 245         return tag->attrs[i].value;
 246       }
 247   return NULL;
 248 }
 249
 250 struct map_context {
 251   char *text;                   /* HTML text. */
 252   char *base;                   /* Base URI of the document, possibly
 253                                    changed through <base href=...>. */
 254   const char *parent_base;      /* Base of the current document. */
 255   const char *document_file;    /* File name of this document. */
 256   bool nofollow;                /* whether NOFOLLOW was specified in a
 257                                    <meta name=robots> tag. */
 258
 259   struct urlpos *head, *tail;   /* List of URLs that is being
 260                                    built. */
 261 };
 262
 263 /* Append LINK_URI to the urlpos structure that is being built.
 264
 265    LINK_URI will be merged with the current document base.  TAG and
 266    ATTRIND are the necessary context to store the position and
 267    size.  */
 268
 269 static struct urlpos *
 270 append_url (const char *link_uri,
 271             struct taginfo *tag, int attrind, struct map_context *ctx)
 272 {
 273   int link_has_scheme = url_has_scheme (link_uri);
 274   struct urlpos *newel;
 275   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 276   struct url *url;
 277
 278   if (!base)
 279     {
 280       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 281                ctx->document_file, link_uri));
 282
 283       if (!link_has_scheme)
 284         {
 285           /* Base URL is unavailable, and the link does not have a
 286              location attached to it -- we have to give up.  Since
 287              this can only happen when using `--force-html -i', print
 288              a warning.  */
 289           logprintf (LOG_NOTQUIET,
 290                      _("%s: Cannot resolve incomplete link %s.\n"),
 291                      ctx->document_file, link_uri);
 292           return NULL;
 293         }
 294
 295       url = url_parse (link_uri, NULL);
 296       if (!url)
 297         {
 298           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 299                    ctx->document_file, link_uri));
 300           return NULL;
 301         }
 302     }
 303   else
 304     {
 305       /* Merge BASE with LINK_URI, but also make sure the result is
 306          canonicalized, i.e. that "../" have been resolved.
 307          (parse_url will do that for us.) */
 308
 309       char *complete_uri = uri_merge (base, link_uri);
 310
 311       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 312                ctx->document_file, base, link_uri, complete_uri));
 313
 314       url = url_parse (complete_uri, NULL);
 315       if (!url)
 316         {
 317           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 318                    ctx->document_file, complete_uri));
 319           xfree (complete_uri);
 320           return NULL;
 321         }
 322       xfree (complete_uri);
 323     }
 324
 325   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 326
 327   newel = xnew0 (struct urlpos);
 328   newel->url = url;
 329   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 330   newel->size = tag->attrs[attrind].value_raw_size;
 331
 332   /* A URL is relative if the host is not named, and the name does not
 333      start with `/'.  */
 334   if (!link_has_scheme && *link_uri != '/')
 335     newel->link_relative_p = 1;
 336   else if (link_has_scheme)
 337     newel->link_complete_p = 1;
 338
 339   if (ctx->tail)
 340     {
 341       ctx->tail->next = newel;
 342       ctx->tail = newel;
 343     }
 344   else
 345     ctx->tail = ctx->head = newel;
 346
 347   return newel;
 348 }
 349 \f
 350 /* All the tag_* functions are called from collect_tags_mapper, as
 351    specified by KNOWN_TAGS.  */
 352
 353 /* Default tag handler: collect URLs from attributes specified for
 354    this tag by tag_url_attributes.  */
 355
 356 static void
 357 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 358 {
 359   int i, attrind;
 360   int first = -1;
 361
 362   for (i = 0; i < countof (tag_url_attributes); i++)
 363     if (tag_url_attributes[i].tagid == tagid)
 364       {
 365         /* We've found the index of tag_url_attributes where the
 366            attributes of our tag begin.  */
 367         first = i;
 368         break;
 369       }
 370   assert (first != -1);
 371
 372   /* Loop over the "interesting" attributes of this tag.  In this
 373      example, it will loop over "src" and "lowsrc".
 374
 375        <img src="foo.png" lowsrc="bar.png">
 376
 377      This has to be done in the outer loop so that the attributes are
 378      processed in the same order in which they appear in the page.
 379      This is required when converting links.  */
 380
 381   for (attrind = 0; attrind < tag->nattrs; attrind++)
 382     {
 383       /* Find whether TAG/ATTRIND is a combination that contains a
 384          URL. */
 385       char *link = tag->attrs[attrind].value;
 386       const int size = countof (tag_url_attributes);
 387
 388       /* If you're cringing at the inefficiency of the nested loops,
 389          remember that they both iterate over a very small number of
 390          items.  The worst-case inner loop is for the IMG tag, which
 391          has three attributes.  */
 392       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 393         {
 394           if (0 == strcasecmp (tag->attrs[attrind].name,
 395                                tag_url_attributes[i].attr_name))
 396             {
 397               struct urlpos *up = append_url (link, tag, attrind, ctx);
 398               if (up)
 399                 {
 400                   int flags = tag_url_attributes[i].flags;
 401                   if (flags & ATTR_INLINE)
 402                     up->link_inline_p = 1;
 403                   if (flags & ATTR_HTML)
 404                     up->link_expect_html = 1;
 405                 }
 406             }
 407         }
 408     }
 409 }
 410
 411 /* Handle the BASE tag, for <base href=...>. */
 412
 413 static void
 414 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 415 {
 416   struct urlpos *base_urlpos;
 417   int attrind;
 418   char *newbase = find_attr (tag, "href", &attrind);
 419   if (!newbase)
 420     return;
 421
 422   base_urlpos = append_url (newbase, tag, attrind, ctx);
 423   if (!base_urlpos)
 424     return;
 425   base_urlpos->ignore_when_downloading = 1;
 426   base_urlpos->link_base_p = 1;
 427
 428   if (ctx->base)
 429     xfree (ctx->base);
 430   if (ctx->parent_base)
 431     ctx->base = uri_merge (ctx->parent_base, newbase);
 432   else
 433     ctx->base = xstrdup (newbase);
 434 }
 435
 436 /* Mark the URL found in <form action=...> for conversion. */
 437
 438 static void
 439 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 440 {
 441   int attrind;
 442   char *action = find_attr (tag, "action", &attrind);
 443   if (action)
 444     {
 445       struct urlpos *up = append_url (action, tag, attrind, ctx);
 446       if (up)
 447         up->ignore_when_downloading = 1;
 448     }
 449 }
 450
 451 /* Handle the LINK tag.  It requires special handling because how its
 452    links will be followed in -p mode depends on the REL attribute.  */
 453
 454 static void
 455 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 456 {
 457   int attrind;
 458   char *href = find_attr (tag, "href", &attrind);
 459
 460   /* All <link href="..."> link references are external, except those
 461      known not to be, such as style sheet and shortcut icon:
 462
 463        <link rel="stylesheet" href="...">
 464        <link rel="shortcut icon" href="...">
 465   */
 466   if (href)
 467     {
 468       struct urlpos *up = append_url (href, tag, attrind, ctx);
 469       if (up)
 470         {
 471           char *rel = find_attr (tag, "rel", NULL);
 472           if (rel
 473               && (0 == strcasecmp (rel, "stylesheet")
 474                   || 0 == strcasecmp (rel, "shortcut icon")))
 475             up->link_inline_p = 1;
 476           else
 477             /* The external ones usually point to HTML pages, such as
 478                <link rel="next" href="..."> */
 479             up->link_expect_html = 1;
 480         }
 481     }
 482 }
 483
 484 /* Handle the META tag.  This requires special handling because of the
 485    refresh feature and because of robot exclusion.  */
 486
 487 static void
 488 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 489 {
 490   char *name = find_attr (tag, "name", NULL);
 491   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 492
 493   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 494     {
 495       /* Some pages use a META tag to specify that the page be
 496          refreshed by a new page after a given number of seconds.  The
 497          general format for this is:
 498
 499            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 500
 501          So we just need to skip past the "NUMBER; URL=" garbage to
 502          get to the URL.  */
 503
 504       struct urlpos *entry;
 505       int attrind;
 506       int timeout = 0;
 507       char *p;
 508
 509       char *refresh = find_attr (tag, "content", &attrind);
 510       if (!refresh)
 511         return;
 512
 513       for (p = refresh; ISDIGIT (*p); p++)
 514         timeout = 10 * timeout + *p - '0';
 515       if (*p++ != ';')
 516         return;
 517
 518       while (ISSPACE (*p))
 519         ++p;
 520       if (!(   TOUPPER (*p)       == 'U'
 521             && TOUPPER (*(p + 1)) == 'R'
 522             && TOUPPER (*(p + 2)) == 'L'
 523             &&          *(p + 3)  == '='))
 524         return;
 525       p += 4;
 526       while (ISSPACE (*p))
 527         ++p;
 528
 529       entry = append_url (p, tag, attrind, ctx);
 530       if (entry)
 531         {
 532           entry->link_refresh_p = 1;
 533           entry->refresh_timeout = timeout;
 534           entry->link_expect_html = 1;
 535         }
 536     }
 537   else if (name && 0 == strcasecmp (name, "robots"))
 538     {
 539       /* Handle stuff like:
 540          <meta name="robots" content="index,nofollow"> */
 541       char *content = find_attr (tag, "content", NULL);
 542       if (!content)
 543         return;
 544       if (!strcasecmp (content, "none"))
 545         ctx->nofollow = true;
 546       else
 547         {
 548           while (*content)
 549             {
 550               /* Find the next occurrence of ',' or the end of
 551                  the string.  */
 552               char *end = strchr (content, ',');
 553               if (end)
 554                 ++end;
 555               else
 556                 end = content + strlen (content);
 557               if (!strncasecmp (content, "nofollow", end - content))
 558                 ctx->nofollow = true;
 559               content = end;
 560             }
 561         }
 562     }
 563 }
 564
 565 /* Dispatch the tag handler appropriate for the tag we're mapping
 566    over.  See known_tags[] for definition of tag handlers.  */
 567
 568 static void
 569 collect_tags_mapper (struct taginfo *tag, void *arg)
 570 {
 571   struct map_context *ctx = (struct map_context *)arg;
 572
 573   /* Find the tag in our table of tags.  This must not fail because
 574      map_html_tags only returns tags found in interesting_tags.  */
 575   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 576   assert (t != NULL);
 577
 578   t->handler (t->tagid, tag, ctx);
 579 }
 580 \f
 581 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 582    it.  It merges relative links in FILE with URL.  It is aware of
 583    <base href=...> and does the right thing.  */
 584
 585 struct urlpos *
 586 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 587 {
 588   struct file_memory *fm;
 589   struct map_context ctx;
 590   int flags;
 591
 592   /* Load the file. */
 593   fm = read_file (file);
 594   if (!fm)
 595     {
 596       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 597       return NULL;
 598     }
 599   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 600
 601   ctx.text = fm->content;
 602   ctx.head = ctx.tail = NULL;
 603   ctx.base = NULL;
 604   ctx.parent_base = url ? url : opt.base_href;
 605   ctx.document_file = file;
 606   ctx.nofollow = false;
 607
 608   if (!interesting_tags)
 609     init_interesting ();
 610
 611   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 612      generate <a href=" foo"> instead of <a href="foo"> (browsers
 613      ignore spaces as well.)  If you really mean space, use &32; or
 614      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 615      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 616      ignored by IE and Mozilla and are presumably introduced by
 617      writing HTML with editors that force word wrap.  */
 618   flags = MHT_TRIM_VALUES;
 619   if (opt.strict_comments)
 620     flags |= MHT_STRICT_COMMENTS;
 621
 622   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 623                  interesting_tags, interesting_attributes);
 624
 625   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 626   if (meta_disallow_follow)
 627     *meta_disallow_follow = ctx.nofollow;
 628
 629   xfree_null (ctx.base);
 630   read_file_free (fm);
 631   return ctx.head;
 632 }
 633
 634 /* This doesn't really have anything to do with HTML, but it's similar
 635    to get_urls_html, so we put it here.  */
 636
 637 struct urlpos *
 638 get_urls_file (const char *file)
 639 {
 640   struct file_memory *fm;
 641   struct urlpos *head, *tail;
 642   const char *text, *text_end;
 643
 644   /* Load the file.  */
 645   fm = read_file (file);
 646   if (!fm)
 647     {
 648       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 649       return NULL;
 650     }
 651   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 652
 653   head = tail = NULL;
 654   text = fm->content;
 655   text_end = fm->content + fm->length;
 656   while (text < text_end)
 657     {
 658       int up_error_code;
 659       char *url_text;
 660       struct urlpos *entry;
 661       struct url *url;
 662
 663       const char *line_beg = text;
 664       const char *line_end = memchr (text, '\n', text_end - text);
 665       if (!line_end)
 666         line_end = text_end;
 667       else
 668         ++line_end;
 669       text = line_end;
 670
 671       /* Strip whitespace from the beginning and end of line. */
 672       while (line_beg < line_end && ISSPACE (*line_beg))
 673         ++line_beg;
 674       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 675         --line_end;
 676
 677       if (line_beg == line_end)
 678         continue;
 679
 680       /* The URL is in the [line_beg, line_end) region. */
 681
 682       /* We must copy the URL to a zero-terminated string, and we
 683          can't use alloca because we're in a loop.  *sigh*.  */
 684       url_text = strdupdelim (line_beg, line_end);
 685
 686       if (opt.base_href)
 687         {
 688           /* Merge opt.base_href with URL. */
 689           char *merged = uri_merge (opt.base_href, url_text);
 690           xfree (url_text);
 691           url_text = merged;
 692         }
 693
 694       url = url_parse (url_text, &up_error_code);
 695       if (!url)
 696         {
 697           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 698                      file, url_text, url_error (up_error_code));
 699           xfree (url_text);
 700           continue;
 701         }
 702       xfree (url_text);
 703
 704       entry = xnew0 (struct urlpos);
 705       entry->url = url;
 706
 707       if (!head)
 708         head = entry;
 709       else
 710         tail->next = entry;
 711       tail = entry;
 712     }
 713   read_file_free (fm);
 714   return head;
 715 }
 716
 717 void
 718 cleanup_html_url (void)
 719 {
 720   /* Destroy the hash tables.  The hash table keys and values are not
 721      allocated by this code, so we don't need to free them here.  */
 722   if (interesting_tags)
 723     hash_table_destroy (interesting_tags);
 724   if (interesting_attributes)
 725     hash_table_destroy (interesting_attributes);
 726 }