sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "convert.h"
  48 #include "recur.h"              /* declaration of get_urls_html */
  49
  50 #ifndef errno
  51 extern int errno;
  52 #endif
  53
  54 struct map_context;
  55
  56 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  57                                        struct map_context *));
  58
  59 #define DECLARE_TAG_HANDLER(fun)                                        \
  60   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  61
  62 DECLARE_TAG_HANDLER (tag_find_urls);
  63 DECLARE_TAG_HANDLER (tag_handle_base);
  64 DECLARE_TAG_HANDLER (tag_handle_form);
  65 DECLARE_TAG_HANDLER (tag_handle_link);
  66 DECLARE_TAG_HANDLER (tag_handle_meta);
  67
  68 enum {
  69   TAG_A,
  70   TAG_APPLET,
  71   TAG_AREA,
  72   TAG_BASE,
  73   TAG_BGSOUND,
  74   TAG_BODY,
  75   TAG_EMBED,
  76   TAG_FIG,
  77   TAG_FORM,
  78   TAG_FRAME,
  79   TAG_IFRAME,
  80   TAG_IMG,
  81   TAG_INPUT,
  82   TAG_LAYER,
  83   TAG_LINK,
  84   TAG_META,
  85   TAG_OBJECT,
  86   TAG_OVERLAY,
  87   TAG_SCRIPT,
  88   TAG_TABLE,
  89   TAG_TD,
  90   TAG_TH
  91 };
  92
  93 /* The list of known tags and functions used for handling them.  Most
  94    tags are simply harvested for URLs. */
  95 static struct known_tag {
  96   int tagid;
  97   const char *name;
  98   tag_handler_t handler;
  99 } known_tags[] = {
 100   { TAG_A,       "a",           tag_find_urls },
 101   { TAG_APPLET,  "applet",      tag_find_urls },
 102   { TAG_AREA,    "area",        tag_find_urls },
 103   { TAG_BASE,    "base",        tag_handle_base },
 104   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 105   { TAG_BODY,    "body",        tag_find_urls },
 106   { TAG_EMBED,   "embed",       tag_find_urls },
 107   { TAG_FIG,     "fig",         tag_find_urls },
 108   { TAG_FORM,    "form",        tag_handle_form },
 109   { TAG_FRAME,   "frame",       tag_find_urls },
 110   { TAG_IFRAME,  "iframe",      tag_find_urls },
 111   { TAG_IMG,     "img",         tag_find_urls },
 112   { TAG_INPUT,   "input",       tag_find_urls },
 113   { TAG_LAYER,   "layer",       tag_find_urls },
 114   { TAG_LINK,    "link",        tag_handle_link },
 115   { TAG_META,    "meta",        tag_handle_meta },
 116   { TAG_OBJECT,  "object",      tag_find_urls },
 117   { TAG_OVERLAY, "overlay",     tag_find_urls },
 118   { TAG_SCRIPT,  "script",      tag_find_urls },
 119   { TAG_TABLE,   "table",       tag_find_urls },
 120   { TAG_TD,      "td",          tag_find_urls },
 121   { TAG_TH,      "th",          tag_find_urls }
 122 };
 123
 124 /* tag_url_attributes documents which attributes of which tags contain
 125    URLs to harvest.  It is used by tag_find_urls.  */
 126
 127 /* Defines for the FLAGS. */
 128
 129 /* The link is "inline", i.e. needs to be retrieved for this document
 130    to be correctly rendered.  Inline links include inlined images,
 131    stylesheets, children frames, etc.  */
 132 #define ATTR_INLINE     1
 133
 134 /* The link is expected to yield HTML contents.  It's important not to
 135    try to follow HTML obtained by following e.g. <img src="...">
 136    regardless of content-type.  Doing this causes infinite loops for
 137    "images" that return non-404 error pages with links to the same
 138    image.  */
 139 #define ATTR_HTML       2
 140
 141 /* For tags handled by tag_find_urls: attributes that contain URLs to
 142    download. */
 143 static struct {
 144   int tagid;
 145   const char *attr_name;
 146   int flags;
 147 } tag_url_attributes[] = {
 148   { TAG_A,              "href",         ATTR_HTML },
 149   { TAG_APPLET,         "code",         ATTR_INLINE },
 150   { TAG_AREA,           "href",         ATTR_HTML },
 151   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 152   { TAG_BODY,           "background",   ATTR_INLINE },
 153   { TAG_EMBED,          "href",         ATTR_HTML },
 154   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_FIG,            "src",          ATTR_INLINE },
 156   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 158   { TAG_IMG,            "href",         ATTR_INLINE },
 159   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 160   { TAG_IMG,            "src",          ATTR_INLINE },
 161   { TAG_INPUT,          "src",          ATTR_INLINE },
 162   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 163   { TAG_OBJECT,         "data",         ATTR_INLINE },
 164   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 165   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 166   { TAG_TABLE,          "background",   ATTR_INLINE },
 167   { TAG_TD,             "background",   ATTR_INLINE },
 168   { TAG_TH,             "background",   ATTR_INLINE }
 169 };
 170
 171 /* The lists of interesting tags and attributes are built dynamically,
 172    from the information above.  However, some places in the code refer
 173    to the attributes not mentioned here.  We add them manually.  */
 174 static const char *additional_attributes[] = {
 175   "rel",                        /* used by tag_handle_link */
 176   "http-equiv",                 /* used by tag_handle_meta */
 177   "name",                       /* used by tag_handle_meta */
 178   "content",                    /* used by tag_handle_meta */
 179   "action"                      /* used by tag_handle_form */
 180 };
 181
 182 struct hash_table *interesting_tags;
 183 struct hash_table *interesting_attributes;
 184
 185 static void
 186 init_interesting (void)
 187 {
 188   /* Init the variables interesting_tags and interesting_attributes
 189      that are used by the HTML parser to know which tags and
 190      attributes we're interested in.  We initialize this only once,
 191      for performance reasons.
 192
 193      Here we also make sure that what we put in interesting_tags
 194      matches the user's preferences as specified through --ignore-tags
 195      and --follow-tags.  */
 196
 197   int i;
 198   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 199
 200   /* First, add all the tags we know hot to handle, mapped to their
 201      respective entries in known_tags.  */
 202   for (i = 0; i < countof (known_tags); i++)
 203     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 204
 205   /* Then remove the tags ignored through --ignore-tags.  */
 206   if (opt.ignore_tags)
 207     {
 208       char **ignored;
 209       for (ignored = opt.ignore_tags; *ignored; ignored++)
 210         hash_table_remove (interesting_tags, *ignored);
 211     }
 212
 213   /* If --follow-tags is specified, use only those tags.  */
 214   if (opt.follow_tags)
 215     {
 216       /* Create a new table intersecting --follow-tags and known_tags,
 217          and use it as interesting_tags.  */
 218       struct hash_table *intersect = make_nocase_string_hash_table (0);
 219       char **followed;
 220       for (followed = opt.follow_tags; *followed; followed++)
 221         {
 222           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 223           if (!t)
 224             continue;           /* ignore unknown --follow-tags entries. */
 225           hash_table_put (intersect, *followed, t);
 226         }
 227       hash_table_destroy (interesting_tags);
 228       interesting_tags = intersect;
 229     }
 230
 231   /* Add the attributes we care about. */
 232   interesting_attributes = make_nocase_string_hash_table (10);
 233   for (i = 0; i < countof (additional_attributes); i++)
 234     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 235   for (i = 0; i < countof (tag_url_attributes); i++)
 236     hash_table_put (interesting_attributes,
 237                     tag_url_attributes[i].attr_name, "1");
 238 }
 239
 240 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 241    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 242    index of the attribute in TAG will be stored there.  */
 243
 244 static char *
 245 find_attr (struct taginfo *tag, const char *name, int *attrind)
 246 {
 247   int i;
 248   for (i = 0; i < tag->nattrs; i++)
 249     if (!strcasecmp (tag->attrs[i].name, name))
 250       {
 251         if (attrind)
 252           *attrind = i;
 253         return tag->attrs[i].value;
 254       }
 255   return NULL;
 256 }
 257
 258 struct map_context {
 259   char *text;                   /* HTML text. */
 260   char *base;                   /* Base URI of the document, possibly
 261                                    changed through <base href=...>. */
 262   const char *parent_base;      /* Base of the current document. */
 263   const char *document_file;    /* File name of this document. */
 264   int nofollow;                 /* whether NOFOLLOW was specified in a
 265                                    <meta name=robots> tag. */
 266
 267   struct urlpos *head, *tail;   /* List of URLs that is being
 268                                    built. */
 269 };
 270
 271 /* Append LINK_URI to the urlpos structure that is being built.
 272
 273    LINK_URI will be merged with the current document base.  TAG and
 274    ATTRIND are the necessary context to store the position and
 275    size.  */
 276
 277 static struct urlpos *
 278 append_url (const char *link_uri,
 279             struct taginfo *tag, int attrind, struct map_context *ctx)
 280 {
 281   int link_has_scheme = url_has_scheme (link_uri);
 282   struct urlpos *newel;
 283   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 284   struct url *url;
 285
 286   if (!base)
 287     {
 288       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 289                ctx->document_file, link_uri));
 290
 291       if (!link_has_scheme)
 292         {
 293           /* Base URL is unavailable, and the link does not have a
 294              location attached to it -- we have to give up.  Since
 295              this can only happen when using `--force-html -i', print
 296              a warning.  */
 297           logprintf (LOG_NOTQUIET,
 298                      _("%s: Cannot resolve incomplete link %s.\n"),
 299                      ctx->document_file, link_uri);
 300           return NULL;
 301         }
 302
 303       url = url_parse (link_uri, NULL);
 304       if (!url)
 305         {
 306           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 307                    ctx->document_file, link_uri));
 308           return NULL;
 309         }
 310     }
 311   else
 312     {
 313       /* Merge BASE with LINK_URI, but also make sure the result is
 314          canonicalized, i.e. that "../" have been resolved.
 315          (parse_url will do that for us.) */
 316
 317       char *complete_uri = uri_merge (base, link_uri);
 318
 319       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 320                ctx->document_file, base, link_uri, complete_uri));
 321
 322       url = url_parse (complete_uri, NULL);
 323       if (!url)
 324         {
 325           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 326                    ctx->document_file, complete_uri));
 327           xfree (complete_uri);
 328           return NULL;
 329         }
 330       xfree (complete_uri);
 331     }
 332
 333   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 334
 335   newel = xnew0 (struct urlpos);
 336   newel->url = url;
 337   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 338   newel->size = tag->attrs[attrind].value_raw_size;
 339
 340   /* A URL is relative if the host is not named, and the name does not
 341      start with `/'.  */
 342   if (!link_has_scheme && *link_uri != '/')
 343     newel->link_relative_p = 1;
 344   else if (link_has_scheme)
 345     newel->link_complete_p = 1;
 346
 347   if (ctx->tail)
 348     {
 349       ctx->tail->next = newel;
 350       ctx->tail = newel;
 351     }
 352   else
 353     ctx->tail = ctx->head = newel;
 354
 355   return newel;
 356 }
 357 \f
 358 /* All the tag_* functions are called from collect_tags_mapper, as
 359    specified by KNOWN_TAGS.  */
 360
 361 /* Default tag handler: collect URLs from attributes specified for
 362    this tag by tag_url_attributes.  */
 363
 364 static void
 365 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 366 {
 367   int i, attrind;
 368   int first = -1;
 369
 370   for (i = 0; i < countof (tag_url_attributes); i++)
 371     if (tag_url_attributes[i].tagid == tagid)
 372       {
 373         /* We've found the index of tag_url_attributes where the
 374            attributes of our tag begin.  */
 375         first = i;
 376         break;
 377       }
 378   assert (first != -1);
 379
 380   /* Loop over the "interesting" attributes of this tag.  In this
 381      example, it will loop over "src" and "lowsrc".
 382
 383        <img src="foo.png" lowsrc="bar.png">
 384
 385      This has to be done in the outer loop so that the attributes are
 386      processed in the same order in which they appear in the page.
 387      This is required when converting links.  */
 388
 389   for (attrind = 0; attrind < tag->nattrs; attrind++)
 390     {
 391       /* Find whether TAG/ATTRIND is a combination that contains a
 392          URL. */
 393       char *link = tag->attrs[attrind].value;
 394       const int size = countof (tag_url_attributes);
 395
 396       /* If you're cringing at the inefficiency of the nested loops,
 397          remember that they both iterate over a very small number of
 398          items.  The worst-case inner loop is for the IMG tag, which
 399          has three attributes.  */
 400       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 401         {
 402           if (0 == strcasecmp (tag->attrs[attrind].name,
 403                                tag_url_attributes[i].attr_name))
 404             {
 405               struct urlpos *up = append_url (link, tag, attrind, ctx);
 406               if (up)
 407                 {
 408                   int flags = tag_url_attributes[i].flags;
 409                   if (flags & ATTR_INLINE)
 410                     up->link_inline_p = 1;
 411                   if (flags & ATTR_HTML)
 412                     up->link_expect_html = 1;
 413                 }
 414             }
 415         }
 416     }
 417 }
 418
 419 /* Handle the BASE tag, for <base href=...>. */
 420
 421 static void
 422 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 423 {
 424   struct urlpos *base_urlpos;
 425   int attrind;
 426   char *newbase = find_attr (tag, "href", &attrind);
 427   if (!newbase)
 428     return;
 429
 430   base_urlpos = append_url (newbase, tag, attrind, ctx);
 431   if (!base_urlpos)
 432     return;
 433   base_urlpos->ignore_when_downloading = 1;
 434   base_urlpos->link_base_p = 1;
 435
 436   if (ctx->base)
 437     xfree (ctx->base);
 438   if (ctx->parent_base)
 439     ctx->base = uri_merge (ctx->parent_base, newbase);
 440   else
 441     ctx->base = xstrdup (newbase);
 442 }
 443
 444 /* Mark the URL found in <form action=...> for conversion. */
 445
 446 static void
 447 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 448 {
 449   int attrind;
 450   char *action = find_attr (tag, "action", &attrind);
 451   if (action)
 452     {
 453       struct urlpos *up = append_url (action, tag, attrind, ctx);
 454       if (up)
 455         up->ignore_when_downloading = 1;
 456     }
 457 }
 458
 459 /* Handle the LINK tag.  It requires special handling because how its
 460    links will be followed in -p mode depends on the REL attribute.  */
 461
 462 static void
 463 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 464 {
 465   int attrind;
 466   char *href = find_attr (tag, "href", &attrind);
 467
 468   /* All <link href="..."> link references are external, except those
 469      known not to be, such as style sheet and shortcut icon:
 470
 471        <link rel="stylesheet" href="...">
 472        <link rel="shortcut icon" href="...">
 473   */
 474   if (href)
 475     {
 476       struct urlpos *up = append_url (href, tag, attrind, ctx);
 477       if (up)
 478         {
 479           char *rel = find_attr (tag, "rel", NULL);
 480           if (rel
 481               && (0 == strcasecmp (rel, "stylesheet")
 482                   || 0 == strcasecmp (rel, "shortcut icon")))
 483             up->link_inline_p = 1;
 484           else
 485             /* The external ones usually point to HTML pages, such as
 486                <link rel="next" href="..."> */
 487             up->link_expect_html = 1;
 488         }
 489     }
 490 }
 491
 492 /* Handle the META tag.  This requires special handling because of the
 493    refresh feature and because of robot exclusion.  */
 494
 495 static void
 496 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 497 {
 498   char *name = find_attr (tag, "name", NULL);
 499   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 500
 501   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 502     {
 503       /* Some pages use a META tag to specify that the page be
 504          refreshed by a new page after a given number of seconds.  The
 505          general format for this is:
 506
 507            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 508
 509          So we just need to skip past the "NUMBER; URL=" garbage to
 510          get to the URL.  */
 511
 512       struct urlpos *entry;
 513       int attrind;
 514       int timeout = 0;
 515       char *p;
 516
 517       char *refresh = find_attr (tag, "content", &attrind);
 518       if (!refresh)
 519         return;
 520
 521       for (p = refresh; ISDIGIT (*p); p++)
 522         timeout = 10 * timeout + *p - '0';
 523       if (*p++ != ';')
 524         return;
 525
 526       while (ISSPACE (*p))
 527         ++p;
 528       if (!(   TOUPPER (*p)       == 'U'
 529             && TOUPPER (*(p + 1)) == 'R'
 530             && TOUPPER (*(p + 2)) == 'L'
 531             &&          *(p + 3)  == '='))
 532         return;
 533       p += 4;
 534       while (ISSPACE (*p))
 535         ++p;
 536
 537       entry = append_url (p, tag, attrind, ctx);
 538       if (entry)
 539         {
 540           entry->link_refresh_p = 1;
 541           entry->refresh_timeout = timeout;
 542           entry->link_expect_html = 1;
 543         }
 544     }
 545   else if (name && 0 == strcasecmp (name, "robots"))
 546     {
 547       /* Handle stuff like:
 548          <meta name="robots" content="index,nofollow"> */
 549       char *content = find_attr (tag, "content", NULL);
 550       if (!content)
 551         return;
 552       if (!strcasecmp (content, "none"))
 553         ctx->nofollow = 1;
 554       else
 555         {
 556           while (*content)
 557             {
 558               /* Find the next occurrence of ',' or the end of
 559                  the string.  */
 560               char *end = strchr (content, ',');
 561               if (end)
 562                 ++end;
 563               else
 564                 end = content + strlen (content);
 565               if (!strncasecmp (content, "nofollow", end - content))
 566                 ctx->nofollow = 1;
 567               content = end;
 568             }
 569         }
 570     }
 571 }
 572
 573 /* Dispatch the tag handler appropriate for the tag we're mapping
 574    over.  See known_tags[] for definition of tag handlers.  */
 575
 576 static void
 577 collect_tags_mapper (struct taginfo *tag, void *arg)
 578 {
 579   struct map_context *ctx = (struct map_context *)arg;
 580
 581   /* Find the tag in our table of tags.  This must not fail because
 582      map_html_tags only returns tags found in interesting_tags.  */
 583   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 584   assert (t != NULL);
 585
 586   t->handler (t->tagid, tag, ctx);
 587 }
 588 \f
 589 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 590    it.  It merges relative links in FILE with URL.  It is aware of
 591    <base href=...> and does the right thing.  */
 592
 593 struct urlpos *
 594 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 595 {
 596   struct file_memory *fm;
 597   struct map_context ctx;
 598   int flags;
 599
 600   /* Load the file. */
 601   fm = read_file (file);
 602   if (!fm)
 603     {
 604       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 605       return NULL;
 606     }
 607   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 608
 609   ctx.text = fm->content;
 610   ctx.head = ctx.tail = NULL;
 611   ctx.base = NULL;
 612   ctx.parent_base = url ? url : opt.base_href;
 613   ctx.document_file = file;
 614   ctx.nofollow = 0;
 615
 616   if (!interesting_tags)
 617     init_interesting ();
 618
 619   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 620      generate <a href=" foo"> instead of <a href="foo"> (browsers
 621      ignore spaces as well.)  If you really mean space, use &32; or
 622      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 623      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 624      ignored by IE and Mozilla and are presumably introduced by
 625      writing HTML with editors that force word wrap.  */
 626   flags = MHT_TRIM_VALUES;
 627   if (opt.strict_comments)
 628     flags |= MHT_STRICT_COMMENTS;
 629
 630   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 631                  interesting_tags, interesting_attributes);
 632
 633   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 634   if (meta_disallow_follow)
 635     *meta_disallow_follow = ctx.nofollow;
 636
 637   xfree_null (ctx.base);
 638   read_file_free (fm);
 639   return ctx.head;
 640 }
 641
 642 /* This doesn't really have anything to do with HTML, but it's similar
 643    to get_urls_html, so we put it here.  */
 644
 645 struct urlpos *
 646 get_urls_file (const char *file)
 647 {
 648   struct file_memory *fm;
 649   struct urlpos *head, *tail;
 650   const char *text, *text_end;
 651
 652   /* Load the file.  */
 653   fm = read_file (file);
 654   if (!fm)
 655     {
 656       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 657       return NULL;
 658     }
 659   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 660
 661   head = tail = NULL;
 662   text = fm->content;
 663   text_end = fm->content + fm->length;
 664   while (text < text_end)
 665     {
 666       int up_error_code;
 667       char *url_text;
 668       struct urlpos *entry;
 669       struct url *url;
 670
 671       const char *line_beg = text;
 672       const char *line_end = memchr (text, '\n', text_end - text);
 673       if (!line_end)
 674         line_end = text_end;
 675       else
 676         ++line_end;
 677       text = line_end;
 678
 679       /* Strip whitespace from the beginning and end of line. */
 680       while (line_beg < line_end && ISSPACE (*line_beg))
 681         ++line_beg;
 682       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 683         --line_end;
 684
 685       if (line_beg == line_end)
 686         continue;
 687
 688       /* The URL is in the [line_beg, line_end) region. */
 689
 690       /* We must copy the URL to a zero-terminated string, and we
 691          can't use alloca because we're in a loop.  *sigh*.  */
 692       url_text = strdupdelim (line_beg, line_end);
 693
 694       if (opt.base_href)
 695         {
 696           /* Merge opt.base_href with URL. */
 697           char *merged = uri_merge (opt.base_href, url_text);
 698           xfree (url_text);
 699           url_text = merged;
 700         }
 701
 702       url = url_parse (url_text, &up_error_code);
 703       if (!url)
 704         {
 705           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 706                      file, url_text, url_error (up_error_code));
 707           xfree (url_text);
 708           continue;
 709         }
 710       xfree (url_text);
 711
 712       entry = xnew0 (struct urlpos);
 713       entry->next = NULL;
 714       entry->url = url;
 715
 716       if (!head)
 717         head = entry;
 718       else
 719         tail->next = entry;
 720       tail = entry;
 721     }
 722   read_file_free (fm);
 723   return head;
 724 }
 725
 726 void
 727 cleanup_html_url (void)
 728 {
 729   /* Destroy the hash tables.  The hash table keys and values are not
 730      allocated by this code, so we don't need to free them here.  */
 731   if (interesting_tags)
 732     hash_table_destroy (interesting_tags);
 733   if (interesting_attributes)
 734     hash_table_destroy (interesting_attributes);
 735 }