sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "convert.h"
  48
  49 #ifndef errno
  50 extern int errno;
  51 #endif
  52
  53 struct map_context;
  54
  55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  56                                        struct map_context *));
  57
  58 #define DECLARE_TAG_HANDLER(fun)                                        \
  59   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  60
  61 DECLARE_TAG_HANDLER (tag_find_urls);
  62 DECLARE_TAG_HANDLER (tag_handle_base);
  63 DECLARE_TAG_HANDLER (tag_handle_form);
  64 DECLARE_TAG_HANDLER (tag_handle_link);
  65 DECLARE_TAG_HANDLER (tag_handle_meta);
  66
  67 enum {
  68   TAG_A,
  69   TAG_APPLET,
  70   TAG_AREA,
  71   TAG_BASE,
  72   TAG_BGSOUND,
  73   TAG_BODY,
  74   TAG_EMBED,
  75   TAG_FIG,
  76   TAG_FORM,
  77   TAG_FRAME,
  78   TAG_IFRAME,
  79   TAG_IMG,
  80   TAG_INPUT,
  81   TAG_LAYER,
  82   TAG_LINK,
  83   TAG_META,
  84   TAG_OVERLAY,
  85   TAG_SCRIPT,
  86   TAG_TABLE,
  87   TAG_TD,
  88   TAG_TH
  89 };
  90
  91 /* The list of known tags and functions used for handling them.  Most
  92    tags are simply harvested for URLs. */
  93 static struct known_tag {
  94   int tagid;
  95   const char *name;
  96   tag_handler_t handler;
  97 } known_tags[] = {
  98   { TAG_A,       "a",           tag_find_urls },
  99   { TAG_APPLET,  "applet",      tag_find_urls },
 100   { TAG_AREA,    "area",        tag_find_urls },
 101   { TAG_BASE,    "base",        tag_handle_base },
 102   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 103   { TAG_BODY,    "body",        tag_find_urls },
 104   { TAG_EMBED,   "embed",       tag_find_urls },
 105   { TAG_FIG,     "fig",         tag_find_urls },
 106   { TAG_FORM,    "form",        tag_handle_form },
 107   { TAG_FRAME,   "frame",       tag_find_urls },
 108   { TAG_IFRAME,  "iframe",      tag_find_urls },
 109   { TAG_IMG,     "img",         tag_find_urls },
 110   { TAG_INPUT,   "input",       tag_find_urls },
 111   { TAG_LAYER,   "layer",       tag_find_urls },
 112   { TAG_LINK,    "link",        tag_handle_link },
 113   { TAG_META,    "meta",        tag_handle_meta },
 114   { TAG_OVERLAY, "overlay",     tag_find_urls },
 115   { TAG_SCRIPT,  "script",      tag_find_urls },
 116   { TAG_TABLE,   "table",       tag_find_urls },
 117   { TAG_TD,      "td",          tag_find_urls },
 118   { TAG_TH,      "th",          tag_find_urls }
 119 };
 120
 121 /* tag_url_attributes documents which attributes of which tags contain
 122    URLs to harvest.  It is used by tag_find_urls.  */
 123
 124 /* Defines for the FLAGS field; currently only one flag is defined. */
 125
 126 /* This tag points to an external document not necessary for rendering this
 127    document (i.e. it's not an inlined image, stylesheet, etc.). */
 128 #define TUA_EXTERNAL 1
 129
 130 /* For tags handled by tag_find_urls: attributes that contain URLs to
 131    download. */
 132 static struct {
 133   int tagid;
 134   const char *attr_name;
 135   int flags;
 136 } tag_url_attributes[] = {
 137   { TAG_A,              "href",         TUA_EXTERNAL },
 138   { TAG_APPLET,         "code",         0 },
 139   { TAG_AREA,           "href",         TUA_EXTERNAL },
 140   { TAG_BGSOUND,        "src",          0 },
 141   { TAG_BODY,           "background",   0 },
 142   { TAG_EMBED,          "href",         TUA_EXTERNAL },
 143   { TAG_EMBED,          "src",          0 },
 144   { TAG_FIG,            "src",          0 },
 145   { TAG_FRAME,          "src",          0 },
 146   { TAG_IFRAME,         "src",          0 },
 147   { TAG_IMG,            "href",         0 },
 148   { TAG_IMG,            "lowsrc",       0 },
 149   { TAG_IMG,            "src",          0 },
 150   { TAG_INPUT,          "src",          0 },
 151   { TAG_LAYER,          "src",          0 },
 152   { TAG_OVERLAY,        "src",          0 },
 153   { TAG_SCRIPT,         "src",          0 },
 154   { TAG_TABLE,          "background",   0 },
 155   { TAG_TD,             "background",   0 },
 156   { TAG_TH,             "background",   0 }
 157 };
 158
 159 /* The lists of interesting tags and attributes are built dynamically,
 160    from the information above.  However, some places in the code refer
 161    to the attributes not mentioned here.  We add them manually.  */
 162 static const char *additional_attributes[] = {
 163   "rel",                        /* used by tag_handle_link */
 164   "http-equiv",                 /* used by tag_handle_meta */
 165   "name",                       /* used by tag_handle_meta */
 166   "content",                    /* used by tag_handle_meta */
 167   "action"                      /* used by tag_handle_form */
 168 };
 169
 170 struct hash_table *interesting_tags;
 171 struct hash_table *interesting_attributes;
 172
 173 static void
 174 init_interesting (void)
 175 {
 176   /* Init the variables interesting_tags and interesting_attributes
 177      that are used by the HTML parser to know which tags and
 178      attributes we're interested in.  We initialize this only once,
 179      for performance reasons.
 180
 181      Here we also make sure that what we put in interesting_tags
 182      matches the user's preferences as specified through --ignore-tags
 183      and --follow-tags.  */
 184
 185   int i;
 186   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 187
 188   /* First, add all the tags we know hot to handle, mapped to their
 189      respective entries in known_tags.  */
 190   for (i = 0; i < countof (known_tags); i++)
 191     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 192
 193   /* Then remove the tags ignored through --ignore-tags.  */
 194   if (opt.ignore_tags)
 195     {
 196       char **ignored;
 197       for (ignored = opt.ignore_tags; *ignored; ignored++)
 198         hash_table_remove (interesting_tags, *ignored);
 199     }
 200
 201   /* If --follow-tags is specified, use only those tags.  */
 202   if (opt.follow_tags)
 203     {
 204       /* Create a new hash table with the intersection of tags in
 205          --follow-tags and known_tags, and use that as
 206          interesting_tags.  */
 207       struct hash_table *intersect = make_nocase_string_hash_table (0);
 208       char **followed;
 209       for (followed = opt.follow_tags; *followed; followed++)
 210         {
 211           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 212           if (!t)
 213             continue;           /* ignore unknown tags in --follow-tags. */
 214           hash_table_put (intersect, *followed, t);
 215         }
 216       hash_table_destroy (interesting_tags);
 217       interesting_tags = intersect;
 218     }
 219
 220   /* Add the attributes we care about. */
 221   interesting_attributes = make_nocase_string_hash_table (17);
 222   for (i = 0; i < countof (additional_attributes); i++)
 223     string_set_add (interesting_attributes, additional_attributes[i]);
 224   for (i = 0; i < countof (tag_url_attributes); i++)
 225     string_set_add (interesting_attributes, tag_url_attributes[i].attr_name);
 226 }
 227
 228 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 229    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 230    index of the attribute in TAG will be stored there.  */
 231
 232 static char *
 233 find_attr (struct taginfo *tag, const char *name, int *attrind)
 234 {
 235   int i;
 236   for (i = 0; i < tag->nattrs; i++)
 237     if (!strcasecmp (tag->attrs[i].name, name))
 238       {
 239         if (attrind)
 240           *attrind = i;
 241         return tag->attrs[i].value;
 242       }
 243   return NULL;
 244 }
 245
 246 struct map_context {
 247   char *text;                   /* HTML text. */
 248   char *base;                   /* Base URI of the document, possibly
 249                                    changed through <base href=...>. */
 250   const char *parent_base;      /* Base of the current document. */
 251   const char *document_file;    /* File name of this document. */
 252   int nofollow;                 /* whether NOFOLLOW was specified in a
 253                                    <meta name=robots> tag. */
 254
 255   struct urlpos *head, *tail;   /* List of URLs that is being
 256                                    built. */
 257 };
 258
 259 /* Append LINK_URI to the urlpos structure that is being built.
 260
 261    LINK_URI will be merged with the current document base.  TAG and
 262    ATTRIND are the necessary context to store the position and
 263    size.  */
 264
 265 static struct urlpos *
 266 append_one_url (const char *link_uri, int inlinep,
 267                 struct taginfo *tag, int attrind, struct map_context *ctx)
 268 {
 269   int link_has_scheme = url_has_scheme (link_uri);
 270   struct urlpos *newel;
 271   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 272   struct url *url;
 273
 274   if (!base)
 275     {
 276       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 277                ctx->document_file, link_uri));
 278
 279       if (!link_has_scheme)
 280         {
 281           /* Base URL is unavailable, and the link does not have a
 282              location attached to it -- we have to give up.  Since
 283              this can only happen when using `--force-html -i', print
 284              a warning.  */
 285           logprintf (LOG_NOTQUIET,
 286                      _("%s: Cannot resolve incomplete link %s.\n"),
 287                      ctx->document_file, link_uri);
 288           return NULL;
 289         }
 290
 291       url = url_parse (link_uri, NULL);
 292       if (!url)
 293         {
 294           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 295                    ctx->document_file, link_uri));
 296           return NULL;
 297         }
 298     }
 299   else
 300     {
 301       /* Merge BASE with LINK_URI, but also make sure the result is
 302          canonicalized, i.e. that "../" have been resolved.
 303          (parse_url will do that for us.) */
 304
 305       char *complete_uri = uri_merge (base, link_uri);
 306
 307       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 308                ctx->document_file, base, link_uri, complete_uri));
 309
 310       url = url_parse (complete_uri, NULL);
 311       if (!url)
 312         {
 313           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 314                    ctx->document_file, complete_uri));
 315           xfree (complete_uri);
 316           return NULL;
 317         }
 318       xfree (complete_uri);
 319     }
 320
 321   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 322
 323   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 324   memset (newel, 0, sizeof (*newel));
 325
 326   newel->next = NULL;
 327   newel->url = url;
 328   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 329   newel->size = tag->attrs[attrind].value_raw_size;
 330   newel->link_inline_p = inlinep;
 331
 332   /* A URL is relative if the host is not named, and the name does not
 333      start with `/'.  */
 334   if (!link_has_scheme && *link_uri != '/')
 335     newel->link_relative_p = 1;
 336   else if (link_has_scheme)
 337     newel->link_complete_p = 1;
 338
 339   if (ctx->tail)
 340     {
 341       ctx->tail->next = newel;
 342       ctx->tail = newel;
 343     }
 344   else
 345     ctx->tail = ctx->head = newel;
 346
 347   return newel;
 348 }
 349 \f
 350 /* All the tag_* functions are called from collect_tags_mapper, as
 351    specified by KNOWN_TAGS.  */
 352
 353 /* Default tag handler: collect URLs from attributes specified for
 354    this tag by tag_url_attributes.  */
 355
 356 static void
 357 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 358 {
 359   int i, attrind;
 360   int first = -1;
 361
 362   for (i = 0; i < countof (tag_url_attributes); i++)
 363     if (tag_url_attributes[i].tagid == tagid)
 364       {
 365         /* We've found the index of tag_url_attributes where the
 366            attributes of our tag begin.  */
 367         first = i;
 368         break;
 369       }
 370   assert (first != -1);
 371
 372   /* Loop over the "interesting" attributes of this tag.  In this
 373      example, it will loop over "src" and "lowsrc".
 374
 375        <img src="foo.png" lowsrc="bar.png">
 376
 377      This has to be done in the outer loop so that the attributes are
 378      processed in the same order in which they appear in the page.
 379      This is required when converting links.  */
 380
 381   for (attrind = 0; attrind < tag->nattrs; attrind++)
 382     {
 383       /* Find whether TAG/ATTRIND is a combination that contains a
 384          URL. */
 385       char *link = tag->attrs[attrind].value;
 386       const int size = countof (tag_url_attributes);
 387
 388       /* If you're cringing at the inefficiency of the nested loops,
 389          remember that they both iterate over a very small number of
 390          items.  The worst-case inner loop is for the IMG tag, which
 391          has three attributes.  */
 392       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 393         {
 394           if (0 == strcasecmp (tag->attrs[attrind].name,
 395                                tag_url_attributes[i].attr_name))
 396             {
 397               int flags = tag_url_attributes[i].flags;
 398               append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
 399             }
 400         }
 401     }
 402 }
 403
 404 /* Handle the BASE tag, for <base href=...>. */
 405
 406 static void
 407 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 408 {
 409   struct urlpos *base_urlpos;
 410   int attrind;
 411   char *newbase = find_attr (tag, "href", &attrind);
 412   if (!newbase)
 413     return;
 414
 415   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 416   if (!base_urlpos)
 417     return;
 418   base_urlpos->ignore_when_downloading = 1;
 419   base_urlpos->link_base_p = 1;
 420
 421   if (ctx->base)
 422     xfree (ctx->base);
 423   if (ctx->parent_base)
 424     ctx->base = uri_merge (ctx->parent_base, newbase);
 425   else
 426     ctx->base = xstrdup (newbase);
 427 }
 428
 429 /* Mark the URL found in <form action=...> for conversion. */
 430
 431 static void
 432 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 433 {
 434   int attrind;
 435   char *action = find_attr (tag, "action", &attrind);
 436   if (action)
 437     {
 438       struct urlpos *action_urlpos = append_one_url (action, 0, tag,
 439                                                      attrind, ctx);
 440       if (action_urlpos)
 441         action_urlpos->ignore_when_downloading = 1;
 442     }
 443 }
 444
 445 /* Handle the LINK tag.  It requires special handling because how its
 446    links will be followed in -p mode depends on the REL attribute.  */
 447
 448 static void
 449 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 450 {
 451   int attrind;
 452   char *href = find_attr (tag, "href", &attrind);
 453
 454   /* All <link href="..."> link references are external, except those
 455      known not to be, such as style sheet and shortcut icon:
 456
 457        <link rel="stylesheet" href="...">
 458        <link rel="shortcut icon" href="...">
 459   */
 460   if (href)
 461     {
 462       char *rel  = find_attr (tag, "rel", NULL);
 463       int inlinep = (rel
 464                      && (0 == strcasecmp (rel, "stylesheet")
 465                          || 0 == strcasecmp (rel, "shortcut icon")));
 466       append_one_url (href, inlinep, tag, attrind, ctx);
 467     }
 468 }
 469
 470 /* Handle the META tag.  This requires special handling because of the
 471    refresh feature and because of robot exclusion.  */
 472
 473 static void
 474 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 475 {
 476   char *name = find_attr (tag, "name", NULL);
 477   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 478
 479   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 480     {
 481       /* Some pages use a META tag to specify that the page be
 482          refreshed by a new page after a given number of seconds.  The
 483          general format for this is:
 484
 485            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 486
 487          So we just need to skip past the "NUMBER; URL=" garbage to
 488          get to the URL.  */
 489
 490       struct urlpos *entry;
 491       int attrind;
 492       int timeout = 0;
 493       char *p;
 494
 495       char *refresh = find_attr (tag, "content", &attrind);
 496       if (!refresh)
 497         return;
 498
 499       for (p = refresh; ISDIGIT (*p); p++)
 500         timeout = 10 * timeout + *p - '0';
 501       if (*p++ != ';')
 502         return;
 503
 504       while (ISSPACE (*p))
 505         ++p;
 506       if (!(   TOUPPER (*p)       == 'U'
 507             && TOUPPER (*(p + 1)) == 'R'
 508             && TOUPPER (*(p + 2)) == 'L'
 509             &&          *(p + 3)  == '='))
 510         return;
 511       p += 4;
 512       while (ISSPACE (*p))
 513         ++p;
 514
 515       entry = append_one_url (p, 0, tag, attrind, ctx);
 516       if (entry)
 517         {
 518           entry->link_refresh_p = 1;
 519           entry->refresh_timeout = timeout;
 520         }
 521     }
 522   else if (name && 0 == strcasecmp (name, "robots"))
 523     {
 524       /* Handle stuff like:
 525          <meta name="robots" content="index,nofollow"> */
 526       char *content = find_attr (tag, "content", NULL);
 527       if (!content)
 528         return;
 529       if (!strcasecmp (content, "none"))
 530         ctx->nofollow = 1;
 531       else
 532         {
 533           while (*content)
 534             {
 535               /* Find the next occurrence of ',' or the end of
 536                  the string.  */
 537               char *end = strchr (content, ',');
 538               if (end)
 539                 ++end;
 540               else
 541                 end = content + strlen (content);
 542               if (!strncasecmp (content, "nofollow", end - content))
 543                 ctx->nofollow = 1;
 544               content = end;
 545             }
 546         }
 547     }
 548 }
 549
 550 /* Dispatch the tag handler appropriate for the tag we're mapping
 551    over.  See known_tags[] for definition of tag handlers.  */
 552
 553 static void
 554 collect_tags_mapper (struct taginfo *tag, void *arg)
 555 {
 556   struct map_context *ctx = (struct map_context *)arg;
 557
 558   /* Find the tag in our table of tags.  This must not fail because
 559      map_html_tags only returns tags found in interesting_tags.  */
 560   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 561   assert (t != NULL);
 562
 563   t->handler (t->tagid, tag, ctx);
 564 }
 565 \f
 566 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 567    it.  It merges relative links in FILE with URL.  It is aware of
 568    <base href=...> and does the right thing.  */
 569
 570 struct urlpos *
 571 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 572 {
 573   struct file_memory *fm;
 574   struct map_context ctx;
 575   int flags;
 576
 577   /* Load the file. */
 578   fm = read_file (file);
 579   if (!fm)
 580     {
 581       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 582       return NULL;
 583     }
 584   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 585
 586   ctx.text = fm->content;
 587   ctx.head = ctx.tail = NULL;
 588   ctx.base = NULL;
 589   ctx.parent_base = url ? url : opt.base_href;
 590   ctx.document_file = file;
 591   ctx.nofollow = 0;
 592
 593   if (!interesting_tags)
 594     init_interesting ();
 595
 596   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 597      generate <a href=" foo"> instead of <a href="foo"> (Netscape
 598      ignores spaces as well.)  If you really mean space, use &32; or
 599      %20.  */
 600   flags = MHT_TRIM_VALUES;
 601   if (opt.strict_comments)
 602     flags |= MHT_STRICT_COMMENTS;
 603
 604   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 605                  interesting_tags, interesting_attributes);
 606
 607   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 608   if (meta_disallow_follow)
 609     *meta_disallow_follow = ctx.nofollow;
 610
 611   FREE_MAYBE (ctx.base);
 612   read_file_free (fm);
 613   return ctx.head;
 614 }
 615
 616 /* This doesn't really have anything to do with HTML, but it's similar
 617    to get_urls_html, so we put it here.  */
 618
 619 struct urlpos *
 620 get_urls_file (const char *file)
 621 {
 622   struct file_memory *fm;
 623   struct urlpos *head, *tail;
 624   const char *text, *text_end;
 625
 626   /* Load the file.  */
 627   fm = read_file (file);
 628   if (!fm)
 629     {
 630       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 631       return NULL;
 632     }
 633   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 634
 635   head = tail = NULL;
 636   text = fm->content;
 637   text_end = fm->content + fm->length;
 638   while (text < text_end)
 639     {
 640       int up_error_code;
 641       char *url_text;
 642       struct urlpos *entry;
 643       struct url *url;
 644
 645       const char *line_beg = text;
 646       const char *line_end = memchr (text, '\n', text_end - text);
 647       if (!line_end)
 648         line_end = text_end;
 649       else
 650         ++line_end;
 651       text = line_end;
 652
 653       /* Strip whitespace from the beginning and end of line. */
 654       while (line_beg < line_end && ISSPACE (*line_beg))
 655         ++line_beg;
 656       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 657         --line_end;
 658
 659       if (line_beg == line_end)
 660         continue;
 661
 662       /* The URL is in the [line_beg, line_end) region. */
 663
 664       /* We must copy the URL to a zero-terminated string, and we
 665          can't use alloca because we're in a loop.  *sigh*.  */
 666       url_text = strdupdelim (line_beg, line_end);
 667
 668       if (opt.base_href)
 669         {
 670           /* Merge opt.base_href with URL. */
 671           char *merged = uri_merge (opt.base_href, url_text);
 672           xfree (url_text);
 673           url_text = merged;
 674         }
 675
 676       url = url_parse (url_text, &up_error_code);
 677       if (!url)
 678         {
 679           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 680                      file, url_text, url_error (up_error_code));
 681           xfree (url_text);
 682           continue;
 683         }
 684       xfree (url_text);
 685
 686       entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 687       memset (entry, 0, sizeof (*entry));
 688       entry->next = NULL;
 689       entry->url = url;
 690
 691       if (!head)
 692         head = entry;
 693       else
 694         tail->next = entry;
 695       tail = entry;
 696     }
 697   read_file_free (fm);
 698   return head;
 699 }
 700
 701 void
 702 cleanup_html_url (void)
 703 {
 704   FREE_MAYBE (interesting_tags);
 705   FREE_MAYBE (interesting_attributes);
 706 }