sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "convert.h"
  48
  49 #ifndef errno
  50 extern int errno;
  51 #endif
  52
  53 struct map_context;
  54
  55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  56                                        struct map_context *));
  57
  58 #define DECLARE_TAG_HANDLER(fun)                                        \
  59   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  60
  61 DECLARE_TAG_HANDLER (tag_find_urls);
  62 DECLARE_TAG_HANDLER (tag_handle_base);
  63 DECLARE_TAG_HANDLER (tag_handle_form);
  64 DECLARE_TAG_HANDLER (tag_handle_link);
  65 DECLARE_TAG_HANDLER (tag_handle_meta);
  66
  67 enum {
  68   TAG_A,
  69   TAG_APPLET,
  70   TAG_AREA,
  71   TAG_BASE,
  72   TAG_BGSOUND,
  73   TAG_BODY,
  74   TAG_EMBED,
  75   TAG_FIG,
  76   TAG_FORM,
  77   TAG_FRAME,
  78   TAG_IFRAME,
  79   TAG_IMG,
  80   TAG_INPUT,
  81   TAG_LAYER,
  82   TAG_LINK,
  83   TAG_META,
  84   TAG_OVERLAY,
  85   TAG_SCRIPT,
  86   TAG_TABLE,
  87   TAG_TD,
  88   TAG_TH
  89 };
  90
  91 /* The list of known tags and functions used for handling them.  Most
  92    tags are simply harvested for URLs. */
  93 static struct known_tag {
  94   int tagid;
  95   const char *name;
  96   tag_handler_t handler;
  97 } known_tags[] = {
  98   { TAG_A,       "a",           tag_find_urls },
  99   { TAG_APPLET,  "applet",      tag_find_urls },
 100   { TAG_AREA,    "area",        tag_find_urls },
 101   { TAG_BASE,    "base",        tag_handle_base },
 102   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 103   { TAG_BODY,    "body",        tag_find_urls },
 104   { TAG_EMBED,   "embed",       tag_find_urls },
 105   { TAG_FIG,     "fig",         tag_find_urls },
 106   { TAG_FORM,    "form",        tag_handle_form },
 107   { TAG_FRAME,   "frame",       tag_find_urls },
 108   { TAG_IFRAME,  "iframe",      tag_find_urls },
 109   { TAG_IMG,     "img",         tag_find_urls },
 110   { TAG_INPUT,   "input",       tag_find_urls },
 111   { TAG_LAYER,   "layer",       tag_find_urls },
 112   { TAG_LINK,    "link",        tag_handle_link },
 113   { TAG_META,    "meta",        tag_handle_meta },
 114   { TAG_OVERLAY, "overlay",     tag_find_urls },
 115   { TAG_SCRIPT,  "script",      tag_find_urls },
 116   { TAG_TABLE,   "table",       tag_find_urls },
 117   { TAG_TD,      "td",          tag_find_urls },
 118   { TAG_TH,      "th",          tag_find_urls }
 119 };
 120
 121 /* tag_url_attributes documents which attributes of which tags contain
 122    URLs to harvest.  It is used by tag_find_urls.  */
 123
 124 /* Defines for the FLAGS field; currently only one flag is defined. */
 125
 126 /* This tag points to an external document not necessary for rendering this
 127    document (i.e. it's not an inlined image, stylesheet, etc.). */
 128 #define TUA_EXTERNAL 1
 129
 130 /* For tags handled by tag_find_urls: attributes that contain URLs to
 131    download. */
 132 static struct {
 133   int tagid;
 134   const char *attr_name;
 135   int flags;
 136 } tag_url_attributes[] = {
 137   { TAG_A,              "href",         TUA_EXTERNAL },
 138   { TAG_APPLET,         "code",         0 },
 139   { TAG_AREA,           "href",         TUA_EXTERNAL },
 140   { TAG_BGSOUND,        "src",          0 },
 141   { TAG_BODY,           "background",   0 },
 142   { TAG_EMBED,          "href",         TUA_EXTERNAL },
 143   { TAG_EMBED,          "src",          0 },
 144   { TAG_FIG,            "src",          0 },
 145   { TAG_FRAME,          "src",          0 },
 146   { TAG_IFRAME,         "src",          0 },
 147   { TAG_IMG,            "href",         0 },
 148   { TAG_IMG,            "lowsrc",       0 },
 149   { TAG_IMG,            "src",          0 },
 150   { TAG_INPUT,          "src",          0 },
 151   { TAG_LAYER,          "src",          0 },
 152   { TAG_OVERLAY,        "src",          0 },
 153   { TAG_SCRIPT,         "src",          0 },
 154   { TAG_TABLE,          "background",   0 },
 155   { TAG_TD,             "background",   0 },
 156   { TAG_TH,             "background",   0 }
 157 };
 158
 159 /* The lists of interesting tags and attributes are built dynamically,
 160    from the information above.  However, some places in the code refer
 161    to the attributes not mentioned here.  We add them manually.  */
 162 static const char *additional_attributes[] = {
 163   "rel",                        /* used by tag_handle_link */
 164   "http-equiv",                 /* used by tag_handle_meta */
 165   "name",                       /* used by tag_handle_meta */
 166   "content",                    /* used by tag_handle_meta */
 167   "action"                      /* used by tag_handle_form */
 168 };
 169
 170 struct hash_table *interesting_tags;
 171 struct hash_table *interesting_attributes;
 172
 173 static void
 174 init_interesting (void)
 175 {
 176   /* Init the variables interesting_tags and interesting_attributes
 177      that are used by the HTML parser to know which tags and
 178      attributes we're interested in.  We initialize this only once,
 179      for performance reasons.
 180
 181      Here we also make sure that what we put in interesting_tags
 182      matches the user's preferences as specified through --ignore-tags
 183      and --follow-tags.  */
 184
 185   int i;
 186   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 187
 188   /* First, add all the tags we know hot to handle, mapped to their
 189      respective entries in known_tags.  */
 190   for (i = 0; i < countof (known_tags); i++)
 191     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 192
 193   /* Then remove the tags ignored through --ignore-tags.  */
 194   if (opt.ignore_tags)
 195     {
 196       char **ignored;
 197       for (ignored = opt.ignore_tags; *ignored; ignored++)
 198         hash_table_remove (interesting_tags, *ignored);
 199     }
 200
 201   /* If --follow-tags is specified, use only those tags.  */
 202   if (opt.follow_tags)
 203     {
 204       /* Create a new table intersecting --follow-tags and known_tags,
 205          and use it as interesting_tags.  */
 206       struct hash_table *intersect = make_nocase_string_hash_table (0);
 207       char **followed;
 208       for (followed = opt.follow_tags; *followed; followed++)
 209         {
 210           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 211           if (!t)
 212             continue;           /* ignore unknown --follow-tags entries. */
 213           hash_table_put (intersect, *followed, t);
 214         }
 215       hash_table_destroy (interesting_tags);
 216       interesting_tags = intersect;
 217     }
 218
 219   /* Add the attributes we care about. */
 220   interesting_attributes = make_nocase_string_hash_table (10);
 221   for (i = 0; i < countof (additional_attributes); i++)
 222     string_set_add (interesting_attributes, additional_attributes[i]);
 223   for (i = 0; i < countof (tag_url_attributes); i++)
 224     string_set_add (interesting_attributes, tag_url_attributes[i].attr_name);
 225 }
 226
 227 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 228    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 229    index of the attribute in TAG will be stored there.  */
 230
 231 static char *
 232 find_attr (struct taginfo *tag, const char *name, int *attrind)
 233 {
 234   int i;
 235   for (i = 0; i < tag->nattrs; i++)
 236     if (!strcasecmp (tag->attrs[i].name, name))
 237       {
 238         if (attrind)
 239           *attrind = i;
 240         return tag->attrs[i].value;
 241       }
 242   return NULL;
 243 }
 244
 245 struct map_context {
 246   char *text;                   /* HTML text. */
 247   char *base;                   /* Base URI of the document, possibly
 248                                    changed through <base href=...>. */
 249   const char *parent_base;      /* Base of the current document. */
 250   const char *document_file;    /* File name of this document. */
 251   int nofollow;                 /* whether NOFOLLOW was specified in a
 252                                    <meta name=robots> tag. */
 253
 254   struct urlpos *head, *tail;   /* List of URLs that is being
 255                                    built. */
 256 };
 257
 258 /* Append LINK_URI to the urlpos structure that is being built.
 259
 260    LINK_URI will be merged with the current document base.  TAG and
 261    ATTRIND are the necessary context to store the position and
 262    size.  */
 263
 264 static struct urlpos *
 265 append_one_url (const char *link_uri, int inlinep,
 266                 struct taginfo *tag, int attrind, struct map_context *ctx)
 267 {
 268   int link_has_scheme = url_has_scheme (link_uri);
 269   struct urlpos *newel;
 270   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 271   struct url *url;
 272
 273   if (!base)
 274     {
 275       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 276                ctx->document_file, link_uri));
 277
 278       if (!link_has_scheme)
 279         {
 280           /* Base URL is unavailable, and the link does not have a
 281              location attached to it -- we have to give up.  Since
 282              this can only happen when using `--force-html -i', print
 283              a warning.  */
 284           logprintf (LOG_NOTQUIET,
 285                      _("%s: Cannot resolve incomplete link %s.\n"),
 286                      ctx->document_file, link_uri);
 287           return NULL;
 288         }
 289
 290       url = url_parse (link_uri, NULL);
 291       if (!url)
 292         {
 293           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 294                    ctx->document_file, link_uri));
 295           return NULL;
 296         }
 297     }
 298   else
 299     {
 300       /* Merge BASE with LINK_URI, but also make sure the result is
 301          canonicalized, i.e. that "../" have been resolved.
 302          (parse_url will do that for us.) */
 303
 304       char *complete_uri = uri_merge (base, link_uri);
 305
 306       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 307                ctx->document_file, base, link_uri, complete_uri));
 308
 309       url = url_parse (complete_uri, NULL);
 310       if (!url)
 311         {
 312           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 313                    ctx->document_file, complete_uri));
 314           xfree (complete_uri);
 315           return NULL;
 316         }
 317       xfree (complete_uri);
 318     }
 319
 320   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 321
 322   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 323   memset (newel, 0, sizeof (*newel));
 324
 325   newel->next = NULL;
 326   newel->url = url;
 327   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 328   newel->size = tag->attrs[attrind].value_raw_size;
 329   newel->link_inline_p = inlinep;
 330
 331   /* A URL is relative if the host is not named, and the name does not
 332      start with `/'.  */
 333   if (!link_has_scheme && *link_uri != '/')
 334     newel->link_relative_p = 1;
 335   else if (link_has_scheme)
 336     newel->link_complete_p = 1;
 337
 338   if (ctx->tail)
 339     {
 340       ctx->tail->next = newel;
 341       ctx->tail = newel;
 342     }
 343   else
 344     ctx->tail = ctx->head = newel;
 345
 346   return newel;
 347 }
 348 \f
 349 /* All the tag_* functions are called from collect_tags_mapper, as
 350    specified by KNOWN_TAGS.  */
 351
 352 /* Default tag handler: collect URLs from attributes specified for
 353    this tag by tag_url_attributes.  */
 354
 355 static void
 356 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 357 {
 358   int i, attrind;
 359   int first = -1;
 360
 361   for (i = 0; i < countof (tag_url_attributes); i++)
 362     if (tag_url_attributes[i].tagid == tagid)
 363       {
 364         /* We've found the index of tag_url_attributes where the
 365            attributes of our tag begin.  */
 366         first = i;
 367         break;
 368       }
 369   assert (first != -1);
 370
 371   /* Loop over the "interesting" attributes of this tag.  In this
 372      example, it will loop over "src" and "lowsrc".
 373
 374        <img src="foo.png" lowsrc="bar.png">
 375
 376      This has to be done in the outer loop so that the attributes are
 377      processed in the same order in which they appear in the page.
 378      This is required when converting links.  */
 379
 380   for (attrind = 0; attrind < tag->nattrs; attrind++)
 381     {
 382       /* Find whether TAG/ATTRIND is a combination that contains a
 383          URL. */
 384       char *link = tag->attrs[attrind].value;
 385       const int size = countof (tag_url_attributes);
 386
 387       /* If you're cringing at the inefficiency of the nested loops,
 388          remember that they both iterate over a very small number of
 389          items.  The worst-case inner loop is for the IMG tag, which
 390          has three attributes.  */
 391       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 392         {
 393           if (0 == strcasecmp (tag->attrs[attrind].name,
 394                                tag_url_attributes[i].attr_name))
 395             {
 396               int flags = tag_url_attributes[i].flags;
 397               append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
 398             }
 399         }
 400     }
 401 }
 402
 403 /* Handle the BASE tag, for <base href=...>. */
 404
 405 static void
 406 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 407 {
 408   struct urlpos *base_urlpos;
 409   int attrind;
 410   char *newbase = find_attr (tag, "href", &attrind);
 411   if (!newbase)
 412     return;
 413
 414   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 415   if (!base_urlpos)
 416     return;
 417   base_urlpos->ignore_when_downloading = 1;
 418   base_urlpos->link_base_p = 1;
 419
 420   if (ctx->base)
 421     xfree (ctx->base);
 422   if (ctx->parent_base)
 423     ctx->base = uri_merge (ctx->parent_base, newbase);
 424   else
 425     ctx->base = xstrdup (newbase);
 426 }
 427
 428 /* Mark the URL found in <form action=...> for conversion. */
 429
 430 static void
 431 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 432 {
 433   int attrind;
 434   char *action = find_attr (tag, "action", &attrind);
 435   if (action)
 436     {
 437       struct urlpos *action_urlpos = append_one_url (action, 0, tag,
 438                                                      attrind, ctx);
 439       if (action_urlpos)
 440         action_urlpos->ignore_when_downloading = 1;
 441     }
 442 }
 443
 444 /* Handle the LINK tag.  It requires special handling because how its
 445    links will be followed in -p mode depends on the REL attribute.  */
 446
 447 static void
 448 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 449 {
 450   int attrind;
 451   char *href = find_attr (tag, "href", &attrind);
 452
 453   /* All <link href="..."> link references are external, except those
 454      known not to be, such as style sheet and shortcut icon:
 455
 456        <link rel="stylesheet" href="...">
 457        <link rel="shortcut icon" href="...">
 458   */
 459   if (href)
 460     {
 461       char *rel  = find_attr (tag, "rel", NULL);
 462       int inlinep = (rel
 463                      && (0 == strcasecmp (rel, "stylesheet")
 464                          || 0 == strcasecmp (rel, "shortcut icon")));
 465       append_one_url (href, inlinep, tag, attrind, ctx);
 466     }
 467 }
 468
 469 /* Handle the META tag.  This requires special handling because of the
 470    refresh feature and because of robot exclusion.  */
 471
 472 static void
 473 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 474 {
 475   char *name = find_attr (tag, "name", NULL);
 476   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 477
 478   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 479     {
 480       /* Some pages use a META tag to specify that the page be
 481          refreshed by a new page after a given number of seconds.  The
 482          general format for this is:
 483
 484            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 485
 486          So we just need to skip past the "NUMBER; URL=" garbage to
 487          get to the URL.  */
 488
 489       struct urlpos *entry;
 490       int attrind;
 491       int timeout = 0;
 492       char *p;
 493
 494       char *refresh = find_attr (tag, "content", &attrind);
 495       if (!refresh)
 496         return;
 497
 498       for (p = refresh; ISDIGIT (*p); p++)
 499         timeout = 10 * timeout + *p - '0';
 500       if (*p++ != ';')
 501         return;
 502
 503       while (ISSPACE (*p))
 504         ++p;
 505       if (!(   TOUPPER (*p)       == 'U'
 506             && TOUPPER (*(p + 1)) == 'R'
 507             && TOUPPER (*(p + 2)) == 'L'
 508             &&          *(p + 3)  == '='))
 509         return;
 510       p += 4;
 511       while (ISSPACE (*p))
 512         ++p;
 513
 514       entry = append_one_url (p, 0, tag, attrind, ctx);
 515       if (entry)
 516         {
 517           entry->link_refresh_p = 1;
 518           entry->refresh_timeout = timeout;
 519         }
 520     }
 521   else if (name && 0 == strcasecmp (name, "robots"))
 522     {
 523       /* Handle stuff like:
 524          <meta name="robots" content="index,nofollow"> */
 525       char *content = find_attr (tag, "content", NULL);
 526       if (!content)
 527         return;
 528       if (!strcasecmp (content, "none"))
 529         ctx->nofollow = 1;
 530       else
 531         {
 532           while (*content)
 533             {
 534               /* Find the next occurrence of ',' or the end of
 535                  the string.  */
 536               char *end = strchr (content, ',');
 537               if (end)
 538                 ++end;
 539               else
 540                 end = content + strlen (content);
 541               if (!strncasecmp (content, "nofollow", end - content))
 542                 ctx->nofollow = 1;
 543               content = end;
 544             }
 545         }
 546     }
 547 }
 548
 549 /* Dispatch the tag handler appropriate for the tag we're mapping
 550    over.  See known_tags[] for definition of tag handlers.  */
 551
 552 static void
 553 collect_tags_mapper (struct taginfo *tag, void *arg)
 554 {
 555   struct map_context *ctx = (struct map_context *)arg;
 556
 557   /* Find the tag in our table of tags.  This must not fail because
 558      map_html_tags only returns tags found in interesting_tags.  */
 559   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 560   assert (t != NULL);
 561
 562   t->handler (t->tagid, tag, ctx);
 563 }
 564 \f
 565 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 566    it.  It merges relative links in FILE with URL.  It is aware of
 567    <base href=...> and does the right thing.  */
 568
 569 struct urlpos *
 570 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 571 {
 572   struct file_memory *fm;
 573   struct map_context ctx;
 574   int flags;
 575
 576   /* Load the file. */
 577   fm = read_file (file);
 578   if (!fm)
 579     {
 580       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 581       return NULL;
 582     }
 583   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 584
 585   ctx.text = fm->content;
 586   ctx.head = ctx.tail = NULL;
 587   ctx.base = NULL;
 588   ctx.parent_base = url ? url : opt.base_href;
 589   ctx.document_file = file;
 590   ctx.nofollow = 0;
 591
 592   if (!interesting_tags)
 593     init_interesting ();
 594
 595   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 596      generate <a href=" foo"> instead of <a href="foo"> (Netscape
 597      ignores spaces as well.)  If you really mean space, use &32; or
 598      %20.  */
 599   flags = MHT_TRIM_VALUES;
 600   if (opt.strict_comments)
 601     flags |= MHT_STRICT_COMMENTS;
 602
 603   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 604                  interesting_tags, interesting_attributes);
 605
 606   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 607   if (meta_disallow_follow)
 608     *meta_disallow_follow = ctx.nofollow;
 609
 610   FREE_MAYBE (ctx.base);
 611   read_file_free (fm);
 612   return ctx.head;
 613 }
 614
 615 /* This doesn't really have anything to do with HTML, but it's similar
 616    to get_urls_html, so we put it here.  */
 617
 618 struct urlpos *
 619 get_urls_file (const char *file)
 620 {
 621   struct file_memory *fm;
 622   struct urlpos *head, *tail;
 623   const char *text, *text_end;
 624
 625   /* Load the file.  */
 626   fm = read_file (file);
 627   if (!fm)
 628     {
 629       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 630       return NULL;
 631     }
 632   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 633
 634   head = tail = NULL;
 635   text = fm->content;
 636   text_end = fm->content + fm->length;
 637   while (text < text_end)
 638     {
 639       int up_error_code;
 640       char *url_text;
 641       struct urlpos *entry;
 642       struct url *url;
 643
 644       const char *line_beg = text;
 645       const char *line_end = memchr (text, '\n', text_end - text);
 646       if (!line_end)
 647         line_end = text_end;
 648       else
 649         ++line_end;
 650       text = line_end;
 651
 652       /* Strip whitespace from the beginning and end of line. */
 653       while (line_beg < line_end && ISSPACE (*line_beg))
 654         ++line_beg;
 655       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 656         --line_end;
 657
 658       if (line_beg == line_end)
 659         continue;
 660
 661       /* The URL is in the [line_beg, line_end) region. */
 662
 663       /* We must copy the URL to a zero-terminated string, and we
 664          can't use alloca because we're in a loop.  *sigh*.  */
 665       url_text = strdupdelim (line_beg, line_end);
 666
 667       if (opt.base_href)
 668         {
 669           /* Merge opt.base_href with URL. */
 670           char *merged = uri_merge (opt.base_href, url_text);
 671           xfree (url_text);
 672           url_text = merged;
 673         }
 674
 675       url = url_parse (url_text, &up_error_code);
 676       if (!url)
 677         {
 678           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 679                      file, url_text, url_error (up_error_code));
 680           xfree (url_text);
 681           continue;
 682         }
 683       xfree (url_text);
 684
 685       entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 686       memset (entry, 0, sizeof (*entry));
 687       entry->next = NULL;
 688       entry->url = url;
 689
 690       if (!head)
 691         head = entry;
 692       else
 693         tail->next = entry;
 694       tail = entry;
 695     }
 696   read_file_free (fm);
 697   return head;
 698 }
 699
 700 void
 701 cleanup_html_url (void)
 702 {
 703   FREE_MAYBE (interesting_tags);
 704   FREE_MAYBE (interesting_attributes);
 705 }