sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "convert.h"
  48
  49 #ifndef errno
  50 extern int errno;
  51 #endif
  52
  53 struct map_context;
  54
  55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  56                                        struct map_context *));
  57
  58 #define DECLARE_TAG_HANDLER(fun)                                        \
  59   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  60
  61 DECLARE_TAG_HANDLER (tag_find_urls);
  62 DECLARE_TAG_HANDLER (tag_handle_base);
  63 DECLARE_TAG_HANDLER (tag_handle_form);
  64 DECLARE_TAG_HANDLER (tag_handle_link);
  65 DECLARE_TAG_HANDLER (tag_handle_meta);
  66
  67 enum {
  68   TAG_A,
  69   TAG_APPLET,
  70   TAG_AREA,
  71   TAG_BASE,
  72   TAG_BGSOUND,
  73   TAG_BODY,
  74   TAG_EMBED,
  75   TAG_FIG,
  76   TAG_FORM,
  77   TAG_FRAME,
  78   TAG_IFRAME,
  79   TAG_IMG,
  80   TAG_INPUT,
  81   TAG_LAYER,
  82   TAG_LINK,
  83   TAG_META,
  84   TAG_OVERLAY,
  85   TAG_SCRIPT,
  86   TAG_TABLE,
  87   TAG_TD,
  88   TAG_TH
  89 };
  90
  91 /* The list of known tags and functions used for handling them.  Most
  92    tags are simply harvested for URLs. */
  93 static struct known_tag {
  94   int tagid;
  95   const char *name;
  96   tag_handler_t handler;
  97 } known_tags[] = {
  98   { TAG_A,       "a",           tag_find_urls },
  99   { TAG_APPLET,  "applet",      tag_find_urls },
 100   { TAG_AREA,    "area",        tag_find_urls },
 101   { TAG_BASE,    "base",        tag_handle_base },
 102   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 103   { TAG_BODY,    "body",        tag_find_urls },
 104   { TAG_EMBED,   "embed",       tag_find_urls },
 105   { TAG_FIG,     "fig",         tag_find_urls },
 106   { TAG_FORM,    "form",        tag_handle_form },
 107   { TAG_FRAME,   "frame",       tag_find_urls },
 108   { TAG_IFRAME,  "iframe",      tag_find_urls },
 109   { TAG_IMG,     "img",         tag_find_urls },
 110   { TAG_INPUT,   "input",       tag_find_urls },
 111   { TAG_LAYER,   "layer",       tag_find_urls },
 112   { TAG_LINK,    "link",        tag_handle_link },
 113   { TAG_META,    "meta",        tag_handle_meta },
 114   { TAG_OVERLAY, "overlay",     tag_find_urls },
 115   { TAG_SCRIPT,  "script",      tag_find_urls },
 116   { TAG_TABLE,   "table",       tag_find_urls },
 117   { TAG_TD,      "td",          tag_find_urls },
 118   { TAG_TH,      "th",          tag_find_urls }
 119 };
 120
 121 /* tag_url_attributes documents which attributes of which tags contain
 122    URLs to harvest.  It is used by tag_find_urls.  */
 123
 124 /* Defines for the FLAGS. */
 125
 126 /* The link is "inline", i.e. needs to be retrieved for this document
 127    to be correctly rendered.  Inline links include inlined images,
 128    stylesheets, children frames, etc.  */
 129 #define ATTR_INLINE     1
 130
 131 /* The link is expected to yield HTML contents.  It's important not to
 132    try to follow HTML obtained by following e.g. <img src="...">
 133    regardless of content-type.  Doing this causes infinite loops for
 134    "images" that return non-404 error pages with links to the same
 135    image.  */
 136 #define ATTR_HTML       2
 137
 138 /* For tags handled by tag_find_urls: attributes that contain URLs to
 139    download. */
 140 static struct {
 141   int tagid;
 142   const char *attr_name;
 143   int flags;
 144 } tag_url_attributes[] = {
 145   { TAG_A,              "href",         ATTR_HTML },
 146   { TAG_APPLET,         "code",         ATTR_INLINE },
 147   { TAG_AREA,           "href",         ATTR_HTML },
 148   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 149   { TAG_BODY,           "background",   ATTR_INLINE },
 150   { TAG_EMBED,          "href",         ATTR_HTML },
 151   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 152   { TAG_FIG,            "src",          ATTR_INLINE },
 153   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_IMG,            "href",         ATTR_INLINE },
 156   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 157   { TAG_IMG,            "src",          ATTR_INLINE },
 158   { TAG_INPUT,          "src",          ATTR_INLINE },
 159   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 160   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 161   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 162   { TAG_TABLE,          "background",   ATTR_INLINE },
 163   { TAG_TD,             "background",   ATTR_INLINE },
 164   { TAG_TH,             "background",   ATTR_INLINE }
 165 };
 166
 167 /* The lists of interesting tags and attributes are built dynamically,
 168    from the information above.  However, some places in the code refer
 169    to the attributes not mentioned here.  We add them manually.  */
 170 static const char *additional_attributes[] = {
 171   "rel",                        /* used by tag_handle_link */
 172   "http-equiv",                 /* used by tag_handle_meta */
 173   "name",                       /* used by tag_handle_meta */
 174   "content",                    /* used by tag_handle_meta */
 175   "action"                      /* used by tag_handle_form */
 176 };
 177
 178 struct hash_table *interesting_tags;
 179 struct hash_table *interesting_attributes;
 180
 181 static void
 182 init_interesting (void)
 183 {
 184   /* Init the variables interesting_tags and interesting_attributes
 185      that are used by the HTML parser to know which tags and
 186      attributes we're interested in.  We initialize this only once,
 187      for performance reasons.
 188
 189      Here we also make sure that what we put in interesting_tags
 190      matches the user's preferences as specified through --ignore-tags
 191      and --follow-tags.  */
 192
 193   int i;
 194   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 195
 196   /* First, add all the tags we know hot to handle, mapped to their
 197      respective entries in known_tags.  */
 198   for (i = 0; i < countof (known_tags); i++)
 199     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 200
 201   /* Then remove the tags ignored through --ignore-tags.  */
 202   if (opt.ignore_tags)
 203     {
 204       char **ignored;
 205       for (ignored = opt.ignore_tags; *ignored; ignored++)
 206         hash_table_remove (interesting_tags, *ignored);
 207     }
 208
 209   /* If --follow-tags is specified, use only those tags.  */
 210   if (opt.follow_tags)
 211     {
 212       /* Create a new table intersecting --follow-tags and known_tags,
 213          and use it as interesting_tags.  */
 214       struct hash_table *intersect = make_nocase_string_hash_table (0);
 215       char **followed;
 216       for (followed = opt.follow_tags; *followed; followed++)
 217         {
 218           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 219           if (!t)
 220             continue;           /* ignore unknown --follow-tags entries. */
 221           hash_table_put (intersect, *followed, t);
 222         }
 223       hash_table_destroy (interesting_tags);
 224       interesting_tags = intersect;
 225     }
 226
 227   /* Add the attributes we care about. */
 228   interesting_attributes = make_nocase_string_hash_table (10);
 229   for (i = 0; i < countof (additional_attributes); i++)
 230     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 231   for (i = 0; i < countof (tag_url_attributes); i++)
 232     hash_table_put (interesting_attributes,
 233                     tag_url_attributes[i].attr_name, "1");
 234 }
 235
 236 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 237    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 238    index of the attribute in TAG will be stored there.  */
 239
 240 static char *
 241 find_attr (struct taginfo *tag, const char *name, int *attrind)
 242 {
 243   int i;
 244   for (i = 0; i < tag->nattrs; i++)
 245     if (!strcasecmp (tag->attrs[i].name, name))
 246       {
 247         if (attrind)
 248           *attrind = i;
 249         return tag->attrs[i].value;
 250       }
 251   return NULL;
 252 }
 253
 254 struct map_context {
 255   char *text;                   /* HTML text. */
 256   char *base;                   /* Base URI of the document, possibly
 257                                    changed through <base href=...>. */
 258   const char *parent_base;      /* Base of the current document. */
 259   const char *document_file;    /* File name of this document. */
 260   int nofollow;                 /* whether NOFOLLOW was specified in a
 261                                    <meta name=robots> tag. */
 262
 263   struct urlpos *head, *tail;   /* List of URLs that is being
 264                                    built. */
 265 };
 266
 267 /* Append LINK_URI to the urlpos structure that is being built.
 268
 269    LINK_URI will be merged with the current document base.  TAG and
 270    ATTRIND are the necessary context to store the position and
 271    size.  */
 272
 273 static struct urlpos *
 274 append_url (const char *link_uri,
 275             struct taginfo *tag, int attrind, struct map_context *ctx)
 276 {
 277   int link_has_scheme = url_has_scheme (link_uri);
 278   struct urlpos *newel;
 279   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 280   struct url *url;
 281
 282   if (!base)
 283     {
 284       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 285                ctx->document_file, link_uri));
 286
 287       if (!link_has_scheme)
 288         {
 289           /* Base URL is unavailable, and the link does not have a
 290              location attached to it -- we have to give up.  Since
 291              this can only happen when using `--force-html -i', print
 292              a warning.  */
 293           logprintf (LOG_NOTQUIET,
 294                      _("%s: Cannot resolve incomplete link %s.\n"),
 295                      ctx->document_file, link_uri);
 296           return NULL;
 297         }
 298
 299       url = url_parse (link_uri, NULL);
 300       if (!url)
 301         {
 302           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 303                    ctx->document_file, link_uri));
 304           return NULL;
 305         }
 306     }
 307   else
 308     {
 309       /* Merge BASE with LINK_URI, but also make sure the result is
 310          canonicalized, i.e. that "../" have been resolved.
 311          (parse_url will do that for us.) */
 312
 313       char *complete_uri = uri_merge (base, link_uri);
 314
 315       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 316                ctx->document_file, base, link_uri, complete_uri));
 317
 318       url = url_parse (complete_uri, NULL);
 319       if (!url)
 320         {
 321           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 322                    ctx->document_file, complete_uri));
 323           xfree (complete_uri);
 324           return NULL;
 325         }
 326       xfree (complete_uri);
 327     }
 328
 329   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 330
 331   newel = xnew0 (struct urlpos);
 332   newel->url = url;
 333   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 334   newel->size = tag->attrs[attrind].value_raw_size;
 335
 336   /* A URL is relative if the host is not named, and the name does not
 337      start with `/'.  */
 338   if (!link_has_scheme && *link_uri != '/')
 339     newel->link_relative_p = 1;
 340   else if (link_has_scheme)
 341     newel->link_complete_p = 1;
 342
 343   if (ctx->tail)
 344     {
 345       ctx->tail->next = newel;
 346       ctx->tail = newel;
 347     }
 348   else
 349     ctx->tail = ctx->head = newel;
 350
 351   return newel;
 352 }
 353 \f
 354 /* All the tag_* functions are called from collect_tags_mapper, as
 355    specified by KNOWN_TAGS.  */
 356
 357 /* Default tag handler: collect URLs from attributes specified for
 358    this tag by tag_url_attributes.  */
 359
 360 static void
 361 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 362 {
 363   int i, attrind;
 364   int first = -1;
 365
 366   for (i = 0; i < countof (tag_url_attributes); i++)
 367     if (tag_url_attributes[i].tagid == tagid)
 368       {
 369         /* We've found the index of tag_url_attributes where the
 370            attributes of our tag begin.  */
 371         first = i;
 372         break;
 373       }
 374   assert (first != -1);
 375
 376   /* Loop over the "interesting" attributes of this tag.  In this
 377      example, it will loop over "src" and "lowsrc".
 378
 379        <img src="foo.png" lowsrc="bar.png">
 380
 381      This has to be done in the outer loop so that the attributes are
 382      processed in the same order in which they appear in the page.
 383      This is required when converting links.  */
 384
 385   for (attrind = 0; attrind < tag->nattrs; attrind++)
 386     {
 387       /* Find whether TAG/ATTRIND is a combination that contains a
 388          URL. */
 389       char *link = tag->attrs[attrind].value;
 390       const int size = countof (tag_url_attributes);
 391
 392       /* If you're cringing at the inefficiency of the nested loops,
 393          remember that they both iterate over a very small number of
 394          items.  The worst-case inner loop is for the IMG tag, which
 395          has three attributes.  */
 396       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 397         {
 398           if (0 == strcasecmp (tag->attrs[attrind].name,
 399                                tag_url_attributes[i].attr_name))
 400             {
 401               struct urlpos *up = append_url (link, tag, attrind, ctx);
 402               if (up)
 403                 {
 404                   int flags = tag_url_attributes[i].flags;
 405                   if (flags & ATTR_INLINE)
 406                     up->link_inline_p = 1;
 407                   if (flags & ATTR_HTML)
 408                     up->link_expect_html = 1;
 409                 }
 410             }
 411         }
 412     }
 413 }
 414
 415 /* Handle the BASE tag, for <base href=...>. */
 416
 417 static void
 418 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 419 {
 420   struct urlpos *base_urlpos;
 421   int attrind;
 422   char *newbase = find_attr (tag, "href", &attrind);
 423   if (!newbase)
 424     return;
 425
 426   base_urlpos = append_url (newbase, tag, attrind, ctx);
 427   if (!base_urlpos)
 428     return;
 429   base_urlpos->ignore_when_downloading = 1;
 430   base_urlpos->link_base_p = 1;
 431
 432   if (ctx->base)
 433     xfree (ctx->base);
 434   if (ctx->parent_base)
 435     ctx->base = uri_merge (ctx->parent_base, newbase);
 436   else
 437     ctx->base = xstrdup (newbase);
 438 }
 439
 440 /* Mark the URL found in <form action=...> for conversion. */
 441
 442 static void
 443 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 444 {
 445   int attrind;
 446   char *action = find_attr (tag, "action", &attrind);
 447   if (action)
 448     {
 449       struct urlpos *up = append_url (action, tag, attrind, ctx);
 450       if (up)
 451         up->ignore_when_downloading = 1;
 452     }
 453 }
 454
 455 /* Handle the LINK tag.  It requires special handling because how its
 456    links will be followed in -p mode depends on the REL attribute.  */
 457
 458 static void
 459 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 460 {
 461   int attrind;
 462   char *href = find_attr (tag, "href", &attrind);
 463
 464   /* All <link href="..."> link references are external, except those
 465      known not to be, such as style sheet and shortcut icon:
 466
 467        <link rel="stylesheet" href="...">
 468        <link rel="shortcut icon" href="...">
 469   */
 470   if (href)
 471     {
 472       struct urlpos *up = append_url (href, tag, attrind, ctx);
 473       if (up)
 474         {
 475           char *rel = find_attr (tag, "rel", NULL);
 476           if (rel
 477               && (0 == strcasecmp (rel, "stylesheet")
 478                   || 0 == strcasecmp (rel, "shortcut icon")))
 479             up->link_inline_p = 1;
 480         }
 481     }
 482 }
 483
 484 /* Handle the META tag.  This requires special handling because of the
 485    refresh feature and because of robot exclusion.  */
 486
 487 static void
 488 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 489 {
 490   char *name = find_attr (tag, "name", NULL);
 491   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 492
 493   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 494     {
 495       /* Some pages use a META tag to specify that the page be
 496          refreshed by a new page after a given number of seconds.  The
 497          general format for this is:
 498
 499            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 500
 501          So we just need to skip past the "NUMBER; URL=" garbage to
 502          get to the URL.  */
 503
 504       struct urlpos *entry;
 505       int attrind;
 506       int timeout = 0;
 507       char *p;
 508
 509       char *refresh = find_attr (tag, "content", &attrind);
 510       if (!refresh)
 511         return;
 512
 513       for (p = refresh; ISDIGIT (*p); p++)
 514         timeout = 10 * timeout + *p - '0';
 515       if (*p++ != ';')
 516         return;
 517
 518       while (ISSPACE (*p))
 519         ++p;
 520       if (!(   TOUPPER (*p)       == 'U'
 521             && TOUPPER (*(p + 1)) == 'R'
 522             && TOUPPER (*(p + 2)) == 'L'
 523             &&          *(p + 3)  == '='))
 524         return;
 525       p += 4;
 526       while (ISSPACE (*p))
 527         ++p;
 528
 529       entry = append_url (p, tag, attrind, ctx);
 530       if (entry)
 531         {
 532           entry->link_refresh_p = 1;
 533           entry->refresh_timeout = timeout;
 534           entry->link_expect_html = 1;
 535         }
 536     }
 537   else if (name && 0 == strcasecmp (name, "robots"))
 538     {
 539       /* Handle stuff like:
 540          <meta name="robots" content="index,nofollow"> */
 541       char *content = find_attr (tag, "content", NULL);
 542       if (!content)
 543         return;
 544       if (!strcasecmp (content, "none"))
 545         ctx->nofollow = 1;
 546       else
 547         {
 548           while (*content)
 549             {
 550               /* Find the next occurrence of ',' or the end of
 551                  the string.  */
 552               char *end = strchr (content, ',');
 553               if (end)
 554                 ++end;
 555               else
 556                 end = content + strlen (content);
 557               if (!strncasecmp (content, "nofollow", end - content))
 558                 ctx->nofollow = 1;
 559               content = end;
 560             }
 561         }
 562     }
 563 }
 564
 565 /* Dispatch the tag handler appropriate for the tag we're mapping
 566    over.  See known_tags[] for definition of tag handlers.  */
 567
 568 static void
 569 collect_tags_mapper (struct taginfo *tag, void *arg)
 570 {
 571   struct map_context *ctx = (struct map_context *)arg;
 572
 573   /* Find the tag in our table of tags.  This must not fail because
 574      map_html_tags only returns tags found in interesting_tags.  */
 575   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 576   assert (t != NULL);
 577
 578   t->handler (t->tagid, tag, ctx);
 579 }
 580 \f
 581 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 582    it.  It merges relative links in FILE with URL.  It is aware of
 583    <base href=...> and does the right thing.  */
 584
 585 struct urlpos *
 586 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 587 {
 588   struct file_memory *fm;
 589   struct map_context ctx;
 590   int flags;
 591
 592   /* Load the file. */
 593   fm = read_file (file);
 594   if (!fm)
 595     {
 596       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 597       return NULL;
 598     }
 599   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 600
 601   ctx.text = fm->content;
 602   ctx.head = ctx.tail = NULL;
 603   ctx.base = NULL;
 604   ctx.parent_base = url ? url : opt.base_href;
 605   ctx.document_file = file;
 606   ctx.nofollow = 0;
 607
 608   if (!interesting_tags)
 609     init_interesting ();
 610
 611   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 612      generate <a href=" foo"> instead of <a href="foo"> (Netscape
 613      ignores spaces as well.)  If you really mean space, use &32; or
 614      %20.  */
 615   flags = MHT_TRIM_VALUES;
 616   if (opt.strict_comments)
 617     flags |= MHT_STRICT_COMMENTS;
 618
 619   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 620                  interesting_tags, interesting_attributes);
 621
 622   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 623   if (meta_disallow_follow)
 624     *meta_disallow_follow = ctx.nofollow;
 625
 626   xfree_null (ctx.base);
 627   read_file_free (fm);
 628   return ctx.head;
 629 }
 630
 631 /* This doesn't really have anything to do with HTML, but it's similar
 632    to get_urls_html, so we put it here.  */
 633
 634 struct urlpos *
 635 get_urls_file (const char *file)
 636 {
 637   struct file_memory *fm;
 638   struct urlpos *head, *tail;
 639   const char *text, *text_end;
 640
 641   /* Load the file.  */
 642   fm = read_file (file);
 643   if (!fm)
 644     {
 645       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 646       return NULL;
 647     }
 648   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 649
 650   head = tail = NULL;
 651   text = fm->content;
 652   text_end = fm->content + fm->length;
 653   while (text < text_end)
 654     {
 655       int up_error_code;
 656       char *url_text;
 657       struct urlpos *entry;
 658       struct url *url;
 659
 660       const char *line_beg = text;
 661       const char *line_end = memchr (text, '\n', text_end - text);
 662       if (!line_end)
 663         line_end = text_end;
 664       else
 665         ++line_end;
 666       text = line_end;
 667
 668       /* Strip whitespace from the beginning and end of line. */
 669       while (line_beg < line_end && ISSPACE (*line_beg))
 670         ++line_beg;
 671       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 672         --line_end;
 673
 674       if (line_beg == line_end)
 675         continue;
 676
 677       /* The URL is in the [line_beg, line_end) region. */
 678
 679       /* We must copy the URL to a zero-terminated string, and we
 680          can't use alloca because we're in a loop.  *sigh*.  */
 681       url_text = strdupdelim (line_beg, line_end);
 682
 683       if (opt.base_href)
 684         {
 685           /* Merge opt.base_href with URL. */
 686           char *merged = uri_merge (opt.base_href, url_text);
 687           xfree (url_text);
 688           url_text = merged;
 689         }
 690
 691       url = url_parse (url_text, &up_error_code);
 692       if (!url)
 693         {
 694           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 695                      file, url_text, url_error (up_error_code));
 696           xfree (url_text);
 697           continue;
 698         }
 699       xfree (url_text);
 700
 701       entry = xnew0 (struct urlpos);
 702       entry->next = NULL;
 703       entry->url = url;
 704
 705       if (!head)
 706         head = entry;
 707       else
 708         tail->next = entry;
 709       tail = entry;
 710     }
 711   read_file_free (fm);
 712   return head;
 713 }
 714
 715 void
 716 cleanup_html_url (void)
 717 {
 718   /* Destroy the hash tables.  The hash table keys and values are not
 719      allocated by this code, so we don't need to free them here.  */
 720   if (interesting_tags)
 721     hash_table_destroy (interesting_tags);
 722   if (interesting_attributes)
 723     hash_table_destroy (interesting_attributes);
 724 }