sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "convert.h"
  48
  49 #ifndef errno
  50 extern int errno;
  51 #endif
  52
  53 struct map_context;
  54
  55 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  56                                        struct map_context *));
  57
  58 #define DECLARE_TAG_HANDLER(fun)                                        \
  59   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  60
  61 DECLARE_TAG_HANDLER (tag_find_urls);
  62 DECLARE_TAG_HANDLER (tag_handle_base);
  63 DECLARE_TAG_HANDLER (tag_handle_form);
  64 DECLARE_TAG_HANDLER (tag_handle_link);
  65 DECLARE_TAG_HANDLER (tag_handle_meta);
  66
  67 enum {
  68   TAG_A,
  69   TAG_APPLET,
  70   TAG_AREA,
  71   TAG_BASE,
  72   TAG_BGSOUND,
  73   TAG_BODY,
  74   TAG_EMBED,
  75   TAG_FIG,
  76   TAG_FORM,
  77   TAG_FRAME,
  78   TAG_IFRAME,
  79   TAG_IMG,
  80   TAG_INPUT,
  81   TAG_LAYER,
  82   TAG_LINK,
  83   TAG_META,
  84   TAG_OVERLAY,
  85   TAG_SCRIPT,
  86   TAG_TABLE,
  87   TAG_TD,
  88   TAG_TH
  89 };
  90
  91 /* The list of known tags and functions used for handling them.  Most
  92    tags are simply harvested for URLs. */
  93 static struct known_tag {
  94   int tagid;
  95   const char *name;
  96   tag_handler_t handler;
  97 } known_tags[] = {
  98   { TAG_A,       "a",           tag_find_urls },
  99   { TAG_APPLET,  "applet",      tag_find_urls },
 100   { TAG_AREA,    "area",        tag_find_urls },
 101   { TAG_BASE,    "base",        tag_handle_base },
 102   { TAG_BGSOUND, "bgsound",     tag_find_urls },
 103   { TAG_BODY,    "body",        tag_find_urls },
 104   { TAG_EMBED,   "embed",       tag_find_urls },
 105   { TAG_FIG,     "fig",         tag_find_urls },
 106   { TAG_FORM,    "form",        tag_handle_form },
 107   { TAG_FRAME,   "frame",       tag_find_urls },
 108   { TAG_IFRAME,  "iframe",      tag_find_urls },
 109   { TAG_IMG,     "img",         tag_find_urls },
 110   { TAG_INPUT,   "input",       tag_find_urls },
 111   { TAG_LAYER,   "layer",       tag_find_urls },
 112   { TAG_LINK,    "link",        tag_handle_link },
 113   { TAG_META,    "meta",        tag_handle_meta },
 114   { TAG_OVERLAY, "overlay",     tag_find_urls },
 115   { TAG_SCRIPT,  "script",      tag_find_urls },
 116   { TAG_TABLE,   "table",       tag_find_urls },
 117   { TAG_TD,      "td",          tag_find_urls },
 118   { TAG_TH,      "th",          tag_find_urls }
 119 };
 120
 121 /* tag_url_attributes documents which attributes of which tags contain
 122    URLs to harvest.  It is used by tag_find_urls.  */
 123
 124 /* Defines for the FLAGS. */
 125
 126 /* The link is "inline", i.e. needs to be retrieved for this document
 127    to be correctly rendered.  Inline links include inlined images,
 128    stylesheets, children frames, etc.  */
 129 #define ATTR_INLINE     1
 130
 131 /* The link is expected to yield HTML contents.  It's important not to
 132    try to follow HTML obtained by following e.g. <img src="...">
 133    regardless of content-type.  Doing this causes infinite loops for
 134    "images" that return non-404 error pages with links to the same
 135    image.  */
 136 #define ATTR_HTML       2
 137
 138 /* For tags handled by tag_find_urls: attributes that contain URLs to
 139    download. */
 140 static struct {
 141   int tagid;
 142   const char *attr_name;
 143   int flags;
 144 } tag_url_attributes[] = {
 145   { TAG_A,              "href",         ATTR_HTML },
 146   { TAG_APPLET,         "code",         ATTR_INLINE },
 147   { TAG_AREA,           "href",         ATTR_HTML },
 148   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 149   { TAG_BODY,           "background",   ATTR_INLINE },
 150   { TAG_EMBED,          "href",         ATTR_HTML },
 151   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 152   { TAG_FIG,            "src",          ATTR_INLINE },
 153   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_IMG,            "href",         ATTR_INLINE },
 156   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 157   { TAG_IMG,            "src",          ATTR_INLINE },
 158   { TAG_INPUT,          "src",          ATTR_INLINE },
 159   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 160   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 161   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 162   { TAG_TABLE,          "background",   ATTR_INLINE },
 163   { TAG_TD,             "background",   ATTR_INLINE },
 164   { TAG_TH,             "background",   ATTR_INLINE }
 165 };
 166
 167 /* The lists of interesting tags and attributes are built dynamically,
 168    from the information above.  However, some places in the code refer
 169    to the attributes not mentioned here.  We add them manually.  */
 170 static const char *additional_attributes[] = {
 171   "rel",                        /* used by tag_handle_link */
 172   "http-equiv",                 /* used by tag_handle_meta */
 173   "name",                       /* used by tag_handle_meta */
 174   "content",                    /* used by tag_handle_meta */
 175   "action"                      /* used by tag_handle_form */
 176 };
 177
 178 struct hash_table *interesting_tags;
 179 struct hash_table *interesting_attributes;
 180
 181 static void
 182 init_interesting (void)
 183 {
 184   /* Init the variables interesting_tags and interesting_attributes
 185      that are used by the HTML parser to know which tags and
 186      attributes we're interested in.  We initialize this only once,
 187      for performance reasons.
 188
 189      Here we also make sure that what we put in interesting_tags
 190      matches the user's preferences as specified through --ignore-tags
 191      and --follow-tags.  */
 192
 193   int i;
 194   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 195
 196   /* First, add all the tags we know hot to handle, mapped to their
 197      respective entries in known_tags.  */
 198   for (i = 0; i < countof (known_tags); i++)
 199     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 200
 201   /* Then remove the tags ignored through --ignore-tags.  */
 202   if (opt.ignore_tags)
 203     {
 204       char **ignored;
 205       for (ignored = opt.ignore_tags; *ignored; ignored++)
 206         hash_table_remove (interesting_tags, *ignored);
 207     }
 208
 209   /* If --follow-tags is specified, use only those tags.  */
 210   if (opt.follow_tags)
 211     {
 212       /* Create a new table intersecting --follow-tags and known_tags,
 213          and use it as interesting_tags.  */
 214       struct hash_table *intersect = make_nocase_string_hash_table (0);
 215       char **followed;
 216       for (followed = opt.follow_tags; *followed; followed++)
 217         {
 218           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 219           if (!t)
 220             continue;           /* ignore unknown --follow-tags entries. */
 221           hash_table_put (intersect, *followed, t);
 222         }
 223       hash_table_destroy (interesting_tags);
 224       interesting_tags = intersect;
 225     }
 226
 227   /* Add the attributes we care about. */
 228   interesting_attributes = make_nocase_string_hash_table (10);
 229   for (i = 0; i < countof (additional_attributes); i++)
 230     string_set_add (interesting_attributes, additional_attributes[i]);
 231   for (i = 0; i < countof (tag_url_attributes); i++)
 232     string_set_add (interesting_attributes, tag_url_attributes[i].attr_name);
 233 }
 234
 235 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 236    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 237    index of the attribute in TAG will be stored there.  */
 238
 239 static char *
 240 find_attr (struct taginfo *tag, const char *name, int *attrind)
 241 {
 242   int i;
 243   for (i = 0; i < tag->nattrs; i++)
 244     if (!strcasecmp (tag->attrs[i].name, name))
 245       {
 246         if (attrind)
 247           *attrind = i;
 248         return tag->attrs[i].value;
 249       }
 250   return NULL;
 251 }
 252
 253 struct map_context {
 254   char *text;                   /* HTML text. */
 255   char *base;                   /* Base URI of the document, possibly
 256                                    changed through <base href=...>. */
 257   const char *parent_base;      /* Base of the current document. */
 258   const char *document_file;    /* File name of this document. */
 259   int nofollow;                 /* whether NOFOLLOW was specified in a
 260                                    <meta name=robots> tag. */
 261
 262   struct urlpos *head, *tail;   /* List of URLs that is being
 263                                    built. */
 264 };
 265
 266 /* Append LINK_URI to the urlpos structure that is being built.
 267
 268    LINK_URI will be merged with the current document base.  TAG and
 269    ATTRIND are the necessary context to store the position and
 270    size.  */
 271
 272 static struct urlpos *
 273 append_url (const char *link_uri,
 274             struct taginfo *tag, int attrind, struct map_context *ctx)
 275 {
 276   int link_has_scheme = url_has_scheme (link_uri);
 277   struct urlpos *newel;
 278   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 279   struct url *url;
 280
 281   if (!base)
 282     {
 283       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 284                ctx->document_file, link_uri));
 285
 286       if (!link_has_scheme)
 287         {
 288           /* Base URL is unavailable, and the link does not have a
 289              location attached to it -- we have to give up.  Since
 290              this can only happen when using `--force-html -i', print
 291              a warning.  */
 292           logprintf (LOG_NOTQUIET,
 293                      _("%s: Cannot resolve incomplete link %s.\n"),
 294                      ctx->document_file, link_uri);
 295           return NULL;
 296         }
 297
 298       url = url_parse (link_uri, NULL);
 299       if (!url)
 300         {
 301           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 302                    ctx->document_file, link_uri));
 303           return NULL;
 304         }
 305     }
 306   else
 307     {
 308       /* Merge BASE with LINK_URI, but also make sure the result is
 309          canonicalized, i.e. that "../" have been resolved.
 310          (parse_url will do that for us.) */
 311
 312       char *complete_uri = uri_merge (base, link_uri);
 313
 314       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 315                ctx->document_file, base, link_uri, complete_uri));
 316
 317       url = url_parse (complete_uri, NULL);
 318       if (!url)
 319         {
 320           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 321                    ctx->document_file, complete_uri));
 322           xfree (complete_uri);
 323           return NULL;
 324         }
 325       xfree (complete_uri);
 326     }
 327
 328   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 329
 330   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 331   memset (newel, 0, sizeof (*newel));
 332
 333   newel->next = NULL;
 334   newel->url = url;
 335   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 336   newel->size = tag->attrs[attrind].value_raw_size;
 337
 338   /* A URL is relative if the host is not named, and the name does not
 339      start with `/'.  */
 340   if (!link_has_scheme && *link_uri != '/')
 341     newel->link_relative_p = 1;
 342   else if (link_has_scheme)
 343     newel->link_complete_p = 1;
 344
 345   if (ctx->tail)
 346     {
 347       ctx->tail->next = newel;
 348       ctx->tail = newel;
 349     }
 350   else
 351     ctx->tail = ctx->head = newel;
 352
 353   return newel;
 354 }
 355 \f
 356 /* All the tag_* functions are called from collect_tags_mapper, as
 357    specified by KNOWN_TAGS.  */
 358
 359 /* Default tag handler: collect URLs from attributes specified for
 360    this tag by tag_url_attributes.  */
 361
 362 static void
 363 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 364 {
 365   int i, attrind;
 366   int first = -1;
 367
 368   for (i = 0; i < countof (tag_url_attributes); i++)
 369     if (tag_url_attributes[i].tagid == tagid)
 370       {
 371         /* We've found the index of tag_url_attributes where the
 372            attributes of our tag begin.  */
 373         first = i;
 374         break;
 375       }
 376   assert (first != -1);
 377
 378   /* Loop over the "interesting" attributes of this tag.  In this
 379      example, it will loop over "src" and "lowsrc".
 380
 381        <img src="foo.png" lowsrc="bar.png">
 382
 383      This has to be done in the outer loop so that the attributes are
 384      processed in the same order in which they appear in the page.
 385      This is required when converting links.  */
 386
 387   for (attrind = 0; attrind < tag->nattrs; attrind++)
 388     {
 389       /* Find whether TAG/ATTRIND is a combination that contains a
 390          URL. */
 391       char *link = tag->attrs[attrind].value;
 392       const int size = countof (tag_url_attributes);
 393
 394       /* If you're cringing at the inefficiency of the nested loops,
 395          remember that they both iterate over a very small number of
 396          items.  The worst-case inner loop is for the IMG tag, which
 397          has three attributes.  */
 398       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 399         {
 400           if (0 == strcasecmp (tag->attrs[attrind].name,
 401                                tag_url_attributes[i].attr_name))
 402             {
 403               struct urlpos *up = append_url (link, tag, attrind, ctx);
 404               if (up)
 405                 {
 406                   int flags = tag_url_attributes[i].flags;
 407                   if (flags & ATTR_INLINE)
 408                     up->link_inline_p = 1;
 409                   if (flags & ATTR_HTML)
 410                     up->link_expect_html = 1;
 411                 }
 412             }
 413         }
 414     }
 415 }
 416
 417 /* Handle the BASE tag, for <base href=...>. */
 418
 419 static void
 420 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 421 {
 422   struct urlpos *base_urlpos;
 423   int attrind;
 424   char *newbase = find_attr (tag, "href", &attrind);
 425   if (!newbase)
 426     return;
 427
 428   base_urlpos = append_url (newbase, tag, attrind, ctx);
 429   if (!base_urlpos)
 430     return;
 431   base_urlpos->ignore_when_downloading = 1;
 432   base_urlpos->link_base_p = 1;
 433
 434   if (ctx->base)
 435     xfree (ctx->base);
 436   if (ctx->parent_base)
 437     ctx->base = uri_merge (ctx->parent_base, newbase);
 438   else
 439     ctx->base = xstrdup (newbase);
 440 }
 441
 442 /* Mark the URL found in <form action=...> for conversion. */
 443
 444 static void
 445 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 446 {
 447   int attrind;
 448   char *action = find_attr (tag, "action", &attrind);
 449   if (action)
 450     {
 451       struct urlpos *up = append_url (action, tag, attrind, ctx);
 452       if (up)
 453         up->ignore_when_downloading = 1;
 454     }
 455 }
 456
 457 /* Handle the LINK tag.  It requires special handling because how its
 458    links will be followed in -p mode depends on the REL attribute.  */
 459
 460 static void
 461 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 462 {
 463   int attrind;
 464   char *href = find_attr (tag, "href", &attrind);
 465
 466   /* All <link href="..."> link references are external, except those
 467      known not to be, such as style sheet and shortcut icon:
 468
 469        <link rel="stylesheet" href="...">
 470        <link rel="shortcut icon" href="...">
 471   */
 472   if (href)
 473     {
 474       struct urlpos *up = append_url (href, tag, attrind, ctx);
 475       if (up)
 476         {
 477           char *rel = find_attr (tag, "rel", NULL);
 478           if (rel
 479               && (0 == strcasecmp (rel, "stylesheet")
 480                   || 0 == strcasecmp (rel, "shortcut icon")))
 481             up->link_inline_p = 1;
 482         }
 483     }
 484 }
 485
 486 /* Handle the META tag.  This requires special handling because of the
 487    refresh feature and because of robot exclusion.  */
 488
 489 static void
 490 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 491 {
 492   char *name = find_attr (tag, "name", NULL);
 493   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 494
 495   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 496     {
 497       /* Some pages use a META tag to specify that the page be
 498          refreshed by a new page after a given number of seconds.  The
 499          general format for this is:
 500
 501            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 502
 503          So we just need to skip past the "NUMBER; URL=" garbage to
 504          get to the URL.  */
 505
 506       struct urlpos *entry;
 507       int attrind;
 508       int timeout = 0;
 509       char *p;
 510
 511       char *refresh = find_attr (tag, "content", &attrind);
 512       if (!refresh)
 513         return;
 514
 515       for (p = refresh; ISDIGIT (*p); p++)
 516         timeout = 10 * timeout + *p - '0';
 517       if (*p++ != ';')
 518         return;
 519
 520       while (ISSPACE (*p))
 521         ++p;
 522       if (!(   TOUPPER (*p)       == 'U'
 523             && TOUPPER (*(p + 1)) == 'R'
 524             && TOUPPER (*(p + 2)) == 'L'
 525             &&          *(p + 3)  == '='))
 526         return;
 527       p += 4;
 528       while (ISSPACE (*p))
 529         ++p;
 530
 531       entry = append_url (p, tag, attrind, ctx);
 532       if (entry)
 533         {
 534           entry->link_refresh_p = 1;
 535           entry->refresh_timeout = timeout;
 536           entry->link_expect_html = 1;
 537         }
 538     }
 539   else if (name && 0 == strcasecmp (name, "robots"))
 540     {
 541       /* Handle stuff like:
 542          <meta name="robots" content="index,nofollow"> */
 543       char *content = find_attr (tag, "content", NULL);
 544       if (!content)
 545         return;
 546       if (!strcasecmp (content, "none"))
 547         ctx->nofollow = 1;
 548       else
 549         {
 550           while (*content)
 551             {
 552               /* Find the next occurrence of ',' or the end of
 553                  the string.  */
 554               char *end = strchr (content, ',');
 555               if (end)
 556                 ++end;
 557               else
 558                 end = content + strlen (content);
 559               if (!strncasecmp (content, "nofollow", end - content))
 560                 ctx->nofollow = 1;
 561               content = end;
 562             }
 563         }
 564     }
 565 }
 566
 567 /* Dispatch the tag handler appropriate for the tag we're mapping
 568    over.  See known_tags[] for definition of tag handlers.  */
 569
 570 static void
 571 collect_tags_mapper (struct taginfo *tag, void *arg)
 572 {
 573   struct map_context *ctx = (struct map_context *)arg;
 574
 575   /* Find the tag in our table of tags.  This must not fail because
 576      map_html_tags only returns tags found in interesting_tags.  */
 577   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 578   assert (t != NULL);
 579
 580   t->handler (t->tagid, tag, ctx);
 581 }
 582 \f
 583 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 584    it.  It merges relative links in FILE with URL.  It is aware of
 585    <base href=...> and does the right thing.  */
 586
 587 struct urlpos *
 588 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 589 {
 590   struct file_memory *fm;
 591   struct map_context ctx;
 592   int flags;
 593
 594   /* Load the file. */
 595   fm = read_file (file);
 596   if (!fm)
 597     {
 598       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 599       return NULL;
 600     }
 601   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 602
 603   ctx.text = fm->content;
 604   ctx.head = ctx.tail = NULL;
 605   ctx.base = NULL;
 606   ctx.parent_base = url ? url : opt.base_href;
 607   ctx.document_file = file;
 608   ctx.nofollow = 0;
 609
 610   if (!interesting_tags)
 611     init_interesting ();
 612
 613   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 614      generate <a href=" foo"> instead of <a href="foo"> (Netscape
 615      ignores spaces as well.)  If you really mean space, use &32; or
 616      %20.  */
 617   flags = MHT_TRIM_VALUES;
 618   if (opt.strict_comments)
 619     flags |= MHT_STRICT_COMMENTS;
 620
 621   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 622                  interesting_tags, interesting_attributes);
 623
 624   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 625   if (meta_disallow_follow)
 626     *meta_disallow_follow = ctx.nofollow;
 627
 628   FREE_MAYBE (ctx.base);
 629   read_file_free (fm);
 630   return ctx.head;
 631 }
 632
 633 /* This doesn't really have anything to do with HTML, but it's similar
 634    to get_urls_html, so we put it here.  */
 635
 636 struct urlpos *
 637 get_urls_file (const char *file)
 638 {
 639   struct file_memory *fm;
 640   struct urlpos *head, *tail;
 641   const char *text, *text_end;
 642
 643   /* Load the file.  */
 644   fm = read_file (file);
 645   if (!fm)
 646     {
 647       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 648       return NULL;
 649     }
 650   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 651
 652   head = tail = NULL;
 653   text = fm->content;
 654   text_end = fm->content + fm->length;
 655   while (text < text_end)
 656     {
 657       int up_error_code;
 658       char *url_text;
 659       struct urlpos *entry;
 660       struct url *url;
 661
 662       const char *line_beg = text;
 663       const char *line_end = memchr (text, '\n', text_end - text);
 664       if (!line_end)
 665         line_end = text_end;
 666       else
 667         ++line_end;
 668       text = line_end;
 669
 670       /* Strip whitespace from the beginning and end of line. */
 671       while (line_beg < line_end && ISSPACE (*line_beg))
 672         ++line_beg;
 673       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 674         --line_end;
 675
 676       if (line_beg == line_end)
 677         continue;
 678
 679       /* The URL is in the [line_beg, line_end) region. */
 680
 681       /* We must copy the URL to a zero-terminated string, and we
 682          can't use alloca because we're in a loop.  *sigh*.  */
 683       url_text = strdupdelim (line_beg, line_end);
 684
 685       if (opt.base_href)
 686         {
 687           /* Merge opt.base_href with URL. */
 688           char *merged = uri_merge (opt.base_href, url_text);
 689           xfree (url_text);
 690           url_text = merged;
 691         }
 692
 693       url = url_parse (url_text, &up_error_code);
 694       if (!url)
 695         {
 696           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 697                      file, url_text, url_error (up_error_code));
 698           xfree (url_text);
 699           continue;
 700         }
 701       xfree (url_text);
 702
 703       entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 704       memset (entry, 0, sizeof (*entry));
 705       entry->next = NULL;
 706       entry->url = url;
 707
 708       if (!head)
 709         head = entry;
 710       else
 711         tail->next = entry;
 712       tail = entry;
 713     }
 714   read_file_free (fm);
 715   return head;
 716 }
 717
 718 void
 719 cleanup_html_url (void)
 720 {
 721   FREE_MAYBE (interesting_tags);
 722   FREE_MAYBE (interesting_attributes);
 723 }