sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "convert.h"
  47
  48 #ifndef errno
  49 extern int errno;
  50 #endif
  51
  52 struct map_context;
  53
  54 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  55                                        struct map_context *));
  56
  57 #define DECLARE_TAG_HANDLER(fun)                                        \
  58   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  59
  60 DECLARE_TAG_HANDLER (tag_find_urls);
  61 DECLARE_TAG_HANDLER (tag_handle_base);
  62 DECLARE_TAG_HANDLER (tag_handle_form);
  63 DECLARE_TAG_HANDLER (tag_handle_link);
  64 DECLARE_TAG_HANDLER (tag_handle_meta);
  65
  66 /* The list of known tags and functions used for handling them.  Most
  67    tags are simply harvested for URLs. */
  68 static struct {
  69   const char *name;
  70   tag_handler_t handler;
  71 } known_tags[] = {
  72 #define TAG_A           0
  73   { "a",        tag_find_urls },
  74 #define TAG_APPLET      1
  75   { "applet",   tag_find_urls },
  76 #define TAG_AREA        2
  77   { "area",     tag_find_urls },
  78 #define TAG_BASE        3
  79   { "base",     tag_handle_base },
  80 #define TAG_BGSOUND     4
  81   { "bgsound",  tag_find_urls },
  82 #define TAG_BODY        5
  83   { "body",     tag_find_urls },
  84 #define TAG_EMBED       6
  85   { "embed",    tag_find_urls },
  86 #define TAG_FIG         7
  87   { "fig",      tag_find_urls },
  88 #define TAG_FORM        8
  89   { "form",     tag_handle_form },
  90 #define TAG_FRAME       9
  91   { "frame",    tag_find_urls },
  92 #define TAG_IFRAME      10
  93   { "iframe",   tag_find_urls },
  94 #define TAG_IMG         11
  95   { "img",      tag_find_urls },
  96 #define TAG_INPUT       12
  97   { "input",    tag_find_urls },
  98 #define TAG_LAYER       13
  99   { "layer",    tag_find_urls },
 100 #define TAG_LINK        14
 101   { "link",     tag_handle_link },
 102 #define TAG_META        15
 103   { "meta",     tag_handle_meta },
 104 #define TAG_OVERLAY     16
 105   { "overlay",  tag_find_urls },
 106 #define TAG_SCRIPT      17
 107   { "script",   tag_find_urls },
 108 #define TAG_TABLE       18
 109   { "table",    tag_find_urls },
 110 #define TAG_TD          19
 111   { "td",       tag_find_urls },
 112 #define TAG_TH          20
 113   { "th",       tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS field; currently only one flag is defined. */
 120
 121 /* This tag points to an external document not necessary for rendering this
 122    document (i.e. it's not an inlined image, stylesheet, etc.). */
 123 #define TUA_EXTERNAL 1
 124
 125 /* For tags handled by tag_find_urls: attributes that contain URLs to
 126    download. */
 127 static struct {
 128   int tagid;
 129   const char *attr_name;
 130   int flags;
 131 } tag_url_attributes[] = {
 132   { TAG_A,              "href",         TUA_EXTERNAL },
 133   { TAG_APPLET,         "code",         0 },
 134   { TAG_AREA,           "href",         TUA_EXTERNAL },
 135   { TAG_BGSOUND,        "src",          0 },
 136   { TAG_BODY,           "background",   0 },
 137   { TAG_EMBED,          "href",         TUA_EXTERNAL },
 138   { TAG_EMBED,          "src",          0 },
 139   { TAG_FIG,            "src",          0 },
 140   { TAG_FRAME,          "src",          0 },
 141   { TAG_IFRAME,         "src",          0 },
 142   { TAG_IMG,            "href",         0 },
 143   { TAG_IMG,            "lowsrc",       0 },
 144   { TAG_IMG,            "src",          0 },
 145   { TAG_INPUT,          "src",          0 },
 146   { TAG_LAYER,          "src",          0 },
 147   { TAG_OVERLAY,        "src",          0 },
 148   { TAG_SCRIPT,         "src",          0 },
 149   { TAG_TABLE,          "background",   0 },
 150   { TAG_TD,             "background",   0 },
 151   { TAG_TH,             "background",   0 }
 152 };
 153
 154 /* The lists of interesting tags and attributes are built dynamically,
 155    from the information above.  However, some places in the code refer
 156    to the attributes not mentioned here.  We add them manually.  */
 157 static const char *additional_attributes[] = {
 158   "rel",                        /* used by tag_handle_link */
 159   "http-equiv",                 /* used by tag_handle_meta */
 160   "name",                       /* used by tag_handle_meta */
 161   "content",                    /* used by tag_handle_meta */
 162   "action"                      /* used by tag_handle_form */
 163 };
 164
 165 static const char **interesting_tags;
 166 static const char **interesting_attributes;
 167
 168 static void
 169 init_interesting (void)
 170 {
 171   /* Init the variables interesting_tags and interesting_attributes
 172      that are used by the HTML parser to know which tags and
 173      attributes we're interested in.  We initialize this only once,
 174      for performance reasons.
 175
 176      Here we also make sure that what we put in interesting_tags
 177      matches the user's preferences as specified through --ignore-tags
 178      and --follow-tags.
 179
 180      This function is as large as this only because of the glorious
 181      expressivity of the C programming language.  */
 182
 183   {
 184     int i, ind = 0;
 185     int size = countof (known_tags);
 186     interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
 187
 188     for (i = 0; i < size; i++)
 189       {
 190         const char *name = known_tags[i].name;
 191
 192         /* Normally here we could say:
 193            interesting_tags[i] = name;
 194            But we need to respect the settings of --ignore-tags and
 195            --follow-tags, so the code gets a bit hairier.  */
 196
 197         if (opt.ignore_tags)
 198           {
 199             /* --ignore-tags was specified.  Do not match these
 200                specific tags.  --ignore-tags takes precedence over
 201                --follow-tags, so we process --ignore first and fall
 202                through if there's no match. */
 203             int j, lose = 0;
 204             for (j = 0; opt.ignore_tags[j] != NULL; j++)
 205               /* Loop through all the tags this user doesn't care about. */
 206               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
 207                 {
 208                   lose = 1;
 209                   break;
 210                 }
 211             if (lose)
 212               continue;
 213           }
 214
 215         if (opt.follow_tags)
 216           {
 217             /* --follow-tags was specified.  Only match these specific tags, so
 218                continue back to top of for if we don't match one of them. */
 219             int j, win = 0;
 220             for (j = 0; opt.follow_tags[j] != NULL; j++)
 221               /* Loop through all the tags this user cares about. */
 222               if (strcasecmp(opt.follow_tags[j], name) == EQ)
 223                 {
 224                   win = 1;
 225                   break;
 226                 }
 227             if (!win)
 228               continue;  /* wasn't one of the explicitly desired tags */
 229           }
 230
 231         /* If we get to here, --follow-tags isn't being used or the
 232            tag is among the ones that are followed, and --ignore-tags,
 233            if specified, didn't include this tag, so it's an
 234            "interesting" one. */
 235         interesting_tags[ind++] = name;
 236       }
 237     interesting_tags[ind] = NULL;
 238   }
 239
 240   /* The same for attributes, except we loop through tag_url_attributes.
 241      Here we also need to make sure that the list of attributes is
 242      unique, and to include the attributes from additional_attributes.  */
 243   {
 244     int i, ind;
 245     const char **att = xmalloc ((countof (additional_attributes) + 1)
 246                                 * sizeof (char *));
 247     /* First copy the "additional" attributes. */
 248     for (i = 0; i < countof (additional_attributes); i++)
 249       att[i] = additional_attributes[i];
 250     ind = i;
 251     att[ind] = NULL;
 252     for (i = 0; i < countof (tag_url_attributes); i++)
 253       {
 254         int j, seen = 0;
 255         const char *look_for = tag_url_attributes[i].attr_name;
 256         for (j = 0; j < ind - 1; j++)
 257           if (!strcmp (att[j], look_for))
 258             {
 259               seen = 1;
 260               break;
 261             }
 262         if (!seen)
 263           {
 264             att = xrealloc (att, (ind + 2) * sizeof (*att));
 265             att[ind++] = look_for;
 266             att[ind] = NULL;
 267           }
 268       }
 269     interesting_attributes = att;
 270   }
 271 }
 272
 273 static int
 274 find_tag (const char *tag_name)
 275 {
 276   int i;
 277
 278   /* This is linear search; if the number of tags grow, we can switch
 279      to binary search.  */
 280
 281   for (i = 0; i < countof (known_tags); i++)
 282     {
 283       int cmp = strcasecmp (known_tags[i].name, tag_name);
 284       /* known_tags are sorted alphabetically, so we can
 285          micro-optimize.  */
 286       if (cmp > 0)
 287         break;
 288       else if (cmp == 0)
 289         return i;
 290     }
 291   return -1;
 292 }
 293
 294 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 295    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 296    index of the attribute in TAG will be stored there.  */
 297 static char *
 298 find_attr (struct taginfo *tag, const char *name, int *attrind)
 299 {
 300   int i;
 301   for (i = 0; i < tag->nattrs; i++)
 302     if (!strcasecmp (tag->attrs[i].name, name))
 303       {
 304         if (attrind)
 305           *attrind = i;
 306         return tag->attrs[i].value;
 307       }
 308   return NULL;
 309 }
 310
 311 struct map_context {
 312   char *text;                   /* HTML text. */
 313   char *base;                   /* Base URI of the document, possibly
 314                                    changed through <base href=...>. */
 315   const char *parent_base;      /* Base of the current document. */
 316   const char *document_file;    /* File name of this document. */
 317   int nofollow;                 /* whether NOFOLLOW was specified in a
 318                                    <meta name=robots> tag. */
 319
 320   struct urlpos *head, *tail;   /* List of URLs that is being
 321                                    built. */
 322 };
 323
 324 /* Append LINK_URI to the urlpos structure that is being built.
 325
 326    LINK_URI will be merged with the current document base.  TAG and
 327    ATTRIND are the necessary context to store the position and
 328    size.  */
 329
 330 static struct urlpos *
 331 append_one_url (const char *link_uri, int inlinep,
 332                 struct taginfo *tag, int attrind, struct map_context *ctx)
 333 {
 334   int link_has_scheme = url_has_scheme (link_uri);
 335   struct urlpos *newel;
 336   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 337   struct url *url;
 338
 339   if (!base)
 340     {
 341       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 342                ctx->document_file, link_uri));
 343
 344       if (!link_has_scheme)
 345         {
 346           /* Base URL is unavailable, and the link does not have a
 347              location attached to it -- we have to give up.  Since
 348              this can only happen when using `--force-html -i', print
 349              a warning.  */
 350           logprintf (LOG_NOTQUIET,
 351                      _("%s: Cannot resolve incomplete link %s.\n"),
 352                      ctx->document_file, link_uri);
 353           return NULL;
 354         }
 355
 356       url = url_parse (link_uri, NULL);
 357       if (!url)
 358         {
 359           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 360                    ctx->document_file, link_uri));
 361           return NULL;
 362         }
 363     }
 364   else
 365     {
 366       /* Merge BASE with LINK_URI, but also make sure the result is
 367          canonicalized, i.e. that "../" have been resolved.
 368          (parse_url will do that for us.) */
 369
 370       char *complete_uri = uri_merge (base, link_uri);
 371
 372       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 373                ctx->document_file, base, link_uri, complete_uri));
 374
 375       url = url_parse (complete_uri, NULL);
 376       if (!url)
 377         {
 378           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 379                    ctx->document_file, complete_uri));
 380           xfree (complete_uri);
 381           return NULL;
 382         }
 383       xfree (complete_uri);
 384     }
 385
 386   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 387
 388   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 389   memset (newel, 0, sizeof (*newel));
 390
 391   newel->next = NULL;
 392   newel->url = url;
 393   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 394   newel->size = tag->attrs[attrind].value_raw_size;
 395   newel->link_inline_p = inlinep;
 396
 397   /* A URL is relative if the host is not named, and the name does not
 398      start with `/'.  */
 399   if (!link_has_scheme && *link_uri != '/')
 400     newel->link_relative_p = 1;
 401   else if (link_has_scheme)
 402     newel->link_complete_p = 1;
 403
 404   if (ctx->tail)
 405     {
 406       ctx->tail->next = newel;
 407       ctx->tail = newel;
 408     }
 409   else
 410     ctx->tail = ctx->head = newel;
 411
 412   return newel;
 413 }
 414 \f
 415 /* All the tag_* functions are called from collect_tags_mapper, as
 416    specified by KNOWN_TAGS.  */
 417
 418 /* Default tag handler: collect URLs from attributes specified for
 419    this tag by tag_url_attributes.  */
 420
 421 static void
 422 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 423 {
 424   int i, attrind, first = -1;
 425   int size = countof (tag_url_attributes);
 426
 427   for (i = 0; i < size; i++)
 428     if (tag_url_attributes[i].tagid == tagid)
 429       {
 430         /* We've found the index of tag_url_attributes where the
 431            attributes of our tag begin.  */
 432         first = i;
 433         break;
 434       }
 435   assert (first != -1);
 436
 437   /* Loop over the "interesting" attributes of this tag.  In this
 438      example, it will loop over "src" and "lowsrc".
 439
 440        <img src="foo.png" lowsrc="bar.png">
 441
 442      This has to be done in the outer loop so that the attributes are
 443      processed in the same order in which they appear in the page.
 444      This is required when converting links.  */
 445
 446   for (attrind = 0; attrind < tag->nattrs; attrind++)
 447     {
 448       /* Find whether TAG/ATTRIND is a combination that contains a
 449          URL. */
 450       char *link = tag->attrs[attrind].value;
 451
 452       /* If you're cringing at the inefficiency of the nested loops,
 453          remember that they both iterate over a laughably small
 454          quantity of items.  The worst-case inner loop is for the IMG
 455          tag, which has three attributes.  */
 456       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 457         {
 458           if (0 == strcasecmp (tag->attrs[attrind].name,
 459                                tag_url_attributes[i].attr_name))
 460             {
 461               int flags = tag_url_attributes[i].flags;
 462               append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
 463             }
 464         }
 465     }
 466 }
 467
 468 /* Handle the BASE tag, for <base href=...>. */
 469
 470 static void
 471 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 472 {
 473   struct urlpos *base_urlpos;
 474   int attrind;
 475   char *newbase = find_attr (tag, "href", &attrind);
 476   if (!newbase)
 477     return;
 478
 479   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 480   if (!base_urlpos)
 481     return;
 482   base_urlpos->ignore_when_downloading = 1;
 483   base_urlpos->link_base_p = 1;
 484
 485   if (ctx->base)
 486     xfree (ctx->base);
 487   if (ctx->parent_base)
 488     ctx->base = uri_merge (ctx->parent_base, newbase);
 489   else
 490     ctx->base = xstrdup (newbase);
 491 }
 492
 493 /* Mark the URL found in <form action=...> for conversion. */
 494
 495 static void
 496 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 497 {
 498   int attrind;
 499   char *action = find_attr (tag, "action", &attrind);
 500   if (action)
 501     {
 502       struct urlpos *action_urlpos = append_one_url (action, 0, tag,
 503                                                      attrind, ctx);
 504       if (action_urlpos)
 505         action_urlpos->ignore_when_downloading = 1;
 506     }
 507 }
 508
 509 /* Handle the LINK tag.  It requires special handling because how its
 510    links will be followed in -p mode depends on the REL attribute.  */
 511
 512 static void
 513 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 514 {
 515   int attrind;
 516   char *href = find_attr (tag, "href", &attrind);
 517
 518   /* All <link href="..."> link references are external, except those
 519      known not to be, such as style sheet and shortcut icon:
 520
 521        <link rel="stylesheet" href="...">
 522        <link rel="shortcut icon" href="...">
 523   */
 524   if (href)
 525     {
 526       char *rel  = find_attr (tag, "rel", NULL);
 527       int inlinep = (rel
 528                      && (0 == strcasecmp (rel, "stylesheet")
 529                          || 0 == strcasecmp (rel, "shortcut icon")));
 530       append_one_url (href, inlinep, tag, attrind, ctx);
 531     }
 532 }
 533
 534 /* Handle the META tag.  This requires special handling because of the
 535    refresh feature and because of robot exclusion.  */
 536
 537 static void
 538 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 539 {
 540   char *name = find_attr (tag, "name", NULL);
 541   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 542
 543   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 544     {
 545       /* Some pages use a META tag to specify that the page be
 546          refreshed by a new page after a given number of seconds.  The
 547          general format for this is:
 548
 549            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 550
 551          So we just need to skip past the "NUMBER; URL=" garbage to
 552          get to the URL.  */
 553
 554       struct urlpos *entry;
 555       int attrind;
 556       int timeout = 0;
 557       char *p;
 558
 559       char *refresh = find_attr (tag, "content", &attrind);
 560       if (!refresh)
 561         return;
 562
 563       for (p = refresh; ISDIGIT (*p); p++)
 564         timeout = 10 * timeout + *p - '0';
 565       if (*p++ != ';')
 566         return;
 567
 568       while (ISSPACE (*p))
 569         ++p;
 570       if (!(   TOUPPER (*p)       == 'U'
 571             && TOUPPER (*(p + 1)) == 'R'
 572             && TOUPPER (*(p + 2)) == 'L'
 573             &&          *(p + 3)  == '='))
 574         return;
 575       p += 4;
 576       while (ISSPACE (*p))
 577         ++p;
 578
 579       entry = append_one_url (p, 0, tag, attrind, ctx);
 580       if (entry)
 581         {
 582           entry->link_refresh_p = 1;
 583           entry->refresh_timeout = timeout;
 584         }
 585     }
 586   else if (name && 0 == strcasecmp (name, "robots"))
 587     {
 588       /* Handle stuff like:
 589          <meta name="robots" content="index,nofollow"> */
 590       char *content = find_attr (tag, "content", NULL);
 591       if (!content)
 592         return;
 593       if (!strcasecmp (content, "none"))
 594         ctx->nofollow = 1;
 595       else
 596         {
 597           while (*content)
 598             {
 599               /* Find the next occurrence of ',' or the end of
 600                  the string.  */
 601               char *end = strchr (content, ',');
 602               if (end)
 603                 ++end;
 604               else
 605                 end = content + strlen (content);
 606               if (!strncasecmp (content, "nofollow", end - content))
 607                 ctx->nofollow = 1;
 608               content = end;
 609             }
 610         }
 611     }
 612 }
 613
 614 /* Examine name and attributes of TAG and take appropriate action
 615    according to the tag.  */
 616
 617 static void
 618 collect_tags_mapper (struct taginfo *tag, void *arg)
 619 {
 620   struct map_context *ctx = (struct map_context *)arg;
 621   int tagid;
 622   tag_handler_t handler;
 623
 624   tagid = find_tag (tag->name);
 625   assert (tagid != -1);
 626   handler = known_tags[tagid].handler;
 627
 628   handler (tagid, tag, ctx);
 629 }
 630 \f
 631 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 632    it.  It merges relative links in FILE with URL.  It is aware of
 633    <base href=...> and does the right thing.  */
 634
 635 struct urlpos *
 636 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 637 {
 638   struct file_memory *fm;
 639   struct map_context ctx;
 640
 641   /* Load the file. */
 642   fm = read_file (file);
 643   if (!fm)
 644     {
 645       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 646       return NULL;
 647     }
 648   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 649
 650   ctx.text = fm->content;
 651   ctx.head = ctx.tail = NULL;
 652   ctx.base = NULL;
 653   ctx.parent_base = url ? url : opt.base_href;
 654   ctx.document_file = file;
 655   ctx.nofollow = 0;
 656
 657   if (!interesting_tags)
 658     init_interesting ();
 659
 660   map_html_tags (fm->content, fm->length, interesting_tags,
 661                  interesting_attributes, collect_tags_mapper, &ctx);
 662
 663   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 664   if (meta_disallow_follow)
 665     *meta_disallow_follow = ctx.nofollow;
 666
 667   FREE_MAYBE (ctx.base);
 668   read_file_free (fm);
 669   return ctx.head;
 670 }
 671
 672 /* This doesn't really have anything to do with HTML, but it's similar
 673    to get_urls_html, so we put it here.  */
 674
 675 struct urlpos *
 676 get_urls_file (const char *file)
 677 {
 678   struct file_memory *fm;
 679   struct urlpos *head, *tail;
 680   const char *text, *text_end;
 681
 682   /* Load the file.  */
 683   fm = read_file (file);
 684   if (!fm)
 685     {
 686       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 687       return NULL;
 688     }
 689   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 690
 691   head = tail = NULL;
 692   text = fm->content;
 693   text_end = fm->content + fm->length;
 694   while (text < text_end)
 695     {
 696       int up_error_code;
 697       char *url_text;
 698       struct urlpos *entry;
 699       struct url *url;
 700
 701       const char *line_beg = text;
 702       const char *line_end = memchr (text, '\n', text_end - text);
 703       if (!line_end)
 704         line_end = text_end;
 705       else
 706         ++line_end;
 707       text = line_end;
 708
 709       /* Strip whitespace from the beginning and end of line. */
 710       while (line_beg < line_end && ISSPACE (*line_beg))
 711         ++line_beg;
 712       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 713         --line_end;
 714
 715       if (line_beg == line_end)
 716         continue;
 717
 718       /* The URL is in the [line_beg, line_end) region. */
 719
 720       /* We must copy the URL to a zero-terminated string, and we
 721          can't use alloca because we're in a loop.  *sigh*.  */
 722       url_text = strdupdelim (line_beg, line_end);
 723
 724       if (opt.base_href)
 725         {
 726           /* Merge opt.base_href with URL. */
 727           char *merged = uri_merge (opt.base_href, url_text);
 728           xfree (url_text);
 729           url_text = merged;
 730         }
 731
 732       url = url_parse (url_text, &up_error_code);
 733       if (!url)
 734         {
 735           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 736                      file, url_text, url_error (up_error_code));
 737           xfree (url_text);
 738           continue;
 739         }
 740       xfree (url_text);
 741
 742       entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 743       memset (entry, 0, sizeof (*entry));
 744       entry->next = NULL;
 745       entry->url = url;
 746
 747       if (!head)
 748         head = entry;
 749       else
 750         tail->next = entry;
 751       tail = entry;
 752     }
 753   read_file_free (fm);
 754   return head;
 755 }
 756
 757 void
 758 cleanup_html_url (void)
 759 {
 760   FREE_MAYBE (interesting_tags);
 761   FREE_MAYBE (interesting_attributes);
 762 }