sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #define USE_GNULIB_ALLOC
  32
  33 #include "wget.h"
  34
  35 #include <stdio.h>
  36 #include <string.h>
  37 #include <stdlib.h>
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "html-parse.h"
  42 #include "url.h"
  43 #include "utils.h"
  44 #include "hash.h"
  45 #include "convert.h"
  46 #include "recur.h"              /* declaration of get_urls_html */
  47
  48 struct map_context;
  49
  50 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  51
  52 #define DECLARE_TAG_HANDLER(fun)                                \
  53   static void fun (int, struct taginfo *, struct map_context *)
  54
  55 DECLARE_TAG_HANDLER (tag_find_urls);
  56 DECLARE_TAG_HANDLER (tag_handle_base);
  57 DECLARE_TAG_HANDLER (tag_handle_form);
  58 DECLARE_TAG_HANDLER (tag_handle_link);
  59 DECLARE_TAG_HANDLER (tag_handle_meta);
  60
  61 enum {
  62   TAG_A,
  63   TAG_APPLET,
  64   TAG_AREA,
  65   TAG_BASE,
  66   TAG_BGSOUND,
  67   TAG_BODY,
  68   TAG_EMBED,
  69   TAG_FIG,
  70   TAG_FORM,
  71   TAG_FRAME,
  72   TAG_IFRAME,
  73   TAG_IMG,
  74   TAG_INPUT,
  75   TAG_LAYER,
  76   TAG_LINK,
  77   TAG_META,
  78   TAG_OBJECT,
  79   TAG_OVERLAY,
  80   TAG_SCRIPT,
  81   TAG_TABLE,
  82   TAG_TD,
  83   TAG_TH
  84 };
  85
  86 /* The list of known tags and functions used for handling them.  Most
  87    tags are simply harvested for URLs. */
  88 static struct known_tag {
  89   int tagid;
  90   const char *name;
  91   tag_handler_t handler;
  92 } known_tags[] = {
  93   { TAG_A,       "a",           tag_find_urls },
  94   { TAG_APPLET,  "applet",      tag_find_urls },
  95   { TAG_AREA,    "area",        tag_find_urls },
  96   { TAG_BASE,    "base",        tag_handle_base },
  97   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  98   { TAG_BODY,    "body",        tag_find_urls },
  99   { TAG_EMBED,   "embed",       tag_find_urls },
 100   { TAG_FIG,     "fig",         tag_find_urls },
 101   { TAG_FORM,    "form",        tag_handle_form },
 102   { TAG_FRAME,   "frame",       tag_find_urls },
 103   { TAG_IFRAME,  "iframe",      tag_find_urls },
 104   { TAG_IMG,     "img",         tag_find_urls },
 105   { TAG_INPUT,   "input",       tag_find_urls },
 106   { TAG_LAYER,   "layer",       tag_find_urls },
 107   { TAG_LINK,    "link",        tag_handle_link },
 108   { TAG_META,    "meta",        tag_handle_meta },
 109   { TAG_OBJECT,  "object",      tag_find_urls },
 110   { TAG_OVERLAY, "overlay",     tag_find_urls },
 111   { TAG_SCRIPT,  "script",      tag_find_urls },
 112   { TAG_TABLE,   "table",       tag_find_urls },
 113   { TAG_TD,      "td",          tag_find_urls },
 114   { TAG_TH,      "th",          tag_find_urls }
 115 };
 116
 117 /* tag_url_attributes documents which attributes of which tags contain
 118    URLs to harvest.  It is used by tag_find_urls.  */
 119
 120 /* Defines for the FLAGS. */
 121
 122 /* The link is "inline", i.e. needs to be retrieved for this document
 123    to be correctly rendered.  Inline links include inlined images,
 124    stylesheets, children frames, etc.  */
 125 #define ATTR_INLINE     1
 126
 127 /* The link is expected to yield HTML contents.  It's important not to
 128    try to follow HTML obtained by following e.g. <img src="...">
 129    regardless of content-type.  Doing this causes infinite loops for
 130    "images" that return non-404 error pages with links to the same
 131    image.  */
 132 #define ATTR_HTML       2
 133
 134 /* For tags handled by tag_find_urls: attributes that contain URLs to
 135    download. */
 136 static struct {
 137   int tagid;
 138   const char *attr_name;
 139   int flags;
 140 } tag_url_attributes[] = {
 141   { TAG_A,              "href",         ATTR_HTML },
 142   { TAG_APPLET,         "code",         ATTR_INLINE },
 143   { TAG_AREA,           "href",         ATTR_HTML },
 144   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 145   { TAG_BODY,           "background",   ATTR_INLINE },
 146   { TAG_EMBED,          "href",         ATTR_HTML },
 147   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_FIG,            "src",          ATTR_INLINE },
 149   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 151   { TAG_IMG,            "href",         ATTR_INLINE },
 152   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 153   { TAG_IMG,            "src",          ATTR_INLINE },
 154   { TAG_INPUT,          "src",          ATTR_INLINE },
 155   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_OBJECT,         "data",         ATTR_INLINE },
 157   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 158   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 159   { TAG_TABLE,          "background",   ATTR_INLINE },
 160   { TAG_TD,             "background",   ATTR_INLINE },
 161   { TAG_TH,             "background",   ATTR_INLINE }
 162 };
 163
 164 /* The lists of interesting tags and attributes are built dynamically,
 165    from the information above.  However, some places in the code refer
 166    to the attributes not mentioned here.  We add them manually.  */
 167 static const char *additional_attributes[] = {
 168   "rel",                        /* used by tag_handle_link */
 169   "http-equiv",                 /* used by tag_handle_meta */
 170   "name",                       /* used by tag_handle_meta */
 171   "content",                    /* used by tag_handle_meta */
 172   "action"                      /* used by tag_handle_form */
 173 };
 174
 175 static struct hash_table *interesting_tags;
 176 static struct hash_table *interesting_attributes;
 177
 178 static void
 179 init_interesting (void)
 180 {
 181   /* Init the variables interesting_tags and interesting_attributes
 182      that are used by the HTML parser to know which tags and
 183      attributes we're interested in.  We initialize this only once,
 184      for performance reasons.
 185
 186      Here we also make sure that what we put in interesting_tags
 187      matches the user's preferences as specified through --ignore-tags
 188      and --follow-tags.  */
 189
 190   int i;
 191   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 192
 193   /* First, add all the tags we know hot to handle, mapped to their
 194      respective entries in known_tags.  */
 195   for (i = 0; i < countof (known_tags); i++)
 196     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 197
 198   /* Then remove the tags ignored through --ignore-tags.  */
 199   if (opt.ignore_tags)
 200     {
 201       char **ignored;
 202       for (ignored = opt.ignore_tags; *ignored; ignored++)
 203         hash_table_remove (interesting_tags, *ignored);
 204     }
 205
 206   /* If --follow-tags is specified, use only those tags.  */
 207   if (opt.follow_tags)
 208     {
 209       /* Create a new table intersecting --follow-tags and known_tags,
 210          and use it as interesting_tags.  */
 211       struct hash_table *intersect = make_nocase_string_hash_table (0);
 212       char **followed;
 213       for (followed = opt.follow_tags; *followed; followed++)
 214         {
 215           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 216           if (!t)
 217             continue;           /* ignore unknown --follow-tags entries. */
 218           hash_table_put (intersect, *followed, t);
 219         }
 220       hash_table_destroy (interesting_tags);
 221       interesting_tags = intersect;
 222     }
 223
 224   /* Add the attributes we care about. */
 225   interesting_attributes = make_nocase_string_hash_table (10);
 226   for (i = 0; i < countof (additional_attributes); i++)
 227     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 228   for (i = 0; i < countof (tag_url_attributes); i++)
 229     hash_table_put (interesting_attributes,
 230                     tag_url_attributes[i].attr_name, "1");
 231 }
 232
 233 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 234    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 235    index of the attribute in TAG will be stored there.  */
 236
 237 static char *
 238 find_attr (struct taginfo *tag, const char *name, int *attrind)
 239 {
 240   int i;
 241   for (i = 0; i < tag->nattrs; i++)
 242     if (!strcasecmp (tag->attrs[i].name, name))
 243       {
 244         if (attrind)
 245           *attrind = i;
 246         return tag->attrs[i].value;
 247       }
 248   return NULL;
 249 }
 250
 251 struct map_context {
 252   char *text;                   /* HTML text. */
 253   char *base;                   /* Base URI of the document, possibly
 254                                    changed through <base href=...>. */
 255   const char *parent_base;      /* Base of the current document. */
 256   const char *document_file;    /* File name of this document. */
 257   bool nofollow;                /* whether NOFOLLOW was specified in a
 258                                    <meta name=robots> tag. */
 259
 260   struct urlpos *head, *tail;   /* List of URLs that is being
 261                                    built. */
 262 };
 263
 264 /* Append LINK_URI to the urlpos structure that is being built.
 265
 266    LINK_URI will be merged with the current document base.  TAG and
 267    ATTRIND are the necessary context to store the position and
 268    size.  */
 269
 270 static struct urlpos *
 271 append_url (const char *link_uri,
 272             struct taginfo *tag, int attrind, struct map_context *ctx)
 273 {
 274   int link_has_scheme = url_has_scheme (link_uri);
 275   struct urlpos *newel;
 276   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 277   struct url *url;
 278
 279   if (!base)
 280     {
 281       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 282                ctx->document_file, link_uri));
 283
 284       if (!link_has_scheme)
 285         {
 286           /* Base URL is unavailable, and the link does not have a
 287              location attached to it -- we have to give up.  Since
 288              this can only happen when using `--force-html -i', print
 289              a warning.  */
 290           logprintf (LOG_NOTQUIET,
 291                      _("%s: Cannot resolve incomplete link %s.\n"),
 292                      ctx->document_file, link_uri);
 293           return NULL;
 294         }
 295
 296       url = url_parse (link_uri, NULL);
 297       if (!url)
 298         {
 299           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 300                    ctx->document_file, link_uri));
 301           return NULL;
 302         }
 303     }
 304   else
 305     {
 306       /* Merge BASE with LINK_URI, but also make sure the result is
 307          canonicalized, i.e. that "../" have been resolved.
 308          (parse_url will do that for us.) */
 309
 310       char *complete_uri = uri_merge (base, link_uri);
 311
 312       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 313                ctx->document_file, base, link_uri, complete_uri));
 314
 315       url = url_parse (complete_uri, NULL);
 316       if (!url)
 317         {
 318           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 319                    ctx->document_file, complete_uri));
 320           xfree (complete_uri);
 321           return NULL;
 322         }
 323       xfree (complete_uri);
 324     }
 325
 326   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 327
 328   newel = xnew0 (struct urlpos);
 329   newel->url = url;
 330   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 331   newel->size = tag->attrs[attrind].value_raw_size;
 332
 333   /* A URL is relative if the host is not named, and the name does not
 334      start with `/'.  */
 335   if (!link_has_scheme && *link_uri != '/')
 336     newel->link_relative_p = 1;
 337   else if (link_has_scheme)
 338     newel->link_complete_p = 1;
 339
 340   if (ctx->tail)
 341     {
 342       ctx->tail->next = newel;
 343       ctx->tail = newel;
 344     }
 345   else
 346     ctx->tail = ctx->head = newel;
 347
 348   return newel;
 349 }
 350 \f
 351 /* All the tag_* functions are called from collect_tags_mapper, as
 352    specified by KNOWN_TAGS.  */
 353
 354 /* Default tag handler: collect URLs from attributes specified for
 355    this tag by tag_url_attributes.  */
 356
 357 static void
 358 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 359 {
 360   int i, attrind;
 361   int first = -1;
 362
 363   for (i = 0; i < countof (tag_url_attributes); i++)
 364     if (tag_url_attributes[i].tagid == tagid)
 365       {
 366         /* We've found the index of tag_url_attributes where the
 367            attributes of our tag begin.  */
 368         first = i;
 369         break;
 370       }
 371   assert (first != -1);
 372
 373   /* Loop over the "interesting" attributes of this tag.  In this
 374      example, it will loop over "src" and "lowsrc".
 375
 376        <img src="foo.png" lowsrc="bar.png">
 377
 378      This has to be done in the outer loop so that the attributes are
 379      processed in the same order in which they appear in the page.
 380      This is required when converting links.  */
 381
 382   for (attrind = 0; attrind < tag->nattrs; attrind++)
 383     {
 384       /* Find whether TAG/ATTRIND is a combination that contains a
 385          URL. */
 386       char *link = tag->attrs[attrind].value;
 387       const int size = countof (tag_url_attributes);
 388
 389       /* If you're cringing at the inefficiency of the nested loops,
 390          remember that they both iterate over a very small number of
 391          items.  The worst-case inner loop is for the IMG tag, which
 392          has three attributes.  */
 393       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 394         {
 395           if (0 == strcasecmp (tag->attrs[attrind].name,
 396                                tag_url_attributes[i].attr_name))
 397             {
 398               struct urlpos *up = append_url (link, tag, attrind, ctx);
 399               if (up)
 400                 {
 401                   int flags = tag_url_attributes[i].flags;
 402                   if (flags & ATTR_INLINE)
 403                     up->link_inline_p = 1;
 404                   if (flags & ATTR_HTML)
 405                     up->link_expect_html = 1;
 406                 }
 407             }
 408         }
 409     }
 410 }
 411
 412 /* Handle the BASE tag, for <base href=...>. */
 413
 414 static void
 415 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 416 {
 417   struct urlpos *base_urlpos;
 418   int attrind;
 419   char *newbase = find_attr (tag, "href", &attrind);
 420   if (!newbase)
 421     return;
 422
 423   base_urlpos = append_url (newbase, tag, attrind, ctx);
 424   if (!base_urlpos)
 425     return;
 426   base_urlpos->ignore_when_downloading = 1;
 427   base_urlpos->link_base_p = 1;
 428
 429   if (ctx->base)
 430     xfree (ctx->base);
 431   if (ctx->parent_base)
 432     ctx->base = uri_merge (ctx->parent_base, newbase);
 433   else
 434     ctx->base = xstrdup (newbase);
 435 }
 436
 437 /* Mark the URL found in <form action=...> for conversion. */
 438
 439 static void
 440 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 441 {
 442   int attrind;
 443   char *action = find_attr (tag, "action", &attrind);
 444   if (action)
 445     {
 446       struct urlpos *up = append_url (action, tag, attrind, ctx);
 447       if (up)
 448         up->ignore_when_downloading = 1;
 449     }
 450 }
 451
 452 /* Handle the LINK tag.  It requires special handling because how its
 453    links will be followed in -p mode depends on the REL attribute.  */
 454
 455 static void
 456 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 457 {
 458   int attrind;
 459   char *href = find_attr (tag, "href", &attrind);
 460
 461   /* All <link href="..."> link references are external, except those
 462      known not to be, such as style sheet and shortcut icon:
 463
 464        <link rel="stylesheet" href="...">
 465        <link rel="shortcut icon" href="...">
 466   */
 467   if (href)
 468     {
 469       struct urlpos *up = append_url (href, tag, attrind, ctx);
 470       if (up)
 471         {
 472           char *rel = find_attr (tag, "rel", NULL);
 473           if (rel
 474               && (0 == strcasecmp (rel, "stylesheet")
 475                   || 0 == strcasecmp (rel, "shortcut icon")))
 476             up->link_inline_p = 1;
 477           else
 478             /* The external ones usually point to HTML pages, such as
 479                <link rel="next" href="..."> */
 480             up->link_expect_html = 1;
 481         }
 482     }
 483 }
 484
 485 /* Handle the META tag.  This requires special handling because of the
 486    refresh feature and because of robot exclusion.  */
 487
 488 static void
 489 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 490 {
 491   char *name = find_attr (tag, "name", NULL);
 492   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 493
 494   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 495     {
 496       /* Some pages use a META tag to specify that the page be
 497          refreshed by a new page after a given number of seconds.  The
 498          general format for this is:
 499
 500            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 501
 502          So we just need to skip past the "NUMBER; URL=" garbage to
 503          get to the URL.  */
 504
 505       struct urlpos *entry;
 506       int attrind;
 507       int timeout = 0;
 508       char *p;
 509
 510       char *refresh = find_attr (tag, "content", &attrind);
 511       if (!refresh)
 512         return;
 513
 514       for (p = refresh; c_isdigit (*p); p++)
 515         timeout = 10 * timeout + *p - '0';
 516       if (*p++ != ';')
 517         return;
 518
 519       while (c_isspace (*p))
 520         ++p;
 521       if (!(   c_toupper (*p)       == 'U'
 522             && c_toupper (*(p + 1)) == 'R'
 523             && c_toupper (*(p + 2)) == 'L'
 524             &&          *(p + 3)  == '='))
 525         return;
 526       p += 4;
 527       while (c_isspace (*p))
 528         ++p;
 529
 530       entry = append_url (p, tag, attrind, ctx);
 531       if (entry)
 532         {
 533           entry->link_refresh_p = 1;
 534           entry->refresh_timeout = timeout;
 535           entry->link_expect_html = 1;
 536         }
 537     }
 538   else if (name && 0 == strcasecmp (name, "robots"))
 539     {
 540       /* Handle stuff like:
 541          <meta name="robots" content="index,nofollow"> */
 542       char *content = find_attr (tag, "content", NULL);
 543       if (!content)
 544         return;
 545       if (!strcasecmp (content, "none"))
 546         ctx->nofollow = true;
 547       else
 548         {
 549           while (*content)
 550             {
 551               /* Find the next occurrence of ',' or the end of
 552                  the string.  */
 553               char *end = strchr (content, ',');
 554               if (end)
 555                 ++end;
 556               else
 557                 end = content + strlen (content);
 558               if (!strncasecmp (content, "nofollow", end - content))
 559                 ctx->nofollow = true;
 560               content = end;
 561             }
 562         }
 563     }
 564 }
 565
 566 /* Dispatch the tag handler appropriate for the tag we're mapping
 567    over.  See known_tags[] for definition of tag handlers.  */
 568
 569 static void
 570 collect_tags_mapper (struct taginfo *tag, void *arg)
 571 {
 572   struct map_context *ctx = (struct map_context *)arg;
 573
 574   /* Find the tag in our table of tags.  This must not fail because
 575      map_html_tags only returns tags found in interesting_tags.  */
 576   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 577   assert (t != NULL);
 578
 579   t->handler (t->tagid, tag, ctx);
 580 }
 581 \f
 582 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 583    it.  It merges relative links in FILE with URL.  It is aware of
 584    <base href=...> and does the right thing.  */
 585
 586 struct urlpos *
 587 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 588 {
 589   struct file_memory *fm;
 590   struct map_context ctx;
 591   int flags;
 592
 593   /* Load the file. */
 594   fm = read_file (file);
 595   if (!fm)
 596     {
 597       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 598       return NULL;
 599     }
 600   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 601
 602   ctx.text = fm->content;
 603   ctx.head = ctx.tail = NULL;
 604   ctx.base = NULL;
 605   ctx.parent_base = url ? url : opt.base_href;
 606   ctx.document_file = file;
 607   ctx.nofollow = false;
 608
 609   if (!interesting_tags)
 610     init_interesting ();
 611
 612   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 613      generate <a href=" foo"> instead of <a href="foo"> (browsers
 614      ignore spaces as well.)  If you really mean space, use &32; or
 615      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 616      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 617      ignored by IE and Mozilla and are presumably introduced by
 618      writing HTML with editors that force word wrap.  */
 619   flags = MHT_TRIM_VALUES;
 620   if (opt.strict_comments)
 621     flags |= MHT_STRICT_COMMENTS;
 622
 623   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 624                  interesting_tags, interesting_attributes);
 625
 626   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 627   if (meta_disallow_follow)
 628     *meta_disallow_follow = ctx.nofollow;
 629
 630   xfree_null (ctx.base);
 631   read_file_free (fm);
 632   return ctx.head;
 633 }
 634
 635 /* This doesn't really have anything to do with HTML, but it's similar
 636    to get_urls_html, so we put it here.  */
 637
 638 struct urlpos *
 639 get_urls_file (const char *file)
 640 {
 641   struct file_memory *fm;
 642   struct urlpos *head, *tail;
 643   const char *text, *text_end;
 644
 645   /* Load the file.  */
 646   fm = read_file (file);
 647   if (!fm)
 648     {
 649       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 650       return NULL;
 651     }
 652   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 653
 654   head = tail = NULL;
 655   text = fm->content;
 656   text_end = fm->content + fm->length;
 657   while (text < text_end)
 658     {
 659       int up_error_code;
 660       char *url_text;
 661       struct urlpos *entry;
 662       struct url *url;
 663
 664       const char *line_beg = text;
 665       const char *line_end = memchr (text, '\n', text_end - text);
 666       if (!line_end)
 667         line_end = text_end;
 668       else
 669         ++line_end;
 670       text = line_end;
 671
 672       /* Strip whitespace from the beginning and end of line. */
 673       while (line_beg < line_end && c_isspace (*line_beg))
 674         ++line_beg;
 675       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 676         --line_end;
 677
 678       if (line_beg == line_end)
 679         continue;
 680
 681       /* The URL is in the [line_beg, line_end) region. */
 682
 683       /* We must copy the URL to a zero-terminated string, and we
 684          can't use alloca because we're in a loop.  *sigh*.  */
 685       url_text = strdupdelim (line_beg, line_end);
 686
 687       if (opt.base_href)
 688         {
 689           /* Merge opt.base_href with URL. */
 690           char *merged = uri_merge (opt.base_href, url_text);
 691           xfree (url_text);
 692           url_text = merged;
 693         }
 694
 695       url = url_parse (url_text, &up_error_code);
 696       if (!url)
 697         {
 698           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 699                      file, url_text, url_error (up_error_code));
 700           xfree (url_text);
 701           continue;
 702         }
 703       xfree (url_text);
 704
 705       entry = xnew0 (struct urlpos);
 706       entry->url = url;
 707
 708       if (!head)
 709         head = entry;
 710       else
 711         tail->next = entry;
 712       tail = entry;
 713     }
 714   read_file_free (fm);
 715   return head;
 716 }
 717
 718 void
 719 cleanup_html_url (void)
 720 {
 721   /* Destroy the hash tables.  The hash table keys and values are not
 722      allocated by this code, so we don't need to free them here.  */
 723   if (interesting_tags)
 724     hash_table_destroy (interesting_tags);
 725   if (interesting_attributes)
 726     hash_table_destroy (interesting_attributes);
 727 }