sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"              /* declaration of get_urls_html */
  45 #include "iri.h"
  46
  47 struct map_context;
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH
  83 };
  84
  85 /* The list of known tags and functions used for handling them.  Most
  86    tags are simply harvested for URLs. */
  87 static struct known_tag {
  88   int tagid;
  89   const char *name;
  90   tag_handler_t handler;
  91 } known_tags[] = {
  92   { TAG_A,       "a",           tag_find_urls },
  93   { TAG_APPLET,  "applet",      tag_find_urls },
  94   { TAG_AREA,    "area",        tag_find_urls },
  95   { TAG_BASE,    "base",        tag_handle_base },
  96   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  97   { TAG_BODY,    "body",        tag_find_urls },
  98   { TAG_EMBED,   "embed",       tag_find_urls },
  99   { TAG_FIG,     "fig",         tag_find_urls },
 100   { TAG_FORM,    "form",        tag_handle_form },
 101   { TAG_FRAME,   "frame",       tag_find_urls },
 102   { TAG_IFRAME,  "iframe",      tag_find_urls },
 103   { TAG_IMG,     "img",         tag_find_urls },
 104   { TAG_INPUT,   "input",       tag_find_urls },
 105   { TAG_LAYER,   "layer",       tag_find_urls },
 106   { TAG_LINK,    "link",        tag_handle_link },
 107   { TAG_META,    "meta",        tag_handle_meta },
 108   { TAG_OBJECT,  "object",      tag_find_urls },
 109   { TAG_OVERLAY, "overlay",     tag_find_urls },
 110   { TAG_SCRIPT,  "script",      tag_find_urls },
 111   { TAG_TABLE,   "table",       tag_find_urls },
 112   { TAG_TD,      "td",          tag_find_urls },
 113   { TAG_TH,      "th",          tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS. */
 120
 121 /* The link is "inline", i.e. needs to be retrieved for this document
 122    to be correctly rendered.  Inline links include inlined images,
 123    stylesheets, children frames, etc.  */
 124 #define ATTR_INLINE     1
 125
 126 /* The link is expected to yield HTML contents.  It's important not to
 127    try to follow HTML obtained by following e.g. <img src="...">
 128    regardless of content-type.  Doing this causes infinite loops for
 129    "images" that return non-404 error pages with links to the same
 130    image.  */
 131 #define ATTR_HTML       2
 132
 133 /* For tags handled by tag_find_urls: attributes that contain URLs to
 134    download. */
 135 static struct {
 136   int tagid;
 137   const char *attr_name;
 138   int flags;
 139 } tag_url_attributes[] = {
 140   { TAG_A,              "href",         ATTR_HTML },
 141   { TAG_APPLET,         "code",         ATTR_INLINE },
 142   { TAG_AREA,           "href",         ATTR_HTML },
 143   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 144   { TAG_BODY,           "background",   ATTR_INLINE },
 145   { TAG_EMBED,          "href",         ATTR_HTML },
 146   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_FIG,            "src",          ATTR_INLINE },
 148   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IMG,            "href",         ATTR_INLINE },
 151   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 152   { TAG_IMG,            "src",          ATTR_INLINE },
 153   { TAG_INPUT,          "src",          ATTR_INLINE },
 154   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_OBJECT,         "data",         ATTR_INLINE },
 156   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 158   { TAG_TABLE,          "background",   ATTR_INLINE },
 159   { TAG_TD,             "background",   ATTR_INLINE },
 160   { TAG_TH,             "background",   ATTR_INLINE }
 161 };
 162
 163 /* The lists of interesting tags and attributes are built dynamically,
 164    from the information above.  However, some places in the code refer
 165    to the attributes not mentioned here.  We add them manually.  */
 166 static const char *additional_attributes[] = {
 167   "rel",                        /* used by tag_handle_link */
 168   "http-equiv",                 /* used by tag_handle_meta */
 169   "name",                       /* used by tag_handle_meta */
 170   "content",                    /* used by tag_handle_meta */
 171   "action"                      /* used by tag_handle_form */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 static void
 178 init_interesting (void)
 179 {
 180   /* Init the variables interesting_tags and interesting_attributes
 181      that are used by the HTML parser to know which tags and
 182      attributes we're interested in.  We initialize this only once,
 183      for performance reasons.
 184
 185      Here we also make sure that what we put in interesting_tags
 186      matches the user's preferences as specified through --ignore-tags
 187      and --follow-tags.  */
 188
 189   size_t i;
 190   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 191
 192   /* First, add all the tags we know hot to handle, mapped to their
 193      respective entries in known_tags.  */
 194   for (i = 0; i < countof (known_tags); i++)
 195     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 196
 197   /* Then remove the tags ignored through --ignore-tags.  */
 198   if (opt.ignore_tags)
 199     {
 200       char **ignored;
 201       for (ignored = opt.ignore_tags; *ignored; ignored++)
 202         hash_table_remove (interesting_tags, *ignored);
 203     }
 204
 205   /* If --follow-tags is specified, use only those tags.  */
 206   if (opt.follow_tags)
 207     {
 208       /* Create a new table intersecting --follow-tags and known_tags,
 209          and use it as interesting_tags.  */
 210       struct hash_table *intersect = make_nocase_string_hash_table (0);
 211       char **followed;
 212       for (followed = opt.follow_tags; *followed; followed++)
 213         {
 214           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 215           if (!t)
 216             continue;           /* ignore unknown --follow-tags entries. */
 217           hash_table_put (intersect, *followed, t);
 218         }
 219       hash_table_destroy (interesting_tags);
 220       interesting_tags = intersect;
 221     }
 222
 223   /* Add the attributes we care about. */
 224   interesting_attributes = make_nocase_string_hash_table (10);
 225   for (i = 0; i < countof (additional_attributes); i++)
 226     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 227   for (i = 0; i < countof (tag_url_attributes); i++)
 228     hash_table_put (interesting_attributes,
 229                     tag_url_attributes[i].attr_name, "1");
 230 }
 231
 232 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 233    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 234    index of the attribute in TAG will be stored there.  */
 235
 236 static char *
 237 find_attr (struct taginfo *tag, const char *name, int *attrind)
 238 {
 239   int i;
 240   for (i = 0; i < tag->nattrs; i++)
 241     if (!strcasecmp (tag->attrs[i].name, name))
 242       {
 243         if (attrind)
 244           *attrind = i;
 245         return tag->attrs[i].value;
 246       }
 247   return NULL;
 248 }
 249
 250 struct map_context {
 251   char *text;                   /* HTML text. */
 252   char *base;                   /* Base URI of the document, possibly
 253                                    changed through <base href=...>. */
 254   const char *parent_base;      /* Base of the current document. */
 255   const char *document_file;    /* File name of this document. */
 256   bool nofollow;                /* whether NOFOLLOW was specified in a
 257                                    <meta name=robots> tag. */
 258
 259   struct urlpos *head, *tail;   /* List of URLs that is being
 260                                    built. */
 261 };
 262
 263 /* Append LINK_URI to the urlpos structure that is being built.
 264
 265    LINK_URI will be merged with the current document base.  TAG and
 266    ATTRIND are the necessary context to store the position and
 267    size.  */
 268
 269 static struct urlpos *
 270 append_url (const char *link_uri,
 271             struct taginfo *tag, int attrind, struct map_context *ctx)
 272 {
 273   int link_has_scheme = url_has_scheme (link_uri);
 274   struct urlpos *newel;
 275   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 276   struct url *url;
 277
 278   if (!base)
 279     {
 280       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 281                ctx->document_file, link_uri));
 282
 283       if (!link_has_scheme)
 284         {
 285           /* Base URL is unavailable, and the link does not have a
 286              location attached to it -- we have to give up.  Since
 287              this can only happen when using `--force-html -i', print
 288              a warning.  */
 289           logprintf (LOG_NOTQUIET,
 290                      _("%s: Cannot resolve incomplete link %s.\n"),
 291                      ctx->document_file, link_uri);
 292           return NULL;
 293         }
 294
 295       url = url_parse (link_uri, NULL);
 296       if (!url)
 297         {
 298           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 299                    ctx->document_file, link_uri));
 300           return NULL;
 301         }
 302     }
 303   else
 304     {
 305       /* Merge BASE with LINK_URI, but also make sure the result is
 306          canonicalized, i.e. that "../" have been resolved.
 307          (parse_url will do that for us.) */
 308
 309       char *complete_uri = uri_merge (base, link_uri);
 310
 311       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 312                ctx->document_file, base, link_uri, complete_uri));
 313
 314       url = url_parse (complete_uri, NULL);
 315       if (!url)
 316         {
 317           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 318                    ctx->document_file, complete_uri));
 319           xfree (complete_uri);
 320           return NULL;
 321         }
 322       xfree (complete_uri);
 323     }
 324
 325   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 326
 327   newel = xnew0 (struct urlpos);
 328   newel->url = url;
 329   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 330   newel->size = tag->attrs[attrind].value_raw_size;
 331
 332   /* A URL is relative if the host is not named, and the name does not
 333      start with `/'.  */
 334   if (!link_has_scheme && *link_uri != '/')
 335     newel->link_relative_p = 1;
 336   else if (link_has_scheme)
 337     newel->link_complete_p = 1;
 338
 339   if (ctx->tail)
 340     {
 341       ctx->tail->next = newel;
 342       ctx->tail = newel;
 343     }
 344   else
 345     ctx->tail = ctx->head = newel;
 346
 347   return newel;
 348 }
 349 \f
 350 /* All the tag_* functions are called from collect_tags_mapper, as
 351    specified by KNOWN_TAGS.  */
 352
 353 /* Default tag handler: collect URLs from attributes specified for
 354    this tag by tag_url_attributes.  */
 355
 356 static void
 357 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 358 {
 359   size_t i;
 360   int attrind;
 361   int first = -1;
 362
 363   for (i = 0; i < countof (tag_url_attributes); i++)
 364     if (tag_url_attributes[i].tagid == tagid)
 365       {
 366         /* We've found the index of tag_url_attributes where the
 367            attributes of our tag begin.  */
 368         first = i;
 369         break;
 370       }
 371   assert (first != -1);
 372
 373   /* Loop over the "interesting" attributes of this tag.  In this
 374      example, it will loop over "src" and "lowsrc".
 375
 376        <img src="foo.png" lowsrc="bar.png">
 377
 378      This has to be done in the outer loop so that the attributes are
 379      processed in the same order in which they appear in the page.
 380      This is required when converting links.  */
 381
 382   for (attrind = 0; attrind < tag->nattrs; attrind++)
 383     {
 384       /* Find whether TAG/ATTRIND is a combination that contains a
 385          URL. */
 386       char *link = tag->attrs[attrind].value;
 387       const size_t size = countof (tag_url_attributes);
 388
 389       /* If you're cringing at the inefficiency of the nested loops,
 390          remember that they both iterate over a very small number of
 391          items.  The worst-case inner loop is for the IMG tag, which
 392          has three attributes.  */
 393       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 394         {
 395           if (0 == strcasecmp (tag->attrs[attrind].name,
 396                                tag_url_attributes[i].attr_name))
 397             {
 398               struct urlpos *up = append_url (link, tag, attrind, ctx);
 399               if (up)
 400                 {
 401                   int flags = tag_url_attributes[i].flags;
 402                   if (flags & ATTR_INLINE)
 403                     up->link_inline_p = 1;
 404                   if (flags & ATTR_HTML)
 405                     up->link_expect_html = 1;
 406                 }
 407             }
 408         }
 409     }
 410 }
 411
 412 /* Handle the BASE tag, for <base href=...>. */
 413
 414 static void
 415 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 416 {
 417   struct urlpos *base_urlpos;
 418   int attrind;
 419   char *newbase = find_attr (tag, "href", &attrind);
 420   if (!newbase)
 421     return;
 422
 423   base_urlpos = append_url (newbase, tag, attrind, ctx);
 424   if (!base_urlpos)
 425     return;
 426   base_urlpos->ignore_when_downloading = 1;
 427   base_urlpos->link_base_p = 1;
 428
 429   if (ctx->base)
 430     xfree (ctx->base);
 431   if (ctx->parent_base)
 432     ctx->base = uri_merge (ctx->parent_base, newbase);
 433   else
 434     ctx->base = xstrdup (newbase);
 435 }
 436
 437 /* Mark the URL found in <form action=...> for conversion. */
 438
 439 static void
 440 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 441 {
 442   int attrind;
 443   char *action = find_attr (tag, "action", &attrind);
 444   if (action)
 445     {
 446       struct urlpos *up = append_url (action, tag, attrind, ctx);
 447       if (up)
 448         up->ignore_when_downloading = 1;
 449     }
 450 }
 451
 452 /* Handle the LINK tag.  It requires special handling because how its
 453    links will be followed in -p mode depends on the REL attribute.  */
 454
 455 static void
 456 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 457 {
 458   int attrind;
 459   char *href = find_attr (tag, "href", &attrind);
 460
 461   /* All <link href="..."> link references are external, except those
 462      known not to be, such as style sheet and shortcut icon:
 463
 464        <link rel="stylesheet" href="...">
 465        <link rel="shortcut icon" href="...">
 466   */
 467   if (href)
 468     {
 469       struct urlpos *up = append_url (href, tag, attrind, ctx);
 470       if (up)
 471         {
 472           char *rel = find_attr (tag, "rel", NULL);
 473           if (rel
 474               && (0 == strcasecmp (rel, "stylesheet")
 475                   || 0 == strcasecmp (rel, "shortcut icon")))
 476             up->link_inline_p = 1;
 477           else
 478             /* The external ones usually point to HTML pages, such as
 479                <link rel="next" href="..."> */
 480             up->link_expect_html = 1;
 481         }
 482     }
 483 }
 484
 485 /* Handle the META tag.  This requires special handling because of the
 486    refresh feature and because of robot exclusion.  */
 487
 488 static void
 489 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 490 {
 491   char *name = find_attr (tag, "name", NULL);
 492   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 493
 494   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 495     {
 496       /* Some pages use a META tag to specify that the page be
 497          refreshed by a new page after a given number of seconds.  The
 498          general format for this is:
 499
 500            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 501
 502          So we just need to skip past the "NUMBER; URL=" garbage to
 503          get to the URL.  */
 504
 505       struct urlpos *entry;
 506       int attrind;
 507       int timeout = 0;
 508       char *p;
 509
 510       char *refresh = find_attr (tag, "content", &attrind);
 511       if (!refresh)
 512         return;
 513
 514       for (p = refresh; c_isdigit (*p); p++)
 515         timeout = 10 * timeout + *p - '0';
 516       if (*p++ != ';')
 517         return;
 518
 519       while (c_isspace (*p))
 520         ++p;
 521       if (!(   c_toupper (*p)       == 'U'
 522             && c_toupper (*(p + 1)) == 'R'
 523             && c_toupper (*(p + 2)) == 'L'
 524             &&          *(p + 3)  == '='))
 525         return;
 526       p += 4;
 527       while (c_isspace (*p))
 528         ++p;
 529
 530       entry = append_url (p, tag, attrind, ctx);
 531       if (entry)
 532         {
 533           entry->link_refresh_p = 1;
 534           entry->refresh_timeout = timeout;
 535           entry->link_expect_html = 1;
 536         }
 537     }
 538   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 539     {
 540       /* Handle stuff like:
 541          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 542
 543       char *mcharset;
 544       char *content = find_attr (tag, "content", NULL);
 545       if (!content)
 546         return;
 547
 548       mcharset = parse_charset (content);
 549       if (!mcharset)
 550         return;
 551
 552       logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
 553
 554       /* sXXXav: Not used yet */
 555       xfree (mcharset);
 556     }
 557   else if (name && 0 == strcasecmp (name, "robots"))
 558     {
 559       /* Handle stuff like:
 560          <meta name="robots" content="index,nofollow"> */
 561       char *content = find_attr (tag, "content", NULL);
 562       if (!content)
 563         return;
 564       if (!strcasecmp (content, "none"))
 565         ctx->nofollow = true;
 566       else
 567         {
 568           while (*content)
 569             {
 570               /* Find the next occurrence of ',' or the end of
 571                  the string.  */
 572               char *end = strchr (content, ',');
 573               if (end)
 574                 ++end;
 575               else
 576                 end = content + strlen (content);
 577               if (!strncasecmp (content, "nofollow", end - content))
 578                 ctx->nofollow = true;
 579               content = end;
 580             }
 581         }
 582     }
 583 }
 584
 585 /* Dispatch the tag handler appropriate for the tag we're mapping
 586    over.  See known_tags[] for definition of tag handlers.  */
 587
 588 static void
 589 collect_tags_mapper (struct taginfo *tag, void *arg)
 590 {
 591   struct map_context *ctx = (struct map_context *)arg;
 592
 593   /* Find the tag in our table of tags.  This must not fail because
 594      map_html_tags only returns tags found in interesting_tags.  */
 595   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 596   assert (t != NULL);
 597
 598   t->handler (t->tagid, tag, ctx);
 599 }
 600 \f
 601 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 602    it.  It merges relative links in FILE with URL.  It is aware of
 603    <base href=...> and does the right thing.  */
 604
 605 struct urlpos *
 606 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 607 {
 608   struct file_memory *fm;
 609   struct map_context ctx;
 610   int flags;
 611
 612   /* Load the file. */
 613   fm = read_file (file);
 614   if (!fm)
 615     {
 616       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 617       return NULL;
 618     }
 619   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 620
 621   ctx.text = fm->content;
 622   ctx.head = ctx.tail = NULL;
 623   ctx.base = NULL;
 624   ctx.parent_base = url ? url : opt.base_href;
 625   ctx.document_file = file;
 626   ctx.nofollow = false;
 627
 628   if (!interesting_tags)
 629     init_interesting ();
 630
 631   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 632      generate <a href=" foo"> instead of <a href="foo"> (browsers
 633      ignore spaces as well.)  If you really mean space, use &32; or
 634      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 635      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 636      ignored by IE and Mozilla and are presumably introduced by
 637      writing HTML with editors that force word wrap.  */
 638   flags = MHT_TRIM_VALUES;
 639   if (opt.strict_comments)
 640     flags |= MHT_STRICT_COMMENTS;
 641
 642   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 643                  interesting_tags, interesting_attributes);
 644
 645   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 646   if (meta_disallow_follow)
 647     *meta_disallow_follow = ctx.nofollow;
 648
 649   xfree_null (ctx.base);
 650   read_file_free (fm);
 651   return ctx.head;
 652 }
 653
 654 /* This doesn't really have anything to do with HTML, but it's similar
 655    to get_urls_html, so we put it here.  */
 656
 657 struct urlpos *
 658 get_urls_file (const char *file)
 659 {
 660   struct file_memory *fm;
 661   struct urlpos *head, *tail;
 662   const char *text, *text_end;
 663
 664   /* Load the file.  */
 665   fm = read_file (file);
 666   if (!fm)
 667     {
 668       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 669       return NULL;
 670     }
 671   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 672
 673   head = tail = NULL;
 674   text = fm->content;
 675   text_end = fm->content + fm->length;
 676   while (text < text_end)
 677     {
 678       int up_error_code;
 679       char *url_text;
 680       struct urlpos *entry;
 681       struct url *url;
 682
 683       const char *line_beg = text;
 684       const char *line_end = memchr (text, '\n', text_end - text);
 685       if (!line_end)
 686         line_end = text_end;
 687       else
 688         ++line_end;
 689       text = line_end;
 690
 691       /* Strip whitespace from the beginning and end of line. */
 692       while (line_beg < line_end && c_isspace (*line_beg))
 693         ++line_beg;
 694       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 695         --line_end;
 696
 697       if (line_beg == line_end)
 698         continue;
 699
 700       /* The URL is in the [line_beg, line_end) region. */
 701
 702       /* We must copy the URL to a zero-terminated string, and we
 703          can't use alloca because we're in a loop.  *sigh*.  */
 704       url_text = strdupdelim (line_beg, line_end);
 705
 706       if (opt.base_href)
 707         {
 708           /* Merge opt.base_href with URL. */
 709           char *merged = uri_merge (opt.base_href, url_text);
 710           xfree (url_text);
 711           url_text = merged;
 712         }
 713
 714       url = url_parse (url_text, &up_error_code);
 715       if (!url)
 716         {
 717           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 718                      file, url_text, url_error (up_error_code));
 719           xfree (url_text);
 720           continue;
 721         }
 722       xfree (url_text);
 723
 724       entry = xnew0 (struct urlpos);
 725       entry->url = url;
 726
 727       if (!head)
 728         head = entry;
 729       else
 730         tail->next = entry;
 731       tail = entry;
 732     }
 733   read_file_free (fm);
 734   return head;
 735 }
 736
 737 void
 738 cleanup_html_url (void)
 739 {
 740   /* Destroy the hash tables.  The hash table keys and values are not
 741      allocated by this code, so we don't need to free them here.  */
 742   if (interesting_tags)
 743     hash_table_destroy (interesting_tags);
 744   if (interesting_attributes)
 745     hash_table_destroy (interesting_attributes);
 746 }