sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"              /* declaration of get_urls_html */
  45 #include "iri.h"
  46
  47 struct map_context;
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH
  83 };
  84
  85 /* The list of known tags and functions used for handling them.  Most
  86    tags are simply harvested for URLs. */
  87 static struct known_tag {
  88   int tagid;
  89   const char *name;
  90   tag_handler_t handler;
  91 } known_tags[] = {
  92   { TAG_A,       "a",           tag_find_urls },
  93   { TAG_APPLET,  "applet",      tag_find_urls },
  94   { TAG_AREA,    "area",        tag_find_urls },
  95   { TAG_BASE,    "base",        tag_handle_base },
  96   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  97   { TAG_BODY,    "body",        tag_find_urls },
  98   { TAG_EMBED,   "embed",       tag_find_urls },
  99   { TAG_FIG,     "fig",         tag_find_urls },
 100   { TAG_FORM,    "form",        tag_handle_form },
 101   { TAG_FRAME,   "frame",       tag_find_urls },
 102   { TAG_IFRAME,  "iframe",      tag_find_urls },
 103   { TAG_IMG,     "img",         tag_find_urls },
 104   { TAG_INPUT,   "input",       tag_find_urls },
 105   { TAG_LAYER,   "layer",       tag_find_urls },
 106   { TAG_LINK,    "link",        tag_handle_link },
 107   { TAG_META,    "meta",        tag_handle_meta },
 108   { TAG_OBJECT,  "object",      tag_find_urls },
 109   { TAG_OVERLAY, "overlay",     tag_find_urls },
 110   { TAG_SCRIPT,  "script",      tag_find_urls },
 111   { TAG_TABLE,   "table",       tag_find_urls },
 112   { TAG_TD,      "td",          tag_find_urls },
 113   { TAG_TH,      "th",          tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS. */
 120
 121 /* The link is "inline", i.e. needs to be retrieved for this document
 122    to be correctly rendered.  Inline links include inlined images,
 123    stylesheets, children frames, etc.  */
 124 #define ATTR_INLINE     1
 125
 126 /* The link is expected to yield HTML contents.  It's important not to
 127    try to follow HTML obtained by following e.g. <img src="...">
 128    regardless of content-type.  Doing this causes infinite loops for
 129    "images" that return non-404 error pages with links to the same
 130    image.  */
 131 #define ATTR_HTML       2
 132
 133 /* For tags handled by tag_find_urls: attributes that contain URLs to
 134    download. */
 135 static struct {
 136   int tagid;
 137   const char *attr_name;
 138   int flags;
 139 } tag_url_attributes[] = {
 140   { TAG_A,              "href",         ATTR_HTML },
 141   { TAG_APPLET,         "code",         ATTR_INLINE },
 142   { TAG_AREA,           "href",         ATTR_HTML },
 143   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 144   { TAG_BODY,           "background",   ATTR_INLINE },
 145   { TAG_EMBED,          "href",         ATTR_HTML },
 146   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_FIG,            "src",          ATTR_INLINE },
 148   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IMG,            "href",         ATTR_INLINE },
 151   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 152   { TAG_IMG,            "src",          ATTR_INLINE },
 153   { TAG_INPUT,          "src",          ATTR_INLINE },
 154   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_OBJECT,         "data",         ATTR_INLINE },
 156   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 158   { TAG_TABLE,          "background",   ATTR_INLINE },
 159   { TAG_TD,             "background",   ATTR_INLINE },
 160   { TAG_TH,             "background",   ATTR_INLINE }
 161 };
 162
 163 /* The lists of interesting tags and attributes are built dynamically,
 164    from the information above.  However, some places in the code refer
 165    to the attributes not mentioned here.  We add them manually.  */
 166 static const char *additional_attributes[] = {
 167   "rel",                        /* used by tag_handle_link */
 168   "http-equiv",                 /* used by tag_handle_meta */
 169   "name",                       /* used by tag_handle_meta */
 170   "content",                    /* used by tag_handle_meta */
 171   "action"                      /* used by tag_handle_form */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 static void
 178 init_interesting (void)
 179 {
 180   /* Init the variables interesting_tags and interesting_attributes
 181      that are used by the HTML parser to know which tags and
 182      attributes we're interested in.  We initialize this only once,
 183      for performance reasons.
 184
 185      Here we also make sure that what we put in interesting_tags
 186      matches the user's preferences as specified through --ignore-tags
 187      and --follow-tags.  */
 188
 189   size_t i;
 190   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 191
 192   /* First, add all the tags we know hot to handle, mapped to their
 193      respective entries in known_tags.  */
 194   for (i = 0; i < countof (known_tags); i++)
 195     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 196
 197   /* Then remove the tags ignored through --ignore-tags.  */
 198   if (opt.ignore_tags)
 199     {
 200       char **ignored;
 201       for (ignored = opt.ignore_tags; *ignored; ignored++)
 202         hash_table_remove (interesting_tags, *ignored);
 203     }
 204
 205   /* If --follow-tags is specified, use only those tags.  */
 206   if (opt.follow_tags)
 207     {
 208       /* Create a new table intersecting --follow-tags and known_tags,
 209          and use it as interesting_tags.  */
 210       struct hash_table *intersect = make_nocase_string_hash_table (0);
 211       char **followed;
 212       for (followed = opt.follow_tags; *followed; followed++)
 213         {
 214           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 215           if (!t)
 216             continue;           /* ignore unknown --follow-tags entries. */
 217           hash_table_put (intersect, *followed, t);
 218         }
 219       hash_table_destroy (interesting_tags);
 220       interesting_tags = intersect;
 221     }
 222
 223   /* Add the attributes we care about. */
 224   interesting_attributes = make_nocase_string_hash_table (10);
 225   for (i = 0; i < countof (additional_attributes); i++)
 226     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 227   for (i = 0; i < countof (tag_url_attributes); i++)
 228     hash_table_put (interesting_attributes,
 229                     tag_url_attributes[i].attr_name, "1");
 230 }
 231
 232 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 233    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 234    index of the attribute in TAG will be stored there.  */
 235
 236 static char *
 237 find_attr (struct taginfo *tag, const char *name, int *attrind)
 238 {
 239   int i;
 240   for (i = 0; i < tag->nattrs; i++)
 241     if (!strcasecmp (tag->attrs[i].name, name))
 242       {
 243         if (attrind)
 244           *attrind = i;
 245         return tag->attrs[i].value;
 246       }
 247   return NULL;
 248 }
 249
 250 struct map_context {
 251   char *text;                   /* HTML text. */
 252   char *base;                   /* Base URI of the document, possibly
 253                                    changed through <base href=...>. */
 254   const char *parent_base;      /* Base of the current document. */
 255   const char *document_file;    /* File name of this document. */
 256   bool nofollow;                /* whether NOFOLLOW was specified in a
 257                                    <meta name=robots> tag. */
 258
 259   struct urlpos *head, *tail;   /* List of URLs that is being
 260                                    built. */
 261 };
 262
 263 /* Append LINK_URI to the urlpos structure that is being built.
 264
 265    LINK_URI will be merged with the current document base.  TAG and
 266    ATTRIND are the necessary context to store the position and
 267    size.  */
 268
 269 static struct urlpos *
 270 append_url (const char *link_uri,
 271             struct taginfo *tag, int attrind, struct map_context *ctx)
 272 {
 273   int link_has_scheme = url_has_scheme (link_uri);
 274   struct urlpos *newel;
 275   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 276   struct url *url;
 277
 278   if (!base)
 279     {
 280       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 281                ctx->document_file, link_uri));
 282
 283       if (!link_has_scheme)
 284         {
 285           /* Base URL is unavailable, and the link does not have a
 286              location attached to it -- we have to give up.  Since
 287              this can only happen when using `--force-html -i', print
 288              a warning.  */
 289           logprintf (LOG_NOTQUIET,
 290                      _("%s: Cannot resolve incomplete link %s.\n"),
 291                      ctx->document_file, link_uri);
 292           return NULL;
 293         }
 294
 295       set_ugly_no_encode (true);
 296       url = url_parse (link_uri, NULL);
 297       set_ugly_no_encode (false);
 298       if (!url)
 299         {
 300           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 301                    ctx->document_file, link_uri));
 302           return NULL;
 303         }
 304     }
 305   else
 306     {
 307       /* Merge BASE with LINK_URI, but also make sure the result is
 308          canonicalized, i.e. that "../" have been resolved.
 309          (parse_url will do that for us.) */
 310
 311       char *complete_uri = uri_merge (base, link_uri);
 312
 313       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 314                ctx->document_file, base, link_uri, complete_uri));
 315
 316       set_ugly_no_encode (true);
 317       url = url_parse (complete_uri, NULL);
 318       set_ugly_no_encode (false);
 319       if (!url)
 320         {
 321           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 322                    ctx->document_file, complete_uri));
 323           xfree (complete_uri);
 324           return NULL;
 325         }
 326       xfree (complete_uri);
 327     }
 328
 329   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 330
 331   newel = xnew0 (struct urlpos);
 332   newel->url = url;
 333   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 334   newel->size = tag->attrs[attrind].value_raw_size;
 335
 336   /* A URL is relative if the host is not named, and the name does not
 337      start with `/'.  */
 338   if (!link_has_scheme && *link_uri != '/')
 339     newel->link_relative_p = 1;
 340   else if (link_has_scheme)
 341     newel->link_complete_p = 1;
 342
 343   if (ctx->tail)
 344     {
 345       ctx->tail->next = newel;
 346       ctx->tail = newel;
 347     }
 348   else
 349     ctx->tail = ctx->head = newel;
 350
 351   return newel;
 352 }
 353 \f
 354 /* All the tag_* functions are called from collect_tags_mapper, as
 355    specified by KNOWN_TAGS.  */
 356
 357 /* Default tag handler: collect URLs from attributes specified for
 358    this tag by tag_url_attributes.  */
 359
 360 static void
 361 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 362 {
 363   size_t i;
 364   int attrind;
 365   int first = -1;
 366
 367   for (i = 0; i < countof (tag_url_attributes); i++)
 368     if (tag_url_attributes[i].tagid == tagid)
 369       {
 370         /* We've found the index of tag_url_attributes where the
 371            attributes of our tag begin.  */
 372         first = i;
 373         break;
 374       }
 375   assert (first != -1);
 376
 377   /* Loop over the "interesting" attributes of this tag.  In this
 378      example, it will loop over "src" and "lowsrc".
 379
 380        <img src="foo.png" lowsrc="bar.png">
 381
 382      This has to be done in the outer loop so that the attributes are
 383      processed in the same order in which they appear in the page.
 384      This is required when converting links.  */
 385
 386   for (attrind = 0; attrind < tag->nattrs; attrind++)
 387     {
 388       /* Find whether TAG/ATTRIND is a combination that contains a
 389          URL. */
 390       char *link = tag->attrs[attrind].value;
 391       const size_t size = countof (tag_url_attributes);
 392
 393       /* If you're cringing at the inefficiency of the nested loops,
 394          remember that they both iterate over a very small number of
 395          items.  The worst-case inner loop is for the IMG tag, which
 396          has three attributes.  */
 397       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 398         {
 399           if (0 == strcasecmp (tag->attrs[attrind].name,
 400                                tag_url_attributes[i].attr_name))
 401             {
 402               struct urlpos *up = append_url (link, tag, attrind, ctx);
 403               if (up)
 404                 {
 405                   int flags = tag_url_attributes[i].flags;
 406                   if (flags & ATTR_INLINE)
 407                     up->link_inline_p = 1;
 408                   if (flags & ATTR_HTML)
 409                     up->link_expect_html = 1;
 410                 }
 411             }
 412         }
 413     }
 414 }
 415
 416 /* Handle the BASE tag, for <base href=...>. */
 417
 418 static void
 419 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 420 {
 421   struct urlpos *base_urlpos;
 422   int attrind;
 423   char *newbase = find_attr (tag, "href", &attrind);
 424   if (!newbase)
 425     return;
 426
 427   base_urlpos = append_url (newbase, tag, attrind, ctx);
 428   if (!base_urlpos)
 429     return;
 430   base_urlpos->ignore_when_downloading = 1;
 431   base_urlpos->link_base_p = 1;
 432
 433   if (ctx->base)
 434     xfree (ctx->base);
 435   if (ctx->parent_base)
 436     ctx->base = uri_merge (ctx->parent_base, newbase);
 437   else
 438     ctx->base = xstrdup (newbase);
 439 }
 440
 441 /* Mark the URL found in <form action=...> for conversion. */
 442
 443 static void
 444 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 445 {
 446   int attrind;
 447   char *action = find_attr (tag, "action", &attrind);
 448   if (action)
 449     {
 450       struct urlpos *up = append_url (action, tag, attrind, ctx);
 451       if (up)
 452         up->ignore_when_downloading = 1;
 453     }
 454 }
 455
 456 /* Handle the LINK tag.  It requires special handling because how its
 457    links will be followed in -p mode depends on the REL attribute.  */
 458
 459 static void
 460 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 461 {
 462   int attrind;
 463   char *href = find_attr (tag, "href", &attrind);
 464
 465   /* All <link href="..."> link references are external, except those
 466      known not to be, such as style sheet and shortcut icon:
 467
 468        <link rel="stylesheet" href="...">
 469        <link rel="shortcut icon" href="...">
 470   */
 471   if (href)
 472     {
 473       struct urlpos *up = append_url (href, tag, attrind, ctx);
 474       if (up)
 475         {
 476           char *rel = find_attr (tag, "rel", NULL);
 477           if (rel
 478               && (0 == strcasecmp (rel, "stylesheet")
 479                   || 0 == strcasecmp (rel, "shortcut icon")))
 480             up->link_inline_p = 1;
 481           else
 482             /* The external ones usually point to HTML pages, such as
 483                <link rel="next" href="..."> */
 484             up->link_expect_html = 1;
 485         }
 486     }
 487 }
 488
 489 /* Handle the META tag.  This requires special handling because of the
 490    refresh feature and because of robot exclusion.  */
 491
 492 static void
 493 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 494 {
 495   char *name = find_attr (tag, "name", NULL);
 496   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 497
 498   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 499     {
 500       /* Some pages use a META tag to specify that the page be
 501          refreshed by a new page after a given number of seconds.  The
 502          general format for this is:
 503
 504            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 505
 506          So we just need to skip past the "NUMBER; URL=" garbage to
 507          get to the URL.  */
 508
 509       struct urlpos *entry;
 510       int attrind;
 511       int timeout = 0;
 512       char *p;
 513
 514       char *refresh = find_attr (tag, "content", &attrind);
 515       if (!refresh)
 516         return;
 517
 518       for (p = refresh; c_isdigit (*p); p++)
 519         timeout = 10 * timeout + *p - '0';
 520       if (*p++ != ';')
 521         return;
 522
 523       while (c_isspace (*p))
 524         ++p;
 525       if (!(   c_toupper (*p)       == 'U'
 526             && c_toupper (*(p + 1)) == 'R'
 527             && c_toupper (*(p + 2)) == 'L'
 528             &&          *(p + 3)  == '='))
 529         return;
 530       p += 4;
 531       while (c_isspace (*p))
 532         ++p;
 533
 534       entry = append_url (p, tag, attrind, ctx);
 535       if (entry)
 536         {
 537           entry->link_refresh_p = 1;
 538           entry->refresh_timeout = timeout;
 539           entry->link_expect_html = 1;
 540         }
 541     }
 542   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 543     {
 544       /* Handle stuff like:
 545          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 546
 547       char *mcharset;
 548       char *content = find_attr (tag, "content", NULL);
 549       if (!content)
 550         return;
 551
 552       mcharset = parse_charset (content);
 553       if (!mcharset)
 554         return;
 555
 556       /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
 557
 558       set_current_charset (mcharset);
 559       xfree (mcharset);
 560     }
 561   else if (name && 0 == strcasecmp (name, "robots"))
 562     {
 563       /* Handle stuff like:
 564          <meta name="robots" content="index,nofollow"> */
 565       char *content = find_attr (tag, "content", NULL);
 566       if (!content)
 567         return;
 568       if (!strcasecmp (content, "none"))
 569         ctx->nofollow = true;
 570       else
 571         {
 572           while (*content)
 573             {
 574               /* Find the next occurrence of ',' or the end of
 575                  the string.  */
 576               char *end = strchr (content, ',');
 577               if (end)
 578                 ++end;
 579               else
 580                 end = content + strlen (content);
 581               if (!strncasecmp (content, "nofollow", end - content))
 582                 ctx->nofollow = true;
 583               content = end;
 584             }
 585         }
 586     }
 587 }
 588
 589 /* Dispatch the tag handler appropriate for the tag we're mapping
 590    over.  See known_tags[] for definition of tag handlers.  */
 591
 592 static void
 593 collect_tags_mapper (struct taginfo *tag, void *arg)
 594 {
 595   struct map_context *ctx = (struct map_context *)arg;
 596
 597   /* Find the tag in our table of tags.  This must not fail because
 598      map_html_tags only returns tags found in interesting_tags.  */
 599   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 600   assert (t != NULL);
 601
 602   t->handler (t->tagid, tag, ctx);
 603 }
 604 \f
 605 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 606    it.  It merges relative links in FILE with URL.  It is aware of
 607    <base href=...> and does the right thing.  */
 608
 609 struct urlpos *
 610 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 611 {
 612   struct file_memory *fm;
 613   struct map_context ctx;
 614   int flags;
 615
 616   /* Load the file. */
 617   fm = read_file (file);
 618   if (!fm)
 619     {
 620       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 621       return NULL;
 622     }
 623   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 624
 625   ctx.text = fm->content;
 626   ctx.head = ctx.tail = NULL;
 627   ctx.base = NULL;
 628   ctx.parent_base = url ? url : opt.base_href;
 629   ctx.document_file = file;
 630   ctx.nofollow = false;
 631
 632   if (!interesting_tags)
 633     init_interesting ();
 634
 635   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 636      generate <a href=" foo"> instead of <a href="foo"> (browsers
 637      ignore spaces as well.)  If you really mean space, use &32; or
 638      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 639      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 640      ignored by IE and Mozilla and are presumably introduced by
 641      writing HTML with editors that force word wrap.  */
 642   flags = MHT_TRIM_VALUES;
 643   if (opt.strict_comments)
 644     flags |= MHT_STRICT_COMMENTS;
 645
 646   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 647                  interesting_tags, interesting_attributes);
 648
 649   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 650   if (meta_disallow_follow)
 651     *meta_disallow_follow = ctx.nofollow;
 652
 653   xfree_null (ctx.base);
 654   read_file_free (fm);
 655   return ctx.head;
 656 }
 657
 658 /* This doesn't really have anything to do with HTML, but it's similar
 659    to get_urls_html, so we put it here.  */
 660
 661 struct urlpos *
 662 get_urls_file (const char *file)
 663 {
 664   struct file_memory *fm;
 665   struct urlpos *head, *tail;
 666   const char *text, *text_end;
 667
 668   /* Load the file.  */
 669   fm = read_file (file);
 670   if (!fm)
 671     {
 672       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 673       return NULL;
 674     }
 675   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 676
 677   head = tail = NULL;
 678   text = fm->content;
 679   text_end = fm->content + fm->length;
 680   while (text < text_end)
 681     {
 682       int up_error_code;
 683       char *url_text;
 684       struct urlpos *entry;
 685       struct url *url;
 686
 687       const char *line_beg = text;
 688       const char *line_end = memchr (text, '\n', text_end - text);
 689       if (!line_end)
 690         line_end = text_end;
 691       else
 692         ++line_end;
 693       text = line_end;
 694
 695       /* Strip whitespace from the beginning and end of line. */
 696       while (line_beg < line_end && c_isspace (*line_beg))
 697         ++line_beg;
 698       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 699         --line_end;
 700
 701       if (line_beg == line_end)
 702         continue;
 703
 704       /* The URL is in the [line_beg, line_end) region. */
 705
 706       /* We must copy the URL to a zero-terminated string, and we
 707          can't use alloca because we're in a loop.  *sigh*.  */
 708       url_text = strdupdelim (line_beg, line_end);
 709
 710       if (opt.base_href)
 711         {
 712           /* Merge opt.base_href with URL. */
 713           char *merged = uri_merge (opt.base_href, url_text);
 714           xfree (url_text);
 715           url_text = merged;
 716         }
 717
 718       set_ugly_no_encode (true);
 719       url = url_parse (url_text, &up_error_code);
 720       set_ugly_no_encode (false);
 721       if (!url)
 722         {
 723           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 724                      file, url_text, url_error (up_error_code));
 725           xfree (url_text);
 726           continue;
 727         }
 728       xfree (url_text);
 729
 730       entry = xnew0 (struct urlpos);
 731       entry->url = url;
 732
 733       if (!head)
 734         head = entry;
 735       else
 736         tail->next = entry;
 737       tail = entry;
 738     }
 739   read_file_free (fm);
 740   return head;
 741 }
 742
 743 void
 744 cleanup_html_url (void)
 745 {
 746   /* Destroy the hash tables.  The hash table keys and values are not
 747      allocated by this code, so we don't need to free them here.  */
 748   if (interesting_tags)
 749     hash_table_destroy (interesting_tags);
 750   if (interesting_attributes)
 751     hash_table_destroy (interesting_attributes);
 752 }