sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46 #include "convert.h"
  47
  48 #ifndef errno
  49 extern int errno;
  50 #endif
  51
  52 struct map_context;
  53
  54 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  55                                        struct map_context *));
  56
  57 #define DECLARE_TAG_HANDLER(fun)                                        \
  58   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  59
  60 DECLARE_TAG_HANDLER (tag_find_urls);
  61 DECLARE_TAG_HANDLER (tag_handle_base);
  62 DECLARE_TAG_HANDLER (tag_handle_form);
  63 DECLARE_TAG_HANDLER (tag_handle_link);
  64 DECLARE_TAG_HANDLER (tag_handle_meta);
  65
  66 /* The list of known tags and functions used for handling them.  Most
  67    tags are simply harvested for URLs. */
  68 static struct {
  69   const char *name;
  70   tag_handler_t handler;
  71 } known_tags[] = {
  72 #define TAG_A           0
  73   { "a",        tag_find_urls },
  74 #define TAG_APPLET      1
  75   { "applet",   tag_find_urls },
  76 #define TAG_AREA        2
  77   { "area",     tag_find_urls },
  78 #define TAG_BASE        3
  79   { "base",     tag_handle_base },
  80 #define TAG_BGSOUND     4
  81   { "bgsound",  tag_find_urls },
  82 #define TAG_BODY        5
  83   { "body",     tag_find_urls },
  84 #define TAG_EMBED       6
  85   { "embed",    tag_find_urls },
  86 #define TAG_FIG         7
  87   { "fig",      tag_find_urls },
  88 #define TAG_FORM        8
  89   { "form",     tag_handle_form },
  90 #define TAG_FRAME       9
  91   { "frame",    tag_find_urls },
  92 #define TAG_IFRAME      10
  93   { "iframe",   tag_find_urls },
  94 #define TAG_IMG         11
  95   { "img",      tag_find_urls },
  96 #define TAG_INPUT       12
  97   { "input",    tag_find_urls },
  98 #define TAG_LAYER       13
  99   { "layer",    tag_find_urls },
 100 #define TAG_LINK        14
 101   { "link",     tag_handle_link },
 102 #define TAG_META        15
 103   { "meta",     tag_handle_meta },
 104 #define TAG_OVERLAY     16
 105   { "overlay",  tag_find_urls },
 106 #define TAG_SCRIPT      17
 107   { "script",   tag_find_urls },
 108 #define TAG_TABLE       18
 109   { "table",    tag_find_urls },
 110 #define TAG_TD          19
 111   { "td",       tag_find_urls },
 112 #define TAG_TH          20
 113   { "th",       tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS field; currently only one flag is defined. */
 120
 121 /* This tag points to an external document not necessary for rendering this
 122    document (i.e. it's not an inlined image, stylesheet, etc.). */
 123 #define TUA_EXTERNAL 1
 124
 125 /* For tags handled by tag_find_urls: attributes that contain URLs to
 126    download. */
 127 static struct {
 128   int tagid;
 129   const char *attr_name;
 130   int flags;
 131 } tag_url_attributes[] = {
 132   { TAG_A,              "href",         TUA_EXTERNAL },
 133   { TAG_APPLET,         "code",         0 },
 134   { TAG_AREA,           "href",         TUA_EXTERNAL },
 135   { TAG_BGSOUND,        "src",          0 },
 136   { TAG_BODY,           "background",   0 },
 137   { TAG_EMBED,          "href",         TUA_EXTERNAL },
 138   { TAG_EMBED,          "src",          0 },
 139   { TAG_FIG,            "src",          0 },
 140   { TAG_FRAME,          "src",          0 },
 141   { TAG_IFRAME,         "src",          0 },
 142   { TAG_IMG,            "href",         0 },
 143   { TAG_IMG,            "lowsrc",       0 },
 144   { TAG_IMG,            "src",          0 },
 145   { TAG_INPUT,          "src",          0 },
 146   { TAG_LAYER,          "src",          0 },
 147   { TAG_OVERLAY,        "src",          0 },
 148   { TAG_SCRIPT,         "src",          0 },
 149   { TAG_TABLE,          "background",   0 },
 150   { TAG_TD,             "background",   0 },
 151   { TAG_TH,             "background",   0 }
 152 };
 153
 154 /* The lists of interesting tags and attributes are built dynamically,
 155    from the information above.  However, some places in the code refer
 156    to the attributes not mentioned here.  We add them manually.  */
 157 static const char *additional_attributes[] = {
 158   "rel",                        /* used by tag_handle_link */
 159   "http-equiv",                 /* used by tag_handle_meta */
 160   "name",                       /* used by tag_handle_meta */
 161   "content",                    /* used by tag_handle_meta */
 162   "action"                      /* used by tag_handle_form */
 163 };
 164
 165 static const char **interesting_tags;
 166 static const char **interesting_attributes;
 167
 168 static void
 169 init_interesting (void)
 170 {
 171   /* Init the variables interesting_tags and interesting_attributes
 172      that are used by the HTML parser to know which tags and
 173      attributes we're interested in.  We initialize this only once,
 174      for performance reasons.
 175
 176      Here we also make sure that what we put in interesting_tags
 177      matches the user's preferences as specified through --ignore-tags
 178      and --follow-tags.
 179
 180      This function is as large as this only because of the glorious
 181      expressivity of the C programming language.  */
 182
 183   {
 184     int i, ind = 0;
 185     int size = countof (known_tags);
 186     interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
 187
 188     for (i = 0; i < size; i++)
 189       {
 190         const char *name = known_tags[i].name;
 191
 192         /* Normally here we could say:
 193            interesting_tags[i] = name;
 194            But we need to respect the settings of --ignore-tags and
 195            --follow-tags, so the code gets a bit hairier.  */
 196
 197         if (opt.ignore_tags)
 198           {
 199             /* --ignore-tags was specified.  Do not match these
 200                specific tags.  --ignore-tags takes precedence over
 201                --follow-tags, so we process --ignore first and fall
 202                through if there's no match. */
 203             int j, lose = 0;
 204             for (j = 0; opt.ignore_tags[j] != NULL; j++)
 205               /* Loop through all the tags this user doesn't care about. */
 206               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
 207                 {
 208                   lose = 1;
 209                   break;
 210                 }
 211             if (lose)
 212               continue;
 213           }
 214
 215         if (opt.follow_tags)
 216           {
 217             /* --follow-tags was specified.  Only match these specific tags, so
 218                continue back to top of for if we don't match one of them. */
 219             int j, win = 0;
 220             for (j = 0; opt.follow_tags[j] != NULL; j++)
 221               /* Loop through all the tags this user cares about. */
 222               if (strcasecmp(opt.follow_tags[j], name) == EQ)
 223                 {
 224                   win = 1;
 225                   break;
 226                 }
 227             if (!win)
 228               continue;  /* wasn't one of the explicitly desired tags */
 229           }
 230
 231         /* If we get to here, --follow-tags isn't being used or the
 232            tag is among the ones that are followed, and --ignore-tags,
 233            if specified, didn't include this tag, so it's an
 234            "interesting" one. */
 235         interesting_tags[ind++] = name;
 236       }
 237     interesting_tags[ind] = NULL;
 238   }
 239
 240   /* The same for attributes, except we loop through tag_url_attributes.
 241      Here we also need to make sure that the list of attributes is
 242      unique, and to include the attributes from additional_attributes.  */
 243   {
 244     int i, ind;
 245     const char **att = xmalloc ((countof (additional_attributes) + 1)
 246                                 * sizeof (char *));
 247     /* First copy the "additional" attributes. */
 248     for (i = 0; i < countof (additional_attributes); i++)
 249       att[i] = additional_attributes[i];
 250     ind = i;
 251     att[ind] = NULL;
 252     for (i = 0; i < countof (tag_url_attributes); i++)
 253       {
 254         int j, seen = 0;
 255         const char *look_for = tag_url_attributes[i].attr_name;
 256         for (j = 0; j < ind - 1; j++)
 257           if (!strcmp (att[j], look_for))
 258             {
 259               seen = 1;
 260               break;
 261             }
 262         if (!seen)
 263           {
 264             att = xrealloc (att, (ind + 2) * sizeof (*att));
 265             att[ind++] = look_for;
 266             att[ind] = NULL;
 267           }
 268       }
 269     interesting_attributes = att;
 270   }
 271 }
 272
 273 /* Find tag with name TAG_NAME in KNOWN_TAGS and return its index.  */
 274
 275 static int
 276 find_tag (const char *tag_name)
 277 {
 278   /* Originally implemented as linear search.  In Wget 1.9 known_tags
 279      contains 21 elements, for which binary search requires max. 5
 280      comparisons, whereas linear search performs 10 on average.  */
 281
 282   int lo = 0, hi = countof (known_tags) - 1;
 283
 284   while (lo <= hi)
 285     {
 286       int mid = (lo + hi) >> 1;
 287       int cmp = strcasecmp (tag_name, known_tags[mid].name);
 288       if (cmp < 0)
 289         hi = mid - 1;
 290       else if (cmp > 0)
 291         lo = mid + 1;
 292       else
 293         return mid;
 294     }
 295
 296   return -1;
 297 }
 298
 299 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 300    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 301    index of the attribute in TAG will be stored there.  */
 302
 303 static char *
 304 find_attr (struct taginfo *tag, const char *name, int *attrind)
 305 {
 306   int i;
 307   for (i = 0; i < tag->nattrs; i++)
 308     if (!strcasecmp (tag->attrs[i].name, name))
 309       {
 310         if (attrind)
 311           *attrind = i;
 312         return tag->attrs[i].value;
 313       }
 314   return NULL;
 315 }
 316
 317 struct map_context {
 318   char *text;                   /* HTML text. */
 319   char *base;                   /* Base URI of the document, possibly
 320                                    changed through <base href=...>. */
 321   const char *parent_base;      /* Base of the current document. */
 322   const char *document_file;    /* File name of this document. */
 323   int nofollow;                 /* whether NOFOLLOW was specified in a
 324                                    <meta name=robots> tag. */
 325
 326   struct urlpos *head, *tail;   /* List of URLs that is being
 327                                    built. */
 328 };
 329
 330 /* Append LINK_URI to the urlpos structure that is being built.
 331
 332    LINK_URI will be merged with the current document base.  TAG and
 333    ATTRIND are the necessary context to store the position and
 334    size.  */
 335
 336 static struct urlpos *
 337 append_one_url (const char *link_uri, int inlinep,
 338                 struct taginfo *tag, int attrind, struct map_context *ctx)
 339 {
 340   int link_has_scheme = url_has_scheme (link_uri);
 341   struct urlpos *newel;
 342   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 343   struct url *url;
 344
 345   if (!base)
 346     {
 347       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 348                ctx->document_file, link_uri));
 349
 350       if (!link_has_scheme)
 351         {
 352           /* Base URL is unavailable, and the link does not have a
 353              location attached to it -- we have to give up.  Since
 354              this can only happen when using `--force-html -i', print
 355              a warning.  */
 356           logprintf (LOG_NOTQUIET,
 357                      _("%s: Cannot resolve incomplete link %s.\n"),
 358                      ctx->document_file, link_uri);
 359           return NULL;
 360         }
 361
 362       url = url_parse (link_uri, NULL);
 363       if (!url)
 364         {
 365           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 366                    ctx->document_file, link_uri));
 367           return NULL;
 368         }
 369     }
 370   else
 371     {
 372       /* Merge BASE with LINK_URI, but also make sure the result is
 373          canonicalized, i.e. that "../" have been resolved.
 374          (parse_url will do that for us.) */
 375
 376       char *complete_uri = uri_merge (base, link_uri);
 377
 378       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 379                ctx->document_file, base, link_uri, complete_uri));
 380
 381       url = url_parse (complete_uri, NULL);
 382       if (!url)
 383         {
 384           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 385                    ctx->document_file, complete_uri));
 386           xfree (complete_uri);
 387           return NULL;
 388         }
 389       xfree (complete_uri);
 390     }
 391
 392   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 393
 394   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 395   memset (newel, 0, sizeof (*newel));
 396
 397   newel->next = NULL;
 398   newel->url = url;
 399   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 400   newel->size = tag->attrs[attrind].value_raw_size;
 401   newel->link_inline_p = inlinep;
 402
 403   /* A URL is relative if the host is not named, and the name does not
 404      start with `/'.  */
 405   if (!link_has_scheme && *link_uri != '/')
 406     newel->link_relative_p = 1;
 407   else if (link_has_scheme)
 408     newel->link_complete_p = 1;
 409
 410   if (ctx->tail)
 411     {
 412       ctx->tail->next = newel;
 413       ctx->tail = newel;
 414     }
 415   else
 416     ctx->tail = ctx->head = newel;
 417
 418   return newel;
 419 }
 420 \f
 421 /* All the tag_* functions are called from collect_tags_mapper, as
 422    specified by KNOWN_TAGS.  */
 423
 424 /* Default tag handler: collect URLs from attributes specified for
 425    this tag by tag_url_attributes.  */
 426
 427 static void
 428 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 429 {
 430   int i, attrind, first = -1;
 431   int size = countof (tag_url_attributes);
 432
 433   for (i = 0; i < size; i++)
 434     if (tag_url_attributes[i].tagid == tagid)
 435       {
 436         /* We've found the index of tag_url_attributes where the
 437            attributes of our tag begin.  */
 438         first = i;
 439         break;
 440       }
 441   assert (first != -1);
 442
 443   /* Loop over the "interesting" attributes of this tag.  In this
 444      example, it will loop over "src" and "lowsrc".
 445
 446        <img src="foo.png" lowsrc="bar.png">
 447
 448      This has to be done in the outer loop so that the attributes are
 449      processed in the same order in which they appear in the page.
 450      This is required when converting links.  */
 451
 452   for (attrind = 0; attrind < tag->nattrs; attrind++)
 453     {
 454       /* Find whether TAG/ATTRIND is a combination that contains a
 455          URL. */
 456       char *link = tag->attrs[attrind].value;
 457
 458       /* If you're cringing at the inefficiency of the nested loops,
 459          remember that they both iterate over a laughably small
 460          quantity of items.  The worst-case inner loop is for the IMG
 461          tag, which has three attributes.  */
 462       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 463         {
 464           if (0 == strcasecmp (tag->attrs[attrind].name,
 465                                tag_url_attributes[i].attr_name))
 466             {
 467               int flags = tag_url_attributes[i].flags;
 468               append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
 469             }
 470         }
 471     }
 472 }
 473
 474 /* Handle the BASE tag, for <base href=...>. */
 475
 476 static void
 477 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 478 {
 479   struct urlpos *base_urlpos;
 480   int attrind;
 481   char *newbase = find_attr (tag, "href", &attrind);
 482   if (!newbase)
 483     return;
 484
 485   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 486   if (!base_urlpos)
 487     return;
 488   base_urlpos->ignore_when_downloading = 1;
 489   base_urlpos->link_base_p = 1;
 490
 491   if (ctx->base)
 492     xfree (ctx->base);
 493   if (ctx->parent_base)
 494     ctx->base = uri_merge (ctx->parent_base, newbase);
 495   else
 496     ctx->base = xstrdup (newbase);
 497 }
 498
 499 /* Mark the URL found in <form action=...> for conversion. */
 500
 501 static void
 502 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 503 {
 504   int attrind;
 505   char *action = find_attr (tag, "action", &attrind);
 506   if (action)
 507     {
 508       struct urlpos *action_urlpos = append_one_url (action, 0, tag,
 509                                                      attrind, ctx);
 510       if (action_urlpos)
 511         action_urlpos->ignore_when_downloading = 1;
 512     }
 513 }
 514
 515 /* Handle the LINK tag.  It requires special handling because how its
 516    links will be followed in -p mode depends on the REL attribute.  */
 517
 518 static void
 519 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 520 {
 521   int attrind;
 522   char *href = find_attr (tag, "href", &attrind);
 523
 524   /* All <link href="..."> link references are external, except those
 525      known not to be, such as style sheet and shortcut icon:
 526
 527        <link rel="stylesheet" href="...">
 528        <link rel="shortcut icon" href="...">
 529   */
 530   if (href)
 531     {
 532       char *rel  = find_attr (tag, "rel", NULL);
 533       int inlinep = (rel
 534                      && (0 == strcasecmp (rel, "stylesheet")
 535                          || 0 == strcasecmp (rel, "shortcut icon")));
 536       append_one_url (href, inlinep, tag, attrind, ctx);
 537     }
 538 }
 539
 540 /* Handle the META tag.  This requires special handling because of the
 541    refresh feature and because of robot exclusion.  */
 542
 543 static void
 544 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 545 {
 546   char *name = find_attr (tag, "name", NULL);
 547   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 548
 549   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 550     {
 551       /* Some pages use a META tag to specify that the page be
 552          refreshed by a new page after a given number of seconds.  The
 553          general format for this is:
 554
 555            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 556
 557          So we just need to skip past the "NUMBER; URL=" garbage to
 558          get to the URL.  */
 559
 560       struct urlpos *entry;
 561       int attrind;
 562       int timeout = 0;
 563       char *p;
 564
 565       char *refresh = find_attr (tag, "content", &attrind);
 566       if (!refresh)
 567         return;
 568
 569       for (p = refresh; ISDIGIT (*p); p++)
 570         timeout = 10 * timeout + *p - '0';
 571       if (*p++ != ';')
 572         return;
 573
 574       while (ISSPACE (*p))
 575         ++p;
 576       if (!(   TOUPPER (*p)       == 'U'
 577             && TOUPPER (*(p + 1)) == 'R'
 578             && TOUPPER (*(p + 2)) == 'L'
 579             &&          *(p + 3)  == '='))
 580         return;
 581       p += 4;
 582       while (ISSPACE (*p))
 583         ++p;
 584
 585       entry = append_one_url (p, 0, tag, attrind, ctx);
 586       if (entry)
 587         {
 588           entry->link_refresh_p = 1;
 589           entry->refresh_timeout = timeout;
 590         }
 591     }
 592   else if (name && 0 == strcasecmp (name, "robots"))
 593     {
 594       /* Handle stuff like:
 595          <meta name="robots" content="index,nofollow"> */
 596       char *content = find_attr (tag, "content", NULL);
 597       if (!content)
 598         return;
 599       if (!strcasecmp (content, "none"))
 600         ctx->nofollow = 1;
 601       else
 602         {
 603           while (*content)
 604             {
 605               /* Find the next occurrence of ',' or the end of
 606                  the string.  */
 607               char *end = strchr (content, ',');
 608               if (end)
 609                 ++end;
 610               else
 611                 end = content + strlen (content);
 612               if (!strncasecmp (content, "nofollow", end - content))
 613                 ctx->nofollow = 1;
 614               content = end;
 615             }
 616         }
 617     }
 618 }
 619
 620 /* Examine name and attributes of TAG and take appropriate action
 621    according to the tag.  */
 622
 623 static void
 624 collect_tags_mapper (struct taginfo *tag, void *arg)
 625 {
 626   struct map_context *ctx = (struct map_context *)arg;
 627   int tagid;
 628   tag_handler_t handler;
 629
 630   tagid = find_tag (tag->name);
 631   assert (tagid != -1);
 632   handler = known_tags[tagid].handler;
 633
 634   handler (tagid, tag, ctx);
 635 }
 636 \f
 637 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 638    it.  It merges relative links in FILE with URL.  It is aware of
 639    <base href=...> and does the right thing.  */
 640
 641 struct urlpos *
 642 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 643 {
 644   struct file_memory *fm;
 645   struct map_context ctx;
 646   int flags;
 647
 648   /* Load the file. */
 649   fm = read_file (file);
 650   if (!fm)
 651     {
 652       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 653       return NULL;
 654     }
 655   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 656
 657   ctx.text = fm->content;
 658   ctx.head = ctx.tail = NULL;
 659   ctx.base = NULL;
 660   ctx.parent_base = url ? url : opt.base_href;
 661   ctx.document_file = file;
 662   ctx.nofollow = 0;
 663
 664   if (!interesting_tags)
 665     init_interesting ();
 666
 667   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 668      generate <a href=" foo"> instead of <a href="foo"> (Netscape
 669      ignores spaces as well.)  If you really mean space, use &32; or
 670      %20.  */
 671   flags = MHT_TRIM_VALUES;
 672   if (opt.strict_comments)
 673     flags |= MHT_STRICT_COMMENTS;
 674
 675   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 676                  interesting_tags, interesting_attributes);
 677
 678   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 679   if (meta_disallow_follow)
 680     *meta_disallow_follow = ctx.nofollow;
 681
 682   FREE_MAYBE (ctx.base);
 683   read_file_free (fm);
 684   return ctx.head;
 685 }
 686
 687 /* This doesn't really have anything to do with HTML, but it's similar
 688    to get_urls_html, so we put it here.  */
 689
 690 struct urlpos *
 691 get_urls_file (const char *file)
 692 {
 693   struct file_memory *fm;
 694   struct urlpos *head, *tail;
 695   const char *text, *text_end;
 696
 697   /* Load the file.  */
 698   fm = read_file (file);
 699   if (!fm)
 700     {
 701       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 702       return NULL;
 703     }
 704   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 705
 706   head = tail = NULL;
 707   text = fm->content;
 708   text_end = fm->content + fm->length;
 709   while (text < text_end)
 710     {
 711       int up_error_code;
 712       char *url_text;
 713       struct urlpos *entry;
 714       struct url *url;
 715
 716       const char *line_beg = text;
 717       const char *line_end = memchr (text, '\n', text_end - text);
 718       if (!line_end)
 719         line_end = text_end;
 720       else
 721         ++line_end;
 722       text = line_end;
 723
 724       /* Strip whitespace from the beginning and end of line. */
 725       while (line_beg < line_end && ISSPACE (*line_beg))
 726         ++line_beg;
 727       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
 728         --line_end;
 729
 730       if (line_beg == line_end)
 731         continue;
 732
 733       /* The URL is in the [line_beg, line_end) region. */
 734
 735       /* We must copy the URL to a zero-terminated string, and we
 736          can't use alloca because we're in a loop.  *sigh*.  */
 737       url_text = strdupdelim (line_beg, line_end);
 738
 739       if (opt.base_href)
 740         {
 741           /* Merge opt.base_href with URL. */
 742           char *merged = uri_merge (opt.base_href, url_text);
 743           xfree (url_text);
 744           url_text = merged;
 745         }
 746
 747       url = url_parse (url_text, &up_error_code);
 748       if (!url)
 749         {
 750           logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
 751                      file, url_text, url_error (up_error_code));
 752           xfree (url_text);
 753           continue;
 754         }
 755       xfree (url_text);
 756
 757       entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 758       memset (entry, 0, sizeof (*entry));
 759       entry->next = NULL;
 760       entry->url = url;
 761
 762       if (!head)
 763         head = entry;
 764       else
 765         tail->next = entry;
 766       tail = entry;
 767     }
 768   read_file_free (fm);
 769   return head;
 770 }
 771
 772 void
 773 cleanup_html_url (void)
 774 {
 775   FREE_MAYBE (interesting_tags);
 776   FREE_MAYBE (interesting_attributes);
 777 }