sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"              /* declaration of get_urls_html */
  45 #include "iri.h"
  46
  47 struct map_context;
  48
  49 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  50
  51 #define DECLARE_TAG_HANDLER(fun)                                \
  52   static void fun (int, struct taginfo *, struct map_context *)
  53
  54 DECLARE_TAG_HANDLER (tag_find_urls);
  55 DECLARE_TAG_HANDLER (tag_handle_base);
  56 DECLARE_TAG_HANDLER (tag_handle_form);
  57 DECLARE_TAG_HANDLER (tag_handle_link);
  58 DECLARE_TAG_HANDLER (tag_handle_meta);
  59
  60 enum {
  61   TAG_A,
  62   TAG_APPLET,
  63   TAG_AREA,
  64   TAG_BASE,
  65   TAG_BGSOUND,
  66   TAG_BODY,
  67   TAG_EMBED,
  68   TAG_FIG,
  69   TAG_FORM,
  70   TAG_FRAME,
  71   TAG_IFRAME,
  72   TAG_IMG,
  73   TAG_INPUT,
  74   TAG_LAYER,
  75   TAG_LINK,
  76   TAG_META,
  77   TAG_OBJECT,
  78   TAG_OVERLAY,
  79   TAG_SCRIPT,
  80   TAG_TABLE,
  81   TAG_TD,
  82   TAG_TH
  83 };
  84
  85 /* The list of known tags and functions used for handling them.  Most
  86    tags are simply harvested for URLs. */
  87 static struct known_tag {
  88   int tagid;
  89   const char *name;
  90   tag_handler_t handler;
  91 } known_tags[] = {
  92   { TAG_A,       "a",           tag_find_urls },
  93   { TAG_APPLET,  "applet",      tag_find_urls },
  94   { TAG_AREA,    "area",        tag_find_urls },
  95   { TAG_BASE,    "base",        tag_handle_base },
  96   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  97   { TAG_BODY,    "body",        tag_find_urls },
  98   { TAG_EMBED,   "embed",       tag_find_urls },
  99   { TAG_FIG,     "fig",         tag_find_urls },
 100   { TAG_FORM,    "form",        tag_handle_form },
 101   { TAG_FRAME,   "frame",       tag_find_urls },
 102   { TAG_IFRAME,  "iframe",      tag_find_urls },
 103   { TAG_IMG,     "img",         tag_find_urls },
 104   { TAG_INPUT,   "input",       tag_find_urls },
 105   { TAG_LAYER,   "layer",       tag_find_urls },
 106   { TAG_LINK,    "link",        tag_handle_link },
 107   { TAG_META,    "meta",        tag_handle_meta },
 108   { TAG_OBJECT,  "object",      tag_find_urls },
 109   { TAG_OVERLAY, "overlay",     tag_find_urls },
 110   { TAG_SCRIPT,  "script",      tag_find_urls },
 111   { TAG_TABLE,   "table",       tag_find_urls },
 112   { TAG_TD,      "td",          tag_find_urls },
 113   { TAG_TH,      "th",          tag_find_urls }
 114 };
 115
 116 /* tag_url_attributes documents which attributes of which tags contain
 117    URLs to harvest.  It is used by tag_find_urls.  */
 118
 119 /* Defines for the FLAGS. */
 120
 121 /* The link is "inline", i.e. needs to be retrieved for this document
 122    to be correctly rendered.  Inline links include inlined images,
 123    stylesheets, children frames, etc.  */
 124 #define ATTR_INLINE     1
 125
 126 /* The link is expected to yield HTML contents.  It's important not to
 127    try to follow HTML obtained by following e.g. <img src="...">
 128    regardless of content-type.  Doing this causes infinite loops for
 129    "images" that return non-404 error pages with links to the same
 130    image.  */
 131 #define ATTR_HTML       2
 132
 133 /* For tags handled by tag_find_urls: attributes that contain URLs to
 134    download. */
 135 static struct {
 136   int tagid;
 137   const char *attr_name;
 138   int flags;
 139 } tag_url_attributes[] = {
 140   { TAG_A,              "href",         ATTR_HTML },
 141   { TAG_APPLET,         "code",         ATTR_INLINE },
 142   { TAG_AREA,           "href",         ATTR_HTML },
 143   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 144   { TAG_BODY,           "background",   ATTR_INLINE },
 145   { TAG_EMBED,          "href",         ATTR_HTML },
 146   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 147   { TAG_FIG,            "src",          ATTR_INLINE },
 148   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 150   { TAG_IMG,            "href",         ATTR_INLINE },
 151   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 152   { TAG_IMG,            "src",          ATTR_INLINE },
 153   { TAG_INPUT,          "src",          ATTR_INLINE },
 154   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 155   { TAG_OBJECT,         "data",         ATTR_INLINE },
 156   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 157   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 158   { TAG_TABLE,          "background",   ATTR_INLINE },
 159   { TAG_TD,             "background",   ATTR_INLINE },
 160   { TAG_TH,             "background",   ATTR_INLINE }
 161 };
 162
 163 /* The lists of interesting tags and attributes are built dynamically,
 164    from the information above.  However, some places in the code refer
 165    to the attributes not mentioned here.  We add them manually.  */
 166 static const char *additional_attributes[] = {
 167   "rel",                        /* used by tag_handle_link */
 168   "http-equiv",                 /* used by tag_handle_meta */
 169   "name",                       /* used by tag_handle_meta */
 170   "content",                    /* used by tag_handle_meta */
 171   "action"                      /* used by tag_handle_form */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 static void
 178 init_interesting (void)
 179 {
 180   /* Init the variables interesting_tags and interesting_attributes
 181      that are used by the HTML parser to know which tags and
 182      attributes we're interested in.  We initialize this only once,
 183      for performance reasons.
 184
 185      Here we also make sure that what we put in interesting_tags
 186      matches the user's preferences as specified through --ignore-tags
 187      and --follow-tags.  */
 188
 189   size_t i;
 190   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 191
 192   /* First, add all the tags we know hot to handle, mapped to their
 193      respective entries in known_tags.  */
 194   for (i = 0; i < countof (known_tags); i++)
 195     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 196
 197   /* Then remove the tags ignored through --ignore-tags.  */
 198   if (opt.ignore_tags)
 199     {
 200       char **ignored;
 201       for (ignored = opt.ignore_tags; *ignored; ignored++)
 202         hash_table_remove (interesting_tags, *ignored);
 203     }
 204
 205   /* If --follow-tags is specified, use only those tags.  */
 206   if (opt.follow_tags)
 207     {
 208       /* Create a new table intersecting --follow-tags and known_tags,
 209          and use it as interesting_tags.  */
 210       struct hash_table *intersect = make_nocase_string_hash_table (0);
 211       char **followed;
 212       for (followed = opt.follow_tags; *followed; followed++)
 213         {
 214           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 215           if (!t)
 216             continue;           /* ignore unknown --follow-tags entries. */
 217           hash_table_put (intersect, *followed, t);
 218         }
 219       hash_table_destroy (interesting_tags);
 220       interesting_tags = intersect;
 221     }
 222
 223   /* Add the attributes we care about. */
 224   interesting_attributes = make_nocase_string_hash_table (10);
 225   for (i = 0; i < countof (additional_attributes); i++)
 226     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 227   for (i = 0; i < countof (tag_url_attributes); i++)
 228     hash_table_put (interesting_attributes,
 229                     tag_url_attributes[i].attr_name, "1");
 230 }
 231
 232 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 233    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 234    index of the attribute in TAG will be stored there.  */
 235
 236 static char *
 237 find_attr (struct taginfo *tag, const char *name, int *attrind)
 238 {
 239   int i;
 240   for (i = 0; i < tag->nattrs; i++)
 241     if (!strcasecmp (tag->attrs[i].name, name))
 242       {
 243         if (attrind)
 244           *attrind = i;
 245         return tag->attrs[i].value;
 246       }
 247   return NULL;
 248 }
 249
 250 struct map_context {
 251   char *text;                   /* HTML text. */
 252   char *base;                   /* Base URI of the document, possibly
 253                                    changed through <base href=...>. */
 254   const char *parent_base;      /* Base of the current document. */
 255   const char *document_file;    /* File name of this document. */
 256   bool nofollow;                /* whether NOFOLLOW was specified in a
 257                                    <meta name=robots> tag. */
 258
 259   struct urlpos *head, *tail;   /* List of URLs that is being
 260                                    built. */
 261 };
 262
 263 /* Append LINK_URI to the urlpos structure that is being built.
 264
 265    LINK_URI will be merged with the current document base.  TAG and
 266    ATTRIND are the necessary context to store the position and
 267    size.  */
 268
 269 static struct urlpos *
 270 append_url (const char *link_uri,
 271             struct taginfo *tag, int attrind, struct map_context *ctx)
 272 {
 273   int link_has_scheme = url_has_scheme (link_uri);
 274   struct urlpos *newel;
 275   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 276   struct url *url;
 277   bool utf8_encode = false;
 278
 279   if (!base)
 280     {
 281       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 282                ctx->document_file, link_uri));
 283
 284       if (!link_has_scheme)
 285         {
 286           /* Base URL is unavailable, and the link does not have a
 287              location attached to it -- we have to give up.  Since
 288              this can only happen when using `--force-html -i', print
 289              a warning.  */
 290           logprintf (LOG_NOTQUIET,
 291                      _("%s: Cannot resolve incomplete link %s.\n"),
 292                      ctx->document_file, link_uri);
 293           return NULL;
 294         }
 295
 296       url = url_parse (link_uri, NULL, &utf8_encode);
 297       if (!url)
 298         {
 299           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 300                    ctx->document_file, link_uri));
 301           return NULL;
 302         }
 303     }
 304   else
 305     {
 306       /* Merge BASE with LINK_URI, but also make sure the result is
 307          canonicalized, i.e. that "../" have been resolved.
 308          (parse_url will do that for us.) */
 309
 310       char *complete_uri = uri_merge (base, link_uri);
 311
 312       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 313                ctx->document_file, base, link_uri, complete_uri));
 314
 315       url = url_parse (complete_uri, NULL, &utf8_encode);
 316       if (!url)
 317         {
 318           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 319                    ctx->document_file, complete_uri));
 320           xfree (complete_uri);
 321           return NULL;
 322         }
 323       xfree (complete_uri);
 324     }
 325
 326   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 327
 328   newel = xnew0 (struct urlpos);
 329   newel->url = url;
 330   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 331   newel->size = tag->attrs[attrind].value_raw_size;
 332
 333   /* A URL is relative if the host is not named, and the name does not
 334      start with `/'.  */
 335   if (!link_has_scheme && *link_uri != '/')
 336     newel->link_relative_p = 1;
 337   else if (link_has_scheme)
 338     newel->link_complete_p = 1;
 339
 340   if (ctx->tail)
 341     {
 342       ctx->tail->next = newel;
 343       ctx->tail = newel;
 344     }
 345   else
 346     ctx->tail = ctx->head = newel;
 347
 348   return newel;
 349 }
 350 \f
 351 /* All the tag_* functions are called from collect_tags_mapper, as
 352    specified by KNOWN_TAGS.  */
 353
 354 /* Default tag handler: collect URLs from attributes specified for
 355    this tag by tag_url_attributes.  */
 356
 357 static void
 358 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 359 {
 360   size_t i;
 361   int attrind;
 362   int first = -1;
 363
 364   for (i = 0; i < countof (tag_url_attributes); i++)
 365     if (tag_url_attributes[i].tagid == tagid)
 366       {
 367         /* We've found the index of tag_url_attributes where the
 368            attributes of our tag begin.  */
 369         first = i;
 370         break;
 371       }
 372   assert (first != -1);
 373
 374   /* Loop over the "interesting" attributes of this tag.  In this
 375      example, it will loop over "src" and "lowsrc".
 376
 377        <img src="foo.png" lowsrc="bar.png">
 378
 379      This has to be done in the outer loop so that the attributes are
 380      processed in the same order in which they appear in the page.
 381      This is required when converting links.  */
 382
 383   for (attrind = 0; attrind < tag->nattrs; attrind++)
 384     {
 385       /* Find whether TAG/ATTRIND is a combination that contains a
 386          URL. */
 387       char *link = tag->attrs[attrind].value;
 388       const size_t size = countof (tag_url_attributes);
 389
 390       /* If you're cringing at the inefficiency of the nested loops,
 391          remember that they both iterate over a very small number of
 392          items.  The worst-case inner loop is for the IMG tag, which
 393          has three attributes.  */
 394       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 395         {
 396           if (0 == strcasecmp (tag->attrs[attrind].name,
 397                                tag_url_attributes[i].attr_name))
 398             {
 399               struct urlpos *up = append_url (link, tag, attrind, ctx);
 400               if (up)
 401                 {
 402                   int flags = tag_url_attributes[i].flags;
 403                   if (flags & ATTR_INLINE)
 404                     up->link_inline_p = 1;
 405                   if (flags & ATTR_HTML)
 406                     up->link_expect_html = 1;
 407                 }
 408             }
 409         }
 410     }
 411 }
 412
 413 /* Handle the BASE tag, for <base href=...>. */
 414
 415 static void
 416 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 417 {
 418   struct urlpos *base_urlpos;
 419   int attrind;
 420   char *newbase = find_attr (tag, "href", &attrind);
 421   if (!newbase)
 422     return;
 423
 424   base_urlpos = append_url (newbase, tag, attrind, ctx);
 425   if (!base_urlpos)
 426     return;
 427   base_urlpos->ignore_when_downloading = 1;
 428   base_urlpos->link_base_p = 1;
 429
 430   if (ctx->base)
 431     xfree (ctx->base);
 432   if (ctx->parent_base)
 433     ctx->base = uri_merge (ctx->parent_base, newbase);
 434   else
 435     ctx->base = xstrdup (newbase);
 436 }
 437
 438 /* Mark the URL found in <form action=...> for conversion. */
 439
 440 static void
 441 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 442 {
 443   int attrind;
 444   char *action = find_attr (tag, "action", &attrind);
 445   if (action)
 446     {
 447       struct urlpos *up = append_url (action, tag, attrind, ctx);
 448       if (up)
 449         up->ignore_when_downloading = 1;
 450     }
 451 }
 452
 453 /* Handle the LINK tag.  It requires special handling because how its
 454    links will be followed in -p mode depends on the REL attribute.  */
 455
 456 static void
 457 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 458 {
 459   int attrind;
 460   char *href = find_attr (tag, "href", &attrind);
 461
 462   /* All <link href="..."> link references are external, except those
 463      known not to be, such as style sheet and shortcut icon:
 464
 465        <link rel="stylesheet" href="...">
 466        <link rel="shortcut icon" href="...">
 467   */
 468   if (href)
 469     {
 470       struct urlpos *up = append_url (href, tag, attrind, ctx);
 471       if (up)
 472         {
 473           char *rel = find_attr (tag, "rel", NULL);
 474           if (rel
 475               && (0 == strcasecmp (rel, "stylesheet")
 476                   || 0 == strcasecmp (rel, "shortcut icon")))
 477             up->link_inline_p = 1;
 478           else
 479             /* The external ones usually point to HTML pages, such as
 480                <link rel="next" href="..."> */
 481             up->link_expect_html = 1;
 482         }
 483     }
 484 }
 485
 486 /* Handle the META tag.  This requires special handling because of the
 487    refresh feature and because of robot exclusion.  */
 488
 489 static void
 490 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 491 {
 492   char *name = find_attr (tag, "name", NULL);
 493   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 494
 495   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 496     {
 497       /* Some pages use a META tag to specify that the page be
 498          refreshed by a new page after a given number of seconds.  The
 499          general format for this is:
 500
 501            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 502
 503          So we just need to skip past the "NUMBER; URL=" garbage to
 504          get to the URL.  */
 505
 506       struct urlpos *entry;
 507       int attrind;
 508       int timeout = 0;
 509       char *p;
 510
 511       char *refresh = find_attr (tag, "content", &attrind);
 512       if (!refresh)
 513         return;
 514
 515       for (p = refresh; c_isdigit (*p); p++)
 516         timeout = 10 * timeout + *p - '0';
 517       if (*p++ != ';')
 518         return;
 519
 520       while (c_isspace (*p))
 521         ++p;
 522       if (!(   c_toupper (*p)       == 'U'
 523             && c_toupper (*(p + 1)) == 'R'
 524             && c_toupper (*(p + 2)) == 'L'
 525             &&          *(p + 3)  == '='))
 526         return;
 527       p += 4;
 528       while (c_isspace (*p))
 529         ++p;
 530
 531       entry = append_url (p, tag, attrind, ctx);
 532       if (entry)
 533         {
 534           entry->link_refresh_p = 1;
 535           entry->refresh_timeout = timeout;
 536           entry->link_expect_html = 1;
 537         }
 538     }
 539   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 540     {
 541       /* Handle stuff like:
 542          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 543
 544       char *mcharset;
 545       char *content = find_attr (tag, "content", NULL);
 546       if (!content)
 547         return;
 548
 549       mcharset = parse_charset (content);
 550       if (!mcharset)
 551         return;
 552
 553       /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
 554
 555       set_current_charset (mcharset);
 556       xfree (mcharset);
 557     }
 558   else if (name && 0 == strcasecmp (name, "robots"))
 559     {
 560       /* Handle stuff like:
 561          <meta name="robots" content="index,nofollow"> */
 562       char *content = find_attr (tag, "content", NULL);
 563       if (!content)
 564         return;
 565       if (!strcasecmp (content, "none"))
 566         ctx->nofollow = true;
 567       else
 568         {
 569           while (*content)
 570             {
 571               /* Find the next occurrence of ',' or the end of
 572                  the string.  */
 573               char *end = strchr (content, ',');
 574               if (end)
 575                 ++end;
 576               else
 577                 end = content + strlen (content);
 578               if (!strncasecmp (content, "nofollow", end - content))
 579                 ctx->nofollow = true;
 580               content = end;
 581             }
 582         }
 583     }
 584 }
 585
 586 /* Dispatch the tag handler appropriate for the tag we're mapping
 587    over.  See known_tags[] for definition of tag handlers.  */
 588
 589 static void
 590 collect_tags_mapper (struct taginfo *tag, void *arg)
 591 {
 592   struct map_context *ctx = (struct map_context *)arg;
 593
 594   /* Find the tag in our table of tags.  This must not fail because
 595      map_html_tags only returns tags found in interesting_tags.  */
 596   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 597   assert (t != NULL);
 598
 599   t->handler (t->tagid, tag, ctx);
 600 }
 601 \f
 602 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 603    it.  It merges relative links in FILE with URL.  It is aware of
 604    <base href=...> and does the right thing.  */
 605
 606 struct urlpos *
 607 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
 608 {
 609   struct file_memory *fm;
 610   struct map_context ctx;
 611   int flags;
 612
 613   /* Load the file. */
 614   fm = read_file (file);
 615   if (!fm)
 616     {
 617       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 618       return NULL;
 619     }
 620   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 621
 622   ctx.text = fm->content;
 623   ctx.head = ctx.tail = NULL;
 624   ctx.base = NULL;
 625   ctx.parent_base = url ? url : opt.base_href;
 626   ctx.document_file = file;
 627   ctx.nofollow = false;
 628
 629   if (!interesting_tags)
 630     init_interesting ();
 631
 632   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 633      generate <a href=" foo"> instead of <a href="foo"> (browsers
 634      ignore spaces as well.)  If you really mean space, use &32; or
 635      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 636      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 637      ignored by IE and Mozilla and are presumably introduced by
 638      writing HTML with editors that force word wrap.  */
 639   flags = MHT_TRIM_VALUES;
 640   if (opt.strict_comments)
 641     flags |= MHT_STRICT_COMMENTS;
 642
 643   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 644                  interesting_tags, interesting_attributes);
 645
 646   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 647   if (meta_disallow_follow)
 648     *meta_disallow_follow = ctx.nofollow;
 649
 650   xfree_null (ctx.base);
 651   read_file_free (fm);
 652   return ctx.head;
 653 }
 654
 655 /* This doesn't really have anything to do with HTML, but it's similar
 656    to get_urls_html, so we put it here.  */
 657
 658 struct urlpos *
 659 get_urls_file (const char *file)
 660 {
 661   struct file_memory *fm;
 662   struct urlpos *head, *tail;
 663   const char *text, *text_end;
 664   bool utf8_encode = false;
 665
 666   /* Load the file.  */
 667   fm = read_file (file);
 668   if (!fm)
 669     {
 670       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 671       return NULL;
 672     }
 673   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 674
 675   head = tail = NULL;
 676   text = fm->content;
 677   text_end = fm->content + fm->length;
 678   while (text < text_end)
 679     {
 680       int up_error_code;
 681       char *url_text;
 682       struct urlpos *entry;
 683       struct url *url;
 684
 685       const char *line_beg = text;
 686       const char *line_end = memchr (text, '\n', text_end - text);
 687       if (!line_end)
 688         line_end = text_end;
 689       else
 690         ++line_end;
 691       text = line_end;
 692
 693       /* Strip whitespace from the beginning and end of line. */
 694       while (line_beg < line_end && c_isspace (*line_beg))
 695         ++line_beg;
 696       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 697         --line_end;
 698
 699       if (line_beg == line_end)
 700         continue;
 701
 702       /* The URL is in the [line_beg, line_end) region. */
 703
 704       /* We must copy the URL to a zero-terminated string, and we
 705          can't use alloca because we're in a loop.  *sigh*.  */
 706       url_text = strdupdelim (line_beg, line_end);
 707
 708       if (opt.base_href)
 709         {
 710           /* Merge opt.base_href with URL. */
 711           char *merged = uri_merge (opt.base_href, url_text);
 712           xfree (url_text);
 713           url_text = merged;
 714         }
 715
 716       url = url_parse (url_text, &up_error_code, &utf8_encode);
 717       if (!url)
 718         {
 719           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 720                      file, url_text, url_error (up_error_code));
 721           xfree (url_text);
 722           continue;
 723         }
 724       xfree (url_text);
 725
 726       entry = xnew0 (struct urlpos);
 727       entry->url = url;
 728
 729       if (!head)
 730         head = entry;
 731       else
 732         tail->next = entry;
 733       tail = entry;
 734     }
 735   read_file_free (fm);
 736   return head;
 737 }
 738
 739 void
 740 cleanup_html_url (void)
 741 {
 742   /* Destroy the hash tables.  The hash table keys and values are not
 743      allocated by this code, so we don't need to free them here.  */
 744   if (interesting_tags)
 745     hash_table_destroy (interesting_tags);
 746   if (interesting_attributes)
 747     hash_table_destroy (interesting_attributes);
 748 }