sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link  */
 167   "http-equiv",                 /* used by tag_handle_meta  */
 168   "name",                       /* used by tag_handle_meta  */
 169   "content",                    /* used by tag_handle_meta  */
 170   "action",                     /* used by tag_handle_form  */
 171   "style"                       /* used by check_style_attr */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 /* Will contains the (last) charset found in 'http-equiv=content-type'
 178    meta tags  */
 179 static char *meta_charset;
 180
 181 static void
 182 init_interesting (void)
 183 {
 184   /* Init the variables interesting_tags and interesting_attributes
 185      that are used by the HTML parser to know which tags and
 186      attributes we're interested in.  We initialize this only once,
 187      for performance reasons.
 188
 189      Here we also make sure that what we put in interesting_tags
 190      matches the user's preferences as specified through --ignore-tags
 191      and --follow-tags.  */
 192
 193   size_t i;
 194   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 195
 196   /* First, add all the tags we know hot to handle, mapped to their
 197      respective entries in known_tags.  */
 198   for (i = 0; i < countof (known_tags); i++)
 199     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 200
 201   /* Then remove the tags ignored through --ignore-tags.  */
 202   if (opt.ignore_tags)
 203     {
 204       char **ignored;
 205       for (ignored = opt.ignore_tags; *ignored; ignored++)
 206         hash_table_remove (interesting_tags, *ignored);
 207     }
 208
 209   /* If --follow-tags is specified, use only those tags.  */
 210   if (opt.follow_tags)
 211     {
 212       /* Create a new table intersecting --follow-tags and known_tags,
 213          and use it as interesting_tags.  */
 214       struct hash_table *intersect = make_nocase_string_hash_table (0);
 215       char **followed;
 216       for (followed = opt.follow_tags; *followed; followed++)
 217         {
 218           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 219           if (!t)
 220             continue;           /* ignore unknown --follow-tags entries. */
 221           hash_table_put (intersect, *followed, t);
 222         }
 223       hash_table_destroy (interesting_tags);
 224       interesting_tags = intersect;
 225     }
 226
 227   /* Add the attributes we care about. */
 228   interesting_attributes = make_nocase_string_hash_table (10);
 229   for (i = 0; i < countof (additional_attributes); i++)
 230     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 231   for (i = 0; i < countof (tag_url_attributes); i++)
 232     hash_table_put (interesting_attributes,
 233                     tag_url_attributes[i].attr_name, "1");
 234 }
 235
 236 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 237    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 238    index of the attribute in TAG will be stored there.  */
 239
 240 static char *
 241 find_attr (struct taginfo *tag, const char *name, int *attrind)
 242 {
 243   int i;
 244   for (i = 0; i < tag->nattrs; i++)
 245     if (!strcasecmp (tag->attrs[i].name, name))
 246       {
 247         if (attrind)
 248           *attrind = i;
 249         return tag->attrs[i].value;
 250       }
 251   return NULL;
 252 }
 253
 254 /* used for calls to append_url */
 255 #define ATTR_POS(tag, attrind, ctx) \
 256  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 257 #define ATTR_SIZE(tag, attrind) \
 258  (tag->attrs[attrind].value_raw_size)
 259
 260 /* Append LINK_URI to the urlpos structure that is being built.
 261
 262    LINK_URI will be merged with the current document base.
 263 */
 264
 265 struct urlpos *
 266 append_url (const char *link_uri, int position, int size,
 267             struct map_context *ctx)
 268 {
 269   int link_has_scheme = url_has_scheme (link_uri);
 270   struct urlpos *newel;
 271   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 272   struct url *url;
 273
 274   if (!base)
 275     {
 276       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 277                ctx->document_file, link_uri));
 278
 279       if (!link_has_scheme)
 280         {
 281           /* Base URL is unavailable, and the link does not have a
 282              location attached to it -- we have to give up.  Since
 283              this can only happen when using `--force-html -i', print
 284              a warning.  */
 285           logprintf (LOG_NOTQUIET,
 286                      _("%s: Cannot resolve incomplete link %s.\n"),
 287                      ctx->document_file, link_uri);
 288           return NULL;
 289         }
 290
 291       url = url_parse (link_uri, NULL, NULL);
 292       if (!url)
 293         {
 294           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 295                    ctx->document_file, link_uri));
 296           return NULL;
 297         }
 298     }
 299   else
 300     {
 301       /* Merge BASE with LINK_URI, but also make sure the result is
 302          canonicalized, i.e. that "../" have been resolved.
 303          (parse_url will do that for us.) */
 304
 305       char *complete_uri = uri_merge (base, link_uri);
 306
 307       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 308                ctx->document_file, base, link_uri, complete_uri));
 309
 310       url = url_parse (complete_uri, NULL, NULL);
 311       if (!url)
 312         {
 313           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 314                    ctx->document_file, complete_uri));
 315           xfree (complete_uri);
 316           return NULL;
 317         }
 318       xfree (complete_uri);
 319     }
 320
 321   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 322
 323   newel = xnew0 (struct urlpos);
 324   newel->url = url;
 325   newel->pos = position;
 326   newel->size = size;
 327
 328   /* A URL is relative if the host is not named, and the name does not
 329      start with `/'.  */
 330   if (!link_has_scheme && *link_uri != '/')
 331     newel->link_relative_p = 1;
 332   else if (link_has_scheme)
 333     newel->link_complete_p = 1;
 334
 335   if (ctx->tail)
 336     {
 337       ctx->tail->next = newel;
 338       ctx->tail = newel;
 339     }
 340   else
 341     ctx->tail = ctx->head = newel;
 342
 343   return newel;
 344 }
 345 \f
 346 static void
 347 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 348 {
 349   int attrind;
 350   char *style = find_attr (tag, "style", &attrind);
 351   if (!style)
 352     return;
 353
 354   /* raw pos and raw size include the quotes, hence the +1 -2 */
 355   get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
 356 }
 357
 358 /* All the tag_* functions are called from collect_tags_mapper, as
 359    specified by KNOWN_TAGS.  */
 360
 361 /* Default tag handler: collect URLs from attributes specified for
 362    this tag by tag_url_attributes.  */
 363
 364 static void
 365 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 366 {
 367   size_t i;
 368   int attrind;
 369   int first = -1;
 370
 371   for (i = 0; i < countof (tag_url_attributes); i++)
 372     if (tag_url_attributes[i].tagid == tagid)
 373       {
 374         /* We've found the index of tag_url_attributes where the
 375            attributes of our tag begin.  */
 376         first = i;
 377         break;
 378       }
 379   assert (first != -1);
 380
 381   /* Loop over the "interesting" attributes of this tag.  In this
 382      example, it will loop over "src" and "lowsrc".
 383
 384        <img src="foo.png" lowsrc="bar.png">
 385
 386      This has to be done in the outer loop so that the attributes are
 387      processed in the same order in which they appear in the page.
 388      This is required when converting links.  */
 389
 390   for (attrind = 0; attrind < tag->nattrs; attrind++)
 391     {
 392       /* Find whether TAG/ATTRIND is a combination that contains a
 393          URL. */
 394       char *link = tag->attrs[attrind].value;
 395       const size_t size = countof (tag_url_attributes);
 396
 397       /* If you're cringing at the inefficiency of the nested loops,
 398          remember that they both iterate over a very small number of
 399          items.  The worst-case inner loop is for the IMG tag, which
 400          has three attributes.  */
 401       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 402         {
 403           if (0 == strcasecmp (tag->attrs[attrind].name,
 404                                tag_url_attributes[i].attr_name))
 405             {
 406               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 407                                               ATTR_SIZE(tag,attrind), ctx);
 408               if (up)
 409                 {
 410                   int flags = tag_url_attributes[i].flags;
 411                   if (flags & ATTR_INLINE)
 412                     up->link_inline_p = 1;
 413                   if (flags & ATTR_HTML)
 414                     up->link_expect_html = 1;
 415                 }
 416             }
 417         }
 418     }
 419 }
 420
 421 /* Handle the BASE tag, for <base href=...>. */
 422
 423 static void
 424 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 425 {
 426   struct urlpos *base_urlpos;
 427   int attrind;
 428   char *newbase = find_attr (tag, "href", &attrind);
 429   if (!newbase)
 430     return;
 431
 432   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 433                             ATTR_SIZE(tag,attrind), ctx);
 434   if (!base_urlpos)
 435     return;
 436   base_urlpos->ignore_when_downloading = 1;
 437   base_urlpos->link_base_p = 1;
 438
 439   if (ctx->base)
 440     xfree (ctx->base);
 441   if (ctx->parent_base)
 442     ctx->base = uri_merge (ctx->parent_base, newbase);
 443   else
 444     ctx->base = xstrdup (newbase);
 445 }
 446
 447 /* Mark the URL found in <form action=...> for conversion. */
 448
 449 static void
 450 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 451 {
 452   int attrind;
 453   char *action = find_attr (tag, "action", &attrind);
 454
 455   if (action)
 456     {
 457       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 458                                       ATTR_SIZE(tag,attrind), ctx);
 459       if (up)
 460         up->ignore_when_downloading = 1;
 461     }
 462 }
 463
 464 /* Handle the LINK tag.  It requires special handling because how its
 465    links will be followed in -p mode depends on the REL attribute.  */
 466
 467 static void
 468 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 469 {
 470   int attrind;
 471   char *href = find_attr (tag, "href", &attrind);
 472
 473   /* All <link href="..."> link references are external, except those
 474      known not to be, such as style sheet and shortcut icon:
 475
 476        <link rel="stylesheet" href="...">
 477        <link rel="shortcut icon" href="...">
 478   */
 479   if (href)
 480     {
 481       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 482                                       ATTR_SIZE(tag,attrind), ctx);
 483       if (up)
 484         {
 485           char *rel = find_attr (tag, "rel", NULL);
 486           if (rel)
 487             {
 488               if (0 == strcasecmp (rel, "stylesheet"))
 489                 {
 490                   up->link_inline_p = 1;
 491                   up->link_expect_css = 1;
 492                 }
 493               else if (0 == strcasecmp (rel, "shortcut icon"))
 494                 {
 495                   up->link_inline_p = 1;
 496                 }
 497             }
 498           else
 499             /* The external ones usually point to HTML pages, such as
 500                <link rel="next" href="..."> */
 501             up->link_expect_html = 1;
 502         }
 503     }
 504 }
 505
 506 /* Handle the META tag.  This requires special handling because of the
 507    refresh feature and because of robot exclusion.  */
 508
 509 static void
 510 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 511 {
 512   char *name = find_attr (tag, "name", NULL);
 513   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 514
 515   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 516     {
 517       /* Some pages use a META tag to specify that the page be
 518          refreshed by a new page after a given number of seconds.  The
 519          general format for this is:
 520
 521            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 522
 523          So we just need to skip past the "NUMBER; URL=" garbage to
 524          get to the URL.  */
 525
 526       struct urlpos *entry;
 527       int attrind;
 528       int timeout = 0;
 529       char *p;
 530
 531       char *refresh = find_attr (tag, "content", &attrind);
 532       if (!refresh)
 533         return;
 534
 535       for (p = refresh; c_isdigit (*p); p++)
 536         timeout = 10 * timeout + *p - '0';
 537       if (*p++ != ';')
 538         return;
 539
 540       while (c_isspace (*p))
 541         ++p;
 542       if (!(   c_toupper (*p)       == 'U'
 543             && c_toupper (*(p + 1)) == 'R'
 544             && c_toupper (*(p + 2)) == 'L'
 545             &&          *(p + 3)  == '='))
 546         return;
 547       p += 4;
 548       while (c_isspace (*p))
 549         ++p;
 550
 551       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 552                           ATTR_SIZE(tag,attrind), ctx);
 553       if (entry)
 554         {
 555           entry->link_refresh_p = 1;
 556           entry->refresh_timeout = timeout;
 557           entry->link_expect_html = 1;
 558         }
 559     }
 560   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 561     {
 562       /* Handle stuff like:
 563          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 564
 565       char *mcharset;
 566       char *content = find_attr (tag, "content", NULL);
 567       if (!content)
 568         return;
 569
 570       mcharset = parse_charset (content);
 571       if (!mcharset)
 572         return;
 573
 574       /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
 575       xfree_null (meta_charset);
 576       meta_charset = mcharset;
 577     }
 578   else if (name && 0 == strcasecmp (name, "robots"))
 579     {
 580       /* Handle stuff like:
 581          <meta name="robots" content="index,nofollow"> */
 582       char *content = find_attr (tag, "content", NULL);
 583       if (!content)
 584         return;
 585       if (!strcasecmp (content, "none"))
 586         ctx->nofollow = true;
 587       else
 588         {
 589           while (*content)
 590             {
 591               /* Find the next occurrence of ',' or the end of
 592                  the string.  */
 593               char *end = strchr (content, ',');
 594               if (end)
 595                 ++end;
 596               else
 597                 end = content + strlen (content);
 598               if (!strncasecmp (content, "nofollow", end - content))
 599                 ctx->nofollow = true;
 600               content = end;
 601             }
 602         }
 603     }
 604 }
 605
 606 /* Dispatch the tag handler appropriate for the tag we're mapping
 607    over.  See known_tags[] for definition of tag handlers.  */
 608
 609 static void
 610 collect_tags_mapper (struct taginfo *tag, void *arg)
 611 {
 612   struct map_context *ctx = (struct map_context *)arg;
 613
 614   /* Find the tag in our table of tags.  This must not fail because
 615      map_html_tags only returns tags found in interesting_tags.
 616
 617      I've changed this for now, I'm passing NULL as interesting_tags
 618      to map_html_tags.  This way we can check all tags for a style
 619      attribute.
 620   */
 621   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 622
 623   if (t != NULL)
 624     t->handler (t->tagid, tag, ctx);
 625
 626   check_style_attr (tag, ctx);
 627
 628   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 629       tag->contents_begin && tag->contents_end)
 630   {
 631     /* parse contents */
 632     get_urls_css (ctx, tag->contents_begin - ctx->text,
 633                   tag->contents_end - tag->contents_begin);
 634   }
 635 }
 636 \f
 637 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 638    it.  It merges relative links in FILE with URL.  It is aware of
 639    <base href=...> and does the right thing.  */
 640
 641 struct urlpos *
 642 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 643                struct iri *iri)
 644 {
 645   struct file_memory *fm;
 646   struct map_context ctx;
 647   int flags;
 648
 649   /* Load the file. */
 650   fm = read_file (file);
 651   if (!fm)
 652     {
 653       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 654       return NULL;
 655     }
 656   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 657
 658   ctx.text = fm->content;
 659   ctx.head = ctx.tail = NULL;
 660   ctx.base = NULL;
 661   ctx.parent_base = url ? url : opt.base_href;
 662   ctx.document_file = file;
 663   ctx.nofollow = false;
 664
 665   if (!interesting_tags)
 666     init_interesting ();
 667
 668   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 669      generate <a href=" foo"> instead of <a href="foo"> (browsers
 670      ignore spaces as well.)  If you really mean space, use &32; or
 671      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 672      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 673      ignored by IE and Mozilla and are presumably introduced by
 674      writing HTML with editors that force word wrap.  */
 675   flags = MHT_TRIM_VALUES;
 676   if (opt.strict_comments)
 677     flags |= MHT_STRICT_COMMENTS;
 678
 679   /* the NULL here used to be interesting_tags */
 680   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 681                  NULL, interesting_attributes);
 682
 683   /* If meta charset isn't null, override content encoding */
 684   if (iri && meta_charset)
 685     set_content_encoding (iri, meta_charset);
 686
 687   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 688   if (meta_disallow_follow)
 689     *meta_disallow_follow = ctx.nofollow;
 690
 691   xfree_null (ctx.base);
 692   read_file_free (fm);
 693   return ctx.head;
 694 }
 695
 696 /* This doesn't really have anything to do with HTML, but it's similar
 697    to get_urls_html, so we put it here.  */
 698
 699 struct urlpos *
 700 get_urls_file (const char *file)
 701 {
 702   struct file_memory *fm;
 703   struct urlpos *head, *tail;
 704   const char *text, *text_end;
 705
 706   /* Load the file.  */
 707   fm = read_file (file);
 708   if (!fm)
 709     {
 710       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 711       return NULL;
 712     }
 713   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 714
 715   head = tail = NULL;
 716   text = fm->content;
 717   text_end = fm->content + fm->length;
 718   while (text < text_end)
 719     {
 720       int up_error_code;
 721       char *url_text;
 722       struct urlpos *entry;
 723       struct url *url;
 724
 725       const char *line_beg = text;
 726       const char *line_end = memchr (text, '\n', text_end - text);
 727       if (!line_end)
 728         line_end = text_end;
 729       else
 730         ++line_end;
 731       text = line_end;
 732
 733       /* Strip whitespace from the beginning and end of line. */
 734       while (line_beg < line_end && c_isspace (*line_beg))
 735         ++line_beg;
 736       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 737         --line_end;
 738
 739       if (line_beg == line_end)
 740         continue;
 741
 742       /* The URL is in the [line_beg, line_end) region. */
 743
 744       /* We must copy the URL to a zero-terminated string, and we
 745          can't use alloca because we're in a loop.  *sigh*.  */
 746       url_text = strdupdelim (line_beg, line_end);
 747
 748       if (opt.base_href)
 749         {
 750           /* Merge opt.base_href with URL. */
 751           char *merged = uri_merge (opt.base_href, url_text);
 752           xfree (url_text);
 753           url_text = merged;
 754         }
 755
 756       url = url_parse (url_text, &up_error_code, NULL);
 757       if (!url)
 758         {
 759           char *error = url_error (url_text, up_error_code);
 760           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 761                      file, url_text, error);
 762           xfree (url_text);
 763           xfree (error);
 764           continue;
 765         }
 766       xfree (url_text);
 767
 768       entry = xnew0 (struct urlpos);
 769       entry->url = url;
 770
 771       if (!head)
 772         head = entry;
 773       else
 774         tail->next = entry;
 775       tail = entry;
 776     }
 777   read_file_free (fm);
 778   return head;
 779 }
 780
 781 void
 782 cleanup_html_url (void)
 783 {
 784   /* Destroy the hash tables.  The hash table keys and values are not
 785      allocated by this code, so we don't need to free them here.  */
 786   if (interesting_tags)
 787     hash_table_destroy (interesting_tags);
 788   if (interesting_attributes)
 789     hash_table_destroy (interesting_attributes);
 790 }