sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   3    2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36 #include <errno.h>
  37 #include <assert.h>
  38
  39 #include "html-parse.h"
  40 #include "url.h"
  41 #include "utils.h"
  42 #include "hash.h"
  43 #include "convert.h"
  44 #include "recur.h"
  45 #include "html-url.h"
  46 #include "css-url.h"
  47
  48 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  49
  50 #define DECLARE_TAG_HANDLER(fun)                                \
  51   static void fun (int, struct taginfo *, struct map_context *)
  52
  53 DECLARE_TAG_HANDLER (tag_find_urls);
  54 DECLARE_TAG_HANDLER (tag_handle_base);
  55 DECLARE_TAG_HANDLER (tag_handle_form);
  56 DECLARE_TAG_HANDLER (tag_handle_link);
  57 DECLARE_TAG_HANDLER (tag_handle_meta);
  58
  59 enum {
  60   TAG_A,
  61   TAG_APPLET,
  62   TAG_AREA,
  63   TAG_BASE,
  64   TAG_BGSOUND,
  65   TAG_BODY,
  66   TAG_EMBED,
  67   TAG_FIG,
  68   TAG_FORM,
  69   TAG_FRAME,
  70   TAG_IFRAME,
  71   TAG_IMG,
  72   TAG_INPUT,
  73   TAG_LAYER,
  74   TAG_LINK,
  75   TAG_META,
  76   TAG_OBJECT,
  77   TAG_OVERLAY,
  78   TAG_SCRIPT,
  79   TAG_TABLE,
  80   TAG_TD,
  81   TAG_TH
  82 };
  83
  84 /* The list of known tags and functions used for handling them.  Most
  85    tags are simply harvested for URLs. */
  86 static struct known_tag {
  87   int tagid;
  88   const char *name;
  89   tag_handler_t handler;
  90 } known_tags[] = {
  91   { TAG_A,       "a",           tag_find_urls },
  92   { TAG_APPLET,  "applet",      tag_find_urls },
  93   { TAG_AREA,    "area",        tag_find_urls },
  94   { TAG_BASE,    "base",        tag_handle_base },
  95   { TAG_BGSOUND, "bgsound",     tag_find_urls },
  96   { TAG_BODY,    "body",        tag_find_urls },
  97   { TAG_EMBED,   "embed",       tag_find_urls },
  98   { TAG_FIG,     "fig",         tag_find_urls },
  99   { TAG_FORM,    "form",        tag_handle_form },
 100   { TAG_FRAME,   "frame",       tag_find_urls },
 101   { TAG_IFRAME,  "iframe",      tag_find_urls },
 102   { TAG_IMG,     "img",         tag_find_urls },
 103   { TAG_INPUT,   "input",       tag_find_urls },
 104   { TAG_LAYER,   "layer",       tag_find_urls },
 105   { TAG_LINK,    "link",        tag_handle_link },
 106   { TAG_META,    "meta",        tag_handle_meta },
 107   { TAG_OBJECT,  "object",      tag_find_urls },
 108   { TAG_OVERLAY, "overlay",     tag_find_urls },
 109   { TAG_SCRIPT,  "script",      tag_find_urls },
 110   { TAG_TABLE,   "table",       tag_find_urls },
 111   { TAG_TD,      "td",          tag_find_urls },
 112   { TAG_TH,      "th",          tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS. */
 119
 120 /* The link is "inline", i.e. needs to be retrieved for this document
 121    to be correctly rendered.  Inline links include inlined images,
 122    stylesheets, children frames, etc.  */
 123 #define ATTR_INLINE     1
 124
 125 /* The link is expected to yield HTML contents.  It's important not to
 126    try to follow HTML obtained by following e.g. <img src="...">
 127    regardless of content-type.  Doing this causes infinite loops for
 128    "images" that return non-404 error pages with links to the same
 129    image.  */
 130 #define ATTR_HTML       2
 131
 132 /* For tags handled by tag_find_urls: attributes that contain URLs to
 133    download. */
 134 static struct {
 135   int tagid;
 136   const char *attr_name;
 137   int flags;
 138 } tag_url_attributes[] = {
 139   { TAG_A,              "href",         ATTR_HTML },
 140   { TAG_APPLET,         "code",         ATTR_INLINE },
 141   { TAG_AREA,           "href",         ATTR_HTML },
 142   { TAG_BGSOUND,        "src",          ATTR_INLINE },
 143   { TAG_BODY,           "background",   ATTR_INLINE },
 144   { TAG_EMBED,          "href",         ATTR_HTML },
 145   { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
 146   { TAG_FIG,            "src",          ATTR_INLINE },
 147   { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
 148   { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
 149   { TAG_IMG,            "href",         ATTR_INLINE },
 150   { TAG_IMG,            "lowsrc",       ATTR_INLINE },
 151   { TAG_IMG,            "src",          ATTR_INLINE },
 152   { TAG_INPUT,          "src",          ATTR_INLINE },
 153   { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
 154   { TAG_OBJECT,         "data",         ATTR_INLINE },
 155   { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
 156   { TAG_SCRIPT,         "src",          ATTR_INLINE },
 157   { TAG_TABLE,          "background",   ATTR_INLINE },
 158   { TAG_TD,             "background",   ATTR_INLINE },
 159   { TAG_TH,             "background",   ATTR_INLINE }
 160 };
 161
 162 /* The lists of interesting tags and attributes are built dynamically,
 163    from the information above.  However, some places in the code refer
 164    to the attributes not mentioned here.  We add them manually.  */
 165 static const char *additional_attributes[] = {
 166   "rel",                        /* used by tag_handle_link  */
 167   "http-equiv",                 /* used by tag_handle_meta  */
 168   "name",                       /* used by tag_handle_meta  */
 169   "content",                    /* used by tag_handle_meta  */
 170   "action",                     /* used by tag_handle_form  */
 171   "style"                       /* used by check_style_attr */
 172 };
 173
 174 static struct hash_table *interesting_tags;
 175 static struct hash_table *interesting_attributes;
 176
 177 /* Will contains the (last) charset found in 'http-equiv=content-type'
 178    meta tags  */
 179 static char *meta_charset;
 180
 181 static void
 182 init_interesting (void)
 183 {
 184   /* Init the variables interesting_tags and interesting_attributes
 185      that are used by the HTML parser to know which tags and
 186      attributes we're interested in.  We initialize this only once,
 187      for performance reasons.
 188
 189      Here we also make sure that what we put in interesting_tags
 190      matches the user's preferences as specified through --ignore-tags
 191      and --follow-tags.  */
 192
 193   size_t i;
 194   interesting_tags = make_nocase_string_hash_table (countof (known_tags));
 195
 196   /* First, add all the tags we know hot to handle, mapped to their
 197      respective entries in known_tags.  */
 198   for (i = 0; i < countof (known_tags); i++)
 199     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 200
 201   /* Then remove the tags ignored through --ignore-tags.  */
 202   if (opt.ignore_tags)
 203     {
 204       char **ignored;
 205       for (ignored = opt.ignore_tags; *ignored; ignored++)
 206         hash_table_remove (interesting_tags, *ignored);
 207     }
 208
 209   /* If --follow-tags is specified, use only those tags.  */
 210   if (opt.follow_tags)
 211     {
 212       /* Create a new table intersecting --follow-tags and known_tags,
 213          and use it as interesting_tags.  */
 214       struct hash_table *intersect = make_nocase_string_hash_table (0);
 215       char **followed;
 216       for (followed = opt.follow_tags; *followed; followed++)
 217         {
 218           struct known_tag *t = hash_table_get (interesting_tags, *followed);
 219           if (!t)
 220             continue;           /* ignore unknown --follow-tags entries. */
 221           hash_table_put (intersect, *followed, t);
 222         }
 223       hash_table_destroy (interesting_tags);
 224       interesting_tags = intersect;
 225     }
 226
 227   /* Add the attributes we care about. */
 228   interesting_attributes = make_nocase_string_hash_table (10);
 229   for (i = 0; i < countof (additional_attributes); i++)
 230     hash_table_put (interesting_attributes, additional_attributes[i], "1");
 231   for (i = 0; i < countof (tag_url_attributes); i++)
 232     hash_table_put (interesting_attributes,
 233                     tag_url_attributes[i].attr_name, "1");
 234 }
 235
 236 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 237    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 238    index of the attribute in TAG will be stored there.  */
 239
 240 static char *
 241 find_attr (struct taginfo *tag, const char *name, int *attrind)
 242 {
 243   int i;
 244   for (i = 0; i < tag->nattrs; i++)
 245     if (!strcasecmp (tag->attrs[i].name, name))
 246       {
 247         if (attrind)
 248           *attrind = i;
 249         return tag->attrs[i].value;
 250       }
 251   return NULL;
 252 }
 253
 254 /* used for calls to append_url */
 255 #define ATTR_POS(tag, attrind, ctx) \
 256  (tag->attrs[attrind].value_raw_beginning - ctx->text)
 257 #define ATTR_SIZE(tag, attrind) \
 258  (tag->attrs[attrind].value_raw_size)
 259
 260 /* Append LINK_URI to the urlpos structure that is being built.
 261
 262    LINK_URI will be merged with the current document base.
 263 */
 264
 265 struct urlpos *
 266 append_url (const char *link_uri, int position, int size,
 267             struct map_context *ctx)
 268 {
 269   int link_has_scheme = url_has_scheme (link_uri);
 270   struct urlpos *newel;
 271   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 272   struct url *url;
 273
 274   if (!base)
 275     {
 276       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 277                ctx->document_file, link_uri));
 278
 279       if (!link_has_scheme)
 280         {
 281           /* Base URL is unavailable, and the link does not have a
 282              location attached to it -- we have to give up.  Since
 283              this can only happen when using `--force-html -i', print
 284              a warning.  */
 285           logprintf (LOG_NOTQUIET,
 286                      _("%s: Cannot resolve incomplete link %s.\n"),
 287                      ctx->document_file, link_uri);
 288           return NULL;
 289         }
 290
 291       url = url_parse (link_uri, NULL, NULL, false);
 292       if (!url)
 293         {
 294           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 295                    ctx->document_file, link_uri));
 296           return NULL;
 297         }
 298     }
 299   else
 300     {
 301       /* Merge BASE with LINK_URI, but also make sure the result is
 302          canonicalized, i.e. that "../" have been resolved.
 303          (parse_url will do that for us.) */
 304
 305       char *complete_uri = uri_merge (base, link_uri);
 306
 307       DEBUGP (("%s: merge(%s, %s) -> %s\n",
 308                quotearg_n_style (0, escape_quoting_style, ctx->document_file),
 309                quote_n (1, base),
 310                quote_n (2, link_uri),
 311                quotearg_n_style (3, escape_quoting_style, complete_uri)));
 312
 313       url = url_parse (complete_uri, NULL, NULL, false);
 314       if (!url)
 315         {
 316           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 317                    ctx->document_file, complete_uri));
 318           xfree (complete_uri);
 319           return NULL;
 320         }
 321       xfree (complete_uri);
 322     }
 323
 324   DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
 325
 326   newel = xnew0 (struct urlpos);
 327   newel->url = url;
 328   newel->pos = position;
 329   newel->size = size;
 330
 331   /* A URL is relative if the host is not named, and the name does not
 332      start with `/'.  */
 333   if (!link_has_scheme && *link_uri != '/')
 334     newel->link_relative_p = 1;
 335   else if (link_has_scheme)
 336     newel->link_complete_p = 1;
 337
 338   if (ctx->tail)
 339     {
 340       ctx->tail->next = newel;
 341       ctx->tail = newel;
 342     }
 343   else
 344     ctx->tail = ctx->head = newel;
 345
 346   return newel;
 347 }
 348 \f
 349 static void
 350 check_style_attr (struct taginfo *tag, struct map_context *ctx)
 351 {
 352   int attrind;
 353   char *style = find_attr (tag, "style", &attrind);
 354   if (!style)
 355     return;
 356
 357   /* raw pos and raw size include the quotes, hence the +1 -2 */
 358   get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
 359 }
 360
 361 /* All the tag_* functions are called from collect_tags_mapper, as
 362    specified by KNOWN_TAGS.  */
 363
 364 /* Default tag handler: collect URLs from attributes specified for
 365    this tag by tag_url_attributes.  */
 366
 367 static void
 368 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 369 {
 370   size_t i;
 371   int attrind;
 372   int first = -1;
 373
 374   for (i = 0; i < countof (tag_url_attributes); i++)
 375     if (tag_url_attributes[i].tagid == tagid)
 376       {
 377         /* We've found the index of tag_url_attributes where the
 378            attributes of our tag begin.  */
 379         first = i;
 380         break;
 381       }
 382   assert (first != -1);
 383
 384   /* Loop over the "interesting" attributes of this tag.  In this
 385      example, it will loop over "src" and "lowsrc".
 386
 387        <img src="foo.png" lowsrc="bar.png">
 388
 389      This has to be done in the outer loop so that the attributes are
 390      processed in the same order in which they appear in the page.
 391      This is required when converting links.  */
 392
 393   for (attrind = 0; attrind < tag->nattrs; attrind++)
 394     {
 395       /* Find whether TAG/ATTRIND is a combination that contains a
 396          URL. */
 397       char *link = tag->attrs[attrind].value;
 398       const size_t size = countof (tag_url_attributes);
 399
 400       /* If you're cringing at the inefficiency of the nested loops,
 401          remember that they both iterate over a very small number of
 402          items.  The worst-case inner loop is for the IMG tag, which
 403          has three attributes.  */
 404       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 405         {
 406           if (0 == strcasecmp (tag->attrs[attrind].name,
 407                                tag_url_attributes[i].attr_name))
 408             {
 409               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
 410                                               ATTR_SIZE(tag,attrind), ctx);
 411               if (up)
 412                 {
 413                   int flags = tag_url_attributes[i].flags;
 414                   if (flags & ATTR_INLINE)
 415                     up->link_inline_p = 1;
 416                   if (flags & ATTR_HTML)
 417                     up->link_expect_html = 1;
 418                 }
 419             }
 420         }
 421     }
 422 }
 423
 424 /* Handle the BASE tag, for <base href=...>. */
 425
 426 static void
 427 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 428 {
 429   struct urlpos *base_urlpos;
 430   int attrind;
 431   char *newbase = find_attr (tag, "href", &attrind);
 432   if (!newbase)
 433     return;
 434
 435   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
 436                             ATTR_SIZE(tag,attrind), ctx);
 437   if (!base_urlpos)
 438     return;
 439   base_urlpos->ignore_when_downloading = 1;
 440   base_urlpos->link_base_p = 1;
 441
 442   if (ctx->base)
 443     xfree (ctx->base);
 444   if (ctx->parent_base)
 445     ctx->base = uri_merge (ctx->parent_base, newbase);
 446   else
 447     ctx->base = xstrdup (newbase);
 448 }
 449
 450 /* Mark the URL found in <form action=...> for conversion. */
 451
 452 static void
 453 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 454 {
 455   int attrind;
 456   char *action = find_attr (tag, "action", &attrind);
 457
 458   if (action)
 459     {
 460       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
 461                                       ATTR_SIZE(tag,attrind), ctx);
 462       if (up)
 463         up->ignore_when_downloading = 1;
 464     }
 465 }
 466
 467 /* Handle the LINK tag.  It requires special handling because how its
 468    links will be followed in -p mode depends on the REL attribute.  */
 469
 470 static void
 471 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 472 {
 473   int attrind;
 474   char *href = find_attr (tag, "href", &attrind);
 475
 476   /* All <link href="..."> link references are external, except those
 477      known not to be, such as style sheet and shortcut icon:
 478
 479        <link rel="stylesheet" href="...">
 480        <link rel="shortcut icon" href="...">
 481   */
 482   if (href)
 483     {
 484       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
 485                                       ATTR_SIZE(tag,attrind), ctx);
 486       if (up)
 487         {
 488           char *rel = find_attr (tag, "rel", NULL);
 489           if (rel)
 490             {
 491               if (0 == strcasecmp (rel, "stylesheet"))
 492                 {
 493                   up->link_inline_p = 1;
 494                   up->link_expect_css = 1;
 495                 }
 496               else if (0 == strcasecmp (rel, "shortcut icon"))
 497                 {
 498                   up->link_inline_p = 1;
 499                 }
 500             }
 501           else
 502             /* The external ones usually point to HTML pages, such as
 503                <link rel="next" href="..."> */
 504             up->link_expect_html = 1;
 505         }
 506     }
 507 }
 508
 509 /* Handle the META tag.  This requires special handling because of the
 510    refresh feature and because of robot exclusion.  */
 511
 512 static void
 513 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 514 {
 515   char *name = find_attr (tag, "name", NULL);
 516   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 517
 518   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 519     {
 520       /* Some pages use a META tag to specify that the page be
 521          refreshed by a new page after a given number of seconds.  The
 522          general format for this is:
 523
 524            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 525
 526          So we just need to skip past the "NUMBER; URL=" garbage to
 527          get to the URL.  */
 528
 529       struct urlpos *entry;
 530       int attrind;
 531       int timeout = 0;
 532       char *p;
 533
 534       char *refresh = find_attr (tag, "content", &attrind);
 535       if (!refresh)
 536         return;
 537
 538       for (p = refresh; c_isdigit (*p); p++)
 539         timeout = 10 * timeout + *p - '0';
 540       if (*p++ != ';')
 541         return;
 542
 543       while (c_isspace (*p))
 544         ++p;
 545       if (!(   c_toupper (*p)       == 'U'
 546             && c_toupper (*(p + 1)) == 'R'
 547             && c_toupper (*(p + 2)) == 'L'
 548             &&          *(p + 3)  == '='))
 549         return;
 550       p += 4;
 551       while (c_isspace (*p))
 552         ++p;
 553
 554       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
 555                           ATTR_SIZE(tag,attrind), ctx);
 556       if (entry)
 557         {
 558           entry->link_refresh_p = 1;
 559           entry->refresh_timeout = timeout;
 560           entry->link_expect_html = 1;
 561         }
 562     }
 563   else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
 564     {
 565       /* Handle stuff like:
 566          <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
 567
 568       char *mcharset;
 569       char *content = find_attr (tag, "content", NULL);
 570       if (!content)
 571         return;
 572
 573       mcharset = parse_charset (content);
 574       if (!mcharset)
 575         return;
 576
 577       xfree_null (meta_charset);
 578       meta_charset = mcharset;
 579     }
 580   else if (name && 0 == strcasecmp (name, "robots"))
 581     {
 582       /* Handle stuff like:
 583          <meta name="robots" content="index,nofollow"> */
 584       char *content = find_attr (tag, "content", NULL);
 585       if (!content)
 586         return;
 587       if (!strcasecmp (content, "none"))
 588         ctx->nofollow = true;
 589       else
 590         {
 591           while (*content)
 592             {
 593               char *end;
 594               /* Skip any initial whitespace. */
 595               content += strspn (content, " \f\n\r\t\v");
 596               /* Find the next occurrence of ',' or whitespace,
 597                * or the end of the string.  */
 598               end = content + strcspn (content, ", \f\n\r\t\v");
 599               if (!strncasecmp (content, "nofollow", end - content))
 600                 ctx->nofollow = true;
 601               /* Skip past the next comma, if any. */
 602               if (*end == ',')
 603                 ++end;
 604               else
 605                 {
 606                   end = strchr (end, ',');
 607                   if (end)
 608                     ++end;
 609                   else
 610                     end = content + strlen (content);
 611                 }
 612               content = end;
 613             }
 614         }
 615     }
 616 }
 617
 618 /* Dispatch the tag handler appropriate for the tag we're mapping
 619    over.  See known_tags[] for definition of tag handlers.  */
 620
 621 static void
 622 collect_tags_mapper (struct taginfo *tag, void *arg)
 623 {
 624   struct map_context *ctx = (struct map_context *)arg;
 625
 626   /* Find the tag in our table of tags.  This must not fail because
 627      map_html_tags only returns tags found in interesting_tags.
 628
 629      I've changed this for now, I'm passing NULL as interesting_tags
 630      to map_html_tags.  This way we can check all tags for a style
 631      attribute.
 632   */
 633   struct known_tag *t = hash_table_get (interesting_tags, tag->name);
 634
 635   if (t != NULL)
 636     t->handler (t->tagid, tag, ctx);
 637
 638   check_style_attr (tag, ctx);
 639
 640   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
 641       tag->contents_begin && tag->contents_end)
 642   {
 643     /* parse contents */
 644     get_urls_css (ctx, tag->contents_begin - ctx->text,
 645                   tag->contents_end - tag->contents_begin);
 646   }
 647 }
 648 \f
 649 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 650    it.  It merges relative links in FILE with URL.  It is aware of
 651    <base href=...> and does the right thing.  */
 652
 653 struct urlpos *
 654 get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
 655                struct iri *iri)
 656 {
 657   struct file_memory *fm;
 658   struct map_context ctx;
 659   int flags;
 660
 661   /* Load the file. */
 662   fm = read_file (file);
 663   if (!fm)
 664     {
 665       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 666       return NULL;
 667     }
 668   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 669
 670   ctx.text = fm->content;
 671   ctx.head = ctx.tail = NULL;
 672   ctx.base = NULL;
 673   ctx.parent_base = url ? url : opt.base_href;
 674   ctx.document_file = file;
 675   ctx.nofollow = false;
 676
 677   if (!interesting_tags)
 678     init_interesting ();
 679
 680   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
 681      generate <a href=" foo"> instead of <a href="foo"> (browsers
 682      ignore spaces as well.)  If you really mean space, use &32; or
 683      %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
 684      e.g. in <img src="foo.[newline]html">.  Such newlines are also
 685      ignored by IE and Mozilla and are presumably introduced by
 686      writing HTML with editors that force word wrap.  */
 687   flags = MHT_TRIM_VALUES;
 688   if (opt.strict_comments)
 689     flags |= MHT_STRICT_COMMENTS;
 690
 691   /* the NULL here used to be interesting_tags */
 692   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
 693                  NULL, interesting_attributes);
 694
 695   /* If meta charset isn't null, override content encoding */
 696   if (iri && meta_charset)
 697     set_content_encoding (iri, meta_charset);
 698
 699   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 700   if (meta_disallow_follow)
 701     *meta_disallow_follow = ctx.nofollow;
 702
 703   xfree_null (ctx.base);
 704   read_file_free (fm);
 705   return ctx.head;
 706 }
 707
 708 /* This doesn't really have anything to do with HTML, but it's similar
 709    to get_urls_html, so we put it here.  */
 710
 711 struct urlpos *
 712 get_urls_file (const char *file)
 713 {
 714   struct file_memory *fm;
 715   struct urlpos *head, *tail;
 716   const char *text, *text_end;
 717
 718   /* Load the file.  */
 719   fm = read_file (file);
 720   if (!fm)
 721     {
 722       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 723       return NULL;
 724     }
 725   DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 726
 727   head = tail = NULL;
 728   text = fm->content;
 729   text_end = fm->content + fm->length;
 730   while (text < text_end)
 731     {
 732       int up_error_code;
 733       char *url_text;
 734       struct urlpos *entry;
 735       struct url *url;
 736
 737       const char *line_beg = text;
 738       const char *line_end = memchr (text, '\n', text_end - text);
 739       if (!line_end)
 740         line_end = text_end;
 741       else
 742         ++line_end;
 743       text = line_end;
 744
 745       /* Strip whitespace from the beginning and end of line. */
 746       while (line_beg < line_end && c_isspace (*line_beg))
 747         ++line_beg;
 748       while (line_end > line_beg && c_isspace (*(line_end - 1)))
 749         --line_end;
 750
 751       if (line_beg == line_end)
 752         continue;
 753
 754       /* The URL is in the [line_beg, line_end) region. */
 755
 756       /* We must copy the URL to a zero-terminated string, and we
 757          can't use alloca because we're in a loop.  *sigh*.  */
 758       url_text = strdupdelim (line_beg, line_end);
 759
 760       if (opt.base_href)
 761         {
 762           /* Merge opt.base_href with URL. */
 763           char *merged = uri_merge (opt.base_href, url_text);
 764           xfree (url_text);
 765           url_text = merged;
 766         }
 767
 768       url = url_parse (url_text, &up_error_code, NULL, false);
 769       if (!url)
 770         {
 771           char *error = url_error (url_text, up_error_code);
 772           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
 773                      file, url_text, error);
 774           xfree (url_text);
 775           xfree (error);
 776           continue;
 777         }
 778       xfree (url_text);
 779
 780       entry = xnew0 (struct urlpos);
 781       entry->url = url;
 782
 783       if (!head)
 784         head = entry;
 785       else
 786         tail->next = entry;
 787       tail = entry;
 788     }
 789   read_file_free (fm);
 790   return head;
 791 }
 792
 793 void
 794 cleanup_html_url (void)
 795 {
 796   /* Destroy the hash tables.  The hash table keys and values are not
 797      allocated by this code, so we don't need to free them here.  */
 798   if (interesting_tags)
 799     hash_table_destroy (interesting_tags);
 800   if (interesting_attributes)
 801     hash_table_destroy (interesting_attributes);
 802 }