sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #ifdef HAVE_STRING_H
  34 # include <string.h>
  35 #else
  36 # include <strings.h>
  37 #endif
  38 #include <stdlib.h>
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "html-parse.h"
  44 #include "url.h"
  45 #include "utils.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 struct map_context;
  52
  53 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  54                                        struct map_context *));
  55
  56 #define DECLARE_TAG_HANDLER(fun)                                        \
  57   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  58
  59 DECLARE_TAG_HANDLER (tag_find_urls);
  60 DECLARE_TAG_HANDLER (tag_handle_base);
  61 DECLARE_TAG_HANDLER (tag_handle_form);
  62 DECLARE_TAG_HANDLER (tag_handle_link);
  63 DECLARE_TAG_HANDLER (tag_handle_meta);
  64
  65 /* The list of known tags and functions used for handling them.  Most
  66    tags are simply harvested for URLs. */
  67 static struct {
  68   const char *name;
  69   tag_handler_t handler;
  70 } known_tags[] = {
  71 #define TAG_A           0
  72   { "a",        tag_find_urls },
  73 #define TAG_APPLET      1
  74   { "applet",   tag_find_urls },
  75 #define TAG_AREA        2
  76   { "area",     tag_find_urls },
  77 #define TAG_BASE        3
  78   { "base",     tag_handle_base },
  79 #define TAG_BGSOUND     4
  80   { "bgsound",  tag_find_urls },
  81 #define TAG_BODY        5
  82   { "body",     tag_find_urls },
  83 #define TAG_EMBED       6
  84   { "embed",    tag_find_urls },
  85 #define TAG_FIG         7
  86   { "fig",      tag_find_urls },
  87 #define TAG_FORM        8
  88   { "form",     tag_handle_form },
  89 #define TAG_FRAME       9
  90   { "frame",    tag_find_urls },
  91 #define TAG_IFRAME      10
  92   { "iframe",   tag_find_urls },
  93 #define TAG_IMG         11
  94   { "img",      tag_find_urls },
  95 #define TAG_INPUT       12
  96   { "input",    tag_find_urls },
  97 #define TAG_LAYER       13
  98   { "layer",    tag_find_urls },
  99 #define TAG_LINK        14
 100   { "link",     tag_handle_link },
 101 #define TAG_META        15
 102   { "meta",     tag_handle_meta },
 103 #define TAG_OVERLAY     16
 104   { "overlay",  tag_find_urls },
 105 #define TAG_SCRIPT      17
 106   { "script",   tag_find_urls },
 107 #define TAG_TABLE       18
 108   { "table",    tag_find_urls },
 109 #define TAG_TD          19
 110   { "td",       tag_find_urls },
 111 #define TAG_TH          20
 112   { "th",       tag_find_urls }
 113 };
 114
 115 /* tag_url_attributes documents which attributes of which tags contain
 116    URLs to harvest.  It is used by tag_find_urls.  */
 117
 118 /* Defines for the FLAGS field; currently only one flag is defined. */
 119
 120 /* This tag points to an external document not necessary for rendering this
 121    document (i.e. it's not an inlined image, stylesheet, etc.). */
 122 #define TUA_EXTERNAL 1
 123
 124 /* For tags handled by tag_find_urls: attributes that contain URLs to
 125    download. */
 126 static struct {
 127   int tagid;
 128   const char *attr_name;
 129   int flags;
 130 } tag_url_attributes[] = {
 131   { TAG_A,              "href",         TUA_EXTERNAL },
 132   { TAG_APPLET,         "code",         0 },
 133   { TAG_AREA,           "href",         TUA_EXTERNAL },
 134   { TAG_BGSOUND,        "src",          0 },
 135   { TAG_BODY,           "background",   0 },
 136   { TAG_EMBED,          "href",         TUA_EXTERNAL },
 137   { TAG_EMBED,          "src",          0 },
 138   { TAG_FIG,            "src",          0 },
 139   { TAG_FRAME,          "src",          0 },
 140   { TAG_IFRAME,         "src",          0 },
 141   { TAG_IMG,            "href",         0 },
 142   { TAG_IMG,            "lowsrc",       0 },
 143   { TAG_IMG,            "src",          0 },
 144   { TAG_INPUT,          "src",          0 },
 145   { TAG_LAYER,          "src",          0 },
 146   { TAG_OVERLAY,        "src",          0 },
 147   { TAG_SCRIPT,         "src",          0 },
 148   { TAG_TABLE,          "background",   0 },
 149   { TAG_TD,             "background",   0 },
 150   { TAG_TH,             "background",   0 }
 151 };
 152
 153 /* The lists of interesting tags and attributes are built dynamically,
 154    from the information above.  However, some places in the code refer
 155    to the attributes not mentioned here.  We add them manually.  */
 156 static const char *additional_attributes[] = {
 157   "rel",                        /* used by tag_handle_link */
 158   "http-equiv",                 /* used by tag_handle_meta */
 159   "name",                       /* used by tag_handle_meta */
 160   "content",                    /* used by tag_handle_meta */
 161   "action"                      /* used by tag_handle_form */
 162 };
 163
 164 static const char **interesting_tags;
 165 static const char **interesting_attributes;
 166
 167 static void
 168 init_interesting (void)
 169 {
 170   /* Init the variables interesting_tags and interesting_attributes
 171      that are used by the HTML parser to know which tags and
 172      attributes we're interested in.  We initialize this only once,
 173      for performance reasons.
 174
 175      Here we also make sure that what we put in interesting_tags
 176      matches the user's preferences as specified through --ignore-tags
 177      and --follow-tags.
 178
 179      This function is as large as this only because of the glorious
 180      expressivity of the C programming language.  */
 181
 182   {
 183     int i, ind = 0;
 184     int size = ARRAY_SIZE (known_tags);
 185     interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
 186
 187     for (i = 0; i < size; i++)
 188       {
 189         const char *name = known_tags[i].name;
 190
 191         /* Normally here we could say:
 192            interesting_tags[i] = name;
 193            But we need to respect the settings of --ignore-tags and
 194            --follow-tags, so the code gets a bit hairier.  */
 195
 196         if (opt.ignore_tags)
 197           {
 198             /* --ignore-tags was specified.  Do not match these
 199                specific tags.  --ignore-tags takes precedence over
 200                --follow-tags, so we process --ignore first and fall
 201                through if there's no match. */
 202             int j, lose = 0;
 203             for (j = 0; opt.ignore_tags[j] != NULL; j++)
 204               /* Loop through all the tags this user doesn't care about. */
 205               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
 206                 {
 207                   lose = 1;
 208                   break;
 209                 }
 210             if (lose)
 211               continue;
 212           }
 213
 214         if (opt.follow_tags)
 215           {
 216             /* --follow-tags was specified.  Only match these specific tags, so
 217                continue back to top of for if we don't match one of them. */
 218             int j, win = 0;
 219             for (j = 0; opt.follow_tags[j] != NULL; j++)
 220               /* Loop through all the tags this user cares about. */
 221               if (strcasecmp(opt.follow_tags[j], name) == EQ)
 222                 {
 223                   win = 1;
 224                   break;
 225                 }
 226             if (!win)
 227               continue;  /* wasn't one of the explicitly desired tags */
 228           }
 229
 230         /* If we get to here, --follow-tags isn't being used or the
 231            tag is among the ones that are followed, and --ignore-tags,
 232            if specified, didn't include this tag, so it's an
 233            "interesting" one. */
 234         interesting_tags[ind++] = name;
 235       }
 236     interesting_tags[ind] = NULL;
 237   }
 238
 239   /* The same for attributes, except we loop through tag_url_attributes.
 240      Here we also need to make sure that the list of attributes is
 241      unique, and to include the attributes from additional_attributes.  */
 242   {
 243     int i, ind;
 244     const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
 245                                 * sizeof (char *));
 246     /* First copy the "additional" attributes. */
 247     for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
 248       att[i] = additional_attributes[i];
 249     ind = i;
 250     att[ind] = NULL;
 251     for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
 252       {
 253         int j, seen = 0;
 254         const char *look_for = tag_url_attributes[i].attr_name;
 255         for (j = 0; j < ind - 1; j++)
 256           if (!strcmp (att[j], look_for))
 257             {
 258               seen = 1;
 259               break;
 260             }
 261         if (!seen)
 262           {
 263             att = xrealloc (att, (ind + 2) * sizeof (*att));
 264             att[ind++] = look_for;
 265             att[ind] = NULL;
 266           }
 267       }
 268     interesting_attributes = att;
 269   }
 270 }
 271
 272 static int
 273 find_tag (const char *tag_name)
 274 {
 275   int i;
 276
 277   /* This is linear search; if the number of tags grow, we can switch
 278      to binary search.  */
 279
 280   for (i = 0; i < ARRAY_SIZE (known_tags); i++)
 281     {
 282       int cmp = strcasecmp (known_tags[i].name, tag_name);
 283       /* known_tags are sorted alphabetically, so we can
 284          micro-optimize.  */
 285       if (cmp > 0)
 286         break;
 287       else if (cmp == 0)
 288         return i;
 289     }
 290   return -1;
 291 }
 292
 293 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 294    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 295    index of the attribute in TAG will be stored there.  */
 296 static char *
 297 find_attr (struct taginfo *tag, const char *name, int *attrind)
 298 {
 299   int i;
 300   for (i = 0; i < tag->nattrs; i++)
 301     if (!strcasecmp (tag->attrs[i].name, name))
 302       {
 303         if (attrind)
 304           *attrind = i;
 305         return tag->attrs[i].value;
 306       }
 307   return NULL;
 308 }
 309
 310 struct map_context {
 311   char *text;                   /* HTML text. */
 312   char *base;                   /* Base URI of the document, possibly
 313                                    changed through <base href=...>. */
 314   const char *parent_base;      /* Base of the current document. */
 315   const char *document_file;    /* File name of this document. */
 316   int nofollow;                 /* whether NOFOLLOW was specified in a
 317                                    <meta name=robots> tag. */
 318
 319   struct urlpos *head, *tail;   /* List of URLs that is being
 320                                    built. */
 321 };
 322
 323 /* Append LINK_URI to the urlpos structure that is being built.
 324
 325    LINK_URI will be merged with the current document base.  TAG and
 326    ATTRIND are the necessary context to store the position and
 327    size.  */
 328
 329 static struct urlpos *
 330 append_one_url (const char *link_uri, int inlinep,
 331                 struct taginfo *tag, int attrind, struct map_context *ctx)
 332 {
 333   int link_has_scheme = url_has_scheme (link_uri);
 334   struct urlpos *newel;
 335   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 336   struct url *url;
 337
 338   if (!base)
 339     {
 340       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 341                ctx->document_file, link_uri));
 342
 343       if (!link_has_scheme)
 344         {
 345           /* Base URL is unavailable, and the link does not have a
 346              location attached to it -- we have to give up.  Since
 347              this can only happen when using `--force-html -i', print
 348              a warning.  */
 349           logprintf (LOG_NOTQUIET,
 350                      _("%s: Cannot resolve incomplete link %s.\n"),
 351                      ctx->document_file, link_uri);
 352           return NULL;
 353         }
 354
 355       url = url_parse (link_uri, NULL);
 356       if (!url)
 357         {
 358           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 359                    ctx->document_file, link_uri));
 360           return NULL;
 361         }
 362     }
 363   else
 364     {
 365       /* Merge BASE with LINK_URI, but also make sure the result is
 366          canonicalized, i.e. that "../" have been resolved.
 367          (parse_url will do that for us.) */
 368
 369       char *complete_uri = uri_merge (base, link_uri);
 370
 371       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 372                ctx->document_file, base, link_uri, complete_uri));
 373
 374       url = url_parse (complete_uri, NULL);
 375       if (!url)
 376         {
 377           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 378                    ctx->document_file, complete_uri));
 379           xfree (complete_uri);
 380           return NULL;
 381         }
 382       xfree (complete_uri);
 383     }
 384
 385   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 386
 387   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 388   memset (newel, 0, sizeof (*newel));
 389
 390   newel->next = NULL;
 391   newel->url = url;
 392   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 393   newel->size = tag->attrs[attrind].value_raw_size;
 394   newel->link_inline_p = inlinep;
 395
 396   /* A URL is relative if the host is not named, and the name does not
 397      start with `/'.  */
 398   if (!link_has_scheme && *link_uri != '/')
 399     newel->link_relative_p = 1;
 400   else if (link_has_scheme)
 401     newel->link_complete_p = 1;
 402
 403   if (ctx->tail)
 404     {
 405       ctx->tail->next = newel;
 406       ctx->tail = newel;
 407     }
 408   else
 409     ctx->tail = ctx->head = newel;
 410
 411   return newel;
 412 }
 413 \f
 414 /* All the tag_* functions are called from collect_tags_mapper, as
 415    specified by KNOWN_TAGS.  */
 416
 417 /* Default tag handler: collect URLs from attributes specified for
 418    this tag by tag_url_attributes.  */
 419
 420 static void
 421 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 422 {
 423   int i, attrind, first = -1;
 424   int size = ARRAY_SIZE (tag_url_attributes);
 425
 426   for (i = 0; i < size; i++)
 427     if (tag_url_attributes[i].tagid == tagid)
 428       {
 429         /* We've found the index of tag_url_attributes where the
 430            attributes of our tag begin.  */
 431         first = i;
 432         break;
 433       }
 434   assert (first != -1);
 435
 436   /* Loop over the "interesting" attributes of this tag.  In this
 437      example, it will loop over "src" and "lowsrc".
 438
 439        <img src="foo.png" lowsrc="bar.png">
 440
 441      This has to be done in the outer loop so that the attributes are
 442      processed in the same order in which they appear in the page.
 443      This is required when converting links.  */
 444
 445   for (attrind = 0; attrind < tag->nattrs; attrind++)
 446     {
 447       /* Find whether TAG/ATTRIND is a combination that contains a
 448          URL. */
 449       char *link = tag->attrs[attrind].value;
 450
 451       /* If you're cringing at the inefficiency of the nested loops,
 452          remember that they both iterate over a laughably small
 453          quantity of items.  The worst-case inner loop is for the IMG
 454          tag, which has three attributes.  */
 455       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 456         {
 457           if (0 == strcasecmp (tag->attrs[attrind].name,
 458                                tag_url_attributes[i].attr_name))
 459             {
 460               int flags = tag_url_attributes[i].flags;
 461               append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
 462             }
 463         }
 464     }
 465 }
 466
 467 /* Handle the BASE tag, for <base href=...>. */
 468
 469 static void
 470 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 471 {
 472   struct urlpos *base_urlpos;
 473   int attrind;
 474   char *newbase = find_attr (tag, "href", &attrind);
 475   if (!newbase)
 476     return;
 477
 478   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 479   if (!base_urlpos)
 480     return;
 481   base_urlpos->ignore_when_downloading = 1;
 482   base_urlpos->link_base_p = 1;
 483
 484   if (ctx->base)
 485     xfree (ctx->base);
 486   if (ctx->parent_base)
 487     ctx->base = uri_merge (ctx->parent_base, newbase);
 488   else
 489     ctx->base = xstrdup (newbase);
 490 }
 491
 492 /* Mark the URL found in <form action=...> for conversion. */
 493
 494 static void
 495 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 496 {
 497   int attrind;
 498   char *action = find_attr (tag, "action", &attrind);
 499   if (action)
 500     {
 501       struct urlpos *action_urlpos = append_one_url (action, 0, tag,
 502                                                      attrind, ctx);
 503       if (action_urlpos)
 504         action_urlpos->ignore_when_downloading = 1;
 505     }
 506 }
 507
 508 /* Handle the LINK tag.  It requires special handling because how its
 509    links will be followed in -p mode depends on the REL attribute.  */
 510
 511 static void
 512 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 513 {
 514   int attrind;
 515   char *href = find_attr (tag, "href", &attrind);
 516
 517   /* All <link href="..."> link references are external, except those
 518      known not to be, such as style sheet and shortcut icon:
 519
 520        <link rel="stylesheet" href="...">
 521        <link rel="shortcut icon" href="...">
 522   */
 523   if (href)
 524     {
 525       char *rel  = find_attr (tag, "rel", NULL);
 526       int inlinep = (rel
 527                      && (0 == strcasecmp (rel, "stylesheet")
 528                          || 0 == strcasecmp (rel, "shortcut icon")));
 529       append_one_url (href, inlinep, tag, attrind, ctx);
 530     }
 531 }
 532
 533 /* Handle the META tag.  This requires special handling because of the
 534    refresh feature and because of robot exclusion.  */
 535
 536 static void
 537 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 538 {
 539   char *name = find_attr (tag, "name", NULL);
 540   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 541
 542   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 543     {
 544       /* Some pages use a META tag to specify that the page be
 545          refreshed by a new page after a given number of seconds.  The
 546          general format for this is:
 547
 548            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 549
 550          So we just need to skip past the "NUMBER; URL=" garbage to
 551          get to the URL.  */
 552
 553       struct urlpos *entry;
 554       int attrind;
 555       int timeout = 0;
 556       char *p;
 557
 558       char *refresh = find_attr (tag, "content", &attrind);
 559       if (!refresh)
 560         return;
 561
 562       for (p = refresh; ISDIGIT (*p); p++)
 563         timeout = 10 * timeout + *p - '0';
 564       if (*p++ != ';')
 565         return;
 566
 567       while (ISSPACE (*p))
 568         ++p;
 569       if (!(   TOUPPER (*p)       == 'U'
 570             && TOUPPER (*(p + 1)) == 'R'
 571             && TOUPPER (*(p + 2)) == 'L'
 572             &&          *(p + 3)  == '='))
 573         return;
 574       p += 4;
 575       while (ISSPACE (*p))
 576         ++p;
 577
 578       entry = append_one_url (p, 0, tag, attrind, ctx);
 579       if (entry)
 580         {
 581           entry->link_refresh_p = 1;
 582           entry->refresh_timeout = timeout;
 583         }
 584     }
 585   else if (name && 0 == strcasecmp (name, "robots"))
 586     {
 587       /* Handle stuff like:
 588          <meta name="robots" content="index,nofollow"> */
 589       char *content = find_attr (tag, "content", NULL);
 590       if (!content)
 591         return;
 592       if (!strcasecmp (content, "none"))
 593         ctx->nofollow = 1;
 594       else
 595         {
 596           while (*content)
 597             {
 598               /* Find the next occurrence of ',' or the end of
 599                  the string.  */
 600               char *end = strchr (content, ',');
 601               if (end)
 602                 ++end;
 603               else
 604                 end = content + strlen (content);
 605               if (!strncasecmp (content, "nofollow", end - content))
 606                 ctx->nofollow = 1;
 607               content = end;
 608             }
 609         }
 610     }
 611 }
 612
 613 /* Examine name and attributes of TAG and take appropriate action
 614    according to the tag.  */
 615
 616 static void
 617 collect_tags_mapper (struct taginfo *tag, void *arg)
 618 {
 619   struct map_context *ctx = (struct map_context *)arg;
 620   int tagid;
 621   tag_handler_t handler;
 622
 623   tagid = find_tag (tag->name);
 624   assert (tagid != -1);
 625   handler = known_tags[tagid].handler;
 626
 627   handler (tagid, tag, ctx);
 628 }
 629 \f
 630 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 631    it.  It merges relative links in FILE with URL.  It is aware of
 632    <base href=...> and does the right thing.  */
 633 struct urlpos *
 634 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 635 {
 636   struct file_memory *fm;
 637   struct map_context ctx;
 638
 639   /* Load the file. */
 640   fm = read_file (file);
 641   if (!fm)
 642     {
 643       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 644       return NULL;
 645     }
 646   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 647
 648   ctx.text = fm->content;
 649   ctx.head = ctx.tail = NULL;
 650   ctx.base = NULL;
 651   ctx.parent_base = url ? url : opt.base_href;
 652   ctx.document_file = file;
 653   ctx.nofollow = 0;
 654
 655   if (!interesting_tags)
 656     init_interesting ();
 657
 658   map_html_tags (fm->content, fm->length, interesting_tags,
 659                  interesting_attributes, collect_tags_mapper, &ctx);
 660
 661   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 662   if (meta_disallow_follow)
 663     *meta_disallow_follow = ctx.nofollow;
 664
 665   FREE_MAYBE (ctx.base);
 666   read_file_free (fm);
 667   return ctx.head;
 668 }
 669
 670 void
 671 cleanup_html_url (void)
 672 {
 673   FREE_MAYBE (interesting_tags);
 674   FREE_MAYBE (interesting_attributes);
 675 }