sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdlib.h>
  29 #include <errno.h>
  30 #include <assert.h>
  31
  32 #include "wget.h"
  33 #include "html-parse.h"
  34 #include "url.h"
  35 #include "utils.h"
  36
  37 #ifndef errno
  38 extern int errno;
  39 #endif
  40
  41 struct map_context;
  42
  43 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  44                                        struct map_context *));
  45
  46 #define DECLARE_TAG_HANDLER(fun)                                        \
  47   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  48
  49 DECLARE_TAG_HANDLER (tag_find_urls);
  50 DECLARE_TAG_HANDLER (tag_handle_base);
  51 DECLARE_TAG_HANDLER (tag_handle_form);
  52 DECLARE_TAG_HANDLER (tag_handle_link);
  53 DECLARE_TAG_HANDLER (tag_handle_meta);
  54
  55 /* The list of known tags and functions used for handling them.  Most
  56    tags are simply harvested for URLs. */
  57 static struct {
  58   const char *name;
  59   tag_handler_t handler;
  60 } known_tags[] = {
  61 #define TAG_A           0
  62   { "a",        tag_find_urls },
  63 #define TAG_APPLET      1
  64   { "applet",   tag_find_urls },
  65 #define TAG_AREA        2
  66   { "area",     tag_find_urls },
  67 #define TAG_BASE        3
  68   { "base",     tag_handle_base },
  69 #define TAG_BGSOUND     4
  70   { "bgsound",  tag_find_urls },
  71 #define TAG_BODY        5
  72   { "body",     tag_find_urls },
  73 #define TAG_EMBED       6
  74   { "embed",    tag_find_urls },
  75 #define TAG_FIG         7
  76   { "fig",      tag_find_urls },
  77 #define TAG_FORM        8
  78   { "form",     tag_handle_form },
  79 #define TAG_FRAME       9
  80   { "frame",    tag_find_urls },
  81 #define TAG_IFRAME      10
  82   { "iframe",   tag_find_urls },
  83 #define TAG_IMG         11
  84   { "img",      tag_find_urls },
  85 #define TAG_INPUT       12
  86   { "input",    tag_find_urls },
  87 #define TAG_LAYER       13
  88   { "layer",    tag_find_urls },
  89 #define TAG_LINK        14
  90   { "link",     tag_handle_link },
  91 #define TAG_META        15
  92   { "meta",     tag_handle_meta },
  93 #define TAG_OVERLAY     16
  94   { "overlay",  tag_find_urls },
  95 #define TAG_SCRIPT      17
  96   { "script",   tag_find_urls },
  97 #define TAG_TABLE       18
  98   { "table",    tag_find_urls },
  99 #define TAG_TD          19
 100   { "td",       tag_find_urls },
 101 #define TAG_TH          20
 102   { "th",       tag_find_urls }
 103 };
 104
 105 /* tag_url_attributes documents which attributes of which tags contain
 106    URLs to harvest.  It is used by tag_find_urls.  */
 107
 108 /* Defines for the FLAGS field; currently only one flag is defined. */
 109
 110 /* This tag points to an external document not necessary for rendering this
 111    document (i.e. it's not an inlined image, stylesheet, etc.). */
 112 #define TUA_EXTERNAL 1
 113
 114 /* For tags handled by tag_find_urls: attributes that contain URLs to
 115    download. */
 116 static struct {
 117   int tagid;
 118   const char *attr_name;
 119   int flags;
 120 } tag_url_attributes[] = {
 121   { TAG_A,              "href",         TUA_EXTERNAL },
 122   { TAG_APPLET,         "code",         0 },
 123   { TAG_AREA,           "href",         TUA_EXTERNAL },
 124   { TAG_BGSOUND,        "src",          0 },
 125   { TAG_BODY,           "background",   0 },
 126   { TAG_EMBED,          "href",         TUA_EXTERNAL },
 127   { TAG_EMBED,          "src",          0 },
 128   { TAG_FIG,            "src",          0 },
 129   { TAG_FRAME,          "src",          0 },
 130   { TAG_IFRAME,         "src",          0 },
 131   { TAG_IMG,            "href",         0 },
 132   { TAG_IMG,            "lowsrc",       0 },
 133   { TAG_IMG,            "src",          0 },
 134   { TAG_INPUT,          "src",          0 },
 135   { TAG_LAYER,          "src",          0 },
 136   { TAG_OVERLAY,        "src",          0 },
 137   { TAG_SCRIPT,         "src",          0 },
 138   { TAG_TABLE,          "background",   0 },
 139   { TAG_TD,             "background",   0 },
 140   { TAG_TH,             "background",   0 }
 141 };
 142
 143 /* The lists of interesting tags and attributes are built dynamically,
 144    from the information above.  However, some places in the code refer
 145    to the attributes not mentioned here.  We add them manually.  */
 146 static const char *additional_attributes[] = {
 147   "rel",                        /* used by tag_handle_link */
 148   "http-equiv",                 /* used by tag_handle_meta */
 149   "name",                       /* used by tag_handle_meta */
 150   "content",                    /* used by tag_handle_meta */
 151   "action"                      /* used by tag_handle_form */
 152 };
 153
 154 static const char **interesting_tags;
 155 static const char **interesting_attributes;
 156
 157 static void
 158 init_interesting (void)
 159 {
 160   /* Init the variables interesting_tags and interesting_attributes
 161      that are used by the HTML parser to know which tags and
 162      attributes we're interested in.  We initialize this only once,
 163      for performance reasons.
 164
 165      Here we also make sure that what we put in interesting_tags
 166      matches the user's preferences as specified through --ignore-tags
 167      and --follow-tags.
 168
 169      This function is as large as this only because of the glorious
 170      expressivity of the C programming language.  */
 171
 172   {
 173     int i, ind = 0;
 174     int size = ARRAY_SIZE (known_tags);
 175     interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
 176
 177     for (i = 0; i < size; i++)
 178       {
 179         const char *name = known_tags[i].name;
 180
 181         /* Normally here we could say:
 182            interesting_tags[i] = name;
 183            But we need to respect the settings of --ignore-tags and
 184            --follow-tags, so the code gets a bit hairier.  */
 185
 186         if (opt.ignore_tags)
 187           {
 188             /* --ignore-tags was specified.  Do not match these
 189                specific tags.  --ignore-tags takes precedence over
 190                --follow-tags, so we process --ignore first and fall
 191                through if there's no match. */
 192             int j, lose = 0;
 193             for (j = 0; opt.ignore_tags[j] != NULL; j++)
 194               /* Loop through all the tags this user doesn't care about. */
 195               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
 196                 {
 197                   lose = 1;
 198                   break;
 199                 }
 200             if (lose)
 201               continue;
 202           }
 203
 204         if (opt.follow_tags)
 205           {
 206             /* --follow-tags was specified.  Only match these specific tags, so
 207                continue back to top of for if we don't match one of them. */
 208             int j, win = 0;
 209             for (j = 0; opt.follow_tags[j] != NULL; j++)
 210               /* Loop through all the tags this user cares about. */
 211               if (strcasecmp(opt.follow_tags[j], name) == EQ)
 212                 {
 213                   win = 1;
 214                   break;
 215                 }
 216             if (!win)
 217               continue;  /* wasn't one of the explicitly desired tags */
 218           }
 219
 220         /* If we get to here, --follow-tags isn't being used or the
 221            tag is among the ones that are followed, and --ignore-tags,
 222            if specified, didn't include this tag, so it's an
 223            "interesting" one. */
 224         interesting_tags[ind++] = name;
 225       }
 226     interesting_tags[ind] = NULL;
 227   }
 228
 229   /* The same for attributes, except we loop through tag_url_attributes.
 230      Here we also need to make sure that the list of attributes is
 231      unique, and to include the attributes from additional_attributes.  */
 232   {
 233     int i, ind;
 234     const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
 235                                 * sizeof (char *));
 236     /* First copy the "additional" attributes. */
 237     for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
 238       att[i] = additional_attributes[i];
 239     ind = i;
 240     att[ind] = NULL;
 241     for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
 242       {
 243         int j, seen = 0;
 244         const char *look_for = tag_url_attributes[i].attr_name;
 245         for (j = 0; j < ind - 1; j++)
 246           if (!strcmp (att[j], look_for))
 247             {
 248               seen = 1;
 249               break;
 250             }
 251         if (!seen)
 252           {
 253             att = xrealloc (att, (ind + 2) * sizeof (*att));
 254             att[ind++] = look_for;
 255             att[ind] = NULL;
 256           }
 257       }
 258     interesting_attributes = att;
 259   }
 260 }
 261
 262 static int
 263 find_tag (const char *tag_name)
 264 {
 265   int i;
 266
 267   /* This is linear search; if the number of tags grow, we can switch
 268      to binary search.  */
 269
 270   for (i = 0; i < ARRAY_SIZE (known_tags); i++)
 271     {
 272       int cmp = strcasecmp (known_tags[i].name, tag_name);
 273       /* known_tags are sorted alphabetically, so we can
 274          micro-optimize.  */
 275       if (cmp > 0)
 276         break;
 277       else if (cmp == 0)
 278         return i;
 279     }
 280   return -1;
 281 }
 282
 283 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 284    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 285    index of the attribute in TAG will be stored there.  */
 286 static char *
 287 find_attr (struct taginfo *tag, const char *name, int *attrind)
 288 {
 289   int i;
 290   for (i = 0; i < tag->nattrs; i++)
 291     if (!strcasecmp (tag->attrs[i].name, name))
 292       {
 293         if (attrind)
 294           *attrind = i;
 295         return tag->attrs[i].value;
 296       }
 297   return NULL;
 298 }
 299
 300 struct map_context {
 301   char *text;                   /* HTML text. */
 302   char *base;                   /* Base URI of the document, possibly
 303                                    changed through <base href=...>. */
 304   const char *parent_base;      /* Base of the current document. */
 305   const char *document_file;    /* File name of this document. */
 306   int nofollow;                 /* whether NOFOLLOW was specified in a
 307                                    <meta name=robots> tag. */
 308
 309   struct urlpos *head, *tail;   /* List of URLs that is being
 310                                    built. */
 311 };
 312
 313 /* Append LINK_URI to the urlpos structure that is being built.
 314
 315    LINK_URI will be merged with the current document base.  TAG and
 316    ATTRIND are the necessary context to store the position and
 317    size.  */
 318
 319 static struct urlpos *
 320 append_one_url (const char *link_uri, int inlinep,
 321                 struct taginfo *tag, int attrind, struct map_context *ctx)
 322 {
 323   int link_has_scheme = url_has_scheme (link_uri);
 324   struct urlpos *newel;
 325   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 326   struct url *url;
 327
 328   if (!base)
 329     {
 330       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 331                ctx->document_file, link_uri));
 332
 333       if (!link_has_scheme)
 334         {
 335           /* Base URL is unavailable, and the link does not have a
 336              location attached to it -- we have to give up.  Since
 337              this can only happen when using `--force-html -i', print
 338              a warning.  */
 339           logprintf (LOG_NOTQUIET,
 340                      _("%s: Cannot resolve incomplete link %s.\n"),
 341                      ctx->document_file, link_uri);
 342           return NULL;
 343         }
 344
 345       url = url_parse (link_uri, NULL);
 346       if (!url)
 347         {
 348           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 349                    ctx->document_file, link_uri));
 350           return NULL;
 351         }
 352     }
 353   else
 354     {
 355       /* Merge BASE with LINK_URI, but also make sure the result is
 356          canonicalized, i.e. that "../" have been resolved.
 357          (parse_url will do that for us.) */
 358
 359       char *complete_uri = uri_merge (base, link_uri);
 360
 361       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 362                ctx->document_file, base, link_uri, complete_uri));
 363
 364       url = url_parse (complete_uri, NULL);
 365       if (!url)
 366         {
 367           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 368                    ctx->document_file, complete_uri));
 369           xfree (complete_uri);
 370           return NULL;
 371         }
 372       xfree (complete_uri);
 373     }
 374
 375   DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
 376
 377   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 378   memset (newel, 0, sizeof (*newel));
 379
 380   newel->next = NULL;
 381   newel->url = url;
 382   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 383   newel->size = tag->attrs[attrind].value_raw_size;
 384   newel->link_inline_p = inlinep;
 385
 386   /* A URL is relative if the host is not named, and the name does not
 387      start with `/'.  */
 388   if (!link_has_scheme && *link_uri != '/')
 389     newel->link_relative_p = 1;
 390   else if (link_has_scheme)
 391     newel->link_complete_p = 1;
 392
 393   if (ctx->tail)
 394     {
 395       ctx->tail->next = newel;
 396       ctx->tail = newel;
 397     }
 398   else
 399     ctx->tail = ctx->head = newel;
 400
 401   return newel;
 402 }
 403 \f
 404 /* All the tag_* functions are called from collect_tags_mapper, as
 405    specified by KNOWN_TAGS.  */
 406
 407 /* Default tag handler: collect URLs from attributes specified for
 408    this tag by tag_url_attributes.  */
 409
 410 static void
 411 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 412 {
 413   int i, attrind, first = -1;
 414   int size = ARRAY_SIZE (tag_url_attributes);
 415
 416   for (i = 0; i < size; i++)
 417     if (tag_url_attributes[i].tagid == tagid)
 418       {
 419         /* We've found the index of tag_url_attributes where the
 420            attributes of our tag begin.  */
 421         first = i;
 422         break;
 423       }
 424   assert (first != -1);
 425
 426   /* Loop over the "interesting" attributes of this tag.  In this
 427      example, it will loop over "src" and "lowsrc".
 428
 429        <img src="foo.png" lowsrc="bar.png">
 430
 431      This has to be done in the outer loop so that the attributes are
 432      processed in the same order in which they appear in the page.
 433      This is required when converting links.  */
 434
 435   for (attrind = 0; attrind < tag->nattrs; attrind++)
 436     {
 437       /* Find whether TAG/ATTRIND is a combination that contains a
 438          URL. */
 439       char *link = tag->attrs[attrind].value;
 440
 441       /* If you're cringing at the inefficiency of the nested loops,
 442          remember that they both iterate over a laughably small
 443          quantity of items.  The worst-case inner loop is for the IMG
 444          tag, which has three attributes.  */
 445       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 446         {
 447           if (0 == strcasecmp (tag->attrs[attrind].name,
 448                                tag_url_attributes[i].attr_name))
 449             {
 450               int flags = tag_url_attributes[i].flags;
 451               append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
 452             }
 453         }
 454     }
 455 }
 456
 457 /* Handle the BASE tag, for <base href=...>. */
 458
 459 static void
 460 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 461 {
 462   struct urlpos *base_urlpos;
 463   int attrind;
 464   char *newbase = find_attr (tag, "href", &attrind);
 465   if (!newbase)
 466     return;
 467
 468   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 469   if (!base_urlpos)
 470     return;
 471   base_urlpos->ignore_when_downloading = 1;
 472   base_urlpos->link_base_p = 1;
 473
 474   if (ctx->base)
 475     xfree (ctx->base);
 476   if (ctx->parent_base)
 477     ctx->base = uri_merge (ctx->parent_base, newbase);
 478   else
 479     ctx->base = xstrdup (newbase);
 480 }
 481
 482 /* Mark the URL found in <form action=...> for conversion. */
 483
 484 static void
 485 tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
 486 {
 487   int attrind;
 488   char *action = find_attr (tag, "action", &attrind);
 489   if (action)
 490     {
 491       struct urlpos *action_urlpos = append_one_url (action, 0, tag,
 492                                                      attrind, ctx);
 493       if (action_urlpos)
 494         action_urlpos->ignore_when_downloading = 1;
 495     }
 496 }
 497
 498 /* Handle the LINK tag.  It requires special handling because how its
 499    links will be followed in -p mode depends on the REL attribute.  */
 500
 501 static void
 502 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 503 {
 504   int attrind;
 505   char *href = find_attr (tag, "href", &attrind);
 506
 507   /* All <link href="..."> link references are external, except those
 508      known not to be, such as style sheet and shortcut icon:
 509
 510        <link rel="stylesheet" href="...">
 511        <link rel="shortcut icon" href="...">
 512   */
 513   if (href)
 514     {
 515       char *rel  = find_attr (tag, "rel", NULL);
 516       int inlinep = (rel
 517                      && (0 == strcasecmp (rel, "stylesheet")
 518                          || 0 == strcasecmp (rel, "shortcut icon")));
 519       append_one_url (href, inlinep, tag, attrind, ctx);
 520     }
 521 }
 522
 523 /* Handle the META tag.  This requires special handling because of the
 524    refresh feature and because of robot exclusion.  */
 525
 526 static void
 527 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 528 {
 529   char *name = find_attr (tag, "name", NULL);
 530   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 531
 532   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 533     {
 534       /* Some pages use a META tag to specify that the page be
 535          refreshed by a new page after a given number of seconds.  The
 536          general format for this is:
 537
 538            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 539
 540          So we just need to skip past the "NUMBER; URL=" garbage to
 541          get to the URL.  */
 542
 543       struct urlpos *entry;
 544       int attrind;
 545       int timeout = 0;
 546       char *p;
 547
 548       char *refresh = find_attr (tag, "content", &attrind);
 549       if (!refresh)
 550         return;
 551
 552       for (p = refresh; ISDIGIT (*p); p++)
 553         timeout = 10 * timeout + *p - '0';
 554       if (*p++ != ';')
 555         return;
 556
 557       while (ISSPACE (*p))
 558         ++p;
 559       if (!(   TOUPPER (*p)       == 'U'
 560             && TOUPPER (*(p + 1)) == 'R'
 561             && TOUPPER (*(p + 2)) == 'L'
 562             &&          *(p + 3)  == '='))
 563         return;
 564       p += 4;
 565       while (ISSPACE (*p))
 566         ++p;
 567
 568       entry = append_one_url (p, 0, tag, attrind, ctx);
 569       if (entry)
 570         {
 571           entry->link_refresh_p = 1;
 572           entry->refresh_timeout = timeout;
 573         }
 574     }
 575   else if (name && 0 == strcasecmp (name, "robots"))
 576     {
 577       /* Handle stuff like:
 578          <meta name="robots" content="index,nofollow"> */
 579       char *content = find_attr (tag, "content", NULL);
 580       if (!content)
 581         return;
 582       if (!strcasecmp (content, "none"))
 583         ctx->nofollow = 1;
 584       else
 585         {
 586           while (*content)
 587             {
 588               /* Find the next occurrence of ',' or the end of
 589                  the string.  */
 590               char *end = strchr (content, ',');
 591               if (end)
 592                 ++end;
 593               else
 594                 end = content + strlen (content);
 595               if (!strncasecmp (content, "nofollow", end - content))
 596                 ctx->nofollow = 1;
 597               content = end;
 598             }
 599         }
 600     }
 601 }
 602
 603 /* Examine name and attributes of TAG and take appropriate action
 604    according to the tag.  */
 605
 606 static void
 607 collect_tags_mapper (struct taginfo *tag, void *arg)
 608 {
 609   struct map_context *ctx = (struct map_context *)arg;
 610   int tagid;
 611   tag_handler_t handler;
 612
 613   tagid = find_tag (tag->name);
 614   assert (tagid != -1);
 615   handler = known_tags[tagid].handler;
 616
 617   handler (tagid, tag, ctx);
 618 }
 619 \f
 620 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 621    it.  It merges relative links in FILE with URL.  It is aware of
 622    <base href=...> and does the right thing.  */
 623 struct urlpos *
 624 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 625 {
 626   struct file_memory *fm;
 627   struct map_context ctx;
 628
 629   /* Load the file. */
 630   fm = read_file (file);
 631   if (!fm)
 632     {
 633       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 634       return NULL;
 635     }
 636   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 637
 638   ctx.text = fm->content;
 639   ctx.head = ctx.tail = NULL;
 640   ctx.base = NULL;
 641   ctx.parent_base = url ? url : opt.base_href;
 642   ctx.document_file = file;
 643   ctx.nofollow = 0;
 644
 645   if (!interesting_tags)
 646     init_interesting ();
 647
 648   map_html_tags (fm->content, fm->length, interesting_tags,
 649                  interesting_attributes, collect_tags_mapper, &ctx);
 650
 651   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 652   if (meta_disallow_follow)
 653     *meta_disallow_follow = ctx.nofollow;
 654
 655   FREE_MAYBE (ctx.base);
 656   read_file_free (fm);
 657   return ctx.head;
 658 }
 659
 660 void
 661 cleanup_html_url (void)
 662 {
 663   FREE_MAYBE (interesting_tags);
 664   FREE_MAYBE (interesting_attributes);
 665 }