sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdlib.h>
  29 #include <errno.h>
  30 #include <assert.h>
  31
  32 #include "wget.h"
  33 #include "html-parse.h"
  34 #include "url.h"
  35 #include "utils.h"
  36
  37 #ifndef errno
  38 extern int errno;
  39 #endif
  40
  41 struct map_context;
  42
  43 typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
  44                                        struct map_context *));
  45
  46 #define DECLARE_TAG_HANDLER(fun)                                        \
  47   static void fun PARAMS ((int, struct taginfo *, struct map_context *))
  48
  49 DECLARE_TAG_HANDLER (tag_find_urls);
  50 DECLARE_TAG_HANDLER (tag_handle_base);
  51 DECLARE_TAG_HANDLER (tag_handle_link);
  52 DECLARE_TAG_HANDLER (tag_handle_meta);
  53
  54 /* The list of known tags and functions used for handling them.  Most
  55    tags are simply harvested for URLs. */
  56 static struct {
  57   const char *name;
  58   tag_handler_t handler;
  59 } known_tags[] = {
  60 #define TAG_A           0
  61   { "a",        tag_find_urls },
  62 #define TAG_APPLET      1
  63   { "applet",   tag_find_urls },
  64 #define TAG_AREA        2
  65   { "area",     tag_find_urls },
  66 #define TAG_BASE        3
  67   { "base",     tag_handle_base },
  68 #define TAG_BGSOUND     4
  69   { "bgsound",  tag_find_urls },
  70 #define TAG_BODY        5
  71   { "body",     tag_find_urls },
  72 #define TAG_EMBED       6
  73   { "embed",    tag_find_urls },
  74 #define TAG_FIG         7
  75   { "fig",      tag_find_urls },
  76 #define TAG_FRAME       8
  77   { "frame",    tag_find_urls },
  78 #define TAG_IFRAME      9
  79   { "iframe",   tag_find_urls },
  80 #define TAG_IMG         10
  81   { "img",      tag_find_urls },
  82 #define TAG_INPUT       11
  83   { "input",    tag_find_urls },
  84 #define TAG_LAYER       12
  85   { "layer",    tag_find_urls },
  86 #define TAG_LINK        13
  87   { "link",     tag_handle_link },
  88 #define TAG_META        14
  89   { "meta",     tag_handle_meta },
  90 #define TAG_OVERLAY     15
  91   { "overlay",  tag_find_urls },
  92 #define TAG_SCRIPT      16
  93   { "script",   tag_find_urls },
  94 #define TAG_TABLE       17
  95   { "table",    tag_find_urls },
  96 #define TAG_TD          18
  97   { "td",       tag_find_urls },
  98 #define TAG_TH          19
  99   { "th",       tag_find_urls }
 100 };
 101
 102 /* tag_url_attributes documents which attributes of which tags contain
 103    URLs to harvest.  It is used by tag_find_urls.  */
 104
 105 /* Defines for the FLAGS field; currently only one flag is defined. */
 106
 107 /* This tag points to an external document not necessary for rendering this
 108    document (i.e. it's not an inlined image, stylesheet, etc.). */
 109 #define TUA_EXTERNAL 1
 110
 111 /* For tags handled by tag_find_urls: attributes that contain URLs to
 112    download. */
 113 static struct {
 114   int tagid;
 115   const char *attr_name;
 116   int flags;
 117 } tag_url_attributes[] = {
 118   { TAG_A,              "href",         TUA_EXTERNAL },
 119   { TAG_APPLET,         "code",         0 },
 120   { TAG_AREA,           "href",         TUA_EXTERNAL },
 121   { TAG_BGSOUND,        "src",          0 },
 122   { TAG_BODY,           "background",   0 },
 123   { TAG_EMBED,          "href",         0 },
 124   { TAG_EMBED,          "src",          0 },
 125   { TAG_FIG,            "src",          0 },
 126   { TAG_FRAME,          "src",          0 },
 127   { TAG_IFRAME,         "src",          0 },
 128   { TAG_IMG,            "href",         0 },
 129   { TAG_IMG,            "lowsrc",       0 },
 130   { TAG_IMG,            "src",          0 },
 131   { TAG_INPUT,          "src",          0 },
 132   { TAG_LAYER,          "src",          0 },
 133   { TAG_OVERLAY,        "src",          0 },
 134   { TAG_SCRIPT,         "src",          0 },
 135   { TAG_TABLE,          "background",   0 },
 136   { TAG_TD,             "background",   0 },
 137   { TAG_TH,             "background",   0 }
 138 };
 139
 140 /* The lists of interesting tags and attributes are built dynamically,
 141    from the information above.  However, some places in the code refer
 142    to the attributes not mentioned here.  We add them manually.  */
 143 static const char *additional_attributes[] = {
 144   "rel",                        /* for TAG_LINK */
 145   "http-equiv",                 /* for TAG_META */
 146   "name",                       /* for TAG_META */
 147   "content"                     /* for TAG_META */
 148 };
 149
 150 static const char **interesting_tags;
 151 static const char **interesting_attributes;
 152
 153 static void
 154 init_interesting (void)
 155 {
 156   /* Init the variables interesting_tags and interesting_attributes
 157      that are used by the HTML parser to know which tags and
 158      attributes we're interested in.  We initialize this only once,
 159      for performance reasons.
 160
 161      Here we also make sure that what we put in interesting_tags
 162      matches the user's preferences as specified through --ignore-tags
 163      and --follow-tags.
 164
 165      This function is as large as this only because of the glorious
 166      expressivity of the C programming language.  */
 167
 168   {
 169     int i, ind = 0;
 170     int size = ARRAY_SIZE (known_tags);
 171     interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
 172
 173     for (i = 0; i < size; i++)
 174       {
 175         const char *name = known_tags[i].name;
 176
 177         /* Normally here we could say:
 178            interesting_tags[i] = name;
 179            But we need to respect the settings of --ignore-tags and
 180            --follow-tags, so the code gets a bit hairier.  */
 181
 182         if (opt.ignore_tags)
 183           {
 184             /* --ignore-tags was specified.  Do not match these
 185                specific tags.  --ignore-tags takes precedence over
 186                --follow-tags, so we process --ignore first and fall
 187                through if there's no match. */
 188             int j, lose = 0;
 189             for (j = 0; opt.ignore_tags[j] != NULL; j++)
 190               /* Loop through all the tags this user doesn't care about. */
 191               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
 192                 {
 193                   lose = 1;
 194                   break;
 195                 }
 196             if (lose)
 197               continue;
 198           }
 199
 200         if (opt.follow_tags)
 201           {
 202             /* --follow-tags was specified.  Only match these specific tags, so
 203                continue back to top of for if we don't match one of them. */
 204             int j, win = 0;
 205             for (j = 0; opt.follow_tags[j] != NULL; j++)
 206               /* Loop through all the tags this user cares about. */
 207               if (strcasecmp(opt.follow_tags[j], name) == EQ)
 208                 {
 209                   win = 1;
 210                   break;
 211                 }
 212             if (!win)
 213               continue;  /* wasn't one of the explicitly desired tags */
 214           }
 215
 216         /* If we get to here, --follow-tags isn't being used or the
 217            tag is among the ones that are followed, and --ignore-tags,
 218            if specified, didn't include this tag, so it's an
 219            "interesting" one. */
 220         interesting_tags[ind++] = name;
 221       }
 222     interesting_tags[ind] = NULL;
 223   }
 224
 225   /* The same for attributes, except we loop through tag_url_attributes.
 226      Here we also need to make sure that the list of attributes is
 227      unique, and to include the attributes from additional_attributes.  */
 228   {
 229     int i, ind;
 230     const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
 231                                 * sizeof (char *));
 232     /* First copy the "additional" attributes. */
 233     for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
 234       att[i] = additional_attributes[i];
 235     ind = i;
 236     att[ind] = NULL;
 237     for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
 238       {
 239         int j, seen = 0;
 240         const char *look_for = tag_url_attributes[i].attr_name;
 241         for (j = 0; j < ind - 1; j++)
 242           if (!strcmp (att[j], look_for))
 243             {
 244               seen = 1;
 245               break;
 246             }
 247         if (!seen)
 248           {
 249             att = xrealloc (att, (ind + 2) * sizeof (*att));
 250             att[ind++] = look_for;
 251             att[ind] = NULL;
 252           }
 253       }
 254     interesting_attributes = att;
 255   }
 256 }
 257
 258 static int
 259 find_tag (const char *tag_name)
 260 {
 261   int i;
 262
 263   /* This is linear search; if the number of tags grow, we can switch
 264      to binary search.  */
 265
 266   for (i = 0; i < ARRAY_SIZE (known_tags); i++)
 267     {
 268       int cmp = strcasecmp (known_tags[i].name, tag_name);
 269       /* known_tags are sorted alphabetically, so we can
 270          micro-optimize.  */
 271       if (cmp > 0)
 272         break;
 273       else if (cmp == 0)
 274         return i;
 275     }
 276   return -1;
 277 }
 278
 279 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 280    attribute is not present, return NULL.  If ATTRIND is non-NULL, the
 281    index of the attribute in TAG will be stored there.  */
 282 static char *
 283 find_attr (struct taginfo *tag, const char *name, int *attrind)
 284 {
 285   int i;
 286   for (i = 0; i < tag->nattrs; i++)
 287     if (!strcasecmp (tag->attrs[i].name, name))
 288       {
 289         if (attrind)
 290           *attrind = i;
 291         return tag->attrs[i].value;
 292       }
 293   return NULL;
 294 }
 295
 296 struct map_context {
 297   char *text;                   /* HTML text. */
 298   char *base;                   /* Base URI of the document, possibly
 299                                    changed through <base href=...>. */
 300   const char *parent_base;      /* Base of the current document. */
 301   const char *document_file;    /* File name of this document. */
 302   int nofollow;                 /* whether NOFOLLOW was specified in a
 303                                    <meta name=robots> tag. */
 304
 305   struct urlpos *head, *tail;   /* List of URLs that is being
 306                                    built. */
 307 };
 308
 309 /* Append LINK_URI to the urlpos structure that is being built.
 310
 311    LINK_URI will be merged with the current document base.  TAG and
 312    ATTRIND are the necessary context to store the position and
 313    size.  */
 314
 315 static struct urlpos *
 316 append_one_url (const char *link_uri, int inlinep,
 317                 struct taginfo *tag, int attrind, struct map_context *ctx)
 318 {
 319   int link_has_scheme = url_has_scheme (link_uri);
 320   struct urlpos *newel;
 321   const char *base = ctx->base ? ctx->base : ctx->parent_base;
 322   struct url *url;
 323
 324   if (!base)
 325     {
 326       DEBUGP (("%s: no base, merge will use \"%s\".\n",
 327                ctx->document_file, link_uri));
 328
 329       if (!link_has_scheme)
 330         {
 331           /* We have no base, and the link does not have a host
 332              attached to it.  Nothing we can do.  */
 333           /* #### Should we print a warning here?  Wget 1.5.x used to.  */
 334           return NULL;
 335         }
 336
 337       url = url_parse (link_uri, NULL);
 338       if (!url)
 339         {
 340           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
 341                    ctx->document_file, link_uri));
 342           return NULL;
 343         }
 344     }
 345   else
 346     {
 347       /* Merge BASE with LINK_URI, but also make sure the result is
 348          canonicalized, i.e. that "../" have been resolved.
 349          (parse_url will do that for us.) */
 350
 351       char *complete_uri = uri_merge (base, link_uri);
 352
 353       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 354                ctx->document_file, base, link_uri, complete_uri));
 355
 356       url = url_parse (complete_uri, NULL);
 357       if (!url)
 358         {
 359           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
 360                    ctx->document_file, complete_uri));
 361           xfree (complete_uri);
 362           return NULL;
 363         }
 364       xfree (complete_uri);
 365     }
 366
 367   newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
 368   memset (newel, 0, sizeof (*newel));
 369
 370   newel->next = NULL;
 371   newel->url = url;
 372   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
 373   newel->size = tag->attrs[attrind].value_raw_size;
 374   newel->link_inline_p = inlinep;
 375
 376   /* A URL is relative if the host is not named, and the name does not
 377      start with `/'.  */
 378   if (!link_has_scheme && *link_uri != '/')
 379     newel->link_relative_p = 1;
 380   else if (link_has_scheme)
 381     newel->link_complete_p = 1;
 382
 383   if (ctx->tail)
 384     {
 385       ctx->tail->next = newel;
 386       ctx->tail = newel;
 387     }
 388   else
 389     ctx->tail = ctx->head = newel;
 390
 391   return newel;
 392 }
 393 \f
 394 /* All the tag_* functions are called from collect_tags_mapper, as
 395    specified by KNOWN_TAGS.  */
 396
 397 /* For most tags, all we want to do is harvest URLs from their
 398    attributes.  */
 399
 400 static void
 401 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 402 {
 403   int i, attrind, first = -1;
 404   int size = ARRAY_SIZE (tag_url_attributes);
 405
 406   for (i = 0; i < size; i++)
 407     if (tag_url_attributes[i].tagid == tagid)
 408       {
 409         /* We've found the index of tag_url_attributes where the
 410            attributes of our tags begin.  */
 411         first = i;
 412         break;
 413       }
 414   assert (first != -1);
 415
 416   /* Loop over the "interesting" attributes of this tag.  In this
 417      example, it will loop over "src" and "lowsrc".
 418
 419        <img src="foo.png" lowsrc="bar.png">
 420
 421      This has to be done in the outer loop so that the attributes are
 422      processed in the same order in which they appear in the page.
 423      This is required when converting links.  */
 424
 425   for (attrind = 0; attrind < tag->nattrs; attrind++)
 426     {
 427       /* Find whether TAG/ATTRIND is a combination that contains a
 428          URL. */
 429       char *attrvalue = tag->attrs[attrind].value;
 430
 431       /* If you're cringing at the inefficiency of the nested loops,
 432          remember that the number of attributes the inner loop
 433          iterates over is laughably small -- three in the worst case
 434          (IMG).  */
 435       for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
 436         {
 437           if (0 == strcasecmp (tag->attrs[attrind].name,
 438                                tag_url_attributes[i].attr_name))
 439             {
 440               int flags = tag_url_attributes[i].flags;
 441               append_one_url (attrvalue, !(flags & TUA_EXTERNAL),
 442                               tag, attrind, ctx);
 443             }
 444         }
 445     }
 446 }
 447
 448 static void
 449 tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
 450 {
 451   struct urlpos *base_urlpos;
 452   int attrind;
 453   char *newbase = find_attr (tag, "href", &attrind);
 454   if (!newbase)
 455     return;
 456
 457   base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
 458   if (!base_urlpos)
 459     return;
 460   base_urlpos->ignore_when_downloading = 1;
 461   base_urlpos->link_base_p = 1;
 462
 463   if (ctx->base)
 464     xfree (ctx->base);
 465   if (ctx->parent_base)
 466     ctx->base = uri_merge (ctx->parent_base, newbase);
 467   else
 468     ctx->base = xstrdup (newbase);
 469 }
 470
 471 static void
 472 tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
 473 {
 474   int attrind;
 475   char *href = find_attr (tag, "href", &attrind);
 476
 477   /* All <link href="..."> link references are external,
 478      except for <link rel="stylesheet" href="...">.  */
 479   if (href)
 480     {
 481       char *rel  = find_attr (tag, "rel", NULL);
 482       int inlinep = (rel && 0 == strcasecmp (rel, "stylesheet"));
 483       append_one_url (href, inlinep, tag, attrind, ctx);
 484     }
 485 }
 486
 487 /* Some pages use a META tag to specify that the page be refreshed by
 488    a new page after a given number of seconds.  The general format for
 489    this is:
 490
 491    <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 492
 493    So we just need to skip past the "NUMBER; URL=" garbage to get to
 494    the URL.  */
 495
 496 static void
 497 tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
 498 {
 499   char *name = find_attr (tag, "name", NULL);
 500   char *http_equiv = find_attr (tag, "http-equiv", NULL);
 501
 502   if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
 503     {
 504       struct urlpos *entry;
 505
 506       int attrind;
 507       char *p, *refresh = find_attr (tag, "content", &attrind);
 508       int timeout = 0;
 509
 510       for (p = refresh; ISDIGIT (*p); p++)
 511         timeout = 10 * timeout + *p - '0';
 512       if (*p++ != ';')
 513         return;
 514
 515       while (ISSPACE (*p))
 516         ++p;
 517       if (!(   TOUPPER (*p)       == 'U'
 518             && TOUPPER (*(p + 1)) == 'R'
 519             && TOUPPER (*(p + 2)) == 'L'
 520             &&          *(p + 3)  == '='))
 521         return;
 522       p += 4;
 523       while (ISSPACE (*p))
 524         ++p;
 525
 526       entry = append_one_url (p, 0, tag, attrind, ctx);
 527       if (entry)
 528         {
 529           entry->link_refresh_p = 1;
 530           entry->refresh_timeout = timeout;
 531         }
 532     }
 533   else if (name && 0 == strcasecmp (name, "robots"))
 534     {
 535       /* Handle stuff like:
 536          <meta name="robots" content="index,nofollow"> */
 537       char *content = find_attr (tag, "content", NULL);
 538       if (!content)
 539         return;
 540       if (!strcasecmp (content, "none"))
 541         ctx->nofollow = 1;
 542       else
 543         {
 544           while (*content)
 545             {
 546               /* Find the next occurrence of ',' or the end of
 547                  the string.  */
 548               char *end = strchr (content, ',');
 549               if (end)
 550                 ++end;
 551               else
 552                 end = content + strlen (content);
 553               if (!strncasecmp (content, "nofollow", end - content))
 554                 ctx->nofollow = 1;
 555               content = end;
 556             }
 557         }
 558     }
 559 }
 560
 561 /* Examine name and attributes of TAG and take appropriate action
 562    according to the tag.  */
 563
 564 static void
 565 collect_tags_mapper (struct taginfo *tag, void *arg)
 566 {
 567   struct map_context *ctx = (struct map_context *)arg;
 568   int tagid;
 569   tag_handler_t handler;
 570
 571   tagid = find_tag (tag->name);
 572   assert (tagid != -1);
 573   handler = known_tags[tagid].handler;
 574
 575   handler (tagid, tag, ctx);
 576 }
 577 \f
 578 /* Analyze HTML tags FILE and construct a list of URLs referenced from
 579    it.  It merges relative links in FILE with URL.  It is aware of
 580    <base href=...> and does the right thing.  */
 581 struct urlpos *
 582 get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 583 {
 584   struct file_memory *fm;
 585   struct map_context ctx;
 586
 587   /* Load the file. */
 588   fm = read_file (file);
 589   if (!fm)
 590     {
 591       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 592       return NULL;
 593     }
 594   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 595
 596   ctx.text = fm->content;
 597   ctx.head = ctx.tail = NULL;
 598   ctx.base = NULL;
 599   ctx.parent_base = url ? url : opt.base_href;
 600   ctx.document_file = file;
 601   ctx.nofollow = 0;
 602
 603   if (!interesting_tags)
 604     init_interesting ();
 605
 606   map_html_tags (fm->content, fm->length, interesting_tags,
 607                  interesting_attributes, collect_tags_mapper, &ctx);
 608
 609   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
 610   if (meta_disallow_follow)
 611     *meta_disallow_follow = ctx.nofollow;
 612
 613   FREE_MAYBE (ctx.base);
 614   read_file_free (fm);
 615   return ctx.head;
 616 }
 617
 618 void
 619 cleanup_html_url (void)
 620 {
 621   FREE_MAYBE (interesting_tags);
 622   FREE_MAYBE (interesting_attributes);
 623 }