sjero.net Git - wget/blob - src/html-url.c

   1 /* Collect URLs from HTML source.
   2    Copyright (C) 1998, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #ifdef HAVE_STRING_H
  24 # include <string.h>
  25 #else
  26 # include <strings.h>
  27 #endif
  28 #include <stdlib.h>
  29 #include <ctype.h>
  30 #include <errno.h>
  31 #include <assert.h>
  32
  33 #include "wget.h"
  34 #include "html-parse.h"
  35 #include "url.h"
  36 #include "utils.h"
  37
  38 #ifndef errno
  39 extern int errno;
  40 #endif
  41
  42 enum tag_category { TC_LINK, TC_SPEC };
  43
  44 /* Here we try to categorize the known tags.  Each tag has its ID and
  45    cetegory.  Category TC_LINK means that one or more of its
  46    attributes contain links that should be retrieved.  TC_SPEC means
  47    that the tag is specific in some way, and has to be handled
  48    specially. */
  49 static struct {
  50   const char *name;
  51   enum tag_category category;
  52 } known_tags[] = {
  53 #define TAG_A           0
  54   { "a",        TC_LINK },
  55 #define TAG_APPLET      1
  56   { "applet",   TC_LINK },
  57 #define TAG_AREA        2
  58   { "area",     TC_LINK },
  59 #define TAG_BASE        3
  60   { "base",     TC_SPEC },
  61 #define TAG_BGSOUND     4
  62   { "bgsound",  TC_LINK },
  63 #define TAG_BODY        5
  64   { "body",     TC_LINK },
  65 #define TAG_EMBED       6
  66   { "embed",    TC_LINK },
  67 #define TAG_FIG         7
  68   { "fig",      TC_LINK },
  69 #define TAG_FRAME       8
  70   { "frame",    TC_LINK },
  71 #define TAG_IFRAME      9
  72   { "iframe",   TC_LINK },
  73 #define TAG_IMG         10
  74   { "img",      TC_LINK },
  75 #define TAG_INPUT       11
  76   { "input",    TC_LINK },
  77 #define TAG_LAYER       12
  78   { "layer",    TC_LINK },
  79 #define TAG_LINK        13
  80   { "link",     TC_SPEC },
  81 #define TAG_META        14
  82   { "meta",     TC_SPEC },
  83 #define TAG_OVERLAY     15
  84   { "overlay",  TC_LINK },
  85 #define TAG_SCRIPT      16
  86   { "script",   TC_LINK },
  87 #define TAG_TABLE       17
  88   { "table",    TC_LINK },
  89 #define TAG_TD          18
  90   { "td",       TC_LINK },
  91 #define TAG_TH          19
  92   { "th",       TC_LINK }
  93 };
  94
  95
  96 /* Flags for specific url-attr pairs handled through TC_LINK: */
  97
  98 /* This tag points to an external document not necessary for rendering this
  99    document (i.e. it's not an inlined image, stylesheet, etc.). */
 100 #define AF_EXTERNAL 1
 101
 102
 103 /* For tags handled by TC_LINK: attributes that contain URLs to
 104    download. */
 105 static struct {
 106   int tagid;
 107   const char *attr_name;
 108   int flags;
 109 } url_tag_attr_map[] = {
 110   { TAG_A,              "href",         AF_EXTERNAL },
 111   { TAG_APPLET,         "code",         0 },
 112   { TAG_AREA,           "href",         AF_EXTERNAL },
 113   { TAG_BGSOUND,        "src",          0 },
 114   { TAG_BODY,           "background",   0 },
 115   { TAG_EMBED,          "src",          0 },
 116   { TAG_FIG,            "src",          0 },
 117   { TAG_FRAME,          "src",          0 },
 118   { TAG_IFRAME,         "src",          0 },
 119   { TAG_IMG,            "href",         0 },
 120   { TAG_IMG,            "lowsrc",       0 },
 121   { TAG_IMG,            "src",          0 },
 122   { TAG_INPUT,          "src",          0 },
 123   { TAG_LAYER,          "src",          0 },
 124   { TAG_OVERLAY,        "src",          0 },
 125   { TAG_SCRIPT,         "src",          0 },
 126   { TAG_TABLE,          "background",   0 },
 127   { TAG_TD,             "background",   0 },
 128   { TAG_TH,             "background",   0 }
 129 };
 130
 131 /* The lists of interesting tags and attributes are built dynamically,
 132    from the information above.  However, some places in the code refer
 133    to the attributes not mentioned here.  We add them manually.  */
 134 static const char *additional_attributes[] = {
 135   "rel",                        /* for TAG_LINK */
 136   "http-equiv",                 /* for TAG_META */
 137   "name",                       /* for TAG_META */
 138   "content"                     /* for TAG_META */
 139 };
 140
 141 static const char **interesting_tags;
 142 static const char **interesting_attributes;
 143
 144 void
 145 init_interesting (void)
 146 {
 147   /* Init the variables interesting_tags and interesting_attributes
 148      that are used by the HTML parser to know which tags and
 149      attributes we're interested in.  We initialize this only once,
 150      for performance reasons.
 151
 152      Here we also make sure that what we put in interesting_tags
 153      matches the user's preferences as specified through --ignore-tags
 154      and --follow-tags.  */
 155
 156   {
 157     int i, ind = 0;
 158     int size = ARRAY_SIZE (known_tags);
 159     interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
 160
 161     for (i = 0; i < size; i++)
 162       {
 163         const char *name = known_tags[i].name;
 164
 165         /* Normally here we could say:
 166            interesting_tags[i] = name;
 167            But we need to respect the settings of --ignore-tags and
 168            --follow-tags, so the code gets a bit harier.  */
 169
 170         if (opt.ignore_tags)
 171           {
 172             /* --ignore-tags was specified.  Do not match these
 173                specific tags.  --ignore-tags takes precedence over
 174                --follow-tags, so we process --ignore first and fall
 175                through if there's no match. */
 176             int j, lose = 0;
 177             for (j = 0; opt.ignore_tags[j] != NULL; j++)
 178               /* Loop through all the tags this user doesn't care
 179                  about. */
 180               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
 181                 {
 182                   lose = 1;
 183                   break;
 184                 }
 185             if (lose)
 186               continue;
 187           }
 188
 189         if (opt.follow_tags)
 190           {
 191             /* --follow-tags was specified.  Only match these specific
 192                tags, so return FALSE if we don't match one of them. */
 193             int j, win = 0;
 194             for (j = 0; opt.follow_tags[j] != NULL; j++)
 195               /* Loop through all the tags this user cares about. */
 196               if (strcasecmp(opt.follow_tags[j], name) == EQ)
 197                 {
 198                   win = 1;
 199                   break;
 200                 }
 201             if (!win)
 202               continue;         /* wasn't one of the explicitly
 203                                    desired tags */
 204           }
 205
 206         /* If we get to here, --follow-tags isn't being used or the
 207            tag is among the ones that are follwed, and --ignore-tags,
 208            if specified, didn't include this tag, so it's an
 209            "interesting" one. */
 210         interesting_tags[ind++] = name;
 211       }
 212     interesting_tags[ind] = NULL;
 213   }
 214
 215   /* The same for attributes, except we loop through url_tag_attr_map.
 216      Here we also need to make sure that the list of attributes is
 217      unique, and to include the attributes from additional_attributes.  */
 218   {
 219     int i, ind;
 220     const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
 221                                 * sizeof (char *));
 222     /* First copy the "additional" attributes. */
 223     for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
 224       att[i] = additional_attributes[i];
 225     ind = i;
 226     att[ind] = NULL;
 227     for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++)
 228       {
 229         int j, seen = 0;
 230         const char *look_for = url_tag_attr_map[i].attr_name;
 231         for (j = 0; j < ind - 1; j++)
 232           if (!strcmp (att[j], look_for))
 233             {
 234               seen = 1;
 235               break;
 236             }
 237         if (!seen)
 238           {
 239             att = xrealloc (att, (ind + 2) * sizeof (*att));
 240             att[ind++] = look_for;
 241             att[ind] = NULL;
 242           }
 243       }
 244     interesting_attributes = att;
 245   }
 246 }
 247
 248 static int
 249 find_tag (const char *tag_name)
 250 {
 251   int i;
 252
 253   /* This is linear search; if the number of tags grow, we can switch
 254      to binary search.  */
 255
 256   for (i = 0; i < ARRAY_SIZE (known_tags); i++)
 257     {
 258       int cmp = strcasecmp (known_tags[i].name, tag_name);
 259       /* known_tags are sorted alphabetically, so we can
 260          micro-optimize.  */
 261       if (cmp > 0)
 262         break;
 263       else if (cmp == 0)
 264         return i;
 265     }
 266   return -1;
 267 }
 268
 269 /* Find the value of attribute named NAME in the taginfo TAG.  If the
 270    attribute is not present, return NULL.  If ATTRID is non-NULL, the
 271    exact identity of the attribute will be returned.  */
 272 static char *
 273 find_attr (struct taginfo *tag, const char *name, int *attrid)
 274 {
 275   int i;
 276   for (i = 0; i < tag->nattrs; i++)
 277     if (!strcasecmp (tag->attrs[i].name, name))
 278       {
 279         if (attrid)
 280           *attrid = i;
 281         return tag->attrs[i].value;
 282       }
 283   return NULL;
 284 }
 285
 286 struct collect_urls_closure {
 287   char *text;                   /* HTML text. */
 288   char *base;                   /* Base URI of the document, possibly
 289                                    changed through <base href=...>. */
 290   urlpos *head, *tail;          /* List of URLs */
 291   const char *parent_base;      /* Base of the current document. */
 292   const char *document_file;    /* File name of this document. */
 293   int dash_p_leaf_HTML;         /* Whether -p is specified, and this
 294                                    document is the "leaf" node of the
 295                                    HTML tree. */
 296   int nofollow;                 /* whether NOFOLLOW was specified in a
 297                                    <meta name=robots> tag. */
 298 };
 299
 300 /* Resolve LINK_URI and append it to closure->tail.  TAG and ATTRID
 301    are the necessary context to store the position and size.  */
 302
 303 static void
 304 handle_link (struct collect_urls_closure *closure, const char *link_uri,
 305              struct taginfo *tag, int attrid)
 306 {
 307   int no_proto = !has_proto (link_uri);
 308   urlpos *newel;
 309
 310   const char *base = closure->base ? closure->base : closure->parent_base;
 311   char *complete_uri;
 312
 313   char *fragment = strrchr (link_uri, '#');
 314
 315   if (fragment)
 316     {
 317       /* Nullify the fragment identifier, i.e. everything after the
 318          last occurrence of `#', inclusive.  This copying is
 319          relatively inefficient, but it doesn't matter because
 320          fragment identifiers don't come up all that often.  */
 321       int hashlen = fragment - link_uri;
 322       char *p = alloca (hashlen + 1);
 323       memcpy (p, link_uri, hashlen);
 324       p[hashlen] = '\0';
 325       link_uri = p;
 326     }
 327
 328   if (!base)
 329     {
 330       if (no_proto)
 331         {
 332           /* We have no base, and the link does not have a protocol or
 333              a host attached to it.  Nothing we can do.  */
 334           /* #### Should we print a warning here?  Wget 1.5.x used to.  */
 335           return;
 336         }
 337       else
 338         complete_uri = xstrdup (link_uri);
 339     }
 340   else
 341     complete_uri = url_concat (base, link_uri);
 342
 343   DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
 344            closure->document_file, base ? base : "(null)",
 345            link_uri, complete_uri));
 346
 347   newel = (urlpos *)xmalloc (sizeof (urlpos));
 348
 349   memset (newel, 0, sizeof (*newel));
 350   newel->next = NULL;
 351   newel->url = complete_uri;
 352   newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
 353   newel->size = tag->attrs[attrid].value_raw_size;
 354
 355   /* A URL is relative if the host and protocol are not named, and the
 356      name does not start with `/'.  */
 357   if (no_proto && *link_uri != '/')
 358     newel->link_relative_p = 1;
 359   else if (!no_proto)
 360     newel->link_complete_p = 1;
 361
 362   if (closure->tail)
 363     {
 364       closure->tail->next = newel;
 365       closure->tail = newel;
 366     }
 367   else
 368     closure->tail = closure->head = newel;
 369 }
 370
 371 /* #### Document what this does.
 372    #### It would be nice to split this into several functions.  */
 373
 374 static void
 375 collect_tags_mapper (struct taginfo *tag, void *arg)
 376 {
 377   struct collect_urls_closure *closure = (struct collect_urls_closure *)arg;
 378   int tagid = find_tag (tag->name);
 379   assert (tagid != -1);
 380
 381   switch (known_tags[tagid].category)
 382     {
 383     case TC_LINK:
 384       {
 385         int i;
 386         int size = ARRAY_SIZE (url_tag_attr_map);
 387         for (i = 0; i < size; i++)
 388           if (url_tag_attr_map[i].tagid == tagid)
 389             break;
 390         /* We've found the index of url_tag_attr_map where the
 391            attributes of our tags begin.  Now, look for every one of
 392            them, and handle it.  */
 393         for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
 394           {
 395             char *attr_value;
 396             int id;
 397             if (closure->dash_p_leaf_HTML
 398                 && (url_tag_attr_map[i].flags & AF_EXTERNAL))
 399               /* If we're at a -p leaf node, we don't want to retrieve
 400                  links to references we know are external to this document,
 401                  such as <a href=...>.  */
 402               continue;
 403
 404             /* This find_attr() buried in a loop may seem inefficient
 405                (O(n^2)), but it's not, since the number of attributes
 406                (n) we loop over is extremely small.  In the worst case
 407                of IMG with all its possible attributes, n^2 will be
 408                only 9.  */
 409             attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
 410             if (attr_value)
 411               handle_link (closure, attr_value, tag, id);
 412           }
 413       }
 414       break;
 415     case TC_SPEC:
 416       switch (tagid)
 417         {
 418         case TAG_BASE:
 419           {
 420             char *newbase = find_attr (tag, "href", NULL);
 421             if (!newbase)
 422               break;
 423             if (closure->base)
 424               xfree (closure->base);
 425             if (closure->parent_base)
 426               closure->base = url_concat (closure->parent_base, newbase);
 427             else
 428               closure->base = xstrdup (newbase);
 429           }
 430           break;
 431         case TAG_LINK:
 432           {
 433             int id;
 434             char *rel  = find_attr (tag, "rel", NULL);
 435             char *href = find_attr (tag, "href", &id);
 436             if (href)
 437               {
 438                 /* In the normal case, all <link href=...> tags are
 439                    fair game.
 440
 441                    In the special case of when -p is active, however,
 442                    and we're at a leaf node (relative to the -l
 443                    max. depth) in the HTML document tree, the only
 444                    <LINK> tag we'll follow is a <LINK REL=
 445                    "stylesheet">, as it'll be necessary for displaying
 446                    this document properly.  We won't follow other
 447                    <LINK> tags, like <LINK REL="home">, for instance,
 448                    as they refer to external documents.  */
 449                 if (!closure->dash_p_leaf_HTML
 450                     || (rel && !strcasecmp (rel, "stylesheet")))
 451                   handle_link (closure, href, tag, id);
 452               }
 453           }
 454           break;
 455         case TAG_META:
 456           /* Some pages use a META tag to specify that the page be
 457              refreshed by a new page after a given number of seconds.
 458              The general format for this is:
 459
 460              <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
 461
 462              So we just need to skip past the "NUMBER; URL=" garbage
 463              to get to the URL.  */
 464           {
 465             int id;
 466             char *name = find_attr (tag, "name", NULL);
 467             char *http_equiv = find_attr (tag, "http-equiv", &id);
 468             if (http_equiv && !strcasecmp (http_equiv, "refresh"))
 469               {
 470                 char *refresh = find_attr (tag, "content", NULL);
 471                 char *p = refresh;
 472                 int offset;
 473                 while (ISDIGIT (*p))
 474                   ++p;
 475                 if (*p++ != ';')
 476                   return;
 477                 while (ISSPACE (*p))
 478                   ++p;
 479                 if (!(TOUPPER (*p) == 'U'
 480                       && TOUPPER (*(p + 1)) == 'R'
 481                       && TOUPPER (*(p + 2)) == 'L'
 482                       && *(p + 3) == '='))
 483                   return;
 484                 p += 4;
 485                 while (ISSPACE (*p))
 486                   ++p;
 487                 offset = p - refresh;
 488                 tag->attrs[id].value_raw_beginning += offset;
 489                 tag->attrs[id].value_raw_size -= offset;
 490                 handle_link (closure, p, tag, id);
 491               }
 492             else if (name && !strcasecmp (name, "robots"))
 493               {
 494                 /* Handle stuff like:
 495                    <meta name="robots" content="index,nofollow"> */
 496                 char *content = find_attr (tag, "content", NULL);
 497                 if (!content)
 498                   return;
 499                 if (!strcasecmp (content, "none"))
 500                   closure->nofollow = 1;
 501                 else
 502                   {
 503                     while (*content)
 504                       {
 505                         /* Find the next occurrence of ',' or the end of
 506                            the string.  */
 507                         char *end = strchr (content, ',');
 508                         if (end)
 509                           ++end;
 510                         else
 511                           end = content + strlen (content);
 512                         if (!strncasecmp (content, "nofollow", end - content))
 513                           closure->nofollow = 1;
 514                         content = end;
 515                       }
 516                   }
 517               }
 518           }
 519           break;
 520         default:
 521           /* Category is TC_SPEC, but tag name is unhandled.  This
 522              must not be.  */
 523           abort ();
 524         }
 525       break;
 526     }
 527 }
 528
 529 /* Scan FILE, retrieving links to HTML documents from it.  Each link is
 530
 531   Similar to get_urls_file, but for HTML files.  FILE is scanned as
 532    an HTML document.  get_urls_html() constructs the URLs from the
 533    relative href-s.
 534
 535    If SILENT is non-zero, do not barf on baseless relative links.  */
 536 urlpos *
 537 get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
 538                int *meta_disallow_follow)
 539 {
 540   struct file_memory *fm;
 541   struct collect_urls_closure closure;
 542
 543   /* Load the file. */
 544   fm = read_file (file);
 545   if (!fm)
 546     {
 547       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 548       return NULL;
 549     }
 550   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 551
 552   closure.text = fm->content;
 553   closure.head = closure.tail = NULL;
 554   closure.base = NULL;
 555   closure.parent_base = this_url ? this_url : opt.base_href;
 556   closure.document_file = file;
 557   closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
 558   closure.nofollow = 0;
 559
 560   if (!interesting_tags)
 561     init_interesting ();
 562
 563   map_html_tags (fm->content, fm->length, interesting_tags,
 564                  interesting_attributes, collect_tags_mapper, &closure);
 565
 566   DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow));
 567   if (meta_disallow_follow)
 568     *meta_disallow_follow = closure.nofollow;
 569
 570   FREE_MAYBE (closure.base);
 571   read_file_free (fm);
 572   return closure.head;
 573 }
 574
 575 void
 576 cleanup_html_url (void)
 577 {
 578   FREE_MAYBE (interesting_tags);
 579   FREE_MAYBE (interesting_attributes);
 580 }