1 /* Parsing FTP `ls' output.
2 Copyright (C) 1996-2004 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
45 #include "convert.h" /* for html_quote_string prototype */
46 #include "retr.h" /* for output_stream */
48 /* Converts symbolic permissions to number-style ones, e.g. string
49 rwxr-xr-x to 755. For now, it knows nothing of
50 setuid/setgid/sticky. ACLs are ignored. */
52 symperms (const char *s)
58 for (i = 0; i < 3; i++, s += 3)
61 perms += (((s[0] == 'r') << 2) + ((s[1] == 'w') << 1) +
62 (s[2] == 'x' || s[2] == 's'));
68 /* Cleans a line of text so that it can be consistently parsed. Destroys
69 <CR> and <LF> in case that thay occur at the end of the line and
70 replaces all <TAB> character with <SPACE>. Returns the length of the
73 clean_line(char *line)
75 int len = strlen (line);
77 if (line[len - 1] == '\n')
79 if (line[len - 1] == '\r')
81 for ( ; *line ; line++ ) if (*line == '\t') *line = ' ';
85 /* Convert the Un*x-ish style directory listing stored in FILE to a
86 linked list of fileinfo (system-independent) entries. The contents
87 of FILE are considered to be produced by the standard Unix `ls -la'
88 output (whatever that might be). BSD (no group) and SYSV (with
89 group) listings are handled.
91 The time stamps are stored in a separate variable, time_t
92 compatible (I hope). The timezones are ignored. */
93 static struct fileinfo *
94 ftp_parse_unix_ls (const char *file, int ignore_perms)
97 static const char *months[] = {
98 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
99 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
101 int next, len, i, error, ignore;
102 int year, month, day; /* for time analysis */
104 struct tm timestruct, *tnow;
107 char *line, *tok; /* tokenizer */
108 struct fileinfo *dir, *l, cur; /* list creation */
110 fp = fopen (file, "rb");
113 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
118 /* Line loop to end of file: */
119 while ((line = read_whole_line (fp)) != NULL)
121 len = clean_line (line);
122 /* Skip if total... */
123 if (!strncasecmp (line, "total", 5))
128 /* Get the first token (permissions). */
129 tok = strtok (line, " ");
139 /* Decide whether we deal with a file or a directory. */
143 cur.type = FT_PLAINFILE;
144 DEBUGP (("PLAINFILE; "));
147 cur.type = FT_DIRECTORY;
148 DEBUGP (("DIRECTORY; "));
151 cur.type = FT_SYMLINK;
152 DEBUGP (("SYMLINK; "));
155 cur.type = FT_UNKNOWN;
156 DEBUGP (("UNKNOWN; "));
171 /*cur.perms = 1023;*/ /* #### What is this? --hniksic */
174 DEBUGP (("implicit perms %0o; ", cur.perms));
178 cur.perms = symperms (tok + 1);
179 DEBUGP (("perms %0o; ", cur.perms));
182 error = ignore = 0; /* Erroneous and ignoring entries are
183 treated equally for now. */
184 year = hour = min = sec = 0; /* Silence the compiler. */
187 /* While there are tokens on the line, parse them. Next is the
188 number of tokens left until the filename.
190 Use the month-name token as the "anchor" (the place where the
191 position wrt the file name is "known"). When a month name is
192 encountered, `next' is set to 5. Also, the preceding
193 characters are parsed to get the file size.
195 This tactic is quite dubious when it comes to
196 internationalization issues (non-English month names), but it
198 while ((tok = strtok (NULL, " ")) != NULL)
201 if (next < 0) /* a month name was not encountered */
203 for (i = 0; i < 12; i++)
204 if (!strcmp (tok, months[i]))
206 /* If we got a month, it means the token before it is the
207 size, and the filename is three tokens away. */
212 /* Back up to the beginning of the previous token
213 and parse it with str_to_wgint. */
215 while (t > line && ISDIGIT (*t))
219 /* Something has gone wrong during parsing. */
224 size = str_to_wgint (t, NULL, 10);
225 if (size == WGINT_MAX && errno == ERANGE)
226 /* Out of range -- ignore the size. #### Should
227 we refuse to start the download. */
234 DEBUGP (("month: %s; ", months[month]));
237 else if (next == 4) /* days */
239 if (tok[1]) /* two-digit... */
240 day = 10 * (*tok - '0') + tok[1] - '0';
241 else /* ...or one-digit */
243 DEBUGP (("day: %d; ", day));
247 /* This ought to be either the time, or the year. Let's
250 If we have a number x, it's a year. If we have x:y,
251 it's hours and minutes. If we have x:y:z, z are
254 min = hour = sec = 0;
255 /* We must deal with digits. */
258 /* Suppose it's year. */
259 for (; ISDIGIT (*tok); tok++)
260 year = (*tok - '0') + 10 * year;
263 /* This means these were hours! */
267 /* Get the minutes... */
268 for (; ISDIGIT (*tok); tok++)
269 min = (*tok - '0') + 10 * min;
272 /* ...and the seconds. */
274 for (; ISDIGIT (*tok); tok++)
275 sec = (*tok - '0') + 10 * sec;
280 DEBUGP (("year: %d (no tm); ", year));
282 DEBUGP (("time: %02d:%02d:%02d (no yr); ", hour, min, sec));
284 else if (next == 2) /* The file name */
289 /* Since the file name may contain a SPC, it is possible
290 for strtok to handle it wrong. */
291 fnlen = strlen (tok);
292 if (fnlen < len - (tok - line))
294 /* So we have a SPC in the file name. Restore the
297 /* If the file is a symbolic link, it should have a
299 if (cur.type == FT_SYMLINK)
301 p = strstr (tok, " -> ");
307 cur.linkto = xstrdup (p + 4);
308 DEBUGP (("link to: %s\n", cur.linkto));
309 /* And separate it from the file name. */
313 /* If we have the filename, add it to the list of files or
315 /* "." and ".." are an exception! */
316 if (!strcmp (tok, ".") || !strcmp (tok, ".."))
318 DEBUGP (("\nIgnoring `.' and `..'; "));
322 /* Some FTP sites choose to have ls -F as their default
323 LIST output, which marks the symlinks with a trailing
324 `@', directory names with a trailing `/' and
325 executables with a trailing `*'. This is no problem
326 unless encountering a symbolic link ending with `@',
327 or an executable ending with `*' on a server without
328 default -F output. I believe these cases are very
330 fnlen = strlen (tok); /* re-calculate `fnlen' */
331 cur.name = xmalloc (fnlen + 1);
332 memcpy (cur.name, tok, fnlen + 1);
335 if (cur.type == FT_DIRECTORY && cur.name[fnlen - 1] == '/')
337 cur.name[fnlen - 1] = '\0';
338 DEBUGP (("trailing `/' on dir.\n"));
340 else if (cur.type == FT_SYMLINK && cur.name[fnlen - 1] == '@')
342 cur.name[fnlen - 1] = '\0';
343 DEBUGP (("trailing `@' on link.\n"));
345 else if (cur.type == FT_PLAINFILE
346 && (cur.perms & 0111)
347 && cur.name[fnlen - 1] == '*')
349 cur.name[fnlen - 1] = '\0';
350 DEBUGP (("trailing `*' on exec.\n"));
361 if (!cur.name || (cur.type == FT_SYMLINK && !cur.linkto))
368 DEBUGP (("Skipping.\n"));
369 xfree_null (cur.name);
370 xfree_null (cur.linkto);
377 l = dir = xnew (struct fileinfo);
378 memcpy (l, &cur, sizeof (cur));
379 l->prev = l->next = NULL;
384 l->next = xnew (struct fileinfo);
386 memcpy (l, &cur, sizeof (cur));
389 /* Get the current time. */
390 timenow = time (NULL);
391 tnow = localtime (&timenow);
392 /* Build the time-stamp (the idea by zaga@fly.cc.fer.hr). */
393 timestruct.tm_sec = sec;
394 timestruct.tm_min = min;
395 timestruct.tm_hour = hour;
396 timestruct.tm_mday = day;
397 timestruct.tm_mon = month;
400 /* Some listings will not specify the year if it is "obvious"
401 that the file was from the previous year. E.g. if today
402 is 97-01-12, and you see a file of Dec 15th, its year is
403 1996, not 1997. Thanks to Vladimir Volovich for
405 if (month > tnow->tm_mon)
406 timestruct.tm_year = tnow->tm_year - 1;
408 timestruct.tm_year = tnow->tm_year;
411 timestruct.tm_year = year;
412 if (timestruct.tm_year >= 1900)
413 timestruct.tm_year -= 1900;
414 timestruct.tm_wday = 0;
415 timestruct.tm_yday = 0;
416 timestruct.tm_isdst = -1;
417 l->tstamp = mktime (×truct); /* store the time-stamp */
426 static struct fileinfo *
427 ftp_parse_winnt_ls (const char *file)
431 int year, month, day; /* for time analysis */
433 struct tm timestruct;
435 char *line, *tok; /* tokenizer */
436 struct fileinfo *dir, *l, cur; /* list creation */
438 fp = fopen (file, "rb");
441 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
446 /* Line loop to end of file: */
447 while ((line = read_whole_line (fp)) != NULL)
449 len = clean_line (line);
451 /* Extracting name is a bit of black magic and we have to do it
452 before `strtok' inserted extra \0 characters in the line
453 string. For the moment let us just suppose that the name starts at
454 column 39 of the listing. This way we could also recognize
455 filenames that begin with a series of space characters (but who
456 really wants to use such filenames anyway?). */
457 if (len < 40) continue;
459 cur.name = xstrdup(tok);
460 DEBUGP(("Name: '%s'\n", cur.name));
462 /* First column: mm-dd-yy. Should atoi() on the month fail, january
464 tok = strtok(line, "-");
465 if (tok == NULL) continue;
466 month = atoi(tok) - 1;
467 if (month < 0) month = 0;
468 tok = strtok(NULL, "-");
469 if (tok == NULL) continue;
471 tok = strtok(NULL, " ");
472 if (tok == NULL) continue;
474 /* Assuming the epoch starting at 1.1.1970 */
475 if (year <= 70) year += 100;
477 /* Second column: hh:mm[AP]M, listing does not contain value for
479 tok = strtok(NULL, ":");
480 if (tok == NULL) continue;
482 tok = strtok(NULL, "M");
483 if (tok == NULL) continue;
485 /* Adjust hour from AM/PM. Just for the record, the sequence goes
486 11:00AM, 12:00PM, 01:00PM ... 11:00PM, 12:00AM, 01:00AM . */
488 if (hour == 12) hour = 0;
489 if (*tok == 'P') hour += 12;
491 DEBUGP(("YYYY/MM/DD HH:MM - %d/%02d/%02d %02d:%02d\n",
492 year+1900, month, day, hour, min));
494 /* Build the time-stamp (copy & paste from above) */
495 timestruct.tm_sec = 0;
496 timestruct.tm_min = min;
497 timestruct.tm_hour = hour;
498 timestruct.tm_mday = day;
499 timestruct.tm_mon = month;
500 timestruct.tm_year = year;
501 timestruct.tm_wday = 0;
502 timestruct.tm_yday = 0;
503 timestruct.tm_isdst = -1;
504 cur.tstamp = mktime (×truct); /* store the time-stamp */
506 DEBUGP(("Timestamp: %ld\n", cur.tstamp));
508 /* Third column: Either file length, or <DIR>. We also set the
509 permissions (guessed as 0644 for plain files and 0755 for
510 directories as the listing does not give us a clue) and filetype
512 tok = strtok(NULL, " ");
513 if (tok == NULL) continue;
514 while ((tok != NULL) && (*tok == '\0')) tok = strtok(NULL, " ");
515 if (tok == NULL) continue;
518 cur.type = FT_DIRECTORY;
521 DEBUGP(("Directory\n"));
526 cur.type = FT_PLAINFILE;
528 size = str_to_wgint (tok, NULL, 10);
529 if (size == WGINT_MAX && errno == ERANGE)
530 cur.size = 0; /* overflow */
534 DEBUGP(("File, size %s bytes\n", number_to_static_string (cur.size)));
539 /* And put everything into the linked list */
542 l = dir = xnew (struct fileinfo);
543 memcpy (l, &cur, sizeof (cur));
544 l->prev = l->next = NULL;
549 l->next = xnew (struct fileinfo);
551 memcpy (l, &cur, sizeof (cur));
562 /* Converts VMS symbolic permissions to number-style ones, e.g. string
563 RWED,RWE,RE to 755. "D" (delete) is taken to be equal to "W"
564 (write). Inspired by a patch of Stoyan Lekov <lekov@eda.bg>. */
566 vmsperms (const char *s)
573 case ',': perms <<= 3; break;
574 case 'R': perms |= 4; break;
575 case 'W': perms |= 2; break;
576 case 'D': perms |= 2; break;
577 case 'E': perms |= 1; break;
578 default: DEBUGP(("wrong VMS permissons!\n"));
586 static struct fileinfo *
587 ftp_parse_vms_ls (const char *file)
590 /* #### A third copy of more-or-less the same array ? */
591 static const char *months[] = {
592 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
593 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
596 int year, month, day; /* for time analysis */
598 struct tm timestruct;
600 char *line, *tok; /* tokenizer */
601 struct fileinfo *dir, *l, cur; /* list creation */
603 fp = fopen (file, "rb");
606 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
611 /* Skip empty line. */
612 line = read_whole_line (fp);
615 /* Skip "Directory PUB$DEVICE[PUB]" */
616 line = read_whole_line (fp);
619 /* Skip empty line. */
620 line = read_whole_line (fp);
623 /* Line loop to end of file: */
624 while ((line = read_whole_line (fp)) != NULL)
627 i = clean_line (line);
634 /* First column: Name. A bit of black magic again. The name my be
635 either ABCD.EXT or ABCD.EXT;NUM and it might be on a separate
636 line. Therefore we will first try to get the complete name
637 until the first space character; if it fails, we assume that the name
638 occupies the whole line. After that we search for the version
639 separator ";", we remove it and check the extension of the file;
640 extension .DIR denotes directory. */
642 tok = strtok(line, " ");
643 if (tok == NULL) tok = line;
644 DEBUGP(("file name: '%s'\n", tok));
645 for (p = tok ; *p && *p != ';' ; p++)
647 if (*p == ';') *p = '\0';
648 p = tok + strlen(tok) - 4;
649 if (!strcmp(p, ".DIR")) *p = '\0';
650 cur.name = xstrdup(tok);
651 DEBUGP(("Name: '%s'\n", cur.name));
653 /* If the name ends on .DIR or .DIR;#, it's a directory. We also set
654 the file size to zero as the listing does tell us only the size in
655 filesystem blocks - for an integrity check (when mirroring, for
656 example) we would need the size in bytes. */
660 cur.type = FT_DIRECTORY;
662 DEBUGP(("Directory\n"));
666 cur.type = FT_PLAINFILE;
672 /* Second column, if exists, or the first column of the next line
673 contain file size in blocks. We will skip it. */
675 tok = strtok(NULL, " ");
678 DEBUGP(("Getting additional line\n"));
680 line = read_whole_line (fp);
683 DEBUGP(("empty line read, leaving listing parser\n"));
686 i = clean_line (line);
689 DEBUGP(("confusing VMS listing item, leaving listing parser\n"));
693 tok = strtok(line, " ");
695 DEBUGP(("second token: '%s'\n", tok));
697 /* Third/Second column: Date DD-MMM-YYYY. */
699 tok = strtok(NULL, "-");
700 if (tok == NULL) continue;
701 DEBUGP(("day: '%s'\n",tok));
703 tok = strtok(NULL, "-");
706 /* If the server produces garbage like
707 'EA95_0PS.GZ;1 No privilege for attempted operation'
708 the first strtok(NULL, "-") will return everything until the end
709 of the line and only the next strtok() call will return NULL. */
710 DEBUGP(("nonsense in VMS listing, skipping this line\n"));
714 for (i=0; i<12; i++) if (!strcmp(tok,months[i])) break;
715 /* Uknown months are mapped to January */
717 tok = strtok (NULL, " ");
718 if (tok == NULL) continue;
719 year = atoi (tok) - 1900;
720 DEBUGP(("date parsed\n"));
722 /* Fourth/Third column: Time hh:mm[:ss] */
723 tok = strtok (NULL, " ");
724 if (tok == NULL) continue;
728 for (; *p && *p != ':'; ++p)
732 for (; *p && *p != ':'; ++p)
737 DEBUGP(("YYYY/MM/DD HH:MM:SS - %d/%02d/%02d %02d:%02d:%02d\n",
738 year+1900, month, day, hour, min, sec));
740 /* Build the time-stamp (copy & paste from above) */
741 timestruct.tm_sec = sec;
742 timestruct.tm_min = min;
743 timestruct.tm_hour = hour;
744 timestruct.tm_mday = day;
745 timestruct.tm_mon = month;
746 timestruct.tm_year = year;
747 timestruct.tm_wday = 0;
748 timestruct.tm_yday = 0;
749 timestruct.tm_isdst = -1;
750 cur.tstamp = mktime (×truct); /* store the time-stamp */
752 DEBUGP(("Timestamp: %ld\n", cur.tstamp));
754 /* Skip the fifth column */
756 tok = strtok(NULL, " ");
757 if (tok == NULL) continue;
759 /* Sixth column: Permissions */
761 tok = strtok(NULL, ","); /* Skip the VMS-specific SYSTEM permissons */
762 if (tok == NULL) continue;
763 tok = strtok(NULL, ")");
766 DEBUGP(("confusing VMS permissions, skipping line\n"));
770 /* Permissons have the format "RWED,RWED,RE" */
771 cur.perms = vmsperms(tok);
772 DEBUGP(("permissions: %s -> 0%o\n", tok, cur.perms));
776 /* And put everything into the linked list */
779 l = dir = xnew (struct fileinfo);
780 memcpy (l, &cur, sizeof (cur));
781 l->prev = l->next = NULL;
786 l->next = xnew (struct fileinfo);
788 memcpy (l, &cur, sizeof (cur));
800 /* This function switches between the correct parsing routine depending on
801 the SYSTEM_TYPE. The system type should be based on the result of the
802 "SYST" response of the FTP server. According to this repsonse we will
803 use on of the three different listing parsers that cover the most of FTP
804 servers used nowadays. */
807 ftp_parse_ls (const char *file, const enum stype system_type)
812 return ftp_parse_unix_ls (file, 0);
815 /* Detect whether the listing is simulating the UNIX format */
818 fp = fopen (file, "rb");
821 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
826 /* If the first character of the file is '0'-'9', it's WINNT
828 if (c >= '0' && c <='9')
829 return ftp_parse_winnt_ls (file);
831 return ftp_parse_unix_ls (file, 1);
834 return ftp_parse_vms_ls (file);
836 return ftp_parse_unix_ls (file, 1);
838 logprintf (LOG_NOTQUIET, _("\
839 Unsupported listing type, trying Unix listing parser.\n"));
840 return ftp_parse_unix_ls (file, 0);
844 /* Stuff for creating FTP index. */
846 /* The function creates an HTML index containing references to given
847 directories and files on the appropriate host. The references are
850 ftp_index (const char *file, struct url *u, struct fileinfo *f)
854 char *htclfile; /* HTML-clean file name */
858 fp = fopen (file, "wb");
861 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
869 char *tmpu, *tmpp; /* temporary, clean user and passwd */
871 tmpu = url_escape (u->user);
872 tmpp = u->passwd ? url_escape (u->passwd) : NULL;
874 upwd = concat_strings (tmpu, ":", tmpp, "@", (char *) 0);
876 upwd = concat_strings (tmpu, "@", (char *) 0);
882 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
883 fprintf (fp, "<html>\n<head>\n<title>");
884 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
885 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
886 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
887 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
893 /* #### Should we translate the months? Or, even better, use
895 static const char *months[] = {
896 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
897 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
899 struct tm *ptm = localtime ((time_t *)&f->tstamp);
901 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
904 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
909 fprintf (fp, _("time unknown "));
913 fprintf (fp, _("File "));
916 fprintf (fp, _("Directory "));
919 fprintf (fp, _("Link "));
922 fprintf (fp, _("Not sure "));
925 htclfile = html_quote_string (f->name);
926 fprintf (fp, "<a href=\"ftp://%s%s:%d", upwd, u->host, u->port);
929 fprintf (fp, "%s", u->dir);
932 fprintf (fp, "%s", htclfile);
933 if (f->type == FT_DIRECTORY)
935 fprintf (fp, "\">%s", htclfile);
936 if (f->type == FT_DIRECTORY)
938 fprintf (fp, "</a> ");
939 if (f->type == FT_PLAINFILE)
940 fprintf (fp, _(" (%s bytes)"), number_to_static_string (f->size));
941 else if (f->type == FT_SYMLINK)
942 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
947 fprintf (fp, "</pre>\n</body>\n</html>\n");