1 /* Parsing FTP `ls' output.
2 Copyright (C) 1995, 1996, 1997, 2000, 2001
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
46 #include "convert.h" /* for html_quote_string prototype */
47 #include "retr.h" /* for output_stream */
49 /* Converts symbolic permissions to number-style ones, e.g. string
50 rwxr-xr-x to 755. For now, it knows nothing of
51 setuid/setgid/sticky. ACLs are ignored. */
53 symperms (const char *s)
59 for (i = 0; i < 3; i++, s += 3)
62 perms += (((s[0] == 'r') << 2) + ((s[1] == 'w') << 1) +
63 (s[2] == 'x' || s[2] == 's'));
69 /* Cleans a line of text so that it can be consistently parsed. Destroys
70 <CR> and <LF> in case that thay occur at the end of the line and
71 replaces all <TAB> character with <SPACE>. Returns the length of the
74 clean_line(char *line)
76 int len = strlen (line);
78 if (line[len - 1] == '\n')
80 if (line[len - 1] == '\r')
82 for ( ; *line ; line++ ) if (*line == '\t') *line = ' ';
86 /* Convert the Un*x-ish style directory listing stored in FILE to a
87 linked list of fileinfo (system-independent) entries. The contents
88 of FILE are considered to be produced by the standard Unix `ls -la'
89 output (whatever that might be). BSD (no group) and SYSV (with
90 group) listings are handled.
92 The time stamps are stored in a separate variable, time_t
93 compatible (I hope). The timezones are ignored. */
94 static struct fileinfo *
95 ftp_parse_unix_ls (const char *file, int ignore_perms)
98 static const char *months[] = {
99 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
100 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
102 int next, len, i, error, ignore;
103 int year, month, day; /* for time analysis */
105 struct tm timestruct, *tnow;
108 char *line, *tok; /* tokenizer */
109 struct fileinfo *dir, *l, cur; /* list creation */
111 fp = fopen (file, "rb");
114 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
119 /* Line loop to end of file: */
120 while ((line = read_whole_line (fp)) != NULL)
122 len = clean_line (line);
123 /* Skip if total... */
124 if (!strncasecmp (line, "total", 5))
129 /* Get the first token (permissions). */
130 tok = strtok (line, " ");
140 /* Decide whether we deal with a file or a directory. */
144 cur.type = FT_PLAINFILE;
145 DEBUGP (("PLAINFILE; "));
148 cur.type = FT_DIRECTORY;
149 DEBUGP (("DIRECTORY; "));
152 cur.type = FT_SYMLINK;
153 DEBUGP (("SYMLINK; "));
156 cur.type = FT_UNKNOWN;
157 DEBUGP (("UNKNOWN; "));
172 /*cur.perms = 1023;*/ /* #### What is this? --hniksic */
175 DEBUGP (("implicit perms %0o; ", cur.perms));
179 cur.perms = symperms (tok + 1);
180 DEBUGP (("perms %0o; ", cur.perms));
183 error = ignore = 0; /* Erroneous and ignoring entries are
184 treated equally for now. */
185 year = hour = min = sec = 0; /* Silence the compiler. */
188 /* While there are tokens on the line, parse them. Next is the
189 number of tokens left until the filename.
191 Use the month-name token as the "anchor" (the place where the
192 position wrt the file name is "known"). When a month name is
193 encountered, `next' is set to 5. Also, the preceding
194 characters are parsed to get the file size.
196 This tactic is quite dubious when it comes to
197 internationalization issues (non-English month names), but it
199 while ((tok = strtok (NULL, " ")) != NULL)
202 if (next < 0) /* a month name was not encountered */
204 for (i = 0; i < 12; i++)
205 if (!strcmp (tok, months[i]))
207 /* If we got a month, it means the token before it is the
208 size, and the filename is three tokens away. */
213 /* Back up to the beginning of the previous token
214 and parse it with str_to_wgint. */
216 while (t > line && ISDIGIT (*t))
220 /* Something has gone wrong during parsing. */
225 size = str_to_wgint (t, NULL, 10);
226 if (size == WGINT_MAX && errno == ERANGE)
227 /* Out of range -- ignore the size. #### Should
228 we refuse to start the download. */
235 DEBUGP (("month: %s; ", months[month]));
238 else if (next == 4) /* days */
240 if (tok[1]) /* two-digit... */
241 day = 10 * (*tok - '0') + tok[1] - '0';
242 else /* ...or one-digit */
244 DEBUGP (("day: %d; ", day));
248 /* This ought to be either the time, or the year. Let's
251 If we have a number x, it's a year. If we have x:y,
252 it's hours and minutes. If we have x:y:z, z are
255 min = hour = sec = 0;
256 /* We must deal with digits. */
259 /* Suppose it's year. */
260 for (; ISDIGIT (*tok); tok++)
261 year = (*tok - '0') + 10 * year;
264 /* This means these were hours! */
268 /* Get the minutes... */
269 for (; ISDIGIT (*tok); tok++)
270 min = (*tok - '0') + 10 * min;
273 /* ...and the seconds. */
275 for (; ISDIGIT (*tok); tok++)
276 sec = (*tok - '0') + 10 * sec;
281 DEBUGP (("year: %d (no tm); ", year));
283 DEBUGP (("time: %02d:%02d:%02d (no yr); ", hour, min, sec));
285 else if (next == 2) /* The file name */
290 /* Since the file name may contain a SPC, it is possible
291 for strtok to handle it wrong. */
292 fnlen = strlen (tok);
293 if (fnlen < len - (tok - line))
295 /* So we have a SPC in the file name. Restore the
298 /* If the file is a symbolic link, it should have a
300 if (cur.type == FT_SYMLINK)
302 p = strstr (tok, " -> ");
308 cur.linkto = xstrdup (p + 4);
309 DEBUGP (("link to: %s\n", cur.linkto));
310 /* And separate it from the file name. */
314 /* If we have the filename, add it to the list of files or
316 /* "." and ".." are an exception! */
317 if (!strcmp (tok, ".") || !strcmp (tok, ".."))
319 DEBUGP (("\nIgnoring `.' and `..'; "));
323 /* Some FTP sites choose to have ls -F as their default
324 LIST output, which marks the symlinks with a trailing
325 `@', directory names with a trailing `/' and
326 executables with a trailing `*'. This is no problem
327 unless encountering a symbolic link ending with `@',
328 or an executable ending with `*' on a server without
329 default -F output. I believe these cases are very
331 fnlen = strlen (tok); /* re-calculate `fnlen' */
332 cur.name = xmalloc (fnlen + 1);
333 memcpy (cur.name, tok, fnlen + 1);
336 if (cur.type == FT_DIRECTORY && cur.name[fnlen - 1] == '/')
338 cur.name[fnlen - 1] = '\0';
339 DEBUGP (("trailing `/' on dir.\n"));
341 else if (cur.type == FT_SYMLINK && cur.name[fnlen - 1] == '@')
343 cur.name[fnlen - 1] = '\0';
344 DEBUGP (("trailing `@' on link.\n"));
346 else if (cur.type == FT_PLAINFILE
347 && (cur.perms & 0111)
348 && cur.name[fnlen - 1] == '*')
350 cur.name[fnlen - 1] = '\0';
351 DEBUGP (("trailing `*' on exec.\n"));
362 if (!cur.name || (cur.type == FT_SYMLINK && !cur.linkto))
369 DEBUGP (("Skipping.\n"));
370 xfree_null (cur.name);
371 xfree_null (cur.linkto);
378 l = dir = xnew (struct fileinfo);
379 memcpy (l, &cur, sizeof (cur));
380 l->prev = l->next = NULL;
385 l->next = xnew (struct fileinfo);
387 memcpy (l, &cur, sizeof (cur));
390 /* Get the current time. */
391 timenow = time (NULL);
392 tnow = localtime (&timenow);
393 /* Build the time-stamp (the idea by zaga@fly.cc.fer.hr). */
394 timestruct.tm_sec = sec;
395 timestruct.tm_min = min;
396 timestruct.tm_hour = hour;
397 timestruct.tm_mday = day;
398 timestruct.tm_mon = month;
401 /* Some listings will not specify the year if it is "obvious"
402 that the file was from the previous year. E.g. if today
403 is 97-01-12, and you see a file of Dec 15th, its year is
404 1996, not 1997. Thanks to Vladimir Volovich for
406 if (month > tnow->tm_mon)
407 timestruct.tm_year = tnow->tm_year - 1;
409 timestruct.tm_year = tnow->tm_year;
412 timestruct.tm_year = year;
413 if (timestruct.tm_year >= 1900)
414 timestruct.tm_year -= 1900;
415 timestruct.tm_wday = 0;
416 timestruct.tm_yday = 0;
417 timestruct.tm_isdst = -1;
418 l->tstamp = mktime (×truct); /* store the time-stamp */
427 static struct fileinfo *
428 ftp_parse_winnt_ls (const char *file)
432 int year, month, day; /* for time analysis */
434 struct tm timestruct;
436 char *line, *tok; /* tokenizer */
437 struct fileinfo *dir, *l, cur; /* list creation */
439 fp = fopen (file, "rb");
442 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
447 /* Line loop to end of file: */
448 while ((line = read_whole_line (fp)) != NULL)
450 len = clean_line (line);
452 /* Extracting name is a bit of black magic and we have to do it
453 before `strtok' inserted extra \0 characters in the line
454 string. For the moment let us just suppose that the name starts at
455 column 39 of the listing. This way we could also recognize
456 filenames that begin with a series of space characters (but who
457 really wants to use such filenames anyway?). */
458 if (len < 40) continue;
460 cur.name = xstrdup(tok);
461 DEBUGP(("Name: '%s'\n", cur.name));
463 /* First column: mm-dd-yy. Should atoi() on the month fail, january
465 tok = strtok(line, "-");
466 if (tok == NULL) continue;
467 month = atoi(tok) - 1;
468 if (month < 0) month = 0;
469 tok = strtok(NULL, "-");
470 if (tok == NULL) continue;
472 tok = strtok(NULL, " ");
473 if (tok == NULL) continue;
475 /* Assuming the epoch starting at 1.1.1970 */
476 if (year <= 70) year += 100;
478 /* Second column: hh:mm[AP]M, listing does not contain value for
480 tok = strtok(NULL, ":");
481 if (tok == NULL) continue;
483 tok = strtok(NULL, "M");
484 if (tok == NULL) continue;
486 /* Adjust hour from AM/PM. Just for the record, the sequence goes
487 11:00AM, 12:00PM, 01:00PM ... 11:00PM, 12:00AM, 01:00AM . */
489 if (hour == 12) hour = 0;
490 if (*tok == 'P') hour += 12;
492 DEBUGP(("YYYY/MM/DD HH:MM - %d/%02d/%02d %02d:%02d\n",
493 year+1900, month, day, hour, min));
495 /* Build the time-stamp (copy & paste from above) */
496 timestruct.tm_sec = 0;
497 timestruct.tm_min = min;
498 timestruct.tm_hour = hour;
499 timestruct.tm_mday = day;
500 timestruct.tm_mon = month;
501 timestruct.tm_year = year;
502 timestruct.tm_wday = 0;
503 timestruct.tm_yday = 0;
504 timestruct.tm_isdst = -1;
505 cur.tstamp = mktime (×truct); /* store the time-stamp */
507 DEBUGP(("Timestamp: %ld\n", cur.tstamp));
509 /* Third column: Either file length, or <DIR>. We also set the
510 permissions (guessed as 0644 for plain files and 0755 for
511 directories as the listing does not give us a clue) and filetype
513 tok = strtok(NULL, " ");
514 if (tok == NULL) continue;
515 while ((tok != NULL) && (*tok == '\0')) tok = strtok(NULL, " ");
516 if (tok == NULL) continue;
519 cur.type = FT_DIRECTORY;
522 DEBUGP(("Directory\n"));
527 cur.type = FT_PLAINFILE;
529 size = str_to_wgint (tok, NULL, 10);
530 if (size == WGINT_MAX && errno == ERANGE)
531 cur.size = 0; /* overflow */
535 DEBUGP(("File, size %s bytes\n", number_to_static_string (cur.size)));
540 /* And put everything into the linked list */
543 l = dir = xnew (struct fileinfo);
544 memcpy (l, &cur, sizeof (cur));
545 l->prev = l->next = NULL;
550 l->next = xnew (struct fileinfo);
552 memcpy (l, &cur, sizeof (cur));
563 /* Converts VMS symbolic permissions to number-style ones, e.g. string
564 RWED,RWE,RE to 755. "D" (delete) is taken to be equal to "W"
565 (write). Inspired by a patch of Stoyan Lekov <lekov@eda.bg>. */
567 vmsperms (const char *s)
574 case ',': perms <<= 3; break;
575 case 'R': perms |= 4; break;
576 case 'W': perms |= 2; break;
577 case 'D': perms |= 2; break;
578 case 'E': perms |= 1; break;
579 default: DEBUGP(("wrong VMS permissons!\n"));
587 static struct fileinfo *
588 ftp_parse_vms_ls (const char *file)
591 /* #### A third copy of more-or-less the same array ? */
592 static const char *months[] = {
593 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
594 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
597 int year, month, day; /* for time analysis */
599 struct tm timestruct;
601 char *line, *tok; /* tokenizer */
602 struct fileinfo *dir, *l, cur; /* list creation */
604 fp = fopen (file, "rb");
607 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
612 /* Skip empty line. */
613 line = read_whole_line (fp);
616 /* Skip "Directory PUB$DEVICE[PUB]" */
617 line = read_whole_line (fp);
620 /* Skip empty line. */
621 line = read_whole_line (fp);
624 /* Line loop to end of file: */
625 while ((line = read_whole_line (fp)) != NULL)
628 i = clean_line (line);
635 /* First column: Name. A bit of black magic again. The name my be
636 either ABCD.EXT or ABCD.EXT;NUM and it might be on a separate
637 line. Therefore we will first try to get the complete name
638 until the first space character; if it fails, we assume that the name
639 occupies the whole line. After that we search for the version
640 separator ";", we remove it and check the extension of the file;
641 extension .DIR denotes directory. */
643 tok = strtok(line, " ");
644 if (tok == NULL) tok = line;
645 DEBUGP(("file name: '%s'\n", tok));
646 for (p = tok ; *p && *p != ';' ; p++);
647 if (*p == ';') *p = '\0';
648 p = tok + strlen(tok) - 4;
649 if (!strcmp(p, ".DIR")) *p = '\0';
650 cur.name = xstrdup(tok);
651 DEBUGP(("Name: '%s'\n", cur.name));
653 /* If the name ends on .DIR or .DIR;#, it's a directory. We also set
654 the file size to zero as the listing does tell us only the size in
655 filesystem blocks - for an integrity check (when mirroring, for
656 example) we would need the size in bytes. */
660 cur.type = FT_DIRECTORY;
662 DEBUGP(("Directory\n"));
666 cur.type = FT_PLAINFILE;
672 /* Second column, if exists, or the first column of the next line
673 contain file size in blocks. We will skip it. */
675 tok = strtok(NULL, " ");
678 DEBUGP(("Getting additional line\n"));
680 line = read_whole_line (fp);
683 DEBUGP(("empty line read, leaving listing parser\n"));
686 i = clean_line (line);
689 DEBUGP(("confusing VMS listing item, leaving listing parser\n"));
693 tok = strtok(line, " ");
695 DEBUGP(("second token: '%s'\n", tok));
697 /* Third/Second column: Date DD-MMM-YYYY. */
699 tok = strtok(NULL, "-");
700 if (tok == NULL) continue;
701 DEBUGP(("day: '%s'\n",tok));
703 tok = strtok(NULL, "-");
706 /* If the server produces garbage like
707 'EA95_0PS.GZ;1 No privilege for attempted operation'
708 the first strtok(NULL, "-") will return everything until the end
709 of the line and only the next strtok() call will return NULL. */
710 DEBUGP(("nonsense in VMS listing, skipping this line\n"));
714 for (i=0; i<12; i++) if (!strcmp(tok,months[i])) break;
715 /* Uknown months are mapped to January */
717 tok = strtok (NULL, " ");
718 if (tok == NULL) continue;
719 year = atoi (tok) - 1900;
720 DEBUGP(("date parsed\n"));
722 /* Fourth/Third column: Time hh:mm[:ss] */
723 tok = strtok (NULL, " ");
724 if (tok == NULL) continue;
728 for (; *p && *p != ':'; ++p);
731 for (; *p && *p != ':'; ++p);
735 DEBUGP(("YYYY/MM/DD HH:MM:SS - %d/%02d/%02d %02d:%02d:%02d\n",
736 year+1900, month, day, hour, min, sec));
738 /* Build the time-stamp (copy & paste from above) */
739 timestruct.tm_sec = sec;
740 timestruct.tm_min = min;
741 timestruct.tm_hour = hour;
742 timestruct.tm_mday = day;
743 timestruct.tm_mon = month;
744 timestruct.tm_year = year;
745 timestruct.tm_wday = 0;
746 timestruct.tm_yday = 0;
747 timestruct.tm_isdst = -1;
748 cur.tstamp = mktime (×truct); /* store the time-stamp */
750 DEBUGP(("Timestamp: %ld\n", cur.tstamp));
752 /* Skip the fifth column */
754 tok = strtok(NULL, " ");
755 if (tok == NULL) continue;
757 /* Sixth column: Permissions */
759 tok = strtok(NULL, ","); /* Skip the VMS-specific SYSTEM permissons */
760 if (tok == NULL) continue;
761 tok = strtok(NULL, ")");
764 DEBUGP(("confusing VMS permissions, skipping line\n"));
768 /* Permissons have the format "RWED,RWED,RE" */
769 cur.perms = vmsperms(tok);
770 DEBUGP(("permissions: %s -> 0%o\n", tok, cur.perms));
774 /* And put everything into the linked list */
777 l = dir = xnew (struct fileinfo);
778 memcpy (l, &cur, sizeof (cur));
779 l->prev = l->next = NULL;
784 l->next = xnew (struct fileinfo);
786 memcpy (l, &cur, sizeof (cur));
798 /* This function switches between the correct parsing routine depending on
799 the SYSTEM_TYPE. The system type should be based on the result of the
800 "SYST" response of the FTP server. According to this repsonse we will
801 use on of the three different listing parsers that cover the most of FTP
802 servers used nowadays. */
805 ftp_parse_ls (const char *file, const enum stype system_type)
810 return ftp_parse_unix_ls (file, 0);
813 /* Detect whether the listing is simulating the UNIX format */
816 fp = fopen (file, "rb");
819 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
824 /* If the first character of the file is '0'-'9', it's WINNT
826 if (c >= '0' && c <='9')
827 return ftp_parse_winnt_ls (file);
829 return ftp_parse_unix_ls (file, 1);
832 return ftp_parse_vms_ls (file);
834 return ftp_parse_unix_ls (file, 1);
836 logprintf (LOG_NOTQUIET, _("\
837 Unsupported listing type, trying Unix listing parser.\n"));
838 return ftp_parse_unix_ls (file, 0);
842 /* Stuff for creating FTP index. */
844 /* The function creates an HTML index containing references to given
845 directories and files on the appropriate host. The references are
848 ftp_index (const char *file, struct url *u, struct fileinfo *f)
852 char *htclfile; /* HTML-clean file name */
856 fp = fopen (file, "wb");
859 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
867 char *tmpu, *tmpp; /* temporary, clean user and passwd */
869 tmpu = url_escape (u->user);
870 tmpp = u->passwd ? url_escape (u->passwd) : NULL;
872 upwd = concat_strings (tmpu, ":", tmpp, "@", (char *) 0);
874 upwd = concat_strings (tmpu, "@", (char *) 0);
880 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
881 fprintf (fp, "<html>\n<head>\n<title>");
882 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
883 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
884 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
885 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
891 /* #### Should we translate the months? Or, even better, use
893 static const char *months[] = {
894 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
895 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
897 struct tm *ptm = localtime ((time_t *)&f->tstamp);
899 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
902 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
907 fprintf (fp, _("time unknown "));
911 fprintf (fp, _("File "));
914 fprintf (fp, _("Directory "));
917 fprintf (fp, _("Link "));
920 fprintf (fp, _("Not sure "));
923 htclfile = html_quote_string (f->name);
924 fprintf (fp, "<a href=\"ftp://%s%s:%d", upwd, u->host, u->port);
927 fprintf (fp, "%s", u->dir);
930 fprintf (fp, "%s", htclfile);
931 if (f->type == FT_DIRECTORY)
933 fprintf (fp, "\">%s", htclfile);
934 if (f->type == FT_DIRECTORY)
936 fprintf (fp, "</a> ");
937 if (f->type == FT_PLAINFILE)
938 fprintf (fp, _(" (%s bytes)"), number_to_static_string (f->size));
939 else if (f->type == FT_SYMLINK)
940 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
945 fprintf (fp, "</pre>\n</body>\n</html>\n");