1 /* Parsing FTP `ls' output.
2 Copyright (C) 1996-2004 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
45 #include "convert.h" /* for html_quote_string prototype */
46 #include "retr.h" /* for output_stream */
48 /* Converts symbolic permissions to number-style ones, e.g. string
49 rwxr-xr-x to 755. For now, it knows nothing of
50 setuid/setgid/sticky. ACLs are ignored. */
52 symperms (const char *s)
58 for (i = 0; i < 3; i++, s += 3)
61 perms += (((s[0] == 'r') << 2) + ((s[1] == 'w') << 1) +
62 (s[2] == 'x' || s[2] == 's'));
68 /* Cleans a line of text so that it can be consistently parsed. Destroys
69 <CR> and <LF> in case that thay occur at the end of the line and
70 replaces all <TAB> character with <SPACE>. Returns the length of the
73 clean_line(char *line)
75 int len = strlen (line);
77 if (line[len - 1] == '\n')
79 if (line[len - 1] == '\r')
81 for ( ; *line ; line++ ) if (*line == '\t') *line = ' ';
85 /* Convert the Un*x-ish style directory listing stored in FILE to a
86 linked list of fileinfo (system-independent) entries. The contents
87 of FILE are considered to be produced by the standard Unix `ls -la'
88 output (whatever that might be). BSD (no group) and SYSV (with
89 group) listings are handled.
91 The time stamps are stored in a separate variable, time_t
92 compatible (I hope). The timezones are ignored. */
93 static struct fileinfo *
94 ftp_parse_unix_ls (const char *file, int ignore_perms)
97 static const char *months[] = {
98 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
99 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
101 int next, len, i, error, ignore;
102 int year, month, day; /* for time analysis */
104 struct tm timestruct, *tnow;
107 char *line, *tok, *ptok; /* tokenizer */
108 struct fileinfo *dir, *l, cur; /* list creation */
110 fp = fopen (file, "rb");
113 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
118 /* Line loop to end of file: */
119 while ((line = read_whole_line (fp)) != NULL)
121 len = clean_line (line);
122 /* Skip if total... */
123 if (!strncasecmp (line, "total", 5))
128 /* Get the first token (permissions). */
129 tok = strtok (line, " ");
139 /* Decide whether we deal with a file or a directory. */
143 cur.type = FT_PLAINFILE;
144 DEBUGP (("PLAINFILE; "));
147 cur.type = FT_DIRECTORY;
148 DEBUGP (("DIRECTORY; "));
151 cur.type = FT_SYMLINK;
152 DEBUGP (("SYMLINK; "));
155 cur.type = FT_UNKNOWN;
156 DEBUGP (("UNKNOWN; "));
171 /*cur.perms = 1023;*/ /* #### What is this? --hniksic */
174 DEBUGP (("implicit perms %0o; ", cur.perms));
178 cur.perms = symperms (tok + 1);
179 DEBUGP (("perms %0o; ", cur.perms));
182 error = ignore = 0; /* Erroneous and ignoring entries are
183 treated equally for now. */
184 year = hour = min = sec = 0; /* Silence the compiler. */
187 /* While there are tokens on the line, parse them. Next is the
188 number of tokens left until the filename.
190 Use the month-name token as the "anchor" (the place where the
191 position wrt the file name is "known"). When a month name is
192 encountered, `next' is set to 5. Also, the preceding
193 characters are parsed to get the file size.
195 This tactic is quite dubious when it comes to
196 internationalization issues (non-English month names), but it
200 (tok = strtok (NULL, " ")) != NULL)
203 if (next < 0) /* a month name was not encountered */
205 for (i = 0; i < 12; i++)
206 if (!strcmp (tok, months[i]))
208 /* If we got a month, it means the token before it is the
209 size, and the filename is three tokens away. */
214 /* Back up to the beginning of the previous token
215 and parse it with str_to_wgint. */
217 while (t > line && ISDIGIT (*t))
221 /* Something has gone wrong during parsing. */
226 size = str_to_wgint (t, NULL, 10);
227 if (size == WGINT_MAX && errno == ERANGE)
228 /* Out of range -- ignore the size. #### Should
229 we refuse to start the download. */
236 DEBUGP (("month: %s; ", months[month]));
239 else if (next == 4) /* days */
241 if (tok[1]) /* two-digit... */
242 day = 10 * (*tok - '0') + tok[1] - '0';
243 else /* ...or one-digit */
245 DEBUGP (("day: %d; ", day));
249 /* This ought to be either the time, or the year. Let's
252 If we have a number x, it's a year. If we have x:y,
253 it's hours and minutes. If we have x:y:z, z are
256 min = hour = sec = 0;
257 /* We must deal with digits. */
260 /* Suppose it's year. */
261 for (; ISDIGIT (*tok); tok++)
262 year = (*tok - '0') + 10 * year;
265 /* This means these were hours! */
269 /* Get the minutes... */
270 for (; ISDIGIT (*tok); tok++)
271 min = (*tok - '0') + 10 * min;
274 /* ...and the seconds. */
276 for (; ISDIGIT (*tok); tok++)
277 sec = (*tok - '0') + 10 * sec;
282 DEBUGP (("year: %d (no tm); ", year));
284 DEBUGP (("time: %02d:%02d:%02d (no yr); ", hour, min, sec));
286 else if (next == 2) /* The file name */
291 /* Since the file name may contain a SPC, it is possible
292 for strtok to handle it wrong. */
293 fnlen = strlen (tok);
294 if (fnlen < len - (tok - line))
296 /* So we have a SPC in the file name. Restore the
299 /* If the file is a symbolic link, it should have a
301 if (cur.type == FT_SYMLINK)
303 p = strstr (tok, " -> ");
309 cur.linkto = xstrdup (p + 4);
310 DEBUGP (("link to: %s\n", cur.linkto));
311 /* And separate it from the file name. */
315 /* If we have the filename, add it to the list of files or
317 /* "." and ".." are an exception! */
318 if (!strcmp (tok, ".") || !strcmp (tok, ".."))
320 DEBUGP (("\nIgnoring `.' and `..'; "));
324 /* Some FTP sites choose to have ls -F as their default
325 LIST output, which marks the symlinks with a trailing
326 `@', directory names with a trailing `/' and
327 executables with a trailing `*'. This is no problem
328 unless encountering a symbolic link ending with `@',
329 or an executable ending with `*' on a server without
330 default -F output. I believe these cases are very
332 fnlen = strlen (tok); /* re-calculate `fnlen' */
333 cur.name = xmalloc (fnlen + 1);
334 memcpy (cur.name, tok, fnlen + 1);
337 if (cur.type == FT_DIRECTORY && cur.name[fnlen - 1] == '/')
339 cur.name[fnlen - 1] = '\0';
340 DEBUGP (("trailing `/' on dir.\n"));
342 else if (cur.type == FT_SYMLINK && cur.name[fnlen - 1] == '@')
344 cur.name[fnlen - 1] = '\0';
345 DEBUGP (("trailing `@' on link.\n"));
347 else if (cur.type == FT_PLAINFILE
348 && (cur.perms & 0111)
349 && cur.name[fnlen - 1] == '*')
351 cur.name[fnlen - 1] = '\0';
352 DEBUGP (("trailing `*' on exec.\n"));
363 if (!cur.name || (cur.type == FT_SYMLINK && !cur.linkto))
370 DEBUGP (("Skipping.\n"));
371 xfree_null (cur.name);
372 xfree_null (cur.linkto);
379 l = dir = xnew (struct fileinfo);
380 memcpy (l, &cur, sizeof (cur));
381 l->prev = l->next = NULL;
386 l->next = xnew (struct fileinfo);
388 memcpy (l, &cur, sizeof (cur));
391 /* Get the current time. */
392 timenow = time (NULL);
393 tnow = localtime (&timenow);
394 /* Build the time-stamp (the idea by zaga@fly.cc.fer.hr). */
395 timestruct.tm_sec = sec;
396 timestruct.tm_min = min;
397 timestruct.tm_hour = hour;
398 timestruct.tm_mday = day;
399 timestruct.tm_mon = month;
402 /* Some listings will not specify the year if it is "obvious"
403 that the file was from the previous year. E.g. if today
404 is 97-01-12, and you see a file of Dec 15th, its year is
405 1996, not 1997. Thanks to Vladimir Volovich for
407 if (month > tnow->tm_mon)
408 timestruct.tm_year = tnow->tm_year - 1;
410 timestruct.tm_year = tnow->tm_year;
413 timestruct.tm_year = year;
414 if (timestruct.tm_year >= 1900)
415 timestruct.tm_year -= 1900;
416 timestruct.tm_wday = 0;
417 timestruct.tm_yday = 0;
418 timestruct.tm_isdst = -1;
419 l->tstamp = mktime (×truct); /* store the time-stamp */
428 static struct fileinfo *
429 ftp_parse_winnt_ls (const char *file)
433 int year, month, day; /* for time analysis */
435 struct tm timestruct;
437 char *line, *tok; /* tokenizer */
438 struct fileinfo *dir, *l, cur; /* list creation */
440 fp = fopen (file, "rb");
443 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
448 /* Line loop to end of file: */
449 while ((line = read_whole_line (fp)) != NULL)
451 len = clean_line (line);
453 /* Extracting name is a bit of black magic and we have to do it
454 before `strtok' inserted extra \0 characters in the line
455 string. For the moment let us just suppose that the name starts at
456 column 39 of the listing. This way we could also recognize
457 filenames that begin with a series of space characters (but who
458 really wants to use such filenames anyway?). */
459 if (len < 40) continue;
461 cur.name = xstrdup(tok);
462 DEBUGP(("Name: '%s'\n", cur.name));
464 /* First column: mm-dd-yy. Should atoi() on the month fail, january
466 tok = strtok(line, "-");
467 if (tok == NULL) continue;
468 month = atoi(tok) - 1;
469 if (month < 0) month = 0;
470 tok = strtok(NULL, "-");
471 if (tok == NULL) continue;
473 tok = strtok(NULL, " ");
474 if (tok == NULL) continue;
476 /* Assuming the epoch starting at 1.1.1970 */
477 if (year <= 70) year += 100;
479 /* Second column: hh:mm[AP]M, listing does not contain value for
481 tok = strtok(NULL, ":");
482 if (tok == NULL) continue;
484 tok = strtok(NULL, "M");
485 if (tok == NULL) continue;
487 /* Adjust hour from AM/PM. Just for the record, the sequence goes
488 11:00AM, 12:00PM, 01:00PM ... 11:00PM, 12:00AM, 01:00AM . */
490 if (hour == 12) hour = 0;
491 if (*tok == 'P') hour += 12;
493 DEBUGP(("YYYY/MM/DD HH:MM - %d/%02d/%02d %02d:%02d\n",
494 year+1900, month, day, hour, min));
496 /* Build the time-stamp (copy & paste from above) */
497 timestruct.tm_sec = 0;
498 timestruct.tm_min = min;
499 timestruct.tm_hour = hour;
500 timestruct.tm_mday = day;
501 timestruct.tm_mon = month;
502 timestruct.tm_year = year;
503 timestruct.tm_wday = 0;
504 timestruct.tm_yday = 0;
505 timestruct.tm_isdst = -1;
506 cur.tstamp = mktime (×truct); /* store the time-stamp */
508 DEBUGP(("Timestamp: %ld\n", cur.tstamp));
510 /* Third column: Either file length, or <DIR>. We also set the
511 permissions (guessed as 0644 for plain files and 0755 for
512 directories as the listing does not give us a clue) and filetype
514 tok = strtok(NULL, " ");
515 if (tok == NULL) continue;
516 while ((tok != NULL) && (*tok == '\0')) tok = strtok(NULL, " ");
517 if (tok == NULL) continue;
520 cur.type = FT_DIRECTORY;
523 DEBUGP(("Directory\n"));
528 cur.type = FT_PLAINFILE;
530 size = str_to_wgint (tok, NULL, 10);
531 if (size == WGINT_MAX && errno == ERANGE)
532 cur.size = 0; /* overflow */
536 DEBUGP(("File, size %s bytes\n", number_to_static_string (cur.size)));
541 /* And put everything into the linked list */
544 l = dir = xnew (struct fileinfo);
545 memcpy (l, &cur, sizeof (cur));
546 l->prev = l->next = NULL;
551 l->next = xnew (struct fileinfo);
553 memcpy (l, &cur, sizeof (cur));
564 /* Converts VMS symbolic permissions to number-style ones, e.g. string
565 RWED,RWE,RE to 755. "D" (delete) is taken to be equal to "W"
566 (write). Inspired by a patch of Stoyan Lekov <lekov@eda.bg>. */
568 vmsperms (const char *s)
575 case ',': perms <<= 3; break;
576 case 'R': perms |= 4; break;
577 case 'W': perms |= 2; break;
578 case 'D': perms |= 2; break;
579 case 'E': perms |= 1; break;
580 default: DEBUGP(("wrong VMS permissons!\n"));
588 static struct fileinfo *
589 ftp_parse_vms_ls (const char *file)
592 /* #### A third copy of more-or-less the same array ? */
593 static const char *months[] = {
594 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
595 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
598 int year, month, day; /* for time analysis */
600 struct tm timestruct;
602 char *line, *tok; /* tokenizer */
603 struct fileinfo *dir, *l, cur; /* list creation */
605 fp = fopen (file, "rb");
608 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
613 /* Skip empty line. */
614 line = read_whole_line (fp);
617 /* Skip "Directory PUB$DEVICE[PUB]" */
618 line = read_whole_line (fp);
621 /* Skip empty line. */
622 line = read_whole_line (fp);
625 /* Line loop to end of file: */
626 while ((line = read_whole_line (fp)) != NULL)
629 i = clean_line (line);
636 /* First column: Name. A bit of black magic again. The name my be
637 either ABCD.EXT or ABCD.EXT;NUM and it might be on a separate
638 line. Therefore we will first try to get the complete name
639 until the first space character; if it fails, we assume that the name
640 occupies the whole line. After that we search for the version
641 separator ";", we remove it and check the extension of the file;
642 extension .DIR denotes directory. */
644 tok = strtok(line, " ");
645 if (tok == NULL) tok = line;
646 DEBUGP(("file name: '%s'\n", tok));
647 for (p = tok ; *p && *p != ';' ; p++)
649 if (*p == ';') *p = '\0';
650 p = tok + strlen(tok) - 4;
651 if (!strcmp(p, ".DIR")) *p = '\0';
652 cur.name = xstrdup(tok);
653 DEBUGP(("Name: '%s'\n", cur.name));
655 /* If the name ends on .DIR or .DIR;#, it's a directory. We also set
656 the file size to zero as the listing does tell us only the size in
657 filesystem blocks - for an integrity check (when mirroring, for
658 example) we would need the size in bytes. */
662 cur.type = FT_DIRECTORY;
664 DEBUGP(("Directory\n"));
668 cur.type = FT_PLAINFILE;
674 /* Second column, if exists, or the first column of the next line
675 contain file size in blocks. We will skip it. */
677 tok = strtok(NULL, " ");
680 DEBUGP(("Getting additional line\n"));
682 line = read_whole_line (fp);
685 DEBUGP(("empty line read, leaving listing parser\n"));
688 i = clean_line (line);
691 DEBUGP(("confusing VMS listing item, leaving listing parser\n"));
695 tok = strtok(line, " ");
697 DEBUGP(("second token: '%s'\n", tok));
699 /* Third/Second column: Date DD-MMM-YYYY. */
701 tok = strtok(NULL, "-");
702 if (tok == NULL) continue;
703 DEBUGP(("day: '%s'\n",tok));
705 tok = strtok(NULL, "-");
708 /* If the server produces garbage like
709 'EA95_0PS.GZ;1 No privilege for attempted operation'
710 the first strtok(NULL, "-") will return everything until the end
711 of the line and only the next strtok() call will return NULL. */
712 DEBUGP(("nonsense in VMS listing, skipping this line\n"));
716 for (i=0; i<12; i++) if (!strcmp(tok,months[i])) break;
717 /* Uknown months are mapped to January */
719 tok = strtok (NULL, " ");
720 if (tok == NULL) continue;
721 year = atoi (tok) - 1900;
722 DEBUGP(("date parsed\n"));
724 /* Fourth/Third column: Time hh:mm[:ss] */
725 tok = strtok (NULL, " ");
726 if (tok == NULL) continue;
730 for (; *p && *p != ':'; ++p)
734 for (; *p && *p != ':'; ++p)
739 DEBUGP(("YYYY/MM/DD HH:MM:SS - %d/%02d/%02d %02d:%02d:%02d\n",
740 year+1900, month, day, hour, min, sec));
742 /* Build the time-stamp (copy & paste from above) */
743 timestruct.tm_sec = sec;
744 timestruct.tm_min = min;
745 timestruct.tm_hour = hour;
746 timestruct.tm_mday = day;
747 timestruct.tm_mon = month;
748 timestruct.tm_year = year;
749 timestruct.tm_wday = 0;
750 timestruct.tm_yday = 0;
751 timestruct.tm_isdst = -1;
752 cur.tstamp = mktime (×truct); /* store the time-stamp */
754 DEBUGP(("Timestamp: %ld\n", cur.tstamp));
756 /* Skip the fifth column */
758 tok = strtok(NULL, " ");
759 if (tok == NULL) continue;
761 /* Sixth column: Permissions */
763 tok = strtok(NULL, ","); /* Skip the VMS-specific SYSTEM permissons */
764 if (tok == NULL) continue;
765 tok = strtok(NULL, ")");
768 DEBUGP(("confusing VMS permissions, skipping line\n"));
772 /* Permissons have the format "RWED,RWED,RE" */
773 cur.perms = vmsperms(tok);
774 DEBUGP(("permissions: %s -> 0%o\n", tok, cur.perms));
778 /* And put everything into the linked list */
781 l = dir = xnew (struct fileinfo);
782 memcpy (l, &cur, sizeof (cur));
783 l->prev = l->next = NULL;
788 l->next = xnew (struct fileinfo);
790 memcpy (l, &cur, sizeof (cur));
802 /* This function switches between the correct parsing routine depending on
803 the SYSTEM_TYPE. The system type should be based on the result of the
804 "SYST" response of the FTP server. According to this repsonse we will
805 use on of the three different listing parsers that cover the most of FTP
806 servers used nowadays. */
809 ftp_parse_ls (const char *file, const enum stype system_type)
814 return ftp_parse_unix_ls (file, 0);
817 /* Detect whether the listing is simulating the UNIX format */
820 fp = fopen (file, "rb");
823 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
828 /* If the first character of the file is '0'-'9', it's WINNT
830 if (c >= '0' && c <='9')
831 return ftp_parse_winnt_ls (file);
833 return ftp_parse_unix_ls (file, 1);
836 return ftp_parse_vms_ls (file);
838 return ftp_parse_unix_ls (file, 1);
840 logprintf (LOG_NOTQUIET, _("\
841 Unsupported listing type, trying Unix listing parser.\n"));
842 return ftp_parse_unix_ls (file, 0);
846 /* Stuff for creating FTP index. */
848 /* The function creates an HTML index containing references to given
849 directories and files on the appropriate host. The references are
852 ftp_index (const char *file, struct url *u, struct fileinfo *f)
856 char *htclfile; /* HTML-clean file name */
860 fp = fopen (file, "wb");
863 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
871 char *tmpu, *tmpp; /* temporary, clean user and passwd */
873 tmpu = url_escape (u->user);
874 tmpp = u->passwd ? url_escape (u->passwd) : NULL;
876 upwd = concat_strings (tmpu, ":", tmpp, "@", (char *) 0);
878 upwd = concat_strings (tmpu, "@", (char *) 0);
884 fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
885 fprintf (fp, "<html>\n<head>\n<title>");
886 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
887 fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
888 fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
889 fprintf (fp, "</h1>\n<hr>\n<pre>\n");
895 /* #### Should we translate the months? Or, even better, use
897 static const char *months[] = {
898 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
899 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
901 struct tm *ptm = localtime ((time_t *)&f->tstamp);
903 fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
906 fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
911 fprintf (fp, _("time unknown "));
915 fprintf (fp, _("File "));
918 fprintf (fp, _("Directory "));
921 fprintf (fp, _("Link "));
924 fprintf (fp, _("Not sure "));
927 htclfile = html_quote_string (f->name);
928 fprintf (fp, "<a href=\"ftp://%s%s:%d", upwd, u->host, u->port);
931 fprintf (fp, "%s", u->dir);
934 fprintf (fp, "%s", htclfile);
935 if (f->type == FT_DIRECTORY)
937 fprintf (fp, "\">%s", htclfile);
938 if (f->type == FT_DIRECTORY)
940 fprintf (fp, "</a> ");
941 if (f->type == FT_PLAINFILE)
942 fprintf (fp, _(" (%s bytes)"), number_to_static_string (f->size));
943 else if (f->type == FT_SYMLINK)
944 fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
949 fprintf (fp, "</pre>\n</body>\n</html>\n");