#!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # repetidos.pl # Finds duplicate directories and files. # Makes a "cache" if asked. # # USAGE: # See: repetidos.pl --help # # TODO: # pod # --outfile to store report # use compare_text for text files (?) # --load-index=file --verify to check file integrity # --base-dir=dir --relative to make non-absolute filenames # save file dates (?) # invert compare - show what is different # $VERSION = "0.013"; # # 0.013 # use File::Glob -- fixes "pp" under Windows # fixed "help" # # 0.012 # glob # # 0.011 # doesn't change files atime # don't show files if they are in duplicate dirs already, unless some of them are in other places # --verbose to see "read: ..." # # 0.010 # corrected dir splice error # if parent dirs match, doesn't show subdirs # # 0.009 # save and load index files: can be used to make a "cache" for CD-ROMs # use Digest::MD5 instead of File::Compare # much more processing required! # safer separator: \t # # 0.008 # Last version using File::Compare # always group compares # # 0.007 # prints filenames using "\" in Windows # # 0.006 # more uniform compare # doesn't miss compares # Directories can match even if filenames are different # # 0.005 # nicer output # Uses more pointers # # 0.004 # Corrected Subdir size # # 0.003 # Sorts numerically by directory size # # 0.002 # Subdir compare is exact # # 0.001 # Subdir compare is uncertain - will show "?" # File compare is exact # use Digest::MD5; use Getopt::Long; use File::Glob qw(:globally :nocase); GetOptions( 'save-index=s' => \$SAVE_INDEX, 'load-index=s' => \@LOAD_INDEX, 'verbose!' => \$VERBOSE, 'version' => \&print_version, 'help' => \&usage, ) || usage(); $dots = 20; $join = '/'; $separator = "\t"; $nFields = 3; @b = (); @subdir = (); %visited = (); foreach (@LOAD_INDEX) { foreach (glob $_) { print "Loading: $_\n\n" if $VERBOSE; open (FILE, "<$_"); $type = 0; foreach() { chomp($_); if ($_ eq "DIRECTORIES") { $type = 1; } elsif ($_ eq "FILES") { $type = 2; } elsif ($type == 1) { push @subdir, $_; ($a) = /\d$separator(.*?)$separator/; $visited{$a} = 1; } elsif ($type == 2) { push @b, $_; } } close (FILE); } } $a = shift; usage() unless $a or @b or @subdir; print "Reading:" if $VERBOSE; while ($a) { foreach (glob $a) { $b = $_; # $b =~ s|\\|\/|g; $b =~ s/([^:])\/$/$1/; if ($visited{$b}) { print "\n\t(", &print_filename($b), " done)" if $VERBOSE; } else { $size = &readmydir ($b); if (-d $b) { push @subdir, "$size$separator$b$separator$md5"; $visited{$b} = 1; } } } $a = shift; } undef %visited; print "\n\n" if $VERBOSE; print "Sorting: " if $VERBOSE; # sort files in reverse order of filesize @b = sort {$b <=> $a} @b; # sort dirs in reverse order of the sum of filesizes @subdir = sort {$b <=> $a} @subdir; if ($SAVE_INDEX) { print "Saving: $SAVE_INDEX\n" if $VERBOSE; open (FILE, ">$SAVE_INDEX"); print FILE "DIRECTORIES\n"; print FILE join("\n", @subdir); print FILE "\nFILES\n"; print FILE join("\n", @b); close (FILE); } foreach(0 .. $#b) { ($filesize[$_], $filename[$_], $filemd5[$_]) = split($separator, $b[$_], $nFields); # print "($filesize[$_], $filename[$_], $filemd5[$_])\n"; } undef @b; foreach(0 .. $#subdir) { ($dirsize[$_], $dirname[$_], $dirmd5[$_]) = split($separator, $subdir[$_], $nFields); } undef @subdir; print "done.\n\n" if $VERBOSE; print "Duplicate Subdirs:\n"; foreach (0 .. $#dirname) { $this = $_; $last = $_ + 1; $match = 0; while (($last <= $#dirname) and ($dirsize[$this] == $dirsize[$last])) { if ( &compare_dir ($this, $last) == 0 ) { $report_size = $dirsize[$this]; $report_size =~ s/(.*)\.(.*)/$1 bytes in $2 file/; $report_size .= 's' if $2 > 1; print "\n\t", &print_filename($dirname[$this]), " \t$report_size\n" unless $match; print "\t", &print_filename($dirname[$last]), "\n"; # remove subdirs from list for ($i = $#dirname; $i > $last; $i--) { if ((substr($dirname[$i], 0, length($dirname[$this])) eq $dirname[$this]) or (substr($dirname[$i], 0, length($dirname[$last])) eq $dirname[$last]) ) { # print " $dirname[$i] is subdir\n"; splice(@dirname, $i, 1); splice(@dirsize, $i, 1); splice(@dirmd5, $i, 1); } } splice(@dirname, $last, 1); splice(@dirsize, $last, 1); splice(@dirmd5, $last, 1); $match = 1; } else { $last++; } } } print "\n"; print "Duplicate Files:\n"; foreach(0 .. $#filename) { $last = $_ + 1; $match = 0; $unmarked = 0; $report = ''; nextfile: while (($last < $#filename) and ($filesize[$_] == $filesize[$last])) { if ( &compare_file ($_, $last) == 0 ) { $unmarked = 1 if not $files_marked[$_]; $unmarked = 1 if not $files_marked[$last]; $report .= "\n\t" . &print_filename($filename[$_]) . " \t$filesize[$_] bytes\n" unless $match; $report .= "\t" . &print_filename($filename[$last]) . "\n"; splice(@filename, $last, 1); splice(@filesize, $last, 1); splice(@filemd5, $last, 1); splice(@files_marked, $last, 1); $match = 1; } else { $last++; } } print $report if $unmarked; } sub readmydir { # globals: @subdir, @b my ($a) = @_; my ($size, $atime, $mtime); my ($md5, $count); if (-d $a) { # is-subdir my @dir; my ($tmpsize, $tmpmd5); $size = 0; if ($visited{$a}) { print "\n\t(", &print_filename($a), " done)" if $VERBOSE; } else { print "\n\tread:\t", &print_filename($a), " \t" if $VERBOSE; my (@file_stat) = stat($a); if (@file_stat) { # $size = $file_stat[7]; $atime = $file_stat[8]; $mtime = $file_stat[9]; opendir(DIR,$a); @dir = readdir(DIR); closedir DIR; $count = 0; foreach(@dir) { if (($_ ne ".") and ($_ ne "..")) { $item = "$a$join$_"; ($tmpsize, $tmpmd5) = readmydir ($item); $size += $tmpsize; $count ++; # tmpmd5 ?? } } push @subdir, "$size.$count$separator$a$join$separator$md5"; $visited{$a} = 1; utime $atime, $mtime, $a; return $size; } else { return -1; } } } else { # is-file my (@file_stat) = stat($a); if (@file_stat) { $size = $file_stat[7]; $atime = $file_stat[8]; $mtime = $file_stat[9]; open(FILE,$a); binmode(FILE); # $digest = md5_base64(*FILE); $md5 = Digest::MD5->new->addfile(*FILE)->b64digest; # print "\n $md5 $a "; close(FILE); utime $atime, $mtime, $a; # print "FILE: $a = $md5\n"; if ($k++ > $dots) { print "." if $VERBOSE; $k = 0; } push @b, "$size$separator$a$separator$md5"; return ($size, $md5); } else { return (-1,""); } } } sub compare_file { # globals: @filesize, @filename my ($this, $other) = @_; return 1 if ($filesize[$this] < 1) or ($filesize[$other] < 1); return 0 if ($filemd5[$this] eq $filemd5[$other]); return 1; } sub compare_file_zero { # globals: @filesize, @filename my ($this, $other) = @_; return 0 if ($filemd5[$this] eq $filemd5[$other]); return 1; } sub compare_dir { # globals: @dirname, @dirsize, @filename my ($this, $other) = @_; my (@files_this, @files_other, $tmp, $tmp_this, $tmp_other); my (@files_matched); return -2 if ($this == $other); return 1 if ($dirsize[$this] != $dirsize[$other]); return -3 if ($dirsize[$this] < 1); return -4 if ($dirname[$this] eq $dirname[$other]); return -5 if ( substr($dirname[$other], 0, length($dirname[$this])) eq $dirname[$this]) or ( substr($dirname[$this], 0, length($dirname[$other])) eq $dirname[$other]); # get list of files in subdirs $tmp_this = $dirname[$this]; $tmp_other = $dirname[$other]; @files_this = (); @files_other = (); foreach(0 .. $#filename) { $tmp = substr($filename[$_], 0, length($tmp_this)); if ($tmp eq $tmp_this) { push @files_this, $_; } $tmp = substr($filename[$_], 0, length($tmp_other)); if ($tmp eq $tmp_other) { push @files_other, $_; } } # compare files CMPFILE: foreach $this (0 .. $#files_this) { foreach $other (0 .. $#files_other) { if ( &compare_file_zero ($files_this[$this], $files_other[$other]) == 0 ) { # match push @files_matched, $files_this[$this]; push @files_matched, $files_other[$other]; splice(@files_other, $other, 1); next CMPFILE; } } # no match return 1; } # equal foreach (@files_matched) { # print " * $_ "; $files_marked[$_] = 1; } # report return 0; } sub print_filename { if ($^O =~ /win32/i) { my $a = $_[0]; $a =~ tr|\/|\\|; return $a; } return $_[0]; } sub print_version { print "repetidos.pl version $VERSION\n"; exit 0; } sub usage { print <