#!/usr/bin/python # # Combine identical files. Ie, when two identical files are detected, # delete one and replace it with a hardlink to the other. # # If owners and permissions are not the same, prompt whether to continue and # which set of ownership/permissions should apply to the final, shared file. # Note: ACLs and other fancy inode mumbo-jumbo not supported at this time. # # # Sample run: # # # mkdir a; cd a; echo x | tee x1 > x2; echo y | tee y1 y2 > y3 # # echo manual | tee m1 > m2; ln m2 m3; chmod -w m1 # # combine-duplicates . # 7 bytes: # 0) inode 781808, mode 644, owner root, group root, modified Mon Feb 22 22:40:25 2010 # ./m2 # ./m3 # 1) inode 781809, mode 444, owner root, group root, modified Mon Feb 22 22:40:25 2010 # ./m1 # Select the menu # of the inode to keep, or "s" to skip: 0 # `./m1' => `./m2' # `./y3' => `./y1' # `./y2' => `./y1' # `./x2' => `./x1' # 8 files, 2 file sizes, 1 inode pairs, 1 inode groups # 5 hashings, 2 unique hashes, 3 pairwise cmps to verify hashes # 2 auto-merges, 1 manual prompts, 1 manual merges # 13 bytes recovered in 4 files import subprocess import sys import time stat_files = 0 stat_file_sizes = 0 stat_inode_pairs = 0 stat_inode_groups = 0 stat_hashings = 0 stat_unique_hashes = 0 stat_hashed_cmps = 0 stat_auto_merges = 0 stat_manual_merge_candidates = 0 stat_manual_merges_performed = 0 stat_clobbers = 0 stat_space_reclaimed = 0 def merge(inodes): """Merge inodes with ln -f. Keep inodes[0], clobber inodes[1:].""" global stat_clobbers, stat_space_reclaimed for inode in inodes[1:]: for file in inode: subprocess.Popen(['ln', '-f', '-v', inodes[0][0][6], file[6]]).wait() stat_clobbers += 1 stat_space_reclaimed += int(inode[0][0]) def mergable_inodes(inodes): """Consider merging a group of inodes. The inodes argument is a list of inodes. Each 'inode' is a list of seven-component file info things.""" global stat_auto_merges, stat_manual_merge_candidates, stat_manual_merges_performed # Check the easy case -- Do they all have the same owner, group, and perms? all_the_same = True for inode in inodes[1:]: if inode[0][2:5] != inodes[0][0][2:5]: all_the_same = False if all_the_same: # Auto-merge stat_auto_merges += 1 merge(inodes) else: stat_manual_merge_candidates += 1 print "\n%s bytes:" % inodes[0][0][0] for i in range(0,len(inodes)): print "%d) inode %s, mode %s, owner %s, group %s, modified %s " % ( i, inodes[i][0][1], inodes[i][0][2], inodes[i][0][3], inodes[i][0][4], time.ctime(float(inodes[i][0][5]))) for file in inodes[i]: print " %s" % file[6] while True: selection = raw_input('Select the menu # of the inode to keep, or "s" to skip: ') if selection == 's': return try: int_selection = int(selection) except: print '"%s" is not a number.' % selection continue if int_selection < 0 or int_selection >= len(inodes): print 'Please make a selection between 0 and %d' % (len(inodes) - 1) continue break inodes.insert(0, inodes.pop(int_selection)) stat_manual_merges_performed += 1 merge(inodes) def process(batch): """Process a batch of files with the same size. Calls mergable_inodes.""" global stat_file_sizes, stat_inode_pairs, stat_inode_groups, stat_hashings, stat_unique_hashes, stat_hashed_cmps stat_file_sizes += 1 if len(batch) <= 1: return # Can't combine 1 file with anything. # Group by inode (some of these may already be shared) inodes = {} for file in batch: if file[1] not in inodes: inodes[file[1]] = [] inodes[file[1]].append(file) if len(inodes) <= 1: return # Can't combine 1 inode with anything. inodes = inodes.values() inodes.sort(key=lambda x: int(x[0][5])) # by modification date # Compare two inodes with cmp if len(inodes) == 2: stat_inode_pairs += 1 cmp = subprocess.Popen(['cmp', '-s', inodes[0][0][6], inodes[1][0][6]]) if cmp.wait() == 0: mergable_inodes(inodes) return # There are more than two indodes with this size. Group by hash. # TODO: Hash the first 1MB first. Hashing three+ very large files that # differ widely and early just because they're the same size is wasteful. stat_inode_groups += 1 stat_hashings += len(inodes) hashes = {} for inode in inodes: hash = subprocess.Popen(['sha1sum', inode[0][6]], stdout=subprocess.PIPE).communicate()[0].split(' ', 1)[0] if hash not in hashes: hashes[hash] = [] hashes[hash].append(inode) stat_unique_hashes += len(hashes) # It is extremely likely that files with the same hash are indentical, # but check anyway. for hash_group in hashes.values(): if len(hash_group) <= 1: continue # Can't combine 1 inode with anything. for inode in hash_group[1:]: stat_hashed_cmps += 1 cmp = subprocess.Popen(['cmp', '-s', hash_group[0][0][6], inode[0][6]]) if cmp.wait() != 0: # Whoa! We observed a sha1 hash collision. This is more noteworthy # than just mundanely hardlink-merging files. Shout and abort. print "OBSERVED SHA1 HASH COLLISION!\n\"%s\"\nand \"%s\"\ndiffer but have the same sha1 hash!" % (hash_group[0][0][6], inode[0][6]) sys.exit(2) # Ok, all inodes in hash_group are verified to be identical. mergable_inodes(hash_group) if len(sys.argv) < 2: print "Usage: combine-duplicates path" sys.exit(1) find = subprocess.Popen( args=['find', sys.argv[1], '-type', 'f', '-size', '+0', '-printf', '%s %i %m %u %g %Ts %p\\n'], stdout=subprocess.PIPE) sort = subprocess.Popen( args=['sort', '-k1nr', '-k6n'], stdin=find.stdout, stdout=subprocess.PIPE) current_size = None files_with_this_size = [] try: # Pass batches of files to process() for line in sort.stdout: stat_files += 1 fields = line.rstrip('\n').split(' ', 6) if current_size and fields[0] != current_size: process(files_with_this_size) files_with_this_size = [] files_with_this_size.append(fields) current_size = fields[0] process(files_with_this_size) find.wait() sort.wait() finally: print '%d files, %d file sizes, %d inode pairs, %d inode groups' % ( stat_files, stat_file_sizes, stat_inode_pairs, stat_inode_groups) print '%d hashings, %d unique hashes, %d pairwise cmps to verify hashes' % ( stat_hashings, stat_unique_hashes, stat_hashed_cmps) print '%d auto-merges, %d manual prompts, %d manual merges' % ( stat_auto_merges, stat_manual_merge_candidates, stat_manual_merges_performed) print '%d bytes' % (stat_space_reclaimed), if stat_space_reclaimed > 2**30: print "(%f GB)" % (stat_space_reclaimed / 2.0**30), elif stat_space_reclaimed > 2**20: print "(%f MB)" % (stat_space_reclaimed / 2.0**20), elif stat_space_reclaimed > 2**10: print "(%f kB)" % (stat_space_reclaimed / 2.0**10), print 'recovered in %d files' % (stat_clobbers)