/* Dupmerge - Reclaim disk space by deleting redundant copies of files and * creating hardlinks in their place * Phil Karn, KA9Q * karn@ka9q.net * http://www.ka9q.net/ * New algorithm. I no longer do the duplicate unlinking and relinking inside the qsort comparison routine. * It was a cute idea, but probably not as good as just producing a list of files, sorting it by size, * and running through that. * Latest rewrite: May 2010 * The May 2010 version simplifies the algorithm (only one sort is performed) and fixes bugs in the handling of more * than two identical copies of a single file. This version should reliably identify and link all identical copies. * This program reads from standard input a list of files (such * as that generated by "find . -print") and discovers which files are * identical. Dupmerge unlinks one file of each identical pair and * recreates its path name as a link to the other. * * Non-plain files in the input (directories, pipes, devices, etc) * are ignored. Identical files must be on the same file system to be linked. * * Dupmerge prefers to keep the older of two identical files, as the older * timestamp is more likely to be the correct one given that many * copy utilities (e.g., 'cp') do not by default preserve modification * times. * * Command line arguments: * -0 Delimit file names with nulls rather than newlines; for use with 'find -print0' * -q Operate in quiet mode (otherwise, relinks are displayed on stdout) * -f Fast (very fast) mode that bypasses an exhaustive file comparison in favor of modification timestamps. * If the two files are the same size, have the same basename and exactly the same timestamp, they're probably the same. * This method seems to work well enough with rsync. * -t threshold_size * Apply the fast mode feature only to files larger than threshold_size bytes (default 100,000) * -n Dry run: simply list the actions that would be taken without actually unlinking or linking anything. Turns off -q. * -s By default, files are sorted in decreasing order of size so that the actual unlinking of duplicate files starts with * the largest files. This recovers disk space as quickly as possible, but if for some reason you want to start with the * smallest files, use this flag. * * Trivia: this program was inspired by a cheating scandal in the introductory computer science course CS-100 * at Cornell University the year after I graduated. The TAs simply sorted the projects by object code size and compared * those that were equal. That effectively found copies where only the variable names and comments had been changed. * * March 2009: the sort comparison function looks at the first page of each file when they're the same size. * This greatly reduces the number of comparisons done after the sort. * * Copyright Phil Karn, karn@ka9q.net. May be used under the terms of the GNU General Public License v 2.0 * * $Id: dupmerge.c,v 0.22 2010/05/09 12:51:02 karn Exp karn $ */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Darwin (OSX) has this, but Linux apparently doesn't */ #ifndef MAP_NOCACHE #define MAP_NOCACHE (0) #endif /* Linux has this and OSX apparently doesn't */ #ifndef MAP_POPULATE #define MAP_POPULATE (0) #endif #define HASHSIZE 20 #define PAGESIZE (4096) enum flag { NO=0,YES=1,UNKNOWN=-1 }; enum flag Fast_flag = NO; enum flag Zero_flag = NO; enum flag Quiet_flag = NO; enum flag Fast_threshold = 100000; enum flag No_do = NO; enum flag Small_first = NO; /* Statistics counts */ unsigned Regular_file = 0; unsigned FIFO = 0; unsigned Character_special = 0; unsigned Directory = 0; unsigned Block_special = 0; unsigned Symbolic_link = 0; unsigned Socket = 0; unsigned Whiteout = 0; unsigned Null_pathname = 0; unsigned Total_files = 0; unsigned Empty = 0; unsigned Stat_fail = 0; unsigned Not_accessible = 0; unsigned Files_deleted = 0; long long Blocks_reclaimed = 0; long long Full_hashes_computed = 0; long long Block_hashes_computed = 0; long long Full_hash_hits = 0; long long Block_hash_hits = 0; long long Partial_hit_full_fail = 0; long long Unlinks = 0; long long Unlink_failures = 0; long long Map_fails = 0; #define ENTRYCHUNK 5000 /* Allocation unit for file table */ /* File table entry */ struct entry { char *pathname; /* Path name, reserved space for null */ struct stat statbuf; /* file i-node data */ unsigned char partialhash[HASHSIZE]; unsigned char filehash[HASHSIZE]; int partialhash_present:1; int filehash_present:1; }; /* Comparison functions */ int comparison_equal(const void *ap,const void *bp); /* Does most of the work */ int comparison_sort(const void *ap,const void *bp); /* Version called by qsort() */ int main(int argc,char *argv[]){ int i,j; struct entry *entries = NULL; /* Dynamically allocated file table */ struct entry *ep; int entryarraysize = 0; /* Start with empty table, allocate on first pass */ int nfiles; /* Process command line args */ { char c; while((c = getopt(argc,argv,"snqf0t:")) != EOF){ switch(c){ default: fprintf(stderr,"Usage: %s [-s] [-n] [-q] [-f] [-0] [-t threshold_size]\n",argv[0]); break; case 's': Small_first = YES; break; case 'n': No_do = YES; break; case 'q': Quiet_flag = YES; break; case 'f': Fast_flag = YES; /* Just compare modification timestamps, not file contents */ break; case '0': Zero_flag = YES; /* Path names are delimited by nulls, e.g., from 'find . -print0' */ break; case 't': Fast_threshold = atoi(optarg); break; } } } if(No_do && Quiet_flag){ fprintf(stderr,"%s: -q flag forced off with -n set\n",argv[0]); Quiet_flag = NO; /* Force off */ } /* Start by reading stdin for list of path names * Check each one and ignore non-regular files, zero-length files, special files, errors, etc */ ep = NULL; for(nfiles=0;!feof(stdin) && !ferror(stdin);){ int ch; char pathname[PATH_MAX+1]; /* Expand file table if necessary and possible */ if(nfiles >= entryarraysize){ entries = (struct entry *)realloc(entries,(entryarraysize + ENTRYCHUNK) * sizeof(struct entry)); assert(entries != NULL); entryarraysize += ENTRYCHUNK; } ep = &entries[nfiles]; for(i=0;i< PATH_MAX;i++){ if(EOF == (ch = getc(stdin)) || '\0' == ch || (!Zero_flag && '\n' == ch)) break; pathname[i] = ch; } if(ch == EOF) break; pathname[i] = '\0'; Total_files++; /* Ignore null file names */ if(strlen(pathname) == 0){ Null_pathname++; continue; /* Reuse entry for next file */ } /* If we can't stat it, ignore it */ if(lstat(pathname,&entries[nfiles].statbuf) != 0){ Stat_fail++; continue; } #if DEBUG #ifdef __darwin__ { fprintf(stderr," nlink %d; uid %d; gid %d; atime %ld.%09ld; mtime %ld.%09ld; ctime %ld.%09ld; size %lld; gen %d %s\n", ep->statbuf.st_nlink,ep->statbuf.st_uid,ep->statbuf.st_gid, ep->statbuf.st_atimespec.tv_sec,ep->statbuf.st_atimespec.tv_nsec, ep->statbuf.st_mtimespec.tv_sec,ep->statbuf.st_mtimespec.tv_nsec, ep->statbuf.st_ctimespec.tv_sec,ep->statbuf.st_ctimespec.tv_nsec, (long long)ep->statbuf.st_size,ep->statbuf.st_gen,pathname); } #else { fprintf(stderr," nlink %d; uid %d; gid %d; atime %ld; mtime %ld; ctime %ld; size %lld %s\n", ep->statbuf.st_nlink,ep->statbuf.st_uid,ep->statbuf.st_gid, ep->statbuf.st_atime, ep->statbuf.st_mtime, ep->statbuf.st_ctime, (long long)ep->statbuf.st_size,pathname); } #endif #endif /* Ignore all but ordinary files */ switch(ep->statbuf.st_mode & S_IFMT){ case S_IFREG: Regular_file++; break; case S_IFIFO: FIFO++; break; case S_IFCHR: Character_special++; break; case S_IFDIR: Directory++; break; case S_IFBLK: Block_special++; break; case S_IFLNK: Symbolic_link++; break; case S_IFSOCK: Socket++; break; #ifdef S_IFWHT case S_IFWHT: Whiteout++; /* What's this? */ break; #endif } if((ep->statbuf.st_mode & S_IFMT) != S_IFREG) continue; /* Ignore empty files and files with no assigned data blocks (any data being stored in the inode). * Zero size files are often used as flags and locks we don't want to upset. And we won't recover * any data blocks from a file without any data blocks! * I should also exclude HFS files on OSX with resource forks */ if(ep->statbuf.st_blocks == 0 || ep->statbuf.st_size == 0){ Empty++; continue; } /* Ignore files we can't read */ if(access(pathname,R_OK) == -1){ Not_accessible++; continue; } /* Otherwise keep the filename and inode on our list */ ep->pathname = strdup(pathname); nfiles++; } if(!Quiet_flag){ fprintf(stderr,"%s: input files: total %u; ordinary %u",argv[0],Total_files,Regular_file); if(FIFO) fprintf(stderr,"; FIFO %u",FIFO); if(Character_special) fprintf(stderr,"; char special %u",Character_special); if(Directory) fprintf(stderr,"; directories %u",Directory); if(Block_special) fprintf(stderr,"; block specials %u",Block_special); if(Symbolic_link) fprintf(stderr,"; symbolic links %u",Symbolic_link); if(Socket) fprintf(stderr,"; sockets %u",Socket); #ifdef S_IFWHT if(Whiteout) fprintf(stderr,"; whiteouts %u",Whiteout); #endif if(Empty) fprintf(stderr,"; empties %u",Empty); putc('\n',stderr); if(Null_pathname) fprintf(stderr,"%s: null pathnames %u\n",argv[0],Null_pathname); if(Stat_fail) fprintf(stderr,"%s: stat failures %u\n",argv[0],Stat_fail); if(Not_accessible) fprintf(stderr,"%s: files not accessible %u\n",argv[0],Not_accessible); if(nfiles == 0){ fprintf(stderr,"%s: no files left to examine\n",argv[0]); exit(0); } } #if DEBUG for(i=0;i %s\n",argv[0],(long long)entries[j].statbuf.st_size,entries[j].pathname,entries[i].pathname); // fprintf(stderr,"debug: inodes %lld %lld\n",(long long)entries[j].statbuf.st_ino,(long long)entries[i].statbuf.st_ino); } if(entries[j].statbuf.st_nlink == 1){ /* Pathname has single remaining link, so its blocks will be recovered */ Blocks_reclaimed += entries[j].statbuf.st_blocks; } { /* Some last minute paranoid checks */ struct stat statbuf_a,statbuf_b; if(lstat(entries[i].pathname,&statbuf_a)){ fprintf(stderr,"%s: can't lstat(%s): %d %s\n",argv[0],entries[i].pathname,errno,strerror(errno)); abort(); } if(lstat(entries[j].pathname,&statbuf_b)){ fprintf(stderr,"%s: can't lstat(%s): %d %s\n",argv[0],entries[j].pathname,errno,strerror(errno)); abort(); } assert(statbuf_a.st_size == statbuf_b.st_size); assert(statbuf_a.st_ino != statbuf_b.st_ino); assert(statbuf_a.st_dev == statbuf_b.st_dev); assert(statbuf_a.st_mtime <= statbuf_b.st_mtime); } if(!No_do){ if(unlink(entries[j].pathname)) { Unlink_failures++; fprintf(stderr,"%s: can't unlink(%s): %d %s\n",argv[0],entries[j].pathname,errno,strerror(errno)); } else if(link(entries[i].pathname,entries[j].pathname)){ /* Should never fail */ fprintf(stderr,"%s: can't link(%s,%s): %d %s\n",argv[0],entries[i].pathname,entries[j].pathname,errno,strerror(errno)); abort(); } } /* Don't use this entry as a reference file later */ free(entries[j].pathname); entries[j].pathname = NULL; Unlinks++; } /* if same */ } /* End of inner for() loop */ } /* end of for loop looking ahead for match with file i */ if(!Quiet_flag){ if(No_do) fprintf(stderr,"%s: This was a dry run; no files were actually unlinked.\n",argv[0]); if(Unlinks) fprintf(stderr,"%s: Unlinks: %llu; Unlink failures: %llu; disk blocks reclaimed: %llu\n",argv[0],Unlinks,Unlink_failures,Blocks_reclaimed); fprintf(stderr,"%s: First page hashes: %llu; hits %llu\n",argv[0],Block_hashes_computed,Block_hash_hits); fprintf(stderr,"%s: Full file hashes: %llu; hits: %llu; full file hash mismatches: %llu; map fails: %llu\n",argv[0],Full_hashes_computed,Full_hash_hits,Partial_hit_full_fail,Map_fails); } /* Not really necessary since we're exiting */ free(entries); exit(0); } /* Compare files by size (used by second sort) * Return 0 means same size *and* on same device * Returning <0 causes the first argument to sort toward the top of the list * Returning >0 causes the second argument to sort toward the top of the list * We want the largest files to go to the top of the list, so "smaller is greater". * We also want empty entries to go to the end of the list, so they are always "greater" */ int comparison_sort(const void *ap,const void *bp){ struct entry *a,*b; a = (struct entry *)ap; b = (struct entry *)bp; assert(a != NULL); assert(b != NULL); if(a == b) return 0; /* Can this happen? */ /* Push all invalid entries to the bottom of the sort */ if(!b->pathname && !a->pathname) return 0; if(!b->pathname) return -1; if(!a->pathname) return 1; /* Distinguish first by size. * By default, bigger files sort first unless overridden with -s option */ if(b->statbuf.st_size != a->statbuf.st_size){ if(Small_first) return a->statbuf.st_size - b->statbuf.st_size; else return b->statbuf.st_size - a->statbuf.st_size; } /* Files are same size; distinguish if on different device; ordering is unimportant */ if(b->statbuf.st_dev != a->statbuf.st_dev) return b->statbuf.st_dev - a->statbuf.st_dev; /* Same size, same device; distinguish by modification time, older files first */ if(a->statbuf.st_mtime != b->statbuf.st_mtime) return a->statbuf.st_mtime - b->statbuf.st_mtime; /* Order equal, same time files with fewer links later so they'll be preferentially deleted sooner */ if(a->statbuf.st_nlink != b->statbuf.st_nlink) return b->statbuf.st_nlink - a->statbuf.st_nlink; /* Same size, same device, same modification time, same number of links; includes case of two links to same inode */ return 0; } void get_small_hash(struct entry *ep); void get_big_hash(struct entry *ep); int comparison_equal(const void *ap,const void *bp){ struct entry *a,*b; int i; a = (struct entry *)ap; b = (struct entry *)bp; if(a == b) return 0; /* Can this happen? */ assert(a != NULL); assert(b != NULL); if(a->statbuf.st_dev == b->statbuf.st_dev && a->statbuf.st_ino == b->statbuf.st_ino) return 0; /* Files are already hard-linked, so they're the same */ /* Make file size the most significant part of the comparison */ if(b->statbuf.st_size != a->statbuf.st_size) return b->statbuf.st_size - a->statbuf.st_size; /* Same-size files sort together only when they're on the same device */ if(b->statbuf.st_dev != a->statbuf.st_dev) return b->statbuf.st_dev - a->statbuf.st_dev; /* Optionally use the rsync heuristic -- if the files have the same size, mod timestamp and * the same base name, declare them the same without actually reading the contents * Do this only on files larger than Fast_threshold to further reduce chances of false equality */ if(Fast_flag && a->statbuf.st_size > Fast_threshold){ /* Rsync-style fast comparison */ char *bn1,*bn2; /* I could use the built-in basename() function, but what a mess it is */ if((bn1 = strrchr(a->pathname,'/')) == NULL) bn1 = a->pathname; if((bn2 = strrchr(b->pathname,'/')) == NULL) bn2 = b->pathname; /* Are the basenames and mod times identical? */ if(0 == strcmp(bn1,bn2) && a->statbuf.st_mtime == b->statbuf.st_mtime) return 0; } /* Next order of business: compare the partial file hashes */ if(!a->partialhash_present){ get_small_hash(a); } else Block_hash_hits++; if(!b->partialhash_present){ get_small_hash(b); } else Block_hash_hits++; i = memcmp(a->partialhash,b->partialhash,HASHSIZE); if(i != 0) /* They differ, no need to continue */ return i; /* Partial hashes are the same, compare full file hashes */ if(!a->filehash_present){ get_big_hash(a); } else Full_hash_hits++; if(!b->filehash_present){ get_big_hash(b); } else Full_hash_hits++; i = memcmp(a->filehash,b->filehash,HASHSIZE); if(i != 0){ Partial_hit_full_fail++; return i; } return 0; /* We've passed the gauntlet; the files are the same! */ } void get_small_hash(struct entry *ep){ if(!ep->filehash_present){ int fd,i,len; void *p; Block_hashes_computed++; if((fd = open(ep->pathname,O_RDONLY)) == -1){ fprintf(stderr,"can't open(%s): %d %s\n",ep->pathname,errno,strerror(errno)); abort(); } assert(fd != -1); len = ep->statbuf.st_size > PAGESIZE ? PAGESIZE : ep->statbuf.st_size; p = mmap(NULL, len, PROT_READ, MAP_NOCACHE|MAP_FILE|MAP_SHARED, fd, 0); assert(p != MAP_FAILED); /* No reason for it to fail */ SHA1(p,len,ep->partialhash); i = munmap(p,len); assert(i == 0); i = close(fd); assert(i == 0); ep->partialhash_present = 1; } } void get_big_hash(struct entry *ep){ if(!ep->filehash_present){ int fd,i; void *p; Full_hashes_computed++; if((fd = open(ep->pathname,O_RDONLY)) == -1){ fprintf(stderr,"can't open(%s): %d %s\n",ep->pathname,errno,strerror(errno)); abort(); } assert(fd != -1); p = mmap(NULL, ep->statbuf.st_size, PROT_READ, MAP_NOCACHE|MAP_FILE|MAP_SHARED|MAP_POPULATE, fd, 0); if(p != MAP_FAILED){ SHA1(p,ep->statbuf.st_size,ep->filehash); i = munmap(p,ep->statbuf.st_size); assert(i == 0); } else { /* Not enough address space to map entire file? */ unsigned long len; SHA_CTX context; char buffer[PAGESIZE]; Map_fails++; SHA1_Init(&context); while((len = read(fd,buffer,PAGESIZE)) > 0){ SHA1_Update(&context,buffer,len); } if(len < 0){ fprintf(stderr,"Read error on %s: %d %s\n",ep->pathname,errno,strerror(errno)); abort(); } SHA1_Final(ep->filehash,&context); } i = close(fd); assert(i == 0); ep->filehash_present = 1; } }