/* Dupmerge - Reclaim disk space by deleting redundant copies of files and * creating hardlinks in their place * Phil Karn, KA9Q * karn@ka9q.net * http://www.ka9q.net/ * Complete rewrite December 2008/January 2009 * New algorithm. I no longer do the duplicate unlinking and relinking inside the qsort comparison routine. * It was a cute idea, but probably not as good as just producing a list of files, sorting it by size, * and running through that. * * This program reads from standard input a list of files (such * as that generated by "find . -print") and discovers which files are * identical. Dupmerge unlinks one file of each identical pair and * recreates its path name as a link to the other. * * Non-plain files in the input (directories, pipes, devices, etc) * are ignored. Identical files must be on the same file system to be linked. * * Dupmerge prefers to keep the older of two identical files, as the older * timestamp is more likely to be the correct one given that many * copy utilities (e.g., 'cp') do not by default preserve modification * times. * * Command line arguments: * -0 Delimit file names with nulls rather than newlines; for use with 'find -print0' * -q Operate in quiet mode (otherwise, relinks are displayed on stdout) * -d 'expression' delete (rather than relink) copies whose names match the regular expression 'expression'. * This is intended for FAT16/32 file systems that don't support hard links. * WARNING: the -d option is not well thought out or tested. I will probably change it * or remove it entirely. Use at your own risk. * -f Fast (very fast) mode that bypasses an exhaustive file comparison in favor of modification timestamps. * If the two files are the same size, have the same basename and exactly the same timestamp, they're probably the same. * This method seems to work well enough with rsync. * -t threshold_size * Apply the fast mode feature only to files larger than threshold_size bytes (default 100,000) * * Trivia: this program was inspired by a cheating scandal in the introductory computer science course CS-100 * at Cornell University the year after I graduated. The TAs simply sorted the projects by object code size and compared * those that were equal. That effectively found copies where only the variable names and comments had been changed. * * Copyright Phil Karn, karn@ka9q.net. May be used under the terms of the GNU General Public License v 2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include enum flag { NO=0,YES=1,UNKNOWN=-1 }; enum flag Fast_flag = NO; enum flag Zero_flag = NO; enum flag Quiet_flag = NO; enum flag Fast_threshold = 100000; long long Files_deleted = 0; long long Blocks_reclaimed = 0; int Delete_flag = NO; regex_t Delete_exp; #define ENTRYCHUNK 5000 /* Allocation unit for file table */ /* File table entry */ struct entry { char *pathname; /* Path name */ struct stat statbuf; /* file i-node data */ }; /* Comparison function called by qsort() */ int comparison(const void *ap,const void *bp); int main(int argc,char *argv[]){ int first; struct entry *entries = NULL; /* Dynamically allocated file table */ int entryarraysize = 0; /* Start with empty table, allocate on first pass */ int nfiles; /* Process command line args */ { char c; int errcode; char errbuf[150]; while((c = getopt(argc,argv,"f0t:d:q")) != EOF){ switch(c){ case 'q': Quiet_flag = YES; break; case 'f': Fast_flag = YES; /* Just compare modification timestamps, not file contents */ break; case '0': Zero_flag = YES; /* Path names are delimited by nulls, e.g., from 'find . -print0' */ break; case 't': Fast_threshold = atoi(optarg); break; case 'd': Delete_flag = YES; if((errcode = regcomp(&Delete_exp,optarg,REG_BASIC|REG_NOSUB)) != 0){ regerror(errcode,&Delete_exp,errbuf,sizeof(errbuf)); fprintf(stderr,"%s: -d %s didn't compile - %s\n",argv[0],optarg,errbuf); exit(0); } break; } } } /* Start by reading stdin for list of path names * Check each one and ignore non-regular files, zero-length files, special files, errors, etc */ for(nfiles=0;!feof(stdin) && !ferror(stdin);){ char filename[PATH_MAX]; if(Zero_flag){ /* File names are delimited by binary nulls */ int i; for(i=0;i= entryarraysize){ struct entry *ne; ne = (struct entry *)realloc(entries,(entryarraysize + ENTRYCHUNK) * sizeof(struct entry)); if(ne != NULL){ entries = ne; entryarraysize += ENTRYCHUNK; } else { /* Storage allocation failed; this should be pretty rare! */ fprintf(stderr,"%s: Pathname table full, ignoring remainder\n",argv[0]); break; } } /* Ignore null file names */ if(strlen(filename) == 0) continue; /* If we can't stat it, ignore it */ if(lstat(filename,&entries[nfiles].statbuf) != 0) continue; #if 0 { struct stat *s; s = &entries[nfiles].statbuf; fprintf(stderr," nlink %d; uid %d; gid %d; atime %ld.%09ld; mtime %ld.%09ld; ctime %ld.%09ld; size %ld; gen %d\n", s->st_nlink,s->st_uid,s->st_gid, s->st_atimespec.tv_sec,s->st_atimespec.tv_nsec, s->st_mtimespec.tv_sec,s->st_mtimespec.tv_nsec, s->st_ctimespec.tv_sec,s->st_ctimespec.tv_nsec, s->st_size,s->st_gen); } #endif /* Ignore all but ordinary files */ if((entries[nfiles].statbuf.st_mode & S_IFMT) != S_IFREG) continue; /* Ignore empty files and files with no assigned data blocks (any data being stored in the inode). * Zero size files are often used as flags and locks we don't want to upset. And we won't recover * any data blocks from a file without any data blocks! * I should also exclude HFS files on OSX with resource forks */ if(entries[nfiles].statbuf.st_blocks == 0 || entries[nfiles].statbuf.st_size == 0) continue; /* Otherwise keep the filename and inode on our list */ entries[nfiles].pathname = strdup(filename); nfiles++; } if(!Quiet_flag) fprintf(stderr,"%s: analyzing %d files\n",argv[0],nfiles); /* Sort list of pathnames */ qsort(entries,nfiles,sizeof(struct entry),comparison); if(!Quiet_flag) fprintf(stderr,"%s: sort done\n",argv[0]); /* Walk through list looking for duplicates */ for(first=0; first < nfiles-1; first++){ int second; enum flag different; if(entries[first].pathname == NULL) continue; /* -d flag specified and a duplicate file has been deleted */ /* Starting with the next entry, scan forward as long as the sizes match */ for(second=first+1; entries[first].statbuf.st_size == entries[second].statbuf.st_size; second++){ int filesize; int k; if(entries[second].pathname == NULL) continue; /* -d flag specified and a duplicate file has been deleted */ /* Two files are same length */ filesize = entries[first].statbuf.st_size; if(entries[first].statbuf.st_dev != entries[second].statbuf.st_dev) continue; /* Different file systems, can't link */ if(entries[first].statbuf.st_ino == entries[second].statbuf.st_ino) continue; /* They're already linked! */ different = UNKNOWN; /* Optionally use the rsync heuristic -- if the files have the same size, mod timestamp and * the same base name, declare them the same without actually reading the contents * Do this only on files larger than Fast_threshold to further reduce chances of false equality */ if(Fast_flag && filesize > Fast_threshold){ /* Rsync-style fast comparison */ char *bn1,*bn2; /* Are the basenames same? I could use the built-in basename() function, but what a mess it is */ if((bn1 = strrchr(entries[first].pathname,'/')) == NULL) bn1 = entries[first].pathname; if((bn2 = strrchr(entries[second].pathname,'/')) == NULL) bn2 = entries[second].pathname; if(strcmp(bn1,bn2) == 0) different = NO; } if(different == UNKNOWN){ /* Need to keep testing? */ /* Compare file contents */ FILE *file_a; if(file_a = fopen(entries[first].pathname,"r")){ FILE *file_b; if(file_b = fopen(entries[second].pathname,"r")){ /* First try to compare with memory mapping */ void *file_a_p; if((file_a_p = mmap(NULL, filesize, PROT_READ, MAP_FILE|MAP_SHARED, fileno(file_a), 0)) != (void *)-1){ void *file_b_p; if((file_b_p = mmap(NULL, filesize, PROT_READ, MAP_FILE|MAP_SHARED, fileno(file_b), 0)) != (void *)-1){ different = (memcmp(file_a_p,file_b_p,filesize) == 0) ? NO : YES; k = munmap(file_b_p,filesize); assert(!k); } k = munmap(file_a_p,filesize); assert(!k); } if(different == UNKNOWN){ /* Couldn't do memory compare, so try stream compare instead */ for(k=0;k entries[second].statbuf.st_nlink){ deleted_entry = second; saved_entry = first; } else { deleted_entry = first; saved_entry = second; } } else if(entries[first].statbuf.st_mtime < entries[second].statbuf.st_mtime) { /* Keep the older version, as its mod timestamp is more likely meaningful */ deleted_entry = second; saved_entry = first; } else { deleted_entry = first; saved_entry = second; } } /* Account for reclaimed blocks if we're unlinking a file with link count 1 */ if(entries[deleted_entry].statbuf.st_nlink == 1) Blocks_reclaimed += entries[deleted_entry].statbuf.st_blocks; { /* Some last minute paranoid checks */ struct stat statbuf_a,statbuf_b; assert(deleted_entry >= 0 && saved_entry >= 0); assert(!lstat(entries[deleted_entry].pathname,&statbuf_a)); assert(!lstat(entries[saved_entry].pathname,&statbuf_b)); assert(filesize == statbuf_a.st_size); assert(filesize == statbuf_b.st_size); assert(statbuf_a.st_ino != statbuf_b.st_ino); assert(statbuf_a.st_dev == statbuf_b.st_dev); } if(Delete_flag){ if(!Quiet_flag) fprintf(stderr,"%d keeping %s, deleting %s\n",filesize,entries[saved_entry].pathname,entries[deleted_entry].pathname); unlink(entries[deleted_entry].pathname); free(entries[deleted_entry].pathname); entries[deleted_entry].pathname = NULL; bzero(&entries[deleted_entry].statbuf,sizeof(struct stat)); /* Exterminate stale data */ } else { if(!Quiet_flag) fprintf(stderr,"%d link %s -> %s\n",filesize,entries[saved_entry].pathname,entries[deleted_entry].pathname); if(unlink(entries[deleted_entry].pathname)) { perror("unlink"); /* Abort instead? */ } else if(link(entries[saved_entry].pathname,entries[deleted_entry].pathname)){ perror("link"); /* Abort instead? */ free(entries[deleted_entry].pathname); entries[deleted_entry].pathname = NULL; } else { entries[deleted_entry].statbuf = entries[saved_entry].statbuf; } } } } /* Done with this pair of files */ } /* Done with all files on list */ #if 0 for(first=0;first 0){ free(entries[--nfiles].pathname); } free(entries); regfree(&Delete_exp); } /* This is the comparison function called by qsort */ int comparison(const void *ap,const void *bp){ struct entry *a = (struct entry *)ap; struct entry *b = (struct entry *)bp; assert(a != NULL && b != NULL); assert(a->pathname != NULL); assert(b->pathname != NULL); return b->statbuf.st_size - a->statbuf.st_size; /* This puts the biggest files at the top, so we'll gain back disk space ASAP */ }