import sys, os, cPickle, hashlib, pprint
from datetime import datetime
from collections import defaultdict 

class FileInfo():
    cache_file = "c:\\temp\\finddupes.cache"
    sizes        = defaultdict(list)
    fingerprints = defaultdict(list)

    def __init__(self):
        #self.load_cache()
        pass

    def __del__(self):
        #self.save_cache()
        pass

    def load_cache(self):
        sys.stderr.write("Loading Cache\n")
        try:
            cache = file(self.cache_file, "rb")
            self.sizes = cPickle.load( cache )
            self.fingerprints = cPickle.load( cache )
        except Exception, e:
            print "Couldn't read cache file:" + str(e)

    def save_cache(self):
        sys.stderr.write("Saving Cache\n")
        try:
            cache = file(self.cache_file, "wb")
            cPickle.dump( self.sizes, cache )
            cPickle.dump( self.fingerprints, cache )
        except Exception, e:
            print "Couldn't write to cache file:" + str(e)

    def get_file_info(self, top):
        i = 0
        for root, dirs, files in os.walk(top, topdown=False):
            for name in files:
                if i % 1000 == 0: print i
                try:
                    file = os.path.join(root, name)
                    size = os.path.getsize(file)
                    self.sizes[size].append(file)
                except Exception, e:
                    print "Unable to get size of", file, str(e)
                i += 1

    def get_same_sizes(self):
        dupes = ((i, self.sizes[i]) for i in self.sizes if len(self.sizes[i]) > 1)
        for d in dupes:
            yield d
    
    def get_fingerprint(self, filename):
        f = file(filename, "rb")
        try:
            f.seek(-10240,2)
        except:
            print filename, ": unable to seek"
            return 0
        return hashlib.sha256(f.read(10240)).hexdigest()

    def get_dupes(self):
        i = 0
        for size, dupes in self.get_same_sizes():
            if i % 100 == 0: print i
            fprints = defaultdict(list)
            for f in dupes:
                fp = self.get_fingerprint(f)
                fprints[fp].append(f)
                self.fingerprints[fp].append(f)
            for d in (fprints[i] for i in fprints if len(fprints[i]) > 1):
                yield size, d
            i += 1

    def get_dupes_from_cache(self):
        dupes = [self.fingerprints[i] for i in self.fingerprints if len(self.fingerprints[i]) > 1]

def run(top):
    fi = FileInfo()
    print str(datetime.now()), "Gathering sizes"
    fi.get_file_info(top)
    print str(datetime.now()), "Finding dupes"
    dupes = [d for d in fi.get_dupes()]
    dupes.sort()
    for s, d in dupes:
        if s < 500: continue
        for f in d[1:]:
            print "Deleting", f
            os.unlink(f)
    out = file("c:\\temp\dupes.txt", "w")
    pp = pprint.PrettyPrinter(indent=2)
    print str(datetime.now()), "Writing output"
    out.write(pp.pformat(dupes))
    out.close()
    print str(datetime.now()), "Done"

if __name__ == "__main__":
    run(sys.argv[1])