#!/usr/bin/env python3
# Copyright (c) 2008-11 Qtrac Ltd. All rights reserved.
# This program or module is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. It is provided for educational
# purposes and is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

import locale
locale.setlocale(locale.LC_ALL, "")

import collections
import hashlib
import itertools
import optparse
import os
import queue
import sys
import threading
import Util


class Worker(threading.Thread):

    Md5_lock = threading.Lock()

    def __init__(self, work_queue, md5_from_filename, results_queue,
                 number):
        super().__init__()
        self.work_queue = work_queue
        self.md5_from_filename = md5_from_filename
        self.results_queue = results_queue
        self.number = number


    def run(self):
        while True:
            try:
                size, names = self.work_queue.get()
                self.process(size, names)
            finally:
                self.work_queue.task_done()


    def process(self, size, filenames):
        md5s = collections.defaultdict(set)
        for filename in filenames:
            with Worker.Md5_lock:
                md5 = self.md5_from_filename.get(filename, None)
            if md5 is None:
                try:
                    md5 = hashlib.md5()
                    with open(filename, "rb") as fh:
                        md5.update(fh.read())
                    md5 = md5.digest()
                    with Worker.Md5_lock:
                        self.md5_from_filename[filename] = md5
                except EnvironmentError:
                    continue
            md5s[md5].add(filename)
        for filenames in md5s.values():
            if len(filenames) == 1:
                continue
            self.results_queue.put("{0}Duplicate files ({1:n} bytes):"
                                   "\n\t{2}".format(self.number, size,
                                   "\n\t".join(sorted(filenames))))


def main():
    opts, path = parse_options()
    data = collections.defaultdict(list)
    if opts.verbose:
        print("Creating file list...")
    for root, dirs, files in os.walk(path):
        for filename in files:
            fullname = os.path.join(root, filename)
            try:
                key = (os.path.getsize(fullname), filename)
            except EnvironmentError:
                continue
            if key[0] == 0:
                continue
            data[key].append(fullname)

    if opts.verbose:
        print("Creating {0} thread{1}...".format(
              opts.count, Util.s(opts.count)))
    work_queue = queue.PriorityQueue()
    results_queue = queue.Queue()
    md5_from_filename = {}
    for i in range(opts.count):
        number = "{0}: ".format(i + 1) if opts.debug else ""
        worker = Worker(work_queue, md5_from_filename, results_queue,
                        number)
        worker.daemon = True
        worker.start()

    results_thread = threading.Thread(
                        target=lambda: print_results(results_queue))
    results_thread.daemon = True
    results_thread.start()

    for size, filename in sorted(data):
        names = data[size, filename]
        if len(names) > 1:
            work_queue.put((size, names))
    work_queue.join()
    results_queue.join()


def print_results(results_queue):
    while True:
        try:
            results = results_queue.get()
            if results:
                print(results)
        finally:
            results_queue.task_done()


def parse_options():
    parser = optparse.OptionParser(
            usage=("usage: %prog [options] [path]\n"
                   "outputs a list of duplicate files in path "
                   "using the MD5 algorithm\n"
                   "ignores zero-length files\n"
                   "path defaults to ."))
    parser.add_option("-t", "--threads", dest="count", default=7,
            type="int",
            help=("the number of threads to use (1..20) "
                  "[default %default]"))
    parser.add_option("-v", "--verbose", dest="verbose",
                      default=False, action="store_true")
    parser.add_option("-d", "--debug", dest="debug", default=False,
                      action="store_true")
    opts, args = parser.parse_args()
    if not (1 <= opts.count <= 20):
        parser.error("thread count must be 1..20")
    return opts, args[0] if args else "."


main()