קוד מהלייב: בדיקת קבצים כפולים

תגיות: ,

import collections
import hashlib
import os
import sys
from pathlib import Path
from typing import DefaultDict, Dict, Iterator, List


DuplicatesDict = DefaultDict[str, List[Path]]


def get_file_paths(path: Path) -> Iterator[Path]:
    for root, _dirs, files in os.walk(path):
        for file in files:
            yield Path(root) / Path(file)


def get_file_hash(path: Path) -> str:
    file_content = path.read_bytes()
    return hashlib.sha256(file_content).hexdigest()


def get_duplicate_files(path: Path) -> DuplicatesDict:
    duplicates: DuplicatesDict = collections.defaultdict(list)
    for file_path in get_file_paths(path):
        file_hash = get_file_hash(file_path)
        duplicates[file_hash].append(file_path)
    return duplicates


def get_only_hashes_with_duplications(
        hashes: DuplicatesDict,
) -> Dict[str, List[Path]]:
    return {k: v for k, v in hashes.items() if len(v) >= 2}


def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:
    for duplicate_files in duplicates.values():
        for file in duplicate_files:
            print(file)
        print('-' * 40)


def main(path: str):
    new_path = Path(path).resolve().absolute()
    hashes = get_duplicate_files(new_path)
    duplicates = get_only_hashes_with_duplications(hashes)
    return duplicates


if __name__ == '__main__':
    if len(sys.argv) == 2:
        print_duplicates(main(sys.argv[1]))
    else:
        print(f"Usage: python {__file__} <path>")
3 לייקים