import collections
import hashlib
import os
import sys
from pathlib import Path
from typing import DefaultDict, Dict, Iterator, List
DuplicatesDict = DefaultDict[str, List[Path]]
def get_file_paths(path: Path) -> Iterator[Path]:
for root, _dirs, files in os.walk(path):
for file in files:
yield Path(root) / Path(file)
def get_file_hash(path: Path) -> str:
file_content = path.read_bytes()
return hashlib.sha256(file_content).hexdigest()
def get_duplicate_files(path: Path) -> DuplicatesDict:
duplicates: DuplicatesDict = collections.defaultdict(list)
for file_path in get_file_paths(path):
file_hash = get_file_hash(file_path)
duplicates[file_hash].append(file_path)
return duplicates
def get_only_hashes_with_duplications(
hashes: DuplicatesDict,
) -> Dict[str, List[Path]]:
return {k: v for k, v in hashes.items() if len(v) >= 2}
def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:
for duplicate_files in duplicates.values():
for file in duplicate_files:
print(file)
print('-' * 40)
def main(path: str):
new_path = Path(path).resolve().absolute()
hashes = get_duplicate_files(new_path)
duplicates = get_only_hashes_with_duplications(hashes)
return duplicates
if __name__ == '__main__':
if len(sys.argv) == 2:
print_duplicates(main(sys.argv[1]))
else:
print(f"Usage: python {__file__} <path>")
3 לייקים