BDFR_Web/tests/test_hash_persistence.py

#!/usr/bin/env python3
"""
Test script to verify hash persistence functionality.
"""
import json
import tempfile
import shutil
from pathlib import Path
from unittest.mock import Mock

# Import the necessary modules
import sys
sys.path.insert(0, '/Users/Daniel/Documents/GitHub/bulk-downloader-for-reddit')

from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader


def test_hash_persistence():
    """Test the hash persistence functionality."""

    # Create a temporary directory for testing
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        # Create a mock args object
        mock_args = Mock()
        mock_args.no_dupes = True
        mock_args.simple_check = False
        mock_args.search_existing = False
        mock_args.skip_subreddit = []
        mock_args.ignore_user = []
        mock_args.min_score = None
        mock_args.max_score = None
        mock_args.min_score_ratio = None
        mock_args.max_score_ratio = None
        mock_args.disable_module = []
        mock_args.make_hard_links = False
        mock_args.max_wait_time = 30

        # Create downloader instance
        downloader = RedditDownloader.__new__(RedditDownloader)
        downloader.args = mock_args
        downloader.download_directory = temp_path
        downloader.master_hash_list = {}
        downloader.excluded_submission_ids = set()
        downloader.file_name_formatter = Mock()
        downloader.download_filter = Mock()
        downloader.download_filter.check_url.return_value = True
        downloader.download_filter.check_resource.return_value = True
        downloader.authenticator = Mock()

        # Test 1: Initially empty hash list
        print("Test 1: Loading from empty directory")
        hash_list = downloader._load_hash_list()
        assert len(hash_list) == 0, f"Expected empty hash list, got {len(hash_list)}"
        print("PASS Passed")

        # Test 2: Save empty hash list
        print("Test 2: Saving empty hash list")
        downloader._save_hash_list()
        hash_file = temp_path / '.bdfr_hashes.json'
        assert hash_file.exists(), "Hash file should be created even when empty"
        print("PASS Passed")

        # Test 3: Load hash list after saving empty one
        print("Test 3: Loading saved empty hash list")
        hash_list = downloader._load_hash_list()
        assert len(hash_list) == 0, f"Expected empty hash list, got {len(hash_list)}"
        print("PASS")

        # Test 4: Add some test data and save
        print("Test 4: Adding test data and saving")
        test_file = temp_path / 'test.txt'
        test_file.write_text("test content")
        downloader.master_hash_list['test_hash_123'] = test_file

        downloader._save_hash_list()

        # Verify the saved JSON structure
        with open(hash_file, 'r') as f:
            saved_data = json.load(f)

        assert 'test_hash_123' in saved_data, "Test hash should be in saved data"
        assert saved_data['test_hash_123'] == 'test.txt', f"Expected 'test.txt', got {saved_data['test_hash_123']}"
        print("PASS Passed")

        # Test 5: Load hash list and verify data is restored
        print("Test 5: Loading hash list with saved data")
        new_downloader = RedditDownloader.__new__(RedditDownloader)
        new_downloader.args = mock_args
        new_downloader.download_directory = temp_path
        new_downloader.master_hash_list = {}
        new_downloader.excluded_submission_ids = set()
        new_downloader.file_name_formatter = Mock()
        new_downloader.download_filter = Mock()
        new_downloader.download_filter.check_url.return_value = True
        new_downloader.download_filter.check_resource.return_value = True
        new_downloader.authenticator = Mock()

        loaded_hash_list = new_downloader._load_hash_list()
        assert len(loaded_hash_list) == 1, f"Expected 1 hash, got {len(loaded_hash_list)}"
        assert 'test_hash_123' in loaded_hash_list, "Test hash should be loaded"
        assert loaded_hash_list['test_hash_123'] == test_file, f"File path should match: {loaded_hash_list['test_hash_123']} != {test_file}"
        print("PASS Passed")

        # Test 6: Test corrupted hash file handling
        print("Test 6: Testing corrupted hash file handling")
        with open(hash_file, 'w') as f:
            f.write("invalid json content")

        corrupted_downloader = RedditDownloader.__new__(RedditDownloader)
        corrupted_downloader.args = mock_args
        corrupted_downloader.download_directory = temp_path
        corrupted_downloader.master_hash_list = {}
        corrupted_downloader.excluded_submission_ids = set()
        corrupted_downloader.file_name_formatter = Mock()
        corrupted_downloader.download_filter = Mock()
        corrupted_downloader.download_filter.check_url.return_value = True
        corrupted_downloader.download_filter.check_resource.return_value = True
        corrupted_downloader.authenticator = Mock()

        # Should handle corrupted file gracefully and return empty dict
        corrupted_hash_list = corrupted_downloader._load_hash_list()
        assert len(corrupted_hash_list) == 0, f"Expected empty hash list for corrupted file, got {len(corrupted_hash_list)}"
        print("PASS Passed")

        print("\nAll tests passed! Hash persistence functionality is working correctly.")


def test_simple_check_functionality():
    """Test the simple-check functionality with URL-based duplicate detection."""

    print("\n=== Testing Simple-Check Functionality ===")

    # Create a temporary directory for testing
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        # Create a mock args object with simple_check enabled
        mock_args = Mock()
        mock_args.no_dupes = True
        mock_args.simple_check = True
        mock_args.search_existing = False
        mock_args.skip_subreddit = []
        mock_args.ignore_user = []
        mock_args.min_score = None
        mock_args.max_score = None
        mock_args.min_score_ratio = None
        mock_args.max_score_ratio = None
        mock_args.disable_module = []
        mock_args.make_hard_links = False
        mock_args.max_wait_time = 30

        # Create downloader instance
        downloader = RedditDownloader.__new__(RedditDownloader)
        downloader.args = mock_args
        downloader.download_directory = temp_path
        downloader.master_hash_list = {}
        downloader.url_list = {}
        downloader.excluded_submission_ids = set()
        downloader.file_name_formatter = Mock()
        downloader.download_filter = Mock()
        downloader.download_filter.check_url.return_value = True
        downloader.download_filter.check_resource.return_value = True
        downloader.authenticator = Mock()

        # Test 1: Initially empty hash list
        print("Test 1: Loading from empty directory with simple_check")
        hash_list = downloader._load_hash_list()
        assert len(hash_list) == 0, f"Expected empty hash list, got {len(hash_list)}"
        assert len(downloader.url_list) == 0, f"Expected empty URL list, got {len(downloader.url_list)}"
        print("PASS")

        # Test 2: Add test data and save with simple_check format
        print("Test 2: Adding test data and saving with simple_check format")
        test_file = temp_path / 'test.txt'
        test_file.write_text("test content")
        test_url = "https://example.com/test.txt"
        test_hash = "test_hash_123"

        downloader.master_hash_list[test_hash] = test_file
        downloader.url_list[test_url] = test_hash

        downloader._save_hash_list()

        # Verify the saved JSON structure has enhanced format
        with open(temp_path / '.bdfr_hashes.json', 'r') as f:
            saved_data = json.load(f)

        assert 'files' in saved_data, "Enhanced format should have 'files' section"
        assert 'urls' in saved_data, "Enhanced format should have 'urls' section"
        assert 'metadata' in saved_data, "Enhanced format should have 'metadata' section"
        assert test_hash in saved_data['files'], "Test hash should be in files section"
        assert test_url in saved_data['urls'], "Test URL should be in urls section"
        assert saved_data['metadata']['version'] == '2.0', "Version should be 2.0"
        assert saved_data['metadata']['created_with'] == 'simple_check', "Should be created with simple_check"
        print("PASS")

        # Test 3: Load hash list and verify URL mapping is restored
        print("Test 3: Loading hash list with URL mappings")
        new_downloader = RedditDownloader.__new__(RedditDownloader)
        new_downloader.args = mock_args
        new_downloader.download_directory = temp_path
        new_downloader.master_hash_list = {}
        new_downloader.url_list = {}
        new_downloader.excluded_submission_ids = set()
        new_downloader.file_name_formatter = Mock()
        new_downloader.download_filter = Mock()
        new_downloader.download_filter.check_url.return_value = True
        new_downloader.download_filter.check_resource.return_value = True
        new_downloader.authenticator = Mock()

        loaded_hash_list = new_downloader._load_hash_list()
        assert len(loaded_hash_list) == 1, f"Expected 1 hash, got {len(loaded_hash_list)}"
        assert len(new_downloader.url_list) == 1, f"Expected 1 URL, got {len(new_downloader.url_list)}"
        assert test_hash in loaded_hash_list, "Test hash should be loaded"
        assert test_url in new_downloader.url_list, "Test URL should be loaded"
        assert new_downloader.url_list[test_url] == test_hash, "URL should map to correct hash"
        print("PASS")

        # Test 4: Test URL-based duplicate detection
        print("Test 4: Testing URL-based duplicate detection")

        # Mock resource with URL that matches our stored URL
        mock_resource = Mock()
        mock_resource.url = test_url
        mock_resource.hash.hexdigest.return_value = test_hash

        # Create a mock destination that exists
        mock_destination = temp_path / 'existing_file.txt'
        mock_destination.parent.mkdir(parents=True, exist_ok=True)
        mock_destination.write_text("existing content")

        # Mock the file name formatter to return our test destination and resource
        downloader.file_name_formatter.format_resource_paths.return_value = [(mock_destination, mock_resource)]

        # Mock submission
        mock_submission = Mock()
        mock_submission.id = "test123"

        # This should detect the URL match and skip processing
        # We can't easily test the full _download_submission without more mocking,
        # but we can verify the URL list is working
        assert test_url in downloader.url_list, "URL should be in url_list for duplicate detection"
        assert downloader.url_list[test_url] == test_hash, "URL should map to correct hash"

        print("PASS")

        print("\nAll simple-check tests passed! URL-based duplicate detection is working correctly.")


def test_backward_compatibility():
    """Test that old hash files still work with new implementation."""

    print("\n=== Testing Backward Compatibility ===")

    # Create a temporary directory for testing
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        # Create old-format hash file manually
        (temp_path / 'relative' / 'path').mkdir(parents=True, exist_ok=True)
        (temp_path / 'relative' / 'path' / 'file1.txt').write_text("content1")
        (temp_path / 'relative' / 'path' / 'file2.txt').write_text("content2")

        old_hash_data = {
            "hash1": "relative/path/file1.txt",
            "hash2": "relative/path/file2.txt"
        }

        hash_file = temp_path / '.bdfr_hashes.json'
        with open(hash_file, 'w') as f:
            json.dump(old_hash_data, f)

        # Create downloader and load old format
        mock_args = Mock()
        mock_args.no_dupes = True
        mock_args.simple_check = True  # Enable simple_check to test format upgrade

        downloader = RedditDownloader.__new__(RedditDownloader)
        downloader.args = mock_args
        downloader.download_directory = temp_path
        downloader.master_hash_list = {}
        downloader.url_list = {}

        loaded_hashes = downloader._load_hash_list()

        assert len(loaded_hashes) == 2, f"Expected 2 hashes from old format, got {len(loaded_hashes)}"
        assert "hash1" in loaded_hashes, "hash1 should be loaded from old format"
        assert "hash2" in loaded_hashes, "hash2 should be loaded from old format"
        assert len(downloader.url_list) == 0, "URL list should be empty for old format"

        print("PASS - Old format loaded correctly")

        # Test saving in new format
        (temp_path / 'another').mkdir(parents=True, exist_ok=True)
        test_file = temp_path / 'another' / 'new_file.txt'
        test_file.write_text("new content")
        downloader.master_hash_list["new_hash"] = test_file

        downloader._save_hash_list()

        # Verify new format was created
        with open(hash_file, 'r') as f:
            new_data = json.load(f)

        assert 'files' in new_data, "New format should have 'files' section"
        assert 'urls' in new_data, "New format should have 'urls' section"
        assert 'metadata' in new_data, "New format should have 'metadata' section"
        assert new_data['metadata']['version'] == '2.0', "Should be version 2.0"

        print("PASS - Old format upgraded to new format correctly")

        print("\nBackward compatibility tests passed!")


if __name__ == "__main__":
    test_hash_persistence()
    test_simple_check_functionality()
    test_backward_compatibility()