from torch.utils.data import Dataset
import os
from random import shuffle
import numpy as np
import torch
import scipy.sparse


class BooleanNGramDataset(Dataset):
    def __init__(self, csv_filepath: str):
        self.all_files = []
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
            for line in lines:
                filepath, label = line.strip().split(",")
                self.all_files.append((filepath, int(label)))
        shuffle(self.all_files)

    def __len__(self):
        return len(self.all_files)

    def __getitem__(self, index):
        to_load, y = self.all_files[index]
        # Step 1: Load the .npz file into a sparse matrix
        sparse_matrix = scipy.sparse.load_npz(to_load)
        # Step 2: Convert the sparse matrix to a dense matrix (e.g., using toarray())
        dense_matrix = sparse_matrix.toarray()  # You can also use .todense() if needed

        # Step 3: Convert the dense matrix to a PyTorch tensor
        x = torch.tensor(dense_matrix, dtype=torch.float)
        x = x.squeeze()
        return x, torch.tensor(y)