from torch.utils.data import Dataset
import os
from random import shuffle
import numpy as np
import torch
import scipy.sparse


class NGramDataset(Dataset):
    def __init__(self, csv_filepath: str):
        self.all_files = []
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
            for line in lines:
                filepath, label = line.strip().split(",")
                self.all_files.append((filepath, int(label)))
        shuffle(self.all_files)


    def __len__(self):
        return len(self.all_files)

    def __getitem__(self, index):
        to_load, y = self.all_files[index]
        # Step 1: Load the .npz file
        matrix = np.load(to_load)["arr_0"]
        # Step 2: Convert the dense matrix to a PyTorch tensor
        x = torch.tensor(matrix, dtype=torch.float)
        x = x.squeeze()
        return x, torch.tensor(y)