from torch.utils.data import Dataset import os from random import shuffle import numpy as np import torch import scipy.sparse class BooleanNGramDataset(Dataset): def __init__(self, csv_filepath: str): self.all_files = [] with open(csv_filepath, "r") as f: lines = f.readlines() for line in lines: filepath, label = line.strip().split(",") self.all_files.append((filepath, int(label))) shuffle(self.all_files) def __len__(self): return len(self.all_files) def __getitem__(self, index): to_load, y = self.all_files[index] # Step 1: Load the .npz file into a sparse matrix sparse_matrix = scipy.sparse.load_npz(to_load) # Step 2: Convert the sparse matrix to a dense matrix (e.g., using toarray()) dense_matrix = sparse_matrix.toarray() # You can also use .todense() if needed # Step 3: Convert the dense matrix to a PyTorch tensor x = torch.tensor(dense_matrix, dtype=torch.float) x = x.squeeze() return x, torch.tensor(y)