| 1234567891011121314151617181920212223242526272829303132 |
- from torch.utils.data import Dataset
- import os
- from random import shuffle
- import numpy as np
- import torch
- import scipy.sparse
-
-
- class BooleanNGramDataset(Dataset):
- def __init__(self, csv_filepath: str):
- self.all_files = []
- with open(csv_filepath, "r") as f:
- lines = f.readlines()
- for line in lines:
- filepath, label = line.strip().split(",")
- self.all_files.append((filepath, int(label)))
- shuffle(self.all_files)
-
- def __len__(self):
- return len(self.all_files)
-
- def __getitem__(self, index):
- to_load, y = self.all_files[index]
- # Step 1: Load the .npz file into a sparse matrix
- sparse_matrix = scipy.sparse.load_npz(to_load)
- # Step 2: Convert the sparse matrix to a dense matrix (e.g., using toarray())
- dense_matrix = sparse_matrix.toarray() # You can also use .todense() if needed
-
- # Step 3: Convert the dense matrix to a PyTorch tensor
- x = torch.tensor(dense_matrix, dtype=torch.float)
- x = x.squeeze()
- return x, torch.tensor(y)
|