Source code for torchsig.utils.file_handlers.npy

"""File handlers for NumPy standard binary *.npy files."""

# TorchSig
import bisect
import csv
import itertools
import json
from pathlib import Path

# Built-In
# Third Party
import numpy as np

from torchsig.signals.signal_types import Signal
from torchsig.utils.file_handlers import FileReader



[docs]
class NPYReader(FileReader):
    """ "Handles reading externally stored data into TorchSig as datasets, with data
    formatted in standard NumPy binary .npy files and metadata in a JSON format.
    """


[docs]
    def __init__(self, root: str):
        super().__init__(root=root)
        self.root_dir = root
        self.npy_files = sorted(Path(root).glob("*.npy"))
        if not self.npy_files:
            raise FileNotFoundError("No .npy files found in directory.")

        # Determine cumulative sample counts for each file
        self.file_start_indices = (
            []
        )  # start index of each file in the global index space
        total = 0
        for file_path in self.npy_files:
            # Load the file header (memory-mapped) to get number of samples in this file
            arr = np.load(file_path, mmap_mode="r")  # memmap, does not load entire file
            length = arr.shape[0]  # number of samples in this file (note: arr not kept)
            self.file_start_indices.append(total)
            total += length
        self.total_elements = total

        self.class_list: list[str] = ["BPSK", "QPSK", "Noise"]
        self.dataset_size: int = None
        self.dataset_metadata: dict = self._load_json_metadata()

        self.dataset_size = 0
        try:
            with open(f"{self.root}/info.json") as f:
                dataset_info = json.load(f)

            self.dataset_size = dataset_info["size"]
        except:
            raise ValueError(f"Error loading {self.root}/info.json")


    def _load_json_metadata(self) -> dict:
        try:
            with open(f"{self.root}/info.json") as f:
                dataset_info = json.load(f)
                return dataset_info
        except:
            raise ValueError(f"Error loading {self.root}/info.json")


[docs]
    def read(self, idx: int) -> tuple[np.ndarray, list[dict]]:
        """Read and return the sample at global index `idx`."""
        if idx < 0 or idx >= self.total_elements:
            raise IndexError(
                f"Index {idx} out of range (0 <= idx < {self.total_elements})."
            )

        # Data
        # determine which file contains this index using binary search on file start indices
        file_idx = bisect.bisect_right(self.file_start_indices, idx) - 1

        # compute index within selected file
        in_file_idx = idx - self.file_start_indices[file_idx]

        # load only needed file chunk (memory-mapped)
        file_path = self.npy_files[file_idx]
        arr = np.load(file_path, mmap_mode="r")  # memmap file
        data = arr[in_file_idx]  # retrieve specific sample

        # Metadata
        with open(f"{self.root}/metadata.csv") as f:
            reader = csv.DictReader(
                f, fieldnames=["index", "label", "modcod", "sample_rate"]
            )
            # get to idx row
            row = next(itertools.islice(reader, idx, idx + 1), None)
            if row is None:
                raise IndexError(f"Metadata idx {idx} is out of bounds")

            row["index"] = int(row["index"])
            row["sample_rate"] = float(row["sample_rate"])
            # add class_name
            row["class_name"] = row["label"].lower()
            # add class index
            row["class_index"] = self.class_list.index(row["label"])

            row["num_signals_max"] = 1

            metadata = row

        return Signal(data=data, component_signals=[], metadata=metadata)


    def __len__(self) -> int:
        return self.dataset_size