methane_super_emitters.dataset
1import torch 2from torch.utils.data import DataLoader, Dataset 3import os 4import glob 5import random 6import numpy as np 7from torchvision.transforms import Compose, ToTensor, Normalize 8from methane_super_emitters.dataset_stats import normalize 9 10 11class TROPOMISuperEmitterDataset(Dataset): 12 def __init__(self, data_dir, fields): 13 self.data_dir = data_dir 14 self.samples = [] 15 self.positive_filenames = glob.glob(os.path.join(data_dir, "positive", "*.npz")) 16 negative_filenames = glob.glob(os.path.join(data_dir, "negative", "*.npz")) 17 self.negative_filenames = negative_filenames[: len(self.positive_filenames)] 18 for filename in self.positive_filenames: 19 data = np.load(filename) 20 self.samples.append((normalize(data, fields), 1.0)) 21 for filename in self.negative_filenames: 22 data = np.load(filename) 23 self.samples.append((normalize(data, fields), 0.0)) 24 25 def unload(self): 26 """Make garbage collector collect the data.""" 27 self.samples = [] 28 import gc 29 30 gc.collect() 31 32 def __len__(self): 33 return len(self.samples) 34 35 def __getitem__(self, idx): 36 img, label = self.samples[idx] 37 return torch.tensor(img, dtype=torch.float), torch.tensor( 38 label, dtype=torch.float 39 ) 40 41 42class TROPOMISuperEmitterLocatorDataset(TROPOMISuperEmitterDataset): 43 def __init__(self, data_dir, fields): 44 self.data_dir = data_dir 45 self.samples = [] 46 self.positive_filenames = glob.glob(os.path.join(data_dir, "positive", "*.npz")) 47 negative_filenames = glob.glob(os.path.join(data_dir, "negative", "*.npz")) 48 self.negative_filenames = negative_filenames[: len(self.positive_filenames)] 49 self.filenames = self.positive_filenames + self.negative_filenames 50 for filename in self.filenames: 51 data = np.load(filename) 52 self.samples.append((normalize(data, fields), data["location"]))
12class TROPOMISuperEmitterDataset(Dataset): 13 def __init__(self, data_dir, fields): 14 self.data_dir = data_dir 15 self.samples = [] 16 self.positive_filenames = glob.glob(os.path.join(data_dir, "positive", "*.npz")) 17 negative_filenames = glob.glob(os.path.join(data_dir, "negative", "*.npz")) 18 self.negative_filenames = negative_filenames[: len(self.positive_filenames)] 19 for filename in self.positive_filenames: 20 data = np.load(filename) 21 self.samples.append((normalize(data, fields), 1.0)) 22 for filename in self.negative_filenames: 23 data = np.load(filename) 24 self.samples.append((normalize(data, fields), 0.0)) 25 26 def unload(self): 27 """Make garbage collector collect the data.""" 28 self.samples = [] 29 import gc 30 31 gc.collect() 32 33 def __len__(self): 34 return len(self.samples) 35 36 def __getitem__(self, idx): 37 img, label = self.samples[idx] 38 return torch.tensor(img, dtype=torch.float), torch.tensor( 39 label, dtype=torch.float 40 )
An abstract class representing a Dataset
.
All datasets that represent a map from keys to data samples should subclass
it. All subclasses should overwrite __getitem__()
, supporting fetching a
data sample for a given key. Subclasses could also optionally overwrite
__len__()
, which is expected to return the size of the dataset by many
~torch.utils.data.Sampler
implementations and the default options
of ~torch.utils.data.DataLoader
. Subclasses could also
optionally implement __getitems__()
, for speedup batched samples
loading. This method accepts list of indices of samples of batch and returns
list of samples.
sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.
13 def __init__(self, data_dir, fields): 14 self.data_dir = data_dir 15 self.samples = [] 16 self.positive_filenames = glob.glob(os.path.join(data_dir, "positive", "*.npz")) 17 negative_filenames = glob.glob(os.path.join(data_dir, "negative", "*.npz")) 18 self.negative_filenames = negative_filenames[: len(self.positive_filenames)] 19 for filename in self.positive_filenames: 20 data = np.load(filename) 21 self.samples.append((normalize(data, fields), 1.0)) 22 for filename in self.negative_filenames: 23 data = np.load(filename) 24 self.samples.append((normalize(data, fields), 0.0))
43class TROPOMISuperEmitterLocatorDataset(TROPOMISuperEmitterDataset): 44 def __init__(self, data_dir, fields): 45 self.data_dir = data_dir 46 self.samples = [] 47 self.positive_filenames = glob.glob(os.path.join(data_dir, "positive", "*.npz")) 48 negative_filenames = glob.glob(os.path.join(data_dir, "negative", "*.npz")) 49 self.negative_filenames = negative_filenames[: len(self.positive_filenames)] 50 self.filenames = self.positive_filenames + self.negative_filenames 51 for filename in self.filenames: 52 data = np.load(filename) 53 self.samples.append((normalize(data, fields), data["location"]))
An abstract class representing a Dataset
.
All datasets that represent a map from keys to data samples should subclass
it. All subclasses should overwrite __getitem__()
, supporting fetching a
data sample for a given key. Subclasses could also optionally overwrite
__len__()
, which is expected to return the size of the dataset by many
~torch.utils.data.Sampler
implementations and the default options
of ~torch.utils.data.DataLoader
. Subclasses could also
optionally implement __getitems__()
, for speedup batched samples
loading. This method accepts list of indices of samples of batch and returns
list of samples.
sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.
44 def __init__(self, data_dir, fields): 45 self.data_dir = data_dir 46 self.samples = [] 47 self.positive_filenames = glob.glob(os.path.join(data_dir, "positive", "*.npz")) 48 negative_filenames = glob.glob(os.path.join(data_dir, "negative", "*.npz")) 49 self.negative_filenames = negative_filenames[: len(self.positive_filenames)] 50 self.filenames = self.positive_filenames + self.negative_filenames 51 for filename in self.filenames: 52 data = np.load(filename) 53 self.samples.append((normalize(data, fields), data["location"]))