methane_super_emitters.dataset_stats
This module gathers descriptive statistics of the dataset for normalization porpoises.
1"""This module gathers descriptive statistics of the dataset for normalization porpoises.""" 2 3import click 4import os 5import glob 6from collections import defaultdict 7import numpy as np 8import json 9 10DATASET_STATS = { 11 "methane": { 12 "mean": 1862.5366583594632, 13 "median": 1878.5830688476562, 14 "std": 66.58710672017679, 15 "min": 971.0758666992188, 16 "max": 3324.72998046875 17 }, 18 "qa": { 19 "mean": 0.8106104458852114, 20 "median": 1.0, 21 "std": 0.3274058748852436, 22 "min": 0.0, 23 "max": 1.0 24 }, 25 "u10": { 26 "mean": -0.3505060237319564, 27 "median": -0.237032949924469, 28 "std": 3.0427688265223236, 29 "min": -16.58311653137207, 30 "max": 16.715341567993164 31 }, 32 "v10": { 33 "mean": -0.2016462336753655, 34 "median": -0.3121262937784195, 35 "std": 3.0363927475921657, 36 "min": -15.385231018066406, 37 "max": 22.70849609375 38 }, 39 "sza": { 40 "mean": 0.6852687383838644, 41 "median": 0.70140740275383, 42 "std": 0.17970663271587994, 43 "min": 0.24475011229515076, 44 "max": 0.9911531805992126 45 }, 46 "vza": { 47 "mean": 0.891019676507066, 48 "median": 0.928871214389801, 49 "std": 0.11010636053335394, 50 "min": 0.49698495864868164, 51 "max": 0.9998136758804321 52 }, 53 "scattering_angle": { 54 "mean": -0.6000556685447394, 55 "median": -0.6434199315569306, 56 "std": 0.26127084751804125, 57 "min": -0.9999982782919568, 58 "max": 0.4007348417791011 59 }, 60 "sa_std": { 61 "mean": 16.65754507928552, 62 "median": 8.775827407836914, 63 "std": 26.56312667561806, 64 "min": 0.0, 65 "max": 965.2076416015625 66 }, 67 "cloud_fraction": { 68 "mean": 0.010853144763690946, 69 "median": 0.0, 70 "std": 0.07413277621675883, 71 "min": 0.0, 72 "max": 1.0 73 }, 74 "cirrus_reflectance": { 75 "mean": 0.002408624468020358, 76 "median": 0.0, 77 "std": 0.00954486875404972, 78 "min": 0.0, 79 "max": 0.1243622899055481 80 }, 81 "methane_ratio_std": { 82 "mean": 0.0049173906056315655, 83 "median": 0.0, 84 "std": 0.0085810962161963, 85 "min": 0.0, 86 "max": 0.6137746572494507 87 }, 88 "methane_precision": { 89 "mean": 0.6099002507901794, 90 "median": 0.0, 91 "std": 1.0656924191550565, 92 "min": 0.0, 93 "max": 47.113624572753906 94 }, 95 "surface_albedo": { 96 "mean": 0.08097774154289568, 97 "median": 0.0, 98 "std": 0.16019963835712323, 99 "min": -0.05953392758965492, 100 "max": 0.7304432988166809 101 }, 102 "surface_albedo_precision": { 103 "mean": 6.182406963785989e-05, 104 "median": 0.0, 105 "std": 0.00012510325923893395, 106 "min": 0.0, 107 "max": 0.009150429628789425 108 }, 109 "aerosol_optical_thickness": { 110 "mean": 0.020228099796273182, 111 "median": 0.0, 112 "std": 0.04644573871021487, 113 "min": 0.0, 114 "max": 0.9478278756141663 115 } 116} 117 118def normalize(data, fields): 119 """Normalize the selected fields in the dataset. 120 121 Parameters 122 ---------- 123 data: dict 124 A dictionary with the full dataset data from an .NPZ file 125 fields: list 126 A list with field names 127 128 Returns 129 ------- 130 NumPy array 131 """ 132 result = [] 133 for field in fields: 134 x = np.array(data[field]) 135 if field == "methane": 136 x[data["mask"]] = np.nanmedian(x) 137 x[np.argwhere(x > 1.0e30)] = 0.0 138 a = DATASET_STATS[field]["mean"] 139 b = DATASET_STATS[field]["std"] 140 x = (x - a) / b 141 result.append(x) 142 # for field in fields: 143 # if field in ["methane"]: 144 # m = np.array(data[field]) 145 # m[data["mask"]] = np.nanmedian(m) 146 # mean = DATASET_STATS[field]["mean"] 147 # std = DATASET_STATS[field]["std"] 148 # m = (m - mean) / std 149 # result.append(m) 150 # elif field == "qa": 151 # qa = np.array(data["qa"]) 152 # qa = (qa > 0.5).astype(np.float64) 153 # result.append(qa) 154 # elif field in ["u10", "v10"]: 155 # x = np.array(data[field]) 156 # x[np.argwhere(x > 1.0e30)] = 0.0 157 # mean = DATASET_STATS[field]["mean"] 158 # std = DATASET_STATS[field]["std"] 159 # x = (x - mean) / std 160 # result.append(x) 161 # elif field in ["sza", "vza", "scattering_angle", "sa_std"]: 162 # x = np.array(data[field]) 163 # mean = DATASET_STATS[field]["mean"] 164 # std = DATASET_STATS[field]["std"] 165 # x = (x - mean) / std 166 # result.append(x) 167 # else: 168 # result.append(np.array(data[field])) 169 return np.array(result) 170 171 172@click.command() 173@click.option("-i", "--input-dir", help="Directory with the full dataset") 174def main(input_dir): 175 results = defaultdict(list) 176 positive_filenames = glob.glob(os.path.join(input_dir, "positive", "*.npz")) 177 negative_filenames = glob.glob(os.path.join(input_dir, "negative", "*.npz")) 178 negative_filenames = negative_filenames[: len(positive_filenames)] 179 for filename in positive_filenames + negative_filenames: 180 data = np.load(filename) 181 for key in data: 182 if key not in [ 183 "time", 184 "location", 185 "lat", 186 "lon", 187 "lat_bounds", 188 "lon_bounds", 189 "mask", 190 "non_destriped", 191 ]: 192 x = data[key] 193 if key in [ 194 "u10", "v10", "cloud_fraction", "cirrus_reflectance", 195 "methane_ratio_std", "methane_precision", "surface_albedo", 196 "surface_albedo_precision", "aerosol_optical_thickness"]: 197 x[np.argwhere(x > 1.0e30)] = 0.0 198 results[key].append(x) 199 for key in results: 200 results[key] = np.array(results[key]).astype(np.float128) 201 stats = {} 202 for key in results: 203 stats[key] = { 204 "mean": float(np.nanmean(results[key])), 205 "median": float(np.nanmedian(results[key])), 206 "std": float(np.nanstd(results[key])), 207 "min": float(np.nanmin(results[key])), 208 "max": float(np.nanmax(results[key])), 209 } 210 print(json.dumps(stats, indent=4)) 211 212 213if __name__ == "__main__": 214 main()
DATASET_STATS =
{'methane': {'mean': 1862.5366583594632, 'median': 1878.5830688476562, 'std': 66.58710672017679, 'min': 971.0758666992188, 'max': 3324.72998046875}, 'qa': {'mean': 0.8106104458852114, 'median': 1.0, 'std': 0.3274058748852436, 'min': 0.0, 'max': 1.0}, 'u10': {'mean': -0.3505060237319564, 'median': -0.237032949924469, 'std': 3.0427688265223236, 'min': -16.58311653137207, 'max': 16.715341567993164}, 'v10': {'mean': -0.2016462336753655, 'median': -0.3121262937784195, 'std': 3.0363927475921657, 'min': -15.385231018066406, 'max': 22.70849609375}, 'sza': {'mean': 0.6852687383838644, 'median': 0.70140740275383, 'std': 0.17970663271587994, 'min': 0.24475011229515076, 'max': 0.9911531805992126}, 'vza': {'mean': 0.891019676507066, 'median': 0.928871214389801, 'std': 0.11010636053335394, 'min': 0.49698495864868164, 'max': 0.9998136758804321}, 'scattering_angle': {'mean': -0.6000556685447394, 'median': -0.6434199315569306, 'std': 0.26127084751804125, 'min': -0.9999982782919568, 'max': 0.4007348417791011}, 'sa_std': {'mean': 16.65754507928552, 'median': 8.775827407836914, 'std': 26.56312667561806, 'min': 0.0, 'max': 965.2076416015625}, 'cloud_fraction': {'mean': 0.010853144763690946, 'median': 0.0, 'std': 0.07413277621675883, 'min': 0.0, 'max': 1.0}, 'cirrus_reflectance': {'mean': 0.002408624468020358, 'median': 0.0, 'std': 0.00954486875404972, 'min': 0.0, 'max': 0.1243622899055481}, 'methane_ratio_std': {'mean': 0.0049173906056315655, 'median': 0.0, 'std': 0.0085810962161963, 'min': 0.0, 'max': 0.6137746572494507}, 'methane_precision': {'mean': 0.6099002507901794, 'median': 0.0, 'std': 1.0656924191550565, 'min': 0.0, 'max': 47.113624572753906}, 'surface_albedo': {'mean': 0.08097774154289568, 'median': 0.0, 'std': 0.16019963835712323, 'min': -0.05953392758965492, 'max': 0.7304432988166809}, 'surface_albedo_precision': {'mean': 6.182406963785989e-05, 'median': 0.0, 'std': 0.00012510325923893395, 'min': 0.0, 'max': 0.009150429628789425}, 'aerosol_optical_thickness': {'mean': 0.020228099796273182, 'median': 0.0, 'std': 0.04644573871021487, 'min': 0.0, 'max': 0.9478278756141663}}
def
normalize(data, fields):
119def normalize(data, fields): 120 """Normalize the selected fields in the dataset. 121 122 Parameters 123 ---------- 124 data: dict 125 A dictionary with the full dataset data from an .NPZ file 126 fields: list 127 A list with field names 128 129 Returns 130 ------- 131 NumPy array 132 """ 133 result = [] 134 for field in fields: 135 x = np.array(data[field]) 136 if field == "methane": 137 x[data["mask"]] = np.nanmedian(x) 138 x[np.argwhere(x > 1.0e30)] = 0.0 139 a = DATASET_STATS[field]["mean"] 140 b = DATASET_STATS[field]["std"] 141 x = (x - a) / b 142 result.append(x) 143 # for field in fields: 144 # if field in ["methane"]: 145 # m = np.array(data[field]) 146 # m[data["mask"]] = np.nanmedian(m) 147 # mean = DATASET_STATS[field]["mean"] 148 # std = DATASET_STATS[field]["std"] 149 # m = (m - mean) / std 150 # result.append(m) 151 # elif field == "qa": 152 # qa = np.array(data["qa"]) 153 # qa = (qa > 0.5).astype(np.float64) 154 # result.append(qa) 155 # elif field in ["u10", "v10"]: 156 # x = np.array(data[field]) 157 # x[np.argwhere(x > 1.0e30)] = 0.0 158 # mean = DATASET_STATS[field]["mean"] 159 # std = DATASET_STATS[field]["std"] 160 # x = (x - mean) / std 161 # result.append(x) 162 # elif field in ["sza", "vza", "scattering_angle", "sa_std"]: 163 # x = np.array(data[field]) 164 # mean = DATASET_STATS[field]["mean"] 165 # std = DATASET_STATS[field]["std"] 166 # x = (x - mean) / std 167 # result.append(x) 168 # else: 169 # result.append(np.array(data[field])) 170 return np.array(result)
Normalize the selected fields in the dataset.
Parameters
- data (dict): A dictionary with the full dataset data from an .NPZ file
- fields (list): A list with field names
Returns
- NumPy array