From eefce12ed2caf5bb5a54cbd9c3ae16a0b47a033f Mon Sep 17 00:00:00 2001 From: mtazzari Date: Tue, 5 Jul 2022 14:01:44 +0100 Subject: [PATCH 01/33] [gulpy] first implementation --- oasislmf/pytools/gul/random.py | 75 ++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py index f7b237a0e3..c4c67d8157 100644 --- a/oasislmf/pytools/gul/random.py +++ b/oasislmf/pytools/gul/random.py @@ -3,8 +3,10 @@ """ +from math import sqrt import logging import numpy as np +from scipy.stats import norm from numba import njit logger = logging.getLogger(__name__) @@ -72,6 +74,79 @@ def get_random_generator(random_generator): raise ValueError(f"No random generator exists for random_generator={random_generator}.") +EVENT_ID_HASH_CODE = np.int64(1943_272_559) +PERIL_CORRELATION_GROUP_HASH = np.int64(1836311903) +HASH_MOD_CODE = np.int64(2147483648) + + +@njit(cache=True, fastmath=True) +def generate_correlated_hash_vector(peril_correlation_group, event_id, base_seed=0): + """Generate hash for an `event_id`. + + Args: + event_id (int): event id. + base_seed (int, optional): base random seed. Defaults to 0. + + Returns: + int64: hash + """ + for i in range(1, peril_correlation_group.shape[0]): # why start from 1?? + peril_correlation_group[i] = (base_seed + + (i * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE + + (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE) % HASH_MOD_CODE + + return peril_correlation_group + + +def compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N): + return norm.ppf(np.linspace(arr_min, arr_max, arr_N)) + + +def compute_norm_cdf_lookup(arr_min, arr_max, arr_N): + return norm.cdf(np.linspace(arr_min, arr_max, arr_N)) + + +norm_inv_cdf = compute_norm_inv_cdf_lookup(1e-16, 1 - 1e-16, 1000000) +norm_cdf = compute_norm_cdf_lookup(-20., 20., 1000000) +# pre-compute lookup tables for the Gaussian cdf and inverse cdf + +# Note: +# - the size of these arrays can be increased to achieve better resolution in the Gaussian cdf and inv cdf. +# - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf +arr_min = 1e-16 +arr_max = 1 - 1e-16 +arr_N = 1000000 +norm_inv_cdf = norm.ppf(np.linspace(arr_min, arr_max, arr_N)) + +arr_min_cdf = -20. +arr_max_cdf = 20. +arr_N_cdf = 1000000 +norm_cdf = norm.cdf(np.linspace(arr_min_cdf, arr_max_cdf, arr_N_cdf)) + + +@njit(cache=True, fastmath=True) +def get_norm_cdf_cell_nb(x, arr_min, arr_max, arr_N): + return (x - arr_min) * (arr_N - 1) // (arr_max - arr_min) + + +@njit(cache=True, fastmath=True) +def get_corr_rval_v2(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif): + + sqrt_rho = sqrt(rho) + sqrt_1_minus_rho = sqrt(1. - rho) + + for i in range(Nsamples): + + x_norm = norm_inv_cdf[get_norm_cdf_cell_nb(x_unif[i], arr_min, arr_max, arr_N)] + y_norm = norm_inv_cdf[get_norm_cdf_cell_nb(y_unif[i], arr_min, arr_max, arr_N)] + z_norm = sqrt_rho * x_norm + sqrt_1_minus_rho * y_norm + + z_unif[i] = norm_cdf[get_norm_cdf_cell_nb(z_norm, arr_min_cdf, arr_max_cdf, arr_N_cdf)] + + +# return z_unif + + @njit(cache=True, fastmath=True) def random_MersenneTwister(seeds, n): """Generate random numbers using the default Mersenne Twister algorithm. From a8f89de751bbf877acccf2b66776c62e6b7483aa Mon Sep 17 00:00:00 2001 From: mtazzari Date: Tue, 5 Jul 2022 18:17:19 +0100 Subject: [PATCH 02/33] [gulpy] implementing correlated rng --- oasislmf/pytools/gul/manager.py | 42 ++++++++++++++++++++++++++++++--- oasislmf/pytools/gul/random.py | 25 +++----------------- oasislmf/pytools/gulpy.py | 2 ++ 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index de8e9483aa..bdd553e7ff 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -2,6 +2,7 @@ This file is the entry point for the gul command for the package. """ +from random import sample import sys import os from select import select @@ -23,7 +24,12 @@ write_negative_sidx, write_sample_header, write_sample_rec, read_getmodel_stream ) -from oasislmf.pytools.gul.random import get_random_generator + +from oasislmf.pytools.gul.random import ( + get_random_generator, compute_norm_cdf_lookup, + compute_norm_inv_cdf_lookup, get_corr_rval +) + from oasislmf.pytools.gul.core import split_tiv, get_gul, setmaxloss, compute_mean_loss from oasislmf.pytools.gul.utils import append_to_dict_value, binary_search @@ -116,7 +122,7 @@ def generate_item_map(items, coverages): def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debug, - random_generator, file_in=None, file_out=None, **kwargs): + random_generator, file_in=None, file_out=None, correlated=False, **kwargs): """Execute the main gulpy worklow. Args: @@ -209,14 +215,44 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # create the array to store the seeds seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id']) + # pre-compute lookup tables for the Gaussian cdf and inverse cdf + # this is used for the generation of correlated random numbers + + # Note: + # - the size `arr_N` and `arr_N_cdf` can be increased to achieve better resolution in the Gaussian cdf and inv cdf. + # - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf + arr_min = 1e-16 + arr_max = 1 - 1e-16 + arr_N = 1000000 + + arr_min_cdf = -20. + arr_max_cdf = 20. + arr_N_cdf = 1000000 + + norm_inv_cdf = compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N) + norm_cdf = compute_norm_cdf_lookup(arr_min_cdf, arr_max_cdf, arr_N_cdf) + # create buffer to be reused to store all losses for one coverage losses_buffer = np.zeros((sample_size + NUM_IDX + 1, np.max(coverages[1:]['max_items'])), dtype=oasis_float) + z_unif = np.zeros(sample_size) for event_data in read_getmodel_stream(streams_in, item_map, coverages, compute, seeds): event_id, compute_i, items_data, recs, rec_idx_ptr, rng_index = event_data - rndms = generate_rndm(seeds[:rng_index], sample_size) + if not correlated: + rndms = generate_rndm(seeds[:rng_index], sample_size) + else: + rndms_x = generate_rndm(seeds[:rng_index], sample_size) + rndms_y = generate_rndm(seeds[:rng_index], sample_size) + logger.info(rndms_x.shape) + rho = 0.5 # get this from the map + # TODO: rndms_z needs to be 2d of shape Nseeds x samplesize + for i_seed in range(rndms_x.shape[0]): + rndms_z = get_corr_rval( + rndms_x[i_seed, :], rndms_y[i_seed, :], rho, arr_min, arr_max, arr_N, norm_inv_cdf, + arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif + ) last_processed_coverage_ids_idx = 0 while last_processed_coverage_ids_idx < compute_i: diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py index c4c67d8157..8201d23558 100644 --- a/oasislmf/pytools/gul/random.py +++ b/oasislmf/pytools/gul/random.py @@ -106,31 +106,13 @@ def compute_norm_cdf_lookup(arr_min, arr_max, arr_N): return norm.cdf(np.linspace(arr_min, arr_max, arr_N)) -norm_inv_cdf = compute_norm_inv_cdf_lookup(1e-16, 1 - 1e-16, 1000000) -norm_cdf = compute_norm_cdf_lookup(-20., 20., 1000000) -# pre-compute lookup tables for the Gaussian cdf and inverse cdf - -# Note: -# - the size of these arrays can be increased to achieve better resolution in the Gaussian cdf and inv cdf. -# - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf -arr_min = 1e-16 -arr_max = 1 - 1e-16 -arr_N = 1000000 -norm_inv_cdf = norm.ppf(np.linspace(arr_min, arr_max, arr_N)) - -arr_min_cdf = -20. -arr_max_cdf = 20. -arr_N_cdf = 1000000 -norm_cdf = norm.cdf(np.linspace(arr_min_cdf, arr_max_cdf, arr_N_cdf)) - - @njit(cache=True, fastmath=True) def get_norm_cdf_cell_nb(x, arr_min, arr_max, arr_N): - return (x - arr_min) * (arr_N - 1) // (arr_max - arr_min) + return int((x - arr_min) * (arr_N - 1) // (arr_max - arr_min)) @njit(cache=True, fastmath=True) -def get_corr_rval_v2(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif): +def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif): sqrt_rho = sqrt(rho) sqrt_1_minus_rho = sqrt(1. - rho) @@ -143,8 +125,7 @@ def get_corr_rval_v2(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, z_unif[i] = norm_cdf[get_norm_cdf_cell_nb(z_norm, arr_min_cdf, arr_max_cdf, arr_N_cdf)] - -# return z_unif + return z_unif @njit(cache=True, fastmath=True) diff --git a/oasislmf/pytools/gulpy.py b/oasislmf/pytools/gulpy.py index ed022b0471..30be823f0d 100644 --- a/oasislmf/pytools/gulpy.py +++ b/oasislmf/pytools/gulpy.py @@ -12,6 +12,8 @@ ) parser.add_argument('-a', help='back-allocation rule', default=0, type=int, dest='alloc_rule') +parser.add_argument('-c', '--correlated', help='(wip) if passed, it activates the correlated rng', + action='store_true', dest='correlated', default=False) parser.add_argument('-d', help='output random numbers instead of gul (default: False).', default=False, action='store_true', dest='debug') parser.add_argument('-i', '--file-in', help='filename of input stream.', action='store', type=str, dest='file_in') From d8215d87e0389f402124a7f8550d2a12763212f9 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Wed, 6 Jul 2022 15:37:24 +0100 Subject: [PATCH 03/33] [wip] implementing correlated rng --- oasislmf/pytools/gul/manager.py | 49 +++++++++++++++++++-------------- oasislmf/pytools/gul/random.py | 19 ++++++++----- oasislmf/pytools/gulpy.py | 4 ++- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index bdd553e7ff..dca002b611 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -22,12 +22,12 @@ ) from oasislmf.pytools.gul.io import ( write_negative_sidx, write_sample_header, - write_sample_rec, read_getmodel_stream + write_sample_rec, read_getmodel_stream, ) from oasislmf.pytools.gul.random import ( get_random_generator, compute_norm_cdf_lookup, - compute_norm_inv_cdf_lookup, get_corr_rval + compute_norm_inv_cdf_lookup, get_corr_rval, generate_correlated_hash_vector ) from oasislmf.pytools.gul.core import split_tiv, get_gul, setmaxloss, compute_mean_loss @@ -240,26 +240,21 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu event_id, compute_i, items_data, recs, rec_idx_ptr, rng_index = event_data - if not correlated: - rndms = generate_rndm(seeds[:rng_index], sample_size) - else: - rndms_x = generate_rndm(seeds[:rng_index], sample_size) - rndms_y = generate_rndm(seeds[:rng_index], sample_size) - logger.info(rndms_x.shape) - rho = 0.5 # get this from the map - # TODO: rndms_z needs to be 2d of shape Nseeds x samplesize - for i_seed in range(rndms_x.shape[0]): - rndms_z = get_corr_rval( - rndms_x[i_seed, :], rndms_y[i_seed, :], rho, arr_min, arr_max, arr_N, norm_inv_cdf, - arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif - ) + # generate the correlated samples for the whole event, for all peril correlation groups + peril_correlation_group_ids = [1, 2, 3] # TODO get it from input data + # Nperil_correlation_groups = len(peril_correlation_group_ids) + corr_seeds = generate_correlated_hash_vector(peril_correlation_group_ids, event_id) + eps_ij = generate_rndm(corr_seeds, sample_size) + # Nseeds = len(seeds[:rng_index]) + + rndms_base = generate_rndm(seeds[:rng_index], sample_size) last_processed_coverage_ids_idx = 0 while last_processed_coverage_ids_idx < compute_i: cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses( event_id, coverages, compute[:compute_i], items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, - damage_bins, loss_threshold, losses_buffer, alloc_rule, rndms, debug, + damage_bins, loss_threshold, losses_buffer, alloc_rule, rndms_base, eps_ij, correlated, debug, GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor ) @@ -275,7 +270,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu @njit(cache=True, fastmath=True) def compute_event_losses(event_id, coverages, coverage_ids, items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins, - loss_threshold, losses, alloc_rule, rndms, debug, buff_size, + loss_threshold, losses, alloc_rule, rndms_base, eps_ij, correlated, debug, buff_size, int32_mv, cursor): """Compute losses for an event. @@ -326,7 +321,9 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, item = items[item_i] damagecdf_i = item['damagecdf_i'] rng_index = item['rng_index'] - + peril_correlation_group = item['peril_correlation_group'] + # probably we need a peril_correlation_group_index and an array that maps index to peril_correlation_group values. + # for now, let's assume peril_correlation_group start from 0. rec = recs[rec_idx_ptr[damagecdf_i]:rec_idx_ptr[damagecdf_i + 1]] prob_to = rec['prob_to'] bin_mean = rec['bin_mean'] @@ -344,14 +341,26 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, losses[MEAN_IDX, item_i] = gul_mean if sample_size > 0: + if correlated: + # TODO: pass these variables in + rndms = get_corr_rval( + eps_ij[peril_correlation_group], rndms_base[rng_index], + rho, arr_min, arr_max, arr_N, norm_inv_cdf, + arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size + ) + else: + rndms = rndms_base[rng_index] + + # this can be optimized by caching rndms with a dict + if debug: for sample_idx in range(1, sample_size + 1): - rval = rndms[rng_index][sample_idx - 1] + rval = rndms[sample_idx - 1] losses[sample_idx, item_i] = rval else: for sample_idx in range(1, sample_size + 1): # cap `rval` to the maximum `prob_to` value (which should be 1.) - rval = rndms[rng_index][sample_idx - 1] + rval = rndms[sample_idx - 1] if rval >= prob_to[Nbins - 1]: rval = prob_to[Nbins - 1] - 0.00000003 diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py index 8201d23558..44eaa39826 100644 --- a/oasislmf/pytools/gul/random.py +++ b/oasislmf/pytools/gul/random.py @@ -80,7 +80,7 @@ def get_random_generator(random_generator): @njit(cache=True, fastmath=True) -def generate_correlated_hash_vector(peril_correlation_group, event_id, base_seed=0): +def generate_correlated_hash_vector(peril_correlation_groups, event_id, base_seed=0): """Generate hash for an `event_id`. Args: @@ -90,12 +90,17 @@ def generate_correlated_hash_vector(peril_correlation_group, event_id, base_seed Returns: int64: hash """ - for i in range(1, peril_correlation_group.shape[0]): # why start from 1?? - peril_correlation_group[i] = (base_seed + - (i * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE + - (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE) % HASH_MOD_CODE + Nperil_correlation_groups = peril_correlation_groups.shape[0] + correlated_hashes = np.zeros(Nperil_correlation_groups, dtype='int64') - return peril_correlation_group + for i in range(Nperil_correlation_groups): # why start from 1?? + correlated_hashes[i] = ( + base_seed + + (peril_correlation_groups[i] * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE + + (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE + ) % HASH_MOD_CODE + + return correlated_hashes def compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N): @@ -116,9 +121,9 @@ def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, ar sqrt_rho = sqrt(rho) sqrt_1_minus_rho = sqrt(1. - rho) + z_unif = np.zeros(x_unif.shape[0], dtype='float64') for i in range(Nsamples): - x_norm = norm_inv_cdf[get_norm_cdf_cell_nb(x_unif[i], arr_min, arr_max, arr_N)] y_norm = norm_inv_cdf[get_norm_cdf_cell_nb(y_unif[i], arr_min, arr_max, arr_N)] z_norm = sqrt_rho * x_norm + sqrt_1_minus_rho * y_norm diff --git a/oasislmf/pytools/gulpy.py b/oasislmf/pytools/gulpy.py index 30be823f0d..6df677c0ce 100644 --- a/oasislmf/pytools/gulpy.py +++ b/oasislmf/pytools/gulpy.py @@ -12,7 +12,9 @@ ) parser.add_argument('-a', help='back-allocation rule', default=0, type=int, dest='alloc_rule') -parser.add_argument('-c', '--correlated', help='(wip) if passed, it activates the correlated rng', +parser.add_argument('-c', '--correlated', + help='[EXPERIMENTAL] if passed, uses peril correlation groups to produce ' + 'correlated samples for items within the same peril correlation group', action='store_true', dest='correlated', default=False) parser.add_argument('-d', help='output random numbers instead of gul (default: False).', default=False, action='store_true', dest='debug') From 614dd3de85d8779074020596a813f1f396108cd3 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Tue, 12 Jul 2022 16:46:39 +0100 Subject: [PATCH 04/33] [wip] --- oasislmf/pytools/gul/io.py | 2 ++ oasislmf/pytools/gul/manager.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/oasislmf/pytools/gul/io.py b/oasislmf/pytools/gul/io.py index 259f6a88f8..d17ad4ed46 100644 --- a/oasislmf/pytools/gul/io.py +++ b/oasislmf/pytools/gul/io.py @@ -245,6 +245,8 @@ def stream_to_data(int32_mv, valid_buf, size_cdf_entry, last_event_id, item_map, seeds[rng_index] = generate_hash(group_id, last_event_id) this_rng_index = rng_index rng_index += 1 + + # TODO Q for Stephane: should we create a hash for one group_id, for all peril correlation groups by default? else: this_rng_index = group_id_rng_index[group_id] diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index dca002b611..71433dd3c3 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -159,6 +159,15 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # read coverages from file coverages_tiv = get_coverages(input_path) + # TODO finish here + # get model settings + # from oasislmf.preparation.correlations import get_model_settings + # get_model_settings("model_settings.json") + + # from oasislmf.pytools.data_layer.conversions.correlations import CorrelationsData + # file_path = os.path.join(input_path, 'correlations.bin') + # data = CorrelationsData.from_bin(file_path=file_path) + # init the structure for computation # coverages are numbered from 1, therefore we skip element 0 in `coverages` coverages = np.zeros(coverages_tiv.shape[0] + 1, coverage_type) @@ -295,7 +304,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, buff_size (int): size in bytes of the output buffer. int32_mv (numpy.ndarray): int32 view of the memoryview where the output is buffered. cursor (int): index of int32_mv where to start writing. - + Returns: int, int, int: updated value of cursor, updated value of cursor_bytes, last last_processed_coverage_ids_idx """ From d407089d4d0561ab69caff2ae483df4d91dcaa67 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Fri, 15 Jul 2022 16:38:39 +0100 Subject: [PATCH 05/33] [gulpy] working implementation of the correlated random values --- oasislmf/pytools/gul/manager.py | 100 +++++++++++++++++++------------- oasislmf/pytools/gul/random.py | 29 +++++---- 2 files changed, 75 insertions(+), 54 deletions(-) diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index 71433dd3c3..0defb0e308 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -12,8 +12,10 @@ from numba import njit from numba.typed import Dict, List +from oasislmf.pytools.data_layer.conversions.correlations import CorrelationsData + from oasislmf.pytools.getmodel.manager import get_damage_bins, Item -from oasislmf.pytools.getmodel.common import oasis_float +from oasislmf.pytools.getmodel.common import oasis_float, Correlation from oasislmf.pytools.gul.common import ( MEAN_IDX, STD_DEV_IDX, TIV_IDX, CHANCE_OF_LOSS_IDX, MAX_LOSS_IDX, NUM_IDX, @@ -159,15 +161,6 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # read coverages from file coverages_tiv = get_coverages(input_path) - # TODO finish here - # get model settings - # from oasislmf.preparation.correlations import get_model_settings - # get_model_settings("model_settings.json") - - # from oasislmf.pytools.data_layer.conversions.correlations import CorrelationsData - # file_path = os.path.join(input_path, 'correlations.bin') - # data = CorrelationsData.from_bin(file_path=file_path) - # init the structure for computation # coverages are numbered from 1, therefore we skip element 0 in `coverages` coverages = np.zeros(coverages_tiv.shape[0] + 1, coverage_type) @@ -224,46 +217,71 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # create the array to store the seeds seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id']) - # pre-compute lookup tables for the Gaussian cdf and inverse cdf - # this is used for the generation of correlated random numbers + logger.info(f"Correlated random number generation: switched {'ON' if correlated else 'OFF'}") + if correlated: + file_path = os.path.join(input_path, 'correlations.bin') + data = CorrelationsData.from_bin(file_path=file_path).data + Nperil_correlation_groups = len(data) + + corr_data_by_item_id = np.ndarray(Nperil_correlation_groups + 1, dtype=Correlation) + corr_data_by_item_id[0] = (0, 0.) + corr_data_by_item_id[1:]['peril_correlation_group'] = np.array(data['peril_correlation_group']) + corr_data_by_item_id[1:]['correlation_value'] = np.array(data['correlation_value']) + + logger.info( + f"Correlation values for {Nperil_correlation_groups} peril correlation groups have been imported." + ) + + unique_peril_correlation_groups = np.unique(corr_data_by_item_id[1:]['peril_correlation_group']) - # Note: - # - the size `arr_N` and `arr_N_cdf` can be increased to achieve better resolution in the Gaussian cdf and inv cdf. - # - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf - arr_min = 1e-16 - arr_max = 1 - 1e-16 - arr_N = 1000000 + # pre-compute lookup tables for the Gaussian cdf and inverse cdf + # Notes: + # - the size `arr_N` and `arr_N_cdf` can be increased to achieve better resolution in the Gaussian cdf and inv cdf. + # - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf + arr_min, arr_max, arr_N = 1e-16, 1 - 1e-16, 1000000 + arr_min_cdf, arr_max_cdf, arr_N_cdf = -20., 20., 1000000 + norm_inv_cdf = compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N) + norm_cdf = compute_norm_cdf_lookup(arr_min_cdf, arr_max_cdf, arr_N_cdf) - arr_min_cdf = -20. - arr_max_cdf = 20. - arr_N_cdf = 1000000 + # buffer to be re-used to store all the correlated random values + z_unif = np.zeros(sample_size, dtype='float64') - norm_inv_cdf = compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N) - norm_cdf = compute_norm_cdf_lookup(arr_min_cdf, arr_max_cdf, arr_N_cdf) + else: + # create dummy data structures with proper dtypes to allow correct numba compilation + corr_data_by_item_id = np.ndarray(1, dtype=Correlation) + arr_min, arr_max, arr_N = 0, 0, 0 + arr_min_cdf, arr_max_cdf, arr_N_cdf = 0, 0, 0 + norm_inv_cdf, norm_cdf = np.zeros(1, dtype='float64'), np.zeros(1, dtype='float64') + z_unif = np.zeros(1, dtype='float64') # create buffer to be reused to store all losses for one coverage losses_buffer = np.zeros((sample_size + NUM_IDX + 1, np.max(coverages[1:]['max_items'])), dtype=oasis_float) - z_unif = np.zeros(sample_size) for event_data in read_getmodel_stream(streams_in, item_map, coverages, compute, seeds): event_id, compute_i, items_data, recs, rec_idx_ptr, rng_index = event_data + # generation of "base" random values is done as before + rndms_base = generate_rndm(seeds[:rng_index], sample_size) + + # to generate the correlated part, we do the hashing here for now (instead of in stream_to_data) # generate the correlated samples for the whole event, for all peril correlation groups - peril_correlation_group_ids = [1, 2, 3] # TODO get it from input data - # Nperil_correlation_groups = len(peril_correlation_group_ids) - corr_seeds = generate_correlated_hash_vector(peril_correlation_group_ids, event_id) - eps_ij = generate_rndm(corr_seeds, sample_size) - # Nseeds = len(seeds[:rng_index]) + if correlated: + corr_seeds = generate_correlated_hash_vector(unique_peril_correlation_groups, event_id) + eps_ij = generate_rndm(corr_seeds, sample_size, skip_seeds=1) - rndms_base = generate_rndm(seeds[:rng_index], sample_size) + else: + # create dummy data structures with proper dtypes to allow correct numba compilation + corr_seeds = np.zeros(1, dtype='int64') + eps_ij = np.zeros((1, 1), dtype='float64') last_processed_coverage_ids_idx = 0 while last_processed_coverage_ids_idx < compute_i: cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses( event_id, coverages, compute[:compute_i], items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, - damage_bins, loss_threshold, losses_buffer, alloc_rule, rndms_base, eps_ij, correlated, debug, + damage_bins, loss_threshold, losses_buffer, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id, + arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug, GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor ) @@ -279,8 +297,9 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu @njit(cache=True, fastmath=True) def compute_event_losses(event_id, coverages, coverage_ids, items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins, - loss_threshold, losses, alloc_rule, rndms_base, eps_ij, correlated, debug, buff_size, - int32_mv, cursor): + loss_threshold, losses, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id, + arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, + z_unif, debug, buff_size, int32_mv, cursor): """Compute losses for an event. Args: @@ -309,6 +328,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, int, int, int: updated value of cursor, updated value of cursor_bytes, last last_processed_coverage_ids_idx """ max_size_per_item = (sample_size + NUM_IDX + 1) * gulSampleslevelRec_size + 2 * gulSampleslevelHeader_size + for coverage_i in range(last_processed_coverage_ids_idx, coverage_ids.shape[0]): coverage = coverages[coverage_ids[coverage_i]] tiv = coverage['tiv'] # coverages are indexed from 1 @@ -330,9 +350,6 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, item = items[item_i] damagecdf_i = item['damagecdf_i'] rng_index = item['rng_index'] - peril_correlation_group = item['peril_correlation_group'] - # probably we need a peril_correlation_group_index and an array that maps index to peril_correlation_group values. - # for now, let's assume peril_correlation_group start from 0. rec = recs[rec_idx_ptr[damagecdf_i]:rec_idx_ptr[damagecdf_i + 1]] prob_to = rec['prob_to'] bin_mean = rec['bin_mean'] @@ -351,12 +368,17 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, if sample_size > 0: if correlated: - # TODO: pass these variables in - rndms = get_corr_rval( + item_corr_data = corr_data_by_item_id[item['item_id']] + peril_correlation_group = item_corr_data['peril_correlation_group'] + rho = item_corr_data['correlation_value'] + + get_corr_rval( eps_ij[peril_correlation_group], rndms_base[rng_index], rho, arr_min, arr_max, arr_N, norm_inv_cdf, - arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size + arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif ) + rndms = z_unif + else: rndms = rndms_base[rng_index] diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py index 44eaa39826..b990064a5b 100644 --- a/oasislmf/pytools/gul/random.py +++ b/oasislmf/pytools/gul/random.py @@ -80,7 +80,7 @@ def get_random_generator(random_generator): @njit(cache=True, fastmath=True) -def generate_correlated_hash_vector(peril_correlation_groups, event_id, base_seed=0): +def generate_correlated_hash_vector(unique_peril_correlation_groups, event_id, base_seed=0): """Generate hash for an `event_id`. Args: @@ -90,13 +90,14 @@ def generate_correlated_hash_vector(peril_correlation_groups, event_id, base_see Returns: int64: hash """ - Nperil_correlation_groups = peril_correlation_groups.shape[0] - correlated_hashes = np.zeros(Nperil_correlation_groups, dtype='int64') + Nperil_correlation_groups = unique_peril_correlation_groups.shape[0] + correlated_hashes = np.empty(Nperil_correlation_groups + 1, dtype='int64') + correlated_hashes[0] = 0 - for i in range(Nperil_correlation_groups): # why start from 1?? + for i in range(1, Nperil_correlation_groups + 1): correlated_hashes[i] = ( base_seed + - (peril_correlation_groups[i] * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE + + (unique_peril_correlation_groups[i - 1] * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE + (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE ) % HASH_MOD_CODE @@ -117,11 +118,11 @@ def get_norm_cdf_cell_nb(x, arr_min, arr_max, arr_N): @njit(cache=True, fastmath=True) -def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif): +def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, + arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif): sqrt_rho = sqrt(rho) sqrt_1_minus_rho = sqrt(1. - rho) - z_unif = np.zeros(x_unif.shape[0], dtype='float64') for i in range(Nsamples): x_norm = norm_inv_cdf[get_norm_cdf_cell_nb(x_unif[i], arr_min, arr_max, arr_N)] @@ -130,11 +131,9 @@ def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, ar z_unif[i] = norm_cdf[get_norm_cdf_cell_nb(z_norm, arr_min_cdf, arr_max_cdf, arr_N_cdf)] - return z_unif - @njit(cache=True, fastmath=True) -def random_MersenneTwister(seeds, n): +def random_MersenneTwister(seeds, n, skip_seeds=0): """Generate random numbers using the default Mersenne Twister algorithm. Args: @@ -150,9 +149,9 @@ def random_MersenneTwister(seeds, n): Nseeds = len(seeds) rndms = np.zeros((Nseeds, n), dtype='float64') - for seed_i, seed in enumerate(seeds): + for seed_i in range(skip_seeds, Nseeds, 1): # set the seed - np.random.seed(seed) + np.random.seed(seeds[seed_i]) # draw the random numbers for j in range(n): @@ -163,7 +162,7 @@ def random_MersenneTwister(seeds, n): @njit(cache=True, fastmath=True) -def random_LatinHypercube(seeds, n): +def random_LatinHypercube(seeds, n, skip_seeds=0): """Generate random numbers using the Latin Hypercube algorithm. Args: @@ -188,9 +187,9 @@ def random_LatinHypercube(seeds, n): samples = np.zeros(n, dtype='float64') perms = np.zeros(n, dtype='float64') - for seed_i, seed in enumerate(seeds): + for seed_i in range(skip_seeds, Nseeds, 1): # set the seed - np.random.seed(seed) + np.random.seed(seeds[seed_i]) # draw the random numbers and re-generate permutations array for i in range(n): From 5417f9b416cf1697c2a1663fd5036677879fa586 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Fri, 15 Jul 2022 17:31:43 +0100 Subject: [PATCH 06/33] minor cleanup --- oasislmf/pytools/gul/io.py | 3 +-- oasislmf/pytools/gul/manager.py | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/oasislmf/pytools/gul/io.py b/oasislmf/pytools/gul/io.py index 6fb8ddcc7e..2b4f1343b7 100644 --- a/oasislmf/pytools/gul/io.py +++ b/oasislmf/pytools/gul/io.py @@ -94,7 +94,7 @@ def read_getmodel_stream(stream_in, item_map, coverages, compute, seeds, valid_a if valid_area_peril_id is not None: valid_area_peril_dict = gen_valid_area_peril(valid_area_peril_id) else: - valid_area_peril_dict=None + valid_area_peril_dict = None # init data structures group_id_rng_index, rec_idx_ptr = gen_structs() @@ -267,7 +267,6 @@ def stream_to_data(int32_mv, valid_buf, size_cdf_entry, last_event_id, item_map, this_rng_index = rng_index rng_index += 1 - # TODO Q for Stephane: should we create a hash for one group_id, for all peril correlation groups by default? else: this_rng_index = group_id_rng_index[group_id] diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index b06ff1f926..489381df05 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -2,7 +2,6 @@ This file is the entry point for the gul command for the package. """ -from random import sample import sys import os from select import select @@ -155,7 +154,6 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu input_path = os.path.join(run_dir, 'input') ignore_file_type = set(ignore_file_type) - damage_bins = get_damage_bins(static_path) # read coverages from file @@ -170,7 +168,6 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu else: valid_area_peril_id = None - # init the structure for computation # coverages are numbered from 1, therefore we skip element 0 in `coverages` coverages = np.zeros(coverages_tiv.shape[0] + 1, coverage_type) @@ -392,8 +389,6 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, else: rndms = rndms_base[rng_index] - # this can be optimized by caching rndms with a dict - if debug: for sample_idx in range(1, sample_size + 1): rval = rndms[sample_idx - 1] From 5ad10f9792ae855efe3a5409f8dcc2bc200fe4b6 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Fri, 22 Jul 2022 15:52:43 +0100 Subject: [PATCH 07/33] [gulpy] Update docstrings for random module functions --- oasislmf/pytools/gul/random.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py index b990064a5b..388626cb25 100644 --- a/oasislmf/pytools/gul/random.py +++ b/oasislmf/pytools/gul/random.py @@ -81,14 +81,15 @@ def get_random_generator(random_generator): @njit(cache=True, fastmath=True) def generate_correlated_hash_vector(unique_peril_correlation_groups, event_id, base_seed=0): - """Generate hash for an `event_id`. + """Generate hashes for all peril correlation groups for a given `event_id`. Args: + unique_peril_correlation_groups (List[int]): list of the unique peril correlation groups. event_id (int): event id. base_seed (int, optional): base random seed. Defaults to 0. Returns: - int64: hash + List[int64]: hashes """ Nperil_correlation_groups = unique_peril_correlation_groups.shape[0] correlated_hashes = np.empty(Nperil_correlation_groups + 1, dtype='int64') @@ -139,6 +140,10 @@ def random_MersenneTwister(seeds, n, skip_seeds=0): Args: seeds (List[int64]): List of seeds. n (int): number of random samples to generate for each seed. + skip_seeds (int): number of seeds to skip starting from the beginning + of the `seeds` array. For skipped seeds no random numbers are generated + and the output rndms will contain zeros at their corresponding row. + Default is 0, i.e. no seeds are skipped. Returns: rndms (array[float]): 2-d array of shape (number of seeds, n) @@ -174,6 +179,10 @@ def random_LatinHypercube(seeds, n, skip_seeds=0): containing the random values generated for each seed. rndms_idx (Dict[int64, int]): mapping between `seed` and the row in rndms that stores the corresponding random values. + skip_seeds (int): number of seeds to skip starting from the beginning + of the `seeds` array. For skipped seeds no random numbers are generated + and the output rndms will contain zeros at their corresponding row. + Default is 0, i.e. no seeds are skipped. Notes: Implementation follows scipy.stats.qmc.LatinHypercube v1.8.0. From 2066aa25a3e788cc7ab47c6b6ad8f8eb24cda74d Mon Sep 17 00:00:00 2001 From: mtazzari Date: Tue, 2 Aug 2022 11:05:50 +0100 Subject: [PATCH 08/33] [gulpy] remove unused generate_correlated_hash --- oasislmf/pytools/gul/random.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py index 388626cb25..2310bbdd14 100644 --- a/oasislmf/pytools/gul/random.py +++ b/oasislmf/pytools/gul/random.py @@ -35,22 +35,6 @@ def generate_hash(group_id, event_id, base_seed=0): return hash -@njit(cache=True, fastmath=True) -def generate_correlated_hash(event_id, base_seed=0): - """Generate hash for an `event_id`. - - Args: - event_id (int): event id. - base_seed (int, optional): base random seed. Defaults to 0. - - Returns: - int64: hash - """ - hash = (base_seed + (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE) % HASH_MOD_CODE - - return hash - - def get_random_generator(random_generator): """Get the random generator function. From f0311c0b37b980913245ba06ea0bdf261eece736 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Tue, 2 Aug 2022 11:57:23 +0100 Subject: [PATCH 09/33] [gulpy] introduce --ignore-correlation flag --- oasislmf/pytools/gul/manager.py | 32 ++++++++++++++++++++++---------- oasislmf/pytools/gulpy.py | 7 +++---- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index 489381df05..ce4c0eeaca 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -125,7 +125,7 @@ def generate_item_map(items, coverages): def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debug, - random_generator, peril_filter=[], file_in=None, file_out=None, correlated=False, **kwargs): + random_generator, peril_filter=[], file_in=None, file_out=None, ignore_correlation=False, **kwargs): """Execute the main gulpy worklow. Args: @@ -139,6 +139,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu random_generator (int): random generator function id. file_in (str, optional): filename of input stream. Defaults to None. file_out (str, optional): filename of output stream. Defaults to None. + ignore_correlation (bool): if True, do not compute correlated random samples. Raises: ValueError: if alloc_rule is not 0, 1, or 2. @@ -224,11 +225,21 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # create the array to store the seeds seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id']) - logger.info(f"Correlated random number generation: switched {'ON' if correlated else 'OFF'}") - if correlated: - file_path = os.path.join(input_path, 'correlations.bin') - data = CorrelationsData.from_bin(file_path=file_path).data - Nperil_correlation_groups = len(data) + file_path = os.path.join(input_path, 'correlations.bin') + data = CorrelationsData.from_bin(file_path=file_path).data + Nperil_correlation_groups = len(data) + logger.info(f"Detected {Nperil_correlation_groups} peril correlation groups.") + + if Nperil_correlation_groups == 0: + ignore_correlation = True + logger.info(f"Correlated random number generation: switched OFF because 0 peril correlation groups were detected.") + + else: + if ignore_correlation: + logger.info(f"Correlated random number generation: switched OFF because --ignore-correlation is True.") + + if not ignore_correlation: + logger.info(f"Correlated random number generation: switched ON.") corr_data_by_item_id = np.ndarray(Nperil_correlation_groups + 1, dtype=Correlation) corr_data_by_item_id[0] = (0, 0.) @@ -273,7 +284,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # to generate the correlated part, we do the hashing here for now (instead of in stream_to_data) # generate the correlated samples for the whole event, for all peril correlation groups - if correlated: + if not ignore_correlation: corr_seeds = generate_correlated_hash_vector(unique_peril_correlation_groups, event_id) eps_ij = generate_rndm(corr_seeds, sample_size, skip_seeds=1) @@ -287,7 +298,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses( event_id, coverages, compute[:compute_i], items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, - damage_bins, loss_threshold, losses_buffer, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id, + damage_bins, loss_threshold, losses_buffer, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug, GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor ) @@ -304,7 +315,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu @njit(cache=True, fastmath=True) def compute_event_losses(event_id, coverages, coverage_ids, items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins, - loss_threshold, losses, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id, + loss_threshold, losses, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug, buff_size, int32_mv, cursor): """Compute losses for an event. @@ -323,6 +334,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, loss_threshold (float): threshold above which losses are printed to the output stream. losses (numpy.array[oasis_float]): array (to be re-used) to store losses for all item_ids. alloc_rule (int): back-allocation rule. + ignore_correlation (bool): if True, do not compute correlated random samples. rndms (numpy.array[float64]): 2d array of shape (number of seeds, sample_size) storing the random values drawn for each seed. debug (bool): if True, for each random sample, print to the output stream the random value @@ -374,7 +386,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, losses[MEAN_IDX, item_i] = gul_mean if sample_size > 0: - if correlated: + if not ignore_correlation: item_corr_data = corr_data_by_item_id[item['item_id']] peril_correlation_group = item_corr_data['peril_correlation_group'] rho = item_corr_data['correlation_value'] diff --git a/oasislmf/pytools/gulpy.py b/oasislmf/pytools/gulpy.py index 2d52bc586f..f9c961ffda 100644 --- a/oasislmf/pytools/gulpy.py +++ b/oasislmf/pytools/gulpy.py @@ -12,10 +12,9 @@ ) parser.add_argument('-a', help='back-allocation rule', default=0, type=int, dest='alloc_rule') -parser.add_argument('-c', '--correlated', - help='[EXPERIMENTAL] if passed, uses peril correlation groups to produce ' - 'correlated samples for items within the same peril correlation group', - action='store_true', dest='correlated', default=False) +parser.add_argument('--ignore-correlation', + help='if passed, peril correlation groups (if defined) are ignored for the generation of correlated samples', + action='store_true', dest='ignore_correlation', default=False) parser.add_argument('-d', help='output random numbers instead of gul (default: False).', default=False, action='store_true', dest='debug') parser.add_argument('-i', '--file-in', help='filename of input stream.', action='store', type=str, dest='file_in') From 1709cee8069ec760eb36dcc5c6e0a7e6a06144c4 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Wed, 3 Aug 2022 14:34:59 +0100 Subject: [PATCH 10/33] set hashed_group_id to True by default, cleanup --- oasislmf/computation/generate/files.py | 6 +++--- oasislmf/preparation/gul_inputs.py | 12 +++--------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 6e9930d6a8..79d8620365 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -104,7 +104,8 @@ class GenerateFiles(ComputationStep): {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'}, {'name': 'group_id_cols', 'flag':'-G', 'nargs':'+', 'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS}, {'name': 'lookup_multiprocessing', 'type': str2bool, 'const': False, 'nargs':'?', 'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'}, - {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs':'?', "default": False, "help": "Hashes the group_id in the items.bin"}, + {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs': '?', + "default": True, "help": "Hashes the group_id in the items.bin"}, # Manager only options (pass data directy instead of filepaths) {'name': 'lookup_config'}, @@ -232,7 +233,7 @@ def run(self): keys_df, exposure_profile=location_profile, group_id_cols=group_id_cols, - hash_group_ids=self.hashed_group_id, + hashed_group_id=self.hashed_group_id, ) if self.model_settings_json is not None: @@ -267,7 +268,6 @@ def run(self): output_dir=self._get_output_dir(), oasis_files_prefixes=files_prefixes['gul'], chunksize=self.write_chunksize, - hashed_item_id=self.hashed_group_id ) gul_summary_mapping = get_summary_mapping(gul_inputs_df, oed_hierarchy) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index a90421a9cc..87efd900bb 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -51,7 +51,7 @@ def get_gul_input_items( keys_df, exposure_profile=get_default_exposure_profile(), group_id_cols=['loc_id'], - hash_group_ids=False + hashed_group_id=True ): """ Generates and returns a Pandas dataframe of GUL input items. @@ -317,7 +317,7 @@ def get_gul_input_items( if correlation_check is True: gul_inputs_df['group_id'] = gul_inputs_df[correlation_group_id] - elif hash_group_ids is False: + elif hashed_group_id is False: if len(group_id_cols) > 1: gul_inputs_df['group_id'] = factorize_ndarray( @@ -331,7 +331,7 @@ def get_gul_input_items( gul_inputs_df[group_id_cols[0]].values )[0] - # this block gets fired if the hash_group_ids is True + # this block gets fired if the hashed_group_id is True else: gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]).to_numpy() >> 33) @@ -446,7 +446,6 @@ def write_gul_input_files( output_dir, oasis_files_prefixes=copy.deepcopy(OASIS_FILES_PREFIXES['gul']), chunksize=(2 * 10 ** 5), - hashed_item_id=False ): """ Writes the standard Oasis GUL input files to a target directory, using a @@ -504,9 +503,4 @@ def write_gul_input_files( for fn in gul_input_files: getattr(this_module, 'write_{}_file'.format(fn))(gul_inputs_df.copy(deep=True), gul_input_files[fn], chunksize) - # if hashed_item_id is True: - # input_file = gul_input_files["items"] - # input_directory = "/".join(input_file.split("/")[:-1]) + "/" - # convert_item_csv_to_hash(input_directory=input_directory) - return gul_input_files From 2222be2cb930e09ced342c77fe7d3905533f76be Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Mon, 8 Aug 2022 14:11:40 +0100 Subject: [PATCH 11/33] adding haahing patch --- oasislmf/computation/generate/files.py | 2 +- oasislmf/preparation/gul_inputs.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 6e9930d6a8..bdd55ae0d2 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -104,7 +104,7 @@ class GenerateFiles(ComputationStep): {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'}, {'name': 'group_id_cols', 'flag':'-G', 'nargs':'+', 'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS}, {'name': 'lookup_multiprocessing', 'type': str2bool, 'const': False, 'nargs':'?', 'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'}, - {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs':'?', "default": False, "help": "Hashes the group_id in the items.bin"}, + {"name": "hashed_group_id", "type": str2bool, "const": True, 'nargs':'?', "default": True, "help": "Hashes the group_id in the items.bin"}, # Manager only options (pass data directy instead of filepaths) {'name': 'lookup_config'}, diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index a90421a9cc..230dc03cf1 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -333,7 +333,10 @@ def get_gul_input_items( # this block gets fired if the hash_group_ids is True else: - gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]).to_numpy() >> 33) + para_chain = gul_inputs_df.drop_duplicates(subset=group_id_cols).reset_index(drop=True) + para_chain["group_id"] = pd.util.hash_pandas_object(para_chain[group_id_cols]) + para_chain = para_chain[group_id_cols + ["group_id"]] + gul_inputs_df = pd.merge(gul_inputs_df, para_chain, how='left', left_on=group_id_cols, right_on=group_id_cols) gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32') From 2c0d5e3c230d81923d1c0463f56593a6b8a93dd9 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Mon, 8 Aug 2022 14:14:30 +0100 Subject: [PATCH 12/33] adding haahing patch --- oasislmf/computation/generate/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index bdd55ae0d2..6e9930d6a8 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -104,7 +104,7 @@ class GenerateFiles(ComputationStep): {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'}, {'name': 'group_id_cols', 'flag':'-G', 'nargs':'+', 'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS}, {'name': 'lookup_multiprocessing', 'type': str2bool, 'const': False, 'nargs':'?', 'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'}, - {"name": "hashed_group_id", "type": str2bool, "const": True, 'nargs':'?', "default": True, "help": "Hashes the group_id in the items.bin"}, + {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs':'?', "default": False, "help": "Hashes the group_id in the items.bin"}, # Manager only options (pass data directy instead of filepaths) {'name': 'lookup_config'}, From bb0585863d8e1a54f0513913ba0e5fde40fc03a2 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Thu, 11 Aug 2022 15:34:26 +0100 Subject: [PATCH 13/33] [gulpy] minor cleanup files.py parameter on same line --- oasislmf/computation/generate/files.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 79d8620365..0803074821 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -104,8 +104,7 @@ class GenerateFiles(ComputationStep): {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'}, {'name': 'group_id_cols', 'flag':'-G', 'nargs':'+', 'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS}, {'name': 'lookup_multiprocessing', 'type': str2bool, 'const': False, 'nargs':'?', 'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'}, - {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs': '?', - "default": True, "help": "Hashes the group_id in the items.bin"}, + {"name": "hashed_group_id", 'type': str2bool, 'const': False, 'nargs': '?', 'default': True, 'help': "Hashes the group_id in the items.bin"}, # Manager only options (pass data directy instead of filepaths) {'name': 'lookup_config'}, From e593f0ca0d60757f88c1fe7c757629ca430fed40 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Thu, 11 Aug 2022 16:20:39 +0100 Subject: [PATCH 14/33] [gulpy] run correlation only if rho>0 --- oasislmf/pytools/gul/manager.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index 14102b60c8..b40e754798 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -389,15 +389,20 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, if sample_size > 0: if not ignore_correlation: item_corr_data = corr_data_by_item_id[item['item_id']] - peril_correlation_group = item_corr_data['peril_correlation_group'] rho = item_corr_data['correlation_value'] - get_corr_rval( - eps_ij[peril_correlation_group], rndms_base[rng_index], - rho, arr_min, arr_max, arr_N, norm_inv_cdf, - arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif - ) - rndms = z_unif + if rho > 0: + peril_correlation_group = item_corr_data['peril_correlation_group'] + + get_corr_rval( + eps_ij[peril_correlation_group], rndms_base[rng_index], + rho, arr_min, arr_max, arr_N, norm_inv_cdf, + arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif + ) + rndms = z_unif + + else: + rndms = rndms_base[rng_index] else: rndms = rndms_base[rng_index] From fbf1689e19ba887352e2c0f41d15b490d8e0b6a9 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Thu, 11 Aug 2022 16:40:58 +0100 Subject: [PATCH 15/33] updating hashing --- oasislmf/preparation/gul_inputs.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 230dc03cf1..765bae9410 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -279,7 +279,7 @@ def get_gul_input_items( # Concatenate chunks. Sort by index to preserve item_id order in generated outputs compared # to original code. - gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index() + gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True, inplace=True) # Set default values and data types for BI coverage boolean, TIV, deductibles and limit dtypes = { **{t: 'uint8' for t in term_cols_ints + terms_ints}, @@ -333,10 +333,7 @@ def get_gul_input_items( # this block gets fired if the hash_group_ids is True else: - para_chain = gul_inputs_df.drop_duplicates(subset=group_id_cols).reset_index(drop=True) - para_chain["group_id"] = pd.util.hash_pandas_object(para_chain[group_id_cols]) - para_chain = para_chain[group_id_cols + ["group_id"]] - gul_inputs_df = pd.merge(gul_inputs_df, para_chain, how='left', left_on=group_id_cols, right_on=group_id_cols) + gul_inputs_df["group_id"] = pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]) gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32') From d6118699ecd3833bbeb98e84080d013116537ab8 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Fri, 12 Aug 2022 11:55:42 +0100 Subject: [PATCH 16/33] [gulpy] improve flow depending on corr definitions --- oasislmf/pytools/gul/manager.py | 35 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py index b40e754798..445fdd720d 100644 --- a/oasislmf/pytools/gul/manager.py +++ b/oasislmf/pytools/gul/manager.py @@ -226,20 +226,23 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # create the array to store the seeds seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id']) - file_path = os.path.join(input_path, 'correlations.bin') - data = CorrelationsData.from_bin(file_path=file_path).data - Nperil_correlation_groups = len(data) - logger.info(f"Detected {Nperil_correlation_groups} peril correlation groups.") - - if Nperil_correlation_groups == 0: - ignore_correlation = True - logger.info(f"Correlated random number generation: switched OFF because 0 peril correlation groups were detected.") + do_correlation = False + if ignore_correlation: + logger.info(f"Correlated random number generation: switched OFF because --ignore-correlation is True.") else: - if ignore_correlation: - logger.info(f"Correlated random number generation: switched OFF because --ignore-correlation is True.") + file_path = os.path.join(input_path, 'correlations.bin') + data = CorrelationsData.from_bin(file_path=file_path).data + Nperil_correlation_groups = len(data) + logger.info(f"Detected {Nperil_correlation_groups} peril correlation groups.") + + if Nperil_correlation_groups > 0 and any(data['correlation_value'] > 0): + do_correlation = True + else: + logger.info(f"Correlated random number generation: switched OFF because 0 peril correlation groups were detected or " + "the correlation value is zero for all peril correlation groups.") - if not ignore_correlation: + if do_correlation: logger.info(f"Correlated random number generation: switched ON.") corr_data_by_item_id = np.ndarray(Nperil_correlation_groups + 1, dtype=Correlation) @@ -285,7 +288,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu # to generate the correlated part, we do the hashing here for now (instead of in stream_to_data) # generate the correlated samples for the whole event, for all peril correlation groups - if not ignore_correlation: + if do_correlation: corr_seeds = generate_correlated_hash_vector(unique_peril_correlation_groups, event_id) eps_ij = generate_rndm(corr_seeds, sample_size, skip_seeds=1) @@ -299,7 +302,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses( event_id, coverages, compute[:compute_i], items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, - damage_bins, loss_threshold, losses_buffer, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id, + damage_bins, loss_threshold, losses_buffer, alloc_rule, do_correlation, rndms_base, eps_ij, corr_data_by_item_id, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug, GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor ) @@ -316,7 +319,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu @njit(cache=True, fastmath=True) def compute_event_losses(event_id, coverages, coverage_ids, items_data, last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins, - loss_threshold, losses, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id, + loss_threshold, losses, alloc_rule, do_correlation, rndms_base, eps_ij, corr_data_by_item_id, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug, buff_size, int32_mv, cursor): """Compute losses for an event. @@ -335,7 +338,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, loss_threshold (float): threshold above which losses are printed to the output stream. losses (numpy.array[oasis_float]): array (to be re-used) to store losses for all item_ids. alloc_rule (int): back-allocation rule. - ignore_correlation (bool): if True, do not compute correlated random samples. + do_correlation (bool): if True, compute correlated random samples. rndms (numpy.array[float64]): 2d array of shape (number of seeds, sample_size) storing the random values drawn for each seed. debug (bool): if True, for each random sample, print to the output stream the random value @@ -387,7 +390,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data, losses[MEAN_IDX, item_i] = gul_mean if sample_size > 0: - if not ignore_correlation: + if do_correlation: item_corr_data = corr_data_by_item_id[item['item_id']] rho = item_corr_data['correlation_value'] From 82fb79c01861ef77dedbf5f60c9712c14bdbb2d5 Mon Sep 17 00:00:00 2001 From: sambles Date: Fri, 12 Aug 2022 14:25:41 +0100 Subject: [PATCH 17/33] Disable GroupID hashing for acceptance tests (#1094) * Update expected acceptance tests * Revert "Update expected acceptance tests" This reverts commit ad0907fd286387f811b00537b3c59680ce13c7d4. * Default "hashed_group_id" to false in exposure run * Move hashed_group_id=F default from "RunExposure" to "RunFmTest" * Fix/pip compile (#1097) * Only install pip-tools before pip-compile * Try pinning flake8 * Revert "Try pinning flake8" This reverts commit d845d5b2051ed7aa79a3282be000153b52966cad. * Try pinning virtualenv * add --upgrade to pip install pip-tools * Fix test_get_dataframe__from_csv_file__set_col_defaults_option_and_use_defaults_ and run with falsifying example * Remove falsifying example Co-authored-by: Marco Tazzari <6020226+mtazzari@users.noreply.github.com> --- .github/workflows/oasislmf-unittest.yml | 3 +-- oasislmf/computation/run/exposure.py | 6 +++++- tests/fm/test_fm.py | 2 ++ tests/fm/test_fmpy.py | 4 +++- tests/utils/test_data.py | 3 +++ 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/oasislmf-unittest.yml b/.github/workflows/oasislmf-unittest.yml index 9c22e4a4d9..bec0a7bdc9 100644 --- a/.github/workflows/oasislmf-unittest.yml +++ b/.github/workflows/oasislmf-unittest.yml @@ -35,8 +35,7 @@ jobs: - name: Install pip-tools run: | python -m pip install --upgrade pip - pip install pip-tools - + pip install --upgrade pip-tools - name: Pip Compile run: | rm -f requirements.txt diff --git a/oasislmf/computation/run/exposure.py b/oasislmf/computation/run/exposure.py index a332851b84..82f322095e 100644 --- a/oasislmf/computation/run/exposure.py +++ b/oasislmf/computation/run/exposure.py @@ -54,6 +54,7 @@ class RunExposure(ComputationStep): {'name': 'fmpy_low_memory', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'use memory map instead of RAM to store loss array (may decrease performance but reduce RAM usage drastically)'}, {'name': 'fmpy_sort_output', 'default': True, 'type': str2bool, 'const': True, 'nargs': '?', 'help': 'order fmpy output by item_id'}, {'name': 'stream_type', 'flag':'-t', 'default': 2, 'type':int, 'help': 'Set the IL input stream type, 2 = default loss stream, 1 = deprecated cov/item stream'}, + {"name": "hashed_group_id", "default": True, "type": str2bool, "const": False, 'nargs': '?', "help": "Hashes the group_id in the items.bin"}, {'name': 'net_ri', 'default': True}, {'name': 'include_loss_factor', 'default': True}, {'name': 'print_summary', 'default': True}, @@ -120,6 +121,7 @@ def run(self): oed_info_csv=ri_info_fp, oed_scope_csv=ri_scope_fp, keys_data_csv=keys_fp, + hashed_group_id=self.hashed_group_id, ).run() # 3. Run Deterministic Losses @@ -312,6 +314,7 @@ class RunFmTest(ComputationStep): {'name': 'fmpy_low_memory', 'default': False, 'type': str2bool, 'const': True, 'nargs': '?', 'help': 'use memory map instead of RAM to store loss array (may decrease performance but reduce RAM usage drastically)'}, {'name': 'fmpy_sort_output', 'default': True, 'type': str2bool, 'const': True, 'nargs': '?', 'help': 'order fmpy output by item_id'}, {'name': 'update_expected', 'default': False}, + {'name': 'hashed_group_id', 'default': False}, {'name': 'expected_output_dir', 'default': "expected"}, ] @@ -408,7 +411,8 @@ def execute_test_case(self, test_case): num_subperils=self.num_subperils, fmpy=self.fmpy, fmpy_low_memory=self.fmpy_low_memory, - fmpy_sort_output=self.fmpy_sort_output + fmpy_sort_output=self.fmpy_sort_output, + hashed_group_id=self.hashed_group_id, ).run() expected_data_dir = os.path.join(test_dir, self.expected_output_dir) diff --git a/tests/fm/test_fm.py b/tests/fm/test_fm.py index 82133c94e3..b9bb672fa0 100644 --- a/tests/fm/test_fm.py +++ b/tests/fm/test_fm.py @@ -15,6 +15,7 @@ def setUp(self): self.test_cases_fp = os.path.join(sys.path[0], 'validation') self.update_expected = False self.keep_output = True + self.hashed_group_id = False def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"): with tempfile.TemporaryDirectory() as tmp_run_dir: @@ -35,6 +36,7 @@ def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"): num_subperils=subperils, test_tolerance=0.001, expected_output_dir=expected_dir, + hashed_group_id=self.hashed_group_id, ) self.assertTrue(result) diff --git a/tests/fm/test_fmpy.py b/tests/fm/test_fmpy.py index 077b342a28..c10ff17f8d 100644 --- a/tests/fm/test_fmpy.py +++ b/tests/fm/test_fmpy.py @@ -15,6 +15,7 @@ def setUp(self): self.test_cases_fp = os.path.join(sys.path[0], 'validation') self.update_expected = False self.keep_output = True + self.hashed_group_id = False def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"): with tempfile.TemporaryDirectory() as tmp_run_dir: @@ -37,6 +38,7 @@ def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"): num_subperils=subperils, test_tolerance=0.001, expected_output_dir=expected_dir, + hashed_group_id=self.hashed_group_id, ) self.assertTrue(result) @@ -91,4 +93,4 @@ def test_issues_2_subperils(self): self.run_test('issues', fmpy=True, subperils=2, expected_dir="expected_subperils") def test_insurance_policy_coverage_2_subperils(self): - self.run_test('insurance_policy_coverage',fmpy=True, subperils=2, expected_dir="expected_subperils") \ No newline at end of file + self.run_test('insurance_policy_coverage',fmpy=True, subperils=2, expected_dir="expected_subperils") diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py index be99f55edf..73f52b37bf 100644 --- a/tests/utils/test_data.py +++ b/tests/utils/test_data.py @@ -39,6 +39,7 @@ get_timestamp, get_utctimestamp, get_location_df, + PANDAS_DEFAULT_NULL_VALUES, ) from oasislmf.utils.defaults import ( @@ -541,10 +542,12 @@ def test_get_dataframe__from_csv_file_with_mixed_case_cols__set_col_defaults_opt try: df = pd.DataFrame(data) df.to_csv(path_or_buf=fp, columns=df.columns, encoding='utf-8', index=False) + df['STR_COL'] = df['STR_COL'].map(lambda x: np.nan if x in PANDAS_DEFAULT_NULL_VALUES else x) fp.close() expected = df.copy(deep=True) expected.columns = expected.columns.str.lower() + for col, default in defaults.items(): expected.loc[:, col.lower()].fillna(defaults[col], inplace=True) From fee427ee04a1422b8300d92d6002911c55461f5c Mon Sep 17 00:00:00 2001 From: mtazzari Date: Fri, 12 Aug 2022 14:45:46 +0100 Subject: [PATCH 18/33] Update group_id_cols default in get_gul_input_items --- oasislmf/preparation/gul_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 87efd900bb..7df06b66ab 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -50,7 +50,7 @@ def get_gul_input_items( exposure_df, keys_df, exposure_profile=get_default_exposure_profile(), - group_id_cols=['loc_id'], + group_id_cols=["PortNumber", "AccNumber", "LocNumber"], hashed_group_id=True ): """ From 45f8779f2b58ec38294f39006e19c18693582b77 Mon Sep 17 00:00:00 2001 From: Maxwell Flitton Date: Fri, 12 Aug 2022 15:46:00 +0100 Subject: [PATCH 19/33] Hashing investigation (#1096) * adding haahing patch * adding haahing patch * updating hashing * Update oasislmf/preparation/gul_inputs.py Co-authored-by: Marco Tazzari <6020226+mtazzari@users.noreply.github.com> --- oasislmf/preparation/gul_inputs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 7df06b66ab..279aa9cd7c 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -279,7 +279,7 @@ def get_gul_input_items( # Concatenate chunks. Sort by index to preserve item_id order in generated outputs compared # to original code. - gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index() + gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True, inplace=True) # Set default values and data types for BI coverage boolean, TIV, deductibles and limit dtypes = { **{t: 'uint8' for t in term_cols_ints + terms_ints}, @@ -333,7 +333,7 @@ def get_gul_input_items( # this block gets fired if the hashed_group_id is True else: - gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]).to_numpy() >> 33) + gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols], index=False).to_numpy() >> 33) gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32') From 461621f3b129d4a0c986dbb7a65384346bdbf8e6 Mon Sep 17 00:00:00 2001 From: mtazzari Date: Fri, 12 Aug 2022 16:43:03 +0100 Subject: [PATCH 20/33] [gul_inputs] bugfix don't modify inplace --- oasislmf/preparation/gul_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 279aa9cd7c..6a7b69d8ac 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -279,7 +279,7 @@ def get_gul_input_items( # Concatenate chunks. Sort by index to preserve item_id order in generated outputs compared # to original code. - gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True, inplace=True) + gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True) # Set default values and data types for BI coverage boolean, TIV, deductibles and limit dtypes = { **{t: 'uint8' for t in term_cols_ints + terms_ints}, From 77245a3301fd1f4749b83cd4c6b6e2d8dc208519 Mon Sep 17 00:00:00 2001 From: Sam Gamble Date: Fri, 12 Aug 2022 18:47:44 +0100 Subject: [PATCH 21/33] Update test_summaries.py to not rely on "loc_id" as default for group_id_cols --- tests/model_preparation/test_summaries.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/model_preparation/test_summaries.py b/tests/model_preparation/test_summaries.py index c19c97a2a4..e128b7510c 100644 --- a/tests/model_preparation/test_summaries.py +++ b/tests/model_preparation/test_summaries.py @@ -133,7 +133,7 @@ def test_single_peril__totals_correct(self, data): ) # Run Gul Proccessing - gul_inputs = get_gul_input_items(loc_df, keys_df) + gul_inputs = get_gul_input_items(loc_df, keys_df, group_id_cols=['loc_id']) gul_inputs = gul_inputs[gul_inputs['status'].isin(OASIS_KEYS_STATUS_MODELLED)] # Fetch expected TIVS @@ -200,7 +200,7 @@ def test_multi_perils__single_covarage(self, data): # Run Summary output check self.assertSummaryIsValid( loc_df, - get_gul_input_items(loc_df, keys_df), + get_gul_input_items(loc_df, keys_df, group_id_cols=['loc_id']), get_exposure_summary(exposure_df=loc_df, keys_df=keys_df), perils_returned ) @@ -244,7 +244,7 @@ def test_multi_perils__multi_covarage(self, data): # Run Summary output check exp_summary = get_exposure_summary(exposure_df=loc_df, keys_df=keys_df) - gul_inputs = get_gul_input_items(loc_df, keys_df) + gul_inputs = get_gul_input_items(loc_df, keys_df, group_id_cols=['loc_id']) self.assertSummaryIsValid( loc_df, gul_inputs, From bf500beed1eef1e06640f809342fb6161da8869c Mon Sep 17 00:00:00 2001 From: sambles Date: Tue, 23 Aug 2022 14:31:59 +0100 Subject: [PATCH 22/33] Always create a correlations.bin, if missing model_settings file is blank (#1101) --- oasislmf/computation/generate/files.py | 12 ++++-------- oasislmf/preparation/correlations.py | 7 +++++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 0803074821..a8ad386b7d 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -234,14 +234,10 @@ def run(self): group_id_cols=group_id_cols, hashed_group_id=self.hashed_group_id, ) - - if self.model_settings_json is not None: - correlation_input_items = get_correlation_input_items( - model_settings_path=self.model_settings_json, - gul_inputs_df=gul_inputs_df - ) - else: - correlation_input_items = None + correlation_input_items = get_correlation_input_items( + model_settings_path=self.model_settings_json, + gul_inputs_df=gul_inputs_df + ) # If not in det. loss gen. scenario, write exposure summary file if summarise_exposure: diff --git a/oasislmf/preparation/correlations.py b/oasislmf/preparation/correlations.py index 6a5fed0bc6..bbb42ae5f9 100644 --- a/oasislmf/preparation/correlations.py +++ b/oasislmf/preparation/correlations.py @@ -43,8 +43,11 @@ def get_correlation_input_items(model_settings_path: str, gul_inputs_df: pd.Data Returns: (pd.DataFrame) the mapped data of correlations """ - model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path) - correlation_map_df = map_data(data=model_settings_raw_data) + if model_settings_path == None: + correlation_map_df = None + else: + model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path) + correlation_map_df = map_data(data=model_settings_raw_data) if correlation_map_df is not None: gul_inputs_df = gul_inputs_df.merge(correlation_map_df, left_on='peril_id', right_on='id').reset_index() From 2b3d2026f794397dc0b74b0d9312095aeb9f8518 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Mon, 5 Sep 2022 13:06:26 +0100 Subject: [PATCH 23/33] adding peril_correlation_group for valid_oasis_group_cols --- oasislmf/preparation/gul_inputs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 765bae9410..502e63a336 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -153,6 +153,7 @@ def get_gul_input_items( 'peril_id', 'coverage_id', 'coverage_type_id', + 'peril_correlation_group' ] for col in group_id_cols: if col not in list(exposure_df.columns) + valid_oasis_group_cols: From 7d772fb74d7de813ead482c8d2b4930a7f8b4dc2 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Mon, 5 Sep 2022 16:01:39 +0100 Subject: [PATCH 24/33] appending peril_correlation_group to columns if correlations group is present --- oasislmf/computation/generate/files.py | 1 + oasislmf/preparation/gul_inputs.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index a8ad386b7d..28a7e2aa03 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -230,6 +230,7 @@ def run(self): gul_inputs_df = get_gul_input_items( location_df, keys_df, + output_dir=self._get_output_dir(), exposure_profile=location_profile, group_id_cols=group_id_cols, hashed_group_id=self.hashed_group_id, diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index cfd0da489e..6a36df1048 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -49,6 +49,7 @@ def get_gul_input_items( exposure_df, keys_df, + output_dir, exposure_profile=get_default_exposure_profile(), group_id_cols=["PortNumber", "AccNumber", "LocNumber"], hashed_group_id=True @@ -62,6 +63,9 @@ def get_gul_input_items( :param keys_df: Keys dataframe :type keys_df: pandas.DataFrame + :param output_dir: the output directory where input files are stored + :type output_dir: str + :param exposure_profile: Exposure profile :type exposure_profile: dict @@ -160,6 +164,20 @@ def get_gul_input_items( warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col)) group_id_cols.remove(col) + # here we check to see if the correlation file is here, if it is then we need to add the "peril_correlation_group" to the valid_oasis_group_cols + peril_correlation_group = 'peril_correlation_group' + correlations_files = [ + f"{output_dir}/correlations.csv", + f"{output_dir}/correlations.bin", + ] + for file_path in correlations_files: + if os.path.exists(path=file_path): + if peril_correlation_group not in group_id_cols: + group_id_cols.append(peril_correlation_group) + break + + + # Should list of column names used to group_id be empty, revert to # default if len(group_id_cols) == 0: From e6ac89e944af08036ca53a6ca7f6e9a71413e250 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Wed, 7 Sep 2022 12:30:41 +0100 Subject: [PATCH 25/33] adding peril_correlation_group column to hashing of group IDs if correlations groups are used and hashing group IDs is done --- oasislmf/computation/generate/files.py | 20 ++++-- oasislmf/preparation/gul_inputs.py | 89 +++++++++++++++++--------- oasislmf/utils/data.py | 22 +++++++ run_test.py | 0 4 files changed, 96 insertions(+), 35 deletions(-) create mode 100644 run_test.py diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 28a7e2aa03..63beabcff9 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -8,6 +8,8 @@ import json import os from pathlib import Path +from typing import List +import pandas as pd from .keys import GenerateKeys, GenerateKeysDeterministic from ..base import ComputationStep @@ -72,7 +74,10 @@ GULSummaryXrefFile, FMSummaryXrefFile ) -from oasislmf.preparation.correlations import get_correlation_input_items +from oasislmf.preparation.correlations import get_correlation_input_items, map_data +from oasislmf.preparation.gul_inputs import process_group_id_cols, hash_with_correlations +# from oasislmf.preparation.correlations import map_data +from oasislmf.utils.data import establish_correlations class GenerateFiles(ComputationStep): @@ -230,16 +235,24 @@ def run(self): gul_inputs_df = get_gul_input_items( location_df, keys_df, - output_dir=self._get_output_dir(), exposure_profile=location_profile, group_id_cols=group_id_cols, - hashed_group_id=self.hashed_group_id, + hashed_group_id=self.hashed_group_id ) correlation_input_items = get_correlation_input_items( model_settings_path=self.model_settings_json, gul_inputs_df=gul_inputs_df ) + correlations: bool = establish_correlations(model_settings_path=self.model_settings_json) + group_id_cols: List[str] = process_group_id_cols(group_id_cols=group_id_cols, + exposure_df_columns=list(location_df), + correlations=correlations) + + if self.hashed_group_id is True and correlations is True: + gul_inputs_df = pd.merge(gul_inputs_df, correlation_input_items, on="item_id") + gul_inputs_df = hash_with_correlations(gul_inputs_df=gul_inputs_df, hashing_columns=group_id_cols) + # If not in det. loss gen. scenario, write exposure summary file if summarise_exposure: write_exposure_summary( @@ -265,7 +278,6 @@ def run(self): oasis_files_prefixes=files_prefixes['gul'], chunksize=self.write_chunksize, ) - gul_summary_mapping = get_summary_mapping(gul_inputs_df, oed_hierarchy) write_mapping_file(gul_summary_mapping, target_dir) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 6a36df1048..33238a6139 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -10,6 +10,7 @@ import sys import warnings from collections import OrderedDict +from typing import List import pandas as pd @@ -44,12 +45,61 @@ pd.options.mode.chained_assignment = None warnings.simplefilter(action='ignore', category=FutureWarning) +VALID_OASIS_GROUP_COLS = [ + 'item_id', + 'peril_id', + 'coverage_id', + 'coverage_type_id', + 'peril_correlation_group' + ] + + +def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[str], correlations: bool) -> List[str]: + """ + cleans out columns that are not valid oasis group columns. + + Valid group id columns can be either + 1. exist in the location file + 2. be listed as a useful internal col + + Args: + group_id_cols: (List[str]) the ID columns that are going to be filtered + exposure_df_columns: (List[str]) the columns in the exposure dataframe + correlations: (bool) if set to True means that we are hashing with correlations in mind therefore the + "peril_correlation_group" column is added + + Returns: (List[str]) the filtered columns + """ + for col in VALID_OASIS_GROUP_COLS: + if col not in list(exposure_df_columns) + VALID_OASIS_GROUP_COLS: + warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col)) + group_id_cols.remove(col) + + peril_correlation_group = 'peril_correlation_group' + if peril_correlation_group not in group_id_cols and correlations is True: + group_id_cols.append(peril_correlation_group) + return group_id_cols + + +def hash_with_correlations(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd.DataFrame: + """ + Creates a hash for the group ID field for the input data frame. + + Args: + gul_inputs_df: (pd.DataFrame) the gul inputs that are doing the have the group_id field rewritten with a hash + hashing_columns: (List[str]) the list of columns used in the hashing algorithm + + Returns: (pd.DataFrame) the gul_inputs_df with the new hashed group_id + """ + gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[hashing_columns], + index=False).to_numpy() >> 33) + return gul_inputs_df + @oasis_log def get_gul_input_items( exposure_df, keys_df, - output_dir, exposure_profile=get_default_exposure_profile(), group_id_cols=["PortNumber", "AccNumber", "LocNumber"], hashed_group_id=True @@ -148,35 +198,12 @@ def get_gul_input_items( # Remove any duplicate column names used to assign group_id group_id_cols = list(set(group_id_cols)) - # Ignore any column names used to assign group_id that are missing or not supported - # Valid group id columns can be either - # 1. exist in the location file - # 2. be listed as a useful internal col - valid_oasis_group_cols = [ - 'item_id', - 'peril_id', - 'coverage_id', - 'coverage_type_id', - 'peril_correlation_group' - ] - for col in group_id_cols: - if col not in list(exposure_df.columns) + valid_oasis_group_cols: - warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col)) - group_id_cols.remove(col) - - # here we check to see if the correlation file is here, if it is then we need to add the "peril_correlation_group" to the valid_oasis_group_cols - peril_correlation_group = 'peril_correlation_group' - correlations_files = [ - f"{output_dir}/correlations.csv", - f"{output_dir}/correlations.bin", - ] - for file_path in correlations_files: - if os.path.exists(path=file_path): - if peril_correlation_group not in group_id_cols: - group_id_cols.append(peril_correlation_group) - break - - + # it is assumed that correlations are False for now, correlations for group ID hashing are assessed later on in + # the process to re-hash the group ID with the correlation "peril_correlation_group" column name. This is because + # the correlations is achieved later in the process leading to a chicken and egg problem + group_id_cols = process_group_id_cols(group_id_cols=group_id_cols, + exposure_df_columns=list(exposure_df.columns), + correlations=False) # Should list of column names used to group_id be empty, revert to # default @@ -186,7 +213,7 @@ def get_gul_input_items( # Only add group col if not internal oasis col missing_group_id_cols = [] for col in group_id_cols: - if col in valid_oasis_group_cols: + if col in VALID_OASIS_GROUP_COLS: pass elif col not in exposure_df_gul_inputs_cols: missing_group_id_cols.append(col) diff --git a/oasislmf/utils/data.py b/oasislmf/utils/data.py index 24c7ccd41c..b4fa6e0391 100644 --- a/oasislmf/utils/data.py +++ b/oasislmf/utils/data.py @@ -44,6 +44,7 @@ from chardet.universaldetector import UniversalDetector from tabulate import tabulate +from typing import List, Optional import numpy as np import pandas as pd @@ -409,6 +410,27 @@ def get_model_settings(model_settings_fp, key=None, validate=True): return model_settings if not key else model_settings.get(key) +def establish_correlations(model_settings_path: str) -> bool: + """ + Checks the model settings to see if correlations are present. + + Args: + model_settings_path: (str) path to the model setting JSON file + + Returns: (bool) True if correlations, False if not + """ + model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path) + correlations: Optional[List[dict]] = model_settings_raw_data.get("correlation_settings") + + if correlations is None: + return False + if not isinstance(correlations, list): + return False + if len(correlations) == 0: + return False + return True + + def detect_encoding(filepath): """ Given a path to a CSV of unknown encoding diff --git a/run_test.py b/run_test.py new file mode 100644 index 0000000000..e69de29bb2 From 823add86efa0d011b2b59cf10ebb1cbcc57da3b1 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Thu, 15 Sep 2022 16:09:57 +0100 Subject: [PATCH 26/33] updating hashing group ID --- oasislmf/computation/generate/files.py | 38 ++++++-------- oasislmf/preparation/correlations.py | 50 ++++++++----------- oasislmf/preparation/gul_inputs.py | 29 +++++++---- .../data_layer/oasis_files/correlations.py | 2 + oasislmf/utils/data.py | 7 ++- tests/preparation/test_correlations.py | 4 +- 6 files changed, 63 insertions(+), 67 deletions(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 63beabcff9..885b3d3013 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -9,9 +9,8 @@ import os from pathlib import Path from typing import List -import pandas as pd -from .keys import GenerateKeys, GenerateKeysDeterministic +from .keys import GenerateKeys from ..base import ComputationStep #from ...utils.coverages import SUPPORTED_COVERAGE_TYPES @@ -74,10 +73,10 @@ GULSummaryXrefFile, FMSummaryXrefFile ) -from oasislmf.preparation.correlations import get_correlation_input_items, map_data -from oasislmf.preparation.gul_inputs import process_group_id_cols, hash_with_correlations -# from oasislmf.preparation.correlations import map_data +from oasislmf.preparation.correlations import map_data +from oasislmf.preparation.gul_inputs import process_group_id_cols from oasislmf.utils.data import establish_correlations +from oasislmf.pytools.data_layer.oasis_files.correlations import CorrelationsData class GenerateFiles(ComputationStep): @@ -216,11 +215,14 @@ def run(self): # Columns from loc file to assign group_id model_group_fields = None + correlations: bool = False + model_settings = None + if self.model_settings_json: + model_settings = get_model_settings(self.model_settings_json) + correlations = establish_correlations(model_settings=model_settings) try: - model_group_fields = get_model_settings( - self.model_settings_json, key='data_settings' - ).get('group_fields') + model_group_fields = model_settings["data_settings"].get("group_fields") except (KeyError, AttributeError, OasisException) as e: self.logger.warn('WARNING: Failed to load {} - {}'.format(self.model_settings_json, e)) @@ -232,26 +234,18 @@ def run(self): group_id_cols = self.group_id_cols group_id_cols = list(map(lambda col: col.lower(), group_id_cols)) + group_id_cols: List[str] = process_group_id_cols(group_id_cols=group_id_cols, + exposure_df_columns=list(location_df), + has_correlation_groups=correlations) gul_inputs_df = get_gul_input_items( location_df, keys_df, + peril_correlation_group_df=map_data(data=model_settings), + correlations=correlations, exposure_profile=location_profile, group_id_cols=group_id_cols, hashed_group_id=self.hashed_group_id ) - correlation_input_items = get_correlation_input_items( - model_settings_path=self.model_settings_json, - gul_inputs_df=gul_inputs_df - ) - - correlations: bool = establish_correlations(model_settings_path=self.model_settings_json) - group_id_cols: List[str] = process_group_id_cols(group_id_cols=group_id_cols, - exposure_df_columns=list(location_df), - correlations=correlations) - - if self.hashed_group_id is True and correlations is True: - gul_inputs_df = pd.merge(gul_inputs_df, correlation_input_items, on="item_id") - gul_inputs_df = hash_with_correlations(gul_inputs_df=gul_inputs_df, hashing_columns=group_id_cols) # If not in det. loss gen. scenario, write exposure summary file if summarise_exposure: @@ -273,7 +267,7 @@ def run(self): gul_input_files = write_gul_input_files( gul_inputs_df, target_dir, - correlations_df=correlation_input_items, + correlations_df=gul_inputs_df[CorrelationsData.COLUMNS] if correlations is True else None, output_dir=self._get_output_dir(), oasis_files_prefixes=files_prefixes['gul'], chunksize=self.write_chunksize, diff --git a/oasislmf/preparation/correlations.py b/oasislmf/preparation/correlations.py index bbb42ae5f9..bfea8c2713 100644 --- a/oasislmf/preparation/correlations.py +++ b/oasislmf/preparation/correlations.py @@ -8,7 +8,7 @@ from oasislmf.utils.data import get_model_settings -def map_data(data: dict) -> Optional[pd.DataFrame]: +def map_data(data: Optional[dict]) -> Optional[pd.DataFrame]: """ Maps data from the model settings to to have Peril ID, peril_correlation_group, and correlation_value. @@ -17,43 +17,37 @@ def map_data(data: dict) -> Optional[pd.DataFrame]: Returns: (pd.DataFrame) the mapped data """ - supported_perils = data.get("lookup_settings", {}).get("supported_perils", []) - correlation_settings = data.get("correlation_settings", []) + if data is not None: + supported_perils = data.get("lookup_settings", {}).get("supported_perils", []) + correlation_settings = data.get("correlation_settings", []) - for supported_peril in supported_perils: - supported_peril["peril_correlation_group"] = supported_peril.get("peril_correlation_group", 0) + for supported_peril in supported_perils: + supported_peril["peril_correlation_group"] = supported_peril.get("peril_correlation_group", 0) - supported_perils_df = pd.DataFrame(supported_perils) - correlation_settings_df = pd.DataFrame(correlation_settings) + supported_perils_df = pd.DataFrame(supported_perils) + correlation_settings_df = pd.DataFrame(correlation_settings) - # merge allows duplicates of the "peril_correlation_group" in the supported perils - # merge does not allow duplicates of the "peril_correlation_group" in the correlation settings - if len(supported_perils_df) > 0 and len(correlation_settings_df) > 0: - mapped_data = pd.merge(supported_perils_df, correlation_settings_df, on="peril_correlation_group") - return mapped_data + # merge allows duplicates of the "peril_correlation_group" in the supported perils + # merge does not allow duplicates of the "peril_correlation_group" in the correlation settings + if len(supported_perils_df) > 0 and len(correlation_settings_df) > 0: + mapped_data = pd.merge(supported_perils_df, correlation_settings_df, on="peril_correlation_group") + return mapped_data -def get_correlation_input_items(model_settings_path: str, gul_inputs_df: pd.DataFrame) -> pd.DataFrame: +def get_correlation_input_items(gul_inputs_df: pd.DataFrame, correlation_map_df: pd.DataFrame) -> pd.DataFrame: """ Gets the correlation values with the peril ID from the model_settings. Args: - model_settings_path: (str) the path to the model settings JSON file + correlation_map_df: (pd.DataFrame) data from the model settings to to have Peril ID, peril_correlation_group, + and correlation_value gul_inputs_df: (pd.DataFrame) the data of the gul inputs to be mapped Returns: (pd.DataFrame) the mapped data of correlations """ - if model_settings_path == None: - correlation_map_df = None - else: - model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path) - correlation_map_df = map_data(data=model_settings_raw_data) - - if correlation_map_df is not None: - gul_inputs_df = gul_inputs_df.merge(correlation_map_df, left_on='peril_id', right_on='id').reset_index() - gul_inputs_df["correlation_value"] = gul_inputs_df["correlation_value"].astype(float) - gul_inputs_df = gul_inputs_df.reindex(columns=list(gul_inputs_df)) - - correlation_df = gul_inputs_df[["item_id", "peril_correlation_group", "correlation_value"]] - return correlation_df.sort_values('item_id') - return pd.DataFrame(columns=["item_id", "peril_correlation_group", "correlation_value"]) + gul_inputs_df = gul_inputs_df.merge(correlation_map_df, left_on='peril_id', right_on='id').reset_index() + gul_inputs_df["correlation_value"] = gul_inputs_df["correlation_value"].astype(float) + gul_inputs_df = gul_inputs_df.reindex(columns=list(gul_inputs_df)) + + correlation_df = gul_inputs_df[["item_id", "peril_correlation_group", "correlation_value"]] + return correlation_df.sort_values('item_id') diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 33238a6139..2c5ec20969 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -54,7 +54,7 @@ ] -def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[str], correlations: bool) -> List[str]: +def process_group_id_cols(group_id_cols, exposure_df_columns, has_correlation_groups): """ cleans out columns that are not valid oasis group columns. @@ -65,7 +65,7 @@ def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[st Args: group_id_cols: (List[str]) the ID columns that are going to be filtered exposure_df_columns: (List[str]) the columns in the exposure dataframe - correlations: (bool) if set to True means that we are hashing with correlations in mind therefore the + has_correlation_groups: (bool) if set to True means that we are hashing with correlations in mind therefore the "peril_correlation_group" column is added Returns: (List[str]) the filtered columns @@ -76,12 +76,12 @@ def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[st group_id_cols.remove(col) peril_correlation_group = 'peril_correlation_group' - if peril_correlation_group not in group_id_cols and correlations is True: + if peril_correlation_group not in group_id_cols and has_correlation_groups is True: group_id_cols.append(peril_correlation_group) return group_id_cols -def hash_with_correlations(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd.DataFrame: +def hash_group_id(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd.DataFrame: """ Creates a hash for the group ID field for the input data frame. @@ -100,6 +100,8 @@ def hash_with_correlations(gul_inputs_df: pd.DataFrame, hashing_columns: List[st def get_gul_input_items( exposure_df, keys_df, + correlations, + peril_correlation_group_df, exposure_profile=get_default_exposure_profile(), group_id_cols=["PortNumber", "AccNumber", "LocNumber"], hashed_group_id=True @@ -201,9 +203,9 @@ def get_gul_input_items( # it is assumed that correlations are False for now, correlations for group ID hashing are assessed later on in # the process to re-hash the group ID with the correlation "peril_correlation_group" column name. This is because # the correlations is achieved later in the process leading to a chicken and egg problem - group_id_cols = process_group_id_cols(group_id_cols=group_id_cols, - exposure_df_columns=list(exposure_df.columns), - correlations=False) + # group_id_cols = process_group_id_cols(group_id_cols=group_id_cols, + # exposure_df_columns=list(exposure_df.columns), + # has_correlation_groups=False) # Should list of column names used to group_id be empty, revert to # default @@ -358,8 +360,6 @@ def get_gul_input_items( # directly, otherwise create an index of the group id fields group_id_cols.sort() - col_key = group_id_cols[0] - if correlation_check is True: gul_inputs_df['group_id'] = gul_inputs_df[correlation_group_id] @@ -378,8 +378,14 @@ def get_gul_input_items( )[0] # this block gets fired if the hashed_group_id is True + elif correlations is False: + gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols], + index=False).to_numpy() >> 33) else: - gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols], index=False).to_numpy() >> 33) + # do merge with peril correlation df + gul_inputs_df = gul_inputs_df.merge(peril_correlation_group_df, left_on='peril_id', right_on='id').reset_index() + gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols], + index=False).to_numpy() >> 33) gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32') @@ -391,7 +397,8 @@ def get_gul_input_items( ['peril_id', 'coverage_type_id', 'tiv', 'areaperil_id', 'vulnerability_id'] + terms + (['model_data'] if 'model_data' in gul_inputs_df else []) + - ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status'] + ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status'] + + ["peril_correlation_group", "correlation_value"] if correlations is True else [] ) usecols = [col for col in usecols if col in gul_inputs_df] gul_inputs_df = gul_inputs_df[usecols] diff --git a/oasislmf/pytools/data_layer/oasis_files/correlations.py b/oasislmf/pytools/data_layer/oasis_files/correlations.py index 75de73ad79..afdd9c4547 100644 --- a/oasislmf/pytools/data_layer/oasis_files/correlations.py +++ b/oasislmf/pytools/data_layer/oasis_files/correlations.py @@ -16,6 +16,8 @@ class CorrelationsData: Attributes: data (Optional[pd.DataFrame): correlation data that is either loaded or saved """ + COLUMNS = ["item_id", "peril_correlation_group", "correlation_value"] + def __init__(self, data: Optional[pd.DataFrame] = None) -> None: """ The constructor for the CorrelationsData class. diff --git a/oasislmf/utils/data.py b/oasislmf/utils/data.py index b4fa6e0391..b3dfeb46d3 100644 --- a/oasislmf/utils/data.py +++ b/oasislmf/utils/data.py @@ -410,17 +410,16 @@ def get_model_settings(model_settings_fp, key=None, validate=True): return model_settings if not key else model_settings.get(key) -def establish_correlations(model_settings_path: str) -> bool: +def establish_correlations(model_settings: dict) -> bool: """ Checks the model settings to see if correlations are present. Args: - model_settings_path: (str) path to the model setting JSON file + model_settings: (dict) the model settings that are going to be checked Returns: (bool) True if correlations, False if not """ - model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path) - correlations: Optional[List[dict]] = model_settings_raw_data.get("correlation_settings") + correlations: Optional[List[dict]] = model_settings.get("correlation_settings") if correlations is None: return False diff --git a/tests/preparation/test_correlations.py b/tests/preparation/test_correlations.py index 62c10112c9..3d2bc49456 100644 --- a/tests/preparation/test_correlations.py +++ b/tests/preparation/test_correlations.py @@ -27,10 +27,10 @@ def test_map_data(self): def test_get_correlation_input_items(self): gul_path = META_PATH + "gul_inputs_df.csv" - settings_path = META_PATH + "model_settings.json" gul_inputs_df = pd.read_csv(gul_path) - correlation_df = get_correlation_input_items(model_settings_path=settings_path, gul_inputs_df=gul_inputs_df) + correlation_df = get_correlation_input_items(correlation_map_df=map_data(data=self.model_settings), + gul_inputs_df=gul_inputs_df) correlation_df_check = pd.read_csv(f"{META_PATH}correlation_df.csv") correlation_df_check.equals(correlation_df) From ff9f5681d2a3dd84892ebbd165241d477dcef6d1 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Tue, 20 Sep 2022 16:38:35 +0100 Subject: [PATCH 27/33] updating to accomodate non-correlations --- oasislmf/preparation/gul_inputs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 2c5ec20969..b82fd11053 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -397,9 +397,11 @@ def get_gul_input_items( ['peril_id', 'coverage_type_id', 'tiv', 'areaperil_id', 'vulnerability_id'] + terms + (['model_data'] if 'model_data' in gul_inputs_df else []) + - ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status'] + - ["peril_correlation_group", "correlation_value"] if correlations is True else [] + ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status'] ) + if correlations is True: + usecols += ["peril_correlation_group", "correlation_value"] + usecols = [col for col in usecols if col in gul_inputs_df] gul_inputs_df = gul_inputs_df[usecols] From 32a5f66d0c2a5d914ca0e02a2878bca5e7843b17 Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Wed, 21 Sep 2022 14:19:32 +0100 Subject: [PATCH 28/33] fixxing run --- oasislmf/computation/generate/files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py index 885b3d3013..40addb86a5 100644 --- a/oasislmf/computation/generate/files.py +++ b/oasislmf/computation/generate/files.py @@ -218,7 +218,7 @@ def run(self): correlations: bool = False model_settings = None - if self.model_settings_json: + if self.model_settings_json is not None: model_settings = get_model_settings(self.model_settings_json) correlations = establish_correlations(model_settings=model_settings) try: @@ -226,6 +226,7 @@ def run(self): except (KeyError, AttributeError, OasisException) as e: self.logger.warn('WARNING: Failed to load {} - {}'.format(self.model_settings_json, e)) + # load group columns from model_settings.json if not set in kwargs (CLI) if model_group_fields and not self.kwargs.get('group_id_cols'): group_id_cols = model_group_fields @@ -264,6 +265,7 @@ def run(self): # Write the GUL input files files_prefixes = self.oasis_files_prefixes + gul_input_files = write_gul_input_files( gul_inputs_df, target_dir, From d60deb20ec4ded903523ef32ea9ea71a0d21dd2a Mon Sep 17 00:00:00 2001 From: maxwellflitton Date: Wed, 21 Sep 2022 15:00:28 +0100 Subject: [PATCH 29/33] fixing empty correlations df write header if empty correlations --- oasislmf/preparation/gul_inputs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index b82fd11053..d93d3ad154 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -531,11 +531,13 @@ def write_gul_input_files( # Clean the target directory path target_dir = as_path(target_dir, 'Target IL input files directory', is_dir=True, preexists=False) + if correlations_df is None: + correlations_df = pd.DataFrame(columns=['item_id', 'peril_correlation_group', 'correlation_value']) + # write the correlations to a binary file - if correlations_df is not None: - correlation_data_handle = CorrelationsData(data=correlations_df) - correlation_data_handle.to_bin(file_path=f"{output_dir}/correlations.bin") - correlation_data_handle.to_csv(file_path=f"{output_dir}/correlations.csv") + correlation_data_handle = CorrelationsData(data=correlations_df) + correlation_data_handle.to_bin(file_path=f"{output_dir}/correlations.bin") + correlation_data_handle.to_csv(file_path=f"{output_dir}/correlations.csv") # Set chunk size for writing the CSV files - default is the minimum of 100K # or the GUL inputs frame size From e95dac662cc7375948e308f0e17bb961c5e0aebf Mon Sep 17 00:00:00 2001 From: Sam Gamble Date: Mon, 3 Oct 2022 13:07:58 +0100 Subject: [PATCH 30/33] Remove empty file --- run_test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 run_test.py diff --git a/run_test.py b/run_test.py deleted file mode 100644 index e69de29bb2..0000000000 From 62c805f536963094e85c8d43b61236a0f3e9fc81 Mon Sep 17 00:00:00 2001 From: Sam Gamble Date: Mon, 3 Oct 2022 13:14:06 +0100 Subject: [PATCH 31/33] Add missing defaults to get_gul_input_items (backwards compatible) --- oasislmf/preparation/gul_inputs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index d93d3ad154..1c97b12369 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -100,8 +100,8 @@ def hash_group_id(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd def get_gul_input_items( exposure_df, keys_df, - correlations, - peril_correlation_group_df, + correlations=False, + peril_correlation_group_df=None, exposure_profile=get_default_exposure_profile(), group_id_cols=["PortNumber", "AccNumber", "LocNumber"], hashed_group_id=True From 5f6933adc8ada030be82f15f7cf969d79a12beac Mon Sep 17 00:00:00 2001 From: Sam Gamble Date: Mon, 3 Oct 2022 13:28:06 +0100 Subject: [PATCH 32/33] Fix Group_id valid column check --- oasislmf/preparation/gul_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py index 1c97b12369..f3e5ad7aa6 100644 --- a/oasislmf/preparation/gul_inputs.py +++ b/oasislmf/preparation/gul_inputs.py @@ -70,7 +70,7 @@ def process_group_id_cols(group_id_cols, exposure_df_columns, has_correlation_gr Returns: (List[str]) the filtered columns """ - for col in VALID_OASIS_GROUP_COLS: + for col in group_id_cols: if col not in list(exposure_df_columns) + VALID_OASIS_GROUP_COLS: warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col)) group_id_cols.remove(col) From 44e43f2d53830fd8bd4778b21ad5393c89c74e78 Mon Sep 17 00:00:00 2001 From: Sam Gamble Date: Mon, 3 Oct 2022 14:33:34 +0100 Subject: [PATCH 33/33] Force retest --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 6369f55384..bced70a95d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/OasisLMF/OasisLMF/develop?filepath=FmTesting.ipynb) # OasisLMF - The `oasislmf` Python package, loosely called the *model development kit (MDK)* or the *MDK package*, provides a command line toolkit for developing, testing and running Oasis models end-to-end locally, or remotely via the Oasis API. It can generate ground-up losses (GUL), direct/insured losses (IL) and reinsurance losses (RIL). It can also generate deterministic losses at all these levels.