From eefce12ed2caf5bb5a54cbd9c3ae16a0b47a033f Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Tue, 5 Jul 2022 14:01:44 +0100
Subject: [PATCH 01/33] [gulpy] first implementation

---
 oasislmf/pytools/gul/random.py | 75 ++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py
index f7b237a0e3..c4c67d8157 100644
--- a/oasislmf/pytools/gul/random.py
+++ b/oasislmf/pytools/gul/random.py
@@ -3,8 +3,10 @@
 
 """
 
+from math import sqrt
 import logging
 import numpy as np
+from scipy.stats import norm
 from numba import njit
 
 logger = logging.getLogger(__name__)
@@ -72,6 +74,79 @@ def get_random_generator(random_generator):
         raise ValueError(f"No random generator exists for random_generator={random_generator}.")
 
 
+EVENT_ID_HASH_CODE = np.int64(1943_272_559)
+PERIL_CORRELATION_GROUP_HASH = np.int64(1836311903)
+HASH_MOD_CODE = np.int64(2147483648)
+
+
+@njit(cache=True, fastmath=True)
+def generate_correlated_hash_vector(peril_correlation_group, event_id, base_seed=0):
+    """Generate hash for an `event_id`.
+
+    Args:
+        event_id (int): event id.
+        base_seed (int, optional): base random seed. Defaults to 0.
+
+    Returns:
+        int64: hash
+    """
+    for i in range(1, peril_correlation_group.shape[0]):  # why start from 1??
+        peril_correlation_group[i] = (base_seed +
+                                      (i * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE +
+                                      (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE) % HASH_MOD_CODE
+
+    return peril_correlation_group
+
+
+def compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N):
+    return norm.ppf(np.linspace(arr_min, arr_max, arr_N))
+
+
+def compute_norm_cdf_lookup(arr_min, arr_max, arr_N):
+    return norm.cdf(np.linspace(arr_min, arr_max, arr_N))
+
+
+norm_inv_cdf = compute_norm_inv_cdf_lookup(1e-16, 1 - 1e-16, 1000000)
+norm_cdf = compute_norm_cdf_lookup(-20., 20., 1000000)
+# pre-compute lookup tables for the Gaussian cdf and inverse cdf
+
+# Note:
+#  - the size of these arrays can be increased to achieve better resolution in the Gaussian cdf and inv cdf.
+#  - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf
+arr_min = 1e-16
+arr_max = 1 - 1e-16
+arr_N = 1000000
+norm_inv_cdf = norm.ppf(np.linspace(arr_min, arr_max, arr_N))
+
+arr_min_cdf = -20.
+arr_max_cdf = 20.
+arr_N_cdf = 1000000
+norm_cdf = norm.cdf(np.linspace(arr_min_cdf, arr_max_cdf, arr_N_cdf))
+
+
+@njit(cache=True, fastmath=True)
+def get_norm_cdf_cell_nb(x, arr_min, arr_max, arr_N):
+    return (x - arr_min) * (arr_N - 1) // (arr_max - arr_min)
+
+
+@njit(cache=True, fastmath=True)
+def get_corr_rval_v2(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif):
+
+    sqrt_rho = sqrt(rho)
+    sqrt_1_minus_rho = sqrt(1. - rho)
+
+    for i in range(Nsamples):
+
+        x_norm = norm_inv_cdf[get_norm_cdf_cell_nb(x_unif[i], arr_min, arr_max, arr_N)]
+        y_norm = norm_inv_cdf[get_norm_cdf_cell_nb(y_unif[i], arr_min, arr_max, arr_N)]
+        z_norm = sqrt_rho * x_norm + sqrt_1_minus_rho * y_norm
+
+        z_unif[i] = norm_cdf[get_norm_cdf_cell_nb(z_norm, arr_min_cdf, arr_max_cdf, arr_N_cdf)]
+
+
+#     return z_unif
+
+
 @njit(cache=True, fastmath=True)
 def random_MersenneTwister(seeds, n):
     """Generate random numbers using the default Mersenne Twister algorithm.

From a8f89de751bbf877acccf2b66776c62e6b7483aa Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Tue, 5 Jul 2022 18:17:19 +0100
Subject: [PATCH 02/33] [gulpy] implementing correlated rng

---
 oasislmf/pytools/gul/manager.py | 42 ++++++++++++++++++++++++++++++---
 oasislmf/pytools/gul/random.py  | 25 +++-----------------
 oasislmf/pytools/gulpy.py       |  2 ++
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index de8e9483aa..bdd553e7ff 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -2,6 +2,7 @@
 This file is the entry point for the gul command for the package.
 
 """
+from random import sample
 import sys
 import os
 from select import select
@@ -23,7 +24,12 @@
     write_negative_sidx, write_sample_header,
     write_sample_rec, read_getmodel_stream
 )
-from oasislmf.pytools.gul.random import get_random_generator
+
+from oasislmf.pytools.gul.random import (
+    get_random_generator, compute_norm_cdf_lookup,
+    compute_norm_inv_cdf_lookup, get_corr_rval
+)
+
 from oasislmf.pytools.gul.core import split_tiv, get_gul, setmaxloss, compute_mean_loss
 from oasislmf.pytools.gul.utils import append_to_dict_value, binary_search
 
@@ -116,7 +122,7 @@ def generate_item_map(items, coverages):
 
 
 def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debug,
-        random_generator, file_in=None, file_out=None, **kwargs):
+        random_generator, file_in=None, file_out=None, correlated=False, **kwargs):
     """Execute the main gulpy worklow.
 
     Args:
@@ -209,14 +215,44 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
         # create the array to store the seeds
         seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id'])
 
+        # pre-compute lookup tables for the Gaussian cdf and inverse cdf
+        # this is used for the generation of correlated random numbers
+
+        # Note:
+        #  - the size `arr_N` and `arr_N_cdf` can be increased to achieve better resolution in the Gaussian cdf and inv cdf.
+        #  - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf
+        arr_min = 1e-16
+        arr_max = 1 - 1e-16
+        arr_N = 1000000
+
+        arr_min_cdf = -20.
+        arr_max_cdf = 20.
+        arr_N_cdf = 1000000
+
+        norm_inv_cdf = compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N)
+        norm_cdf = compute_norm_cdf_lookup(arr_min_cdf, arr_max_cdf, arr_N_cdf)
+
         # create buffer to be reused to store all losses for one coverage
         losses_buffer = np.zeros((sample_size + NUM_IDX + 1, np.max(coverages[1:]['max_items'])), dtype=oasis_float)
+        z_unif = np.zeros(sample_size)
 
         for event_data in read_getmodel_stream(streams_in, item_map, coverages, compute, seeds):
 
             event_id, compute_i, items_data, recs, rec_idx_ptr, rng_index = event_data
 
-            rndms = generate_rndm(seeds[:rng_index], sample_size)
+            if not correlated:
+                rndms = generate_rndm(seeds[:rng_index], sample_size)
+            else:
+                rndms_x = generate_rndm(seeds[:rng_index], sample_size)
+                rndms_y = generate_rndm(seeds[:rng_index], sample_size)
+                logger.info(rndms_x.shape)
+                rho = 0.5  # get this from the map
+                # TODO: rndms_z needs to be 2d of shape Nseeds x samplesize
+                for i_seed in range(rndms_x.shape[0]):
+                    rndms_z = get_corr_rval(
+                        rndms_x[i_seed, :], rndms_y[i_seed, :], rho, arr_min, arr_max, arr_N, norm_inv_cdf,
+                        arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif
+                    )
 
             last_processed_coverage_ids_idx = 0
             while last_processed_coverage_ids_idx < compute_i:
diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py
index c4c67d8157..8201d23558 100644
--- a/oasislmf/pytools/gul/random.py
+++ b/oasislmf/pytools/gul/random.py
@@ -106,31 +106,13 @@ def compute_norm_cdf_lookup(arr_min, arr_max, arr_N):
     return norm.cdf(np.linspace(arr_min, arr_max, arr_N))
 
 
-norm_inv_cdf = compute_norm_inv_cdf_lookup(1e-16, 1 - 1e-16, 1000000)
-norm_cdf = compute_norm_cdf_lookup(-20., 20., 1000000)
-# pre-compute lookup tables for the Gaussian cdf and inverse cdf
-
-# Note:
-#  - the size of these arrays can be increased to achieve better resolution in the Gaussian cdf and inv cdf.
-#  - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf
-arr_min = 1e-16
-arr_max = 1 - 1e-16
-arr_N = 1000000
-norm_inv_cdf = norm.ppf(np.linspace(arr_min, arr_max, arr_N))
-
-arr_min_cdf = -20.
-arr_max_cdf = 20.
-arr_N_cdf = 1000000
-norm_cdf = norm.cdf(np.linspace(arr_min_cdf, arr_max_cdf, arr_N_cdf))
-
-
 @njit(cache=True, fastmath=True)
 def get_norm_cdf_cell_nb(x, arr_min, arr_max, arr_N):
-    return (x - arr_min) * (arr_N - 1) // (arr_max - arr_min)
+    return int((x - arr_min) * (arr_N - 1) // (arr_max - arr_min))
 
 
 @njit(cache=True, fastmath=True)
-def get_corr_rval_v2(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif):
+def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif):
 
     sqrt_rho = sqrt(rho)
     sqrt_1_minus_rho = sqrt(1. - rho)
@@ -143,8 +125,7 @@ def get_corr_rval_v2(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf,
 
         z_unif[i] = norm_cdf[get_norm_cdf_cell_nb(z_norm, arr_min_cdf, arr_max_cdf, arr_N_cdf)]
 
-
-#     return z_unif
+    return z_unif
 
 
 @njit(cache=True, fastmath=True)
diff --git a/oasislmf/pytools/gulpy.py b/oasislmf/pytools/gulpy.py
index ed022b0471..30be823f0d 100644
--- a/oasislmf/pytools/gulpy.py
+++ b/oasislmf/pytools/gulpy.py
@@ -12,6 +12,8 @@
 )
 
 parser.add_argument('-a', help='back-allocation rule', default=0, type=int, dest='alloc_rule')
+parser.add_argument('-c', '--correlated', help='(wip) if passed, it activates the correlated rng',
+                    action='store_true', dest='correlated', default=False)
 parser.add_argument('-d', help='output random numbers instead of gul (default: False).',
                     default=False, action='store_true', dest='debug')
 parser.add_argument('-i', '--file-in', help='filename of input stream.', action='store', type=str, dest='file_in')

From d8215d87e0389f402124a7f8550d2a12763212f9 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Wed, 6 Jul 2022 15:37:24 +0100
Subject: [PATCH 03/33] [wip] implementing correlated rng

---
 oasislmf/pytools/gul/manager.py | 49 +++++++++++++++++++--------------
 oasislmf/pytools/gul/random.py  | 19 ++++++++-----
 oasislmf/pytools/gulpy.py       |  4 ++-
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index bdd553e7ff..dca002b611 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -22,12 +22,12 @@
 )
 from oasislmf.pytools.gul.io import (
     write_negative_sidx, write_sample_header,
-    write_sample_rec, read_getmodel_stream
+    write_sample_rec, read_getmodel_stream,
 )
 
 from oasislmf.pytools.gul.random import (
     get_random_generator, compute_norm_cdf_lookup,
-    compute_norm_inv_cdf_lookup, get_corr_rval
+    compute_norm_inv_cdf_lookup, get_corr_rval, generate_correlated_hash_vector
 )
 
 from oasislmf.pytools.gul.core import split_tiv, get_gul, setmaxloss, compute_mean_loss
@@ -240,26 +240,21 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 
             event_id, compute_i, items_data, recs, rec_idx_ptr, rng_index = event_data
 
-            if not correlated:
-                rndms = generate_rndm(seeds[:rng_index], sample_size)
-            else:
-                rndms_x = generate_rndm(seeds[:rng_index], sample_size)
-                rndms_y = generate_rndm(seeds[:rng_index], sample_size)
-                logger.info(rndms_x.shape)
-                rho = 0.5  # get this from the map
-                # TODO: rndms_z needs to be 2d of shape Nseeds x samplesize
-                for i_seed in range(rndms_x.shape[0]):
-                    rndms_z = get_corr_rval(
-                        rndms_x[i_seed, :], rndms_y[i_seed, :], rho, arr_min, arr_max, arr_N, norm_inv_cdf,
-                        arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif
-                    )
+            # generate the correlated samples for the whole event, for all peril correlation groups
+            peril_correlation_group_ids = [1, 2, 3]  # TODO get it from input data
+            # Nperil_correlation_groups = len(peril_correlation_group_ids)
+            corr_seeds = generate_correlated_hash_vector(peril_correlation_group_ids, event_id)
+            eps_ij = generate_rndm(corr_seeds, sample_size)
+            # Nseeds = len(seeds[:rng_index])
+
+            rndms_base = generate_rndm(seeds[:rng_index], sample_size)
 
             last_processed_coverage_ids_idx = 0
             while last_processed_coverage_ids_idx < compute_i:
                 cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses(
                     event_id, coverages, compute[:compute_i], items_data,
                     last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr,
-                    damage_bins, loss_threshold, losses_buffer, alloc_rule, rndms, debug,
+                    damage_bins, loss_threshold, losses_buffer, alloc_rule, rndms_base, eps_ij, correlated, debug,
                     GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor
                 )
 
@@ -275,7 +270,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 @njit(cache=True, fastmath=True)
 def compute_event_losses(event_id, coverages, coverage_ids, items_data,
                          last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins,
-                         loss_threshold, losses, alloc_rule, rndms, debug, buff_size,
+                         loss_threshold, losses, alloc_rule, rndms_base, eps_ij, correlated, debug, buff_size,
                          int32_mv, cursor):
     """Compute losses for an event.
 
@@ -326,7 +321,9 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
             item = items[item_i]
             damagecdf_i = item['damagecdf_i']
             rng_index = item['rng_index']
-
+            peril_correlation_group = item['peril_correlation_group']
+            # probably we need a peril_correlation_group_index and an array that maps index to peril_correlation_group values.
+            # for now, let's assume peril_correlation_group start from 0.
             rec = recs[rec_idx_ptr[damagecdf_i]:rec_idx_ptr[damagecdf_i + 1]]
             prob_to = rec['prob_to']
             bin_mean = rec['bin_mean']
@@ -344,14 +341,26 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
             losses[MEAN_IDX, item_i] = gul_mean
 
             if sample_size > 0:
+                if correlated:
+                    # TODO: pass these variables in
+                    rndms = get_corr_rval(
+                        eps_ij[peril_correlation_group], rndms_base[rng_index],
+                        rho, arr_min, arr_max, arr_N, norm_inv_cdf,
+                        arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size
+                    )
+                else:
+                    rndms = rndms_base[rng_index]
+
+                # this can be optimized by caching rndms with a dict
+
                 if debug:
                     for sample_idx in range(1, sample_size + 1):
-                        rval = rndms[rng_index][sample_idx - 1]
+                        rval = rndms[sample_idx - 1]
                         losses[sample_idx, item_i] = rval
                 else:
                     for sample_idx in range(1, sample_size + 1):
                         # cap `rval` to the maximum `prob_to` value (which should be 1.)
-                        rval = rndms[rng_index][sample_idx - 1]
+                        rval = rndms[sample_idx - 1]
 
                         if rval >= prob_to[Nbins - 1]:
                             rval = prob_to[Nbins - 1] - 0.00000003
diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py
index 8201d23558..44eaa39826 100644
--- a/oasislmf/pytools/gul/random.py
+++ b/oasislmf/pytools/gul/random.py
@@ -80,7 +80,7 @@ def get_random_generator(random_generator):
 
 
 @njit(cache=True, fastmath=True)
-def generate_correlated_hash_vector(peril_correlation_group, event_id, base_seed=0):
+def generate_correlated_hash_vector(peril_correlation_groups, event_id, base_seed=0):
     """Generate hash for an `event_id`.
 
     Args:
@@ -90,12 +90,17 @@ def generate_correlated_hash_vector(peril_correlation_group, event_id, base_seed
     Returns:
         int64: hash
     """
-    for i in range(1, peril_correlation_group.shape[0]):  # why start from 1??
-        peril_correlation_group[i] = (base_seed +
-                                      (i * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE +
-                                      (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE) % HASH_MOD_CODE
+    Nperil_correlation_groups = peril_correlation_groups.shape[0]
+    correlated_hashes = np.zeros(Nperil_correlation_groups, dtype='int64')
 
-    return peril_correlation_group
+    for i in range(Nperil_correlation_groups):  # why start from 1??
+        correlated_hashes[i] = (
+            base_seed +
+            (peril_correlation_groups[i] * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE +
+            (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE
+        ) % HASH_MOD_CODE
+
+    return correlated_hashes
 
 
 def compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N):
@@ -116,9 +121,9 @@ def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, ar
 
     sqrt_rho = sqrt(rho)
     sqrt_1_minus_rho = sqrt(1. - rho)
+    z_unif = np.zeros(x_unif.shape[0], dtype='float64')
 
     for i in range(Nsamples):
-
         x_norm = norm_inv_cdf[get_norm_cdf_cell_nb(x_unif[i], arr_min, arr_max, arr_N)]
         y_norm = norm_inv_cdf[get_norm_cdf_cell_nb(y_unif[i], arr_min, arr_max, arr_N)]
         z_norm = sqrt_rho * x_norm + sqrt_1_minus_rho * y_norm
diff --git a/oasislmf/pytools/gulpy.py b/oasislmf/pytools/gulpy.py
index 30be823f0d..6df677c0ce 100644
--- a/oasislmf/pytools/gulpy.py
+++ b/oasislmf/pytools/gulpy.py
@@ -12,7 +12,9 @@
 )
 
 parser.add_argument('-a', help='back-allocation rule', default=0, type=int, dest='alloc_rule')
-parser.add_argument('-c', '--correlated', help='(wip) if passed, it activates the correlated rng',
+parser.add_argument('-c', '--correlated',
+                    help='[EXPERIMENTAL] if passed, uses peril correlation groups to produce '
+                         'correlated samples for items within the same peril correlation group',
                     action='store_true', dest='correlated', default=False)
 parser.add_argument('-d', help='output random numbers instead of gul (default: False).',
                     default=False, action='store_true', dest='debug')

From 614dd3de85d8779074020596a813f1f396108cd3 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Tue, 12 Jul 2022 16:46:39 +0100
Subject: [PATCH 04/33] [wip]

---
 oasislmf/pytools/gul/io.py      |  2 ++
 oasislmf/pytools/gul/manager.py | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/oasislmf/pytools/gul/io.py b/oasislmf/pytools/gul/io.py
index 259f6a88f8..d17ad4ed46 100644
--- a/oasislmf/pytools/gul/io.py
+++ b/oasislmf/pytools/gul/io.py
@@ -245,6 +245,8 @@ def stream_to_data(int32_mv, valid_buf, size_cdf_entry, last_event_id, item_map,
                 seeds[rng_index] = generate_hash(group_id, last_event_id)
                 this_rng_index = rng_index
                 rng_index += 1
+
+                # TODO Q for Stephane: should we create a hash for one group_id, for all peril correlation groups by default?
             else:
                 this_rng_index = group_id_rng_index[group_id]
 
diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index dca002b611..71433dd3c3 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -159,6 +159,15 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
     # read coverages from file
     coverages_tiv = get_coverages(input_path)
 
+    # TODO finish here
+    # get model settings
+    # from oasislmf.preparation.correlations import get_model_settings
+    # get_model_settings("model_settings.json")
+
+    # from oasislmf.pytools.data_layer.conversions.correlations import CorrelationsData
+    # file_path = os.path.join(input_path, 'correlations.bin')
+    # data = CorrelationsData.from_bin(file_path=file_path)
+
     # init the structure for computation
     # coverages are numbered from 1, therefore we skip element 0 in `coverages`
     coverages = np.zeros(coverages_tiv.shape[0] + 1, coverage_type)
@@ -295,7 +304,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
         buff_size (int): size in bytes of the output buffer.
         int32_mv (numpy.ndarray): int32 view of the memoryview where the output is buffered.
         cursor (int): index of int32_mv where to start writing.
-        
+
     Returns:
         int, int, int: updated value of cursor, updated value of cursor_bytes, last last_processed_coverage_ids_idx
     """

From d407089d4d0561ab69caff2ae483df4d91dcaa67 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Fri, 15 Jul 2022 16:38:39 +0100
Subject: [PATCH 05/33] [gulpy] working implementation of the correlated random
 values

---
 oasislmf/pytools/gul/manager.py | 100 +++++++++++++++++++-------------
 oasislmf/pytools/gul/random.py  |  29 +++++----
 2 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index 71433dd3c3..0defb0e308 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -12,8 +12,10 @@
 from numba import njit
 from numba.typed import Dict, List
 
+from oasislmf.pytools.data_layer.conversions.correlations import CorrelationsData
+
 from oasislmf.pytools.getmodel.manager import get_damage_bins, Item
-from oasislmf.pytools.getmodel.common import oasis_float
+from oasislmf.pytools.getmodel.common import oasis_float, Correlation
 
 from oasislmf.pytools.gul.common import (
     MEAN_IDX, STD_DEV_IDX, TIV_IDX, CHANCE_OF_LOSS_IDX, MAX_LOSS_IDX, NUM_IDX,
@@ -159,15 +161,6 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
     # read coverages from file
     coverages_tiv = get_coverages(input_path)
 
-    # TODO finish here
-    # get model settings
-    # from oasislmf.preparation.correlations import get_model_settings
-    # get_model_settings("model_settings.json")
-
-    # from oasislmf.pytools.data_layer.conversions.correlations import CorrelationsData
-    # file_path = os.path.join(input_path, 'correlations.bin')
-    # data = CorrelationsData.from_bin(file_path=file_path)
-
     # init the structure for computation
     # coverages are numbered from 1, therefore we skip element 0 in `coverages`
     coverages = np.zeros(coverages_tiv.shape[0] + 1, coverage_type)
@@ -224,46 +217,71 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
         # create the array to store the seeds
         seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id'])
 
-        # pre-compute lookup tables for the Gaussian cdf and inverse cdf
-        # this is used for the generation of correlated random numbers
+        logger.info(f"Correlated random number generation: switched {'ON' if correlated else 'OFF'}")
+        if correlated:
+            file_path = os.path.join(input_path, 'correlations.bin')
+            data = CorrelationsData.from_bin(file_path=file_path).data
+            Nperil_correlation_groups = len(data)
+
+            corr_data_by_item_id = np.ndarray(Nperil_correlation_groups + 1, dtype=Correlation)
+            corr_data_by_item_id[0] = (0, 0.)
+            corr_data_by_item_id[1:]['peril_correlation_group'] = np.array(data['peril_correlation_group'])
+            corr_data_by_item_id[1:]['correlation_value'] = np.array(data['correlation_value'])
+
+            logger.info(
+                f"Correlation values for {Nperil_correlation_groups} peril correlation groups have been imported."
+            )
+
+            unique_peril_correlation_groups = np.unique(corr_data_by_item_id[1:]['peril_correlation_group'])
 
-        # Note:
-        #  - the size `arr_N` and `arr_N_cdf` can be increased to achieve better resolution in the Gaussian cdf and inv cdf.
-        #  - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf
-        arr_min = 1e-16
-        arr_max = 1 - 1e-16
-        arr_N = 1000000
+            # pre-compute lookup tables for the Gaussian cdf and inverse cdf
+            # Notes:
+            #  - the size `arr_N` and `arr_N_cdf` can be increased to achieve better resolution in the Gaussian cdf and inv cdf.
+            #  - the function `get_corr_rval` to compute the correlated numbers is not affected by arr_N and arr_N_cdf
+            arr_min, arr_max, arr_N = 1e-16, 1 - 1e-16, 1000000
+            arr_min_cdf, arr_max_cdf, arr_N_cdf = -20., 20., 1000000
+            norm_inv_cdf = compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N)
+            norm_cdf = compute_norm_cdf_lookup(arr_min_cdf, arr_max_cdf, arr_N_cdf)
 
-        arr_min_cdf = -20.
-        arr_max_cdf = 20.
-        arr_N_cdf = 1000000
+            # buffer to be re-used to store all the correlated random values
+            z_unif = np.zeros(sample_size, dtype='float64')
 
-        norm_inv_cdf = compute_norm_inv_cdf_lookup(arr_min, arr_max, arr_N)
-        norm_cdf = compute_norm_cdf_lookup(arr_min_cdf, arr_max_cdf, arr_N_cdf)
+        else:
+            # create dummy data structures with proper dtypes to allow correct numba compilation
+            corr_data_by_item_id = np.ndarray(1, dtype=Correlation)
+            arr_min, arr_max, arr_N = 0, 0, 0
+            arr_min_cdf, arr_max_cdf, arr_N_cdf = 0, 0, 0
+            norm_inv_cdf, norm_cdf = np.zeros(1, dtype='float64'), np.zeros(1, dtype='float64')
+            z_unif = np.zeros(1, dtype='float64')
 
         # create buffer to be reused to store all losses for one coverage
         losses_buffer = np.zeros((sample_size + NUM_IDX + 1, np.max(coverages[1:]['max_items'])), dtype=oasis_float)
-        z_unif = np.zeros(sample_size)
 
         for event_data in read_getmodel_stream(streams_in, item_map, coverages, compute, seeds):
 
             event_id, compute_i, items_data, recs, rec_idx_ptr, rng_index = event_data
 
+            # generation of "base" random values is done as before
+            rndms_base = generate_rndm(seeds[:rng_index], sample_size)
+
+            # to generate the correlated part, we do the hashing here for now (instead of in stream_to_data)
             # generate the correlated samples for the whole event, for all peril correlation groups
-            peril_correlation_group_ids = [1, 2, 3]  # TODO get it from input data
-            # Nperil_correlation_groups = len(peril_correlation_group_ids)
-            corr_seeds = generate_correlated_hash_vector(peril_correlation_group_ids, event_id)
-            eps_ij = generate_rndm(corr_seeds, sample_size)
-            # Nseeds = len(seeds[:rng_index])
+            if correlated:
+                corr_seeds = generate_correlated_hash_vector(unique_peril_correlation_groups, event_id)
+                eps_ij = generate_rndm(corr_seeds, sample_size, skip_seeds=1)
 
-            rndms_base = generate_rndm(seeds[:rng_index], sample_size)
+            else:
+                # create dummy data structures with proper dtypes to allow correct numba compilation
+                corr_seeds = np.zeros(1, dtype='int64')
+                eps_ij = np.zeros((1, 1), dtype='float64')
 
             last_processed_coverage_ids_idx = 0
             while last_processed_coverage_ids_idx < compute_i:
                 cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses(
                     event_id, coverages, compute[:compute_i], items_data,
                     last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr,
-                    damage_bins, loss_threshold, losses_buffer, alloc_rule, rndms_base, eps_ij, correlated, debug,
+                    damage_bins, loss_threshold, losses_buffer, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id,
+                    arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug,
                     GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor
                 )
 
@@ -279,8 +297,9 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 @njit(cache=True, fastmath=True)
 def compute_event_losses(event_id, coverages, coverage_ids, items_data,
                          last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins,
-                         loss_threshold, losses, alloc_rule, rndms_base, eps_ij, correlated, debug, buff_size,
-                         int32_mv, cursor):
+                         loss_threshold, losses, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id,
+                         arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf,
+                         z_unif, debug, buff_size, int32_mv, cursor):
     """Compute losses for an event.
 
     Args:
@@ -309,6 +328,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
         int, int, int: updated value of cursor, updated value of cursor_bytes, last last_processed_coverage_ids_idx
     """
     max_size_per_item = (sample_size + NUM_IDX + 1) * gulSampleslevelRec_size + 2 * gulSampleslevelHeader_size
+
     for coverage_i in range(last_processed_coverage_ids_idx, coverage_ids.shape[0]):
         coverage = coverages[coverage_ids[coverage_i]]
         tiv = coverage['tiv']  # coverages are indexed from 1
@@ -330,9 +350,6 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
             item = items[item_i]
             damagecdf_i = item['damagecdf_i']
             rng_index = item['rng_index']
-            peril_correlation_group = item['peril_correlation_group']
-            # probably we need a peril_correlation_group_index and an array that maps index to peril_correlation_group values.
-            # for now, let's assume peril_correlation_group start from 0.
             rec = recs[rec_idx_ptr[damagecdf_i]:rec_idx_ptr[damagecdf_i + 1]]
             prob_to = rec['prob_to']
             bin_mean = rec['bin_mean']
@@ -351,12 +368,17 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
 
             if sample_size > 0:
                 if correlated:
-                    # TODO: pass these variables in
-                    rndms = get_corr_rval(
+                    item_corr_data = corr_data_by_item_id[item['item_id']]
+                    peril_correlation_group = item_corr_data['peril_correlation_group']
+                    rho = item_corr_data['correlation_value']
+
+                    get_corr_rval(
                         eps_ij[peril_correlation_group], rndms_base[rng_index],
                         rho, arr_min, arr_max, arr_N, norm_inv_cdf,
-                        arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size
+                        arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif
                     )
+                    rndms = z_unif
+
                 else:
                     rndms = rndms_base[rng_index]
 
diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py
index 44eaa39826..b990064a5b 100644
--- a/oasislmf/pytools/gul/random.py
+++ b/oasislmf/pytools/gul/random.py
@@ -80,7 +80,7 @@ def get_random_generator(random_generator):
 
 
 @njit(cache=True, fastmath=True)
-def generate_correlated_hash_vector(peril_correlation_groups, event_id, base_seed=0):
+def generate_correlated_hash_vector(unique_peril_correlation_groups, event_id, base_seed=0):
     """Generate hash for an `event_id`.
 
     Args:
@@ -90,13 +90,14 @@ def generate_correlated_hash_vector(peril_correlation_groups, event_id, base_see
     Returns:
         int64: hash
     """
-    Nperil_correlation_groups = peril_correlation_groups.shape[0]
-    correlated_hashes = np.zeros(Nperil_correlation_groups, dtype='int64')
+    Nperil_correlation_groups = unique_peril_correlation_groups.shape[0]
+    correlated_hashes = np.empty(Nperil_correlation_groups + 1, dtype='int64')
+    correlated_hashes[0] = 0
 
-    for i in range(Nperil_correlation_groups):  # why start from 1??
+    for i in range(1, Nperil_correlation_groups + 1):
         correlated_hashes[i] = (
             base_seed +
-            (peril_correlation_groups[i] * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE +
+            (unique_peril_correlation_groups[i - 1] * PERIL_CORRELATION_GROUP_HASH) % HASH_MOD_CODE +
             (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE
         ) % HASH_MOD_CODE
 
@@ -117,11 +118,11 @@ def get_norm_cdf_cell_nb(x, arr_min, arr_max, arr_N):
 
 
 @njit(cache=True, fastmath=True)
-def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif):
+def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf,
+                  arr_max_cdf, arr_N_cdf, norm_cdf, Nsamples, z_unif):
 
     sqrt_rho = sqrt(rho)
     sqrt_1_minus_rho = sqrt(1. - rho)
-    z_unif = np.zeros(x_unif.shape[0], dtype='float64')
 
     for i in range(Nsamples):
         x_norm = norm_inv_cdf[get_norm_cdf_cell_nb(x_unif[i], arr_min, arr_max, arr_N)]
@@ -130,11 +131,9 @@ def get_corr_rval(x_unif, y_unif, rho, arr_min, arr_max, arr_N, norm_inv_cdf, ar
 
         z_unif[i] = norm_cdf[get_norm_cdf_cell_nb(z_norm, arr_min_cdf, arr_max_cdf, arr_N_cdf)]
 
-    return z_unif
-
 
 @njit(cache=True, fastmath=True)
-def random_MersenneTwister(seeds, n):
+def random_MersenneTwister(seeds, n, skip_seeds=0):
     """Generate random numbers using the default Mersenne Twister algorithm.
 
     Args:
@@ -150,9 +149,9 @@ def random_MersenneTwister(seeds, n):
     Nseeds = len(seeds)
     rndms = np.zeros((Nseeds, n), dtype='float64')
 
-    for seed_i, seed in enumerate(seeds):
+    for seed_i in range(skip_seeds, Nseeds, 1):
         # set the seed
-        np.random.seed(seed)
+        np.random.seed(seeds[seed_i])
 
         # draw the random numbers
         for j in range(n):
@@ -163,7 +162,7 @@ def random_MersenneTwister(seeds, n):
 
 
 @njit(cache=True, fastmath=True)
-def random_LatinHypercube(seeds, n):
+def random_LatinHypercube(seeds, n, skip_seeds=0):
     """Generate random numbers using the Latin Hypercube algorithm.
 
     Args:
@@ -188,9 +187,9 @@ def random_LatinHypercube(seeds, n):
     samples = np.zeros(n, dtype='float64')
     perms = np.zeros(n, dtype='float64')
 
-    for seed_i, seed in enumerate(seeds):
+    for seed_i in range(skip_seeds, Nseeds, 1):
         # set the seed
-        np.random.seed(seed)
+        np.random.seed(seeds[seed_i])
 
         # draw the random numbers and re-generate permutations array
         for i in range(n):

From 5417f9b416cf1697c2a1663fd5036677879fa586 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Fri, 15 Jul 2022 17:31:43 +0100
Subject: [PATCH 06/33] minor cleanup

---
 oasislmf/pytools/gul/io.py      | 3 +--
 oasislmf/pytools/gul/manager.py | 5 -----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/oasislmf/pytools/gul/io.py b/oasislmf/pytools/gul/io.py
index 6fb8ddcc7e..2b4f1343b7 100644
--- a/oasislmf/pytools/gul/io.py
+++ b/oasislmf/pytools/gul/io.py
@@ -94,7 +94,7 @@ def read_getmodel_stream(stream_in, item_map, coverages, compute, seeds, valid_a
     if valid_area_peril_id is not None:
         valid_area_peril_dict = gen_valid_area_peril(valid_area_peril_id)
     else:
-        valid_area_peril_dict=None
+        valid_area_peril_dict = None
 
     # init data structures
     group_id_rng_index, rec_idx_ptr = gen_structs()
@@ -267,7 +267,6 @@ def stream_to_data(int32_mv, valid_buf, size_cdf_entry, last_event_id, item_map,
                 this_rng_index = rng_index
                 rng_index += 1
 
-                # TODO Q for Stephane: should we create a hash for one group_id, for all peril correlation groups by default?
             else:
                 this_rng_index = group_id_rng_index[group_id]
 
diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index b06ff1f926..489381df05 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -2,7 +2,6 @@
 This file is the entry point for the gul command for the package.
 
 """
-from random import sample
 import sys
 import os
 from select import select
@@ -155,7 +154,6 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
     input_path = os.path.join(run_dir, 'input')
     ignore_file_type = set(ignore_file_type)
 
-
     damage_bins = get_damage_bins(static_path)
 
     # read coverages from file
@@ -170,7 +168,6 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
     else:
         valid_area_peril_id = None
 
-
     # init the structure for computation
     # coverages are numbered from 1, therefore we skip element 0 in `coverages`
     coverages = np.zeros(coverages_tiv.shape[0] + 1, coverage_type)
@@ -392,8 +389,6 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
                 else:
                     rndms = rndms_base[rng_index]
 
-                # this can be optimized by caching rndms with a dict
-
                 if debug:
                     for sample_idx in range(1, sample_size + 1):
                         rval = rndms[sample_idx - 1]

From 5ad10f9792ae855efe3a5409f8dcc2bc200fe4b6 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Fri, 22 Jul 2022 15:52:43 +0100
Subject: [PATCH 07/33] [gulpy] Update docstrings for random  module functions

---
 oasislmf/pytools/gul/random.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py
index b990064a5b..388626cb25 100644
--- a/oasislmf/pytools/gul/random.py
+++ b/oasislmf/pytools/gul/random.py
@@ -81,14 +81,15 @@ def get_random_generator(random_generator):
 
 @njit(cache=True, fastmath=True)
 def generate_correlated_hash_vector(unique_peril_correlation_groups, event_id, base_seed=0):
-    """Generate hash for an `event_id`.
+    """Generate hashes for all peril correlation groups for a given `event_id`.
 
     Args:
+        unique_peril_correlation_groups (List[int]): list of the unique peril correlation groups.
         event_id (int): event id.
         base_seed (int, optional): base random seed. Defaults to 0.
 
     Returns:
-        int64: hash
+        List[int64]: hashes
     """
     Nperil_correlation_groups = unique_peril_correlation_groups.shape[0]
     correlated_hashes = np.empty(Nperil_correlation_groups + 1, dtype='int64')
@@ -139,6 +140,10 @@ def random_MersenneTwister(seeds, n, skip_seeds=0):
     Args:
         seeds (List[int64]): List of seeds.
         n (int): number of random samples to generate for each seed.
+        skip_seeds (int): number of seeds to skip starting from the beginning
+          of the `seeds` array. For skipped seeds no random numbers are generated
+          and the output rndms will contain zeros at their corresponding row.
+          Default is 0, i.e. no seeds are skipped.
 
     Returns:
         rndms (array[float]): 2-d array of shape (number of seeds, n) 
@@ -174,6 +179,10 @@ def random_LatinHypercube(seeds, n, skip_seeds=0):
           containing the random values generated for each seed.
         rndms_idx (Dict[int64, int]): mapping between `seed` and the 
           row in rndms that stores the corresponding random values.
+        skip_seeds (int): number of seeds to skip starting from the beginning
+          of the `seeds` array. For skipped seeds no random numbers are generated
+          and the output rndms will contain zeros at their corresponding row.
+          Default is 0, i.e. no seeds are skipped.
 
     Notes:
         Implementation follows scipy.stats.qmc.LatinHypercube v1.8.0.

From 2066aa25a3e788cc7ab47c6b6ad8f8eb24cda74d Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Tue, 2 Aug 2022 11:05:50 +0100
Subject: [PATCH 08/33] [gulpy] remove unused generate_correlated_hash

---
 oasislmf/pytools/gul/random.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/oasislmf/pytools/gul/random.py b/oasislmf/pytools/gul/random.py
index 388626cb25..2310bbdd14 100644
--- a/oasislmf/pytools/gul/random.py
+++ b/oasislmf/pytools/gul/random.py
@@ -35,22 +35,6 @@ def generate_hash(group_id, event_id, base_seed=0):
     return hash
 
 
-@njit(cache=True, fastmath=True)
-def generate_correlated_hash(event_id, base_seed=0):
-    """Generate hash for an `event_id`.
-
-    Args:
-        event_id (int): event id.
-        base_seed (int, optional): base random seed. Defaults to 0.
-
-    Returns:
-        int64: hash
-    """
-    hash = (base_seed + (event_id * EVENT_ID_HASH_CODE) % HASH_MOD_CODE) % HASH_MOD_CODE
-
-    return hash
-
-
 def get_random_generator(random_generator):
     """Get the random generator function.
 

From f0311c0b37b980913245ba06ea0bdf261eece736 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Tue, 2 Aug 2022 11:57:23 +0100
Subject: [PATCH 09/33] [gulpy] introduce --ignore-correlation flag

---
 oasislmf/pytools/gul/manager.py | 32 ++++++++++++++++++++++----------
 oasislmf/pytools/gulpy.py       |  7 +++----
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index 489381df05..ce4c0eeaca 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -125,7 +125,7 @@ def generate_item_map(items, coverages):
 
 
 def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debug,
-        random_generator, peril_filter=[], file_in=None, file_out=None, correlated=False, **kwargs):
+        random_generator, peril_filter=[], file_in=None, file_out=None, ignore_correlation=False, **kwargs):
     """Execute the main gulpy worklow.
 
     Args:
@@ -139,6 +139,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
         random_generator (int): random generator function id.
         file_in (str, optional): filename of input stream. Defaults to None.
         file_out (str, optional): filename of output stream. Defaults to None.
+        ignore_correlation (bool): if True, do not compute correlated random samples.
 
     Raises:
         ValueError: if alloc_rule is not 0, 1, or 2.
@@ -224,11 +225,21 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
         # create the array to store the seeds
         seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id'])
 
-        logger.info(f"Correlated random number generation: switched {'ON' if correlated else 'OFF'}")
-        if correlated:
-            file_path = os.path.join(input_path, 'correlations.bin')
-            data = CorrelationsData.from_bin(file_path=file_path).data
-            Nperil_correlation_groups = len(data)
+        file_path = os.path.join(input_path, 'correlations.bin')
+        data = CorrelationsData.from_bin(file_path=file_path).data
+        Nperil_correlation_groups = len(data)
+        logger.info(f"Detected {Nperil_correlation_groups} peril correlation groups.")
+
+        if Nperil_correlation_groups == 0:
+            ignore_correlation = True
+            logger.info(f"Correlated random number generation: switched OFF because 0 peril correlation groups were detected.")
+
+        else:
+            if ignore_correlation:
+                logger.info(f"Correlated random number generation: switched OFF because --ignore-correlation is True.")
+
+        if not ignore_correlation:
+            logger.info(f"Correlated random number generation: switched ON.")
 
             corr_data_by_item_id = np.ndarray(Nperil_correlation_groups + 1, dtype=Correlation)
             corr_data_by_item_id[0] = (0, 0.)
@@ -273,7 +284,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 
             # to generate the correlated part, we do the hashing here for now (instead of in stream_to_data)
             # generate the correlated samples for the whole event, for all peril correlation groups
-            if correlated:
+            if not ignore_correlation:
                 corr_seeds = generate_correlated_hash_vector(unique_peril_correlation_groups, event_id)
                 eps_ij = generate_rndm(corr_seeds, sample_size, skip_seeds=1)
 
@@ -287,7 +298,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
                 cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses(
                     event_id, coverages, compute[:compute_i], items_data,
                     last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr,
-                    damage_bins, loss_threshold, losses_buffer, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id,
+                    damage_bins, loss_threshold, losses_buffer, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id,
                     arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug,
                     GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor
                 )
@@ -304,7 +315,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 @njit(cache=True, fastmath=True)
 def compute_event_losses(event_id, coverages, coverage_ids, items_data,
                          last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins,
-                         loss_threshold, losses, alloc_rule, correlated, rndms_base, eps_ij, corr_data_by_item_id,
+                         loss_threshold, losses, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id,
                          arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf,
                          z_unif, debug, buff_size, int32_mv, cursor):
     """Compute losses for an event.
@@ -323,6 +334,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
         loss_threshold (float): threshold above which losses are printed to the output stream.
         losses (numpy.array[oasis_float]): array (to be re-used) to store losses for all item_ids.
         alloc_rule (int): back-allocation rule.
+        ignore_correlation (bool): if True, do not compute correlated random samples.
         rndms (numpy.array[float64]): 2d array of shape (number of seeds, sample_size) storing the random values
           drawn for each seed.
         debug (bool): if True, for each random sample, print to the output stream the random value
@@ -374,7 +386,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
             losses[MEAN_IDX, item_i] = gul_mean
 
             if sample_size > 0:
-                if correlated:
+                if not ignore_correlation:
                     item_corr_data = corr_data_by_item_id[item['item_id']]
                     peril_correlation_group = item_corr_data['peril_correlation_group']
                     rho = item_corr_data['correlation_value']
diff --git a/oasislmf/pytools/gulpy.py b/oasislmf/pytools/gulpy.py
index 2d52bc586f..f9c961ffda 100644
--- a/oasislmf/pytools/gulpy.py
+++ b/oasislmf/pytools/gulpy.py
@@ -12,10 +12,9 @@
 )
 
 parser.add_argument('-a', help='back-allocation rule', default=0, type=int, dest='alloc_rule')
-parser.add_argument('-c', '--correlated',
-                    help='[EXPERIMENTAL] if passed, uses peril correlation groups to produce '
-                         'correlated samples for items within the same peril correlation group',
-                    action='store_true', dest='correlated', default=False)
+parser.add_argument('--ignore-correlation',
+                    help='if passed, peril correlation groups (if defined) are ignored for the generation of correlated samples',
+                    action='store_true', dest='ignore_correlation', default=False)
 parser.add_argument('-d', help='output random numbers instead of gul (default: False).',
                     default=False, action='store_true', dest='debug')
 parser.add_argument('-i', '--file-in', help='filename of input stream.', action='store', type=str, dest='file_in')

From 1709cee8069ec760eb36dcc5c6e0a7e6a06144c4 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Wed, 3 Aug 2022 14:34:59 +0100
Subject: [PATCH 10/33] set hashed_group_id to True by default, cleanup

---
 oasislmf/computation/generate/files.py |  6 +++---
 oasislmf/preparation/gul_inputs.py     | 12 +++---------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 6e9930d6a8..79d8620365 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -104,7 +104,8 @@ class GenerateFiles(ComputationStep):
         {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'},
         {'name': 'group_id_cols',              'flag':'-G', 'nargs':'+',                         'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS},
         {'name': 'lookup_multiprocessing',     'type': str2bool, 'const': False, 'nargs':'?',  'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'},
-        {"name": "hashed_group_id",            "type": str2bool, "const": False, 'nargs':'?',  "default": False, "help": "Hashes the group_id in the items.bin"},
+        {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs': '?',
+            "default": True, "help": "Hashes the group_id in the items.bin"},
 
         # Manager only options (pass data directy instead of filepaths)
         {'name': 'lookup_config'},
@@ -232,7 +233,7 @@ def run(self):
             keys_df,
             exposure_profile=location_profile,
             group_id_cols=group_id_cols,
-            hash_group_ids=self.hashed_group_id,
+            hashed_group_id=self.hashed_group_id,
         )
 
         if self.model_settings_json is not None:
@@ -267,7 +268,6 @@ def run(self):
             output_dir=self._get_output_dir(),
             oasis_files_prefixes=files_prefixes['gul'],
             chunksize=self.write_chunksize,
-            hashed_item_id=self.hashed_group_id
         )
 
         gul_summary_mapping = get_summary_mapping(gul_inputs_df, oed_hierarchy)
diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index a90421a9cc..87efd900bb 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -51,7 +51,7 @@ def get_gul_input_items(
     keys_df,
     exposure_profile=get_default_exposure_profile(),
     group_id_cols=['loc_id'],
-    hash_group_ids=False
+    hashed_group_id=True
 ):
     """
     Generates and returns a Pandas dataframe of GUL input items.
@@ -317,7 +317,7 @@ def get_gul_input_items(
     if correlation_check is True:
         gul_inputs_df['group_id'] = gul_inputs_df[correlation_group_id]
 
-    elif hash_group_ids is False:
+    elif hashed_group_id is False:
 
         if len(group_id_cols) > 1:
             gul_inputs_df['group_id'] = factorize_ndarray(
@@ -331,7 +331,7 @@ def get_gul_input_items(
                 gul_inputs_df[group_id_cols[0]].values
             )[0]
 
-    # this block gets fired if the hash_group_ids is True
+    # this block gets fired if the hashed_group_id is True
     else:
         gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]).to_numpy() >> 33)
 
@@ -446,7 +446,6 @@ def write_gul_input_files(
     output_dir,
     oasis_files_prefixes=copy.deepcopy(OASIS_FILES_PREFIXES['gul']),
     chunksize=(2 * 10 ** 5),
-    hashed_item_id=False
 ):
     """
     Writes the standard Oasis GUL input files to a target directory, using a
@@ -504,9 +503,4 @@ def write_gul_input_files(
     for fn in gul_input_files:
         getattr(this_module, 'write_{}_file'.format(fn))(gul_inputs_df.copy(deep=True), gul_input_files[fn], chunksize)
 
-    # if hashed_item_id is True:
-    #     input_file = gul_input_files["items"]
-    #     input_directory = "/".join(input_file.split("/")[:-1]) + "/"
-    #     convert_item_csv_to_hash(input_directory=input_directory)
-
     return gul_input_files

From 2222be2cb930e09ced342c77fe7d3905533f76be Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Mon, 8 Aug 2022 14:11:40 +0100
Subject: [PATCH 11/33] adding haahing patch

---
 oasislmf/computation/generate/files.py | 2 +-
 oasislmf/preparation/gul_inputs.py     | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 6e9930d6a8..bdd55ae0d2 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -104,7 +104,7 @@ class GenerateFiles(ComputationStep):
         {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'},
         {'name': 'group_id_cols',              'flag':'-G', 'nargs':'+',                         'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS},
         {'name': 'lookup_multiprocessing',     'type': str2bool, 'const': False, 'nargs':'?',  'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'},
-        {"name": "hashed_group_id",            "type": str2bool, "const": False, 'nargs':'?',  "default": False, "help": "Hashes the group_id in the items.bin"},
+        {"name": "hashed_group_id",            "type": str2bool, "const": True, 'nargs':'?',  "default": True, "help": "Hashes the group_id in the items.bin"},
 
         # Manager only options (pass data directy instead of filepaths)
         {'name': 'lookup_config'},
diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index a90421a9cc..230dc03cf1 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -333,7 +333,10 @@ def get_gul_input_items(
 
     # this block gets fired if the hash_group_ids is True
     else:
-        gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]).to_numpy() >> 33)
+        para_chain = gul_inputs_df.drop_duplicates(subset=group_id_cols).reset_index(drop=True)
+        para_chain["group_id"] = pd.util.hash_pandas_object(para_chain[group_id_cols])
+        para_chain = para_chain[group_id_cols + ["group_id"]]
+        gul_inputs_df = pd.merge(gul_inputs_df, para_chain, how='left', left_on=group_id_cols, right_on=group_id_cols)
 
     gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32')
 

From 2c0d5e3c230d81923d1c0463f56593a6b8a93dd9 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Mon, 8 Aug 2022 14:14:30 +0100
Subject: [PATCH 12/33] adding haahing patch

---
 oasislmf/computation/generate/files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index bdd55ae0d2..6e9930d6a8 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -104,7 +104,7 @@ class GenerateFiles(ComputationStep):
         {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'},
         {'name': 'group_id_cols',              'flag':'-G', 'nargs':'+',                         'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS},
         {'name': 'lookup_multiprocessing',     'type': str2bool, 'const': False, 'nargs':'?',  'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'},
-        {"name": "hashed_group_id",            "type": str2bool, "const": True, 'nargs':'?',  "default": True, "help": "Hashes the group_id in the items.bin"},
+        {"name": "hashed_group_id",            "type": str2bool, "const": False, 'nargs':'?',  "default": False, "help": "Hashes the group_id in the items.bin"},
 
         # Manager only options (pass data directy instead of filepaths)
         {'name': 'lookup_config'},

From bb0585863d8e1a54f0513913ba0e5fde40fc03a2 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Thu, 11 Aug 2022 15:34:26 +0100
Subject: [PATCH 13/33] [gulpy] minor cleanup files.py parameter on same line

---
 oasislmf/computation/generate/files.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 79d8620365..0803074821 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -104,8 +104,7 @@ class GenerateFiles(ComputationStep):
         {'name': 'disable_summarise_exposure', 'flag':'-S', 'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'Disables creation of an exposure summary report'},
         {'name': 'group_id_cols',              'flag':'-G', 'nargs':'+',                         'help': 'Columns from loc file to set group_id', 'default': GROUP_ID_COLS},
         {'name': 'lookup_multiprocessing',     'type': str2bool, 'const': False, 'nargs':'?',  'default': False, 'help': 'Flag to enable/disable lookup multiprocessing'},
-        {"name": "hashed_group_id", "type": str2bool, "const": False, 'nargs': '?',
-            "default": True, "help": "Hashes the group_id in the items.bin"},
+        {"name": "hashed_group_id",            'type': str2bool, 'const': False, 'nargs': '?', 'default': True, 'help': "Hashes the group_id in the items.bin"},
 
         # Manager only options (pass data directy instead of filepaths)
         {'name': 'lookup_config'},

From e593f0ca0d60757f88c1fe7c757629ca430fed40 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Thu, 11 Aug 2022 16:20:39 +0100
Subject: [PATCH 14/33] [gulpy] run correlation only if rho>0

---
 oasislmf/pytools/gul/manager.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index 14102b60c8..b40e754798 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -389,15 +389,20 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
             if sample_size > 0:
                 if not ignore_correlation:
                     item_corr_data = corr_data_by_item_id[item['item_id']]
-                    peril_correlation_group = item_corr_data['peril_correlation_group']
                     rho = item_corr_data['correlation_value']
 
-                    get_corr_rval(
-                        eps_ij[peril_correlation_group], rndms_base[rng_index],
-                        rho, arr_min, arr_max, arr_N, norm_inv_cdf,
-                        arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif
-                    )
-                    rndms = z_unif
+                    if rho > 0:
+                        peril_correlation_group = item_corr_data['peril_correlation_group']
+
+                        get_corr_rval(
+                            eps_ij[peril_correlation_group], rndms_base[rng_index],
+                            rho, arr_min, arr_max, arr_N, norm_inv_cdf,
+                            arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, sample_size, z_unif
+                        )
+                        rndms = z_unif
+
+                    else:
+                        rndms = rndms_base[rng_index]
 
                 else:
                     rndms = rndms_base[rng_index]

From fbf1689e19ba887352e2c0f41d15b490d8e0b6a9 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Thu, 11 Aug 2022 16:40:58 +0100
Subject: [PATCH 15/33] updating hashing

---
 oasislmf/preparation/gul_inputs.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 230dc03cf1..765bae9410 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -279,7 +279,7 @@ def get_gul_input_items(
 
     # Concatenate chunks. Sort by index to preserve item_id order in generated outputs compared
     # to original code.
-    gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index()
+    gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True, inplace=True)
     # Set default values and data types for BI coverage boolean, TIV, deductibles and limit
     dtypes = {
         **{t: 'uint8' for t in term_cols_ints + terms_ints},
@@ -333,10 +333,7 @@ def get_gul_input_items(
 
     # this block gets fired if the hash_group_ids is True
     else:
-        para_chain = gul_inputs_df.drop_duplicates(subset=group_id_cols).reset_index(drop=True)
-        para_chain["group_id"] = pd.util.hash_pandas_object(para_chain[group_id_cols])
-        para_chain = para_chain[group_id_cols + ["group_id"]]
-        gul_inputs_df = pd.merge(gul_inputs_df, para_chain, how='left', left_on=group_id_cols, right_on=group_id_cols)
+        gul_inputs_df["group_id"] = pd.util.hash_pandas_object(gul_inputs_df[group_id_cols])
 
     gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32')
 

From d6118699ecd3833bbeb98e84080d013116537ab8 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Fri, 12 Aug 2022 11:55:42 +0100
Subject: [PATCH 16/33] [gulpy] improve flow depending on corr definitions

---
 oasislmf/pytools/gul/manager.py | 35 ++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/oasislmf/pytools/gul/manager.py b/oasislmf/pytools/gul/manager.py
index b40e754798..445fdd720d 100644
--- a/oasislmf/pytools/gul/manager.py
+++ b/oasislmf/pytools/gul/manager.py
@@ -226,20 +226,23 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
         # create the array to store the seeds
         seeds = np.zeros(len(np.unique(items['group_id'])), dtype=Item.dtype['group_id'])
 
-        file_path = os.path.join(input_path, 'correlations.bin')
-        data = CorrelationsData.from_bin(file_path=file_path).data
-        Nperil_correlation_groups = len(data)
-        logger.info(f"Detected {Nperil_correlation_groups} peril correlation groups.")
-
-        if Nperil_correlation_groups == 0:
-            ignore_correlation = True
-            logger.info(f"Correlated random number generation: switched OFF because 0 peril correlation groups were detected.")
+        do_correlation = False
+        if ignore_correlation:
+            logger.info(f"Correlated random number generation: switched OFF because --ignore-correlation is True.")
 
         else:
-            if ignore_correlation:
-                logger.info(f"Correlated random number generation: switched OFF because --ignore-correlation is True.")
+            file_path = os.path.join(input_path, 'correlations.bin')
+            data = CorrelationsData.from_bin(file_path=file_path).data
+            Nperil_correlation_groups = len(data)
+            logger.info(f"Detected {Nperil_correlation_groups} peril correlation groups.")
+
+            if Nperil_correlation_groups > 0 and any(data['correlation_value'] > 0):
+                do_correlation = True
+            else:
+                logger.info(f"Correlated random number generation: switched OFF because 0 peril correlation groups were detected or "
+                            "the correlation value is zero for all peril correlation groups.")
 
-        if not ignore_correlation:
+        if do_correlation:
             logger.info(f"Correlated random number generation: switched ON.")
 
             corr_data_by_item_id = np.ndarray(Nperil_correlation_groups + 1, dtype=Correlation)
@@ -285,7 +288,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 
             # to generate the correlated part, we do the hashing here for now (instead of in stream_to_data)
             # generate the correlated samples for the whole event, for all peril correlation groups
-            if not ignore_correlation:
+            if do_correlation:
                 corr_seeds = generate_correlated_hash_vector(unique_peril_correlation_groups, event_id)
                 eps_ij = generate_rndm(corr_seeds, sample_size, skip_seeds=1)
 
@@ -299,7 +302,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
                 cursor, cursor_bytes, last_processed_coverage_ids_idx = compute_event_losses(
                     event_id, coverages, compute[:compute_i], items_data,
                     last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr,
-                    damage_bins, loss_threshold, losses_buffer, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id,
+                    damage_bins, loss_threshold, losses_buffer, alloc_rule, do_correlation, rndms_base, eps_ij, corr_data_by_item_id,
                     arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf, z_unif, debug,
                     GULPY_STREAM_BUFF_SIZE_WRITE, int32_mv_write, cursor
                 )
@@ -316,7 +319,7 @@ def run(run_dir, ignore_file_type, sample_size, loss_threshold, alloc_rule, debu
 @njit(cache=True, fastmath=True)
 def compute_event_losses(event_id, coverages, coverage_ids, items_data,
                          last_processed_coverage_ids_idx, sample_size, recs, rec_idx_ptr, damage_bins,
-                         loss_threshold, losses, alloc_rule, ignore_correlation, rndms_base, eps_ij, corr_data_by_item_id,
+                         loss_threshold, losses, alloc_rule, do_correlation, rndms_base, eps_ij, corr_data_by_item_id,
                          arr_min, arr_max, arr_N, norm_inv_cdf, arr_min_cdf, arr_max_cdf, arr_N_cdf, norm_cdf,
                          z_unif, debug, buff_size, int32_mv, cursor):
     """Compute losses for an event.
@@ -335,7 +338,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
         loss_threshold (float): threshold above which losses are printed to the output stream.
         losses (numpy.array[oasis_float]): array (to be re-used) to store losses for all item_ids.
         alloc_rule (int): back-allocation rule.
-        ignore_correlation (bool): if True, do not compute correlated random samples.
+        do_correlation (bool): if True, compute correlated random samples.
         rndms (numpy.array[float64]): 2d array of shape (number of seeds, sample_size) storing the random values
           drawn for each seed.
         debug (bool): if True, for each random sample, print to the output stream the random value
@@ -387,7 +390,7 @@ def compute_event_losses(event_id, coverages, coverage_ids, items_data,
             losses[MEAN_IDX, item_i] = gul_mean
 
             if sample_size > 0:
-                if not ignore_correlation:
+                if do_correlation:
                     item_corr_data = corr_data_by_item_id[item['item_id']]
                     rho = item_corr_data['correlation_value']
 

From 82fb79c01861ef77dedbf5f60c9712c14bdbb2d5 Mon Sep 17 00:00:00 2001
From: sambles <sambles@users.noreply.github.com>
Date: Fri, 12 Aug 2022 14:25:41 +0100
Subject: [PATCH 17/33] Disable GroupID hashing for acceptance tests (#1094)

* Update expected acceptance tests

* Revert "Update expected acceptance tests"

This reverts commit ad0907fd286387f811b00537b3c59680ce13c7d4.

* Default "hashed_group_id" to false in exposure run

* Move hashed_group_id=F default from "RunExposure" to "RunFmTest"

* Fix/pip compile (#1097)

* Only install pip-tools before pip-compile

* Try pinning flake8

* Revert "Try pinning flake8"

This reverts commit d845d5b2051ed7aa79a3282be000153b52966cad.

* Try pinning virtualenv

* add --upgrade to pip install pip-tools

* Fix test_get_dataframe__from_csv_file__set_col_defaults_option_and_use_defaults_ and run with falsifying example

* Remove falsifying example

Co-authored-by: Marco Tazzari <6020226+mtazzari@users.noreply.github.com>
---
 .github/workflows/oasislmf-unittest.yml | 3 +--
 oasislmf/computation/run/exposure.py    | 6 +++++-
 tests/fm/test_fm.py                     | 2 ++
 tests/fm/test_fmpy.py                   | 4 +++-
 tests/utils/test_data.py                | 3 +++
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/oasislmf-unittest.yml b/.github/workflows/oasislmf-unittest.yml
index 9c22e4a4d9..bec0a7bdc9 100644
--- a/.github/workflows/oasislmf-unittest.yml
+++ b/.github/workflows/oasislmf-unittest.yml
@@ -35,8 +35,7 @@ jobs:
       - name: Install pip-tools
         run: |
           python -m pip install --upgrade pip
-          pip install pip-tools
-
+          pip install --upgrade pip-tools
       - name: Pip Compile
         run: |
           rm -f requirements.txt
diff --git a/oasislmf/computation/run/exposure.py b/oasislmf/computation/run/exposure.py
index a332851b84..82f322095e 100644
--- a/oasislmf/computation/run/exposure.py
+++ b/oasislmf/computation/run/exposure.py
@@ -54,6 +54,7 @@ class RunExposure(ComputationStep):
         {'name': 'fmpy_low_memory',      'default': False, 'type': str2bool, 'const':True, 'nargs':'?', 'help': 'use memory map instead of RAM to store loss array (may decrease performance but reduce RAM usage drastically)'},
         {'name': 'fmpy_sort_output',     'default': True, 'type': str2bool, 'const': True, 'nargs': '?', 'help': 'order fmpy output by item_id'},
         {'name': 'stream_type',          'flag':'-t', 'default': 2,  'type':int,  'help': 'Set the IL input stream type, 2 = default loss stream, 1 = deprecated cov/item stream'},
+        {"name": "hashed_group_id",      "default": True, "type": str2bool, "const": False, 'nargs': '?',  "help": "Hashes the group_id in the items.bin"},
         {'name': 'net_ri', 'default': True},
         {'name': 'include_loss_factor', 'default': True},
         {'name': 'print_summary', 'default': True},
@@ -120,6 +121,7 @@ def run(self):
             oed_info_csv=ri_info_fp,
             oed_scope_csv=ri_scope_fp,
             keys_data_csv=keys_fp,
+            hashed_group_id=self.hashed_group_id,
         ).run()
 
         # 3. Run Deterministic Losses
@@ -312,6 +314,7 @@ class RunFmTest(ComputationStep):
         {'name': 'fmpy_low_memory',     'default': False,   'type': str2bool, 'const': True, 'nargs': '?', 'help': 'use memory map instead of RAM to store loss array (may decrease performance but reduce RAM usage drastically)'},
         {'name': 'fmpy_sort_output',    'default': True,    'type': str2bool, 'const': True, 'nargs': '?', 'help': 'order fmpy output by item_id'},
         {'name': 'update_expected',     'default': False},
+        {'name': 'hashed_group_id',     'default': False},
         {'name': 'expected_output_dir', 'default': "expected"},
     ]
 
@@ -408,7 +411,8 @@ def execute_test_case(self, test_case):
             num_subperils=self.num_subperils,
             fmpy=self.fmpy,
             fmpy_low_memory=self.fmpy_low_memory,
-            fmpy_sort_output=self.fmpy_sort_output
+            fmpy_sort_output=self.fmpy_sort_output,
+            hashed_group_id=self.hashed_group_id,
         ).run()
 
         expected_data_dir = os.path.join(test_dir, self.expected_output_dir)
diff --git a/tests/fm/test_fm.py b/tests/fm/test_fm.py
index 82133c94e3..b9bb672fa0 100644
--- a/tests/fm/test_fm.py
+++ b/tests/fm/test_fm.py
@@ -15,6 +15,7 @@ def setUp(self):
         self.test_cases_fp = os.path.join(sys.path[0], 'validation')
         self.update_expected = False
         self.keep_output = True
+        self.hashed_group_id = False
 
     def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"):
         with tempfile.TemporaryDirectory() as tmp_run_dir:
@@ -35,6 +36,7 @@ def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"):
                 num_subperils=subperils,
                 test_tolerance=0.001,
                 expected_output_dir=expected_dir,
+                hashed_group_id=self.hashed_group_id,
             )
 
         self.assertTrue(result)
diff --git a/tests/fm/test_fmpy.py b/tests/fm/test_fmpy.py
index 077b342a28..c10ff17f8d 100644
--- a/tests/fm/test_fmpy.py
+++ b/tests/fm/test_fmpy.py
@@ -15,6 +15,7 @@ def setUp(self):
         self.test_cases_fp = os.path.join(sys.path[0], 'validation')
         self.update_expected = False
         self.keep_output = True
+        self.hashed_group_id = False
 
     def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"):
         with tempfile.TemporaryDirectory() as tmp_run_dir:
@@ -37,6 +38,7 @@ def run_test(self, test_case, fmpy=False, subperils=1, expected_dir="expected"):
                 num_subperils=subperils,
                 test_tolerance=0.001,
                 expected_output_dir=expected_dir,
+                hashed_group_id=self.hashed_group_id,
             )
         self.assertTrue(result)
 
@@ -91,4 +93,4 @@ def test_issues_2_subperils(self):
         self.run_test('issues', fmpy=True, subperils=2, expected_dir="expected_subperils")
 
     def test_insurance_policy_coverage_2_subperils(self):
-        self.run_test('insurance_policy_coverage',fmpy=True, subperils=2, expected_dir="expected_subperils")
\ No newline at end of file
+        self.run_test('insurance_policy_coverage',fmpy=True, subperils=2, expected_dir="expected_subperils")
diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py
index be99f55edf..73f52b37bf 100644
--- a/tests/utils/test_data.py
+++ b/tests/utils/test_data.py
@@ -39,6 +39,7 @@
     get_timestamp,
     get_utctimestamp,
     get_location_df,
+    PANDAS_DEFAULT_NULL_VALUES,
 )
 
 from oasislmf.utils.defaults import (
@@ -541,10 +542,12 @@ def test_get_dataframe__from_csv_file_with_mixed_case_cols__set_col_defaults_opt
         try:
             df = pd.DataFrame(data)
             df.to_csv(path_or_buf=fp, columns=df.columns, encoding='utf-8', index=False)
+            df['STR_COL'] = df['STR_COL'].map(lambda x: np.nan if x in PANDAS_DEFAULT_NULL_VALUES  else x)
             fp.close()
 
             expected = df.copy(deep=True)
             expected.columns = expected.columns.str.lower()
+
             for col, default in defaults.items():
                 expected.loc[:, col.lower()].fillna(defaults[col], inplace=True)
 

From fee427ee04a1422b8300d92d6002911c55461f5c Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Fri, 12 Aug 2022 14:45:46 +0100
Subject: [PATCH 18/33] Update group_id_cols default in get_gul_input_items

---
 oasislmf/preparation/gul_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 87efd900bb..7df06b66ab 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -50,7 +50,7 @@ def get_gul_input_items(
     exposure_df,
     keys_df,
     exposure_profile=get_default_exposure_profile(),
-    group_id_cols=['loc_id'],
+    group_id_cols=["PortNumber", "AccNumber", "LocNumber"],
     hashed_group_id=True
 ):
     """

From 45f8779f2b58ec38294f39006e19c18693582b77 Mon Sep 17 00:00:00 2001
From: Maxwell Flitton <maxwellflitton@gmail.com>
Date: Fri, 12 Aug 2022 15:46:00 +0100
Subject: [PATCH 19/33] Hashing investigation (#1096)

* adding haahing patch

* adding haahing patch

* updating hashing

* Update oasislmf/preparation/gul_inputs.py

Co-authored-by: Marco Tazzari <6020226+mtazzari@users.noreply.github.com>
---
 oasislmf/preparation/gul_inputs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 7df06b66ab..279aa9cd7c 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -279,7 +279,7 @@ def get_gul_input_items(
 
     # Concatenate chunks. Sort by index to preserve item_id order in generated outputs compared
     # to original code.
-    gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index()
+    gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True, inplace=True)
     # Set default values and data types for BI coverage boolean, TIV, deductibles and limit
     dtypes = {
         **{t: 'uint8' for t in term_cols_ints + terms_ints},
@@ -333,7 +333,7 @@ def get_gul_input_items(
 
     # this block gets fired if the hashed_group_id is True
     else:
-        gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols]).to_numpy() >> 33)
+        gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols], index=False).to_numpy() >> 33)
 
     gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32')
 

From 461621f3b129d4a0c986dbb7a65384346bdbf8e6 Mon Sep 17 00:00:00 2001
From: mtazzari <mtazzari@gmail.com>
Date: Fri, 12 Aug 2022 16:43:03 +0100
Subject: [PATCH 20/33] [gul_inputs] bugfix don't modify inplace

---
 oasislmf/preparation/gul_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 279aa9cd7c..6a7b69d8ac 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -279,7 +279,7 @@ def get_gul_input_items(
 
     # Concatenate chunks. Sort by index to preserve item_id order in generated outputs compared
     # to original code.
-    gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True, inplace=True)
+    gul_inputs_df = pd.concat(gul_inputs_reformatted_chunks).sort_index().reset_index(drop=True)
     # Set default values and data types for BI coverage boolean, TIV, deductibles and limit
     dtypes = {
         **{t: 'uint8' for t in term_cols_ints + terms_ints},

From 77245a3301fd1f4749b83cd4c6b6e2d8dc208519 Mon Sep 17 00:00:00 2001
From: Sam Gamble <hexadessa@gmail.com>
Date: Fri, 12 Aug 2022 18:47:44 +0100
Subject: [PATCH 21/33] Update test_summaries.py to not rely on "loc_id" as
 default for group_id_cols

---
 tests/model_preparation/test_summaries.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/model_preparation/test_summaries.py b/tests/model_preparation/test_summaries.py
index c19c97a2a4..e128b7510c 100644
--- a/tests/model_preparation/test_summaries.py
+++ b/tests/model_preparation/test_summaries.py
@@ -133,7 +133,7 @@ def test_single_peril__totals_correct(self, data):
         )
 
         # Run Gul Proccessing
-        gul_inputs = get_gul_input_items(loc_df, keys_df)
+        gul_inputs = get_gul_input_items(loc_df, keys_df, group_id_cols=['loc_id'])
         gul_inputs = gul_inputs[gul_inputs['status'].isin(OASIS_KEYS_STATUS_MODELLED)]
 
         # Fetch expected TIVS
@@ -200,7 +200,7 @@ def test_multi_perils__single_covarage(self, data):
         # Run Summary output check
         self.assertSummaryIsValid(
             loc_df,
-            get_gul_input_items(loc_df, keys_df),
+            get_gul_input_items(loc_df, keys_df, group_id_cols=['loc_id']),
             get_exposure_summary(exposure_df=loc_df, keys_df=keys_df),
             perils_returned
         )
@@ -244,7 +244,7 @@ def test_multi_perils__multi_covarage(self, data):
 
         # Run Summary output check
         exp_summary = get_exposure_summary(exposure_df=loc_df, keys_df=keys_df)
-        gul_inputs = get_gul_input_items(loc_df, keys_df)
+        gul_inputs = get_gul_input_items(loc_df, keys_df, group_id_cols=['loc_id'])
         self.assertSummaryIsValid(
             loc_df,
             gul_inputs,

From bf500beed1eef1e06640f809342fb6161da8869c Mon Sep 17 00:00:00 2001
From: sambles <sambles@users.noreply.github.com>
Date: Tue, 23 Aug 2022 14:31:59 +0100
Subject: [PATCH 22/33] Always create a correlations.bin, if missing
 model_settings file is blank (#1101)

---
 oasislmf/computation/generate/files.py | 12 ++++--------
 oasislmf/preparation/correlations.py   |  7 +++++--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 0803074821..a8ad386b7d 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -234,14 +234,10 @@ def run(self):
             group_id_cols=group_id_cols,
             hashed_group_id=self.hashed_group_id,
         )
-
-        if self.model_settings_json is not None:
-            correlation_input_items = get_correlation_input_items(
-                model_settings_path=self.model_settings_json,
-                gul_inputs_df=gul_inputs_df
-            )
-        else:
-            correlation_input_items = None
+        correlation_input_items = get_correlation_input_items(
+            model_settings_path=self.model_settings_json,
+            gul_inputs_df=gul_inputs_df
+        )
 
         # If not in det. loss gen. scenario, write exposure summary file
         if summarise_exposure:
diff --git a/oasislmf/preparation/correlations.py b/oasislmf/preparation/correlations.py
index 6a5fed0bc6..bbb42ae5f9 100644
--- a/oasislmf/preparation/correlations.py
+++ b/oasislmf/preparation/correlations.py
@@ -43,8 +43,11 @@ def get_correlation_input_items(model_settings_path: str, gul_inputs_df: pd.Data
 
     Returns: (pd.DataFrame) the mapped data of correlations
     """
-    model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path)
-    correlation_map_df = map_data(data=model_settings_raw_data)
+    if model_settings_path == None:
+        correlation_map_df = None
+    else:
+        model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path)
+        correlation_map_df = map_data(data=model_settings_raw_data)
 
     if correlation_map_df is not None:
         gul_inputs_df = gul_inputs_df.merge(correlation_map_df, left_on='peril_id', right_on='id').reset_index()

From 2b3d2026f794397dc0b74b0d9312095aeb9f8518 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Mon, 5 Sep 2022 13:06:26 +0100
Subject: [PATCH 23/33] adding peril_correlation_group for
 valid_oasis_group_cols

---
 oasislmf/preparation/gul_inputs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 765bae9410..502e63a336 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -153,6 +153,7 @@ def get_gul_input_items(
         'peril_id',
         'coverage_id',
         'coverage_type_id',
+        'peril_correlation_group'
     ]
     for col in group_id_cols:
         if col not in list(exposure_df.columns) + valid_oasis_group_cols:

From 7d772fb74d7de813ead482c8d2b4930a7f8b4dc2 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Mon, 5 Sep 2022 16:01:39 +0100
Subject: [PATCH 24/33] appending peril_correlation_group to columns if
 correlations group is present

---
 oasislmf/computation/generate/files.py |  1 +
 oasislmf/preparation/gul_inputs.py     | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index a8ad386b7d..28a7e2aa03 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -230,6 +230,7 @@ def run(self):
         gul_inputs_df = get_gul_input_items(
             location_df,
             keys_df,
+            output_dir=self._get_output_dir(),
             exposure_profile=location_profile,
             group_id_cols=group_id_cols,
             hashed_group_id=self.hashed_group_id,
diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index cfd0da489e..6a36df1048 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -49,6 +49,7 @@
 def get_gul_input_items(
     exposure_df,
     keys_df,
+    output_dir,
     exposure_profile=get_default_exposure_profile(),
     group_id_cols=["PortNumber", "AccNumber", "LocNumber"],
     hashed_group_id=True
@@ -62,6 +63,9 @@ def get_gul_input_items(
     :param keys_df: Keys dataframe
     :type keys_df: pandas.DataFrame
 
+    :param output_dir: the output directory where input files are stored
+    :type output_dir: str
+
     :param exposure_profile: Exposure profile
     :type exposure_profile: dict
 
@@ -160,6 +164,20 @@ def get_gul_input_items(
             warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col))
             group_id_cols.remove(col)
 
+    # here we check to see if the correlation file is here, if it is then we need to add the "peril_correlation_group" to the valid_oasis_group_cols
+    peril_correlation_group = 'peril_correlation_group'
+    correlations_files = [
+        f"{output_dir}/correlations.csv",
+        f"{output_dir}/correlations.bin",
+    ]
+    for file_path in correlations_files:
+        if os.path.exists(path=file_path):
+            if peril_correlation_group not in group_id_cols:
+                group_id_cols.append(peril_correlation_group)
+            break
+
+
+
     # Should list of column names used to group_id be empty, revert to
     # default
     if len(group_id_cols) == 0:

From e6ac89e944af08036ca53a6ca7f6e9a71413e250 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Wed, 7 Sep 2022 12:30:41 +0100
Subject: [PATCH 25/33] adding peril_correlation_group column to hashing of
 group IDs if correlations groups are used and hashing group IDs is done

---
 oasislmf/computation/generate/files.py | 20 ++++--
 oasislmf/preparation/gul_inputs.py     | 89 +++++++++++++++++---------
 oasislmf/utils/data.py                 | 22 +++++++
 run_test.py                            |  0
 4 files changed, 96 insertions(+), 35 deletions(-)
 create mode 100644 run_test.py

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 28a7e2aa03..63beabcff9 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -8,6 +8,8 @@
 import json
 import os
 from pathlib import Path
+from typing import List
+import pandas as pd
 
 from .keys import GenerateKeys, GenerateKeysDeterministic
 from ..base import ComputationStep
@@ -72,7 +74,10 @@
     GULSummaryXrefFile,
     FMSummaryXrefFile
 )
-from oasislmf.preparation.correlations import get_correlation_input_items
+from oasislmf.preparation.correlations import get_correlation_input_items, map_data
+from oasislmf.preparation.gul_inputs import process_group_id_cols, hash_with_correlations
+# from oasislmf.preparation.correlations import map_data
+from oasislmf.utils.data import establish_correlations
 
 
 class GenerateFiles(ComputationStep):
@@ -230,16 +235,24 @@ def run(self):
         gul_inputs_df = get_gul_input_items(
             location_df,
             keys_df,
-            output_dir=self._get_output_dir(),
             exposure_profile=location_profile,
             group_id_cols=group_id_cols,
-            hashed_group_id=self.hashed_group_id,
+            hashed_group_id=self.hashed_group_id
         )
         correlation_input_items = get_correlation_input_items(
             model_settings_path=self.model_settings_json,
             gul_inputs_df=gul_inputs_df
         )
 
+        correlations: bool = establish_correlations(model_settings_path=self.model_settings_json)
+        group_id_cols: List[str] = process_group_id_cols(group_id_cols=group_id_cols,
+                                                         exposure_df_columns=list(location_df),
+                                                         correlations=correlations)
+
+        if self.hashed_group_id is True and correlations is True:
+            gul_inputs_df = pd.merge(gul_inputs_df, correlation_input_items, on="item_id")
+            gul_inputs_df = hash_with_correlations(gul_inputs_df=gul_inputs_df, hashing_columns=group_id_cols)
+
         # If not in det. loss gen. scenario, write exposure summary file
         if summarise_exposure:
             write_exposure_summary(
@@ -265,7 +278,6 @@ def run(self):
             oasis_files_prefixes=files_prefixes['gul'],
             chunksize=self.write_chunksize,
         )
-
         gul_summary_mapping = get_summary_mapping(gul_inputs_df, oed_hierarchy)
         write_mapping_file(gul_summary_mapping, target_dir)
 
diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 6a36df1048..33238a6139 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -10,6 +10,7 @@
 import sys
 import warnings
 from collections import OrderedDict
+from typing import List
 
 import pandas as pd
 
@@ -44,12 +45,61 @@
 pd.options.mode.chained_assignment = None
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
+VALID_OASIS_GROUP_COLS = [
+        'item_id',
+        'peril_id',
+        'coverage_id',
+        'coverage_type_id',
+        'peril_correlation_group'
+    ]
+
+
+def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[str], correlations: bool) -> List[str]:
+    """
+    cleans out columns that are not valid oasis group columns.
+
+    Valid group id columns can be either
+    1. exist in the location file
+    2. be listed as a useful internal col
+
+    Args:
+        group_id_cols: (List[str]) the ID columns that are going to be filtered
+        exposure_df_columns: (List[str]) the columns in the exposure dataframe
+        correlations: (bool) if set to True means that we are hashing with correlations in mind therefore the
+                             "peril_correlation_group" column is added
+
+    Returns: (List[str]) the filtered columns
+    """
+    for col in VALID_OASIS_GROUP_COLS:
+        if col not in list(exposure_df_columns) + VALID_OASIS_GROUP_COLS:
+            warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col))
+            group_id_cols.remove(col)
+
+    peril_correlation_group = 'peril_correlation_group'
+    if peril_correlation_group not in group_id_cols and correlations is True:
+        group_id_cols.append(peril_correlation_group)
+    return group_id_cols
+
+
+def hash_with_correlations(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd.DataFrame:
+    """
+    Creates a hash for the group ID field for the input data frame.
+
+    Args:
+        gul_inputs_df: (pd.DataFrame) the gul inputs that are doing the have the group_id field rewritten with a hash
+        hashing_columns: (List[str]) the list of columns used in the hashing algorithm
+
+    Returns: (pd.DataFrame) the gul_inputs_df with the new hashed group_id
+    """
+    gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[hashing_columns],
+                                                            index=False).to_numpy() >> 33)
+    return gul_inputs_df
+
 
 @oasis_log
 def get_gul_input_items(
     exposure_df,
     keys_df,
-    output_dir,
     exposure_profile=get_default_exposure_profile(),
     group_id_cols=["PortNumber", "AccNumber", "LocNumber"],
     hashed_group_id=True
@@ -148,35 +198,12 @@ def get_gul_input_items(
     # Remove any duplicate column names used to assign group_id
     group_id_cols = list(set(group_id_cols))
 
-    # Ignore any column names used to assign group_id that are missing or not supported
-    # Valid group id columns can be either
-    # 1. exist in the location file
-    # 2. be listed as a useful internal col
-    valid_oasis_group_cols = [
-        'item_id',
-        'peril_id',
-        'coverage_id',
-        'coverage_type_id',
-        'peril_correlation_group'
-    ]
-    for col in group_id_cols:
-        if col not in list(exposure_df.columns) + valid_oasis_group_cols:
-            warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col))
-            group_id_cols.remove(col)
-
-    # here we check to see if the correlation file is here, if it is then we need to add the "peril_correlation_group" to the valid_oasis_group_cols
-    peril_correlation_group = 'peril_correlation_group'
-    correlations_files = [
-        f"{output_dir}/correlations.csv",
-        f"{output_dir}/correlations.bin",
-    ]
-    for file_path in correlations_files:
-        if os.path.exists(path=file_path):
-            if peril_correlation_group not in group_id_cols:
-                group_id_cols.append(peril_correlation_group)
-            break
-
-
+    # it is assumed that correlations are False for now, correlations for group ID hashing are assessed later on in
+    # the process to re-hash the group ID with the correlation "peril_correlation_group" column name. This is because
+    # the correlations is achieved later in the process leading to a chicken and egg problem
+    group_id_cols = process_group_id_cols(group_id_cols=group_id_cols,
+                                          exposure_df_columns=list(exposure_df.columns),
+                                          correlations=False)
 
     # Should list of column names used to group_id be empty, revert to
     # default
@@ -186,7 +213,7 @@ def get_gul_input_items(
     # Only add group col if not internal oasis col
     missing_group_id_cols = []
     for col in group_id_cols:
-        if col in valid_oasis_group_cols:
+        if col in VALID_OASIS_GROUP_COLS:
             pass
         elif col not in exposure_df_gul_inputs_cols:
             missing_group_id_cols.append(col)
diff --git a/oasislmf/utils/data.py b/oasislmf/utils/data.py
index 24c7ccd41c..b4fa6e0391 100644
--- a/oasislmf/utils/data.py
+++ b/oasislmf/utils/data.py
@@ -44,6 +44,7 @@
 
 from chardet.universaldetector import UniversalDetector
 from tabulate import tabulate
+from typing import List, Optional
 
 import numpy as np
 import pandas as pd
@@ -409,6 +410,27 @@ def get_model_settings(model_settings_fp, key=None, validate=True):
     return model_settings if not key else model_settings.get(key)
 
 
+def establish_correlations(model_settings_path: str) -> bool:
+    """
+    Checks the model settings to see if correlations are present.
+
+    Args:
+        model_settings_path: (str) path to the model setting JSON file
+
+    Returns: (bool) True if correlations, False if not
+    """
+    model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path)
+    correlations: Optional[List[dict]] = model_settings_raw_data.get("correlation_settings")
+
+    if correlations is None:
+        return False
+    if not isinstance(correlations, list):
+        return False
+    if len(correlations) == 0:
+        return False
+    return True
+
+
 def detect_encoding(filepath):
     """
     Given a path to a CSV of unknown encoding
diff --git a/run_test.py b/run_test.py
new file mode 100644
index 0000000000..e69de29bb2

From 823add86efa0d011b2b59cf10ebb1cbcc57da3b1 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Thu, 15 Sep 2022 16:09:57 +0100
Subject: [PATCH 26/33] updating hashing group ID

---
 oasislmf/computation/generate/files.py        | 38 ++++++--------
 oasislmf/preparation/correlations.py          | 50 ++++++++-----------
 oasislmf/preparation/gul_inputs.py            | 29 +++++++----
 .../data_layer/oasis_files/correlations.py    |  2 +
 oasislmf/utils/data.py                        |  7 ++-
 tests/preparation/test_correlations.py        |  4 +-
 6 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 63beabcff9..885b3d3013 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -9,9 +9,8 @@
 import os
 from pathlib import Path
 from typing import List
-import pandas as pd
 
-from .keys import GenerateKeys, GenerateKeysDeterministic
+from .keys import GenerateKeys
 from ..base import ComputationStep
 
 #from ...utils.coverages import SUPPORTED_COVERAGE_TYPES
@@ -74,10 +73,10 @@
     GULSummaryXrefFile,
     FMSummaryXrefFile
 )
-from oasislmf.preparation.correlations import get_correlation_input_items, map_data
-from oasislmf.preparation.gul_inputs import process_group_id_cols, hash_with_correlations
-# from oasislmf.preparation.correlations import map_data
+from oasislmf.preparation.correlations import map_data
+from oasislmf.preparation.gul_inputs import process_group_id_cols
 from oasislmf.utils.data import establish_correlations
+from oasislmf.pytools.data_layer.oasis_files.correlations import CorrelationsData
 
 
 class GenerateFiles(ComputationStep):
@@ -216,11 +215,14 @@ def run(self):
 
         # Columns from loc file to assign group_id
         model_group_fields = None
+        correlations: bool = False
+        model_settings = None
+
         if self.model_settings_json:
+            model_settings = get_model_settings(self.model_settings_json)
+            correlations = establish_correlations(model_settings=model_settings)
             try:
-                model_group_fields = get_model_settings(
-                    self.model_settings_json, key='data_settings'
-                ).get('group_fields')
+                model_group_fields = model_settings["data_settings"].get("group_fields")
             except (KeyError, AttributeError, OasisException) as e:
                 self.logger.warn('WARNING: Failed to load {} - {}'.format(self.model_settings_json, e))
 
@@ -232,26 +234,18 @@ def run(self):
             group_id_cols = self.group_id_cols
         group_id_cols = list(map(lambda col: col.lower(), group_id_cols))
 
+        group_id_cols: List[str] = process_group_id_cols(group_id_cols=group_id_cols,
+                                                         exposure_df_columns=list(location_df),
+                                                         has_correlation_groups=correlations)
         gul_inputs_df = get_gul_input_items(
             location_df,
             keys_df,
+            peril_correlation_group_df=map_data(data=model_settings),
+            correlations=correlations,
             exposure_profile=location_profile,
             group_id_cols=group_id_cols,
             hashed_group_id=self.hashed_group_id
         )
-        correlation_input_items = get_correlation_input_items(
-            model_settings_path=self.model_settings_json,
-            gul_inputs_df=gul_inputs_df
-        )
-
-        correlations: bool = establish_correlations(model_settings_path=self.model_settings_json)
-        group_id_cols: List[str] = process_group_id_cols(group_id_cols=group_id_cols,
-                                                         exposure_df_columns=list(location_df),
-                                                         correlations=correlations)
-
-        if self.hashed_group_id is True and correlations is True:
-            gul_inputs_df = pd.merge(gul_inputs_df, correlation_input_items, on="item_id")
-            gul_inputs_df = hash_with_correlations(gul_inputs_df=gul_inputs_df, hashing_columns=group_id_cols)
 
         # If not in det. loss gen. scenario, write exposure summary file
         if summarise_exposure:
@@ -273,7 +267,7 @@ def run(self):
         gul_input_files = write_gul_input_files(
             gul_inputs_df,
             target_dir,
-            correlations_df=correlation_input_items,
+            correlations_df=gul_inputs_df[CorrelationsData.COLUMNS] if correlations is True else None,
             output_dir=self._get_output_dir(),
             oasis_files_prefixes=files_prefixes['gul'],
             chunksize=self.write_chunksize,
diff --git a/oasislmf/preparation/correlations.py b/oasislmf/preparation/correlations.py
index bbb42ae5f9..bfea8c2713 100644
--- a/oasislmf/preparation/correlations.py
+++ b/oasislmf/preparation/correlations.py
@@ -8,7 +8,7 @@
 from oasislmf.utils.data import get_model_settings
 
 
-def map_data(data: dict) -> Optional[pd.DataFrame]:
+def map_data(data: Optional[dict]) -> Optional[pd.DataFrame]:
     """
     Maps data from the model settings to to have Peril ID, peril_correlation_group, and correlation_value.
 
@@ -17,43 +17,37 @@ def map_data(data: dict) -> Optional[pd.DataFrame]:
 
     Returns: (pd.DataFrame) the mapped data
     """
-    supported_perils = data.get("lookup_settings", {}).get("supported_perils", [])
-    correlation_settings = data.get("correlation_settings", [])
+    if data is not None:
+        supported_perils = data.get("lookup_settings", {}).get("supported_perils", [])
+        correlation_settings = data.get("correlation_settings", [])
 
-    for supported_peril in supported_perils:
-        supported_peril["peril_correlation_group"] = supported_peril.get("peril_correlation_group", 0)
+        for supported_peril in supported_perils:
+            supported_peril["peril_correlation_group"] = supported_peril.get("peril_correlation_group", 0)
 
-    supported_perils_df = pd.DataFrame(supported_perils)
-    correlation_settings_df = pd.DataFrame(correlation_settings)
+        supported_perils_df = pd.DataFrame(supported_perils)
+        correlation_settings_df = pd.DataFrame(correlation_settings)
 
-    # merge allows duplicates of the "peril_correlation_group" in the supported perils
-    # merge does not allow duplicates of the "peril_correlation_group" in the correlation settings
-    if len(supported_perils_df) > 0 and len(correlation_settings_df) > 0:
-        mapped_data = pd.merge(supported_perils_df, correlation_settings_df, on="peril_correlation_group")
-        return mapped_data
+        # merge allows duplicates of the "peril_correlation_group" in the supported perils
+        # merge does not allow duplicates of the "peril_correlation_group" in the correlation settings
+        if len(supported_perils_df) > 0 and len(correlation_settings_df) > 0:
+            mapped_data = pd.merge(supported_perils_df, correlation_settings_df, on="peril_correlation_group")
+            return mapped_data
 
 
-def get_correlation_input_items(model_settings_path: str, gul_inputs_df: pd.DataFrame) -> pd.DataFrame:
+def get_correlation_input_items(gul_inputs_df: pd.DataFrame, correlation_map_df: pd.DataFrame) -> pd.DataFrame:
     """
     Gets the correlation values with the peril ID from the model_settings.
 
     Args:
-        model_settings_path: (str) the path to the model settings JSON file
+        correlation_map_df: (pd.DataFrame) data from the model settings to to have Peril ID, peril_correlation_group,
+                                           and correlation_value
         gul_inputs_df: (pd.DataFrame) the data of the gul inputs to be mapped
 
     Returns: (pd.DataFrame) the mapped data of correlations
     """
-    if model_settings_path == None:
-        correlation_map_df = None
-    else:
-        model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path)
-        correlation_map_df = map_data(data=model_settings_raw_data)
-
-    if correlation_map_df is not None:
-        gul_inputs_df = gul_inputs_df.merge(correlation_map_df, left_on='peril_id', right_on='id').reset_index()
-        gul_inputs_df["correlation_value"] = gul_inputs_df["correlation_value"].astype(float)
-        gul_inputs_df = gul_inputs_df.reindex(columns=list(gul_inputs_df))
-
-        correlation_df = gul_inputs_df[["item_id", "peril_correlation_group", "correlation_value"]]
-        return correlation_df.sort_values('item_id')
-    return pd.DataFrame(columns=["item_id", "peril_correlation_group", "correlation_value"])
+    gul_inputs_df = gul_inputs_df.merge(correlation_map_df, left_on='peril_id', right_on='id').reset_index()
+    gul_inputs_df["correlation_value"] = gul_inputs_df["correlation_value"].astype(float)
+    gul_inputs_df = gul_inputs_df.reindex(columns=list(gul_inputs_df))
+
+    correlation_df = gul_inputs_df[["item_id", "peril_correlation_group", "correlation_value"]]
+    return correlation_df.sort_values('item_id')
diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 33238a6139..2c5ec20969 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -54,7 +54,7 @@
     ]
 
 
-def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[str], correlations: bool) -> List[str]:
+def process_group_id_cols(group_id_cols, exposure_df_columns, has_correlation_groups):
     """
     cleans out columns that are not valid oasis group columns.
 
@@ -65,7 +65,7 @@ def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[st
     Args:
         group_id_cols: (List[str]) the ID columns that are going to be filtered
         exposure_df_columns: (List[str]) the columns in the exposure dataframe
-        correlations: (bool) if set to True means that we are hashing with correlations in mind therefore the
+        has_correlation_groups: (bool) if set to True means that we are hashing with correlations in mind therefore the
                              "peril_correlation_group" column is added
 
     Returns: (List[str]) the filtered columns
@@ -76,12 +76,12 @@ def process_group_id_cols(group_id_cols: List[str], exposure_df_columns: List[st
             group_id_cols.remove(col)
 
     peril_correlation_group = 'peril_correlation_group'
-    if peril_correlation_group not in group_id_cols and correlations is True:
+    if peril_correlation_group not in group_id_cols and has_correlation_groups is True:
         group_id_cols.append(peril_correlation_group)
     return group_id_cols
 
 
-def hash_with_correlations(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd.DataFrame:
+def hash_group_id(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd.DataFrame:
     """
     Creates a hash for the group ID field for the input data frame.
 
@@ -100,6 +100,8 @@ def hash_with_correlations(gul_inputs_df: pd.DataFrame, hashing_columns: List[st
 def get_gul_input_items(
     exposure_df,
     keys_df,
+    correlations,
+    peril_correlation_group_df,
     exposure_profile=get_default_exposure_profile(),
     group_id_cols=["PortNumber", "AccNumber", "LocNumber"],
     hashed_group_id=True
@@ -201,9 +203,9 @@ def get_gul_input_items(
     # it is assumed that correlations are False for now, correlations for group ID hashing are assessed later on in
     # the process to re-hash the group ID with the correlation "peril_correlation_group" column name. This is because
     # the correlations is achieved later in the process leading to a chicken and egg problem
-    group_id_cols = process_group_id_cols(group_id_cols=group_id_cols,
-                                          exposure_df_columns=list(exposure_df.columns),
-                                          correlations=False)
+    # group_id_cols = process_group_id_cols(group_id_cols=group_id_cols,
+    #                                       exposure_df_columns=list(exposure_df.columns),
+    #                                       has_correlation_groups=False)
 
     # Should list of column names used to group_id be empty, revert to
     # default
@@ -358,8 +360,6 @@ def get_gul_input_items(
     # directly, otherwise create an index of the group id fields
     group_id_cols.sort()
 
-    col_key = group_id_cols[0]
-
     if correlation_check is True:
         gul_inputs_df['group_id'] = gul_inputs_df[correlation_group_id]
 
@@ -378,8 +378,14 @@ def get_gul_input_items(
             )[0]
 
     # this block gets fired if the hashed_group_id is True
+    elif correlations is False:
+        gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols],
+                                                                index=False).to_numpy() >> 33)
     else:
-        gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols], index=False).to_numpy() >> 33)
+        # do merge with peril correlation df
+        gul_inputs_df = gul_inputs_df.merge(peril_correlation_group_df, left_on='peril_id', right_on='id').reset_index()
+        gul_inputs_df["group_id"] = (pd.util.hash_pandas_object(gul_inputs_df[group_id_cols],
+                                                                index=False).to_numpy() >> 33)
 
     gul_inputs_df['group_id'] = gul_inputs_df['group_id'].astype('uint32')
 
@@ -391,7 +397,8 @@ def get_gul_input_items(
         ['peril_id', 'coverage_type_id', 'tiv', 'areaperil_id', 'vulnerability_id'] +
         terms +
         (['model_data'] if 'model_data' in gul_inputs_df else []) +
-        ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status']
+        ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status'] +
+        ["peril_correlation_group", "correlation_value"] if correlations is True else []
     )
     usecols = [col for col in usecols if col in gul_inputs_df]
     gul_inputs_df = gul_inputs_df[usecols]
diff --git a/oasislmf/pytools/data_layer/oasis_files/correlations.py b/oasislmf/pytools/data_layer/oasis_files/correlations.py
index 75de73ad79..afdd9c4547 100644
--- a/oasislmf/pytools/data_layer/oasis_files/correlations.py
+++ b/oasislmf/pytools/data_layer/oasis_files/correlations.py
@@ -16,6 +16,8 @@ class CorrelationsData:
     Attributes:
         data (Optional[pd.DataFrame): correlation data that is either loaded or saved
     """
+    COLUMNS = ["item_id", "peril_correlation_group", "correlation_value"]
+
     def __init__(self, data: Optional[pd.DataFrame] = None) -> None:
         """
         The constructor for the CorrelationsData class.
diff --git a/oasislmf/utils/data.py b/oasislmf/utils/data.py
index b4fa6e0391..b3dfeb46d3 100644
--- a/oasislmf/utils/data.py
+++ b/oasislmf/utils/data.py
@@ -410,17 +410,16 @@ def get_model_settings(model_settings_fp, key=None, validate=True):
     return model_settings if not key else model_settings.get(key)
 
 
-def establish_correlations(model_settings_path: str) -> bool:
+def establish_correlations(model_settings: dict) -> bool:
     """
     Checks the model settings to see if correlations are present.
 
     Args:
-        model_settings_path: (str) path to the model setting JSON file
+        model_settings: (dict) the model settings that are going to be checked
 
     Returns: (bool) True if correlations, False if not
     """
-    model_settings_raw_data: dict = get_model_settings(model_settings_fp=model_settings_path)
-    correlations: Optional[List[dict]] = model_settings_raw_data.get("correlation_settings")
+    correlations: Optional[List[dict]] = model_settings.get("correlation_settings")
 
     if correlations is None:
         return False
diff --git a/tests/preparation/test_correlations.py b/tests/preparation/test_correlations.py
index 62c10112c9..3d2bc49456 100644
--- a/tests/preparation/test_correlations.py
+++ b/tests/preparation/test_correlations.py
@@ -27,10 +27,10 @@ def test_map_data(self):
 
     def test_get_correlation_input_items(self):
         gul_path = META_PATH + "gul_inputs_df.csv"
-        settings_path = META_PATH + "model_settings.json"
 
         gul_inputs_df = pd.read_csv(gul_path)
-        correlation_df = get_correlation_input_items(model_settings_path=settings_path, gul_inputs_df=gul_inputs_df)
+        correlation_df = get_correlation_input_items(correlation_map_df=map_data(data=self.model_settings),
+                                                     gul_inputs_df=gul_inputs_df)
         correlation_df_check = pd.read_csv(f"{META_PATH}correlation_df.csv")
 
         correlation_df_check.equals(correlation_df)

From ff9f5681d2a3dd84892ebbd165241d477dcef6d1 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Tue, 20 Sep 2022 16:38:35 +0100
Subject: [PATCH 27/33] updating to accomodate non-correlations

---
 oasislmf/preparation/gul_inputs.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 2c5ec20969..b82fd11053 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -397,9 +397,11 @@ def get_gul_input_items(
         ['peril_id', 'coverage_type_id', 'tiv', 'areaperil_id', 'vulnerability_id'] +
         terms +
         (['model_data'] if 'model_data' in gul_inputs_df else []) +
-        ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status'] +
-        ["peril_correlation_group", "correlation_value"] if correlations is True else []
+        ['is_bi_coverage', 'group_id', 'coverage_id', 'item_id', 'status']
     )
+    if correlations is True:
+        usecols += ["peril_correlation_group", "correlation_value"]
+
     usecols = [col for col in usecols if col in gul_inputs_df]
     gul_inputs_df = gul_inputs_df[usecols]
 

From 32a5f66d0c2a5d914ca0e02a2878bca5e7843b17 Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Wed, 21 Sep 2022 14:19:32 +0100
Subject: [PATCH 28/33] fixxing run

---
 oasislmf/computation/generate/files.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/oasislmf/computation/generate/files.py b/oasislmf/computation/generate/files.py
index 885b3d3013..40addb86a5 100644
--- a/oasislmf/computation/generate/files.py
+++ b/oasislmf/computation/generate/files.py
@@ -218,7 +218,7 @@ def run(self):
         correlations: bool = False
         model_settings = None
 
-        if self.model_settings_json:
+        if self.model_settings_json is not None:
             model_settings = get_model_settings(self.model_settings_json)
             correlations = establish_correlations(model_settings=model_settings)
             try:
@@ -226,6 +226,7 @@ def run(self):
             except (KeyError, AttributeError, OasisException) as e:
                 self.logger.warn('WARNING: Failed to load {} - {}'.format(self.model_settings_json, e))
 
+
         # load group columns from model_settings.json if not set in kwargs (CLI)
         if model_group_fields and not self.kwargs.get('group_id_cols'):
             group_id_cols = model_group_fields
@@ -264,6 +265,7 @@ def run(self):
         # Write the GUL input files
         files_prefixes = self.oasis_files_prefixes
 
+
         gul_input_files = write_gul_input_files(
             gul_inputs_df,
             target_dir,

From d60deb20ec4ded903523ef32ea9ea71a0d21dd2a Mon Sep 17 00:00:00 2001
From: maxwellflitton <maxwellflitton@gmail.com>
Date: Wed, 21 Sep 2022 15:00:28 +0100
Subject: [PATCH 29/33] fixing empty correlations df write header if empty
 correlations

---
 oasislmf/preparation/gul_inputs.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index b82fd11053..d93d3ad154 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -531,11 +531,13 @@ def write_gul_input_files(
     # Clean the target directory path
     target_dir = as_path(target_dir, 'Target IL input files directory', is_dir=True, preexists=False)
 
+    if correlations_df is None:
+        correlations_df = pd.DataFrame(columns=['item_id', 'peril_correlation_group', 'correlation_value'])
+
     # write the correlations to a binary file
-    if correlations_df is not None:
-        correlation_data_handle = CorrelationsData(data=correlations_df)
-        correlation_data_handle.to_bin(file_path=f"{output_dir}/correlations.bin")
-        correlation_data_handle.to_csv(file_path=f"{output_dir}/correlations.csv")
+    correlation_data_handle = CorrelationsData(data=correlations_df)
+    correlation_data_handle.to_bin(file_path=f"{output_dir}/correlations.bin")
+    correlation_data_handle.to_csv(file_path=f"{output_dir}/correlations.csv")
 
     # Set chunk size for writing the CSV files - default is the minimum of 100K
     # or the GUL inputs frame size

From e95dac662cc7375948e308f0e17bb961c5e0aebf Mon Sep 17 00:00:00 2001
From: Sam Gamble <hexadessa@gmail.com>
Date: Mon, 3 Oct 2022 13:07:58 +0100
Subject: [PATCH 30/33] Remove empty file

---
 run_test.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 run_test.py

diff --git a/run_test.py b/run_test.py
deleted file mode 100644
index e69de29bb2..0000000000

From 62c805f536963094e85c8d43b61236a0f3e9fc81 Mon Sep 17 00:00:00 2001
From: Sam Gamble <hexadessa@gmail.com>
Date: Mon, 3 Oct 2022 13:14:06 +0100
Subject: [PATCH 31/33] Add missing defaults to get_gul_input_items (backwards
 compatible)

---
 oasislmf/preparation/gul_inputs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index d93d3ad154..1c97b12369 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -100,8 +100,8 @@ def hash_group_id(gul_inputs_df: pd.DataFrame, hashing_columns: List[str]) -> pd
 def get_gul_input_items(
     exposure_df,
     keys_df,
-    correlations,
-    peril_correlation_group_df,
+    correlations=False,
+    peril_correlation_group_df=None,
     exposure_profile=get_default_exposure_profile(),
     group_id_cols=["PortNumber", "AccNumber", "LocNumber"],
     hashed_group_id=True

From 5f6933adc8ada030be82f15f7cf969d79a12beac Mon Sep 17 00:00:00 2001
From: Sam Gamble <hexadessa@gmail.com>
Date: Mon, 3 Oct 2022 13:28:06 +0100
Subject: [PATCH 32/33] Fix Group_id valid column check

---
 oasislmf/preparation/gul_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oasislmf/preparation/gul_inputs.py b/oasislmf/preparation/gul_inputs.py
index 1c97b12369..f3e5ad7aa6 100644
--- a/oasislmf/preparation/gul_inputs.py
+++ b/oasislmf/preparation/gul_inputs.py
@@ -70,7 +70,7 @@ def process_group_id_cols(group_id_cols, exposure_df_columns, has_correlation_gr
 
     Returns: (List[str]) the filtered columns
     """
-    for col in VALID_OASIS_GROUP_COLS:
+    for col in group_id_cols:
         if col not in list(exposure_df_columns) + VALID_OASIS_GROUP_COLS:
             warnings.warn('Column {} not found in loc file, or a valid internal oasis column'.format(col))
             group_id_cols.remove(col)

From 44e43f2d53830fd8bd4778b21ad5393c89c74e78 Mon Sep 17 00:00:00 2001
From: Sam Gamble <hexadessa@gmail.com>
Date: Mon, 3 Oct 2022 14:33:34 +0100
Subject: [PATCH 33/33] Force retest

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 6369f55384..bced70a95d 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/OasisLMF/OasisLMF/develop?filepath=FmTesting.ipynb)
 
 # OasisLMF
-
 The `oasislmf` Python package, loosely called the *model development kit (MDK)* or the *MDK package*, provides a command line toolkit for developing, testing and running Oasis models end-to-end locally, or remotely via the Oasis API. It can generate ground-up losses (GUL), direct/insured losses (IL) and reinsurance losses (RIL). It can also generate deterministic losses at all these levels.