feat: Adding new data

vivekkatial · Jun 12, 2024 · 26babf5 · 26babf5
1 parent ca87413
commit 26babf5
Show file tree

Hide file tree

Showing 17 changed files with 38,571 additions and 4,566 deletions.
diff --git a/data/12-nodes/matilda_processed_graph-all.csv b/data/12-nodes/matilda_processed_graph-all.csv
diff --git a/data/12-nodes/matilda_processed_graph-unweight-only.csv b/data/12-nodes/matilda_processed_graph-unweight-only.csv
diff --git a/data/12-nodes/matilda_processed_graph-weight-only.csv b/data/12-nodes/matilda_processed_graph-weight-only.csv
diff --git a/data/12-nodes/matilda_processed_graph_weight-all.csv b/data/12-nodes/matilda_processed_graph_weight-all.csv
diff --git a/data/12-nodes/matilda_processed_graph_weight-weight-only.csv b/data/12-nodes/matilda_processed_graph_weight-weight-only.csv
diff --git a/data/12-nodes/matilda_processed_weight-all.csv b/data/12-nodes/matilda_processed_weight-all.csv
diff --git a/data/12-nodes/matilda_processed_weighted_unweighted-all.csv b/data/12-nodes/matilda_processed_weighted_unweighted-all.csv
diff --git a/data/12-nodes/matilda_processed_weighted_unweighted-weight-only.csv b/data/12-nodes/matilda_processed_weighted_unweighted-weight-only.csv
diff --git a/data/initialisation_results_all_nodes.csv b/data/initialisation_results_all_nodes.csv
diff --git a/data/initialisation_results_nodes-10.csv b/data/initialisation_results_nodes-10.csv
diff --git a/data/initialisation_results_nodes-11.csv b/data/initialisation_results_nodes-11.csv
diff --git a/data/initialisation_results_nodes-13.csv b/data/initialisation_results_nodes-13.csv
diff --git a/data/initialisation_results_nodes-9.csv b/data/initialisation_results_nodes-9.csv
diff --git a/data/matilda_processed_weight.csv b/data/matilda_processed_weight.csv
diff --git a/src/extract_runs.py b/src/extract_runs.py
@@ -15,7 +15,7 @@ def main():
     try:
         # Connect to MLFlow experiment
         EXPERIMENT_NAME = "QAOA-Parameter-Initialisation"
-        NUM_NODES = 12
+        NUM_NODES = 11
         graph_types = [
             "Nearly Complete BiPartite",
             "Uniform Random",

diff --git a/src/matilda_prep.py b/src/matilda_prep.py
@@ -49,6 +49,8 @@ def load_and_process_data(file_path, source_type='graph_weight', feature_filter=
 
     # Filter out rows with 'feature_weight_type' equal to 'None'
     selected_df = selected_df[selected_df['feature_weight_type'] != 'None']
+
+    # selected_df = selected_df[selected_df['feature_weight_type'] == 'None']
 
     # Create the 'Source' column based on the source_type parameter
     if source_type == 'graph_weight':
@@ -171,11 +173,10 @@ def test_only_numeric_features():
 
 if __name__ == "__main__":
     file_path = "data/initialisation_results_nodes-12.csv"
-    source_types = ['graph_weight', 'graph', 'weight']
-
+    source_types = ['graph_weight', 'graph', 'weight', 'weighted_unweighted']
     for source_type in source_types:
         d_matilda = load_and_process_data(file_path, source_type, feature_filter=False)
-        output_file = f"data/12-nodes/matilda_processed_{source_type}.csv"
+        output_file = f"data/12-nodes/matilda_processed_{source_type}-weight-only.csv"
 
         # Write to csv file
         d_matilda.to_csv(output_file, index=False)

diff --git a/src/plots_informs.py b/src/plots_informs.py
@@ -27,6 +27,7 @@ def plot_ar_distribution(df, output_file='ar_distribution.png'):
     # Create the FacetGrid
     g = sns.FacetGrid(df_melted, col='Approximation Type', col_wrap=4, height=4, sharex=True, sharey=True)
     g.map(sns.histplot, 'Approximation Ratio', bins=20, kde=False, color='blue', alpha=0.3)
+
 
     g.set(xlim=(0, 1))
 
@@ -49,3 +50,55 @@ def plot_ar_distribution(df, output_file='ar_distribution.png'):
 
 # Plot the distribution and save the plot
 plot_ar_distribution(df)
+
+# Read in the CSV files for all nodes
+node_sizes = [9, 10, 11, 12, 13]
+# Create a empty list to store the dataframes
+dfs = []
+for node_size in node_sizes:
+    fn = f'data/initialisation_results_nodes-{node_size}.csv'
+    df = pd.read_csv(fn)
+    # Cols that start with algo
+    algo_cols = [col for col in df.columns if col.startswith('algo')]
+    # AR cols
+    ar_cols = ['approximation_ratio_fixed_angles_constant',
+            'approximation_ratio_random',
+            'approximation_ratio_three_regular',
+            'approximation_ratio_qibpi',
+            'approximation_ratio_tqa',
+            'approximation_ratio_interp_p15',
+            'approximation_ratio_fourier_p15']
+
+    # Instance Class and Instance Size
+    instance_cols = ['num_nodes','graph_type', 'weight_type']
+
+    # Selected columns
+    selected_cols = ['run_id'] + instance_cols + ar_cols + algo_cols
+    df = df[selected_cols]
+
+    # Validate the columns
+    if not set(algo_cols).issubset(set(df.columns)):
+        print(f"Columns {algo_cols} not found in the dataframe {fn}.")
+        continue
+    else:
+        dfs.append(df)
+
+    # Print the number of rows and columns
+    print(f"Dataframe {fn} has {df.shape[0]} rows and {df.shape[1]} columns.")
+
+
+# Concatenate the dataframes
+df_all = pd.concat(dfs, ignore_index=True)
+
+# Print columns
+print(df_all.columns)
+print(df_all.shape)
+# Remove rows with NaN values
+df_all = df_all.dropna()
+print(df_all.head())
+
+# Print num instances for the number of nodes
+print(df_all['num_nodes'].value_counts())
+
+# Write to a CSV file
+df_all.to_csv('data/initialisation_results_all_nodes.csv', index=False)