forked from Tancredi-Schettini-Gherardini/P5CY4ML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data_Analysis_Fourfolds.py
317 lines (251 loc) · 11.3 KB
/
Data_Analysis_Fourfolds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
"""
Import the four-folds dataset, produce plots to show the interesting features
of the invariants and perform a clustering analysis.
The code below reproduces section 3.1 of the paper.
"""
#Import libraries
import numpy as np
import gzip
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import Counter
#Define path to data
path = './Data/5dTransWH.all.gz'
# Import data
# The data contain the three Hodge numbers h^{1,1}, h^{1,2}, h^{1,3} and the Euler number \Chi.
# h^{2,2} can be obtained by 4 + 2 h^{1,1} - 4 h{1,2} + 2 h{1,3} + h^{2,2} = \chi
with gzip.open(path, 'rb') as file:
weights, chi, h11, h12, h13, h22, triple_h = [], [], [], [], [], [], []
for line in file.readlines():
line_str = str(line).replace('b\'','').replace(']\\n\'','').replace('=d','').replace(':',',').replace('[','').replace(' ',',').split(',')
chi_ = int(line_str[-1])
h_11 = int(line_str[-4])
h_12 = int(line_str[-3])
h_13 = int(line_str[-2])
h_22 = -(4 + 2*h_11 - 4*h_12 + 2*h_13 - chi_)
weights.append(line_str[:6])
chi.append(chi_)
h11.append(h_11)
h12.append(h_12)
h13.append(h_13)
h22.append(h_22)
# Can be alternatively calcualted as:
# h22.append(1/3*(84 + chi_ + 6*h_11 + 6*h_13))
triple_h.append([h_11, h_13, h_22])
# Given the extra constraint, coming from the Atiyah-Singer theorem,
# i.e. -4 h^{1,1} + 2 h^{2,1} - 4 h^{3,1} + h^{2,2} - 44 = 0,
# only 3 numbers out of the (Hodges, Euler) are really independent.
# Here we choose those three because they are not zero, so they give
# a well-defined MAPE when investigating ML performances.
# Bring the variables into a suitable format.
weights = np.array(weights,dtype='int')
chi = np.array(chi)
h11=np.array(h11)
h12=np.array(h12)
h13=np.array(h13)
h22=np.array(h22)
triple_h = np.array(triple_h)
chi=chi.astype(float)
h11=h11.astype(float)
h12=h12.astype(float)
h13=h13.astype(float)
h22=h22.astype(float)
triple_h=triple_h.astype(float)
#%% --------------------------------------------------------------------------------------
# We plot the histograms to show the distributions below.
#%% h11 Histogram
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(5,5))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.hist(h11, bins=500,log=True, zorder = 10)
plt.xlabel(r'$h^{1,1}$', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)
#plt.savefig('h11_hist_.jpg', dpi=1200, bbox_inches='tight')
print("Maximumx is ", max(h11))
print("Minimum is ", min(h11))
print("Mean is ", sum(h11)/len(h11))
#%% h12 Histogram
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(3,3))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.hist(h12, bins=500,log=True, zorder = 10)
plt.xlabel(r'$h^{1,2}$', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)
#plt.savefig('h12_hist_.jpg', dpi=1200, bbox_inches='tight')
print("Maximumx is ", max(h12))
print("Minimum is ", min(h12))
print("Mean is ", sum(h12)/len(h12))
#%% h13 Histogram
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(5,5))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.hist(h13, bins=500,log=True, zorder = 10)
plt.xlabel(r'$h^{1,3}$', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)
#plt.savefig('h13_hist_.jpg', dpi=1200, bbox_inches='tight')
print("Maximumx is ", max(h13))
print("Minimum is ", min(h13))
print("Mean is ", sum(h13)/len(h13))
#%% h22 Histogram
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(6,6))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.hist(h22, bins=500,log=True, zorder = 10)
plt.xlabel(r'$h^{2,2}$', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)
#plt.savefig('h22_hist_.jpg', dpi=1200, bbox_inches='tight')
print("Maximumx is ", max(h22))
print("Minimum is ", min(h22))
print("Mean is ", sum(h22.astype(float))/len(h22))
#%% Euler Histogram
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(6,6))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.hist(chi, bins=500,log=True, zorder = 10)
plt.xlabel(r'$\chi$', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)
#plt.savefig('chi_hist_.jpg', dpi=1200, bbox_inches='tight')
print("Maximumx is ", max(chi))
print("Minimum is ", min(chi))
print("Mean is ", sum(chi.astype(float))/len(chi))
#%% --------------------------------------------------------------------------------------
# We produce scatter plots for some of the Hodge data below.
#%% Classic Mirror Plot
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(5,5))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.scatter(h11-h13, h11+ h13, zorder = 10, s = 0.2)
x = np.arange(0, 3.15*10**5, 0.1*10**5 )
plt.plot(x, x , linewidth = 5, color="orangered", alpha = 0.5, label = r"$h^{1,3} = 0$")
x = np.arange(-3.15*10**5, 0, 0.1*10**5 )
plt.plot(x, -x , linewidth = 5, color="orangered", alpha = 0.5, label = r"$h^{1,1} = 0$")
plt.xlabel(r'$h^{1,1} - h^{1,3}$', fontsize = 15)
plt.ylabel(r'$h^{1,1} + h^{1,3}$', fontsize = 15)
#plt.savefig('Mirror_plot.jpg', dpi=1200, bbox_inches = "tight")
#%% Two highest Hodges plot
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(5,5))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.scatter(h13, h22, zorder = 10, s = 0.3)
x = np.arange(0, 3.15*10**5, 0.1*10**5 )
plt.plot(x, 4*x + 82, linewidth = 3, color="orangered", alpha = 0.5, label = r"$h^{2,2} = 4 \, h^{1,3}$")
plt.xlabel(r'$h^{1,3}$', fontsize = 15)
plt.ylabel(r'$h^{2,2} $', fontsize = 15)
plt.legend(loc="center right")
#plt.savefig('Mirror_plot_2.jpg', dpi=1200, bbox_inches = "tight")
#%% 3-d Hodges plot
fig = plt.figure(figsize=(9, 6))
ax = plt.axes(projection='3d')
ax.scatter3D(h11, h12, h13, color='steelblue', s=0.4)
ax.ticklabel_format(axis='x', style='sci', scilimits=(5,5))
plt.ticklabel_format(axis='y', style='sci', scilimits=(3,3))
plt.ticklabel_format(axis='z', style='sci', scilimits=(5,5))
ax.view_init(elev=17, azim = 65)
ax.set_xlabel(r"$h^{1,1}$ (1e5) ", fontsize = 12)
ax.set_ylabel(r"$h^{1,2}$ (1e3)", fontsize = 12)
ax.set_zlabel(r"$h^{1,3}$ (1e5)", fontsize = 12)
#plt.savefig('3dplot4_.jpg', dpi=1000, bbox_inches = "tight")
#%% h^{1,1} vs highest weight plot
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.ticklabel_format(axis='both', style='sci', scilimits=(5,5))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.ylabel(r"$h^{1,1}$", fontsize = 15)
plt.xlabel(r"$w_{max}$", fontsize = 15)
#plt.scatter(weights[:,5],h11[:], c="steelblue", s=0.2)
#%% --------------------------------------------------------------------------------------
# We now focus on the clustering behaviour, which is evident from the scatter
# plot above.
#%% Perform clustering analysis on the data for which the linear forking is most
# evident, i.e. the systems with large weights.
h11_clust, high_weight_clust = [], []
for i in range(len(h11)):
# This is just a way of selecting those points in the h^{1,1}-w_{max} plane
# that nicely show linear clustering. We identified the middle cluster, and
# used the line perpendicular to it as a lower bound to select the data.
# Simply choosing all the points with w_{max}>3*10^5 would also be an option.
if h11[i] + (1/0.1925)*weights[i,5] > 20*10**5:
h11_clust.append(h11[i])
high_weight_clust.append(weights[i,5])
# Get them into the right format
h11_clust = np.array(h11_clust)
high_weight_clust = np.array(high_weight_clust)
preset_number_clusters=8 # Chose this just by inspection.
ratio_data=np.array(h11_clust/high_weight_clust).reshape(-1,1)
kmeans = KMeans(n_clusters=preset_number_clusters).fit(ratio_data)
#Compute clustering over the full ratio data (irrespective of whether full or outer used to identify clusters)
transformed_full_data = kmeans.transform(ratio_data) #...data transformed to list distance to all centres
kmeans_labels = np.argmin(transformed_full_data,axis=1) #...identify the closest cluster centre to each datapoint
full_data_inertia = np.sum([min(x)**2 for x in transformed_full_data]) #...compute the inertia over the full dataset
cluster_sizes = Counter(kmeans_labels) #...compute the frequencies in each cluster
print('\nCluster Centres: '+str(kmeans.cluster_centers_.flatten())+'\nCluster sizes: '+str([cluster_sizes[x] for x in range(10)])+'\n\nInertia: '+str(full_data_inertia)+'\nNormalised Inertia: '+str(full_data_inertia/len(h11_clust))+'\nNormalised Inertia / range: '+str((full_data_inertia/(len(h11_clust)*(max(ratio_data)-min(ratio_data))))[0]))
#%% Visualise the data that we have selected for the investigation as a histogram.
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.grid(alpha=0.5)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
inner=500000
plt.hist(h11_clust/high_weight_clust,bins=50, zorder = 10, color= "steelblue")
plt.xlabel(r"$h^{1,1}/ w_{max}$", fontsize = 15)
plt.ylabel("Frequency", fontsize = 15)
#plt.savefig('clust_hist.jpg', dpi=1000, bbox_inches = "tight")
#%% Visualise the data and the clusters that were found above on the same plot.
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.xlim(-100000,2000000)
plt.ylim(-10000,400000)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(alpha=0.5)
plt.scatter(high_weight_clust,h11_clust, c='steelblue', s=0.1, zorder = 10, label="Data")
plt.plot(np.linspace(0,2000000,2),kmeans.cluster_centers_.flatten()[0]*np.linspace(0,2000000,2),color='black',lw=0.5, alpha=0.5, zorder = 15, label="Cluster lines")
for grad in kmeans.cluster_centers_.flatten():
plt.plot(np.linspace(0,2000000,2),grad*np.linspace(0,2000000,2),color='black',lw=0.5, alpha=0.5, zorder = 15)
plt.ylabel(r'$h^{1,1}$', fontsize = 15)
plt.xlabel(r'$w_{max}$', fontsize = 15 )
plt.legend(loc="upper left", fontsize = 11.5)
#plt.savefig('clust_lines_overlaid.jpg', dpi=1000, bbox_inches = "tight")