-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_interpolate.py
116 lines (90 loc) · 4.84 KB
/
data_interpolate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Impute or Interpolate missing data according to categorical & numerical features.
Removes outliers according to m*IQR -> (m=1.5 default).
"""
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
import numpy as np
import pandas as pd
# Features
# straight features are straightforward to extract with \d+
# NOTE 1: the current 'straight' feature is battery which seems to be mostly im mAH
# NOTE 2: should still check if its actually 'mAH' and remove or convert if not
straight_features = ["battery"]
all_features = ["oem", "launch_announced", "launch_status", "body_dimensions", "display_size", "comms_wlan",
"comms_usb",
"features_sensors", "platform_os", "platform_cpu", "platform_gpu", "memory_internal",
"main_camera_single", "main_camera_video", "misc_price",
"selfie_camera_video",
"selfie_camera_single", "battery"]
final_features = ["oem", "launch_announced", "body_dimensions", "screen_size", "scn_bdy_ratio",
"features_sensors", "clock_speed", "platform_gpu", "ram", "rom",
"main_camera_single", "main_camera_video", "misc_price",
"selfie_camera_video",
"selfie_camera_single", "battery"]
cols_to_drop = ['launch_status',
'comms_wlan', 'comms_usb', 'platform_os', 'core_count']
numeric_features = ["body_dimensions", "screen_size", "scn_bdy_ratio", "clock_speed", "memory_internal",
"main_camera_single", "main_camera_video", "misc_price",
"selfie_camera_video",
"selfie_camera_single", "battery"]
# There are some categorical features like 'main_camera_features' & etc,
# someone should include these features.
# Other features like 'body_weight' and 'body_sim' could be also included.
def rem_outliers(df):
for feature in numeric_features:
series_ = df[feature]
# Calc IQR for the column
Q1 = np.quantile(series_, .75)
Q3 = np.quantile(series_, .25)
out_factor = 1.5 * (Q3 - Q1)
out_cond = lambda x: x and (x >= Q1 - out_factor or x <= Q3 - out_factor)
# Check if each value is > 1.5 * IQR
# NOTE: does not work if a lot of examples were not properly recorded/extracted
df[feature] = series_.apply(lambda x: x if out_cond(x) else np.nan)
# Return outlier free data, at the cost of potentially many missing examples.
print(df.shape[0], df.shape[1])
return df
def fill_gaps(df, option):
# NOTE: Can also use some interpolation (linear, cubic) instead.
i_imp = IterativeImputer(max_iter=20, random_state=6)
s_imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# Infer the object types -> ensuring numeric encoding.
# NOTE: categorical/numeric split pipeline?
df = df.fillna(value=np.nan)
df_ret = df.infer_objects()
for feature in final_features:
df_ret[feature] = pd.to_numeric(df_ret[feature], downcast='float')
# Remove outliers for each column, if they are 1.5X IQR for the column.
# NOTE: apparently too many outliers -> perhaps data still not in correct form or data inconsistently sampled?
# df_ret = rem_outliers(df_ret)
if option == 'A':
# (A) Imputing
df_impute = pd.DataFrame(s_imp.fit_transform(df_ret))
df_impute.columns = df_ret.columns
df_impute.index = df_ret.index
# print("Dimensions of imputed df", df_impute.shape[0], df_impute.shape[1])
# df_impute.to_csv('imputed_df.csv')
# print("DF has been output to imputed_df.csv")
# (B) Interpolation
if option == 'B':
for feature in final_features:
# forward interpolate linearly -> 87% on RF
n_missing = df_ret[feature].isnull().sum()
# if n missing values > 5000, skip. NOTE: make this 4000.
# Someone change this to only fill small gaps instead. I.e. do not fill gaps of over 4 indices (mask).
if n_missing > 5000:
continue
# print(f"df[{feature}] contains {n_missing} missing values")
df_ret[feature].interpolate(method='linear', inplace=True)
# forward interpolate cubic spline -> 85% on RF
# df_ret[feature].interpolate(method='cubicspline', inplace=True)
# central differentiation approximation -> 87% on RF
# df_ret[feature].interpolate(method='from_derivatives', inplace=True)
# (C) Simply drop null cols. Result = 700~800 examples.
# Drop null cols
df_ret.dropna(inplace=True)
# Reindex the data
df_ret.reset_index(inplace=True)
# print("Dimensions of dataframe", str(df_ret.shape[0]) + "x" + str(df_ret.shape[1]))
return df_ret