mage-ai · skunichetty · Jul 15, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 21, 2022
diff --git a/mage_ai/data_cleaner/cleaning_rules/remove_collinear_columns.py b/mage_ai/data_cleaner/cleaning_rules/remove_collinear_columns.py
@@ -1,13 +1,11 @@
 from mage_ai.data_cleaner.cleaning_rules.base import BaseRule
-from mage_ai.data_cleaner.column_types.constants import NUMBER_TYPES
 from mage_ai.data_cleaner.transformer_actions.constants import ActionType, Axis
 import numpy as np
 
-
 class RemoveCollinearColumns(BaseRule):
-    EPSILON = 1e-12
+    EPSILON = 1e-15
     MIN_ENTRIES = 3
-    ROW_SAMPLE_SIZE = 300
+    VIF_UB = 5
 
     default_config = dict(
         vif_ub=3,
@@ -22,19 +20,23 @@ def evaluate(self):
         suggestions = []
         if self.numeric_df.empty or len(self.numeric_df) < self.MIN_ENTRIES:
             return suggestions
+
+        C = self.numeric_df.corr().to_numpy()
         collinear_columns = []
-        self.numeric_df['intercept'] = np.ones(len(self.numeric_df))
-        for column in self.numeric_columns[:-1]:
-            variance_inflation_factor = self.get_variance_inflation_factor(column)
-            if variance_inflation_factor > self.config('vif_ub'):
-                collinear_columns.append(column)
-                self.numeric_df.drop(column, axis=1, inplace=True)
-        if len(collinear_columns) != len(self.numeric_columns) - 1:
-            # check the final column if and only if there are other columns to compare it to
-            column = self.numeric_columns[-1]
-            variance_inflation_factor = self.get_variance_inflation_factor(column)
-            if variance_inflation_factor > self.config('vif_ub'):
-                collinear_columns.append(column)
+        good_columns = self.numeric_columns.copy()
+        while True:
+            e_vals = np.linalg.eigvalsh(C)
+            vifs = np.sign(e_vals) / (abs(e_vals) + self.EPSILON)
+            collinearity = vifs >= self.VIF_UB
+
+            i = collinearity.argmax()
+            if i == 0 and collinearity[0] == 0:
+                break
+            else:
+                C = np.delete(C, i, axis=0)
+                C = np.delete(C, i, axis=1)
+                collinear_columns.append(good_columns.pop(i))
+
         if len(collinear_columns) != 0:
             suggestions.append(
                 self._build_transformer_action_suggestion(
@@ -46,29 +48,3 @@ def evaluate(self):
                 )
             )
         return suggestions
-
-    def get_variance_inflation_factor(self, column):
-        """
-        Variance Inflation Factor = 1 / (1 - <coefficient of determination on column k>)
-        Measures increase in regression model variance due to collinearity
-        => column k is multicollinear with others if model predicting its value
-        has this variance inflation greater than some amount
-        """
-        if self.numeric_df.empty:
-            raise RuntimeError('No other columns to compare \'{column}\' against')
-        if len(self.numeric_df) > self.ROW_SAMPLE_SIZE:
-            sample = self.numeric_df.sample(self.ROW_SAMPLE_SIZE)
-        else:
-            sample = self.numeric_df
-
-        responses = sample[column].to_numpy()
-        predictors = sample.drop(column, axis=1).to_numpy()
-        params, _, _, _ = np.linalg.lstsq(predictors, responses, rcond=None)
-
-        mean = responses.mean()
-        centered_predictions = predictors @ params - mean
-        sum_sq_model = np.sum(centered_predictions * centered_predictions)
-        centered_responses = responses - mean
-        sum_sq_to = np.sum(centered_responses * centered_responses)
-        r_sq = sum_sq_model / sum_sq_to if sum_sq_to else 0
-        return 1 / (1 - r_sq + self.EPSILON)
diff --git a/mage_ai/tests/data_cleaner/cleaning_rules/test_remove_collinear_columns.py b/mage_ai/tests/data_cleaner/cleaning_rules/test_remove_collinear_columns.py
@@ -431,7 +431,7 @@ def test_perfectly_collinear(self):
                 status='not_applied',
                 action_payload=dict(
                     action_type='remove',
-                    action_arguments=['number_of_users', 'views', 'revenue'],
+                    action_arguments=['views', 'revenue', 'number_of_users'],
                     axis='column',
                     action_options={},
                     action_variables={},
@@ -440,61 +440,4 @@ def test_perfectly_collinear(self):
                 ),
             )
         ]
-        self.assertEqual(result, expected_results)
-
-    def test_vif_calcuation(self):
-        df = pd.DataFrame(
-            [
-                [1000, 30000, 10, 100, 30, 1],
-                [500, 10000, 20, 3000, 20, 1],
-                [250, 7500, 25, 8000, 20, 1],
-                [1000, 45003, 20, 90, 40, 1],
-                [1500, 75000, 30, 70, 25, 1],
-                [1250, 60000, 50, 80, 20, 1],
-                [200, 5000, 30, 10000, 30, 1],
-                [800, 12050, 40, 2000, 45, 1],
-                [600, 11000, 50, 3000, 50, 1],
-                [700, 11750, 20, 2750, 55, 1],
-                [1200, 52000, 10, 75, 60, 1],
-            ],
-            columns=[
-                'number_of_users',
-                'views',
-                'number_of_creators',
-                'losses',
-                'number_of_advertisers',
-                'intercept',
-            ],
-        )
-        column_types = {
-            'number_of_users': 'number',
-            'views': 'number',
-            'number_of_creators': 'number',
-            'losses': 'number',
-            'number_of_advertisers': 'number',
-            'intercept': 'number',
-        }
-        statistics = {}
-        df = clean_dataframe(df, column_types, dropna=False)
-        rule = RemoveCollinearColumns(df, column_types, statistics)
-        expected_vifs_no_remove = (
-            37.32458129910488,
-            17.631950562635552,
-            1.0708742222569918,
-            9.011569965497548,
-            1.4493983644673827,
-        )
-        expected_vifs_remove = (
-            37.32458129910488,
-            2.3104977034413134,
-            1.0257954327379684,
-            1.0616108815019005,
-            1.0,
-        )
-        for column, expected_vif in zip(rule.numeric_columns, expected_vifs_no_remove):
-            vif = rule.get_variance_inflation_factor(column)
-            self.assertAlmostEqual(vif, expected_vif)
-        for column, expected_vif in zip(rule.numeric_columns[:-1], expected_vifs_remove):
-            vif = rule.get_variance_inflation_factor(column)
-            self.assertAlmostEqual(vif, expected_vif)
-            rule.numeric_df.drop(column, axis=1, inplace=True)
+        self.assertEqual(result, expected_results)