Created epic algo

sammce · Feb 11, 2021 · e2777ab · e2777ab
1 parent bf666c0
commit e2777ab
Show file tree

Hide file tree

Showing 10 changed files with 572,411 additions and 612,315 deletions.
diff --git a/__pycache__/clean.cpython-38.pyc b/__pycache__/clean.cpython-38.pyc
diff --git a/__pycache__/formatter.cpython-38.pyc b/__pycache__/formatter.cpython-38.pyc
diff --git a/__pycache__/process.cpython-38.pyc b/__pycache__/process.cpython-38.pyc
diff --git a/clean.py b/clean.py
@@ -4,7 +4,7 @@ class CleanedData(Formatter):
     # Create dictionary to house cleaned DataFrames
     frames = {}
 
-    def __init__(self):
+    def __init__(self, new=False):
         '''
         Reads in data from specific excel files and stores it in 
         various variables.
@@ -34,35 +34,9 @@ def __init__(self):
         # used as row and column identifiers respectively
         self.years = self.np.array([], dtype=int)
         self.places = self.np.array(self.new_house_data.iloc[0][1:8].tolist())
-        # END OF AVERAGE PRICES PARSING
-
-
-        # START OF RAW PRICES PARSING
-        self.raw_data = self.pd.read_csv("with_outliers.csv")
-        self.cleaned_data = self.pd.read_csv("cleaned.csv")
-
-        outlier_dict = {
-            'min': {
-                'Dublin': 150000,
-                'Cork': 125000,
-                'Galway': 135000,
-                'Limerick': 110000,
-                'Waterford': 100000,
-                'Other Areas': 80000
-            },
-            'max': {
-                'Dublin': 570000,
-                'Cork': 500000,
-                'Galway': 520000,
-                'Limerick': 440000,
-                'Waterford': 460000,
-                'Other Areas': 400000,
-            },
-            'new': 8,
-        }
+        self.places_no_national = self.np.array(self.new_house_data.iloc[0][2:8].tolist())
 
         self.clean_averages()
-        # self.clean_raw_prices(outlier_dict)
 
     def clean_averages(self):
         # Clean new listings excel sheet
@@ -99,45 +73,6 @@ def clean_averages(self):
                 break
 
         self.old_avg = self.pd.DataFrame.from_dict(data, orient='index', columns=self.places)
-
-    def clean_raw_prices(self, outlier_dict):
-        # self.raw_final_new = []
-
-        self.cleaned_prices = self.cleaned_prices.drop(columns=["Unnamed: 0.1"])
-        # self.raw_price = self.raw_price.rename(columns={
-        #     "Date of Sale (dd/mm/yyyy)":"Year",
-        #     "Price (€)":"Price",
-        #     "Description of Property":"Description",
-        # })
-
-        for index, row in self.raw_data.iterrows():
-            # removes first character (€), any separating commas and turns to int
-            price = row['Price']
-            place = row['County']
-
-            # if price < outlier_dict['min'][place] or price > outlier_dict['max'][place]:
-                # self.raw_price = self.raw_price.drop([index])
-        #     row_place = self.raw_price.loc[index, 'County']
-        #     row_desc = self.raw_price.loc[index, 'Description']
-
-        #     if not row_place in self.places:
-        #         self.raw_price.loc[index, 'County'] = "Other Areas"
-        #         row_place = "Other Areas"
-
-        #     self.raw_price.loc[index, 'Year'] = int(row['Year'].split('/')[-1])
-
-        #     if 'New' in row_desc:
-        #         self.raw_price.loc[index, 'Description'] = "New"
-        #         row_price = round((row_price / 100) * outlier_dict['new'])
-        #     else:
-        #         self.raw_price.loc[index, 'Description'] = "Second"
-
-        #     # outlier dictionary is passed where min and max values are defined
-            if price < outlier_dict['min'][place] or price > outlier_dict['max'][place]:
-                self.cleaned_prices = self.cleaned_prices.drop([index])
-
-        self.cleaned_prices.to_csv("cleaned.csv", index=False)
-        print("Success!")
 
     def get_every_nyears(self, df, n=1):
         '''
@@ -184,12 +119,27 @@ def search(self, df, location, year):
         Returns an int, or prints an error message if no results are found.
         '''
         return df.loc[int(year), location.title()]
+
+
 
+
+
+
 if __name__=='__main__':
-    cleaned = CleanedData()
-    dublin_houses = cleaned.cleaned_data.loc[cleaned.cleaned_data['County'] == 'Dublin']
-    houses_2015 = dublin_houses.loc[dublin_houses['Year'] == 2015]
-    mean_2015 = houses_2015['Price'].mean()
-    print(mean_2015)
-    # print(cleaned.raw_price)  
-#     print(cleaned.search('2004', 'dublin'))
+    import plotly.express as px
+    # cleaned = CleanedData(new=True)
+
+    # data = []
+    # for place in cleaned.places:
+    #     if place == 'National':
+    #         continue
+
+    #     new_df = df[
+    #         (df['Year'] == 2015) & (df['County'] == place) & (df['Description'] == 'New')
+    #     ]
+    #     data.append([place, round(new_df['Price'].mean())])
+
+    # our_averages_2015 = cleaned.pd.DataFrame(data, columns=["Place", 'Price'])
+    # print(our_averages_2015)
+    # graph = px.bar(our_averages_2015, x="Place", y="Price", title="Price of new 2015 houses across Ireland")
+    # graph.write_html("newhtml.html", full_html=False, include_plotlyjs=False)