This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create price per sqft | |
df['Price_sqft'] = df['Price'] / df['Built_Size'] | |
# most expensive area by price per sqft | |
dfc = df.copy(deep=True) | |
# since we have infinite values in the data, for simplicity sake, we will drop these values | |
dfc = dfc.replace([np.inf, -np.inf], np.nan).dropna(subset=["Price_sqft"], how="all") # replace infinite values with nan | |
all_property_price_sqft = dfc.groupby('Location')['Price_sqft'].mean().sort_values(ascending=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# highly sqft per area | |
all_property_sqft = df.groupby('Location')['Built_Size'].mean().sort_values(ascending=False) | |
bx = all_property_sqft.plot(kind='bar', title="Property Size Distribution in Kuala Lumpur", figsize=(15,10), legend=True, fontsize=10, rot=90) | |
bx.set_xlabel("Locations", fontsize=10) | |
bx.set_ylabel("Size", fontsize=10) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# most expensive area in KL by mean pricing | |
all_property_prices = df.groupby('Location')['Price'].mean().sort_values(ascending=False) | |
ax = all_property_prices.plot(kind='bar', title="Property Price Distribution in Kuala Lumpur", figsize=(15,10), legend=True, fontsize=10, rot=90) | |
ax.set_xlabel("Locations", fontsize=10) | |
ax.set_ylabel("Price", fontsize=10) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# impute the values using KNN | |
# define the methods | |
def weighted_hamming(data): | |
categories_dist = [] | |
for category in data: | |
X = pd.get_dummies(data[category]) | |
X_mean = X * X.mean() | |
X_dot = X_mean.dot(X.transpose()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Clean the Price column by stripping the label & commas | |
def price_cleanup(price): | |
if pd.notna(price): | |
price = price.replace("RM", "") | |
price = price.replace(",", "") | |
price = price.strip() | |
return int(price) | |
else: | |
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from collections import defaultdict | |
from scipy.stats import hmean | |
from scipy.spatial.distance import cdist | |
from scipy import stats | |
import numbers | |
def weighted_hamming(data): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert Built_Size into numeric value | |
def convert_built_size_numeric(bsize): | |
try: | |
if re.search(r"sq\.*\s*ft\.*", bsize) is None: | |
return None | |
bsize = bsize.replace(",", "") #remove the commas in price | |
bsize = bsize.replace("'", "") # remove the ''' symbol in few records | |
bsize = bsize.replace("sq. ft.", "") # remove the sq. ft. from the records | |
bsize = bsize.replace("sf", "") # some records are in 'sf' format, clean them |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define the function to split Size into an array of two differnt values | |
def split_property_size(size, tp=0): | |
try: | |
return size.split(":")[tp].strip() | |
except AttributeError: | |
return size | |
#create a new column with the buildup type | |
df["Built_Type"] = df['Size'].astype(str).apply(split_property_size, tp=0) | |
df["Built_Type"].value_counts(dropna=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_property_types(propType): | |
# Define the cleaned types without the extra details | |
cleanTypes = [ | |
'Condominium', | |
'Serviced Residence', | |
'Terrace/Link House', | |
'Bungalow', | |
'Semi-detached House', | |
'Apartment', | |
'Residential Land', |
NewerOlder