Airbnb Prediction Notebook
Jupiter notebook for airbnb prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
Reading In the Data¶
df = pd.read_csv('train.csv')
pd.set_option('display.max_columns', None)
df.head()
id | name | summary | space | description | experiences_offered | neighborhood_overview | notes | transit | access | interaction | house_rules | host_id | host_name | host_since | host_location | host_about | host_response_time | host_response_rate | host_acceptance_rate | host_is_superhost | host_neighbourhood | host_listings_count | host_verifications | host_has_profile_pic | host_identity_verified | neighbourhood_cleansed | neighbourhood_group_cleansed | city | state | zipcode | market | country_code | country | property_type | room_type | accommodates | bathrooms | bedrooms | beds | bed_type | amenities | square_feet | price | guests_included | extra_people | minimum_nights | maximum_nights | number_of_reviews | first_review | last_review | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | is_business_travel_ready | cancellation_policy | require_guest_profile_picture | require_guest_phone_verification | calculated_host_listings_count | reviews_per_month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22267382 | Modern and Cozy Large Studio in Brooklyn | Modern large studio with new amenities and app... | Our place is a little quiet sanctuary in the h... | Modern large studio with new amenities and app... | none | BAM, Barclays, Brooklyn City Point, Fort Green... | NaN | Subway: 2,3,4,5,A,C,B,Q,G | Washer/Dryer Dishwasher Internet Gym Roof Top ... | Depending on the time of your visit, I'll be h... | - Please be respectful of our neighbors, no lo... | 1910170 | Katarina | 2012-03-12 | Los Angeles, California, United States | Traveler, Artist, Philanthropist | NaN | NaN | NaN | f | Fort Greene | 1.0 | ['email', 'phone', 'reviews', 'jumio', 'govern... | t | t | Fort Greene | Brooklyn | Brooklyn | NY | 11217.0 | New York | US | United States | Loft | Entire home/apt | 2 | 1.0 | 1.0 | 1.0 | Real Bed | {Wifi,"Air conditioning",Kitchen,Gym,Breakfast... | NaN | 145.0 | 2 | $30.00 | 7 | 12 | 6 | 2018-01-01 | 2018-07-03 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | t | f | flexible | f | f | 1 | 0.59 |
1 | 2473861 | Royal Harlem TRIPLEX Home 5 Beds | Harlem is back and so gorgeous! Visit and expl... | Harlem is back and so gorgeous! Visit and expl... | Harlem is back and so gorgeous! Visit and expl... | none | HARLEM is a piece of real NY history overflowi... | HARLEM RESTAURANTS Red Rooster Harlem -- excel... | PUBLIC TRANSPORTATION: Conveniently near all p... | The WHOLE ENTIRE HOUSE | NaN | Smoking, pets and unaccounted guests NOT permi... | 8826175 | Grover | 2013-09-14 | New York, New York, United States | As the owners of the property we have realized... | within an hour | 100% | NaN | f | Mid-Wilshire | 4.0 | ['email', 'phone', 'reviews', 'manual_offline'] | t | t | Harlem | Manhattan | New York | NY | 10027.0 | New York | US | United States | House | Entire home/apt | 8 | 1.0 | 3.0 | 5.0 | Real Bed | {TV,Internet,Wifi,"Air conditioning",Kitchen,"... | NaN | 175.0 | 3 | $19.00 | 3 | 30 | 137 | 2014-04-15 | 2018-10-20 | 91.0 | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | t | f | moderate | f | f | 3 | 2.47 |
2 | 25079703 | Sunny East Village Studio | Clean, hip and well designed sun drenched East... | This is a rare East Village studio with it's h... | Clean, hip and well designed sun drenched East... | none | East Village is one of the last remaining neig... | NaN | NaN | You'll have access to the entire space - it's ... | Very responsive via phone call, text or email. | NaN | 4383563 | Zander | 2012-12-11 | New York, New York, United States | New York City based, currently working as a te... | within a few hours | 100% | NaN | f | East Village | 1.0 | ['email', 'phone', 'facebook', 'reviews', 'kba'] | t | t | East Village | Manhattan | New York | NY | 10009.0 | New York | US | United States | Apartment | Entire home/apt | 2 | 1.0 | 0.0 | 1.0 | Real Bed | {TV,Wifi,"Air conditioning",Kitchen,Heating,"S... | NaN | 180.0 | 1 | $0.00 | 2 | 1125 | 3 | 2018-07-26 | 2018-10-14 | 100.0 | 10.0 | 9.0 | 9.0 | 10.0 | 10.0 | 10.0 | f | f | moderate | f | f | 1 | 0.89 |
3 | 9342478 | Beautiful, airy, light-filled room | Private, spacious, comfortable room in 2-bed f... | Big closet, two big windows, tall ceiling and ... | Private, spacious, comfortable room in 2-bed f... | none | One block from Morgan L stop. Super cool area.... | NaN | NaN | NaN | NaN | NaN | 10406276 | Kathleen | 2013-12-03 | New York, New York, United States | Australian actress living in New York. Love ex... | NaN | NaN | NaN | f | Williamsburg | 1.0 | ['email', 'phone', 'reviews', 'kba'] | t | t | Williamsburg | Brooklyn | Brooklyn | NY | 11237.0 | New York | US | United States | Apartment | Private room | 1 | 1.0 | 1.0 | 1.0 | Real Bed | {Wifi,"Air conditioning",Kitchen,Heating,Washe... | NaN | 42.0 | 1 | $0.00 | 3 | 1125 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | f | flexible | f | f | 1 | NaN |
4 | 4866426 | Private Room in Prime Brooklyn Spot | Comfy, quiet and big private room in a three b... | This big old apartment that we love and take c... | Comfy, quiet and big private room in a three b... | none | I absolutely love this neighborhood - right at... | Just a note about the space: The window in you... | Super convenient to almost all subway lines. A... | Your room has a very comfortable queen sized b... | We are my husband Joaquin and I, our sweet new... | This house is shoes off. Thank you! No guests ... | 2346300 | Donna And Joaquin | 2012-05-11 | Brooklyn, NY | Of the many reasons I love to travel perhaps t... | within a few hours | 100% | NaN | t | Boerum Hill | 1.0 | ['email', 'phone', 'reviews', 'jumio', 'govern... | t | t | Boerum Hill | Brooklyn | Brooklyn | NY | 11201.0 | New York | US | United States | Apartment | Private room | 2 | 1.0 | 1.0 | 1.0 | Real Bed | {Internet,Wifi,"Air conditioning",Kitchen,Brea... | NaN | 80.0 | 1 | $14.00 | 1 | 90 | 144 | 2015-01-31 | 2018-10-17 | 97.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | f | f | flexible | f | f | 1 | 3.14 |
Asscessing the Data¶
Through a quick look through the data, I can see that the data has a large number of columns, and a lot of that data would only act as noise when trying to predict the prices of the airbns. Hence after looking through the data, I am dropping the columns that I think would not be useful in predicting the prices of the airbnbs.
# Columns removed to make the dataset smaller, which was determined by a quick assesment of the data and domain knowledge
df_clean = df.drop(columns=[
'summary', 'space', 'description', 'experiences_offered', 'notes',
'transit', 'access', 'interaction', 'house_rules', 'host_id', 'host_name', 'host_since', 'host_location',
'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count',
'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'state', 'market', 'country_code',
'country', 'first_review', 'last_review', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 'host_neighbourhood',
'neighborhood_overview', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'reviews_per_month', 'maximum_nights', 'review_scores_value', 'number_of_reviews',
'name', 'neighbourhood_group_cleansed', 'city', 'zipcode', 'extra_people'
])
# Function to use later on to drop all the columns that are not needed
def remove_useless(df):
df = df.drop(columns=[
'summary', 'space', 'description', 'experiences_offered', 'notes',
'transit', 'access', 'interaction', 'house_rules', 'host_id', 'host_name', 'host_since', 'host_location',
'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count',
'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'state', 'market', 'country_code',
'country', 'first_review', 'last_review', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 'host_neighbourhood',
'neighborhood_overview', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'reviews_per_month', 'maximum_nights', 'review_scores_value', 'number_of_reviews',
'name', 'city', 'zipcode', 'extra_people', 'neighbourhood_group_cleansed'
])
return df
Exploring the Data¶
First I am going to plot a heatmap to view the numerical data columns correlation with the price column. This will help me to see which columns are most correlated with the price column. As price is the target variable.
temp = df_clean.corr()
sns.heatmap(temp[['price']].sort_values(by='price', ascending=False), annot=True, cmap='coolwarm')
<AxesSubplot:>
As expected, we can see the number of people the airbnb can accomodate has the largest correlation with the price, as it makes sense of a airbnb to cost more if it can accomodate more people which in turn also tells us that the place is larger.
This leads me to a slight oddity in the correlation which is the squarefeet. The correlation is still high, but I personally expected it to be higher. I shall access that further later to figure out why this might be the case.
Overall all the correlations that I expected to be high are high, which is a good sign.
Next we are going to look do a missing accessment of the data. As before we begin deeper EDA we need to make sure that the data is clean and does not have any missing values.
# Find all missing values
df_clean.isnull().sum()
id 0 host_is_superhost 5 neighbourhood_cleansed 0 property_type 0 room_type 0 accommodates 0 bathrooms 60 bedrooms 33 beds 31 bed_type 0 amenities 0 square_feet 33197 price 0 guests_included 0 minimum_nights 0 review_scores_rating 7664 review_scores_location 7708 dtype: int64
As we can see the columns bathrooms
, bedrooms
, beds
, square_feet
, review_scores_rating
and review_scores_location
have missing values.
Relating back to the correlations, it now makes a lot more sense why the correlation of the square_feet is not as high as I expected. This is because the square_feet column has a lot of missing values. Which means that there is not as much data avaliable to make the calculation of the correlation hence, could be a determining factor as to why the correlation is not as high as I expected.
Before we move forward with more EDA I will first try to access any clear patterns in the columns with the missing values to find out the best way to impute the missing data.
My initial thought was that the column accomodation
and square_feet
should have some kind of relationship between both columns as the more people a place can accomodate the larger the place shuld be in theory. Hence I am going to plot a scatter plot to see if there is any relationship between the two columns. And also draw a line of best fit to see if there is any clear pattern.
# plot to sese relationship between accommodates and square_feet
sns.scatterplot(x='accommodates', y='square_feet', data=df_clean)
# draw a line to see the trend
sns.regplot(x='accommodates', y='square_feet', data=df_clean)
<AxesSubplot:xlabel='accommodates', ylabel='square_feet'>
As we can see there is a clear pattern between the two columns, which means that we can use the accomodation
column to impute the missing values in the square_feet
column by looking at the values of sqaure_feet for the same accomodation value and then imputing the missing values with the mean of the values of the square_feet for the same accomodation value.
# Get statistical data on `bathrooms`, `bedrooms`, `beds`, `review_scores_rating` and `review_scores_location`
df_clean[['bathrooms', 'bedrooms', 'beds', 'review_scores_rating', 'review_scores_location']].describe()
bathrooms | bedrooms | beds | review_scores_rating | review_scores_location | |
---|---|---|---|---|---|
count | 33478.000000 | 33505.000000 | 33507.000000 | 25874.000000 | 25830.000000 |
mean | 1.141376 | 1.176869 | 1.571313 | 93.751488 | 9.500542 |
std | 0.425003 | 0.744365 | 1.054226 | 8.315867 | 0.781516 |
min | 0.000000 | 0.000000 | 0.000000 | 20.000000 | 2.000000 |
25% | 1.000000 | 1.000000 | 1.000000 | 91.000000 | 9.000000 |
50% | 1.000000 | 1.000000 | 1.000000 | 96.000000 | 10.000000 |
75% | 1.000000 | 1.000000 | 2.000000 | 100.000000 | 10.000000 |
max | 16.500000 | 10.000000 | 18.000000 | 100.000000 | 10.000000 |
Looking at the values of the bathrooms
, bedrooms
, beds
and review_scores_rating
columns, I can see that the missing values are not in any particular pattern and the mean and the median discrepancy is not big. Hence I will impute the missing values with the mean for review_scores_rating
and review_scores_location
as having float values for these columns still make somewhat of a sense.
However, for the bathrooms
, bedrooms
and beds
columns, I will impute the missing values with the median as having a whole number for these columns still make sense.
def impute_df(df):
# Impute missing values with mean review_scores_rating and review_scores_location
df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].mean())
df['review_scores_location'] = df['review_scores_location'].fillna(df['review_scores_location'].mean())
# impute missing values with median for bathrooms, bedrooms, beds
df['bathrooms'] = df['bathrooms'].fillna(df['bathrooms'].median())
df['bedrooms'] = df['bedrooms'].fillna(df['bedrooms'].median())
df['beds'] = df['beds'].fillna(df['beds'].median())
# get a dictionary of the mean square feet for each accommodates
vals = df.groupby('accommodates')['square_feet'].median()
# impute values for square feet using the dictionary
df['square_feet'] = df['square_feet'].fillna(df['accommodates'].map(vals))
# if square feet is still null, impute with 75% quantile
df['square_feet'] = df['square_feet'].fillna(df['square_feet'].max())
# impute 'host_is_superhost; using mode
df['host_is_superhost'] = df['host_is_superhost'].fillna(df['host_is_superhost'].mode()[0])
# For rows where the guests_included is greater than the accommodates, set the guests_included to the accommodates
df.loc[df['guests_included'] > df['accommodates'], 'guests_included'] = df['accommodates']
return df
imputed = impute_df(df_clean)
imputed.isnull().sum()
id 0 host_is_superhost 0 neighbourhood_cleansed 0 property_type 0 room_type 0 accommodates 0 bathrooms 0 bedrooms 0 beds 0 bed_type 0 amenities 0 square_feet 0 price 0 guests_included 0 minimum_nights 0 review_scores_rating 0 review_scores_location 0 dtype: int64
As we can see the missing values have been imputed and we do not have any missing values in the data anymore!
Now we can begin the EDA by checking the relationships with the price.
# draw a barplot and a boxplot to see the distribution of `bathrooms`, `bedrooms`, `beds`, `review_scores_rating` and `review_scores_location`
fig, ax = plt.subplots(2, 3, figsize=(20, 10))
sns.barplot(x='bathrooms', y='price', data=df_clean, ax=ax[0, 0])
sns.barplot(x='bedrooms', y='price', data=df_clean, ax=ax[0, 1])
sns.barplot(x='beds', y='price', data=df_clean, ax=ax[0, 2])
sns.histplot(x='review_scores_rating', y='price', data=df_clean, ax=ax[1, 0])
sns.barplot(x='review_scores_location', y='price', data=df_clean, ax=ax[1, 1])
<AxesSubplot:xlabel='review_scores_location', ylabel='price'>
fig, ax = plt.subplots(2, 2, figsize=(20, 10))
sns.boxplot(x='bathrooms', y='price', data=df_clean, ax=ax[0, 0])
sns.boxplot(x='bedrooms', y='price', data=df_clean, ax=ax[0, 1])
sns.boxplot(x='beds', y='price', data=df_clean, ax=ax[1, 0])
sns.boxplot(x='review_scores_location', y='price', data=df_clean, ax=ax[1, 1])
<AxesSubplot:xlabel='review_scores_location', ylabel='price'>
# increase the size of the plot
plt.figure(figsize=(30, 10))
sns.boxplot(x='review_scores_rating', y='price', data=df_clean)
<AxesSubplot:xlabel='review_scores_rating', ylabel='price'>
df_clean.head()
id | host_is_superhost | neighbourhood_cleansed | property_type | room_type | accommodates | bathrooms | bedrooms | beds | bed_type | amenities | square_feet | price | guests_included | minimum_nights | review_scores_rating | review_scores_location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22267382 | f | Fort Greene | Loft | Entire home/apt | 2 | 1.0 | 1.0 | 1.0 | Real Bed | {Wifi,"Air conditioning",Kitchen,Gym,Breakfast... | 550.0 | 145.0 | 2 | 7 | 100.000000 | 10.000000 |
1 | 2473861 | f | Harlem | House | Entire home/apt | 8 | 1.0 | 3.0 | 5.0 | Real Bed | {TV,Internet,Wifi,"Air conditioning",Kitchen,"... | 1650.0 | 175.0 | 3 | 3 | 91.000000 | 9.000000 |
2 | 25079703 | f | East Village | Apartment | Entire home/apt | 2 | 1.0 | 0.0 | 1.0 | Real Bed | {TV,Wifi,"Air conditioning",Kitchen,Heating,"S... | 550.0 | 180.0 | 1 | 2 | 100.000000 | 10.000000 |
3 | 9342478 | f | Williamsburg | Apartment | Private room | 1 | 1.0 | 1.0 | 1.0 | Real Bed | {Wifi,"Air conditioning",Kitchen,Heating,Washe... | 269.0 | 42.0 | 1 | 3 | 93.751488 | 9.500542 |
4 | 4866426 | t | Boerum Hill | Apartment | Private room | 2 | 1.0 | 1.0 | 1.0 | Real Bed | {Internet,Wifi,"Air conditioning",Kitchen,Brea... | 550.0 | 80.0 | 1 | 1 | 97.000000 | 10.000000 |
# Use CountVectorizer to encode the amenities column
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(token_pattern=r"(?u)\b\w+\b(?<!,)")
amenities_encoded = cv.fit_transform(imputed['amenities'])
# Use chi-squared test to select the most informative amenities
from sklearn.feature_selection import SelectKBest, chi2
k = 10
selector = SelectKBest(chi2, k=k)
selector.fit(amenities_encoded, imputed['price'])
top_k_indices = selector.get_support(indices=True)
top_k_amenities = [cv.get_feature_names()[i] for i in top_k_indices]
print('Top {} amenities:'.format(k))
print(top_k_amenities)
Top 10 amenities: ['building', 'crib', 'floor', 'ground', 'gym', 'play', 'staff', 'travel', 'tv', 'wide']
# Use TF-IDF to encode the amenities column
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b(?<!,)", ngram_range=(1, 2))
amenities_encoded = tfidf.fit_transform(df['amenities'])
# Use the fitted TfidfVectorizer to transform new data
new_amenities = 'Wifi, Kitchen, Washer, Dryer, Gym'
new_amenities_encoded = tfidf.transform([new_amenities])
# Identify the most important phrases using the TF-IDF scores
import numpy as np
scores = np.asarray(amenities_encoded.mean(axis=0)).ravel()
top_k_indices = scores.argsort()[-10:]
top_k_amenities = [tfidf.get_feature_names()[i] for i in top_k_indices]
print('Top 10 amenities:')
print(top_k_amenities)
Top 10 amenities: ['conditioning', 'air', 'essentials', 'heating', 'kitchen', 'wifi', 'friendly', 'dryer', 'tv', 'detector']
Will check if these exists ['conditioning', 'air', 'essentials', 'heating', 'kitchen', 'wifi', 'friendly', 'dryer', 'tv', 'detector', 'gym']
def create_features(df):
df['has_wifi'] = df['amenities'].str.contains('wifi', case=False, na=False).astype(int)
df['has_kitchen'] = df['amenities'].str.contains('kitchen', case=False, na=False).astype(int)
df['has_washer'] = df['amenities'].str.contains('washer', case=False, na=False).astype(int)
df['has_dryer'] = df['amenities'].str.contains('dryer', case=False, na=False).astype(int)
df['has_gym'] = df['amenities'].str.contains('gym', case=False, na=False).astype(int)
df['has_heating'] = df['amenities'].str.contains('heating', case=False, na=False).astype(int)
df['has_air_conditioning'] = df['amenities'].str.contains('air conditioning', case=False, na=False).astype(int)
df['has_tv'] = df['amenities'].str.contains('tv', case=False, na=False).astype(int)
df['has_breakfast'] = df['amenities'].str.contains('breakfast', case=False, na=False).astype(int)
df['has_essentials'] = df['amenities'].str.contains('essentials', case=False, na=False).astype(int)
df = df.drop('minimum_nights', axis=1)
df = df.drop('amenities', axis=1)
df = df.drop('host_is_superhost', axis=1)
return df
def remove_outliers(df):
df = df[df['price'] < 1000]
return df
train_data = pd.read_csv('train.csv')
train_data = remove_useless(train_data)
train_data = impute_df(train_data)
train_data = create_features(train_data)
# train_data = remove_outliers(train_data)
train_data.head()
id | neighbourhood_cleansed | property_type | room_type | accommodates | bathrooms | bedrooms | beds | bed_type | square_feet | price | guests_included | review_scores_rating | review_scores_location | has_wifi | has_kitchen | has_washer | has_dryer | has_gym | has_heating | has_air_conditioning | has_tv | has_breakfast | has_essentials | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22267382 | Fort Greene | Loft | Entire home/apt | 2 | 1.0 | 1.0 | 1.0 | Real Bed | 550.0 | 145.0 | 2 | 100.000000 | 10.000000 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 |
1 | 2473861 | Harlem | House | Entire home/apt | 8 | 1.0 | 3.0 | 5.0 | Real Bed | 1650.0 | 175.0 | 3 | 91.000000 | 9.000000 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 |
2 | 25079703 | East Village | Apartment | Entire home/apt | 2 | 1.0 | 0.0 | 1.0 | Real Bed | 550.0 | 180.0 | 1 | 100.000000 | 10.000000 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 |
3 | 9342478 | Williamsburg | Apartment | Private room | 1 | 1.0 | 1.0 | 1.0 | Real Bed | 269.0 | 42.0 | 1 | 93.751488 | 9.500542 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 |
4 | 4866426 | Boerum Hill | Apartment | Private room | 2 | 1.0 | 1.0 | 1.0 | Real Bed | 550.0 | 80.0 | 1 | 97.000000 | 10.000000 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 |
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
def host_is_superhost_func(ser):
ser = ser.apply(lambda x: 1 if x == 't' else 0)
return pd.DataFrame(ser)
def room_type_func(ser):
ser = ser.apply(lambda x: 0 if x == 'Shared room' else 1 if x == 'Private room' else 2)
return pd.DataFrame(ser)
host_is_superhost_encoder = FunctionTransformer(host_is_superhost_func)
room_type_encoder = FunctionTransformer(room_type_func)
preprocessor = ColumnTransformer(transformers=[
('std_sf', StandardScaler(), ['square_feet']),
# ('host_is_superhost', host_is_superhost_encoder, 'host_is_superhost'),
('room_type', room_type_encoder, 'room_type'),
('ohe', OneHotEncoder(handle_unknown='ignore'), ['property_type', 'bed_type']),
('te', ce.TargetEncoder(), ['neighbourhood_cleansed']),
], remainder='passthrough')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
X = train_data.drop(columns=['price', 'id'])
y = train_data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lin_reg = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
lin_reg.fit(X_train, y_train)
Pipeline(steps=[('preprocessor', ColumnTransformer(remainder='passthrough', transformers=[('std_sf', StandardScaler(), ['square_feet']), ('room_type', FunctionTransformer(func=<function room_type_func at 0x0000026628B99310>), 'room_type'), ('ohe', OneHotEncoder(handle_unknown='ignore'), ['property_type', 'bed_type']), ('te', TargetEncoder(), ['neighbourhood_cleansed'])])), ('regressor', LinearRegression())])
preprocessor.fit_transform(X_train, y_train).shape
(26830, 53)
# RMSE on training set
from sklearn.metrics import mean_squared_error
y_pred = lin_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)
97.78753409712414
y_pred = lin_reg.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)
88.75963352020634
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()
# learning curve for linear regression
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(lin_reg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
train_scores_mean = -train_scores.mean(axis=1)
test_scores_mean = -test_scores.mean(axis=1)
plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, test_scores_mean, label='Validation error')
plt.ylabel('RMSE', fontsize=14)
plt.xlabel('Training set size', fontsize=14)
plt.title('Learning curves for a linear regression model', fontsize=18, y=1.03)
plt.legend()
plt.ylim(0, 100)
(0.0, 100.0)
# Testing using decision tree
from sklearn.tree import DecisionTreeRegressor
tree_reg = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', DecisionTreeRegressor())
])
tree_reg.fit(X_train, y_train)
y_pred = tree_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)
22.089845626294636
# RMSE on test set
y_pred = tree_reg.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)
118.8768649459989
# Testing using random forest
from sklearn.ensemble import RandomForestRegressor
forest_reg = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', RandomForestRegressor())
])
forest_reg.fit(X_train, y_train)
y_pred = forest_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)
40.037828415449844
# RMSE on test set
y_pred = forest_reg.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)
85.33254270104557
# Plot a train test curve for the random forest regressor
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(forest_reg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
train_scores_mean = -train_scores.mean(axis=1)
test_scores_mean = -test_scores.mean(axis=1)
plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, test_scores_mean, label='Validation error')
plt.ylabel('RMSE', fontsize=14)
plt.xlabel('Training set size', fontsize=14)
plt.title('Learning curves for a random forest regressor', fontsize=18, y=1.03)
plt.legend()
plt.ylim(0, 100)
(0.0, 100.0)
# Testing using lasso regression
from sklearn.linear_model import Lasso
lasso_reg = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', Lasso())
])
lasso_reg.fit(X_train, y_train)
y_pred = lasso_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)
101.45708378436902
# RMSE on test set
y_pred = lasso_reg.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)
103.31870550049732
# testing on ridge regression
from sklearn.linear_model import Ridge
ridge_reg = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', Ridge())
])
ridge_reg.fit(X_train, y_train)
y_pred = ridge_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)
105.10485272712643
# RMSE on test set
y_pred = ridge_reg.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)
106.60569205771647
# testing on elastic net
from sklearn.linear_model import ElasticNet
elastic_net = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', ElasticNet())
])
elastic_net.fit(X_train, y_train)
y_pred = elastic_net.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)
108.85157564759555
test_data = pd.read_csv('test.csv')
test_data = remove_useless(test_data)
test_data = impute_df(test_data)
test_data = create_features(test_data)
test_data_to_predict = test_data.drop(columns=['id'])
test_data.isnull().sum()
id 0 neighbourhood_cleansed 0 property_type 0 room_type 0 accommodates 0 bathrooms 0 bedrooms 0 beds 0 bed_type 0 square_feet 0 guests_included 0 review_scores_rating 0 review_scores_location 0 has_wifi 0 has_kitchen 0 has_washer 0 has_dryer 0 has_gym 0 has_heating 0 has_air_conditioning 0 has_tv 0 has_breakfast 0 has_essentials 0 dtype: int64
test_data['accommodates'].value_counts()
2 7782 1 2752 4 2584 3 2012 6 822 5 701 8 256 7 165 10 98 9 49 16 43 12 37 14 13 15 8 11 8 13 7 Name: accommodates, dtype: int64
# get rows where square feet is null
test_data[test_data['square_feet'].isnull()]['accommodates'].value_counts()
Series([], Name: accommodates, dtype: int64)
train = train_data.drop(columns=['id', 'price'])
train_y = train_data['price']
final = forest_reg.fit(train, train_y)
pred = final.predict(test_data_to_predict)
output = pd.DataFrame({'id': test_data['id'], 'Predicted': pred})
output.head()
id | Predicted | |
---|---|---|
0 | 19307997 | 357.088333 |
1 | 20176193 | 124.355000 |
2 | 19485371 | 89.835000 |
3 | 13079990 | 140.276667 |
4 | 22339757 | 60.760000 |
to_csv = output.to_csv('output.csv', index=False)