Date: 27 June, 2025 (Explore)
Country/City | File Name | Description |
---|---|---|
Denver | listings.csv | Summary information and metrics for listings in Denver (useful for visualisations) |
Denver | reviews.csv | Summary Review data and Listing ID (to facilitate time-based analytics and visualisations linked to a listing) |
Denver | neighbourhoods.csv | Neighbourhood list for geo filters (sourced from city/open source GIS files) |
Denver | neighbourhoods.geojson | GeoJSON file of neighbourhoods of the city. Source |
🔎 This analysis explores Airbnb pricing, demand, host strategies, and investment opportunities in Denver, using listings, calendar, reviews, and geospatial datasets.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import folium
from folium.plugins import HeatMap
from geopy.distance import geodesic
from statsmodels.tsa.seasonal import seasonal_decompose
from prophet import Prophet
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette("Set2")
listings_file = "listings.csv"
reviews_file = "reviews.csv"
neighbourhoods_file = "neighbourhoods.csv"
def load_data(listings_file, reviews_file, neighbourhoods_file):
listings = pd.read_csv(listings_file)
reviews = pd.read_csv(reviews_file)
neighbourhoods = pd.read_csv(neighbourhoods_file)
return listings, reviews, neighbourhoods
listings, reviews, neighbourhoods = load_data(listings_file, reviews_file, neighbourhoods_file)
def merge_data(listings, reviews, neighbourhoods):
listings_reviews = pd.merge(listings, reviews, left_on='id', right_on='listing_id', how='left')
listings_full = pd.merge(listings_reviews, neighbourhoods, on='neighbourhood', how='left')
if 'neighbourhood_group_x' in listings_full.columns and 'neighbourhood_group_y' in listings_full.columns:
listings_full.drop(columns=['neighbourhood_group_x'], inplace=True)
listings_full.rename(columns={'neighbourhood_group_y': 'neighbourhood_group'}, inplace=True)
return listings_full
full_data = merge_data(listings, reviews, neighbourhoods)
def analyze_data(listings, reviews, neighbourhoods, df):
#print("Dataset shape:", df.shape)
#print("\nColumns:", df.columns.tolist())
#print("\nSummary statistics:")
#print(df.describe(include='all'))
# Clean numeric fields
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['reviews_per_month'] = pd.to_numeric(df['reviews_per_month'], errors='coerce')
df['availability_365'] = pd.to_numeric(df['availability_365'], errors='coerce')
# Price Analysis
avg_price = df.groupby('neighbourhood')['price'].mean().sort_values(ascending=False)
print("\nAverage price per neighbourhood:")
print(avg_price.head(10))
avg_price.head(10).plot(kind='bar', figsize=(10, 5),
color=sns.color_palette("viridis", 10))
plt.title("Top 10 Neighbourhoods by Avg Price")
plt.ylabel("Average Price ($)")
plt.show()
#Demand Analysis
reviews_count = listings.groupby('neighbourhood')['number_of_reviews'].sum().sort_values(ascending=False)
print("\nTotal reviews per neighbourhood:")
print(reviews_count.head(10))
reviews_count.head(10).plot(kind='bar', figsize=(10, 5),
color=sns.color_palette("magma", 10))
plt.title("Top 10 Neighbourhoods by Total Reviews")
plt.ylabel("Number of Reviews")
plt.show()
#Listings Count
listings_count = listings['neighbourhood'].value_counts().head(10)
print("\nTop 10 neighbourhoods by number of listings:")
print(listings_count)
listings_count.plot(kind='bar', figsize=(10, 5),
color=sns.color_palette("pastel", 10))
plt.title("Top 10 Neighbourhoods by Listings Count")
plt.ylabel("Number of Listings")
plt.show()
#Price Distribution
plt.figure(figsize=(10,6))
df['price'].clip(upper=500).hist(bins=50, color=sns.color_palette("cubehelix", 8)[4])
plt.title("Price Distribution (Clipped at $500)")
plt.xlabel("Price ($)")
plt.ylabel("Count")
plt.show()
#Revenue Estimation
df['est_revenue'] = (
df['price'] *
df['reviews_per_month'].fillna(0) *
(df['availability_365'] / 30)
)
revenue_by_neighbourhood = df.groupby('neighbourhood')['est_revenue'].mean().sort_values(ascending=False)
print("\nAverage estimated revenue per neighbourhood:")
print(revenue_by_neighbourhood.head(10))
revenue_by_neighbourhood.head(10).plot(kind='bar', figsize=(10,5),
color=sns.color_palette("coolwarm", 10))
plt.title("Top 10 Neighbourhoods by Avg Estimated Revenue")
plt.ylabel("Estimated Revenue ($)")
plt.show()
# Occupancy Proxy
plt.figure(figsize=(8,6))
plt.scatter(df['availability_365'], df['reviews_per_month'],
alpha=0.5, c=df['price'].clip(upper=500), cmap="viridis")
plt.colorbar(label="Price ($, clipped at 500)")
plt.xlabel("Availability (days per year)")
plt.ylabel("Reviews per Month")
plt.title("Occupancy Proxy: Availability vs Reviews (Colored by Price)")
plt.show()
analyze_data(listings, reviews, neighbourhoods, full_data)
Average price per neighbourhood: neighbourhood Southmoor Park 399.839080 Belcaro 399.528302 Union Station 362.669429 Civic Center 359.879508 University Park 354.165195 CBD 316.517794 Hampden 314.521186 Chaffee Park 263.144940 Country Club 249.785211 Cherry Creek 236.462508 Name: price, dtype: float64
Total reviews per neighbourhood: neighbourhood Five Points 38302 Highland 24683 West Highland 12772 Berkeley 12419 Capitol Hill 12001 West Colfax 11601 Sunnyside 10762 Gateway - Green Valley Ranch 10687 Whittier 10467 Speer 10045 Name: number_of_reviews, dtype: int64
Top 10 neighbourhoods by number of listings: neighbourhood Five Points 397 Highland 279 Union Station 208 West Colfax 197 Gateway - Green Valley Ranch 185 Berkeley 181 West Highland 162 CBD 151 Capitol Hill 134 Sunnyside 133 Name: count, dtype: int64
Average estimated revenue per neighbourhood: neighbourhood Union Station 15927.793818 Jefferson Park 12375.036026 Highland 8091.679783 CBD 6913.921048 City Park West 6828.302944 Five Points 6220.461016 Elyria Swansea 5991.105472 Civic Center 5743.830543 Harvey Park 5586.956113 Gateway - Green Valley Ranch 5186.130440 Name: est_revenue, dtype: float64
def demand_forecasting(reviews, neighbourhood=None):
reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
df = reviews.dropna(subset=['date'])
if neighbourhood and neighbourhood != "All" and 'neighbourhood' in df.columns:
df = df[df['neighbourhood'] == neighbourhood]
ts = df.groupby(pd.Grouper(key='date', freq='MS')).size().reset_index(name='reviews')
ts.rename(columns={'date': 'ds', 'reviews': 'y'}, inplace=True)
# Prophet Forecast
model = Prophet(yearly_seasonality=True, daily_seasonality=False)
model.fit(ts)
future = model.make_future_dataframe(periods=12, freq='MS')
forecast = model.predict(future)
# Forecast plot
model.plot(forecast)
plt.title(f"Forecast of Monthly Reviews ({neighbourhood if neighbourhood else 'All'})")
plt.show()
# Seasonal decomposition
decomposition = seasonal_decompose(ts.set_index('ds')['y'], model='additive', period=12)
decomposition.plot()
plt.show()
demand_forecasting(reviews, neighbourhood="All")
07:45:58 - cmdstanpy - INFO - Chain [1] start processing 07:45:58 - cmdstanpy - INFO - Chain [1] done processing
def host_strategy_analysis(listings):
listings['price'] = pd.to_numeric(listings['price'], errors='coerce')
listings['reviews_per_month'] = pd.to_numeric(listings['reviews_per_month'], errors='coerce')
listings['availability_365'] = pd.to_numeric(listings['availability_365'], errors='coerce')
# Identify host type
host_counts = listings.groupby('host_id')['id'].count().reset_index(name='listing_count')
listings = listings.merge(host_counts, on='host_id', how='left')
listings['host_type'] = np.where(listings['listing_count'] > 1, 'Multi-property', 'Single-property')
# Estimate revenue
listings['est_revenue'] = (
listings['price'] *
listings['reviews_per_month'].fillna(0) *
(listings['availability_365'] / 30)
)
# Market share
host_share = listings['host_type'].value_counts(normalize=True) * 100
host_share.plot(kind='bar', figsize=(6,4), title="Market Share by Host Type (%)")
plt.ylabel("Percentage of Listings")
plt.show()
# Revenue comparison
listings.groupby('host_type')['est_revenue'].mean().plot(
kind='bar', figsize=(6,4), title="Avg Estimated Revenue by Host Type"
)
plt.ylabel("Estimated Revenue ($)")
plt.show()
print("\n--- Host Strategy Insights ---")
print("Market Share (% of listings):")
print(host_share)
print("\nAverage Estimated Revenue by Host Type:")
print(listings.groupby('host_type')['est_revenue'].mean())
host_strategy_analysis(listings)
--- Host Strategy Insights --- Market Share (% of listings): host_type Single-property 57.978723 Multi-property 42.021277 Name: proportion, dtype: float64 Average Estimated Revenue by Host Type: host_type Multi-property 2298.460062 Single-property 2466.294476 Name: est_revenue, dtype: float64
def market_efficiency_anomalies(df):
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['minimum_nights'] = pd.to_numeric(df['minimum_nights'], errors='coerce')
df['calculated_host_listings_count'] = pd.to_numeric(df['calculated_host_listings_count'], errors='coerce')
# Detect anomalies
anomalies = df[
(df['minimum_nights'] > 365) |
((df['price'] > 1000) & (df['number_of_reviews'] == 0)) |
(df['calculated_host_listings_count'] > 100)
]
print(f"Found {len(anomalies)} anomalous listings")
# Outlier detection plot (Price vs Reviews)
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='number_of_reviews', y='price', alpha=0.4)
sns.scatterplot(data=anomalies, x='number_of_reviews', y='price', color='red', label="Anomalies")
plt.ylim(0, 2000)
plt.title("Outlier Detection: Price vs Reviews")
plt.legend()
plt.show()
# Supply vs Demand (Neighbourhood saturation)
supply_demand = df.groupby('neighbourhood').agg(
listings=('id','count'),
total_reviews=('number_of_reviews','sum')
).reset_index()
plt.figure(figsize=(8,6))
sns.scatterplot(data=supply_demand, x='listings', y='total_reviews', alpha=0.7)
for _, row in supply_demand.nlargest(5,'listings').iterrows():
plt.text(row['listings'], row['total_reviews'], row['neighbourhood'], fontsize=9)
plt.xlabel("Number of Listings (Supply)")
plt.ylabel("Total Reviews (Demand)")
plt.title("Market Saturation: Supply vs Demand by Neighbourhood")
plt.show()
market_efficiency_anomalies(full_data)
Found 8 anomalous listings
def segmentation_clustering(df):
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['availability_365'] = pd.to_numeric(df['availability_365'], errors='coerce')
df['number_of_reviews'] = pd.to_numeric(df['number_of_reviews'], errors='coerce')
df['minimum_nights'] = pd.to_numeric(df['minimum_nights'], errors='coerce')
#Cluster neighbourhoods by market features
neigh_stats = df.groupby('neighbourhood').agg(
avg_price=('price','mean'),
avg_reviews=('number_of_reviews','mean'),
avg_avail=('availability_365','mean')
).dropna()
scaler = StandardScaler()
X = scaler.fit_transform(neigh_stats)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
neigh_stats['cluster'] = kmeans.fit_predict(X)
plt.figure(figsize=(8,6))
sns.scatterplot(data=neigh_stats, x='avg_price', y='avg_reviews',
hue='cluster', palette="Set2", s=80, alpha=0.8)
plt.title("Neighbourhood Segmentation: Price vs Reviews")
plt.xlabel("Average Price")
plt.ylabel("Average Reviews")
plt.show()
#Cluster individual listings (customer segments)
listings_sub = df[['price','minimum_nights','number_of_reviews']]
listings_sub = listings_sub.dropna()
scaler2 = StandardScaler()
X2 = scaler2.fit_transform(listings_sub)
kmeans2 = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans2.fit_predict(X2)
# Reduce to 2D for plotting
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X2)
plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_result[:,0], y=pca_result[:,1], hue=clusters,
palette="tab10", alpha=0.6)
plt.title("Listing Segmentation (PCA 2D)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.show()
segmentation_clustering(full_data)
High-price, high-demand clusters (prime downtown/tourist areas)
Low-price, moderate-demand clusters (residential zones)
Budget clusters with low demand (fringe/remote areas)
Short-stay budget seekers (private rooms/shared spaces)
Mid-tier family/group travelers (entire homes at moderate prices)
Luxury travelers (premium entire-home stays, longer durations
def investment_strategy(df):
df = df.copy()
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['reviews_per_month'] = pd.to_numeric(df['reviews_per_month'], errors='coerce')
df['availability_365'] = pd.to_numeric(df['availability_365'], errors='coerce')
df = df.dropna(subset=['price', 'reviews_per_month', 'availability_365', 'neighbourhood'])
# Estimated annual revenue (proxy ROI)
df['estimated_annual_revenue'] = df['price'] * df['reviews_per_month'] * df['availability_365']
roi = df.groupby('neighbourhood')['estimated_annual_revenue'].mean().sort_values(ascending=False).head(10)
# Bar chart: Top 10 ROI neighbourhoods
plt.figure(figsize=(10, 5))
roi.plot(kind='bar', color='orange', edgecolor='black')
plt.title("Top 10 Neighbourhoods by ROI Potential")
plt.ylabel("Estimated Annual Revenue ($)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Hidden gems analysis
neighbourhood_stats = df.groupby('neighbourhood').agg({
'price': 'mean',
'reviews_per_month': 'mean',
'id': 'count',
'estimated_annual_revenue': 'mean'
}).reset_index().rename(columns={'id': 'listing_count'})
plt.figure(figsize=(10, 6))
scatter = plt.scatter(
neighbourhood_stats['price'],
neighbourhood_stats['reviews_per_month'],
s=neighbourhood_stats['listing_count'] * 5,
c=neighbourhood_stats['estimated_annual_revenue'],
cmap='coolwarm',
alpha=0.7,
edgecolors='k'
)
plt.colorbar(scatter, label="Avg ROI ($)")
plt.xlabel("Average Price ($)")
plt.ylabel("Average Reviews per Month (Demand)")
plt.title("Hidden Gem Neighbourhoods (Demand vs Price)")
for _, row in neighbourhood_stats.sort_values('estimated_annual_revenue', ascending=False).head(5).iterrows():
plt.text(row['price'], row['reviews_per_month'], row['neighbourhood'], fontsize=8, weight='bold')
plt.tight_layout()
plt.show()
investment_strategy(full_data)
def revenue_optimization(df):
df = df.copy()
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['reviews_per_month'] = pd.to_numeric(df['reviews_per_month'], errors='coerce')
df['availability_365'] = pd.to_numeric(df['availability_365'], errors='coerce')
df['calculated_host_listings_count'] = pd.to_numeric(
df['calculated_host_listings_count'], errors='coerce'
)
df = df.dropna(
subset=['price', 'neighbourhood', 'room_type',
'availability_365', 'reviews_per_month',
'calculated_host_listings_count']
)
# Features
X = pd.get_dummies(
df[['neighbourhood', 'room_type',
'availability_365', 'reviews_per_month',
'calculated_host_listings_count']],
drop_first=True
)
y = df['price'].clip(upper=1000) # cap extreme outliers
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = RandomForestRegressor(
n_estimators=200,
max_depth=15,
random_state=42,
n_jobs=-1
)
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
# Feature Importance Plot
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(15)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_features.values, y=top_features.index, palette="viridis")
plt.title("Top Factors Driving Airbnb Price")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()
# Actual vs Predicted Price Plot
df_test = X_test.copy()
df_test['actual_price'] = y_test
df_test['predicted_price'] = y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(
x=df_test['actual_price'],
y=df_test['predicted_price'],
alpha=0.5
)
plt.plot([0, 1000], [0, 1000], color='red', linestyle='--') # perfect prediction line
plt.title("Actual vs Predicted Airbnb Prices")
plt.xlabel("Actual Price ($)")
plt.ylabel("Predicted Price ($)")
plt.show()
return model, df_test
model, df_test = revenue_optimization(full_data)
R² Score: 0.7248801639213218 RMSE: 74.35417784694954
/Applications/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
from IPython.display import display
def geo_maps(df):
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['latitude', 'longitude', 'price'])
center = [df['latitude'].mean(), df['longitude'].mean()]
# Choropleth + dots
avg_price = df.groupby('neighbourhood')['price'].mean().reset_index()
neighbourhood_geo = "neighbourhoods.geojson"
combined = folium.Map(location=center, zoom_start=12)
folium.Choropleth(
geo_data=neighbourhood_geo,
data=avg_price,
columns=['neighbourhood', 'price'],
key_on='feature.properties.neighbourhood',
fill_color='YlOrRd',
legend_name='Average Price ($)'
).add_to(combined)
for _, row in df.sample(2000).iterrows():
folium.CircleMarker(
location=[row['latitude'], row['longitude']],
radius=2,
color='blue',
fill=True,
fill_opacity=0.5
).add_to(combined)
display(combined)
# Heatmap
heat_data = df[['latitude', 'longitude', 'number_of_reviews']].dropna()
heat_data['number_of_reviews'] = heat_data['number_of_reviews'].clip(upper=100)
hm = folium.Map(location=center, zoom_start=12)
HeatMap(heat_data.values.tolist()).add_to(hm)
display(hm)
geo_maps(full_data)