import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
# Global Style
sns.set_theme(style="whitegrid")
plt.rcParams.update({
"figure.facecolor": "white",
"axes.titlesize": 16,
"axes.titleweight": "bold",
"axes.labelsize": 13,
"axes.labelweight": "bold",
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"axes.edgecolor": "#333333",
"axes.linewidth": 1.2,
"grid.color": "#DDDDDD",
"grid.linestyle": "--",
"grid.linewidth": 0.7
})
#Data Cleaning Function
def data_cleaning(input_file):
df = pd.read_csv(input_file)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values(by=['Store', 'Date']).reset_index(drop=True)
df['Date'] = df['Date'].dt.strftime("%Y-%m-%d") # keep SQL-friendly format
df['Weekly_Sales'] = df['Weekly_Sales'].round(2)
df['Temperature'] = df['Temperature'].round(0).astype("Int64")
df['Fuel_Price'] = df['Fuel_Price'].round(2)
df['CPI'] = df['CPI'].round(3)
df['Unemployment'] = df['Unemployment'].round(3)
df = df.dropna().reset_index(drop=True)
print("✅ Data cleaned and saved into SQLite")
# Load into SQLite
conn = sqlite3.connect(":memory:") # or "walmart.db" if you want a file
df.to_sql("Walmart_Sales", conn, index=False, if_exists="replace")
return df, conn
df, conn = data_cleaning("Walmart_Sales.csv")
✅ Data cleaned and saved into SQLite
def sql_top_stores(conn):
query = """
SELECT Store, ROUND(AVG(Weekly_Sales), 2) AS Avg_Weekly_Sales
FROM Walmart_Sales
GROUP BY Store
ORDER BY Avg_Weekly_Sales DESC
LIMIT 10;
"""
result = pd.read_sql(query, conn)
plt.figure(figsize=(12,6))
sns.barplot(x="Store", y="Avg_Weekly_Sales", data=result, palette="Greens", edgecolor="black")
plt.title("Top 10 Stores by Average Weekly Sales", pad=15)
plt.ylabel("Avg Weekly Sales ($)")
plt.xlabel("Store")
plt.tight_layout()
plt.show()
return result
The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
print("\n5) Top 10 Performing Stores (SQLite)")
print(sql_top_stores(conn))
5) Top 10 Performing Stores (SQLite)
Store Avg_Weekly_Sales 0 20 2169370.45 1 4 2088557.17 2 14 2075189.47 3 13 1998168.43 4 2 1951954.53 5 10 1905260.18 6 27 1782782.45 7 1 1601845.14 8 6 1588921.57 9 19 1468691.88
def analyze_holiday_sales(data):
holiday_sales = data[data["Holiday_Flag"] == 1]
holiday_avg_sales = holiday_sales.groupby("Date")["Weekly_Sales"].mean().sort_values(ascending=False)
plt.figure(figsize=(12,6))
sns.barplot(
x=holiday_avg_sales.head(10).index,
y=holiday_avg_sales.head(10).values,
palette="Blues_d",
edgecolor="black"
)
plt.title("Top 10 Holiday Weeks by Average Weekly Sales", pad=15)
plt.ylabel("Average Weekly Sales ($)")
plt.xlabel("Date (Holiday Weeks)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
print("Holidays (especially Thanksgiving and Christmas) significantly boost sales.")
return holiday_avg_sales
print("\n1) Holidays Affecting Weekly Sales")
print(analyze_holiday_sales(df).head(10))
1) Holidays Affecting Weekly Sales
Holidays (especially Thanksgiving and Christmas) significantly boost sales. Date 2012-10-02 1.111320e+06 2010-12-02 1.074148e+06 2012-07-09 1.074001e+06 2011-11-02 1.051915e+06 2011-09-09 1.039183e+06 2010-10-09 1.014098e+06 Name: Weekly_Sales, dtype: float64
def analyze_unemployment(data):
store_unemp = data.groupby("Store")["Unemployment"].mean()
plt.figure(figsize=(14,6))
sns.barplot(
x=store_unemp.index,
y=store_unemp.values,
palette="Oranges_r",
edgecolor="black"
)
plt.title("Average Unemployment Rate by Store", pad=15)
plt.ylabel("Unemployment Rate (%)")
plt.xlabel("Store")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
lowest_store = (store_unemp.idxmin(), store_unemp.min())
highest_store = (store_unemp.idxmax(), store_unemp.max())
print("Geographic and economic factors drive unemployment differences.")
return lowest_store, highest_store
print("\n2) Lowest and Highest Unemployment by Store")
lowest, highest = analyze_unemployment(df)
print("Lowest Unemployment Store:", lowest[0], "Rate:", lowest[1])
print("Highest Unemployment Store:", highest[0], "Rate:", highest[1])
2) Lowest and Highest Unemployment by Store
Geographic and economic factors drive unemployment differences. Lowest Unemployment Store: 23 Rate: 4.824877192982456 Highest Unemployment Store: 12 Rate: 13.16838596491228
def sql_temp_sales(conn):
query = """
SELECT
ROUND(Temperature, 0) AS Temp_Bucket,
ROUND(AVG(Weekly_Sales), 2) AS Avg_Weekly_Sales
FROM Walmart_Sales
GROUP BY Temp_Bucket
ORDER BY Temp_Bucket;
"""
result = pd.read_sql(query, conn)
plt.figure(figsize=(10,6))
sns.scatterplot(x="Temp_Bucket", y="Avg_Weekly_Sales", data=result, color="darkorange", edgecolor="black")
plt.title("Impact of Temperature on Weekly Sales", pad=15)
plt.xlabel("Temperature (°F)")
plt.ylabel("Avg Weekly Sales ($)")
plt.tight_layout()
plt.show()
return result
print("\n7) Temperature vs Sales Impact (SQLite)")
print(sql_temp_sales(conn))
7) Temperature vs Sales Impact (SQLite)
Temp_Bucket Avg_Weekly_Sales 0 -2.0 558027.77 1 6.0 1083071.14 2 10.0 751728.76 3 11.0 719745.24 4 12.0 573545.96 .. ... ... 87 96.0 544599.89 88 97.0 242456.39 89 98.0 247099.06 90 99.0 239198.36 91 100.0 297753.49 [92 rows x 2 columns]
def sql_monthly_sales(conn):
query = """
SELECT
CAST(STRFTIME('%m', Date) AS INT) AS Month,
ROUND(AVG(Weekly_Sales), 2) AS Avg_Weekly_Sales
FROM Walmart_Sales
GROUP BY Month
ORDER BY Month;
"""
result = pd.read_sql(query, conn)
plt.figure(figsize=(10,6))
sns.lineplot(x="Month", y="Avg_Weekly_Sales", data=result, marker="o", color="purple")
plt.title("Average Weekly Sales by Month", pad=15)
plt.xlabel("Month")
plt.ylabel("Avg Weekly Sales ($)")
plt.xticks(range(1,13))
plt.tight_layout()
plt.show()
return result
print("\n6) Monthly Seasonality Trends (SQLite)")
print(sql_monthly_sales(conn))
6) Monthly Seasonality Trends (SQLite)
/Applications/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): /Applications/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
Month Avg_Weekly_Sales 0 1 1008661.31 1 2 1070975.69 2 3 1064207.94 3 4 1062675.56 4 5 1057716.88 5 6 1087900.96 6 7 1037893.22 7 8 1047227.09 8 9 1089198.44 9 10 1095038.94 10 11 1045289.61 11 12 1034510.93
def analyze_cpi_sales_correlation(data):
overall = data["CPI"].corr(data["Weekly_Sales"])
non_holiday = data[data["Holiday_Flag"] == 0]["CPI"].corr(data[data["Holiday_Flag"] == 0]["Weekly_Sales"])
holiday = data[data["Holiday_Flag"] == 1]["CPI"].corr(data[data["Holiday_Flag"] == 1]["Weekly_Sales"])
plt.figure(figsize=(9,6))
sns.scatterplot(
x="CPI", y="Weekly_Sales",
hue="Holiday_Flag", data=data,
palette={0:"steelblue", 1:"crimson"},
alpha=0.7, edgecolor="black"
)
plt.title("CPI vs Weekly Sales\n(Red = Holiday Weeks, Blue = Non-Holiday)", pad=15)
plt.xlabel("CPI")
plt.ylabel("Weekly Sales ($)")
plt.legend(title="Holiday", labels=["Non-Holiday","Holiday"], loc="best")
plt.tight_layout()
plt.show()
print("As CPI increases, sales slightly decrease, but holidays weaken this effect.")
return overall, non_holiday, holiday
print("\n3) CPI vs Weekly Sales Correlation")
overall, non_holiday, holiday = analyze_cpi_sales_correlation(df)
print("Overall:", overall)
print("Non-Holiday Weeks:", non_holiday)
print("Holiday Weeks:", holiday)
3) CPI vs Weekly Sales Correlation
As CPI increases, sales slightly decrease, but holidays weaken this effect. Overall: -0.07778526993418897 Non-Holiday Weeks: -0.07592603753248182 Holiday Weeks: -0.09362087936935959
def analyze_fuel_sales(data):
corr = data["Fuel_Price"].corr(data["Weekly_Sales"])
plt.figure(figsize=(9,6))
sns.regplot(
x="Fuel_Price", y="Weekly_Sales",
data=data, scatter_kws={"alpha":0.5, "color":"seagreen"},
line_kws={"color":"black", "lw":2}
)
plt.title(f"Fuel Price vs Weekly Sales (Correlation = {corr:.2f})", pad=15)
plt.xlabel("Fuel Price ($)")
plt.ylabel("Weekly Sales ($)")
plt.tight_layout()
plt.show()
print("Fuel prices have little impact on Walmart sales — seen as a low-cost retailer.")
return corr
print("\n4) Fuel Price vs Weekly Sales Correlation")
print("Correlation:", analyze_fuel_sales(df))
4) Fuel Price vs Weekly Sales Correlation
Fuel prices have little impact on Walmart sales — seen as a low-cost retailer. Correlation: 0.027823812967352958