# Merge demand and averaged temperature
df = df_load.join(df_temp_all["temp_avg"], how="left").asfreq("h")

# Conservative interpolation for short gaps only
df["temp_avg"] = df["temp_avg"].interpolate(method="time", limit=6)

print("Missing temperature %:", df["temp_avg"].isna().mean())
df.head()

Missing temperature %: 0.0

df.describe()

tmp = df.copy()
tmp["hour"] = tmp.index.hour
tmp["dayofweek"] = tmp.index.dayofweek

tmp.groupby("hour")["PJME_MW"].mean().plot(figsize=(10,3))
plt.xticks(range(24), range(1, 25))  # show 1–24
plt.title("Average Demand by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Load (MW)")
plt.grid(True, alpha=0.3)
plt.show()

# Average demand per month
tmp["month"] = tmp.index.month
month_avg = tmp.groupby("month")["PJME_MW"].mean()
plt.figure(figsize=(10,3))
plt.plot(month_avg.index, month_avg.values)
plt.xticks(
    ticks=range(1, 13),
    labels=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])
plt.title("Average Demand by Month")
plt.xlabel("Month")
plt.ylabel("Average Load (MW)")
plt.grid(True, alpha=0.3)
plt.show()

import matplotlib.pyplot as plt

df["PJME_MW"].plot(figsize=(14,4))
plt.title("Hourly Electricity Demand (MW) Over Time")
plt.ylabel("MW")
plt.xlabel("Datetime")
plt.show()

plt.figure(figsize=(6,5))
plt.scatter(df["temp_avg"], df["PJME_MW"], alpha=0.15)
plt.xlabel("Averaged Temperature (°C)")
plt.ylabel("Electricity Demand (MW)")
plt.title("Electricity Demand vs Temperature")
plt.show()

df["temp_avg"].plot(figsize=(14,4))
plt.title("Hourly Averaged Temperature (°C) Over Time")
plt.ylabel("°C")
plt.xlabel("Datetime")
plt.show()

from statsmodels.graphics.tsaplots import plot_acf

plot_acf(df["PJME_MW"].dropna(), lags=168)
plt.title("Autocorrelation of Electricity Demand (up to 168 hours or 7 days)")
plt.show()

df["hour"] = df.index.hour
df["dayofweek"] = df.index.dayofweek
df["month"] = df.index.month
df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)

# Display the first 5 rows with new time-based features
df[["hour", "dayofweek", "month", "is_weekend"]].head()

df["lag_1"] = df["PJME_MW"].shift(1)
df["lag_24"] = df["PJME_MW"].shift(24)
df["lag_168"] = df["PJME_MW"].shift(168)
df["roll_mean_24"] = df["PJME_MW"].rolling(24).mean()
df["roll_std_24"] = df["PJME_MW"].rolling(24).std()
df["roll_mean_168"] = df["PJME_MW"].rolling(168).mean()
df["roll_std_168"] = df["PJME_MW"].rolling(168).std()
df[["PJME_MW","lag_1", "lag_24", "lag_168","roll_mean_24", "roll_std_24","roll_mean_168", "roll_std_168"]].head(10)

base_temp = 18.0  # °C
df["HDD"] = (base_temp - df["temp_avg"]).clip(lower=0)
df["CDD"] = (df["temp_avg"] - base_temp).clip(lower=0)
df["temp_lag_24"] = df["temp_avg"].shift(24)
df["temp_roll_mean_24"] = df["temp_avg"].rolling(24).mean()
df[["temp_avg","HDD", "CDD","temp_lag_24","temp_roll_mean_24"]].head(10)

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

feature_cols = [
    "temp_avg","HDD","CDD",
    "hour","dayofweek","month","is_weekend",
    "lag_1","lag_24","lag_168",
    "roll_mean_24","roll_std_24","roll_mean_168","roll_std_168",
    "temp_lag_24","temp_roll_mean_24"
]

df_feat = df[feature_cols + ["PJME_MW"]].dropna()
X = df_feat[feature_cols]

X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=0.90)
X_pca = pca.fit_transform(X_scaled)

print("PCA components retained:", pca.n_components_)
print("Explained variance retained:", pca.explained_variance_ratio_.sum())

PCA components retained: 6
Explained variance retained: 0.9232996165553495

pca_full = PCA()
pca_full.fit(X_scaled)

plt.figure(figsize=(7,3))
plt.plot(
    range(1, len(pca_full.explained_variance_ratio_) + 1),
    pca_full.explained_variance_ratio_.cumsum(),
    marker="o")
plt.axhline(y=0.9, linestyle="--")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Cumulative Explained Variance")
plt.grid(True, alpha=0.3)
plt.show()

df_model = df[feature_cols + ["PJME_MW"]].dropna()

print("Final dataset shape:", df_model.shape)
df_model.head()

Final dataset shape: (145224, 17)

# Define features and target
target = "PJME_MW"
features = feature_cols

df_model = df[features + [target]].dropna()

# Train-test split (80% train, 20% test)
split_idx = int(len(df_model) * 0.8)

train = df_model.iloc[:split_idx]
test = df_model.iloc[split_idx:]

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

print("Train period:", train.index.min(), "to", train.index.max())
print("Test period:", test.index.min(), "to", test.index.max())

Train period: 2002-01-08 01:00:00 to 2015-04-10 19:00:00
Test period: 2015-04-10 20:00:00 to 2018-08-03 00:00:00

plt.figure(figsize=(12,4))

# Plot full series
plt.plot(df_model.index, df_model[target], color="lightgray", label="Full Dataset")
# Highlight training set
plt.plot(train.index,train[target],label="Training Set",linewidth=1.5)
# Highlight test set
plt.plot(test.index,test[target],label="Test Set",linewidth=1.5)
plt.axvline(test.index.min(), linestyle="--", alpha=0.7, label="Train/Test Split")
plt.title("Train–Test Split for Electricity Demand Forecasting")
plt.xlabel("Time")
plt.ylabel("Electricity Demand (MW)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

y_pred_baseline = X_test["lag_24"]
rmse_b, mae_b, mape_b = evaluate(y_test, y_pred_baseline)

print("Baseline Model: Lag-24 Persistence")
print(f"RMSE: {rmse_b:.2f} MW")
print(f"MAE: {mae_b:.2f} MW")
print(f"MAPE: {mape_b:.2f} %")

Baseline Model: Lag-24 Persistence
RMSE: 3009.01 MW
MAE: 2184.24 MW
MAPE: 6.93 %

first_month = y_test.index.to_period("M")[0].strftime("%Y-%m")

y_test_m = y_test.loc[first_month]
y_pred_m = y_pred_baseline.loc[y_test_m.index]

plt.figure(figsize=(12,4))
plt.plot(y_test_m.index, y_test_m, label="Actual Demand", linewidth=1.5)
plt.plot(y_test_m.index, y_pred_m, label="Baseline (Lag-24)", linestyle="--", linewidth=1.5)

plt.title(f"Baseline Model: Actual vs Prediction ({first_month})")
plt.xlabel("Time")
plt.ylabel("Electricity Demand (MW)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

rmse_lr, mae_lr, mape_lr = evaluate(y_test, y_pred_lr)

print("Linear Regression")
print(f"RMSE: {rmse_lr:.2f}")
print(f"MAE: {mae_lr:.2f}")
print(f"MAPE: {mape_lr:.2f}%")

Linear Regression
RMSE: 1156.81
MAE: 915.77
MAPE: 2.94%

# Generate predictions
y_pred_lr = lr.predict(X_test)
y_pred_lr = pd.Series(y_pred_lr, index=y_test.index)
# First month in test set
first_month = y_test.index.to_period("M")[0].strftime("%Y-%m")
y_test_m = y_test.loc[first_month]
y_pred_m = y_pred_lr.loc[y_test_m.index]
plt.figure(figsize=(12,4))
plt.plot(y_test_m.index, y_test_m, label="Actual Demand", linewidth=1.5)
plt.plot(y_test_m.index, y_pred_m, label="Linear Regression Prediction", linestyle="--", linewidth=1.5)
plt.title(f"Linear Regression: Actual vs Prediction ({first_month})")
plt.xlabel("Time")
plt.ylabel("Electricity Demand (MW)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rmse_rf, mae_rf, mape_rf = evaluate(y_test, y_pred_rf)

print("Random Forest")
print(f"RMSE: {rmse_rf:.2f}")
print(f"MAE: {mae_rf:.2f}")
print(f"MAPE: {mape_rf:.2f}%")

Random Forest
RMSE: 434.16
MAE: 314.17
MAPE: 0.99%

y_pred_rf = rf.predict(X_test)
y_pred_rf = pd.Series(y_pred_rf, index=y_test.index)
first_month = y_test.index.to_period("M")[0].strftime("%Y-%m")
y_test_m = y_test.loc[first_month]
y_pred_m = y_pred_rf.loc[y_test_m.index]
plt.figure(figsize=(12,4))
plt.plot(y_test_m.index,y_test_m,label="Actual Demand",linewidth=1.5)
plt.plot(y_test_m.index,y_pred_m,label="Random Forest Prediction",linestyle="--",linewidth=1.5)
plt.title(f"Random Forest: Actual vs Prediction ({first_month})")
plt.xlabel("Time")
plt.ylabel("Electricity Demand (MW)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

from sklearn.ensemble import HistGradientBoostingRegressor

gbr = HistGradientBoostingRegressor(
    max_depth=8,
    learning_rate=0.05,
    max_iter=300,
    random_state=42
)

gbr.fit(X_train, y_train)

y_pred_gbr = gbr.predict(X_test)

rmse_gbr, mae_gbr, mape_gbr = evaluate(y_test, y_pred_gbr)

print("Gradient Boosting")
print(f"RMSE: {rmse_gbr:.2f}")
print(f"MAE: {mae_gbr:.2f}")
print(f"MAPE: {mape_gbr:.2f}%")

Gradient Boosting
RMSE: 426.41
MAE: 320.39
MAPE: 1.02%

y_pred_gbr = gbr.predict(X_test)
y_pred_gbr = pd.Series(y_pred_gbr, index=y_test.index)
first_month = y_test.index.to_period("M")[0].strftime("%Y-%m")
y_test_m = y_test.loc[first_month]
y_pred_m = y_pred_gbr.loc[y_test_m.index]
plt.figure(figsize=(12,4))
plt.plot(y_test_m.index,y_test_m,label="Actual Demand",linewidth=1.5)
plt.plot(y_test_m.index,y_pred_m,label="Gradient Boosting Prediction",linestyle="--",linewidth=1.5)
plt.title(f"Gradient Boosting: Actual vs Prediction ({first_month})")
plt.xlabel("Time")
plt.ylabel("Electricity Demand (MW)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

import pandas as pd

results = pd.DataFrame({
    "Model": [
        "Baseline (Lag-24)",
        "Linear Regression",
        "Random Forest",
        "Gradient Boosting"
    ],
    "RMSE": [rmse_b, rmse_lr, rmse_rf, rmse_gbr],
    "MAE": [mae_b, mae_lr, mae_rf, mae_gbr],
    "MAPE (%)": [mape_b, mape_lr, mape_rf, mape_gbr]
})

results

models = results["Model"]
x = np.arange(len(models))
width = 0.35
plt.figure(figsize=(7,3))
plt.bar(x - width/2, results["RMSE"], width, label="RMSE")
plt.bar(x + width/2, results["MAE"], width, label="MAE")
plt.xticks(x, models, rotation=15)
plt.ylabel("Error (MW)")
plt.title("Model Comparison: RMSE and MAE")
plt.legend()
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

from sklearn.inspection import permutation_importance
perm = permutation_importance(gbr,X_test,y_test,n_repeats=10,random_state=42,n_jobs=-1)
feat_importance = pd.Series(perm.importances_mean,index=feature_cols).sort_values(ascending=False)
plt.figure(figsize=(8,4))
feat_importance.head(10).plot(kind="bar")
plt.yscale("log")
plt.title("Permutation Importance (Log Scale)")
plt.ylabel("Mean Importance (log)")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

import seaborn as sns
# Get top 10 features from permutation importance
top10_features = feat_importance.head(10).index.tolist()
# Include target variable for correlation
heatmap_cols = ["PJME_MW"] + top10_features
corr_matrix = df_model[heatmap_cols].corr()
plt.figure(figsize=(8,4))
sns.heatmap(corr_matrix,annot=True,fmt=".2f",cmap="coolwarm",center=0,linewidths=0.5)
plt.title("Correlation Heatmap: PJME Demand vs Top 10 Important Features")
plt.tight_layout()
plt.show()

# Create evaluation dataframe
df_eval = pd.DataFrame({"actual": y_test,"predicted": y_pred_gbr})
df_eval["error"] = df_eval["predicted"] - df_eval["actual"]
df_eval["hour"] = df_eval.index.hour
df_eval["season"] = df_eval.index.month % 12 // 3 + 1
hourly_error = df_eval.groupby("hour")["error"].mean()
plt.figure(figsize=(7,3))
hourly_error.plot()
plt.axhline(0, color="black", linewidth=1)
plt.title("Mean Forecast Error by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Mean Error (MW)")
# Force all hours (0–23)
plt.xticks(range(24), range(24))
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Map season numbers to labels
season_map = {1: "Winter",2: "Spring",3: "Summer",4: "Fall"}
seasonal_error = (df_eval.groupby("season")["error"].mean().rename(index=season_map))
plt.figure(figsize=(6,4))
seasonal_error.plot(kind="bar")
plt.axhline(0, color="black", linewidth=1)
plt.title("Mean Forecast Error by Season")
plt.ylabel("Mean Error (MW)")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

	PJME_MW	temp_avg
Datetime
2002-01-01 01:00:00	30393.0	-2.033333
2002-01-01 02:00:00	29265.0	-3.533333
2002-01-01 03:00:00	28357.0	-3.133333
2002-01-01 04:00:00	27899.0	-4.066667
2002-01-01 05:00:00	28057.0	-4.633333

	PJME_MW	temp_avg
count	145392.000000	145392.000000
mean	32078.927654	14.064849
std	6464.283311	10.084597
min	14544.000000	-16.100000
25%	27571.000000	5.933333
50%	31420.000000	14.600000
75%	35647.000000	22.566667
max	62009.000000	39.466667

	PJME_MW	lag_1	lag_24	lag_168	roll_mean_24	roll_std_24	roll_mean_168	roll_std_168
Datetime
2002-01-01 01:00:00	30393.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 02:00:00	29265.0	30393.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 03:00:00	28357.0	29265.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 04:00:00	27899.0	28357.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 05:00:00	28057.0	27899.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 06:00:00	28654.0	28057.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 07:00:00	29308.0	28654.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 08:00:00	29595.0	29308.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 09:00:00	29943.0	29595.0	NaN	NaN	NaN	NaN	NaN	NaN
2002-01-01 10:00:00	30692.0	29943.0	NaN	NaN	NaN	NaN	NaN	NaN

	temp_avg	HDD	CDD	temp_lag_24	temp_roll_mean_24
Datetime
2002-01-01 01:00:00	-2.033333	20.033333	0.0	NaN	NaN
2002-01-01 02:00:00	-3.533333	21.533333	0.0	NaN	NaN
2002-01-01 03:00:00	-3.133333	21.133333	0.0	NaN	NaN
2002-01-01 04:00:00	-4.066667	22.066667	0.0	NaN	NaN
2002-01-01 05:00:00	-4.633333	22.633333	0.0	NaN	NaN
2002-01-01 06:00:00	-5.200000	23.200000	0.0	NaN	NaN
2002-01-01 07:00:00	-5.766667	23.766667	0.0	NaN	NaN
2002-01-01 08:00:00	-6.500000	24.500000	0.0	NaN	NaN
2002-01-01 09:00:00	-6.666667	24.666667	0.0	NaN	NaN
2002-01-01 10:00:00	-6.666667	24.666667	0.0	NaN	NaN

	temp_avg	HDD	CDD	hour	dayofweek	month	is_weekend	lag_1	lag_24	lag_168	roll_mean_24	roll_std_24	roll_mean_168	roll_std_168	temp_lag_24	temp_roll_mean_24	PJME_MW
Datetime
2002-01-08 01:00:00	1.866667	16.133333	0.0	1	1	1	0	31187.0	26862.0	30393.0	33560.208333	4425.965952	32513.869048	3861.770954	1.466667	1.952778	29445.0
2002-01-08 02:00:00	1.300000	16.700000	0.0	2	1	1	0	29445.0	25976.0	29265.0	33672.458333	4256.159403	32510.327381	3865.039821	1.666667	1.937500	28670.0
2002-01-08 03:00:00	0.566667	17.433333	0.0	3	1	1	0	28670.0	25641.0	28357.0	33786.375000	4064.104959	32510.434524	3864.924245	1.666667	1.891667	28375.0
2002-01-08 04:00:00	-0.600000	18.600000	0.0	4	1	1	0	28375.0	25666.0	27899.0	33906.208333	3851.076461	32514.261905	3860.646270	1.333333	1.811111	28542.0
2002-01-08 05:00:00	-1.300000	19.300000	0.0	5	1	1	0	28542.0	26328.0	28057.0	34028.416667	3640.941409	32521.428571	3853.433314	1.500000	1.694444	29261.0

	Model	RMSE	MAE	MAPE (%)
0	Baseline (Lag-24)	3009.005888	2184.238217	6.931294
1	Linear Regression	1156.808528	915.766818	2.938434
2	Random Forest	434.163865	314.169763	0.992615
3	Gradient Boosting	426.411469	320.394633	1.020462

Electricity Demand Forecasting⚡¶

Objective¶

Problem Overview¶

Dataset Overview¶

Data Sources¶

Scope¶

Core Variables¶

⬇️ See tables below¶

Exploratory Data Analysis¶

Demand Patterns¶

Observations¶

⬇️ See graphs below¶

Exploratory Data Analysis¶

Temporal & Weather Effects¶

Observations¶

⬇️ See graphs below¶

Feature Engineering & Dimensionality Reduction¶

Engineered Features¶

Dimensionality Reduction¶

⬇️ See tables and graphs below¶

Model-Ready Dataset & Train–Test Strategy¶

Final Model-Ready Dataset¶

Validation Approach¶

⬇️ See tables and graphs below¶

Models Evaluated¶

Success Metrics¶

⬇️ See results per model below¶

Baseline Model (Lag-24 Persistence)¶

Linear Regression Model¶

Random Forest Regressor¶

Gradient Boosting¶

Model Performance Comparison¶

Key Results¶

⬇️ See tables and graphs below¶

Model Explainability¶

Interpretation¶

⬇️ See graphs below¶

Bias & Error Diagnostics¶

Findings¶

⬇️ See graphs below¶

Limitations & Conclusion¶

Key Limitations¶

Conclusion¶