Regression¶
Import Libraries¶
InĀ [1]:
Copied!
%%capture
!pip install ucimlrepo
%%capture
!pip install ucimlrepo
InĀ [2]:
Copied!
import gdown
# Replace with your Google Drive shareable link
url = 'https://drive.google.com/file/d/12vVwZp09AQ8j-QgwgHtyecIEde6bHaiG/view?usp=sharing'
# Convert to the direct download link
file_id = url.split('/d/')[1].split('/')[0]
direct_url = f'https://drive.google.com/uc?id={file_id}'
# Download
gdown.download(direct_url, 'homoscedasticity.jpg', quiet=False)
import gdown
# Replace with your Google Drive shareable link
url = 'https://drive.google.com/file/d/12vVwZp09AQ8j-QgwgHtyecIEde6bHaiG/view?usp=sharing'
# Convert to the direct download link
file_id = url.split('/d/')[1].split('/')[0]
direct_url = f'https://drive.google.com/uc?id={file_id}'
# Download
gdown.download(direct_url, 'homoscedasticity.jpg', quiet=False)
Downloading... From: https://drive.google.com/uc?id=12vVwZp09AQ8j-QgwgHtyecIEde6bHaiG To: /content/homoscedasticity.jpg 100%|āāāāāāāāāā| 23.7k/23.7k [00:00<00:00, 27.3MB/s]
Out[2]:
'homoscedasticity.jpg'
InĀ [3]:
Copied!
import copy
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import tensorflow as tf
%matplotlib inline
plt.style.use('Solarize_Light2')
import copy
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import tensorflow as tf
%matplotlib inline
plt.style.use('Solarize_Light2')
InĀ [4]:
Copied!
from ucimlrepo import fetch_ucirepo
# fetch dataset
seoul_bike_sharing_demand = fetch_ucirepo(id=560)
# data (as pandas dataframes)
X = seoul_bike_sharing_demand.data.features
y = seoul_bike_sharing_demand.data.targets
from ucimlrepo import fetch_ucirepo
# fetch dataset
seoul_bike_sharing_demand = fetch_ucirepo(id=560)
# data (as pandas dataframes)
X = seoul_bike_sharing_demand.data.features
y = seoul_bike_sharing_demand.data.targets
InĀ [5]:
Copied!
X.head(2)
X.head(2)
Out[5]:
Date | Rented Bike Count | Hour | Temperature | Humidity | Wind speed | Visibility | Dew point temperature | Solar Radiation | Rainfall | Snowfall | Seasons | Holiday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1/12/2017 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | No Holiday |
1 | 1/12/2017 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | No Holiday |
InĀ [6]:
Copied!
y.head(2)
y.head(2)
Out[6]:
Functioning Day | |
---|---|
0 | Yes |
1 | Yes |
Processing Dataset¶
InĀ [7]:
Copied!
df = X.drop(["Date", "Holiday", "Seasons"], axis=1)
df.head(2)
df = X.drop(["Date", "Holiday", "Seasons"], axis=1)
df.head(2)
Out[7]:
Rented Bike Count | Hour | Temperature | Humidity | Wind speed | Visibility | Dew point temperature | Solar Radiation | Rainfall | Snowfall | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 |
1 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 |
InĀ [8]:
Copied!
# convert y target to boolboolean
df["Functioning Day"] = (y == "Yes").astype(np.int8)
df.head(2)
# convert y target to boolboolean
df["Functioning Day"] = (y == "Yes").astype(np.int8)
df.head(2)
Out[8]:
Rented Bike Count | Hour | Temperature | Humidity | Wind speed | Visibility | Dew point temperature | Solar Radiation | Rainfall | Snowfall | Functioning Day | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | 1 |
1 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | 1 |
InĀ [9]:
Copied!
df_noon = df[df["Hour"].isin(range(10, 19))]
df_noon = df_noon.drop(["Hour"], axis=1)
# df_noon = df.drop(["Hour"], axis=1)
df_noon = df[df["Hour"].isin(range(10, 19))]
df_noon = df_noon.drop(["Hour"], axis=1)
# df_noon = df.drop(["Hour"], axis=1)
Visualization¶
InĀ [10]:
Copied!
def plots(data):
for label in data.columns[1:]:
plt.scatter(data[label], data["Rented Bike Count"])
plt.title(label)
plt.xlabel(label)
plt.ylabel("Rented Bike Count")
plt.show()
def plots(data):
for label in data.columns[1:]:
plt.scatter(data[label], data["Rented Bike Count"])
plt.title(label)
plt.xlabel(label)
plt.ylabel("Rented Bike Count")
plt.show()
InĀ [11]:
Copied!
plots(df_noon)
plots(df_noon)
Get rid of columns that not satisfy Linear Regression¶
InĀ [12]:
Copied!
df_clean = df_noon.drop(["Functioning Day", "Wind speed", "Visibility", "Rainfall", "Snowfall"], axis=1)
df_clean.head(2)
df_clean = df_noon.drop(["Functioning Day", "Wind speed", "Visibility", "Rainfall", "Snowfall"], axis=1)
df_clean.head(2)
Out[12]:
Rented Bike Count | Temperature | Humidity | Dew point temperature | Solar Radiation | |
---|---|---|---|---|---|
10 | 339 | -3.5 | 24 | -21.2 | 0.65 |
11 | 360 | -0.5 | 21 | -20.2 | 0.94 |
InĀ [13]:
Copied!
plots(df_clean)
plots(df_clean)
Train and Test¶
InĀ [14]:
Copied!
# MulMultiple Reg
X_train, X_test, y_train, y_test = train_test_split(df_clean.drop(["Rented Bike Count"], axis=1), df_clean["Rented Bike Count"], test_size=0.33, random_state=42)
# Simple Reg
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(df_clean["Temperature"], df_clean["Rented Bike Count"], test_size=0.33, random_state=42)
# MulMultiple Reg
X_train, X_test, y_train, y_test = train_test_split(df_clean.drop(["Rented Bike Count"], axis=1), df_clean["Rented Bike Count"], test_size=0.33, random_state=42)
# Simple Reg
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(df_clean["Temperature"], df_clean["Rented Bike Count"], test_size=0.33, random_state=42)
Normalization¶
InĀ [15]:
Copied!
def normalization(train_data, test_data, all_data):
# Create a scaler object
scaler = preprocessing.StandardScaler()
# Fit the scaler to training data
scaler.fit(train_data)
# Transform both training and testing data
train_norm = scaler.transform(train_data)
test_norm = scaler.transform(test_data)
return train_norm, test_norm
def normalization(train_data, test_data, all_data):
# Create a scaler object
scaler = preprocessing.StandardScaler()
# Fit the scaler to training data
scaler.fit(train_data)
# Transform both training and testing data
train_norm = scaler.transform(train_data)
test_norm = scaler.transform(test_data)
return train_norm, test_norm
InĀ [16]:
Copied!
X_train, X_test = normalization(X_train, X_test, df_clean.drop(["Rented Bike Count"], axis=1))
# X_train_temp, X_test_temp = normalization(X_train_temp.values.reshape(-1, 1), X_test_temp.values.reshape(-1, 1))
X_train, X_test = normalization(X_train, X_test, df_clean.drop(["Rented Bike Count"], axis=1))
# X_train_temp, X_test_temp = normalization(X_train_temp.values.reshape(-1, 1), X_test_temp.values.reshape(-1, 1))
Linear Regression¶
Dataset should have these attributes:¶
Linearity¶
Independent: All the samples shouldn't rely in each other¶
Normality¶
Homoscedasticity(Spread should be constant)¶
Error Calculation:¶
MAE : Mean Absolute Error => sum(yi - y^i) / n, i in range(0, n)¶
MSE : Mean Square Error sum((yi - y^i) ^ 2) / n, i in range(0, n)¶
RMSE: Root Mean Square Error => sqrt(sum((yi - y^i) ^ 2) / n), i in range(0, n)¶
R2 Coeff of Determination(R2 == 1 => good line):¶
- R^2 = 1 - (RSS / TSS)
- RSS = Sum of Squared Residuals
- sum((yi - y^i)^2)
- TSS = Total Sum of Squares
- sum((yi - mean(y))^2)
- RSS = Sum of Squared Residuals
- R^2 = 1 - (RSS / TSS)
We use Gradient Descent to find best w and b (y = wx + b)¶
InĀ [17]:
Copied!
# Load the image
img = Image.open('./homoscedasticity.jpg')
# Display the image
plt.imshow(img)
plt.axis('off') # Optional: turn off axis
plt.show()
# Load the image
img = Image.open('./homoscedasticity.jpg')
# Display the image
plt.imshow(img)
plt.axis('off') # Optional: turn off axis
plt.show()
Single Linear Regression¶
InĀ [18]:
Copied!
temp_reg = LinearRegression()
X_train_temp = pd.DataFrame(X_train_temp, columns=["Temperature"])
temp_reg.fit(X_train_temp, y_train_temp)
temp_reg = LinearRegression()
X_train_temp = pd.DataFrame(X_train_temp, columns=["Temperature"])
temp_reg.fit(X_train_temp, y_train_temp)
Out[18]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
InĀ [19]:
Copied!
print(temp_reg.coef_, temp_reg.intercept_)
X_test_temp = pd.DataFrame(X_test_temp, columns=["Temperature"])
print(temp_reg.score(X_test_temp, y_test_temp))
print(temp_reg.coef_, temp_reg.intercept_)
X_test_temp = pd.DataFrame(X_test_temp, columns=["Temperature"])
print(temp_reg.score(X_test_temp, y_test_temp))
[27.9594049] 414.1728882380206 0.2862170774263961
InĀ [20]:
Copied!
plt.scatter(x=X_train_temp, y=y_train_temp, label="Data", color="blue", alpha=0.85)
x = np.array(tf.linspace(-20, 40, 100)).reshape(-1, 1)
plt.plot(x, temp_reg.predict(x), label="Fit", color="red")
plt.legend()
plt.title("Bikes vs Temperature")
plt.xlabel("Temperature")
plt.ylabel("Bikes")
plt.show()
plt.scatter(x=X_train_temp, y=y_train_temp, label="Data", color="blue", alpha=0.85)
x = np.array(tf.linspace(-20, 40, 100)).reshape(-1, 1)
plt.plot(x, temp_reg.predict(x), label="Fit", color="red")
plt.legend()
plt.title("Bikes vs Temperature")
plt.xlabel("Temperature")
plt.ylabel("Bikes")
plt.show()
/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
Multiple Linear Regression¶
InĀ [21]:
Copied!
mul_reg = LinearRegression()
mul_reg.fit(X_train, y_train)
mul_reg = LinearRegression()
mul_reg.fit(X_train, y_train)
Out[21]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
InĀ [22]:
Copied!
print(mul_reg.coef_, mul_reg.intercept_) # w(s) and b
print(mul_reg.score(X_test, y_test))
print(mul_reg.coef_, mul_reg.intercept_) # w(s) and b
print(mul_reg.score(X_test, y_test))
[ 5.39270766 -492.88366452 551.33873797 -179.14262585] 859.2713636363636 0.41321153859657567
Regression with NN¶
InĀ [23]:
Copied!
def plot_loss(history):
"""Plots training and validation loss."""
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)
plt.show()
def plot_loss(history):
"""Plots training and validation loss."""
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)
plt.show()
Single with NN¶
InĀ [24]:
Copied!
temp_normalizer = tf.keras.layers.Normalization(input_shape=(1, ), axis=None)
temp_normalizer.adapt(X_train_temp.values.reshape(-1))
temp_normalizer = tf.keras.layers.Normalization(input_shape=(1, ), axis=None)
temp_normalizer.adapt(X_train_temp.values.reshape(-1))
/usr/local/lib/python3.11/dist-packages/keras/src/layers/preprocessing/tf_data_layer.py:19: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(**kwargs)
InĀ [25]:
Copied!
temp_nn_model = tf.keras.Sequential([
temp_normalizer,
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(units=1)
])
temp_nn_model = tf.keras.Sequential([
temp_normalizer,
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(units=1)
])
InĀ [26]:
Copied!
temp_nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.MeanSquaredError())
temp_nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.MeanSquaredError())
InĀ [27]:
Copied!
# Without extra Denses MSE equal to Linear Regression
history = temp_nn_model.fit(X_train_temp.values.reshape(-1,),
y_train_temp,
epochs=500,
verbose = 0,
validation_split=0.2)
# Without extra Denses MSE equal to Linear Regression
history = temp_nn_model.fit(X_train_temp.values.reshape(-1,),
y_train_temp,
epochs=500,
verbose = 0,
validation_split=0.2)
InĀ [28]:
Copied!
plot_loss(history)
plot_loss(history)
InĀ [29]:
Copied!
plt.scatter(x=X_train_temp, y=y_train_temp, label="Data", color="blue", alpha=0.85)
x = np.array(tf.linspace(-20, 40, 100)).reshape(-1, 1)
plt.plot(x, temp_nn_model.predict(x), label="Fit", color="red")
plt.legend()
plt.title("Bikes vs Temperature")
plt.xlabel("Temperature")
plt.ylabel("Bikes")
plt.show()
plt.scatter(x=X_train_temp, y=y_train_temp, label="Data", color="blue", alpha=0.85)
x = np.array(tf.linspace(-20, 40, 100)).reshape(-1, 1)
plt.plot(x, temp_nn_model.predict(x), label="Fit", color="red")
plt.legend()
plt.title("Bikes vs Temperature")
plt.xlabel("Temperature")
plt.ylabel("Bikes")
plt.show()
4/4 āāāāāāāāāāāāāāāāāāāā 0s 23ms/step
Multiple with NN¶
InĀ [30]:
Copied!
# axis=-1, which normalizes each feature independentl
nn_normalizer = tf.keras.layers.Normalization(input_shape=(len(X_train[0]), ), axis=-1)
nn_normalizer.adapt(X_train)
# axis=-1, which normalizes each feature independentl
nn_normalizer = tf.keras.layers.Normalization(input_shape=(len(X_train[0]), ), axis=-1)
nn_normalizer.adapt(X_train)
InĀ [31]:
Copied!
# Without extra Denses MSE equal to Linear Regression
nn_model = tf.keras.Sequential([
nn_normalizer,
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1)
])
# Without extra Denses MSE equal to Linear Regression
nn_model = tf.keras.Sequential([
nn_normalizer,
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1)
])
InĀ [32]:
Copied!
nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.MeanSquaredError())
nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.MeanSquaredError())
InĀ [33]:
Copied!
history = nn_model.fit(X_train,
y_train,
epochs=1000,
validation_split=0.2,
verbose = 0,
)
history = nn_model.fit(X_train,
y_train,
epochs=1000,
validation_split=0.2,
verbose = 0,
)
InĀ [34]:
Copied!
plot_loss(history)
plot_loss(history)
Evaluating Models¶
InĀ [35]:
Copied!
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
InĀ [36]:
Copied!
# Singles
y_true = y_test_temp
y_pred_temp = temp_reg.predict(X_test_temp)
y_pred_temp_nn = temp_nn_model.predict(X_test_temp)
# Multiples
y_true_mul = y_test
y_pred_mul = mul_reg.predict(X_test)
y_pred_mul_nn = nn_model.predict(X_test)
# Singles
y_true = y_test_temp
y_pred_temp = temp_reg.predict(X_test_temp)
y_pred_temp_nn = temp_nn_model.predict(X_test_temp)
# Multiples
y_true_mul = y_test
y_pred_mul = mul_reg.predict(X_test)
y_pred_mul_nn = nn_model.predict(X_test)
34/34 āāāāāāāāāāāāāāāāāāāā 0s 3ms/step 34/34 āāāāāāāāāāāāāāāāāāāā 0s 4ms/step
InĀ [37]:
Copied!
# Single
mse_single_reg = mean_squared_error(y_true, y_pred_temp)
mse_single_reg_nn = mean_squared_error(y_true, y_pred_temp_nn)
print(f"Mean Squared Error Linear Regression: {mse_single_reg}")
print(f"Mean Squared Error NN: {mse_single_reg_nn}")
# Mul
mse_mul_reg = mean_squared_error(y_true_mul, y_pred_mul)
mse_mul_reg_nn = mean_squared_error(y_true_mul, y_pred_mul_nn)
print(f"Mean Squared Error Linear Regression mul: {mse_mul_reg}")
print(f"Mean Squared Error mul NN: {mse_mul_reg_nn}")
# Single
mse_single_reg = mean_squared_error(y_true, y_pred_temp)
mse_single_reg_nn = mean_squared_error(y_true, y_pred_temp_nn)
print(f"Mean Squared Error Linear Regression: {mse_single_reg}")
print(f"Mean Squared Error NN: {mse_single_reg_nn}")
# Mul
mse_mul_reg = mean_squared_error(y_true_mul, y_pred_mul)
mse_mul_reg_nn = mean_squared_error(y_true_mul, y_pred_mul_nn)
print(f"Mean Squared Error Linear Regression mul: {mse_mul_reg}")
print(f"Mean Squared Error mul NN: {mse_mul_reg_nn}")
Mean Squared Error Linear Regression: 315400.22600332374 Mean Squared Error NN: 285309.6875 Mean Squared Error Linear Regression mul: 259285.01157674904 Mean Squared Error mul NN: 169355.328125
InĀ [38]:
Copied!
ax = plt.axes(aspect="equal")
plt.scatter(y_true, y_pred_temp, label="Lin Reg Preds")
plt.scatter(y_true, y_pred_temp_nn, label="NN Preds", alpha=0.8)
plt.title("Single")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims, c="purple")
plt.show()
ax2 = plt.axes(aspect="equal")
plt.scatter(y_true_mul, y_pred_mul, label="Lin Reg Preds")
plt.scatter(y_true_mul, y_pred_mul_nn, label="NN Preds", alpha=0.8)
plt.title("Multiple")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims, c="purple")
plt.show()
ax = plt.axes(aspect="equal")
plt.scatter(y_true, y_pred_temp, label="Lin Reg Preds")
plt.scatter(y_true, y_pred_temp_nn, label="NN Preds", alpha=0.8)
plt.title("Single")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims, c="purple")
plt.show()
ax2 = plt.axes(aspect="equal")
plt.scatter(y_true_mul, y_pred_mul, label="Lin Reg Preds")
plt.scatter(y_true_mul, y_pred_mul_nn, label="NN Preds", alpha=0.8)
plt.title("Multiple")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims, c="purple")
plt.show()
InĀ [38]:
Copied!