Alright, let's put theory into practice. We've covered defining network structures, initializing parameters, calculating predictions via forward propagation, measuring error with loss functions, computing gradients through backpropagation, and updating parameters using gradient descent. Now, we'll integrate these steps to build and train a simple neural network classifier from scratch using Python and NumPy.
Our goal is to train a network that can classify data points belonging to one of two classes based on two input features. This is a classic binary classification task, perfect for illustrating the core training loop.
First, we need our primary tool for numerical computation, NumPy, and a library for visualization, like Plotly, to see our data and results.
import numpy as np
import json # For embedding Plotly JSON
# Set random seed for reproducibility
np.random.seed(42)
Next, let's generate some simple synthetic data. We'll create two distinct clusters of points in a 2D plane, representing our two classes (labeled 0 and 1).
def generate_data(n_samples=100, noise=0.1):
"""Generates two distinct clusters of data points."""
# Class 0: centered around (1, 1)
X0 = np.random.randn(n_samples // 2, 2) * noise + np.array([1, 1])
Y0 = np.zeros((n_samples // 2, 1))
# Class 1: centered around (-1, -1)
X1 = np.random.randn(n_samples // 2, 2) * noise + np.array([-1, -1])
Y1 = np.ones((n_samples // 2, 1))
X = np.vstack((X0, X1))
Y = np.vstack((Y0, Y1))
# Shuffle the data
permutation = np.random.permutation(n_samples)
X = X[permutation]
Y = Y[permutation]
return X, Y
# Generate data
X_train, Y_train = generate_data(n_samples=200, noise=0.2)
# Let's visualize the data
trace0 = {
"type": "scatter", "mode": "markers",
"x": X_train[Y_train.flatten() == 0, 0].tolist(),
"y": X_train[Y_train.flatten() == 0, 1].tolist(),
"name": "Class 0", "marker": {"color": "#fa5252", "size": 8} # red
}
trace1 = {
"type": "scatter", "mode": "markers",
"x": X_train[Y_train.flatten() == 1, 0].tolist(),
"y": X_train[Y_train.flatten() == 1, 1].tolist(),
"name": "Class 1", "marker": {"color": "#4c6ef5", "size": 8} # blue
}
layout = {
"title": {"text": "Synthetic Classification Data"},
"xaxis": {"title": "Feature 1"}, "yaxis": {"title": "Feature 2"},
"width": 600, "height": 400, "showlegend": True,
"plot_bgcolor": "#e9ecef"
}
The synthetic dataset contains two classes, visually separated in a 2D feature space. Our network should learn to draw a boundary between them.
We'll define a simple feedforward network:
Let's define the layer sizes:
n_input = X_train.shape[1] # Number of features = 2
n_hidden = 4
n_output = 1
We need weights (W) and biases (b) for the connection between the input and hidden layer (W1,b1) and between the hidden and output layer (W2,b2). We'll initialize weights with small random numbers (scaled by 0.01 to prevent overly large initial values) and biases to zero.
def initialize_parameters(n_in, n_hid, n_out):
"""Initializes weights and biases."""
W1 = np.random.randn(n_in, n_hid) * 0.01
b1 = np.zeros((1, n_hid))
W2 = np.random.randn(n_hid, n_out) * 0.01
b2 = np.zeros((1, n_out))
parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
return parameters
parameters = initialize_parameters(n_input, n_hidden, n_output)
print("Initial W1 shape:", parameters["W1"].shape)
print("Initial b1 shape:", parameters["b1"].shape)
print("Initial W2 shape:", parameters["W2"].shape)
print("Initial b2 shape:", parameters["b2"].shape)
Now, let's implement the core functions we discussed in previous sections.
Activation Functions:
def sigmoid(Z):
"""Sigmoid activation function."""
A = 1 / (1 + np.exp(-Z))
return A
def relu(Z):
"""ReLU activation function."""
A = np.maximum(0, Z)
return A
Forward Propagation: This function takes the input data X and the network parameters, performs the linear transformations and applies activation functions layer by layer, returning the final prediction A2 and intermediate values (cache) needed for backpropagation.
def forward_propagation(X, parameters):
"""Performs the forward pass."""
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
# Layer 1 (Hidden)
Z1 = np.dot(X, W1) + b1
A1 = relu(Z1) # Using ReLU for hidden layer
# Layer 2 (Output)
Z2 = np.dot(A1, W2) + b2
A2 = sigmoid(Z2) # Using Sigmoid for output layer (binary classification)
cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2}
return A2, cache
Loss Function: We'll use Binary Cross-Entropy loss, suitable for binary classification problems where the output is a probability.
L=−m1i=1∑m[y(i)log(a(i))+(1−y(i))log(1−a(i))]def compute_loss(A2, Y):
"""Computes the Binary Cross-Entropy loss."""
m = Y.shape[0] # Number of examples
# Add a small epsilon to prevent log(0)
epsilon = 1e-8
loss = - (1 / m) * np.sum(Y * np.log(A2 + epsilon) + (1 - Y) * np.log(1 - A2 + epsilon))
loss = np.squeeze(loss) # Ensure loss is a scalar
return loss
Backward Propagation: This is where we calculate the gradients (∂W1∂L,∂b1∂L,∂W2∂L,∂b2∂L) using the chain rule, working backward from the output layer.
def backward_propagation(parameters, cache, X, Y):
"""Performs the backward pass to calculate gradients."""
m = X.shape[0]
W1 = parameters["W1"]
W2 = parameters["W2"]
A1 = cache["A1"]
A2 = cache["A2"]
Z1 = cache["Z1"]
# Output Layer Gradients
dZ2 = A2 - Y # Derivative of BCE loss w.r.t Z2
dW2 = (1 / m) * np.dot(A1.T, dZ2)
db2 = (1 / m) * np.sum(dZ2, axis=0, keepdims=True)
# Hidden Layer Gradients
dA1 = np.dot(dZ2, W2.T)
# Gradient of ReLU: 1 if Z1 > 0, else 0
dZ1 = dA1 * (Z1 > 0)
dW1 = (1 / m) * np.dot(X.T, dZ1)
db1 = (1 / m) * np.sum(dZ1, axis=0, keepdims=True)
gradients = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
return gradients
Parameter Update: Apply the gradient descent rule: W=W−α∂W∂L, b=b−α∂b∂L.
def update_parameters(parameters, gradients, learning_rate):
"""Updates parameters using gradient descent."""
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
dW1 = gradients["dW1"]
db1 = gradients["db1"]
dW2 = gradients["dW2"]
db2 = gradients["db2"]
# Update rules
W1 = W1 - learning_rate * dW1
b1 = b1 - learning_rate * db1
W2 = W2 - learning_rate * dW2
b2 = b2 - learning_rate * db2
parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
return parameters
Now we assemble everything into the main training loop. We'll iterate multiple times (epochs) over the entire dataset, performing forward propagation, calculating loss, performing backpropagation, and updating parameters in each iteration.
def train_network(X, Y, n_hidden, num_epochs=10000, learning_rate=0.1, print_loss=True):
"""Builds and trains the neural network."""
n_input = X.shape[1]
n_output = Y.shape[1]
m = X.shape[0]
losses = []
# 1. Initialize parameters
parameters = initialize_parameters(n_input, n_hidden, n_output)
# 2. Training Loop (Gradient Descent)
for i in range(num_epochs):
# 3. Forward propagation
A2, cache = forward_propagation(X, parameters)
# 4. Compute loss
loss = compute_loss(A2, Y)
losses.append(loss)
# 5. Backward propagation
gradients = backward_propagation(parameters, cache, X, Y)
# 6. Update parameters
parameters = update_parameters(parameters, gradients, learning_rate)
# Print the loss every 1000 epochs
if print_loss and i % 1000 == 0:
print(f"Loss after epoch {i}: {loss:.4f}")
if print_loss:
print(f"Final Loss after epoch {num_epochs}: {loss:.4f}")
return parameters, losses
# --- Train the model ---
trained_parameters, training_losses = train_network(
X_train, Y_train, n_hidden, num_epochs=20000, learning_rate=0.5
)
A standard way to monitor training is to plot the loss over epochs. We expect the loss to decrease as the network learns.
# Plotting the loss curve
epochs = list(range(len(training_losses)))
loss_trace = {
"type": "scatter", "mode": "lines",
"x": epochs, "y": training_losses,
"name": "Training Loss", "line": {"color": "#7048e8"} # violet
}
loss_layout = {
"title": {"text": "Training Loss Over Epochs"},
"xaxis": {"title": "Epoch"}, "yaxis": {"title": "Binary Cross-Entropy Loss"},
"width": 600, "height": 400, "showlegend": False,
"yaxis_range": [0, max(training_losses)*1.1] # Adjust y-axis slightly
}
The training loss decreases significantly over epochs, indicating that the network is learning to minimize the prediction error.
Let's evaluate the performance by calculating the accuracy on the training set. We'll make predictions using the final trained parameters and compare them to the true labels. A threshold of 0.5 is commonly used for binary classification with sigmoid output.
def predict(parameters, X):
"""Makes predictions using the trained parameters."""
A2, _ = forward_propagation(X, parameters)
predictions = (A2 > 0.5).astype(int) # Threshold at 0.5
return predictions
# Make predictions on the training set
predictions = predict(trained_parameters, X_train)
# Calculate accuracy
accuracy = np.mean(predictions == Y_train) * 100
print(f"Training Accuracy: {accuracy:.2f}%")
You should see a high accuracy (likely close to 100% for this simple dataset), confirming that the network learned to classify the data points correctly.
To better understand what the network learned, we can visualize the decision boundary. This involves creating a grid of points spanning the feature space, predicting the class for each point, and plotting the regions corresponding to each predicted class.
def plot_decision_boundary(pred_func, X, Y, parameters):
"""Plots the decision boundary learned by the model."""
# Set min and max values and give it some padding
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
h = 0.01 # Step size in the mesh
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole grid
Z = pred_func(parameters, np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour
contour_trace = {
"type": 'contour',
"x": xx[0,:].tolist(), "y": yy[:,0].tolist(), "z": Z.tolist(),
"colorscale": [[0, '#ffc9c9'], [1, '#a5d8ff']], # red to blue
"opacity": 0.4, "showscale": False, "name": "Decision Boundary"
}
# Plot the original data points
trace0 = {
"type": "scatter", "mode": "markers",
"x": X[Y.flatten() == 0, 0].tolist(), "y": X[Y.flatten() == 0, 1].tolist(),
"name": "Class 0", "marker": {"color": '#fa5252', "size": 8} # red
}
trace1 = {
"type": "scatter", "mode": "markers",
"x": X[Y.flatten() == 1, 0].tolist(), "y": X[Y.flatten() == 1, 1].tolist(),
"name": "Class 1", "marker": {"color": '#4c6ef5', "size": 8} # blue
}
layout = {
"title": {"text": "Decision Boundary"},
"xaxis": {"title": "Feature 1"}, "yaxis": {"title": "Feature 2"},
"width": 600, "height": 450, "showlegend": True,
"plot_bgcolor": "#e9ecef"
}
fig_data = [contour_trace, trace0, trace1]
return {"layout": layout, "data": fig_data}
# Generate the plot JSON
boundary_plot_json = plot_decision_boundary(predict, X_train, Y_train, trained_parameters)
# (Optional) Display the plot using Plotly library if available, or just show the JSON
# import plotly.graph_objects as go
# fig = go.Figure(data=boundary_plot_json['data'], layout=boundary_plot_json['layout'])
# fig.show()
# Embed the JSON for web display
Was this section helpful?
© 2025 ApX Machine Learning