# Prepare environment ----------------------------------------------------------
# Packages
library(nnet)
library(MLmetrics) # for LogLoss() function
library(FactoMineR) # for tab.disjonctif() function
# Default rounding for this sessino
options("digits" = 5)
Introduction
Cross-entropy (CE) quantifies the difference between two probability distributions. As such, it comes in handy as a loss function in multi-class classification tasks (e.g., multinomial logistic regression). It also provides an elegant solution for determining the difference between actual and predicted categorical data point values. It can be used to determine the predictive performance of a classification model. The value of the cross-entropy is higher when the predicted classes diverge more from the true labels.
Learn by coding
In a multiclass-classification task, we calculate a separate “loss” for each class for each observation and sum the result:
\[ CE = - \sum^{N}_{i = 1} \sum^{K}_{k = 1} p_{(i, k)}log(\hat{p}_{(i, k)}) (\#eq:CE) \]
where
- \(N\) is the sample size.
- \(K\) is the number of categories of the variable we are trying to predict.
- \(p\) is a scalar taking value \(0 = \text{no}\) or \(1 = \text{yes}\) to indicate whether observation \(i\) belongs to class \(k\). This can also be thought of as the true probability of the observation belonging to that class.
- \(\hat{p}\) is a scalar indicating the predicted probability of observation \(i\) belonging to class \(k\).
- \(log\) is the natural logarithm.
Let’s see an example in R. The iris
data records the petal and sepal dimensions for 150 and their species. Consider the task of predicting the flowers’ species based on all the numeric predictors available. We will fit a multinomial logistic regression on the data and compute the cross-entropy between the observed and predicted class membership.
To start, we should prepare the R environment by loading a few packages we will use:
nnet
to estimate the multinomial logistic model;MLmetric
to check someone else’s implementation of the cross-entropy computation.FactoMineR
to create a disjunctive table from an R factor
Then, we should estimate the multinomial logistic model of interest. We will use this model to create predictions.
# Fit mulinomial logistic model ------------------------------------------------
# Fit model
<- multinom(Species ~ Sepal.Length, data = iris) glm_mln
We can now create two R matrices p
and p_hat
storing all the scalars \(p_{ik}\) and \(\hat{p}_{ik}\) we need to compute @ref(eq:CE).
First, we want to store all the \(p_{ik}\) in one matrix. To do so, we can create a disjunctive table based on the
species
factor. This is an \(N \times K\) matrix storing 0s and 1s to indicate which class every observation belongs to.# Obtain p and p_har ----------------------------------------------------------- # store true labels in a matrix p <- FactoMineR::tab.disjonctif(iris$Species) p # check it head(p)
setosa versicolor virginica 1 1 0 0 2 1 0 0 3 1 0 0 4 1 0 0 5 1 0 0 6 1 0 0
Second, we want to obtain the predicted class probabilities for every observation:
# obtain predictions <- predict(glm_mln, type = "probs") p_hat # check it head(p_hat)
setosa versicolor virginica 1 0.80657 0.176155 0.0172792 2 0.91844 0.076558 0.0050018 3 0.96787 0.030792 0.0013399 4 0.98005 0.019262 0.0006841 5 0.87281 0.117765 0.0094276 6 0.47769 0.442466 0.0798435
We can now write a loop to perform the computation in @ref(eq:CE) for every \(i\) and \(k\).
# Compute CE with a loop -------------------------------------------------------
# Define parameters
<- nrow(iris) # sample size
N <- nlevels(iris$Species) # number of classes
K
# Create storing object for CE
<- 0
CE
# Compute CE with a loop
for (i in 1:N){
for (k in 1:K){
<- CE - p[i, k] * log(p_hat[i, k])
CE
}
}
# Print the value of CE
CE
[1] 91.034
We can also work with the matrices p
and p_hat
directly to avoid using a loop:
# Compute CE using the matrices directly ---------------------------------------
<- -sum(diag(p %*% t(log(p_hat))))
ce
# Print the value of ce
ce
[1] 91.034
This approach works for a binary prediction just as well. We only need to pay attention to storing the true and predicted probabilities in matrix form. For example, consider the task of predicting the transmission type (automatic or not) for the cars recorded in the mtcars
dataset.
# Binary cross entropy ---------------------------------------------------------
# Fit model
<- glm(am ~ hp + wt,
glm_log family = binomial(link = 'logit'),
data = mtcars)
# store true labels in a matrix p
<- FactoMineR::tab.disjonctif(as.factor(mtcars$am))
p
# obtain predicted probabilites in matrix form
<- predict(glm_log, type = "response")
pred_probs <- cbind(k_0 = 1 - pred_probs,
p_hat k_1 = pred_probs)
class(p_hat)
[1] "matrix" "array"
The objects p
and p_hat
are all the information we need to compute the cross-entropy for this binary prediction task:
# check the first few rows of p
head(p)
0 1
1 0 1
2 0 1
3 0 1
4 1 0
5 1 0
6 1 0
# check the first few rows of p_hat
head(p_hat)
k_0 k_1
Mazda RX4 0.157664 0.8423355
Mazda RX4 Wag 0.595217 0.4047825
Datsun 710 0.029759 0.9702408
Hornet 4 Drive 0.958272 0.0417280
Hornet Sportabout 0.930612 0.0693881
Valiant 0.995012 0.0049882
We can use these new objects to obtain the binary CE with the same computation we used for the multiclass CE:
# Compute CE using the matrices directly
<- -sum(diag(p %*% t(log(p_hat))))
ce
# Print the value of ce
ce
[1] 5.0296
It is not uncommon to divide the value of the cross-entropy by the number of units on which the computation is performed, effectively producing an average loss across the units.
# Express as average
/ nrow(mtcars) ce
[1] 0.15717
Just to be sure, we can use the LogLoss()
function from the MLmetrics
package to compute the same binary CE. However, this function requires the true and predicted probabilities to be stored as vectors instead of matrices. So first we need to obtain the vector versions of p
and p_hat
.
# Compute binary CE with MLmetrics implementation ------------------------------
# Obtain vector of true probabilities
<- mtcars$am
p_vec
# Obtain vector of predicted probabilities
<- predict(glm_log, type = "response") p_hat_vec
and then we can simply provide these objects to the LogLoss()
function:
# Compute and print binary CE with MLmetrics implementation
::LogLoss(y_pred = p_hat_vec,
MLmetricsy_true = p_vec)
[1] 0.15717
TL;DR, just give me the code!
# Prepare environment ----------------------------------------------------------
# Packages
library(nnet)
library(MLmetrics) # for LogLoss() function
library(FactoMineR) # for tab.disjonctif() function
# Default rounding for this sessino
options("digits" = 5)
# Fit mulinomial logistic model ------------------------------------------------
# Fit model
<- multinom(Species ~ Sepal.Length, data = iris)
glm_mln
# Obtain p and p_har -----------------------------------------------------------
# store true labels in a matrix p
<- FactoMineR::tab.disjonctif(iris$Species)
p
# check it
head(p)
# obtain predictions
<- predict(glm_mln, type = "probs")
p_hat
# check it
head(p_hat)
# Compute CE with a loop -------------------------------------------------------
# Define parameters
<- nrow(iris) # sample size
N <- nlevels(iris$Species) # number of classes
K
# Create storing object for CE
<- 0
CE
# Compute CE with a loop
for (i in 1:N){
for (k in 1:K){
<- CE - p[i, k] * log(p_hat[i, k])
CE
}
}
# Print the value of CE
CE
# Compute CE using the matrices directly ---------------------------------------
<- -sum(diag(p %*% t(log(p_hat))))
ce
# Print the value of ce
ce
# Binary cross entropy ---------------------------------------------------------
# Fit model
<- glm(am ~ hp + wt,
glm_log family = binomial(link = 'logit'),
data = mtcars)
# store true labels in a matrix p
<- FactoMineR::tab.disjonctif(as.factor(mtcars$am))
p
# obtain predicted probabilites in matrix form
<- predict(glm_log, type = "response")
pred_probs <- cbind(k_0 = 1 - pred_probs,
p_hat k_1 = pred_probs)
class(p_hat)
# check the first few rows of p
head(p)
# check the first few rows of p_hat
head(p_hat)
# Compute CE using the matrices directly
<- -sum(diag(p %*% t(log(p_hat))))
ce
# Print the value of ce
ce
# Express as average
/ nrow(mtcars)
ce
# Compute binary CE with MLmetrics implementation ------------------------------
# Obtain vector of true probabilities
<- mtcars$am
p_vec
# Obtain vector of predicted probabilities
<- predict(glm_log, type = "response")
p_hat_vec
# Compute and print binary CE with MLmetrics implementation
::LogLoss(y_pred = p_hat_vec,
MLmetricsy_true = p_vec)