Replicate logloss function in R from one given in Kaggle Plankton tutorial by Aaron Sander.
y_true <- read.csv("y_true.csv")[,1]
y_true <- y_true + 1 # make 1 origin
length(y_true)
## [1] 30336
y_true[1]
## [1] 1
y_pred <- as.matrix(read.csv("y_pred.csv"))
dim(y_pred)
## [1] 30336 121
colnames(y_pred) <-NULL
y_pred[1,]
## [1] 0.27 0.00 0.00 0.00 0.00 0.01 0.01 0.01 0.00 0.00 0.00 0.00 0.00 0.00
## [15] 0.07 0.01 0.02 0.04 0.00 0.02 0.00 0.01 0.02 0.00 0.03 0.15 0.00 0.01
## [29] 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.04 0.00 0.00 0.00 0.01 0.00
## [43] 0.00 0.01 0.00 0.00 0.00 0.01 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00
## [57] 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.03 0.00
## [71] 0.00 0.00 0.07 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## [85] 0.00 0.01 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## [99] 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01
## [113] 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.01 0.00
No clip function (that I know of) in R, so brute force:
epsilon <- 1E-15
predictions <- y_pred
predictions[y_pred < epsilon] <- epsilon
predictions[y_pred > 1-epsilon] <- 1-epsilon
predictions[1,]
## [1] 2.7e-01 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-02 1.0e-02 1.0e-02
## [9] 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 7.0e-02 1.0e-02
## [17] 2.0e-02 4.0e-02 1.0e-15 2.0e-02 1.0e-15 1.0e-02 2.0e-02 1.0e-15
## [25] 3.0e-02 1.5e-01 1.0e-15 1.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15
## [33] 1.0e-15 1.0e-15 2.0e-02 1.0e-15 4.0e-02 1.0e-15 1.0e-15 1.0e-15
## [41] 1.0e-02 1.0e-15 1.0e-15 1.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-02
## [49] 1.0e-15 1.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15
## [57] 1.0e-15 1.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15
## [65] 1.0e-15 2.0e-02 1.0e-15 1.0e-15 3.0e-02 1.0e-15 1.0e-15 1.0e-15
## [73] 7.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15
## [81] 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-02 1.0e-15 1.0e-15
## [89] 1.0e-15 1.0e-15 2.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15
## [97] 1.0e-15 1.0e-15 3.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15
## [105] 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-02
## [113] 1.0e-15 1.0e-15 1.0e-02 1.0e-15 1.0e-15 1.0e-15 1.0e-15 1.0e-02
## [121] 1.0e-15
actual <- matrix(0, nrow(y_pred), ncol(y_pred))
n_samples <- nrow(actual)
“easy” python-like way does not work in R
actual[1:121, y_true] <- 1
for (i in 1:n_samples)
{
actual[i, y_true[i]] <- 1
}
actual[1,]
## [1] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Element-wise product
prod <- actual * log(predictions)
prod[1,]
## [1] -1.309333 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [8] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [15] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [22] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [29] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [36] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [43] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [50] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [57] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [64] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [71] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [78] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [85] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [92] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [99] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [106] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [113] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [120] 0.000000 0.000000
logloss by image
byImage <- apply(prod,1, sum)
hist(byImage, main="Image logloss distribution")
loss <- -1.0 * sum(byImage) / n_samples
loss
## [1] 3.711261
table(y_true[byImage < 5*median(byImage)])
##
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 13 5 15 8 4 22 5 2 2 7 4 5 1 3 15 15 11 12
## 20 21 22 23 24 25 27 28 29 30 31 32 33 34 35 36 37 38
## 3 8 15 6 11 1 15 10 22 17 31 5 13 18 34 15 1 8
## 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
## 10 9 12 8 4 1 16 12 2 13 11 6 7 20 9 6 14 13
## 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 9 15 14 7 1 4 11 30 14 13 11 5 1 6 20 29 3 11
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
## 24 10 10 5 12 14 8 40 23 8 3 5 3 14 12 15 23 22
## 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
## 25 4 33 17 16 16 14 10 36 25 25 4 46 16 12 3 8 6
## 111 112 113 114 115 117 118 119 120 121
## 4 8 20 8 21 12 14 36 18 51
multiclassLogLoss <- function(y_true, y_pred, epsilon=1E-15)
{
predictions <- y_pred
predictions[y_pred < epsilon] <- epsilon
predictions[y_pred > 1-epsilon] <- 1-epsilon
actual <- matrix(0, nrow(y_pred), ncol(y_pred))
n_samples <- nrow(actual)
for (i in 1:n_samples)
{
actual[i, y_true[i]] <- 1
}
prod <- actual * log(predictions) # element-wise matrix multiplication
byImage <- apply(prod,1, sum)
loss <- -1.0 * sum(byImage) / n_samples
invisible(list(loss=loss, byImage=byImage))
}
Example use of R function
result <- multiclassLogLoss(y_true, y_pred)
result$loss
[1] 3.711261
heuristicCut <- -5
hist(result$byImage, main="Image logloss distribution")
abline(v=heuristicCut, col="skyblue", lwd=3)
Problem counts by class
counts <- table(y_true[result$byImage < heuristicCut])
counts
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
13 5 15 8 4 22 5 2 2 7 4 5 1 3 15 15 11 12
20 21 22 23 24 25 27 28 29 30 31 32 33 34 35 36 37 38
3 8 15 6 11 1 15 10 22 17 31 5 13 18 34 15 1 8
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
10 9 12 8 4 1 16 12 2 13 11 6 7 20 9 6 14 13
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
9 15 14 7 1 4 11 30 14 13 11 5 1 6 20 29 3 11
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
24 10 10 5 12 14 8 40 23 8 3 5 3 14 12 15 23 22
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
25 4 33 17 16 16 14 10 36 25 25 4 46 16 12 3 8 6
111 112 113 114 115 117 118 119 120 121
4 8 20 8 21 12 14 36 18 51
sum(counts)
[1] 1518