Load wndchrm feature files for TEST cases and save to .Rdata file.
efg, 2015-02-14
time.1 <- Sys.time()
format(time.1, "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-151006"
TEST.BASE <- "test"
parseWndchrm <- function(wndchrm)
{
# For now, be very strict with header
header <- wndchrm[1]
filename <- wndchrm[2]
wndchrm <- wndchrm[-1:-2]
splits <- strsplit(wndchrm, "\t")
values <- as.numeric(unlist(lapply(splits, "[", 1)))
features <- unlist(lapply(splits, "[", 2))
invisible( list(header=header, filename=filename, values=values, features=features) )
}
Setup feature matrix based on first .sig file in directory
files <- list.files(path=TEST.BASE, pattern="^.*\\.sig$")
Nfiles <- length(files)
stopifnot(Nfiles == 130400) # Make sure all files counted
Read wndchrm file but ignore first two rows for now.
wndchrm <- readLines(paste0(TEST.BASE, "/", files[1]))
parsed <- parseWndchrm(wndchrm)
Allocate feature matrix:
feature.matrix <- matrix(0, nrow=Nfiles, ncol=length(parsed$features))
dim(feature.matrix)
## [1] 130400 2919
object.size(feature.matrix)
## 3045101000 bytes
rowNames <- files
columnNames <- parsed$features # R accepts almost any string as column name
dimnames(feature.matrix) = list(rowNames, columnNames)
for (i in 1:Nfiles)
{
if (i %% 5000 == 1)
{
cat(i, files[i], "\n")
flush.console()
}
wndchrm <- readLines(paste0(TEST.BASE, "/", files[i]))
parsed <- parseWndchrm(wndchrm)
stopifnot(all(columnNames == parsed$features)) # check column names
feature.matrix[i,] <- parsed$values
}
## 1 1-l.sig
## 5001 105522-l.sig
## 10001 111097-l.sig
## 15001 11669-l.sig
## 20001 122210-l.sig
## 25001 127730-l.sig
## 30001 133285-l.sig
## 35001 138861-l.sig
## 40001 144413-l.sig
## 45001 149982-l.sig
## 50001 155502-l.sig
## 55001 16387-l.sig
## 60001 21916-l.sig
## 65001 27499-l.sig
## 70001 3303-l.sig
## 75001 38592-l.sig
## 80001 44105-l.sig
## 85001 49665-l.sig
## 90001 55183-l.sig
## 95001 60765-l.sig
## 100001 66309-l.sig
## 105001 71850-l.sig
## 110001 77417-l.sig
## 115001 82970-l.sig
## 120001 885-l.sig
## 125001 94005-l.sig
## 130001 99559-l.sig
Column is marked as constant if column max equals min.
How is it possible to have so many constant columns?
min.col <- apply(feature.matrix, 2, min)
max.col <- apply(feature.matrix, 2, max)
N.removed <- sum(min.col == max.col)
minmax <- which(min.col == max.col)
minmax <- data.frame(ColName=names(minmax), ColNumber=as.integer(minmax))
minmax
## ColName ColNumber
## 1 Chebyshev Coefficients () [29] 165
## 2 Chebyshev Coefficients () [30] 166
## 3 Chebyshev Coefficients () [31] 167
## 4 Multiscale Histograms (Fourier ()) [0] 596
## 5 Multiscale Histograms (Fourier ()) [1] 597
## 6 Multiscale Histograms (Fourier ()) [5] 601
## 7 Multiscale Histograms (Fourier ()) [6] 602
## 8 Multiscale Histograms (Fourier ()) [10] 606
## 9 Multiscale Histograms (Fourier ()) [11] 607
## 10 Multiscale Histograms (Fourier ()) [12] 608
## 11 Multiscale Histograms (Fourier ()) [13] 609
## 12 Multiscale Histograms (Fourier ()) [18] 614
## 13 Multiscale Histograms (Fourier ()) [19] 615
## 14 Multiscale Histograms (Fourier ()) [20] 616
## 15 Multiscale Histograms (Fourier ()) [21] 617
## 16 Multiscale Histograms (Fourier ()) [22] 618
## 17 Multiscale Histograms (Fourier (Wavelet ())) [0] 1724
## 18 Tamura Textures (Fourier (Chebyshev ())) [0] 1892
## 19 Multiscale Histograms (Chebyshev (Wavelet ())) [1] 2013
## 20 Pixel Intensity Statistics (Edge ()) [3] 2357
## 21 Multiscale Histograms (Fourier (Edge ())) [0] 2572
write.csv(minmax, "plankton-test-constant-columns.csv")
constant.columns <- feature.matrix[,min.col == max.col]
head(constant.columns, 1)
## Chebyshev Coefficients () [29] Chebyshev Coefficients () [30]
## 1-l.sig 0 0
## Chebyshev Coefficients () [31]
## 1-l.sig 1
## Multiscale Histograms (Fourier ()) [0]
## 1-l.sig 1
## Multiscale Histograms (Fourier ()) [1]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [5]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [6]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [10]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [11]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [12]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [13]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [18]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [19]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [20]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [21]
## 1-l.sig 0
## Multiscale Histograms (Fourier ()) [22]
## 1-l.sig 0
## Multiscale Histograms (Fourier (Wavelet ())) [0]
## 1-l.sig 1
## Tamura Textures (Fourier (Chebyshev ())) [0]
## 1-l.sig 1
## Multiscale Histograms (Chebyshev (Wavelet ())) [1]
## 1-l.sig 1
## Pixel Intensity Statistics (Edge ()) [3]
## 1-l.sig 0
## Multiscale Histograms (Fourier (Edge ())) [0]
## 1-l.sig 1
dim(feature.matrix)
## [1] 130400 2919
N.removed
## [1] 21
feature.matrix <- feature.matrix[,min.col != max.col]
dim(feature.matrix)
## [1] 130400 2898
format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-163415"
save(feature.matrix, file="plankton-test-wndchrm-features.Rdata")
format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-163614"
time.2 <- Sys.time()
cat(sprintf("%.1f", as.numeric(difftime(time.2, time.1, units="secs"))), " secs\n")
## 5168.5 secs
efg @EarlGlynn
2015-02-15 1636