Load wndchrm feature files for TEST cases and save to .Rdata file.

efg, 2015-02-14

time.1 <- Sys.time()
format(time.1, "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-151006"
TEST.BASE   <- "test"

Parse wndchrm feature file

parseWndchrm <- function(wndchrm)
{
  # For now, be very strict with header
  header   <- wndchrm[1]

  filename <- wndchrm[2]

  wndchrm <- wndchrm[-1:-2]
  splits <- strsplit(wndchrm, "\t")

  values   <- as.numeric(unlist(lapply(splits, "[", 1)))
  features <- unlist(lapply(splits, "[", 2))

  invisible( list(header=header, filename=filename, values=values, features=features) )
}

Create empty feature.matrix

Setup feature matrix based on first .sig file in directory

files <- list.files(path=TEST.BASE, pattern="^.*\\.sig$")
Nfiles <- length(files)
stopifnot(Nfiles == 130400)  # Make sure all files counted

Read wndchrm file but ignore first two rows for now.

wndchrm <- readLines(paste0(TEST.BASE, "/", files[1]))
parsed <- parseWndchrm(wndchrm)

Allocate feature matrix:

feature.matrix <- matrix(0, nrow=Nfiles, ncol=length(parsed$features))
dim(feature.matrix)
## [1] 130400   2919
object.size(feature.matrix)
## 3045101000 bytes
rowNames <- files
columnNames <- parsed$features # R accepts almost any string as column name
dimnames(feature.matrix) = list(rowNames, columnNames)

Loop through files to fill feature matrix

for (i in 1:Nfiles)
{
  if (i %% 5000 == 1)
  {
    cat(i, files[i], "\n")
    flush.console()
  }
  wndchrm <- readLines(paste0(TEST.BASE, "/", files[i]))
  parsed <- parseWndchrm(wndchrm)
  stopifnot(all(columnNames == parsed$features)) # check column names
  feature.matrix[i,] <- parsed$values
}
## 1 1-l.sig 
## 5001 105522-l.sig 
## 10001 111097-l.sig 
## 15001 11669-l.sig 
## 20001 122210-l.sig 
## 25001 127730-l.sig 
## 30001 133285-l.sig 
## 35001 138861-l.sig 
## 40001 144413-l.sig 
## 45001 149982-l.sig 
## 50001 155502-l.sig 
## 55001 16387-l.sig 
## 60001 21916-l.sig 
## 65001 27499-l.sig 
## 70001 3303-l.sig 
## 75001 38592-l.sig 
## 80001 44105-l.sig 
## 85001 49665-l.sig 
## 90001 55183-l.sig 
## 95001 60765-l.sig 
## 100001 66309-l.sig 
## 105001 71850-l.sig 
## 110001 77417-l.sig 
## 115001 82970-l.sig 
## 120001 885-l.sig 
## 125001 94005-l.sig 
## 130001 99559-l.sig

Remove constant columns.

Column is marked as constant if column max equals min.

How is it possible to have so many constant columns?

min.col <- apply(feature.matrix, 2, min)
max.col <- apply(feature.matrix, 2, max)
N.removed <- sum(min.col == max.col)

minmax <- which(min.col == max.col)
minmax <- data.frame(ColName=names(minmax), ColNumber=as.integer(minmax))
minmax
##                                               ColName ColNumber
## 1                      Chebyshev Coefficients () [29]       165
## 2                      Chebyshev Coefficients () [30]       166
## 3                      Chebyshev Coefficients () [31]       167
## 4              Multiscale Histograms (Fourier ()) [0]       596
## 5              Multiscale Histograms (Fourier ()) [1]       597
## 6              Multiscale Histograms (Fourier ()) [5]       601
## 7              Multiscale Histograms (Fourier ()) [6]       602
## 8             Multiscale Histograms (Fourier ()) [10]       606
## 9             Multiscale Histograms (Fourier ()) [11]       607
## 10            Multiscale Histograms (Fourier ()) [12]       608
## 11            Multiscale Histograms (Fourier ()) [13]       609
## 12            Multiscale Histograms (Fourier ()) [18]       614
## 13            Multiscale Histograms (Fourier ()) [19]       615
## 14            Multiscale Histograms (Fourier ()) [20]       616
## 15            Multiscale Histograms (Fourier ()) [21]       617
## 16            Multiscale Histograms (Fourier ()) [22]       618
## 17   Multiscale Histograms (Fourier (Wavelet ())) [0]      1724
## 18       Tamura Textures (Fourier (Chebyshev ())) [0]      1892
## 19 Multiscale Histograms (Chebyshev (Wavelet ())) [1]      2013
## 20           Pixel Intensity Statistics (Edge ()) [3]      2357
## 21      Multiscale Histograms (Fourier (Edge ())) [0]      2572
write.csv(minmax, "plankton-test-constant-columns.csv")

constant.columns <- feature.matrix[,min.col == max.col]
head(constant.columns, 1)
##         Chebyshev Coefficients () [29] Chebyshev Coefficients () [30]
## 1-l.sig                              0                              0
##         Chebyshev Coefficients () [31]
## 1-l.sig                              1
##         Multiscale Histograms (Fourier ()) [0]
## 1-l.sig                                      1
##         Multiscale Histograms (Fourier ()) [1]
## 1-l.sig                                      0
##         Multiscale Histograms (Fourier ()) [5]
## 1-l.sig                                      0
##         Multiscale Histograms (Fourier ()) [6]
## 1-l.sig                                      0
##         Multiscale Histograms (Fourier ()) [10]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [11]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [12]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [13]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [18]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [19]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [20]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [21]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier ()) [22]
## 1-l.sig                                       0
##         Multiscale Histograms (Fourier (Wavelet ())) [0]
## 1-l.sig                                                1
##         Tamura Textures (Fourier (Chebyshev ())) [0]
## 1-l.sig                                            1
##         Multiscale Histograms (Chebyshev (Wavelet ())) [1]
## 1-l.sig                                                  1
##         Pixel Intensity Statistics (Edge ()) [3]
## 1-l.sig                                        0
##         Multiscale Histograms (Fourier (Edge ())) [0]
## 1-l.sig                                             1
dim(feature.matrix)
## [1] 130400   2919
N.removed
## [1] 21
feature.matrix <- feature.matrix[,min.col != max.col]
dim(feature.matrix)
## [1] 130400   2898

Save TEST feature.matrix file

format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-163415"
save(feature.matrix, file="plankton-test-wndchrm-features.Rdata")
format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-163614"

time.2 <- Sys.time()
cat(sprintf("%.1f", as.numeric(difftime(time.2, time.1, units="secs"))), " secs\n")
## 5168.5  secs

efg @EarlGlynn

2015-02-15 1636