Load wndchrm feature files for TRAINING cases and save to .Rdata file.

efg, 2015-02-09

time.1 <- Sys.time()
format(time.1, "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-143053"
TRAIN.BASE   <- "train"

Parse wndchrm feature file

parseWndchrm <- function(wndchrm)
{
  # For now, be very strict with header
  header   <- wndchrm[1]

  filename <- wndchrm[2]

  wndchrm <- wndchrm[-1:-2]
  splits <- strsplit(wndchrm, "\t")

  values   <- as.numeric(unlist(lapply(splits, "[", 1)))
  features <- unlist(lapply(splits, "[", 2))

  invisible( list(header=header, filename=filename, values=values, features=features) )
}

Count files in all TRAIN.BASE directories

dirs <- list.dirs(path=TRAIN.BASE, full.names=FALSE, recursive=FALSE)

file_counts <- integer(length(dirs))
for (k in 1:length(dirs))
{
  directory <- dirs[k]
  files <- list.files(path=paste0(TRAIN.BASE, "/", directory),
                      pattern="^.*\\.sig$")
  file_counts[k] <- length(files)
}

Nfiles <- sum(file_counts)
stopifnot(Nfiles == 30336)  # Make sure all files counted

Create empty feature.matrix

Setup feature matrix based on first .sig file in first directory

directory <- dirs[1]
files <- list.files(path=paste0(TRAIN.BASE, "/", directory),
                    pattern="^.*\\.sig$")

Read wndchrm file but ignore first two rows for now.

wndchrm <- readLines(paste0(TRAIN.BASE, "/", directory, "/", files[1]))
parsed <- parseWndchrm(wndchrm)

Allocate feature matrix:

feature.matrix <- matrix(0, nrow=Nfiles, ncol=1+length(parsed$features))
dim(feature.matrix)
## [1] 30336  2920
object.size(feature.matrix)
## 708649160 bytes
rowNames    <- rep("", Nfiles)
columnNames <- c("class", parsed$features) # R accepts almost any string as column name
# set dimnames below after read loop when filenames are known

Loop through directories and files to fill feature matrix

rowIndex <- 1
for (k in 1:length(dirs))
{
  directory <- dirs[k]
  files <- list.files(path=paste0(TRAIN.BASE, "/", directory),
                      pattern="^.*\\.sig$")

  cat(k, directory, length(files), "\n")
  flush.console()

  rowNames[rowIndex:(rowIndex+length(files)-1)] <- files

  for (i in 1:length(files))
  {
    wndchrm <- readLines(paste0(TRAIN.BASE, "/", directory, "/", files[i]))
    parsed <- parseWndchrm(wndchrm)
    stopifnot(all(columnNames[-1] == parsed$features)) # check column names
    feature.matrix[rowIndex,] <- c(k, parsed$values)
    rowIndex <- rowIndex + 1
  }
}
## 1 acantharia_protist 889 
## 2 acantharia_protist_big_center 13 
## 3 acantharia_protist_halo 71 
## 4 amphipods 49 
## 5 appendicularian_fritillaridae 16 
## 6 appendicularian_s_shape 696 
## 7 appendicularian_slight_curve 532 
## 8 appendicularian_straight 242 
## 9 artifacts 393 
## 10 artifacts_edge 170 
## 11 chaetognath_non_sagitta 815 
## 12 chaetognath_other 1934 
## 13 chaetognath_sagitta 694 
## 14 chordate_type1 77 
## 15 copepod_calanoid 681 
## 16 copepod_calanoid_eggs 173 
## 17 copepod_calanoid_eucalanus 96 
## 18 copepod_calanoid_flatheads 178 
## 19 copepod_calanoid_frillyAntennae 63 
## 20 copepod_calanoid_large 286 
## 21 copepod_calanoid_large_side_antennatucked 106 
## 22 copepod_calanoid_octomoms 49 
## 23 copepod_calanoid_small_longantennae 87 
## 24 copepod_cyclopoid_copilia 30 
## 25 copepod_cyclopoid_oithona 899 
## 26 copepod_cyclopoid_oithona_eggs 1189 
## 27 copepod_other 24 
## 28 crustacean_other 201 
## 29 ctenophore_cestid 113 
## 30 ctenophore_cydippid_no_tentacles 42 
## 31 ctenophore_cydippid_tentacles 53 
## 32 ctenophore_lobate 38 
## 33 decapods 55 
## 34 detritus_blob 363 
## 35 detritus_filamentous 394 
## 36 detritus_other 914 
## 37 diatom_chain_string 519 
## 38 diatom_chain_tube 500 
## 39 echinoderm_larva_pluteus_brittlestar 36 
## 40 echinoderm_larva_pluteus_early 92 
## 41 echinoderm_larva_pluteus_typeC 80 
## 42 echinoderm_larva_pluteus_urchin 88 
## 43 echinoderm_larva_seastar_bipinnaria 385 
## 44 echinoderm_larva_seastar_brachiolaria 536 
## 45 echinoderm_seacucumber_auricularia_larva 96 
## 46 echinopluteus 27 
## 47 ephyra 14 
## 48 euphausiids 136 
## 49 euphausiids_young 38 
## 50 fecal_pellet 511 
## 51 fish_larvae_deep_body 10 
## 52 fish_larvae_leptocephali 31 
## 53 fish_larvae_medium_body 85 
## 54 fish_larvae_myctophids 114 
## 55 fish_larvae_thin_body 64 
## 56 fish_larvae_very_thin_body 16 
## 57 heteropod 10 
## 58 hydromedusae_aglaura 127 
## 59 hydromedusae_bell_and_tentacles 75 
## 60 hydromedusae_h15 35 
## 61 hydromedusae_haliscera 229 
## 62 hydromedusae_haliscera_small_sideview 9 
## 63 hydromedusae_liriope 19 
## 64 hydromedusae_narco_dark 23 
## 65 hydromedusae_narco_young 336 
## 66 hydromedusae_narcomedusae 132 
## 67 hydromedusae_other 12 
## 68 hydromedusae_partial_dark 190 
## 69 hydromedusae_shapeA 412 
## 70 hydromedusae_shapeA_sideview_small 274 
## 71 hydromedusae_shapeB 150 
## 72 hydromedusae_sideview_big 76 
## 73 hydromedusae_solmaris 703 
## 74 hydromedusae_solmundella 123 
## 75 hydromedusae_typeD 43 
## 76 hydromedusae_typeD_bell_and_tentacles 56 
## 77 hydromedusae_typeE 14 
## 78 hydromedusae_typeF 61 
## 79 invertebrate_larvae_other_A 14 
## 80 invertebrate_larvae_other_B 24 
## 81 jellies_tentacles 141 
## 82 polychaete 131 
## 83 protist_dark_center 108 
## 84 protist_fuzzy_olive 372 
## 85 protist_noctiluca 625 
## 86 protist_other 1172 
## 87 protist_star 113 
## 88 pteropod_butterfly 108 
## 89 pteropod_theco_dev_seq 13 
## 90 pteropod_triangle 65 
## 91 radiolarian_chain 287 
## 92 radiolarian_colony 158 
## 93 shrimp-like_other 52 
## 94 shrimp_caridean 49 
## 95 shrimp_sergestidae 153 
## 96 shrimp_zoea 174 
## 97 siphonophore_calycophoran_abylidae 212 
## 98 siphonophore_calycophoran_rocketship_adult 135 
## 99 siphonophore_calycophoran_rocketship_young 483 
## 100 siphonophore_calycophoran_sphaeronectes 179 
## 101 siphonophore_calycophoran_sphaeronectes_stem 57 
## 102 siphonophore_calycophoran_sphaeronectes_young 247 
## 103 siphonophore_other_parts 29 
## 104 siphonophore_partial 30 
## 105 siphonophore_physonect 128 
## 106 siphonophore_physonect_young 21 
## 107 stomatopod 24 
## 108 tornaria_acorn_worm_larvae 38 
## 109 trichodesmium_bowtie 708 
## 110 trichodesmium_multiple 54 
## 111 trichodesmium_puff 1979 
## 112 trichodesmium_tuft 678 
## 113 trochophore_larvae 29 
## 114 tunicate_doliolid 439 
## 115 tunicate_doliolid_nurse 417 
## 116 tunicate_partial 352 
## 117 tunicate_salp 236 
## 118 tunicate_salp_chains 73 
## 119 unknown_blobs_and_smudges 317 
## 120 unknown_sticks 175 
## 121 unknown_unclassified 425
dimnames(feature.matrix) = list(rowNames, columnNames)

Compute tables of class IDs to check if files assigned correctly to rows in feature matrix.

table(feature.matrix[,1])
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  889   13   71   49   16  696  532  242  393  170  815 1934  694   77  681 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  173   96  178   63  286  106   49   87   30  899 1189   24  201  113   42 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##   53   38   55  363  394  914  519  500   36   92   80   88  385  536   96 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   27   14  136   38  511   10   31   85  114   64   16   10  127   75   35 
##   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75 
##  229    9   19   23  336  132   12  190  412  274  150   76  703  123   43 
##   76   77   78   79   80   81   82   83   84   85   86   87   88   89   90 
##   56   14   61   14   24  141  131  108  372  625 1172  113  108   13   65 
##   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105 
##  287  158   52   49  153  174  212  135  483  179   57  247   29   30  128 
##  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120 
##   21   24   38  708   54 1979  678   29  439  417  352  236   73  317  175 
##  121 
##  425

Remove constant columns.

Column is marked as constant if column max equals min.

How is it possible to have so many constant columns?

min.col <- apply(feature.matrix, 2, min)
max.col <- apply(feature.matrix, 2, max)
N.removed <- sum(min.col == max.col)

minmax <- which(min.col == max.col)
minmax <- data.frame(ColName=names(minmax), ColNumber=as.integer(minmax))
minmax
##                                               ColName ColNumber
## 1                      Chebyshev Coefficients () [28]       165
## 2                      Chebyshev Coefficients () [29]       166
## 3                      Chebyshev Coefficients () [30]       167
## 4                      Chebyshev Coefficients () [31]       168
## 5                   Pixel Intensity Statistics () [4]       383
## 6              Multiscale Histograms (Fourier ()) [0]       597
## 7              Multiscale Histograms (Fourier ()) [1]       598
## 8              Multiscale Histograms (Fourier ()) [5]       602
## 9              Multiscale Histograms (Fourier ()) [6]       603
## 10            Multiscale Histograms (Fourier ()) [10]       607
## 11            Multiscale Histograms (Fourier ()) [11]       608
## 12            Multiscale Histograms (Fourier ()) [12]       609
## 13            Multiscale Histograms (Fourier ()) [13]       610
## 14            Multiscale Histograms (Fourier ()) [18]       615
## 15            Multiscale Histograms (Fourier ()) [19]       616
## 16            Multiscale Histograms (Fourier ()) [20]       617
## 17            Multiscale Histograms (Fourier ()) [21]       618
## 18            Multiscale Histograms (Fourier ()) [22]       619
## 19                Radon Coefficients (Fourier ()) [2]       629
## 20   Multiscale Histograms (Wavelet (Fourier ())) [0]      1445
## 21   Multiscale Histograms (Fourier (Wavelet ())) [0]      1725
## 22       Tamura Textures (Fourier (Chebyshev ())) [0]      1893
## 23 Multiscale Histograms (Chebyshev (Wavelet ())) [1]      2014
## 24           Pixel Intensity Statistics (Edge ()) [3]      2358
## 25      Multiscale Histograms (Fourier (Edge ())) [0]      2573
write.csv(minmax, "plankton-train-constant-columns.csv")

constant.columns <- feature.matrix[,min.col == max.col]
head(constant.columns, 1)
##              Chebyshev Coefficients () [28] Chebyshev Coefficients () [29]
## 100224-l.sig                              0                              0
##              Chebyshev Coefficients () [30] Chebyshev Coefficients () [31]
## 100224-l.sig                              0                              1
##              Pixel Intensity Statistics () [4]
## 100224-l.sig                               255
##              Multiscale Histograms (Fourier ()) [0]
## 100224-l.sig                                      1
##              Multiscale Histograms (Fourier ()) [1]
## 100224-l.sig                                      0
##              Multiscale Histograms (Fourier ()) [5]
## 100224-l.sig                                      0
##              Multiscale Histograms (Fourier ()) [6]
## 100224-l.sig                                      0
##              Multiscale Histograms (Fourier ()) [10]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [11]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [12]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [13]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [18]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [19]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [20]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [21]
## 100224-l.sig                                       0
##              Multiscale Histograms (Fourier ()) [22]
## 100224-l.sig                                       0
##              Radon Coefficients (Fourier ()) [2]
## 100224-l.sig                                   1
##              Multiscale Histograms (Wavelet (Fourier ())) [0]
## 100224-l.sig                                                1
##              Multiscale Histograms (Fourier (Wavelet ())) [0]
## 100224-l.sig                                                1
##              Tamura Textures (Fourier (Chebyshev ())) [0]
## 100224-l.sig                                            1
##              Multiscale Histograms (Chebyshev (Wavelet ())) [1]
## 100224-l.sig                                                  1
##              Pixel Intensity Statistics (Edge ()) [3]
## 100224-l.sig                                        0
##              Multiscale Histograms (Fourier (Edge ())) [0]
## 100224-l.sig                                             1
dim(feature.matrix)
## [1] 30336  2920
N.removed
## [1] 25
feature.matrix <- feature.matrix[,min.col != max.col]
dim(feature.matrix)
## [1] 30336  2895

Save TRAINING feature.matrix file

format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-144153"
save(feature.matrix, file="plankton-train-wndchrm-features.Rdata")
format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-144220"

time.2 <- Sys.time()
cat(sprintf("%.1f", as.numeric(difftime(time.2, time.1, units="secs"))), " secs\n")
## 686.8  secs

efg @EarlGlynn

2015-02-15 1442