Load wndchrm feature files for TRAINING cases and save to .Rdata file.
efg, 2015-02-09
time.1 <- Sys.time()
format(time.1, "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-143053"
TRAIN.BASE <- "train"
parseWndchrm <- function(wndchrm)
{
# For now, be very strict with header
header <- wndchrm[1]
filename <- wndchrm[2]
wndchrm <- wndchrm[-1:-2]
splits <- strsplit(wndchrm, "\t")
values <- as.numeric(unlist(lapply(splits, "[", 1)))
features <- unlist(lapply(splits, "[", 2))
invisible( list(header=header, filename=filename, values=values, features=features) )
}
dirs <- list.dirs(path=TRAIN.BASE, full.names=FALSE, recursive=FALSE)
file_counts <- integer(length(dirs))
for (k in 1:length(dirs))
{
directory <- dirs[k]
files <- list.files(path=paste0(TRAIN.BASE, "/", directory),
pattern="^.*\\.sig$")
file_counts[k] <- length(files)
}
Nfiles <- sum(file_counts)
stopifnot(Nfiles == 30336) # Make sure all files counted
Setup feature matrix based on first .sig file in first directory
directory <- dirs[1]
files <- list.files(path=paste0(TRAIN.BASE, "/", directory),
pattern="^.*\\.sig$")
Read wndchrm file but ignore first two rows for now.
wndchrm <- readLines(paste0(TRAIN.BASE, "/", directory, "/", files[1]))
parsed <- parseWndchrm(wndchrm)
Allocate feature matrix:
feature.matrix <- matrix(0, nrow=Nfiles, ncol=1+length(parsed$features))
dim(feature.matrix)
## [1] 30336 2920
object.size(feature.matrix)
## 708649160 bytes
rowNames <- rep("", Nfiles)
columnNames <- c("class", parsed$features) # R accepts almost any string as column name
# set dimnames below after read loop when filenames are known
rowIndex <- 1
for (k in 1:length(dirs))
{
directory <- dirs[k]
files <- list.files(path=paste0(TRAIN.BASE, "/", directory),
pattern="^.*\\.sig$")
cat(k, directory, length(files), "\n")
flush.console()
rowNames[rowIndex:(rowIndex+length(files)-1)] <- files
for (i in 1:length(files))
{
wndchrm <- readLines(paste0(TRAIN.BASE, "/", directory, "/", files[i]))
parsed <- parseWndchrm(wndchrm)
stopifnot(all(columnNames[-1] == parsed$features)) # check column names
feature.matrix[rowIndex,] <- c(k, parsed$values)
rowIndex <- rowIndex + 1
}
}
## 1 acantharia_protist 889
## 2 acantharia_protist_big_center 13
## 3 acantharia_protist_halo 71
## 4 amphipods 49
## 5 appendicularian_fritillaridae 16
## 6 appendicularian_s_shape 696
## 7 appendicularian_slight_curve 532
## 8 appendicularian_straight 242
## 9 artifacts 393
## 10 artifacts_edge 170
## 11 chaetognath_non_sagitta 815
## 12 chaetognath_other 1934
## 13 chaetognath_sagitta 694
## 14 chordate_type1 77
## 15 copepod_calanoid 681
## 16 copepod_calanoid_eggs 173
## 17 copepod_calanoid_eucalanus 96
## 18 copepod_calanoid_flatheads 178
## 19 copepod_calanoid_frillyAntennae 63
## 20 copepod_calanoid_large 286
## 21 copepod_calanoid_large_side_antennatucked 106
## 22 copepod_calanoid_octomoms 49
## 23 copepod_calanoid_small_longantennae 87
## 24 copepod_cyclopoid_copilia 30
## 25 copepod_cyclopoid_oithona 899
## 26 copepod_cyclopoid_oithona_eggs 1189
## 27 copepod_other 24
## 28 crustacean_other 201
## 29 ctenophore_cestid 113
## 30 ctenophore_cydippid_no_tentacles 42
## 31 ctenophore_cydippid_tentacles 53
## 32 ctenophore_lobate 38
## 33 decapods 55
## 34 detritus_blob 363
## 35 detritus_filamentous 394
## 36 detritus_other 914
## 37 diatom_chain_string 519
## 38 diatom_chain_tube 500
## 39 echinoderm_larva_pluteus_brittlestar 36
## 40 echinoderm_larva_pluteus_early 92
## 41 echinoderm_larva_pluteus_typeC 80
## 42 echinoderm_larva_pluteus_urchin 88
## 43 echinoderm_larva_seastar_bipinnaria 385
## 44 echinoderm_larva_seastar_brachiolaria 536
## 45 echinoderm_seacucumber_auricularia_larva 96
## 46 echinopluteus 27
## 47 ephyra 14
## 48 euphausiids 136
## 49 euphausiids_young 38
## 50 fecal_pellet 511
## 51 fish_larvae_deep_body 10
## 52 fish_larvae_leptocephali 31
## 53 fish_larvae_medium_body 85
## 54 fish_larvae_myctophids 114
## 55 fish_larvae_thin_body 64
## 56 fish_larvae_very_thin_body 16
## 57 heteropod 10
## 58 hydromedusae_aglaura 127
## 59 hydromedusae_bell_and_tentacles 75
## 60 hydromedusae_h15 35
## 61 hydromedusae_haliscera 229
## 62 hydromedusae_haliscera_small_sideview 9
## 63 hydromedusae_liriope 19
## 64 hydromedusae_narco_dark 23
## 65 hydromedusae_narco_young 336
## 66 hydromedusae_narcomedusae 132
## 67 hydromedusae_other 12
## 68 hydromedusae_partial_dark 190
## 69 hydromedusae_shapeA 412
## 70 hydromedusae_shapeA_sideview_small 274
## 71 hydromedusae_shapeB 150
## 72 hydromedusae_sideview_big 76
## 73 hydromedusae_solmaris 703
## 74 hydromedusae_solmundella 123
## 75 hydromedusae_typeD 43
## 76 hydromedusae_typeD_bell_and_tentacles 56
## 77 hydromedusae_typeE 14
## 78 hydromedusae_typeF 61
## 79 invertebrate_larvae_other_A 14
## 80 invertebrate_larvae_other_B 24
## 81 jellies_tentacles 141
## 82 polychaete 131
## 83 protist_dark_center 108
## 84 protist_fuzzy_olive 372
## 85 protist_noctiluca 625
## 86 protist_other 1172
## 87 protist_star 113
## 88 pteropod_butterfly 108
## 89 pteropod_theco_dev_seq 13
## 90 pteropod_triangle 65
## 91 radiolarian_chain 287
## 92 radiolarian_colony 158
## 93 shrimp-like_other 52
## 94 shrimp_caridean 49
## 95 shrimp_sergestidae 153
## 96 shrimp_zoea 174
## 97 siphonophore_calycophoran_abylidae 212
## 98 siphonophore_calycophoran_rocketship_adult 135
## 99 siphonophore_calycophoran_rocketship_young 483
## 100 siphonophore_calycophoran_sphaeronectes 179
## 101 siphonophore_calycophoran_sphaeronectes_stem 57
## 102 siphonophore_calycophoran_sphaeronectes_young 247
## 103 siphonophore_other_parts 29
## 104 siphonophore_partial 30
## 105 siphonophore_physonect 128
## 106 siphonophore_physonect_young 21
## 107 stomatopod 24
## 108 tornaria_acorn_worm_larvae 38
## 109 trichodesmium_bowtie 708
## 110 trichodesmium_multiple 54
## 111 trichodesmium_puff 1979
## 112 trichodesmium_tuft 678
## 113 trochophore_larvae 29
## 114 tunicate_doliolid 439
## 115 tunicate_doliolid_nurse 417
## 116 tunicate_partial 352
## 117 tunicate_salp 236
## 118 tunicate_salp_chains 73
## 119 unknown_blobs_and_smudges 317
## 120 unknown_sticks 175
## 121 unknown_unclassified 425
dimnames(feature.matrix) = list(rowNames, columnNames)
Compute tables of class IDs to check if files assigned correctly to rows in feature matrix.
table(feature.matrix[,1])
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 889 13 71 49 16 696 532 242 393 170 815 1934 694 77 681
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 173 96 178 63 286 106 49 87 30 899 1189 24 201 113 42
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 53 38 55 363 394 914 519 500 36 92 80 88 385 536 96
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 27 14 136 38 511 10 31 85 114 64 16 10 127 75 35
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 229 9 19 23 336 132 12 190 412 274 150 76 703 123 43
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 56 14 61 14 24 141 131 108 372 625 1172 113 108 13 65
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
## 287 158 52 49 153 174 212 135 483 179 57 247 29 30 128
## 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 21 24 38 708 54 1979 678 29 439 417 352 236 73 317 175
## 121
## 425
Column is marked as constant if column max equals min.
How is it possible to have so many constant columns?
min.col <- apply(feature.matrix, 2, min)
max.col <- apply(feature.matrix, 2, max)
N.removed <- sum(min.col == max.col)
minmax <- which(min.col == max.col)
minmax <- data.frame(ColName=names(minmax), ColNumber=as.integer(minmax))
minmax
## ColName ColNumber
## 1 Chebyshev Coefficients () [28] 165
## 2 Chebyshev Coefficients () [29] 166
## 3 Chebyshev Coefficients () [30] 167
## 4 Chebyshev Coefficients () [31] 168
## 5 Pixel Intensity Statistics () [4] 383
## 6 Multiscale Histograms (Fourier ()) [0] 597
## 7 Multiscale Histograms (Fourier ()) [1] 598
## 8 Multiscale Histograms (Fourier ()) [5] 602
## 9 Multiscale Histograms (Fourier ()) [6] 603
## 10 Multiscale Histograms (Fourier ()) [10] 607
## 11 Multiscale Histograms (Fourier ()) [11] 608
## 12 Multiscale Histograms (Fourier ()) [12] 609
## 13 Multiscale Histograms (Fourier ()) [13] 610
## 14 Multiscale Histograms (Fourier ()) [18] 615
## 15 Multiscale Histograms (Fourier ()) [19] 616
## 16 Multiscale Histograms (Fourier ()) [20] 617
## 17 Multiscale Histograms (Fourier ()) [21] 618
## 18 Multiscale Histograms (Fourier ()) [22] 619
## 19 Radon Coefficients (Fourier ()) [2] 629
## 20 Multiscale Histograms (Wavelet (Fourier ())) [0] 1445
## 21 Multiscale Histograms (Fourier (Wavelet ())) [0] 1725
## 22 Tamura Textures (Fourier (Chebyshev ())) [0] 1893
## 23 Multiscale Histograms (Chebyshev (Wavelet ())) [1] 2014
## 24 Pixel Intensity Statistics (Edge ()) [3] 2358
## 25 Multiscale Histograms (Fourier (Edge ())) [0] 2573
write.csv(minmax, "plankton-train-constant-columns.csv")
constant.columns <- feature.matrix[,min.col == max.col]
head(constant.columns, 1)
## Chebyshev Coefficients () [28] Chebyshev Coefficients () [29]
## 100224-l.sig 0 0
## Chebyshev Coefficients () [30] Chebyshev Coefficients () [31]
## 100224-l.sig 0 1
## Pixel Intensity Statistics () [4]
## 100224-l.sig 255
## Multiscale Histograms (Fourier ()) [0]
## 100224-l.sig 1
## Multiscale Histograms (Fourier ()) [1]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [5]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [6]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [10]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [11]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [12]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [13]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [18]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [19]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [20]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [21]
## 100224-l.sig 0
## Multiscale Histograms (Fourier ()) [22]
## 100224-l.sig 0
## Radon Coefficients (Fourier ()) [2]
## 100224-l.sig 1
## Multiscale Histograms (Wavelet (Fourier ())) [0]
## 100224-l.sig 1
## Multiscale Histograms (Fourier (Wavelet ())) [0]
## 100224-l.sig 1
## Tamura Textures (Fourier (Chebyshev ())) [0]
## 100224-l.sig 1
## Multiscale Histograms (Chebyshev (Wavelet ())) [1]
## 100224-l.sig 1
## Pixel Intensity Statistics (Edge ()) [3]
## 100224-l.sig 0
## Multiscale Histograms (Fourier (Edge ())) [0]
## 100224-l.sig 1
dim(feature.matrix)
## [1] 30336 2920
N.removed
## [1] 25
feature.matrix <- feature.matrix[,min.col != max.col]
dim(feature.matrix)
## [1] 30336 2895
format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-144153"
save(feature.matrix, file="plankton-train-wndchrm-features.Rdata")
format(Sys.time(), "%Y-%m-%d-%H%M%S")
## [1] "2015-02-15-144220"
time.2 <- Sys.time()
cat(sprintf("%.1f", as.numeric(difftime(time.2, time.1, units="secs"))), " secs\n")
## 686.8 secs
efg @EarlGlynn
2015-02-15 1442