Plankton image properties for machine learning

This Notebook describes creation of a file of skimage regionprops for Kaggle's National Data Science Bowl "Predict ocean health, one plankton at a time" competition. The Python code below creates separate files of features for the training and test sets to be used for numerical experiments in R.

I previously posted in the online Kaggle forum about the many duplicates, mostly in the test set:

  • There are 30,336 training set images in 121 defined classes stored in separate folders -- two of these images are identical.
  • There are 130,400 test set images of which only 93,502 are unique.

For now I made no attempt to remove the duplicate images.

Many ideas in this Notebook were adapted from the online tutorials by Aaron Sander and Ehud Ben-Reuven.

In [1]:
import numpy  as np
import os
import pandas

from   matplotlib import pyplot    as plt
from   pylab      import cm
from   skimage    import measure
from   skimage    import morphology
from   skimage.io import imread

# make graphics appear inline
%matplotlib inline
In [2]:
import warnings
warnings.filterwarnings("ignore")
In [3]:
import datetime
start_time = datetime.datetime.now()
print start_time
2015-02-01 02:11:54.580000

In [4]:
cd C:\Kaggle\2015\Plankton  
C:\Kaggle\2015\Plankton

train and test directories of plankton images should be under this directory.

Python functions

In [5]:
# Read image, analyze, find region properties
def getImageRegionList(filename):   
    # Read image file
    image = imread(filename, as_grey=True)    
    
    # Thresholding
    image_threshold = np.where(image > np.mean(image),0.,1.0)   
    
    # Dilation
    size_neighborhood = 4
    image_dilated = morphology.dilation(image_threshold, \
                                        np.ones((size_neighborhood,size_neighborhood)))   
    
    # Label regions
    label_list = measure.label(image_dilated)    
    
    # Create label list
    label_list = (image_threshold * label_list).astype(int)
    
    # Region properties
    region_list = measure.regionprops(label_list)
    
    return region_list

# Find the region with the largest area
def getMaxArea(filename):
    region_list = getImageRegionList(filename)
    
    maxArea = None
    for property in region_list:       
        if maxArea is None:
            maxArea = property
        else:
            if property.area > maxArea.area:
                maxArea = property
    return maxArea

def getMaxAreaDict(filename):
    property = getMaxArea(filename)
    
    if property is None:
      maxAreaDict = {'area'               :  0}
    else:
      maxAreaDict = {'label'              :  property.label,
                     
                     'centroid_row'       :  property.centroid[0],          # 0D:  location
                     'centroid_col'       :  property.centroid[1],                     
                     
                     'diameter_equivalent':  property.equivalent_diameter,  # 1D
                     'length_minor_axis'  :  property.minor_axis_length, 
                     'length_major_axis'  :  property.major_axis_length,
                     'ratio_eccentricity' :  property.eccentricity,
                     'perimeter'          :  property.perimeter,
                     'orientation'        :  property.orientation,          # ranges from -pi/2 to pi/2 
                     
                     'area'               :  property.area,                 # 2D
                     'area_convex'        :  property.convex_area,
                     'area_filled'        :  property.filled_area,
                     'box_min_row'        :  property.bbox[0],
                     'box_max_row'        :  property.bbox[2],
                     'box_min_col'        :  property.bbox[1],
                     'box_max_col'        :  property.bbox[3],
                     'ratio_extent'       :  property.extent,
                     'ratio_solidity'     :  property.solidity,                  
                     
                     'inertia_tensor_eigenvalue1':  property.inertia_tensor_eigvals[0], 
                     'inertia_tensor_eigenvalue2':  property.inertia_tensor_eigvals[1],
                     
                     'moments_hu1'        :  property.moments_hu[0],        # translation, scale and rotation invariant
                     'moments_hu2'        :  property.moments_hu[1],
                     'moments_hu3'        :  property.moments_hu[2],
                     'moments_hu4'        :  property.moments_hu[3],
                     'moments_hu5'        :  property.moments_hu[4],
                     'moments_hu6'        :  property.moments_hu[5],
                     'moments_hu7'        :  property.moments_hu[6],
                     
                     'euler_number'       :  property.euler_number,         # miscellaneous
                     
                     'countCoords'        :  len(property.coords)}          # eventually grab these coordinates?

    return maxAreaDict

Training Images

In [6]:
directory_names = os.listdir("train")
# Loop through all the training folders

# imagePropertiesList will contain a list of dictionaries with each
# dictionary the skiimage region properties for one plankton image.
imagePropertiesList = []

for train_index in range(len(directory_names)):  
    folder = directory_names[train_index] 
    basedir = os.path.join("train", folder)   
    filenames = os.listdir(basedir)
    
    print train_index, folder, len(filenames)
    for index in range(len(filenames)):
        filename = filenames[index]
        fullname = os.path.join(basedir, filename)
        
        imagePropertyDict = getMaxAreaDict(fullname)
    
        # Add filename and train_index for training set.
        # filenames are unique across all training and test sets
        imagePropertyDict['filename'] = filename 
        imagePropertyDict['train'] = train_index
    
        imagePropertiesList.append(imagePropertyDict)
        
# Convert list of dictionaries to pandas dataframe.
df = pandas.DataFrame(imagePropertiesList)
print df.shape

# Save to .csv file
df.to_csv('train-properties.csv', index=False)  
0 acantharia_protist 889
1 acantharia_protist_big_center 13
2 acantharia_protist_halo 71
3 amphipods 49
4 appendicularian_fritillaridae 16
5 appendicularian_slight_curve 532
6 appendicularian_straight 242
7 appendicularian_s_shape 696
8 artifacts 393
9 artifacts_edge 170
10 chaetognath_non_sagitta 815
11 chaetognath_other 1934
12 chaetognath_sagitta 694
13 chordate_type1 77
14 copepod_calanoid 681
15 copepod_calanoid_eggs 173
16 copepod_calanoid_eucalanus 96
17 copepod_calanoid_flatheads 178
18 copepod_calanoid_frillyAntennae 63
19 copepod_calanoid_large 286
20 copepod_calanoid_large_side_antennatucked 106
21 copepod_calanoid_octomoms 49
22 copepod_calanoid_small_longantennae 87
23 copepod_cyclopoid_copilia 30
24 copepod_cyclopoid_oithona 899
25 copepod_cyclopoid_oithona_eggs 1189
26 copepod_other 24
27 crustacean_other 201
28 ctenophore_cestid 113
29 ctenophore_cydippid_no_tentacles 42
30 ctenophore_cydippid_tentacles 53
31 ctenophore_lobate 38
32 decapods 55
33 detritus_blob 363
34 detritus_filamentous 394
35 detritus_other 914
36 diatom_chain_string 519
37 diatom_chain_tube 500
38 echinoderm_larva_pluteus_brittlestar 36
39 echinoderm_larva_pluteus_early 92
40 echinoderm_larva_pluteus_typeC 80
41 echinoderm_larva_pluteus_urchin 88
42 echinoderm_larva_seastar_bipinnaria 385
43 echinoderm_larva_seastar_brachiolaria 536
44 echinoderm_seacucumber_auricularia_larva 96
45 echinopluteus 27
46 ephyra 14
47 euphausiids 136
48 euphausiids_young 38
49 fecal_pellet 511
50 fish_larvae_deep_body 10
51 fish_larvae_leptocephali 31
52 fish_larvae_medium_body 85
53 fish_larvae_myctophids 114
54 fish_larvae_thin_body 64
55 fish_larvae_very_thin_body 16
56 heteropod 10
57 hydromedusae_aglaura 127
58 hydromedusae_bell_and_tentacles 75
59 hydromedusae_h15 35
60 hydromedusae_haliscera 229
61 hydromedusae_haliscera_small_sideview 9
62 hydromedusae_liriope 19
63 hydromedusae_narcomedusae 132
64 hydromedusae_narco_dark 23
65 hydromedusae_narco_young 336
66 hydromedusae_other 12
67 hydromedusae_partial_dark 190
68 hydromedusae_shapeA 412
69 hydromedusae_shapeA_sideview_small 274
70 hydromedusae_shapeB 150
71 hydromedusae_sideview_big 76
72 hydromedusae_solmaris 703
73 hydromedusae_solmundella 123
74 hydromedusae_typeD 43
75 hydromedusae_typeD_bell_and_tentacles 56
76 hydromedusae_typeE 14
77 hydromedusae_typeF 61
78 invertebrate_larvae_other_A 14
79 invertebrate_larvae_other_B 24
80 jellies_tentacles 141
81 polychaete 131
82 protist_dark_center 108
83 protist_fuzzy_olive 372
84 protist_noctiluca 625
85 protist_other 1172
86 protist_star 113
87 pteropod_butterfly 108
88 pteropod_theco_dev_seq 13
89 pteropod_triangle 65
90 radiolarian_chain 287
91 radiolarian_colony 158
92 shrimp-like_other 52
93 shrimp_caridean 49
94 shrimp_sergestidae 153
95 shrimp_zoea 174
96 siphonophore_calycophoran_abylidae 212
97 siphonophore_calycophoran_rocketship_adult 135
98 siphonophore_calycophoran_rocketship_young 483
99 siphonophore_calycophoran_sphaeronectes 179
100 siphonophore_calycophoran_sphaeronectes_stem 57
101 siphonophore_calycophoran_sphaeronectes_young 247
102 siphonophore_other_parts 29
103 siphonophore_partial 30
104 siphonophore_physonect 128
105 siphonophore_physonect_young 21
106 stomatopod 24
107 tornaria_acorn_worm_larvae 38
108 trichodesmium_bowtie 708
109 trichodesmium_multiple 54
110 trichodesmium_puff 1979
111 trichodesmium_tuft 678
112 trochophore_larvae 29
113 tunicate_doliolid 439
114 tunicate_doliolid_nurse 417
115 tunicate_partial 352
116 tunicate_salp 236
117 tunicate_salp_chains 73
118 unknown_blobs_and_smudges 317
119 unknown_sticks 175
120 unknown_unclassified 425
(30336, 31)

In [7]:
stop_time = datetime.datetime.now()
print stop_time
print (stop_time - start_time), "elapsed time [TRAIN]"
start_time = stop_time
2015-02-01 02:18:02.316000
0:06:07.736000 elapsed time [TRAIN]

Test Images

In [8]:
# imagePropertiesList will contain a list of dictionaries with each
# dictionary the skiimage region properties for one plankton image.
imagePropertiesList = []

filenames = os.listdir("test")    
print len(filenames)

for index in range(len(filenames)):
    filename = filenames[index]
    fullname = os.path.join("test", filename)
        
    imagePropertyDict = getMaxAreaDict(fullname)
    
    # Add filename for test set.
    imagePropertyDict['filename'] = filename 
    
    imagePropertiesList.append(imagePropertyDict)
    
    if index % 5000 == 0:
        print index, "/", len(filenames)
        
# Convert list of dictionaries to pandas dataframe.
df = pandas.DataFrame(imagePropertiesList)
print df.shape

# Save to .csv file
df.to_csv('test-properties.csv', index=False)  
130400
0 / 130400
5000 / 130400
10000 / 130400
15000 / 130400
20000 / 130400
25000 / 130400
30000 / 130400
35000 / 130400
40000 / 130400
45000 / 130400
50000 / 130400
55000 / 130400
60000 / 130400
65000 / 130400
70000 / 130400
75000 / 130400
80000 / 130400
85000 / 130400
90000 / 130400
95000 / 130400
100000 / 130400
105000 / 130400
110000 / 130400
115000 / 130400
120000 / 130400
125000 / 130400
130000 / 130400
(130400, 30)

In [9]:
stop_time = datetime.datetime.now()
print stop_time
print (stop_time - start_time), "elapsed time [TEST]"
2015-02-01 02:59:59.740000
0:41:57.424000 elapsed time [TEST]