This Notebook describes creation of a file of skimage regionprops for Kaggle's National Data Science Bowl "Predict ocean health, one plankton at a time" competition. The Python code below creates separate files of features for the training and test sets to be used for numerical experiments in R.
I previously posted in the online Kaggle forum about the many duplicates, mostly in the test set:
For now I made no attempt to remove the duplicate images.
Many ideas in this Notebook were adapted from the online tutorials by Aaron Sander and Ehud Ben-Reuven.
import numpy  as np
import os
import pandas
from   matplotlib import pyplot    as plt
from   pylab      import cm
from   skimage    import measure
from   skimage    import morphology
from   skimage.io import imread
# make graphics appear inline
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import datetime
start_time = datetime.datetime.now()
print start_time
cd C:\Kaggle\2015\Plankton  
train and test directories of plankton images should be under this directory.
# Read image, analyze, find region properties
def getImageRegionList(filename):   
    # Read image file
    image = imread(filename, as_grey=True)    
    
    # Thresholding
    image_threshold = np.where(image > np.mean(image),0.,1.0)   
    
    # Dilation
    size_neighborhood = 4
    image_dilated = morphology.dilation(image_threshold, \
                                        np.ones((size_neighborhood,size_neighborhood)))   
    
    # Label regions
    label_list = measure.label(image_dilated)    
    
    # Create label list
    label_list = (image_threshold * label_list).astype(int)
    
    # Region properties
    region_list = measure.regionprops(label_list)
    
    return region_list
# Find the region with the largest area
def getMaxArea(filename):
    region_list = getImageRegionList(filename)
    
    maxArea = None
    for property in region_list:       
        if maxArea is None:
            maxArea = property
        else:
            if property.area > maxArea.area:
                maxArea = property
    return maxArea
def getMaxAreaDict(filename):
    property = getMaxArea(filename)
    
    if property is None:
      maxAreaDict = {'area'               :  0}
    else:
      maxAreaDict = {'label'              :  property.label,
                     
                     'centroid_row'       :  property.centroid[0],          # 0D:  location
                     'centroid_col'       :  property.centroid[1],                     
                     
                     'diameter_equivalent':  property.equivalent_diameter,  # 1D
                     'length_minor_axis'  :  property.minor_axis_length, 
                     'length_major_axis'  :  property.major_axis_length,
                     'ratio_eccentricity' :  property.eccentricity,
                     'perimeter'          :  property.perimeter,
                     'orientation'        :  property.orientation,          # ranges from -pi/2 to pi/2 
                     
                     'area'               :  property.area,                 # 2D
                     'area_convex'        :  property.convex_area,
                     'area_filled'        :  property.filled_area,
                     'box_min_row'        :  property.bbox[0],
                     'box_max_row'        :  property.bbox[2],
                     'box_min_col'        :  property.bbox[1],
                     'box_max_col'        :  property.bbox[3],
                     'ratio_extent'       :  property.extent,
                     'ratio_solidity'     :  property.solidity,                  
                     
                     'inertia_tensor_eigenvalue1':  property.inertia_tensor_eigvals[0], 
                     'inertia_tensor_eigenvalue2':  property.inertia_tensor_eigvals[1],
                     
                     'moments_hu1'        :  property.moments_hu[0],        # translation, scale and rotation invariant
                     'moments_hu2'        :  property.moments_hu[1],
                     'moments_hu3'        :  property.moments_hu[2],
                     'moments_hu4'        :  property.moments_hu[3],
                     'moments_hu5'        :  property.moments_hu[4],
                     'moments_hu6'        :  property.moments_hu[5],
                     'moments_hu7'        :  property.moments_hu[6],
                     
                     'euler_number'       :  property.euler_number,         # miscellaneous
                     
                     'countCoords'        :  len(property.coords)}          # eventually grab these coordinates?
    return maxAreaDict
directory_names = os.listdir("train")
# Loop through all the training folders
# imagePropertiesList will contain a list of dictionaries with each
# dictionary the skiimage region properties for one plankton image.
imagePropertiesList = []
for train_index in range(len(directory_names)):  
    folder = directory_names[train_index] 
    basedir = os.path.join("train", folder)   
    filenames = os.listdir(basedir)
    
    print train_index, folder, len(filenames)
    for index in range(len(filenames)):
        filename = filenames[index]
        fullname = os.path.join(basedir, filename)
        
        imagePropertyDict = getMaxAreaDict(fullname)
    
        # Add filename and train_index for training set.
        # filenames are unique across all training and test sets
        imagePropertyDict['filename'] = filename 
        imagePropertyDict['train'] = train_index
    
        imagePropertiesList.append(imagePropertyDict)
        
# Convert list of dictionaries to pandas dataframe.
df = pandas.DataFrame(imagePropertiesList)
print df.shape
# Save to .csv file
df.to_csv('train-properties.csv', index=False)  
stop_time = datetime.datetime.now()
print stop_time
print (stop_time - start_time), "elapsed time [TRAIN]"
start_time = stop_time
# imagePropertiesList will contain a list of dictionaries with each
# dictionary the skiimage region properties for one plankton image.
imagePropertiesList = []
filenames = os.listdir("test")    
print len(filenames)
for index in range(len(filenames)):
    filename = filenames[index]
    fullname = os.path.join("test", filename)
        
    imagePropertyDict = getMaxAreaDict(fullname)
    
    # Add filename for test set.
    imagePropertyDict['filename'] = filename 
    
    imagePropertiesList.append(imagePropertyDict)
    
    if index % 5000 == 0:
        print index, "/", len(filenames)
        
# Convert list of dictionaries to pandas dataframe.
df = pandas.DataFrame(imagePropertiesList)
print df.shape
# Save to .csv file
df.to_csv('test-properties.csv', index=False)  
stop_time = datetime.datetime.now()
print stop_time
print (stop_time - start_time), "elapsed time [TEST]"