This Notebook describes creation of a file of skimage regionprops for Kaggle's National Data Science Bowl "Predict ocean health, one plankton at a time" competition. The Python code below creates separate files of features for the training and test sets to be used for numerical experiments in R.
I previously posted in the online Kaggle forum about the many duplicates, mostly in the test set:
For now I made no attempt to remove the duplicate images.
Many ideas in this Notebook were adapted from the online tutorials by Aaron Sander and Ehud Ben-Reuven.
import numpy as np
import os
import pandas
from matplotlib import pyplot as plt
from pylab import cm
from skimage import measure
from skimage import morphology
from skimage.io import imread
# make graphics appear inline
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import datetime
start_time = datetime.datetime.now()
print start_time
cd C:\Kaggle\2015\Plankton
train and test directories of plankton images should be under this directory.
# Read image, analyze, find region properties
def getImageRegionList(filename):
# Read image file
image = imread(filename, as_grey=True)
# Thresholding
image_threshold = np.where(image > np.mean(image),0.,1.0)
# Dilation
size_neighborhood = 4
image_dilated = morphology.dilation(image_threshold, \
np.ones((size_neighborhood,size_neighborhood)))
# Label regions
label_list = measure.label(image_dilated)
# Create label list
label_list = (image_threshold * label_list).astype(int)
# Region properties
region_list = measure.regionprops(label_list)
return region_list
# Find the region with the largest area
def getMaxArea(filename):
region_list = getImageRegionList(filename)
maxArea = None
for property in region_list:
if maxArea is None:
maxArea = property
else:
if property.area > maxArea.area:
maxArea = property
return maxArea
def getMaxAreaDict(filename):
property = getMaxArea(filename)
if property is None:
maxAreaDict = {'area' : 0}
else:
maxAreaDict = {'label' : property.label,
'centroid_row' : property.centroid[0], # 0D: location
'centroid_col' : property.centroid[1],
'diameter_equivalent': property.equivalent_diameter, # 1D
'length_minor_axis' : property.minor_axis_length,
'length_major_axis' : property.major_axis_length,
'ratio_eccentricity' : property.eccentricity,
'perimeter' : property.perimeter,
'orientation' : property.orientation, # ranges from -pi/2 to pi/2
'area' : property.area, # 2D
'area_convex' : property.convex_area,
'area_filled' : property.filled_area,
'box_min_row' : property.bbox[0],
'box_max_row' : property.bbox[2],
'box_min_col' : property.bbox[1],
'box_max_col' : property.bbox[3],
'ratio_extent' : property.extent,
'ratio_solidity' : property.solidity,
'inertia_tensor_eigenvalue1': property.inertia_tensor_eigvals[0],
'inertia_tensor_eigenvalue2': property.inertia_tensor_eigvals[1],
'moments_hu1' : property.moments_hu[0], # translation, scale and rotation invariant
'moments_hu2' : property.moments_hu[1],
'moments_hu3' : property.moments_hu[2],
'moments_hu4' : property.moments_hu[3],
'moments_hu5' : property.moments_hu[4],
'moments_hu6' : property.moments_hu[5],
'moments_hu7' : property.moments_hu[6],
'euler_number' : property.euler_number, # miscellaneous
'countCoords' : len(property.coords)} # eventually grab these coordinates?
return maxAreaDict
directory_names = os.listdir("train")
# Loop through all the training folders
# imagePropertiesList will contain a list of dictionaries with each
# dictionary the skiimage region properties for one plankton image.
imagePropertiesList = []
for train_index in range(len(directory_names)):
folder = directory_names[train_index]
basedir = os.path.join("train", folder)
filenames = os.listdir(basedir)
print train_index, folder, len(filenames)
for index in range(len(filenames)):
filename = filenames[index]
fullname = os.path.join(basedir, filename)
imagePropertyDict = getMaxAreaDict(fullname)
# Add filename and train_index for training set.
# filenames are unique across all training and test sets
imagePropertyDict['filename'] = filename
imagePropertyDict['train'] = train_index
imagePropertiesList.append(imagePropertyDict)
# Convert list of dictionaries to pandas dataframe.
df = pandas.DataFrame(imagePropertiesList)
print df.shape
# Save to .csv file
df.to_csv('train-properties.csv', index=False)
stop_time = datetime.datetime.now()
print stop_time
print (stop_time - start_time), "elapsed time [TRAIN]"
start_time = stop_time
# imagePropertiesList will contain a list of dictionaries with each
# dictionary the skiimage region properties for one plankton image.
imagePropertiesList = []
filenames = os.listdir("test")
print len(filenames)
for index in range(len(filenames)):
filename = filenames[index]
fullname = os.path.join("test", filename)
imagePropertyDict = getMaxAreaDict(fullname)
# Add filename for test set.
imagePropertyDict['filename'] = filename
imagePropertiesList.append(imagePropertyDict)
if index % 5000 == 0:
print index, "/", len(filenames)
# Convert list of dictionaries to pandas dataframe.
df = pandas.DataFrame(imagePropertiesList)
print df.shape
# Save to .csv file
df.to_csv('test-properties.csv', index=False)
stop_time = datetime.datetime.now()
print stop_time
print (stop_time - start_time), "elapsed time [TEST]"