AI for Medical Diagnosis - Lab 1
Notes on AI for Medical Diagnosis – Lab 1
Loading and Cleaning up Data
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
sns.set()
# Read in dataset
train_df = pd.read_csv('/content/train-small.csv')
# Print # of columns and rows
print(f'There are {train_df.shape[0]} rows and {train_df.shape[1]} columns in this dataframe')
# display first 5 elements
train_df.head()
Image | Atelectasis | Cardiomegaly | Consolidation | Edema | Effusion | Emphysema | Fibrosis | Hernia | Infiltration | Mass | Nodule | PatientId | Pleural_Thickening | Pneumonia | Pneumothorax | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00008270_015.png | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8270 | 0 | 0 | 0 |
1 | 00029855_001.png | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 29855 | 0 | 0 | 0 |
2 | 00001297_000.png | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1297 | 1 | 0 | 0 |
3 | 00012359_002.png | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12359 | 0 | 0 | 0 |
4 | 00017951_001.png | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 17951 | 0 | 0 | 0 |
# Get the info for the dataframe
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Image 1000 non-null object 1 Atelectasis 1000 non-null int64 2 Cardiomegaly 1000 non-null int64 3 Consolidation 1000 non-null int64 4 Edema 1000 non-null int64 5 Effusion 1000 non-null int64 6 Emphysema 1000 non-null int64 7 Fibrosis 1000 non-null int64 8 Hernia 1000 non-null int64 9 Infiltration 1000 non-null int64 10 Mass 1000 non-null int64 11 Nodule 1000 non-null int64 12 PatientId 1000 non-null int64 13 Pleural_Thickening 1000 non-null int64 14 Pneumonia 1000 non-null int64 15 Pneumothorax 1000 non-null int64 dtypes: int64(15), object(1) memory usage: 125.1+ KB
Check for unique IDs
“PatientID” has an ID number for each patient. In this dataset we need to determine if each image corresponds to a unique patient. Duplicate IDs would indicate that a single patient might have multiple images.
print(f"The total number of patients are {train_df['PatientId'].count()},
from those the unique IDs are {train_df['PatientId'].value_counts().shape[0]}")
As we can see the number of unique patien IDs are 928 out of 1000 which indicates that there must be some overlap. For patients with multiple records, we want to make sure that they don’t show up in both, the training and test datasets in order to avoid data leakage.
Explore data labels.
columns = train_df.keys()
columns = list(columns)
print(columns)
['Image', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia',
'Infiltration', 'Mass', 'Nodule', 'PatientId', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
# Remove unnecessary columns
columns.remove('Image')
columns.remove('PatientId')
# Get the total classes
print(f"There are {len(columns)} columns of labels for these conditions: {columns}")
There are 14 columns of labels for these conditions: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
# Print out the number of positives labels (1) for each class
for column in columns:
print(f"The class {column} has {train_df[column].sum()} samples")
The class Atelectasis has 106 samples
The class Cardiomegaly has 20 samples
The class Consolidation has 33 samples
The class Edema has 16 samples
The class Effusion has 128 samples
The class Emphysema has 13 samples
The class Fibrosis has 14 samples
The class Hernia has 2 samples
The class Infiltration has 175 samples
The class Mass has 45 samples
The class Nodule has 54 samples
The class Pleural_Thickening has 21 samples
The class Pneumonia has 10 samples
The class Pneumothorax has 38 samples
Data Visualization
Now we are going to visualize some of the images in the dataset
images = train_df['Image'].head(10).values
images
array(['00008270_015.png', '00029855_001.png', '00001297_000.png',
'00012359_002.png', '00017951_001.png', '00001232_002.png',
'00017135_000.png', '00027235_000.png', '00014197_007.png',
'00011584_002.png'], dtype=object)
# Extract numpy values from the 'Image' column of the dataframe
# This is a very larger dataset and I only uploaded a handful of images to colab
# so I'm creating a list to iterate through the images
images = ['00001855_037.png', '00005132_000.png', '00009804_001.png', '00011553_006.png',
'00012051_001.png', '00012276_018.png', '00013249_033.png', '00013615_007.png',
'00015007_006.png', '00019967_011.png', '00028341_002.png', '00030061_000.png']
'''
Run below line if you have the whole dataset
images = train_df['Image'].head(10).values
'''
# Extract 9 random images from df
random_images = [np.random.choice(images) for i in range(9)]
# Define location of the image dir
img_dir = '/content/images-small'
print("Display Random Images")
# Adjust size of the images
plt.figure(figsize=(20, 10))
# Iterate and plot random images
for i in range(9):
plt.subplot(3, 3, i +1)
img = plt.imread(os.path.join(img_dir, random_images[i]))
plt.imshow(img, cmap='gray')
plt.axis('off')
# Adjust subplot padding
plt.tight_layout()
Investigate a Single Image
Display some information about the image as well as the pixel value distribution
# Explore any single image in more detail
sample_img = '00005132_000.png'
raw_image = plt.imread(os.path.join(img_dir, sample_img))
plt.imshow(raw_image, cmap='gray')
plt.colorbar()
plt.title('Raw Chest X Ray Image')
print(f"The dimensions of the image are {raw_image.shape[0]} pixels width and {raw_image.shape[1]} pixels height, one single color chanel")
print(f"The maximun pixel value is: {raw_image.max():.4f} and the min pixel value is {raw_image.min():.4f}")
print(f"The mean value of the pixel is {raw_image.mean():.4f} and the standard deviation is {raw_image.std():.4f}\n")
The dimensions of the image are 1024 pixels width and 1024 pixels height, one single color chanel
The maximun pixel value is: 0.9608 and the min pixel value is 0.0078
The mean value of the pixel is 0.6178 and the standard deviation is 0.2126
# Plot a Histogram of the Distribution of Pixels
plt.figure(figsize=(10,8));
sns.distplot(raw_image.ravel(),
label=f"Pixel Mean{np.mean(raw_image):.4f} & Standard Deviation {np.std(raw_image):.4f}",
kde=False);
plt.legend(loc='upper center')
plt.title("Distribution of Pixel Intensities in an Image")
plt.xlabel("Pixel Intensity");
plt.ylabel("# of Pixels in Image");
Image Pre-Processing in Keras
Before we train the model a CNN, the images must be pre-processed. To achieve that we’ll use Kera’s ImageDataGenerator to perform data pre-processing and data augmentation.
# Create an Image Generator for Pre-processing
from keras.preprocessing.image import ImageDataGenerator
# Normalize Images
image_generator = ImageDataGenerator(
samplewise_center = True, # Set each sample mean to 0
samplewise_std_normalization = True # Divide each input by its STD
)
Standardization
The image_generator
created above will adjust the image such that the new mean of the data will be 0 and the STD of the data will be 1.
The generator will replace each pixel value of the image with a new calculated value by subtracting the mean and dividing by the STD.
# Flow from directory with specified batch and target image size
generator = image_generator.flow_from_dataframe(
dataframe = train_df,
directory = img_dir,
x_col = 'Image', # features
y_col = 'Mass', # labels
class_mode = 'raw', # 'Mass' column must be in train_df
batch_size = 1, #images per batch
shuffle = False, # to shuffle images or not
target_size = (320, 320) # wight and height of the output image
# Plot a processed imaged
# set style
sns.set_style("white")
generated_image, label = generator.__getitem__(0)
plt.imshow(generated_image[0], cmap='gray')
plt.colorbar()
plt.title("Raw Chest X Ray Image")
# print info about the image
#print(generated_image)
print(f"The dimensions of the image are {generated_image.shape[1]} pixels width and {generated_image.shape[2]} pixels height \n")
print(f"The maximum pixel value is {generated_image.max():.4f} and the minimum value is {generated_image.min():.4f}\n")
print(f"The mean value of the pixels is {generated_image.mean():.4f} and the STD is {generated_image.std():.4f}\n")
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
The dimensions of the image are 320 pixels width and 320 pixels height
The maximum pixel value is 2.7594 and the minimum value is -0.7400
The mean value of the pixels is -0.0000 and the STD is 1.0000
# Plot a Histogram with the Distribution of the Pixels
sns.set()
plt.figure(figsize=(10, 8))
# Plot Histogram for original image
sns.distplot(raw_image.ravel(),
label = f"Original Image: mean {np.mean(raw_image):.4f} -- STD {np.std(raw_image):.4f} \n"
f"Min pixel value {np.mean(raw_image):.4f} -- Max pixel value {np.min(raw_image):.4f}",
color = 'blue',
kde = False)
# Plot Histogram of new generated image
sns.distplot(generated_image[0].ravel(),
label = f"Generated Image: mean {np.mean(generated_image[0]):.4f} -- STD {np.std(generated_image[0]):.4f} \n"
f"Min pixel value {np.mean(generated_image[0]):.4f} -- Max pixel value {np.min(generated_image[0]):.4f}",
color = 'red',
kde = False)
# Place Legends
plt.legend()
plt.title("Distribution of Pixel Intensities of the Image")
plt.xlabel("Pixel Intensity")
plt.ylabel("# of Pixels");