# Importing packages!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reads in breast cancer dataset
cancer_df = pd.read_csv('breastcancer_rv.csv')
# Views the data
cancer_df.head()

# Creates a figure for plotting
fig, ax1 = plt.subplots()

# Creates a random jitter because many points have the same values
def make_random(a):
    return a + np.random.normal(0, 0.15, len(a))

# Sets outcomes and data columns
outcomes = ['cancer', 'no_cancer']
xcol = 'Bland Chromatin'
ycol = 'Uniformity of Cell Size'

# Sets plotting colors
colors=['m','g']

# Plots cancerous and non-cancerous cells separately
for index, outcome in enumerate(outcomes):

    # Gets the data indices for the class
    truth_indices = cancer_df['Class'] == outcome
    # Scatters the class points with a jitter
    ax1.scatter(make_random(cancer_df.loc[truth_indices, xcol]), make_random(cancer_df.loc[truth_indices, ycol]), 3, color = colors[index], label = outcome)

# Labels plot
ax1.set_xlabel(xcol)
ax1.set_ylabel(ycol)
ax1.legend()

<matplotlib.legend.Legend at 0x1f28d112900>

def distance(row1, row2):
    # Removes the class column
    row1 = row1.drop('Class')
    row2 = row2.drop('Class')
    
    # Calculates the Euclidean distance
    dist = np.sqrt(np.sum((row1 - row2) ** 2))

    # Returns the distance
    return dist

test_distance = distance(cancer_df.loc[0], cancer_df.loc[1])
print(f'The Euclidean distance between these two points is {test_distance}.')

The Euclidean distance between these two points is 11.874342087037917.

def calc_distance_to_all_points(df, testrow):
    # Creates a list for distances
    distances = []
    # Extracts the number of rows
    num_rows = df.shape[0]

    # Iterates through all the rows
    for row in range(num_rows):
        # Extracts row
        cur_row = df.iloc[row, :]
        # Calculates distance
        cur_distance = distance(cur_row, testrow)
        # Adds to distances list
        distances.append(cur_distance)

    # Returns list of distances
    return distances

test_distances = calc_distance_to_all_points(cancer_df, cancer_df.loc[0])
print(f'The first four distances are {test_distances[:4]}.')

The first four distances are [np.float64(0.0), np.float64(11.874342087037917), np.float64(2.23606797749979), np.float64(12.041594578792296)].

def find_k_closest(df, testrow, k, drop_closest=True):
    # Calculates distances
    distances = calc_distance_to_all_points(df, testrow)
    # Sorts indices
    sorted_indices = np.argsort(distances)

    # Returns indices, either including or excluding closest neighbor
    if drop_closest:
        return sorted_indices[1:k+1]
    else:
        return sorted_indices[1:k]

closest_inds = find_k_closest(cancer_df, cancer_df.loc[121], 5)
print(f'The closest indices are {closest_inds}')

The closest indices are [ 56 147 303 142 173]

# Creates figure for plotting
fig, ax2 = plt.subplots()

# Sets outcomes and data columns
outcomes = ['cancer', 'no_cancer']
xcol = 'Bland Chromatin'
ycol = 'Uniformity of Cell Size'

# Sets plotting colors
colors = ['m', 'g']

# Plots cancerous and non-cancerous cells
for index, outcome in enumerate(outcomes):
    # Extracts the cell indices
    truth_indices = cancer_df['Class'] == outcome
    # Scatters the cells
    ax2.scatter(make_random(cancer_df.loc[truth_indices, xcol]), make_random(cancer_df.loc[truth_indices, ycol]), 3, color = colors[index], label = outcome)

# Labels the plot
ax2.set_xlabel(xcol)
ax2.set_ylabel(ycol)
ax2.legend()

# Plots the individual observation
ax2.plot(cancer_df.loc[121, xcol], cancer_df.loc[121, ycol], 'ro')
# Plots the k-nearest neighbors
ax2.scatter(make_random(cancer_df.loc[closest_inds, xcol]), make_random(cancer_df.loc[closest_inds, ycol]), color = 'c')

<matplotlib.collections.PathCollection at 0x1f28eb7bb10>

def classify(df, testrow, k, drop_closest=True):
    # Finds the closest neighbors
    closest_inds = find_k_closest(df, testrow, k, drop_closest)
    
    # Sums the number of cancerous cell neighbors
    cancer_ct=sum(df['Class'].iloc[closest_inds] == 'cancer')
    # Sums the number of non-cancerous cell neighbors
    nocancer_ct=sum(df['Class'].iloc[closest_inds] == 'no_cancer')

    # Returns the classification
    if cancer_ct >= nocancer_ct:
        return 'cancer'
    else:
        return 'no_cancer'

classification = classify(cancer_df, cancer_df.iloc[121,:], 15)
print(f'The cell classification is {classification}.')

The cell classification is cancer.

# Gets the number of rows
num_rows = cancer_df.shape[0]

# Permutes the indices to randomize the sets
permuted_indices = np.random.permutation(np.arange(num_rows))

# Makes 80% of the data training data
frac_training = 0.8

# Gets the training and test set indices
training_ind = permuted_indices[0:int(0.8 * num_rows)]
test_ind = permuted_indices[int(0.8 * num_rows):]

# Subsets the training and test data
training_df = cancer_df.iloc[training_ind, :]
test_df = cancer_df.iloc[test_ind,:]

def evaluate_accuracy(training_df, test_df, k):
    # Extracts the number of test rows
    num_test_rows = test_df.shape[0]
    # Creates a counter for the number of correct classifications
    correct_total = 0
    
    # Iterates through test data
    for rowind in range(num_test_rows):
        # Extracts test row
        test_row = cancer_df.iloc[rowind,:]
        
        # Predicts classification
        predicted_outcome = classify(training_df, test_row, k, drop_closest=False)

        # Checks if classification is correct
        if predicted_outcome == test_row['Class']:
            correct_total = correct_total + 1

    # Returns accuracy
    return correct_total/num_test_rows

acc_k3 = evaluate_accuracy(training_df, test_df, 3)
print(f'The accuracy is {acc_k3}.')

The accuracy is 0.948905109489051.

Classification¶

What is Machine Learning?¶

Introduction to Classification and K-nearest Neighbors¶

Performing K-nearest Neighbors Classification¶

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	no_cancer
1	5	4	4	5	7	10	3	2	1	no_cancer
2	3	1	1	1	2	2	3	1	1	no_cancer
3	6	8	8	1	3	4	3	7	1	no_cancer
4	4	1	1	3	2	1	3	1	1	no_cancer