import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('breastcancer_rv.csv')
df.head()

# Visualizing the data

# Creating a random jitter because many points have the same values
def make_random(a):
    return a+np.random.normal(0,0.15,len(a))

outcomes=['cancer','no_cancer']
xcol='Bland Chromatin'
ycol='Uniformity of Cell Size'
fig,ax=plt.subplots(figsize=(4,4))
colors=['m','g']
for index,outcome in enumerate(outcomes):
    truth_indices=df['Class']==outcome
    ax.scatter(make_random(df.loc[truth_indices,xcol]), make_random(df.loc[truth_indices,ycol]), 3, color=colors[index], label=outcome)
    ax.set_xlabel(xcol)
    ax.set_ylabel(ycol)
ax.legend()

<matplotlib.legend.Legend at 0x26bf3169310>

def row_distance(row1, row2):
    # removing class column
    row1=row1.drop('Class')
    row2=row2.drop('Class')
    # calculating distance
    dist = np.sqrt(np.sum((row1-row2)**2))
    return dist

def calc_distance_to_all_rows(df, testrow):
    distances=[]
    num_rows=df.shape[0]
    for row in range(num_rows):
        cr_row=df.iloc[row,:]
        cr_distance=row_distance(cr_row,testrow)
        distances.append(cr_distance)
    return distances

def find_k_closest(df, testrow, k, drop_closest=True):
    distances=calc_distance_to_all_rows(df, testrow)
    sorted_indices=np.argsort(distances)
    # returns indices excluding test row itself if specified
    if drop_closest:
        return sorted_indices[1:k+1]
    else:
        return sorted_indices[1:k]

closest_inds=find_k_closest(df, df.iloc[121,:],5)

def plot_k_closest(df, testrow):
    outcomes=['cancer','no_cancer']
    xcol='Bland Chromatin'
    ycol='Uniformity of Cell Size'
    fig,ax=plt.subplots(figsize=(4,4))
    colors=['m','g']
    for index,outcome in enumerate(outcomes):
        truth_indices=df['Class']==outcome
        ax.scatter(make_random(df.loc[truth_indices,xcol]), make_random(df.loc[truth_indices,ycol]), 3, color=colors[index], label=outcome)
        ax.set_xlabel(xcol)
        ax.set_ylabel(ycol)
    ax.legend()
    ax.plot(testrow[xcol], testrow[ycol], 'ro')
    ax.scatter(make_random(df.loc[closest_inds,xcol]), make_random(df.loc[closest_inds, ycol]), color='c')
plot_k_closest(df, df.iloc[121,:])

def classify(df, testrow, k, drop_closest=True):
    closest_inds=find_k_closest(df, testrow, k, drop_closest)
    cancer_ct=sum(df['Class'].iloc[closest_inds]=='cancer')
    nocancer_ct=sum(df['Class'].iloc[closest_inds]=='no_cancer')
    if cancer_ct >= nocancer_ct:
        return 'cancer'
    else:
        return 'no_cancer'

classify(df,df.iloc[88,:],17)

'no_cancer'

# get training and test data
num_rows=df.shape[0]
permuted_indices=np.random.permutation(np.arange(num_rows))
frac_training=0.8
training_ind=permuted_indices[0:int(0.8*num_rows)]
test_ind=permuted_indices[int(0.8*num_rows):]
training_df=df.iloc[training_ind,:]
test_df=df.iloc[test_ind,:]

def evaluate_accuracy(training_df, test_df, k):
    num_test_rows=test_df.shape[0]
    correct_total=0
    
    for rowind in range(num_test_rows):
        test_row=df.iloc[rowind,:]
        predicted_outcome=classify(training_df, test_row, k, drop_closest=False)
        if predicted_outcome==test_row['Class']:
            correct_total=correct_total+1

    return correct_total/num_test_rows

acc_k3=evaluate_accuracy(training_df, test_df, 3)
acc_k3

0.9562043795620438

Building a Classifier¶

K-nearest neighbors¶

Can we use classification to predict whether a cell is cancerous or not based on this data?¶

Classification seems plausible with this data¶

Classification Approach:¶

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	no_cancer
1	5	4	4	5	7	10	3	2	1	no_cancer
2	3	1	1	1	2	2	3	1	1	no_cancer
3	6	8	8	1	3	4	3	7	1	no_cancer
4	4	1	1	3	2	1	3	1	1	no_cancer