import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
df=pd.read_csv("old_faithful.csv")
df

fig,ax1=plt.subplots()
ax1.scatter(df['eruptions'], df['waiting'])
ax1.set_xlabel('eruption time')
ax1.set_ylabel('waiting time')

Text(0, 0.5, 'waiting time')

model=np.empty(2)
#guess slope
model[0]=20
#guess intercept
model[1]=10

def plot_raw_error(xdata, ydata, model):
    fig,ax2=plt.subplots()
    
    #plot raw data points
    ax2.scatter(xdata, ydata)

    #get model y data
    model_ydata=xdata*model[0] + model[1]

    #plot best fit line
    ax2.plot(xdata, model_ydata)

    #calculate error values
    raw_error_values=ydata-model_ydata
    plt.plot([xdata,xdata], [ydata,ydata-raw_error_values], 'c')

plot_raw_error(df['eruptions'], df['waiting'], model)

xdata=df['eruptions']
ydata=df['waiting']

def calc_rms_error(model):
    model_ydata=xdata*model[0] + model[1]
    raw_error_values=ydata-model_ydata
    rms_error=np.sqrt(np.mean(raw_error_values**2))
    return rms_error

calc_rms_error(model)

14.991246220221123

import scipy.optimize as opt
model_fit=opt.minimize(calc_rms_error, [15,20])
model_fit['x']

array([10.72963549, 33.47440846])

plot_raw_error(xdata, ydata, model_fit['x'])

# Waiting time for an eruption when eruption time is 6
eruption_time=6
waiting=model_fit['x'][0]*eruption_time + model_fit['x'][1]
waiting

97.85222136878929

	eruptions	waiting
0	3.600	79
1	1.800	54
2	3.333	74
3	2.283	62
4	4.533	85
...	...	...
267	4.117	81
268	2.150	46
269	4.417	90
270	1.817	46
271	4.467	74

Perfecting Models with Error Minimization¶

Plotting data¶

Visualize the raw error¶

Calculate root mean square error¶

Optimizing RMS error¶

Prediction using optimized model¶