Creating Sample Data for Testing
https://pythonprogramming.net/sample-data-testing-machine-learning-tutorial/
Generating and using sample data to test the functions and equations – similar to unit testing.
Key Points:-
- When starting to define a function build out the inputs and then focus on what it needs to return – and then connect the two with the various steps
- Once the test data is calculated we can then change the variance and correlation in the equation and test that the R^2 value changes as expected
- create_dataset(40,10,2,correlation=’pos’) – gives a higher R^2
- create_dataset(40,80,2,correlation=’pos’) – gives a lower R^2
- create_dataset(40,10,2,correlation=’neg’) – draws the line the other way around
- create_dataset(40,10,2,correlation=False) – gives a very low R^2 (no correlation)
# Import Libs
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
import random #psudo random :)
# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot
# Define values
#xs = np.array([1,2,3,4,5], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future
#ys = np.array([5,4,6,5,6], dtype=np.float64)
def create_dataset(how_many, variance, step=2, correlation=False): # step and correlation have default values set that can be overwritten when the function is called
#set up starting values
val=1
ys=[] #empty array
# for loop cycling through how_many to give ys
for i in range(how_many):
y = val + random.randrange(-variance, variance)
ys.append(y)
# also need to increase/decrease value of val depending on correlation
if correlation and correlation =='pos':
val += step # add the value of step to Val and store as val
elif correlation and correlation =='neg':
val -= step # minus the value of step from Val and store as val
# now define xs
xs = [i for i in range(len(ys))] # one line for loop to generate an array of ittrative xs; could have used How_Many
return np.array(xs, dtype=np.float64), np.array(ys, dtype=np.float64)
# Define best fit function
def best_fit_slope_and_intercept(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys
m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code
((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2
b = mean(ys) - m*mean(xs)
return m, b
# define new xs and ys
xs, ys = create_dataset(40,40,2,'pos')
m, b = best_fit_slope_and_intercept(xs,ys)
# Define function to square error
def squared_error(ys_orig,ys_line):
return sum((ys_line - ys_orig) * (ys_line - ys_orig)) # return used with calc rather than seperately first
def coefficient_of_determination(ys_orig,ys_line):
y_mean_line = [mean(ys_orig) for y in ys_orig] # one line for loop
squared_error_regr = squared_error(ys_orig, ys_line)
squared_error_y_mean = squared_error(ys_orig, y_mean_line)
return 1 - (squared_error_regr/squared_error_y_mean)
m, b = best_fit_slope_and_intercept(xs,ys)
regression_line = [(m*x)+b for x in xs]
r_squared = coefficient_of_determination(ys,regression_line)
print(r_squared)
plt.scatter(xs,ys,color='#003F72', label = 'data')
plt.plot(xs, regression_line, label = 'regression line')
plt.legend(loc=4)
plt.savefig('ML_Tutorial12.png', bbox_inches='tight') #Sets the output to save an image
plt.show() # exports the image