# First Let's import the necessary packages
import pandas as pd # Package used for working with dataframes
import numpy as np # Package used for working with arrays
import matplotlib.pyplot as plt #Package used visualizing our data
%matplotlib inline
#This command causes plots to show up in notebook
#sklearn is the machine learning package
#First we load in the instances to preprocess our data
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import cross_validation
#Next we load the various algorithms we will be using:
#logistic regression, k-nearest neighbors, support vector machines, random trees and random forests
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
#Finally, the gridsearch instance will allow us to fine-tune our algorithms
from sklearn.grid_search import GridSearchCV
#Now we will read in the data pick out the second down plays which resulted in a run or a pass
file_loc = "/home/matt/Downloads/nfl.csv"
data = pd.read_csv(file_loc)
second = data[(data['down'] == 2) & ((data['PlayType'] == 'Pass') | (data['PlayType'] == 'Run'))]
breakdown = second['PlayType'].value_counts()
breakdown['Pass']/ (breakdown['Pass'] + breakdown['Run'])
passing = second[second['PlayType'] == 'Pass']
running = second[second['PlayType'] == 'Run']
plt.figure(figsize = (10, 5))
for i in range(1, 11):
plt.subplot(2, 5, i)
plt.pie([second[second['ydstogo'] == i]['PlayType'].value_counts()['Pass'],
second[second['ydstogo'] == i]['PlayType'].value_counts()['Run']], colors = ['Blue', 'Red'])
plt.title('Second and %i' %i)
plt.tight_layout()
plt.legend(['Pass','Run'], bbox_to_anchor = (2, 1.5))
second['Prediction'] = second['ydstogo'] <= 5
second['Ran'] = second['PlayType'] == 'Run'
check = second['Prediction'] == second['Ran']
check.mean()
#TimeInHalf will contain seconds left in the half
second['TimeInHalf'] = second['TimeSecs']%(60*30)
pre_X = second[['ydstogo', 'yrdline100', 'TimeInHalf', 'ScoreDiff']]
y = second['PlayType']
#Separate 20% of the data for testing
pre_X_train, pre_X_test, y_train, y_test = cross_validation.train_test_split(pre_X, y, test_size = .2, random_state = 0)
#Normalize data for better results
X_train = preprocessing.scale(pre_X_train)
X_test = preprocessing.scale(pre_X_test)
X_train = preprocessing.normalize(X_train, norm = 'l2')
X_test = preprocessing.normalize(X_test, norm = 'l2')
#Choose the 2 best features
selector = SelectKBest(f_classif, k = 2)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
plt.bar([1, 2, 3, 4],scores, align = 'center')
plt.title("Feature Selection")
plt.xlabel("Features")
plt.ylabel("Score")
plt.xticks([1,2,3,4],['Yards to Go','Yard Line', 'Time in Half', 'Score Differential'])
#Pick the first and fourth columns, which contain the best predictors
X_train = X_train[:, [0, 3]]
X_test = X_test[:, [0, 3]]
plt.figure(figsize = (10, 5))
plt.subplot(121)
color = []
for value in y_test:
if value == 'Run':
color.append('Red')
else:
color.append('Blue')
plt.scatter(pre_X_test['ydstogo'], pre_X_test['ScoreDiff'], color = color)
plt.xlabel('Yards To Go')
plt.ylabel('Score Differential')
plt.title("Actual Decisions from Testing Data")
color = []
for value in pre_X_test['ydstogo']:
if value <= 5:
color.append('Red')
else:
color.append('Blue')
plt.subplot(122)
plt.scatter(pre_X_test['ydstogo'], pre_X_test['ScoreDiff'], color = color)
plt.xlabel('Yards To Go')
plt.ylabel('Score Differential')
plt.title("Naive Prediction on Testing Data")
plt.tight_layout()
plt.figure(figsize = (10, 5))
color = []
for value in y_test:
if value == 'Run':
color.append('Red')
else:
color.append('Blue')
plt.subplot(121)
plt.scatter(X_test[:,0], X_test[:, 1], color = color)
plt.xlabel('Yards To Go')
plt.ylabel('Score Differential')
plt.title("Actual Decisions, Normalized")
color = []
for value in pre_X_test['ydstogo']:
if value <= 5:
color.append('Red')
else:
color.append('Blue')
plt.subplot(122)
plt.scatter(X_test[:, 0], X_test[:, 1], color = color)
plt.xlabel('Yards To Go')
plt.ylabel('Score Differential')
plt.title("Naive Prediction, Normalized")
plt.tight_layout()
def graphPrediction(prediction, pred_name):
plt.figure(figsize = (10, 5))
color = []
for value in y_test:
if value == 'Run':
color.append('Red')
else:
color.append('Blue')
plt.subplot(121)
plt.scatter(pre_X_test['ydstogo'], pre_X_test['ScoreDiff'], color = color)
plt.xlabel('Yards To Go')
plt.ylabel('Score Differential')
plt.title("Actual Decisions from Testing Data")
color = []
for value in prediction:
if value == 'Run':
color.append('Red')
else:
color.append('Blue')
plt.subplot(122)
plt.scatter(pre_X_test['ydstogo'], pre_X_test['ScoreDiff'], color = color)
plt.xlabel('Yards To Go')
plt.ylabel('Score Differential')
plt.title(pred_name + " Prediction")
plt.tight_layout()
#Print the accuracy of the prediction model
check = prediction == y_test
accuracy = check.mean()
print("%s Accuracy: %f" %(pred_name, accuracy))
#Run logistic regression on our training data
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
graphPrediction(logistic.predict(X_test), "Logistic Regression")
# Use k-nearest neighbors algorithm. I chose 61 through trial and error, but there's
# a way to have sklearn optimize paramters. I need to look into that more.
neighbors = KNeighborsClassifier(n_neighbors = 61)
neighbors.fit(X_train,y_train)
graphPrediction(neighbors.predict(X_test), "61-Nearest Neighbors")
# Run support vector machine with a linear kernel. Again, it may be
# possible to do better by tweaking some parameters.
support = svm.SVC(kernel = "linear")
support.fit(X_train, y_train)
graphPrediction(support.predict(X_test), "Support Vector Machine")
dectree = tree.DecisionTreeClassifier()
dectree.fit(X_train, y_train)
graphPrediction(dectree.predict(X_test), 'Decision Tree')
Rfor = RandomForestClassifier(n_estimators = 50)
Rfor.fit(X_train, y_train)
graphPrediction(Rfor.predict(X_test), "Random Forest")