import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
data = pd.read_csv('./Raw_Data.csv', sep=',')
data.shape
Applying filter, due to trolls
data['Difference']=data['Actual Grade']-data['Predicted Grade']
data = data[(data['Actual Grade']>12)&(data['Actual Grade']<=45)&(data['Predicted Grade']>12)&(data['Predicted Grade']<=45)&(data['Difference']<=20)]
pd.options.display.float_format = '{:,.3f}'.format
data.describe()
data['Actual Grade'].mode()
data['Predicted Grade'].mode()
data['Difference'].mode()
data.corr()
ax=data.hist(column='Actual Grade', figsize=(20,10), bins=data['Actual Grade'].max()-data['Actual Grade'].min(),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Grade')
data.hist(column='Predicted Grade', figsize=(20,10),bins=int(data['Predicted Grade'].max()-data['Predicted Grade'].min()),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Grade')
data.boxplot(column='Actual Grade', figsize=(10,30))
data.boxplot(column='Predicted Grade', figsize=(10,30))
data.boxplot(column='Difference', figsize=(10,30))
Let's see how many predicted grade were correct.
correctly_predicted=(data['Difference']==0)
correctly_predicted.sum()
print ('Just ' + str(correctly_predicted.sum()) + ' out of ' + str(data['Actual Grade'].count()) +' were correctly predicted!')
data['Timestamp']= pd.to_datetime(data['Timestamp'])
before_offical_release_date=data[(data['Timestamp']<'07-06-2017 14:00:00')]
before_offical_release_date.describe()
after_offical_release_date=data[(data['Timestamp']>'07-06-2017 14:00:00')]
after_offical_release_date.describe()
data.sort_values('Difference', ascending=True)
data[:1].describe()
mot=[]
for x in range(0,data['Actual Grade'].count() ):
mot.append((data['Actual Grade'][:x+1]).mean())
mean_over_time=pd.Series(mot)
mean_over_time.plot(kind='area',ylim=(0,45),y='Grades',figsize=(20,10))
plt.xlabel('Data Entry #')
plt.ylabel('Mean Grade')
# (mean_over_time.min(),mean_over_time.max()
plt.plot([213.5, 213.5], [0, 45], 'k-', lw=2,color='r')
ib_m16_results = pd.read_csv('./ib_m16_results.csv', sep=',')
ib_m16_results.head(46)
#note some are rounded if a frequency <10 or assumed to be 5 (like for Grade of 1)
def descriptives_from_agg(values, freqs):
values = np.array(values)
freqs = np.array(freqs)
count = freqs.sum()
fx = values * freqs
mean = fx.sum() / count
variance = ((freqs * values**2).sum() / count) - mean**2
variance = count / (count - 1) * variance # dof correction for sample variance
std = np.sqrt(variance)
minimum = np.min(values)
maximum = np.max(values)
cumcount = np.cumsum(freqs)
Q1 = values[np.searchsorted(cumcount, 0.25*count)]
Q2 = values[np.searchsorted(cumcount, 0.50*count)]
Q3 = values[np.searchsorted(cumcount, 0.75*count)]
result = pd.Series([count, mean, std, minimum, Q1, Q2, Q3, maximum],
index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
return result
descriptives_from_agg(ib_m16_results['Grade'], ib_m16_results['Frequency'])
data.describe()
data['Difference'].hist(figsize=(20,10), bins=int(data['Difference'].max()-data['Difference'].min()),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Difference')
correctly_predicted=(data['Predicted Grade']==44)
correctly_predicted.sum()
correctly_predicted=(data['Difference']==1)
correctly_predicted.sum()