In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
data = pd.read_csv('./Raw_Data.csv', sep=',')
data.shape
Out[1]:
(492, 3)

Applying filter, due to trolls

In [2]:
data['Difference']=data['Actual Grade']-data['Predicted Grade']
data = data[(data['Actual Grade']>12)&(data['Actual Grade']<=45)&(data['Predicted Grade']>12)&(data['Predicted Grade']<=45)&(data['Difference']<=20)]
pd.options.display.float_format = '{:,.3f}'.format
In [3]:
data.describe()
Out[3]:
Actual Grade Predicted Grade Difference
count 451.000 451.000 451.000
mean 35.882 37.224 -1.341
std 6.033 5.447 4.871
min 18.000 19.000 -24.000
25% 32.000 34.000 -3.000
50% 37.000 38.000 -1.000
75% 40.000 42.000 1.000
max 45.000 45.000 19.000
In [4]:
data['Actual Grade'].mode()
Out[4]:
0    39
dtype: int64
In [5]:
data['Predicted Grade'].mode()
Out[5]:
0   38.000
dtype: float64
In [6]:
data['Difference'].mode()
Out[6]:
0   -1.000
dtype: float64
In [7]:
data.corr()
Out[7]:
Actual Grade Predicted Grade Difference
Actual Grade 1.000 0.644 0.518
Predicted Grade 0.644 1.000 -0.320
Difference 0.518 -0.320 1.000
In [8]:
ax=data.hist(column='Actual Grade', figsize=(20,10), bins=data['Actual Grade'].max()-data['Actual Grade'].min(),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Grade')
Out[8]:
<matplotlib.text.Text at 0x115baa0f0>
In [9]:
data.hist(column='Predicted Grade', figsize=(20,10),bins=int(data['Predicted Grade'].max()-data['Predicted Grade'].min()),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Grade')
Out[9]:
<matplotlib.text.Text at 0x1160dafd0>
In [10]:
data.boxplot(column='Actual Grade', figsize=(10,30))
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x116435e10>
In [11]:
data.boxplot(column='Predicted Grade', figsize=(10,30))
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1164ddac8>
In [12]:
data.boxplot(column='Difference', figsize=(10,30))
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x11681f208>

Let's see how many predicted grade were correct.

In [13]:
correctly_predicted=(data['Difference']==0)
correctly_predicted.sum()
print ('Just ' + str(correctly_predicted.sum()) + ' out of ' + str(data['Actual Grade'].count()) +' were correctly predicted!')
Just 62 out of 451 were correctly predicted!
In [14]:
data['Timestamp']= pd.to_datetime(data['Timestamp'])
before_offical_release_date=data[(data['Timestamp']<'07-06-2017 14:00:00')]
before_offical_release_date.describe()
Out[14]:
Actual Grade Predicted Grade Difference
count 213.000 213.000 213.000
mean 35.178 37.488 -2.310
std 6.815 5.344 5.742
min 18.000 21.000 -24.000
25% 31.000 35.000 -5.000
50% 37.000 38.000 -2.000
75% 41.000 42.000 0.000
max 45.000 45.000 19.000
In [15]:
after_offical_release_date=data[(data['Timestamp']>'07-06-2017 14:00:00')]
after_offical_release_date.describe()
Out[15]:
Actual Grade Predicted Grade Difference
count 238.000 238.000 238.000
mean 36.513 36.987 -0.475
std 5.169 5.538 3.737
min 22.000 19.000 -22.000
25% 33.000 33.250 -2.000
50% 37.000 38.000 0.000
75% 40.000 41.000 2.000
max 45.000 45.000 13.000
In [16]:
data.sort_values('Difference', ascending=True)
Out[16]:
Timestamp Actual Grade Predicted Grade Difference
38 2017-07-05 18:01:46 21 45.000 -24.000
406 2017-07-07 06:40:08 23 45.000 -22.000
73 2017-07-05 19:12:17 24 45.000 -21.000
33 2017-07-05 18:01:18 23 43.000 -20.000
21 2017-07-05 17:59:10 19 38.000 -19.000
17 2017-07-05 17:57:47 23 41.000 -18.000
14 2017-07-05 17:56:48 24 42.000 -18.000
31 2017-07-05 18:01:09 20 36.000 -16.000
118 2017-07-05 21:37:05 24 39.000 -15.000
23 2017-07-05 17:59:23 21 35.000 -14.000
13 2017-07-05 17:56:36 22 35.000 -13.000
22 2017-07-05 17:59:18 24 37.000 -13.000
30 2017-07-05 18:01:01 22 34.000 -12.000
36 2017-07-05 18:01:36 22 34.000 -12.000
37 2017-07-05 18:01:43 21 32.000 -11.000
377 2017-07-06 22:41:19 27 38.000 -11.000
2 2017-07-05 17:45:44 26 37.000 -11.000
378 2017-07-06 22:42:00 30 41.000 -11.000
20 2017-07-05 17:58:58 22 33.000 -11.000
99 2017-07-05 20:42:57 25 35.000 -10.000
299 2017-07-06 16:39:22 28 38.000 -10.000
16 2017-07-05 17:57:33 22 32.000 -10.000
64 2017-07-05 19:05:02 27 37.000 -10.000
12 2017-07-05 17:56:23 24 34.000 -10.000
390 2017-07-07 02:11:10 29 39.000 -10.000
75 2017-07-05 19:14:03 26 36.000 -10.000
157 2017-07-06 00:37:22 29 39.000 -10.000
374 2017-07-06 21:40:58 29 38.000 -9.000
44 2017-07-05 18:14:59 27 36.000 -9.000
84 2017-07-05 19:56:20 29 38.000 -9.000
... ... ... ... ...
401 2017-07-07 04:49:24 40 36.000 4.000
122 2017-07-05 21:50:35 33 28.000 5.000
214 2017-07-06 09:43:22 29 24.000 5.000
221 2017-07-06 10:38:39 29 24.000 5.000
280 2017-07-06 15:33:36 39 34.000 5.000
296 2017-07-06 16:28:13 33 28.000 5.000
474 2017-07-11 00:19:14 43 38.000 5.000
262 2017-07-06 14:51:23 38 33.000 5.000
327 2017-07-06 17:44:55 39 34.000 5.000
117 2017-07-05 21:31:13 44 39.000 5.000
232 2017-07-06 12:51:27 34 29.000 5.000
445 2017-07-08 09:58:35 43 37.000 6.000
124 2017-07-05 21:58:30 32 26.000 6.000
313 2017-07-06 17:15:34 29 23.000 6.000
314 2017-07-06 17:15:59 29 23.000 6.000
83 2017-07-05 19:54:05 44 38.000 6.000
213 2017-07-06 09:35:46 32 26.000 6.000
395 2017-07-07 02:51:23 35 28.000 7.000
291 2017-07-06 16:08:16 35 28.000 7.000
432 2017-07-07 15:17:30 39 32.000 7.000
4 2017-07-05 17:50:52 42 35.000 7.000
238 2017-07-06 14:02:05 36 29.000 7.000
113 2017-07-05 21:19:48 42 33.000 9.000
255 2017-07-06 14:39:52 39 30.000 9.000
7 2017-07-05 17:51:37 42 33.000 9.000
476 2017-07-11 14:29:50 31 19.000 12.000
330 2017-07-06 17:47:48 37 24.000 13.000
179 2017-07-06 04:23:28 45 32.000 13.000
101 2017-07-05 20:50:23 45 27.000 18.000
55 2017-07-05 18:51:16 42 23.000 19.000

451 rows × 4 columns

In [17]:
data[:1].describe()
Out[17]:
Actual Grade Predicted Grade Difference
count 1.000 1.000 1.000
mean 37.000 40.000 -3.000
std nan nan nan
min 37.000 40.000 -3.000
25% 37.000 40.000 -3.000
50% 37.000 40.000 -3.000
75% 37.000 40.000 -3.000
max 37.000 40.000 -3.000
In [18]:
mot=[]
for x in range(0,data['Actual Grade'].count() ):
    mot.append((data['Actual Grade'][:x+1]).mean())

mean_over_time=pd.Series(mot)
mean_over_time.plot(kind='area',ylim=(0,45),y='Grades',figsize=(20,10))
plt.xlabel('Data Entry #')
plt.ylabel('Mean Grade')
# (mean_over_time.min(),mean_over_time.max()
plt.plot([213.5, 213.5], [0, 45], 'k-', lw=2,color='r')
Out[18]:
[<matplotlib.lines.Line2D at 0x116807550>]
In [19]:
ib_m16_results = pd.read_csv('./ib_m16_results.csv', sep=',')
ib_m16_results.head(46)
#note some are rounded if a frequency <10 or assumed to be 5 (like for Grade of 1) 
Out[19]:
Grade Frequency
0 1 5
1 2 11
2 3 29
3 4 39
4 5 37
5 6 40
6 7 41
7 8 67
8 9 68
9 10 97
10 11 145
11 12 173
12 13 232
13 14 307
14 15 376
15 16 515
16 17 652
17 18 851
18 19 1060
19 20 1330
20 21 1589
21 22 2043
22 23 2349
23 24 3006
24 25 3285
25 26 3605
26 27 3788
27 28 4165
28 29 4309
29 30 4454
30 31 4248
31 32 4294
32 33 3992
33 34 3730
34 35 3530
35 36 3320
36 37 2830
37 38 2441
38 39 2013
39 40 1662
40 41 1325
41 42 992
42 43 820
43 44 430
44 45 201
In [20]:
def descriptives_from_agg(values, freqs):
    values = np.array(values)
    freqs = np.array(freqs)
    count = freqs.sum()
    fx = values * freqs
    mean = fx.sum() / count
    variance = ((freqs * values**2).sum() / count) - mean**2
    variance = count / (count - 1) * variance  # dof correction for sample variance
    std = np.sqrt(variance)
    minimum = np.min(values)
    maximum = np.max(values)
    cumcount = np.cumsum(freqs)
    Q1 = values[np.searchsorted(cumcount, 0.25*count)]
    Q2 = values[np.searchsorted(cumcount, 0.50*count)]
    Q3 = values[np.searchsorted(cumcount, 0.75*count)]
    result = pd.Series([count, mean, std, minimum, Q1, Q2, Q3, maximum],
                       index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
    return result


descriptives_from_agg(ib_m16_results['Grade'], ib_m16_results['Frequency'])
Out[20]:
count   74,496.000
mean        29.952
std          6.717
min          1.000
25%         26.000
50%         30.000
75%         35.000
max         45.000
dtype: float64
In [21]:
data.describe()
Out[21]:
Actual Grade Predicted Grade Difference
count 451.000 451.000 451.000
mean 35.882 37.224 -1.341
std 6.033 5.447 4.871
min 18.000 19.000 -24.000
25% 32.000 34.000 -3.000
50% 37.000 38.000 -1.000
75% 40.000 42.000 1.000
max 45.000 45.000 19.000
In [22]:
data['Difference'].hist(figsize=(20,10), bins=int(data['Difference'].max()-data['Difference'].min()),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Difference')
Out[22]:
<matplotlib.text.Text at 0x116b53898>
In [23]:
correctly_predicted=(data['Predicted Grade']==44)
correctly_predicted.sum()
Out[23]:
17
In [24]:
correctly_predicted=(data['Difference']==1)
correctly_predicted.sum()
Out[24]:
35