import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
data = pd.read_csv('./Raw_Data.csv', sep=',')
data.shape

(492, 3)

Applying filter, due to trolls

data['Difference']=data['Actual Grade']-data['Predicted Grade']
data = data[(data['Actual Grade']>12)&(data['Actual Grade']<=45)&(data['Predicted Grade']>12)&(data['Predicted Grade']<=45)&(data['Difference']<=20)]
pd.options.display.float_format = '{:,.3f}'.format

data.describe()

data['Actual Grade'].mode()

0    39
dtype: int64

data['Predicted Grade'].mode()

0   38.000
dtype: float64

data['Difference'].mode()

0   -1.000
dtype: float64

data.corr()

ax=data.hist(column='Actual Grade', figsize=(20,10), bins=data['Actual Grade'].max()-data['Actual Grade'].min(),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Grade')

<matplotlib.text.Text at 0x115baa0f0>

data.hist(column='Predicted Grade', figsize=(20,10),bins=int(data['Predicted Grade'].max()-data['Predicted Grade'].min()),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Grade')

<matplotlib.text.Text at 0x1160dafd0>

data.boxplot(column='Actual Grade', figsize=(10,30))

<matplotlib.axes._subplots.AxesSubplot at 0x116435e10>

data.boxplot(column='Predicted Grade', figsize=(10,30))

<matplotlib.axes._subplots.AxesSubplot at 0x1164ddac8>

data.boxplot(column='Difference', figsize=(10,30))

<matplotlib.axes._subplots.AxesSubplot at 0x11681f208>

Let's see how many predicted grade were correct.

correctly_predicted=(data['Difference']==0)
correctly_predicted.sum()
print ('Just ' + str(correctly_predicted.sum()) + ' out of ' + str(data['Actual Grade'].count()) +' were correctly predicted!')

Just 62 out of 451 were correctly predicted!

data['Timestamp']= pd.to_datetime(data['Timestamp'])
before_offical_release_date=data[(data['Timestamp']<'07-06-2017 14:00:00')]
before_offical_release_date.describe()

after_offical_release_date=data[(data['Timestamp']>'07-06-2017 14:00:00')]
after_offical_release_date.describe()

data.sort_values('Difference', ascending=True)

data[:1].describe()

mot=[]
for x in range(0,data['Actual Grade'].count() ):
    mot.append((data['Actual Grade'][:x+1]).mean())

mean_over_time=pd.Series(mot)
mean_over_time.plot(kind='area',ylim=(0,45),y='Grades',figsize=(20,10))
plt.xlabel('Data Entry #')
plt.ylabel('Mean Grade')
# (mean_over_time.min(),mean_over_time.max()
plt.plot([213.5, 213.5], [0, 45], 'k-', lw=2,color='r')

[<matplotlib.lines.Line2D at 0x116807550>]

ib_m16_results = pd.read_csv('./ib_m16_results.csv', sep=',')
ib_m16_results.head(46)
#note some are rounded if a frequency <10 or assumed to be 5 (like for Grade of 1)

def descriptives_from_agg(values, freqs):
    values = np.array(values)
    freqs = np.array(freqs)
    count = freqs.sum()
    fx = values * freqs
    mean = fx.sum() / count
    variance = ((freqs * values**2).sum() / count) - mean**2
    variance = count / (count - 1) * variance  # dof correction for sample variance
    std = np.sqrt(variance)
    minimum = np.min(values)
    maximum = np.max(values)
    cumcount = np.cumsum(freqs)
    Q1 = values[np.searchsorted(cumcount, 0.25*count)]
    Q2 = values[np.searchsorted(cumcount, 0.50*count)]
    Q3 = values[np.searchsorted(cumcount, 0.75*count)]
    result = pd.Series([count, mean, std, minimum, Q1, Q2, Q3, maximum],
                       index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
    return result


descriptives_from_agg(ib_m16_results['Grade'], ib_m16_results['Frequency'])

count   74,496.000
mean        29.952
std          6.717
min          1.000
25%         26.000
50%         30.000
75%         35.000
max         45.000
dtype: float64

data.describe()

data['Difference'].hist(figsize=(20,10), bins=int(data['Difference'].max()-data['Difference'].min()),edgecolor='000')
plt.ylabel('Frequency')
plt.xlabel('Difference')

<matplotlib.text.Text at 0x116b53898>

correctly_predicted=(data['Predicted Grade']==44)
correctly_predicted.sum()

17

correctly_predicted=(data['Difference']==1)
correctly_predicted.sum()

35

	Actual Grade	Predicted Grade	Difference
count	451.000	451.000	451.000
mean	35.882	37.224	-1.341
std	6.033	5.447	4.871
min	18.000	19.000	-24.000
25%	32.000	34.000	-3.000
50%	37.000	38.000	-1.000
75%	40.000	42.000	1.000
max	45.000	45.000	19.000

	Actual Grade	Predicted Grade	Difference
count	213.000	213.000	213.000
mean	35.178	37.488	-2.310
std	6.815	5.344	5.742
min	18.000	21.000	-24.000
25%	31.000	35.000	-5.000
50%	37.000	38.000	-2.000
75%	41.000	42.000	0.000
max	45.000	45.000	19.000

	Actual Grade	Predicted Grade	Difference
count	238.000	238.000	238.000
mean	36.513	36.987	-0.475
std	5.169	5.538	3.737
min	22.000	19.000	-22.000
25%	33.000	33.250	-2.000
50%	37.000	38.000	0.000
75%	40.000	41.000	2.000
max	45.000	45.000	13.000

	Timestamp	Actual Grade	Predicted Grade	Difference
38	2017-07-05 18:01:46	21	45.000	-24.000
406	2017-07-07 06:40:08	23	45.000	-22.000
73	2017-07-05 19:12:17	24	45.000	-21.000
33	2017-07-05 18:01:18	23	43.000	-20.000
21	2017-07-05 17:59:10	19	38.000	-19.000
17	2017-07-05 17:57:47	23	41.000	-18.000
14	2017-07-05 17:56:48	24	42.000	-18.000
31	2017-07-05 18:01:09	20	36.000	-16.000
118	2017-07-05 21:37:05	24	39.000	-15.000
23	2017-07-05 17:59:23	21	35.000	-14.000
13	2017-07-05 17:56:36	22	35.000	-13.000
22	2017-07-05 17:59:18	24	37.000	-13.000
30	2017-07-05 18:01:01	22	34.000	-12.000
36	2017-07-05 18:01:36	22	34.000	-12.000
37	2017-07-05 18:01:43	21	32.000	-11.000
377	2017-07-06 22:41:19	27	38.000	-11.000
2	2017-07-05 17:45:44	26	37.000	-11.000
378	2017-07-06 22:42:00	30	41.000	-11.000
20	2017-07-05 17:58:58	22	33.000	-11.000
99	2017-07-05 20:42:57	25	35.000	-10.000
299	2017-07-06 16:39:22	28	38.000	-10.000
16	2017-07-05 17:57:33	22	32.000	-10.000
64	2017-07-05 19:05:02	27	37.000	-10.000
12	2017-07-05 17:56:23	24	34.000	-10.000
390	2017-07-07 02:11:10	29	39.000	-10.000
75	2017-07-05 19:14:03	26	36.000	-10.000
157	2017-07-06 00:37:22	29	39.000	-10.000
374	2017-07-06 21:40:58	29	38.000	-9.000
44	2017-07-05 18:14:59	27	36.000	-9.000
84	2017-07-05 19:56:20	29	38.000	-9.000
...	...	...	...	...
401	2017-07-07 04:49:24	40	36.000	4.000
122	2017-07-05 21:50:35	33	28.000	5.000
214	2017-07-06 09:43:22	29	24.000	5.000
221	2017-07-06 10:38:39	29	24.000	5.000
280	2017-07-06 15:33:36	39	34.000	5.000
296	2017-07-06 16:28:13	33	28.000	5.000
474	2017-07-11 00:19:14	43	38.000	5.000
262	2017-07-06 14:51:23	38	33.000	5.000
327	2017-07-06 17:44:55	39	34.000	5.000
117	2017-07-05 21:31:13	44	39.000	5.000
232	2017-07-06 12:51:27	34	29.000	5.000
445	2017-07-08 09:58:35	43	37.000	6.000
124	2017-07-05 21:58:30	32	26.000	6.000
313	2017-07-06 17:15:34	29	23.000	6.000
314	2017-07-06 17:15:59	29	23.000	6.000
83	2017-07-05 19:54:05	44	38.000	6.000
213	2017-07-06 09:35:46	32	26.000	6.000
395	2017-07-07 02:51:23	35	28.000	7.000
291	2017-07-06 16:08:16	35	28.000	7.000
432	2017-07-07 15:17:30	39	32.000	7.000
4	2017-07-05 17:50:52	42	35.000	7.000
238	2017-07-06 14:02:05	36	29.000	7.000
113	2017-07-05 21:19:48	42	33.000	9.000
255	2017-07-06 14:39:52	39	30.000	9.000
7	2017-07-05 17:51:37	42	33.000	9.000
476	2017-07-11 14:29:50	31	19.000	12.000
330	2017-07-06 17:47:48	37	24.000	13.000
179	2017-07-06 04:23:28	45	32.000	13.000
101	2017-07-05 20:50:23	45	27.000	18.000
55	2017-07-05 18:51:16	42	23.000	19.000

	Actual Grade	Predicted Grade	Difference
count	1.000	1.000	1.000
mean	37.000	40.000	-3.000
std	nan	nan	nan
min	37.000	40.000	-3.000
25%	37.000	40.000	-3.000
50%	37.000	40.000	-3.000
75%	37.000	40.000	-3.000
max	37.000	40.000	-3.000

	Actual Grade	Predicted Grade	Difference
Actual Grade	1.000	0.644	0.518
Predicted Grade	0.644	1.000	-0.320
Difference	0.518	-0.320	1.000

	Grade	Frequency
0	1	5
1	2	11
2	3	29
3	4	39
4	5	37
5	6	40
6	7	41
7	8	67
8	9	68
9	10	97
10	11	145
11	12	173
12	13	232
13	14	307
14	15	376
15	16	515
16	17	652
17	18	851
18	19	1060
19	20	1330
20	21	1589
21	22	2043
22	23	2349
23	24	3006
24	25	3285
25	26	3605
26	27	3788
27	28	4165
28	29	4309
29	30	4454
30	31	4248
31	32	4294
32	33	3992
33	34	3730
34	35	3530
35	36	3320
36	37	2830
37	38	2441
38	39	2013
39	40	1662
40	41	1325
41	42	992
42	43	820
43	44	430
44	45	201