with open("Colors.txt", 'rb') as open_file:
print 'Colors.txt content:\n' + open_file.read()
Colors.txt content:
Color Value
Red 1
Orange 2
Yellow 3
Green 4
Blue 5
Purple 6
Black 7
White 8
with open("Colors.txt", 'rb') as open_file:
for observation in open_file:
print 'Reading Data: ' + observation
Reading Data: Color Value
Reading Data: Red 1
Reading Data: Orange 2
Reading Data: Yellow 3
Reading Data: Green 4
Reading Data: Blue 5
Reading Data: Purple 6
Reading Data: Black 7
Reading Data: White 8
n = 3
with open("Colors.txt", 'rb') as open_file:
for j, observation in enumerate(open_file):
if j % n==0:
print('Reading Line: ' + str(j) +
' Content: ' + observation)
Reading Line: 0 Content: Color Value
Reading Line: 3 Content: Yellow 3
Reading Line: 6 Content: Purple 6
from random import random
sample_size = 0.25
with open("Colors.txt", 'rb') as open_file:
for j, observation in enumerate(open_file):
if random()<=sample_size:
print('Reading Line: ' + str(j) +
' Content: ' + observation)
Reading Line: 2 Content: Orange 2
Reading Line: 3 Content: Yellow 3
import pandas as pd
color_table = pd.io.parsers.read_table("Colors.txt")
print color_table
Color Value
0 Red 1
1 Orange 2
2 Yellow 3
3 Green 4
4 Blue 5
5 Purple 6
6 Black 7
7 White 8
import pandas as pd
titanic = pd.io.parsers.read_csv("Titanic.csv")
X = titanic[['age']]
#X = titanic[['age']].values
print X
age
0 29.0000
1 0.9167
2 2.0000
3 30.0000
4 25.0000
5 48.0000
6 63.0000
7 39.0000
8 53.0000
9 71.0000
10 47.0000
11 18.0000
12 24.0000
13 26.0000
14 80.0000
15 9999.0000
16 24.0000
17 50.0000
18 32.0000
19 36.0000
20 37.0000
21 47.0000
22 26.0000
23 42.0000
24 29.0000
25 25.0000
26 25.0000
27 19.0000
28 35.0000
29 28.0000
... ...
1279 14.0000
1280 22.0000
1281 22.0000
1282 9999.0000
1283 9999.0000
1284 9999.0000
1285 32.5000
1286 38.0000
1287 51.0000
1288 18.0000
1289 21.0000
1290 47.0000
1291 9999.0000
1292 9999.0000
1293 9999.0000
1294 28.5000
1295 21.0000
1296 27.0000
1297 9999.0000
1298 36.0000
1299 27.0000
1300 15.0000
1301 45.5000
1302 9999.0000
1303 9999.0000
1304 14.5000
1305 9999.0000
1306 26.5000
1307 27.0000
1308 29.0000
[1309 rows x 1 columns]
import pandas as pd
xls = pd.ExcelFile("Values.xls")
trig_values = xls.parse('Sheet1', index_col=None,
na_values=['NA'])
#trig_values = pd.read_excel("Values.xls", 'Sheet1', index_col=None, na_values=['NA'])
print trig_values
Angle (Degrees) Sine Cosine Tangent
0 138.550574 0.661959 -0.749540 -0.883153
1 305.535745 -0.813753 0.581211 -1.400100
2 280.518695 -0.983195 0.182556 -5.385709
3 216.363795 -0.592910 -0.805269 0.736289
4 36.389247 0.593268 0.805005 0.736974
5 31.474311 0.522116 0.852874 0.612184
6 120.121669 0.864962 -0.501838 -1.723588
7 293.947055 -0.913921 0.405892 -2.251634
8 179.882632 0.002048 -0.999998 -0.002048
9 120.927562 0.857818 -0.513954 -1.669056
10 71.349485 0.947487 0.319795 2.962796
11 241.971082 -0.882711 -0.469917 1.878439
12 297.208817 -0.889346 0.457235 -1.945053
13 142.004551 0.615599 -0.788060 -0.781158
14 173.770696 0.108508 -0.994096 -0.109152
15 229.232002 -0.757360 -0.652998 1.159820
16 67.926976 0.926706 0.375788 2.466033
17 261.866575 -0.989941 -0.141479 6.997102
18 59.185450 0.858830 0.512261 1.676547
19 98.029275 0.990197 -0.139679 -7.089086
20 9.336088 0.162225 0.986754 0.164403
21 90.746371 0.999915 -0.013026 -76.761483
22 217.798087 -0.612881 -0.790175 0.775626
23 58.616049 0.853697 0.520771 1.639295
24 197.196367 -0.295647 -0.955297 0.309482
25 331.194892 -0.481832 0.876264 -0.549871
26 6.509875 0.113374 0.993552 0.114110
27 266.390707 -0.998017 -0.062952 15.853513
28 230.323819 -0.769665 -0.638448 1.205525
29 314.224257 -0.716615 0.697469 -1.027452
.. ... ... ... ...
42 324.080564 -0.586647 0.809843 -0.724396
43 140.893727 0.630761 -0.775977 -0.812860
44 128.226889 0.785567 -0.618777 -1.269547
45 82.770054 0.992049 0.125852 7.882680
46 303.112455 -0.837600 0.546284 -1.533268
47 164.824273 0.261780 -0.965127 -0.271239
48 218.829827 -0.627009 -0.779012 0.804878
49 28.649593 0.479452 0.877568 0.546341
50 349.336296 -0.185044 0.982730 -0.188296
51 84.889713 0.996025 0.089073 11.182105
52 197.935862 -0.307952 -0.951402 0.323683
53 303.049380 -0.838201 0.545362 -1.536963
54 183.737235 -0.065181 -0.997873 0.065320
55 346.153919 -0.239314 0.970942 -0.246477
56 218.822745 -0.626913 -0.779089 0.804674
57 243.969070 -0.898557 -0.438856 2.047498
58 115.600771 0.901827 -0.432098 -2.087089
59 125.906606 0.809974 -0.586466 -1.381111
60 200.094363 -0.343567 -0.939128 0.365836
61 337.860807 -0.376858 0.926271 -0.406855
62 168.176975 0.204889 -0.978785 -0.209330
63 305.708155 -0.812000 0.583657 -1.391229
64 162.656078 0.298107 -0.954533 -0.312306
65 219.007899 -0.629428 -0.777059 0.810012
66 222.830465 -0.679831 -0.733368 0.926998
67 324.199562 -0.584964 0.811059 -0.721234
68 187.948172 -0.138277 -0.990394 0.139619
69 270.678249 -0.999930 0.011837 -84.472139
70 270.779159 -0.999908 0.013598 -73.530885
71 200.213513 -0.345520 -0.938412 0.368196
[72 rows x 4 columns]
from skimage.io import imread
from skimage.transform import resize
from matplotlib import pyplot as plt
import matplotlib.cm as cm
example_file = ("http://upload.wikimedia.org/" +
"wikipedia/commons/7/7d/Dog_face.png")
image = imread(example_file, as_grey=True)
plt.imshow(image, cmap=cm.gray)
plt.show()
print("data type: %s, shape: %s" %
(type(image), image.shape))
data type: <type 'numpy.ndarray'>, shape: (90L, 90L)
image2 = image[5:70,0:70]
plt.imshow(image2, cmap=cm.gray)
plt.show()
image3 = resize(image2, (30, 30), mode='nearest')
plt.imshow(image3, cmap=cm.gray)
print("data type: %s, shape: %s" %
(type(image3), image3.shape))
data type: <type 'numpy.ndarray'>, shape: (30L, 30L)
image_row = image3.flatten()
print("data type: %s, shape: %s" %
(type(image_row), image_row.shape))
data type: <type 'numpy.ndarray'>, shape: (900L,)
from lxml import objectify
import pandas as pd
xml = objectify.parse(open('XMLData.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))
for i in range(0,4):
obj = root.getchildren()[i].getchildren()
row = dict(zip(['Number', 'String', 'Boolean'],
[obj[0].text, obj[1].text,
obj[2].text]))
row_s = pd.Series(row)
row_s.name = i
df = df.append(row_s)
print df
Number String Boolean
0 1 First True
1 2 Second False
2 3 Third True
3 4 Fourth False
from lxml import objectify
import pandas as pd
xml = objectify.parse(open('XMLData2.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))
for i in range(0,4):
obj = root.getchildren()[i].getchildren()
row = dict(zip(['Number', 'String', 'Boolean'],
[obj[0].text, obj[1].text,
obj[2].text]))
row_s = pd.Series(row)
row_s.name = i
df = df.append(row_s)
search = pd.DataFrame.duplicated(df)
print df
print
print search[search == True]
Number String Boolean
0 1 First True
1 1 First True
2 2 Second False
3 3 Third True
1 True
dtype: bool
from lxml import objectify
import pandas as pd
xml = objectify.parse(open('XMLData2.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))
for i in range(0,4):
obj = root.getchildren()[i].getchildren()
row = dict(zip(['Number', 'String', 'Boolean'],
[obj[0].text, obj[1].text,
obj[2].text]))
row_s = pd.Series(row)
row_s.name = i
df = df.append(row_s)
print df.drop_duplicates()
Number String Boolean
0 1 First True
2 2 Second False
3 3 Third True
import pandas as pd
df = pd.DataFrame({'A': [0,0,0,0,0,1,1],
'B': [1,2,3,5,4,2,5],
'C': [5,3,4,1,1,2,3]})
a_group_desc = df.groupby('A').describe()
print a_group_desc
B C
A
0 count 5.000000 5.000000
mean 3.000000 2.800000
std 1.581139 1.788854
min 1.000000 1.000000
25% 2.000000 1.000000
50% 3.000000 3.000000
75% 4.000000 4.000000
max 5.000000 5.000000
1 count 2.000000 2.000000
mean 3.500000 2.500000
std 2.121320 0.707107
min 2.000000 2.000000
25% 2.750000 2.250000
50% 3.500000 2.500000
75% 4.250000 2.750000
max 5.000000 3.000000
unstacked = a_group_desc.unstack()
print unstacked
B C \
count mean std min 25% 50% 75% max count mean std min
A
0 5 3.0 1.581139 1 2.00 3.0 4.00 5 5 2.8 1.788854 1
1 2 3.5 2.121320 2 2.75 3.5 4.25 5 2 2.5 0.707107 2
25% 50% 75% max
A
0 1.00 3.0 4.00 5
1 2.25 2.5 2.75 3
print unstacked.loc[:,(slice(None),['count','mean']),]
B C
count mean count mean
A
0 5 3.0 5 2.8
1 2 3.5 2 2.5
import pandas as pd
car_colors = pd.Series(['Blue', 'Red', 'Green'], dtype='category')
car_data = pd.Series(
pd.Categorical(['Yellow', 'Green', 'Red', 'Blue', 'Purple'],
categories=car_colors, ordered=False))
find_entries = pd.isnull(car_data)
print car_colors
print
print car_data
print
print find_entries[find_entries == True]
0 Blue
1 Red
2 Green
dtype: category
Categories (3, object): [Blue < Green < Red]
0 NaN
1 Green
2 Red
3 Blue
4 NaN
dtype: category
Categories (3, object): [Blue, Red, Green]
0 True
4 True
dtype: bool
import pandas as pd
car_colors = pd.Series(['Blue', 'Red', 'Green'],
dtype='category')
car_data = pd.Series(
pd.Categorical(
['Blue', 'Green', 'Red', 'Blue', 'Red'],
categories=car_colors, ordered=False))
car_colors.cat.categories = ["Purple", "Yellow", "Mauve"]
car_data.cat.categories = car_colors
print car_data
0 Purple
1 Yellow
2 Mauve
3 Purple
4 Mauve
dtype: category
Categories (3, object): [Purple, Mauve, Yellow]
import pandas as pd
car_colors = pd.Series(['Blue', 'Red', 'Green'],
dtype='category')
car_data = pd.Series(
pd.Categorical(
['Blue', 'Green', 'Red', 'Green', 'Red', 'Green'],
categories=car_colors, ordered=False))
car_data.cat.categories = ["Blue_Red", "Red", "Green"]
print car_data.ix[car_data.isin(['Red'])]
car_data.ix[car_data.isin(['Red'])] = 'Blue_Red'
print
print car_data
2 Red
4 Red
dtype: category
Categories (3, object): [Blue_Red, Red, Green]
0 Blue_Red
1 Green
2 Blue_Red
3 Green
4 Blue_Red
5 Green
dtype: category
Categories (3, object): [Blue_Red, Red, Green]
import datetime as dt
now = dt.datetime.now()
print str(now)
print now.strftime('%a, %d %B %Y')
2015-04-19 18:00:08.427000
Sun, 19 April 2015
import datetime as dt
now = dt.datetime.now()
timevalue = now + dt.timedelta(hours=2)
print now.strftime('%H:%M:%S')
print timevalue.strftime('%H:%M:%S')
print timevalue - now
18:00:10
20:00:10
2:00:00
import pandas as pd
import numpy as np
s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])
print s.isnull()
print
print s[s.isnull()]
0 False
1 False
2 False
3 True
4 False
5 False
6 True
dtype: bool
3 NaN
6 NaN
dtype: float64
import pandas as pd
import numpy as np
s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])
print s.fillna(int(s.mean()))
print
print s.dropna()
0 1
1 2
2 3
3 3
4 5
5 6
6 3
dtype: float64
0 1
1 2
2 3
4 5
5 6
dtype: float64
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])
imp = Imputer(missing_values='NaN',
strategy='mean', axis=0)
imp.fit([1, 2, 3, 4, 5, 6, 7])
x = pd.Series(imp.transform(s).tolist()[0])
print x
0 1
1 2
2 3
3 4
4 5
5 6
6 7
dtype: float64
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
[[11,12,13], [14,15,16], [17,18,19],],
[[21,22,23], [24,25,26], [27,28,29]]])
x[1]
array([[11, 12, 13],
[14, 15, 16],
[17, 18, 19]])
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
[[11,12,13], [14,15,16], [17,18,19],],
[[21,22,23], [24,25,26], [27,28,29]]])
x[:,1]
array([[ 4, 5, 6],
[14, 15, 16],
[24, 25, 26]])
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
[[11,12,13], [14,15,16], [17,18,19],],
[[21,22,23], [24,25,26], [27,28,29]]])
print x[1,1]
print x[:,1,1]
print x[1,:,1]
print
print x[1:3, 1:3]
[14 15 16]
[ 5 15 25]
[12 15 18]
[[[14 15 16]
[17 18 19]]
[[24 25 26]
[27 28 29]]]
import pandas as pd
df = pd.DataFrame({'A': [2,3,1],
'B': [1,2,3],
'C': [5,3,4]})
df1 = pd.DataFrame({'A': [4],
'B': [4],
'C': [4]})
df = df.append(df1)
df = df.reset_index(drop=True)
print df
df.loc[df.last_valid_index() + 1] = [5, 5, 5]
print
print df
df2 = pd.DataFrame({'D': [1, 2, 3, 4, 5]})
df = pd.DataFrame.join(df, df2)
print
print df
A B C
0 2 1 5
1 3 2 3
2 1 3 4
3 4 4 4
A B C
0 2 1 5
1 3 2 3
2 1 3 4
3 4 4 4
4 5 5 5
A B C D
0 2 1 5 1
1 3 2 3 2
2 1 3 4 3
3 4 4 4 4
4 5 5 5 5
import pandas as pd
df = pd.DataFrame({'A': [2,3,1],
'B': [1,2,3],
'C': [5,3,4]})
df = df.drop(df.index[[1]])
print df
df = df.drop('B', 1)
print
print df
A B C
0 2 1 5
2 1 3 4
A C
0 2 5
2 1 4
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [2,1,2,3,3,5,4],
'B': [1,2,3,5,4,2,5],
'C': [5,3,4,1,1,2,3]})
df = df.sort_index(by=['A', 'B'], ascending=[True, True])
df = df.reset_index(drop=True)
print df
index = df.index.tolist()
np.random.shuffle(index)
df = df.ix[index]
df = df.reset_index(drop=True)
print
print df
A B C
0 1 2 3
1 2 1 5
2 2 3 4
3 3 4 1
4 3 5 1
5 4 5 3
6 5 2 2
A B C
0 2 1 5
1 5 2 2
2 4 5 3
3 3 5 1
4 3 4 1
5 1 2 3
6 2 3 4
import pandas as pd
df = pd.DataFrame({'Map': [0,0,0,1,1,2,2],
'Values': [1,2,3,5,4,2,5]})
df['S'] = df.groupby('Map')['Values'].transform(np.sum)
df['M'] = df.groupby('Map')['Values'].transform(np.mean)
df['V'] = df.groupby('Map')['Values'].transform(np.var)
print df
Map Values S M V
0 0 1 6 2.0 1.0
1 0 2 6 2.0 1.0
2 0 3 6 2.0 1.0
3 1 5 9 4.5 0.5
4 1 4 9 4.5 0.5
5 2 2 7 3.5 4.5
6 2 5 7 3.5 4.5
Measuring central tendency with mean and median
Measuring variance and range
Working with Percentiles
Defining measures of skewness and kurtosis
import pandas as pd
import numpy as np
print 'Your pandas version is: %s' % pd.__version__
print 'Your NumPy version is %s' % np.__version__
from sklearn.datasets import load_iris
iris = load_iris()
iris_nparray = iris.data
iris_dataframe = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_dataframe['group'] = pd.Series([iris.target_names[item] for item in iris.target], dtype="category")
Your pandas version is: 0.15.2
Your NumPy version is 1.8.1
print iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
iris_dataframe.dtypes
sepal length (cm) float64
sepal width (cm) float64
petal length (cm) float64
petal width (cm) float64
group category
dtype: object
print iris_dataframe['group'].value_counts()
virginica 50
versicolor 50
setosa 50
dtype: int64
print iris_dataframe.mean(numeric_only=True)
# means = np.mean(iris_dataframe, axis=0)
# medians = np.median(iris_dataframe, axis=0)
sepal length (cm) 5.843333
sepal width (cm) 3.054000
petal length (cm) 3.758667
petal width (cm) 1.198667
dtype: float64
print iris_dataframe.median(numeric_only=True)
sepal length (cm) 5.80
sepal width (cm) 3.00
petal length (cm) 4.35
petal width (cm) 1.30
dtype: float64
print iris_dataframe.var(numeric_only=True)
print iris_dataframe.std(numeric_only=True)
sepal length (cm) 0.685694
sepal width (cm) 0.188004
petal length (cm) 3.113179
petal width (cm) 0.582414
dtype: float64
sepal length (cm) 0.828066
sepal width (cm) 0.433594
petal length (cm) 1.764420
petal width (cm) 0.763161
dtype: float64
print iris_dataframe.max(numeric_only=True)-iris_dataframe.min(numeric_only=True)
variances = np.var(iris_nparray, axis=0)
stdeviations = np.std(iris_nparray, axis=0)
maxs = np.max(iris_nparray, axis=0)
mins =np.min(iris_nparray, axis=0)
sepal length (cm) 3.6
sepal width (cm) 2.4
petal length (cm) 5.9
petal width (cm) 2.4
dtype: float64
percentiles = np.percentile(iris_nparray, q=[0,25,50,75,100], axis=0)
[array([ 4.3, 2. , 1. , 0.1]), array([ 5.1, 2.8, 1.6, 0.3]), array([ 5.8 , 3. , 4.35, 1.3 ]), array([ 6.4, 3.3, 5.1, 1.8]), array([ 7.9, 4.4, 6.9, 2.5])]
print iris_dataframe.quantile(np.array([0,.25,.50,.75,1]))
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0.00 4.3 2.0 1.00 0.1
0.25 5.1 2.8 1.60 0.3
0.50 5.8 3.0 4.35 1.3
0.75 6.4 3.3 5.10 1.8
1.00 7.9 4.4 6.90 2.5
print iris_dataframe.describe()
sepal length (cm) sepal width (cm) petal length (cm) \
count 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667
std 0.828066 0.433594 1.764420
min 4.300000 2.000000 1.000000
25% 5.100000 2.800000 1.600000
50% 5.800000 3.000000 4.350000
75% 6.400000 3.300000 5.100000
max 7.900000 4.400000 6.900000
petal width (cm)
count 150.000000
mean 1.198667
std 0.763161
min 0.100000
25% 0.300000
50% 1.300000
75% 1.800000
max 2.500000
from scipy.stats import kurtosis, kurtosistest
k = kurtosis(iris_dataframe['petal length (cm)'])
zscore, pvalue = kurtosistest(iris_dataframe['petal length (cm)'])
print 'Kurtosis %0.3f z-score %0.3f p-value %0.3f' % (k, zscore, pvalue)
"""
allontanamento dalla normalità distributiva
width of peak), tail weight, and lack of shoulders
> 0 la curva si definisce leptocurtica, cioè più "appuntita" di una normale.
< 0 la curva si definisce platicurtica, cioè più "piatta" di una normale.
= 0 la curva si definisce normocurtica, cioè "piatta" come una normale.
"""
Kurtosis -1.395 z-score -14.811 p-value 0.000
'\nallontanamento dalla normalit\xc3\xa0 distributiva\nwidth of peak), tail weight, and lack of shoulders\n> 0 la curva si definisce leptocurtica, cio\xc3\xa8 pi\xc3\xb9 "appuntita" di una normale.\n< 0 la curva si definisce platicurtica, cio\xc3\xa8 pi\xc3\xb9 "piatta" di una normale.\n= 0 la curva si definisce normocurtica, cio\xc3\xa8 "piatta" come una normale.\n'
from scipy.stats import skew, skewtest
s = skew(iris_dataframe['petal length (cm)'])
zscore, pvalue = skewtest(iris_dataframe['petal length (cm)'])
print 'Skewness %0.3f z-score %0.3f p-value %0.3f' % (s, zscore, pvalue)
"""
n probability theory and statistics, skewness is a measure of the asymmetry
of the probability distribution
of a real-valued random variable about its mean.
The skewness value can be positive or negative, or even undefined.
negative skew: The left tail is longer; the mass of the distribution is concentrated on the right of the figure.
The distribution is said to be left-skewed, left-tailed, or skewed to the left.
positive skew: The right tail is longer; the mass of the distribution is concentrated on the left of the figure.
The distribution is said to be right-skewed, right-tailed, or skewed to the right.
"""
Skewness -0.272 z-score -1.398 p-value 0.162
'\nn probability theory and statistics, skewness is a measure of the asymmetry \nof the probability distribution \nof a real-valued random variable about its mean. \nThe skewness value can be positive or negative, or even undefined.\nnegative skew: The left tail is longer; the mass of the distribution is concentrated on the right of the figure. \nThe distribution is said to be left-skewed, left-tailed, or skewed to the left.\npositive skew: The right tail is longer; the mass of the distribution is concentrated on the left of the figure. \nThe distribution is said to be right-skewed, right-tailed, or skewed to the right.\n'
Understanding frequency and the mode
Creating contingency tables
iris_binned = pd.concat([
pd.qcut(iris_dataframe['sepal length (cm)'], [0, .25, .5, .75, 1]),
pd.qcut(iris_dataframe['sepal width (cm)'], [0, .25, .5, .75, 1]),
pd.qcut(iris_dataframe['petal length (cm)'], [0, .25, .5, .75, 1]),
pd.qcut(iris_dataframe['petal width (cm)'], [0, .25, .5, .75, 1])
], join='outer', axis = 1)
print iris_binned['petal length (cm)'].value_counts()
[1, 1.6] 44
(4.35, 5.1] 41
(5.1, 6.9] 34
(1.6, 4.35] 31
dtype: int64
print iris_dataframe['group'].value_counts()
2 50
1 50
0 50
dtype: int64
iris_binned.describe()
print pd.crosstab(iris_dataframe['group'], iris_binned['petal length (cm)'])
petal length (cm) (1.6, 4.35] (4.35, 5.1] (5.1, 6.9] [1, 1.6]
group
setosa 6 0 0 44
versicolor 25 25 0 0
virginica 0 16 34 0
Using covariance and correlation
Using non-parametric correlation
Considering chi-square for contingency tables
print iris_dataframe.cov()
sepal length (cm) sepal width (cm) petal length (cm) \
sepal length (cm) 0.685694 -0.039268 1.273682
sepal width (cm) -0.039268 0.188004 -0.321713
petal length (cm) 1.273682 -0.321713 3.113179
petal width (cm) 0.516904 -0.117981 1.296387
petal width (cm)
sepal length (cm) 0.516904
sepal width (cm) -0.117981
petal length (cm) 1.296387
petal width (cm) 0.582414
print iris_dataframe.corr()
sepal length (cm) sepal width (cm) petal length (cm) \
sepal length (cm) 1.000000 -0.109369 0.871754
sepal width (cm) -0.109369 1.000000 -0.420516
petal length (cm) 0.871754 -0.420516 1.000000
petal width (cm) 0.817954 -0.356544 0.962757
petal width (cm)
sepal length (cm) 0.817954
sepal width (cm) -0.356544
petal length (cm) 0.962757
petal width (cm) 1.000000
print iris_dataframe.corr()**2
sepal length (cm) sepal width (cm) petal length (cm) \
sepal length (cm) 1.000000 0.011962 0.759955
sepal width (cm) 0.011962 1.000000 0.176834
petal length (cm) 0.759955 0.176834 1.000000
petal width (cm) 0.669048 0.127124 0.926901
petal width (cm)
sepal length (cm) 0.669048
sepal width (cm) 0.127124
petal length (cm) 0.926901
petal width (cm) 1.000000
covariance_matrix = np.cov(iris_nparray, rowvar=0, bias=1)
correlation_matrix= np.corrcoef(iris_nparray, rowvar=0, bias=1)
# Notes
print np.diag(covariance_matrix)
print np.var(iris_nparray, axis=0)
from sklearn.preprocessing import scale
st_covariance_matrix = np.cov(scale(iris_nparray), rowvar=0, bias=1)
print st_covariance_matrix
print correlation_matrix
[ 0.68112222 0.18675067 3.09242489 0.57853156]
[ 0.68112222 0.18675067 3.09242489 0.57853156]
[[ 1. -0.10936925 0.87175416 0.81795363]
[-0.10936925 1. -0.4205161 -0.35654409]
[ 0.87175416 -0.4205161 1. 0.9627571 ]
[ 0.81795363 -0.35654409 0.9627571 1. ]]
[[ 1. -0.10936925 0.87175416 0.81795363]
[-0.10936925 1. -0.4205161 -0.35654409]
[ 0.87175416 -0.4205161 1. 0.9627571 ]
[ 0.81795363 -0.35654409 0.9627571 1. ]]
from scipy.stats import spearmanr
from scipy.stats.stats import pearsonr
spearmanr_coef, spearmanr_p = spearmanr(iris_dataframe['sepal length (cm)'], iris_dataframe['sepal width (cm)'])
pearsonr_coef, pearsonr_p = pearsonr(iris_dataframe['sepal length (cm)'], iris_dataframe['sepal width (cm)'])
print 'Pearson correlation %0.3f | Spearman correlation %0.3f' % (pearsonr_coef, spearmanr_coef)
Pearson correlation -0.109 | Spearman correlation -0.159
from scipy.stats import chi2_contingency
table = pd.crosstab(iris_dataframe['group'], iris_binned['petal length (cm)'])
chi2, p, dof, expected = chi2_contingency(table.values)
print 'Chi-square %0.2f p-value %0.3f' % (chi2, p)
Chi-square 212.43 p-value 0.000
from scipy.stats import chi2_contingency
table = pd.crosstab(iris_binned['sepal width (cm)'], iris_binned['sepal length (cm)'])
chi2, p, dof, expected = chi2_contingency(table.values)
print 'Chi-square %0.2f p-value %0.3f' % (chi2, p)
Chi-square 44.25 p-value 0.000
print pd.crosstab(iris_binned['sepal width (cm)'], iris_binned['sepal length (cm)'])
sepal length (cm) (5.1, 5.8] (5.8, 6.4] (6.4, 7.9] [4.3, 5.1]
sepal width (cm)
(2.8, 3] 6 10 13 7
(3, 3.3] 0 6 13 12
(3.3, 4.4] 14 3 3 16
[2, 2.8] 19 16 6 6
Plotting boxplots
Graphing histograms
Developing scatterplots
Parallel plots
boxplots = iris_dataframe.boxplot(return_type='axes')
from pandas.tools.plotting import parallel_coordinates
iris_dataframe['labels'] = [iris.target_names[k] for k in iris_dataframe['group']]
pll = parallel_coordinates(iris_dataframe,'labels')
densityplot = iris_dataframe[iris_dataframe.columns[:4]].plot(kind='density')
densityplot = iris_dataframe['petal length (cm)'].plot(kind='density')
densityplot1 = iris_dataframe['sepal width (cm)'].plot(kind='hist')
densityplot2 = np.sqrt(iris_dataframe['sepal width (cm)']).plot(kind='hist')
from scipy.stats import skew, skewtest
from scipy.stats import kurtosis, kurtosistest
s = skew(iris_dataframe['sepal length (cm)'])
k = kurtosis(iris_dataframe['sepal length (cm)'])
print s,k
_, spvalue = skewtest(iris_dataframe['sepal width (cm)'])
_, kpvalue = kurtosistest(iris_dataframe['sepal width (cm)'])
print 'Skewness p-value %0.3f | Kurtosis p-value %0.3f' % (spvalue, kpvalue)
0.311753058502 -0.573567948925
Skewness p-value 0.091 | Kurtosis p-value 0.395
single_distribution = iris_dataframe['petal length (cm)'].plot(kind='hist', alpha=0.4)
colors_palette = {0: 'red', 1: 'yellow', 2:'blue'}
colors = [colors_palette[c] for c in iris_dataframe['group']]
simple_scatterplot = iris_dataframe.plot(kind='scatter', x='petal length (cm)', y='petal width (cm)', c=colors)
from pandas.tools.plotting import scatter_matrix
colors_palette = {0: "red", 1: "yellow", 2: "blue"}
colors = [colors_palette[c] for c in iris_dataframe['group']]
matrix_of_scatterplots = scatter_matrix(iris_dataframe, figsize=(6, 6), color=colors, diagonal='kde')
Performing t-tests
Considering non-parametric tests
from scipy.stats import ttest_ind
group0 = iris_dataframe['group'] == 'setosa'
group1 = iris_dataframe['group'] == 'versicolor'
group2 = iris_dataframe['group'] == 'virginica'
print 'var1 %0.3f var2 %03f' % (iris_dataframe['petal length (cm)'][group1].var(), iris_dataframe['petal length (cm)'][group2].var())
t, pvalue = ttest_ind(iris_dataframe['petal length (cm)'][group1], iris_dataframe['petal length (cm)'][group2], axis=0, equal_var=False)
print 't statistic %0.3f p-value %0.3f' % (t, pvalue)
var1 0.221 var2 0.304588
t statistic -12.604 p-value 0.000
boxplots = iris_dataframe.boxplot(column='petal length (cm)', by='group', return_type='axes')
#help(pd.DataFrame.boxplot)
t, pvalue = ttest_ind(iris_dataframe['sepal width (cm)'][group1], iris_dataframe['sepal width (cm)'][group2], axis=0, equal_var=False)
print 't statistic %0.3f p-value %0.3f' % (t, pvalue)
t statistic -3.206 p-value 0.002
from scipy.stats import f_oneway
f, pvalue = f_oneway(iris_dataframe['sepal width (cm)'][group0],
iris_dataframe['sepal width (cm)'][group1],
iris_dataframe['sepal width (cm)'][group2])
print "One-way ANOVA F-value %0.3f p-value %0.3f" % (f,pvalue)
One-way ANOVA F-value 47.364 p-value 0.000
from scipy.stats import wilcoxon
T, pvalue = wilcoxon(iris_dataframe['sepal width (cm)'][group1], iris_dataframe['sepal width (cm)'][group2])
print 'Wilcoxon T statistic %0.3f p-value %0.3f' % (t, pvalue)
Wilcoxon T statistic -3.206 p-value 0.006
from scipy.stats import kruskal
H, pvalue = kruskal(iris_dataframe['sepal width (cm)'][group0], iris_dataframe['sepal width (cm)'][group1], iris_dataframe['sepal width (cm)'][group2])
print 'Kruskal-Wallis H statistic %0.3f p-value %0.3f' % (H, pvalue)
Kruskal-Wallis H statistic 62.495 p-value 0.000
from sklearn.preprocessing import scale
stand_sepal_width = scale(iris_dataframe['sepal width (cm)'])
from scipy.stats.stats import pearsonr
tranformations = {'x': lambda x: x, '1/x': lambda x: 1/x, 'x**2': lambda x: x**2, 'x**3': lambda x: x**3, 'log(x)': lambda x: np.log(x)}
for transformation in tranformations:
pearsonr_coef, pearsonr_p = pearsonr(iris_dataframe['sepal length (cm)'], tranformations[transformation](iris_dataframe['sepal width (cm)']))
print 'Transformation: %s \t Pearson\'s r: %0.3f' % (transformation, pearsonr_coef)
Transformation: x Pearson's r: -0.109
Transformation: x**2 Pearson's r: -0.122
Transformation: x**3 Pearson's r: -0.131
Transformation: log(x) Pearson's r: -0.093
Transformation: 1/x Pearson's r: 0.073
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
boston = load_boston()
X, y = scale(boston.data), boston.target
print X.shape, y.shape
(506L, 13L) (506L,)
from sklearn.linear_model import LinearRegression
regression = LinearRegression(normalize=True)
regression.fit(X,y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=True)
print regression.score(X,y)
0.740607742865
print [a+':'+str(round(b,1)) for a, b in zip(boston.feature_names, regression.coef_,)]
['CRIM:-0.9', 'ZN:1.1', 'INDUS:0.1', 'CHAS:0.7', 'NOX:-2.1', 'RM:2.7', 'AGE:0.0', 'DIS:-3.1', 'RAD:2.7', 'TAX:-2.1', 'PTRATIO:-2.1', 'B:0.9', 'LSTAT:-3.7']
print boston.DESCR
Boston House Prices dataset
Notes
------
Data Set Characteristics:
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive
:Median Value (attribute 14) is usually the target
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression
problems.
**References**
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
- many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data[:-1,:], iris.target[:-1]
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X,y)
print 'Predicted class %s, real class %s' % (logistic.predict(iris.data[-1,:]),iris.target[-1])
print 'Probabilities for each class from 0 to 2: %s' % logistic.predict_proba(iris.data[-1,:])
Predicted class [2], real class 2
Probabilities for each class from 0 to 2: [[ 0.00168787 0.28720074 0.71111138]]
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data[:1700,:], digits.target[:1700]
tX, ty = digits.data[1700:,:], digits.target[1700:]
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
OVR = OneVsRestClassifier(logistic).fit(X,y)
OVO = OneVsOneClassifier(logistic).fit(X,y)
print 'One vs rest accuracy: %.3f' % OVR.score(tX,ty)
print 'One vs one accuracy: %.3f' % OVO.score(tX,ty)
One vs rest accuracy: 0.938
One vs one accuracy: 0.969
C:\WinPython-64bit-2.7.6.4\python-2.7.6.amd64\lib\site-packages\sklearn\utils\__init__.py:93: DeprecationWarning: Function multilabel_ is deprecated; Attribute multilabel_ is deprecated and will be removed in 0.17. Use 'y_type_.startswith('multilabel')' instead
warnings.warn(msg, category=DeprecationWarning)
LR = LogisticRegression()
LR.fit(X,y)
print 'One vs rest accuracy: %.3f' % LR.score(tX,ty)
One vs rest accuracy: 0.938
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
print 'number of posts in training: %i' % len(newsgroups_train.data)
D={word:True for post in newsgroups_train.data for word in post.split(' ')}
print 'number of distinct words in training: %i' % len(D)
print 'number of posts in test: %i' % len(newsgroups_test.data)
number of posts in training: 11314
number of distinct words in training: 300972
number of posts in test: 7532
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
multinomial_hashing_trick = HashingVectorizer(stop_words='english', binary=False, norm=None, non_negative=True)
binary_hashing_trick = HashingVectorizer(stop_words='english', binary=True, norm=None, non_negative=True)
Multinomial.fit(multinomial_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
Bernoulli.fit(binary_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
from sklearn.metrics import accuracy_score
for m,h in [(Bernoulli,binary_hashing_trick), (Multinomial,multinomial_hashing_trick)]:
print 'Accuracy for %s: %.3f' % (m, accuracy_score(y_true=newsgroups_test.target, y_pred=m.predict(h.transform(newsgroups_test.data))))
Accuracy for BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True): 0.570
Accuracy for MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True): 0.651
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.naive_bayes import GaussianNB
Gaussian = GaussianNB()
y_ord = pd.cut(boston.target, bins=4, labels=False)
Gaussian.fit(boston.data,y_ord)
print np.corrcoef(Gaussian.predict(boston.data),boston.target)[0,1]
0.734907024299
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
digits = load_digits()
pca = PCA(n_components=25)
pca.fit(digits.data[:1700,:])
X, y = pca.transform(digits.data[:1700,:]), digits.target[:1700]
tX, ty = pca.transform(digits.data[1700:,:]), digits.target[1700:]
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=5, p=2)
kNN.fit(X,y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_neighbors=5, p=2, weights='uniform')
print 'Accuracy: %.3f' % kNN.score(tX,ty)
print 'Prediction: %s actual: %s' % (kNN.predict(tX[:10,:]),ty[:10])
Accuracy: 0.990
Prediction: [5 6 5 0 9 8 9 8 4 1] actual: [5 6 5 0 9 8 9 8 4 1]
for k in [1,5,10,20,50,100,200]:
kNN = KNeighborsClassifier(n_neighbors=k).fit(X,y)
print 'for k=%3i accuracy is %.3f' % (k, kNN.score(tX,ty))
for k= 1 accuracy is 0.979
for k= 5 accuracy is 0.990
for k= 10 accuracy is 0.969
for k= 20 accuracy is 0.969
for k= 50 accuracy is 0.959
for k=100 accuracy is 0.959
for k=200 accuracy is 0.907