Uploading, Streaming, and Sampling Data

Uploading small amounts of data into memory

with open("Colors.txt", 'rb') as open_file:
    print 'Colors.txt content:\n' + open_file.read()

Colors.txt content:
Color   Value
Red 1
Orange  2
Yellow  3
Green   4
Blue    5
Purple  6
Black   7
White   8

Streaming large amounts of data into memory

with open("Colors.txt", 'rb') as open_file:
    for observation in open_file:
        print 'Reading Data: ' + observation

Reading Data: Color Value

Reading Data: Red   1

Reading Data: Orange    2

Reading Data: Yellow    3

Reading Data: Green 4

Reading Data: Blue  5

Reading Data: Purple    6

Reading Data: Black 7

Reading Data: White 8

Sampling data

Fixed samples

n = 3
with open("Colors.txt", 'rb') as open_file:
    for j, observation in enumerate(open_file):
        if j % n==0:
            print('Reading Line: ' + str(j) + 
            ' Content: ' + observation)

Reading Line: 0 Content: Color  Value

Reading Line: 3 Content: Yellow 3

Reading Line: 6 Content: Purple 6

Random samples

from random import random
sample_size = 0.25
with open("Colors.txt", 'rb') as open_file:
    for j, observation in enumerate(open_file):
        if random()<=sample_size:
            print('Reading Line: ' + str(j) + 
            ' Content: ' + observation)

Reading Line: 2 Content: Orange 2

Reading Line: 3 Content: Yellow 3

Accessing Data in Structured Flat File Form

Reading from a text file

import pandas as pd
color_table = pd.io.parsers.read_table("Colors.txt")
print color_table

    Color  Value
0     Red      1
1  Orange      2
2  Yellow      3
3   Green      4
4    Blue      5
5  Purple      6
6   Black      7
7   White      8

Reading CSV delimited format

import pandas as pd
titanic = pd.io.parsers.read_csv("Titanic.csv")
X = titanic[['age']]
#X = titanic[['age']].values
print X

            age
0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
5       48.0000
6       63.0000
7       39.0000
8       53.0000
9       71.0000
10      47.0000
11      18.0000
12      24.0000
13      26.0000
14      80.0000
15    9999.0000
16      24.0000
17      50.0000
18      32.0000
19      36.0000
20      37.0000
21      47.0000
22      26.0000
23      42.0000
24      29.0000
25      25.0000
26      25.0000
27      19.0000
28      35.0000
29      28.0000
...         ...
1279    14.0000
1280    22.0000
1281    22.0000
1282  9999.0000
1283  9999.0000
1284  9999.0000
1285    32.5000
1286    38.0000
1287    51.0000
1288    18.0000
1289    21.0000
1290    47.0000
1291  9999.0000
1292  9999.0000
1293  9999.0000
1294    28.5000
1295    21.0000
1296    27.0000
1297  9999.0000
1298    36.0000
1299    27.0000
1300    15.0000
1301    45.5000
1302  9999.0000
1303  9999.0000
1304    14.5000
1305  9999.0000
1306    26.5000
1307    27.0000
1308    29.0000

[1309 rows x 1 columns]

Reading Excel and other Microsoft Office files

import pandas as pd
xls = pd.ExcelFile("Values.xls")
trig_values = xls.parse('Sheet1', index_col=None, 
                        na_values=['NA'])
#trig_values = pd.read_excel("Values.xls", 'Sheet1', index_col=None, na_values=['NA'])
print trig_values

    Angle (Degrees)      Sine    Cosine    Tangent
0        138.550574  0.661959 -0.749540  -0.883153
1        305.535745 -0.813753  0.581211  -1.400100
2        280.518695 -0.983195  0.182556  -5.385709
3        216.363795 -0.592910 -0.805269   0.736289
4         36.389247  0.593268  0.805005   0.736974
5         31.474311  0.522116  0.852874   0.612184
6        120.121669  0.864962 -0.501838  -1.723588
7        293.947055 -0.913921  0.405892  -2.251634
8        179.882632  0.002048 -0.999998  -0.002048
9        120.927562  0.857818 -0.513954  -1.669056
10        71.349485  0.947487  0.319795   2.962796
11       241.971082 -0.882711 -0.469917   1.878439
12       297.208817 -0.889346  0.457235  -1.945053
13       142.004551  0.615599 -0.788060  -0.781158
14       173.770696  0.108508 -0.994096  -0.109152
15       229.232002 -0.757360 -0.652998   1.159820
16        67.926976  0.926706  0.375788   2.466033
17       261.866575 -0.989941 -0.141479   6.997102
18        59.185450  0.858830  0.512261   1.676547
19        98.029275  0.990197 -0.139679  -7.089086
20         9.336088  0.162225  0.986754   0.164403
21        90.746371  0.999915 -0.013026 -76.761483
22       217.798087 -0.612881 -0.790175   0.775626
23        58.616049  0.853697  0.520771   1.639295
24       197.196367 -0.295647 -0.955297   0.309482
25       331.194892 -0.481832  0.876264  -0.549871
26         6.509875  0.113374  0.993552   0.114110
27       266.390707 -0.998017 -0.062952  15.853513
28       230.323819 -0.769665 -0.638448   1.205525
29       314.224257 -0.716615  0.697469  -1.027452
..              ...       ...       ...        ...
42       324.080564 -0.586647  0.809843  -0.724396
43       140.893727  0.630761 -0.775977  -0.812860
44       128.226889  0.785567 -0.618777  -1.269547
45        82.770054  0.992049  0.125852   7.882680
46       303.112455 -0.837600  0.546284  -1.533268
47       164.824273  0.261780 -0.965127  -0.271239
48       218.829827 -0.627009 -0.779012   0.804878
49        28.649593  0.479452  0.877568   0.546341
50       349.336296 -0.185044  0.982730  -0.188296
51        84.889713  0.996025  0.089073  11.182105
52       197.935862 -0.307952 -0.951402   0.323683
53       303.049380 -0.838201  0.545362  -1.536963
54       183.737235 -0.065181 -0.997873   0.065320
55       346.153919 -0.239314  0.970942  -0.246477
56       218.822745 -0.626913 -0.779089   0.804674
57       243.969070 -0.898557 -0.438856   2.047498
58       115.600771  0.901827 -0.432098  -2.087089
59       125.906606  0.809974 -0.586466  -1.381111
60       200.094363 -0.343567 -0.939128   0.365836
61       337.860807 -0.376858  0.926271  -0.406855
62       168.176975  0.204889 -0.978785  -0.209330
63       305.708155 -0.812000  0.583657  -1.391229
64       162.656078  0.298107 -0.954533  -0.312306
65       219.007899 -0.629428 -0.777059   0.810012
66       222.830465 -0.679831 -0.733368   0.926998
67       324.199562 -0.584964  0.811059  -0.721234
68       187.948172 -0.138277 -0.990394   0.139619
69       270.678249 -0.999930  0.011837 -84.472139
70       270.779159 -0.999908  0.013598 -73.530885
71       200.213513 -0.345520 -0.938412   0.368196

[72 rows x 4 columns]

Sending Data in Unstructured File Form

Rendering the image

from skimage.io import imread
from skimage.transform import resize 
from matplotlib import pyplot as plt
import matplotlib.cm as cm

example_file = ("http://upload.wikimedia.org/" +
    "wikipedia/commons/7/7d/Dog_face.png")
image = imread(example_file, as_grey=True)
plt.imshow(image, cmap=cm.gray)
plt.show()

Displaying the image information

print("data type: %s, shape: %s" % 
      (type(image), image.shape))

data type: <type 'numpy.ndarray'>, shape: (90L, 90L)

Cropping the image

image2 = image[5:70,0:70]
plt.imshow(image2, cmap=cm.gray)
plt.show()

Resizing the image

image3 = resize(image2, (30, 30), mode='nearest')
plt.imshow(image3, cmap=cm.gray)
print("data type: %s, shape: %s" % 
      (type(image3), image3.shape))

data type: <type 'numpy.ndarray'>, shape: (30L, 30L)

Flatening the image

image_row = image3.flatten()
print("data type: %s, shape: %s" % 
      (type(image_row), image_row.shape))

data type: <type 'numpy.ndarray'>, shape: (900L,)

Accessing Data from the Web

from lxml import objectify
import pandas as pd

xml = objectify.parse(open('XMLData.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))

for i in range(0,4):
    obj = root.getchildren()[i].getchildren()
    row = dict(zip(['Number', 'String', 'Boolean'], 
                   [obj[0].text, obj[1].text, 
                    obj[2].text]))
    row_s = pd.Series(row)
    row_s.name = i
    df = df.append(row_s)
    
print df

  Number  String Boolean
0      1   First    True
1      2  Second   False
2      3   Third    True
3      4  Fourth   False

Validating Your Data

Figuring out what’s in your data

from lxml import objectify
import pandas as pd

xml = objectify.parse(open('XMLData2.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))

for i in range(0,4):
    obj = root.getchildren()[i].getchildren()
    row = dict(zip(['Number', 'String', 'Boolean'], 
                   [obj[0].text, obj[1].text, 
                    obj[2].text]))
    row_s = pd.Series(row)
    row_s.name = i
    df = df.append(row_s)
    
search = pd.DataFrame.duplicated(df)

print df
print
print search[search == True]

  Number  String Boolean
0      1   First    True
1      1   First    True
2      2  Second   False
3      3   Third    True

1    True
dtype: bool

Removing duplicates

from lxml import objectify
import pandas as pd

xml = objectify.parse(open('XMLData2.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))

for i in range(0,4):
    obj = root.getchildren()[i].getchildren()
    row = dict(zip(['Number', 'String', 'Boolean'], 
                   [obj[0].text, obj[1].text, 
                    obj[2].text]))
    row_s = pd.Series(row)
    row_s.name = i
    df = df.append(row_s)
    
print df.drop_duplicates()

  Number  String Boolean
0      1   First    True
2      2  Second   False
3      3   Third    True

Creating a data map and data plan

import pandas as pd

df = pd.DataFrame({'A': [0,0,0,0,0,1,1],
                   'B': [1,2,3,5,4,2,5],
                   'C': [5,3,4,1,1,2,3]})

a_group_desc = df.groupby('A').describe()
print a_group_desc

                B         C
A                          
0 count  5.000000  5.000000
  mean   3.000000  2.800000
  std    1.581139  1.788854
  min    1.000000  1.000000
  25%    2.000000  1.000000
  50%    3.000000  3.000000
  75%    4.000000  4.000000
  max    5.000000  5.000000
1 count  2.000000  2.000000
  mean   3.500000  2.500000
  std    2.121320  0.707107
  min    2.000000  2.000000
  25%    2.750000  2.250000
  50%    3.500000  2.500000
  75%    4.250000  2.750000
  max    5.000000  3.000000

unstacked = a_group_desc.unstack()
print unstacked

      B                                             C                     \
  count mean       std min   25%  50%   75% max count mean       std min   
A                                                                          
0     5  3.0  1.581139   1  2.00  3.0  4.00   5     5  2.8  1.788854   1   
1     2  3.5  2.121320   2  2.75  3.5  4.25   5     2  2.5  0.707107   2   

                        
    25%  50%   75% max  
A                       
0  1.00  3.0  4.00   5  
1  2.25  2.5  2.75   3

print unstacked.loc[:,(slice(None),['count','mean']),]

      B          C     
  count mean count mean
A                      
0     5  3.0     5  2.8
1     2  3.5     2  2.5

Manipulating Categorical Variables

Creating categorical variables

import pandas as pd

car_colors = pd.Series(['Blue', 'Red', 'Green'], dtype='category')

car_data = pd.Series(
    pd.Categorical(['Yellow', 'Green', 'Red', 'Blue', 'Purple'],
                   categories=car_colors, ordered=False))

find_entries = pd.isnull(car_data)

print car_colors
print
print car_data
print
print find_entries[find_entries == True]

0     Blue
1      Red
2    Green
dtype: category
Categories (3, object): [Blue < Green < Red]

0      NaN
1    Green
2      Red
3     Blue
4      NaN
dtype: category
Categories (3, object): [Blue, Red, Green]

0    True
4    True
dtype: bool

Renaming levels

import pandas as pd

car_colors = pd.Series(['Blue', 'Red', 'Green'], 
                       dtype='category')
car_data = pd.Series(
    pd.Categorical(
        ['Blue', 'Green', 'Red', 'Blue', 'Red'],
        categories=car_colors, ordered=False))


car_colors.cat.categories = ["Purple", "Yellow", "Mauve"]
car_data.cat.categories = car_colors

print car_data

0    Purple
1    Yellow
2     Mauve
3    Purple
4     Mauve
dtype: category
Categories (3, object): [Purple, Mauve, Yellow]

Combining levels

import pandas as pd

car_colors = pd.Series(['Blue', 'Red', 'Green'], 
                       dtype='category')
car_data = pd.Series(
    pd.Categorical(
        ['Blue', 'Green', 'Red', 'Green', 'Red', 'Green'],
        categories=car_colors, ordered=False))

car_data.cat.categories = ["Blue_Red", "Red", "Green"]
print car_data.ix[car_data.isin(['Red'])]

car_data.ix[car_data.isin(['Red'])] = 'Blue_Red'

print
print car_data

2    Red
4    Red
dtype: category
Categories (3, object): [Blue_Red, Red, Green]

0    Blue_Red
1       Green
2    Blue_Red
3       Green
4    Blue_Red
5       Green
dtype: category
Categories (3, object): [Blue_Red, Red, Green]

Dealing with Dates in Your Data

Formatting time values

import datetime as dt

now = dt.datetime.now()

print str(now)
print now.strftime('%a, %d %B %Y')

2015-04-19 18:00:08.427000
Sun, 19 April 2015

Using the right time transformation

import datetime as dt

now = dt.datetime.now()
timevalue = now + dt.timedelta(hours=2)

print now.strftime('%H:%M:%S')
print timevalue.strftime('%H:%M:%S')
print timevalue - now

18:00:10
20:00:10
2:00:00

Dealing with Missing Data

Finding out missing data

import pandas as pd
import numpy as np

s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])

print s.isnull()

print
print s[s.isnull()]

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

3   NaN
6   NaN
dtype: float64

Encoding missingness

import pandas as pd
import numpy as np

s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])

print s.fillna(int(s.mean()))
print
print s.dropna()

0    1
1    2
2    3
3    3
4    5
5    6
6    3
dtype: float64

0    1
1    2
2    3
4    5
5    6
dtype: float64

Imputing missing data

import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer

s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])

imp = Imputer(missing_values='NaN', 
              strategy='mean', axis=0)

imp.fit([1, 2, 3, 4, 5, 6, 7])

x = pd.Series(imp.transform(s).tolist()[0])

print x

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: float64

Slicing and Dicing

Slicing rows

x = np.array([[[1, 2, 3],  [4, 5, 6],  [7, 8, 9],],
              [[11,12,13], [14,15,16], [17,18,19],],
              [[21,22,23], [24,25,26], [27,28,29]]])

x[1]

array([[11, 12, 13],
       [14, 15, 16],
       [17, 18, 19]])

Slicing columns

x = np.array([[[1, 2, 3],  [4, 5, 6],  [7, 8, 9],],
              [[11,12,13], [14,15,16], [17,18,19],],
              [[21,22,23], [24,25,26], [27,28,29]]])

x[:,1]

array([[ 4,  5,  6],
       [14, 15, 16],
       [24, 25, 26]])

Dicing

x = np.array([[[1, 2, 3],  [4, 5, 6],  [7, 8, 9],],
              [[11,12,13], [14,15,16], [17,18,19],],
              [[21,22,23], [24,25,26], [27,28,29]]])

print x[1,1]
print x[:,1,1]
print x[1,:,1]
print
print x[1:3, 1:3]

[14 15 16]
[ 5 15 25]
[12 15 18]

[[[14 15 16]
  [17 18 19]]

 [[24 25 26]
  [27 28 29]]]

Concatenating and Transforming

Adding new cases and variables

import pandas as pd

df = pd.DataFrame({'A': [2,3,1],
                   'B': [1,2,3],
                   'C': [5,3,4]})

df1 = pd.DataFrame({'A': [4],
                    'B': [4],
                    'C': [4]})

df = df.append(df1)
df = df.reset_index(drop=True)
print df

df.loc[df.last_valid_index() + 1] = [5, 5, 5]
print
print df

df2 = pd.DataFrame({'D': [1, 2, 3, 4, 5]})

df = pd.DataFrame.join(df, df2)
print
print df

   A  B  C
0  2  1  5
1  3  2  3
2  1  3  4
3  4  4  4

   A  B  C
0  2  1  5
1  3  2  3
2  1  3  4
3  4  4  4
4  5  5  5

   A  B  C  D
0  2  1  5  1
1  3  2  3  2
2  1  3  4  3
3  4  4  4  4
4  5  5  5  5

Removing data

import pandas as pd

df = pd.DataFrame({'A': [2,3,1],
                   'B': [1,2,3],
                   'C': [5,3,4]})

df = df.drop(df.index[[1]])
print df

df = df.drop('B', 1)
print
print df

Sorting and shuffling

import pandas as pd
import numpy as np

df = pd.DataFrame({'A': [2,1,2,3,3,5,4],
                   'B': [1,2,3,5,4,2,5],
                   'C': [5,3,4,1,1,2,3]})

df = df.sort_index(by=['A', 'B'], ascending=[True, True])
df = df.reset_index(drop=True)
print df

index = df.index.tolist()
np.random.shuffle(index)
df = df.ix[index]
df = df.reset_index(drop=True)
print
print df

Aggregating Data at Any Level

import pandas as pd

df = pd.DataFrame({'Map': [0,0,0,1,1,2,2],
                   'Values': [1,2,3,5,4,2,5]})

df['S'] = df.groupby('Map')['Values'].transform(np.sum)
df['M'] = df.groupby('Map')['Values'].transform(np.mean)
df['V'] = df.groupby('Map')['Values'].transform(np.var)

print df

   Map  Values  S    M    V
0    0       1  6  2.0  1.0
1    0       2  6  2.0  1.0
2    0       3  6  2.0  1.0
3    1       5  9  4.5  0.5
4    1       4  9  4.5  0.5
5    2       2  7  3.5  4.5
6    2       5  7  3.5  4.5

Defining Descriptive Statistics for Numeric Data

Measuring central tendency with mean and median
Measuring variance and range
Working with Percentiles
Defining measures of skewness and kurtosis

import pandas as pd
import numpy as np

print 'Your pandas version is: %s' % pd.__version__
print 'Your NumPy version is %s' % np.__version__
from sklearn.datasets import load_iris
iris = load_iris()
iris_nparray = iris.data

iris_dataframe = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_dataframe['group'] = pd.Series([iris.target_names[item] for item in iris.target], dtype="category")

Your pandas version is: 0.15.2
Your NumPy version is 1.8.1

print iris.feature_names

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

iris_dataframe.dtypes

sepal length (cm)     float64
sepal width (cm)      float64
petal length (cm)     float64
petal width (cm)      float64
group                category
dtype: object

print iris_dataframe['group'].value_counts()

virginica     50
versicolor    50
setosa        50
dtype: int64

print iris_dataframe.mean(numeric_only=True)

# means = np.mean(iris_dataframe, axis=0)
# medians = np.median(iris_dataframe, axis=0)

sepal length (cm)    5.843333
sepal width (cm)     3.054000
petal length (cm)    3.758667
petal width (cm)     1.198667
dtype: float64

print iris_dataframe.median(numeric_only=True)

sepal length (cm)    5.80
sepal width (cm)     3.00
petal length (cm)    4.35
petal width (cm)     1.30
dtype: float64

print iris_dataframe.var(numeric_only=True)
print iris_dataframe.std(numeric_only=True)

sepal length (cm)    0.685694
sepal width (cm)     0.188004
petal length (cm)    3.113179
petal width (cm)     0.582414
dtype: float64
sepal length (cm)    0.828066
sepal width (cm)     0.433594
petal length (cm)    1.764420
petal width (cm)     0.763161
dtype: float64

print iris_dataframe.max(numeric_only=True)-iris_dataframe.min(numeric_only=True)

variances = np.var(iris_nparray, axis=0)
stdeviations = np.std(iris_nparray, axis=0)
maxs = np.max(iris_nparray, axis=0)
mins =np.min(iris_nparray, axis=0)

sepal length (cm)    3.6
sepal width (cm)     2.4
petal length (cm)    5.9
petal width (cm)     2.4
dtype: float64

percentiles = np.percentile(iris_nparray, q=[0,25,50,75,100], axis=0)

[array([ 4.3,  2. ,  1. ,  0.1]), array([ 5.1,  2.8,  1.6,  0.3]), array([ 5.8 ,  3.  ,  4.35,  1.3 ]), array([ 6.4,  3.3,  5.1,  1.8]), array([ 7.9,  4.4,  6.9,  2.5])]

print iris_dataframe.quantile(np.array([0,.25,.50,.75,1]))

      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0.00                4.3               2.0               1.00               0.1
0.25                5.1               2.8               1.60               0.3
0.50                5.8               3.0               4.35               1.3
0.75                6.4               3.3               5.10               1.8
1.00                7.9               4.4               6.90               2.5

print iris_dataframe.describe()

       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.054000           3.758667   
std             0.828066          0.433594           1.764420   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)  
count        150.000000  
mean           1.198667  
std            0.763161  
min            0.100000  
25%            0.300000  
50%            1.300000  
75%            1.800000  
max            2.500000

from scipy.stats import kurtosis, kurtosistest
k = kurtosis(iris_dataframe['petal length (cm)'])
zscore, pvalue = kurtosistest(iris_dataframe['petal length (cm)'])
print 'Kurtosis %0.3f z-score %0.3f p-value %0.3f' % (k, zscore, pvalue)

"""
allontanamento dalla normalità distributiva
width of peak), tail weight, and lack of shoulders
> 0 la curva si definisce leptocurtica, cioè più "appuntita" di una normale.
< 0 la curva si definisce platicurtica, cioè più "piatta" di una normale.
= 0 la curva si definisce normocurtica, cioè "piatta" come una normale.
"""

Kurtosis -1.395 z-score -14.811 p-value 0.000





'\nallontanamento dalla normalit\xc3\xa0 distributiva\nwidth of peak), tail weight, and lack of shoulders\n> 0 la curva si definisce leptocurtica, cio\xc3\xa8 pi\xc3\xb9 "appuntita" di una normale.\n< 0 la curva si definisce platicurtica, cio\xc3\xa8 pi\xc3\xb9 "piatta" di una normale.\n= 0 la curva si definisce normocurtica, cio\xc3\xa8 "piatta" come una normale.\n'

from scipy.stats import skew, skewtest
s = skew(iris_dataframe['petal length (cm)'])
zscore, pvalue = skewtest(iris_dataframe['petal length (cm)'])
print 'Skewness %0.3f z-score %0.3f p-value %0.3f' % (s, zscore, pvalue)
"""
n probability theory and statistics, skewness is a measure of the asymmetry 
of the probability distribution 
of a real-valued random variable about its mean. 
The skewness value can be positive or negative, or even undefined.
negative skew: The left tail is longer; the mass of the distribution is concentrated on the right of the figure. 
The distribution is said to be left-skewed, left-tailed, or skewed to the left.
positive skew: The right tail is longer; the mass of the distribution is concentrated on the left of the figure. 
The distribution is said to be right-skewed, right-tailed, or skewed to the right.
"""

Skewness -0.272 z-score -1.398 p-value 0.162





'\nn probability theory and statistics, skewness is a measure of the asymmetry \nof the probability distribution \nof a real-valued random variable about its mean. \nThe skewness value can be positive or negative, or even undefined.\nnegative skew: The left tail is longer; the mass of the distribution is concentrated on the right of the figure. \nThe distribution is said to be left-skewed, left-tailed, or skewed to the left.\npositive skew: The right tail is longer; the mass of the distribution is concentrated on the left of the figure. \nThe distribution is said to be right-skewed, right-tailed, or skewed to the right.\n'

Counting Data Using Frequency and Contingency Tables

Understanding frequency and the mode
Creating contingency tables

iris_binned = pd.concat([
pd.qcut(iris_dataframe['sepal length (cm)'], [0, .25, .5, .75, 1]),
pd.qcut(iris_dataframe['sepal width (cm)'], [0, .25, .5, .75, 1]),
pd.qcut(iris_dataframe['petal length (cm)'], [0, .25, .5, .75, 1]),
pd.qcut(iris_dataframe['petal width (cm)'], [0, .25, .5, .75, 1])
], join='outer', axis = 1)

print iris_binned['petal length (cm)'].value_counts()

[1, 1.6]       44
(4.35, 5.1]    41
(5.1, 6.9]     34
(1.6, 4.35]    31
dtype: int64

print iris_dataframe['group'].value_counts()

2    50
1    50
0    50
dtype: int64

iris_binned.describe()

<tr style="text-align: right;">
  <th></th>
  <th>sepal length (cm)</th>
  <th>sepal width (cm)</th>
  <th>petal length (cm)</th>
  <th>petal width (cm)</th>
</tr>

<tr>
  <th>count</th>
  <td>        150</td>
  <td>      150</td>
  <td>      150</td>
  <td>        150</td>
</tr>
<tr>
  <th>unique</th>
  <td>          4</td>
  <td>        4</td>
  <td>        4</td>
  <td>          4</td>
</tr>
<tr>
  <th>top</th>
  <td> [4.3, 5.1]</td>
  <td> [2, 2.8]</td>
  <td> [1, 1.6]</td>
  <td> [0.1, 0.3]</td>
</tr>
<tr>
  <th>freq</th>
  <td>         41</td>
  <td>       47</td>
  <td>       44</td>
  <td>         41</td>
</tr>

print pd.crosstab(iris_dataframe['group'], iris_binned['petal length (cm)'])

petal length (cm)  (1.6, 4.35]  (4.35, 5.1]  (5.1, 6.9]  [1, 1.6]
group                                                            
setosa                       6            0           0        44
versicolor                  25           25           0         0
virginica                    0           16          34         0

Understanding Correlation and Association

Using covariance and correlation
Using non-parametric correlation
Considering chi-square for contingency tables

print iris_dataframe.cov()

                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           0.685694         -0.039268           1.273682   
sepal width (cm)           -0.039268          0.188004          -0.321713   
petal length (cm)           1.273682         -0.321713           3.113179   
petal width (cm)            0.516904         -0.117981           1.296387   

                   petal width (cm)  
sepal length (cm)          0.516904  
sepal width (cm)          -0.117981  
petal length (cm)          1.296387  
petal width (cm)           0.582414

print iris_dataframe.corr()

                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           1.000000         -0.109369           0.871754   
sepal width (cm)           -0.109369          1.000000          -0.420516   
petal length (cm)           0.871754         -0.420516           1.000000   
petal width (cm)            0.817954         -0.356544           0.962757   

                   petal width (cm)  
sepal length (cm)          0.817954  
sepal width (cm)          -0.356544  
petal length (cm)          0.962757  
petal width (cm)           1.000000

print iris_dataframe.corr()**2

                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           1.000000          0.011962           0.759955   
sepal width (cm)            0.011962          1.000000           0.176834   
petal length (cm)           0.759955          0.176834           1.000000   
petal width (cm)            0.669048          0.127124           0.926901   

                   petal width (cm)  
sepal length (cm)          0.669048  
sepal width (cm)           0.127124  
petal length (cm)          0.926901  
petal width (cm)           1.000000

covariance_matrix = np.cov(iris_nparray, rowvar=0, bias=1)
correlation_matrix= np.corrcoef(iris_nparray, rowvar=0, bias=1)

# Notes
print np.diag(covariance_matrix)
print np.var(iris_nparray, axis=0)

from sklearn.preprocessing import scale 
st_covariance_matrix = np.cov(scale(iris_nparray), rowvar=0, bias=1)
print st_covariance_matrix 
print correlation_matrix

[ 0.68112222  0.18675067  3.09242489  0.57853156]
[ 0.68112222  0.18675067  3.09242489  0.57853156]
[[ 1.         -0.10936925  0.87175416  0.81795363]
 [-0.10936925  1.         -0.4205161  -0.35654409]
 [ 0.87175416 -0.4205161   1.          0.9627571 ]
 [ 0.81795363 -0.35654409  0.9627571   1.        ]]
[[ 1.         -0.10936925  0.87175416  0.81795363]
 [-0.10936925  1.         -0.4205161  -0.35654409]
 [ 0.87175416 -0.4205161   1.          0.9627571 ]
 [ 0.81795363 -0.35654409  0.9627571   1.        ]]

from scipy.stats import spearmanr
from scipy.stats.stats import pearsonr
spearmanr_coef, spearmanr_p = spearmanr(iris_dataframe['sepal length (cm)'], iris_dataframe['sepal width (cm)'])
pearsonr_coef, pearsonr_p = pearsonr(iris_dataframe['sepal length (cm)'], iris_dataframe['sepal width (cm)'])
print 'Pearson correlation %0.3f | Spearman correlation %0.3f' % (pearsonr_coef, spearmanr_coef)

Pearson correlation -0.109 | Spearman correlation -0.159

from scipy.stats import chi2_contingency
table = pd.crosstab(iris_dataframe['group'], iris_binned['petal length (cm)'])
chi2, p, dof, expected = chi2_contingency(table.values)
print 'Chi-square %0.2f p-value %0.3f' % (chi2, p)

Chi-square 212.43 p-value 0.000

from scipy.stats import chi2_contingency
table = pd.crosstab(iris_binned['sepal width (cm)'], iris_binned['sepal length (cm)'])
chi2, p, dof, expected = chi2_contingency(table.values)
print 'Chi-square %0.2f p-value %0.3f' % (chi2, p)

Chi-square 44.25 p-value 0.000

print pd.crosstab(iris_binned['sepal width (cm)'], iris_binned['sepal length (cm)'])

sepal length (cm)  (5.1, 5.8]  (5.8, 6.4]  (6.4, 7.9]  [4.3, 5.1]
sepal width (cm)                                                 
(2.8, 3]                    6          10          13           7
(3, 3.3]                    0           6          13          12
(3.3, 4.4]                 14           3           3          16
[2, 2.8]                   19          16           6           6

Creating Applied Visualization for Data Exploration

Plotting boxplots
Graphing histograms
Developing scatterplots
Parallel plots

boxplots = iris_dataframe.boxplot(return_type='axes')

png

from pandas.tools.plotting import parallel_coordinates
iris_dataframe['labels'] = [iris.target_names[k] for k in iris_dataframe['group']]
pll = parallel_coordinates(iris_dataframe,'labels')

png

densityplot = iris_dataframe[iris_dataframe.columns[:4]].plot(kind='density')

png

densityplot = iris_dataframe['petal length (cm)'].plot(kind='density')

png

densityplot1 = iris_dataframe['sepal width (cm)'].plot(kind='hist')

png

densityplot2 = np.sqrt(iris_dataframe['sepal width (cm)']).plot(kind='hist')

from scipy.stats import skew, skewtest
from scipy.stats import kurtosis, kurtosistest
s = skew(iris_dataframe['sepal length (cm)'])
k = kurtosis(iris_dataframe['sepal length (cm)'])
print s,k

_, spvalue = skewtest(iris_dataframe['sepal width (cm)'])
_, kpvalue = kurtosistest(iris_dataframe['sepal width (cm)'])
print 'Skewness p-value %0.3f | Kurtosis p-value %0.3f' % (spvalue, kpvalue)

0.311753058502 -0.573567948925
Skewness p-value 0.091 | Kurtosis p-value 0.395

png

single_distribution = iris_dataframe['petal length (cm)'].plot(kind='hist', alpha=0.4)

png

colors_palette = {0: 'red', 1: 'yellow', 2:'blue'}
colors = [colors_palette[c] for c in iris_dataframe['group']]
simple_scatterplot = iris_dataframe.plot(kind='scatter', x='petal length (cm)', y='petal width (cm)', c=colors)

png

from pandas.tools.plotting import scatter_matrix
colors_palette = {0: "red", 1: "yellow", 2: "blue"}
colors = [colors_palette[c] for c in iris_dataframe['group']]
matrix_of_scatterplots = scatter_matrix(iris_dataframe, figsize=(6, 6), color=colors, diagonal='kde')

png

Exploring T-tests and Other Tests of Group Difference

Performing t-tests
Considering non-parametric tests

from scipy.stats import ttest_ind

group0 = iris_dataframe['group'] == 'setosa'
group1 = iris_dataframe['group'] == 'versicolor'
group2 = iris_dataframe['group'] == 'virginica'

print 'var1 %0.3f var2 %03f' % (iris_dataframe['petal length (cm)'][group1].var(), iris_dataframe['petal length (cm)'][group2].var())

t, pvalue = ttest_ind(iris_dataframe['petal length (cm)'][group1], iris_dataframe['petal length (cm)'][group2], axis=0, equal_var=False)
print 't statistic %0.3f p-value %0.3f' % (t, pvalue)

var1 0.221 var2 0.304588
t statistic -12.604 p-value 0.000

boxplots = iris_dataframe.boxplot(column='petal length (cm)', by='group', return_type='axes')
#help(pd.DataFrame.boxplot)

png

t, pvalue = ttest_ind(iris_dataframe['sepal width (cm)'][group1], iris_dataframe['sepal width (cm)'][group2], axis=0, equal_var=False)
print 't statistic %0.3f p-value %0.3f' % (t, pvalue)

t statistic -3.206 p-value 0.002

from scipy.stats import f_oneway      
f, pvalue = f_oneway(iris_dataframe['sepal width (cm)'][group0], 
                     iris_dataframe['sepal width (cm)'][group1], 
                     iris_dataframe['sepal width (cm)'][group2])
print "One-way ANOVA F-value %0.3f p-value %0.3f" % (f,pvalue)

One-way ANOVA F-value 47.364 p-value 0.000

from scipy.stats import wilcoxon
T, pvalue = wilcoxon(iris_dataframe['sepal width (cm)'][group1], iris_dataframe['sepal width (cm)'][group2])
print 'Wilcoxon T statistic %0.3f p-value %0.3f' % (t, pvalue)

Wilcoxon T statistic -3.206 p-value 0.006

from scipy.stats import kruskal
H, pvalue = kruskal(iris_dataframe['sepal width (cm)'][group0], iris_dataframe['sepal width (cm)'][group1], iris_dataframe['sepal width (cm)'][group2])
print 'Kruskal-Wallis H statistic %0.3f p-value %0.3f' % (H, pvalue)

Kruskal-Wallis H statistic 62.495 p-value 0.000

from sklearn.preprocessing import scale
stand_sepal_width = scale(iris_dataframe['sepal width (cm)'])

from scipy.stats.stats import pearsonr
tranformations = {'x': lambda x: x, '1/x': lambda x: 1/x, 'x**2': lambda x: x**2, 'x**3': lambda x: x**3, 'log(x)': lambda x: np.log(x)}
for transformation in tranformations:
    pearsonr_coef, pearsonr_p = pearsonr(iris_dataframe['sepal length (cm)'], tranformations[transformation](iris_dataframe['sepal width (cm)']))
    print 'Transformation: %s \t Pearson\'s r: %0.3f' % (transformation, pearsonr_coef)

Transformation: x    Pearson's r: -0.109
Transformation: x**2     Pearson's r: -0.122
Transformation: x**3     Pearson's r: -0.131
Transformation: log(x)   Pearson's r: -0.093
Transformation: 1/x      Pearson's r: 0.073

Guessing the number: linear regression

Using more variables

from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
boston = load_boston()
X, y = scale(boston.data), boston.target
print X.shape, y.shape

(506L, 13L) (506L,)

from sklearn.linear_model import LinearRegression
regression = LinearRegression(normalize=True)
regression.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=True)

print regression.score(X,y)

0.740607742865

print [a+':'+str(round(b,1)) for a, b in zip(boston.feature_names, regression.coef_,)]

['CRIM:-0.9', 'ZN:1.1', 'INDUS:0.1', 'CHAS:0.7', 'NOX:-2.1', 'RM:2.7', 'AGE:0.0', 'DIS:-3.1', 'RAD:2.7', 'TAX:-2.1', 'PTRATIO:-2.1', 'B:0.9', 'LSTAT:-3.7']

print boston.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
**References**

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)

Moving to Logistic Regression

Applying logistic regression

from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data[:-1,:], iris.target[:-1]

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X,y)
print 'Predicted class %s, real class %s' % (logistic.predict(iris.data[-1,:]),iris.target[-1])
print 'Probabilities for each class from 0 to 2: %s' % logistic.predict_proba(iris.data[-1,:])

Predicted class [2], real class 2
Probabilities for each class from 0 to 2: [[ 0.00168787  0.28720074  0.71111138]]

Considering when classes are more than two

from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data[:1700,:], digits.target[:1700]
tX, ty = digits.data[1700:,:], digits.target[1700:]

from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
OVR = OneVsRestClassifier(logistic).fit(X,y)
OVO = OneVsOneClassifier(logistic).fit(X,y)
print 'One vs rest accuracy: %.3f' % OVR.score(tX,ty)
print 'One vs one accuracy: %.3f' % OVO.score(tX,ty)

One vs rest accuracy: 0.938
One vs one accuracy: 0.969


C:\WinPython-64bit-2.7.6.4\python-2.7.6.amd64\lib\site-packages\sklearn\utils\__init__.py:93: DeprecationWarning: Function multilabel_ is deprecated; Attribute multilabel_ is deprecated and will be removed in 0.17. Use 'y_type_.startswith('multilabel')' instead
  warnings.warn(msg, category=DeprecationWarning)

LR = LogisticRegression()
LR.fit(X,y)
print 'One vs rest accuracy: %.3f' % LR.score(tX,ty)

One vs rest accuracy: 0.938

Making Things as Simple as Naïve Bayes

Predicting text classifications

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

print 'number of posts in training: %i' % len(newsgroups_train.data)
D={word:True for post in newsgroups_train.data for word in post.split(' ')}
print 'number of distinct words in training: %i' % len(D)
print 'number of posts in test: %i' % len(newsgroups_test.data)

number of posts in training: 11314
number of distinct words in training: 300972
number of posts in test: 7532

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)

from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
multinomial_hashing_trick = HashingVectorizer(stop_words='english', binary=False, norm=None, non_negative=True)
binary_hashing_trick = HashingVectorizer(stop_words='english', binary=True, norm=None, non_negative=True)

Multinomial.fit(multinomial_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
Bernoulli.fit(binary_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
from sklearn.metrics import accuracy_score
for m,h in [(Bernoulli,binary_hashing_trick), (Multinomial,multinomial_hashing_trick)]:
    print 'Accuracy for %s: %.3f' % (m, accuracy_score(y_true=newsgroups_test.target, y_pred=m.predict(h.transform(newsgroups_test.data))))

Accuracy for BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True): 0.570
Accuracy for MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True): 0.651

from sklearn.datasets import load_boston
boston = load_boston()

from sklearn.naive_bayes import GaussianNB
Gaussian = GaussianNB()
y_ord = pd.cut(boston.target, bins=4, labels=False)
Gaussian.fit(boston.data,y_ord)
print np.corrcoef(Gaussian.predict(boston.data),boston.target)[0,1]

0.734907024299

Exploring Lazy Learning with K-nearest Neighbors

Predicting after observing neighbors

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
digits = load_digits()
pca = PCA(n_components=25)
pca.fit(digits.data[:1700,:])
X, y = pca.transform(digits.data[:1700,:]), digits.target[:1700]
tX, ty = pca.transform(digits.data[1700:,:]), digits.target[1700:]

from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=5, p=2)
kNN.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

print 'Accuracy: %.3f' % kNN.score(tX,ty) 
print 'Prediction: %s actual: %s' % (kNN.predict(tX[:10,:]),ty[:10])

Accuracy: 0.990
Prediction: [5 6 5 0 9 8 9 8 4 1] actual: [5 6 5 0 9 8 9 8 4 1]

Choosing wisely your k parameter

for k in [1,5,10,20,50,100,200]:
    kNN = KNeighborsClassifier(n_neighbors=k).fit(X,y)
    print 'for k=%3i accuracy is %.3f' % (k, kNN.score(tX,ty))

for k=  1 accuracy is 0.979
for k=  5 accuracy is 0.990
for k= 10 accuracy is 0.969
for k= 20 accuracy is 0.969
for k= 50 accuracy is 0.959
for k=100 accuracy is 0.959
for k=200 accuracy is 0.907

PY Fun!