DataFrame
import numpy as np
import pandas as pd
In [59]:
data = np.random.randn(5,3) # Generate random data in matrix/tabular form 5
rows and 3 columns
In [60]:
data
Out[60]:
array([[-0.62162763, 0.9921252 , -0.3884227 ],
[ 0.25709088, -0.1365675 , 0.77113954],
[-0.96153538, -1.26281479, -1.58631299],
[-2.38202279, -2.19703702, 0.42556743],
[-0.42338357, -0.6131884 ,
-0.66370211]])
In [61]:
type(data)
Out[61]:
numpy.ndarray
In [62]:
pdata = pd.DataFrame(data) # Creating a DataFrame out of
numpy array using DataFrame function
In [63]:
type(pdata)
Out[63]:
pandas.core.frame.DataFrame
In [64]:
pdata
Out[64]:
0 |
1 |
2 |
|
0 |
-0.621628 |
0.992125 |
-0.388423 |
1 |
0.257091 |
-0.136568 |
0.771140 |
2 |
-0.961535 |
-1.262815 |
-1.586313 |
3 |
-2.382023 |
-2.197037 |
0.425567 |
4 |
-0.423384 |
-0.613188 |
-0.663702 |
In [65]:
pd.DataFrame(data, columns = ['A','B','C'])
Out[65]:
A |
B |
C |
|
0 |
-0.621628 |
0.992125 |
-0.388423 |
1 |
0.257091 |
-0.136568 |
0.771140 |
2 |
-0.961535 |
-1.262815 |
-1.586313 |
3 |
-2.382023 |
-2.197037 |
0.425567 |
4 |
-0.423384 |
-0.613188 |
-0.663702 |
In [66]:
pd.DataFrame(data, columns = ['A','B','C'], index = ['a','b','c','d','e'])
Out[66]:
A |
B |
C |
|
a |
-0.621628 |
0.992125 |
-0.388423 |
b |
0.257091 |
-0.136568 |
0.771140 |
c |
-0.961535 |
-1.262815 |
-1.586313 |
d |
-2.382023 |
-2.197037 |
0.425567 |
e |
-0.423384 |
-0.613188 |
-0.663702 |
In [67]:
df = pd.read_csv('C:/Users/kuldip_s/Documents/Python
Scripts/Stock.csv')
# Creating a DataFrame out of numpy array using read_csv function
df
Out[67]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
|
0 |
1 |
16.770000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
1 |
2 |
17.100000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
2 |
3 |
17.059999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
3 |
4 |
16.900000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
4 |
5 |
16.969999 |
16.990000 |
16.200001 |
16.290001 |
16.290001 |
216600 |
5 |
6 |
16.240000 |
16.530001 |
16.110001 |
16.480000 |
16.480000 |
115700 |
In [68]:
df.head(2) # BUT by default 5 records
Out[68]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
|
0 |
1 |
16.77 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
1 |
2 |
17.10 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
In [69]:
df.tail(2) # by default 5 records
Out[69]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
|
18 |
19 |
16.299999 |
16.629999 |
16.00 |
16.459999 |
16.459999 |
324900 |
19 |
20 |
16.459999 |
16.580000 |
15.98 |
16.040001 |
16.040001 |
141600 |
In [70]:
df[2:5] # To show
data for index number 2,3,4
Out[70]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
|
2 |
3 |
17.059999 |
17.23 |
16.870001 |
17.049999 |
17.049999 |
146300 |
3 |
4 |
16.900000 |
17.33 |
16.900000 |
16.920000 |
16.920000 |
139700 |
4 |
5 |
16.969999 |
16.99 |
16.200001 |
16.290001 |
16.290001 |
216600 |
In [71]:
df.index
Out[71]:
RangeIndex(start=0,
stop=20, step=1)
In [72]:
df.shape # To show number of rows and columns
Out[72]:
(20,
7)
In [73]:
df['Week'] = 'All Day' # Adding a new column
'Week' at the end and assign a value to all records
df.head(5)
Out[73]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
|
0 |
1 |
16.770000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
1 |
2 |
17.100000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
2 |
3 |
17.059999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
3 |
4 |
16.900000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
All Day |
4 |
5 |
16.969999 |
16.990000 |
16.200001 |
16.290001 |
16.290001 |
216600 |
All Day |
In [74]:
df['Difference'] = df['High'] - df['Low'] # Another new column
df.head(5)
Out[74]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
0 |
1 |
16.770000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
1 |
2 |
17.100000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
2 |
3 |
17.059999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
0.359999 |
3 |
4 |
16.900000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
All Day |
0.430000 |
4 |
5 |
16.969999 |
16.990000 |
16.200001 |
16.290001 |
16.290001 |
216600 |
All Day |
0.789999 |
In [75]:
df.shape
Out[75]:
(20,
9)
In [76]:
df.High # Selecting one column
named “High”
Out[76]:
0 17.559999
1 17.469999
2 17.230000
3 17.330000
4 16.990000
5 16.530001
6 16.709999
7 16.629999
8 16.320000
Name:
High, dtype: float64
In [77]:
df.Adj Close
# Selecting
one column – error because space is there in between
File "<ipython-input-77-70fafd7a9d32>", line 1
df.Adj Close # Selecting one column --
error
^
SyntaxError: invalid syntax
In [78]:
df['Adj Close']
# Selecting
one column
Out[78]:
0 16.934999
1 17.200001
2 17.049999
3 16.920000
4 16.290001
5 16.480000
6 16.610001
Name:
Adj Close, dtype: float64
In [79]:
df[['Day','High','Low']] # Selecting more than one column
Out[79]:
Day |
High |
Low |
|
0 |
1 |
17.559999 |
16.730000 |
1 |
2 |
17.469999 |
16.950001 |
2 |
3 |
17.230000 |
16.870001 |
df[['Day','High','Low']].head(5) # Selecting more than one column with head function
Out[80]:
Day |
High |
Low |
|
0 |
1 |
17.559999 |
16.730000 |
1 |
2 |
17.469999 |
16.950001 |
2 |
3 |
17.230000 |
16.870001 |
3 |
4 |
17.330000 |
16.900000 |
4 |
5 |
16.990000 |
16.200001 |
In [81]:
select = ['Day','High','Low'] # create a list and pass it into the df
selecteddf = df[select].head(3)
selecteddf
Out[81]:
Day |
High |
Low |
|
0 |
1 |
17.559999 |
16.730000 |
1 |
2 |
17.469999 |
16.950001 |
2 |
3 |
17.230000 |
16.870001 |
In [82]:
type(selecteddf)
Out[82]:
pandas.core.frame.DataFrame
In [83]:
df['Open'] = df['Open'] + 0.25 # sum applied to all values in Open column
df.head(5)
Out[83]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
2 |
3 |
17.309999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
0.359999 |
3 |
4 |
17.150000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
All Day |
0.430000 |
4 |
5 |
17.219999 |
16.990000 |
16.200001 |
16.290001 |
16.290001 |
216600 |
All Day |
0.789999 |
Drop missing rows or column
In [84]:
df1 = pd.DataFrame([[np.nan,1,2,np.nan],[3,4,np.nan,np.nan],[5,np.nan,6,7],
[8,9,np.nan,10],[11,12,13,10],[np.nan,np.nan,np.nan,np.nan]],
columns=list('ABCD'),index=list('abcdef'))
df1
Out[84]:
A |
B |
C |
D |
|
a |
NaN |
1.0 |
2.0 |
NaN |
b |
3.0 |
4.0 |
NaN |
NaN |
c |
5.0 |
NaN |
6.0 |
7.0 |
d |
8.0 |
9.0 |
NaN |
10.0 |
e |
11.0 |
12.0 |
13.0 |
10.0 |
f |
NaN |
NaN |
NaN |
NaN |
In [85]:
df1. dropna (axis=0,how='any')
# axis = 0
means operation on Row
level
# how =
'any' means It will drop rows wherever there is atleast a NaN in the row
record
Out[85]:
A |
B |
C |
D |
|
e |
11.0 |
12.0 |
13.0 |
10.0 |
In [86]:
df1.dropna(axis=0,how='all')
# axis = 0 means operation on Row level
# how = 'all'
means It will drop rows wherever all record values are NaN in the row
Out[86]:
A |
B |
C |
D |
|
a |
NaN |
1.0 |
2.0 |
NaN |
b |
3.0 |
4.0 |
NaN |
NaN |
c |
5.0 |
NaN |
6.0 |
7.0 |
d |
8.0 |
9.0 |
NaN |
10.0 |
e |
11.0 |
12.0 |
13.0 |
10.0 |
In [87]:
df1 = pd.DataFrame([[np.nan,1,2,np.nan],[3,4,np.nan,np.nan],[5,np.nan,6,7],
[8,9,np.nan,10],[11,12,13,10],[np.nan,np.nan,np.nan,np.nan]],
columns=list('ABCD'))
df1
Out[87]:
A |
B |
C |
D |
|
0 |
NaN |
1.0 |
2.0 |
NaN |
1 |
3.0 |
4.0 |
NaN |
NaN |
2 |
5.0 |
NaN |
6.0 |
7.0 |
3 |
8.0 |
9.0 |
NaN |
10.0 |
4 |
11.0 |
12.0 |
13.0 |
10.0 |
5 |
NaN |
NaN |
NaN |
NaN |
In [88]:
df1.dropna(axis='index',how='any') # axis = index (same as axis = 0)
Out[88]:
A |
B |
C |
D |
|
4 |
11.0 |
12.0 |
13.0 |
10.0 |
In [89]:
df1.dropna(axis='columns',how='any') # axis = columns (same as axis = 1)
Out[89]:
0 |
1 |
2 |
3 |
4 |
5 |
In [90]:
df1.drop( axis = 0, labels=[2]) # Drop index no 2 i.e. 3rd row
Out[90]:
A |
B |
C |
D |
|
0 |
NaN |
1.0 |
2.0 |
NaN |
1 |
3.0 |
4.0 |
NaN |
NaN |
3 |
8.0 |
9.0 |
NaN |
10.0 |
4 |
11.0 |
12.0 |
13.0 |
10.0 |
5 |
NaN |
NaN |
NaN |
NaN |
In [91]:
df1.drop( axis = 0, labels=[2,4])
Out[91]:
A |
B |
C |
D |
|
0 |
NaN |
1.0 |
2.0 |
NaN |
1 |
3.0 |
4.0 |
NaN |
NaN |
3 |
8.0 |
9.0 |
NaN |
10.0 |
5 |
NaN |
NaN |
NaN |
NaN |
In [92]:
df1.drop( axis = 1, labels=['C']) # Drop column C
Out[92]:
A |
B |
D |
|
0 |
NaN |
1.0 |
NaN |
1 |
3.0 |
4.0 |
NaN |
2 |
5.0 |
NaN |
7.0 |
3 |
8.0 |
9.0 |
10.0 |
4 |
11.0 |
12.0 |
10.0 |
5 |
NaN |
NaN |
NaN |
In [93]:
df1.drop( axis = 1, labels=['A','C']) # Drop column A & C
Out[93]:
B |
D |
|
0 |
1.0 |
NaN |
1 |
4.0 |
NaN |
2 |
NaN |
7.0 |
3 |
9.0 |
10.0 |
4 |
12.0 |
10.0 |
5 |
NaN |
NaN |
In [94]:
df2 = pd.DataFrame([[np.nan,1,2,np.nan,23,np.nan],[3,4,np.nan,np.nan,24,np.nan],[5,np.nan,6,7,25,np.nan],
[8,9,np.nan,10,25,np.nan],[11,12,13,10,26,np.nan],[np.nan,np.nan,np.nan,np.nan,27,np.nan]],
columns=list('ABCDEF'),index=list('abcdef'))
df2
Out[94]:
A |
B |
C |
D |
E |
F |
|
a |
NaN |
1.0 |
2.0 |
NaN |
23 |
NaN |
b |
3.0 |
4.0 |
NaN |
NaN |
24 |
NaN |
c |
5.0 |
NaN |
6.0 |
7.0 |
25 |
NaN |
d |
8.0 |
9.0 |
NaN |
10.0 |
25 |
NaN |
e |
11.0 |
12.0 |
13.0 |
10.0 |
26 |
NaN |
f |
NaN |
NaN |
NaN |
NaN |
27 |
NaN |
In [95]:
df2.dropna(axis=1,how='any')
# axis = 1 means operation on Column level
# how =
'any' means It will drop columns wherever there is atleast a NaN
Out[95]:
E |
|
a |
23 |
b |
24 |
c |
25 |
d |
25 |
e |
26 |
f |
27 |
In [96]:
df2.dropna(axis=1,how='all')
# axis = 1 means operation on Column level
# how =
'all' means It will drop columns wherever all columns values are NaN
Out[96]:
A |
B |
C |
D |
E |
|
a |
NaN |
1.0 |
2.0 |
NaN |
23 |
b |
3.0 |
4.0 |
NaN |
NaN |
24 |
c |
5.0 |
NaN |
6.0 |
7.0 |
25 |
d |
8.0 |
9.0 |
NaN |
10.0 |
25 |
e |
11.0 |
12.0 |
13.0 |
10.0 |
26 |
f |
NaN |
NaN |
NaN |
NaN |
27 |
Filter data
In [97]:
Df.head()
Out[97]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
2 |
3 |
17.309999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
0.359999 |
3 |
4 |
17.150000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
All Day |
0.430000 |
4 |
5 |
17.219999 |
16.990000 |
16.200001 |
16.290001 |
16.290001 |
216600 |
All Day |
0.789999 |
In [98]:
df['Day'] == 10
Out[98]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 True
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19
False
Name:
Day, dtype: bool
In [99]:
con=df['Day'] == 10 # Give a name to the condition and
pass the variable name in df to filter
df[con]
Out[99]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
9 |
10 |
16.02 |
16.6 |
15.77 |
16.41 |
16.41 |
121700 |
All Day |
0.83 |
In [100]:
df[df['Day'] > 15] # Without giving the condition
name
Out[100]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
15 |
16 |
17.040001 |
17.049999 |
16.480000 |
17.040001 |
17.040001 |
177100 |
All Day |
0.569999 |
16 |
17 |
17.350000 |
17.480000 |
16.790001 |
17.190001 |
17.190001 |
236800 |
All Day |
0.689999 |
17 |
18 |
17.549999 |
17.389999 |
16.450001 |
16.510000 |
16.510000 |
300000 |
All Day |
0.939998 |
18 |
19 |
16.549999 |
16.629999 |
16.000000 |
16.459999 |
16.459999 |
324900 |
All Day |
0.629999 |
19 |
20 |
16.709999 |
16.580000 |
15.980000 |
16.040001 |
16.040001 |
141600 |
All Day |
0.600000 |
In [101]:
con1= df['Day'] >15 # Multiple
Conditions
con2 = df['Close'] > 16.5
df[con1 & con2] # AND
logical operator
Out[102]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
15 |
16 |
17.040001 |
17.049999 |
16.480000 |
17.040001 |
17.040001 |
177100 |
All Day |
0.569999 |
16 |
17 |
17.350000 |
17.480000 |
16.790001 |
17.190001 |
17.190001 |
236800 |
All Day |
0.689999 |
17 |
18 |
17.549999 |
17.389999 |
16.450001 |
16.510000 |
16.510000 |
300000 |
All Day |
0.939998 |
In [103]:
df[con1 | con2] # OR
operator
Out[103]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
2 |
3 |
17.309999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
0.359999 |
3 |
4 |
17.150000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
All Day |
0.430000 |
6 |
7 |
16.639999 |
16.709999 |
16.320000 |
16.610001 |
16.610001 |
93000 |
All Day |
0.389999 |
14 |
15 |
16.719999 |
16.870001 |
16.340000 |
16.790001 |
16.790001 |
174100 |
All Day |
0.530001 |
15 |
16 |
17.040001 |
17.049999 |
16.480000 |
17.040001 |
17.040001 |
177100 |
All Day |
0.569999 |
16 |
17 |
17.350000 |
17.480000 |
16.790001 |
17.190001 |
17.190001 |
236800 |
All Day |
0.689999 |
17 |
18 |
17.549999 |
17.389999 |
16.450001 |
16.510000 |
16.510000 |
300000 |
All Day |
0.939998 |
18 |
19 |
16.549999 |
16.629999 |
16.000000 |
16.459999 |
16.459999 |
324900 |
All Day |
0.629999 |
19 |
20 |
16.709999 |
16.580000 |
15.980000 |
16.040001 |
16.040001 |
141600 |
All Day |
0.600000 |
In [104]:
con3 = df['Volume'] > 200000
In [105]:
df[(con1 | con2) & con3] # AND OR
operators together
Out[105]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
16 |
17 |
17.350000 |
17.480000 |
16.790001 |
17.190001 |
17.190001 |
236800 |
All Day |
0.689999 |
17 |
18 |
17.549999 |
17.389999 |
16.450001 |
16.510000 |
16.510000 |
300000 |
All Day |
0.939998 |
18 |
19 |
16.549999 |
16.629999 |
16.000000 |
16.459999 |
16.459999 |
324900 |
All Day |
0.629999 |
In [111]:
df[df['Day'].isin([1,2,3,10,15,19])] # Use of isin function to pass multiple conditions
at one time
Out[111]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
2 |
3 |
17.309999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
0.359999 |
9 |
10 |
16.020000 |
16.600000 |
15.770000 |
16.410000 |
16.410000 |
121700 |
All Day |
0.830000 |
14 |
15 |
16.719999 |
16.870001 |
16.340000 |
16.790001 |
16.790001 |
174100 |
All Day |
0.530001 |
18 |
19 |
16.549999 |
16.629999 |
16.000000 |
16.459999 |
16.459999 |
324900 |
All Day |
0.629999 |
In [113]:
df[df['Day'].between(10,15)] # Use of
between function to pass 2 values
Out[113]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
9 |
10 |
16.020000 |
16.600000 |
15.770000 |
16.410000 |
16.410000 |
121700 |
All Day |
0.830000 |
10 |
11 |
16.639999 |
16.389999 |
16.100000 |
16.150000 |
16.150000 |
32300 |
All Day |
0.289999 |
11 |
12 |
16.580000 |
16.480000 |
16.059999 |
16.360001 |
16.360001 |
143000 |
All Day |
0.420001 |
12 |
13 |
16.590000 |
16.340000 |
16.110001 |
16.219999 |
16.219999 |
83900 |
All Day |
0.229999 |
13 |
14 |
16.490000 |
16.500000 |
16.049999 |
16.500000 |
16.500000 |
147700 |
All Day |
0.450001 |
14 |
15 |
16.719999 |
16.870001 |
16.340000 |
16.790001 |
16.790001 |
174100 |
All Day |
0.530001 |
In [115]:
df['Week'].unique() # it ll result all available unique values
Out[115]:
array(['All Day'], dtype=object)
In [116]:
df['Week'].nunique() # it ll show the number of unique values
Out[116]:
1
In [124]:
df.sort_values(by='Day', ascending = False).head() # sort_vlaues function for ascending by default
Out[124]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
19 |
20 |
16.709999 |
16.580000 |
15.980000 |
16.040001 |
16.040001 |
141600 |
All Day |
0.600000 |
18 |
19 |
16.549999 |
16.629999 |
16.000000 |
16.459999 |
16.459999 |
324900 |
All Day |
0.629999 |
17 |
18 |
17.549999 |
17.389999 |
16.450001 |
16.510000 |
16.510000 |
300000 |
All Day |
0.939998 |
16 |
17 |
17.350000 |
17.480000 |
16.790001 |
17.190001 |
17.190001 |
236800 |
All Day |
0.689999 |
15 |
16 |
17.040001 |
17.049999 |
16.480000 |
17.040001 |
17.040001 |
177100 |
All Day |
0.569999 |
In [123]:
df.sort_values(by=['Volume','Day'], ascending = [False,True]).head()
# Descending
(False) for Volumne and Ascending (True) for Day
Out[123]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
18 |
19 |
16.549999 |
16.629999 |
16.000000 |
16.459999 |
16.459999 |
324900 |
All Day |
0.629999 |
17 |
18 |
17.549999 |
17.389999 |
16.450001 |
16.510000 |
16.510000 |
300000 |
All Day |
0.939998 |
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
16 |
17 |
17.350000 |
17.480000 |
16.790001 |
17.190001 |
17.190001 |
236800 |
All Day |
0.689999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
In [125]:
df.sort_index(ascending=False) # sorting by index value
Out[125]:
Day |
Open |
High |
Low |
Close |
Adj Close |
Volume |
Week |
Difference |
|
4 |
5 |
17.219999 |
16.990000 |
16.200001 |
16.290001 |
16.290001 |
216600 |
All Day |
0.789999 |
3 |
4 |
17.150000 |
17.330000 |
16.900000 |
16.920000 |
16.920000 |
139700 |
All Day |
0.430000 |
2 |
3 |
17.309999 |
17.230000 |
16.870001 |
17.049999 |
17.049999 |
146300 |
All Day |
0.359999 |
1 |
2 |
17.350000 |
17.469999 |
16.950001 |
17.200001 |
17.200001 |
229800 |
All Day |
0.519998 |
0 |
1 |
17.020000 |
17.559999 |
16.730000 |
17.010000 |
16.934999 |
271700 |
All Day |
0.829999 |
In [130]:
data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')
data.head()
Out[135]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
|
0 |
Afghanistan |
0 |
0 |
0 |
0.0 |
AS |
1 |
Albania |
89 |
132 |
54 |
4.9 |
EU |
2 |
Algeria |
25 |
0 |
14 |
0.7 |
AF |
3 |
Andorra |
245 |
138 |
312 |
12.4 |
EU |
4 |
Angola |
217 |
57 |
45 |
5.9 |
AF |
In [143]:
data.loc[3] # Index number 3 (Row# 4) data
Out[143]:
country Andorra
beer_servings 245
spirit_servings 138
wine_servings 312
total_litres_of_pure_alcohol 12.4
continent EU
Name:
3, dtype: object
In [149]:
data.loc[[4,1]] # more than one row of data
Out[149]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
|
4 |
Angola |
217 |
57 |
45 |
5.9 |
AF |
1 |
Albania |
89 |
132 |
54 |
4.9 |
EU |
In [151]:
data.loc[[0,4,1]] [['country','continent']]
Out[151]:
country |
continent |
|
0 |
Afghanistan |
AS |
4 |
Angola |
AF |
1 |
Albania |
EU |
In [154]:
data.loc[0:2]
Out[154]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
|
0 |
Afghanistan |
0 |
0 |
0 |
0.0 |
AS |
1 |
Albania |
89 |
132 |
54 |
4.9 |
EU |
2 |
Algeria |
25 |
0 |
14 |
0.7 |
AF |
In [158]:
data.loc[5:8] [['country','continent']] # 5th 6th 7th 8th rows LABEL based INDEXING
Out[158]:
country |
continent |
|
5 |
Antigua & Barbuda |
NaN |
6 |
Argentina |
SA |
7 |
Armenia |
EU |
8 |
Australia |
OC |
In [157]:
data.iloc[6]
Out[157]:
country Argentina
beer_servings 193
spirit_servings 25
wine_servings 221
total_litres_of_pure_alcohol 8.3
continent SA
Name:
6, dtype: object
In [159]:
data.iloc[5:8] [['country','continent']] # 5th 6th 7th rows note that there is no 8th Row. POSITIONAL based INDEXING
Out[159]:
country |
continent |
|
5 |
Antigua & Barbuda |
NaN |
6 |
Argentina |
SA |
7 |
Armenia |
EU |
loc gets rows (or columns) with
particular labels from the index.
iloc gets rows
(or columns) at particular positions in the index (so it only takes integers).
ix usually
tries to behave like loc but falls back to behaving like iloc if a label is not
present in the index.
In [160]:
s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])
s
Out[160]:
49 NaN
48 NaN
47 NaN
46 NaN
45 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
dtype:
float64
In [161]:
s.loc[:3] # LABEL based. So, slice up to and including label
3
Out[161]:
49 NaN
48 NaN
47 NaN
46 NaN
45 NaN
1 NaN
2 NaN
3 NaN
dtype:
float64
In [162]:
s.iloc[:3] # POSITION based. So, 1st 3 rows
Out[162]:
49 NaN
48 NaN
47 NaN
dtype:
float64
In [163]:
s.ix[:3] # the integer is in the index so s.ix[:3] works like loc
C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:1:
FutureWarning:
.ix is deprecated. Please use
.loc
for label based indexing or
.iloc
for positional indexing
See
the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching
an IPython kernel.
Out[163]:
49 NaN
48 NaN
47 NaN
46 NaN
45 NaN
1 NaN
2 NaN
3 NaN
dtype:
float64
In [165]:
data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')
data
Out[165]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
|
0 |
Afghanistan |
0 |
0 |
0 |
0.0 |
AS |
1 |
Albania |
89 |
132 |
54 |
4.9 |
EU |
2 |
Algeria |
25 |
0 |
14 |
0.7 |
AF |
3 |
Andorra |
245 |
138 |
312 |
12.4 |
EU |
4 |
Angola |
217 |
57 |
45 |
5.9 |
AF |
... |
... |
... |
... |
... |
... |
... |
188 |
Venezuela |
333 |
100 |
3 |
7.7 |
SA |
189 |
Vietnam |
111 |
2 |
1 |
2.0 |
AS |
190 |
Yemen |
6 |
0 |
0 |
0.1 |
AS |
191 |
Zambia |
32 |
19 |
4 |
2.5 |
AF |
192 |
Zimbabwe |
64 |
18 |
4 |
4.7 |
AF |
193 rows × 6 columns
In [171]:
# Changing
of by default Index column - 2 ways : index_col & set_index
data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv', index_col='country')
data
Out[171]:
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
|
country |
|||||
Afghanistan |
0 |
0 |
0 |
0.0 |
AS |
Albania |
89 |
132 |
54 |
4.9 |
EU |
Algeria |
25 |
0 |
14 |
0.7 |
AF |
Andorra |
245 |
138 |
312 |
12.4 |
EU |
Angola |
217 |
57 |
45 |
5.9 |
AF |
... |
... |
... |
... |
... |
... |
Venezuela |
333 |
100 |
3 |
7.7 |
SA |
Vietnam |
111 |
2 |
1 |
2.0 |
AS |
Yemen |
6 |
0 |
0 |
0.1 |
AS |
Zambia |
32 |
19 |
4 |
2.5 |
AF |
Zimbabwe |
64 |
18 |
4 |
4.7 |
AF |
193 rows × 5 columns
In [174]:
data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')
data.set_index('continent')
Out[174]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
|
continent |
|||||
AS |
Afghanistan |
0 |
0 |
0 |
0.0 |
EU |
Albania |
89 |
132 |
54 |
4.9 |
AF |
Algeria |
25 |
0 |
14 |
0.7 |
EU |
Andorra |
245 |
138 |
312 |
12.4 |
AF |
Angola |
217 |
57 |
45 |
5.9 |
... |
... |
... |
... |
... |
... |
SA |
Venezuela |
333 |
100 |
3 |
7.7 |
AS |
Vietnam |
111 |
2 |
1 |
2.0 |
AS |
Yemen |
6 |
0 |
0 |
0.1 |
AF |
Zambia |
32 |
19 |
4 |
2.5 |
AF |
Zimbabwe |
64 |
18 |
4 |
4.7 |
193 rows × 5 columns
In [177]:
data.set_index(['continent','country'])
Out[177]:
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
||
continent |
country |
||||
AS |
Afghanistan |
0 |
0 |
0 |
0.0 |
EU |
Albania |
89 |
132 |
54 |
4.9 |
AF |
Algeria |
25 |
0 |
14 |
0.7 |
EU |
Andorra |
245 |
138 |
312 |
12.4 |
AF |
Angola |
217 |
57 |
45 |
5.9 |
... |
... |
... |
... |
... |
... |
SA |
Venezuela |
333 |
100 |
3 |
7.7 |
AS |
Vietnam |
111 |
2 |
1 |
2.0 |
Yemen |
6 |
0 |
0 |
0.1 |
|
AF |
Zambia |
32 |
19 |
4 |
2.5 |
Zimbabwe |
64 |
18 |
4 |
4.7 |
193 rows × 4 columns
In [178]:
data
Out[178]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
|
0 |
Afghanistan |
0 |
0 |
0 |
0.0 |
AS |
1 |
Albania |
89 |
132 |
54 |
4.9 |
EU |
2 |
Algeria |
25 |
0 |
14 |
0.7 |
AF |
In [198]:
def geograhy(info):
beer_servings = info[1] # to capture beer_servings
information
spirit_servings = info[2] # to capture spirit_servings
information
wine_servings = info[3]
new_variable =
beer_servings + spirit_servings +
wine_servings
return new_variable
In [199]:
data['New_Column'] =
data.apply(func=geograhy, axis='columns') # New column added to DF
data
Out[199]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
New_Column |
|
0 |
Afghanistan |
0 |
0 |
0 |
0.0 |
AS |
0 |
1 |
Albania |
89 |
132 |
54 |
4.9 |
EU |
275 |
2 |
Algeria |
25 |
0 |
14 |
0.7 |
AF |
39 |
193 rows × 7 columns
In [200]:
len(data) # total number of records
Out[200]:
193
In [203]:
data.sample(n =5) # randomly select 5 sample records
Out[203]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
New_Column |
|
86 |
Jordan |
6 |
21 |
1 |
0.5 |
AS |
28 |
15 |
Belarus |
142 |
373 |
42 |
14.4 |
EU |
557 |
54 |
El Salvador |
52 |
69 |
2 |
2.2 |
NaN |
123 |
64 |
Georgia |
52 |
100 |
149 |
5.4 |
EU |
301 |
160 |
Spain |
284 |
157 |
112 |
10.0 |
EU |
553 |
In [206]:
data.sample(frac = 2/100) # Extract 2 % of sample data randomly
Out[206]:
country |
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
continent |
New_Column |
|
39 |
Congo |
76 |
1 |
9 |
1.7 |
AF |
86 |
89 |
Kiribati |
21 |
34 |
1 |
1.0 |
OC |
56 |
46 |
North Korea |
0 |
0 |
0 |
0.0 |
AS |
0 |
104 |
Mali |
5 |
1 |
1 |
0.6 |
AF |
7 |
In [207]:
data.info() # column data types and Total
number of record in Columns
<class
'pandas.core.frame.DataFrame'>
RangeIndex:
193 entries, 0 to 192
Data
columns (total 7 columns):
country 193 non-null object
beer_servings 193 non-null int64
spirit_servings 193 non-null int64
wine_servings 193 non-null int64
total_litres_of_pure_alcohol 193 non-null float64
continent 170 non-null object
New_Column
193 non-null int64
dtypes:
float64(1), int64(4), object(2)
memory
usage: 10.7+ KB
In [208]:
data.describe() # Statistical information for numerical column
Out[208]:
beer_servings |
spirit_servings |
wine_servings |
total_litres_of_pure_alcohol |
New_Column |
|
count |
193.000000 |
193.000000 |
193.000000 |
193.000000 |
193.000000 |
mean |
106.160622 |
80.994819 |
49.450777 |
4.717098 |
236.606218 |
std |
101.143103 |
88.284312 |
79.697598 |
3.773298 |
209.307863 |
min |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
25% |
20.000000 |
4.000000 |
1.000000 |
1.300000 |
45.000000 |
50% |
76.000000 |
56.000000 |
8.000000 |
4.200000 |
173.000000 |
75% |
188.000000 |
128.000000 |
59.000000 |
7.200000 |
392.000000 |
max |
376.000000 |
438.000000 |
370.000000 |
14.400000 |
695.000000 |
In [209]:
data['continent'].value_counts() # Count the number of countries in
each continent
Out[209]:
AF 53
EU 45
AS 44
OC 16
SA 12
Name:
continent, dtype: int64
In [217]:
max_countries = data['continent'].value_counts().max()
# Get the continent which has most number of Countries
max_countries
Out[217]:
53
In [222]:
data['continent'].value_counts().sort_values(ascending=False).head(1)
Out[222]:
AF 53
Name:
continent, dtype: int64
In [231]:
top3_continent = list(data['continent'].value_counts().sort_values(ascending=False).head(3).index)
top3_continent
Out[231]:
['AF',
'EU', 'AS']
In [230]:
data.iloc[top3_continent]
No comments:
Post a Comment