Data and Database: Pyhon Pandas : Dataframe

DataFrame

import numpy as np

import pandas as pd

In [59]:

data = np.random.randn(5,3) # Generate random data in matrix/tabular form 5 rows and 3 columns

In [60]:

data

Out[60]:

array([[-0.62162763, 0.9921252 , -0.3884227 ],

[ 0.25709088, -0.1365675 , 0.77113954],

[-0.96153538, -1.26281479, -1.58631299],

[-2.38202279, -2.19703702, 0.42556743],

[-0.42338357, -0.6131884 , -0.66370211]])

In [61]:

type(data)

Out[61]:

numpy.ndarray

In [62]:

pdata = pd.DataFrame(data) # Creating a DataFrame out of numpy array using DataFrame function

In [63]:

type(pdata)

Out[63]:

pandas.core.frame.DataFrame

In [64]:

pdata

Out[64]:

	0	1	2
0	-0.621628	0.992125	-0.388423
1	0.257091	-0.136568	0.771140
2	-0.961535	-1.262815	-1.586313
3	-2.382023	-2.197037	0.425567
4	-0.423384	-0.613188	-0.663702

In [65]:

pd.DataFrame(data, columns = ['A','B','C'])

Out[65]:

	A	B	C
0	-0.621628	0.992125	-0.388423
1	0.257091	-0.136568	0.771140
2	-0.961535	-1.262815	-1.586313
3	-2.382023	-2.197037	0.425567
4	-0.423384	-0.613188	-0.663702

In [66]:

pd.DataFrame(data, columns = ['A','B','C'], index = ['a','b','c','d','e'])

Out[66]:

	A	B	C
a	-0.621628	0.992125	-0.388423
b	0.257091	-0.136568	0.771140
c	-0.961535	-1.262815	-1.586313
d	-2.382023	-2.197037	0.425567
e	-0.423384	-0.613188	-0.663702

In [67]:

df = pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/Stock.csv')

# Creating a DataFrame out of numpy array using read_csv function

Out[67]:

	Day	Open	High	Low	Close	Adj Close	Volume
0	1	16.770000	17.559999	16.730000	17.010000	16.934999	271700
1	2	17.100000	17.469999	16.950001	17.200001	17.200001	229800
2	3	17.059999	17.230000	16.870001	17.049999	17.049999	146300
3	4	16.900000	17.330000	16.900000	16.920000	16.920000	139700
4	5	16.969999	16.990000	16.200001	16.290001	16.290001	216600
5	6	16.240000	16.530001	16.110001	16.480000	16.480000	115700

In [68]:

df.head(2) # BUT by default 5 records

Out[68]:

	Day	Open	High	Low	Close	Adj Close	Volume
0	1	16.77	17.559999	16.730000	17.010000	16.934999	271700
1	2	17.10	17.469999	16.950001	17.200001	17.200001	229800

In [69]:

df.tail(2) # by default 5 records

Out[69]:

	Day	Open	High	Low	Close	Adj Close	Volume
18	19	16.299999	16.629999	16.00	16.459999	16.459999	324900
19	20	16.459999	16.580000	15.98	16.040001	16.040001	141600

In [70]:

df[2:5] # To show data for index number 2,3,4

Out[70]:

	Day	Open	High	Low	Close	Adj Close	Volume
2	3	17.059999	17.23	16.870001	17.049999	17.049999	146300
3	4	16.900000	17.33	16.900000	16.920000	16.920000	139700
4	5	16.969999	16.99	16.200001	16.290001	16.290001	216600

In [71]:

df.index

Out[71]:

RangeIndex(start=0, stop=20, step=1)

In [72]:

df.shape # To show number of rows and columns

Out[72]:

(20, 7)

In [73]:

df['Week'] = 'All Day' # Adding a new column 'Week' at the end and assign a value to all records

df.head(5)

Out[73]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week
0	1	16.770000	17.559999	16.730000	17.010000	16.934999	271700	All Day
1	2	17.100000	17.469999	16.950001	17.200001	17.200001	229800	All Day
2	3	17.059999	17.230000	16.870001	17.049999	17.049999	146300	All Day
3	4	16.900000	17.330000	16.900000	16.920000	16.920000	139700	All Day
4	5	16.969999	16.990000	16.200001	16.290001	16.290001	216600	All Day

In [74]:

df['Difference'] = df['High'] - df['Low'] # Another new column

df.head(5)

Out[74]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
0	1	16.770000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
1	2	17.100000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
2	3	17.059999	17.230000	16.870001	17.049999	17.049999	146300	All Day	0.359999
3	4	16.900000	17.330000	16.900000	16.920000	16.920000	139700	All Day	0.430000
4	5	16.969999	16.990000	16.200001	16.290001	16.290001	216600	All Day	0.789999

In [75]:

df.shape

Out[75]:

(20, 9)

In [76]:

df.High # Selecting one column named “High”

Out[76]:

0 17.559999

1 17.469999

2 17.230000

3 17.330000

4 16.990000

5 16.530001

6 16.709999

7 16.629999

8 16.320000

Name: High, dtype: float64

In [77]:

df.Adj Close # Selecting one column – error because space is there in between

File "<ipython-input-77-70fafd7a9d32>", line 1

df.Adj Close # Selecting one column -- error

SyntaxError: invalid syntax

In [78]:

df['Adj Close'] # Selecting one column

Out[78]:

0 16.934999

1 17.200001

2 17.049999

3 16.920000

4 16.290001

5 16.480000

6 16.610001

Name: Adj Close, dtype: float64

In [79]:

df[['Day','High','Low']] # Selecting more than one column

Out[79]:

	Day	High	Low
0	1	17.559999	16.730000
1	2	17.469999	16.950001
2	3	17.230000	16.870001

In [80]:

df[['Day','High','Low']].head(5) # Selecting more than one column with head function

Out[80]:

	Day	High	Low
0	1	17.559999	16.730000
1	2	17.469999	16.950001
2	3	17.230000	16.870001
3	4	17.330000	16.900000
4	5	16.990000	16.200001

In [81]:

select = ['Day','High','Low'] # create a list and pass it into the df

selecteddf = df[select].head(3)

selecteddf

Out[81]:

	Day	High	Low
0	1	17.559999	16.730000
1	2	17.469999	16.950001
2	3	17.230000	16.870001

In [82]:

type(selecteddf)

Out[82]:

pandas.core.frame.DataFrame

In [83]:

df['Open'] = df['Open'] + 0.25 # sum applied to all values in Open column

df.head(5)

Out[83]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
2	3	17.309999	17.230000	16.870001	17.049999	17.049999	146300	All Day	0.359999
3	4	17.150000	17.330000	16.900000	16.920000	16.920000	139700	All Day	0.430000
4	5	17.219999	16.990000	16.200001	16.290001	16.290001	216600	All Day	0.789999

Drop missing rows or column

In [84]:

df1 = pd.DataFrame([[np.nan,1,2,np.nan],[3,4,np.nan,np.nan],[5,np.nan,6,7],

[8,9,np.nan,10],[11,12,13,10],[np.nan,np.nan,np.nan,np.nan]],

columns=list('ABCD'),index=list('abcdef'))

df1

Out[84]:

	A	B	C	D
a	NaN	1.0	2.0	NaN
b	3.0	4.0	NaN	NaN
c	5.0	NaN	6.0	7.0
d	8.0	9.0	NaN	10.0
e	11.0	12.0	13.0	10.0
f	NaN	NaN	NaN	NaN

In [85]:

df1. dropna (axis=0,how='any')

# axis = 0 means operation on Row level

# how = 'any' means It will drop rows wherever there is atleast a NaN in the row record

Out[85]:

	A	B	C	D
e	11.0	12.0	13.0	10.0

In [86]:

df1.dropna(axis=0,how='all')

# axis = 0 means operation on Row level

# how = 'all' means It will drop rows wherever all record values are NaN in the row

Out[86]:

	A	B	C	D
a	NaN	1.0	2.0	NaN
b	3.0	4.0	NaN	NaN
c	5.0	NaN	6.0	7.0
d	8.0	9.0	NaN	10.0
e	11.0	12.0	13.0	10.0

In [87]:

df1 = pd.DataFrame([[np.nan,1,2,np.nan],[3,4,np.nan,np.nan],[5,np.nan,6,7],

[8,9,np.nan,10],[11,12,13,10],[np.nan,np.nan,np.nan,np.nan]],

columns=list('ABCD'))

df1

Out[87]:

	A	B	C	D
0	NaN	1.0	2.0	NaN
1	3.0	4.0	NaN	NaN
2	5.0	NaN	6.0	7.0
3	8.0	9.0	NaN	10.0
4	11.0	12.0	13.0	10.0
5	NaN	NaN	NaN	NaN

In [88]:

df1.dropna(axis='index',how='any') # axis = index (same as axis = 0)

Out[88]:

	A	B	C	D
4	11.0	12.0	13.0	10.0

In [89]:

df1.dropna(axis='columns',how='any') # axis = columns (same as axis = 1)

Out[89]:


0
1
2
3
4
5

In [90]:

df1.drop( axis = 0, labels=[2]) # Drop index no 2 i.e. 3rd row

Out[90]:

	A	B	C	D
0	NaN	1.0	2.0	NaN
1	3.0	4.0	NaN	NaN
3	8.0	9.0	NaN	10.0
4	11.0	12.0	13.0	10.0
5	NaN	NaN	NaN	NaN

In [91]:

df1.drop( axis = 0, labels=[2,4])

Out[91]:

	A	B	C	D
0	NaN	1.0	2.0	NaN
1	3.0	4.0	NaN	NaN
3	8.0	9.0	NaN	10.0
5	NaN	NaN	NaN	NaN

In [92]:

df1.drop( axis = 1, labels=['C']) # Drop column C

Out[92]:

	A	B	D
0	NaN	1.0	NaN
1	3.0	4.0	NaN
2	5.0	NaN	7.0
3	8.0	9.0	10.0
4	11.0	12.0	10.0
5	NaN	NaN	NaN

In [93]:

df1.drop( axis = 1, labels=['A','C']) # Drop column A & C

Out[93]:

	B	D
0	1.0	NaN
1	4.0	NaN
2	NaN	7.0
3	9.0	10.0
4	12.0	10.0
5	NaN	NaN

In [94]:

df2 = pd.DataFrame([[np.nan,1,2,np.nan,23,np.nan],[3,4,np.nan,np.nan,24,np.nan],[5,np.nan,6,7,25,np.nan],

[8,9,np.nan,10,25,np.nan],[11,12,13,10,26,np.nan],[np.nan,np.nan,np.nan,np.nan,27,np.nan]],

columns=list('ABCDEF'),index=list('abcdef'))

df2

Out[94]:

	A	B	C	D	E	F
a	NaN	1.0	2.0	NaN	23	NaN
b	3.0	4.0	NaN	NaN	24	NaN
c	5.0	NaN	6.0	7.0	25	NaN
d	8.0	9.0	NaN	10.0	25	NaN
e	11.0	12.0	13.0	10.0	26	NaN
f	NaN	NaN	NaN	NaN	27	NaN

In [95]:

df2.dropna(axis=1,how='any')

# axis = 1 means operation on Column level

# how = 'any' means It will drop columns wherever there is atleast a NaN

Out[95]:

	E
a	23
b	24
c	25
d	25
e	26
f	27

In [96]:

df2.dropna(axis=1,how='all')

# axis = 1 means operation on Column level

# how = 'all' means It will drop columns wherever all columns values are NaN

Out[96]:

	A	B	C	D	E
a	NaN	1.0	2.0	NaN	23
b	3.0	4.0	NaN	NaN	24
c	5.0	NaN	6.0	7.0	25
d	8.0	9.0	NaN	10.0	25
e	11.0	12.0	13.0	10.0	26
f	NaN	NaN	NaN	NaN	27

Filter data

In [97]:

Df.head()

Out[97]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
2	3	17.309999	17.230000	16.870001	17.049999	17.049999	146300	All Day	0.359999
3	4	17.150000	17.330000	16.900000	16.920000	16.920000	139700	All Day	0.430000
4	5	17.219999	16.990000	16.200001	16.290001	16.290001	216600	All Day	0.789999

In [98]:

df['Day'] == 10

Out[98]:

0 False

1 False

2 False

3 False

4 False

5 False

6 False

7 False

8 False

9 True

10 False

11 False

12 False

13 False

14 False

15 False

16 False

17 False

18 False

19 False

Name: Day, dtype: bool

In [99]:

con=df['Day'] == 10 # Give a name to the condition and pass the variable name in df to filter

df[con]

Out[99]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
9	10	16.02	16.6	15.77	16.41	16.41	121700	All Day	0.83

In [100]:

df[df['Day'] > 15] # Without giving the condition name

Out[100]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
15	16	17.040001	17.049999	16.480000	17.040001	17.040001	177100	All Day	0.569999
16	17	17.350000	17.480000	16.790001	17.190001	17.190001	236800	All Day	0.689999
17	18	17.549999	17.389999	16.450001	16.510000	16.510000	300000	All Day	0.939998
18	19	16.549999	16.629999	16.000000	16.459999	16.459999	324900	All Day	0.629999
19	20	16.709999	16.580000	15.980000	16.040001	16.040001	141600	All Day	0.600000

In [101]:

con1= df['Day'] >15 # Multiple Conditions

con2 = df['Close'] > 16.5

df[con1 & con2] # AND logical operator

Out[102]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
15	16	17.040001	17.049999	16.480000	17.040001	17.040001	177100	All Day	0.569999
16	17	17.350000	17.480000	16.790001	17.190001	17.190001	236800	All Day	0.689999
17	18	17.549999	17.389999	16.450001	16.510000	16.510000	300000	All Day	0.939998

In [103]:

df[con1 | con2] # OR operator

Out[103]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
2	3	17.309999	17.230000	16.870001	17.049999	17.049999	146300	All Day	0.359999
3	4	17.150000	17.330000	16.900000	16.920000	16.920000	139700	All Day	0.430000
6	7	16.639999	16.709999	16.320000	16.610001	16.610001	93000	All Day	0.389999
14	15	16.719999	16.870001	16.340000	16.790001	16.790001	174100	All Day	0.530001
15	16	17.040001	17.049999	16.480000	17.040001	17.040001	177100	All Day	0.569999
16	17	17.350000	17.480000	16.790001	17.190001	17.190001	236800	All Day	0.689999
17	18	17.549999	17.389999	16.450001	16.510000	16.510000	300000	All Day	0.939998
18	19	16.549999	16.629999	16.000000	16.459999	16.459999	324900	All Day	0.629999
19	20	16.709999	16.580000	15.980000	16.040001	16.040001	141600	All Day	0.600000

In [104]:

con3 = df['Volume'] > 200000

In [105]:

df[(con1 | con2) & con3] # AND OR operators together

Out[105]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
16	17	17.350000	17.480000	16.790001	17.190001	17.190001	236800	All Day	0.689999
17	18	17.549999	17.389999	16.450001	16.510000	16.510000	300000	All Day	0.939998
18	19	16.549999	16.629999	16.000000	16.459999	16.459999	324900	All Day	0.629999

In [111]:

df[df['Day'].isin([1,2,3,10,15,19])] # Use of isin function to pass multiple conditions at one time

Out[111]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
2	3	17.309999	17.230000	16.870001	17.049999	17.049999	146300	All Day	0.359999
9	10	16.020000	16.600000	15.770000	16.410000	16.410000	121700	All Day	0.830000
14	15	16.719999	16.870001	16.340000	16.790001	16.790001	174100	All Day	0.530001
18	19	16.549999	16.629999	16.000000	16.459999	16.459999	324900	All Day	0.629999

In [113]:

df[df['Day'].between(10,15)] # Use of between function to pass 2 values

Out[113]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
9	10	16.020000	16.600000	15.770000	16.410000	16.410000	121700	All Day	0.830000
10	11	16.639999	16.389999	16.100000	16.150000	16.150000	32300	All Day	0.289999
11	12	16.580000	16.480000	16.059999	16.360001	16.360001	143000	All Day	0.420001
12	13	16.590000	16.340000	16.110001	16.219999	16.219999	83900	All Day	0.229999
13	14	16.490000	16.500000	16.049999	16.500000	16.500000	147700	All Day	0.450001
14	15	16.719999	16.870001	16.340000	16.790001	16.790001	174100	All Day	0.530001

In [115]:

df['Week'].unique() # it ll result all available unique values

Out[115]:

array(['All Day'], dtype=object)

In [116]:

df['Week'].nunique() # it ll show the number of unique values

Out[116]:

In [124]:

df.sort_values(by='Day', ascending = False).head() # sort_vlaues function for ascending by default

Out[124]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
19	20	16.709999	16.580000	15.980000	16.040001	16.040001	141600	All Day	0.600000
18	19	16.549999	16.629999	16.000000	16.459999	16.459999	324900	All Day	0.629999
17	18	17.549999	17.389999	16.450001	16.510000	16.510000	300000	All Day	0.939998
16	17	17.350000	17.480000	16.790001	17.190001	17.190001	236800	All Day	0.689999
15	16	17.040001	17.049999	16.480000	17.040001	17.040001	177100	All Day	0.569999

In [123]:

df.sort_values(by=['Volume','Day'], ascending = [False,True]).head()

# Descending (False) for Volumne and Ascending (True) for Day

Out[123]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
18	19	16.549999	16.629999	16.000000	16.459999	16.459999	324900	All Day	0.629999
17	18	17.549999	17.389999	16.450001	16.510000	16.510000	300000	All Day	0.939998
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999
16	17	17.350000	17.480000	16.790001	17.190001	17.190001	236800	All Day	0.689999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998

In [125]:

df.sort_index(ascending=False) # sorting by index value

Out[125]:

	Day	Open	High	Low	Close	Adj Close	Volume	Week	Difference
4	5	17.219999	16.990000	16.200001	16.290001	16.290001	216600	All Day	0.789999
3	4	17.150000	17.330000	16.900000	16.920000	16.920000	139700	All Day	0.430000
2	3	17.309999	17.230000	16.870001	17.049999	17.049999	146300	All Day	0.359999
1	2	17.350000	17.469999	16.950001	17.200001	17.200001	229800	All Day	0.519998
0	1	17.020000	17.559999	16.730000	17.010000	16.934999	271700	All Day	0.829999

In [130]:

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

data.head()

Out[135]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF
3	Andorra	245	138	312	12.4	EU
4	Angola	217	57	45	5.9	AF

In [143]:

data.loc[3] # Index number 3 (Row# 4) data

Out[143]:

country Andorra

beer_servings 245

spirit_servings 138

wine_servings 312

total_litres_of_pure_alcohol 12.4

continent EU

Name: 3, dtype: object

In [149]:

data.loc[[4,1]] # more than one row of data

Out[149]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
4	Angola	217	57	45	5.9	AF
1	Albania	89	132	54	4.9	EU

In [151]:

data.loc[[0,4,1]] [['country','continent']]

Out[151]:

	country	continent
0	Afghanistan	AS
4	Angola	AF
1	Albania	EU

In [154]:

data.loc[0:2]

Out[154]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF

In [158]:

data.loc[5:8] [['country','continent']] # 5th 6th 7th 8th rows LABEL based INDEXING

Out[158]:

	country	continent
5	Antigua & Barbuda	NaN
6	Argentina	SA
7	Armenia	EU
8	Australia	OC

In [157]:

data.iloc[6]

Out[157]:

country Argentina

beer_servings 193

spirit_servings 25

wine_servings 221

total_litres_of_pure_alcohol 8.3

continent SA

Name: 6, dtype: object

In [159]:

data.iloc[5:8] [['country','continent']] # 5th 6th 7th rows note that there is no 8th Row. POSITIONAL based INDEXING

Out[159]:

	country	continent
5	Antigua & Barbuda	NaN
6	Argentina	SA
7	Armenia	EU

loc gets rows (or columns) with particular labels from the index.

iloc gets rows (or columns) at particular positions in the index (so it only takes integers).

ix usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.

In [160]:

s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])

Out[160]:

49 NaN

48 NaN

47 NaN

46 NaN

45 NaN

1 NaN

2 NaN

3 NaN

4 NaN

5 NaN

dtype: float64

In [161]:

s.loc[:3] # LABEL based. So, slice up to and including label 3

Out[161]:

49 NaN

48 NaN

47 NaN

46 NaN

45 NaN

1 NaN

2 NaN

3 NaN

dtype: float64

In [162]:

s.iloc[:3] # POSITION based. So, 1st 3 rows

Out[162]:

49 NaN

48 NaN

47 NaN

dtype: float64

In [163]:

s.ix[:3] # the integer is in the index so s.ix[:3] works like loc

C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

.ix is deprecated. Please use

.loc for label based indexing or

.iloc for positional indexing

See the documentation here:

http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated

"""Entry point for launching an IPython kernel.

Out[163]:

49 NaN

48 NaN

47 NaN

46 NaN

45 NaN

1 NaN

2 NaN

3 NaN

dtype: float64

In [165]:

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

data

Out[165]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF
3	Andorra	245	138	312	12.4	EU
4	Angola	217	57	45	5.9	AF
...	...	...	...	...	...	...
188	Venezuela	333	100	3	7.7	SA
189	Vietnam	111	2	1	2.0	AS
190	Yemen	6	0	0	0.1	AS
191	Zambia	32	19	4	2.5	AF
192	Zimbabwe	64	18	4	4.7	AF

193 rows × 6 columns

In [171]:

# Changing of by default Index column - 2 ways : index_col & set_index

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv', index_col='country')

data

Out[171]:

	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
country
Afghanistan	0	0	0	0.0	AS
Albania	89	132	54	4.9	EU
Algeria	25	0	14	0.7	AF
Andorra	245	138	312	12.4	EU
Angola	217	57	45	5.9	AF
...	...	...	...	...	...
Venezuela	333	100	3	7.7	SA
Vietnam	111	2	1	2.0	AS
Yemen	6	0	0	0.1	AS
Zambia	32	19	4	2.5	AF
Zimbabwe	64	18	4	4.7	AF

193 rows × 5 columns

In [174]:

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

data.set_index('continent')

Out[174]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
continent
AS	Afghanistan	0	0	0	0.0
EU	Albania	89	132	54	4.9
AF	Algeria	25	0	14	0.7
EU	Andorra	245	138	312	12.4
AF	Angola	217	57	45	5.9
...	...	...	...	...	...
SA	Venezuela	333	100	3	7.7
AS	Vietnam	111	2	1	2.0
AS	Yemen	6	0	0	0.1
AF	Zambia	32	19	4	2.5
AF	Zimbabwe	64	18	4	4.7

193 rows × 5 columns

In [177]:

data.set_index(['continent','country'])

Out[177]:

		beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
continent	country
AS	Afghanistan	0	0	0	0.0
EU	Albania	89	132	54	4.9
AF	Algeria	25	0	14	0.7
EU	Andorra	245	138	312	12.4
AF	Angola	217	57	45	5.9
...	...	...	...	...	...
SA	Venezuela	333	100	3	7.7
AS	Vietnam	111	2	1	2.0
AS	Yemen	6	0	0	0.1
AF	Zambia	32	19	4	2.5
AF	Zimbabwe	64	18	4	4.7

193 rows × 4 columns

In [178]:

data

Out[178]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF

In [198]:

def geograhy(info):

beer_servings = info[1] # to capture beer_servings information

spirit_servings = info[2] # to capture spirit_servings information

wine_servings = info[3]

new_variable = beer_servings + spirit_servings + wine_servings

return new_variable

In [199]:

data['New_Column'] = data.apply(func=geograhy, axis='columns') # New column added to DF

data

Out[199]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent	New_Column
0	Afghanistan	0	0	0	0.0	AS	0
1	Albania	89	132	54	4.9	EU	275
2	Algeria	25	0	14	0.7	AF	39

193 rows × 7 columns

In [200]:

len(data) # total number of records

Out[200]:

193

In [203]:

data.sample(n =5) # randomly select 5 sample records

Out[203]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent	New_Column
86	Jordan	6	21	1	0.5	AS	28
15	Belarus	142	373	42	14.4	EU	557
54	El Salvador	52	69	2	2.2	NaN	123
64	Georgia	52	100	149	5.4	EU	301
160	Spain	284	157	112	10.0	EU	553

In [206]:

data.sample(frac = 2/100) # Extract 2 % of sample data randomly

Out[206]:

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent	New_Column
39	Congo	76	1	9	1.7	AF	86
89	Kiribati	21	34	1	1.0	OC	56
46	North Korea	0	0	0	0.0	AS	0
104	Mali	5	1	1	0.6	AF	7

In [207]:

data.info() # column data types and Total number of record in Columns

RangeIndex: 193 entries, 0 to 192

Data columns (total 7 columns):

country 193 non-null object

beer_servings 193 non-null int64

spirit_servings 193 non-null int64

wine_servings 193 non-null int64

total_litres_of_pure_alcohol 193 non-null float64

continent 170 non-null object

New_Column 193 non-null int64

dtypes: float64(1), int64(4), object(2)

memory usage: 10.7+ KB

In [208]:

data.describe() # Statistical information for numerical column

Out[208]:

	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	New_Column
count	193.000000	193.000000	193.000000	193.000000	193.000000
mean	106.160622	80.994819	49.450777	4.717098	236.606218
std	101.143103	88.284312	79.697598	3.773298	209.307863
min	0.000000	0.000000	0.000000	0.000000	0.000000
25%	20.000000	4.000000	1.000000	1.300000	45.000000
50%	76.000000	56.000000	8.000000	4.200000	173.000000
75%	188.000000	128.000000	59.000000	7.200000	392.000000
max	376.000000	438.000000	370.000000	14.400000	695.000000

In [209]:

data['continent'].value_counts() # Count the number of countries in each continent

Out[209]:

AF 53

EU 45

AS 44

OC 16

SA 12

Name: continent, dtype: int64

In [217]:

max_countries = data['continent'].value_counts().max()

# Get the continent which has most number of Countries

max_countries

Out[217]:

In [222]:

data['continent'].value_counts().sort_values(ascending=False).head(1)

Out[222]:

AF 53

Name: continent, dtype: int64

In [231]:

top3_continent = list(data['continent'].value_counts().sort_values(ascending=False).head(3).index)

top3_continent

Out[231]:

['AF', 'EU', 'AS']

In [230]:

data.iloc[top3_continent]

Data and Database

Monday, 28 September 2020

Pyhon Pandas : Dataframe - i

No comments:

Total Pageviews

Labels