Monday, 28 September 2020

Pyhon Pandas : Dataframe - i

 DataFrame

 

import numpy as np

import pandas as pd

 

In [59]:

data = np.random.randn(5,3)  # Generate random data in matrix/tabular form 5 rows and 3 columns

 

In [60]:

data

Out[60]:

array([[-0.62162763,  0.9921252 , -0.3884227 ],

       [ 0.25709088, -0.1365675 ,  0.77113954],

       [-0.96153538, -1.26281479, -1.58631299],

       [-2.38202279, -2.19703702,  0.42556743],

       [-0.42338357, -0.6131884 , -0.66370211]])

 

In [61]:

type(data)

Out[61]:

numpy.ndarray

 

In [62]:

pdata = pd.DataFrame(data)  # Creating a DataFrame out of numpy array using  DataFrame function

 

In [63]:

type(pdata)

Out[63]:

pandas.core.frame.DataFrame

 

In [64]:

pdata

Out[64]:

0

1

2

0

-0.621628

0.992125

-0.388423

1

0.257091

-0.136568

0.771140

2

-0.961535

-1.262815

-1.586313

3

-2.382023

-2.197037

0.425567

4

-0.423384

-0.613188

-0.663702

 

In [65]:

pd.DataFrame(data, columns = ['A','B','C'])

Out[65]:

A

B

C

0

-0.621628

0.992125

-0.388423

1

0.257091

-0.136568

0.771140

2

-0.961535

-1.262815

-1.586313

3

-2.382023

-2.197037

0.425567

4

-0.423384

-0.613188

-0.663702

 

In [66]:

pd.DataFrame(data, columns = ['A','B','C'], index = ['a','b','c','d','e'])

Out[66]:

A

B

C

a

-0.621628

0.992125

-0.388423

b

0.257091

-0.136568

0.771140

c

-0.961535

-1.262815

-1.586313

d

-2.382023

-2.197037

0.425567

e

-0.423384

-0.613188

-0.663702

 

In [67]:

df = pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/Stock.csv')

# Creating a DataFrame out of numpy array using  read_csv function

df

Out[67]:

Day

Open

High

Low

Close

Adj Close

Volume

0

1

16.770000

17.559999

16.730000

17.010000

16.934999

271700

1

2

17.100000

17.469999

16.950001

17.200001

17.200001

229800

2

3

17.059999

17.230000

16.870001

17.049999

17.049999

146300

3

4

16.900000

17.330000

16.900000

16.920000

16.920000

139700

4

5

16.969999

16.990000

16.200001

16.290001

16.290001

216600

5

6

16.240000

16.530001

16.110001

16.480000

16.480000

115700

In [68]:

df.head(2)  # BUT by default 5 records

Out[68]:

Day

Open

High

Low

Close

Adj Close

Volume

0

1

16.77

17.559999

16.730000

17.010000

16.934999

271700

1

2

17.10

17.469999

16.950001

17.200001

17.200001

229800

 

In [69]:

df.tail(2)  # by default 5 records

Out[69]:

Day

Open

High

Low

Close

Adj Close

Volume

18

19

16.299999

16.629999

16.00

16.459999

16.459999

324900

19

20

16.459999

16.580000

15.98

16.040001

16.040001

141600

 

In [70]:

df[2:5]  # To show data for index number 2,3,4

Out[70]:

Day

Open

High

Low

Close

Adj Close

Volume

2

3

17.059999

17.23

16.870001

17.049999

17.049999

146300

3

4

16.900000

17.33

16.900000

16.920000

16.920000

139700

4

5

16.969999

16.99

16.200001

16.290001

16.290001

216600

 

In [71]:

df.index

Out[71]:

RangeIndex(start=0, stop=20, step=1)

 

In [72]:

df.shape                                                   # To show number of rows and columns

Out[72]:

(20, 7)

 

In [73]:

df['Week'] = 'All Day'  # Adding a new column 'Week' at the end and assign a value to all records

df.head(5)

Out[73]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

0

1

16.770000

17.559999

16.730000

17.010000

16.934999

271700

All Day

1

2

17.100000

17.469999

16.950001

17.200001

17.200001

229800

All Day

2

3

17.059999

17.230000

16.870001

17.049999

17.049999

146300

All Day

3

4

16.900000

17.330000

16.900000

16.920000

16.920000

139700

All Day

4

5

16.969999

16.990000

16.200001

16.290001

16.290001

216600

All Day

 

 

In [74]:

df['Difference'] = df['High'] - df['Low']  # Another new column

df.head(5)

 

Out[74]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

0

1

16.770000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

1

2

17.100000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

2

3

17.059999

17.230000

16.870001

17.049999

17.049999

146300

All Day

0.359999

3

4

16.900000

17.330000

16.900000

16.920000

16.920000

139700

All Day

0.430000

4

5

16.969999

16.990000

16.200001

16.290001

16.290001

216600

All Day

0.789999

 

In [75]:

df.shape

Out[75]:

(20, 9)

 

In [76]:

df.High                   # Selecting one column named “High”

Out[76]:

0     17.559999

1     17.469999

2     17.230000

3     17.330000

4     16.990000

5     16.530001

6     16.709999

7     16.629999

8     16.320000

Name: High, dtype: float64

In [77]:

df.Adj Close                 # Selecting one column – error because space is there in between

  File "<ipython-input-77-70fafd7a9d32>", line 1

    df.Adj Close                 # Selecting one column -- error

               ^

SyntaxError: invalid syntax

 

 

In [78]:

df['Adj Close']                  # Selecting one column

Out[78]:

0     16.934999

1     17.200001

2     17.049999

3     16.920000

4     16.290001

5     16.480000

6     16.610001

Name: Adj Close, dtype: float64

 

In [79]:

df[['Day','High','Low']]            # Selecting more than one column

Out[79]:

Day

High

Low

0

1

17.559999

16.730000

1

2

17.469999

16.950001

2

3

17.230000

16.870001

In [80]:

df[['Day','High','Low']].head(5)         # Selecting more than one column with head function

Out[80]:

Day

High

Low

0

1

17.559999

16.730000

1

2

17.469999

16.950001

2

3

17.230000

16.870001

3

4

17.330000

16.900000

4

5

16.990000

16.200001

 

In [81]:

select = ['Day','High','Low']  # create  a list and pass it into the df

selecteddf = df[select].head(3)

selecteddf

 

Out[81]:

Day

High

Low

0

1

17.559999

16.730000

1

2

17.469999

16.950001

2

3

17.230000

16.870001

 

In [82]:

type(selecteddf)

Out[82]:

pandas.core.frame.DataFrame

 

In [83]:

df['Open'] = df['Open'] + 0.25  # sum applied to all values in Open column

df.head(5)

 

Out[83]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

2

3

17.309999

17.230000

16.870001

17.049999

17.049999

146300

All Day

0.359999

3

4

17.150000

17.330000

16.900000

16.920000

16.920000

139700

All Day

0.430000

4

5

17.219999

16.990000

16.200001

16.290001

16.290001

216600

All Day

0.789999

 

Drop missing rows or column

 

In [84]:

df1 = pd.DataFrame([[np.nan,1,2,np.nan],[3,4,np.nan,np.nan],[5,np.nan,6,7],

                    [8,9,np.nan,10],[11,12,13,10],[np.nan,np.nan,np.nan,np.nan]],

                   columns=list('ABCD'),index=list('abcdef'))

df1

Out[84]:

A

B

C

D

a

NaN

1.0

2.0

NaN

b

3.0

4.0

NaN

NaN

c

5.0

NaN

6.0

7.0

d

8.0

9.0

NaN

10.0

e

11.0

12.0

13.0

10.0

f

NaN

NaN

NaN

NaN

In [85]:

df1. dropna (axis=0,how='any')

# axis = 0 means operation on Row level

# how = 'any' means It will drop rows wherever there is atleast a NaN in the row record

Out[85]:

A

B

C

D

e

11.0

12.0

13.0

10.0

 

In [86]:

df1.dropna(axis=0,how='all')

# axis = 0 means operation on Row level

# how = 'all' means It will drop rows wherever all record values are NaN in the row

Out[86]:

A

B

C

D

a

NaN

1.0

2.0

NaN

b

3.0

4.0

NaN

NaN

c

5.0

NaN

6.0

7.0

d

8.0

9.0

NaN

10.0

e

11.0

12.0

13.0

10.0

 

In [87]:

df1 = pd.DataFrame([[np.nan,1,2,np.nan],[3,4,np.nan,np.nan],[5,np.nan,6,7],

                    [8,9,np.nan,10],[11,12,13,10],[np.nan,np.nan,np.nan,np.nan]],

                   columns=list('ABCD'))

df1

Out[87]:

A

B

C

D

0

NaN

1.0

2.0

NaN

1

3.0

4.0

NaN

NaN

2

5.0

NaN

6.0

7.0

3

8.0

9.0

NaN

10.0

4

11.0

12.0

13.0

10.0

5

NaN

NaN

NaN

NaN

 

In [88]:

df1.dropna(axis='index',how='any') # axis = index (same as axis = 0)

Out[88]:

A

B

C

D

4

11.0

12.0

13.0

10.0

 

In [89]:

df1.dropna(axis='columns',how='any') # axis = columns (same as axis = 1)

Out[89]:

0

1

2

3

4

5

In [90]:

df1.drop( axis = 0, labels=[2])  # Drop index no 2 i.e. 3rd row

Out[90]:

A

B

C

D

0

NaN

1.0

2.0

NaN

1

3.0

4.0

NaN

NaN

3

8.0

9.0

NaN

10.0

4

11.0

12.0

13.0

10.0

5

NaN

NaN

NaN

NaN

 

In [91]:

df1.drop( axis = 0, labels=[2,4])

Out[91]:

A

B

C

D

0

NaN

1.0

2.0

NaN

1

3.0

4.0

NaN

NaN

3

8.0

9.0

NaN

10.0

5

NaN

NaN

NaN

NaN

 

In [92]:

df1.drop( axis = 1, labels=['C'])  # Drop column C

Out[92]:

A

B

D

0

NaN

1.0

NaN

1

3.0

4.0

NaN

2

5.0

NaN

7.0

3

8.0

9.0

10.0

4

11.0

12.0

10.0

5

NaN

NaN

NaN

In [93]:

df1.drop( axis = 1, labels=['A','C'])  # Drop column A & C

Out[93]:

B

D

0

1.0

NaN

1

4.0

NaN

2

NaN

7.0

3

9.0

10.0

4

12.0

10.0

5

NaN

NaN

 

In [94]:

df2 = pd.DataFrame([[np.nan,1,2,np.nan,23,np.nan],[3,4,np.nan,np.nan,24,np.nan],[5,np.nan,6,7,25,np.nan],

                    [8,9,np.nan,10,25,np.nan],[11,12,13,10,26,np.nan],[np.nan,np.nan,np.nan,np.nan,27,np.nan]],

                   columns=list('ABCDEF'),index=list('abcdef'))

df2

Out[94]:

A

B

C

D

E

F

a

NaN

1.0

2.0

NaN

23

NaN

b

3.0

4.0

NaN

NaN

24

NaN

c

5.0

NaN

6.0

7.0

25

NaN

d

8.0

9.0

NaN

10.0

25

NaN

e

11.0

12.0

13.0

10.0

26

NaN

f

NaN

NaN

NaN

NaN

27

NaN

In [95]:

df2.dropna(axis=1,how='any')

# axis = 1 means operation on Column level

# how = 'any' means It will drop columns wherever there is atleast a NaN

 

Out[95]:

E

a

23

b

24

c

25

d

25

e

26

f

27

In [96]:

df2.dropna(axis=1,how='all')

# axis = 1 means operation on Column level

# how = 'all' means It will drop columns wherever all columns values are NaN

Out[96]:

A

B

C

D

E

a

NaN

1.0

2.0

NaN

23

b

3.0

4.0

NaN

NaN

24

c

5.0

NaN

6.0

7.0

25

d

8.0

9.0

NaN

10.0

25

e

11.0

12.0

13.0

10.0

26

f

NaN

NaN

NaN

NaN

27


Filter data

In [97]:

Df.head()

Out[97]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

2

3

17.309999

17.230000

16.870001

17.049999

17.049999

146300

All Day

0.359999

3

4

17.150000

17.330000

16.900000

16.920000

16.920000

139700

All Day

0.430000

4

5

17.219999

16.990000

16.200001

16.290001

16.290001

216600

All Day

0.789999

 

In [98]:

df['Day'] == 10 

Out[98]:

0     False

1     False

2     False

3     False

4     False

5     False

6     False

7     False

8     False

9      True

10    False

11    False

12    False

13    False

14    False

15    False

16    False

17    False

18    False

19    False

Name: Day, dtype: bool

 

In [99]:

con=df['Day'] == 10          # Give a name to the condition and pass the variable name in df to filter

df[con]

Out[99]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

9

10

16.02

16.6

15.77

16.41

16.41

121700

All Day

0.83

 

In [100]:

df[df['Day'] > 15]          # Without giving the condition name

Out[100]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

15

16

17.040001

17.049999

16.480000

17.040001

17.040001

177100

All Day

0.569999

16

17

17.350000

17.480000

16.790001

17.190001

17.190001

236800

All Day

0.689999

17

18

17.549999

17.389999

16.450001

16.510000

16.510000

300000

All Day

0.939998

18

19

16.549999

16.629999

16.000000

16.459999

16.459999

324900

All Day

0.629999

19

20

16.709999

16.580000

15.980000

16.040001

16.040001

141600

All Day

0.600000

 

In [101]:

con1= df['Day'] >15                     # Multiple Conditions

con2 = df['Close'] > 16.5

df[con1 & con2]                      # AND logical operator

Out[102]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

15

16

17.040001

17.049999

16.480000

17.040001

17.040001

177100

All Day

0.569999

16

17

17.350000

17.480000

16.790001

17.190001

17.190001

236800

All Day

0.689999

17

18

17.549999

17.389999

16.450001

16.510000

16.510000

300000

All Day

0.939998

 

In [103]:

df[con1 | con2]            # OR operator

Out[103]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

2

3

17.309999

17.230000

16.870001

17.049999

17.049999

146300

All Day

0.359999

3

4

17.150000

17.330000

16.900000

16.920000

16.920000

139700

All Day

0.430000

6

7

16.639999

16.709999

16.320000

16.610001

16.610001

93000

All Day

0.389999

14

15

16.719999

16.870001

16.340000

16.790001

16.790001

174100

All Day

0.530001

15

16

17.040001

17.049999

16.480000

17.040001

17.040001

177100

All Day

0.569999

16

17

17.350000

17.480000

16.790001

17.190001

17.190001

236800

All Day

0.689999

17

18

17.549999

17.389999

16.450001

16.510000

16.510000

300000

All Day

0.939998

18

19

16.549999

16.629999

16.000000

16.459999

16.459999

324900

All Day

0.629999

19

20

16.709999

16.580000

15.980000

16.040001

16.040001

141600

All Day

0.600000

In [104]:

con3 = df['Volume'] > 200000

 

In [105]:

df[(con1 | con2) & con3]            # AND OR operators together

Out[105]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

16

17

17.350000

17.480000

16.790001

17.190001

17.190001

236800

All Day

0.689999

17

18

17.549999

17.389999

16.450001

16.510000

16.510000

300000

All Day

0.939998

18

19

16.549999

16.629999

16.000000

16.459999

16.459999

324900

All Day

0.629999

 

In [111]:

df[df['Day'].isin([1,2,3,10,15,19])]  # Use of isin function to pass multiple conditions at one time

Out[111]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

2

3

17.309999

17.230000

16.870001

17.049999

17.049999

146300

All Day

0.359999

9

10

16.020000

16.600000

15.770000

16.410000

16.410000

121700

All Day

0.830000

14

15

16.719999

16.870001

16.340000

16.790001

16.790001

174100

All Day

0.530001

18

19

16.549999

16.629999

16.000000

16.459999

16.459999

324900

All Day

0.629999

In [113]:

df[df['Day'].between(10,15)]  # Use of between function to pass 2 values

 

Out[113]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

9

10

16.020000

16.600000

15.770000

16.410000

16.410000

121700

All Day

0.830000

10

11

16.639999

16.389999

16.100000

16.150000

16.150000

32300

All Day

0.289999

11

12

16.580000

16.480000

16.059999

16.360001

16.360001

143000

All Day

0.420001

12

13

16.590000

16.340000

16.110001

16.219999

16.219999

83900

All Day

0.229999

13

14

16.490000

16.500000

16.049999

16.500000

16.500000

147700

All Day

0.450001

14

15

16.719999

16.870001

16.340000

16.790001

16.790001

174100

All Day

0.530001

 

In [115]:

df['Week'].unique()              # it ll result all available unique values

Out[115]:

array(['All Day'], dtype=object)

 

In [116]:

df['Week'].nunique()          # it ll show the number of unique values

Out[116]:

1

 

In [124]:

df.sort_values(by='Day', ascending = False).head()    # sort_vlaues function for ascending by default

Out[124]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

19

20

16.709999

16.580000

15.980000

16.040001

16.040001

141600

All Day

0.600000

18

19

16.549999

16.629999

16.000000

16.459999

16.459999

324900

All Day

0.629999

17

18

17.549999

17.389999

16.450001

16.510000

16.510000

300000

All Day

0.939998

16

17

17.350000

17.480000

16.790001

17.190001

17.190001

236800

All Day

0.689999

15

16

17.040001

17.049999

16.480000

17.040001

17.040001

177100

All Day

0.569999

 

In [123]:

df.sort_values(by=['Volume','Day'], ascending = [False,True]).head()

# Descending (False) for Volumne and Ascending (True) for Day

Out[123]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

18

19

16.549999

16.629999

16.000000

16.459999

16.459999

324900

All Day

0.629999

17

18

17.549999

17.389999

16.450001

16.510000

16.510000

300000

All Day

0.939998

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

16

17

17.350000

17.480000

16.790001

17.190001

17.190001

236800

All Day

0.689999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

 

In [125]:

df.sort_index(ascending=False)   # sorting by index value

Out[125]:

Day

Open

High

Low

Close

Adj Close

Volume

Week

Difference

4

5

17.219999

16.990000

16.200001

16.290001

16.290001

216600

All Day

0.789999

3

4

17.150000

17.330000

16.900000

16.920000

16.920000

139700

All Day

0.430000

2

3

17.309999

17.230000

16.870001

17.049999

17.049999

146300

All Day

0.359999

1

2

17.350000

17.469999

16.950001

17.200001

17.200001

229800

All Day

0.519998

0

1

17.020000

17.559999

16.730000

17.010000

16.934999

271700

All Day

0.829999

 

 

In [130]:

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

data.head()

Out[135]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

0

Afghanistan

0

0

0

0.0

AS

1

Albania

89

132

54

4.9

EU

2

Algeria

25

0

14

0.7

AF

3

Andorra

245

138

312

12.4

EU

4

Angola

217

57

45

5.9

AF

 

In [143]:

data.loc[3]   # Index number 3 (Row# 4) data

Out[143]:

country                         Andorra

beer_servings                       245

spirit_servings                     138

wine_servings                       312

total_litres_of_pure_alcohol       12.4

continent                            EU

Name: 3, dtype: object

 

In [149]:

data.loc[[4,1]]   # more than one row of data

Out[149]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

4

Angola

217

57

45

5.9

AF

1

Albania

89

132

54

4.9

EU

 

In [151]:

data.loc[[0,4,1]] [['country','continent']]

Out[151]:

country

continent

0

Afghanistan

AS

4

Angola

AF

1

Albania

EU

 

In [154]:

data.loc[0:2]

Out[154]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

0

Afghanistan

0

0

0

0.0

AS

1

Albania

89

132

54

4.9

EU

2

Algeria

25

0

14

0.7

AF

 

 

In [158]:

data.loc[5:8] [['country','continent']]  # 5th 6th 7th 8th rows  LABEL based INDEXING

Out[158]:

country

continent

5

Antigua & Barbuda

NaN

6

Argentina

SA

7

Armenia

EU

8

Australia

OC

 

In [157]:

data.iloc[6]

Out[157]:

country                         Argentina

beer_servings                         193

spirit_servings                        25

wine_servings                         221

total_litres_of_pure_alcohol          8.3

continent                              SA

Name: 6, dtype: object

 

In [159]:

data.iloc[5:8] [['country','continent']]    # 5th 6th 7th rows  note that there is no 8th Row. POSITIONAL based  INDEXING

Out[159]:

country

continent

5

Antigua & Barbuda

NaN

6

Argentina

SA

7

Armenia

EU

 

loc gets rows (or columns) with particular labels from the index.

iloc gets rows (or columns) at particular positions in the index (so it only takes integers).

ix usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.

 

In [160]:

s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])

s

Out[160]:

49   NaN

48   NaN

47   NaN

46   NaN

45   NaN

1    NaN

2    NaN

3    NaN

4    NaN

5    NaN

dtype: float64

In [161]:

s.loc[:3]   # LABEL based. So, slice up to and including label 3

Out[161]:

49   NaN

48   NaN

47   NaN

46   NaN

45   NaN

1    NaN

2    NaN

3    NaN

dtype: float64

 

In [162]:

s.iloc[:3]    # POSITION based. So, 1st 3 rows

Out[162]:

49   NaN

48   NaN

47   NaN

dtype: float64

 

In [163]:

s.ix[:3] # the integer is in the index so s.ix[:3] works like loc

C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

.ix is deprecated. Please use

.loc for label based indexing or

.iloc for positional indexing

 

See the documentation here:

http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated

  """Entry point for launching an IPython kernel.

 

Out[163]:

49   NaN

48   NaN

47   NaN

46   NaN

45   NaN

1    NaN

2    NaN

3    NaN

dtype: float64

In [165]:

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

data

Out[165]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

0

Afghanistan

0

0

0

0.0

AS

1

Albania

89

132

54

4.9

EU

2

Algeria

25

0

14

0.7

AF

3

Andorra

245

138

312

12.4

EU

4

Angola

217

57

45

5.9

AF

...

...

...

...

...

...

...

188

Venezuela

333

100

3

7.7

SA

189

Vietnam

111

2

1

2.0

AS

190

Yemen

6

0

0

0.1

AS

191

Zambia

32

19

4

2.5

AF

192

Zimbabwe

64

18

4

4.7

AF

193 rows × 6 columns

 

In [171]:

# Changing of by default Index column - 2 ways : index_col & set_index

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv', index_col='country')

data

Out[171]:

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

country

Afghanistan

0

0

0

0.0

AS

Albania

89

132

54

4.9

EU

Algeria

25

0

14

0.7

AF

Andorra

245

138

312

12.4

EU

Angola

217

57

45

5.9

AF

...

...

...

...

...

...

Venezuela

333

100

3

7.7

SA

Vietnam

111

2

1

2.0

AS

Yemen

6

0

0

0.1

AS

Zambia

32

19

4

2.5

AF

Zimbabwe

64

18

4

4.7

AF

193 rows × 5 columns

 

In [174]:

data=pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

data.set_index('continent')

Out[174]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

AS

Afghanistan

0

0

0

0.0

EU

Albania

89

132

54

4.9

AF

Algeria

25

0

14

0.7

EU

Andorra

245

138

312

12.4

AF

Angola

217

57

45

5.9

...

...

...

...

...

...

SA

Venezuela

333

100

3

7.7

AS

Vietnam

111

2

1

2.0

AS

Yemen

6

0

0

0.1

AF

Zambia

32

19

4

2.5

AF

Zimbabwe

64

18

4

4.7

193 rows × 5 columns

 

In [177]:

data.set_index(['continent','country'])

Out[177]:

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

country

AS

Afghanistan

0

0

0

0.0

EU

Albania

89

132

54

4.9

AF

Algeria

25

0

14

0.7

EU

Andorra

245

138

312

12.4

AF

Angola

217

57

45

5.9

...

...

...

...

...

...

SA

Venezuela

333

100

3

7.7

AS

Vietnam

111

2

1

2.0

Yemen

6

0

0

0.1

AF

Zambia

32

19

4

2.5

Zimbabwe

64

18

4

4.7

193 rows × 4 columns

 

In [178]:

data

Out[178]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

0

Afghanistan

0

0

0

0.0

AS

1

Albania

89

132

54

4.9

EU

2

Algeria

25

0

14

0.7

AF

In [198]:

def geograhy(info):

    beer_servings = info[1]         # to capture beer_servings information

    spirit_servings = info[2]         # to capture spirit_servings information

    wine_servings = info[3]

    new_variable = beer_servings + spirit_servings + wine_servings

    return new_variable

 

In [199]:

data['New_Column'] = data.apply(func=geograhy, axis='columns')  # New column added to DF

data

Out[199]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

New_Column

0

Afghanistan

0

0

0

0.0

AS

0

1

Albania

89

132

54

4.9

EU

275

2

Algeria

25

0

14

0.7

AF

39

193 rows × 7 columns

 

In [200]:

len(data)  # total number of records

Out[200]:

193

 

In [203]:

data.sample(n =5)  # randomly select 5 sample records

Out[203]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

New_Column

86

Jordan

6

21

1

0.5

AS

28

15

Belarus

142

373

42

14.4

EU

557

54

El Salvador

52

69

2

2.2

NaN

123

64

Georgia

52

100

149

5.4

EU

301

160

Spain

284

157

112

10.0

EU

553

 

In [206]:

data.sample(frac = 2/100)  # Extract 2 % of sample data randomly

Out[206]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

New_Column

39

Congo

76

1

9

1.7

AF

86

89

Kiribati

21

34

1

1.0

OC

56

46

North Korea

0

0

0

0.0

AS

0

104

Mali

5

1

1

0.6

AF

7

 

In [207]:

data.info()   # column data types and Total number of  record in Columns

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 193 entries, 0 to 192

Data columns (total 7 columns):

country                         193 non-null object

beer_servings                   193 non-null int64

spirit_servings                 193 non-null int64

wine_servings                   193 non-null int64

total_litres_of_pure_alcohol    193 non-null float64

continent                       170 non-null object

New_Column                      193 non-null int64

dtypes: float64(1), int64(4), object(2)

memory usage: 10.7+ KB

 

In [208]:

data.describe()  # Statistical information for numerical column

Out[208]:

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

New_Column

count

193.000000

193.000000

193.000000

193.000000

193.000000

mean

106.160622

80.994819

49.450777

4.717098

236.606218

std

101.143103

88.284312

79.697598

3.773298

209.307863

min

0.000000

0.000000

0.000000

0.000000

0.000000

25%

20.000000

4.000000

1.000000

1.300000

45.000000

50%

76.000000

56.000000

8.000000

4.200000

173.000000

75%

188.000000

128.000000

59.000000

7.200000

392.000000

max

376.000000

438.000000

370.000000

14.400000

695.000000

 

In [209]:

data['continent'].value_counts()  # Count the number of countries in each continent

Out[209]:

AF    53

EU    45

AS    44

OC    16

SA    12

Name: continent, dtype: int64

 

In [217]:

max_countries = data['continent'].value_counts().max() 

# Get the continent which has most number of Countries

max_countries

Out[217]:

53

In [222]:

data['continent'].value_counts().sort_values(ascending=False).head(1)

Out[222]:

AF    53

Name: continent, dtype: int64

 

In [231]:

top3_continent = list(data['continent'].value_counts().sort_values(ascending=False).head(3).index)

top3_continent

Out[231]:

['AF', 'EU', 'AS']

 

In [230]:

data.iloc[top3_continent]

 


No comments: