Data and Database: Python : Pandas

#Pandas work with these 3 types data structures to work with data.

#series – It’s For single dimensional data

#dataframe – it’s for 2 dimensional data

#panel – It’s for Multi dimensional data

import pandas as pd

# 1 dimensional array

s = pd.Series([10,20,30,40,50,60],dtype=float)

print(s)

print(s[1:3])

0 10.0

1 20.0

2 30.0

3 40.0

4 50.0

5 60.0

dtype: float64

1 20.0

2 30.0

dtype: float64

data = [10,20,30,40,40,50,60]

s1 = pd.Series(data)

print (s1)

0 10

1 20

2 30

3 40

4 40

5 50

6 60

dtype: int64

print(s1.index)

print(s1.values)

RangeIndex(start=0, stop=7, step=1)

[10 20 30 40 40 50 60]

s2 = pd.Series((data), index=range(15,22))

print(s2)

15 10

16 20

17 30

18 40

19 40

20 50

21 60

dtype: int64

print(s2[15:18])

Series([], dtype: int64)

s2 = pd.Series((data), index=['a','b','c','d','e','f','g'])

print(s2)

print('--------------------------')

print(s2['a':'d'])

a 10

b 20

c 30

d 40

e 40

f 50

g 60

dtype: int64

--------------------------

a 10

b 20

c 30

d 40

dtype: int64

print(type(s2))

# Dataframes

data = [10,20,30,40,50],[60,70,80,90,100]

p1 = pd.DataFrame(data)

print(p1)

0 1 2 3 4

0 10 20 30 40 50

1 60 70 80 90 100

data = [10,20,30,40,50],[60,70,80,90,100],['a','b','c','d','e'],[1.0,2.0,3.0,4.0,5.0]

p1 = pd.DataFrame(data)

print(p1)

0 1 2 3 4

0 10 20 30 40 50

1 60 70 80 90 100

2 a b c d e

3 1 2 3 4 5

data = [10,20,30,40,50],['a','b','c','d','e']

p2 = pd.DataFrame(data)

print(p2)

0 1 2 3 4

0 10 20 30 40 50

1 a b c d e

data = [10,20,30,40,50],['a','c','d']

p2 = pd.DataFrame(data)

print(p2)

0 1 2 3 4

0 10 20 30 40.0 50.0

1 a c d NaN NaN

# 2D arrays converts into 2D Panda Dataframe

data = [[1,2,3],[4,5]]

print(data)

print(type(data))

print("-------------------------------------------------------")

d1 = pd.DataFrame(data)

print(d1)

print(type(d1))

[[1, 2, 3], [4, 5]]

-------------------------------------------------------

0 1 2

0 1 2 3.0

1 4 5 NaN

# Dictionary to 2D DataFrame

data1 = {"A":[1,2,3,4,5],

"B":[4,5,6,7,8],

"C":[14,15,16,17,18],

"D":[24,25,26,27,28]}

print(data1)

print(type(data1))

print("-------------------------------------------------------")

d1 = pd.DataFrame(data1)

print(d1)

print(type(d1))

{'A': [1, 2, 3, 4, 5], 'B': [4, 5, 6, 7, 8], 'C': [14, 15, 16, 17, 18], 'D': [24, 25, 26, 27, 28]}

-------------------------------------------------------

A B C D

0 1 4 14 24

1 2 5 15 25

2 3 6 16 26

3 4 7 17 27

4 5 8 18 28

data3 = [10,20,30,40,40,50,60]

s3 = pd.Series((data3), index=['a','b','c','d','e','f','g'])

print(s3)

a 10

b 20

c 30

d 40

e 40

f 50

g 60

dtype: int64

data4 = pd.Series(([6,12]),index=['a','b'])

d4 = pd.DataFrame([data4])

print(d4)

a b

0 6 12

import pandas as pd

# cities is a dictionary with key as city_name, population, state and values as a list

# city_frame is a DataFrame

cities = {

"city_name" : ["Bangalore","Chennai","Mumbai","BBSR","Hyd'bad","Kolkatta"],

"Population" : [200000,300000,600000,500000,350000,550000],

"state" : ["KA","TN","MH","OD","TS",'WB']

}

city_frame = pd.DataFrame(cities)

print(city_frame)

city_name Population state

0 Bangalore 200000 KA

1 Chennai 300000 TN

2 Mumbai 600000 MH

3 BBSR 500000 OD

4 Hyd'bad 350000 TS

5 Kolkatta 550000 WB

# Custom Indexing

order = ["First","Second","Third","Fourth","Fifth","Sixth"]

city_frame = pd.DataFrame(cities, index=order)

city_frame

Out[18]:

	city_name	Population	state
First	Bangalore	200000	KA
Second	Chennai	300000	TN
Third	Mumbai	600000	MH
Fourth	BBSR	500000	OD
Fifth	Hyd'bad	350000	TS
Sixth	Kolkatta	550000	WB

# Rearranging the columns

city_frame = pd.DataFrame(city_frame,

columns=["city_name","state","Population"]

)

city_frame

Out[19]:

	city_name	state	Population
First	Bangalore	KA	200000
Second	Chennai	TN	300000
Third	Mumbai	MH	600000
Fourth	BBSR	OD	500000
Fifth	Hyd'bad	TS	350000
Sixth	Kolkatta	WB	550000

# Accessing a cell

print(city_frame ['city_name']['Fourth'])

print(city_frame ['Population']['Fourth'])

print("----------------------Other way--------------------")

print(city_frame.city_name['Fourth'])

BBSR

500000

----------------------Other way--------------------

BBSR

# Update a column's value

# set_value

# set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead

city_frame.set_value('Fourth','city_name','Cuttack')

C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead

after removing the cwd from sys.path.

Out[21]:

	city_name	state	Population
First	Bangalore	KA	200000
Second	Chennai	TN	300000
Third	Mumbai	MH	600000
Fourth	Cuttack	OD	500000
Fifth	Hyd'bad	TS	350000
Sixth	Kolkatta	WB	550000

# Adding a new column

import numpy as np

city_frame['area'] = np.nan

city_frame

Out[22]:

	city_name	state	Population	area
First	Bangalore	KA	200000	NaN
Second	Chennai	TN	300000	NaN
Third	Mumbai	MH	600000	NaN
Fourth	Cuttack	OD	500000	NaN
Fifth	Hyd'bad	TS	350000	NaN
Sixth	Kolkatta	WB	550000	NaN

ar = [19191,203030,304040,101000,20022002,404004000]

city_frame['area'] = ar

city_frame

Out[23]:

	city_name	state	Population	area
First	Bangalore	KA	200000	19191
Second	Chennai	TN	300000	203030
Third	Mumbai	MH	600000	304040
Fourth	Cuttack	OD	500000	101000
Fifth	Hyd'bad	TS	350000	20022002
Sixth	Kolkatta	WB	550000	404004000

# Slicing using .loc & .iloc

city_frame.loc['Second':'Fifth','city_name':'Population']

Out[24]:

	city_name	state	Population
Second	Chennai	TN	300000
Third	Mumbai	MH	600000
Fourth	Cuttack	OD	500000
Fifth	Hyd'bad	TS	350000

city_frame.loc['Second':'Fifth':2,'city_name':'Population']

Out[25]:

	city_name	state	Population
Second	Chennai	TN	300000
Fourth	Cuttack	OD	500000

# Accessing specific columns and rows

# This function used access a group of rows and columns by label(s) or a boolean array.

city_frame.loc[['Second','Fifth','Sixth'],['city_name','Population','area']]

Out[26]:

	city_name	Population	area
Second	Chennai	300000	203030
Fifth	Hyd'bad	350000	20022002
Sixth	Kolkatta	550000	404004000

city_frame.loc['Second':'Fifth',['city_name','Population','area']]

Out[27]:

	city_name	Population	area
Second	Chennai	300000	203030
Third	Mumbai	600000	304040
Fourth	Cuttack	500000	101000
Fifth	Hyd'bad	350000	20022002

# use of iloc

city_frame.iloc[2:5,[1,3]]

Out[28]:

	state	area
Third	MH	304040
Fourth	OD	101000
Fifth	TS	20022002

# Addition - sum()

city_frame['area'].sum()

Out[29]:

424653263

city_frame

Out[30]:

	city_name	state	Population	area
First	Bangalore	KA	200000	19191
Second	Chennai	TN	300000	203030
Third	Mumbai	MH	600000	304040
Fourth	Cuttack	OD	500000	101000
Fifth	Hyd'bad	TS	350000	20022002
Sixth	Kolkatta	WB	550000	404004000

# Adding a row into DataFrame

city_frame2 = pd.DataFrame([["Delhi","DL",700000,2992833992]],

columns = ['city_name','state','Population','area'],

index=['Seventh'])

city_frame = city_frame.append(city_frame2)

print(city_frame)

city_name state Population area

First Bangalore KA 200000 19191

Second Chennai TN 300000 203030

Third Mumbai MH 600000 304040

Fourth Cuttack OD 500000 101000

Fifth Hyd'bad TS 350000 20022002

Sixth Kolkatta WB 550000 404004000

Seventh Delhi DL 700000 2992833992

# Dropping rows

city_frame.drop('Seventh',inplace=True)

print(city_frame)

city_name state Population area

First Bangalore KA 200000 19191

Second Chennai TN 300000 203030

Third Mumbai MH 600000 304040

Fourth Cuttack OD 500000 101000

Fifth Hyd'bad TS 350000 20022002

Sixth Kolkatta WB 550000 404004000

import pandas as pd

df1 = pd.DataFrame({'x': [1, 2, 2,3], 'y': [10,20,30,40]})

print("-------------------------Original df1------------------------------")

print(df1)

df2 = pd.DataFrame({'x': [4,5], 'y': [50,60]})

df1=df1.append(df2)

print("-------------------------After addition of rows to df1------------------------------")

print(df1)

-------------------------Original df1------------------------------

x y

0 1 10

1 2 20

2 2 30

3 3 40

-------------------------After addition of rows to df1------------------------------

x y

0 1 10

1 2 20

2 2 30

3 3 40

0 4 50

1 5 60

city_frame

Out[34]:

	city_name	state	Population	area
First	Bangalore	KA	200000	19191
Second	Chennai	TN	300000	203030
Third	Mumbai	MH	600000	304040
Fourth	Cuttack	OD	500000	101000
Fifth	Hyd'bad	TS	350000	20022002
Sixth	Kolkatta	WB	550000	404004000

cf1 = city_frame.copy()

cf1

Out[35]:

	city_name	state	Population	area
First	Bangalore	KA	200000	19191
Second	Chennai	TN	300000	203030
Third	Mumbai	MH	600000	304040
Fourth	Cuttack	OD	500000	101000
Fifth	Hyd'bad	TS	350000	20022002
Sixth	Kolkatta	WB	550000	404004000

cf1.head()

Out[36]:

	city_name	state	Population	area
First	Bangalore	KA	200000	19191
Second	Chennai	TN	300000	203030
Third	Mumbai	MH	600000	304040
Fourth	Cuttack	OD	500000	101000
Fifth	Hyd'bad	TS	350000	20022002

cf1.tail()

Out[37]:

	city_name	state	Population	area
Second	Chennai	TN	300000	203030
Third	Mumbai	MH	600000	304040
Fourth	Cuttack	OD	500000	101000
Fifth	Hyd'bad	TS	350000	20022002
Sixth	Kolkatta	WB	550000	404004000

cf1[cf1['Population'] > 500000]

Out[38]:

	city_name	state	Population	area
Third	Mumbai	MH	600000	304040
Sixth	Kolkatta	WB	550000	404004000

cf1[(cf1['Population'] > 500000) & (cf1['area'] > 400000) ]

Out[39]:

	city_name	state	Population	area
Sixth	Kolkatta	WB	550000	404004000

# pandas.DataFrame.copy() function: This function make a copy of this object’s indices and data.

#When deep=True (default), a new object will be created with a copy of the calling object’s data and indices.

#Modifications to the data or indices of the copy will not be reflected in the original object.

#When deep=False, a new object will be created without copying the calling object’s data or index (only references to the data and index are copied).

#Any changes to the data of the original will be reflected in the shallow copy (and vice versa).

import numpy as np

import pandas as pd

print("-------------------------df1------------------------------")

df1 = pd.DataFrame({'A': [5, 2], 'B': [4, 8]})

print(df1)

print("--------------------------df2-----------------------------")

df2 = df1.copy()

print(df2)

print("-------------------------df3------------------------------")

df3 = df1.copy(deep=True)

print(df3)

print("--------------------------df4-----------------------------")

df4 = df1.copy(deep=False)

print(df4)

print("--------------------------Updated-----------------------------")

df4['C'] = np.nan

c =[3,6]

df4['C']=c

print(df4)

-------------------------df1------------------------------

A B

0 5 4

1 2 8

--------------------------df2-----------------------------

A B

0 5 4

1 2 8

-------------------------df3------------------------------

A B

0 5 4

1 2 8

--------------------------df4-----------------------------

A B

0 5 4

1 2 8

--------------------------Updated-----------------------------

A B C

0 5 4 3

1 2 8 6

# Deleting a cloumn

df4.pop('C')

print("--------------------------Updated-----------------------------")

print(df4)

--------------------------Updated-----------------------------

A B

0 5 4

1 2 8

df4['C'] = np.nan

c =[3,6]

df4['C']=c

print(df4)

df4.sort_values("B",ascending=True)

print(df4)

df4.sort_values("B",ascending=False)

print(df4)

A B C

0 5 4 3

1 2 8 6

A B C

0 5 4 3

1 2 8 6

A B C

0 5 4 3

1 2 8 6

df1 = pd.DataFrame({'A': [1,2,3],'B':[3,4,5]})

df2 = pd.DataFrame({'A':[2,3,4,5], 'C':[3,4,7,9]})

print ('------------df1----------')

print(df1)

print ('------------df2----------')

print(df2)

------------df1----------

A B

0 1 3

1 2 4

2 3 5

------------df2----------

A C

0 2 3

1 3 4

2 4 7

3 5 9

# Full Outer join in Oracle

frames = [df1,df2]

result = pd.concat(frames) # default axis = 0

print(result)

A B C

0 1 3.0 NaN

1 2 4.0 NaN

2 3 5.0 NaN

0 2 NaN 3.0

1 3 NaN 4.0

2 4 NaN 7.0

3 5 NaN 9.0

C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

after removing the cwd from sys.path.

result1 = pd.concat(frames,axis=1)

print(result1)

A B A C

0 1.0 3.0 2 3

1 2.0 4.0 3 4

2 3.0 5.0 4 7

3 NaN NaN 5 9

#same as above

result2 = pd.concat(frames,axis=1,join='outer')

print(result2)

A B A C

0 1.0 3.0 2 3

1 2.0 4.0 3 4

2 3.0 5.0 4 7

3 NaN NaN 5 9

# Inner / Equi join in Oracle

result3 = pd.concat(frames,axis=1,join='inner')

print(result3)

A B A C

0 1 3 2 3

1 2 4 3 4

2 3 5 4 7

# Common column between datasets df1 & df2

result4 = pd.concat(frames,axis=0,join='inner')

print(result4)

0 1

1 2

2 3

0 2

1 3

2 4

3 5

print ('------------df1----------')

print(df1)

print ('------------df2----------')

print(df2)

------------df1----------

A B

0 1 3

1 2 4

2 3 5

------------df2----------

A C

0 2 3

1 3 4

2 4 7

3 5 9

# Merge

pd.merge(df1,df2,how="left")

Out[51]:

	A	B	C
0	1	3	NaN
1	2	4	3.0
2	3	5	4.0

pd.merge(df1,df2,how="right")

Out[52]:

	A	B	C
0	2	4.0	3
1	3	5.0	4
2	4	NaN	7
3	5	NaN	9

pd.merge(df1,df2,how="inner")

Out[53]:

	A	B	C
0	2	4	3
1	3	5	4

pd.merge(df1,df2,how="outer")

Out[54]:

	A	B	C
0	1	3.0	NaN
1	2	4.0	3.0
2	3	5.0	4.0
3	4	NaN	7.0
4	5	NaN	9.0

Data and Database

Thursday, 11 June 2020

Python : Pandas - 1

No comments:

Total Pageviews

Labels