#Pandas work with these 3 types data structures to work with
data.
#series – It’s For single dimensional data
#dataframe – it’s for 2 dimensional data
#panel – It’s for Multi dimensional data
import pandas as pd
# 1 dimensional array
s = pd.Series([10,20,30,40,50,60],dtype=float)
print(s)
print(s[1:3])
0 10.0
1 20.0
2 30.0
3 40.0
4 50.0
5 60.0
dtype:
float64
1 20.0
2 30.0
dtype:
float64
data = [10,20,30,40,40,50,60]
s1 = pd.Series(data)
print (s1)
0 10
1 20
2 30
3 40
4 40
5 50
6 60
dtype:
int64
print(s1.index)
print(s1.values)
RangeIndex(start=0,
stop=7, step=1)
[10
20 30 40 40 50 60]
s2 = pd.Series((data), index=range(15,22))
print(s2)
15 10
16 20
17 30
18 40
19 40
20 50
21 60
dtype:
int64
print(s2[15:18])
Series([],
dtype: int64)
s2 = pd.Series((data), index=['a','b','c','d','e','f','g'])
print(s2)
print('--------------------------')
print(s2['a':'d'])
a 10
b 20
c 30
d 40
e 40
f 50
g 60
dtype:
int64
--------------------------
a 10
b 20
c 30
d 40
dtype:
int64
print(type(s2))
<class
'pandas.core.series.Series'>
# Dataframes
data = [10,20,30,40,50],[60,70,80,90,100]
p1 = pd.DataFrame(data)
print(p1)
0
1 2 3 4
0 10
20 30 40 50
1 60
70 80 90 100
data = [10,20,30,40,50],[60,70,80,90,100],['a','b','c','d','e'],[1.0,2.0,3.0,4.0,5.0]
p1 = pd.DataFrame(data)
print(p1)
0
1 2 3 4
0 10
20 30 40 50
1 60
70 80 90 100
2 a
b c d e
3 1
2 3 4 5
data =
[10,20,30,40,50],['a','b','c','d','e']
p2 = pd.DataFrame(data)
print(p2)
0
1 2 3 4
0 10
20 30 40 50
1 a
b c d e
data =
[10,20,30,40,50],['a','c','d']
p2 = pd.DataFrame(data)
print(p2)
0
1 2 3
4
0 10
20 30 40.0
50.0
1 a
c d NaN
NaN
# 2D arrays converts into 2D Panda Dataframe
data = [[1,2,3],[4,5]]
print(data)
print(type(data))
print("-------------------------------------------------------")
d1 = pd.DataFrame(data)
print(d1)
print(type(d1))
[[1,
2, 3], [4, 5]]
<class
'list'>
-------------------------------------------------------
0
1 2
0 1
2 3.0
1 4
5 NaN
<class
'pandas.core.frame.DataFrame'>
# Dictionary to 2D DataFrame
data1 = {"A":[1,2,3,4,5],
"B":[4,5,6,7,8],
"C":[14,15,16,17,18],
"D":[24,25,26,27,28]}
print(data1)
print(type(data1))
print("-------------------------------------------------------")
d1 = pd.DataFrame(data1)
print(d1)
print(type(d1))
{'A':
[1, 2, 3, 4, 5], 'B': [4, 5, 6, 7, 8], 'C': [14, 15, 16, 17, 18], 'D': [24, 25,
26, 27, 28]}
<class
'dict'>
-------------------------------------------------------
A
B C D
0 1
4 14 24
1 2
5 15 25
2 3
6 16 26
3 4
7 17 27
4 5
8 18 28
<class
'pandas.core.frame.DataFrame'>
data3 = [10,20,30,40,40,50,60]
s3 = pd.Series((data3), index=['a','b','c','d','e','f','g'])
print(s3)
a 10
b 20
c 30
d 40
e 40
f 50
g 60
dtype:
int64
data4 = pd.Series(([6,12]),index=['a','b'])
d4 = pd.DataFrame([data4])
print(d4)
a b
0 6 12
import pandas as pd
# cities is a dictionary with key as city_name, population,
state and values as a list
# city_frame is a DataFrame
cities = {
"city_name"
:
["Bangalore","Chennai","Mumbai","BBSR","Hyd'bad","Kolkatta"],
"Population"
: [200000,300000,600000,500000,350000,550000],
"state" :
["KA","TN","MH","OD","TS",'WB']
}
city_frame = pd.DataFrame(cities)
print(city_frame)
city_name
Population state
0 Bangalore
200000 KA
1 Chennai
300000 TN
2 Mumbai
600000 MH
3 BBSR
500000 OD
4 Hyd'bad
350000 TS
5 Kolkatta
550000 WB
# Custom Indexing
order =
["First","Second","Third","Fourth","Fifth","Sixth"]
city_frame = pd.DataFrame(cities, index=order)
city_frame
Out[18]:
city_name
|
Population
|
state
|
|
First
|
Bangalore
|
200000
|
KA
|
Second
|
Chennai
|
300000
|
TN
|
Third
|
Mumbai
|
600000
|
MH
|
Fourth
|
BBSR
|
500000
|
OD
|
Fifth
|
Hyd'bad
|
350000
|
TS
|
Sixth
|
Kolkatta
|
550000
|
WB
|
# Rearranging the columns
city_frame = pd.DataFrame(city_frame,
columns=["city_name","state","Population"]
)
city_frame
Out[19]:
city_name
|
state
|
Population
|
|
First
|
Bangalore
|
KA
|
200000
|
Second
|
Chennai
|
TN
|
300000
|
Third
|
Mumbai
|
MH
|
600000
|
Fourth
|
BBSR
|
OD
|
500000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
Sixth
|
Kolkatta
|
WB
|
550000
|
# Accessing a cell
print(city_frame
['city_name']['Fourth'])
print(city_frame
['Population']['Fourth'])
print("----------------------Other way--------------------")
print(city_frame.city_name['Fourth'])
BBSR
500000
----------------------Other
way--------------------
BBSR
# Update a column's value
# set_value
# set_value is deprecated and will be removed in a future
release. Please use .at[] or .iat[] accessors instead
city_frame.set_value('Fourth','city_name','Cuttack')
C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:4:
FutureWarning: set_value is deprecated and will be removed in a future release.
Please use .at[] or .iat[] accessors instead
after removing the cwd from sys.path.
Out[21]:
city_name
|
state
|
Population
|
|
First
|
Bangalore
|
KA
|
200000
|
Second
|
Chennai
|
TN
|
300000
|
Third
|
Mumbai
|
MH
|
600000
|
Fourth
|
Cuttack
|
OD
|
500000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
Sixth
|
Kolkatta
|
WB
|
550000
|
# Adding a new column
import numpy as np
city_frame['area'] = np.nan
city_frame
Out[22]:
city_name
|
state
|
Population
|
area
|
|
First
|
Bangalore
|
KA
|
200000
|
NaN
|
Second
|
Chennai
|
TN
|
300000
|
NaN
|
Third
|
Mumbai
|
MH
|
600000
|
NaN
|
Fourth
|
Cuttack
|
OD
|
500000
|
NaN
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
NaN
|
Sixth
|
Kolkatta
|
WB
|
550000
|
NaN
|
ar = [19191,203030,304040,101000,20022002,404004000]
city_frame['area'] = ar
city_frame
Out[23]:
city_name
|
state
|
Population
|
area
|
|
First
|
Bangalore
|
KA
|
200000
|
19191
|
Second
|
Chennai
|
TN
|
300000
|
203030
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Fourth
|
Cuttack
|
OD
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
20022002
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
# Slicing using .loc & .iloc
city_frame.loc['Second':'Fifth','city_name':'Population']
Out[24]:
city_name
|
state
|
Population
|
|
Second
|
Chennai
|
TN
|
300000
|
Third
|
Mumbai
|
MH
|
600000
|
Fourth
|
Cuttack
|
OD
|
500000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
city_frame.loc['Second':'Fifth':2,'city_name':'Population']
Out[25]:
city_name
|
state
|
Population
|
|
Second
|
Chennai
|
TN
|
300000
|
Fourth
|
Cuttack
|
OD
|
500000
|
# Accessing specific columns and rows
# This function used access a group of rows and columns by
label(s) or a boolean array.
city_frame.loc[['Second','Fifth','Sixth'],['city_name','Population','area']]
Out[26]:
city_name
|
Population
|
area
|
|
Second
|
Chennai
|
300000
|
203030
|
Fifth
|
Hyd'bad
|
350000
|
20022002
|
Sixth
|
Kolkatta
|
550000
|
404004000
|
city_frame.loc['Second':'Fifth',['city_name','Population','area']]
Out[27]:
city_name
|
Population
|
area
|
|
Second
|
Chennai
|
300000
|
203030
|
Third
|
Mumbai
|
600000
|
304040
|
Fourth
|
Cuttack
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
350000
|
20022002
|
# use of iloc
city_frame.iloc[2:5,[1,3]]
Out[28]:
state
|
area
|
|
Third
|
MH
|
304040
|
Fourth
|
OD
|
101000
|
Fifth
|
TS
|
20022002
|
# Addition - sum()
city_frame['area'].sum()
Out[29]:
424653263
city_frame
Out[30]:
city_name
|
state
|
Population
|
area
|
|
First
|
Bangalore
|
KA
|
200000
|
19191
|
Second
|
Chennai
|
TN
|
300000
|
203030
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Fourth
|
Cuttack
|
OD
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
20022002
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
# Adding a row into DataFrame
city_frame2 =
pd.DataFrame([["Delhi","DL",700000,2992833992]],
columns = ['city_name','state','Population','area'],
index=['Seventh'])
city_frame = city_frame.append(city_frame2)
print(city_frame)
city_name state Population area
First Bangalore
KA 200000 19191
Second Chennai
TN 300000 203030
Third Mumbai
MH 600000 304040
Fourth Cuttack
OD 500000 101000
Fifth Hyd'bad
TS 350000 20022002
Sixth Kolkatta
WB 550000 404004000
Seventh Delhi
DL 700000 2992833992
# Dropping rows
city_frame.drop('Seventh',inplace=True)
print(city_frame)
city_name state Population area
First Bangalore
KA 200000 19191
Second Chennai
TN 300000 203030
Third Mumbai
MH 600000 304040
Fourth Cuttack
OD 500000 101000
Fifth Hyd'bad
TS 350000 20022002
Sixth Kolkatta
WB 550000 404004000
import pandas as pd
df1 = pd.DataFrame({'x': [1, 2, 2,3], 'y': [10,20,30,40]})
print("-------------------------Original
df1------------------------------")
print(df1)
df2 = pd.DataFrame({'x': [4,5], 'y': [50,60]})
df1=df1.append(df2)
print("-------------------------After addition of rows to
df1------------------------------")
print(df1)
-------------------------Original
df1------------------------------
x y
0 1 10
1 2 20
2 2 30
3 3 40
-------------------------After
addition of rows to df1------------------------------
x y
0 1 10
1 2 20
2 2 30
3 3 40
0 4 50
1 5 60
city_frame
Out[34]:
city_name
|
state
|
Population
|
area
|
|
First
|
Bangalore
|
KA
|
200000
|
19191
|
Second
|
Chennai
|
TN
|
300000
|
203030
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Fourth
|
Cuttack
|
OD
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
20022002
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
cf1 = city_frame.copy()
cf1
Out[35]:
city_name
|
state
|
Population
|
area
|
|
First
|
Bangalore
|
KA
|
200000
|
19191
|
Second
|
Chennai
|
TN
|
300000
|
203030
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Fourth
|
Cuttack
|
OD
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
20022002
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
cf1.head()
Out[36]:
city_name
|
state
|
Population
|
area
|
|
First
|
Bangalore
|
KA
|
200000
|
19191
|
Second
|
Chennai
|
TN
|
300000
|
203030
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Fourth
|
Cuttack
|
OD
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
20022002
|
cf1.tail()
Out[37]:
city_name
|
state
|
Population
|
area
|
|
Second
|
Chennai
|
TN
|
300000
|
203030
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Fourth
|
Cuttack
|
OD
|
500000
|
101000
|
Fifth
|
Hyd'bad
|
TS
|
350000
|
20022002
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
cf1[cf1['Population'] > 500000]
Out[38]:
city_name
|
state
|
Population
|
area
|
|
Third
|
Mumbai
|
MH
|
600000
|
304040
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
cf1[(cf1['Population'] > 500000) & (cf1['area'] >
400000) ]
Out[39]:
city_name
|
state
|
Population
|
area
|
|
Sixth
|
Kolkatta
|
WB
|
550000
|
404004000
|
# pandas.DataFrame.copy()
function: This function make a copy of this object’s indices and data.
#When deep=True (default), a new object will be created with a
copy of the calling object’s data and indices.
#Modifications to the data or indices of the copy will not be
reflected in the original object.
#When deep=False, a new object will be created without copying
the calling object’s data or index (only references to the data and index are
copied).
#Any changes to the data of the original will be reflected in
the shallow copy (and vice versa).
import numpy as np
import pandas as pd
print("-------------------------df1------------------------------")
df1 = pd.DataFrame({'A': [5, 2], 'B': [4, 8]})
print(df1)
print("--------------------------df2-----------------------------")
df2 = df1.copy()
print(df2)
print("-------------------------df3------------------------------")
df3 = df1.copy(deep=True)
print(df3)
print("--------------------------df4-----------------------------")
df4 = df1.copy(deep=False)
print(df4)
print("--------------------------Updated-----------------------------")
df4['C'] = np.nan
c =[3,6]
df4['C']=c
print(df4)
-------------------------df1------------------------------
A B
0 5 4
1 2 8
--------------------------df2-----------------------------
A B
0 5 4
1 2 8
-------------------------df3------------------------------
A B
0 5 4
1 2 8
--------------------------df4-----------------------------
A B
0 5 4
1 2 8
--------------------------Updated-----------------------------
A
B C
0 5
4 3
1 2
8 6
# Deleting a cloumn
df4.pop('C')
print("--------------------------Updated-----------------------------")
print(df4)
--------------------------Updated-----------------------------
A B
0 5 4
1 2 8
df4['C'] = np.nan
c =[3,6]
df4['C']=c
print(df4)
df4.sort_values("B",ascending=True)
print(df4)
df4.sort_values("B",ascending=False)
print(df4)
A
B C
0 5
4 3
1 2
8 6
A
B C
0 5
4 3
1 2
8 6
A
B C
0 5
4 3
1 2
8 6
df1 = pd.DataFrame({'A': [1,2,3],'B':[3,4,5]})
df2 = pd.DataFrame({'A':[2,3,4,5], 'C':[3,4,7,9]})
print ('------------df1----------')
print(df1)
print ('------------df2----------')
print(df2)
------------df1----------
A B
0 1 3
1 2 4
2 3 5
------------df2----------
A C
0 2 3
1 3 4
2 4 7
3 5 9
# Full Outer join in Oracle
frames = [df1,df2]
result = pd.concat(frames) # default axis = 0
print(result)
A
B C
0 1
3.0 NaN
1 2
4.0 NaN
2 3
5.0 NaN
0 2 NaN 3.0
1 3
NaN 4.0
2 4
NaN 7.0
3 5
NaN 9.0
C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:4:
FutureWarning: Sorting because non-concatenation axis is not aligned. A future
version
of
pandas will change to not sort by default.
To
accept the future behavior, pass 'sort=False'.
To
retain the current behavior and silence the warning, pass 'sort=True'.
after removing the cwd from sys.path.
result1 = pd.concat(frames,axis=1)
print(result1)
A
B A C
0 1.0
3.0 2 3
1 2.0
4.0 3 4
2 3.0
5.0 4 7
3 NaN
NaN 5 9
#same as above
result2 = pd.concat(frames,axis=1,join='outer')
print(result2)
A
B A C
0 1.0
3.0 2 3
1 2.0
4.0 3 4
2 3.0
5.0 4 7
3 NaN
NaN 5 9
# Inner / Equi join in Oracle
result3 = pd.concat(frames,axis=1,join='inner')
print(result3)
A
B A C
0 1
3 2 3
1 2
4 3 4
2 3
5 4 7
# Common column between datasets df1 & df2
result4 = pd.concat(frames,axis=0,join='inner')
print(result4)
A
0 1
1 2
2 3
0 2
1 3
2 4
3 5
print ('------------df1----------')
print(df1)
print ('------------df2----------')
print(df2)
------------df1----------
A B
0 1 3
1 2 4
2 3 5
------------df2----------
A C
0 2 3
1 3 4
2 4 7
3 5 9
# Merge
pd.merge(df1,df2,how="left")
Out[51]:
A
|
B
|
C
|
|
0
|
1
|
3
|
NaN
|
1
|
2
|
4
|
3.0
|
2
|
3
|
5
|
4.0
|
pd.merge(df1,df2,how="right")
Out[52]:
A
|
B
|
C
|
|
0
|
2
|
4.0
|
3
|
1
|
3
|
5.0
|
4
|
2
|
4
|
NaN
|
7
|
3
|
5
|
NaN
|
9
|
pd.merge(df1,df2,how="inner")
Out[53]:
A
|
B
|
C
|
|
0
|
2
|
4
|
3
|
1
|
3
|
5
|
4
|
pd.merge(df1,df2,how="outer")
Out[54]:
A
|
B
|
C
|
|
0
|
1
|
3.0
|
NaN
|
1
|
2
|
4.0
|
3.0
|
2
|
3
|
5.0
|
4.0
|
3
|
4
|
NaN
|
7.0
|
4
|
5
|
NaN
|
9.0
|
No comments:
Post a Comment