Thursday, 11 June 2020

Python : Pandas - 1



#Pandas work with these 3 types data structures to work with data.
#series – It’s For single dimensional data
#dataframe – it’s for 2 dimensional data
#panel – It’s for Multi dimensional data
import pandas as pd
. . .
 # 1 dimensional array
s = pd.Series([10,20,30,40,50,60],dtype=float)
print(s)
print(s[1:3])
0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
5    60.0
dtype: float64
1    20.0
2    30.0
dtype: float64
. . .

data = [10,20,30,40,40,50,60]
s1 = pd.Series(data)
print (s1)
0    10
1    20
2    30
3    40
4    40
5    50
6    60
dtype: int64
. . .

print(s1.index)
print(s1.values)
RangeIndex(start=0, stop=7, step=1)
[10 20 30 40 40 50 60]
. . .

s2 = pd.Series((data), index=range(15,22))
print(s2)
15    10
16    20
17    30
18    40
19    40
20    50
21    60
dtype: int64
. . .

print(s2[15:18])
Series([], dtype: int64)
. . .

s2 = pd.Series((data), index=['a','b','c','d','e','f','g'])
print(s2)
print('--------------------------')
print(s2['a':'d'])
a    10
b    20
c    30
d    40
e    40
f    50
g    60
dtype: int64
--------------------------
a    10
b    20
c    30
d    40
dtype: int64
. . .

print(type(s2))
<class 'pandas.core.series.Series'>
. . .

# Dataframes
data = [10,20,30,40,50],[60,70,80,90,100]
p1 = pd.DataFrame(data)
print(p1)
    0   1   2   3    4
0  10  20  30  40   50
1  60  70  80  90  100
. . .

data = [10,20,30,40,50],[60,70,80,90,100],['a','b','c','d','e'],[1.0,2.0,3.0,4.0,5.0]
p1 = pd.DataFrame(data)
print(p1)
    0   1   2   3    4
0  10  20  30  40   50
1  60  70  80  90  100
2   a   b   c   d    e
3   1   2   3   4    5
. . .
 data = [10,20,30,40,50],['a','b','c','d','e']
p2 = pd.DataFrame(data)
print(p2)
    0   1   2   3   4
0  10  20  30  40  50
1   a   b   c   d   e
. . .
 data = [10,20,30,40,50],['a','c','d']
p2 = pd.DataFrame(data)
print(p2)
    0   1   2     3     4
0  10  20  30  40.0  50.0
1   a   c   d   NaN   NaN
. . .

# 2D arrays converts into 2D Panda Dataframe
data = [[1,2,3],[4,5]]
print(data)
print(type(data))
print("-------------------------------------------------------")
d1 = pd.DataFrame(data)
print(d1)
print(type(d1))
[[1, 2, 3], [4, 5]]
<class 'list'>
-------------------------------------------------------
   0  1    2
0  1  2  3.0
1  4  5  NaN
<class 'pandas.core.frame.DataFrame'>
. . .

# Dictionary to 2D DataFrame
data1 = {"A":[1,2,3,4,5],
         "B":[4,5,6,7,8],
         "C":[14,15,16,17,18],
         "D":[24,25,26,27,28]}
print(data1)
print(type(data1))
print("-------------------------------------------------------")
d1 = pd.DataFrame(data1)
print(d1)
print(type(d1))
{'A': [1, 2, 3, 4, 5], 'B': [4, 5, 6, 7, 8], 'C': [14, 15, 16, 17, 18], 'D': [24, 25, 26, 27, 28]}
<class 'dict'>
-------------------------------------------------------
   A  B   C   D
0  1  4  14  24
1  2  5  15  25
2  3  6  16  26
3  4  7  17  27
4  5  8  18  28
<class 'pandas.core.frame.DataFrame'>
. . .

data3 = [10,20,30,40,40,50,60]
s3 = pd.Series((data3), index=['a','b','c','d','e','f','g'])
print(s3)
a    10
b    20
c    30
d    40
e    40
f    50
g    60
dtype: int64
. . .

data4 = pd.Series(([6,12]),index=['a','b'])
d4 = pd.DataFrame([data4])
print(d4)
   a   b
0  6  12
. . .

import pandas as pd
# cities is a dictionary with key as city_name, population, state and values as a list
# city_frame is a DataFrame
cities = {
    "city_name" : ["Bangalore","Chennai","Mumbai","BBSR","Hyd'bad","Kolkatta"],
    "Population" : [200000,300000,600000,500000,350000,550000],
    "state" : ["KA","TN","MH","OD","TS",'WB']
}
city_frame = pd.DataFrame(cities)
print(city_frame)
   city_name  Population state
0  Bangalore      200000    KA
1    Chennai      300000    TN
2     Mumbai      600000    MH
3       BBSR      500000    OD
4    Hyd'bad      350000    TS
5   Kolkatta      550000    WB
. . .
 # Custom Indexing
order = ["First","Second","Third","Fourth","Fifth","Sixth"]
city_frame = pd.DataFrame(cities, index=order)
city_frame
Out[18]:

city_name
Population
state
First
Bangalore
200000
KA
Second
Chennai
300000
TN
Third
Mumbai
600000
MH
Fourth
BBSR
500000
OD
Fifth
Hyd'bad
350000
TS
Sixth
Kolkatta
550000
WB
. . .

# Rearranging the columns
city_frame = pd.DataFrame(city_frame,
                         columns=["city_name","state","Population"]
                         )
city_frame
Out[19]:
city_name
state
Population
First
Bangalore
KA
200000
Second
Chennai
TN
300000
Third
Mumbai
MH
600000
Fourth
BBSR
OD
500000
Fifth
Hyd'bad
TS
350000
Sixth
Kolkatta
WB
550000
. . .

# Accessing a cell
print(city_frame  ['city_name']['Fourth'])
print(city_frame  ['Population']['Fourth'])
print("----------------------Other way--------------------")
print(city_frame.city_name['Fourth'])
BBSR
500000
----------------------Other way--------------------
BBSR
. . .

# Update a column's value
# set_value
# set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
city_frame.set_value('Fourth','city_name','Cuttack')
C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
  after removing the cwd from sys.path.

Out[21]:
city_name
state
Population
First
Bangalore
KA
200000
Second
Chennai
TN
300000
Third
Mumbai
MH
600000
Fourth
Cuttack
OD
500000
Fifth
Hyd'bad
TS
350000
Sixth
Kolkatta
WB
550000
. . .

# Adding a new column
import numpy as np
city_frame['area'] = np.nan
city_frame
Out[22]:
city_name
state
Population
area
First
Bangalore
KA
200000
NaN
Second
Chennai
TN
300000
NaN
Third
Mumbai
MH
600000
NaN
Fourth
Cuttack
OD
500000
NaN
Fifth
Hyd'bad
TS
350000
NaN
Sixth
Kolkatta
WB
550000
NaN
. . .

ar = [19191,203030,304040,101000,20022002,404004000]
city_frame['area'] = ar
city_frame


Out[23]:
city_name
state
Population
area
First
Bangalore
KA
200000
19191
Second
Chennai
TN
300000
203030
Third
Mumbai
MH
600000
304040
Fourth
Cuttack
OD
500000
101000
Fifth
Hyd'bad
TS
350000
20022002
Sixth
Kolkatta
WB
550000
404004000
. . .

# Slicing using .loc & .iloc
city_frame.loc['Second':'Fifth','city_name':'Population']
Out[24]:
city_name
state
Population
Second
Chennai
TN
300000
Third
Mumbai
MH
600000
Fourth
Cuttack
OD
500000
Fifth
Hyd'bad
TS
350000
. . .

city_frame.loc['Second':'Fifth':2,'city_name':'Population']
Out[25]:
city_name
state
Population
Second
Chennai
TN
300000
Fourth
Cuttack
OD
500000
. . .

# Accessing specific columns and rows
# This function used access a group of rows and columns by label(s) or a boolean array.
city_frame.loc[['Second','Fifth','Sixth'],['city_name','Population','area']]
Out[26]:
city_name
Population
area
Second
Chennai
300000
203030
Fifth
Hyd'bad
350000
20022002
Sixth
Kolkatta
550000
404004000
. . .

city_frame.loc['Second':'Fifth',['city_name','Population','area']]
Out[27]:
city_name
Population
area
Second
Chennai
300000
203030
Third
Mumbai
600000
304040
Fourth
Cuttack
500000
101000
Fifth
Hyd'bad
350000
20022002
. . .

# use of iloc
city_frame.iloc[2:5,[1,3]]
Out[28]:
state
area
Third
MH
304040
Fourth
OD
101000
Fifth
TS
20022002
. . .

# Addition - sum()
city_frame['area'].sum()
Out[29]:
424653263
. . .

city_frame
Out[30]:
city_name
state
Population
area
First
Bangalore
KA
200000
19191
Second
Chennai
TN
300000
203030
Third
Mumbai
MH
600000
304040
Fourth
Cuttack
OD
500000
101000
Fifth
Hyd'bad
TS
350000
20022002
Sixth
Kolkatta
WB
550000
404004000
. . .

# Adding a row into DataFrame
city_frame2 = pd.DataFrame([["Delhi","DL",700000,2992833992]],
                         columns = ['city_name','state','Population','area'],
                         index=['Seventh'])
city_frame = city_frame.append(city_frame2)
print(city_frame)
         city_name state  Population        area
First    Bangalore    KA      200000       19191
Second     Chennai    TN      300000      203030
Third       Mumbai    MH      600000      304040
Fourth     Cuttack    OD      500000      101000
Fifth      Hyd'bad    TS      350000    20022002
Sixth     Kolkatta    WB      550000   404004000
Seventh      Delhi    DL      700000  2992833992
. . .

# Dropping rows
city_frame.drop('Seventh',inplace=True)
print(city_frame)
        city_name state  Population       area
First   Bangalore    KA      200000      19191
Second    Chennai    TN      300000     203030
Third      Mumbai    MH      600000     304040
Fourth    Cuttack    OD      500000     101000
Fifth     Hyd'bad    TS      350000   20022002
Sixth    Kolkatta    WB      550000  404004000
. . .

import pandas as pd
df1 = pd.DataFrame({'x': [1, 2, 2,3], 'y': [10,20,30,40]})
print("-------------------------Original df1------------------------------")
print(df1)
df2 = pd.DataFrame({'x': [4,5], 'y': [50,60]})
df1=df1.append(df2)
print("-------------------------After addition of rows to df1------------------------------")
print(df1)
-------------------------Original df1------------------------------
   x   y
0  1  10
1  2  20
2  2  30
3  3  40
-------------------------After addition of rows to df1------------------------------
   x   y
0  1  10
1  2  20
2  2  30
3  3  40
0  4  50
1  5  60
. . .

city_frame
Out[34]:
city_name
state
Population
area
First
Bangalore
KA
200000
19191
Second
Chennai
TN
300000
203030
Third
Mumbai
MH
600000
304040
Fourth
Cuttack
OD
500000
101000
Fifth
Hyd'bad
TS
350000
20022002
Sixth
Kolkatta
WB
550000
404004000
. . .

cf1 = city_frame.copy()
cf1
Out[35]:
city_name
state
Population
area
First
Bangalore
KA
200000
19191
Second
Chennai
TN
300000
203030
Third
Mumbai
MH
600000
304040
Fourth
Cuttack
OD
500000
101000
Fifth
Hyd'bad
TS
350000
20022002
Sixth
Kolkatta
WB
550000
404004000
. . .

cf1.head()
Out[36]:
city_name
state
Population
area
First
Bangalore
KA
200000
19191
Second
Chennai
TN
300000
203030
Third
Mumbai
MH
600000
304040
Fourth
Cuttack
OD
500000
101000
Fifth
Hyd'bad
TS
350000
20022002
. . .

cf1.tail()
Out[37]:
city_name
state
Population
area
Second
Chennai
TN
300000
203030
Third
Mumbai
MH
600000
304040
Fourth
Cuttack
OD
500000
101000
Fifth
Hyd'bad
TS
350000
20022002
Sixth
Kolkatta
WB
550000
404004000
. . .

cf1[cf1['Population'] > 500000]
Out[38]:
city_name
state
Population
area
Third
Mumbai
MH
600000
304040
Sixth
Kolkatta
WB
550000
404004000
. . .

cf1[(cf1['Population'] > 500000) & (cf1['area'] > 400000) ]
Out[39]:
city_name
state
Population
area
Sixth
Kolkatta
WB
550000
404004000
. . .
 # pandas.DataFrame.copy() function: This function make a copy of this object’s indices and data.
#When deep=True (default), a new object will be created with a copy of the calling object’s data and indices.
#Modifications to the data or indices of the copy will not be reflected in the original object.
#When deep=False, a new object will be created without copying the calling object’s data or index (only references to the data and index are copied).
#Any changes to the data of the original will be reflected in the shallow copy (and vice versa).
import numpy as np
import pandas as pd
print("-------------------------df1------------------------------")
df1 = pd.DataFrame({'A': [5, 2], 'B': [4, 8]})
print(df1)
print("--------------------------df2-----------------------------")
df2 = df1.copy()
print(df2)
print("-------------------------df3------------------------------")
df3 = df1.copy(deep=True)
print(df3)
print("--------------------------df4-----------------------------")
df4 = df1.copy(deep=False)
print(df4)
print("--------------------------Updated-----------------------------")
df4['C'] = np.nan
c =[3,6]
df4['C']=c
print(df4)
-------------------------df1------------------------------
   A  B
0  5  4
1  2  8
--------------------------df2-----------------------------
   A  B
0  5  4
1  2  8
-------------------------df3------------------------------
   A  B
0  5  4
1  2  8
--------------------------df4-----------------------------
   A  B
0  5  4
1  2  8
--------------------------Updated-----------------------------
   A  B  C
0  5  4  3
1  2  8  6
. . .

# Deleting  a cloumn
df4.pop('C')
print("--------------------------Updated-----------------------------")
print(df4)
--------------------------Updated-----------------------------
   A  B
0  5  4
1  2  8
. . .

df4['C'] = np.nan
c =[3,6]
df4['C']=c
print(df4)
df4.sort_values("B",ascending=True)
print(df4)
df4.sort_values("B",ascending=False)
print(df4)
   A  B  C
0  5  4  3
1  2  8  6
   A  B  C
0  5  4  3
1  2  8  6
   A  B  C
0  5  4  3
1  2  8  6
. . .

df1 = pd.DataFrame({'A': [1,2,3],'B':[3,4,5]})
df2 = pd.DataFrame({'A':[2,3,4,5], 'C':[3,4,7,9]})
print ('------------df1----------')
print(df1)
print ('------------df2----------')
print(df2)
------------df1----------
   A  B
0  1  3
1  2  4
2  3  5
------------df2----------
   A  C
0  2  3
1  3  4
2  4  7
3  5  9
. . .

# Full Outer join in Oracle
frames = [df1,df2]
result = pd.concat(frames) # default axis = 0
print(result)
   A    B    C
0  1  3.0  NaN
1  2  4.0  NaN
2  3  5.0  NaN
0  2  NaN  3.0
1  3  NaN  4.0
2  4  NaN  7.0
3  5  NaN  9.0
C:\Users\kuldip_s\Downloads\Anaconda\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  after removing the cwd from sys.path.
. . .

result1 = pd.concat(frames,axis=1)
print(result1)
     A    B  A  C
0  1.0  3.0  2  3
1  2.0  4.0  3  4
2  3.0  5.0  4  7
3  NaN  NaN  5  9
. . .

#same as above
result2 = pd.concat(frames,axis=1,join='outer')
print(result2)
     A    B  A  C
0  1.0  3.0  2  3
1  2.0  4.0  3  4
2  3.0  5.0  4  7
3  NaN  NaN  5  9
. . .

# Inner / Equi join in Oracle
result3 = pd.concat(frames,axis=1,join='inner')
print(result3)
   A  B  A  C
0  1  3  2  3
1  2  4  3  4
2  3  5  4  7
. . .







# Common column between datasets df1 & df2
result4 = pd.concat(frames,axis=0,join='inner')
print(result4)
   A
0  1
1  2
2  3
0  2
1  3
2  4
3  5
. . .

print ('------------df1----------')
print(df1)
print ('------------df2----------')
print(df2)
------------df1----------
   A  B
0  1  3
1  2  4
2  3  5
------------df2----------
   A  C
0  2  3
1  3  4
2  4  7
3  5  9
. . .

# Merge
pd.merge(df1,df2,how="left")
Out[51]:
A
B
C
0
1
3
NaN
1
2
4
3.0
2
3
5
4.0
. . .

pd.merge(df1,df2,how="right")
Out[52]:
A
B
C
0
2
4.0
3
1
3
5.0
4
2
4
NaN
7
3
5
NaN
9
. . .


pd.merge(df1,df2,how="inner")
Out[53]:
A
B
C
0
2
4
3
1
3
5
4
. . .

pd.merge(df1,df2,how="outer")
Out[54]:
A
B
C
0
1
3.0
NaN
1
2
4.0
3.0
2
3
5.0
4.0
3
4
NaN
7.0
4
5
NaN
9.0
. . .


No comments: