Monday, 28 September 2020

Python - Pandas: Dataframe - ii

 

import pandas as pd

import numpy as np

df = pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

df

Out[13]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

0

Afghanistan

0

0

0

0.0

AS

1

Albania

89

132

54

4.9

EU

2

Algeria

25

0

14

0.7

AF

3

Andorra

245

138

312

12.4

EU

4

Angola

217

57

45

5.9

AF

...

...

...

...

...

...

...

188

Venezuela

333

100

3

7.7

SA

189

Vietnam

111

2

1

2.0

AS

190

Yemen

6

0

0

0.1

AS

191

Zambia

32

19

4

2.5

AF

192

Zimbabwe

64

18

4

4.7

AF

193 rows × 6 columns

 

In [5]:

pd.options.display.max_rows

Out[5]:

60

In [6]:

pd.options.display.max_columns

Out[6]:

20

 

In [7]:

pd.get_option('max_rows')

Out[7]:

60

 

In [8]:

pd.get_option('max_columns')

Out[8]:

20

 

In [10]:

pd.reset_option('max_rows')

df

Out[10]:

country

beer_servings

spirit_servings

wine_servings

total_litres_of_pure_alcohol

continent

0

Afghanistan

0

0

0

0.0

AS

1

Albania

89

132

54

4.9

EU

2

Algeria

25

0

14

0.7

AF

3

Andorra

245

138

312

12.4

EU

4

Angola

217

57

45

5.9

AF

...

...

...

...

...

...

...

192

Zimbabwe

64

18

4

4.7

AF

193 rows × 6 columns

In [11]:

pd.options.display.precision

Out[11]:

6

In [14]:

pdf = pd.DataFrame(np.random.randn(5,5))

pdf

Out[14]:

0

1

2

3

4

0

-1.309994

-0.321530

0.570290

1.319875

1.119663

1

0.542382

1.481424

0.740494

2.020025

0.461774

2

-0.803047

0.648151

-0.527519

0.032987

-0.177919

3

-0.693263

0.252870

-0.072195

-0.719892

0.354932

4

-0.282920

0.180100

0.994994

0.139111

-1.761986

 

In [15]:

type(pdf)

Out[15]:

pandas.core.frame.DataFrame

 

In [16]:

np.random.randn(5,5)

Out[16]:

array([[-0.51844628, -0.72715875, -0.89111341,  0.02909364, -1.73662819],

       [ 0.39410077, -0.25912634,  0.04872252,  0.29090885,  0.70473386],

       [-0.67857312, -1.25726342,  0.17568482,  0.95954918,  0.13878635],

       [-0.98655845,  1.28251441, -0.32269209,  1.75882543,  1.18835483],

       [ 0.26609078,  1.55897722, -1.60704113,  0.76393953,  0.8338056 ]])

 

In [17]:

type(np.random.randn(5,5))

Out[17]:

numpy.ndarray

 

In [18]:

import matplotlib.pyplot as plt

%matplotlib inline

 

In [19]:

sdf = pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/Stock.csv')

sdf

Out[19]:

Day

Open

High

Low

Close

Adj Close

Volume

0

1

16.770000

17.559999

16.730000

17.010000

16.934999

271700

1

2

17.100000

17.469999

16.950001

17.200001

17.200001

229800

2

3

17.059999

17.230000

16.870001

17.049999

17.049999

146300

17

18

17.299999

17.389999

16.450001

16.510000

16.510000

300000

18

19

16.299999

16.629999

16.000000

16.459999

16.459999

324900

19

20

16.459999

16.580000

15.980000

16.040001

16.040001

141600

 

In [21]:

sdf.plot()

Out[21]:

<matplotlib.axes._subplots.AxesSubplot at 0x1819d141ac8>







In [22]:

sdf['Open'].plot()

Out[22]:

<matplotlib.axes._subplots.AxesSubplot at 0x1819d4a8708>



 

In [24]:

plt.style.available

Out[24]:

['bmh',

 'classic',

 'dark_background',

 'fast',

 'fivethirtyeight',

 'ggplot',

 'grayscale',

 'seaborn-bright',

 'seaborn-colorblind',

 'seaborn-dark-palette',

 'seaborn-dark',

 'seaborn-darkgrid',

 'seaborn-deep',

 'seaborn-muted',

 'seaborn-notebook',

 'seaborn-paper',

 'seaborn-pastel',

 'seaborn-poster',

 'seaborn-talk',

 'seaborn-ticks',

 'seaborn-white',

 'seaborn-whitegrid',

 'seaborn',

 'Solarize_Light2',

 'tableau-colorblind10',

 '_classic_test']

 

In [25]:

plt.style.use('seaborn')

sdf['Open'].plot()

Out[25]:

<matplotlib.axes._subplots.AxesSubplot at 0x1819d66adc8>



 

In [29]:

plt.style.use('dark_background')

sdf[['Open','Close']].plot()          # Multiple columns by putting those in a List

Out[29]:

<matplotlib.axes._subplots.AxesSubplot at 0x1819e82b9c8>



 

In [30]:

plt.style.use('ggplot')

sdf['Open'].plot.line()

Out[30]:

<matplotlib.axes._subplots.AxesSubplot at 0x1819e8b0c08>



In [34]:

plt.style.use('seaborn-notebook')

sdf[['Open','Close']].plot.bar()

Out[34]:

<matplotlib.axes._subplots.AxesSubplot at 0x1819eb6de88>



 

In [35]:

df.to_dict()

. . .

In [36]:

# Import csv data to Pandas

 df = pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/drinks.csv')

# Export xls data from Pandas

df.to_excel('C:/Users/kuldip_s/Documents/Python Scripts/drinks_xls.xlsx')

 

In [37]:

df.columns

Out[37]:

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',

       'total_litres_of_pure_alcohol', 'continent'],

      dtype='object')

In [39]:

# Export selected columns xls data from Pandas

df[['country','total_litres_of_pure_alcohol','continent']].to_excel('C:/Users/kuldip_s/Documents/Python Scripts/selected_columns.xlsx')

In [42]:

df.to_json('C:/Users/kuldip_s/Documents/Python Scripts/selected_columns.json')

In [43]:

df.to_html('C:/Users/kuldip_s/Documents/Python Scripts/selected_columns.html')

 

Working with Text data

In [53]:

import pandas as pd

data = pd.read_csv('C:/Users/kuldip_s/Documents/Python Scripts/SMSSpamCollection.txt',sep='\t',

            header = None, names=['Status','Message'])

type(data)

Out[53]:

pandas.core.frame.DataFrame

In [70]:

data.head(10)

Out[70]:

Status

Message

0

ham

Go until jurong point, crazy.. Available only ...

1

ham

Ok lar... Joking wif u oni...

2

spam

Free entry in 2 a wkly comp to win FA Cup fina...

3

ham

U dun say so early hor... U c already then say...

4

ham

Nah I don't think he goes to usf, he lives aro...

5

spam

FreeMsg Hey there darling it's been 3 week's n...

6

ham

Even my brother is not like to speak with me. ...

7

ham

As per your request 'Melle Melle (Oru Minnamin...

8

spam

WINNER!! As a valued network customer you have...

9

spam

Had your mobile 11 months or more? U R entitle...

In [56]:

data['Status'].upper()

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-56-677f1276ac31> in <module>

----> 1 data['Status'].upper()

 

~\Downloads\Anaconda\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)

   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):

   5178                 return self[name]

-> 5179             return object.__getattribute__(self, name)

   5180

   5181     def __setattr__(self, name, value):

 

AttributeError: 'Series' object has no attribute 'upper'

 

In [58]:

data['Status'].str.upper().head()  # Upper case

Out[58]:

0     HAM

1     HAM

2    SPAM

3     HAM

4     HAM

Name: Status, dtype: object

In [59]:

data['Status'].str.lower().head()   # Lower case

Out[59]:

0     ham

1     ham

2    spam

3     ham

4     ham

Name: Status, dtype: object

 

In [60]:

data['Status'].str.title().head()  # Camel case

Out[60]:

0     Ham

1     Ham

2    Spam

3     Ham

4     Ham

Name: Status, dtype: object

 

In [61]:

data['Status'].str.len().head()  # find the length of the text column

Out[61]:

0    3

1    3

2    4

3    3

4    3

Name: Status, dtype: int64

 

In [62]:

data['Message'].str.len().head()

Out[62]:

0    111

1     29

2    155

3     49

4     61

Name: Message, dtype: int64

 

In [65]:

data['Message'][0]   # find the value of the 1st row for Message column

Out[65]:

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

 

In [66]:

data['Message'][1]   # find the value of the 2nd row for Message column

Out[66]:

'Ok lar... Joking wif u oni...'

 

In [77]:

spam_msg=data[data['Status'] == 'spam']  # Find all the rows for Status = 'spam'

spam_msg.head()

Out[77]:

Status

Message

2

spam

Free entry in 2 a wkly comp to win FA Cup fina...

5

spam

FreeMsg Hey there darling it's been 3 week's n...

8

spam

WINNER!! As a valued network customer you have...

9

spam

Had your mobile 11 months or more? U R entitle...

11

spam

SIX chances to win CASH! From 100 to 20,000 po...

 

In [76]:

spam_msg['Message'].head()    # Find all the Message column values only for Status = spam

Out[76]:

2     Free entry in 2 a wkly comp to win FA Cup fina...

5     FreeMsg Hey there darling it's been 3 week's n...

8     WINNER!! As a valued network customer you have...

9     Had your mobile 11 months or more? U R entitle...

11    SIX chances to win CASH! From 100 to 20,000 po...

Name: Message, dtype: object

 

In [79]:

'I am learning Pandas'

Out[79]:

'I am learning Pandas'

 

In [80]:

'I am learning Pandas'.replace('I am','You are')   # Replace a string

Out[80]:

'You are learning Pandas'

 

In [83]:

type(spam_msg['Message'])

Out[83]:

pandas.core.series.Series

In [84]:

type(data)

Out[84]:

pandas.core.frame.DataFrame

 

In [85]:

data['Message'][3]

Out[85]:

'U dun say so early hor... U c already then say...'

In [86]:

data['Message'][3].replace('...','.')

Out[86]:

'U dun say so early hor. U c already then say.'

In [94]:

data['Message'][6].replace('.','  .  ')  # Adding white spaces

Out[94]:

'Even my brother is not like to speak with me  .   They treat me like aids patent  .  '

In [95]:

data['Message'][6].strip()  # removes the white spaces

Out[95]:

'Even my brother is not like to speak with me. They treat me like aids patent.'

In [97]:

data['Message']

Out[97]:

0       Go until jurong point, crazy.. Available only ...

1                           Ok lar... Joking wif u oni...

2       Free entry in 2 a wkly comp to win FA Cup fina...

3       U dun say so early hor... U c already then say...

4       Nah I don't think he goes to usf, he lives aro...

                              ...                       

5567    This is the 2nd time we have tried 2 contact u...

5568                 Will ü b going to esplanade fr home?

5569    Pity, * was in mood for that. So...any other s...

5570    The guy did some bitching but I acted like i'd...

5571                           Rofl. Its true to its name

Name: Message, Length: 5572, dtype: object

 

In [105]:

data[data['Message'].str.startswith('This')].head()  # records where Message starts with word 'This'

Out[105]:

Status

Message

545

ham

This girl does not stay in bed. This girl does...

668

ham

This pay is &lt;DECIMAL&gt; lakhs:)

791

ham

This is hoping you enjoyed your game yesterday...

1853

spam

This is the 2nd time we have tried 2 contact u...

1878

ham

This pen thing is beyond a joke. Wont a Biro d...

 

In [106]:

data[data['Message'].str.endswith('...')].head()  # records where Message starts with word '...'

Out[106]:

Status

Message

0

ham

Go until jurong point, crazy.. Available only ...

1

ham

Ok lar... Joking wif u oni...

3

ham

U dun say so early hor... U c already then say...

22

ham

So ü pay first lar... Then when is da stock co...

38

ham

Anything lor... U decide...

In [111]:

Sorry_word = data['Message'].str.contains('sorry')  # Message column conatains 'sorry' word

data[Sorry_word].head()

Out[111]:

Status

Message

53

ham

Wow. I never realized that you were so embaras...

192

ham

I'm sorry. I've joined the league of people th...

624

ham

sorry, no, have got few things to do. may be i...

706

ham

I don't think I can get away for a trek that l...

755

ham

Realy sorry-i don't recognise this number and ...

In [112]:

len(Sorry_word)  # How many records are there where 'sorry' word is available in 'message' column?

Out[112]:

5572

 

In [121]:

sorry_msg=data[Sorry_word]

sorry_msg.head()

Out[121]:

Status

Message

53

ham

Wow. I never realized that you were so embaras...

192

ham

I'm sorry. I've joined the league of people th...

624

ham

sorry, no, have got few things to do. may be i...

706

ham

I don't think I can get away for a trek that l...

755

ham

Realy sorry-i don't recognise this number and ...

 

In [126]:

sorry_msg[sorry_msg['Status']=='ham'].head()  # Out of above result

Out[126]:

Status

Message

53

ham

Wow. I never realized that you were so embaras...

192

ham

I'm sorry. I've joined the league of people th...

624

ham

sorry, no, have got few things to do. may be i...

706

ham

I don't think I can get away for a trek that l...

755

ham

Realy sorry-i don't recognise this number and ...

In [128]:

mystr = 'Hi, Hello, Nmskar : How are you all ?, All Good ? : Be strong, Be positive'

mystr

Out[128]:

'Hi, Hello, Nmskar : How are you all ?, All Good ? : Be strong, Be positive'

 

In [143]:

mystr.split()     # split the string

Out[143]:

['Hi,',

 'Hello,',

 'Nmskar',

 ':',

 'How',

 'are',

 'you',

 'all',

 '?,',

 'All',

 'Good',

 '?',

 ':',

 'Be',

 'strong,',

 'Be',

 'positive']

 

In [129]:

mystr.split(':')   # split the string on the basis of a character

Out[129]:

['Hi, Hello, Nmskar ',

 ' How are you all ?, All Good ? ',

 ' Be strong, Be positive']

In [135]:

mystr.split(maxsplit = 5) # split the string into number of strings

Out[135]:

['Hi,',

 'Hello,',

 'Nmskar',

 ':',

 'How',

 'are you all ?, All Good ? : Be strong, Be positive']

In [140]:

data['Message'][1]

Out[140]:

'Ok lar... Joking wif u oni...'

 

In [141]:

data['Message'][1].split()

Out[141]:

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

 

In [142]:

data['Message'][1].split()[0]  # Get the 1st token of splitted string

Out[142]:

'Ok'

 

In [145]:

data['Message'].str.split().head()  # Split all the rows in Message column

Out[145]:

0    [Go, until, jurong, point,, crazy.., Available...

1                 [Ok, lar..., Joking, wif, u, oni...]

2    [Free, entry, in, 2, a, wkly, comp, to, win, F...

3    [U, dun, say, so, early, hor..., U, c, already...

4    [Nah, I, don't, think, he, goes, to, usf,, he,...

Name: Message, dtype: object

 

In [146]:

data['Message'].str.split().str.get(0).head()  # Get the 1st token of splitted string for all rows

Out[146]:

0      Go

1      Ok

2    Free

3       U

4     Nah

Name: Message, dtype: object

In [147]:

data.columns                     # To know the column names in a Dataset

Out[147]:

Index(['Status', 'Message'], dtype='object')

 

In [149]:

data.columns.str.len()              # To know the length of each column names in a Dataset

Out[149]:

Int64Index([6, 7], dtype='int64')

 

In [154]:

data.columns.str.upper()   # Make the column names into uppercase

Out[154]:

Index(['STATUS', 'MESSAGE'], dtype='object')