Data Analysis with Pandas

0 — The Data

import numpy as np
import pandas as pd
import seaborn as sns

df = sns.load_dataset('tips')
df
png

A — Apply

# function to binarise the data
def smoke_bin(text):
if text == 'Yes':
return 1
else:
return 0


df['smoker'].apply(lambda x : smoke_bin(x))
0 0
1 0
2 0
3 0
4 0
..
239 0
240 1
241 1
242 0
243 0
Name: smoker, Length: 244, dtype: category
Categories (2, int64): [1, 0]

B — Boolean Criterion

is_smoker = df['smoker'] == 'Yes'
df[is_smoker]
png
# ~ is the boolean operator "not"
df[~is_smoker]
png

C — Contains

sundays = df['day'].str.contains('Sun')
df[sundays]
png

D — Describe

df.describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99], include='all')
png

E — Explode

# creating a DataFrame with sequences as observations
df_explode = pd.DataFrame({'Col1': [(2,4), [i for i in range(5)], []], 'Col2': [1, 2, 3]})
png
df_explode.explode('Col1')
png

F — Fillna

# altering the DataFrame
df['tip_na'] = df['tip'].replace(to_replace=[2.00, 3.00, 4.00], value=np.nan)

df['tip_na'].fillna(df['tip_na'].mean())
0 1.010000
1 1.660000
2 3.500000
3 3.310000
4 3.610000
...
239 5.920000
240 3.116932
241 3.116932
242 1.750000
243 3.116932
Name: tip_na, Length: 244, dtype: float64
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('tips')
df
png

G — Groupby

df.groupby(by=['day', 'time']).sum()
png

H — Head

df.head(n=10)
png

I — Isna

# altering the DataFrame
df['tip_na'] = df['tip'].replace(to_replace=[2.00, 3.00, 4.00], value=np.nan)

df.isna().value_counts()
total_bill tip sex smoker day time size tip_na
False False False False False False False False 176
True 68
dtype: int64

J — Join

# creating DataFrames to join
df_num = df[['total_bill', 'tip', 'size']]
df_cat = df[['sex', 'smoker', 'day', 'time']]
df_num.join(df_cat)
png

K — Kurtosis

df['total_bill'].plot(kind='kde')<AxesSubplot:ylabel='Density'>
png
df['tip'].plot(kind='kde')<AxesSubplot:ylabel='Density'>
png
df.kurtosis()total_bill    1.218484
tip 3.648376
size 1.731700
tip_na 2.494184
dtype: float64

L — Loc/ Iloc

# Creating a DataFrame with alpha index
df_loc = df.loc[[i for i in range(26)]]
df_loc['new_index'] =[i for i in 'qwertyuiopasdfghjklzxcvbnm']
df_loc.set_index('new_index', inplace=True)
df_loc.head(5)
png
df_loc.loc[['w','a','s','d']]
png
df_loc.iloc[[0, 1, 2, 3, 4]]
png

M -Melt

pd.melt(df, id_vars=['time'], value_vars=['total_bill', 'tip'])
png

N — NaN

O — Ordered

df['time'].cat.orderedFalse

P — Plot

df.plot()<AxesSubplot:>
png

Q — Qcut

pd.qcut(df['tip'], 3, labels=['Cheap', 'Decent', 'Generous'])0         Cheap
1 Cheap
2 Generous
3 Generous
4 Generous
...
239 Generous
240 Cheap
241 Cheap
242 Cheap
243 Decent
Name: tip, Length: 244, dtype: category
Categories (3, object): ['Cheap' < 'Decent' < 'Generous']

R — Read_…

pd.read_parquet('s3://bucket_name', columns=['column_1', 'column_3', 'column_11'])

S -Sample

df.sample(frac=.1, replace=True)
png

T — To_…

df.to_csv('filename.csv')pd.read_csv('filename.csv', index_col=[0])
png

U — Unique

pd.unique(df['day'])['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Sun', 'Sat', 'Thur', 'Fri']

V — Value Counts

df['tip'].value_counts().head(20)2.00    33
3.00 23
4.00 12
5.00 10
2.50 10
3.50 9
1.50 9
1.00 4
1.25 3
3.48 3
2.01 2
4.08 2
2.23 2
2.03 2
3.18 2
2.31 2
2.24 2
6.50 2
4.30 2
2.20 2
Name: tip, dtype: int64

W — Where

df.where(df['smoker'] == 'Yes')
png

X — XS

# creating a DataFrame with multi-index
df_xs = df.set_index(['day', 'size'])
png
df_xs.xs('Fri')
png

Y — Year

# creating DataFrame 
df_time = pd.Series(pd.date_range('2000-01/01', periods=20, freq='M'))
0 2000-01-31
1 2000-02-29
2 2000-03-31
3 2000-04-30
4 2000-05-31
5 2000-06-30
6 2000-07-31
7 2000-08-31
8 2000-09-30
9 2000-10-31
10 2000-11-30
11 2000-12-31
12 2001-01-31
13 2001-02-28
14 2001-03-31
15 2001-04-30
16 2001-05-31
17 2001-06-30
18 2001-07-31
19 2001-08-31
dtype: datetime64[ns]
df_time.dt.year0 2000
1 2000
2 2000
3 2000
4 2000
5 2000
6 2000
7 2000
8 2000
9 2000
10 2000
11 2000
12 2001
13 2001
14 2001
15 2001
16 2001
17 2001
18 2001
19 2001
dtype: int64

Z — Zfill

df['smoker'].str.zfill(4)0      00No
1 00No
2 00No
3 00No
4 00No
...
239 00No
240 0Yes
241 0Yes
242 00No
243 00No
Name: smoker, Length: 244, dtype: object

Data Scientist

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store