Pandas Basics¶

In [ ]:

Copied!

import pandas as pd
import numpy as np
import pandas as pd
import numpy as np

Load Dataset¶

In [ ]:

Copied!





# !pip install mlcroissant
# import mlcroissant as mlc

# # Fetch the Croissant JSON-LD
# croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results/croissant/download')

# # Check what record sets are in the dataset
# record_sets = croissant_dataset.metadata.record_sets
# print(record_sets)

# # Fetch the records and put them in a DataFrame
# record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
# record_set_df.head(2)
# !pip install mlcroissant
# import mlcroissant as mlc

# # Fetch the Croissant JSON-LD
# croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results/croissant/download')

# # Check what record sets are in the dataset
# record_sets = croissant_dataset.metadata.record_sets
# print(record_sets)

# # Fetch the records and put them in a DataFrame
# record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
# record_set_df.head(2)

In [ ]:

Copied!

# cols = map(lambda name: name.split("/")[1], record_set_df.columns)
# df = pd.DataFrame(record_set_df.values[:1401], columns=cols)
# cols = map(lambda name: name.split("/")[1], record_set_df.columns)
# df = pd.DataFrame(record_set_df.values[:1401], columns=cols)

In [ ]:

Copied!

# df.head(2)
# df.head(2)

In [ ]:

Copied!

# df.count()
# df.count()

In [ ]:

Copied!

# df.describe()
# df.describe()

Basics¶

In [ ]:

Copied!

temp_df = pd.DataFrame(data=[[1, 2, 3], [4, 2, 6], [7, 2, 9]], dtype=np.int8, columns=["A", "B", "C"], index=["x", "y", "z"])
temp_df = pd.DataFrame(data=[[1, 2, 3], [4, 2, 6], [7, 2, 9]], dtype=np.int8, columns=["A", "B", "C"], index=["x", "y", "z"])

In [ ]:

Copied!

temp_df.head(1)
temp_df.head(1)

Out[ ]:

	A	B	C
x	1	2	3

In [ ]:

Copied!

temp_df.tail(1)
temp_df.tail(1)

Out[ ]:

	A	B	C
z	7	2	9

In [ ]:

Copied!

temp_df.index
temp_df.index

Out[ ]:

Index(['x', 'y', 'z'], dtype='object')

In [ ]:

Copied!

temp_df.index.to_list()
temp_df.index.to_list()

Out[ ]:

['x', 'y', 'z']

In [ ]:

Copied!

temp_df.info()
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, x to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int8 
 1   B       3 non-null      int8 
 2   C       3 non-null      int8 
dtypes: int8(3)
memory usage: 141.0+ bytes

In [ ]:

Copied!

temp_df.describe()
temp_df.describe()

Out[ ]:

	A	B	C
count	3.0	3.0	3.0
mean	4.0	2.0	6.0
std	3.0	0.0	3.0
min	1.0	2.0	3.0
25%	2.5	2.0	4.5
50%	4.0	2.0	6.0
75%	5.5	2.0	7.5
max	7.0	2.0	9.0

In [ ]:

Copied!

temp_df.nunique() # number of unique values in each column
temp_df.nunique() # number of unique values in each column

Out[ ]:

	0
A	3
B	1
C	3

dtype: int64

In [ ]:

Copied!

temp_df["B"].nunique()
temp_df["B"].nunique()

Out[ ]:

In [ ]:

Copied!

temp_df.shape
temp_df.shape

Out[ ]:

(3, 3)

In [ ]:

Copied!

temp_df.memory_usage() # Return the memory usage of each column in bytes.
temp_df.memory_usage() # Return the memory usage of each column in bytes.

Out[ ]:

	0
Index	132
A	3
B	3
C	3

dtype: int64

Access Random Data¶

In [ ]:

Copied!

temp_df.sample(2, replace=False)
temp_df.sample(2, replace=False)

Out[ ]:

	A	B	C
x	1	2	3
z	7	2	9

Save¶

In [ ]:

Copied!

temp_df.to_parquet("./temp.parquet")
!ls
temp_df.to_parquet("./temp.parquet")
!ls

sample_data  temp.parquet

Load From URL¶

https://github.com/KeithGalli/complete-pandas-tutorial/blob/master/warmup-data/coffee.csv

Load csv¶

In [ ]:

Copied!

# load from url
coffee = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv")
coffee.head(2)
# load from url
coffee = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv")
coffee.head(2)

Out[ ]:

	Day	Coffee Type	Units Sold
0	Monday	Espresso	25
1	Monday	Latte	15

Load Parquet¶

https://github.com/KeithGalli/complete-pandas-tutorial/tree/master/data

In [ ]:

Copied!

olympic_results = pd.read_parquet("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/results.parquet")
olympic_results.head(2)
olympic_results = pd.read_parquet("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/results.parquet")
olympic_results.head(2)

Out[ ]:

	year	type	discipline	event	as	athlete_id	noc	team	place	tied	medal
0	1912.0	Summer	Tennis	Singles, Men (Olympic)	Jean-François Blanchy	1	FRA	None	17.0	True	None
1	1912.0	Summer	Tennis	Doubles, Men (Olympic)	Jean-François Blanchy	1	FRA	Jean Montariol	NaN	False	None

In [ ]:

Copied!

bios = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios.head(2)
bios = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios.head(2)

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN

Loc iloc¶

In [ ]:

Copied!

coffee.head()
coffee.head()

Out[ ]:

	Day	Coffee Type	Units Sold
0	Monday	Espresso	25
1	Monday	Latte	15
2	Tuesday	Espresso	30
3	Tuesday	Latte	20
4	Wednesday	Espresso	35

In [ ]:

Copied!

print(coffee.loc[:2, "Units Sold"]) # Row, Col
print(coffee.loc[[1, 2, 7], ["Day", "Units Sold"]]) # Row, Col
print(coffee.loc[[1, 2, 7]]) # Row, Col
print(coffee.loc[:2, "Units Sold"]) # Row, Col
print(coffee.loc[[1, 2, 7], ["Day", "Units Sold"]]) # Row, Col
print(coffee.loc[[1, 2, 7]]) # Row, Col

0    25
1    15
2    30
Name: Units Sold, dtype: int64
        Day  Units Sold
1    Monday          15
2   Tuesday          30
7  Thursday          30
        Day Coffee Type  Units Sold
1    Monday       Latte          15
2   Tuesday    Espresso          30
7  Thursday       Latte          30

In [ ]:

Copied!

coffee.iloc[[1, 2, 7], [0, 2]] # == coffee.loc[[1, 2, 7], ["Day", "Units Sold"]])
coffee.iloc[[1, 2, 7], [0, 2]] # == coffee.loc[[1, 2, 7], ["Day", "Units Sold"]])

Out[ ]:

	Day	Units Sold
1	Monday	15
2	Tuesday	30
7	Thursday	30

In [ ]:

Copied!

coffee.loc[[1, 2, 7], ["Units Sold"]] += 5
coffee.head()
coffee.loc[[1, 2, 7], ["Units Sold"]] += 5
coffee.head()

Out[ ]:

	Day	Coffee Type	Units Sold
0	Monday	Espresso	25
1	Monday	Latte	20
2	Tuesday	Espresso	35
3	Tuesday	Latte	20
4	Wednesday	Espresso	35

Use at, iat for single item¶

Sort¶

In [ ]:

Copied!

coffee.sort_values(["Units Sold", "Day"], ascending=[False, True])
coffee.sort_values(["Units Sold", "Day"], ascending=[False, True])

Out[ ]:

	Day	Coffee Type	Units Sold
8	Friday	Espresso	45
10	Saturday	Espresso	45
12	Sunday	Espresso	45
6	Thursday	Espresso	40
9	Friday	Latte	35
11	Saturday	Latte	35
13	Sunday	Latte	35
7	Thursday	Latte	35
2	Tuesday	Espresso	35
4	Wednesday	Espresso	35
0	Monday	Espresso	25
5	Wednesday	Latte	25
1	Monday	Latte	20
3	Tuesday	Latte	20

Iterate¶

In [ ]:

Copied!





for index, row in coffee.iterrows():
  if index == 5:
    break
  print(index, row)
for index, row in coffee.iterrows():
  if index == 5:
    break
  print(index, row)

0 Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object
1 Day            Monday
Coffee Type     Latte
Units Sold         20
Name: 1, dtype: object
2 Day             Tuesday
Coffee Type    Espresso
Units Sold           35
Name: 2, dtype: object
3 Day            Tuesday
Coffee Type      Latte
Units Sold          20
Name: 3, dtype: object
4 Day            Wednesday
Coffee Type     Espresso
Units Sold            35
Name: 4, dtype: object

Filter¶

In [ ]:

Copied!

olympic_results.head()
olympic_results.head()

Out[ ]:

	year	type	discipline	event	as	athlete_id	noc	team	place	tied	medal
0	1912.0	Summer	Tennis	Singles, Men (Olympic)	Jean-François Blanchy	1	FRA	None	17.0	True	None
1	1912.0	Summer	Tennis	Doubles, Men (Olympic)	Jean-François Blanchy	1	FRA	Jean Montariol	NaN	False	None
2	1920.0	Summer	Tennis	Singles, Men (Olympic)	Jean-François Blanchy	1	FRA	None	32.0	True	None
3	1920.0	Summer	Tennis	Doubles, Mixed (Olympic)	Jean-François Blanchy	1	FRA	Jeanne Vaussard	8.0	True	None
4	1920.0	Summer	Tennis	Doubles, Men (Olympic)	Jean-François Blanchy	1	FRA	Jacques Brugnon	4.0	False	None

In [ ]:

Copied!

bios.head()
bios.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN
2	3	Jean Borotra	1898-08-13	Biarritz	Pyrénées-Atlantiques	FRA	France	183.0	76.0	1994-07-17
3	4	Jacques Brugnon	1895-05-11	Paris VIIIe	Paris	FRA	France	168.0	64.0	1978-03-20
4	5	Albert Canet	1878-04-17	Wandsworth	England	GBR	France	NaN	NaN	1930-07-25

Work on Olympics¶

In [ ]:

Copied!

olympic_results['discipline'].unique()
olympic_results['discipline'].unique()

Out[ ]:

array(['Tennis', 'Table Tennis', 'Fencing', 'Badminton', 'Hockey',
       'Athletics', 'Art Competitions', 'Modern Pentathlon', 'Archery',
       'Boxing', 'Softball (Baseball/Softball)',
       'Baseball (Baseball/Softball)', 'Sailing', 'Golf', 'Jeu De Paume',
       'Basketball (Basketball)', 'Cycling Road (Cycling)', 'Handball',
       'Australian Rules Football', 'Bobsleigh (Bobsleigh)',
       'Canoe Sprint (Canoeing)', 'Canoe Slalom (Canoeing)',
       'Canoe Marathon (Canoeing)', 'Shooting', 'Rowing',
       'Cycling Track (Cycling)', 'Polo', 'Rugby (Rugby)',
       'Equestrian Dressage (Equestrian)',
       'Equestrian Eventing (Equestrian)',
       'Equestrian Jumping (Equestrian)', 'Tug-Of-War', 'Lacrosse',
       'Diving (Aquatics)', 'Winter Pentathlon',
       'Equestrian Vaulting (Equestrian)',
       'Equestrian Driving (Equestrian)',
       'Cycling Mountain Bike (Cycling)', 'Speed Skating (Skating)',
       'Football (Football)', 'Bicycle Polo (Cycling)',
       'Cross Country Skiing (Skiing)', 'Racquets', 'Cricket', 'Croquet',
       'Motorboating', 'Swimming (Aquatics)', 'Ice Hockey (Ice Hockey)',
       'Bandy', 'Artistic Gymnastics (Gymnastics)',
       'Rhythmic Gymnastics (Gymnastics)', 'Weightlifting', 'Wrestling',
       'Alpine Skiing (Skiing)', 'Judo', 'Military Ski Patrol (Skiing)',
       'Nordic Combined (Skiing)', 'Water Polo (Aquatics)', 'Savate',
       'Skeleton (Bobsleigh)', 'Artistic Swimming (Aquatics)',
       'Triathlon', 'Beach Volleyball (Volleyball)',
       'Volleyball (Volleyball)', 'Glíma', 'Biathlon',
       'Figure Skating (Skating)', 'Basque pelota', 'Luge',
       'Short Track Speed Skating (Skating)', 'Freestyle Skiing (Skiing)',
       'Speed Skiing (Skiing)', 'Ski Jumping (Skiing)',
       'Cycling BMX Racing (Cycling)', 'Trampolining (Gymnastics)',
       'Marathon Swimming (Aquatics)', 'Taekwondo',
       'Snowboarding (Skiing)', 'Curling', 'Roque',
       '3x3 Basketball (Basketball)', 'Skateboarding (Roller Sports)',
       'Mixed Sports', 'Rugby Sevens (Rugby)', 'Hockey 5s',
       'Ballooning (Air Sports)', 'Ski Mountaineering',
       '3-on-3 Ice Hockey (Ice Hockey)', 'Karate', 'Surfing',
       'Cycling BMX Freestyle (Cycling)', 'Sport Climbing',
       'Roller Skating (Roller Sports)', None], dtype=object)

In [ ]:

Copied!

olympic_results.info()
olympic_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308408 entries, 0 to 308407
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   year        305807 non-null  float64
 1   type        305807 non-null  object 
 2   discipline  308407 non-null  object 
 3   event       308408 non-null  object 
 4   as          308408 non-null  object 
 5   athlete_id  308408 non-null  int64  
 6   noc         308407 non-null  object 
 7   team        121714 non-null  object 
 8   place       283193 non-null  float64
 9   tied        308408 non-null  bool   
 10  medal       44139 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(7)
memory usage: 23.8+ MB

In [ ]:

Copied!

olympic_results.describe()
olympic_results.describe()

Out[ ]:

	year	athlete_id	place
count	305807.000000	308408.000000	283193.000000
mean	1982.239223	72624.097125	16.102764
std	30.983982	41285.940841	19.322611
min	1896.000000	1.000000	1.000000
25%	1964.000000	34412.000000	5.000000
50%	1992.000000	73739.500000	9.000000
75%	2008.000000	107168.000000	20.000000
max	2022.000000	148986.000000	185.000000

In [ ]:

Copied!

olympic_results.loc[olympic_results["place"] > 180, "discipline"]
olympic_results.loc[olympic_results["place"] > 180, "discipline"]

Out[ ]:

	discipline
58493	Artistic Gymnastics (Gymnastics)
58507	Artistic Gymnastics (Gymnastics)
58510	Artistic Gymnastics (Gymnastics)
58511	Artistic Gymnastics (Gymnastics)
58747	Artistic Gymnastics (Gymnastics)
59433	Artistic Gymnastics (Gymnastics)
59436	Artistic Gymnastics (Gymnastics)
59437	Artistic Gymnastics (Gymnastics)
59439	Artistic Gymnastics (Gymnastics)
59440	Artistic Gymnastics (Gymnastics)
60054	Artistic Gymnastics (Gymnastics)
60636	Artistic Gymnastics (Gymnastics)
60756	Artistic Gymnastics (Gymnastics)
60761	Artistic Gymnastics (Gymnastics)
63681	Artistic Gymnastics (Gymnastics)
64985	Artistic Gymnastics (Gymnastics)
64986	Artistic Gymnastics (Gymnastics)
64987	Artistic Gymnastics (Gymnastics)
64988	Artistic Gymnastics (Gymnastics)
64989	Artistic Gymnastics (Gymnastics)
64990	Artistic Gymnastics (Gymnastics)
64991	Artistic Gymnastics (Gymnastics)
65007	Artistic Gymnastics (Gymnastics)
65010	Artistic Gymnastics (Gymnastics)
65012	Artistic Gymnastics (Gymnastics)
65013	Artistic Gymnastics (Gymnastics)
67125	Artistic Gymnastics (Gymnastics)
68180	Artistic Gymnastics (Gymnastics)
68183	Artistic Gymnastics (Gymnastics)
68186	Artistic Gymnastics (Gymnastics)
68348	Artistic Gymnastics (Gymnastics)
69909	Artistic Gymnastics (Gymnastics)
72808	Artistic Gymnastics (Gymnastics)
72864	Artistic Gymnastics (Gymnastics)
72865	Artistic Gymnastics (Gymnastics)

dtype: object

Work on Bios¶

In [ ]:

Copied!

# tall_players = bios.loc[bios['height_cm'] > 215, ['athlete_id', 'name']]
tall_players = bios.loc[(bios['height_cm'] > 215) & (bios['weight_kg'] > 100) & (bios['died_date'].isnull())][['athlete_id', 'name', 'height_cm', 'weight_kg', 'died_date']]

tall_players.head()
# tall_players = bios.loc[bios['height_cm'] > 215, ['athlete_id', 'name']]
tall_players = bios.loc[(bios['height_cm'] > 215) & (bios['weight_kg'] > 100) & (bios['died_date'].isnull())][['athlete_id', 'name', 'height_cm', 'weight_kg', 'died_date']]

tall_players.head()

Out[ ]:

	athlete_id	name	height_cm	weight_kg	died_date
5583	5606	Paulinho Villas Boas	217.0	106.0	NaN
5673	5696	Gunther Behnke	221.0	114.0	NaN
5716	5739	Uwe Blab	218.0	110.0	NaN
5781	5804	Tommy Burleson	223.0	102.0	NaN
6223	6250	Lars Hansen	216.0	105.0	NaN

Find Tall Alive Players that Won a Gold Medal¶

In [ ]:

Copied!

tall_results = olympic_results.loc[olympic_results['athlete_id'].isin(tall_players['athlete_id'])][['athlete_id', 'medal']]
tall_results = olympic_results.loc[olympic_results['athlete_id'].isin(tall_players['athlete_id'])][['athlete_id', 'medal']]

In [ ]:

Copied!

tall_results = tall_results.loc[tall_results['medal'] == 'Gold']
tall_results = tall_results.loc[tall_results['medal'] == 'Gold']

In [ ]:

Copied!

# Persons' name with Height(above 215cm) and weight(above 100kg) and Not dead that won gold medal
# tall_players.loc[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
tall_players[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
# Persons' name with Height(above 215cm) and weight(above 100kg) and Not dead that won gold medal
# tall_players.loc[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
tall_players[tall_players['athlete_id'].isin(tall_results['athlete_id'])]

Out[ ]:

	athlete_id	name	height_cm	weight_kg	died_date
6722	6755	Shaquille O'Neal	216.0	137.0	NaN
6937	6972	David Robinson	216.0	107.0	NaN
6978	7013	Arvydas Sabonis	223.0	122.0	NaN
118676	120415	Dmitry Musersky	219.0	104.0	NaN
123850	126093	Tyson Chandler	216.0	107.0	NaN

Str Filter¶

In [ ]:

Copied!

bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')

bios[bios['NOC'] == 'Iran'].head(5)
bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')

bios[bios['NOC'] == 'Iran'].head(5)

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
783	787	Ibrahim Al-Idokht	1967-10-26	NaN	NaN	NaN	Iran	NaN	NaN	NaN
3886	3899	Ibrahim Afsharpour	1930-10-08	NaN	NaN	NaN	Iran	NaN	NaN	NaN
3887	3900	Emanoul Aghasi	1930-12-25	Salmas	Azarbaijan Gharbi	IRAN	Iran	NaN	NaN	2021-09-24
3888	3901	Nasser Aghaie	1944-01-20	Tehran	Tehran	IRAN	Iran	160.0	51.0	2009-12-27
3889	3902	Sadegh Ali Akbarzadeh Khoi	1932-09-03	NaN	NaN	NaN	Iran	163.0	54.0	NaN

In [ ]:

Copied!

bios[bios['name'].str.contains("reZA", case=False)].head()
bios[bios['name'].str.contains("reZA", case=False)].head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3917	3931	Mohamed Reza Samadi	1971-01-01	NaN	NaN	NaN	Iran	198.0	104.0	NaN
10255	10311	Itzel Reza	1979-05-29	NaN	NaN	NaN	Mexico	NaN	NaN	NaN
13398	13480	Mohammad Reza Bajool	1960-03-21	NaN	NaN	NaN	Iran	172.0	62.0	NaN
13415	13497	Mohamed Reza Banna	1971-05-05	NaN	NaN	NaN	Iran	178.0	69.0	NaN
25651	25843	Ali Reza Azizi	1949-08-23	Mashhad	Khorasan Razavi	IRAN	Iran	167.0	61.0	2021-08-07

In [ ]:

Copied!

bios[bios['name'].str.contains("reZA|gholam", case=False, regex=True)].head() # You can use regax
bios[bios['name'].str.contains("reZA|gholam", case=False, regex=True)].head() # You can use regax

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3917	3931	Mohamed Reza Samadi	1971-01-01	NaN	NaN	NaN	Iran	198.0	104.0	NaN
10255	10311	Itzel Reza	1979-05-29	NaN	NaN	NaN	Mexico	NaN	NaN	NaN
13398	13480	Mohammad Reza Bajool	1960-03-21	NaN	NaN	NaN	Iran	172.0	62.0	NaN
13415	13497	Mohamed Reza Banna	1971-05-05	NaN	NaN	NaN	Iran	178.0	69.0	NaN
15027	15128	Gholam Hossein Koohi	1951-01-10	NaN	NaN	NaN	Iran	172.0	68.0	NaN

In [ ]:

Copied!

bios.query('NOC == "Iran" and born_country == "IRAN"')
bios.query('NOC == "Iran" and born_country == "IRAN"')

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3887	3900	Emanoul Aghasi	1930-12-25	Salmas	Azarbaijan Gharbi	IRAN	Iran	NaN	NaN	2021-09-24
3888	3901	Nasser Aghaie	1944-01-20	Tehran	Tehran	IRAN	Iran	160.0	51.0	2009-12-27
3901	3914	Ayoub Pourtaghi	1973-01-01	Orumiyeh (Urmia)	Azarbaijan Gharbi	IRAN	Iran	182.0	81.0	2021-09-29
3905	3918	Vazik Ghazarian	1937-05-04	Bahmanshir	Khuzestan	IRAN	Iran	168.0	63.0	NaN
3907	3920	Iraj Kia Rostami	1963-07-07	Tehran	Tehran	IRAN	Iran	180.0	91.0	2015-08-06
...	...	...	...	...	...	...	...	...	...	...
140331	143837	Morteza Ghiasi	1995-04-07	Dorud	Lorestan	IRAN	Iran	NaN	NaN	NaN
140332	143838	Mostafa Hosseinkhani	1989-03-27	Tehran	Tehran	IRAN	Iran	NaN	NaN	NaN
140333	143839	Reza Atri	1994-08-08	Babol	Mazandaran	IRAN	Iran	NaN	NaN	NaN
144049	147714	Mohammad Fotouhi	1990-12-05	Yazd	Yazd	IRAN	Iran	NaN	NaN	NaN
145469	149196	Danyal Saveh Shemshaki	1997-07-02	Tehran	Tehran	IRAN	Iran	NaN	NaN	NaN

326 rows × 10 columns

Adding and Remove Columns¶

In [ ]:

Copied!

coffee.head()
coffee.head()

Out[ ]:

	Day	Coffee Type	Units Sold
0	Monday	Espresso	25
1	Monday	Latte	20
2	Tuesday	Espresso	35
3	Tuesday	Latte	20
4	Wednesday	Espresso	35

In [ ]:

Copied!

# coffee["Price"] = 5.99
coffee['Price'] = np.where(coffee['Coffee Type'] == 'Espresso', 3.99, 5.99)
# coffee["Price"] = 5.99
coffee['Price'] = np.where(coffee['Coffee Type'] == 'Espresso', 3.99, 5.99)

In [ ]:

Copied!

coffee['Revenue'] = coffee['Units Sold'] * coffee['Price']
coffee.head()
coffee['Revenue'] = coffee['Units Sold'] * coffee['Price']
coffee.head()

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue
0	Monday	Espresso	25	3.99	99.75
1	Monday	Latte	20	5.99	119.80
2	Tuesday	Espresso	35	3.99	139.65
3	Tuesday	Latte	20	5.99	119.80
4	Wednesday	Espresso	35	3.99	139.65

In [ ]:

Copied!

coffee.rename(columns={'Price':'Coffee Price'})
coffee.rename(columns={'Price':'Coffee Price'})

Out[ ]:

	Day	Coffee Type	Units Sold	Coffee Price	Revenue
0	Monday	Espresso	25	3.99	99.75
1	Monday	Latte	20	5.99	119.80
2	Tuesday	Espresso	35	3.99	139.65
3	Tuesday	Latte	20	5.99	119.80
4	Wednesday	Espresso	35	3.99	139.65
5	Wednesday	Latte	25	5.99	149.75
6	Thursday	Espresso	40	3.99	159.60
7	Thursday	Latte	35	5.99	209.65
8	Friday	Espresso	45	3.99	179.55
9	Friday	Latte	35	5.99	209.65
10	Saturday	Espresso	45	3.99	179.55
11	Saturday	Latte	35	5.99	209.65
12	Sunday	Espresso	45	3.99	179.55
13	Sunday	Latte	35	5.99	209.65

In [ ]:

Copied!

# coffee = coffee.drop(['Price'], axis=1) # drop column
# coffee = coffee.drop(0, axis=0) # drop row
# coffee = coffee.drop(['Price'], axis=1) # drop column
# coffee = coffee.drop(0, axis=0) # drop row

DateTime¶

In [ ]:

Copied!

bios_temp = bios.copy()
bios_temp[bios_temp['name'].str.contains("Reza", case=False)].head()
bios_temp = bios.copy()
bios_temp[bios_temp['name'].str.contains("Reza", case=False)].head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3917	3931	Mohamed Reza Samadi	1971-01-01	NaN	NaN	NaN	Iran	198.0	104.0	NaN
10255	10311	Itzel Reza	1979-05-29	NaN	NaN	NaN	Mexico	NaN	NaN	NaN
13398	13480	Mohammad Reza Bajool	1960-03-21	NaN	NaN	NaN	Iran	172.0	62.0	NaN
13415	13497	Mohamed Reza Banna	1971-05-05	NaN	NaN	NaN	Iran	178.0	69.0	NaN
25651	25843	Ali Reza Azizi	1949-08-23	Mashhad	Khorasan Razavi	IRAN	Iran	167.0	61.0	2021-08-07

In [ ]:

Copied!

# convert born data to datetime
bios_temp['born_date_dt'] = pd.to_datetime(bios_temp['born_date'], format="%Y-%m-%d")
# convert born data to datetime
bios_temp['born_date_dt'] = pd.to_datetime(bios_temp['born_date'], format="%Y-%m-%d")

In [ ]:

Copied!

bios_temp.info()
bios_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   athlete_id    145500 non-null  int64         
 1   name          145500 non-null  object        
 2   born_date     143693 non-null  object        
 3   born_city     110908 non-null  object        
 4   born_region   110908 non-null  object        
 5   born_country  110908 non-null  object        
 6   NOC           145499 non-null  object        
 7   height_cm     106651 non-null  float64       
 8   weight_kg     102070 non-null  float64       
 9   died_date     33940 non-null   object        
 10  born_date_dt  143693 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 12.2+ MB

In [ ]:

Copied!





# Now extract month and year from datetime column
bios_temp['born_year'] = bios_temp['born_date_dt'].dt.year
bios_temp['born_day_of_week'] = bios_temp['born_date_dt'].dt.day_of_week
bios_temp['is_leap'] = bios_temp['born_date_dt'].dt.is_leap_year

bios_temp.head()
# Now extract month and year from datetime column
bios_temp['born_year'] = bios_temp['born_date_dt'].dt.year
bios_temp['born_day_of_week'] = bios_temp['born_date_dt'].dt.day_of_week
bios_temp['is_leap'] = bios_temp['born_date_dt'].dt.is_leap_year

bios_temp.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date	born_date_dt	born_year	born_day_of_week	is_leap
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02	1886-12-12	1886.0	6.0	False
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN	1969-04-01	1969.0	1.0	False
2	3	Jean Borotra	1898-08-13	Biarritz	Pyrénées-Atlantiques	FRA	France	183.0	76.0	1994-07-17	1898-08-13	1898.0	5.0	False
3	4	Jacques Brugnon	1895-05-11	Paris VIIIe	Paris	FRA	France	168.0	64.0	1978-03-20	1895-05-11	1895.0	5.0	False
4	5	Albert Canet	1878-04-17	Wandsworth	England	GBR	France	NaN	NaN	1930-07-25	1878-04-17	1878.0	2.0	False

Categories Column¶

In [ ]:

Copied!





def cat_height(h: float):
  if h < 150:
    return 'Short'
  if h > 180:
    return 'Tall'

  return 'Normal' if not np.isnan(h) else 'Not Defined'
def cat_height(h: float):
  if h < 150:
    return 'Short'
  if h > 180:
    return 'Tall'

  return 'Normal' if not np.isnan(h) else 'Not Defined'

In [ ]:

Copied!

# bios_temp['height_category'] = bios_temp['height_cm'].apply(cat_height)
bios_temp['height_category'] = bios_temp['height_cm'].apply(lambda h: "Short" if h < 150 else ("Tall" if h > 180 else ('Normal' if not np.isnan(h) else 'Not Defined')))
# bios_temp['height_category'] = bios_temp['height_cm'].apply(cat_height)
bios_temp['height_category'] = bios_temp['height_cm'].apply(lambda h: "Short" if h < 150 else ("Tall" if h > 180 else ('Normal' if not np.isnan(h) else 'Not Defined')))

In [ ]:

Copied!

bios_temp.head()
bios_temp.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date	born_date_dt	born_year	born_day_of_week	is_leap	height_category
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02	1886-12-12	1886.0	6.0	False	Not Defined
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN	1969-04-01	1969.0	1.0	False	Tall
2	3	Jean Borotra	1898-08-13	Biarritz	Pyrénées-Atlantiques	FRA	France	183.0	76.0	1994-07-17	1898-08-13	1898.0	5.0	False	Tall
3	4	Jacques Brugnon	1895-05-11	Paris VIIIe	Paris	FRA	France	168.0	64.0	1978-03-20	1895-05-11	1895.0	5.0	False	Normal
4	5	Albert Canet	1878-04-17	Wandsworth	England	GBR	France	NaN	NaN	1930-07-25	1878-04-17	1878.0	2.0	False	Not Defined

Merge¶

In [ ]:

Copied!

noc = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/data/noc_regions.csv")
noc.head()
noc = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/data/noc_regions.csv")
noc.head()

Out[ ]:

	NOC	region	notes
0	AFG	Afghanistan	NaN
1	AHO	Curacao	Netherlands Antilles
2	ALB	Albania	NaN
3	ALG	Algeria	NaN
4	AND	Andorra	NaN

Get region base on NOC and merge it to bios¶

In [ ]:

Copied!

pd.merge(bios, noc.drop(["notes"], axis=1), left_on='born_country', right_on='NOC', how='left')
pd.merge(bios, noc.drop(["notes"], axis=1), left_on='born_country', right_on='NOC', how='left')

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC_x	height_cm	weight_kg	died_date	NOC_y	region
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02	FRA	France
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN	FRA	France
2	3	Jean Borotra	1898-08-13	Biarritz	Pyrénées-Atlantiques	FRA	France	183.0	76.0	1994-07-17	FRA	France
3	4	Jacques Brugnon	1895-05-11	Paris VIIIe	Paris	FRA	France	168.0	64.0	1978-03-20	FRA	France
4	5	Albert Canet	1878-04-17	Wandsworth	England	GBR	France	NaN	NaN	1930-07-25	GBR	UK
...	...	...	...	...	...	...	...	...	...	...	...	...
145495	149222	Polina Luchnikova	2002-01-30	Serov	Sverdlovsk	RUS	ROC	167.0	61.0	NaN	RUS	Russia
145496	149223	Valeriya Merkusheva	1999-09-20	Moskva (Moscow)	Moskva	RUS	ROC	168.0	65.0	NaN	RUS	Russia
145497	149224	Yuliya Smirnova	1998-05-08	Kotlas	Arkhangelsk	RUS	ROC	163.0	55.0	NaN	RUS	Russia
145498	149225	André Foussard	1899-05-19	Niort	Deux-Sèvres	FRA	France	166.0	NaN	1986-03-18	FRA	France
145499	149814	Bill Phillips	1913-07-15	Dulwich Hill	New South Wales	AUS	Australia	NaN	NaN	2003-10-20	AUS	Australia

145500 rows × 12 columns

Merge bios with Olympic¶

In [ ]:

Copied!

pd.merge(left=olympic_results, right=bios, left_on='athlete_id', right_on='athlete_id', how='inner')
pd.merge(left=olympic_results, right=bios, left_on='athlete_id', right_on='athlete_id', how='inner')

Out[ ]:

	year	type	discipline	event	as	athlete_id	noc	team	place	tied	medal	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
0	1912.0	Summer	Tennis	Singles, Men (Olympic)	Jean-François Blanchy	1	FRA	None	17.0	True	None	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
1	1912.0	Summer	Tennis	Doubles, Men (Olympic)	Jean-François Blanchy	1	FRA	Jean Montariol	NaN	False	None	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
2	1920.0	Summer	Tennis	Singles, Men (Olympic)	Jean-François Blanchy	1	FRA	None	32.0	True	None	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
3	1920.0	Summer	Tennis	Doubles, Mixed (Olympic)	Jean-François Blanchy	1	FRA	Jeanne Vaussard	8.0	True	None	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
4	1920.0	Summer	Tennis	Doubles, Men (Olympic)	Jean-François Blanchy	1	FRA	Jacques Brugnon	4.0	False	None	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
308403	2022.0	Winter	Luge	Singles, Men (Olympic)	Marián Skupek	148983	SVK	None	26.0	False	None	Marián Skupek	2001-07-12	Gelnica	Košice	SVK	Slovakia	196.0	108.0	NaN
308404	2022.0	Winter	Alpine Skiing (Skiing)	Slalom, Women (Olympic)	Elsa Fermbäck	148984	SWE	None	28.0	False	None	Elsa Fermbäck	1998-03-28	Vemdalen	Jämtland	SWE	Sweden	NaN	NaN	NaN
308405	2022.0	Winter	Alpine Skiing (Skiing)	Team, Mixed (Olympic)	Hilma Lövblom	148985	SWE	Sweden	13.0	False	None	Hilma Lövblom	2000-08-16	Täby	Stockholm	SWE	Sweden	NaN	NaN	NaN
308406	2022.0	Winter	Alpine Skiing (Skiing)	Giant Slalom, Women (Olympic)	Hilma Lövblom	148985	SWE	None	NaN	False	None	Hilma Lövblom	2000-08-16	Täby	Stockholm	SWE	Sweden	NaN	NaN	NaN
308407	2022.0	Winter	None	Slalom, Women (Olympic)	Charlotta Säfvenberg	148986	None	None	24.0	False	None	Charlotta Säfvenberg	1994-10-07	Kungsbacka	Halland	SWE	Sweden	NaN	NaN	NaN

308408 rows × 20 columns

Concatenate¶

In [ ]:

Copied!

iran = bios[bios['born_country'] == "IRAN"].copy()
usa = bios[bios['born_country'] == "USA"].copy()
iran = bios[bios['born_country'] == "IRAN"].copy()
usa = bios[bios['born_country'] == "USA"].copy()

In [ ]:

Copied!

iran.head()
iran.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3887	3900	Emanoul Aghasi	1930-12-25	Salmas	Azarbaijan Gharbi	IRAN	Iran	NaN	NaN	2021-09-24
3888	3901	Nasser Aghaie	1944-01-20	Tehran	Tehran	IRAN	Iran	160.0	51.0	2009-12-27
3901	3914	Ayoub Pourtaghi	1973-01-01	Orumiyeh (Urmia)	Azarbaijan Gharbi	IRAN	Iran	182.0	81.0	2021-09-29
3905	3918	Vazik Ghazarian	1937-05-04	Bahmanshir	Khuzestan	IRAN	Iran	168.0	63.0	NaN
3907	3920	Iraj Kia Rostami	1963-07-07	Tehran	Tehran	IRAN	Iran	180.0	91.0	2015-08-06

In [ ]:

Copied!

usa.head()
usa.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
54	55	Monique Javer	1967-07-22	Burlingame	California	USA	Great Britain	177.0	64.0	NaN
960	964	Xóchitl Escobedo	1968-09-17	West Covina	California	USA	Mexico	170.0	60.0	NaN
961	965	Angélica Gavaldón	1973-10-03	El Centro	California	USA	Mexico	160.0	54.0	NaN
1231	1238	Bert Schneider	1897-07-01	Cleveland	Ohio	USA	Canada	NaN	NaN	1986-02-20
1345	1352	Laura Berg	1975-01-06	Santa Fe Springs	California	USA	United States	168.0	61.0	NaN

In [ ]:

Copied!

pd.concat([iran, usa])
pd.concat([iran, usa])

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3887	3900	Emanoul Aghasi	1930-12-25	Salmas	Azarbaijan Gharbi	IRAN	Iran	NaN	NaN	2021-09-24
3888	3901	Nasser Aghaie	1944-01-20	Tehran	Tehran	IRAN	Iran	160.0	51.0	2009-12-27
3901	3914	Ayoub Pourtaghi	1973-01-01	Orumiyeh (Urmia)	Azarbaijan Gharbi	IRAN	Iran	182.0	81.0	2021-09-29
3905	3918	Vazik Ghazarian	1937-05-04	Bahmanshir	Khuzestan	IRAN	Iran	168.0	63.0	NaN
3907	3920	Iraj Kia Rostami	1963-07-07	Tehran	Tehran	IRAN	Iran	180.0	91.0	2015-08-06
...	...	...	...	...	...	...	...	...	...	...
145445	149168	Kristen Santos	1994-11-02	Fairfield	Connecticut	USA	United States	NaN	NaN	NaN
145446	149169	Corinne Stoddard	2001-08-15	Seattle	Washington	USA	United States	NaN	NaN	NaN
145454	149180	Anna Hoffmann	2000-03-28	Madison	Wisconsin	USA	United States	NaN	NaN	NaN
145457	149183	Alix Wilkinson	2000-08-02	Mammoth Lakes	California	USA	United States	NaN	NaN	NaN
145468	149195	Justin Abdelkader	1987-02-25	Muskegon	Michigan	USA	United States	187.0	97.0	NaN

9993 rows × 10 columns

In [ ]:

Copied!

pd.concat([iran.drop(['name'], axis=1), usa])
pd.concat([iran.drop(['name'], axis=1), usa])

Out[ ]:

	athlete_id	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date	name
3887	3900	1930-12-25	Salmas	Azarbaijan Gharbi	IRAN	Iran	NaN	NaN	2021-09-24	NaN
3888	3901	1944-01-20	Tehran	Tehran	IRAN	Iran	160.0	51.0	2009-12-27	NaN
3901	3914	1973-01-01	Orumiyeh (Urmia)	Azarbaijan Gharbi	IRAN	Iran	182.0	81.0	2021-09-29	NaN
3905	3918	1937-05-04	Bahmanshir	Khuzestan	IRAN	Iran	168.0	63.0	NaN	NaN
3907	3920	1963-07-07	Tehran	Tehran	IRAN	Iran	180.0	91.0	2015-08-06	NaN
...	...	...	...	...	...	...	...	...	...	...
145445	149168	1994-11-02	Fairfield	Connecticut	USA	United States	NaN	NaN	NaN	Kristen Santos
145446	149169	2001-08-15	Seattle	Washington	USA	United States	NaN	NaN	NaN	Corinne Stoddard
145454	149180	2000-03-28	Madison	Wisconsin	USA	United States	NaN	NaN	NaN	Anna Hoffmann
145457	149183	2000-08-02	Mammoth Lakes	California	USA	United States	NaN	NaN	NaN	Alix Wilkinson
145468	149195	1987-02-25	Muskegon	Michigan	USA	United States	187.0	97.0	NaN	Justin Abdelkader

9993 rows × 10 columns

In [ ]:

Copied!

pd.concat([iran, usa.drop(['name'], axis=1)])
pd.concat([iran, usa.drop(['name'], axis=1)])

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
3887	3900	Emanoul Aghasi	1930-12-25	Salmas	Azarbaijan Gharbi	IRAN	Iran	NaN	NaN	2021-09-24
3888	3901	Nasser Aghaie	1944-01-20	Tehran	Tehran	IRAN	Iran	160.0	51.0	2009-12-27
3901	3914	Ayoub Pourtaghi	1973-01-01	Orumiyeh (Urmia)	Azarbaijan Gharbi	IRAN	Iran	182.0	81.0	2021-09-29
3905	3918	Vazik Ghazarian	1937-05-04	Bahmanshir	Khuzestan	IRAN	Iran	168.0	63.0	NaN
3907	3920	Iraj Kia Rostami	1963-07-07	Tehran	Tehran	IRAN	Iran	180.0	91.0	2015-08-06
...	...	...	...	...	...	...	...	...	...	...
145445	149168	NaN	1994-11-02	Fairfield	Connecticut	USA	United States	NaN	NaN	NaN
145446	149169	NaN	2001-08-15	Seattle	Washington	USA	United States	NaN	NaN	NaN
145454	149180	NaN	2000-03-28	Madison	Wisconsin	USA	United States	NaN	NaN	NaN
145457	149183	NaN	2000-08-02	Mammoth Lakes	California	USA	United States	NaN	NaN	NaN
145468	149195	NaN	1987-02-25	Muskegon	Michigan	USA	United States	187.0	97.0	NaN

9993 rows × 10 columns

Handle Null Values¶

Load and null some values¶

In [ ]:

Copied!

coffee_temp = coffee.copy()
coffee_temp = coffee.copy()

In [ ]:

Copied!

coffee_temp.loc[:9:2, 'Units Sold'] = np.nan
coffee_temp.loc[:9:2, 'Units Sold'] = np.nan

In [ ]:

Copied!

coffee_temp.head()
coffee_temp.head()

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue
0	Monday	Espresso	NaN	3.99	99.75
1	Monday	Latte	20.0	5.99	119.80
2	Tuesday	Espresso	NaN	3.99	139.65
3	Tuesday	Latte	20.0	5.99	119.80
4	Wednesday	Espresso	NaN	3.99	139.65

Find NaNs¶

In [ ]:

Copied!

coffee_temp.info()
coffee_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Day          14 non-null     object 
 1   Coffee Type  14 non-null     object 
 2   Units Sold   9 non-null      float64
 3   Price        14 non-null     float64
 4   Revenue      14 non-null     float64
dtypes: float64(3), object(2)
memory usage: 692.0+ bytes

In [ ]:

Copied!

coffee_temp.isna().sum()
coffee_temp.isna().sum()

Out[ ]:

	0
Day	0
Coffee Type	0
Units Sold	5
Price	0
Revenue	0

dtype: int64

In [ ]:

Copied!

coffee_temp[coffee_temp['Units Sold'].isna()]
# coffee_temp[coffee_temp['Units Sold'].notna()] # not nan units sold
coffee_temp[coffee_temp['Units Sold'].isna()]
# coffee_temp[coffee_temp['Units Sold'].notna()] # not nan units sold

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue
0	Monday	Espresso	NaN	3.99	99.75
2	Tuesday	Espresso	NaN	3.99	139.65
4	Wednesday	Espresso	NaN	3.99	139.65
6	Thursday	Espresso	NaN	3.99	159.60
8	Friday	Espresso	NaN	3.99	179.55

Handle NaNs¶

Fill¶

In [ ]:

Copied!





# You can use interpolate istead of mean that fill values base on the neighbors
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].fillna(coffee_temp['Units Sold'].mean())
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].astype(np.int32)
coffee_temp.head()
# You can use interpolate istead of mean that fill values base on the neighbors
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].fillna(coffee_temp['Units Sold'].mean())
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].astype(np.int32)
coffee_temp.head()

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue
0	Monday	Espresso	32	3.99	99.75
1	Monday	Latte	20	5.99	119.80
2	Tuesday	Espresso	32	3.99	139.65
3	Tuesday	Latte	20	5.99	119.80
4	Wednesday	Espresso	32	3.99	139.65

Drop¶

In [ ]:

Copied!

coffee_temp.loc[:9:2, 'Units Sold'] = np.nan
coffee_temp.loc[:9:2, 'Units Sold'] = np.nan

In [ ]:

Copied!

coffee_temp = coffee_temp.dropna()
# coffee_temp.dropna(subset=['Price'], inplace=True) # drop rows that doesn't have price
coffee_temp.head()
coffee_temp = coffee_temp.dropna()
# coffee_temp.dropna(subset=['Price'], inplace=True) # drop rows that doesn't have price
coffee_temp.head()

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue
1	Monday	Latte	20.0	5.99	119.80
3	Tuesday	Latte	20.0	5.99	119.80
5	Wednesday	Latte	25.0	5.99	149.75
7	Thursday	Latte	35.0	5.99	209.65
9	Friday	Latte	35.0	5.99	209.65

Aggregating Data¶

In [ ]:

Copied!





bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')
bios_agg = bios.copy()
bios_agg.head()
bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')
bios_agg = bios.copy()
bios_agg.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN
2	3	Jean Borotra	1898-08-13	Biarritz	Pyrénées-Atlantiques	FRA	France	183.0	76.0	1994-07-17
3	4	Jacques Brugnon	1895-05-11	Paris VIIIe	Paris	FRA	France	168.0	64.0	1978-03-20
4	5	Albert Canet	1878-04-17	Wandsworth	England	GBR	France	NaN	NaN	1930-07-25

Count¶

In [ ]:

Copied!

bios_agg[bios_agg["NOC"] == "Iran"].value_counts(subset=["NOC", "born_region"], dropna=False)
bios_agg[bios_agg["NOC"] == "Iran"].value_counts(subset=["NOC", "born_region"], dropna=False)

Out[ ]:

		count
NOC	born_region
Iran	NaN	240
	Tehran	108
	Mazandaran	35
	Isfahan	21
	Khuzestan	20
	Azarbaijan Gharbi	15
	Khorasan Razavi	15
	Gilan	12
	Azarbaijan Sharqi	10
	Kermanshah	9
	Lorestan	9
	Fars	9
	Chaharmahal-o-Bakhtiyari	8
	Hamadan	8
	Golestan	7
	Ardabil	6
	Qazvin	5
	Markazi	4
	Alborz	4
	Kerman	4
	Ilam	4
	Zanjan	3
	Kurdistan	3
	Khorasan Jonubi	2
	Yazd	2
	England	2
	Baghdad	1
	Bakı	1
	Genève	1
	California	1
	Bushehr	1
	Moskva	1
	Khorasan Shomali	1
	Région de Bruxelles-Capitale	1
	Rheinland-Pfalz	1
	Qom	1
	Vitebsk	1

dtype: int64

Group By¶

In [ ]:

Copied!

# mean of height and weight on each NOC
# bios_agg.groupby(by=['born_country'])['height_cm'].mean()
bios_agg.groupby(by=['NOC']).agg({'height_cm': 'mean', 'weight_kg': 'mean'})
# mean of height and weight on each NOC
# bios_agg.groupby(by=['born_country'])['height_cm'].mean()
bios_agg.groupby(by=['NOC']).agg({'height_cm': 'mean', 'weight_kg': 'mean'})

Out[ ]:

	height_cm	weight_kg
NOC
Afghanistan	170.170732	67.076923
Albania	174.200000	74.285714
Albania Australia	166.000000	65.000000
Albania Bulgaria	189.000000	100.000000
Albania Greece	165.000000	64.000000
...	...	...
West Indies Federation	176.000000	71.500000
Yemen	169.391304	60.476190
Yugoslavia	180.035980	76.444444
Zambia	169.852174	64.808511
Zimbabwe	173.335196	68.089888

696 rows × 2 columns

Pivot¶

In [ ]:

Copied!

coffee
coffee

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue
0	Monday	Espresso	25	3.99	99.75
1	Monday	Latte	20	5.99	119.80
2	Tuesday	Espresso	35	3.99	139.65
3	Tuesday	Latte	20	5.99	119.80
4	Wednesday	Espresso	35	3.99	139.65
5	Wednesday	Latte	25	5.99	149.75
6	Thursday	Espresso	40	3.99	159.60
7	Thursday	Latte	35	5.99	209.65
8	Friday	Espresso	45	3.99	179.55
9	Friday	Latte	35	5.99	209.65
10	Saturday	Espresso	45	3.99	179.55
11	Saturday	Latte	35	5.99	209.65
12	Sunday	Espresso	45	3.99	179.55
13	Sunday	Latte	35	5.99	209.65

In [ ]:

Copied!

pivot = coffee.pivot(index = "Day", columns="Coffee Type", values="Units Sold")
pivot = coffee.pivot(index = "Day", columns="Coffee Type", values="Units Sold")

In [ ]:

Copied!

pivot
pivot

Out[ ]:

Coffee Type	Espresso	Latte
Day
Friday	45	35
Monday	25	20
Saturday	45	35
Sunday	45	35
Thursday	40	35
Tuesday	35	20
Wednesday	35	25

In [ ]:

Copied!

pivot.sum()
pivot.sum()

Out[ ]:

	0
Coffee Type
Espresso	270
Latte	205

dtype: int64

In [ ]:

Copied!

pivot.sum(axis=1)
pivot.sum(axis=1)

Out[ ]:

	0
Day
Friday	80
Monday	45
Saturday	80
Sunday	80
Thursday	75
Tuesday	55
Wednesday	60

dtype: int64

In [ ]:

Copied!

pivot.loc["Monday"]
pivot.loc["Monday"]

Out[ ]:

	Monday
Coffee Type
Espresso	25
Latte	20

dtype: int64

Advanced Exmple¶

Group by born_date

In [ ]:

Copied!

bios_agg.dropna(subset=["born_date"], inplace=True) # drop if born_date is nan
bios_agg.dropna(subset=["born_date"], inplace=True) # drop if born_date is nan

In [ ]:

Copied!

bios_agg["born_date_dt"] = pd.to_datetime(bios_agg["born_date"])
bios_agg["born_date_dt"] = pd.to_datetime(bios_agg["born_date"])

In [ ]:

Copied!

bios_agg.head(2)
bios_agg.head(2)

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date	born_date_dt
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02	1886-12-12
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN	1969-04-01

In [ ]:

Copied!

bios_agg.groupby(by=bios_agg["born_date_dt"].dt.year).agg({"height_cm": "mean", "name": "count"}).sort_values(by="name", ascending=False).reset_index()
bios_agg.groupby(by=bios_agg["born_date_dt"].dt.year).agg({"height_cm": "mean", "name": "count"}).sort_values(by="name", ascending=False).reset_index()

Out[ ]:

	born_date_dt	height_cm	name
0	1972	176.674755	2231
1	1985	176.445522	2227
2	1973	177.127033	2216
3	1971	176.950644	2205
4	1970	177.203734	2174
...	...	...	...
172	1837	NaN	1
173	1833	NaN	1
174	1839	NaN	1
175	1845	NaN	1
176	2009	NaN	1

177 rows × 3 columns

Advance Functionalities¶

Shift¶

In [ ]:

Copied!

coffee['yesterday_revenue'] = coffee["Revenue"].shift(2)
coffee['yesterday_revenue'] = coffee["Revenue"].shift(2)

In [ ]:

Copied!

coffee.head()
coffee.head()

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue	yesterday_revenue
0	Monday	Espresso	25	3.99	99.75	NaN
1	Monday	Latte	20	5.99	119.80	NaN
2	Tuesday	Espresso	35	3.99	139.65	99.75
3	Tuesday	Latte	20	5.99	119.80	119.80
4	Wednesday	Espresso	35	3.99	139.65	139.65

Rank¶

compare with other samples

In [ ]:

Copied!

bios_agg.head()
bios_agg.head()

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date	born_date_dt
0	1	Jean-François Blanchy	1886-12-12	Bordeaux	Gironde	FRA	France	NaN	NaN	1960-10-02	1886-12-12
1	2	Arnaud Boetsch	1969-04-01	Meulan	Yvelines	FRA	France	183.0	76.0	NaN	1969-04-01
2	3	Jean Borotra	1898-08-13	Biarritz	Pyrénées-Atlantiques	FRA	France	183.0	76.0	1994-07-17	1898-08-13
3	4	Jacques Brugnon	1895-05-11	Paris VIIIe	Paris	FRA	France	168.0	64.0	1978-03-20	1895-05-11
4	5	Albert Canet	1878-04-17	Wandsworth	England	GBR	France	NaN	NaN	1930-07-25	1878-04-17

In [ ]:

Copied!

bios_agg["weight_rank"] = bios_agg["weight_kg"].rank()
bios_agg["weight_rank"] = bios_agg["weight_kg"].rank()

In [ ]:

Copied!

bios_agg.sort_values(by=["weight_rank"], ascending=False)
bios_agg.sort_values(by=["weight_rank"], ascending=False)

Out[ ]:

	athlete_id	name	born_date	born_city	born_region	born_country	NOC	height_cm	weight_kg	died_date	born_date_dt	weight_rank
105715	106771	Aythami Ruano	1977-06-18	Las Palmas de Gran Canaria	Las Palmas	ESP	Spain	200.0	198.0	NaN	1977-06-18	101977.0
59204	59630	Marek Galiński	1951-05-13	Wrocław	Dolnośląskie	POL	Poland	200.0	190.0	1999-09-28	1951-05-13	101976.0
60344	60777	Chris Taylor	1950-06-13	Dowagiac	Michigan	USA	United States	196.0	182.0	1979-06-30	1950-06-13	101975.0
91712	92442	Valentyn Rusliakov	1972-02-03	NaN	NaN	NaN	Ukraine	187.0	180.0	NaN	1972-02-03	101974.0
105722	106778	Leonel Wilfredo Ruiz	1975-07-27	NaN	NaN	NaN	Venezuela	187.0	178.0	NaN	1975-07-27	101973.0
...	...	...	...	...	...	...	...	...	...	...	...	...
145492	149219	Carlos García-Ordóñez	1927-04-24	La Habana (Havana)	Ciudad de La Habana	CUB	Cuba	NaN	NaN	2019-11-24	1927-04-24	NaN
145493	149220	Landysh Falyakhova	1998-08-31	Dva Polya Artash	Respublika Tatarstan	RUS	ROC	NaN	NaN	NaN	1998-08-31	NaN
145494	149221	George Wright	1890-07-21	Toronto	Ontario	CAN	Canada	183.0	NaN	1973-01-28	1890-07-21	NaN
145498	149225	André Foussard	1899-05-19	Niort	Deux-Sèvres	FRA	France	166.0	NaN	1986-03-18	1899-05-19	NaN
145499	149814	Bill Phillips	1913-07-15	Dulwich Hill	New South Wales	AUS	Australia	NaN	NaN	2003-10-20	1913-07-15	NaN

143693 rows × 12 columns

Rolling¶

In [ ]:

Copied!

# Add previous rows to current by cumsum
coffee["cumsum_Revenue"] = coffee["Revenue"].cumsum()
coffee.head()
# Add previous rows to current by cumsum
coffee["cumsum_Revenue"] = coffee["Revenue"].cumsum()
coffee.head()

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue	yesterday_revenue	cumsum_Revenue
0	Monday	Espresso	25	3.99	99.75	NaN	99.75
1	Monday	Latte	20	5.99	119.80	NaN	219.55
2	Tuesday	Espresso	35	3.99	139.65	99.75	359.20
3	Tuesday	Latte	20	5.99	119.80	119.80	479.00
4	Wednesday	Espresso	35	3.99	139.65	139.65	618.65

In [ ]:

Copied!

# Add previous rows to current by cumsum
coffee["3_days_Revenue"] = coffee["Revenue"].rolling(6).sum() # each 3 days
coffee.head(12)
# Add previous rows to current by cumsum
coffee["3_days_Revenue"] = coffee["Revenue"].rolling(6).sum() # each 3 days
coffee.head(12)

Out[ ]:

	Day	Coffee Type	Units Sold	Price	Revenue	yesterday_revenue	cumsum_Revenue	week_Revenue	3_days_Revenue
0	Monday	Espresso	25	3.99	99.75	NaN	99.75	NaN	NaN
1	Monday	Latte	20	5.99	119.80	NaN	219.55	NaN	NaN
2	Tuesday	Espresso	35	3.99	139.65	99.75	359.20	NaN	NaN
3	Tuesday	Latte	20	5.99	119.80	119.80	479.00	NaN	NaN
4	Wednesday	Espresso	35	3.99	139.65	139.65	618.65	NaN	NaN
5	Wednesday	Latte	25	5.99	149.75	119.80	768.40	NaN	768.40
6	Thursday	Espresso	40	3.99	159.60	139.65	928.00	NaN	828.25
7	Thursday	Latte	35	5.99	209.65	149.75	1137.65	NaN	918.10
8	Friday	Espresso	45	3.99	179.55	159.60	1317.20	NaN	958.00
9	Friday	Latte	35	5.99	209.65	209.65	1526.85	NaN	1047.85
10	Saturday	Espresso	45	3.99	179.55	179.55	1706.40	NaN	1087.75
11	Saturday	Latte	35	5.99	209.65	209.65	1916.05	NaN	1147.65

New Functionalities¶

In [ ]:

Copied!

pd.__version__
pd.__version__

Out[ ]:

'2.2.2'

In [ ]:

Copied!

bios_numpy = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios_pyarrow = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv", engine="pyarrow", dtype_backend="pyarrow")
bios_numpy = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios_pyarrow = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv", engine="pyarrow", dtype_backend="pyarrow")

In [ ]:

Copied!

bios_numpy.info()
bios_numpy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   athlete_id    145500 non-null  int64  
 1   name          145500 non-null  object 
 2   born_date     143693 non-null  object 
 3   born_city     110908 non-null  object 
 4   born_region   110908 non-null  object 
 5   born_country  110908 non-null  object 
 6   NOC           145499 non-null  object 
 7   height_cm     106651 non-null  float64
 8   weight_kg     102070 non-null  float64
 9   died_date     33940 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 11.1+ MB

In [ ]:

Copied!

bios_pyarrow.info()
bios_pyarrow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype               
---  ------        --------------   -----               
 0   athlete_id    145500 non-null  int64[pyarrow]      
 1   name          145500 non-null  string[pyarrow]     
 2   born_date     143693 non-null  date32[day][pyarrow]
 3   born_city     110908 non-null  string[pyarrow]     
 4   born_region   110908 non-null  string[pyarrow]     
 5   born_country  110908 non-null  string[pyarrow]     
 6   NOC           145499 non-null  string[pyarrow]     
 7   height_cm     106651 non-null  double[pyarrow]     
 8   weight_kg     102070 non-null  double[pyarrow]     
 9   died_date     33940 non-null   date32[day][pyarrow]
dtypes: date32[day][pyarrow](2), double[pyarrow](2), int64[pyarrow](1), string[pyarrow](5)
memory usage: 13.0 MB

Compare Speeds¶

In [ ]:

Copied!

bios_numpy["name"].str.contains("Reza")
bios_numpy["name"].str.contains("Reza")

Out[ ]:

	name
0	False
1	False
2	False
3	False
4	False
...	...
145495	False
145496	False
145497	False
145498	False
145499	False

145500 rows × 1 columns

dtype: bool

In [ ]:

Copied!

bios_pyarrow["name"].str.contains("Reza")
bios_pyarrow["name"].str.contains("Reza")

Out[ ]:

	name
0	False
1	False
2	False
3	False
4	False
...	...
145495	False
145496	False
145497	False
145498	False
145499	False

145500 rows × 1 columns

dtype: bool[pyarrow]

Real Project¶

In [ ]: