Pandas Basics¶
In [ ]:
Copied!
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
Load Dataset¶
In [ ]:
Copied!
# !pip install mlcroissant
# import mlcroissant as mlc
# # Fetch the Croissant JSON-LD
# croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results/croissant/download')
# # Check what record sets are in the dataset
# record_sets = croissant_dataset.metadata.record_sets
# print(record_sets)
# # Fetch the records and put them in a DataFrame
# record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
# record_set_df.head(2)
# !pip install mlcroissant
# import mlcroissant as mlc
# # Fetch the Croissant JSON-LD
# croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results/croissant/download')
# # Check what record sets are in the dataset
# record_sets = croissant_dataset.metadata.record_sets
# print(record_sets)
# # Fetch the records and put them in a DataFrame
# record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
# record_set_df.head(2)
In [ ]:
Copied!
# cols = map(lambda name: name.split("/")[1], record_set_df.columns)
# df = pd.DataFrame(record_set_df.values[:1401], columns=cols)
# cols = map(lambda name: name.split("/")[1], record_set_df.columns)
# df = pd.DataFrame(record_set_df.values[:1401], columns=cols)
In [ ]:
Copied!
# df.head(2)
# df.head(2)
In [ ]:
Copied!
# df.count()
# df.count()
In [ ]:
Copied!
# df.describe()
# df.describe()
Basics¶
In [ ]:
Copied!
temp_df = pd.DataFrame(data=[[1, 2, 3], [4, 2, 6], [7, 2, 9]], dtype=np.int8, columns=["A", "B", "C"], index=["x", "y", "z"])
temp_df = pd.DataFrame(data=[[1, 2, 3], [4, 2, 6], [7, 2, 9]], dtype=np.int8, columns=["A", "B", "C"], index=["x", "y", "z"])
In [ ]:
Copied!
temp_df.head(1)
temp_df.head(1)
Out[ ]:
A | B | C | |
---|---|---|---|
x | 1 | 2 | 3 |
In [ ]:
Copied!
temp_df.tail(1)
temp_df.tail(1)
Out[ ]:
A | B | C | |
---|---|---|---|
z | 7 | 2 | 9 |
In [ ]:
Copied!
temp_df.index
temp_df.index
Out[ ]:
Index(['x', 'y', 'z'], dtype='object')
In [ ]:
Copied!
temp_df.index.to_list()
temp_df.index.to_list()
Out[ ]:
['x', 'y', 'z']
In [ ]:
Copied!
temp_df.info()
temp_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 3 entries, x to z Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 3 non-null int8 1 B 3 non-null int8 2 C 3 non-null int8 dtypes: int8(3) memory usage: 141.0+ bytes
In [ ]:
Copied!
temp_df.describe()
temp_df.describe()
Out[ ]:
A | B | C | |
---|---|---|---|
count | 3.0 | 3.0 | 3.0 |
mean | 4.0 | 2.0 | 6.0 |
std | 3.0 | 0.0 | 3.0 |
min | 1.0 | 2.0 | 3.0 |
25% | 2.5 | 2.0 | 4.5 |
50% | 4.0 | 2.0 | 6.0 |
75% | 5.5 | 2.0 | 7.5 |
max | 7.0 | 2.0 | 9.0 |
In [ ]:
Copied!
temp_df.nunique() # number of unique values in each column
temp_df.nunique() # number of unique values in each column
Out[ ]:
0 | |
---|---|
A | 3 |
B | 1 |
C | 3 |
In [ ]:
Copied!
temp_df["B"].nunique()
temp_df["B"].nunique()
Out[ ]:
1
In [ ]:
Copied!
temp_df.shape
temp_df.shape
Out[ ]:
(3, 3)
In [ ]:
Copied!
temp_df.memory_usage() # Return the memory usage of each column in bytes.
temp_df.memory_usage() # Return the memory usage of each column in bytes.
Out[ ]:
0 | |
---|---|
Index | 132 |
A | 3 |
B | 3 |
C | 3 |
Access Random Data¶
In [ ]:
Copied!
temp_df.sample(2, replace=False)
temp_df.sample(2, replace=False)
Out[ ]:
A | B | C | |
---|---|---|---|
x | 1 | 2 | 3 |
z | 7 | 2 | 9 |
Save¶
In [ ]:
Copied!
temp_df.to_parquet("./temp.parquet")
!ls
temp_df.to_parquet("./temp.parquet")
!ls
sample_data temp.parquet
Load csv¶
In [ ]:
Copied!
# load from url
coffee = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv")
coffee.head(2)
# load from url
coffee = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv")
coffee.head(2)
Out[ ]:
Day | Coffee Type | Units Sold | |
---|---|---|---|
0 | Monday | Espresso | 25 |
1 | Monday | Latte | 15 |
In [ ]:
Copied!
olympic_results = pd.read_parquet("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/results.parquet")
olympic_results.head(2)
olympic_results = pd.read_parquet("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/results.parquet")
olympic_results.head(2)
Out[ ]:
year | type | discipline | event | as | athlete_id | noc | team | place | tied | medal | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1912.0 | Summer | Tennis | Singles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | None | 17.0 | True | None |
1 | 1912.0 | Summer | Tennis | Doubles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | Jean Montariol | NaN | False | None |
In [ ]:
Copied!
bios = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios.head(2)
bios = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios.head(2)
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN |
Loc iloc¶
In [ ]:
Copied!
coffee.head()
coffee.head()
Out[ ]:
Day | Coffee Type | Units Sold | |
---|---|---|---|
0 | Monday | Espresso | 25 |
1 | Monday | Latte | 15 |
2 | Tuesday | Espresso | 30 |
3 | Tuesday | Latte | 20 |
4 | Wednesday | Espresso | 35 |
In [ ]:
Copied!
print(coffee.loc[:2, "Units Sold"]) # Row, Col
print(coffee.loc[[1, 2, 7], ["Day", "Units Sold"]]) # Row, Col
print(coffee.loc[[1, 2, 7]]) # Row, Col
print(coffee.loc[:2, "Units Sold"]) # Row, Col
print(coffee.loc[[1, 2, 7], ["Day", "Units Sold"]]) # Row, Col
print(coffee.loc[[1, 2, 7]]) # Row, Col
0 25 1 15 2 30 Name: Units Sold, dtype: int64 Day Units Sold 1 Monday 15 2 Tuesday 30 7 Thursday 30 Day Coffee Type Units Sold 1 Monday Latte 15 2 Tuesday Espresso 30 7 Thursday Latte 30
In [ ]:
Copied!
coffee.iloc[[1, 2, 7], [0, 2]] # == coffee.loc[[1, 2, 7], ["Day", "Units Sold"]])
coffee.iloc[[1, 2, 7], [0, 2]] # == coffee.loc[[1, 2, 7], ["Day", "Units Sold"]])
Out[ ]:
Day | Units Sold | |
---|---|---|
1 | Monday | 15 |
2 | Tuesday | 30 |
7 | Thursday | 30 |
In [ ]:
Copied!
coffee.loc[[1, 2, 7], ["Units Sold"]] += 5
coffee.head()
coffee.loc[[1, 2, 7], ["Units Sold"]] += 5
coffee.head()
Out[ ]:
Day | Coffee Type | Units Sold | |
---|---|---|---|
0 | Monday | Espresso | 25 |
1 | Monday | Latte | 20 |
2 | Tuesday | Espresso | 35 |
3 | Tuesday | Latte | 20 |
4 | Wednesday | Espresso | 35 |
Use at, iat for single item¶
Sort¶
In [ ]:
Copied!
coffee.sort_values(["Units Sold", "Day"], ascending=[False, True])
coffee.sort_values(["Units Sold", "Day"], ascending=[False, True])
Out[ ]:
Day | Coffee Type | Units Sold | |
---|---|---|---|
8 | Friday | Espresso | 45 |
10 | Saturday | Espresso | 45 |
12 | Sunday | Espresso | 45 |
6 | Thursday | Espresso | 40 |
9 | Friday | Latte | 35 |
11 | Saturday | Latte | 35 |
13 | Sunday | Latte | 35 |
7 | Thursday | Latte | 35 |
2 | Tuesday | Espresso | 35 |
4 | Wednesday | Espresso | 35 |
0 | Monday | Espresso | 25 |
5 | Wednesday | Latte | 25 |
1 | Monday | Latte | 20 |
3 | Tuesday | Latte | 20 |
Iterate¶
In [ ]:
Copied!
for index, row in coffee.iterrows():
if index == 5:
break
print(index, row)
for index, row in coffee.iterrows():
if index == 5:
break
print(index, row)
0 Day Monday Coffee Type Espresso Units Sold 25 Name: 0, dtype: object 1 Day Monday Coffee Type Latte Units Sold 20 Name: 1, dtype: object 2 Day Tuesday Coffee Type Espresso Units Sold 35 Name: 2, dtype: object 3 Day Tuesday Coffee Type Latte Units Sold 20 Name: 3, dtype: object 4 Day Wednesday Coffee Type Espresso Units Sold 35 Name: 4, dtype: object
Filter¶
In [ ]:
Copied!
olympic_results.head()
olympic_results.head()
Out[ ]:
year | type | discipline | event | as | athlete_id | noc | team | place | tied | medal | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1912.0 | Summer | Tennis | Singles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | None | 17.0 | True | None |
1 | 1912.0 | Summer | Tennis | Doubles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | Jean Montariol | NaN | False | None |
2 | 1920.0 | Summer | Tennis | Singles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | None | 32.0 | True | None |
3 | 1920.0 | Summer | Tennis | Doubles, Mixed (Olympic) | Jean-François Blanchy | 1 | FRA | Jeanne Vaussard | 8.0 | True | None |
4 | 1920.0 | Summer | Tennis | Doubles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | Jacques Brugnon | 4.0 | False | None |
In [ ]:
Copied!
bios.head()
bios.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN |
2 | 3 | Jean Borotra | 1898-08-13 | Biarritz | Pyrénées-Atlantiques | FRA | France | 183.0 | 76.0 | 1994-07-17 |
3 | 4 | Jacques Brugnon | 1895-05-11 | Paris VIIIe | Paris | FRA | France | 168.0 | 64.0 | 1978-03-20 |
4 | 5 | Albert Canet | 1878-04-17 | Wandsworth | England | GBR | France | NaN | NaN | 1930-07-25 |
Work on Olympics¶
In [ ]:
Copied!
olympic_results['discipline'].unique()
olympic_results['discipline'].unique()
Out[ ]:
array(['Tennis', 'Table Tennis', 'Fencing', 'Badminton', 'Hockey', 'Athletics', 'Art Competitions', 'Modern Pentathlon', 'Archery', 'Boxing', 'Softball (Baseball/Softball)', 'Baseball (Baseball/Softball)', 'Sailing', 'Golf', 'Jeu De Paume', 'Basketball (Basketball)', 'Cycling Road (Cycling)', 'Handball', 'Australian Rules Football', 'Bobsleigh (Bobsleigh)', 'Canoe Sprint (Canoeing)', 'Canoe Slalom (Canoeing)', 'Canoe Marathon (Canoeing)', 'Shooting', 'Rowing', 'Cycling Track (Cycling)', 'Polo', 'Rugby (Rugby)', 'Equestrian Dressage (Equestrian)', 'Equestrian Eventing (Equestrian)', 'Equestrian Jumping (Equestrian)', 'Tug-Of-War', 'Lacrosse', 'Diving (Aquatics)', 'Winter Pentathlon', 'Equestrian Vaulting (Equestrian)', 'Equestrian Driving (Equestrian)', 'Cycling Mountain Bike (Cycling)', 'Speed Skating (Skating)', 'Football (Football)', 'Bicycle Polo (Cycling)', 'Cross Country Skiing (Skiing)', 'Racquets', 'Cricket', 'Croquet', 'Motorboating', 'Swimming (Aquatics)', 'Ice Hockey (Ice Hockey)', 'Bandy', 'Artistic Gymnastics (Gymnastics)', 'Rhythmic Gymnastics (Gymnastics)', 'Weightlifting', 'Wrestling', 'Alpine Skiing (Skiing)', 'Judo', 'Military Ski Patrol (Skiing)', 'Nordic Combined (Skiing)', 'Water Polo (Aquatics)', 'Savate', 'Skeleton (Bobsleigh)', 'Artistic Swimming (Aquatics)', 'Triathlon', 'Beach Volleyball (Volleyball)', 'Volleyball (Volleyball)', 'Glíma', 'Biathlon', 'Figure Skating (Skating)', 'Basque pelota', 'Luge', 'Short Track Speed Skating (Skating)', 'Freestyle Skiing (Skiing)', 'Speed Skiing (Skiing)', 'Ski Jumping (Skiing)', 'Cycling BMX Racing (Cycling)', 'Trampolining (Gymnastics)', 'Marathon Swimming (Aquatics)', 'Taekwondo', 'Snowboarding (Skiing)', 'Curling', 'Roque', '3x3 Basketball (Basketball)', 'Skateboarding (Roller Sports)', 'Mixed Sports', 'Rugby Sevens (Rugby)', 'Hockey 5s', 'Ballooning (Air Sports)', 'Ski Mountaineering', '3-on-3 Ice Hockey (Ice Hockey)', 'Karate', 'Surfing', 'Cycling BMX Freestyle (Cycling)', 'Sport Climbing', 'Roller Skating (Roller Sports)', None], dtype=object)
In [ ]:
Copied!
olympic_results.info()
olympic_results.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 308408 entries, 0 to 308407 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 305807 non-null float64 1 type 305807 non-null object 2 discipline 308407 non-null object 3 event 308408 non-null object 4 as 308408 non-null object 5 athlete_id 308408 non-null int64 6 noc 308407 non-null object 7 team 121714 non-null object 8 place 283193 non-null float64 9 tied 308408 non-null bool 10 medal 44139 non-null object dtypes: bool(1), float64(2), int64(1), object(7) memory usage: 23.8+ MB
In [ ]:
Copied!
olympic_results.describe()
olympic_results.describe()
Out[ ]:
year | athlete_id | place | |
---|---|---|---|
count | 305807.000000 | 308408.000000 | 283193.000000 |
mean | 1982.239223 | 72624.097125 | 16.102764 |
std | 30.983982 | 41285.940841 | 19.322611 |
min | 1896.000000 | 1.000000 | 1.000000 |
25% | 1964.000000 | 34412.000000 | 5.000000 |
50% | 1992.000000 | 73739.500000 | 9.000000 |
75% | 2008.000000 | 107168.000000 | 20.000000 |
max | 2022.000000 | 148986.000000 | 185.000000 |
In [ ]:
Copied!
olympic_results.loc[olympic_results["place"] > 180, "discipline"]
olympic_results.loc[olympic_results["place"] > 180, "discipline"]
Out[ ]:
discipline | |
---|---|
58493 | Artistic Gymnastics (Gymnastics) |
58507 | Artistic Gymnastics (Gymnastics) |
58510 | Artistic Gymnastics (Gymnastics) |
58511 | Artistic Gymnastics (Gymnastics) |
58747 | Artistic Gymnastics (Gymnastics) |
59433 | Artistic Gymnastics (Gymnastics) |
59436 | Artistic Gymnastics (Gymnastics) |
59437 | Artistic Gymnastics (Gymnastics) |
59439 | Artistic Gymnastics (Gymnastics) |
59440 | Artistic Gymnastics (Gymnastics) |
60054 | Artistic Gymnastics (Gymnastics) |
60636 | Artistic Gymnastics (Gymnastics) |
60756 | Artistic Gymnastics (Gymnastics) |
60761 | Artistic Gymnastics (Gymnastics) |
63681 | Artistic Gymnastics (Gymnastics) |
64985 | Artistic Gymnastics (Gymnastics) |
64986 | Artistic Gymnastics (Gymnastics) |
64987 | Artistic Gymnastics (Gymnastics) |
64988 | Artistic Gymnastics (Gymnastics) |
64989 | Artistic Gymnastics (Gymnastics) |
64990 | Artistic Gymnastics (Gymnastics) |
64991 | Artistic Gymnastics (Gymnastics) |
65007 | Artistic Gymnastics (Gymnastics) |
65010 | Artistic Gymnastics (Gymnastics) |
65012 | Artistic Gymnastics (Gymnastics) |
65013 | Artistic Gymnastics (Gymnastics) |
67125 | Artistic Gymnastics (Gymnastics) |
68180 | Artistic Gymnastics (Gymnastics) |
68183 | Artistic Gymnastics (Gymnastics) |
68186 | Artistic Gymnastics (Gymnastics) |
68348 | Artistic Gymnastics (Gymnastics) |
69909 | Artistic Gymnastics (Gymnastics) |
72808 | Artistic Gymnastics (Gymnastics) |
72864 | Artistic Gymnastics (Gymnastics) |
72865 | Artistic Gymnastics (Gymnastics) |
Work on Bios¶
In [ ]:
Copied!
# tall_players = bios.loc[bios['height_cm'] > 215, ['athlete_id', 'name']]
tall_players = bios.loc[(bios['height_cm'] > 215) & (bios['weight_kg'] > 100) & (bios['died_date'].isnull())][['athlete_id', 'name', 'height_cm', 'weight_kg', 'died_date']]
tall_players.head()
# tall_players = bios.loc[bios['height_cm'] > 215, ['athlete_id', 'name']]
tall_players = bios.loc[(bios['height_cm'] > 215) & (bios['weight_kg'] > 100) & (bios['died_date'].isnull())][['athlete_id', 'name', 'height_cm', 'weight_kg', 'died_date']]
tall_players.head()
Out[ ]:
athlete_id | name | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|
5583 | 5606 | Paulinho Villas Boas | 217.0 | 106.0 | NaN |
5673 | 5696 | Gunther Behnke | 221.0 | 114.0 | NaN |
5716 | 5739 | Uwe Blab | 218.0 | 110.0 | NaN |
5781 | 5804 | Tommy Burleson | 223.0 | 102.0 | NaN |
6223 | 6250 | Lars Hansen | 216.0 | 105.0 | NaN |
Find Tall Alive Players that Won a Gold Medal¶
In [ ]:
Copied!
tall_results = olympic_results.loc[olympic_results['athlete_id'].isin(tall_players['athlete_id'])][['athlete_id', 'medal']]
tall_results = olympic_results.loc[olympic_results['athlete_id'].isin(tall_players['athlete_id'])][['athlete_id', 'medal']]
In [ ]:
Copied!
tall_results = tall_results.loc[tall_results['medal'] == 'Gold']
tall_results = tall_results.loc[tall_results['medal'] == 'Gold']
In [ ]:
Copied!
# Persons' name with Height(above 215cm) and weight(above 100kg) and Not dead that won gold medal
# tall_players.loc[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
tall_players[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
# Persons' name with Height(above 215cm) and weight(above 100kg) and Not dead that won gold medal
# tall_players.loc[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
tall_players[tall_players['athlete_id'].isin(tall_results['athlete_id'])]
Out[ ]:
athlete_id | name | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|
6722 | 6755 | Shaquille O'Neal | 216.0 | 137.0 | NaN |
6937 | 6972 | David Robinson | 216.0 | 107.0 | NaN |
6978 | 7013 | Arvydas Sabonis | 223.0 | 122.0 | NaN |
118676 | 120415 | Dmitry Musersky | 219.0 | 104.0 | NaN |
123850 | 126093 | Tyson Chandler | 216.0 | 107.0 | NaN |
Str Filter¶
In [ ]:
Copied!
bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')
bios[bios['NOC'] == 'Iran'].head(5)
bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')
bios[bios['NOC'] == 'Iran'].head(5)
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
783 | 787 | Ibrahim Al-Idokht | 1967-10-26 | NaN | NaN | NaN | Iran | NaN | NaN | NaN |
3886 | 3899 | Ibrahim Afsharpour | 1930-10-08 | NaN | NaN | NaN | Iran | NaN | NaN | NaN |
3887 | 3900 | Emanoul Aghasi | 1930-12-25 | Salmas | Azarbaijan Gharbi | IRAN | Iran | NaN | NaN | 2021-09-24 |
3888 | 3901 | Nasser Aghaie | 1944-01-20 | Tehran | Tehran | IRAN | Iran | 160.0 | 51.0 | 2009-12-27 |
3889 | 3902 | Sadegh Ali Akbarzadeh Khoi | 1932-09-03 | NaN | NaN | NaN | Iran | 163.0 | 54.0 | NaN |
In [ ]:
Copied!
bios[bios['name'].str.contains("reZA", case=False)].head()
bios[bios['name'].str.contains("reZA", case=False)].head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3917 | 3931 | Mohamed Reza Samadi | 1971-01-01 | NaN | NaN | NaN | Iran | 198.0 | 104.0 | NaN |
10255 | 10311 | Itzel Reza | 1979-05-29 | NaN | NaN | NaN | Mexico | NaN | NaN | NaN |
13398 | 13480 | Mohammad Reza Bajool | 1960-03-21 | NaN | NaN | NaN | Iran | 172.0 | 62.0 | NaN |
13415 | 13497 | Mohamed Reza Banna | 1971-05-05 | NaN | NaN | NaN | Iran | 178.0 | 69.0 | NaN |
25651 | 25843 | Ali Reza Azizi | 1949-08-23 | Mashhad | Khorasan Razavi | IRAN | Iran | 167.0 | 61.0 | 2021-08-07 |
In [ ]:
Copied!
bios[bios['name'].str.contains("reZA|gholam", case=False, regex=True)].head() # You can use regax
bios[bios['name'].str.contains("reZA|gholam", case=False, regex=True)].head() # You can use regax
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3917 | 3931 | Mohamed Reza Samadi | 1971-01-01 | NaN | NaN | NaN | Iran | 198.0 | 104.0 | NaN |
10255 | 10311 | Itzel Reza | 1979-05-29 | NaN | NaN | NaN | Mexico | NaN | NaN | NaN |
13398 | 13480 | Mohammad Reza Bajool | 1960-03-21 | NaN | NaN | NaN | Iran | 172.0 | 62.0 | NaN |
13415 | 13497 | Mohamed Reza Banna | 1971-05-05 | NaN | NaN | NaN | Iran | 178.0 | 69.0 | NaN |
15027 | 15128 | Gholam Hossein Koohi | 1951-01-10 | NaN | NaN | NaN | Iran | 172.0 | 68.0 | NaN |
In [ ]:
Copied!
bios.query('NOC == "Iran" and born_country == "IRAN"')
bios.query('NOC == "Iran" and born_country == "IRAN"')
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3887 | 3900 | Emanoul Aghasi | 1930-12-25 | Salmas | Azarbaijan Gharbi | IRAN | Iran | NaN | NaN | 2021-09-24 |
3888 | 3901 | Nasser Aghaie | 1944-01-20 | Tehran | Tehran | IRAN | Iran | 160.0 | 51.0 | 2009-12-27 |
3901 | 3914 | Ayoub Pourtaghi | 1973-01-01 | Orumiyeh (Urmia) | Azarbaijan Gharbi | IRAN | Iran | 182.0 | 81.0 | 2021-09-29 |
3905 | 3918 | Vazik Ghazarian | 1937-05-04 | Bahmanshir | Khuzestan | IRAN | Iran | 168.0 | 63.0 | NaN |
3907 | 3920 | Iraj Kia Rostami | 1963-07-07 | Tehran | Tehran | IRAN | Iran | 180.0 | 91.0 | 2015-08-06 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
140331 | 143837 | Morteza Ghiasi | 1995-04-07 | Dorud | Lorestan | IRAN | Iran | NaN | NaN | NaN |
140332 | 143838 | Mostafa Hosseinkhani | 1989-03-27 | Tehran | Tehran | IRAN | Iran | NaN | NaN | NaN |
140333 | 143839 | Reza Atri | 1994-08-08 | Babol | Mazandaran | IRAN | Iran | NaN | NaN | NaN |
144049 | 147714 | Mohammad Fotouhi | 1990-12-05 | Yazd | Yazd | IRAN | Iran | NaN | NaN | NaN |
145469 | 149196 | Danyal Saveh Shemshaki | 1997-07-02 | Tehran | Tehran | IRAN | Iran | NaN | NaN | NaN |
326 rows × 10 columns
Adding and Remove Columns¶
In [ ]:
Copied!
coffee.head()
coffee.head()
Out[ ]:
Day | Coffee Type | Units Sold | |
---|---|---|---|
0 | Monday | Espresso | 25 |
1 | Monday | Latte | 20 |
2 | Tuesday | Espresso | 35 |
3 | Tuesday | Latte | 20 |
4 | Wednesday | Espresso | 35 |
In [ ]:
Copied!
# coffee["Price"] = 5.99
coffee['Price'] = np.where(coffee['Coffee Type'] == 'Espresso', 3.99, 5.99)
# coffee["Price"] = 5.99
coffee['Price'] = np.where(coffee['Coffee Type'] == 'Espresso', 3.99, 5.99)
In [ ]:
Copied!
coffee['Revenue'] = coffee['Units Sold'] * coffee['Price']
coffee.head()
coffee['Revenue'] = coffee['Units Sold'] * coffee['Price']
coffee.head()
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | |
---|---|---|---|---|---|
0 | Monday | Espresso | 25 | 3.99 | 99.75 |
1 | Monday | Latte | 20 | 5.99 | 119.80 |
2 | Tuesday | Espresso | 35 | 3.99 | 139.65 |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 |
4 | Wednesday | Espresso | 35 | 3.99 | 139.65 |
In [ ]:
Copied!
coffee.rename(columns={'Price':'Coffee Price'})
coffee.rename(columns={'Price':'Coffee Price'})
Out[ ]:
Day | Coffee Type | Units Sold | Coffee Price | Revenue | |
---|---|---|---|---|---|
0 | Monday | Espresso | 25 | 3.99 | 99.75 |
1 | Monday | Latte | 20 | 5.99 | 119.80 |
2 | Tuesday | Espresso | 35 | 3.99 | 139.65 |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 |
4 | Wednesday | Espresso | 35 | 3.99 | 139.65 |
5 | Wednesday | Latte | 25 | 5.99 | 149.75 |
6 | Thursday | Espresso | 40 | 3.99 | 159.60 |
7 | Thursday | Latte | 35 | 5.99 | 209.65 |
8 | Friday | Espresso | 45 | 3.99 | 179.55 |
9 | Friday | Latte | 35 | 5.99 | 209.65 |
10 | Saturday | Espresso | 45 | 3.99 | 179.55 |
11 | Saturday | Latte | 35 | 5.99 | 209.65 |
12 | Sunday | Espresso | 45 | 3.99 | 179.55 |
13 | Sunday | Latte | 35 | 5.99 | 209.65 |
In [ ]:
Copied!
# coffee = coffee.drop(['Price'], axis=1) # drop column
# coffee = coffee.drop(0, axis=0) # drop row
# coffee = coffee.drop(['Price'], axis=1) # drop column
# coffee = coffee.drop(0, axis=0) # drop row
DateTime¶
In [ ]:
Copied!
bios_temp = bios.copy()
bios_temp[bios_temp['name'].str.contains("Reza", case=False)].head()
bios_temp = bios.copy()
bios_temp[bios_temp['name'].str.contains("Reza", case=False)].head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3917 | 3931 | Mohamed Reza Samadi | 1971-01-01 | NaN | NaN | NaN | Iran | 198.0 | 104.0 | NaN |
10255 | 10311 | Itzel Reza | 1979-05-29 | NaN | NaN | NaN | Mexico | NaN | NaN | NaN |
13398 | 13480 | Mohammad Reza Bajool | 1960-03-21 | NaN | NaN | NaN | Iran | 172.0 | 62.0 | NaN |
13415 | 13497 | Mohamed Reza Banna | 1971-05-05 | NaN | NaN | NaN | Iran | 178.0 | 69.0 | NaN |
25651 | 25843 | Ali Reza Azizi | 1949-08-23 | Mashhad | Khorasan Razavi | IRAN | Iran | 167.0 | 61.0 | 2021-08-07 |
In [ ]:
Copied!
# convert born data to datetime
bios_temp['born_date_dt'] = pd.to_datetime(bios_temp['born_date'], format="%Y-%m-%d")
# convert born data to datetime
bios_temp['born_date_dt'] = pd.to_datetime(bios_temp['born_date'], format="%Y-%m-%d")
In [ ]:
Copied!
bios_temp.info()
bios_temp.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145500 entries, 0 to 145499 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 athlete_id 145500 non-null int64 1 name 145500 non-null object 2 born_date 143693 non-null object 3 born_city 110908 non-null object 4 born_region 110908 non-null object 5 born_country 110908 non-null object 6 NOC 145499 non-null object 7 height_cm 106651 non-null float64 8 weight_kg 102070 non-null float64 9 died_date 33940 non-null object 10 born_date_dt 143693 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(2), int64(1), object(7) memory usage: 12.2+ MB
In [ ]:
Copied!
# Now extract month and year from datetime column
bios_temp['born_year'] = bios_temp['born_date_dt'].dt.year
bios_temp['born_day_of_week'] = bios_temp['born_date_dt'].dt.day_of_week
bios_temp['is_leap'] = bios_temp['born_date_dt'].dt.is_leap_year
bios_temp.head()
# Now extract month and year from datetime column
bios_temp['born_year'] = bios_temp['born_date_dt'].dt.year
bios_temp['born_day_of_week'] = bios_temp['born_date_dt'].dt.day_of_week
bios_temp['is_leap'] = bios_temp['born_date_dt'].dt.is_leap_year
bios_temp.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | born_date_dt | born_year | born_day_of_week | is_leap | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 | 1886-12-12 | 1886.0 | 6.0 | False |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN | 1969-04-01 | 1969.0 | 1.0 | False |
2 | 3 | Jean Borotra | 1898-08-13 | Biarritz | Pyrénées-Atlantiques | FRA | France | 183.0 | 76.0 | 1994-07-17 | 1898-08-13 | 1898.0 | 5.0 | False |
3 | 4 | Jacques Brugnon | 1895-05-11 | Paris VIIIe | Paris | FRA | France | 168.0 | 64.0 | 1978-03-20 | 1895-05-11 | 1895.0 | 5.0 | False |
4 | 5 | Albert Canet | 1878-04-17 | Wandsworth | England | GBR | France | NaN | NaN | 1930-07-25 | 1878-04-17 | 1878.0 | 2.0 | False |
Categories Column¶
In [ ]:
Copied!
def cat_height(h: float):
if h < 150:
return 'Short'
if h > 180:
return 'Tall'
return 'Normal' if not np.isnan(h) else 'Not Defined'
def cat_height(h: float):
if h < 150:
return 'Short'
if h > 180:
return 'Tall'
return 'Normal' if not np.isnan(h) else 'Not Defined'
In [ ]:
Copied!
# bios_temp['height_category'] = bios_temp['height_cm'].apply(cat_height)
bios_temp['height_category'] = bios_temp['height_cm'].apply(lambda h: "Short" if h < 150 else ("Tall" if h > 180 else ('Normal' if not np.isnan(h) else 'Not Defined')))
# bios_temp['height_category'] = bios_temp['height_cm'].apply(cat_height)
bios_temp['height_category'] = bios_temp['height_cm'].apply(lambda h: "Short" if h < 150 else ("Tall" if h > 180 else ('Normal' if not np.isnan(h) else 'Not Defined')))
In [ ]:
Copied!
bios_temp.head()
bios_temp.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | born_date_dt | born_year | born_day_of_week | is_leap | height_category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 | 1886-12-12 | 1886.0 | 6.0 | False | Not Defined |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN | 1969-04-01 | 1969.0 | 1.0 | False | Tall |
2 | 3 | Jean Borotra | 1898-08-13 | Biarritz | Pyrénées-Atlantiques | FRA | France | 183.0 | 76.0 | 1994-07-17 | 1898-08-13 | 1898.0 | 5.0 | False | Tall |
3 | 4 | Jacques Brugnon | 1895-05-11 | Paris VIIIe | Paris | FRA | France | 168.0 | 64.0 | 1978-03-20 | 1895-05-11 | 1895.0 | 5.0 | False | Normal |
4 | 5 | Albert Canet | 1878-04-17 | Wandsworth | England | GBR | France | NaN | NaN | 1930-07-25 | 1878-04-17 | 1878.0 | 2.0 | False | Not Defined |
Merge¶
In [ ]:
Copied!
noc = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/data/noc_regions.csv")
noc.head()
noc = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/data/noc_regions.csv")
noc.head()
Out[ ]:
NOC | region | notes | |
---|---|---|---|
0 | AFG | Afghanistan | NaN |
1 | AHO | Curacao | Netherlands Antilles |
2 | ALB | Albania | NaN |
3 | ALG | Algeria | NaN |
4 | AND | Andorra | NaN |
Get region base on NOC and merge it to bios¶
In [ ]:
Copied!
pd.merge(bios, noc.drop(["notes"], axis=1), left_on='born_country', right_on='NOC', how='left')
pd.merge(bios, noc.drop(["notes"], axis=1), left_on='born_country', right_on='NOC', how='left')
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC_x | height_cm | weight_kg | died_date | NOC_y | region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 | FRA | France |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN | FRA | France |
2 | 3 | Jean Borotra | 1898-08-13 | Biarritz | Pyrénées-Atlantiques | FRA | France | 183.0 | 76.0 | 1994-07-17 | FRA | France |
3 | 4 | Jacques Brugnon | 1895-05-11 | Paris VIIIe | Paris | FRA | France | 168.0 | 64.0 | 1978-03-20 | FRA | France |
4 | 5 | Albert Canet | 1878-04-17 | Wandsworth | England | GBR | France | NaN | NaN | 1930-07-25 | GBR | UK |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
145495 | 149222 | Polina Luchnikova | 2002-01-30 | Serov | Sverdlovsk | RUS | ROC | 167.0 | 61.0 | NaN | RUS | Russia |
145496 | 149223 | Valeriya Merkusheva | 1999-09-20 | Moskva (Moscow) | Moskva | RUS | ROC | 168.0 | 65.0 | NaN | RUS | Russia |
145497 | 149224 | Yuliya Smirnova | 1998-05-08 | Kotlas | Arkhangelsk | RUS | ROC | 163.0 | 55.0 | NaN | RUS | Russia |
145498 | 149225 | André Foussard | 1899-05-19 | Niort | Deux-Sèvres | FRA | France | 166.0 | NaN | 1986-03-18 | FRA | France |
145499 | 149814 | Bill Phillips | 1913-07-15 | Dulwich Hill | New South Wales | AUS | Australia | NaN | NaN | 2003-10-20 | AUS | Australia |
145500 rows × 12 columns
Merge bios with Olympic¶
In [ ]:
Copied!
pd.merge(left=olympic_results, right=bios, left_on='athlete_id', right_on='athlete_id', how='inner')
pd.merge(left=olympic_results, right=bios, left_on='athlete_id', right_on='athlete_id', how='inner')
Out[ ]:
year | type | discipline | event | as | athlete_id | noc | team | place | tied | medal | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1912.0 | Summer | Tennis | Singles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | None | 17.0 | True | None | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
1 | 1912.0 | Summer | Tennis | Doubles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | Jean Montariol | NaN | False | None | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
2 | 1920.0 | Summer | Tennis | Singles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | None | 32.0 | True | None | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
3 | 1920.0 | Summer | Tennis | Doubles, Mixed (Olympic) | Jean-François Blanchy | 1 | FRA | Jeanne Vaussard | 8.0 | True | None | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
4 | 1920.0 | Summer | Tennis | Doubles, Men (Olympic) | Jean-François Blanchy | 1 | FRA | Jacques Brugnon | 4.0 | False | None | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
308403 | 2022.0 | Winter | Luge | Singles, Men (Olympic) | Marián Skupek | 148983 | SVK | None | 26.0 | False | None | Marián Skupek | 2001-07-12 | Gelnica | Košice | SVK | Slovakia | 196.0 | 108.0 | NaN |
308404 | 2022.0 | Winter | Alpine Skiing (Skiing) | Slalom, Women (Olympic) | Elsa Fermbäck | 148984 | SWE | None | 28.0 | False | None | Elsa Fermbäck | 1998-03-28 | Vemdalen | Jämtland | SWE | Sweden | NaN | NaN | NaN |
308405 | 2022.0 | Winter | Alpine Skiing (Skiing) | Team, Mixed (Olympic) | Hilma Lövblom | 148985 | SWE | Sweden | 13.0 | False | None | Hilma Lövblom | 2000-08-16 | Täby | Stockholm | SWE | Sweden | NaN | NaN | NaN |
308406 | 2022.0 | Winter | Alpine Skiing (Skiing) | Giant Slalom, Women (Olympic) | Hilma Lövblom | 148985 | SWE | None | NaN | False | None | Hilma Lövblom | 2000-08-16 | Täby | Stockholm | SWE | Sweden | NaN | NaN | NaN |
308407 | 2022.0 | Winter | None | Slalom, Women (Olympic) | Charlotta Säfvenberg | 148986 | None | None | 24.0 | False | None | Charlotta Säfvenberg | 1994-10-07 | Kungsbacka | Halland | SWE | Sweden | NaN | NaN | NaN |
308408 rows × 20 columns
Concatenate¶
In [ ]:
Copied!
iran = bios[bios['born_country'] == "IRAN"].copy()
usa = bios[bios['born_country'] == "USA"].copy()
iran = bios[bios['born_country'] == "IRAN"].copy()
usa = bios[bios['born_country'] == "USA"].copy()
In [ ]:
Copied!
iran.head()
iran.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3887 | 3900 | Emanoul Aghasi | 1930-12-25 | Salmas | Azarbaijan Gharbi | IRAN | Iran | NaN | NaN | 2021-09-24 |
3888 | 3901 | Nasser Aghaie | 1944-01-20 | Tehran | Tehran | IRAN | Iran | 160.0 | 51.0 | 2009-12-27 |
3901 | 3914 | Ayoub Pourtaghi | 1973-01-01 | Orumiyeh (Urmia) | Azarbaijan Gharbi | IRAN | Iran | 182.0 | 81.0 | 2021-09-29 |
3905 | 3918 | Vazik Ghazarian | 1937-05-04 | Bahmanshir | Khuzestan | IRAN | Iran | 168.0 | 63.0 | NaN |
3907 | 3920 | Iraj Kia Rostami | 1963-07-07 | Tehran | Tehran | IRAN | Iran | 180.0 | 91.0 | 2015-08-06 |
In [ ]:
Copied!
usa.head()
usa.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
54 | 55 | Monique Javer | 1967-07-22 | Burlingame | California | USA | Great Britain | 177.0 | 64.0 | NaN |
960 | 964 | Xóchitl Escobedo | 1968-09-17 | West Covina | California | USA | Mexico | 170.0 | 60.0 | NaN |
961 | 965 | Angélica Gavaldón | 1973-10-03 | El Centro | California | USA | Mexico | 160.0 | 54.0 | NaN |
1231 | 1238 | Bert Schneider | 1897-07-01 | Cleveland | Ohio | USA | Canada | NaN | NaN | 1986-02-20 |
1345 | 1352 | Laura Berg | 1975-01-06 | Santa Fe Springs | California | USA | United States | 168.0 | 61.0 | NaN |
In [ ]:
Copied!
pd.concat([iran, usa])
pd.concat([iran, usa])
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3887 | 3900 | Emanoul Aghasi | 1930-12-25 | Salmas | Azarbaijan Gharbi | IRAN | Iran | NaN | NaN | 2021-09-24 |
3888 | 3901 | Nasser Aghaie | 1944-01-20 | Tehran | Tehran | IRAN | Iran | 160.0 | 51.0 | 2009-12-27 |
3901 | 3914 | Ayoub Pourtaghi | 1973-01-01 | Orumiyeh (Urmia) | Azarbaijan Gharbi | IRAN | Iran | 182.0 | 81.0 | 2021-09-29 |
3905 | 3918 | Vazik Ghazarian | 1937-05-04 | Bahmanshir | Khuzestan | IRAN | Iran | 168.0 | 63.0 | NaN |
3907 | 3920 | Iraj Kia Rostami | 1963-07-07 | Tehran | Tehran | IRAN | Iran | 180.0 | 91.0 | 2015-08-06 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
145445 | 149168 | Kristen Santos | 1994-11-02 | Fairfield | Connecticut | USA | United States | NaN | NaN | NaN |
145446 | 149169 | Corinne Stoddard | 2001-08-15 | Seattle | Washington | USA | United States | NaN | NaN | NaN |
145454 | 149180 | Anna Hoffmann | 2000-03-28 | Madison | Wisconsin | USA | United States | NaN | NaN | NaN |
145457 | 149183 | Alix Wilkinson | 2000-08-02 | Mammoth Lakes | California | USA | United States | NaN | NaN | NaN |
145468 | 149195 | Justin Abdelkader | 1987-02-25 | Muskegon | Michigan | USA | United States | 187.0 | 97.0 | NaN |
9993 rows × 10 columns
In [ ]:
Copied!
pd.concat([iran.drop(['name'], axis=1), usa])
pd.concat([iran.drop(['name'], axis=1), usa])
Out[ ]:
athlete_id | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | name | |
---|---|---|---|---|---|---|---|---|---|---|
3887 | 3900 | 1930-12-25 | Salmas | Azarbaijan Gharbi | IRAN | Iran | NaN | NaN | 2021-09-24 | NaN |
3888 | 3901 | 1944-01-20 | Tehran | Tehran | IRAN | Iran | 160.0 | 51.0 | 2009-12-27 | NaN |
3901 | 3914 | 1973-01-01 | Orumiyeh (Urmia) | Azarbaijan Gharbi | IRAN | Iran | 182.0 | 81.0 | 2021-09-29 | NaN |
3905 | 3918 | 1937-05-04 | Bahmanshir | Khuzestan | IRAN | Iran | 168.0 | 63.0 | NaN | NaN |
3907 | 3920 | 1963-07-07 | Tehran | Tehran | IRAN | Iran | 180.0 | 91.0 | 2015-08-06 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
145445 | 149168 | 1994-11-02 | Fairfield | Connecticut | USA | United States | NaN | NaN | NaN | Kristen Santos |
145446 | 149169 | 2001-08-15 | Seattle | Washington | USA | United States | NaN | NaN | NaN | Corinne Stoddard |
145454 | 149180 | 2000-03-28 | Madison | Wisconsin | USA | United States | NaN | NaN | NaN | Anna Hoffmann |
145457 | 149183 | 2000-08-02 | Mammoth Lakes | California | USA | United States | NaN | NaN | NaN | Alix Wilkinson |
145468 | 149195 | 1987-02-25 | Muskegon | Michigan | USA | United States | 187.0 | 97.0 | NaN | Justin Abdelkader |
9993 rows × 10 columns
In [ ]:
Copied!
pd.concat([iran, usa.drop(['name'], axis=1)])
pd.concat([iran, usa.drop(['name'], axis=1)])
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
3887 | 3900 | Emanoul Aghasi | 1930-12-25 | Salmas | Azarbaijan Gharbi | IRAN | Iran | NaN | NaN | 2021-09-24 |
3888 | 3901 | Nasser Aghaie | 1944-01-20 | Tehran | Tehran | IRAN | Iran | 160.0 | 51.0 | 2009-12-27 |
3901 | 3914 | Ayoub Pourtaghi | 1973-01-01 | Orumiyeh (Urmia) | Azarbaijan Gharbi | IRAN | Iran | 182.0 | 81.0 | 2021-09-29 |
3905 | 3918 | Vazik Ghazarian | 1937-05-04 | Bahmanshir | Khuzestan | IRAN | Iran | 168.0 | 63.0 | NaN |
3907 | 3920 | Iraj Kia Rostami | 1963-07-07 | Tehran | Tehran | IRAN | Iran | 180.0 | 91.0 | 2015-08-06 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
145445 | 149168 | NaN | 1994-11-02 | Fairfield | Connecticut | USA | United States | NaN | NaN | NaN |
145446 | 149169 | NaN | 2001-08-15 | Seattle | Washington | USA | United States | NaN | NaN | NaN |
145454 | 149180 | NaN | 2000-03-28 | Madison | Wisconsin | USA | United States | NaN | NaN | NaN |
145457 | 149183 | NaN | 2000-08-02 | Mammoth Lakes | California | USA | United States | NaN | NaN | NaN |
145468 | 149195 | NaN | 1987-02-25 | Muskegon | Michigan | USA | United States | 187.0 | 97.0 | NaN |
9993 rows × 10 columns
Handle Null Values¶
Load and null some values¶
In [ ]:
Copied!
coffee_temp = coffee.copy()
coffee_temp = coffee.copy()
In [ ]:
Copied!
coffee_temp.loc[:9:2, 'Units Sold'] = np.nan
coffee_temp.loc[:9:2, 'Units Sold'] = np.nan
In [ ]:
Copied!
coffee_temp.head()
coffee_temp.head()
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | |
---|---|---|---|---|---|
0 | Monday | Espresso | NaN | 3.99 | 99.75 |
1 | Monday | Latte | 20.0 | 5.99 | 119.80 |
2 | Tuesday | Espresso | NaN | 3.99 | 139.65 |
3 | Tuesday | Latte | 20.0 | 5.99 | 119.80 |
4 | Wednesday | Espresso | NaN | 3.99 | 139.65 |
Find NaNs¶
In [ ]:
Copied!
coffee_temp.info()
coffee_temp.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14 entries, 0 to 13 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Day 14 non-null object 1 Coffee Type 14 non-null object 2 Units Sold 9 non-null float64 3 Price 14 non-null float64 4 Revenue 14 non-null float64 dtypes: float64(3), object(2) memory usage: 692.0+ bytes
In [ ]:
Copied!
coffee_temp.isna().sum()
coffee_temp.isna().sum()
Out[ ]:
0 | |
---|---|
Day | 0 |
Coffee Type | 0 |
Units Sold | 5 |
Price | 0 |
Revenue | 0 |
In [ ]:
Copied!
coffee_temp[coffee_temp['Units Sold'].isna()]
# coffee_temp[coffee_temp['Units Sold'].notna()] # not nan units sold
coffee_temp[coffee_temp['Units Sold'].isna()]
# coffee_temp[coffee_temp['Units Sold'].notna()] # not nan units sold
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | |
---|---|---|---|---|---|
0 | Monday | Espresso | NaN | 3.99 | 99.75 |
2 | Tuesday | Espresso | NaN | 3.99 | 139.65 |
4 | Wednesday | Espresso | NaN | 3.99 | 139.65 |
6 | Thursday | Espresso | NaN | 3.99 | 159.60 |
8 | Friday | Espresso | NaN | 3.99 | 179.55 |
Handle NaNs¶
Fill¶
In [ ]:
Copied!
# You can use interpolate istead of mean that fill values base on the neighbors
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].fillna(coffee_temp['Units Sold'].mean())
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].astype(np.int32)
coffee_temp.head()
# You can use interpolate istead of mean that fill values base on the neighbors
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].fillna(coffee_temp['Units Sold'].mean())
coffee_temp['Units Sold'] = coffee_temp['Units Sold'].astype(np.int32)
coffee_temp.head()
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | |
---|---|---|---|---|---|
0 | Monday | Espresso | 32 | 3.99 | 99.75 |
1 | Monday | Latte | 20 | 5.99 | 119.80 |
2 | Tuesday | Espresso | 32 | 3.99 | 139.65 |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 |
4 | Wednesday | Espresso | 32 | 3.99 | 139.65 |
Drop¶
In [ ]:
Copied!
coffee_temp.loc[:9:2, 'Units Sold'] = np.nan
coffee_temp.loc[:9:2, 'Units Sold'] = np.nan
In [ ]:
Copied!
coffee_temp = coffee_temp.dropna()
# coffee_temp.dropna(subset=['Price'], inplace=True) # drop rows that doesn't have price
coffee_temp.head()
coffee_temp = coffee_temp.dropna()
# coffee_temp.dropna(subset=['Price'], inplace=True) # drop rows that doesn't have price
coffee_temp.head()
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | |
---|---|---|---|---|---|
1 | Monday | Latte | 20.0 | 5.99 | 119.80 |
3 | Tuesday | Latte | 20.0 | 5.99 | 119.80 |
5 | Wednesday | Latte | 25.0 | 5.99 | 149.75 |
7 | Thursday | Latte | 35.0 | 5.99 | 209.65 |
9 | Friday | Latte | 35.0 | 5.99 | 209.65 |
Aggregating Data¶
In [ ]:
Copied!
bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')
bios_agg = bios.copy()
bios_agg.head()
bios['NOC'] = bios['NOC'].replace(to_replace='Islamic Republic of Iran', value='Iran')
bios['born_country'] = bios['born_country'].replace(to_replace='IRI', value='IRAN')
bios_agg = bios.copy()
bios_agg.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN |
2 | 3 | Jean Borotra | 1898-08-13 | Biarritz | Pyrénées-Atlantiques | FRA | France | 183.0 | 76.0 | 1994-07-17 |
3 | 4 | Jacques Brugnon | 1895-05-11 | Paris VIIIe | Paris | FRA | France | 168.0 | 64.0 | 1978-03-20 |
4 | 5 | Albert Canet | 1878-04-17 | Wandsworth | England | GBR | France | NaN | NaN | 1930-07-25 |
Count¶
In [ ]:
Copied!
bios_agg[bios_agg["NOC"] == "Iran"].value_counts(subset=["NOC", "born_region"], dropna=False)
bios_agg[bios_agg["NOC"] == "Iran"].value_counts(subset=["NOC", "born_region"], dropna=False)
Out[ ]:
count | ||
---|---|---|
NOC | born_region | |
Iran | NaN | 240 |
Tehran | 108 | |
Mazandaran | 35 | |
Isfahan | 21 | |
Khuzestan | 20 | |
Azarbaijan Gharbi | 15 | |
Khorasan Razavi | 15 | |
Gilan | 12 | |
Azarbaijan Sharqi | 10 | |
Kermanshah | 9 | |
Lorestan | 9 | |
Fars | 9 | |
Chaharmahal-o-Bakhtiyari | 8 | |
Hamadan | 8 | |
Golestan | 7 | |
Ardabil | 6 | |
Qazvin | 5 | |
Markazi | 4 | |
Alborz | 4 | |
Kerman | 4 | |
Ilam | 4 | |
Zanjan | 3 | |
Kurdistan | 3 | |
Khorasan Jonubi | 2 | |
Yazd | 2 | |
England | 2 | |
Baghdad | 1 | |
Bakı | 1 | |
Genève | 1 | |
California | 1 | |
Bushehr | 1 | |
Moskva | 1 | |
Khorasan Shomali | 1 | |
Région de Bruxelles-Capitale | 1 | |
Rheinland-Pfalz | 1 | |
Qom | 1 | |
Vitebsk | 1 |
Group By¶
In [ ]:
Copied!
# mean of height and weight on each NOC
# bios_agg.groupby(by=['born_country'])['height_cm'].mean()
bios_agg.groupby(by=['NOC']).agg({'height_cm': 'mean', 'weight_kg': 'mean'})
# mean of height and weight on each NOC
# bios_agg.groupby(by=['born_country'])['height_cm'].mean()
bios_agg.groupby(by=['NOC']).agg({'height_cm': 'mean', 'weight_kg': 'mean'})
Out[ ]:
height_cm | weight_kg | |
---|---|---|
NOC | ||
Afghanistan | 170.170732 | 67.076923 |
Albania | 174.200000 | 74.285714 |
Albania Australia | 166.000000 | 65.000000 |
Albania Bulgaria | 189.000000 | 100.000000 |
Albania Greece | 165.000000 | 64.000000 |
... | ... | ... |
West Indies Federation | 176.000000 | 71.500000 |
Yemen | 169.391304 | 60.476190 |
Yugoslavia | 180.035980 | 76.444444 |
Zambia | 169.852174 | 64.808511 |
Zimbabwe | 173.335196 | 68.089888 |
696 rows × 2 columns
Pivot¶
In [ ]:
Copied!
coffee
coffee
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | |
---|---|---|---|---|---|
0 | Monday | Espresso | 25 | 3.99 | 99.75 |
1 | Monday | Latte | 20 | 5.99 | 119.80 |
2 | Tuesday | Espresso | 35 | 3.99 | 139.65 |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 |
4 | Wednesday | Espresso | 35 | 3.99 | 139.65 |
5 | Wednesday | Latte | 25 | 5.99 | 149.75 |
6 | Thursday | Espresso | 40 | 3.99 | 159.60 |
7 | Thursday | Latte | 35 | 5.99 | 209.65 |
8 | Friday | Espresso | 45 | 3.99 | 179.55 |
9 | Friday | Latte | 35 | 5.99 | 209.65 |
10 | Saturday | Espresso | 45 | 3.99 | 179.55 |
11 | Saturday | Latte | 35 | 5.99 | 209.65 |
12 | Sunday | Espresso | 45 | 3.99 | 179.55 |
13 | Sunday | Latte | 35 | 5.99 | 209.65 |
In [ ]:
Copied!
pivot = coffee.pivot(index = "Day", columns="Coffee Type", values="Units Sold")
pivot = coffee.pivot(index = "Day", columns="Coffee Type", values="Units Sold")
In [ ]:
Copied!
pivot
pivot
Out[ ]:
Coffee Type | Espresso | Latte |
---|---|---|
Day | ||
Friday | 45 | 35 |
Monday | 25 | 20 |
Saturday | 45 | 35 |
Sunday | 45 | 35 |
Thursday | 40 | 35 |
Tuesday | 35 | 20 |
Wednesday | 35 | 25 |
In [ ]:
Copied!
pivot.sum()
pivot.sum()
Out[ ]:
0 | |
---|---|
Coffee Type | |
Espresso | 270 |
Latte | 205 |
In [ ]:
Copied!
pivot.sum(axis=1)
pivot.sum(axis=1)
Out[ ]:
0 | |
---|---|
Day | |
Friday | 80 |
Monday | 45 |
Saturday | 80 |
Sunday | 80 |
Thursday | 75 |
Tuesday | 55 |
Wednesday | 60 |
In [ ]:
Copied!
pivot.loc["Monday"]
pivot.loc["Monday"]
Out[ ]:
Monday | |
---|---|
Coffee Type | |
Espresso | 25 |
Latte | 20 |
Advanced Exmple¶
Group by born_date
In [ ]:
Copied!
bios_agg.dropna(subset=["born_date"], inplace=True) # drop if born_date is nan
bios_agg.dropna(subset=["born_date"], inplace=True) # drop if born_date is nan
In [ ]:
Copied!
bios_agg["born_date_dt"] = pd.to_datetime(bios_agg["born_date"])
bios_agg["born_date_dt"] = pd.to_datetime(bios_agg["born_date"])
In [ ]:
Copied!
bios_agg.head(2)
bios_agg.head(2)
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | born_date_dt | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 | 1886-12-12 |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN | 1969-04-01 |
In [ ]:
Copied!
bios_agg.groupby(by=bios_agg["born_date_dt"].dt.year).agg({"height_cm": "mean", "name": "count"}).sort_values(by="name", ascending=False).reset_index()
bios_agg.groupby(by=bios_agg["born_date_dt"].dt.year).agg({"height_cm": "mean", "name": "count"}).sort_values(by="name", ascending=False).reset_index()
Out[ ]:
born_date_dt | height_cm | name | |
---|---|---|---|
0 | 1972 | 176.674755 | 2231 |
1 | 1985 | 176.445522 | 2227 |
2 | 1973 | 177.127033 | 2216 |
3 | 1971 | 176.950644 | 2205 |
4 | 1970 | 177.203734 | 2174 |
... | ... | ... | ... |
172 | 1837 | NaN | 1 |
173 | 1833 | NaN | 1 |
174 | 1839 | NaN | 1 |
175 | 1845 | NaN | 1 |
176 | 2009 | NaN | 1 |
177 rows × 3 columns
Advance Functionalities¶
Shift¶
In [ ]:
Copied!
coffee['yesterday_revenue'] = coffee["Revenue"].shift(2)
coffee['yesterday_revenue'] = coffee["Revenue"].shift(2)
In [ ]:
Copied!
coffee.head()
coffee.head()
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | yesterday_revenue | |
---|---|---|---|---|---|---|
0 | Monday | Espresso | 25 | 3.99 | 99.75 | NaN |
1 | Monday | Latte | 20 | 5.99 | 119.80 | NaN |
2 | Tuesday | Espresso | 35 | 3.99 | 139.65 | 99.75 |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 | 119.80 |
4 | Wednesday | Espresso | 35 | 3.99 | 139.65 | 139.65 |
Rank¶
compare with other samples
In [ ]:
Copied!
bios_agg.head()
bios_agg.head()
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | born_date_dt | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Jean-François Blanchy | 1886-12-12 | Bordeaux | Gironde | FRA | France | NaN | NaN | 1960-10-02 | 1886-12-12 |
1 | 2 | Arnaud Boetsch | 1969-04-01 | Meulan | Yvelines | FRA | France | 183.0 | 76.0 | NaN | 1969-04-01 |
2 | 3 | Jean Borotra | 1898-08-13 | Biarritz | Pyrénées-Atlantiques | FRA | France | 183.0 | 76.0 | 1994-07-17 | 1898-08-13 |
3 | 4 | Jacques Brugnon | 1895-05-11 | Paris VIIIe | Paris | FRA | France | 168.0 | 64.0 | 1978-03-20 | 1895-05-11 |
4 | 5 | Albert Canet | 1878-04-17 | Wandsworth | England | GBR | France | NaN | NaN | 1930-07-25 | 1878-04-17 |
In [ ]:
Copied!
bios_agg["weight_rank"] = bios_agg["weight_kg"].rank()
bios_agg["weight_rank"] = bios_agg["weight_kg"].rank()
In [ ]:
Copied!
bios_agg.sort_values(by=["weight_rank"], ascending=False)
bios_agg.sort_values(by=["weight_rank"], ascending=False)
Out[ ]:
athlete_id | name | born_date | born_city | born_region | born_country | NOC | height_cm | weight_kg | died_date | born_date_dt | weight_rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
105715 | 106771 | Aythami Ruano | 1977-06-18 | Las Palmas de Gran Canaria | Las Palmas | ESP | Spain | 200.0 | 198.0 | NaN | 1977-06-18 | 101977.0 |
59204 | 59630 | Marek Galiński | 1951-05-13 | Wrocław | Dolnośląskie | POL | Poland | 200.0 | 190.0 | 1999-09-28 | 1951-05-13 | 101976.0 |
60344 | 60777 | Chris Taylor | 1950-06-13 | Dowagiac | Michigan | USA | United States | 196.0 | 182.0 | 1979-06-30 | 1950-06-13 | 101975.0 |
91712 | 92442 | Valentyn Rusliakov | 1972-02-03 | NaN | NaN | NaN | Ukraine | 187.0 | 180.0 | NaN | 1972-02-03 | 101974.0 |
105722 | 106778 | Leonel Wilfredo Ruiz | 1975-07-27 | NaN | NaN | NaN | Venezuela | 187.0 | 178.0 | NaN | 1975-07-27 | 101973.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
145492 | 149219 | Carlos García-Ordóñez | 1927-04-24 | La Habana (Havana) | Ciudad de La Habana | CUB | Cuba | NaN | NaN | 2019-11-24 | 1927-04-24 | NaN |
145493 | 149220 | Landysh Falyakhova | 1998-08-31 | Dva Polya Artash | Respublika Tatarstan | RUS | ROC | NaN | NaN | NaN | 1998-08-31 | NaN |
145494 | 149221 | George Wright | 1890-07-21 | Toronto | Ontario | CAN | Canada | 183.0 | NaN | 1973-01-28 | 1890-07-21 | NaN |
145498 | 149225 | André Foussard | 1899-05-19 | Niort | Deux-Sèvres | FRA | France | 166.0 | NaN | 1986-03-18 | 1899-05-19 | NaN |
145499 | 149814 | Bill Phillips | 1913-07-15 | Dulwich Hill | New South Wales | AUS | Australia | NaN | NaN | 2003-10-20 | 1913-07-15 | NaN |
143693 rows × 12 columns
Rolling¶
In [ ]:
Copied!
# Add previous rows to current by cumsum
coffee["cumsum_Revenue"] = coffee["Revenue"].cumsum()
coffee.head()
# Add previous rows to current by cumsum
coffee["cumsum_Revenue"] = coffee["Revenue"].cumsum()
coffee.head()
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | yesterday_revenue | cumsum_Revenue | |
---|---|---|---|---|---|---|---|
0 | Monday | Espresso | 25 | 3.99 | 99.75 | NaN | 99.75 |
1 | Monday | Latte | 20 | 5.99 | 119.80 | NaN | 219.55 |
2 | Tuesday | Espresso | 35 | 3.99 | 139.65 | 99.75 | 359.20 |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 | 119.80 | 479.00 |
4 | Wednesday | Espresso | 35 | 3.99 | 139.65 | 139.65 | 618.65 |
In [ ]:
Copied!
# Add previous rows to current by cumsum
coffee["3_days_Revenue"] = coffee["Revenue"].rolling(6).sum() # each 3 days
coffee.head(12)
# Add previous rows to current by cumsum
coffee["3_days_Revenue"] = coffee["Revenue"].rolling(6).sum() # each 3 days
coffee.head(12)
Out[ ]:
Day | Coffee Type | Units Sold | Price | Revenue | yesterday_revenue | cumsum_Revenue | week_Revenue | 3_days_Revenue | |
---|---|---|---|---|---|---|---|---|---|
0 | Monday | Espresso | 25 | 3.99 | 99.75 | NaN | 99.75 | NaN | NaN |
1 | Monday | Latte | 20 | 5.99 | 119.80 | NaN | 219.55 | NaN | NaN |
2 | Tuesday | Espresso | 35 | 3.99 | 139.65 | 99.75 | 359.20 | NaN | NaN |
3 | Tuesday | Latte | 20 | 5.99 | 119.80 | 119.80 | 479.00 | NaN | NaN |
4 | Wednesday | Espresso | 35 | 3.99 | 139.65 | 139.65 | 618.65 | NaN | NaN |
5 | Wednesday | Latte | 25 | 5.99 | 149.75 | 119.80 | 768.40 | NaN | 768.40 |
6 | Thursday | Espresso | 40 | 3.99 | 159.60 | 139.65 | 928.00 | NaN | 828.25 |
7 | Thursday | Latte | 35 | 5.99 | 209.65 | 149.75 | 1137.65 | NaN | 918.10 |
8 | Friday | Espresso | 45 | 3.99 | 179.55 | 159.60 | 1317.20 | NaN | 958.00 |
9 | Friday | Latte | 35 | 5.99 | 209.65 | 209.65 | 1526.85 | NaN | 1047.85 |
10 | Saturday | Espresso | 45 | 3.99 | 179.55 | 179.55 | 1706.40 | NaN | 1087.75 |
11 | Saturday | Latte | 35 | 5.99 | 209.65 | 209.65 | 1916.05 | NaN | 1147.65 |
New Functionalities¶
In [ ]:
Copied!
pd.__version__
pd.__version__
Out[ ]:
'2.2.2'
In [ ]:
Copied!
bios_numpy = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios_pyarrow = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv", engine="pyarrow", dtype_backend="pyarrow")
bios_numpy = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv")
bios_pyarrow = pd.read_csv("https://github.com/KeithGalli/complete-pandas-tutorial/raw/refs/heads/master/data/bios.csv", engine="pyarrow", dtype_backend="pyarrow")
In [ ]:
Copied!
bios_numpy.info()
bios_numpy.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145500 entries, 0 to 145499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 athlete_id 145500 non-null int64 1 name 145500 non-null object 2 born_date 143693 non-null object 3 born_city 110908 non-null object 4 born_region 110908 non-null object 5 born_country 110908 non-null object 6 NOC 145499 non-null object 7 height_cm 106651 non-null float64 8 weight_kg 102070 non-null float64 9 died_date 33940 non-null object dtypes: float64(2), int64(1), object(7) memory usage: 11.1+ MB
In [ ]:
Copied!
bios_pyarrow.info()
bios_pyarrow.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145500 entries, 0 to 145499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 athlete_id 145500 non-null int64[pyarrow] 1 name 145500 non-null string[pyarrow] 2 born_date 143693 non-null date32[day][pyarrow] 3 born_city 110908 non-null string[pyarrow] 4 born_region 110908 non-null string[pyarrow] 5 born_country 110908 non-null string[pyarrow] 6 NOC 145499 non-null string[pyarrow] 7 height_cm 106651 non-null double[pyarrow] 8 weight_kg 102070 non-null double[pyarrow] 9 died_date 33940 non-null date32[day][pyarrow] dtypes: date32[day][pyarrow](2), double[pyarrow](2), int64[pyarrow](1), string[pyarrow](5) memory usage: 13.0 MB
Compare Speeds¶
In [ ]:
Copied!
bios_numpy["name"].str.contains("Reza")
bios_numpy["name"].str.contains("Reza")
Out[ ]:
name | |
---|---|
0 | False |
1 | False |
2 | False |
3 | False |
4 | False |
... | ... |
145495 | False |
145496 | False |
145497 | False |
145498 | False |
145499 | False |
145500 rows × 1 columns
In [ ]:
Copied!
bios_pyarrow["name"].str.contains("Reza")
bios_pyarrow["name"].str.contains("Reza")
Out[ ]:
name | |
---|---|
0 | False |
1 | False |
2 | False |
3 | False |
4 | False |
... | ... |
145495 | False |
145496 | False |
145497 | False |
145498 | False |
145499 | False |
145500 rows × 1 columns
Real Project¶
In [ ]:
Copied!