Explore a dataset on the modern Olympic Games, including all the Games from Athens 1896 to Rio 2016
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
data = pd.read_csv('../input/athlete-events/athlete_events.csv')
regions = pd.read_csv('../input/athlete-events/noc_regions.csv')
data.describe()
data.info()
regions = pd.read_csv('../input/athlete-events/noc_regions.csv')
regions.head()
merged = pd.merge(data, regions, on='NOC', how='left')
merged.head()
goldMedals = merged[(merged.Medal == 'Gold')]
goldMedals.head()
goldMedals.isnull().any()
goldMedals = goldMedals[np.isfinite(goldMedals['Age'])]
plt.figure(figsize=(20, 10))
plt.tight_layout()
sns.countplot(goldMedals['Age'])
plt.title('Distribution of Gold Medals')
goldMedals['ID'][goldMedals['Age'] > 50].count()
masterDisciplines = goldMedals['Sport'][goldMedals['Age'] > 50]
plt.figure(figsize=(20, 10))
plt.tight_layout()
sns.countplot(masterDisciplines)
plt.title('Gold Medals for Athletes Over 50')
womenInOlympics = merged[(merged.Sex == 'F') & (merged.Season == 'Summer')]
womenInOlympics.head(10)
sns.set(style="darkgrid")
plt.figure(figsize=(20, 10))
sns.countplot(x='Year', data=womenInOlympics)
plt.title('Women medals per edition of the Games')
womenInOlympics.loc[womenInOlympics['Year'] == 1900].head(10)
womenInOlympics['ID'].loc[womenInOlympics['Year'] == 1900].count()
goldMedals.region.value_counts().reset_index(name='Medal').head()
totalGoldMedals = goldMedals.region.value_counts().reset_index(name='Medal').head(5)
g = sns.catplot(x="index", y="Medal", data=totalGoldMedals,
height=6, kind="bar", palette="muted")
g.despine(left=True)
g.set_xlabels("Top 5 countries")
g.set_ylabels("Number of Medals")
plt.title('Medals per Country')
goldMedalsUSA = goldMedals.loc[goldMedals['NOC'] == 'USA']
goldMedalsUSA.Event.value_counts().reset_index(name='Medal').head(20)
basketballGoldUSA = goldMedalsUSA.loc[(goldMedalsUSA['Sport'] == 'Basketball') & (goldMedalsUSA['Sex'] == 'M')].sort_values(['Year'])
basketballGoldUSA.head(15)
groupedBasketUSA = basketballGoldUSA.groupby(['Year']).first()
groupedBasketUSA
groupedBasketUSA['ID'].count()
goldMedals.head()
We can see that we have NaN values both in height and weight columns
At this point, we can act as follows:
- Using only the rows that has a value in the Height and Weight columns;
- Replace the value with the mean of the column.
Solution 2 in my opinion it is not the best way to go: we are talking about data of athletes of different ages and different disciplines (that have done different training).
Let’s go with solution 1.
The first thing to do is to collect general information about the dataframe that we have to use: goldMedals.
goldMedals.info()
notNullMedals = goldMedals[(goldMedals['Height'].notnull()) & (goldMedals['Weight'].notnull())]
plt.figure(figsize=(12, 10))
ax = sns.scatterplot(x="Height", y="Weight", data=notNullMedals)
plt.title('Height vs Weight of Olympic Medalists')
notNullMedals.loc[notNullMedals['Weight'] > 160]
MenOverTime = merged[(merged.Sex == 'M') & (merged.Season == 'Summer')]
WomenOverTime = merged[(merged.Sex == 'F') & (merged.Season == 'Summer')]
MenOverTime.head()
part = MenOverTime.groupby('Year')['Sex'].value_counts()
plt.figure(figsize=(20, 10))
part.loc[:,'M'].plot()
plt.title('Variation of Male Athletes over time')
part = WomenOverTime.groupby('Year')['Sex'].value_counts()
plt.figure(figsize=(20, 10))
part.loc[:,'F'].plot()
plt.title('Variation of Female Athletes over time')
What I immediately saw is that for women:
- We have a steep increase in the population;
- The grow is constant.
On the other hand, the grow for men seems less strong:
- After the 1990 we can see a relevant decrease in the number of male athletes at the summer games;
- The growth has slowly restarted recently.
plt.figure(figsize=(20, 10))
sns.boxplot('Year', 'Age', data=MenOverTime)
plt.title('Variation of Age for Male Athletes over time')
MenOverTime.loc[MenOverTime['Age'] > 80].head(10)
plt.figure(figsize=(20, 10))
sns.boxplot('Year', 'Age', data=WomenOverTime)
plt.title('Variation of Age for Female Athletes over time')
WomenOverTime.loc[WomenOverTime['Year'] == 1904]
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Weight', data=MenOverTime)
plt.title('Variation of Weight for Male Athletes over time')
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Weight', data=WomenOverTime)
plt.title('Variation of Weight for Female Athletes over time')
womenInOlympics.loc[womenInOlympics['Year'] < 1924].head(20)
womenInOlympics.loc[womenInOlympics['Year'] < 1924].head(20)
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Height', data=WomenOverTime, palette='Set2')
plt.title('Variation of Height for Female Athletes over time')
WomenOverTime.loc[(WomenOverTime['Year'] > 1924) & (WomenOverTime['Year'] < 1952)].head(10)
MenOverTime.head()
itMenOverTime = MenOverTime.loc[MenOverTime['region'] == 'Italy']
sns.set(style="darkgrid")
plt.figure(figsize=(20, 10))
sns.countplot(x='Year', data=itMenOverTime, palette='Set2')
plt.title('Variation of Age for Italian Male Athletes over time')
itWomenOverTime = WomenOverTime.loc[WomenOverTime['region'] == 'Italy']
sns.set(style="darkgrid")
plt.figure(figsize=(20, 10))
sns.countplot(x='Year', data=itWomenOverTime, palette='Set2')
plt.title('Variation of Age for Italian Female Athletes over time')
MenOverTime['Sport'].unique().tolist()
gymMenOverTime = MenOverTime.loc[MenOverTime['Sport'] == 'Gymnastics']
gymWomenOverTime = WomenOverTime.loc[WomenOverTime['Sport'] == 'Gymnastics']
plt.figure(figsize=(20, 10))
sns.barplot('Year', 'Weight', data=gymMenOverTime)
plt.title('Weight over year for Male Gymnasts')
plt.figure(figsize=(20, 10))
sns.barplot('Year', 'Height', data=gymMenOverTime)
plt.title('Height over year for Male Gymnasts')
plt.figure(figsize=(20, 10))
sns.barplot('Year', 'Weight', data=gymWomenOverTime)
plt.title('Weight over year for Female Gymnasts')
plt.figure(figsize=(20, 10))
sns.barplot('Year', 'Height', data=gymWomenOverTime)
plt.title('Height over year for Female Gymnasts')
gymMenOverTime['Weight'].loc[gymMenOverTime['Year'] == 1924].isnull().all()
wlMenOverTime = MenOverTime.loc[MenOverTime['Sport'] == 'Weightlifting']
wlWomenOverTime = WomenOverTime.loc[WomenOverTime['Sport'] == 'Weightlifting']
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Weight', data=wlMenOverTime, palette='Set2')
plt.title('Weight over year for Male Lifters')
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Height', data=wlMenOverTime, palette='Set2')
plt.title('Height over year for Male Lifters')
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Weight', data=wlWomenOverTime)
plt.title('Weight over year for Female Lifters')
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Height', data=wlWomenOverTime)
plt.title('Height over year for Female Lifters')