#import Data Packages to work with the data

import pandas                 as pd       # data science essentials
import numpy                  as np       # arithmetic operations essentials
import matplotlib.pyplot      as plt      # data visualization essentials
import seaborn                as sns      # enhanced data visualization

#Import Final Project Dataset and read it with pd

file  = 'Final_Project_Dataset.xlsx'
final = pd.read_excel(file)


#Subsetting the Data to focus only in our region

df_wheel = final[final["Cool Name"] == "Wheeljack"]
df_bumb = final[final["Cool Name"] == "Bumblebee"]
df_rat  = final[final["Cool Name"] == "Ratchet"]
df_mir  = final[final["Cool Name"] == "Mirage"]
df_side = final[final["Cool Name"] == "Sideswipe"]
df_sm   = final[final["Cool Name"] == "Skids & Mudflap"]
df_iron = final[final["Cool Name"] == "Ironhide"]
df_clif = final[final["Cool Name"] == "Cliffhanger"]
df_jet = final[final["Cool Name"] == "Jetfire"]
df_jaz = final[final["Cool Name"] == "Jazz"]
df_dino = final[final["Cool Name"] == "Dinobots"]
df_hot = final[final["Cool Name"] == "Hot Rod"]
df_opt = final[final["Cool Name"] == "Optimus Prime"]
df_sent = final[final["Cool Name"] == "Sentinel Prime"]

#See the information to see if there are missing values

#df_wheel.info()


#Putting everything in lower case for practicity and changing names for our Region

df_wheel.columns = df_wheel.columns.str.lower()
df_bumb.columns = df_bumb.columns.str.lower()
df_rat.columns = df_rat.columns.str.lower()
df_mir.columns = df_mir.columns.str.lower()
df_side.columns = df_side.columns.str.lower()
df_sm.columns = df_sm.columns.str.lower()
df_iron.columns = df_iron.columns.str.lower()
df_clif.columns = df_clif.columns.str.lower()
df_jet.columns = df_jet.columns.str.lower()
df_jaz.columns = df_jaz.columns.str.lower()
df_dino.columns = df_dino.columns.str.lower()
df_hot.columns = df_hot.columns.str.lower()
df_opt.columns = df_opt.columns.str.lower()
df_sent.columns = df_sent.columns.str.lower()

#Changing Column names

new_names = {
    'country code total'                                                       : 'country_code', 
    'country name'                                                             : 'country_name',
    'hult region'                                                              : 'hult_region', 
    'cool name'                                                                : 'cool_name', 
    'aids estimated deaths (unaids estimates)'                                 : 'est_deaths_by _aids', 
    'adjusted net enrollment rate, primary (% of primary school age children)' : 'enrollment_rate',
    'adolescent fertility rate (births per 1,000 women ages 15-19)'            : 'adolescent _fertility_rate', 
    'antiretroviral therapy coverage (% of people living with hiv)'            : 'art_therapy for aids', 
    'births attended by skilled health staff (% of total)'                     : 'births_under_obstetrician ', 
    'co2 emissions (metric tons per capita)'                                   : 'co2_emissions', 
    'contributing family workers, female (% of female employment)'             : 'female_employment_%',
    'contributing family workers, male (% of male employment)'                 : 'male_employment_%',
    'contributing family workers, total (% of total employment)'               : 'total_employment', 
    'employment to population ratio, 15+, female (%) (modeled ilo estimate)'   : 'female_employment_ratio',
    'employment to population ratio, 15+, male (%) (modeled ilo estimate)'     : 'male_employment _ratio', 
    'employment to population ratio, 15+, total (%) (modeled ilo estimate)'    : 'total_employment_ratio',
    'energy use (kg of oil equivalent) per $1,000 gdp (constant 2011 ppp)'     : 'energy_use',
    'fertility rate, total (births per woman)'                                 : 'fertility_rate', 
    'gdp per person employed (constant 2011 ppp $)'                            : 'gdp_per_person', 
    'gdp per unit of energy use (constant 2011 ppp $ per kg of oil equivalent)': 'gdp_per_energy use', 
    'gni per capita, atlas method (current US$)'                               : 'gross_national_income',
    'immunization, measles (% of children ages 12-23 months)'                  : 'measles_immunization_for kids', 
    'improved sanitation facilities (% of population with access)'             : 'improved_sanitation facilties', 
    'improved water source (% of population with access)'                      : 'improved_water resources', 
    'incidence of tuberculosis (per 100,000 people)'                           : 'tuberculosis_infections',
    'income share held by lowest 20%'                                          : 'income_share_lowest 20',
    'internet users (per 100 people)'                                          : 'internet_users',
    'life expectancy at birth, total (years)'                                  : 'life_expectancy', 
    'literacy rate, adult total (% of people ages 15 and above)'               : 'literacy_rate',
    'maternal mortality ratio (modeled estimate, per 100,000 live births)'     : 'maternal_mortality_ratio',
    'mobile cellular subscriptions (per 100 people)'                           : 'mobile_subscriptions', 
    'mortality rate, infant (per 1,000 live births)'                           : 'infant_mortality_ratio',
    'net oda received per capita (current US$)'                                : 'official_development_received',
    'population, total'                                                        : 'total_population',
    'poverty gap at national poverty lines (%)'                                : 'poverty_gap',
    'pregnant women receiving prenatal care (%)'                               : 'prenatal_care',
    'prevalence of hiv, total (% of population ages 15-49)'                    : 'hiv_prevention', 
    'prevalence of undernourishment (% of population)'                         : 'undernourishment_prevention', 
    'primary completion rate, total (% of relevant age group)'                 : 'primary_education_completion',
    'proportion of seats held by women in national parliaments (%)'            : 'women_in_parliament',
    'reported cases of malaria'                                                : 'malaria_infection',
    'school enrollment, primary (% net)'                                       : 'primary_school_enrolment', 
    'self-employed, total (% of total employment)'                             : 'self_employed', 
    'trade (% of gdp)'                                                         : 'trade', 
    'tuberculosis death rate (per 100,000 people), including hiv'              : 'death_by_tuberculosis_with aids' }

df_wheel_new = df_wheel.rename(columns=new_names)
df_bumb.rename(columns=new_names, inplace=True)
df_rat.rename(columns=new_names, inplace=True)
df_mir.rename(columns=new_names, inplace=True)
df_side.rename(columns=new_names, inplace=True)
df_sm.rename(columns=new_names, inplace=True)
df_iron.rename(columns=new_names, inplace=True)
df_clif.rename(columns=new_names, inplace=True)
df_jet.rename(columns=new_names, inplace=True)
df_jaz.rename(columns=new_names, inplace=True)
df_dino.rename(columns=new_names, inplace=True)
df_hot.rename(columns=new_names, inplace=True)
df_opt.rename(columns=new_names, inplace=True)
df_sent.rename(columns=new_names, inplace=True)

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py:4295: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


#Calculate the means for each region and assigning them a new name to imput later

wheel= df_wheel_new[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate']].mean(axis = 0).round(decimals = 2)

bumb = df_bumb[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

rat = df_rat[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

mir = df_mir[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

side = df_side[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

sm = df_sm[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

iron = df_iron[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

clif = df_clif[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

jet = df_jet[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

jaz = df_jaz[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

dino = df_dino[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

hot = df_hot[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

opt = df_opt[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)

sent = df_sent[['women_in_parliament',
              'maternal_mortality_ratio',
              'tuberculosis_infections',
             'improved_sanitation facilties',
             'mobile_subscriptions',
             'trade',
             'gdp_per_person',
             'co2_emissions',
             'enrollment_rate','literacy_rate']].mean(axis = 0).round(decimals = 2)


#Create a Dataset with the means by region

df = pd.DataFrame({'Arabian Peninsula'                         : wheel,
                   'Western Europe'                            : jet,
                   'East/South Africa'                         : jaz,
                   'Carribean Islands & Parts of Latin America': rat,
                   'Middle East & Russia'                      : mir,
                   'Sub-Saharan Africa'                        : side,
                   'Eastern Europe'                            : sm,
                   'South America'                             : iron,
                   'Australia & Oceania'                       : clif,
                   'Central Asia'                              : dino,
                   'North America & Part of Latin America'     : hot,
                   'East/Southeast Asia'                       : opt,
                   'Northern Sub-Saharan Africa'               : bumb,})

# Transpose the DataFrame for practicity

dft = df.transpose()


#Histogram of the mean for each 


####################Women in parliament#######################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['women_in_parliament'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label   = "Distribution of women in parliament")
plt.xlabel(xlabel = 'Women in Parliament')
plt.ylabel(ylabel = 'Frequency')
plt.axvline(x     = dft['women_in_parliament'].mean(),
            color = 'red')
plt.axvline(x     = dft['women_in_parliament'].median(),
            color = 'blue')
plt.legend(labels =  ['mean', 'median'])


################### MATERNAL MORTALITY RATIO####################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['maternal_mortality_ratio'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label   = "Distribution of Maternal Mortality")
plt.xlabel(xlabel = 'Maternal Mortality')
plt.ylabel(ylabel = 'Frequency')
plt.axvline(x     = dft['maternal_mortality_ratio'].mean(),
            color = 'red')
plt.axvline(x     = dft['maternal_mortality_ratio'].median(),
            color = 'blue')
plt.legend(labels =  ['mean', 'median'])



################# TUBERCULOSIS INFECTIONS #######################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['tuberculosis_infections'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label   = "Distribution of Tuberculosis Infections")
plt.xlabel(xlabel = 'Tuberculosis Infections')
plt.ylabel(ylabel = 'Frequency')
plt.axvline(x     = dft['tuberculosis_infections'].mean(),
            color = 'red')
plt.axvline(x     = dft['tuberculosis_infections'].median(),
            color = 'blue')
plt.legend(labels =  ['mean', 'median'])


################# Improved Sanitation Facilities ################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['improved_sanitation facilties'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label    = "Distribution of Improved Sanitation Facilities")
plt.xlabel(xlabel  = 'Improved Sanotation Facilities')
plt.ylabel(ylabel  = 'Frequency')
plt.axvline(x      = dft['improved_sanitation facilties'].mean(),
            color  = 'red')
plt.axvline(x      = dft['improved_sanitation facilties'].median(),
            color  = 'blue')
plt.legend(labels  =  ['mean', 'median'])


################# Mobile Subscriptions #########################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['mobile_subscriptions'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label    = "Distribution of Mobile Subscriptions")
plt.xlabel(xlabel  = 'Mobile Subscriptions')
plt.ylabel(ylabel  = 'Frequency')
plt.axvline(x      = dft['mobile_subscriptions'].mean(),
            color  = 'red')
plt.axvline(x      = dft['mobile_subscriptions'].median(),
            color  = 'blue')
plt.legend(labels  =  ['mean', 'median'])


################### Trade ####################################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['trade'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label    = "Distribution of Trade")
plt.xlabel(xlabel  = 'Trade')
plt.ylabel(ylabel  = 'Frequency')
plt.axvline(x      = dft['trade'].mean(),
            color  = 'red')
plt.axvline(x      = dft['trade'].median(),
            color  = 'blue')
plt.legend(labels  =  ['mean', 'median'])


#################### GDP per Capita ##########################



fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['gdp_per_person'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label = "Distribution of GDP per Capita")
plt.xlabel(xlabel = 'GDP per Capita')
plt.ylabel(ylabel = 'Frequency')
plt.axvline(x = dft['gdp_per_person'].mean(),
            color = 'red')
plt.axvline(x = dft['gdp_per_person'].median(),
            color = 'blue')
plt.legend(labels =  ['mean', 'median'])

################## CO2 Emissions ###############################


fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['co2_emissions'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label = "Distribution of CO2 Emissions")
plt.xlabel(xlabel = 'CO2 Emissions')
plt.ylabel(ylabel = 'Frequency')
plt.axvline(x = dft['co2_emissions'].mean(),
            color = 'red')
plt.axvline(x = dft['co2_emissions'].median(),
            color = 'blue')
plt.legend(labels =  ['mean', 'median'])


################# Enrollment Rate ###########################



fig, ax = plt.subplots(figsize = [10, 5])

sns.distplot(a     = dft['enrollment_rate'],
             bins  = 10,
             hist  = True,
             kde   = False,
             rug   = False,
             color = 'black')


plt.title(label = "Distribution of Enrollment Rate")
plt.xlabel(xlabel = 'Enrollment Rate')
plt.ylabel(ylabel = 'Frequency')
plt.axvline(x = dft['enrollment_rate'].mean(),
            color = 'red')
plt.axvline(x = dft['enrollment_rate'].median(),
            color = 'blue')
plt.legend(labels =  ['mean', 'median'])

/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<matplotlib.legend.Legend at 0x123196b80>


#flagging the missing values in our Region

for col in df_wheel_new:

    if df_wheel_new[col].isnull().astype(int).sum() > 0:
        df_wheel_new['m_'+col] = df_wheel_new[col].isnull().astype(int)

#soft coding MEAN for women_in_parliament
women_mean = df_wheel_new['women_in_parliament'].mean()


#filling women_in_parliament NAs with MEDIAN
df_wheel_new['women_in_parliament'].fillna(value = women_mean,
                         inplace = True)



#soft coding MEAN for literacy_rate
literacy_mean = df_wheel_new['literacy_rate'].mean()


#filling literacy_rate NAs with MEAN
df_wheel_new['literacy_rate'].fillna(value = literacy_mean,
                         inplace = True)
#Calculating the mean for our Region for each category
wheel= df_wheel_new[['women_in_parliament',
                     'maternal_mortality_ratio',
                     'tuberculosis_infections',
                     'improved_sanitation facilties',
                     'mobile_subscriptions',
                     'trade',
                     'gdp_per_person',
                     'co2_emissions',
                     'enrollment_rate']].mean(axis = 0).round(decimals = 2)

#Calculating the mean for each Region for each category

df_bumb = final[final["Cool Name"] == "Bumblebee"]
df_rat  = final[final["Cool Name"] == "Ratchet"]
df_mir  = final[final["Cool Name"] == "Mirage"]
df_side = final[final["Cool Name"] == "Sideswipe"]
df_sm   = final[final["Cool Name"] == "Skids & Mudflap"]
df_iron = final[final["Cool Name"] == "Ironhide"]
df_clif = final[final["Cool Name"] == "Cliffhanger"]
df_jet  = final[final["Cool Name"] == "Jetfire"]
df_jaz  = final[final["Cool Name"] == "Jazz"]
df_dino = final[final["Cool Name"] == "Dinobots"]
df_hot  = final[final["Cool Name"] == "Hot Rod"]
df_opt  = final[final["Cool Name"] == "Optimus Prime"]
df_sent = final[final["Cool Name"] == "Sentinel Prime"]

new_names = {
    'country code total'                                                       : 'country_code', 
    'country name'                                                             : 'country_name',
    'hult region'                                                              : 'hult_region', 
    'cool name'                                                                : 'cool_name', 
    'aids estimated deaths (unaids estimates)'                                 : 'est_deaths_by _aids', 
    'adjusted net enrollment rate, primary (% of primary school age children)' : 'enrollment_rate',
    'adolescent fertility rate (births per 1,000 women ages 15-19)'            : 'adolescent _fertility_rate', 
    'antiretroviral therapy coverage (% of people living with hiv)'            : 'art_therapy for aids', 
    'births attended by skilled health staff (% of total)'                     : 'births_under_obstetrician ', 
    'co2 emissions (metric tons per capita)'                                   : 'co2_emissions', 
    'contributing family workers, female (% of female employment)'             : 'female_employment_%',
    'contributing family workers, male (% of male employment)'                 : 'male_employment_%',
    'contributing family workers, total (% of total employment)'               : 'total_employment', 
    'employment to population ratio, 15+, female (%) (modeled ilo estimate)'   : 'female_employment_ratio',
    'employment to population ratio, 15+, male (%) (modeled ilo estimate)'     : 'male_employment _ratio', 
    'employment to population ratio, 15+, total (%) (modeled ilo estimate)'    : 'total_employment_ratio',
    'energy use (kg of oil equivalent) per $1,000 gdp (constant 2011 ppp)'     : 'energy_use',
    'fertility rate, total (births per woman)'                                 : 'fertility_rate', 
    'gdp per person employed (constant 2011 ppp $)'                            : 'gdp_per_person', 
    'gdp per unit of energy use (constant 2011 ppp $ per kg of oil equivalent)': 'gdp_per_energy use', 
    'gni per capita, atlas method (current US$)'                               : 'gross_national_income',
    'immunization, measles (% of children ages 12-23 months)'                  : 'measles_immunization_for kids', 
    'improved sanitation facilities (% of population with access)'             : 'improved_sanitation facilties', 
    'improved water source (% of population with access)'                      : 'improved_water resources', 
    'incidence of tuberculosis (per 100,000 people)'                           : 'tuberculosis_infections',
    'income share held by lowest 20%'                                          : 'income_share_lowest 20',
    'internet users (per 100 people)'                                          : 'internet_users',
    'life expectancy at birth, total (years)'                                  : 'life_expectancy', 
    'literacy rate, adult total (% of people ages 15 and above)'               : 'literacy_rate',
    'maternal mortality ratio (modeled estimate, per 100,000 live births)'     : 'maternal_mortality_ratio',
    'mobile cellular subscriptions (per 100 people)'                           : 'mobile_subscriptions', 
    'mortality rate, infant (per 1,000 live births)'                           : 'infant_mortality_ratio',
    'net oda received per capita (current US$)'                                : 'official_development_received',
    'population, total'                                                        : 'total_population',
    'poverty gap at national poverty lines (%)'                                : 'poverty_gap',
    'pregnant women receiving prenatal care (%)'                               : 'prenatal_care',
    'prevalence of hiv, total (% of population ages 15-49)'                    : 'hiv_prevention', 
    'prevalence of undernourishment (% of population)'                         : 'undernourishment_prevention', 
    'primary completion rate, total (% of relevant age group)'                 : 'primary_education_completion',
    'proportion of seats held by women in national parliaments (%)'            : 'women_in_parliament',
    'reported cases of malaria'                                                : 'malaria_infection',
    'school enrollment, primary (% net)'                                       : 'primary_school_enrolment', 
    'self-employed, total (% of total employment)'                             : 'self_employed', 
    'trade (% of gdp)'                                                         : 'trade', 
    'tuberculosis death rate (per 100,000 people), including hiv'              : 'death_by_tuberculosis_with aids' }

#Replacing the names of the columns

#Everything in lowercase
df_bumb.columns  = df_bumb.columns.str.lower()
df_rat.columns   = df_rat.columns.str.lower()
df_mir.columns   = df_mir.columns.str.lower()
df_side.columns  = df_side.columns.str.lower()
df_sm.columns    = df_sm.columns.str.lower()
df_iron.columns  = df_iron.columns.str.lower()
df_clif.columns  = df_clif.columns.str.lower()
df_jet.columns   = df_jet.columns.str.lower()
df_jaz.columns   = df_jaz.columns.str.lower()
df_dino.columns  = df_dino.columns.str.lower()
df_hot.columns   = df_hot.columns.str.lower()
df_opt.columns   = df_opt.columns.str.lower()
df_sent.columns  = df_sent.columns.str.lower()



#The actual replacement of the names

df_bumb.rename(columns = new_names, inplace = True)
df_rat.rename(columns  = new_names, inplace = True)
df_mir.rename(columns  = new_names, inplace = True)
df_side.rename(columns = new_names, inplace = True)
df_sm.rename(columns   = new_names, inplace = True)
df_iron.rename(columns = new_names, inplace = True)
df_clif.rename(columns = new_names, inplace = True)
df_jet.rename(columns  = new_names, inplace = True)
df_jaz.rename(columns  = new_names, inplace = True)
df_dino.rename(columns = new_names, inplace = True)
df_hot.rename(columns  = new_names, inplace = True)
df_opt.rename(columns  = new_names, inplace = True)
df_sent.rename(columns = new_names, inplace = True)

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py:4295: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


################################ HEAT MAP ####################################                                                                           #

wheeljack_heatmap = df_wheel_new[['country_code', 'enrollment_rate','total_employment','gdp_per_person', 
                                  'life_expectancy', 'literacy_rate','maternal_mortality_ratio',
                                  'women_in_parliament','improved_sanitation facilties',
                                  'mobile_subscriptions','internet_users','births_under_obstetrician ',
                                  'co2_emissions','measles_immunization_for kids']]

#Pearson correlation to identify the correlation between variables
wheeljack_correlation = wheeljack_heatmap.corr(method = 'pearson').round(decimals = 2)

#Plotting Heat-map
fig, ax = plt.subplots (figsize = (10,10)) 
sns.heatmap (data       = wheeljack_correlation,
             cmap       = 'viridis',
             square     = True,
             annot      = True,
             linecolor  = 'black',
             linewidths = 0.5)

plt.title ("""
Linear Correlation of the Top Features in Arabian Peninsula""")

Text(0.5, 1.0, '\nLinear Correlation of the Top Features in Arabian Peninsula')


ranking_wheeljack_DATA = pd.DataFrame({'country_name': ['ISRAEL', 'UNITED ARAB EMIRATES','CYPRYUS',
'OMAN','JORDAN','KUWAIT','QATAR','BAHRAIN','SAUDI ARABIA','IRAQ','LEBANON','TURKEY','YEMEN'],
                        't_employmen_rt'              : [0,2,0,0,0,1,4,3,0,0,0,0,0],
                        'gdp_person'                  : [0,0,0,1,0,4,3,0,2,0,0,0,0],
                        'gni'                         : [0,2,1,0,0,3,4,0,0,0,0,0,0], 
                        'immunization'                : [2,0,0,0,0,4,1,3,0,0,0,0,0],
                        'life_expectancy'             : [4,0,3,1,0,0,0,0,0,0,2,0,0],
                        'sanitation'                  : [3,0,1,0,0,4,0,0,2,0,0,0,0],
                        'water'                       : [4,0,3,0,0,0,1,2,0,0,0,0,0],
                        'internet_users'              : [3,4,0,0,0,0,1,2,0,0,0,0,0],
                        'mobile_subscription'         : [0,3,2,0,0,0,0,1,4,0,0,0,0],
                        'population'                  : [0,0,4,0,0,1,2,3,0,0,0,0,0],
                        'trade'                       : [0,4,1,0,2,0,0,3,0,0,0,0,0]})

for col in ranking_wheeljack_DATA:
    if col  =='t_employmen_rt' or col    =='gdp_person' or col=='gni' or col=='trade':
        ranking_wheeljack_DATA[col]=ranking_wheeljack_DATA[col]*2
    elif col=='immunization'   or col    =='life_expectancy':
        ranking_wheeljack_DATA[col]=ranking_wheeljack_DATA[col]*5
    elif col=='internet_users' or col    =='mobile_subscription':
        ranking_wheeljack_DATA[col]= ranking_wheeljack_DATA[col]*4
    elif col=='sanitation'     or col    =='water':
        ranking_wheeljack_DATA[col]= ranking_wheeljack_DATA[col]*3
    else:
        ranking_wheeljack_DATA[col]= ranking_wheeljack_DATA[col]*1

ranking_wheeljack_DATA['Total'] = ranking_wheeljack_DATA.sum(axis=1)
ranking_wheeljack_DATA["Rank"]  = ranking_wheeljack_DATA['country_name'].rank() 
ranking_wheeljack_DATA.sort_values("Total", ascending = False, inplace = True)

print("Top 5 Wheeljack region countries:")
ranking_wheeljack_DATA['country_name'].head(n=5)

Top 5 Wheeljack region countries:

0                  ISRAEL
5                  KUWAIT
7                 BAHRAIN
1    UNITED ARAB EMIRATES
2                 CYPRYUS
Name: country_name, dtype: object


#Health-Immunization
users= df_wheel_new.groupby("country_name")["measles_immunization_for kids"].sum()
users.plot(kind="barh", fontsize=10)

<AxesSubplot:ylabel='country_name'>


#Life Expectancy 
users= df_wheel_new.groupby("country_name")["life_expectancy"].sum()
users.plot(kind="barh", fontsize=10)

<AxesSubplot:ylabel='country_name'>


#Internet Users
users= df_wheel_new.groupby("country_name")["internet_users"].sum()
users.plot(kind="barh", fontsize=10)

<AxesSubplot:ylabel='country_name'>


#Mobile Subscription –
users= df_wheel_new.groupby("country_name")["mobile_subscriptions"].sum()
users.plot(kind="barh", fontsize=10)

<AxesSubplot:ylabel='country_name'>


#Sanitation
users= df_wheel_new.groupby("country_name")["improved_sanitation facilties"].sum()
users.plot(kind="barh", fontsize=10)

<AxesSubplot:ylabel='country_name'>


#the Arabian Peninsula vs Other Regions
colors = ['b', 'green', 'y', 'pink','orange','cyan','darkgrey']
#Women in Parliament
ax = dft.plot(y= 'women_in_parliament',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Women in Parliament')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#Maternal Mortality Ratio
ax = dft.plot(y= 'maternal_mortality_ratio',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Maternal Mortality Ratio')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#Tuberculosis Infections

ax = dft.plot(y= 'tuberculosis_infections',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Tuberculosis Infections')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')


#Improved Sanitation Facilities

ax = dft.plot(y= 'improved_sanitation facilties',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Improved Sanitation Facilities')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#Mobile Subscriptions

ax = dft.plot(y= 'mobile_subscriptions',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Mobile Subscriptions')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#Trade

ax = dft.plot(y= 'trade',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Trade')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#GDP per Capita

ax = dft.plot(y= 'gdp_per_person',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'GDP per Capita')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#CO2 Emissions

ax = dft.plot(y= 'co2_emissions',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'CO2 Emissions')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

#Enrollment Rate

ax = dft.plot(y= 'enrollment_rate',kind='barh', color = colors, legend = False, figsize=(10,5),
                  title = 'Enrollment Rate')

for bar in ax.patches:
    bar.set_facecolor('blue')

highlight = 'Arabian Peninsula'
pos = dft.index.get_loc(highlight)

ax.patches[pos].set_facecolor('green')

About this Project¶

About Arabian Peninsula¶

Importing Dataset and Installing Essential Packages¶

Cleaning and Subsetting Data¶

Missing Value¶

HEATMAP¶

Represent Country - Israel¶

The Arabian Peninsula Compare to the World¶

Conclusion¶

References¶