# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Clone the repository, change the wd
!git clone https://github.com/barisalan00/barisalan00.github.io
%cd /home/jovyan/barisalan00.github.io
!pwd

Cloning into 'barisalan00.github.io'...
remote: Enumerating objects: 141, done.
remote: Counting objects: 100% (89/89), done.
remote: Compressing objects: 100% (86/86), done.
remote: Total 141 (delta 43), reused 2 (delta 2), pack-reused 52 (from 1)
Receiving objects: 100% (141/141), 19.18 MiB | 105.00 KiB/s, done.
Resolving deltas: 100% (57/57), done.
/home/jovyan/barisalan00.github.io
/home/jovyan/barisalan00.github.io

# Main Immigration Dataset: Import Eurostat Immigration/2022 Dataset
euim22 = pd.read_csv('Eurostat-2022Migration-migr_imm1ctz__custom_10841676_linear.csv')
display(euim22.head(3))

# Total number of observations: 119479
display(len(euim22))

# Import Datahub.io 2-digit Country Codes dataset
country_codes2 = pd.read_csv('Datahub-CountryCodes-data_csv.csv')
display(country_codes2.head(3))

# Import WB 3-digit Country Codes dataset
country_codes3 = pd.read_csv('UN-iso3.csv')
display(country_codes3.head(3))

country_codes3_ = pd.read_excel('WB-CountryCodes.xlsx')

# Economic Indicator1: WB 2022 GDP/PC
gdppc = pd.read_csv('WB-2022GDPPC-Const.csv')
display(gdppc.head(3))

# Economic Indicator2: WB 2023 Multidimensional Poverty Measure
mpm = pd.read_excel('WB-2022MPM-Data-AM2022.xlsx')
display(mpm.head(3))

# Conflict Indicator1: ACLED 2022 Battles Dataset
battle = pd.read_csv('ACLED-2022Battles.csv')
display(battle.head(3))

# Conflict Indicator2: ACLED 2022 Riots Dataset
riot = pd.read_csv('ACLED-2022Riots.csv')
display(riot.head(3))

# Conflict Indicator3: ACLED 2022 Violence Dataset
violence = pd.read_csv('ACLED-2022ViolencesCivilians.csv')
display(violence.head(3))

# Political Indicators: WB Governance Indicators
govern = pd.read_csv('WB-2022GovIndic.csv')
display(govern.head(3))

# Climate Indicator: German Watch Climate Risk Index
climate = pd.read_csv('GermanWatch-2018CRI.csv')
display(climate.head(3))

# Population Indicator: CIA World Factbook - Population
population = pd.read_csv('CIA-Population.csv', encoding='latin1')
display(population.head(3))

119479

# Check the dtypes for euim22
# The year (TIME_PERIOD) and flow (OBS_VALUE) columns are integer, and the rest is object as expected.
euim22.dtypes

DATAFLOW       object
LAST UPDATE    object
freq           object
citizen        object
agedef         object
age            object
unit           object
sex            object
geo            object
TIME_PERIOD     int64
OBS_VALUE       int64
OBS_FLAG       object
dtype: object

# Keep only necessary columns and drop redundant ones
euim22 = euim22[['citizen', 'age', 'sex', 'geo', 'TIME_PERIOD', 'OBS_VALUE']]
euim22.head()

# Rename columns for readability
euim22.rename(columns={'citizen':'Migrant_Citizenship',
                        'age': 'Age',
                        'sex': 'Gender',
                        'geo':'Receiving_CCode',
                        'TIME_PERIOD':'Year',
                        'OBS_VALUE':'Flow'},inplace=True)
euim22.head()

# Bring country name information for Migrant Citizenship column (left join to keep all observations at euim_21)
euim22 = pd.merge(euim22, country_codes2, how='left', left_on='Migrant_Citizenship', right_on='Code')

# Drop the redundant "Code" column
euim22.drop('Code', axis=1, inplace=True)

# Rename the 'Name' column to 'Sending_Country'
euim22.rename(columns={'Name':'Sending_Country'}, inplace=True)

# Move Migrant_Country after Migrant_Citizenship
col = euim22.pop('Sending_Country')
euim22.insert(1, col.name, col)

euim22.head(3)

# Bring country name information for Receiving Country Column (left join to keep all observations at euim_22)
euim22 = pd.merge(euim22, country_codes2, how='left',left_on='Receiving_CCode', right_on='Code')

# Drop the redundant "Code" column
euim22.drop('Code', axis=1, inplace=True)

# Rename the 'Name' column to 'Receiving_Country'
euim22.rename(columns={'Name':'Receiving_Country'}, inplace=True)

# Move Receiving_Country after Receiving_CCode
col = euim22.pop('Receiving_Country')
euim22.insert(5, col.name, col)


euim22.head(3)

# Check if we lost any cells at the merge operations.
# We had 119479 observations at the beginning, and and it is still there, we are not missing anything.
len(euim22)

119479

# Is there any missing values under Receiving_Country?
# 'EL' is country code for Greece. Greece is using both 'GR' (in international systems) and 'EL' (in European systems) as its country code.
# 'EU27_2020' is the code for 27 EU countries.
euim22[euim22['Receiving_Country'].isna()]['Receiving_CCode'].unique()

array(['EL', 'EU27_2020'], dtype=object)

# Fill these NaN values for 'EL' with Greece
euim22.loc[euim22['Receiving_CCode'] == 'EL', 'Receiving_Country'] = 'Greece'
euim22.loc[euim22['Receiving_CCode'] == 'EU27_2020', 'Receiving_Country'] = 'EU27'

# Are all receiving countries EU27? Iceland, Liechstein, Norway, Switzerland are not EU27.
display(euim22['Receiving_Country'].unique())

array(['Austria', 'Bulgaria', 'Czech Republic', 'Estonia', 'Finland',
       'Croatia', 'Hungary', 'Iceland', 'Italy', 'Lithuania',
       'Luxembourg', 'Latvia', 'Netherlands', 'Norway', 'Romania',
       'Sweden', 'Slovenia', 'Slovakia', 'Spain', 'France', 'Belgium',
       'Switzerland', 'Cyprus', 'Germany', 'Denmark', 'Greece', 'EU27',
       'Ireland', 'Liechtenstein', 'Malta', 'Poland', 'Portugal'],
      dtype=object)

# Drop these 4 countries: Now we have 27 EU countries + 1 EU27 Aggregated observation
countries_to_drop = ['Iceland', 'Liechtenstein', 'Norway', 'Switzerland']
euim22 = euim22[~euim22['Receiving_Country'].isin(countries_to_drop)]
eu27 = (euim22['Receiving_Country'].unique())
print(eu27)

['Austria' 'Bulgaria' 'Czech Republic' 'Estonia' 'Finland' 'Croatia'
 'Hungary' 'Italy' 'Lithuania' 'Luxembourg' 'Latvia' 'Netherlands'
 'Romania' 'Sweden' 'Slovenia' 'Slovakia' 'Spain' 'France' 'Belgium'
 'Cyprus' 'Germany' 'Denmark' 'Greece' 'EU27' 'Ireland' 'Malta' 'Poland'
 'Portugal']

# Is there any NaN cells under Sending_Country column? --> 20017 observations are missing.
euim22['Sending_Country'].isna().sum()

20017

# Let's check the unique values for these 20017 NaN observations.
euim22[euim22['Sending_Country'].isna()]['Migrant_Citizenship'].unique()

array(['AFR', 'AFR_C', 'AFR_E', 'AFR_N', 'AFR_S', 'AFR_W', 'AME', 'AME_C',
       'AME_N', 'AME_S', 'ASI', 'ASI_C', 'ASI_E', 'ASI_S', 'ASI_S_E',
       'ASI_W', 'AU_NZ', 'CC8_22_FOR', 'CRB', 'CZ_SK', 'EFTA_FOR', 'EL',
       'EU27_2020_FOR', 'EUR', 'EX_SU', 'EX_YU', 'FOR_STLS', 'MEL', 'MIC',
       'NAT', 'NEU27_2020_FOR', 'OCE', 'POL', 'RNC', 'RS_ME', 'STLS',
       'TOTAL', 'UK', 'UNK', 'XK'], dtype=object)

# Replace the NaN Values under Migrant_Country for these ('STLS':Stateless, 'RNC':Recognized Non-Citizens and 'UNK':Unknown) under Migrant_Ciizenship
# Therefore all non-NaN observatoins under Migrant_Country column are part of our analysis.
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'STLS'].index, 'Sending_Country'] = 'Stateless'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'RNC'].index, 'Sending_Country'] = 'Non-Citizens'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'UNK'].index, 'Sending_Country'] = 'Unkown'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'EU27_2020_FOR'].index, 'Sending_Country'] = 'EU27'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'NEU27_2020_FOR'].index, 'Sending_Country'] = 'Non-EU27'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'TOTAL'].index, 'Sending_Country'] = 'Total'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'EL'].index, 'Sending_Country'] = 'Greece'

# How many missing values now: 15129
display(euim22['Sending_Country'].isna().sum())

15129

# Create a 2nd dataset to keep continental observations.
euim22_continents = euim22

# Now we can delete the continent/region observations from the euim22 (which are NA observations under Sending_Country column)
euim22 = euim22.dropna(subset=['Sending_Country'])

# NEw dataframe is 93370 length.
len(euim22)

93370

#Some extra re-naming for easier coding
euim22.loc[euim22[euim22['Age'] == 'TOTAL'].index, 'Age'] = 'Total'
euim22.loc[euim22[euim22['Gender'] == 'T'].index, 'Gender'] = 'Total'
euim22.loc[euim22[euim22['Migrant_Citizenship'] == 'TOTAL'].index, 'Migrant_Citizenship'] = 'Total'

# Total number of immigration to EU27 from and total number of immigration from non-EU27
euim22[(euim22['Sending_Country'].apply(lambda x: x in ['Total', 'EU27', 'Non-EU27'])) & (euim22['Receiving_Country']=='EU27') & (euim22['Age']=='Total') & (euim22['Gender']=='Total')].sort_values(by='Flow', ascending=False)

# Total number of arrivals by reciving EU27 countries
euim22[(euim22['Sending_Country']=='Total') & (euim22['Age']=='Total') & (euim22['Gender']=='Total')].sort_values(by='Flow', ascending=False).head(5)

# Total number of arrivals from non-EU27 countries
euim22[(euim22['Sending_Country']=='Non-EU27') & (euim22['Age']=='Total') & (euim22['Gender']=='Total')].sort_values(by='Flow', ascending=False).head(5)

# Create a new dataset by dropping the 'Total', 'EU27', and 'Non-EU27' observations under 'Sending_Country', and 'EU27' under 'Receiving'
# Total number of observations decreased to 90958.
immig = euim22[~euim22['Sending_Country'].isin(['Total', 'EU27', 'Non-EU27'])]
len(immig)

90958

# Drop 'EU27' under 'Receiving_Country' 
# Total number of observations decreased to 90952.
immig = immig[~immig['Receiving_Country'].isin(['EU27'])]
len(immig)

90952

# What is total number of arrivals? 3.4 million.
immig[(immig['Age']=='Total') & (immig['Gender']=='Total')]['Flow'].sum()

3406513

# Total number of arrivals from EU27 countries: 1.1 million
immig[(immig['Sending_Country'].isin(eu27)) & (immig['Age']=='Total') & (immig['Gender']=='Total')]['Flow'].sum()

1171307

# Total number of arrivals from non-EU27 countries: 2.2 million.
immig_noneu = immig[~immig['Sending_Country'].isin(eu27)]
immig_noneu[(immig_noneu['Age']=='Total') & (immig['Gender']=='Total')]['Flow'].sum()

/tmp/ipykernel_145/547588598.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  immig_noneu[(immig_noneu['Age']=='Total') & (immig['Gender']=='Total')]['Flow'].sum()

2235206

# Total number of arrivals in Germany from non-EU27 countries: 6226
display(immig_noneu[(immig_noneu['Receiving_Country']=='Germany') & (immig_noneu['Age']=='Total') & (immig_noneu['Gender']=='Total')]['Flow'].sum())
display(immig_noneu[immig_noneu['Receiving_Country']=='Germany']['Sending_Country'].unique())

6226

array(['Stateless', 'Unkown'], dtype=object)

# Group by 'Gender' column and sum 'Flow' column
immig_gender = immig_noneu[(immig_noneu['Age'] == 'Total')].groupby('Gender')['Flow'].sum()
print(immig_gender)

plt.figure(figsize=(8, 6))

plot_gender = immig_gender.plot(kind='bar', color=['pink','blue', 'green'])
plt.xlabel('Gender')
plt.ylabel('Total Arrivals')
plt.title('Total Migration Flow from Non-EU27 Countries Based on Gender')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

Gender
F        1142144
M        1092917
Total    2235206
Name: Flow, dtype: int64

# Take the total ('T') from Gender column, and groupby 'Age'.
# Y_LT15: those below 15, and Y_GE65: those above 65. Other age breaks already make sense.
# reindex the age brackets in order
immig_age = immig_noneu[(immig_noneu['Gender'] == 'Total')].groupby('Age')['Flow'].sum()

# Reindex the age breaks from smallest to biggest
display(immig_age.reindex(['TOTAL','Y_LT15', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49', 'Y50-54', 'Y55-59', 'Y60-64', 'Y_GE65']))

#Plot
plot_age = immig_age.reindex(['Y_LT15', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49', 'Y50-54', 'Y55-59', 'Y60-64', 'Y_GE65']).plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Total Flow')
plt.title('Total Flow Based on Age Breaks')

Age
TOTAL          NaN
Y_LT15    323229.0
Y15-19    145583.0
Y20-24    199836.0
Y25-29    258376.0
Y30-34    247847.0
Y35-39    212671.0
Y40-44    162976.0
Y45-49    117268.0
Y50-54     83435.0
Y55-59     56993.0
Y60-64     45407.0
Y_GE65     67215.0
Name: Flow, dtype: float64

Text(0.5, 1.0, 'Total Flow Based on Age Breaks')

# Total immigrants by receiving country
total_by_receiving = immig_noneu[(immig_noneu['Age'] == 'Total') & (immig_noneu['Gender'] == 'Total')].groupby('Receiving_Country')['Flow'].sum()

# Sort them
total_by_receiving = total_by_receiving.sort_values(ascending=False)

print(total_by_receiving)

#Plot
total_by_receiving.plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Receiving Country')
plt.ylabel('Total Flow')
plt.title('Total Flow Based on Receiving Country')

Receiving_Country
Spain             857915
Czech Republic    330362
Italy             283740
Netherlands       189198
Austria           115919
Romania            91019
Lithuania          66139
Sweden             54356
Hungary            43601
Croatia            40073
Estonia            38898
Finland            33014
Latvia             29800
Slovenia           24269
Luxembourg         14555
Bulgaria           13885
Germany             6226
Belgium              922
Slovakia             561
Denmark              380
Ireland              297
Poland                77
France                 0
Malta                  0
Cyprus                 0
Portugal               0
Greece                 0
Name: Flow, dtype: int64

Text(0.5, 1.0, 'Total Flow Based on Receiving Country')

# Total number of immigrants by sending country
total_by_sending = immig_noneu[(immig_noneu['Age'] == 'Total') & (immig_noneu['Gender'] == 'Total')].groupby('Sending_Country')['Flow'].sum()

# Rename countries with long names
total_by_sending.index = total_by_sending.index.str.replace('Venezuela, Bolivarian Republic of', 'Venezuela')

# Sort and display
total_by_sending = total_by_sending.sort_values(ascending=False)
display(total_by_sending.head(10))

# Plot the immigrants by top- 10 sending country
total_by_sending.head(10).plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Sending Country')
plt.ylabel('Total Flow')
plt.title('Total Flow Based on Sending Country')

Sending_Country
Ukraine                 764356
Colombia                176955
Morocco                 138886
Venezuela                85238
Peru                     74722
India                    59999
Russian Federation       52933
Argentina                49531
Pakistan                 42479
Syrian Arab Republic     42466
Name: Flow, dtype: int64

Text(0.5, 1.0, 'Total Flow Based on Sending Country')

# Keep one aggregate sums for sending countries
immigration = immig_noneu.groupby('Sending_Country').agg({'Flow': 'sum','Migrant_Citizenship': 'first'}).reset_index()
immigration

#Rename countries with long names
immigration['Sending_Country'] = immigration['Sending_Country'].replace({
    'Venezuela, Bolivarian Republic of': 'Venezuela',
    'Syrian Arab Republic': 'Syria',
    "Korea, Democratic People's Republic of": 'North Korea',
    'Taiwan, Province of China': 'Taiwan',
    'Holy See (Vatican City State)': 'Vatican City',
    'Tanzania, United Republic of': 'Tanzania',
    'Macedonia, the Former Yugoslav Republic of': 'Macedonia',
    'Iran, Islamic Republic of': 'Iran',
    'Bolivia, Plurinational State of': 'Bolivia'
})

# Merge immigration dataframe with UN country_codes3
immigration = pd.merge(immigration, country_codes3, left_on='Sending_Country', right_on='name', how='left')
immigration.drop(columns=['name'], inplace=True)
immigration.rename(columns={'iso3': 'Sending_iso3'}, inplace=True)
immigration.rename(columns={'Migrant_Citizenship': 'Sending_iso2'}, inplace=True)


display(len(immigration))
immigration.head(5)

174

# Check for missing ISO3 codes: 21 missing values
display(immigration['Sending_iso3'].isna().sum())

# These countries have longer and shorter version of their names.
immigration[immigration['Sending_iso3'].isna()]['Sending_Country'].unique()

21

array(['Bolivia', 'Cape Verde', 'Congo, the Democratic Republic of the',
       'Vatican City', 'Iran', 'North Korea', 'Korea, Republic of',
       'Macedonia', 'Micronesia, Federated States of',
       'Moldova, Republic of', 'Non-Citizens', 'Palestine, State of',
       'Stateless', 'Swaziland', 'Syria', 'Taiwan', 'Tanzania', 'Turkey',
       'United States', 'Unkown', 'Venezuela'], dtype=object)

# Manually bring the 3 digit country codes for the missing countries
missing_iso3 = {
    'Bolivia': 'BOL',
    'Congo, the Democratic Republic of the': 'COD',
    'Cape Verde': 'CPV',
    'Micronesia, Federated States of': 'FSM',
    'Syria': 'SYR',
    'Iran': 'IRN',
    'North Korea': 'PRK',
    'Korea, Republic of': 'KOR',
    'Moldova, Republic of': 'MDA',
    'Macedonia': 'MKD',
    'Palestine, State of': 'PSE',
    'Non-Citizens': 'XXX',
    'Stateless': 'XXX',
    'Swaziland': 'SWZ',
    'Turkey': 'TUR',
    'Taiwan': 'TWN',
    'Tanzania': 'TZA',
    'Unkown': 'XXX',
    'United States': 'USA',
    'Vatican City': 'VAT',
    'Venezuela': 'VEN'
}

# Update ISO3 column in immig_noneu27 dataset using the dictionary
immigration['Sending_iso3'].fillna(immigration['Sending_Country'].map(missing_iso3), inplace=True)
display(immigration['Sending_iso3'].isna().sum())

0

# Merge WB GDP/PC with Immigration Dataset and bring GDP/PC information
immigration = pd.merge(immigration, gdppc[['Country Code', '2022 [YR2022]']], left_on='Sending_iso3', right_on='Country Code', how='left')
immigration.drop(columns=['Country Code'], inplace=True)

# Rename Sending_gdppc
immigration.rename(columns={'2022 [YR2022]': 'Sending_gdppc'}, inplace=True)

immigration

# Data type of Sending_gdppc = object
print(immigration['Sending_gdppc'].dtype)

# Turn datatype of GDP/PC column into numeric
immigration['Sending_gdppc'] = pd.to_numeric(immigration['Sending_gdppc'], errors='coerce')
print(immigration.dtypes)

object
Sending_Country     object
Flow                 int64
Sending_iso2        object
Sending_iso3        object
Sending_gdppc      float64
dtype: object

# How many NaN values under GDP/PC column
missing_gdppc = immigration[immigration['Sending_gdppc'].isna()]['Sending_Country'].unique()
display(len(missing_gdppc))
display("Countries with NaN GDP/PC: ", missing_gdppc)

24

'Countries with NaN GDP/PC: '

array(['Afghanistan', 'Andorra', 'Bhutan', 'Cuba', 'Eritrea',
       'Vatican City', 'Isle of Man', 'Jersey', 'North Korea', 'Lebanon',
       'Liechtenstein', 'Monaco', 'Non-Citizens', 'Palau', 'San Marino',
       'South Sudan', 'Stateless', 'Syria', 'Taiwan', 'Tonga', 'Unkown',
       'Venezuela', 'Western Sahara', 'Yemen'], dtype=object)

# How many migrants arrived from these countries with missing GDP/PC column?
immigration[(immigration['Sending_Country'].isin(missing_gdppc))].groupby('Sending_Country')['Flow'].sum().sort_values(ascending=False)

Sending_Country
Venezuela         340468
Syria             138876
Cuba               82456
Afghanistan        53037
Unkown             35760
Eritrea            14295
Stateless          11142
Lebanon             9025
Yemen               8918
Non-Citizens        4090
Taiwan              3608
South Sudan          761
San Marino           118
Bhutan                98
Liechtenstein         88
North Korea           78
Andorra               30
Monaco                 6
Palau                  4
Vatican City           2
Jersey                 0
Tonga                  0
Isle of Man            0
Western Sahara         0
Name: Flow, dtype: int64

# Define GDP per capita values for missing countries
missing_gdp = {
    'Venezuela': 3420,
    'Syria': 752,
    'Cuba': 7449,
    'Afghanistan': 372,
    'Eritrea': 1921,
    'Lebanon': 4467,
    'Yemen': 1017,
    'Taiwan': 32716,
    'South Sudan': 340,
    'Western Sahara': 2500,
    'Tonga': 4681,
    'Palau': 14565,
    'North Korea': 1217,
    'Bhutan': 3704
    
}

# Fill missing GDP per capita values based on Sending_Country
immigration['Sending_gdppc'] = immigration.apply(
    lambda row: missing_gdp[row['Sending_Country']] if pd.isna(row['Sending_gdppc']) and row['Sending_Country'] in missing_gdp else row['Sending_gdppc'],
    axis=1
)

# Check the remaining missing countries
display(immigration[immigration['Sending_gdppc'].isna()]['Sending_Country'].unique())

array(['Andorra', 'Vatican City', 'Isle of Man', 'Jersey',
       'Liechtenstein', 'Monaco', 'Non-Citizens', 'San Marino',
       'Stateless', 'Unkown'], dtype=object)

# For the Unkown, Stateless, Recognized Non-Citizens observations, fill the GDP/PC with the average of dataset
average_gdppc = immigration['Sending_gdppc'].mean()

# 3 observations
countries_to_fill = ['Unkown', 'Stateless', 'Non-Citizens']

# Fill missing GDP per capita for specified countries with the dataset's average GDP per capita
immigration['Sending_gdppc'] = immigration.apply(
    lambda row: average_gdppc if pd.isna(row['Sending_gdppc']) and row['Sending_Country'] in countries_to_fill else row['Sending_gdppc'],
    axis=1
)

# Check the remaining missing countries
display(immigration[immigration['Sending_gdppc'].isna()]['Sending_Country'].unique())

array(['Andorra', 'Vatican City', 'Isle of Man', 'Jersey',
       'Liechtenstein', 'Monaco', 'San Marino'], dtype=object)

# Drop NA values
immigration = immigration.dropna(subset=['Sending_gdppc'])

# Verify that no missing GDP per capita values remain
immigration['Sending_gdppc'].isna().sum()

0

import numpy as np
import matplotlib.pyplot as plt

# Scatter plot with size proportional to Flow
plt.figure(figsize=(10, 6))
plt.scatter(
    immigration['Sending_gdppc'], 
    immigration['Flow'], 
    s=immigration['Sending_gdppc'] / 100,  # Size of the dot
    alpha=0.1, 
    color='blue'
)

# Calculate the regression line
x = immigration['Sending_gdppc']
y = immigration['Flow']
coefficients = np.polyfit(x, y, 1) 
regression_line = np.poly1d(coefficients)

# Plot the regression line
plt.plot(
    x, 
    regression_line(x), 
    color='red', 
    linewidth=2, 
    label=f'Regression Line: y = {coefficients[0]:.2f}x + {coefficients[1]:.2f}'
)

# Add labels and title
plt.xlabel('Sending GDP per Capita')
plt.ylabel('Migration Flow')
plt.title('Scatter Plot of Sending GDP/PC vs Migration Flow (Dot Size = GDP/PC)')
plt.grid(True)
plt.legend() 

# Show the plot
plt.show()

# Merge WB GDP/MPM with Immigration Dataset and bring Poverty information
immigration = pd.merge(immigration, mpm[['Country code', 'Multidimensional poverty headcount ratio (%)']], left_on='Sending_iso3', right_on='Country code', how='left')
immigration.drop(columns=['Country code'], inplace=True)

# Rename Sending_mpm
immigration.rename(columns={'Multidimensional poverty headcount ratio (%)': 'Sending_mpm'}, inplace=True)

immigration.head()

# How many missing: 46 observations are missing.
missing_mpm = immigration[immigration['Sending_mpm'].isna()]
display(len(missing_mpm['Sending_Country']))

# Which countries have missing Multidimensional poverty information?
display(missing_mpm['Sending_Country'].unique())

# How many migrants came from these countries with missing MPM: 335,542.
display(immigration[immigration['Sending_Country'].isin(missing_mpm['Sending_Country'])]['Flow'].sum())

46

array(['Afghanistan', 'Algeria', 'Antigua and Barbuda', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Barbados', 'Belize',
       'Bosnia and Herzegovina', 'Brunei Darussalam', 'Cambodia',
       'Canada', 'Central African Republic', 'China', 'Cuba', 'Dominica',
       'Equatorial Guinea', 'Eritrea', 'Grenada', 'Guyana', 'India',
       'Jamaica', 'North Korea', 'Kuwait', 'Libya', 'New Zealand',
       'Non-Citizens', 'Oman', 'Palau', 'Qatar', 'Saint Kitts and Nevis',
       'Saint Lucia', 'Saint Vincent and the Grenadines', 'Saudi Arabia',
       'Singapore', 'Somalia', 'Stateless', 'Suriname', 'Syria',
       'Trinidad and Tobago', 'Turkmenistan', 'United Arab Emirates',
       'Unkown', 'Uzbekistan', 'Venezuela', 'Western Sahara'],
      dtype=object)

1239427

# How many migrants arrived from the countries with missing MPM:
immigration[immigration['Sending_mpm'].isna()].groupby('Sending_Country')['Flow'].sum().sort_values(ascending=False)[:20]

Sending_Country
Venezuela                 340468
India                     224303
Syria                     138876
China                     135900
Cuba                       82456
Bosnia and Herzegovina     63130
Algeria                    58852
Afghanistan                53037
Unkown                     35760
Uzbekistan                 17739
Somalia                    14831
Eritrea                    14295
Stateless                  11142
Equatorial Guinea          10398
Canada                      9145
Azerbaijan                  7853
Suriname                    5812
Non-Citizens                4090
Libya                       2338
Turkmenistan                1849
Name: Flow, dtype: int64

# Create Income_group variable
bins = [0, 1085, 4255, 13205, float('inf')]
labels = ['Low Income', 'Lower-middle Income', 'Upper-middle Income', 'High Income']

immigration['Sending_incomegroup'] = pd.cut(immigration['Sending_gdppc'], bins=bins, labels=labels, right=False)
immigration.head(3)

# Fill missing Sending_mpm values

#Calculate average mpm for each income group
mpm_avg = immigration.groupby('Sending_incomegroup')['Sending_mpm'].transform('mean')

# Fill missing Sending_mpm values with the average values based on Income_group
immigration['Sending_mpm'] = immigration['Sending_mpm'].fillna(mpm_avg)

immigration.head(3)

/tmp/ipykernel_145/1772377715.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  mpm_avg = immigration.groupby('Sending_incomegroup')['Sending_mpm'].transform('mean')

# Numeric iso3 codes
immigration = immigration.merge(country_codes3_[['Iso3', 'Code']], how='left', left_on='Sending_iso3', right_on='Iso3')
immigration = immigration.drop(columns=['Iso3'])
immigration.rename(columns={'Code': 'Sending_iso3_num'}, inplace=True)
immigration

# Countries with the most fatalities in battles.
battle1 = battle.groupby('country').agg({
    'fatalities': 'sum',
    'iso': 'first'
}).reset_index()

battle1.sort_values(by='fatalities', ascending=False)[:10]

# Merge with battle1 df
immigration = immigration.merge(battle1, how='left', left_on='Sending_iso3_num', right_on='iso')
immigration = immigration.drop(columns=['country', 'iso']) 
immigration = immigration.rename(columns={'fatalities': 'battle_fatalities'})
immigration

# Merge riot casualties
riot1 = riot.groupby('country').agg({'fatalities': 'sum','iso': 'first'}).reset_index()
print(riot1.sort_values(by='fatalities', ascending=False)[:10])

immigration = immigration.merge(riot1, how='left', left_on='Sending_iso3_num', right_on='iso')
immigration = immigration.drop(columns=['country', 'iso']) 
immigration = immigration.rename(columns={'fatalities': 'riot_fatalities'})
immigration

                          country  fatalities  iso
65                           Iran         430  364
36   Democratic Republic of Congo         244  180
63                          India         214  356
75                          Kenya         211  404
106                       Nigeria         208  566
74                     Kazakhstan         185  398
136                  South Africa         169  710
10                     Bangladesh         151   50
64                      Indonesia         147  360
110                      Pakistan         142  586

# Merge Violence casualties
violence1 = violence.groupby('country').agg({'fatalities': 'sum','iso': 'first'}).reset_index()
print(violence1.sort_values(by='fatalities', ascending=False)[:10])

immigration = immigration.merge(violence1, how='left', left_on='Sending_iso3_num', right_on='iso')
immigration = immigration.drop(columns=['country', 'iso']) 
immigration = immigration.rename(columns={'fatalities': 'violence_fatalities'})
immigration

                          country  fatalities  iso
80                         Mexico        6561  484
16                         Brazil        4034   76
90                        Nigeria        3701  566
31   Democratic Republic of Congo        3046  180
39                       Ethiopia        2614  231
84                        Myanmar        2188  104
76                           Mali        2151  466
26                       Colombia        1680  170
134                       Ukraine        1348  804
17                   Burkina Faso        1177  854

# Fill all NAs with 0
immigration[['battle_fatalities', 'riot_fatalities', 'violence_fatalities']] = immigration[['battle_fatalities', 'riot_fatalities', 'violence_fatalities']].fillna(0)
immigration

# Create conflict_casualties
immigration['Sending_conflict_casualties'] = (immigration['battle_fatalities'] * 0.70) + (immigration['riot_fatalities']*0.15) + (immigration['violence_fatalities']*0.15)
immigration = immigration.drop(columns=['battle_fatalities', 'riot_fatalities', 'violence_fatalities']) 
immigration

# What kind of measures does WB Governance Indicators dataset have?
display(govern.head())
govern["Series Name"].unique()

array(['Political Stability and Absence of Violence/Terrorism: Estimate',
       'Voice and Accountability: Estimate',
       'Voice and Accountability: Percentile Rank',
       'Control of Corruption: Estimate',
       'Regulatory Quality: Percentile Rank',
       'Government Effectiveness: Estimate',
       'Rule of Law: Percentile Rank',
       'Control of Corruption: Percentile Rank',
       'Regulatory Quality: Estimate',
       'Government Effectiveness: Percentile Rank',
       'Rule of Law: Estimate',
       'Political Stability and Absence of Violence/Terrorism: Percentile Rank',
       nan], dtype=object)

# Tidy data using pivot_table function
govern['2022 [YR2022]'] = pd.to_numeric(govern['2022 [YR2022]'], errors='coerce')
govern_p = govern.pivot_table(index='Country Name', columns='Series Name', values='2022 [YR2022]').reset_index()
govern_p.head()

# Add 3.5 to all estimates so that estimates will be above 0.
govern_p[['Control of Corruption: Estimate', 'Government Effectiveness: Estimate', 'Political Stability and Absence of Violence/Terrorism: Estimate', 'Regulatory Quality: Estimate', 'Rule of Law: Estimate',	'Voice and Accountability: Estimate']] += 3.5

# Calculate one governance variable by simply summing up all of the estimate variables
govern_p['Sending_govern'] = (govern_p['Control of Corruption: Estimate']) + (govern_p['Government Effectiveness: Estimate']) + (govern_p['Political Stability and Absence of Violence/Terrorism: Estimate']) + (govern_p['Regulatory Quality: Estimate']) + (govern_p['Rule of Law: Estimate']) + (govern_p['Voice and Accountability: Estimate'])

govern_p.head()

# Merge govern_p with Immigration Dataset and bring governance indicator.
immigration = pd.merge(immigration, govern_p[['Country Name', 'Sending_govern']], left_on='Sending_Country', right_on='Country Name', how='left')
immigration.drop(columns=['Country Name'], inplace=True)

immigration.head(3)

# Fill missing Sending_mpm values

#Calculate average mpm for each income group
govern_average = immigration.groupby('Sending_incomegroup')['Sending_govern'].transform('mean')

# Fill missing Sending_mpm values with the average values based on Income_group
immigration['Sending_govern'].fillna(govern_average, inplace=True)

immigration.head()

/tmp/ipykernel_145/784547835.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  govern_average = immigration.groupby('Sending_incomegroup')['Sending_govern'].transform('mean')

# Merge govern_p with Immigration Dataset and bring governance indicator.
immigration = pd.merge(immigration, climate[['Country', 'CRI\rscore']], left_on='Sending_Country', right_on='Country', how='left')
immigration.drop(columns=['Country'], inplace=True)

# Rename 'CRI/rscore' Column with 'Sending_cri'
immigration.rename(columns={'CRI\rscore': 'Sending_cri'}, inplace=True)

immigration

# Missing values
missing_cri = immigration[immigration['Sending_cri'].isna()]

display(missing_cri['Sending_Country'].unique())

display(missing_cri.groupby('Sending_Country')['Flow'].sum().sort_values(ascending=False))

display(missing_cri['Flow'].sum())

array(['Afghanistan', 'Bahamas', 'Congo',
       'Congo, the Democratic Republic of the', 'Cuba',
       'Equatorial Guinea', 'Gambia', 'Iran', 'North Korea', 'Kyrgyzstan',
       "Lao People's Democratic Republic", 'Macedonia',
       'Micronesia, Federated States of', 'Moldova, Republic of', 'Nauru',
       'Non-Citizens', 'Palau', 'Palestine, State of',
       'Russian Federation', 'Saint Kitts and Nevis', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Sao Tome and Principe',
       'Somalia', 'Stateless', 'Swaziland', 'Syria', 'Taiwan',
       'Timor-Leste', 'Turkmenistan', 'Unkown', 'Viet Nam',
       'Western Sahara', 'Yemen'], dtype=object)

Sending_Country
Russian Federation                       196630
Syria                                    138876
Cuba                                      82456
Afghanistan                               53037
Iran                                      43182
Moldova, Republic of                      40375
Unkown                                    35760
Macedonia                                 28024
Gambia                                    22640
Viet Nam                                  22106
Somalia                                   14831
Stateless                                 11142
Equatorial Guinea                         10398
Kyrgyzstan                                 9474
Yemen                                      8918
Congo, the Democratic Republic of the      6594
Non-Citizens                               4090
Taiwan                                     3608
Palestine, State of                        2424
Turkmenistan                               1849
Congo                                      1426
Lao People's Democratic Republic            328
Saint Kitts and Nevis                       108
North Korea                                  78
Sao Tome and Principe                        68
Saint Lucia                                  62
Swaziland                                    52
Timor-Leste                                  42
Bahamas                                      34
Saint Vincent and the Grenadines             30
Palau                                         4
Nauru                                         0
Western Sahara                                0
Micronesia, Federated States of               0
Name: Flow, dtype: int64

738646

# Replace the missing values under Sending_cri column with the average of the column

# The average of the 'Sending_gdppc' column
average_cri = immigration['Sending_cri'].mean()

# Fill missing values in the 'Sending_gdppc' column with the calculated average
immigration['Sending_cri'].fillna(average_cri, inplace=True)
immigration

# Merge population with Immigration Dataset and bring governance indicator.
#immigration = immigration.drop(columns=['Sending_pop', 'value'])
immigration = pd.merge(immigration, population[['name', 'value']], left_on='Sending_Country', right_on='name', how='left')
immigration.drop(columns=['name'], inplace=True)

immigration.head()

# Rename 'value' Column with 'Sending_pop'
immigration.rename(columns={'value': 'Sending_pop'}, inplace=True)
immigration.head()

# 10 countries with missing population information. This is because of name mismatch.
print(immigration['Sending_pop'].isna().sum())
immigration[immigration['Sending_pop'].isna()]

10

# Fill the missing population information manually
missing_pop = {
    'Bolivia': 12311974,
    "Côte d'Ivoire": 29981758,
    'Iran': 88386937,
    'North Korea': 26298666,
    'Macedonia': 2135622,
    'Syria': 23865423,
    'Taiwan': 23595274,
    'Tanzania': 67462121,
    'Venezuela': 31250306
}

# Fill missing GDP per capita values based on Sending_Country
immigration['Sending_pop'] = immigration.apply(
    lambda row: missing_pop[row['Sending_Country']] if pd.isna(row['Sending_pop']) and row['Sending_Country'] in missing_pop else row['Sending_pop'],
    axis=1
)

# Check the remaining missing countries
print(immigration[immigration['Sending_pop'].isna()])

    Sending_Country  Flow Sending_iso2 Sending_iso3  Sending_gdppc  \
105    Non-Citizens  4090          RNC          XXX   15549.957897   

     Sending_mpm Sending_incomegroup  Sending_iso3_num  \
105     2.721764         High Income               NaN   

     Sending_conflict_casualties  Sending_govern  Sending_cri Sending_pop  
105                          0.0       22.317889    82.451203         NaN

# Calculate the average of Sending_pop
immigration['Sending_pop'] = pd.to_numeric(immigration['Sending_pop'], errors='coerce')
average_pop = immigration['Sending_pop'].mean()

# Fill missing Sending_pop values with the average
immigration['Sending_pop'] = immigration['Sending_pop'].fillna(average_pop)


# Missing values
print(immigration['Sending_pop'].isna().sum())

0

# Individual scatter plots
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

observations = ['Sending_gdppc', 'Sending_mpm', 'Sending_conflict_casualties', 'Sending_govern', 'Sending_cri', 'Sending_pop']

def regress_with_stats(immigration, observations):
    fig, ax = plt.subplots(3, 2, figsize=(20, 10), sharex=False)
    ax = ax.ravel() 
    
    for i, o in enumerate(observations):
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            immigration[o],
            immigration['Flow']
        )
        # A title with statistics
        diag_str = (
            f"p-value={p_value:.1g}\n"
            f"r-value={r_value:.3f}\n"
            f"std err={std_err:.3f}\n"
            f"slope={slope:.3f}\n"
            f"intercept={intercept:.3f}"
        )
        
        # Scatter plot with regression line
        immigration.plot.scatter(x=o, y='Flow', title=diag_str, ax=ax[i])
        pts = np.linspace(immigration[o].min(), immigration[o].max(), 500)
        line = slope * pts + intercept
        ax[i].plot(pts, line, lw=1, color='red')

    for i in range(len(observations), len(ax)):
        fig.delaxes(ax[i])
    
    plt.tight_layout()
    plt.show()

regress_with_stats(immigration, observations)

import seaborn as sns
# Coerce variables into numeric values
immigration['Flow'] = pd.to_numeric(immigration['Flow'], errors='coerce')
immigration['Sending_gdppc'] = pd.to_numeric(immigration['Sending_gdppc'], errors='coerce')
immigration['Sending_mpm'] = pd.to_numeric(immigration['Sending_mpm'], errors='coerce')
immigration['Sending_conflict_casualties'] = pd.to_numeric(immigration['Sending_conflict_casualties'], errors='coerce')
immigration['Sending_govern'] = pd.to_numeric(immigration['Sending_govern'], errors='coerce')
immigration['Sending_cri'] = pd.to_numeric(immigration['Sending_cri'], errors='coerce')
immigration['Sending_pop'] = pd.to_numeric(immigration['Sending_pop'], errors='coerce')

# Correlation Heatmap
variables = ['Flow', 'Sending_gdppc', 'Sending_mpm', 'Sending_conflict_casualties', 'Sending_govern', 'Sending_cri', 'Sending_pop']

# Correlation Matrix
corr_matrix = immigration[variables].corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm_r', fmt=".2f")
plt.title('Correlation Matrix of Selected Columns in Immigration Dataset')
plt.show()

# Multilinear Regression Model
import statsmodels.api as sm


# X and Y variables
predictors = ['Sending_gdppc', 'Sending_mpm', 'Sending_conflict_casualties', 'Sending_govern', 'Sending_cri', 'Sending_pop']
outcome = 'Flow'

# constant term
X = sm.add_constant(immigration[predictors])

# the regression model
model = sm.OLS(immigration[outcome], X)
results = model.fit()

# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Flow   R-squared:                       0.368
Model:                            OLS   Adj. R-squared:                  0.344
Method:                 Least Squares   F-statistic:                     15.51
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           5.54e-14
Time:                        23:13:44   Log-Likelihood:                -2256.0
No. Observations:                 167   AIC:                             4526.
Df Residuals:                     160   BIC:                             4548.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
===============================================================================================
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                       -6.987e+04   1.25e+05     -0.559      0.577   -3.17e+05    1.77e+05
Sending_gdppc                  -1.1049      1.055     -1.048      0.296      -3.187       0.978
Sending_mpm                 -1356.6816    713.712     -1.901      0.059   -2766.193      52.830
Sending_conflict_casualties   115.7646     12.551      9.223      0.000      90.977     140.552
Sending_govern               4533.4749   4510.002      1.005      0.316   -4373.335    1.34e+04
Sending_cri                   508.7402    447.709      1.136      0.258    -375.441    1392.921
Sending_pop                  -9.46e-05      0.002     -0.040      0.968      -0.005       0.005
==============================================================================
Omnibus:                      166.355   Durbin-Watson:                   1.983
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            12738.316
Skew:                           3.093   Prob(JB):                         0.00
Kurtosis:                      45.337   Cond. No.                     3.06e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.06e+08. This might indicate that there are
strong multicollinearity or other numerical problems.

import numpy as np
import statsmodels.api as sm
import statsmodels.stats.stattools as st
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate Residuals
predicted_values = results.predict()
residuals = immigration['Flow'] - predicted_values

# Check for Linearity
plt.scatter(predicted_values, residuals)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted Values')
plt.show()

# Check for Homoscedasticity
plt.scatter(predicted_values, residuals)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted Values')
plt.axhline(y=0, color='r', linestyle='-')
plt.show()

# Normality of Residuals
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# Check for Autocorrelation
# Durbin-Watson statistic using DurbinWatson function
dw_statistic = st.durbin_watson(residuals)
print("Durbin-Watson Statistic:", dw_statistic)

# You can also plot autocorrelation function (ACF) of residuals if needed
sm.graphics.tsa.plot_acf(residuals)
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation Function (ACF) of Residuals')
plt.show()

Durbin-Watson Statistic: 1.982986745408299

	iso3	name
0	BEL	Belgium
1	CH_	China, mainland
2	GGY	Guernsey

	Country Name	Country Code	Series Name	Series Code	2022 [YR2022]
0	Afghanistan	AFG	GDP per capita, PPP (constant 2017 internation...	NY.GDP.PCAP.PP.KD	..
1	Africa Eastern and Southern	AFE	GDP per capita, PPP (constant 2017 internation...	NY.GDP.PCAP.PP.KD	3566.269439
2	Africa Western and Central	AFW	GDP per capita, PPP (constant 2017 internation...	NY.GDP.PCAP.PP.KD	4066.48323

	Region	Country code	Economy	Reporting year	Survey name	Survey year	Survey coverage	Welfare type	Survey comparability	Monetary (%)	Educational attainment (%)	Educational enrollment (%)	Electricity (%)	Sanitation (%)	Drinking water (%)	Multidimensional poverty headcount ratio (%)
0	ECA	ALB	Albania	2018	HBS	2018	N	c	3.0	0.048107	0.192380	-	0.06025	6.579772	9.594966	0.293161
1	SSA	AGO	Angola	2018	IDREA	2018	N	c	2.0	31.122005	29.753423	27.44306	52.639532	53.637516	32.106507	47.203606
2	LAC	ARG	Argentina	2021	EPHC-S2	2021	U	i	2.0	0.958847	1.085320	0.731351	0	0.193965	0.364048	0.971202

	event_id_cnty	event_date	year	time_precision	disorder_type	event_type	sub_event_type	actor1	assoc_actor_1	inter1	...	location	latitude	longitude	geo_precision	source	source_scale	notes	fatalities	tags	timestamp
0	DRC27768	31-Dec-22	2022	1	Political violence	Battles	Armed clash	M23: March 23 Movement	NaN	2	...	Karenga	-1.4724	29.0655	2	Mediacongo.net; Radio Okapi	National	On 31 December 2022, during a two-day battle, ...	0	NaN	1673291085
1	MZM3154	31-Dec-22	2022	1	Political violence	Battles	Armed clash	Islamist Militia (Mozambique)	NaN	3	...	Namacule	-11.8567	39.8000	1	AIM; Pinnacle News; Twitter; Zitamar	New media-National	On 31 December 2022, Islamist militia clashed ...	2	NaN	1673291088
2	MZM3155	31-Dec-22	2022	1	Political violence	Battles	Armed clash	Islamist Militia (Mozambique)	NaN	3	...	Namande	-11.8278	39.7416	1	AIM; Pinnacle News; Twitter; VOA; Zitamar	New media-National	On 31 December 2022, Islamist militia clashed ...	2	NaN	1673291088

	event_id_cnty	event_date	year	time_precision	disorder_type	event_type	sub_event_type	actor1	assoc_actor_1	inter1	...	location	latitude	longitude	geo_precision	source	source_scale	notes	fatalities	tags	timestamp
0	KEN9717	31 December 2022	2022	1	Political violence	Riots	Mob violence	Rioters (Kenya)	Vigilante Group (Kenya)	5	...	Kutus	-0.5753	37.3269	2	Kenya Standard; NTV (Kenya)	New media-National	On 31 December 2022, a mob lynched a man, part...	1	crowd size=no report	1673291087
1	BRA62473	31 December 2022	2022	1	Political violence	Riots	Mob violence	Rioters (Brazil)	Vigilante Group (Brazil)	5	...	Maues	-3.3795	-57.7196	1	Portal do Holanda	Subnational	On 31 December 2022, in Maues (Amazonas), a su...	0	crowd size=no report	1673295343
2	BRA62488	31 December 2022	2022	1	Political violence	Riots	Mob violence	Rioters (Brazil)	PL: Liberal Party	5	...	Catalao	-18.1670	-47.9448	1	Estado de Minas	National	Property destruction: On 31 December 2022, in ...	0	crowd size=no report	1673295343

Migration Flows to Europe¶

Project Topic and Goals¶

Project Datasets¶

The First Dataset¶

The Second Set of Datasets: ISO Codes¶

The Third-set of Datasets: Economic Indicators¶

The Fourth Dataset: ACLED Conflict Datasets¶

The Fifth Dataset: Governance Indicators¶

The Fifth Dataset: German Watch Climate Risk Index¶

The Seventh Dataset: Population¶

ETL (Extract, Transform, Load)¶

Import Datasets¶

Transform and Tidy Data¶

Basic Summary Statistics¶

Further ETL¶

Merge with Economic Indicator1: GDP/PC¶

Merge with Economic Indicator2: Multi-dimensional Poverty Metric¶

Merge with Conflict Variables: Fatalities in Battles, Riots, and Violence against Civilians¶

Merge with Political Indicators: WB Governance Indicators¶

Merge with Climate Indicator: German Watch Climate Risk Index¶

Population Indicator¶

Statistical Modelling¶

Conclusion¶

	DATAFLOW	LAST UPDATE	freq	citizen	agedef	age	unit	sex	geo	TIME_PERIOD	OBS_FLAG
0	ESTAT:MIGR_IMM1CTZ(1.0)	27/03/24 11:00:00	A	AD	REACH	TOTAL	NR	F	AT	2022	NaN
1	ESTAT:MIGR_IMM1CTZ(1.0)	27/03/24 11:00:00	A	AD	REACH	TOTAL	NR	F	BG	2022	NaN
2	ESTAT:MIGR_IMM1CTZ(1.0)	27/03/24 11:00:00	A	AD	REACH	TOTAL	NR	F	CZ	2022	NaN

	event_id_cnty	event_date	year	time_precision	disorder_type	event_type	sub_event_type	actor1	assoc_actor_1	inter1	...	location	latitude	longitude	geo_precision	source	source_scale	notes	fatalities	tags	timestamp
0	DRC27766	31 December 2022	2022	1	Political violence	Violence against civilians	Abduction/forced disappearance	Twirwaneho Ethnic Militia (Democratic Republic...	Banyamulenge Ethnic Militia (Democratic Republ...	4	...	Mikenge	-3.4497	28.4476	1	Kivu Times	Subnational	On 31 December 2022, Twirwaneho abducted a wom...	0	NaN	1673291085
1	SAF18067	31 December 2022	2022	1	Political violence	Violence against civilians	Attack	Unidentified Armed Group (South Africa)	NaN	3	...	Johannesburg	-26.2023	28.0436	1	Zambia Reports	International	On 31 December 2022, unknown suspects shot and...	1	NaN	1673291088
2	SOM38915	31 December 2022	2022	1	Political violence	Violence against civilians	Abduction/forced disappearance	Al Shabaab	NaN	2	...	Ted	4.4000	43.9167	2	Undisclosed Source	Local partner-Other	On 31 December 2022, Al Shabaab abducted three...	0	NaN	1673291088

	Country Name	Country Code	Series Name	Series Code	2022 [YR2022]
0	Afghanistan	AFG	Political Stability and Absence of Violence/Te...	PV.EST	-2.550801754
1	Afghanistan	AFG	Voice and Accountability: Estimate	VA.EST	-1.751587272
2	Korea, Dem. People's Rep.	PRK	Voice and Accountability: Percentile Rank	VA.PER.RNK	0

	name	slug	value	date_of_information	ranking	region
0	Afghanistan	afghanistan	38,346,720	2022 est.	37.0	South Asia
1	Albania	albania	3,095,344	2022 est.	136.0	Europe
2	Algeria	algeria	44,178,884	2022 est.	34.0	Africa

	Migrant_Citizenship	Sending_Country	Age	Gender	Receiving_CCode	Year
0	AD	Andorra	TOTAL	F	AT	2022
1	AD	Andorra	TOTAL	F	BG	2022
2	AD	Andorra	TOTAL	F	CZ	2022

	Migrant_Citizenship	Sending_Country	Age	Gender	Receiving_CCode	Receiving_Country	Year
0	AD	Andorra	TOTAL	F	AT	Austria	2022
1	AD	Andorra	TOTAL	F	BG	Bulgaria	2022
2	AD	Andorra	TOTAL	F	CZ	Czech Republic	2022

	Migrant_Citizenship	Sending_Country	Age	Gender	Receiving_CCode	Receiving_Country	Year	Flow
106898	Total	Total	Total	Total	EU27_2020	EU27	2022	6977742
78980	NEU27_2020_FOR	Non-EU27	Total	Total	EU27_2020	EU27	2022	4777475
38564	EU27_2020_FOR	EU27	Total	Total	EU27_2020	EU27	2022	1098032

	Sending_Country	Flow	Migrant_Citizenship
0	Afghanistan	53037	AF
1	Albania	131294	AL
2	Algeria	58852	DZ
3	Andorra	30	AD
4	Angola	828	AO
...	...	...	...
169	Viet Nam	22106	VN
170	Western Sahara	0	EH
171	Yemen	8918	YE
172	Zambia	698	ZM
173	Zimbabwe	1610	ZW

	Sending_Country	Flow	Sending_iso2	Sending_iso3	Sending_gdppc	Sending_mpm
0	Afghanistan	53037	AF	AFG	372.000000	NaN
1	Albania	131294	AL	ALB	15491.961000	0.293161
2	Algeria	58852	DZ	DZA	11198.233480	NaN
3	Angola	828	AO	AGO	5906.115677	47.203606
4	Antigua and Barbuda	42	AG	ATG	22321.870020	NaN

	Sending_Country	Flow	Sending_iso2	Sending_iso3	Sending_gdppc	Sending_mpm	Sending_incomegroup
0	Afghanistan	53037	AF	AFG	372.00000	NaN	Low Income
1	Albania	131294	AL	ALB	15491.96100	0.293161	High Income
2	Algeria	58852	DZ	DZA	11198.23348	NaN	Upper-middle Income

	CRI\rRank	Country	CRI\rscore	Fatalities\rin 2018\r(Rank)	Fatalities per\r100 000 inhab-\ritants (Rank)	Losses in mil-\rlion US$ (PPP)\r(Rank)	Losses per\runit GDP in\r% (Rank)
0	1	Japan	5.50	2	2	3	12
1	2	Philippines	11.17	4	14	7	14
2	3	Germany	13.83	3	1	6	36

Series Name	Country Name	Control of Corruption: Estimate	Control of Corruption: Percentile Rank	Government Effectiveness: Estimate	Government Effectiveness: Percentile Rank	Political Stability and Absence of Violence/Terrorism: Estimate	Political Stability and Absence of Violence/Terrorism: Percentile Rank	Regulatory Quality: Estimate	Regulatory Quality: Percentile Rank	Rule of Law: Estimate	Rule of Law: Percentile Rank	Voice and Accountability: Estimate	Voice and Accountability: Percentile Rank
0	Afghanistan	-1.183776	12.264151	-1.879552	1.886792	-2.550802	0.471698	-1.271806	8.962264	-1.658442	5.188679	-1.751587	2.415459
1	Albania	-0.407876	38.679245	0.065063	56.603775	0.114945	50.471699	0.159354	57.547169	-0.165779	47.169811	0.139466	52.173912
2	Algeria	-0.637930	28.301888	-0.513090	32.547169	-0.741772	19.339623	-1.063573	14.150944	-0.832473	22.641510	-1.003874	21.739130
3	American Samoa	1.270204	88.679245	0.667918	74.528305	1.128859	91.037735	0.545900	70.754715	1.221118	86.320755	0.957648	77.294685
4	Andorra	1.270204	88.679245	1.495305	92.452827	1.587736	98.584908	1.398334	90.094337	1.485450	90.566040	1.102833	85.507248

Series Name	Country Name	Control of Corruption: Estimate	Control of Corruption: Percentile Rank	Government Effectiveness: Estimate	Government Effectiveness: Percentile Rank	Political Stability and Absence of Violence/Terrorism: Estimate	Political Stability and Absence of Violence/Terrorism: Percentile Rank	Regulatory Quality: Estimate	Regulatory Quality: Percentile Rank	Rule of Law: Estimate	Rule of Law: Percentile Rank	Voice and Accountability: Estimate	Voice and Accountability: Percentile Rank	Sending_govern
0	Afghanistan	2.316224	12.264151	1.620448	1.886792	0.949198	0.471698	2.228194	8.962264	1.841558	5.188679	1.748413	2.415459	10.704034
1	Albania	3.092124	38.679245	3.565063	56.603775	3.614945	50.471699	3.659354	57.547169	3.334221	47.169811	3.639466	52.173912	20.905173
2	Algeria	2.862070	28.301888	2.986910	32.547169	2.758228	19.339623	2.436427	14.150944	2.667527	22.641510	2.496126	21.739130	16.207288
3	American Samoa	4.770204	88.679245	4.167918	74.528305	4.628859	91.037735	4.045900	70.754715	4.721118	86.320755	4.457648	77.294685	26.791646
4	Andorra	4.770204	88.679245	4.995305	92.452827	5.087736	98.584908	4.898334	90.094337	4.985450	90.566040	4.602833	85.507248	29.339862

	country	fatalities	iso
95	Ukraine	13414	804
65	Myanmar	12901	104
69	Nigeria	5274	566
81	Somalia	4498	706
33	Ethiopia	3618	231
98	Yemen	3576	887
13	Brazil	3008	76
26	Democratic Republic of Congo	2919	180
86	Syria	2599	760
0	Afghanistan	2424	4

	Sending_Country	Flow	Sending_iso2	Sending_iso3	Sending_gdppc	Sending_mpm	Sending_incomegroup	Sending_iso3_num	Sending_conflict_casualties	Sending_govern	Sending_cri	Sending_pop
17	Bolivia	36624	BO	BOL	8244.235658	4.539775	Upper-middle Income	68.0	2.20	16.553992	63.500000	NaN
38	Côte d'Ivoire	8292	CI	CIV	5537.369758	37.273455	Upper-middle Income	384.0	21.05	18.866787	89.500000	NaN
63	Iran	43182	IR	IRN	15461.079340	1.027940	High Income	364.0	136.15	22.317889	82.451203	NaN
72	North Korea	78	KP	PRK	1217.000000	46.103936	Lower-middle Income	408.0	5.10	16.658763	82.451203	NaN
81	Macedonia	28024	MK	MKD	17128.642860	3.205135	High Income	807.0	0.00	22.317889	82.451203	NaN
105	Non-Citizens	4090	RNC	XXX	15549.957897	2.721764	High Income	NaN	0.00	22.317889	82.451203	NaN
140	Syria	138876	SY	SYR	752.000000	68.499527	Low Income	760.0	1945.75	11.075789	82.451203	NaN
141	Taiwan	3608	TW	TWN	32716.000000	0.061490	High Income	158.0	0.00	22.317889	82.451203	NaN
143	Tanzania	1536	TZ	TZA	2623.861572	54.589677	Lower-middle Income	834.0	9.15	18.300585	69.830000	NaN
161	Venezuela	340468	VE	VEN	3420.000000	46.103936	Lower-middle Income	862.0	445.85	16.658763	104.170000	NaN