Data Preprocessing

In this notebook, we will preprocess the raw json file and calculate necessary information

In [1]:
# Load some libraries
import pandas as pd
import json
from pandas import DataFrame
import numpy as np
import warnings
warnings.filterwarnings("ignore")

I. Preprocessing the raw data

In [2]:
# load json file
with open('../data/data.json') as f:
    data = json.load(f)
In [3]:
# convert the json data to a dataframe
df = pd.DataFrame(data)
In [4]:
# check the dataframe
df.head()
Out[4]:
id version year countryCode countryName shortName isoa2 record cropLand grazingLand forestLand fishingGround builtupLand carbon value score
0 3 None 1992 1 Armenia Armenia AM BiocapPerCap 1.598044e-01 0.135261 0.084003 0.013742 0.033398 0.000000e+00 4.262086e-01 3A
1 4 None 1992 1 Armenia Armenia AM BiocapTotGHA 5.501762e+05 465677.972167 289207.107815 47311.551724 114982.279295 0.000000e+00 1.467355e+06 3A
2 5 None 1992 1 Armenia Armenia AM EFConsPerCap 3.875102e-01 0.189462 0.000001 0.004165 0.033398 1.114093e+00 1.728629e+00 3A
3 6 None 1992 1 Armenia Armenia AM EFConsTotGHA 1.334124e+06 652282.314685 4.328087 14339.449236 114982.279295 3.835610e+06 5.951343e+06 3A
4 15 None 1993 1 Armenia Armenia AM BiocapPerCap 1.581890e-01 0.138671 0.085922 0.014068 0.031640 0.000000e+00 4.284895e-01 3A
In [5]:
# drop unused columns
df.drop(['id', 'version','countryCode', 'shortName','score'], axis=1, inplace=True)
In [6]:
# rename remaining columns
df.columns = ['Year','Country','Code','Parameter','Cropland','Grazing','Forest','Fishing','Builtup','Carbon','Total']
In [7]:
df.head()
Out[7]:
Year Country Code Parameter Cropland Grazing Forest Fishing Builtup Carbon Total
0 1992 Armenia AM BiocapPerCap 1.598044e-01 0.135261 0.084003 0.013742 0.033398 0.000000e+00 4.262086e-01
1 1992 Armenia AM BiocapTotGHA 5.501762e+05 465677.972167 289207.107815 47311.551724 114982.279295 0.000000e+00 1.467355e+06
2 1992 Armenia AM EFConsPerCap 3.875102e-01 0.189462 0.000001 0.004165 0.033398 1.114093e+00 1.728629e+00
3 1992 Armenia AM EFConsTotGHA 1.334124e+06 652282.314685 4.328087 14339.449236 114982.279295 3.835610e+06 5.951343e+06
4 1993 Armenia AM BiocapPerCap 1.581890e-01 0.138671 0.085922 0.014068 0.031640 0.000000e+00 4.284895e-01
In [8]:
# remove countries without country codes (eg. we remove Sudan (former) and keep Sudan)
df = df[df.Code != '']
In [9]:
# sort the dataframe
df = df.sort_values(by=['Code','Year','Parameter'], ascending = True)
In [10]:
# reset index
df.reset_index(drop=True, inplace=True)
In [11]:
# remove duplicates
df.drop_duplicates(inplace=True) 
In [12]:
df.head()
Out[12]:
Year Country Code Parameter Cropland Grazing Forest Fishing Builtup Carbon Total
0 1975 United Arab Emirates AE GDP NaN NaN NaN NaN NaN NaN 102479.187500
1 1976 United Arab Emirates AE GDP NaN NaN NaN NaN NaN NaN 102319.726562
2 1977 United Arab Emirates AE GDP NaN NaN NaN NaN NaN NaN 107452.179688
3 1978 United Arab Emirates AE GDP NaN NaN NaN NaN NaN NaN 92822.359375
4 1979 United Arab Emirates AE GDP NaN NaN NaN NaN NaN NaN 100480.437500
In [13]:
# save the full preprocessed data to a csv file
df.to_csv(path_or_buf="../data/data0.csv", index = False)

II. Caculating Deficit/Reserve

In [14]:
# get a list of country names
country_list = df['Country'].tolist()
country_list = list(set(country_list))
In [15]:
# create new dataframe to save the data for deficit/reserve
newdf = df[['Year','Country','Code']]
newdf.drop_duplicates(inplace=True) 
newdf = newdf.reindex(columns = newdf.columns.tolist() + ['Earths','HDI','Population','BiocapPerCap','BiocapTotGHA','EFConsPerCap','EFConsTotGHA','DeRe','Percentage'])
In [16]:
# check the new dataframe newdf
newdf.head()
Out[16]:
Year Country Code Earths HDI Population BiocapPerCap BiocapTotGHA EFConsPerCap EFConsTotGHA DeRe Percentage
0 1975 United Arab Emirates AE NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1976 United Arab Emirates AE NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1977 United Arab Emirates AE NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1978 United Arab Emirates AE NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1979 United Arab Emirates AE NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [17]:
# fill the values in the newdf using data from df
# loop for the year, countries and the columns in the dataframe newdf
# use try, except to avoid missing values because some countries miss record at certain years

for year in range(1960,2018):
    for country in country_list:
        for column in ['Earths','HDI','Population','BiocapPerCap','BiocapTotGHA','EFConsPerCap','EFConsTotGHA']:
            try:
                newdf.loc[(newdf.Country == country) & (newdf.Year == year),[column]] = float(df[(df.Country == country) & (df.Year == year) & (df.Parameter == column)]['Total'])
            except:
                pass
        
        
        
In [18]:
# calculate the deficit/reserve
newdf['DeRe'] = newdf['BiocapPerCap'] - newdf['EFConsPerCap']
# calculate the difference in percentage
newdf['Percentage'] = newdf['DeRe'] / newdf['BiocapPerCap'] 
In [19]:
# remove missing values if exist
newdf.dropna(inplace = True) 
In [20]:
newdf.head()
Out[20]:
Year Country Code Earths HDI Population BiocapPerCap BiocapTotGHA EFConsPerCap EFConsTotGHA DeRe Percentage
75 1990 United Arab Emirates AE 6.237739 0.727 1860174.0 2.602391 4.840890e+06 12.902908 2.400160e+07 -10.300517 -3.958097
83 1991 United Arab Emirates AE 6.859624 0.738 1970026.0 2.472270 4.870447e+06 13.852905 2.729064e+07 -11.380635 -4.603313
91 1992 United Arab Emirates AE 6.348376 0.742 2086639.0 2.382417 4.971246e+06 12.829025 2.676956e+07 -10.446608 -4.384879
99 1993 United Arab Emirates AE 6.331413 0.748 2207405.0 2.255677 4.979203e+06 12.545192 2.769238e+07 -10.289515 -4.561609
107 1994 United Arab Emirates AE 6.670189 0.756 2328686.0 2.143710 4.992035e+06 13.122528 3.055830e+07 -10.978818 -5.121411
In [21]:
newdf.to_csv(path_or_buf="../data/data1.csv", index = False)
In [ ]: