Data Preprocessing

In this notebook, we will preprocess the raw json file and calculate necessary information¶

# Load some libraries
import pandas as pd
import json
from pandas import DataFrame
import numpy as np
import warnings
warnings.filterwarnings("ignore")

I. Preprocessing the raw data¶

# load json file
with open('../data/data.json') as f:
    data = json.load(f)

# convert the json data to a dataframe
df = pd.DataFrame(data)

# check the dataframe
df.head()

# drop unused columns
df.drop(['id', 'version','countryCode', 'shortName','score'], axis=1, inplace=True)

# rename remaining columns
df.columns = ['Year','Country','Code','Parameter','Cropland','Grazing','Forest','Fishing','Builtup','Carbon','Total']

df.head()

# remove countries without country codes (eg. we remove Sudan (former) and keep Sudan)
df = df[df.Code != '']

# sort the dataframe
df = df.sort_values(by=['Code','Year','Parameter'], ascending = True)

# reset index
df.reset_index(drop=True, inplace=True)

# remove duplicates
df.drop_duplicates(inplace=True)

df.head()

# save the full preprocessed data to a csv file
df.to_csv(path_or_buf="../data/data0.csv", index = False)

II. Caculating Deficit/Reserve¶

# get a list of country names
country_list = df['Country'].tolist()
country_list = list(set(country_list))

# create new dataframe to save the data for deficit/reserve
newdf = df[['Year','Country','Code']]
newdf.drop_duplicates(inplace=True) 
newdf = newdf.reindex(columns = newdf.columns.tolist() + ['Earths','HDI','Population','BiocapPerCap','BiocapTotGHA','EFConsPerCap','EFConsTotGHA','DeRe','Percentage'])

# check the new dataframe newdf
newdf.head()

# fill the values in the newdf using data from df
# loop for the year, countries and the columns in the dataframe newdf
# use try, except to avoid missing values because some countries miss record at certain years

for year in range(1960,2018):
    for country in country_list:
        for column in ['Earths','HDI','Population','BiocapPerCap','BiocapTotGHA','EFConsPerCap','EFConsTotGHA']:
            try:
                newdf.loc[(newdf.Country == country) & (newdf.Year == year),[column]] = float(df[(df.Country == country) & (df.Year == year) & (df.Parameter == column)]['Total'])
            except:
                pass

# calculate the deficit/reserve
newdf['DeRe'] = newdf['BiocapPerCap'] - newdf['EFConsPerCap']
# calculate the difference in percentage
newdf['Percentage'] = newdf['DeRe'] / newdf['BiocapPerCap']

# remove missing values if exist
newdf.dropna(inplace = True)

newdf.head()

newdf.to_csv(path_or_buf="../data/data1.csv", index = False)

	id	version	year	countryCode	countryName	shortName	isoa2	record	cropLand	grazingLand	forestLand	fishingGround	builtupLand	carbon	value	score
0	3	None	1992	1	Armenia	Armenia	AM	BiocapPerCap	1.598044e-01	0.135261	0.084003	0.013742	0.033398	0.000000e+00	4.262086e-01	3A
1	4	None	1992	1	Armenia	Armenia	AM	BiocapTotGHA	5.501762e+05	465677.972167	289207.107815	47311.551724	114982.279295	0.000000e+00	1.467355e+06	3A
2	5	None	1992	1	Armenia	Armenia	AM	EFConsPerCap	3.875102e-01	0.189462	0.000001	0.004165	0.033398	1.114093e+00	1.728629e+00	3A
3	6	None	1992	1	Armenia	Armenia	AM	EFConsTotGHA	1.334124e+06	652282.314685	4.328087	14339.449236	114982.279295	3.835610e+06	5.951343e+06	3A
4	15	None	1993	1	Armenia	Armenia	AM	BiocapPerCap	1.581890e-01	0.138671	0.085922	0.014068	0.031640	0.000000e+00	4.284895e-01	3A

	Year	Country	Code	Parameter	Cropland	Grazing	Forest	Fishing	Builtup	Carbon	Total
0	1992	Armenia	AM	BiocapPerCap	1.598044e-01	0.135261	0.084003	0.013742	0.033398	0.000000e+00	4.262086e-01
1	1992	Armenia	AM	BiocapTotGHA	5.501762e+05	465677.972167	289207.107815	47311.551724	114982.279295	0.000000e+00	1.467355e+06
2	1992	Armenia	AM	EFConsPerCap	3.875102e-01	0.189462	0.000001	0.004165	0.033398	1.114093e+00	1.728629e+00
3	1992	Armenia	AM	EFConsTotGHA	1.334124e+06	652282.314685	4.328087	14339.449236	114982.279295	3.835610e+06	5.951343e+06
4	1993	Armenia	AM	BiocapPerCap	1.581890e-01	0.138671	0.085922	0.014068	0.031640	0.000000e+00	4.284895e-01

	Year	Country	Code	Parameter	Cropland	Grazing	Forest	Fishing	Builtup	Carbon	Total
0	1975	United Arab Emirates	AE	GDP	NaN	NaN	NaN	NaN	NaN	NaN	102479.187500
1	1976	United Arab Emirates	AE	GDP	NaN	NaN	NaN	NaN	NaN	NaN	102319.726562
2	1977	United Arab Emirates	AE	GDP	NaN	NaN	NaN	NaN	NaN	NaN	107452.179688
3	1978	United Arab Emirates	AE	GDP	NaN	NaN	NaN	NaN	NaN	NaN	92822.359375
4	1979	United Arab Emirates	AE	GDP	NaN	NaN	NaN	NaN	NaN	NaN	100480.437500

	Year	Country	Code	Earths	HDI	Population	BiocapPerCap	BiocapTotGHA	EFConsPerCap	EFConsTotGHA	DeRe	Percentage
0	1975	United Arab Emirates	AE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1976	United Arab Emirates	AE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1977	United Arab Emirates	AE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1978	United Arab Emirates	AE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1979	United Arab Emirates	AE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Year	Country	Code	Earths	HDI	Population	BiocapPerCap	BiocapTotGHA	EFConsPerCap	EFConsTotGHA	DeRe	Percentage
75	1990	United Arab Emirates	AE	6.237739	0.727	1860174.0	2.602391	4.840890e+06	12.902908	2.400160e+07	-10.300517	-3.958097
83	1991	United Arab Emirates	AE	6.859624	0.738	1970026.0	2.472270	4.870447e+06	13.852905	2.729064e+07	-11.380635	-4.603313
91	1992	United Arab Emirates	AE	6.348376	0.742	2086639.0	2.382417	4.971246e+06	12.829025	2.676956e+07	-10.446608	-4.384879
99	1993	United Arab Emirates	AE	6.331413	0.748	2207405.0	2.255677	4.979203e+06	12.545192	2.769238e+07	-10.289515	-4.561609
107	1994	United Arab Emirates	AE	6.670189	0.756	2328686.0	2.143710	4.992035e+06	13.122528	3.055830e+07	-10.978818	-5.121411