# Load some libraries
import pandas as pd
import json
from pandas import DataFrame
import numpy as np
import warnings
warnings.filterwarnings("ignore")
# load json file
with open('../data/data.json') as f:
data = json.load(f)
# convert the json data to a dataframe
df = pd.DataFrame(data)
# check the dataframe
df.head()
# drop unused columns
df.drop(['id', 'version','countryCode', 'shortName','score'], axis=1, inplace=True)
# rename remaining columns
df.columns = ['Year','Country','Code','Parameter','Cropland','Grazing','Forest','Fishing','Builtup','Carbon','Total']
df.head()
# remove countries without country codes (eg. we remove Sudan (former) and keep Sudan)
df = df[df.Code != '']
# sort the dataframe
df = df.sort_values(by=['Code','Year','Parameter'], ascending = True)
# reset index
df.reset_index(drop=True, inplace=True)
# remove duplicates
df.drop_duplicates(inplace=True)
df.head()
# save the full preprocessed data to a csv file
df.to_csv(path_or_buf="../data/data0.csv", index = False)
# get a list of country names
country_list = df['Country'].tolist()
country_list = list(set(country_list))
# create new dataframe to save the data for deficit/reserve
newdf = df[['Year','Country','Code']]
newdf.drop_duplicates(inplace=True)
newdf = newdf.reindex(columns = newdf.columns.tolist() + ['Earths','HDI','Population','BiocapPerCap','BiocapTotGHA','EFConsPerCap','EFConsTotGHA','DeRe','Percentage'])
# check the new dataframe newdf
newdf.head()
# fill the values in the newdf using data from df
# loop for the year, countries and the columns in the dataframe newdf
# use try, except to avoid missing values because some countries miss record at certain years
for year in range(1960,2018):
for country in country_list:
for column in ['Earths','HDI','Population','BiocapPerCap','BiocapTotGHA','EFConsPerCap','EFConsTotGHA']:
try:
newdf.loc[(newdf.Country == country) & (newdf.Year == year),[column]] = float(df[(df.Country == country) & (df.Year == year) & (df.Parameter == column)]['Total'])
except:
pass
# calculate the deficit/reserve
newdf['DeRe'] = newdf['BiocapPerCap'] - newdf['EFConsPerCap']
# calculate the difference in percentage
newdf['Percentage'] = newdf['DeRe'] / newdf['BiocapPerCap']
# remove missing values if exist
newdf.dropna(inplace = True)
newdf.head()
newdf.to_csv(path_or_buf="../data/data1.csv", index = False)