# coding: utf-8 # # Data Analysis and Data Engineering # # # # Import all libraries first # In[24]: """ Created on Sun Sep 3 01:17:41 2017 @author: Kalyan """ import pandas as pd import numpy as np from pandas import DataFrame import os # # Load the Data Set # In[25]: data= pd.read_csv("http://makemeanalyst.com/python-data-science/gapminder.csv", low_memory=False) # In[26]: data.head() # In[27]: print("No of Records : ",len(data)) #This will show number of observations print("No of Features : ",len(data.columns)) #This will show number of Features # # Check the types of the variable # In[28]: print(data.dtypes) # # Setting variables you will be working with to numeric # In[30]: data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True) data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True) data['employrate']=data['employrate'].convert_objects(convert_numeric=True) print(data.dtypes) # # First I want to see are there any countries where incomeperperson, urbanrate and employrate all three features are NA values # In[36]: sub1=data[np.isnan(data['incomeperperson']) & np.isnan(data['urbanrate']) & np.isnan(data['employrate'])] sub1 # # Now I made the income variable as categorical. I made them into three catagories # # In[37]: data['factor_income']=pd.cut(data['incomeperperson'],[0,1000,12735, data.ix[:,['incomeperperson']].max()], labels=['Lower Income','Middle Income','Upper Income']) # In[43]: data.head() # In[39]: print ('counts for original incomeperperson') c1 = data['factor_income'].value_counts(sort=False, dropna=False) print(c1) #Here you will get the frequncy of each catagory as well as number of NA values print(data['factor_income'].describe()) # # Data management for urban rate # # # I will use quartile split (use qcut function & ask for 4 groups - gives you quartile split) # In[40]: data['factor_urbanrate']=pd.qcut(data['urbanrate'],4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"]) print ('counts for original incomeperperson') c2 = data['factor_urbanrate'].value_counts(sort=False, dropna=False) c3 = data['factor_urbanrate'].value_counts(sort=False, dropna=False,normalize=True)#It will display percentage of data #falls in each catagory print(c2) #Here you will get the frequncy of each catagory as well as number of NA values print("Percentage for each catagory\n",c3*100) # # Now create a new employrate variable which is categorical and cretae three categories as Low, average, high # In[41]: data['EmpRateCatogory'] =pd.cut( data['employrate'],[0,50,70,data.ix[:,['employrate']].max()], labels=['Low','Average','High']) c4 = data['EmpRateCatogory'].value_counts(sort=False, dropna=False) print(c4) # # Try yourself # In[ ]: