# coding: utf-8

# # Data Analysis and Data Engineering
# 
# 
# # Import all libraries first

# In[24]:

"""
Created on Sun Sep  3 01:17:41 2017

@author: Kalyan
"""
import pandas as pd
import numpy as np
from pandas import DataFrame
import os


# # Load the Data Set

# In[25]:

data= pd.read_csv("http://makemeanalyst.com/python-data-science/gapminder.csv", low_memory=False)


# In[26]:

data.head()


# In[27]:

print("No of Records : ",len(data)) #This will show number of observations
print("No of Features : ",len(data.columns)) #This will show number of Features


# # Check the types of the variable

# In[28]:

print(data.dtypes)


# # Setting variables you will be working with to numeric

# In[30]:

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)
data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)
data['employrate']=data['employrate'].convert_objects(convert_numeric=True)
print(data.dtypes)


# #  First I want to see are there any countries where incomeperperson, urbanrate and employrate all three features are NA values

# In[36]:

sub1=data[np.isnan(data['incomeperperson']) & np.isnan(data['urbanrate']) & np.isnan(data['employrate'])]
sub1


# # Now I made the income variable as categorical. I made them into three catagories
# 

# In[37]:

data['factor_income']=pd.cut(data['incomeperperson'],[0,1000,12735, data.ix[:,['incomeperperson']].max()], 
                             labels=['Lower Income','Middle Income','Upper Income'])


# In[43]:

data.head()


# In[39]:

print ('counts for original incomeperperson')
c1 = data['factor_income'].value_counts(sort=False, dropna=False)
print(c1) #Here you will get the frequncy of each catagory as well as number of NA values
print(data['factor_income'].describe())


# # Data management for urban rate 
# 
# # I will use quartile split (use qcut function & ask for 4 groups - gives you quartile split)

# In[40]:


data['factor_urbanrate']=pd.qcut(data['urbanrate'],4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
print ('counts for original incomeperperson')
c2 = data['factor_urbanrate'].value_counts(sort=False, dropna=False)
c3 = data['factor_urbanrate'].value_counts(sort=False, dropna=False,normalize=True)#It will display percentage of data
#falls in each catagory
print(c2) #Here you will get the frequncy of each catagory as well as number of NA values
print("Percentage for each catagory\n",c3*100)


# # Now create a new employrate variable which is categorical and cretae three categories as Low, average, high

# In[41]:

data['EmpRateCatogory'] =pd.cut( data['employrate'],[0,50,70,data.ix[:,['employrate']].max()],
    labels=['Low','Average','High'])
    
c4 = data['EmpRateCatogory'].value_counts(sort=False, dropna=False)
print(c4)


# # Try yourself

# In[ ]: