# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# magic word for producing visualizations in notebook
%matplotlib inline
There are four files associated with this project (not including this one):
Udacity_AZDIAS_Subset.csv
: Demographics data for the general population of Germany; 891211 persons (rows) x 85 features (columns).Udacity_CUSTOMERS_Subset.csv
: Demographics data for customers of a mail-order company; 191652 persons (rows) x 85 features (columns).Data_Dictionary.md
: Detailed information file about the features in the provided datasets.AZDIAS_Feature_Summary.csv
: Summary of feature attributes for demographics data; 85 features (rows) x 4 columns# Load in the general demographics data.
azdias = pd.read_csv("Udacity_AZDIAS_Subset.csv", sep=';')
# Load in the feature summary file.
feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", sep=';')
# Check the structure of the data after it's loaded (e.g. print the number of
# rows and columns, print the first few rows).
# TODO: Total number of rows
print("Total number of rows: " + str(azdias.shape[0]))
# TODO: Total number of columns
print("Total number of columns: " + str(azdias.shape[1]))
# First 5 rows
display(azdias.head(n=5))
#randomly pick 10 features to check
feat_info.sample(n=10)
feat_info[feat_info['attribute'] == 'PRAEGENDE_JUGENDJAHRE']
Tip: Add additional cells to keep everything in reasonably-sized chunks! Keyboard shortcut
esc --> a
(press escape to enter command mode, then press the 'A' key) adds a new cell before the active cell, andesc --> b
adds a new cell after the active cell. If you need to convert an active cell to a markdown cell, useesc --> m
and to convert to a code cell, useesc --> y
.
# for f in list(azdias.columns):
# azdias[f][azdias[f].isnull()] = np.nan
# Identify missing or unknown data values and convert them to NaNs.
col_names = azdias.columns.values.tolist()
def cov2NaNs(number):
x = feat_info.missing_or_unknown[number] #Get the string expression of list
x = x.replace(" ", "") #remove blank
l = x[1:-1].split(',') #parse the elements
for i in l:
if i.lstrip('-').isdigit(): #if negative digit, convert to int
azdias.iloc[:,number].replace(int(i), np.nan, inplace = True)
elif i.isdigit(): #if positive digit, convert to int
azdias.iloc[:,number].replace(int(i), np.nan, inplace = True)
else: #For X or XX
azdias.iloc[:,number].replace(i, np.nan, inplace = True)
for n in range(0,len(feat_info)):
cov2NaNs(n)
#save the converted df
#azdias.to_csv('new_azdias.csv', encoding='utf-8', index=False)
#check if successfuly converted
print(feat_info.missing_or_unknown[feat_info.attribute =='SEMIO_SOZ'])
print(azdias['SEMIO_SOZ'][azdias['SEMIO_SOZ'] == -1])
print(azdias['SEMIO_SOZ'][azdias['SEMIO_SOZ'] == 9])
azdias['SEMIO_SOZ']
How much missing data is present in each column?
Identify and document these columns. While some of these columns might have justifications for keeping or re-encoding the data.
For the remaining features, are there any patterns in which columns have, or share, missing data?
# #azdias['AGER_TYP'].isnull()
# print(len(azdias['AGER_TYP'][azdias['AGER_TYP'].isnull()]))
# print(len(pd.DataFrame([1,np.nan,2]).isnull()))
# print(len(azdias))
# Perform an assessment of how much missing data there is in each column of the
# dataset.
# vars_with_missing = []
# missings_cnt = []
# for f in azdias.columns:
# missings = len(azdias[f][azdias[f].isnull()])
# if missings > 0:
# vars_with_missing.append(f)
# missings_perc = missings/azdias.shape[0]
# missings_cnt.append(missings)
# print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
# print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
col_with_missing = pd.DataFrame(data={'vars':azdias.columns, 'count_missing': azdias.isnull().sum(axis=0), 'missing_perc (%)': round(azdias.isnull().sum(axis=0)/azdias.shape[0],4)*100})
vars_with_missing = pd.DataFrame(col_with_missing[col_with_missing['count_missing']>0])
vars_with_missing.sort_values(by='count_missing', ascending=False, inplace=True)
vars_with_missing.reset_index(drop =True, inplace =True)
pd.options.display.max_rows = 61
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
print('Columns which share the same proportion of missing values can be checked from the table below')
print('''e.g. columns share 14.96% missingess:
KBA05_ANTG3
KBA05_ANTG1
KBA05_ANTG2
KBA05_GBZ
KBA05_ANTG4
MOBI_REGIO''')
display(vars_with_missing)
pd.reset_option("display.max_rows")
# Investigate patterns in the amount of missing data in each column.
col_missing = pd.DataFrame(data={'vars': list(vars_with_missing['vars']), 'count': list(vars_with_missing['count_missing'])})
vars_with_missing = list(vars_with_missing['vars'])
ax = col_missing.plot.bar(x='vars', y='count', rot=90, figsize=(25,5))
#visualize the location of missing values (for all features which have missing values)
fig = plt.subplots(figsize=(15,10))
sns.heatmap(azdias[vars_with_missing].isnull())
# Remove the outlier columns from the dataset. (You'll perform other data
# engineering tasks such as re-encoding and imputation later.)
#Remove columns with more than 200000 rows of missing values (these colums also have many countious rows with missing values from plot above)
outlier_vars = list(col_missing['vars'][col_missing['count'] >= 200000])
azdias_new = pd.DataFrame(azdias.drop(outlier_vars, axis=1))
#get the updated list of remaining features
new_vars_with_missing = [v for v in vars_with_missing if v not in outlier_vars]
outlier_vars
# cnt = list(col_missing['count'])
# col_missing['count_z_score'] = (cnt-np.mean(cnt))/np.std(cnt)
# #Remove columns with z-score greater than 1.55 (beyond 84.38% percentile)
# outlier_vars = list(col_missing['vars'][col_missing['count_z_score'] >= 1.11])
# azdias_new = pd.DataFrame(azdias.drop(outlier_vars, axis=1))
# #get the updated list of remaining features
# new_vars_with_missing = [v for v in vars_with_missing if v not in outlier_vars]
# outlier_vars
azdias_new.head()
#visualize the location of missing values (for the remaining features after outliers removed)
fig = plt.subplots(figsize=(15,10))
sns.heatmap(azdias_new[new_vars_with_missing].isnull())
In total, there are 61 variables with missing values. Most of them have less than 200000 missing values. However, the feature: 'AGER_TYP','GEBURTSJAHR','TITEL_KZ','ALTER_HH','KK_KUNDENTYP','KBA05_BAUMAX' are with amount of missing values far beyond 200000(~22%). Those features are remove from the dataset. Also, from the heatmaps, we can see that missingness in rows with missging values run through multiple columns. This is a benefit since we can simply remove those rows with large amount of missginess to solve most of the missing values.
# How much data is missing in each row of the dataset now?
row_with_missing = pd.DataFrame(data={'row_idx':azdias_new.index, 'count_missing': azdias_new.isnull().sum(axis=1)})
#how many rows with missing value
num_miss_row = azdias_new.index[azdias_new.isnull().any(axis=1)]
print('There are {} rows with totally {} missing values.'.format(len(num_miss_row), np.sum(row_with_missing['count_missing'])))
#plot histogram of missingness in rows to choose a suitable threshold
plt.hist(row_with_missing['count_missing'])
# plt.hist(row_with_missing['count_z_score'])
# cnt = list(row_with_missing['count_missing'])
# row_with_missing['count_z_score'] = (cnt-np.mean(cnt))/np.std(cnt)
# #Remove columns with z-score less than 1.85 (under xx% percentile)
# azdias_new['subset_label'] = np.where(row_with_missing['count_z_score'] < 0.9, 'z-score < 1.9', 'z_score >= 1.9')
# print('There are {} rows in first subset and {} rows in the second subset.'.format(sum(azdias_new['subset_label']=='z-score < 1.9'), sum(azdias_new['subset_label']== 'z_score >= 1.9')))
# azdias_new.head(20)
# Write code to divide the data into two subsets based on the number of missing
# values in each row.
#assign subset labels to divide dataset into two groups
#from the histogram above, I choose 10 as the threshold of missing values to subset the data (suggested by the reviewer)
azdias_new['subset_label'] = np.where(row_with_missing['count_missing'] < 10, 'missingness < 10', 'missingness >= 10')
print('There are {} rows in first subset and {} rows in the second subset.'.format(sum(azdias_new['subset_label']=='missingness < 10'), sum(azdias_new['subset_label']== 'missingness >= 10')))
azdias_new.head(20)
#prepare several columns that are not missing data or a few missing data for comparison
col_with_missing = pd.DataFrame(data={'vars':azdias_new.columns, 'count_missing': azdias_new.isnull().sum(axis=0)})
col_with_missing.sort_values(by='count_missing', ascending=True, inplace=True)
#get first 10 features with smallest missingness
vars_for_plot = list(col_with_missing['vars'])[0:10]
#display the 10 features
col_with_missing.head(10)
# Compare the distribution of values for at least five columns where there are
# no or few missing values, between the two subsets.
for f in vars_for_plot:
plt.figure()
fig, ax = plt.subplots(figsize=(15,10))
sns.countplot(ax=ax, x=f, hue="subset_label", data=azdias_new)
plt.ylabel('count', fontsize=10)
plt.xlabel(f, fontsize=10)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.show();
#create the updated azdias dataset
azdias_new = pd.DataFrame(azdias_new[azdias_new['subset_label'] == 'missingness < 10'])
azdias_new.drop(['subset_label'], axis=1, inplace = True)
#visualize the location of missingness for the new dataset
fig = plt.subplots(figsize=(15,10))
sns.heatmap(azdias_new[new_vars_with_missing].isnull())
From the histogram, we can see that the amount of the missging values in each row is mostly under 10 (suggested by the reviewer). So, we can simply remove rows with more than 5 missging values without reducing many rows at the same time. From the countplots, we can see that the patterns, for some of the featrues, has no big difference in the data with lots of missing values than that in data with few or no missing values. However, for some of the them, there are differences.
# How many features are there of each data type?
for v in outlier_vars:
feat_info = feat_info[feat_info['attribute'] != v]
pd.DataFrame({'count' : feat_info.groupby(['type']).size()}).reset_index()
#get the list of categorical data
cat_mix_vars = feat_info['attribute'][feat_info['type'] == 'categorical']
cat_mix_vars[0:10]
# Assess categorical variables: which are binary, which are multi-level, and
# which one needs to be re-encoded?
mat_cat_mix = pd.DataFrame([[v, pd.unique(azdias_new[v]), len(pd.unique(azdias_new[v])), 'binary' if len(pd.unique(azdias_new[v])) == 2 else 'multi'] for v in cat_mix_vars])
mat_cat_mix.columns = ['feature', 'level', 'number_of _levels', 'type']
#display levels of each categorical features
mat_cat_mix
#drop features with more than 10 levels from azdias_new
varstodrop = list(mat_cat_mix['feature'][mat_cat_mix['number_of _levels'] > 10])
varstodrop.append('GEBAEUDETYP') #since there is no 5.0 level in customer data
azdias_new = pd.DataFrame(azdias_new.drop(varstodrop, axis=1))
varstodrop
azdias_new.shape
# Re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into [0,1]
azdias_new['ANREDE_KZ'] = azdias_new['ANREDE_KZ'].map({1:0, 2:1})
azdias_new['SOHO_KZ'] = azdias_new['SOHO_KZ'].map({0.0:0, 1.0:1})
azdias_new['OST_WEST_KZ'] = azdias_new['OST_WEST_KZ'].map({'W':0, 'O':1})
#remove from varstodrop
for v in varstodrop:
mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['feature'] == v].index, inplace=True)
#exempt binary features
#mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['type'] == 'binary'].index, inplace=True)
mat_cat_mix
# Re-encode categorical variable(s) to be kept in the analysis.
vars_to_encode = list(mat_cat_mix['feature'])
azdias_encoded = pd.DataFrame(pd.get_dummies(azdias_new, prefix=vars_to_encode, columns=vars_to_encode))
azdias_encoded.reset_index(drop=True, inplace= True)
print(len(azdias_encoded))
azdias_encoded.head(10)
vars_to_encode
I first came up with a matrix with feature name, levels, number_of_levels, type of each categorical features so that I have the map to perform featrue engineering. Then I drop features with more than 10 levels from the dataset, they are: 'GFK_URLAUBERTYP', 'LP_FAMILIE_FEIN', 'CAMEO_DEU_2015', 'GEBAEUDETYP'. After that, I re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into 0,1. Finally, I re-encode categorical variable(s) to be kept in the analysis.
There are a handful of features that are marked as "mixed" in the feature summary that require special treatment in order to be included in the analysis. There are two in particular that deserve attention; the handling of the rest are up to your own choices:
#display mixed variables
mixed_vars = feat_info['attribute'][feat_info['type'] == 'mixed']
mixed_vars
mat_mixed = pd.DataFrame([[v, pd.unique(azdias_new[v]), len(pd.unique(azdias_new[v]))] for v in mixed_vars])
mat_mixed.columns = ['feature', 'level', 'number_of _levels']
#display levels of each mixed feature
mat_mixed
# Investigate "PRAEGENDE_JUGENDJAHRE" and engineer two new variables.
#it shows there are 16 out of 16 levels including nan.
print(mat_mixed.iloc[2,1])
#duplicate two "PRAEGENDE_JUGENDJAHRE" columns
azdias_encoded['PJ_decade'] = azdias_encoded['PRAEGENDE_JUGENDJAHRE']
azdias_encoded['PJ_movement'] = azdias_encoded['PRAEGENDE_JUGENDJAHRE']
x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90}
y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde',
5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde',
10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'}
#mapping and converting two newly created columns
azdias_encoded['PJ_decade'] = azdias_encoded['PJ_decade'].map(x)
azdias_encoded['PJ_movement'] = azdias_encoded['PJ_movement'].map(y)
azdias_encoded['PJ_movement'] = azdias_encoded['PJ_movement'].map({'Mainstream':0, 'Avantgarde':1})
# Investigate "CAMEO_INTL_2015" and engineer two new variables.
#it shows there are 22 out of 25 levels including nan.
print(mat_mixed.iloc[4,1])
#duplicate two "CAMEO_INTL_2015" columns
azdias_encoded['CAMEO_wealth'] = pd.DataFrame(azdias_encoded['CAMEO_INTL_2015'])
azdias_encoded['CAMEO_life'] = pd.DataFrame(azdias_encoded['CAMEO_INTL_2015'])
#map and convert
not_nan = azdias_encoded['CAMEO_INTL_2015'][azdias_encoded['CAMEO_INTL_2015'].notnull()]
azdias_encoded['CAMEO_wealth'][azdias_encoded['CAMEO_INTL_2015'].notnull()] = [int(r)%5 for r in not_nan]
#azdias_encoded['CAMEO_wealth'] = azdias_encoded['CAMEO_wealth'].map({0: 5}) map can be used with all unique values be mapped.
azdias_encoded['CAMEO_wealth'][azdias_encoded['CAMEO_wealth'] == 0] = 5
azdias_encoded['CAMEO_life'][azdias_encoded['CAMEO_INTL_2015'].notnull()] = [int(int(r)/10) for r in not_nan]
#check uniqueness
print(pd.unique(azdias_encoded['CAMEO_wealth']))
print(pd.unique(azdias_encoded['CAMEO_life']))
plt.hist(azdias_encoded['WOHNLAGE'][~np.isnan(azdias_encoded['WOHNLAGE'])]);
plt.hist(azdias_encoded['PLZ8_BAUMAX'][~np.isnan(azdias_encoded['PLZ8_BAUMAX'])]);
#drop original columns and WOHNLAGE, PLZ8_BAUMAX, LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB
drop_vars = ['PRAEGENDE_JUGENDJAHRE','WOHNLAGE','PLZ8_BAUMAX','CAMEO_INTL_2015','LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB']
azdias_encoded = pd.DataFrame(azdias_encoded.drop(drop_vars, axis=1))
Similar to what I did for categorical featrues. I first came up with a matrix with feature name, levels, number_of_levels of each Mixed features so that I have the map to perform featrue engineering. Then I map and convert 'CAMEO_INTL_2015' and 'PRAEGENDE_JUGENDJAHRE' into seperately two new columns.
WOHNLAGE : This variable is mostly ordinal from levels 1-5, depicting a building’s neighborhood quality from very good to very poor. The issue is that levels 7 and 8 (there’s no 6) are a flag for if the building is in a rural neighborhood, disconnected from neighborhood quality. Since separately re-encode them will cause massive missingness, and I do not want to encode the nan into a new column so that I decide to drop it, I decide to drop it.
PRAEGENDE_JUGENDJAHRE: This variable shows dominating movement of person's youth (avantgarde vs. mainstream; east Germany vs. west Germany) and scales from 1 to 15, with -1,0 unknown. I set x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90} and y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde', 5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde', 10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'} Then I duplicate 'PRAEGENDE_JUGENDJAHRE' into 'PJ_decade' and 'PJ_movement' and map the former by x and the latter y. After that I remove 'PRAEGENDE_JUGENDJAHRE'.
CAMEO_INTL_2015: This variable shows German CAMEO: Wealth / Life Stage Typology, mapped to international code: 11-15, 21-25...51-55, with 1 and XX unknown. I duplicate 'CAMEO_INTL_2015' into 'CAMEO_wealth' and 'CAMEO_life'. Then I map the former by using trick int(r)%5 in order to take the remainder and map the remainder 0 to 5 so that we can have all wealth scales mapped. I map the CAMEO_life by using trick int(int(r)/10) in order to take the ten-digit number so that we can have all life scales mapped.
PLZ8_BAUMAX: It records most common building type within the PLZ8 region. Similar situation to WOHNLAGE, re-encoding it will cause massive missingess and I do not want to encode the nan into a new column so that I decide to drop it.
For featrue 'LP_LEBENSPHASE_FEIN' (Life stage, fine scale) and 'LP_LEBENSPHASE_GROB' (Life stage, rough scale), since the levels in LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB are large (especially LP_LEBENSPHASE_FEIN with more than 40 levels created by 'LP_FAMILIE_FEIN', 'LP_FAMILIE_GROB', 'LP_STATUS_GROB'), I decide to drop them.
# How much data is missing in each row of the dataset now?
row_with_missing = pd.DataFrame(data={'row_idx':azdias_encoded.index, 'count_missing': azdias_encoded.isnull().sum(axis=1)})
#how many rows with missing value
num_miss_row = azdias_encoded.index[azdias_encoded.isnull().any(axis=1)]
print('There are {} rows with totally {} missing values.'.format(len(num_miss_row), np.sum(row_with_missing['count_missing'])))
# If there are other re-engineering tasks you need to perform, make sure you
# take care of them here. (Dealing with missing data will come in step 2.1.)
# Do whatever you need to in order to ensure that the dataframe only contains
# the columns that should be passed to the algorithm functions.
#visualize the location of missingness for the new dataset
fig = plt.subplots(figsize=(15,10))
sns.heatmap(azdias_encoded.isnull())
# def clean_data(df, n_for_cols, n_for_rows):
# """
# Perform feature trimming, re-encoding, and engineering for demographics
# data
# INPUT: Demographics DataFrame
# OUTPUT: Trimmed and cleaned demographics DataFrame
# """
# # Put in code here to execute all main cleaning steps:
# # convert missing value codes into NaNs, ...
# # Identify missing or unknown data values and convert them to NaNs.
# # for f in list(azdias.columns):
# # azdias[f][azdias[f].isnull()] = np.nan
# feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", sep=';')
# for number in range(0,len(feat_info)):
# x = feat_info.missing_or_unknown[number] #Get the string expression of list
# x = x.replace(" ", "") #remove blank
# l = x[1:-1].split(',') #parse the elements
# for i in l:
# if i.lstrip('-').isdigit(): #if negative digit, convert to int
# df.iloc[:,number].replace(int(i), np.nan, inplace = True)
# elif i.isdigit(): #if positive digit, convert to int
# df.iloc[:,number].replace(int(i), np.nan, inplace = True)
# else: #For X or XX
# df.iloc[:,number].replace(i, np.nan, inplace = True)
# # remove selected columns and rows, ...
# #remove columns
# col_missing = pd.DataFrame(data={'vars':df.columns, 'count_missing': df.isnull().sum(axis=0)})
# outlier_vars = list(col_missing['vars'][col_missing['count_missing'] >= n_for_cols])
# df = pd.DataFrame(df.drop(outlier_vars, axis=1))
# #remove rows
# row_with_missing = pd.DataFrame(data={'row_idx':df.index, 'count_missing': df.isnull().sum(axis=1)})
# df = pd.DataFrame(df[row_with_missing['count_missing'] < n_for_rows])
# # select, re-encode, and engineer column values.
# #drop features with more than 10 levels from azdias_new
# for v in outlier_vars:
# feat_info = feat_info[feat_info['attribute'] != v]
# cat_mix_vars = feat_info['attribute'][feat_info['type'] == 'categorical']
# mat_cat_mix = pd.DataFrame([[v, pd.unique(df[v]), len(pd.unique(df[v])), 'binary' if len(pd.unique(df[v])) == 2 else 'multi'] for v in cat_mix_vars])
# mat_cat_mix.columns = ['feature', 'level', 'number_of _levels', 'type']
# varstodrop = mat_cat_mix['feature'][mat_cat_mix['number_of _levels'] > 10]
# df = pd.DataFrame(df.drop(varstodrop, axis=1))
# # Re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into [0,1]
# df['ANREDE_KZ'] = df['ANREDE_KZ'].map({1:0, 2:1})
# df['SOHO_KZ'] = df['SOHO_KZ'].map({0.0:0, 1.0:1})
# df['OST_WEST_KZ'] = df['OST_WEST_KZ'].map({'W':0, 'O':1})
# #remove from varstodrop
# for v in varstodrop:
# mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['feature'] == v].index, inplace=True)
# #exempt binary features
# mat_cat_mix.drop(mat_cat_mix[mat_cat_mix['type'] == 'binary'].index, inplace=True)
# # Re-encode categorical variable(s) to be kept in the analysis.
# vars_to_encode = list(mat_cat_mix['feature'])
# df = pd.DataFrame(pd.get_dummies(df, prefix=vars_to_encode, columns=vars_to_encode))
# df.reset_index(drop=True, inplace= True)
# # mixed variables
# mixed_vars = feat_info['attribute'][feat_info['type'] == 'mixed']
# mat_mixed = pd.DataFrame([[v, pd.unique(df[v]), len(pd.unique(df[v]))] for v in mixed_vars])
# mat_mixed.columns = ['feature', 'level', 'number_of _levels']
# df['PJ_decade'] = df['PRAEGENDE_JUGENDJAHRE']
# df['PJ_movement'] = df['PRAEGENDE_JUGENDJAHRE']
# x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90}
# y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde',
# 5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde',
# 10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'}
# #mapping and converting two newly created columns
# df['PJ_decade'] = df['PJ_decade'].map(x)
# df['PJ_movement'] = df['PJ_movement'].map(y)
# df['PJ_movement'] = df['PJ_movement'].map({'Mainstream':0, 'Avantgarde':1})
# #duplicate two "CAMEO_INTL_2015" columns
# df['CAMEO_wealth'] = pd.DataFrame(df['CAMEO_INTL_2015'])
# df['CAMEO_life'] = pd.DataFrame(df['CAMEO_INTL_2015'])
# #map and convert
# not_nan = df['CAMEO_INTL_2015'][df['CAMEO_INTL_2015'].notnull()]
# df['CAMEO_wealth'][df['CAMEO_INTL_2015'].notnull()] = [int(r)%5 for r in not_nan]
# #azdias_encoded['CAMEO_wealth'] = azdias_encoded['CAMEO_wealth'].map({0: 5}) map can be used with all unique values be mapped.
# df['CAMEO_wealth'][df['CAMEO_wealth'] == 0] = 5
# df['CAMEO_life'][df['CAMEO_INTL_2015'].notnull()] = [int(int(r)/10) for r in not_nan]
# #drop original columns
# drop_vars = ['PRAEGENDE_JUGENDJAHRE','CAMEO_INTL_2015']
# df = pd.DataFrame(df.drop(drop_vars, axis=1))
# #drop LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB
# drop_vars = ['LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB']
# df_new = pd.DataFrame(df.drop(drop_vars, axis=1))
# # Return the cleaned dataframe.
# return df_new
def clean_data(df, n_for_rows):
"""
Perform feature trimming, re-encoding, and engineering for demographics
data
INPUT: Demographics DataFrame
OUTPUT: Trimmed and cleaned demographics DataFrame
"""
# Put in code here to execute all main cleaning steps:
# convert missing value codes into NaNs, ...
# Identify missing or unknown data values and convert them to NaNs.
# for f in list(azdias.columns):
# azdias[f][azdias[f].isnull()] = np.nan
feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", sep=';')
for number in range(0,len(feat_info)):
x = feat_info.missing_or_unknown[number] #Get the string expression of list
x = x.replace(" ", "") #remove blank
l = x[1:-1].split(',') #parse the elements
for i in l:
if i.lstrip('-').isdigit(): #if negative digit, convert to int
df.iloc[:,number].replace(int(i), np.nan, inplace = True)
elif i.isdigit(): #if positive digit, convert to int
df.iloc[:,number].replace(int(i), np.nan, inplace = True)
else: #For X or XX
df.iloc[:,number].replace(i, np.nan, inplace = True)
# remove selected columns and rows, ...
#remove columns
outlier_vars = ['AGER_TYP', 'GEBURTSJAHR', 'TITEL_KZ', 'ALTER_HH', 'KK_KUNDENTYP', 'KBA05_BAUMAX']
df = pd.DataFrame(df.drop(outlier_vars, axis=1))
#remove rows
row_with_missing = pd.DataFrame(data={'row_idx':df.index, 'count_missing': df.isnull().sum(axis=1)})
df = pd.DataFrame(df[row_with_missing['count_missing'] < n_for_rows])
# select, re-encode, and engineer column values.
#drop features with more than 10 levels from azdias_new
varstodrop = ['GFK_URLAUBERTYP','LP_FAMILIE_FEIN','CAMEO_DEU_2015', 'GEBAEUDETYP']
df = pd.DataFrame(df.drop(varstodrop, axis=1))
# Re-encode the levels in ANREDE_KZ, SOHO_KZ and OST_WEST_KZ into [0,1]
df['ANREDE_KZ'] = df['ANREDE_KZ'].map({1:0, 2:1})
df['SOHO_KZ'] = df['SOHO_KZ'].map({0.0:0, 1.0:1})
df['OST_WEST_KZ'] = df['OST_WEST_KZ'].map({'W':0, 'O':1})
vars_to_encode = ['CJT_GESAMTTYP','FINANZTYP','LP_FAMILIE_GROB','LP_STATUS_FEIN','LP_STATUS_GROB','NATIONALITAET_KZ','SHOPPER_TYP','VERS_TYP','ZABEOTYP','CAMEO_DEUG_2015','OST_WEST_KZ','ANREDE_KZ','GREEN_AVANTGARDE','SOHO_KZ']
df = pd.DataFrame(pd.get_dummies(df, prefix=vars_to_encode, columns=vars_to_encode))
df.reset_index(drop=True, inplace= True)
# mixed variables
df['PJ_decade'] = df['PRAEGENDE_JUGENDJAHRE']
df['PJ_movement'] = df['PRAEGENDE_JUGENDJAHRE']
x = {1.0: 40, 2.0: 40, 3.0: 50, 4.0: 50, 5.0: 60, 6.0: 60, 7.0: 60, 8.0: 70, 9.0: 70, 10.0: 80, 11.0: 80, 12.0: 80, 13.0: 80, 14.0: 90, 15.0: 90}
y = {1.0: 'Mainstream', 2.0: 'Avantgarde', 3.0: 'Mainstream', 4.0: 'Avantgarde',
5.0: 'Mainstream', 6.0: 'Avantgarde', 7.0: 'Avantgarde', 8.0: 'Mainstream', 9: 'Avantgarde',
10.0: 'Mainstream', 11.0: 'Avantgarde', 12.0: 'Avantgarde', 13.0: 'Avantgarde', 14.0: 'Mainstream', 15.0: 'Avantgarde'}
#mapping and converting two newly created columns
df['PJ_decade'] = df['PJ_decade'].map(x)
df['PJ_movement'] = df['PJ_movement'].map(y)
df['PJ_movement'] = df['PJ_movement'].map({'Mainstream':0, 'Avantgarde':1})
#duplicate two "CAMEO_INTL_2015" columns
df['CAMEO_wealth'] = pd.DataFrame(df['CAMEO_INTL_2015'])
df['CAMEO_life'] = pd.DataFrame(df['CAMEO_INTL_2015'])
#map and convert
not_nan = df['CAMEO_INTL_2015'][df['CAMEO_INTL_2015'].notnull()]
df['CAMEO_wealth'][df['CAMEO_INTL_2015'].notnull()] = [int(r)%5 for r in not_nan]
#azdias_encoded['CAMEO_wealth'] = azdias_encoded['CAMEO_wealth'].map({0: 5}) map can be used with all unique values be mapped.
df['CAMEO_wealth'][df['CAMEO_wealth'] == 0] = 5
df['CAMEO_life'][df['CAMEO_INTL_2015'].notnull()] = [int(int(r)/10) for r in not_nan]
#drop columns
drop_vars = ['PRAEGENDE_JUGENDJAHRE','WOHNLAGE','PLZ8_BAUMAX','CAMEO_INTL_2015','LP_LEBENSPHASE_FEIN','LP_LEBENSPHASE_GROB']
df_new = pd.DataFrame(df.drop(drop_vars, axis=1))
# Return the cleaned dataframe.
return df_new
azdias = pd.read_csv("Udacity_AZDIAS_Subset.csv", sep=';')
azdias_encoded = clean_data(azdias, 10)
#azdias_no_na['GREEN_AVANTGARDE']
# outlier_vars = ['AGER_TYP', 'GEBURTSJAHR', 'TITEL_KZ', 'ALTER_HH', 'KK_KUNDENTYP', 'KBA05_BAUMAX']
# varstodrop = ['GFK_URLAUBERTYP','LP_FAMILIE_FEIN','CAMEO_DEU_2015']
#vars_to_encode = ['CJT_GESAMTTYP','FINANZTYP','LP_FAMILIE_GROB','LP_STATUS_FEIN','LP_STATUS_GROB','NATIONALITAET_KZ','SHOPPER_TYP','VERS_TYP','ZABEOTYP',''CAMEO_DEUG_2015']
records_na = azdias_encoded.index[azdias_encoded.isnull().any(axis=1)]
print('There are {} rows out of total rows {} have with missing values ({:.2%}) '.format(len(records_na), len(azdias_encoded), len(records_na)/len(azdias_encoded)))
print('Since the amount of rows with missing values is not very large, I decide to remove these rows.')
# If you've not yet cleaned the dataset of all NaN values, then investigate and
# do that now.
azdias_no_na = pd.DataFrame(azdias_encoded.dropna())
azdias_no_na.reset_index(drop=True, inplace= True)
# Apply feature scaling to the general population demographics data.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
azdias_no_na = pd.DataFrame(scaler.fit_transform(azdias_no_na))
azdias_no_na.columns = azdias_encoded.columns
azdias_no_na.head()
There are 73898 rows out of total rows 697109 have with missing values (10.60%). Since the amount of rows with missing values is not very large, I decide to simply remove these rows.
# Apply PCA to the data.
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(azdias_no_na)
#check the dimensionality of pca
pca.components_.shape
# Investigate the variance accounted for by each principal component.
def scree_plot(pca): #referenced from helper_functions.py for Interpret_PC_Results_SC
'''
Creates a scree plot associated with the principal components
INPUT: pca - the result of instantian of PCA in scikit learn
OUTPUT:
None
'''
num_components = len(pca.explained_variance_ratio_)
ind = np.arange(num_components)
vals = pca.explained_variance_ratio_
plt.figure(figsize=(25, 10))
ax = plt.subplot(111)
cumvals = np.cumsum(vals)
ax.bar(ind, vals)
ax.plot(ind, cumvals)
for i in range(num_components):
ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
ax.xaxis.set_tick_params(width=0)
ax.yaxis.set_tick_params(width=2, length=12)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
plt.title('Explained Variance Per Principal Component')
scree_plot(pca)
#list the ratio of each component
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
#Check how many components need to be kept to explain at least 85% of the variability in the original dataset
for i in range(1,len(ratios)+1):
sum_variance = ratios[0:i].sum()
if sum_variance > 0.85:
num_comps = len(ratios[0:i])
break
print("Using {} components, we can explain {}% of the variability in the original data.".format(num_comps,sum_variance))
# Re-apply PCA to the data while selecting for number of components to retain.
#Re-apply PCA to select the first 51 components
pca = PCA(num_comps)
X_pca = pca.fit_transform(azdias_no_na)
Since the intention of doing feature reduction with PCA is to reduce the dimensionality at the same time retain as much as information as we can and in PCA, we want to find where additional features do not add much more explained variability. Therefore, I decide to retain at least 85% of the variability in the original dataset since beyond 85%, more amount of components do not add much explained variability. There are 51 principal components retained.
def pca_plot(df, pca, i_th_comp):
'''
Create a DataFrame of the PCA results
Includes dimension feature weights and explained variance
Visualizes the PCA results
'''
components_ = pca.components_[i_th_comp,:]
explained_variance_ratio_= pca.explained_variance_ratio_[i_th_comp]
# PCA components
components = pd.DataFrame({'weights': np.round(components_, 4)})
components.index = list(df.columns)
components.sort_values(by='weights', ascending=True, inplace=True)
components = pd.DataFrame(components.transpose())
components.reset_index(drop=True, inplace = True)
display(components)
# Create a bar plot visualization
fig = plt.subplots(figsize=(60,8))
features = list(components.columns)
y_pos = np.arange(len(features))
weights = list(components.iloc[0])
plt.bar(y_pos, weights, align='center', alpha=0.5)
plt.xticks(y_pos, features, rotation = 90)
plt.ylabel('Weights')
#plt.title('')
plt.show()
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
# HINT: Try defining a function here or in a new cell that you can reuse in the
# other cells.
pca_plot(azdias_no_na, pca, 1)
# Map weights for the second principal component to corresponding feature names
# and then print the linked values, sorted by weight.
pca_plot(azdias_no_na, pca, 2)
# Map weights for the third principal component to corresponding feature names
# and then print the linked values, sorted by weight.
pca_plot(azdias_no_na, pca, 3)
1st PC: For negative values, e.g. 'PJ_decade' and 'SEMIO_REL' those two features have large weights of the same sign (both negative), then increases in one tend expect to be associated with increases in the other. For posotive values, e.g. 'ZABEOTYP_3' and 'ALTERSKA...GROB' those two features have large weights of the same sign (both positive), then increases in one tend expect to be associated with increases in the other. Also, the sign represents the direction of correlation between the feature and the PC and the weights represent the features's contribution to the PC. Therefore, 'PJ_decade', 'SEMIO_REL' and 'ZABEOTYP_3', 'ALTERSKA...GROB' contribute a lot to the 1st PC in different direction. To contrast, those two gourps of features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.
2nd PC: For negative values, e.g. 'ANREDE_KZ1' and 'SEMIO_KAEM' those two features have large weights of the same sign (both negative), then increases in one tend expect to be associated with increases in the other. For posotive values, e.g. 'SEMIO_VERT' and 'ANREDE_KZ_0' those two features have large weights of the same sign (both positive), then increases in one tend expect to be associated with increases in the other. Also, the sign represents the direction of correlation between the feature and the PC and the weights represent the features's contribution to the PC. Therefore, 'ANREDE_KZ1', 'SEMIO_KAEM' and 'SEMIO_VERT', 'ANREDE_KZ_0' contribute a lot to the 1st PC in different direction. To contrast, those two gourps of features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.
3rd PC: For negative values, e.g. 'GREEN_AVANTGRADE_0' and 'WOHNLAGE' those two features have large weights of the same sign (both negative), then increases in one tend expect to be associated with increases in the other. For posotive values, e.g. 'PJ_movement' and 'GREEN_AVANTGRADE_1' those two features have large weights of the same sign (both positive), then increases in one tend expect to be associated with increases in the other. Also, the sign represents the direction of correlation between the feature and the PC and the weights represent the features's contribution to the PC. Therefore, 'GREEN_AVANTGRADE_0', 'LP_STATUS_GROB_4.0' and 'PJ_movement', 'GREEN_AVANTGRADE_1' contribute a lot to the 1st PC in different direction. To contrast, those two gourps of features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.
# Over a number of different cluster counts...
# run k-means clustering on the data and...
# compute the average within-cluster distances.
def get_kmeans_score(data, center): #referenced from Changing K - Solution
#instantiate kmeans
kmeans = KMeans(n_clusters=center)
#Then fit the model to data using the fit method
model = kmeans.fit(data)
#Obtain a score related to the model fit
score = np.abs(model.score(data))
return score
# Investigate the change in within-cluster distance across number of clusters.
# HINT: Use matplotlib's plot function to visualize this relationship.
scores = []
centers = list(range(1,30))
for center in centers:
scores.append(get_kmeans_score(X_pca, center))
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('SSE');
plt.title('SSE vs. K');
plt.plot(centers, np.gradient(scores))
plt.xlabel('K')
plt.ylabel('Slope of SSE')
plt.title('SSE Slope vs. K')
# Re-fit the k-means model with the selected number of clusters and obtain
# cluster predictions for the general population demographics data.
#instantiate kmeans
kmeans = KMeans(n_clusters=15)
#Then fit the model to data using the fit method
model = kmeans.fit(X_pca)
azdias_label = pd.DataFrame({"Cluster_ID": model.predict(X_pca)})
azdias_pca = pd.DataFrame(data = X_pca, columns = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)])
azdias_pca = pd.concat([azdias_pca, azdias_label], axis=1)
azdias_pca.head()
From the plot, we can see that SSE decreases with the number of clusters increading. Then I calculate the slope at each number of clusters. The slopes converge to almost zero and stabilize at 15. Therefore, I set 15 clusters to segment the population.
# Load in the customer demographics data.
customers = pd.read_csv("Udacity_CUSTOMERS_Subset.csv", sep=';')
customers.shape
# Investigate patterns in the amount of missing data in each column.
col_with_missing = pd.DataFrame(data={'vars':customers.columns, 'count_missing': customers.isnull().sum(axis=0)})
col_with_missing.reset_index(drop =True, inplace=True)
ax = col_with_missing.plot.bar(x='vars', y='count_missing', rot=90, figsize=(25,5))
# How much data is missing in each row of the dataset now?
row_with_missing = pd.DataFrame(data={'row_idx':customers.index, 'count_missing': customers.isnull().sum(axis=1)})
#how many rows with missing value
num_miss_row = customers.index[customers.isnull().any(axis=1)]
print('There are {} rows with totally {} missing values.'.format(len(num_miss_row), np.sum(row_with_missing['count_missing'])))
#plot histogram of missingness in rows to choose a suitable threshold
plt.hist(row_with_missing['count_missing'])
# Apply preprocessing, feature transformation, and clustering from the general
# demographics onto the customer data, obtaining cluster predictions for the
# customer demographics data.
customers_cleaned = clean_data(customers, 5)
customers_no_na = pd.DataFrame(customers_cleaned.dropna())
customers_no_na.reset_index(drop=True, inplace= True)
# Apply feature scaling
from sklearn.preprocessing import StandardScaler
customers_no_na = pd.DataFrame(scaler.transform(customers_no_na))
customers_no_na.columns = customers_cleaned.columns
X_hat_pca = pca.transform(customers_no_na)
#K-Means
customer_label = pd.DataFrame({"Cluster_ID": model.predict(X_hat_pca)})
customers_pca = pd.DataFrame(data = X_hat_pca, columns = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)])
customers_pca = pd.concat([customers_pca, customer_label], axis=1)
# a = list(customers_no_na.columns)
# b = list(azdias_no_na.columns)
# list(set(b)-set(a))
customers_pca.head()
.inverse_transform()
method of the PCA and StandardScaler objects to transform centroids back to the original data space and interpret the retrieved values directly.# Compare the proportion of data in each cluster for the customer data to the
# proportion of data in each cluster for the general population.
df_azdias_scaled_pca = azdias_pca
df_customers_scaled_pca = customers_pca
customers_na_cnt = customers.shape[0] - customers_cleaned.shape[0]
azdias_na_cnt = azdias.shape[0] - azdias_encoded.shape[0]
n_clusters = 15
azdias_pca['label'] = 'General'
customers_pca['label'] = 'Customer'
pred_label = pd.concat([azdias_pca, customers_pca])
# some trivial calculations below
plot_data = pred_label.groupby(['label'])['Cluster_ID'].value_counts().astype(float).rename("Count")
plot_data = pd.DataFrame(plot_data)
plot_data.reset_index(inplace=True)
# assign the large missingness data to a new cluster
ls = [['Customer', n_clusters, customers_na_cnt], ['General', n_clusters, azdias_na_cnt]]
for i, (label, cluster_id, cnt) in enumerate(ls):
new_cluster = pd.Series({'label':label,'Cluster_ID': cluster_id,'Count': cnt})
plot_data = plot_data.append(new_cluster, ignore_index=True)
# total number of counts in population and customer dataset, including the large-NaN data
total_cus = plot_data[plot_data.label == 'Customer'].Count.sum(axis = 0)
total_gen = plot_data[plot_data.label == 'General'].Count.sum(axis = 0)
# calculate the percentage distribution
plot_data['Percentage'] = np.where(plot_data['label']=='Customer', plot_data['Count']/total_cus*100, plot_data['Count']/total_gen*100)
fig = plt.figure(figsize = (15,8))
ax = sns.barplot(x='Cluster_ID', y="Percentage", hue='label', data=plot_data)
ax.set_xlabel("Cluster ID", fontsize = 15)
ax.set_ylabel("Percentage (%)", fontsize = 15)
#ax.set_title(column, fontsize = 20)
ax.grid()
plt.annotate("The {}th Cluster represents the large missingness data".format(n_clusters), xy=(0.35, 1.02), xycoords='axes fraction', fontsize = 15);
# filter_col = [col for col in azdias_no_na if col.startswith('OST_WEST_KZ')]
# filter_col
# What kinds of people are part of a cluster that is overrepresented in the
# customer data compared to the general population?
centroid_id = [11,2]
centroids = []
cluster_center = kmeans.cluster_centers_[centroid_id,:]
result = pd.DataFrame(data = scaler.inverse_transform(pca.inverse_transform(cluster_center)), \
columns = azdias_no_na.columns, index = ['Cluster 11 (overrepresenting)', 'Cluster 2 (underrepresenting)'])
# income-related features
income_col = ['HH_EINKOMMEN_SCORE','LP_STATUS_GROB_1.0','LP_STATUS_GROB_2.0',\
'LP_STATUS_GROB_3.0','LP_STATUS_GROB_4.0','LP_STATUS_GROB_5.0',\
'FINANZ_MINIMALIST', 'CAMEO_wealth']
# age-related features
age_col = ['ALTERSKATEGORIE_GROB', 'PJ_decade']
# gender-related features
gender_col = ['ANREDE_KZ_0','ANREDE_KZ_1']
# movement features
move_col = ['GREEN_AVANTGARDE_0', 'GREEN_AVANTGARDE_1', 'PJ_movement']
# building location
location_col = ['OST_WEST_KZ_1', 'OST_WEST_KZ_0']
features = income_col + age_col + gender_col + move_col + location_col
income = ['Income' for i in np.arange(len(income_col))]
age = ['Age' for i in np.arange(len(age_col))]
gender = ['Gender' for i in np.arange(len(gender_col))]
movement = ['Movement' for i in np.arange(len(move_col))]
location = ['Apartment Location' for i in np.arange(len(location_col))]
parents = income+age+gender+movement+location
meaning = ['Estimated household net income', 'low-income earners', 'average earners', \
'independents', 'houseowners', 'top earners', 'low financial interest', 'Wealth Scale',\
'Estimated age', 'Generation Decades', 'Male', 'Female', \
'mainstream', 'avantgarde', 'mainstream vs. avantgarde', 'East Germany', 'West Germany']
summary = pd.DataFrame(result[features].T.values, \
index=[parents, features, meaning], columns = result.index)
summary
From the tabel, we can see that the most overrepresenting cluster (cluster 0) is the group of people, with higher income (HH_EINKOMMEN_SCORE: lower scale indicates higher income) born around 1968, involved in green avantgarde, and living in west Germany, whereas people in the most underrepresenting cluster (cluster 11) are mostly female with lower income earners, born 1988, and not a member of green avantgarde who are the opposite of people in the most overrepresenting cluster.